]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/i386-expand.cc
x86: Handle V16BF in ix86_avx256_split_vector_move_misalign
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
CommitLineData
7adcbafe 1/* Copyright (C) 1988-2022 Free Software Foundation, Inc.
2bf6d935
ML
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3, or (at your option)
8any later version.
9
10GCC is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with GCC; see the file COPYING3. If not see
17<http://www.gnu.org/licenses/>. */
18
19#define IN_TARGET_CODE 1
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "gimple.h"
29#include "cfghooks.h"
30#include "cfgloop.h"
31#include "df.h"
32#include "tm_p.h"
33#include "stringpool.h"
34#include "expmed.h"
35#include "optabs.h"
36#include "regs.h"
37#include "emit-rtl.h"
38#include "recog.h"
39#include "cgraph.h"
40#include "diagnostic.h"
41#include "cfgbuild.h"
42#include "alias.h"
43#include "fold-const.h"
44#include "attribs.h"
45#include "calls.h"
46#include "stor-layout.h"
47#include "varasm.h"
48#include "output.h"
49#include "insn-attr.h"
50#include "flags.h"
51#include "except.h"
52#include "explow.h"
53#include "expr.h"
54#include "cfgrtl.h"
55#include "common/common-target.h"
56#include "langhooks.h"
57#include "reload.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "tm-constrs.h"
2bf6d935
ML
61#include "cselib.h"
62#include "sched-int.h"
63#include "opts.h"
64#include "tree-pass.h"
65#include "context.h"
66#include "pass_manager.h"
67#include "target-globals.h"
68#include "gimple-iterator.h"
69#include "tree-vectorizer.h"
70#include "shrink-wrap.h"
71#include "builtins.h"
72#include "rtl-iter.h"
73#include "tree-iterator.h"
74#include "dbgcnt.h"
75#include "case-cfn-macros.h"
76#include "dojump.h"
77#include "fold-const-call.h"
78#include "tree-vrp.h"
79#include "tree-ssanames.h"
80#include "selftest.h"
81#include "selftest-rtl.h"
82#include "print-rtl.h"
83#include "intl.h"
84#include "ifcvt.h"
85#include "symbol-summary.h"
86#include "ipa-prop.h"
87#include "ipa-fnsummary.h"
88#include "wide-int-bitmask.h"
89#include "tree-vector-builder.h"
90#include "debug.h"
91#include "dwarf2out.h"
92#include "i386-options.h"
93#include "i386-builtins.h"
94#include "i386-expand.h"
95
96/* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102void
103split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105{
106 machine_mode half_mode;
107 unsigned int byte;
deeedbad
JJ
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
2bf6d935
ML
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
58d6eea0 119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
2bf6d935
ML
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
deeedbad
JJ
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
2bf6d935
ML
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
d39fbed7
UB
157
158 rtx tmp = simplify_gen_subreg (half_mode, op,
159 GET_MODE (op) == VOIDmode
160 ? mode : GET_MODE (op), byte);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
2bf6d935
ML
164 }
165 }
166}
167
16aafa31
RS
168/* Emit the double word assignment DST = { LO, HI }. */
169
170void
171split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
172{
173 rtx dlo, dhi;
174 int deleted_move_count = 0;
175 split_double_mode (mode, &dst, 1, &dlo, &dhi);
176 if (!rtx_equal_p (dlo, hi))
177 {
178 if (!rtx_equal_p (dlo, lo))
179 emit_move_insn (dlo, lo);
180 else
181 deleted_move_count++;
182 if (!rtx_equal_p (dhi, hi))
183 emit_move_insn (dhi, hi);
184 else
185 deleted_move_count++;
186 }
187 else if (!rtx_equal_p (lo, dhi))
188 {
189 if (!rtx_equal_p (dhi, hi))
190 emit_move_insn (dhi, hi);
191 else
192 deleted_move_count++;
193 if (!rtx_equal_p (dlo, lo))
194 emit_move_insn (dlo, lo);
195 else
196 deleted_move_count++;
197 }
198 else if (mode == TImode)
199 emit_insn (gen_swapdi (dlo, dhi));
200 else
201 emit_insn (gen_swapsi (dlo, dhi));
202
203 if (deleted_move_count == 2)
204 emit_note (NOTE_INSN_DELETED);
205}
206
207
2bf6d935
ML
208/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
209 for the target. */
210
211void
212ix86_expand_clear (rtx dest)
213{
214 rtx tmp;
215
216 /* We play register width games, which are only valid after reload. */
217 gcc_assert (reload_completed);
218
219 /* Avoid HImode and its attendant prefix byte. */
220 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
221 dest = gen_rtx_REG (SImode, REGNO (dest));
222 tmp = gen_rtx_SET (dest, const0_rtx);
223
224 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
225 {
226 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
227 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
228 }
229
230 emit_insn (tmp);
231}
232
edafb35b
L
233/* Return true if V can be broadcasted from an integer of WIDTH bits
234 which is returned in VAL_BROADCAST. Otherwise, return false. */
235
236static bool
237ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
238 HOST_WIDE_INT &val_broadcast)
239{
240 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
241 val_broadcast = wi::extract_uhwi (val, 0, width);
242 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
243 {
244 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
245 if (val_broadcast != each)
246 return false;
247 }
248 val_broadcast = sext_hwi (val_broadcast, width);
249 return true;
250}
251
252/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
253
254static rtx
255ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
256{
257 /* Don't use integer vector broadcast if we can't move from GPR to SSE
258 register directly. */
259 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
260 return nullptr;
261
262 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
263 broadcast only if vector broadcast is available. */
264 if (!TARGET_AVX
265 || !CONST_WIDE_INT_P (op)
266 || standard_sse_constant_p (op, mode))
267 return nullptr;
268
269 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
270 HOST_WIDE_INT val_broadcast;
271 scalar_int_mode broadcast_mode;
272 if (TARGET_AVX2
273 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
274 val_broadcast))
275 broadcast_mode = QImode;
276 else if (TARGET_AVX2
277 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
278 val_broadcast))
279 broadcast_mode = HImode;
280 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
281 val_broadcast))
282 broadcast_mode = SImode;
283 else if (TARGET_64BIT
284 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
285 val_broadcast))
286 broadcast_mode = DImode;
287 else
288 return nullptr;
289
290 /* Check if OP can be broadcasted from VAL. */
291 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
292 if (val != CONST_WIDE_INT_ELT (op, i))
293 return nullptr;
294
295 unsigned int nunits = (GET_MODE_SIZE (mode)
296 / GET_MODE_SIZE (broadcast_mode));
297 machine_mode vector_mode;
298 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
299 gcc_unreachable ();
300 rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
301 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
302 target,
303 GEN_INT (val_broadcast));
304 gcc_assert (ok);
305 target = lowpart_subreg (mode, target, vector_mode);
306 return target;
307}
308
2bf6d935
ML
309void
310ix86_expand_move (machine_mode mode, rtx operands[])
311{
312 rtx op0, op1;
313 rtx tmp, addend = NULL_RTX;
314 enum tls_model model;
315
316 op0 = operands[0];
317 op1 = operands[1];
318
be39636d
RS
319 /* Avoid complex sets of likely spilled hard registers before reload. */
320 if (!ix86_hardreg_mov_ok (op0, op1))
321 {
322 tmp = gen_reg_rtx (mode);
323 operands[0] = tmp;
324 ix86_expand_move (mode, operands);
325 operands[0] = op0;
326 operands[1] = tmp;
327 op1 = tmp;
328 }
329
2bf6d935
ML
330 switch (GET_CODE (op1))
331 {
332 case CONST:
333 tmp = XEXP (op1, 0);
334
335 if (GET_CODE (tmp) != PLUS
336 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
337 break;
338
339 op1 = XEXP (tmp, 0);
340 addend = XEXP (tmp, 1);
341 /* FALLTHRU */
342
343 case SYMBOL_REF:
344 model = SYMBOL_REF_TLS_MODEL (op1);
345
346 if (model)
347 op1 = legitimize_tls_address (op1, model, true);
348 else if (ix86_force_load_from_GOT_p (op1))
349 {
350 /* Load the external function address via GOT slot to avoid PLT. */
351 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
352 (TARGET_64BIT
353 ? UNSPEC_GOTPCREL
354 : UNSPEC_GOT));
355 op1 = gen_rtx_CONST (Pmode, op1);
356 op1 = gen_const_mem (Pmode, op1);
357 set_mem_alias_set (op1, ix86_GOT_alias_set ());
358 }
359 else
360 {
361 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
362 if (tmp)
363 {
364 op1 = tmp;
365 if (!addend)
366 break;
367 }
368 else
369 {
370 op1 = operands[1];
371 break;
372 }
373 }
374
375 if (addend)
376 {
377 op1 = force_operand (op1, NULL_RTX);
378 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
379 op0, 1, OPTAB_DIRECT);
380 }
381 else
382 op1 = force_operand (op1, op0);
383
384 if (op1 == op0)
385 return;
386
387 op1 = convert_to_mode (mode, op1, 1);
388
389 default:
390 break;
391 }
392
393 if ((flag_pic || MACHOPIC_INDIRECT)
394 && symbolic_operand (op1, mode))
395 {
396 if (TARGET_MACHO && !TARGET_64BIT)
397 {
398#if TARGET_MACHO
399 /* dynamic-no-pic */
400 if (MACHOPIC_INDIRECT)
401 {
402 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
403 ? op0 : gen_reg_rtx (Pmode);
404 op1 = machopic_indirect_data_reference (op1, temp);
405 if (MACHOPIC_PURE)
406 op1 = machopic_legitimize_pic_address (op1, mode,
407 temp == op1 ? 0 : temp);
408 }
409 if (op0 != op1 && GET_CODE (op0) != MEM)
410 {
411 rtx insn = gen_rtx_SET (op0, op1);
412 emit_insn (insn);
413 return;
414 }
415 if (GET_CODE (op0) == MEM)
416 op1 = force_reg (Pmode, op1);
417 else
418 {
419 rtx temp = op0;
420 if (GET_CODE (temp) != REG)
421 temp = gen_reg_rtx (Pmode);
422 temp = legitimize_pic_address (op1, temp);
423 if (temp == op0)
424 return;
425 op1 = temp;
426 }
427 /* dynamic-no-pic */
428#endif
429 }
430 else
431 {
432 if (MEM_P (op0))
433 op1 = force_reg (mode, op1);
434 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
435 {
436 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
437 op1 = legitimize_pic_address (op1, reg);
438 if (op0 == op1)
439 return;
440 op1 = convert_to_mode (mode, op1, 1);
441 }
442 }
443 }
444 else
445 {
446 if (MEM_P (op0)
447 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
448 || !push_operand (op0, mode))
449 && MEM_P (op1))
450 op1 = force_reg (mode, op1);
451
452 if (push_operand (op0, mode)
453 && ! general_no_elim_operand (op1, mode))
454 op1 = copy_to_mode_reg (mode, op1);
455
456 /* Force large constants in 64bit compilation into register
457 to get them CSEed. */
458 if (can_create_pseudo_p ()
459 && (mode == DImode) && TARGET_64BIT
460 && immediate_operand (op1, mode)
461 && !x86_64_zext_immediate_operand (op1, VOIDmode)
462 && !register_operand (op0, mode)
463 && optimize)
464 op1 = copy_to_mode_reg (mode, op1);
465
edafb35b 466 if (can_create_pseudo_p ())
2bf6d935 467 {
edafb35b 468 if (CONST_DOUBLE_P (op1))
2bf6d935 469 {
edafb35b
L
470 /* If we are loading a floating point constant to a
471 register, force the value to memory now, since we'll
472 get better code out the back end. */
473
474 op1 = validize_mem (force_const_mem (mode, op1));
475 if (!register_operand (op0, mode))
476 {
477 rtx temp = gen_reg_rtx (mode);
478 emit_insn (gen_rtx_SET (temp, op1));
479 emit_move_insn (op0, temp);
480 return;
481 }
482 }
483 else if (GET_MODE_SIZE (mode) >= 16)
484 {
485 rtx tmp = ix86_convert_const_wide_int_to_broadcast
486 (GET_MODE (op0), op1);
487 if (tmp != nullptr)
488 op1 = tmp;
2bf6d935
ML
489 }
490 }
491 }
492
493 emit_insn (gen_rtx_SET (op0, op1));
494}
495
a6291d88 496/* OP is a memref of CONST_VECTOR, return scalar constant mem
497 if CONST_VECTOR is a vec_duplicate, else return NULL. */
edafb35b 498static rtx
a6291d88 499ix86_broadcast_from_constant (machine_mode mode, rtx op)
edafb35b
L
500{
501 int nunits = GET_MODE_NUNITS (mode);
502 if (nunits < 2)
503 return nullptr;
504
505 /* Don't use integer vector broadcast if we can't move from GPR to SSE
506 register directly. */
a6291d88 507 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
508 && INTEGRAL_MODE_P (mode))
edafb35b
L
509 return nullptr;
510
511 /* Convert CONST_VECTOR to a non-standard SSE constant integer
512 broadcast only if vector broadcast is available. */
513 if (!(TARGET_AVX2
514 || (TARGET_AVX
515 && (GET_MODE_INNER (mode) == SImode
a6291d88 516 || GET_MODE_INNER (mode) == DImode))
517 || FLOAT_MODE_P (mode))
edafb35b
L
518 || standard_sse_constant_p (op, mode))
519 return nullptr;
520
a6291d88 521 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
522 We can still put 64-bit integer constant in memory when
523 avx512 embed broadcast is available. */
524 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
525 && (!TARGET_AVX512F
526 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
edafb35b
L
527 return nullptr;
528
f7cad1a0
JJ
529 if (GET_MODE_INNER (mode) == TImode)
530 return nullptr;
531
edafb35b
L
532 rtx constant = get_pool_constant (XEXP (op, 0));
533 if (GET_CODE (constant) != CONST_VECTOR)
534 return nullptr;
535
536 /* There could be some rtx like
537 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
538 but with "*.LC1" refer to V2DI constant vector. */
539 if (GET_MODE (constant) != mode)
540 {
541 constant = simplify_subreg (mode, constant, GET_MODE (constant),
542 0);
543 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
544 return nullptr;
545 }
546
547 rtx first = XVECEXP (constant, 0, 0);
548
549 for (int i = 1; i < nunits; ++i)
550 {
551 rtx tmp = XVECEXP (constant, 0, i);
552 /* Vector duplicate value. */
553 if (!rtx_equal_p (tmp, first))
554 return nullptr;
555 }
556
557 return first;
558}
559
2bf6d935
ML
560void
561ix86_expand_vector_move (machine_mode mode, rtx operands[])
562{
563 rtx op0 = operands[0], op1 = operands[1];
564 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
565 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
566 unsigned int align = (TARGET_IAMCU
567 ? GET_MODE_BITSIZE (mode)
568 : GET_MODE_ALIGNMENT (mode));
569
570 if (push_operand (op0, VOIDmode))
571 op0 = emit_move_resolve_push (mode, op0);
572
573 /* Force constants other than zero into memory. We do not know how
574 the instructions used to build constants modify the upper 64 bits
575 of the register, once we have that information we may be able
576 to handle some of them more efficiently. */
577 if (can_create_pseudo_p ()
578 && (CONSTANT_P (op1)
579 || (SUBREG_P (op1)
580 && CONSTANT_P (SUBREG_REG (op1))))
581 && ((register_operand (op0, mode)
582 && !standard_sse_constant_p (op1, mode))
583 /* ix86_expand_vector_move_misalign() does not like constants. */
584 || (SSE_REG_MODE_P (mode)
585 && MEM_P (op0)
586 && MEM_ALIGN (op0) < align)))
587 {
588 if (SUBREG_P (op1))
589 {
590 machine_mode imode = GET_MODE (SUBREG_REG (op1));
591 rtx r = force_const_mem (imode, SUBREG_REG (op1));
592 if (r)
593 r = validize_mem (r);
594 else
595 r = force_reg (imode, SUBREG_REG (op1));
596 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
597 }
598 else
edafb35b
L
599 {
600 machine_mode mode = GET_MODE (op0);
601 rtx tmp = ix86_convert_const_wide_int_to_broadcast
602 (mode, op1);
603 if (tmp == nullptr)
604 op1 = validize_mem (force_const_mem (mode, op1));
605 else
606 op1 = tmp;
607 }
608 }
609
610 if (can_create_pseudo_p ()
611 && GET_MODE_SIZE (mode) >= 16
a6291d88 612 && VECTOR_MODE_P (mode)
edafb35b
L
613 && (MEM_P (op1)
614 && SYMBOL_REF_P (XEXP (op1, 0))
615 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
616 {
a6291d88 617 rtx first = ix86_broadcast_from_constant (mode, op1);
edafb35b
L
618 if (first != nullptr)
619 {
620 /* Broadcast to XMM/YMM/ZMM register from an integer
a6291d88 621 constant or scalar mem. */
6e5401e8 622 op1 = gen_reg_rtx (mode);
a6291d88 623 if (FLOAT_MODE_P (mode)
624 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
6e5401e8 625 first = force_const_mem (GET_MODE_INNER (mode), first);
edafb35b
L
626 bool ok = ix86_expand_vector_init_duplicate (false, mode,
627 op1, first);
628 gcc_assert (ok);
629 emit_move_insn (op0, op1);
630 return;
631 }
2bf6d935
ML
632 }
633
634 /* We need to check memory alignment for SSE mode since attribute
635 can make operands unaligned. */
636 if (can_create_pseudo_p ()
637 && SSE_REG_MODE_P (mode)
638 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
639 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
640 {
641 rtx tmp[2];
642
643 /* ix86_expand_vector_move_misalign() does not like both
644 arguments in memory. */
645 if (!register_operand (op0, mode)
646 && !register_operand (op1, mode))
09dba016
L
647 {
648 rtx scratch = ix86_gen_scratch_sse_rtx (mode);
649 emit_move_insn (scratch, op1);
650 op1 = scratch;
651 }
2bf6d935
ML
652
653 tmp[0] = op0; tmp[1] = op1;
654 ix86_expand_vector_move_misalign (mode, tmp);
655 return;
656 }
657
fad14a02
RS
658 /* Special case TImode to V1TImode conversions, via V2DI. */
659 if (mode == V1TImode
660 && SUBREG_P (op1)
661 && GET_MODE (SUBREG_REG (op1)) == TImode
662 && TARGET_64BIT && TARGET_SSE
663 && can_create_pseudo_p ())
664 {
665 rtx tmp = gen_reg_rtx (V2DImode);
666 rtx lo = gen_reg_rtx (DImode);
667 rtx hi = gen_reg_rtx (DImode);
668 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
669 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
670 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
671 emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
672 return;
673 }
674
97c32001
RS
675 /* If operand0 is a hard register, make operand1 a pseudo. */
676 if (can_create_pseudo_p ()
677 && !ix86_hardreg_mov_ok (op0, op1))
678 {
679 rtx tmp = gen_reg_rtx (GET_MODE (op0));
680 emit_move_insn (tmp, op1);
681 emit_move_insn (op0, tmp);
682 return;
683 }
684
2bf6d935
ML
685 /* Make operand1 a register if it isn't already. */
686 if (can_create_pseudo_p ()
687 && !register_operand (op0, mode)
688 && !register_operand (op1, mode))
689 {
7f4c3943
L
690 rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
691 emit_move_insn (tmp, op1);
692 emit_move_insn (op0, tmp);
2bf6d935
ML
693 return;
694 }
695
696 emit_insn (gen_rtx_SET (op0, op1));
697}
698
699/* Split 32-byte AVX unaligned load and store if needed. */
700
701static void
702ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
703{
704 rtx m;
705 rtx (*extract) (rtx, rtx, rtx);
706 machine_mode mode;
707
708 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
709 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
710 {
711 emit_insn (gen_rtx_SET (op0, op1));
712 return;
713 }
714
715 rtx orig_op0 = NULL_RTX;
716 mode = GET_MODE (op0);
717 switch (GET_MODE_CLASS (mode))
718 {
719 case MODE_VECTOR_INT:
720 case MODE_INT:
721 if (mode != V32QImode)
722 {
723 if (!MEM_P (op0))
724 {
725 orig_op0 = op0;
726 op0 = gen_reg_rtx (V32QImode);
727 }
728 else
729 op0 = gen_lowpart (V32QImode, op0);
730 op1 = gen_lowpart (V32QImode, op1);
731 mode = V32QImode;
732 }
733 break;
734 case MODE_VECTOR_FLOAT:
735 break;
736 default:
737 gcc_unreachable ();
738 }
739
740 switch (mode)
741 {
742 default:
743 gcc_unreachable ();
744 case E_V32QImode:
745 extract = gen_avx_vextractf128v32qi;
746 mode = V16QImode;
747 break;
60d1d296
L
748 case E_V16BFmode:
749 extract = gen_avx_vextractf128v16bf;
750 mode = V8BFmode;
751 break;
d959312b
L
752 case E_V16HFmode:
753 extract = gen_avx_vextractf128v16hf;
754 mode = V8HFmode;
755 break;
2bf6d935
ML
756 case E_V8SFmode:
757 extract = gen_avx_vextractf128v8sf;
758 mode = V4SFmode;
759 break;
760 case E_V4DFmode:
761 extract = gen_avx_vextractf128v4df;
762 mode = V2DFmode;
763 break;
764 }
765
766 if (MEM_P (op1))
767 {
768 rtx r = gen_reg_rtx (mode);
769 m = adjust_address (op1, mode, 0);
770 emit_move_insn (r, m);
771 m = adjust_address (op1, mode, 16);
772 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
773 emit_move_insn (op0, r);
774 }
775 else if (MEM_P (op0))
776 {
777 m = adjust_address (op0, mode, 0);
778 emit_insn (extract (m, op1, const0_rtx));
779 m = adjust_address (op0, mode, 16);
780 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
781 }
782 else
783 gcc_unreachable ();
784
785 if (orig_op0)
786 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
787}
788
789/* Implement the movmisalign patterns for SSE. Non-SSE modes go
790 straight to ix86_expand_vector_move. */
791/* Code generation for scalar reg-reg moves of single and double precision data:
792 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
793 movaps reg, reg
794 else
795 movss reg, reg
796 if (x86_sse_partial_reg_dependency == true)
797 movapd reg, reg
798 else
799 movsd reg, reg
800
801 Code generation for scalar loads of double precision data:
802 if (x86_sse_split_regs == true)
803 movlpd mem, reg (gas syntax)
804 else
805 movsd mem, reg
806
807 Code generation for unaligned packed loads of single precision data
808 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
809 if (x86_sse_unaligned_move_optimal)
810 movups mem, reg
811
812 if (x86_sse_partial_reg_dependency == true)
813 {
814 xorps reg, reg
815 movlps mem, reg
816 movhps mem+8, reg
817 }
818 else
819 {
820 movlps mem, reg
821 movhps mem+8, reg
822 }
823
824 Code generation for unaligned packed loads of double precision data
825 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
826 if (x86_sse_unaligned_move_optimal)
827 movupd mem, reg
828
829 if (x86_sse_split_regs == true)
830 {
831 movlpd mem, reg
832 movhpd mem+8, reg
833 }
834 else
835 {
836 movsd mem, reg
837 movhpd mem+8, reg
838 }
839 */
840
841void
842ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
843{
844 rtx op0, op1, m;
845
846 op0 = operands[0];
847 op1 = operands[1];
848
849 /* Use unaligned load/store for AVX512 or when optimizing for size. */
850 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
851 {
852 emit_insn (gen_rtx_SET (op0, op1));
853 return;
854 }
855
856 if (TARGET_AVX)
857 {
858 if (GET_MODE_SIZE (mode) == 32)
859 ix86_avx256_split_vector_move_misalign (op0, op1);
860 else
861 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
862 emit_insn (gen_rtx_SET (op0, op1));
863 return;
864 }
865
866 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
867 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
868 {
869 emit_insn (gen_rtx_SET (op0, op1));
870 return;
871 }
872
873 /* ??? If we have typed data, then it would appear that using
874 movdqu is the only way to get unaligned data loaded with
875 integer type. */
876 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
877 {
878 emit_insn (gen_rtx_SET (op0, op1));
879 return;
880 }
881
882 if (MEM_P (op1))
883 {
884 if (TARGET_SSE2 && mode == V2DFmode)
885 {
886 rtx zero;
887
888 /* When SSE registers are split into halves, we can avoid
889 writing to the top half twice. */
890 if (TARGET_SSE_SPLIT_REGS)
891 {
892 emit_clobber (op0);
893 zero = op0;
894 }
895 else
896 {
897 /* ??? Not sure about the best option for the Intel chips.
898 The following would seem to satisfy; the register is
899 entirely cleared, breaking the dependency chain. We
900 then store to the upper half, with a dependency depth
901 of one. A rumor has it that Intel recommends two movsd
902 followed by an unpacklpd, but this is unconfirmed. And
903 given that the dependency depth of the unpacklpd would
904 still be one, I'm not sure why this would be better. */
905 zero = CONST0_RTX (V2DFmode);
906 }
907
908 m = adjust_address (op1, DFmode, 0);
909 emit_insn (gen_sse2_loadlpd (op0, zero, m));
910 m = adjust_address (op1, DFmode, 8);
911 emit_insn (gen_sse2_loadhpd (op0, op0, m));
912 }
913 else
914 {
915 rtx t;
916
917 if (mode != V4SFmode)
918 t = gen_reg_rtx (V4SFmode);
919 else
920 t = op0;
921
922 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
923 emit_move_insn (t, CONST0_RTX (V4SFmode));
924 else
925 emit_clobber (t);
926
927 m = adjust_address (op1, V2SFmode, 0);
928 emit_insn (gen_sse_loadlps (t, t, m));
929 m = adjust_address (op1, V2SFmode, 8);
930 emit_insn (gen_sse_loadhps (t, t, m));
931 if (mode != V4SFmode)
932 emit_move_insn (op0, gen_lowpart (mode, t));
933 }
934 }
935 else if (MEM_P (op0))
936 {
937 if (TARGET_SSE2 && mode == V2DFmode)
938 {
939 m = adjust_address (op0, DFmode, 0);
940 emit_insn (gen_sse2_storelpd (m, op1));
941 m = adjust_address (op0, DFmode, 8);
942 emit_insn (gen_sse2_storehpd (m, op1));
943 }
944 else
945 {
946 if (mode != V4SFmode)
947 op1 = gen_lowpart (V4SFmode, op1);
948
949 m = adjust_address (op0, V2SFmode, 0);
950 emit_insn (gen_sse_storelps (m, op1));
951 m = adjust_address (op0, V2SFmode, 8);
952 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
953 }
954 }
955 else
956 gcc_unreachable ();
957}
958
b74ebb2a
L
959/* Move bits 64:95 to bits 32:63. */
960
961void
962ix86_move_vector_high_sse_to_mmx (rtx op)
963{
964 rtx mask = gen_rtx_PARALLEL (VOIDmode,
965 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
966 GEN_INT (0), GEN_INT (0)));
967 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
968 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
969 rtx insn = gen_rtx_SET (dest, op);
970 emit_insn (insn);
971}
972
973/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
974
975void
976ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
977{
978 rtx op0 = operands[0];
979 rtx op1 = operands[1];
980 rtx op2 = operands[2];
981
982 machine_mode dmode = GET_MODE (op0);
983 machine_mode smode = GET_MODE (op1);
984 machine_mode inner_dmode = GET_MODE_INNER (dmode);
985 machine_mode inner_smode = GET_MODE_INNER (smode);
986
987 /* Get the corresponding SSE mode for destination. */
988 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
989 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
990 nunits).require ();
991 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
992 nunits / 2).require ();
993
994 /* Get the corresponding SSE mode for source. */
995 nunits = 16 / GET_MODE_SIZE (inner_smode);
996 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
997 nunits).require ();
998
999 /* Generate SSE pack with signed/unsigned saturation. */
1000 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1001 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1002 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1003
1004 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1005 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1006 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
1007 op1, op2));
1008 emit_insn (insn);
1009
1010 ix86_move_vector_high_sse_to_mmx (op0);
1011}
1012
6e9fffcf
L
1013/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1014
1015void
1016ix86_split_mmx_punpck (rtx operands[], bool high_p)
1017{
1018 rtx op0 = operands[0];
1019 rtx op1 = operands[1];
1020 rtx op2 = operands[2];
1021 machine_mode mode = GET_MODE (op0);
1022 rtx mask;
1023 /* The corresponding SSE mode. */
1024 machine_mode sse_mode, double_sse_mode;
1025
1026 switch (mode)
1027 {
be8749f9 1028 case E_V4QImode:
6e9fffcf
L
1029 case E_V8QImode:
1030 sse_mode = V16QImode;
1031 double_sse_mode = V32QImode;
1032 mask = gen_rtx_PARALLEL (VOIDmode,
1033 gen_rtvec (16,
1034 GEN_INT (0), GEN_INT (16),
1035 GEN_INT (1), GEN_INT (17),
1036 GEN_INT (2), GEN_INT (18),
1037 GEN_INT (3), GEN_INT (19),
1038 GEN_INT (4), GEN_INT (20),
1039 GEN_INT (5), GEN_INT (21),
1040 GEN_INT (6), GEN_INT (22),
1041 GEN_INT (7), GEN_INT (23)));
1042 break;
1043
1044 case E_V4HImode:
be8749f9 1045 case E_V2HImode:
6e9fffcf
L
1046 sse_mode = V8HImode;
1047 double_sse_mode = V16HImode;
1048 mask = gen_rtx_PARALLEL (VOIDmode,
1049 gen_rtvec (8,
1050 GEN_INT (0), GEN_INT (8),
1051 GEN_INT (1), GEN_INT (9),
1052 GEN_INT (2), GEN_INT (10),
1053 GEN_INT (3), GEN_INT (11)));
1054 break;
1055
1056 case E_V2SImode:
1057 sse_mode = V4SImode;
1058 double_sse_mode = V8SImode;
1059 mask = gen_rtx_PARALLEL (VOIDmode,
1060 gen_rtvec (4,
1061 GEN_INT (0), GEN_INT (4),
1062 GEN_INT (1), GEN_INT (5)));
1063 break;
1064
a325bdd1
PB
1065 case E_V2SFmode:
1066 sse_mode = V4SFmode;
1067 double_sse_mode = V8SFmode;
1068 mask = gen_rtx_PARALLEL (VOIDmode,
1069 gen_rtvec (4,
1070 GEN_INT (0), GEN_INT (4),
1071 GEN_INT (1), GEN_INT (5)));
1072 break;
1073
6e9fffcf
L
1074 default:
1075 gcc_unreachable ();
1076 }
1077
1078 /* Generate SSE punpcklXX. */
1079 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1080 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1081 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1082
1083 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1084 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1085 rtx insn = gen_rtx_SET (dest, op2);
1086 emit_insn (insn);
1087
be8749f9 1088 /* Move high bits to low bits. */
6e9fffcf
L
1089 if (high_p)
1090 {
a325bdd1
PB
1091 if (sse_mode == V4SFmode)
1092 {
1093 mask = gen_rtx_PARALLEL (VOIDmode,
1094 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1095 GEN_INT (4), GEN_INT (5)));
1096 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1097 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1098 }
1099 else
1100 {
be8749f9
UB
1101 int sz = GET_MODE_SIZE (mode);
1102
1103 if (sz == 4)
1104 mask = gen_rtx_PARALLEL (VOIDmode,
1105 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1106 GEN_INT (0), GEN_INT (1)));
1107 else if (sz == 8)
1108 mask = gen_rtx_PARALLEL (VOIDmode,
1109 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1110 GEN_INT (0), GEN_INT (1)));
1111 else
1112 gcc_unreachable ();
1113
a325bdd1
PB
1114 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1115 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1116 }
1117
6e9fffcf
L
1118 insn = gen_rtx_SET (dest, op1);
1119 emit_insn (insn);
1120 }
1121}
1122
2bf6d935
ML
1123/* Helper function of ix86_fixup_binary_operands to canonicalize
1124 operand order. Returns true if the operands should be swapped. */
1125
1126static bool
1127ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1128 rtx operands[])
1129{
1130 rtx dst = operands[0];
1131 rtx src1 = operands[1];
1132 rtx src2 = operands[2];
1133
1134 /* If the operation is not commutative, we can't do anything. */
1135 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1136 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1137 return false;
1138
1139 /* Highest priority is that src1 should match dst. */
1140 if (rtx_equal_p (dst, src1))
1141 return false;
1142 if (rtx_equal_p (dst, src2))
1143 return true;
1144
1145 /* Next highest priority is that immediate constants come second. */
1146 if (immediate_operand (src2, mode))
1147 return false;
1148 if (immediate_operand (src1, mode))
1149 return true;
1150
1151 /* Lowest priority is that memory references should come second. */
1152 if (MEM_P (src2))
1153 return false;
1154 if (MEM_P (src1))
1155 return true;
1156
1157 return false;
1158}
1159
1160
1161/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1162 destination to use for the operation. If different from the true
1163 destination in operands[0], a copy operation will be required. */
1164
1165rtx
1166ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1167 rtx operands[])
1168{
1169 rtx dst = operands[0];
1170 rtx src1 = operands[1];
1171 rtx src2 = operands[2];
1172
1173 /* Canonicalize operand order. */
1174 if (ix86_swap_binary_operands_p (code, mode, operands))
1175 {
1176 /* It is invalid to swap operands of different modes. */
1177 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1178
1179 std::swap (src1, src2);
1180 }
1181
1182 /* Both source operands cannot be in memory. */
1183 if (MEM_P (src1) && MEM_P (src2))
1184 {
1185 /* Optimization: Only read from memory once. */
1186 if (rtx_equal_p (src1, src2))
1187 {
1188 src2 = force_reg (mode, src2);
1189 src1 = src2;
1190 }
1191 else if (rtx_equal_p (dst, src1))
1192 src2 = force_reg (mode, src2);
1193 else
1194 src1 = force_reg (mode, src1);
1195 }
1196
1197 /* If the destination is memory, and we do not have matching source
1198 operands, do things in registers. */
1199 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1200 dst = gen_reg_rtx (mode);
1201
1202 /* Source 1 cannot be a constant. */
1203 if (CONSTANT_P (src1))
1204 src1 = force_reg (mode, src1);
1205
1206 /* Source 1 cannot be a non-matching memory. */
1207 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1208 src1 = force_reg (mode, src1);
1209
1210 /* Improve address combine. */
1211 if (code == PLUS
1212 && GET_MODE_CLASS (mode) == MODE_INT
1213 && MEM_P (src2))
1214 src2 = force_reg (mode, src2);
1215
1216 operands[1] = src1;
1217 operands[2] = src2;
1218 return dst;
1219}
1220
1221/* Similarly, but assume that the destination has already been
1222 set up properly. */
1223
1224void
1225ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1226 machine_mode mode, rtx operands[])
1227{
1228 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1229 gcc_assert (dst == operands[0]);
1230}
1231
1232/* Attempt to expand a binary operator. Make the expansion closer to the
1233 actual machine, then just general_operand, which will allow 3 separate
1234 memory references (one output, two input) in a single insn. */
1235
1236void
1237ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1238 rtx operands[])
1239{
1240 rtx src1, src2, dst, op, clob;
1241
1242 dst = ix86_fixup_binary_operands (code, mode, operands);
1243 src1 = operands[1];
1244 src2 = operands[2];
1245
1246 /* Emit the instruction. */
1247
1248 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1249
1250 if (reload_completed
1251 && code == PLUS
1252 && !rtx_equal_p (dst, src1))
1253 {
1254 /* This is going to be an LEA; avoid splitting it later. */
1255 emit_insn (op);
1256 }
1257 else
1258 {
1259 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1260 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1261 }
1262
1263 /* Fix up the destination if needed. */
1264 if (dst != operands[0])
1265 emit_move_insn (operands[0], dst);
1266}
1267
1268/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1269 the given OPERANDS. */
1270
1271void
1272ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1273 rtx operands[])
1274{
1275 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1276 if (SUBREG_P (operands[1]))
1277 {
1278 op1 = operands[1];
1279 op2 = operands[2];
1280 }
1281 else if (SUBREG_P (operands[2]))
1282 {
1283 op1 = operands[2];
1284 op2 = operands[1];
1285 }
1286 /* Optimize (__m128i) d | (__m128i) e and similar code
1287 when d and e are float vectors into float vector logical
1288 insn. In C/C++ without using intrinsics there is no other way
1289 to express vector logical operation on float vectors than
1290 to cast them temporarily to integer vectors. */
1291 if (op1
1292 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1293 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1294 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1295 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1296 && SUBREG_BYTE (op1) == 0
1297 && (GET_CODE (op2) == CONST_VECTOR
1298 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1299 && SUBREG_BYTE (op2) == 0))
1300 && can_create_pseudo_p ())
1301 {
1302 rtx dst;
1303 switch (GET_MODE (SUBREG_REG (op1)))
1304 {
1305 case E_V4SFmode:
1306 case E_V8SFmode:
1307 case E_V16SFmode:
1308 case E_V2DFmode:
1309 case E_V4DFmode:
1310 case E_V8DFmode:
1311 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1312 if (GET_CODE (op2) == CONST_VECTOR)
1313 {
1314 op2 = gen_lowpart (GET_MODE (dst), op2);
1315 op2 = force_reg (GET_MODE (dst), op2);
1316 }
1317 else
1318 {
1319 op1 = operands[1];
1320 op2 = SUBREG_REG (operands[2]);
1321 if (!vector_operand (op2, GET_MODE (dst)))
1322 op2 = force_reg (GET_MODE (dst), op2);
1323 }
1324 op1 = SUBREG_REG (op1);
1325 if (!vector_operand (op1, GET_MODE (dst)))
1326 op1 = force_reg (GET_MODE (dst), op1);
1327 emit_insn (gen_rtx_SET (dst,
1328 gen_rtx_fmt_ee (code, GET_MODE (dst),
1329 op1, op2)));
1330 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1331 return;
1332 default:
1333 break;
1334 }
1335 }
1336 if (!vector_operand (operands[1], mode))
1337 operands[1] = force_reg (mode, operands[1]);
1338 if (!vector_operand (operands[2], mode))
1339 operands[2] = force_reg (mode, operands[2]);
1340 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1341 emit_insn (gen_rtx_SET (operands[0],
1342 gen_rtx_fmt_ee (code, mode, operands[1],
1343 operands[2])));
1344}
1345
1346/* Return TRUE or FALSE depending on whether the binary operator meets the
1347 appropriate constraints. */
1348
1349bool
1350ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1351 rtx operands[3])
1352{
1353 rtx dst = operands[0];
1354 rtx src1 = operands[1];
1355 rtx src2 = operands[2];
1356
1357 /* Both source operands cannot be in memory. */
7026bb95 1358 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1359 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
2bf6d935
ML
1360 return false;
1361
1362 /* Canonicalize operand order for commutative operators. */
1363 if (ix86_swap_binary_operands_p (code, mode, operands))
1364 std::swap (src1, src2);
1365
1366 /* If the destination is memory, we must have a matching source operand. */
1367 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1368 return false;
1369
1370 /* Source 1 cannot be a constant. */
1371 if (CONSTANT_P (src1))
1372 return false;
1373
1374 /* Source 1 cannot be a non-matching memory. */
1375 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1376 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1377 return (code == AND
1378 && (mode == HImode
1379 || mode == SImode
1380 || (TARGET_64BIT && mode == DImode))
1381 && satisfies_constraint_L (src2));
1382
1383 return true;
1384}
1385
1386/* Attempt to expand a unary operator. Make the expansion closer to the
1387 actual machine, then just general_operand, which will allow 2 separate
1388 memory references (one output, one input) in a single insn. */
1389
1390void
1391ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1392 rtx operands[])
1393{
1394 bool matching_memory = false;
1395 rtx src, dst, op, clob;
1396
1397 dst = operands[0];
1398 src = operands[1];
1399
1400 /* If the destination is memory, and we do not have matching source
1401 operands, do things in registers. */
1402 if (MEM_P (dst))
1403 {
1404 if (rtx_equal_p (dst, src))
1405 matching_memory = true;
1406 else
1407 dst = gen_reg_rtx (mode);
1408 }
1409
1410 /* When source operand is memory, destination must match. */
1411 if (MEM_P (src) && !matching_memory)
1412 src = force_reg (mode, src);
1413
1414 /* Emit the instruction. */
1415
1416 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1417
1418 if (code == NOT)
1419 emit_insn (op);
1420 else
1421 {
1422 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1423 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1424 }
1425
1426 /* Fix up the destination if needed. */
1427 if (dst != operands[0])
1428 emit_move_insn (operands[0], dst);
1429}
1430
1431/* Predict just emitted jump instruction to be taken with probability PROB. */
1432
1433static void
1434predict_jump (int prob)
1435{
1436 rtx_insn *insn = get_last_insn ();
1437 gcc_assert (JUMP_P (insn));
1438 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1439}
1440
1441/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1442 divisor are within the range [0-255]. */
1443
1444void
1445ix86_split_idivmod (machine_mode mode, rtx operands[],
40c81f84 1446 bool unsigned_p)
2bf6d935
ML
1447{
1448 rtx_code_label *end_label, *qimode_label;
1449 rtx div, mod;
1450 rtx_insn *insn;
1451 rtx scratch, tmp0, tmp1, tmp2;
1452 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
2bf6d935 1453
2b399dba
UB
1454 operands[2] = force_reg (mode, operands[2]);
1455 operands[3] = force_reg (mode, operands[3]);
1456
2bf6d935
ML
1457 switch (mode)
1458 {
1459 case E_SImode:
1460 if (GET_MODE (operands[0]) == SImode)
1461 {
1462 if (GET_MODE (operands[1]) == SImode)
40c81f84 1463 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
2bf6d935
ML
1464 else
1465 gen_divmod4_1
40c81f84 1466 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
2bf6d935
ML
1467 }
1468 else
ea298f7a
UB
1469 gen_divmod4_1
1470 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
2bf6d935 1471 break;
ea298f7a 1472
2bf6d935 1473 case E_DImode:
40c81f84 1474 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
2bf6d935 1475 break;
ea298f7a 1476
2bf6d935
ML
1477 default:
1478 gcc_unreachable ();
1479 }
1480
1481 end_label = gen_label_rtx ();
1482 qimode_label = gen_label_rtx ();
1483
1484 scratch = gen_reg_rtx (mode);
1485
1486 /* Use 8bit unsigned divimod if dividend and divisor are within
1487 the range [0-255]. */
1488 emit_move_insn (scratch, operands[2]);
1489 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1490 scratch, 1, OPTAB_DIRECT);
ea298f7a 1491 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
2bf6d935
ML
1492 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1493 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1494 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1495 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1496 pc_rtx);
1497 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1498 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1499 JUMP_LABEL (insn) = qimode_label;
1500
1501 /* Generate original signed/unsigned divimod. */
e9539592
UB
1502 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1503 operands[2], operands[3]));
2bf6d935
ML
1504
1505 /* Branch to the end. */
1506 emit_jump_insn (gen_jump (end_label));
1507 emit_barrier ();
1508
1509 /* Generate 8bit unsigned divide. */
1510 emit_label (qimode_label);
1511 /* Don't use operands[0] for result of 8bit divide since not all
1512 registers support QImode ZERO_EXTRACT. */
1513 tmp0 = lowpart_subreg (HImode, scratch, mode);
1514 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1515 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1516 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1517
40c81f84 1518 if (unsigned_p)
2bf6d935 1519 {
40c81f84
UB
1520 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1521 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
2bf6d935
ML
1522 }
1523 else
1524 {
40c81f84
UB
1525 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1526 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
2bf6d935
ML
1527 }
1528 if (mode == SImode)
1529 {
1530 if (GET_MODE (operands[0]) != SImode)
1531 div = gen_rtx_ZERO_EXTEND (DImode, div);
1532 if (GET_MODE (operands[1]) != SImode)
1533 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1534 }
1535
1536 /* Extract remainder from AH. */
e9539592
UB
1537 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1538 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1539 GEN_INT (8), GEN_INT (8));
1540 insn = emit_move_insn (operands[1], tmp1);
2bf6d935
ML
1541 set_unique_reg_note (insn, REG_EQUAL, mod);
1542
1543 /* Zero extend quotient from AL. */
1544 tmp1 = gen_lowpart (QImode, tmp0);
ea298f7a
UB
1545 insn = emit_insn (gen_extend_insn
1546 (operands[0], tmp1,
1547 GET_MODE (operands[0]), QImode, 1));
2bf6d935
ML
1548 set_unique_reg_note (insn, REG_EQUAL, div);
1549
1550 emit_label (end_label);
1551}
1552
1553/* Emit x86 binary operand CODE in mode MODE, where the first operand
1554 matches destination. RTX includes clobber of FLAGS_REG. */
1555
1556void
1557ix86_emit_binop (enum rtx_code code, machine_mode mode,
1558 rtx dst, rtx src)
1559{
1560 rtx op, clob;
1561
1562 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1563 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1564
1565 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1566}
1567
1568/* Return true if regno1 def is nearest to the insn. */
1569
1570static bool
1571find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1572{
1573 rtx_insn *prev = insn;
1574 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1575
1576 if (insn == start)
1577 return false;
1578 while (prev && prev != start)
1579 {
1580 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1581 {
1582 prev = PREV_INSN (prev);
1583 continue;
1584 }
1585 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1586 return true;
1587 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1588 return false;
1589 prev = PREV_INSN (prev);
1590 }
1591
1592 /* None of the regs is defined in the bb. */
1593 return false;
1594}
1595
d58a66aa
JJ
1596/* INSN_UID of the last insn emitted by zero store peephole2s. */
1597int ix86_last_zero_store_uid;
1598
2bf6d935
ML
1599/* Split lea instructions into a sequence of instructions
1600 which are executed on ALU to avoid AGU stalls.
1601 It is assumed that it is allowed to clobber flags register
1602 at lea position. */
1603
1604void
1605ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1606{
1607 unsigned int regno0, regno1, regno2;
1608 struct ix86_address parts;
1609 rtx target, tmp;
1610 int ok, adds;
1611
1612 ok = ix86_decompose_address (operands[1], &parts);
1613 gcc_assert (ok);
1614
1615 target = gen_lowpart (mode, operands[0]);
1616
1617 regno0 = true_regnum (target);
1618 regno1 = INVALID_REGNUM;
1619 regno2 = INVALID_REGNUM;
1620
1621 if (parts.base)
1622 {
1623 parts.base = gen_lowpart (mode, parts.base);
1624 regno1 = true_regnum (parts.base);
1625 }
1626
1627 if (parts.index)
1628 {
1629 parts.index = gen_lowpart (mode, parts.index);
1630 regno2 = true_regnum (parts.index);
1631 }
1632
1633 if (parts.disp)
1634 parts.disp = gen_lowpart (mode, parts.disp);
1635
1636 if (parts.scale > 1)
1637 {
1638 /* Case r1 = r1 + ... */
1639 if (regno1 == regno0)
1640 {
1641 /* If we have a case r1 = r1 + C * r2 then we
1642 should use multiplication which is very
1643 expensive. Assume cost model is wrong if we
1644 have such case here. */
1645 gcc_assert (regno2 != regno0);
1646
1647 for (adds = parts.scale; adds > 0; adds--)
1648 ix86_emit_binop (PLUS, mode, target, parts.index);
1649 }
1650 else
1651 {
1652 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1653 if (regno0 != regno2)
1654 emit_insn (gen_rtx_SET (target, parts.index));
1655
d55ce33a
JJ
1656 /* Use shift for scaling, but emit it as MULT instead
1657 to avoid it being immediately peephole2 optimized back
1658 into lea. */
1659 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
2bf6d935
ML
1660
1661 if (parts.base)
1662 ix86_emit_binop (PLUS, mode, target, parts.base);
1663
1664 if (parts.disp && parts.disp != const0_rtx)
1665 ix86_emit_binop (PLUS, mode, target, parts.disp);
1666 }
1667 }
1668 else if (!parts.base && !parts.index)
1669 {
1670 gcc_assert(parts.disp);
1671 emit_insn (gen_rtx_SET (target, parts.disp));
1672 }
1673 else
1674 {
1675 if (!parts.base)
1676 {
1677 if (regno0 != regno2)
1678 emit_insn (gen_rtx_SET (target, parts.index));
1679 }
1680 else if (!parts.index)
1681 {
1682 if (regno0 != regno1)
1683 emit_insn (gen_rtx_SET (target, parts.base));
1684 }
1685 else
1686 {
1687 if (regno0 == regno1)
1688 tmp = parts.index;
1689 else if (regno0 == regno2)
1690 tmp = parts.base;
1691 else
1692 {
1693 rtx tmp1;
1694
1695 /* Find better operand for SET instruction, depending
1696 on which definition is farther from the insn. */
1697 if (find_nearest_reg_def (insn, regno1, regno2))
1698 tmp = parts.index, tmp1 = parts.base;
1699 else
1700 tmp = parts.base, tmp1 = parts.index;
1701
1702 emit_insn (gen_rtx_SET (target, tmp));
1703
1704 if (parts.disp && parts.disp != const0_rtx)
1705 ix86_emit_binop (PLUS, mode, target, parts.disp);
1706
1707 ix86_emit_binop (PLUS, mode, target, tmp1);
1708 return;
1709 }
1710
1711 ix86_emit_binop (PLUS, mode, target, tmp);
1712 }
1713
1714 if (parts.disp && parts.disp != const0_rtx)
1715 ix86_emit_binop (PLUS, mode, target, parts.disp);
1716 }
1717}
1718
1719/* Post-reload splitter for converting an SF or DFmode value in an
1720 SSE register into an unsigned SImode. */
1721
1722void
1723ix86_split_convert_uns_si_sse (rtx operands[])
1724{
1725 machine_mode vecmode;
1726 rtx value, large, zero_or_two31, input, two31, x;
1727
1728 large = operands[1];
1729 zero_or_two31 = operands[2];
1730 input = operands[3];
1731 two31 = operands[4];
1732 vecmode = GET_MODE (large);
1733 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1734
1735 /* Load up the value into the low element. We must ensure that the other
1736 elements are valid floats -- zero is the easiest such value. */
1737 if (MEM_P (input))
1738 {
1739 if (vecmode == V4SFmode)
1740 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1741 else
1742 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1743 }
1744 else
1745 {
1746 input = gen_rtx_REG (vecmode, REGNO (input));
1747 emit_move_insn (value, CONST0_RTX (vecmode));
1748 if (vecmode == V4SFmode)
1749 emit_insn (gen_sse_movss (value, value, input));
1750 else
1751 emit_insn (gen_sse2_movsd (value, value, input));
1752 }
1753
1754 emit_move_insn (large, two31);
1755 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1756
1757 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1758 emit_insn (gen_rtx_SET (large, x));
1759
1760 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1761 emit_insn (gen_rtx_SET (zero_or_two31, x));
1762
1763 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1764 emit_insn (gen_rtx_SET (value, x));
1765
1766 large = gen_rtx_REG (V4SImode, REGNO (large));
1767 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1768
1769 x = gen_rtx_REG (V4SImode, REGNO (value));
1770 if (vecmode == V4SFmode)
1771 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1772 else
1773 emit_insn (gen_sse2_cvttpd2dq (x, value));
1774 value = x;
1775
1776 emit_insn (gen_xorv4si3 (value, value, large));
1777}
1778
1779static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1780 machine_mode mode, rtx target,
1781 rtx var, int one_var);
1782
1783/* Convert an unsigned DImode value into a DFmode, using only SSE.
1784 Expects the 64-bit DImode to be supplied in a pair of integral
1785 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1786 -mfpmath=sse, !optimize_size only. */
1787
1788void
1789ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1790{
1791 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1792 rtx int_xmm, fp_xmm;
1793 rtx biases, exponents;
1794 rtx x;
1795
1796 int_xmm = gen_reg_rtx (V4SImode);
1797 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1798 emit_insn (gen_movdi_to_sse (int_xmm, input));
1799 else if (TARGET_SSE_SPLIT_REGS)
1800 {
1801 emit_clobber (int_xmm);
1802 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1803 }
1804 else
1805 {
1806 x = gen_reg_rtx (V2DImode);
1807 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1808 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1809 }
1810
1811 x = gen_rtx_CONST_VECTOR (V4SImode,
1812 gen_rtvec (4, GEN_INT (0x43300000UL),
1813 GEN_INT (0x45300000UL),
1814 const0_rtx, const0_rtx));
1815 exponents = validize_mem (force_const_mem (V4SImode, x));
1816
1817 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1818 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1819
1820 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1821 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1822 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1823 (0x1.0p84 + double(fp_value_hi_xmm)).
1824 Note these exponents differ by 32. */
1825
1826 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1827
1828 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1829 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1830 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1831 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1832 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1833 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1834 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1835 biases = validize_mem (force_const_mem (V2DFmode, biases));
1836 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1837
1838 /* Add the upper and lower DFmode values together. */
1839 if (TARGET_SSE3)
1840 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1841 else
1842 {
1843 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1844 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1845 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1846 }
1847
1848 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1849}
1850
1851/* Not used, but eases macroization of patterns. */
1852void
1853ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1854{
1855 gcc_unreachable ();
1856}
1857
0cda606d
UB
1858static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1859
2bf6d935
ML
1860/* Convert an unsigned SImode value into a DFmode. Only currently used
1861 for SSE, but applicable anywhere. */
1862
1863void
1864ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1865{
1866 REAL_VALUE_TYPE TWO31r;
1867 rtx x, fp;
1868
1869 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1870 NULL, 1, OPTAB_DIRECT);
1871
1872 fp = gen_reg_rtx (DFmode);
1873 emit_insn (gen_floatsidf2 (fp, x));
1874
1875 real_ldexp (&TWO31r, &dconst1, 31);
1876 x = const_double_from_real_value (TWO31r, DFmode);
1877
1878 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
0cda606d
UB
1879
1880 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1881 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1882 x = ix86_expand_sse_fabs (x, NULL);
1883
2bf6d935
ML
1884 if (x != target)
1885 emit_move_insn (target, x);
1886}
1887
1888/* Convert a signed DImode value into a DFmode. Only used for SSE in
1889 32-bit mode; otherwise we have a direct convert instruction. */
1890
1891void
1892ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1893{
1894 REAL_VALUE_TYPE TWO32r;
1895 rtx fp_lo, fp_hi, x;
1896
1897 fp_lo = gen_reg_rtx (DFmode);
1898 fp_hi = gen_reg_rtx (DFmode);
1899
1900 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1901
1902 real_ldexp (&TWO32r, &dconst1, 32);
1903 x = const_double_from_real_value (TWO32r, DFmode);
1904 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1905
1906 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1907
1908 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1909 0, OPTAB_DIRECT);
1910 if (x != target)
1911 emit_move_insn (target, x);
1912}
1913
1914/* Convert an unsigned SImode value into a SFmode, using only SSE.
1915 For x86_32, -mfpmath=sse, !optimize_size only. */
1916void
1917ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1918{
1919 REAL_VALUE_TYPE ONE16r;
1920 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1921
1922 real_ldexp (&ONE16r, &dconst1, 16);
1923 x = const_double_from_real_value (ONE16r, SFmode);
1924 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1925 NULL, 0, OPTAB_DIRECT);
1926 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1927 NULL, 0, OPTAB_DIRECT);
1928 fp_hi = gen_reg_rtx (SFmode);
1929 fp_lo = gen_reg_rtx (SFmode);
1930 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1931 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
ad9fcb96
L
1932 if (TARGET_FMA)
1933 {
1934 x = validize_mem (force_const_mem (SFmode, x));
1935 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1936 emit_move_insn (target, fp_hi);
1937 }
1938 else
1939 {
1940 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1941 0, OPTAB_DIRECT);
1942 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1943 0, OPTAB_DIRECT);
1944 if (!rtx_equal_p (target, fp_hi))
1945 emit_move_insn (target, fp_hi);
1946 }
2bf6d935
ML
1947}
1948
1949/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1950 a vector of unsigned ints VAL to vector of floats TARGET. */
1951
1952void
1953ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1954{
1955 rtx tmp[8];
1956 REAL_VALUE_TYPE TWO16r;
1957 machine_mode intmode = GET_MODE (val);
1958 machine_mode fltmode = GET_MODE (target);
1959 rtx (*cvt) (rtx, rtx);
1960
1961 if (intmode == V4SImode)
1962 cvt = gen_floatv4siv4sf2;
1963 else
1964 cvt = gen_floatv8siv8sf2;
1965 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1966 tmp[0] = force_reg (intmode, tmp[0]);
1967 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1968 OPTAB_DIRECT);
1969 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1970 NULL_RTX, 1, OPTAB_DIRECT);
1971 tmp[3] = gen_reg_rtx (fltmode);
1972 emit_insn (cvt (tmp[3], tmp[1]));
1973 tmp[4] = gen_reg_rtx (fltmode);
1974 emit_insn (cvt (tmp[4], tmp[2]));
1975 real_ldexp (&TWO16r, &dconst1, 16);
1976 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1977 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
ad9fcb96
L
1978 if (TARGET_FMA)
1979 {
1980 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
1981 emit_move_insn (target, tmp[6]);
1982 }
1983 else
1984 {
1985 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
1986 NULL_RTX, 1, OPTAB_DIRECT);
1987 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
1988 target, 1, OPTAB_DIRECT);
1989 if (tmp[7] != target)
1990 emit_move_insn (target, tmp[7]);
1991 }
2bf6d935
ML
1992}
1993
1994/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1995 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1996 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1997 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1998
1999rtx
2000ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2001{
2002 REAL_VALUE_TYPE TWO31r;
2003 rtx two31r, tmp[4];
2004 machine_mode mode = GET_MODE (val);
2005 machine_mode scalarmode = GET_MODE_INNER (mode);
2006 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2007 rtx (*cmp) (rtx, rtx, rtx, rtx);
2008 int i;
2009
2010 for (i = 0; i < 3; i++)
2011 tmp[i] = gen_reg_rtx (mode);
2012 real_ldexp (&TWO31r, &dconst1, 31);
2013 two31r = const_double_from_real_value (TWO31r, scalarmode);
2014 two31r = ix86_build_const_vector (mode, 1, two31r);
2015 two31r = force_reg (mode, two31r);
2016 switch (mode)
2017 {
2018 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2019 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2020 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2021 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2022 default: gcc_unreachable ();
2023 }
2024 tmp[3] = gen_rtx_LE (mode, two31r, val);
2025 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2026 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2027 0, OPTAB_DIRECT);
2028 if (intmode == V4SImode || TARGET_AVX2)
2029 *xorp = expand_simple_binop (intmode, ASHIFT,
2030 gen_lowpart (intmode, tmp[0]),
2031 GEN_INT (31), NULL_RTX, 0,
2032 OPTAB_DIRECT);
2033 else
2034 {
6a556ba4 2035 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2bf6d935
ML
2036 two31 = ix86_build_const_vector (intmode, 1, two31);
2037 *xorp = expand_simple_binop (intmode, AND,
2038 gen_lowpart (intmode, tmp[0]),
2039 two31, NULL_RTX, 0,
2040 OPTAB_DIRECT);
2041 }
2042 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2043 0, OPTAB_DIRECT);
2044}
2045
2046/* Generate code for floating point ABS or NEG. */
2047
2048void
2049ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2050 rtx operands[])
2051{
f359611b 2052 rtx set, dst, src;
2bf6d935
ML
2053 bool use_sse = false;
2054 bool vector_mode = VECTOR_MODE_P (mode);
2055 machine_mode vmode = mode;
f359611b 2056 rtvec par;
2bf6d935 2057
75a97b59
L
2058 if (vector_mode || mode == TFmode || mode == HFmode)
2059 {
2060 use_sse = true;
2061 if (mode == HFmode)
2062 vmode = V8HFmode;
2063 }
2bf6d935
ML
2064 else if (TARGET_SSE_MATH)
2065 {
2066 use_sse = SSE_FLOAT_MODE_P (mode);
2067 if (mode == SFmode)
2068 vmode = V4SFmode;
2069 else if (mode == DFmode)
2070 vmode = V2DFmode;
2071 }
2072
2bf6d935
ML
2073 dst = operands[0];
2074 src = operands[1];
2075
2076 set = gen_rtx_fmt_e (code, mode, src);
2077 set = gen_rtx_SET (dst, set);
2078
f359611b 2079 if (use_sse)
2bf6d935 2080 {
f359611b 2081 rtx mask, use, clob;
2bf6d935 2082
f359611b
UB
2083 /* NEG and ABS performed with SSE use bitwise mask operations.
2084 Create the appropriate mask now. */
2085 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2bf6d935 2086 use = gen_rtx_USE (VOIDmode, mask);
94f687bd 2087 if (vector_mode || mode == TFmode)
2bf6d935
ML
2088 par = gen_rtvec (2, set, use);
2089 else
2090 {
2091 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2092 par = gen_rtvec (3, set, use, clob);
2093 }
2bf6d935
ML
2094 }
2095 else
f359611b
UB
2096 {
2097 rtx clob;
2098
2099 /* Changing of sign for FP values is doable using integer unit too. */
2100 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2101 par = gen_rtvec (2, set, clob);
2102 }
2103
2104 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2105}
2106
2107/* Deconstruct a floating point ABS or NEG operation
2108 with integer registers into integer operations. */
2109
2110void
2111ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2112 rtx operands[])
2113{
2114 enum rtx_code absneg_op;
2115 rtx dst, set;
2116
2117 gcc_assert (operands_match_p (operands[0], operands[1]));
2118
2119 switch (mode)
2120 {
2121 case E_SFmode:
2122 dst = gen_lowpart (SImode, operands[0]);
2123
2124 if (code == ABS)
2125 {
2126 set = gen_int_mode (0x7fffffff, SImode);
2127 absneg_op = AND;
2128 }
2129 else
2130 {
2131 set = gen_int_mode (0x80000000, SImode);
2132 absneg_op = XOR;
2133 }
2134 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2135 break;
2136
2137 case E_DFmode:
2138 if (TARGET_64BIT)
2139 {
2140 dst = gen_lowpart (DImode, operands[0]);
2141 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2142
2143 if (code == ABS)
2144 set = const0_rtx;
2145 else
2146 set = gen_rtx_NOT (DImode, dst);
2147 }
2148 else
2149 {
2150 dst = gen_highpart (SImode, operands[0]);
2151
2152 if (code == ABS)
2153 {
2154 set = gen_int_mode (0x7fffffff, SImode);
2155 absneg_op = AND;
2156 }
2157 else
2158 {
2159 set = gen_int_mode (0x80000000, SImode);
2160 absneg_op = XOR;
2161 }
2162 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2163 }
2164 break;
2165
2166 case E_XFmode:
2167 dst = gen_rtx_REG (SImode,
2168 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2169 if (code == ABS)
2170 {
2171 set = GEN_INT (0x7fff);
2172 absneg_op = AND;
2173 }
2174 else
2175 {
2176 set = GEN_INT (0x8000);
2177 absneg_op = XOR;
2178 }
2179 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2180 break;
2181
2182 default:
2183 gcc_unreachable ();
2184 }
2185
2186 set = gen_rtx_SET (dst, set);
2187
2188 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2189 rtvec par = gen_rtvec (2, set, clob);
2190
2191 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2bf6d935
ML
2192}
2193
2194/* Expand a copysign operation. Special case operand 0 being a constant. */
2195
2196void
2197ix86_expand_copysign (rtx operands[])
2198{
2199 machine_mode mode, vmode;
7e691189 2200 rtx dest, vdest, op0, op1, mask, op2, op3;
2bf6d935 2201
60efb1fe 2202 mode = GET_MODE (operands[0]);
2bf6d935 2203
75a97b59
L
2204 if (mode == HFmode)
2205 vmode = V8HFmode;
2206 else if (mode == SFmode)
2bf6d935
ML
2207 vmode = V4SFmode;
2208 else if (mode == DFmode)
2209 vmode = V2DFmode;
987a3082 2210 else if (mode == TFmode)
2bf6d935 2211 vmode = mode;
987a3082
UB
2212 else
2213 gcc_unreachable ();
2214
60efb1fe 2215 if (rtx_equal_p (operands[1], operands[2]))
2bf6d935 2216 {
60efb1fe 2217 emit_move_insn (operands[0], operands[1]);
2bf6d935
ML
2218 return;
2219 }
2220
7e691189
JJ
2221 dest = operands[0];
2222 vdest = lowpart_subreg (vmode, dest, mode);
2223 if (vdest == NULL_RTX)
2224 vdest = gen_reg_rtx (vmode);
2225 else
2226 dest = NULL_RTX;
2227 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
60efb1fe 2228 mask = ix86_build_signbit_mask (vmode, 0, 0);
2bf6d935 2229
60efb1fe 2230 if (CONST_DOUBLE_P (operands[1]))
2bf6d935 2231 {
60efb1fe 2232 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2233 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2234 if (op0 == CONST0_RTX (mode))
2bf6d935 2235 {
7e691189
JJ
2236 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2237 if (dest)
2238 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
60efb1fe 2239 return;
2bf6d935 2240 }
2bf6d935 2241
60efb1fe 2242 if (GET_MODE_SIZE (mode) < 16)
2243 op0 = ix86_build_const_vector (vmode, false, op0);
2244 op0 = force_reg (vmode, op0);
2bf6d935 2245 }
60efb1fe 2246 else
7e691189 2247 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
60efb1fe 2248
2249 op2 = gen_reg_rtx (vmode);
2250 op3 = gen_reg_rtx (vmode);
2251 emit_move_insn (op2, gen_rtx_AND (vmode,
2252 gen_rtx_NOT (vmode, mask),
2253 op0));
2254 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
7e691189
JJ
2255 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2256 if (dest)
2257 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2bf6d935
ML
2258}
2259
2260/* Expand an xorsign operation. */
2261
2262void
2263ix86_expand_xorsign (rtx operands[])
2264{
2bf6d935 2265 machine_mode mode, vmode;
7e691189 2266 rtx dest, vdest, op0, op1, mask, x, temp;
2bf6d935
ML
2267
2268 dest = operands[0];
2269 op0 = operands[1];
2270 op1 = operands[2];
2271
2272 mode = GET_MODE (dest);
2273
75a97b59
L
2274 if (mode == HFmode)
2275 vmode = V8HFmode;
2276 else if (mode == SFmode)
987a3082 2277 vmode = V4SFmode;
2bf6d935 2278 else if (mode == DFmode)
987a3082 2279 vmode = V2DFmode;
2bf6d935
ML
2280 else
2281 gcc_unreachable ();
2282
7485a525 2283 temp = gen_reg_rtx (vmode);
2bf6d935
ML
2284 mask = ix86_build_signbit_mask (vmode, 0, 0);
2285
7e691189 2286 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
7485a525
JJ
2287 x = gen_rtx_AND (vmode, op1, mask);
2288 emit_insn (gen_rtx_SET (temp, x));
2bf6d935 2289
7e691189 2290 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
7485a525 2291 x = gen_rtx_XOR (vmode, temp, op0);
652bef70 2292
7e691189
JJ
2293 vdest = lowpart_subreg (vmode, dest, mode);
2294 if (vdest == NULL_RTX)
2295 vdest = gen_reg_rtx (vmode);
2296 else
2297 dest = NULL_RTX;
2298 emit_insn (gen_rtx_SET (vdest, x));
2299
2300 if (dest)
2301 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2bf6d935
ML
2302}
2303
2304static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2305
2306void
2307ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2308{
2309 machine_mode mode = GET_MODE (op0);
2310 rtx tmp;
2311
2312 /* Handle special case - vector comparsion with boolean result, transform
2313 it using ptest instruction. */
850a13d7 2314 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2315 || mode == OImode)
2bf6d935
ML
2316 {
2317 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2318 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2319
2320 gcc_assert (code == EQ || code == NE);
850a13d7 2321
2322 if (mode == OImode)
2323 {
2324 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2325 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2326 mode = p_mode;
2327 }
2bf6d935
ML
2328 /* Generate XOR since we can't check that one operand is zero vector. */
2329 tmp = gen_reg_rtx (mode);
2330 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2331 tmp = gen_lowpart (p_mode, tmp);
2332 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2333 gen_rtx_UNSPEC (CCmode,
2334 gen_rtvec (2, tmp, tmp),
2335 UNSPEC_PTEST)));
2336 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2337 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2338 gen_rtx_LABEL_REF (VOIDmode, label),
2339 pc_rtx);
2340 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2341 return;
2342 }
2343
2344 switch (mode)
2345 {
a6841211 2346 case E_HFmode:
2bf6d935
ML
2347 case E_SFmode:
2348 case E_DFmode:
2349 case E_XFmode:
2350 case E_QImode:
2351 case E_HImode:
2352 case E_SImode:
2353 simple:
2354 tmp = ix86_expand_compare (code, op0, op1);
2355 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2356 gen_rtx_LABEL_REF (VOIDmode, label),
2357 pc_rtx);
2358 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2359 return;
2360
2361 case E_DImode:
2362 if (TARGET_64BIT)
2363 goto simple;
2bf6d935
ML
2364 /* FALLTHRU */
2365 case E_TImode:
43201f2c
RS
2366 /* DI and TI mode equality/inequality comparisons may be performed
2367 on SSE registers. Avoid splitting them, except when optimizing
2368 for size. */
2369 if ((code == EQ || code == NE)
2370 && !optimize_insn_for_size_p ())
2371 goto simple;
2372
2bf6d935
ML
2373 /* Expand DImode branch into multiple compare+branch. */
2374 {
2375 rtx lo[2], hi[2];
2376 rtx_code_label *label2;
2377 enum rtx_code code1, code2, code3;
2378 machine_mode submode;
2379
2380 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2381 {
2382 std::swap (op0, op1);
2383 code = swap_condition (code);
2384 }
2385
2386 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2387 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2388
2389 submode = mode == DImode ? SImode : DImode;
2390
43201f2c 2391 /* If we are doing less-than or greater-or-equal-than,
2bf6d935
ML
2392 op1 is a constant and the low word is zero, then we can just
2393 examine the high word. Similarly for low word -1 and
2394 less-or-equal-than or greater-than. */
2395
2396 if (CONST_INT_P (hi[1]))
2397 switch (code)
2398 {
2399 case LT: case LTU: case GE: case GEU:
2400 if (lo[1] == const0_rtx)
2401 {
2402 ix86_expand_branch (code, hi[0], hi[1], label);
2403 return;
2404 }
2405 break;
2406 case LE: case LEU: case GT: case GTU:
2407 if (lo[1] == constm1_rtx)
2408 {
2409 ix86_expand_branch (code, hi[0], hi[1], label);
2410 return;
2411 }
2412 break;
2413 default:
2414 break;
2415 }
2416
2417 /* Emulate comparisons that do not depend on Zero flag with
2418 double-word subtraction. Note that only Overflow, Sign
2419 and Carry flags are valid, so swap arguments and condition
2420 of comparisons that would otherwise test Zero flag. */
2421
2422 switch (code)
2423 {
2424 case LE: case LEU: case GT: case GTU:
2425 std::swap (lo[0], lo[1]);
2426 std::swap (hi[0], hi[1]);
2427 code = swap_condition (code);
2428 /* FALLTHRU */
2429
2430 case LT: case LTU: case GE: case GEU:
2431 {
2bf6d935 2432 bool uns = (code == LTU || code == GEU);
987a3082
UB
2433 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2434 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2bf6d935
ML
2435
2436 if (!nonimmediate_operand (lo[0], submode))
2437 lo[0] = force_reg (submode, lo[0]);
2438 if (!x86_64_general_operand (lo[1], submode))
2439 lo[1] = force_reg (submode, lo[1]);
2440
2441 if (!register_operand (hi[0], submode))
2442 hi[0] = force_reg (submode, hi[0]);
2443 if ((uns && !nonimmediate_operand (hi[1], submode))
2444 || (!uns && !x86_64_general_operand (hi[1], submode)))
2445 hi[1] = force_reg (submode, hi[1]);
2446
987a3082 2447 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2bf6d935 2448
987a3082
UB
2449 tmp = gen_rtx_SCRATCH (submode);
2450 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2bf6d935 2451
987a3082 2452 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2bf6d935
ML
2453 ix86_expand_branch (code, tmp, const0_rtx, label);
2454 return;
2455 }
2456
2457 default:
2458 break;
2459 }
2460
2461 /* Otherwise, we need two or three jumps. */
2462
2463 label2 = gen_label_rtx ();
2464
2465 code1 = code;
2466 code2 = swap_condition (code);
2467 code3 = unsigned_condition (code);
2468
2469 switch (code)
2470 {
2471 case LT: case GT: case LTU: case GTU:
2472 break;
2473
2474 case LE: code1 = LT; code2 = GT; break;
2475 case GE: code1 = GT; code2 = LT; break;
2476 case LEU: code1 = LTU; code2 = GTU; break;
2477 case GEU: code1 = GTU; code2 = LTU; break;
2478
2479 case EQ: code1 = UNKNOWN; code2 = NE; break;
2480 case NE: code2 = UNKNOWN; break;
2481
2482 default:
2483 gcc_unreachable ();
2484 }
2485
2486 /*
2487 * a < b =>
2488 * if (hi(a) < hi(b)) goto true;
2489 * if (hi(a) > hi(b)) goto false;
2490 * if (lo(a) < lo(b)) goto true;
2491 * false:
2492 */
2493
2494 if (code1 != UNKNOWN)
2495 ix86_expand_branch (code1, hi[0], hi[1], label);
2496 if (code2 != UNKNOWN)
2497 ix86_expand_branch (code2, hi[0], hi[1], label2);
2498
2499 ix86_expand_branch (code3, lo[0], lo[1], label);
2500
2501 if (code2 != UNKNOWN)
2502 emit_label (label2);
2503 return;
2504 }
2505
2506 default:
2507 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2508 goto simple;
2509 }
2510}
2511
2512/* Figure out whether to use unordered fp comparisons. */
2513
2514static bool
2515ix86_unordered_fp_compare (enum rtx_code code)
2516{
2517 if (!TARGET_IEEE_FP)
2518 return false;
2519
2520 switch (code)
2521 {
2bf6d935
ML
2522 case LT:
2523 case LE:
d6038777
UB
2524 case GT:
2525 case GE:
2526 case LTGT:
2bf6d935
ML
2527 return false;
2528
2529 case EQ:
2530 case NE:
2531
2bf6d935
ML
2532 case UNORDERED:
2533 case ORDERED:
2534 case UNLT:
2535 case UNLE:
2536 case UNGT:
2537 case UNGE:
2538 case UNEQ:
2539 return true;
2540
2541 default:
2542 gcc_unreachable ();
2543 }
2544}
2545
2546/* Return a comparison we can do and that it is equivalent to
2547 swap_condition (code) apart possibly from orderedness.
2548 But, never change orderedness if TARGET_IEEE_FP, returning
2549 UNKNOWN in that case if necessary. */
2550
2551static enum rtx_code
2552ix86_fp_swap_condition (enum rtx_code code)
2553{
2554 switch (code)
2555 {
2556 case GT: /* GTU - CF=0 & ZF=0 */
2557 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2558 case GE: /* GEU - CF=0 */
2559 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2560 case UNLT: /* LTU - CF=1 */
2561 return TARGET_IEEE_FP ? UNKNOWN : GT;
2562 case UNLE: /* LEU - CF=1 | ZF=1 */
2563 return TARGET_IEEE_FP ? UNKNOWN : GE;
2564 default:
2565 return swap_condition (code);
2566 }
2567}
2568
2569/* Return cost of comparison CODE using the best strategy for performance.
2570 All following functions do use number of instructions as a cost metrics.
2571 In future this should be tweaked to compute bytes for optimize_size and
2572 take into account performance of various instructions on various CPUs. */
2573
2574static int
2575ix86_fp_comparison_cost (enum rtx_code code)
2576{
2577 int arith_cost;
2578
2579 /* The cost of code using bit-twiddling on %ah. */
2580 switch (code)
2581 {
2582 case UNLE:
2583 case UNLT:
2584 case LTGT:
2585 case GT:
2586 case GE:
2587 case UNORDERED:
2588 case ORDERED:
2589 case UNEQ:
2590 arith_cost = 4;
2591 break;
2592 case LT:
2593 case NE:
2594 case EQ:
2595 case UNGE:
2596 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2597 break;
2598 case LE:
2599 case UNGT:
2600 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2601 break;
2602 default:
2603 gcc_unreachable ();
2604 }
2605
2606 switch (ix86_fp_comparison_strategy (code))
2607 {
2608 case IX86_FPCMP_COMI:
2609 return arith_cost > 4 ? 3 : 2;
2610 case IX86_FPCMP_SAHF:
2611 return arith_cost > 4 ? 4 : 3;
2612 default:
2613 return arith_cost;
2614 }
2615}
2616
2617/* Swap, force into registers, or otherwise massage the two operands
2618 to a fp comparison. The operands are updated in place; the new
2619 comparison code is returned. */
2620
2621static enum rtx_code
2622ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2623{
2624 bool unordered_compare = ix86_unordered_fp_compare (code);
2625 rtx op0 = *pop0, op1 = *pop1;
2626 machine_mode op_mode = GET_MODE (op0);
a6841211 2627 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2bf6d935
ML
2628
2629 /* All of the unordered compare instructions only work on registers.
2630 The same is true of the fcomi compare instructions. The XFmode
2631 compare instructions require registers except when comparing
2632 against zero or when converting operand 1 from fixed point to
2633 floating point. */
2634
2635 if (!is_sse
2636 && (unordered_compare
2637 || (op_mode == XFmode
2638 && ! (standard_80387_constant_p (op0) == 1
2639 || standard_80387_constant_p (op1) == 1)
2640 && GET_CODE (op1) != FLOAT)
2641 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2642 {
2643 op0 = force_reg (op_mode, op0);
2644 op1 = force_reg (op_mode, op1);
2645 }
2646 else
2647 {
2648 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2649 things around if they appear profitable, otherwise force op0
2650 into a register. */
2651
2652 if (standard_80387_constant_p (op0) == 0
2653 || (MEM_P (op0)
2654 && ! (standard_80387_constant_p (op1) == 0
2655 || MEM_P (op1))))
2656 {
2657 enum rtx_code new_code = ix86_fp_swap_condition (code);
2658 if (new_code != UNKNOWN)
2659 {
2660 std::swap (op0, op1);
2661 code = new_code;
2662 }
2663 }
2664
2665 if (!REG_P (op0))
2666 op0 = force_reg (op_mode, op0);
2667
2668 if (CONSTANT_P (op1))
2669 {
2670 int tmp = standard_80387_constant_p (op1);
2671 if (tmp == 0)
2672 op1 = validize_mem (force_const_mem (op_mode, op1));
2673 else if (tmp == 1)
2674 {
2675 if (TARGET_CMOVE)
2676 op1 = force_reg (op_mode, op1);
2677 }
2678 else
2679 op1 = force_reg (op_mode, op1);
2680 }
2681 }
2682
2683 /* Try to rearrange the comparison to make it cheaper. */
2684 if (ix86_fp_comparison_cost (code)
2685 > ix86_fp_comparison_cost (swap_condition (code))
2686 && (REG_P (op1) || can_create_pseudo_p ()))
2687 {
2688 std::swap (op0, op1);
2689 code = swap_condition (code);
2690 if (!REG_P (op0))
2691 op0 = force_reg (op_mode, op0);
2692 }
2693
2694 *pop0 = op0;
2695 *pop1 = op1;
2696 return code;
2697}
2698
2699/* Generate insn patterns to do a floating point compare of OPERANDS. */
2700
2701static rtx
2702ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2703{
2704 bool unordered_compare = ix86_unordered_fp_compare (code);
2705 machine_mode cmp_mode;
2706 rtx tmp, scratch;
2707
2708 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2709
2710 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2711 if (unordered_compare)
2712 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2713
2714 /* Do fcomi/sahf based test when profitable. */
2715 switch (ix86_fp_comparison_strategy (code))
2716 {
2717 case IX86_FPCMP_COMI:
2718 cmp_mode = CCFPmode;
2719 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2720 break;
2721
2722 case IX86_FPCMP_SAHF:
2723 cmp_mode = CCFPmode;
2724 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2725 scratch = gen_reg_rtx (HImode);
2726 emit_insn (gen_rtx_SET (scratch, tmp));
2727 emit_insn (gen_x86_sahf_1 (scratch));
2728 break;
2729
2730 case IX86_FPCMP_ARITH:
2731 cmp_mode = CCNOmode;
2732 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2733 scratch = gen_reg_rtx (HImode);
2734 emit_insn (gen_rtx_SET (scratch, tmp));
2735
2736 /* In the unordered case, we have to check C2 for NaN's, which
2737 doesn't happen to work out to anything nice combination-wise.
2738 So do some bit twiddling on the value we've got in AH to come
2739 up with an appropriate set of condition codes. */
2740
2741 switch (code)
2742 {
2743 case GT:
2744 case UNGT:
2745 if (code == GT || !TARGET_IEEE_FP)
2746 {
2747 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2748 code = EQ;
2749 }
2750 else
2751 {
2752 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2753 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2754 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2755 cmp_mode = CCmode;
2756 code = GEU;
2757 }
2758 break;
2759 case LT:
2760 case UNLT:
2761 if (code == LT && TARGET_IEEE_FP)
2762 {
2763 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2764 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2765 cmp_mode = CCmode;
2766 code = EQ;
2767 }
2768 else
2769 {
2770 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2771 code = NE;
2772 }
2773 break;
2774 case GE:
2775 case UNGE:
2776 if (code == GE || !TARGET_IEEE_FP)
2777 {
2778 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2779 code = EQ;
2780 }
2781 else
2782 {
2783 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2784 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2785 code = NE;
2786 }
2787 break;
2788 case LE:
2789 case UNLE:
2790 if (code == LE && TARGET_IEEE_FP)
2791 {
2792 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2793 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2794 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2795 cmp_mode = CCmode;
2796 code = LTU;
2797 }
2798 else
2799 {
2800 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2801 code = NE;
2802 }
2803 break;
2804 case EQ:
2805 case UNEQ:
2806 if (code == EQ && TARGET_IEEE_FP)
2807 {
2808 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2809 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2810 cmp_mode = CCmode;
2811 code = EQ;
2812 }
2813 else
2814 {
2815 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2816 code = NE;
2817 }
2818 break;
2819 case NE:
2820 case LTGT:
2821 if (code == NE && TARGET_IEEE_FP)
2822 {
2823 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2824 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2825 GEN_INT (0x40)));
2826 code = NE;
2827 }
2828 else
2829 {
2830 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2831 code = EQ;
2832 }
2833 break;
2834
2835 case UNORDERED:
2836 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2837 code = NE;
2838 break;
2839 case ORDERED:
2840 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2841 code = EQ;
2842 break;
2843
2844 default:
2845 gcc_unreachable ();
2846 }
2847 break;
2848
2849 default:
2850 gcc_unreachable();
2851 }
2852
2853 /* Return the test that should be put into the flags user, i.e.
2854 the bcc, scc, or cmov instruction. */
2855 return gen_rtx_fmt_ee (code, VOIDmode,
2856 gen_rtx_REG (cmp_mode, FLAGS_REG),
2857 const0_rtx);
2858}
2859
2860/* Generate insn patterns to do an integer compare of OPERANDS. */
2861
2862static rtx
2863ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2864{
2865 machine_mode cmpmode;
2866 rtx tmp, flags;
2867
86403f4e
UB
2868 /* Swap operands to emit carry flag comparison. */
2869 if ((code == GTU || code == LEU)
2870 && nonimmediate_operand (op1, VOIDmode))
2871 {
2872 std::swap (op0, op1);
2873 code = swap_condition (code);
2874 }
2875
2bf6d935
ML
2876 cmpmode = SELECT_CC_MODE (code, op0, op1);
2877 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2878
2879 /* This is very simple, but making the interface the same as in the
2880 FP case makes the rest of the code easier. */
2881 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2882 emit_insn (gen_rtx_SET (flags, tmp));
2883
2884 /* Return the test that should be put into the flags user, i.e.
2885 the bcc, scc, or cmov instruction. */
2886 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2887}
2888
2889static rtx
2890ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2891{
2892 rtx ret;
2893
2894 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2895 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2896
2897 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2898 {
2899 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2900 ret = ix86_expand_fp_compare (code, op0, op1);
2901 }
2902 else
2903 ret = ix86_expand_int_compare (code, op0, op1);
2904
2905 return ret;
2906}
2907
2908void
2909ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2910{
2911 rtx ret;
2912
2913 gcc_assert (GET_MODE (dest) == QImode);
2914
2915 ret = ix86_expand_compare (code, op0, op1);
2916 PUT_MODE (ret, QImode);
2917 emit_insn (gen_rtx_SET (dest, ret));
2918}
2919
463d9108
JJ
2920/* Expand floating point op0 <=> op1, i.e.
2921 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2922
2923void
2924ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
2925{
2926 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
2927 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
2928 rtx l0 = gen_label_rtx ();
2929 rtx l1 = gen_label_rtx ();
2930 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
2931 rtx lend = gen_label_rtx ();
2932 rtx tmp;
2933 rtx_insn *jmp;
2934 if (l2)
2935 {
2936 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
2937 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2938 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
2939 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
2940 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2941 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
2942 }
2943 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
2944 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2945 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
2946 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
2947 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2948 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
2949 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
2950 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
2951 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2952 add_reg_br_prob_note (jmp, profile_probability::even ());
2953 emit_move_insn (dest, constm1_rtx);
2954 emit_jump (lend);
2955 emit_label (l0);
2956 emit_move_insn (dest, const0_rtx);
2957 emit_jump (lend);
2958 emit_label (l1);
2959 emit_move_insn (dest, const1_rtx);
2960 emit_jump (lend);
2961 if (l2)
2962 {
2963 emit_label (l2);
2964 emit_move_insn (dest, const2_rtx);
2965 }
2966 emit_label (lend);
2967}
2968
2bf6d935
ML
2969/* Expand comparison setting or clearing carry flag. Return true when
2970 successful and set pop for the operation. */
2971static bool
2972ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2973{
2974 machine_mode mode
2975 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2976
2977 /* Do not handle double-mode compares that go through special path. */
2978 if (mode == (TARGET_64BIT ? TImode : DImode))
2979 return false;
2980
2981 if (SCALAR_FLOAT_MODE_P (mode))
2982 {
2983 rtx compare_op;
2984 rtx_insn *compare_seq;
2985
2986 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2987
2988 /* Shortcut: following common codes never translate
2989 into carry flag compares. */
2990 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2991 || code == ORDERED || code == UNORDERED)
2992 return false;
2993
2994 /* These comparisons require zero flag; swap operands so they won't. */
2995 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2996 && !TARGET_IEEE_FP)
2997 {
2998 std::swap (op0, op1);
2999 code = swap_condition (code);
3000 }
3001
3002 /* Try to expand the comparison and verify that we end up with
3003 carry flag based comparison. This fails to be true only when
3004 we decide to expand comparison using arithmetic that is not
3005 too common scenario. */
3006 start_sequence ();
3007 compare_op = ix86_expand_fp_compare (code, op0, op1);
3008 compare_seq = get_insns ();
3009 end_sequence ();
3010
3011 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3012 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3013 else
3014 code = GET_CODE (compare_op);
3015
3016 if (code != LTU && code != GEU)
3017 return false;
3018
3019 emit_insn (compare_seq);
3020 *pop = compare_op;
3021 return true;
3022 }
3023
3024 if (!INTEGRAL_MODE_P (mode))
3025 return false;
3026
3027 switch (code)
3028 {
3029 case LTU:
3030 case GEU:
3031 break;
3032
3033 /* Convert a==0 into (unsigned)a<1. */
3034 case EQ:
3035 case NE:
3036 if (op1 != const0_rtx)
3037 return false;
3038 op1 = const1_rtx;
3039 code = (code == EQ ? LTU : GEU);
3040 break;
3041
3042 /* Convert a>b into b<a or a>=b-1. */
3043 case GTU:
3044 case LEU:
3045 if (CONST_INT_P (op1))
3046 {
3047 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3048 /* Bail out on overflow. We still can swap operands but that
3049 would force loading of the constant into register. */
3050 if (op1 == const0_rtx
3051 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3052 return false;
3053 code = (code == GTU ? GEU : LTU);
3054 }
3055 else
3056 {
3057 std::swap (op0, op1);
3058 code = (code == GTU ? LTU : GEU);
3059 }
3060 break;
3061
3062 /* Convert a>=0 into (unsigned)a<0x80000000. */
3063 case LT:
3064 case GE:
3065 if (mode == DImode || op1 != const0_rtx)
3066 return false;
3067 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3068 code = (code == LT ? GEU : LTU);
3069 break;
3070 case LE:
3071 case GT:
3072 if (mode == DImode || op1 != constm1_rtx)
3073 return false;
3074 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3075 code = (code == LE ? GEU : LTU);
3076 break;
3077
3078 default:
3079 return false;
3080 }
3081 /* Swapping operands may cause constant to appear as first operand. */
3082 if (!nonimmediate_operand (op0, VOIDmode))
3083 {
3084 if (!can_create_pseudo_p ())
3085 return false;
3086 op0 = force_reg (mode, op0);
3087 }
3088 *pop = ix86_expand_compare (code, op0, op1);
3089 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3090 return true;
3091}
3092
3093/* Expand conditional increment or decrement using adb/sbb instructions.
3094 The default case using setcc followed by the conditional move can be
3095 done by generic code. */
3096bool
3097ix86_expand_int_addcc (rtx operands[])
3098{
3099 enum rtx_code code = GET_CODE (operands[1]);
3100 rtx flags;
987a3082 3101 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2bf6d935
ML
3102 rtx compare_op;
3103 rtx val = const0_rtx;
3104 bool fpcmp = false;
3105 machine_mode mode;
3106 rtx op0 = XEXP (operands[1], 0);
3107 rtx op1 = XEXP (operands[1], 1);
3108
3109 if (operands[3] != const1_rtx
3110 && operands[3] != constm1_rtx)
3111 return false;
3112 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3113 return false;
3114 code = GET_CODE (compare_op);
3115
3116 flags = XEXP (compare_op, 0);
3117
3118 if (GET_MODE (flags) == CCFPmode)
3119 {
3120 fpcmp = true;
3121 code = ix86_fp_compare_code_to_integer (code);
3122 }
3123
3124 if (code != LTU)
3125 {
3126 val = constm1_rtx;
3127 if (fpcmp)
3128 PUT_CODE (compare_op,
3129 reverse_condition_maybe_unordered
3130 (GET_CODE (compare_op)));
3131 else
3132 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3133 }
3134
3135 mode = GET_MODE (operands[0]);
3136
3137 /* Construct either adc or sbb insn. */
3138 if ((code == LTU) == (operands[3] == constm1_rtx))
987a3082 3139 insn = gen_sub3_carry;
2bf6d935 3140 else
987a3082
UB
3141 insn = gen_add3_carry;
3142
3143 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2bf6d935
ML
3144
3145 return true;
3146}
3147
3148bool
3149ix86_expand_int_movcc (rtx operands[])
3150{
3151 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3152 rtx_insn *compare_seq;
3153 rtx compare_op;
3154 machine_mode mode = GET_MODE (operands[0]);
3155 bool sign_bit_compare_p = false;
f1652e33 3156 bool negate_cc_compare_p = false;
2bf6d935
ML
3157 rtx op0 = XEXP (operands[1], 0);
3158 rtx op1 = XEXP (operands[1], 1);
1ceddd74
JJ
3159 rtx op2 = operands[2];
3160 rtx op3 = operands[3];
2bf6d935
ML
3161
3162 if (GET_MODE (op0) == TImode
3163 || (GET_MODE (op0) == DImode
3164 && !TARGET_64BIT))
3165 return false;
3166
3167 start_sequence ();
3168 compare_op = ix86_expand_compare (code, op0, op1);
3169 compare_seq = get_insns ();
3170 end_sequence ();
3171
3172 compare_code = GET_CODE (compare_op);
3173
3174 if ((op1 == const0_rtx && (code == GE || code == LT))
3175 || (op1 == constm1_rtx && (code == GT || code == LE)))
3176 sign_bit_compare_p = true;
3177
1ceddd74
JJ
3178 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3179 but if op1 is a constant, the latter form allows more optimizations,
3180 either through the last 2 ops being constant handling, or the one
3181 constant and one variable cases. On the other side, for cmov the
3182 former might be better as we don't need to load the constant into
3183 another register. */
3184 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3185 op2 = op1;
3186 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3187 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3188 op3 = op1;
3189
2bf6d935
ML
3190 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3191 HImode insns, we'd be swallowed in word prefix ops. */
3192
3193 if ((mode != HImode || TARGET_FAST_PREFIX)
3194 && (mode != (TARGET_64BIT ? TImode : DImode))
1ceddd74
JJ
3195 && CONST_INT_P (op2)
3196 && CONST_INT_P (op3))
2bf6d935
ML
3197 {
3198 rtx out = operands[0];
1ceddd74
JJ
3199 HOST_WIDE_INT ct = INTVAL (op2);
3200 HOST_WIDE_INT cf = INTVAL (op3);
2bf6d935
ML
3201 HOST_WIDE_INT diff;
3202
f1652e33
RS
3203 if ((mode == SImode
3204 || (TARGET_64BIT && mode == DImode))
3205 && (GET_MODE (op0) == SImode
3206 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3207 {
3208 /* Special case x != 0 ? -1 : y. */
3209 if (code == NE && op1 == const0_rtx && ct == -1)
3210 {
3211 negate_cc_compare_p = true;
3212 std::swap (ct, cf);
3213 code = EQ;
3214 }
3215 else if (code == EQ && op1 == const0_rtx && cf == -1)
3216 negate_cc_compare_p = true;
3217 }
3218
2bf6d935
ML
3219 diff = ct - cf;
3220 /* Sign bit compares are better done using shifts than we do by using
3221 sbb. */
3222 if (sign_bit_compare_p
f1652e33 3223 || negate_cc_compare_p
2bf6d935
ML
3224 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3225 {
3226 /* Detect overlap between destination and compare sources. */
3227 rtx tmp = out;
3228
f1652e33
RS
3229 if (negate_cc_compare_p)
3230 {
3231 if (GET_MODE (op0) == DImode)
3232 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3233 else
3234 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3235 gen_lowpart (SImode, op0)));
3236
3237 tmp = gen_reg_rtx (mode);
3238 if (mode == DImode)
3239 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3240 else
3241 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3242 tmp)));
3243 }
3244 else if (!sign_bit_compare_p)
2bf6d935
ML
3245 {
3246 rtx flags;
3247 bool fpcmp = false;
3248
3249 compare_code = GET_CODE (compare_op);
3250
3251 flags = XEXP (compare_op, 0);
3252
3253 if (GET_MODE (flags) == CCFPmode)
3254 {
3255 fpcmp = true;
3256 compare_code
3257 = ix86_fp_compare_code_to_integer (compare_code);
3258 }
3259
3260 /* To simplify rest of code, restrict to the GEU case. */
3261 if (compare_code == LTU)
3262 {
3263 std::swap (ct, cf);
3264 compare_code = reverse_condition (compare_code);
3265 code = reverse_condition (code);
3266 }
3267 else
3268 {
3269 if (fpcmp)
3270 PUT_CODE (compare_op,
3271 reverse_condition_maybe_unordered
3272 (GET_CODE (compare_op)));
3273 else
3274 PUT_CODE (compare_op,
3275 reverse_condition (GET_CODE (compare_op)));
3276 }
3277 diff = ct - cf;
3278
cb1758d9 3279 if (reg_overlap_mentioned_p (out, compare_op))
2bf6d935
ML
3280 tmp = gen_reg_rtx (mode);
3281
3282 if (mode == DImode)
3283 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3284 else
3285 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3286 flags, compare_op));
3287 }
3288 else
3289 {
3290 if (code == GT || code == GE)
3291 code = reverse_condition (code);
3292 else
3293 {
3294 std::swap (ct, cf);
3295 diff = ct - cf;
3296 }
3297 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3298 }
3299
3300 if (diff == 1)
3301 {
3302 /*
3303 * cmpl op0,op1
3304 * sbbl dest,dest
3305 * [addl dest, ct]
3306 *
3307 * Size 5 - 8.
3308 */
3309 if (ct)
3310 tmp = expand_simple_binop (mode, PLUS,
3311 tmp, GEN_INT (ct),
3312 copy_rtx (tmp), 1, OPTAB_DIRECT);
3313 }
3314 else if (cf == -1)
3315 {
3316 /*
3317 * cmpl op0,op1
3318 * sbbl dest,dest
3319 * orl $ct, dest
3320 *
3321 * Size 8.
3322 */
3323 tmp = expand_simple_binop (mode, IOR,
3324 tmp, GEN_INT (ct),
3325 copy_rtx (tmp), 1, OPTAB_DIRECT);
3326 }
3327 else if (diff == -1 && ct)
3328 {
3329 /*
3330 * cmpl op0,op1
3331 * sbbl dest,dest
3332 * notl dest
3333 * [addl dest, cf]
3334 *
3335 * Size 8 - 11.
3336 */
3337 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3338 if (cf)
3339 tmp = expand_simple_binop (mode, PLUS,
3340 copy_rtx (tmp), GEN_INT (cf),
3341 copy_rtx (tmp), 1, OPTAB_DIRECT);
3342 }
3343 else
3344 {
3345 /*
3346 * cmpl op0,op1
3347 * sbbl dest,dest
3348 * [notl dest]
3349 * andl cf - ct, dest
3350 * [addl dest, ct]
3351 *
3352 * Size 8 - 11.
3353 */
3354
3355 if (cf == 0)
3356 {
3357 cf = ct;
3358 ct = 0;
3359 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3360 }
3361
3362 tmp = expand_simple_binop (mode, AND,
3363 copy_rtx (tmp),
3364 gen_int_mode (cf - ct, mode),
3365 copy_rtx (tmp), 1, OPTAB_DIRECT);
3366 if (ct)
3367 tmp = expand_simple_binop (mode, PLUS,
3368 copy_rtx (tmp), GEN_INT (ct),
3369 copy_rtx (tmp), 1, OPTAB_DIRECT);
3370 }
3371
3372 if (!rtx_equal_p (tmp, out))
3373 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3374
3375 return true;
3376 }
3377
3378 if (diff < 0)
3379 {
3380 machine_mode cmp_mode = GET_MODE (op0);
3381 enum rtx_code new_code;
3382
3383 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3384 {
3385 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3386
8f17461b
UB
3387 /* We may be reversing a non-trapping
3388 comparison to a trapping comparison. */
3389 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3390 && code != EQ && code != NE
3391 && code != ORDERED && code != UNORDERED)
3392 new_code = UNKNOWN;
3393 else
3394 new_code = reverse_condition_maybe_unordered (code);
2bf6d935
ML
3395 }
3396 else
3397 new_code = ix86_reverse_condition (code, cmp_mode);
3398 if (new_code != UNKNOWN)
3399 {
3400 std::swap (ct, cf);
3401 diff = -diff;
3402 code = new_code;
3403 }
3404 }
3405
3406 compare_code = UNKNOWN;
3407 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3408 && CONST_INT_P (op1))
3409 {
3410 if (op1 == const0_rtx
3411 && (code == LT || code == GE))
3412 compare_code = code;
3413 else if (op1 == constm1_rtx)
3414 {
3415 if (code == LE)
3416 compare_code = LT;
3417 else if (code == GT)
3418 compare_code = GE;
3419 }
3420 }
3421
3422 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3423 if (compare_code != UNKNOWN
3424 && GET_MODE (op0) == GET_MODE (out)
3425 && (cf == -1 || ct == -1))
3426 {
3427 /* If lea code below could be used, only optimize
3428 if it results in a 2 insn sequence. */
3429
3430 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3431 || diff == 3 || diff == 5 || diff == 9)
3432 || (compare_code == LT && ct == -1)
3433 || (compare_code == GE && cf == -1))
3434 {
3435 /*
3436 * notl op1 (if necessary)
3437 * sarl $31, op1
3438 * orl cf, op1
3439 */
3440 if (ct != -1)
3441 {
3442 cf = ct;
3443 ct = -1;
3444 code = reverse_condition (code);
3445 }
3446
3447 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3448
3449 out = expand_simple_binop (mode, IOR,
3450 out, GEN_INT (cf),
3451 out, 1, OPTAB_DIRECT);
3452 if (out != operands[0])
3453 emit_move_insn (operands[0], out);
3454
3455 return true;
3456 }
3457 }
3458
3459
3460 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3461 || diff == 3 || diff == 5 || diff == 9)
3462 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3463 && (mode != DImode
3464 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3465 {
3466 /*
3467 * xorl dest,dest
3468 * cmpl op1,op2
3469 * setcc dest
3470 * lea cf(dest*(ct-cf)),dest
3471 *
3472 * Size 14.
3473 *
3474 * This also catches the degenerate setcc-only case.
3475 */
3476
3477 rtx tmp;
3478 int nops;
3479
3480 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3481
3482 nops = 0;
3483 /* On x86_64 the lea instruction operates on Pmode, so we need
3484 to get arithmetics done in proper mode to match. */
3485 if (diff == 1)
3486 tmp = copy_rtx (out);
3487 else
3488 {
3489 rtx out1;
3490 out1 = copy_rtx (out);
3491 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3492 nops++;
3493 if (diff & 1)
3494 {
3495 tmp = gen_rtx_PLUS (mode, tmp, out1);
3496 nops++;
3497 }
3498 }
3499 if (cf != 0)
3500 {
c3185b64 3501 tmp = plus_constant (mode, tmp, cf);
2bf6d935
ML
3502 nops++;
3503 }
3504 if (!rtx_equal_p (tmp, out))
3505 {
3506 if (nops == 1)
3507 out = force_operand (tmp, copy_rtx (out));
3508 else
3509 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3510 }
3511 if (!rtx_equal_p (out, operands[0]))
3512 emit_move_insn (operands[0], copy_rtx (out));
3513
3514 return true;
3515 }
3516
3517 /*
3518 * General case: Jumpful:
3519 * xorl dest,dest cmpl op1, op2
3520 * cmpl op1, op2 movl ct, dest
3521 * setcc dest jcc 1f
3522 * decl dest movl cf, dest
3523 * andl (cf-ct),dest 1:
3524 * addl ct,dest
3525 *
3526 * Size 20. Size 14.
3527 *
3528 * This is reasonably steep, but branch mispredict costs are
3529 * high on modern cpus, so consider failing only if optimizing
3530 * for space.
3531 */
3532
3533 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3534 && BRANCH_COST (optimize_insn_for_speed_p (),
3535 false) >= 2)
3536 {
3537 if (cf == 0)
3538 {
3539 machine_mode cmp_mode = GET_MODE (op0);
3540 enum rtx_code new_code;
3541
3542 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3543 {
3544 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3545
8f17461b
UB
3546 /* We may be reversing a non-trapping
3547 comparison to a trapping comparison. */
3548 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3549 && code != EQ && code != NE
3550 && code != ORDERED && code != UNORDERED)
3551 new_code = UNKNOWN;
3552 else
3553 new_code = reverse_condition_maybe_unordered (code);
3554
2bf6d935
ML
3555 }
3556 else
3557 {
3558 new_code = ix86_reverse_condition (code, cmp_mode);
3559 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3560 compare_code = reverse_condition (compare_code);
3561 }
3562
3563 if (new_code != UNKNOWN)
3564 {
3565 cf = ct;
3566 ct = 0;
3567 code = new_code;
3568 }
3569 }
3570
3571 if (compare_code != UNKNOWN)
3572 {
3573 /* notl op1 (if needed)
3574 sarl $31, op1
3575 andl (cf-ct), op1
3576 addl ct, op1
3577
3578 For x < 0 (resp. x <= -1) there will be no notl,
3579 so if possible swap the constants to get rid of the
3580 complement.
3581 True/false will be -1/0 while code below (store flag
3582 followed by decrement) is 0/-1, so the constants need
3583 to be exchanged once more. */
3584
3585 if (compare_code == GE || !cf)
3586 {
3587 code = reverse_condition (code);
3588 compare_code = LT;
3589 }
3590 else
3591 std::swap (ct, cf);
3592
3593 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3594 }
3595 else
3596 {
3597 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3598
3599 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3600 constm1_rtx,
3601 copy_rtx (out), 1, OPTAB_DIRECT);
3602 }
3603
3604 out = expand_simple_binop (mode, AND, copy_rtx (out),
3605 gen_int_mode (cf - ct, mode),
3606 copy_rtx (out), 1, OPTAB_DIRECT);
3607 if (ct)
3608 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3609 copy_rtx (out), 1, OPTAB_DIRECT);
3610 if (!rtx_equal_p (out, operands[0]))
3611 emit_move_insn (operands[0], copy_rtx (out));
3612
3613 return true;
3614 }
3615 }
3616
3617 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3618 {
3619 /* Try a few things more with specific constants and a variable. */
3620
3621 optab op;
3622 rtx var, orig_out, out, tmp;
3623
3624 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3625 return false;
3626
1ceddd74
JJ
3627 operands[2] = op2;
3628 operands[3] = op3;
3629
2bf6d935
ML
3630 /* If one of the two operands is an interesting constant, load a
3631 constant with the above and mask it in with a logical operation. */
3632
3633 if (CONST_INT_P (operands[2]))
3634 {
3635 var = operands[3];
3636 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3637 operands[3] = constm1_rtx, op = and_optab;
3638 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3639 operands[3] = const0_rtx, op = ior_optab;
3640 else
3641 return false;
3642 }
3643 else if (CONST_INT_P (operands[3]))
3644 {
3645 var = operands[2];
3646 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
e4ced0b6
RS
3647 {
3648 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3649 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3650 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3651 operands[1] = simplify_gen_relational (LT, VOIDmode,
3652 GET_MODE (op0),
3653 op0, const0_rtx);
3654
3655 operands[2] = constm1_rtx;
3656 op = and_optab;
3657 }
2bf6d935
ML
3658 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3659 operands[2] = const0_rtx, op = ior_optab;
3660 else
3661 return false;
3662 }
3663 else
3664 return false;
3665
3666 orig_out = operands[0];
3667 tmp = gen_reg_rtx (mode);
3668 operands[0] = tmp;
3669
3670 /* Recurse to get the constant loaded. */
3671 if (!ix86_expand_int_movcc (operands))
3672 return false;
3673
3674 /* Mask in the interesting variable. */
3675 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3676 OPTAB_WIDEN);
3677 if (!rtx_equal_p (out, orig_out))
3678 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3679
3680 return true;
3681 }
3682
3683 /*
3684 * For comparison with above,
3685 *
3686 * movl cf,dest
3687 * movl ct,tmp
3688 * cmpl op1,op2
3689 * cmovcc tmp,dest
3690 *
3691 * Size 15.
3692 */
3693
3694 if (! nonimmediate_operand (operands[2], mode))
3695 operands[2] = force_reg (mode, operands[2]);
3696 if (! nonimmediate_operand (operands[3], mode))
3697 operands[3] = force_reg (mode, operands[3]);
3698
3699 if (! register_operand (operands[2], VOIDmode)
3700 && (mode == QImode
3701 || ! register_operand (operands[3], VOIDmode)))
3702 operands[2] = force_reg (mode, operands[2]);
3703
3704 if (mode == QImode
3705 && ! register_operand (operands[3], VOIDmode))
3706 operands[3] = force_reg (mode, operands[3]);
3707
3708 emit_insn (compare_seq);
3709 emit_insn (gen_rtx_SET (operands[0],
3710 gen_rtx_IF_THEN_ELSE (mode,
3711 compare_op, operands[2],
3712 operands[3])));
3713 return true;
3714}
3715
3716/* Detect conditional moves that exactly match min/max operational
3717 semantics. Note that this is IEEE safe, as long as we don't
3718 interchange the operands.
3719
3720 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3721 and TRUE if the operation is successful and instructions are emitted. */
3722
3723static bool
3724ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3725 rtx cmp_op1, rtx if_true, rtx if_false)
3726{
3727 machine_mode mode;
3728 bool is_min;
3729 rtx tmp;
3730
3731 if (code == LT)
3732 ;
3733 else if (code == UNGE)
3734 std::swap (if_true, if_false);
3735 else
3736 return false;
3737
3738 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3739 is_min = true;
3740 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3741 is_min = false;
3742 else
3743 return false;
3744
3745 mode = GET_MODE (dest);
3746
3747 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3748 but MODE may be a vector mode and thus not appropriate. */
3749 if (!flag_finite_math_only || flag_signed_zeros)
3750 {
3751 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3752 rtvec v;
3753
3754 if_true = force_reg (mode, if_true);
3755 v = gen_rtvec (2, if_true, if_false);
3756 tmp = gen_rtx_UNSPEC (mode, v, u);
3757 }
3758 else
3759 {
3760 code = is_min ? SMIN : SMAX;
3761 if (MEM_P (if_true) && MEM_P (if_false))
3762 if_true = force_reg (mode, if_true);
3763 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3764 }
3765
3766 emit_insn (gen_rtx_SET (dest, tmp));
3767 return true;
3768}
3769
8b905e9b
HL
3770/* Return true if MODE is valid for vector compare to mask register,
3771 Same result for conditionl vector move with mask register. */
3772static bool
3773ix86_valid_mask_cmp_mode (machine_mode mode)
3774{
3775 /* XOP has its own vector conditional movement. */
a8654147 3776 if (TARGET_XOP && !TARGET_AVX512F)
8b905e9b
HL
3777 return false;
3778
0d788c35 3779 /* HFmode only supports vcmpsh whose dest is mask register. */
3780 if (TARGET_AVX512FP16 && mode == HFmode)
3781 return true;
3782
8b905e9b
HL
3783 /* AVX512F is needed for mask operation. */
3784 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3785 return false;
3786
3787 /* AVX512BW is needed for vector QI/HImode,
3788 AVX512VL is needed for 128/256-bit vector. */
3789 machine_mode inner_mode = GET_MODE_INNER (mode);
3790 int vector_size = GET_MODE_SIZE (mode);
3791 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3792 return false;
3793
3794 return vector_size == 64 || TARGET_AVX512VL;
3795}
3796
8d0737d8 3797/* Return true if integer mask comparison should be used. */
3798static bool
3799ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3800 rtx op_true, rtx op_false)
3801{
92f372f0
UB
3802 int vector_size = GET_MODE_SIZE (mode);
3803
0d788c35 3804 if (cmp_mode == HFmode)
3805 return true;
3806 else if (vector_size < 16)
92f372f0
UB
3807 return false;
3808 else if (vector_size == 64)
8d0737d8 3809 return true;
9ce50028
HW
3810 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3811 return true;
8d0737d8 3812
3813 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3814 gcc_assert (!op_true == !op_false);
3815
3816 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3817 vector dest is required. */
3818 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3819 return false;
3820
3821 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3822 if (op_false == CONST0_RTX (mode)
3823 || op_true == CONST0_RTX (mode)
3824 || (INTEGRAL_MODE_P (mode)
3825 && (op_true == CONSTM1_RTX (mode)
3826 || op_false == CONSTM1_RTX (mode))))
3827 return false;
3828
3829 return true;
3830}
3831
2bf6d935
ML
3832/* Expand an SSE comparison. Return the register with the result. */
3833
3834static rtx
3835ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3836 rtx op_true, rtx op_false)
3837{
3838 machine_mode mode = GET_MODE (dest);
3839 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3840
3841 /* In general case result of comparison can differ from operands' type. */
3842 machine_mode cmp_mode;
3843
3844 /* In AVX512F the result of comparison is an integer mask. */
3845 bool maskcmp = false;
3846 rtx x;
3847
8d0737d8 3848 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
2bf6d935
ML
3849 {
3850 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
2bf6d935 3851 maskcmp = true;
8b905e9b 3852 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
2bf6d935
ML
3853 }
3854 else
3855 cmp_mode = cmp_ops_mode;
3856
3857 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3858
a86b3453 3859 bool (*op1_predicate)(rtx, machine_mode)
2bf6d935
ML
3860 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3861
3862 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3863 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3864
3865 if (optimize
3866 || (maskcmp && cmp_mode != mode)
3867 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3868 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3869 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3870
99e4891e 3871 if (maskcmp)
3872 {
3873 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3874 gcc_assert (ok);
3875 return dest;
3876 }
3877
2bf6d935
ML
3878 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3879
8d0737d8 3880 if (cmp_mode != mode)
2bf6d935
ML
3881 {
3882 x = force_reg (cmp_ops_mode, x);
3883 convert_move (dest, x, false);
3884 }
3885 else
3886 emit_insn (gen_rtx_SET (dest, x));
3887
3888 return dest;
3889}
3890
b5193e35
UB
3891/* Emit x86 binary operand CODE in mode MODE for SSE vector
3892 instructions that can be performed using GP registers. */
3893
3894static void
3895ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
3896 rtx dst, rtx src1, rtx src2)
3897{
3898 rtx tmp;
3899
3900 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
3901
3902 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
3903 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
3904 {
3905 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
3906 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
3907 }
3908
3909 emit_insn (tmp);
3910}
3911
2bf6d935
ML
3912/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3913 operations. This is used for both scalar and vector conditional moves. */
3914
3915void
3916ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3917{
3918 machine_mode mode = GET_MODE (dest);
3919 machine_mode cmpmode = GET_MODE (cmp);
f4a2cecd 3920 rtx x;
2bf6d935 3921
9b5d50b7 3922 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3923 if (rtx_equal_p (op_true, op_false))
3924 {
3925 emit_move_insn (dest, op_true);
3926 return;
3927 }
3928
2bf6d935
ML
3929 /* If we have an integer mask and FP value then we need
3930 to cast mask to FP mode. */
3931 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3932 {
3933 cmp = force_reg (cmpmode, cmp);
3934 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3935 }
3936
8d0737d8 3937 /* In AVX512F the result of comparison is an integer mask. */
3938 if (mode != cmpmode
3939 && GET_MODE_CLASS (cmpmode) == MODE_INT)
2bf6d935 3940 {
8d0737d8 3941 gcc_assert (ix86_valid_mask_cmp_mode (mode));
0d788c35 3942 /* Using scalar/vector move with mask register. */
8b905e9b
HL
3943 cmp = force_reg (cmpmode, cmp);
3944 /* Optimize for mask zero. */
3945 op_true = (op_true != CONST0_RTX (mode)
3946 ? force_reg (mode, op_true) : op_true);
3947 op_false = (op_false != CONST0_RTX (mode)
3948 ? force_reg (mode, op_false) : op_false);
3949 if (op_true == CONST0_RTX (mode))
2bf6d935 3950 {
ee78c20e 3951 if (cmpmode == E_DImode && !TARGET_64BIT)
f4a2cecd
UB
3952 {
3953 x = gen_reg_rtx (cmpmode);
3954 emit_insn (gen_knotdi (x, cmp));
3955 }
ee78c20e 3956 else
f4a2cecd
UB
3957 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
3958 cmp = x;
8b905e9b
HL
3959 /* Reverse op_true op_false. */
3960 std::swap (op_true, op_false);
2bf6d935 3961 }
8b905e9b 3962
0d788c35 3963 if (mode == HFmode)
3964 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
3965 else
f4a2cecd
UB
3966 emit_insn (gen_rtx_SET (dest,
3967 gen_rtx_VEC_MERGE (mode,
3968 op_true, op_false, cmp)));
8b905e9b 3969 return;
2bf6d935 3970 }
f4a2cecd
UB
3971
3972 if (vector_all_ones_operand (op_true, mode)
3973 && op_false == CONST0_RTX (mode))
2bf6d935 3974 {
f4a2cecd 3975 emit_move_insn (dest, cmp);
2bf6d935
ML
3976 return;
3977 }
3978 else if (op_false == CONST0_RTX (mode))
3979 {
f4a2cecd
UB
3980 x = expand_simple_binop (mode, AND, cmp, op_true,
3981 dest, 1, OPTAB_DIRECT);
3982 if (x != dest)
3983 emit_move_insn (dest, x);
2bf6d935
ML
3984 return;
3985 }
3986 else if (op_true == CONST0_RTX (mode))
3987 {
3988 op_false = force_reg (mode, op_false);
3989 x = gen_rtx_NOT (mode, cmp);
b5193e35 3990 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
2bf6d935
ML
3991 return;
3992 }
f4a2cecd 3993 else if (vector_all_ones_operand (op_true, mode))
2bf6d935 3994 {
f4a2cecd
UB
3995 x = expand_simple_binop (mode, IOR, cmp, op_false,
3996 dest, 1, OPTAB_DIRECT);
3997 if (x != dest)
3998 emit_move_insn (dest, x);
2bf6d935
ML
3999 return;
4000 }
f4a2cecd
UB
4001
4002 if (TARGET_XOP)
2bf6d935
ML
4003 {
4004 op_true = force_reg (mode, op_true);
4005
f1693741
UB
4006 if (GET_MODE_SIZE (mode) < 16
4007 || !nonimmediate_operand (op_false, mode))
2bf6d935
ML
4008 op_false = force_reg (mode, op_false);
4009
f4a2cecd
UB
4010 emit_insn (gen_rtx_SET (dest,
4011 gen_rtx_IF_THEN_ELSE (mode, cmp,
4012 op_true, op_false)));
2bf6d935
ML
4013 return;
4014 }
4015
4016 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
f4a2cecd 4017 machine_mode blend_mode = mode;
2bf6d935 4018
f4a2cecd
UB
4019 if (GET_MODE_SIZE (mode) < 16
4020 || !vector_operand (op_true, mode))
2bf6d935
ML
4021 op_true = force_reg (mode, op_true);
4022
4023 op_false = force_reg (mode, op_false);
4024
4025 switch (mode)
4026 {
b1f7fd8a
UB
4027 case E_V2SFmode:
4028 if (TARGET_SSE4_1)
f4a2cecd 4029 gen = gen_mmx_blendvps;
b1f7fd8a 4030 break;
2bf6d935
ML
4031 case E_V4SFmode:
4032 if (TARGET_SSE4_1)
4033 gen = gen_sse4_1_blendvps;
4034 break;
4035 case E_V2DFmode:
4036 if (TARGET_SSE4_1)
4037 gen = gen_sse4_1_blendvpd;
4038 break;
4039 case E_SFmode:
4040 if (TARGET_SSE4_1)
f4a2cecd 4041 gen = gen_sse4_1_blendvss;
2bf6d935
ML
4042 break;
4043 case E_DFmode:
4044 if (TARGET_SSE4_1)
f4a2cecd 4045 gen = gen_sse4_1_blendvsd;
2bf6d935 4046 break;
5795ec0e
UB
4047 case E_V8QImode:
4048 case E_V4HImode:
4049 case E_V2SImode:
4050 if (TARGET_SSE4_1)
4051 {
820ac79e 4052 gen = gen_mmx_pblendvb_v8qi;
f4a2cecd 4053 blend_mode = V8QImode;
5795ec0e
UB
4054 }
4055 break;
2df9d3c5
UB
4056 case E_V4QImode:
4057 case E_V2HImode:
4058 if (TARGET_SSE4_1)
4059 {
820ac79e 4060 gen = gen_mmx_pblendvb_v4qi;
f4a2cecd 4061 blend_mode = V4QImode;
2df9d3c5
UB
4062 }
4063 break;
820ac79e
UB
4064 case E_V2QImode:
4065 if (TARGET_SSE4_1)
f4a2cecd 4066 gen = gen_mmx_pblendvb_v2qi;
820ac79e 4067 break;
2bf6d935
ML
4068 case E_V16QImode:
4069 case E_V8HImode:
9e2a82e1 4070 case E_V8HFmode:
6910cad5 4071 case E_V8BFmode:
2bf6d935
ML
4072 case E_V4SImode:
4073 case E_V2DImode:
793f847b 4074 case E_V1TImode:
2bf6d935
ML
4075 if (TARGET_SSE4_1)
4076 {
4077 gen = gen_sse4_1_pblendvb;
f4a2cecd 4078 blend_mode = V16QImode;
2bf6d935
ML
4079 }
4080 break;
4081 case E_V8SFmode:
4082 if (TARGET_AVX)
4083 gen = gen_avx_blendvps256;
4084 break;
4085 case E_V4DFmode:
4086 if (TARGET_AVX)
4087 gen = gen_avx_blendvpd256;
4088 break;
4089 case E_V32QImode:
4090 case E_V16HImode:
9e2a82e1 4091 case E_V16HFmode:
6910cad5 4092 case E_V16BFmode:
2bf6d935
ML
4093 case E_V8SImode:
4094 case E_V4DImode:
4095 if (TARGET_AVX2)
4096 {
4097 gen = gen_avx2_pblendvb;
f4a2cecd 4098 blend_mode = V32QImode;
2bf6d935
ML
4099 }
4100 break;
4101
4102 case E_V64QImode:
4103 gen = gen_avx512bw_blendmv64qi;
4104 break;
4105 case E_V32HImode:
4106 gen = gen_avx512bw_blendmv32hi;
4107 break;
9e2a82e1 4108 case E_V32HFmode:
4109 gen = gen_avx512bw_blendmv32hf;
4110 break;
6910cad5 4111 case E_V32BFmode:
4112 gen = gen_avx512bw_blendmv32bf;
4113 break;
2bf6d935
ML
4114 case E_V16SImode:
4115 gen = gen_avx512f_blendmv16si;
4116 break;
4117 case E_V8DImode:
4118 gen = gen_avx512f_blendmv8di;
4119 break;
4120 case E_V8DFmode:
4121 gen = gen_avx512f_blendmv8df;
4122 break;
4123 case E_V16SFmode:
4124 gen = gen_avx512f_blendmv16sf;
4125 break;
4126
4127 default:
4128 break;
4129 }
4130
4131 if (gen != NULL)
4132 {
f4a2cecd
UB
4133 if (blend_mode == mode)
4134 x = dest;
4135 else
4136 {
4137 x = gen_reg_rtx (blend_mode);
4138 op_false = gen_lowpart (blend_mode, op_false);
4139 op_true = gen_lowpart (blend_mode, op_true);
4140 cmp = gen_lowpart (blend_mode, cmp);
4141 }
4142
4143 emit_insn (gen (x, op_false, op_true, cmp));
4144
4145 if (x != dest)
4146 emit_move_insn (dest, gen_lowpart (mode, x));
2bf6d935
ML
4147 }
4148 else
4149 {
f4a2cecd 4150 rtx t2, t3;
2bf6d935 4151
f4a2cecd
UB
4152 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4153 NULL, 1, OPTAB_DIRECT);
2bf6d935 4154
f4a2cecd 4155 t3 = gen_reg_rtx (mode);
2bf6d935 4156 x = gen_rtx_NOT (mode, cmp);
b5193e35 4157 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
2bf6d935 4158
f4a2cecd
UB
4159 x = expand_simple_binop (mode, IOR, t3, t2,
4160 dest, 1, OPTAB_DIRECT);
4161 if (x != dest)
4162 emit_move_insn (dest, x);
2bf6d935
ML
4163 }
4164}
4165
4166/* Swap, force into registers, or otherwise massage the two operands
4167 to an sse comparison with a mask result. Thus we differ a bit from
4168 ix86_prepare_fp_compare_args which expects to produce a flags result.
4169
4170 The DEST operand exists to help determine whether to commute commutative
4171 operators. The POP0/POP1 operands are updated in place. The new
4172 comparison code is returned, or UNKNOWN if not implementable. */
4173
4174static enum rtx_code
4175ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4176 rtx *pop0, rtx *pop1)
4177{
4178 switch (code)
4179 {
4180 case LTGT:
4181 case UNEQ:
4182 /* AVX supports all the needed comparisons. */
4183 if (TARGET_AVX)
4184 break;
4185 /* We have no LTGT as an operator. We could implement it with
4186 NE & ORDERED, but this requires an extra temporary. It's
4187 not clear that it's worth it. */
4188 return UNKNOWN;
4189
4190 case LT:
4191 case LE:
4192 case UNGT:
4193 case UNGE:
4194 /* These are supported directly. */
4195 break;
4196
4197 case EQ:
4198 case NE:
4199 case UNORDERED:
4200 case ORDERED:
4201 /* AVX has 3 operand comparisons, no need to swap anything. */
4202 if (TARGET_AVX)
4203 break;
4204 /* For commutative operators, try to canonicalize the destination
4205 operand to be first in the comparison - this helps reload to
4206 avoid extra moves. */
4207 if (!dest || !rtx_equal_p (dest, *pop1))
4208 break;
4209 /* FALLTHRU */
4210
4211 case GE:
4212 case GT:
4213 case UNLE:
4214 case UNLT:
4215 /* These are not supported directly before AVX, and furthermore
4216 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4217 comparison operands to transform into something that is
4218 supported. */
4219 std::swap (*pop0, *pop1);
4220 code = swap_condition (code);
4221 break;
4222
4223 default:
4224 gcc_unreachable ();
4225 }
4226
4227 return code;
4228}
4229
4230/* Expand a floating-point conditional move. Return true if successful. */
4231
4232bool
4233ix86_expand_fp_movcc (rtx operands[])
4234{
4235 machine_mode mode = GET_MODE (operands[0]);
4236 enum rtx_code code = GET_CODE (operands[1]);
4237 rtx tmp, compare_op;
4238 rtx op0 = XEXP (operands[1], 0);
4239 rtx op1 = XEXP (operands[1], 1);
4240
a6841211 4241 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
2bf6d935
ML
4242 {
4243 machine_mode cmode;
4244
4245 /* Since we've no cmove for sse registers, don't force bad register
4246 allocation just to gain access to it. Deny movcc when the
4247 comparison mode doesn't match the move mode. */
4248 cmode = GET_MODE (op0);
4249 if (cmode == VOIDmode)
4250 cmode = GET_MODE (op1);
4251 if (cmode != mode)
4252 return false;
4253
4254 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4255 if (code == UNKNOWN)
4256 return false;
4257
4258 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4259 operands[2], operands[3]))
4260 return true;
4261
4262 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4263 operands[2], operands[3]);
4264 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4265 return true;
4266 }
4267
4268 if (GET_MODE (op0) == TImode
4269 || (GET_MODE (op0) == DImode
4270 && !TARGET_64BIT))
4271 return false;
4272
4273 /* The floating point conditional move instructions don't directly
4274 support conditions resulting from a signed integer comparison. */
4275
4276 compare_op = ix86_expand_compare (code, op0, op1);
4277 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4278 {
4279 tmp = gen_reg_rtx (QImode);
4280 ix86_expand_setcc (tmp, code, op0, op1);
4281
4282 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4283 }
4284
4285 emit_insn (gen_rtx_SET (operands[0],
4286 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4287 operands[2], operands[3])));
4288
4289 return true;
4290}
4291
4292/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4293
4294static int
4295ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4296{
4297 switch (code)
4298 {
4299 case EQ:
4300 return 0;
4301 case LT:
4302 case LTU:
4303 return 1;
4304 case LE:
4305 case LEU:
4306 return 2;
4307 case NE:
4308 return 4;
4309 case GE:
4310 case GEU:
4311 return 5;
4312 case GT:
4313 case GTU:
4314 return 6;
4315 default:
4316 gcc_unreachable ();
4317 }
4318}
4319
4320/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4321
4322static int
4323ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4324{
4325 switch (code)
4326 {
4327 case EQ:
4328 return 0x00;
4329 case NE:
4330 return 0x04;
4331 case GT:
4332 return 0x0e;
4333 case LE:
4334 return 0x02;
4335 case GE:
4336 return 0x0d;
4337 case LT:
4338 return 0x01;
4339 case UNLE:
4340 return 0x0a;
4341 case UNLT:
4342 return 0x09;
4343 case UNGE:
4344 return 0x05;
4345 case UNGT:
4346 return 0x06;
4347 case UNEQ:
4348 return 0x18;
4349 case LTGT:
4350 return 0x0c;
4351 case ORDERED:
4352 return 0x07;
4353 case UNORDERED:
4354 return 0x03;
4355 default:
4356 gcc_unreachable ();
4357 }
4358}
4359
4360/* Return immediate value to be used in UNSPEC_PCMP
4361 for comparison CODE in MODE. */
4362
4363static int
4364ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4365{
4366 if (FLOAT_MODE_P (mode))
4367 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4368 return ix86_int_cmp_code_to_pcmp_immediate (code);
4369}
4370
4371/* Expand AVX-512 vector comparison. */
4372
4373bool
99e4891e 4374ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
2bf6d935 4375{
99e4891e 4376 machine_mode mask_mode = GET_MODE (dest);
4377 machine_mode cmp_mode = GET_MODE (cmp_op0);
2bf6d935
ML
4378 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4379 int unspec_code;
4380 rtx unspec;
4381
4382 switch (code)
4383 {
4384 case LEU:
4385 case GTU:
4386 case GEU:
4387 case LTU:
4388 unspec_code = UNSPEC_UNSIGNED_PCMP;
4389 break;
4390
4391 default:
4392 unspec_code = UNSPEC_PCMP;
4393 }
4394
99e4891e 4395 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
2bf6d935 4396 unspec_code);
99e4891e 4397 emit_insn (gen_rtx_SET (dest, unspec));
2bf6d935
ML
4398
4399 return true;
4400}
4401
4402/* Expand fp vector comparison. */
4403
4404bool
4405ix86_expand_fp_vec_cmp (rtx operands[])
4406{
4407 enum rtx_code code = GET_CODE (operands[1]);
4408 rtx cmp;
4409
4410 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4411 &operands[2], &operands[3]);
4412 if (code == UNKNOWN)
4413 {
4414 rtx temp;
4415 switch (GET_CODE (operands[1]))
4416 {
4417 case LTGT:
4418 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4419 operands[3], NULL, NULL);
4420 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4421 operands[3], NULL, NULL);
4422 code = AND;
4423 break;
4424 case UNEQ:
4425 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4426 operands[3], NULL, NULL);
4427 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4428 operands[3], NULL, NULL);
4429 code = IOR;
4430 break;
4431 default:
4432 gcc_unreachable ();
4433 }
4434 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4435 OPTAB_DIRECT);
4436 }
4437 else
4438 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
8d0737d8 4439 NULL, NULL);
2bf6d935
ML
4440
4441 if (operands[0] != cmp)
4442 emit_move_insn (operands[0], cmp);
4443
4444 return true;
4445}
4446
4447static rtx
4448ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4449 rtx op_true, rtx op_false, bool *negate)
4450{
4451 machine_mode data_mode = GET_MODE (dest);
4452 machine_mode mode = GET_MODE (cop0);
4453 rtx x;
4454
4455 *negate = false;
4456
4457 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4458 if (TARGET_XOP
6c67afaf
UB
4459 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4460 && GET_MODE_SIZE (mode) <= 16)
2bf6d935 4461 ;
8b905e9b
HL
4462 /* AVX512F supports all of the comparsions
4463 on all 128/256/512-bit vector int types. */
8d0737d8 4464 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
8b905e9b 4465 ;
2bf6d935
ML
4466 else
4467 {
4468 /* Canonicalize the comparison to EQ, GT, GTU. */
4469 switch (code)
4470 {
4471 case EQ:
4472 case GT:
4473 case GTU:
4474 break;
4475
4476 case NE:
4477 case LE:
4478 case LEU:
4479 code = reverse_condition (code);
4480 *negate = true;
4481 break;
4482
4483 case GE:
4484 case GEU:
4485 code = reverse_condition (code);
4486 *negate = true;
4487 /* FALLTHRU */
4488
4489 case LT:
4490 case LTU:
4491 std::swap (cop0, cop1);
4492 code = swap_condition (code);
4493 break;
4494
4495 default:
4496 gcc_unreachable ();
4497 }
4498
4499 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4500 if (mode == V2DImode)
4501 {
4502 switch (code)
4503 {
4504 case EQ:
4505 /* SSE4.1 supports EQ. */
4506 if (!TARGET_SSE4_1)
4507 return NULL;
4508 break;
4509
4510 case GT:
4511 case GTU:
4512 /* SSE4.2 supports GT/GTU. */
4513 if (!TARGET_SSE4_2)
4514 return NULL;
4515 break;
4516
4517 default:
4518 gcc_unreachable ();
4519 }
4520 }
4521
4522 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4523 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4524 if (*negate)
4525 std::swap (optrue, opfalse);
4526
4527 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4528 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4529 min (x, y) == x). While we add one instruction (the minimum),
4530 we remove the need for two instructions in the negation, as the
4531 result is done this way.
4532 When using masks, do it for SI/DImode element types, as it is shorter
4533 than the two subtractions. */
4534 if ((code != EQ
4535 && GET_MODE_SIZE (mode) != 64
4536 && vector_all_ones_operand (opfalse, data_mode)
4537 && optrue == CONST0_RTX (data_mode))
4538 || (code == GTU
4539 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4540 /* Don't do it if not using integer masks and we'd end up with
4541 the right values in the registers though. */
4542 && (GET_MODE_SIZE (mode) == 64
4543 || !vector_all_ones_operand (optrue, data_mode)
4544 || opfalse != CONST0_RTX (data_mode))))
4545 {
4546 rtx (*gen) (rtx, rtx, rtx) = NULL;
4547
4548 switch (mode)
4549 {
4550 case E_V16SImode:
4551 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4552 break;
4553 case E_V8DImode:
4554 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4555 cop0 = force_reg (mode, cop0);
4556 cop1 = force_reg (mode, cop1);
4557 break;
4558 case E_V32QImode:
4559 if (TARGET_AVX2)
4560 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4561 break;
4562 case E_V16HImode:
4563 if (TARGET_AVX2)
4564 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4565 break;
4566 case E_V8SImode:
4567 if (TARGET_AVX2)
4568 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4569 break;
4570 case E_V4DImode:
4571 if (TARGET_AVX512VL)
4572 {
4573 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4574 cop0 = force_reg (mode, cop0);
4575 cop1 = force_reg (mode, cop1);
4576 }
4577 break;
4578 case E_V16QImode:
4579 if (code == GTU && TARGET_SSE2)
4580 gen = gen_uminv16qi3;
4581 else if (code == GT && TARGET_SSE4_1)
4582 gen = gen_sminv16qi3;
4583 break;
f3661f2d
UB
4584 case E_V8QImode:
4585 if (code == GTU && TARGET_SSE2)
4586 gen = gen_uminv8qi3;
4587 else if (code == GT && TARGET_SSE4_1)
4588 gen = gen_sminv8qi3;
4589 break;
2df9d3c5
UB
4590 case E_V4QImode:
4591 if (code == GTU && TARGET_SSE2)
4592 gen = gen_uminv4qi3;
4593 else if (code == GT && TARGET_SSE4_1)
4594 gen = gen_sminv4qi3;
4595 break;
04a74555
UB
4596 case E_V2QImode:
4597 if (code == GTU && TARGET_SSE2)
4598 gen = gen_uminv2qi3;
4599 else if (code == GT && TARGET_SSE4_1)
4600 gen = gen_sminv2qi3;
4601 break;
2bf6d935
ML
4602 case E_V8HImode:
4603 if (code == GTU && TARGET_SSE4_1)
4604 gen = gen_uminv8hi3;
4605 else if (code == GT && TARGET_SSE2)
4606 gen = gen_sminv8hi3;
4607 break;
f3661f2d
UB
4608 case E_V4HImode:
4609 if (code == GTU && TARGET_SSE4_1)
4610 gen = gen_uminv4hi3;
4611 else if (code == GT && TARGET_SSE2)
4612 gen = gen_sminv4hi3;
4613 break;
2df9d3c5
UB
4614 case E_V2HImode:
4615 if (code == GTU && TARGET_SSE4_1)
4616 gen = gen_uminv2hi3;
4617 else if (code == GT && TARGET_SSE2)
4618 gen = gen_sminv2hi3;
4619 break;
2bf6d935
ML
4620 case E_V4SImode:
4621 if (TARGET_SSE4_1)
4622 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4623 break;
f3661f2d
UB
4624 case E_V2SImode:
4625 if (TARGET_SSE4_1)
4626 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4627 break;
2bf6d935
ML
4628 case E_V2DImode:
4629 if (TARGET_AVX512VL)
4630 {
4631 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4632 cop0 = force_reg (mode, cop0);
4633 cop1 = force_reg (mode, cop1);
4634 }
4635 break;
4636 default:
4637 break;
4638 }
4639
4640 if (gen)
4641 {
4642 rtx tem = gen_reg_rtx (mode);
4643 if (!vector_operand (cop0, mode))
4644 cop0 = force_reg (mode, cop0);
4645 if (!vector_operand (cop1, mode))
4646 cop1 = force_reg (mode, cop1);
4647 *negate = !*negate;
4648 emit_insn (gen (tem, cop0, cop1));
4649 cop1 = tem;
4650 code = EQ;
4651 }
4652 }
4653
4654 /* Unsigned parallel compare is not supported by the hardware.
4655 Play some tricks to turn this into a signed comparison
4656 against 0. */
4657 if (code == GTU)
4658 {
4659 cop0 = force_reg (mode, cop0);
4660
4661 switch (mode)
4662 {
4663 case E_V16SImode:
4664 case E_V8DImode:
4665 case E_V8SImode:
4666 case E_V4DImode:
4667 case E_V4SImode:
f3661f2d 4668 case E_V2SImode:
2bf6d935
ML
4669 case E_V2DImode:
4670 {
4671 rtx t1, t2, mask;
83bc5e44 4672
2bf6d935
ML
4673 /* Subtract (-(INT MAX) - 1) from both operands to make
4674 them signed. */
4675 mask = ix86_build_signbit_mask (mode, true, false);
4676 t1 = gen_reg_rtx (mode);
83bc5e44 4677 emit_insn (gen_sub3_insn (t1, cop0, mask));
2bf6d935
ML
4678
4679 t2 = gen_reg_rtx (mode);
83bc5e44 4680 emit_insn (gen_sub3_insn (t2, cop1, mask));
2bf6d935
ML
4681
4682 cop0 = t1;
4683 cop1 = t2;
4684 code = GT;
4685 }
4686 break;
4687
4688 case E_V64QImode:
4689 case E_V32HImode:
4690 case E_V32QImode:
4691 case E_V16HImode:
4692 case E_V16QImode:
f3661f2d 4693 case E_V8QImode:
2df9d3c5 4694 case E_V4QImode:
04a74555 4695 case E_V2QImode:
2bf6d935 4696 case E_V8HImode:
f3661f2d 4697 case E_V4HImode:
2df9d3c5 4698 case E_V2HImode:
2bf6d935
ML
4699 /* Perform a parallel unsigned saturating subtraction. */
4700 x = gen_reg_rtx (mode);
83bc5e44
UB
4701 emit_insn (gen_rtx_SET
4702 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
2bf6d935
ML
4703 cop0 = x;
4704 cop1 = CONST0_RTX (mode);
4705 code = EQ;
4706 *negate = !*negate;
4707 break;
4708
4709 default:
4710 gcc_unreachable ();
4711 }
4712 }
4713 }
4714
4715 if (*negate)
4716 std::swap (op_true, op_false);
4717
4718 /* Allow the comparison to be done in one mode, but the movcc to
4719 happen in another mode. */
4720 if (data_mode == mode)
4721 {
4722 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4723 op_true, op_false);
4724 }
4725 else
4726 {
4727 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4728 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4729 op_true, op_false);
4730 if (GET_MODE (x) == mode)
4731 x = gen_lowpart (data_mode, x);
4732 }
4733
4734 return x;
4735}
4736
4737/* Expand integer vector comparison. */
4738
4739bool
4740ix86_expand_int_vec_cmp (rtx operands[])
4741{
4742 rtx_code code = GET_CODE (operands[1]);
4743 bool negate = false;
4744 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4745 operands[3], NULL, NULL, &negate);
4746
4747 if (!cmp)
4748 return false;
4749
4750 if (negate)
4751 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4752 CONST0_RTX (GET_MODE (cmp)),
4753 NULL, NULL, &negate);
4754
4755 gcc_assert (!negate);
4756
4757 if (operands[0] != cmp)
4758 emit_move_insn (operands[0], cmp);
4759
4760 return true;
4761}
4762
4763/* Expand a floating-point vector conditional move; a vcond operation
4764 rather than a movcc operation. */
4765
4766bool
4767ix86_expand_fp_vcond (rtx operands[])
4768{
4769 enum rtx_code code = GET_CODE (operands[3]);
4770 rtx cmp;
4771
4772 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4773 &operands[4], &operands[5]);
4774 if (code == UNKNOWN)
4775 {
4776 rtx temp;
4777 switch (GET_CODE (operands[3]))
4778 {
4779 case LTGT:
4780 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4781 operands[5], operands[0], operands[0]);
4782 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4783 operands[5], operands[1], operands[2]);
4784 code = AND;
4785 break;
4786 case UNEQ:
4787 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4788 operands[5], operands[0], operands[0]);
4789 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4790 operands[5], operands[1], operands[2]);
4791 code = IOR;
4792 break;
4793 default:
4794 gcc_unreachable ();
4795 }
4796 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4797 OPTAB_DIRECT);
4798 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4799 return true;
4800 }
4801
4802 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4803 operands[5], operands[1], operands[2]))
4804 return true;
4805
4806 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4807 operands[1], operands[2]);
4808 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4809 return true;
4810}
4811
4812/* Expand a signed/unsigned integral vector conditional move. */
4813
4814bool
4815ix86_expand_int_vcond (rtx operands[])
4816{
4817 machine_mode data_mode = GET_MODE (operands[0]);
4818 machine_mode mode = GET_MODE (operands[4]);
4819 enum rtx_code code = GET_CODE (operands[3]);
4820 bool negate = false;
4821 rtx x, cop0, cop1;
4822
4823 cop0 = operands[4];
4824 cop1 = operands[5];
4825
4826 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4827 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4828 if ((code == LT || code == GE)
4829 && data_mode == mode
4830 && cop1 == CONST0_RTX (mode)
4831 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4832 && GET_MODE_UNIT_SIZE (data_mode) > 1
4833 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4834 && (GET_MODE_SIZE (data_mode) == 16
4835 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4836 {
4837 rtx negop = operands[2 - (code == LT)];
4838 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4839 if (negop == CONST1_RTX (data_mode))
4840 {
4841 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4842 operands[0], 1, OPTAB_DIRECT);
4843 if (res != operands[0])
4844 emit_move_insn (operands[0], res);
4845 return true;
4846 }
4847 else if (GET_MODE_INNER (data_mode) != DImode
4848 && vector_all_ones_operand (negop, data_mode))
4849 {
4850 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4851 operands[0], 0, OPTAB_DIRECT);
4852 if (res != operands[0])
4853 emit_move_insn (operands[0], res);
4854 return true;
4855 }
4856 }
4857
4858 if (!nonimmediate_operand (cop1, mode))
4859 cop1 = force_reg (mode, cop1);
4860 if (!general_operand (operands[1], data_mode))
4861 operands[1] = force_reg (data_mode, operands[1]);
4862 if (!general_operand (operands[2], data_mode))
4863 operands[2] = force_reg (data_mode, operands[2]);
4864
4865 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4866 operands[1], operands[2], &negate);
4867
4868 if (!x)
4869 return false;
4870
4871 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4872 operands[2-negate]);
4873 return true;
4874}
4875
4876static bool
4877ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4878 struct expand_vec_perm_d *d)
4879{
4880 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4881 expander, so args are either in d, or in op0, op1 etc. */
4882 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4883 machine_mode maskmode = mode;
4884 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4885
4886 switch (mode)
4887 {
faf2b6bc 4888 case E_V16QImode:
4889 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4890 gen = gen_avx512vl_vpermt2varv16qi3;
4891 break;
4892 case E_V32QImode:
4893 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4894 gen = gen_avx512vl_vpermt2varv32qi3;
4895 break;
4896 case E_V64QImode:
4897 if (TARGET_AVX512VBMI)
4898 gen = gen_avx512bw_vpermt2varv64qi3;
4899 break;
2bf6d935
ML
4900 case E_V8HImode:
4901 if (TARGET_AVX512VL && TARGET_AVX512BW)
4902 gen = gen_avx512vl_vpermt2varv8hi3;
4903 break;
4904 case E_V16HImode:
4905 if (TARGET_AVX512VL && TARGET_AVX512BW)
4906 gen = gen_avx512vl_vpermt2varv16hi3;
4907 break;
2bf6d935
ML
4908 case E_V32HImode:
4909 if (TARGET_AVX512BW)
4910 gen = gen_avx512bw_vpermt2varv32hi3;
4911 break;
4912 case E_V4SImode:
4913 if (TARGET_AVX512VL)
4914 gen = gen_avx512vl_vpermt2varv4si3;
4915 break;
4916 case E_V8SImode:
4917 if (TARGET_AVX512VL)
4918 gen = gen_avx512vl_vpermt2varv8si3;
4919 break;
4920 case E_V16SImode:
4921 if (TARGET_AVX512F)
4922 gen = gen_avx512f_vpermt2varv16si3;
4923 break;
4924 case E_V4SFmode:
4925 if (TARGET_AVX512VL)
4926 {
4927 gen = gen_avx512vl_vpermt2varv4sf3;
4928 maskmode = V4SImode;
4929 }
4930 break;
4931 case E_V8SFmode:
4932 if (TARGET_AVX512VL)
4933 {
4934 gen = gen_avx512vl_vpermt2varv8sf3;
4935 maskmode = V8SImode;
4936 }
4937 break;
4938 case E_V16SFmode:
4939 if (TARGET_AVX512F)
4940 {
4941 gen = gen_avx512f_vpermt2varv16sf3;
4942 maskmode = V16SImode;
4943 }
4944 break;
4945 case E_V2DImode:
4946 if (TARGET_AVX512VL)
4947 gen = gen_avx512vl_vpermt2varv2di3;
4948 break;
4949 case E_V4DImode:
4950 if (TARGET_AVX512VL)
4951 gen = gen_avx512vl_vpermt2varv4di3;
4952 break;
4953 case E_V8DImode:
4954 if (TARGET_AVX512F)
4955 gen = gen_avx512f_vpermt2varv8di3;
4956 break;
4957 case E_V2DFmode:
4958 if (TARGET_AVX512VL)
4959 {
4960 gen = gen_avx512vl_vpermt2varv2df3;
4961 maskmode = V2DImode;
4962 }
4963 break;
4964 case E_V4DFmode:
4965 if (TARGET_AVX512VL)
4966 {
4967 gen = gen_avx512vl_vpermt2varv4df3;
4968 maskmode = V4DImode;
4969 }
4970 break;
4971 case E_V8DFmode:
4972 if (TARGET_AVX512F)
4973 {
4974 gen = gen_avx512f_vpermt2varv8df3;
4975 maskmode = V8DImode;
4976 }
4977 break;
4978 default:
4979 break;
4980 }
4981
4982 if (gen == NULL)
4983 return false;
4984
4985 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4986 expander, so args are either in d, or in op0, op1 etc. */
4987 if (d)
4988 {
4989 rtx vec[64];
4990 target = d->target;
4991 op0 = d->op0;
4992 op1 = d->op1;
4993 for (int i = 0; i < d->nelt; ++i)
4994 vec[i] = GEN_INT (d->perm[i]);
4995 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4996 }
4997
4998 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4999 return true;
5000}
5001
5002/* Expand a variable vector permutation. */
5003
5004void
5005ix86_expand_vec_perm (rtx operands[])
5006{
5007 rtx target = operands[0];
5008 rtx op0 = operands[1];
5009 rtx op1 = operands[2];
5010 rtx mask = operands[3];
5011 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5012 machine_mode mode = GET_MODE (op0);
5013 machine_mode maskmode = GET_MODE (mask);
5014 int w, e, i;
5015 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5016
5017 /* Number of elements in the vector. */
5018 w = GET_MODE_NUNITS (mode);
5019 e = GET_MODE_UNIT_SIZE (mode);
5020 gcc_assert (w <= 64);
5021
be072bfa
HW
5022 /* For HF mode vector, convert it to HI using subreg. */
5023 if (GET_MODE_INNER (mode) == HFmode)
5024 {
5025 machine_mode orig_mode = mode;
5026 mode = mode_for_vector (HImode, w).require ();
5027 target = lowpart_subreg (mode, target, orig_mode);
5028 op0 = lowpart_subreg (mode, op0, orig_mode);
5029 op1 = lowpart_subreg (mode, op1, orig_mode);
5030 }
5031
2bf6d935
ML
5032 if (TARGET_AVX512F && one_operand_shuffle)
5033 {
5034 rtx (*gen) (rtx, rtx, rtx) = NULL;
5035 switch (mode)
5036 {
5037 case E_V16SImode:
5038 gen =gen_avx512f_permvarv16si;
5039 break;
5040 case E_V16SFmode:
5041 gen = gen_avx512f_permvarv16sf;
5042 break;
5043 case E_V8DImode:
5044 gen = gen_avx512f_permvarv8di;
5045 break;
5046 case E_V8DFmode:
5047 gen = gen_avx512f_permvarv8df;
5048 break;
5049 default:
5050 break;
5051 }
5052 if (gen != NULL)
5053 {
5054 emit_insn (gen (target, op0, mask));
5055 return;
5056 }
5057 }
5058
5059 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5060 return;
5061
5062 if (TARGET_AVX2)
5063 {
5064 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5065 {
5066 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5067 an constant shuffle operand. With a tiny bit of effort we can
5068 use VPERMD instead. A re-interpretation stall for V4DFmode is
5069 unfortunate but there's no avoiding it.
5070 Similarly for V16HImode we don't have instructions for variable
5071 shuffling, while for V32QImode we can use after preparing suitable
5072 masks vpshufb; vpshufb; vpermq; vpor. */
5073
5074 if (mode == V16HImode)
5075 {
5076 maskmode = mode = V32QImode;
5077 w = 32;
5078 e = 1;
5079 }
5080 else
5081 {
5082 maskmode = mode = V8SImode;
5083 w = 8;
5084 e = 4;
5085 }
5086 t1 = gen_reg_rtx (maskmode);
5087
5088 /* Replicate the low bits of the V4DImode mask into V8SImode:
5089 mask = { A B C D }
5090 t1 = { A A B B C C D D }. */
5091 for (i = 0; i < w / 2; ++i)
5092 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5093 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5094 vt = force_reg (maskmode, vt);
5095 mask = gen_lowpart (maskmode, mask);
5096 if (maskmode == V8SImode)
5097 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5098 else
5099 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5100
5101 /* Multiply the shuffle indicies by two. */
5102 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5103 OPTAB_DIRECT);
5104
5105 /* Add one to the odd shuffle indicies:
5106 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5107 for (i = 0; i < w / 2; ++i)
5108 {
5109 vec[i * 2] = const0_rtx;
5110 vec[i * 2 + 1] = const1_rtx;
5111 }
5112 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5113 vt = validize_mem (force_const_mem (maskmode, vt));
5114 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5115 OPTAB_DIRECT);
5116
5117 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5118 operands[3] = mask = t1;
5119 target = gen_reg_rtx (mode);
5120 op0 = gen_lowpart (mode, op0);
5121 op1 = gen_lowpart (mode, op1);
5122 }
5123
5124 switch (mode)
5125 {
5126 case E_V8SImode:
5127 /* The VPERMD and VPERMPS instructions already properly ignore
5128 the high bits of the shuffle elements. No need for us to
5129 perform an AND ourselves. */
5130 if (one_operand_shuffle)
5131 {
5132 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5133 if (target != operands[0])
5134 emit_move_insn (operands[0],
5135 gen_lowpart (GET_MODE (operands[0]), target));
5136 }
5137 else
5138 {
5139 t1 = gen_reg_rtx (V8SImode);
5140 t2 = gen_reg_rtx (V8SImode);
5141 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5142 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5143 goto merge_two;
5144 }
5145 return;
5146
5147 case E_V8SFmode:
5148 mask = gen_lowpart (V8SImode, mask);
5149 if (one_operand_shuffle)
5150 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5151 else
5152 {
5153 t1 = gen_reg_rtx (V8SFmode);
5154 t2 = gen_reg_rtx (V8SFmode);
5155 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5156 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5157 goto merge_two;
5158 }
5159 return;
5160
5161 case E_V4SImode:
5162 /* By combining the two 128-bit input vectors into one 256-bit
5163 input vector, we can use VPERMD and VPERMPS for the full
5164 two-operand shuffle. */
5165 t1 = gen_reg_rtx (V8SImode);
5166 t2 = gen_reg_rtx (V8SImode);
5167 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5168 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5169 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5170 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5171 return;
5172
5173 case E_V4SFmode:
5174 t1 = gen_reg_rtx (V8SFmode);
5175 t2 = gen_reg_rtx (V8SImode);
5176 mask = gen_lowpart (V4SImode, mask);
5177 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5178 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5179 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5180 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5181 return;
5182
5183 case E_V32QImode:
5184 t1 = gen_reg_rtx (V32QImode);
5185 t2 = gen_reg_rtx (V32QImode);
5186 t3 = gen_reg_rtx (V32QImode);
5187 vt2 = GEN_INT (-128);
5188 vt = gen_const_vec_duplicate (V32QImode, vt2);
5189 vt = force_reg (V32QImode, vt);
5190 for (i = 0; i < 32; i++)
5191 vec[i] = i < 16 ? vt2 : const0_rtx;
5192 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5193 vt2 = force_reg (V32QImode, vt2);
5194 /* From mask create two adjusted masks, which contain the same
5195 bits as mask in the low 7 bits of each vector element.
5196 The first mask will have the most significant bit clear
5197 if it requests element from the same 128-bit lane
5198 and MSB set if it requests element from the other 128-bit lane.
5199 The second mask will have the opposite values of the MSB,
5200 and additionally will have its 128-bit lanes swapped.
5201 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5202 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5203 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5204 stands for other 12 bytes. */
5205 /* The bit whether element is from the same lane or the other
5206 lane is bit 4, so shift it up by 3 to the MSB position. */
5207 t5 = gen_reg_rtx (V4DImode);
5208 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5209 GEN_INT (3)));
5210 /* Clear MSB bits from the mask just in case it had them set. */
5211 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5212 /* After this t1 will have MSB set for elements from other lane. */
5213 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5214 /* Clear bits other than MSB. */
5215 emit_insn (gen_andv32qi3 (t1, t1, vt));
5216 /* Or in the lower bits from mask into t3. */
5217 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5218 /* And invert MSB bits in t1, so MSB is set for elements from the same
5219 lane. */
5220 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5221 /* Swap 128-bit lanes in t3. */
5222 t6 = gen_reg_rtx (V4DImode);
5223 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5224 const2_rtx, GEN_INT (3),
5225 const0_rtx, const1_rtx));
5226 /* And or in the lower bits from mask into t1. */
5227 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5228 if (one_operand_shuffle)
5229 {
5230 /* Each of these shuffles will put 0s in places where
5231 element from the other 128-bit lane is needed, otherwise
5232 will shuffle in the requested value. */
5233 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5234 gen_lowpart (V32QImode, t6)));
5235 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5236 /* For t3 the 128-bit lanes are swapped again. */
5237 t7 = gen_reg_rtx (V4DImode);
5238 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5239 const2_rtx, GEN_INT (3),
5240 const0_rtx, const1_rtx));
5241 /* And oring both together leads to the result. */
5242 emit_insn (gen_iorv32qi3 (target, t1,
5243 gen_lowpart (V32QImode, t7)));
5244 if (target != operands[0])
5245 emit_move_insn (operands[0],
5246 gen_lowpart (GET_MODE (operands[0]), target));
5247 return;
5248 }
5249
5250 t4 = gen_reg_rtx (V32QImode);
5251 /* Similarly to the above one_operand_shuffle code,
5252 just for repeated twice for each operand. merge_two:
5253 code will merge the two results together. */
5254 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5255 gen_lowpart (V32QImode, t6)));
5256 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5257 gen_lowpart (V32QImode, t6)));
5258 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5259 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5260 t7 = gen_reg_rtx (V4DImode);
5261 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5262 const2_rtx, GEN_INT (3),
5263 const0_rtx, const1_rtx));
5264 t8 = gen_reg_rtx (V4DImode);
5265 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5266 const2_rtx, GEN_INT (3),
5267 const0_rtx, const1_rtx));
5268 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5269 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5270 t1 = t4;
5271 t2 = t3;
5272 goto merge_two;
5273
5274 default:
5275 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5276 break;
5277 }
5278 }
5279
5280 if (TARGET_XOP)
5281 {
5282 /* The XOP VPPERM insn supports three inputs. By ignoring the
5283 one_operand_shuffle special case, we avoid creating another
5284 set of constant vectors in memory. */
5285 one_operand_shuffle = false;
5286
5287 /* mask = mask & {2*w-1, ...} */
5288 vt = GEN_INT (2*w - 1);
5289 }
5290 else
5291 {
5292 /* mask = mask & {w-1, ...} */
5293 vt = GEN_INT (w - 1);
5294 }
5295
5296 vt = gen_const_vec_duplicate (maskmode, vt);
5297 mask = expand_simple_binop (maskmode, AND, mask, vt,
5298 NULL_RTX, 0, OPTAB_DIRECT);
5299
5300 /* For non-QImode operations, convert the word permutation control
5301 into a byte permutation control. */
5302 if (mode != V16QImode)
5303 {
5304 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5305 GEN_INT (exact_log2 (e)),
5306 NULL_RTX, 0, OPTAB_DIRECT);
5307
5308 /* Convert mask to vector of chars. */
5309 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5310
5311 /* Replicate each of the input bytes into byte positions:
5312 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5313 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5314 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5315 for (i = 0; i < 16; ++i)
5316 vec[i] = GEN_INT (i/e * e);
5317 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5318 vt = validize_mem (force_const_mem (V16QImode, vt));
5319 if (TARGET_XOP)
5320 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5321 else
5322 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5323
5324 /* Convert it into the byte positions by doing
5325 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5326 for (i = 0; i < 16; ++i)
5327 vec[i] = GEN_INT (i % e);
5328 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5329 vt = validize_mem (force_const_mem (V16QImode, vt));
5330 emit_insn (gen_addv16qi3 (mask, mask, vt));
5331 }
5332
5333 /* The actual shuffle operations all operate on V16QImode. */
5334 op0 = gen_lowpart (V16QImode, op0);
5335 op1 = gen_lowpart (V16QImode, op1);
5336
5337 if (TARGET_XOP)
5338 {
5339 if (GET_MODE (target) != V16QImode)
5340 target = gen_reg_rtx (V16QImode);
5341 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5342 if (target != operands[0])
5343 emit_move_insn (operands[0],
5344 gen_lowpart (GET_MODE (operands[0]), target));
5345 }
5346 else if (one_operand_shuffle)
5347 {
5348 if (GET_MODE (target) != V16QImode)
5349 target = gen_reg_rtx (V16QImode);
5350 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5351 if (target != operands[0])
5352 emit_move_insn (operands[0],
5353 gen_lowpart (GET_MODE (operands[0]), target));
5354 }
5355 else
5356 {
5357 rtx xops[6];
5358 bool ok;
5359
5360 /* Shuffle the two input vectors independently. */
5361 t1 = gen_reg_rtx (V16QImode);
5362 t2 = gen_reg_rtx (V16QImode);
5363 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5364 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5365
5366 merge_two:
5367 /* Then merge them together. The key is whether any given control
5368 element contained a bit set that indicates the second word. */
5369 mask = operands[3];
5370 vt = GEN_INT (w);
5371 if (maskmode == V2DImode && !TARGET_SSE4_1)
5372 {
5373 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5374 more shuffle to convert the V2DI input mask into a V4SI
5375 input mask. At which point the masking that expand_int_vcond
5376 will work as desired. */
5377 rtx t3 = gen_reg_rtx (V4SImode);
5378 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5379 const0_rtx, const0_rtx,
5380 const2_rtx, const2_rtx));
5381 mask = t3;
5382 maskmode = V4SImode;
5383 e = w = 4;
5384 }
5385
5386 vt = gen_const_vec_duplicate (maskmode, vt);
5387 vt = force_reg (maskmode, vt);
5388 mask = expand_simple_binop (maskmode, AND, mask, vt,
5389 NULL_RTX, 0, OPTAB_DIRECT);
5390
5391 if (GET_MODE (target) != mode)
5392 target = gen_reg_rtx (mode);
5393 xops[0] = target;
5394 xops[1] = gen_lowpart (mode, t2);
5395 xops[2] = gen_lowpart (mode, t1);
5396 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5397 xops[4] = mask;
5398 xops[5] = vt;
5399 ok = ix86_expand_int_vcond (xops);
5400 gcc_assert (ok);
5401 if (target != operands[0])
5402 emit_move_insn (operands[0],
5403 gen_lowpart (GET_MODE (operands[0]), target));
5404 }
5405}
5406
5407/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5408 true if we should do zero extension, else sign extension. HIGH_P is
5409 true if we want the N/2 high elements, else the low elements. */
5410
5411void
5412ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5413{
5414 machine_mode imode = GET_MODE (src);
5415 rtx tmp;
5416
5417 if (TARGET_SSE4_1)
5418 {
5419 rtx (*unpack)(rtx, rtx);
5420 rtx (*extract)(rtx, rtx) = NULL;
5421 machine_mode halfmode = BLKmode;
5422
5423 switch (imode)
5424 {
5425 case E_V64QImode:
5426 if (unsigned_p)
5427 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5428 else
5429 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5430 halfmode = V32QImode;
5431 extract
5432 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5433 break;
5434 case E_V32QImode:
5435 if (unsigned_p)
5436 unpack = gen_avx2_zero_extendv16qiv16hi2;
5437 else
5438 unpack = gen_avx2_sign_extendv16qiv16hi2;
5439 halfmode = V16QImode;
5440 extract
5441 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5442 break;
5443 case E_V32HImode:
5444 if (unsigned_p)
5445 unpack = gen_avx512f_zero_extendv16hiv16si2;
5446 else
5447 unpack = gen_avx512f_sign_extendv16hiv16si2;
5448 halfmode = V16HImode;
5449 extract
5450 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5451 break;
5452 case E_V16HImode:
5453 if (unsigned_p)
5454 unpack = gen_avx2_zero_extendv8hiv8si2;
5455 else
5456 unpack = gen_avx2_sign_extendv8hiv8si2;
5457 halfmode = V8HImode;
5458 extract
5459 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5460 break;
5461 case E_V16SImode:
5462 if (unsigned_p)
5463 unpack = gen_avx512f_zero_extendv8siv8di2;
5464 else
5465 unpack = gen_avx512f_sign_extendv8siv8di2;
5466 halfmode = V8SImode;
5467 extract
5468 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5469 break;
5470 case E_V8SImode:
5471 if (unsigned_p)
5472 unpack = gen_avx2_zero_extendv4siv4di2;
5473 else
5474 unpack = gen_avx2_sign_extendv4siv4di2;
5475 halfmode = V4SImode;
5476 extract
5477 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5478 break;
5479 case E_V16QImode:
5480 if (unsigned_p)
5481 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5482 else
5483 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5484 break;
5485 case E_V8HImode:
5486 if (unsigned_p)
5487 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5488 else
5489 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5490 break;
5491 case E_V4SImode:
5492 if (unsigned_p)
5493 unpack = gen_sse4_1_zero_extendv2siv2di2;
5494 else
5495 unpack = gen_sse4_1_sign_extendv2siv2di2;
5496 break;
836328b2
UB
5497 case E_V8QImode:
5498 if (unsigned_p)
5499 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5500 else
5501 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5502 break;
5503 case E_V4HImode:
5504 if (unsigned_p)
5505 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5506 else
5507 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5508 break;
663a014e
UB
5509 case E_V4QImode:
5510 if (unsigned_p)
5511 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5512 else
5513 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5514 break;
2bf6d935
ML
5515 default:
5516 gcc_unreachable ();
5517 }
5518
5519 if (GET_MODE_SIZE (imode) >= 32)
5520 {
5521 tmp = gen_reg_rtx (halfmode);
5522 emit_insn (extract (tmp, src));
5523 }
5524 else if (high_p)
5525 {
836328b2
UB
5526 switch (GET_MODE_SIZE (imode))
5527 {
5528 case 16:
5529 /* Shift higher 8 bytes to lower 8 bytes. */
5530 tmp = gen_reg_rtx (V1TImode);
5531 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5532 GEN_INT (64)));
5533 break;
5534 case 8:
5535 /* Shift higher 4 bytes to lower 4 bytes. */
5536 tmp = gen_reg_rtx (V1DImode);
5537 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5538 GEN_INT (32)));
5539 break;
663a014e
UB
5540 case 4:
5541 /* Shift higher 2 bytes to lower 2 bytes. */
5542 tmp = gen_reg_rtx (V1SImode);
5543 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5544 GEN_INT (16)));
5545 break;
836328b2
UB
5546 default:
5547 gcc_unreachable ();
5548 }
5549
2bf6d935
ML
5550 tmp = gen_lowpart (imode, tmp);
5551 }
5552 else
5553 tmp = src;
5554
5555 emit_insn (unpack (dest, tmp));
5556 }
5557 else
5558 {
5559 rtx (*unpack)(rtx, rtx, rtx);
5560
5561 switch (imode)
5562 {
5563 case E_V16QImode:
5564 if (high_p)
5565 unpack = gen_vec_interleave_highv16qi;
5566 else
5567 unpack = gen_vec_interleave_lowv16qi;
5568 break;
5569 case E_V8HImode:
5570 if (high_p)
5571 unpack = gen_vec_interleave_highv8hi;
5572 else
5573 unpack = gen_vec_interleave_lowv8hi;
5574 break;
5575 case E_V4SImode:
5576 if (high_p)
5577 unpack = gen_vec_interleave_highv4si;
5578 else
5579 unpack = gen_vec_interleave_lowv4si;
5580 break;
836328b2
UB
5581 case E_V8QImode:
5582 if (high_p)
5583 unpack = gen_mmx_punpckhbw;
5584 else
5585 unpack = gen_mmx_punpcklbw;
5586 break;
5587 case E_V4HImode:
5588 if (high_p)
5589 unpack = gen_mmx_punpckhwd;
5590 else
5591 unpack = gen_mmx_punpcklwd;
5592 break;
663a014e
UB
5593 case E_V4QImode:
5594 if (high_p)
5595 unpack = gen_mmx_punpckhbw_low;
5596 else
5597 unpack = gen_mmx_punpcklbw_low;
5598 break;
2bf6d935
ML
5599 default:
5600 gcc_unreachable ();
5601 }
5602
5603 if (unsigned_p)
5604 tmp = force_reg (imode, CONST0_RTX (imode));
5605 else
5606 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5607 src, pc_rtx, pc_rtx);
5608
5609 rtx tmp2 = gen_reg_rtx (imode);
5610 emit_insn (unpack (tmp2, src, tmp));
5611 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5612 }
5613}
5614
faf2b6bc 5615/* Return true if mem is pool constant which contains a const_vector
5616 perm index, assign the index to PERM. */
5617bool
5618ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5619{
5620 machine_mode mode = GET_MODE (mem);
5621 int nelt = GET_MODE_NUNITS (mode);
5622
5623 if (!INTEGRAL_MODE_P (mode))
5624 return false;
5625
5626 /* Needs to be constant pool. */
5627 if (!(MEM_P (mem))
5628 || !SYMBOL_REF_P (XEXP (mem, 0))
5629 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5630 return false;
5631
5632 rtx constant = get_pool_constant (XEXP (mem, 0));
5633
5634 if (GET_CODE (constant) != CONST_VECTOR)
5635 return false;
5636
5637 /* There could be some rtx like
5638 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5639 but with "*.LC1" refer to V2DI constant vector. */
5640 if (GET_MODE (constant) != mode)
5641 {
5642 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5643
5644 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5645 return false;
5646 }
5647
5648 for (int i = 0; i != nelt; i++)
5649 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5650
5651 return true;
5652}
5653
2bf6d935
ML
5654/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5655 but works for floating pointer parameters and nonoffsetable memories.
5656 For pushes, it returns just stack offsets; the values will be saved
5657 in the right order. Maximally three parts are generated. */
5658
5659static int
5660ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5661{
5662 int size;
5663
5664 if (!TARGET_64BIT)
5665 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5666 else
5667 size = (GET_MODE_SIZE (mode) + 4) / 8;
5668
5669 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5670 gcc_assert (size >= 2 && size <= 4);
5671
5672 /* Optimize constant pool reference to immediates. This is used by fp
5673 moves, that force all constants to memory to allow combining. */
5674 if (MEM_P (operand) && MEM_READONLY_P (operand))
5675 operand = avoid_constant_pool_reference (operand);
5676
5677 if (MEM_P (operand) && !offsettable_memref_p (operand))
5678 {
5679 /* The only non-offsetable memories we handle are pushes. */
5680 int ok = push_operand (operand, VOIDmode);
5681
5682 gcc_assert (ok);
5683
5684 operand = copy_rtx (operand);
5685 PUT_MODE (operand, word_mode);
5686 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5687 return size;
5688 }
5689
5690 if (GET_CODE (operand) == CONST_VECTOR)
5691 {
5692 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5693 /* Caution: if we looked through a constant pool memory above,
5694 the operand may actually have a different mode now. That's
5695 ok, since we want to pun this all the way back to an integer. */
5696 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5697 gcc_assert (operand != NULL);
5698 mode = imode;
5699 }
5700
5701 if (!TARGET_64BIT)
5702 {
5703 if (mode == DImode)
5704 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5705 else
5706 {
5707 int i;
5708
5709 if (REG_P (operand))
5710 {
5711 gcc_assert (reload_completed);
5712 for (i = 0; i < size; i++)
5713 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5714 }
5715 else if (offsettable_memref_p (operand))
5716 {
5717 operand = adjust_address (operand, SImode, 0);
5718 parts[0] = operand;
5719 for (i = 1; i < size; i++)
5720 parts[i] = adjust_address (operand, SImode, 4 * i);
5721 }
5722 else if (CONST_DOUBLE_P (operand))
5723 {
5724 const REAL_VALUE_TYPE *r;
5725 long l[4];
5726
5727 r = CONST_DOUBLE_REAL_VALUE (operand);
5728 switch (mode)
5729 {
5730 case E_TFmode:
5731 real_to_target (l, r, mode);
5732 parts[3] = gen_int_mode (l[3], SImode);
5733 parts[2] = gen_int_mode (l[2], SImode);
5734 break;
5735 case E_XFmode:
5736 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5737 long double may not be 80-bit. */
5738 real_to_target (l, r, mode);
5739 parts[2] = gen_int_mode (l[2], SImode);
5740 break;
5741 case E_DFmode:
5742 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5743 break;
5744 default:
5745 gcc_unreachable ();
5746 }
5747 parts[1] = gen_int_mode (l[1], SImode);
5748 parts[0] = gen_int_mode (l[0], SImode);
5749 }
5750 else
5751 gcc_unreachable ();
5752 }
5753 }
5754 else
5755 {
5756 if (mode == TImode)
5757 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5758 if (mode == XFmode || mode == TFmode)
5759 {
5760 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5761 if (REG_P (operand))
5762 {
5763 gcc_assert (reload_completed);
5764 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5765 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5766 }
5767 else if (offsettable_memref_p (operand))
5768 {
5769 operand = adjust_address (operand, DImode, 0);
5770 parts[0] = operand;
5771 parts[1] = adjust_address (operand, upper_mode, 8);
5772 }
5773 else if (CONST_DOUBLE_P (operand))
5774 {
5775 long l[4];
5776
5777 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5778
5779 /* real_to_target puts 32-bit pieces in each long. */
5780 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5781 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5782 << 32), DImode);
5783
5784 if (upper_mode == SImode)
5785 parts[1] = gen_int_mode (l[2], SImode);
5786 else
5787 parts[1]
5788 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5789 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5790 << 32), DImode);
5791 }
5792 else
5793 gcc_unreachable ();
5794 }
5795 }
5796
5797 return size;
5798}
5799
5800/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5801 Return false when normal moves are needed; true when all required
5802 insns have been emitted. Operands 2-4 contain the input values
5803 int the correct order; operands 5-7 contain the output values. */
5804
5805void
5806ix86_split_long_move (rtx operands[])
5807{
5808 rtx part[2][4];
5809 int nparts, i, j;
5810 int push = 0;
5811 int collisions = 0;
5812 machine_mode mode = GET_MODE (operands[0]);
5813 bool collisionparts[4];
5814
5815 /* The DFmode expanders may ask us to move double.
5816 For 64bit target this is single move. By hiding the fact
5817 here we simplify i386.md splitters. */
5818 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5819 {
5820 /* Optimize constant pool reference to immediates. This is used by
5821 fp moves, that force all constants to memory to allow combining. */
5822
5823 if (MEM_P (operands[1])
5824 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5825 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5826 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5827 if (push_operand (operands[0], VOIDmode))
5828 {
5829 operands[0] = copy_rtx (operands[0]);
5830 PUT_MODE (operands[0], word_mode);
5831 }
5832 else
5833 operands[0] = gen_lowpart (DImode, operands[0]);
5834 operands[1] = gen_lowpart (DImode, operands[1]);
5835 emit_move_insn (operands[0], operands[1]);
5836 return;
5837 }
5838
5839 /* The only non-offsettable memory we handle is push. */
5840 if (push_operand (operands[0], VOIDmode))
5841 push = 1;
5842 else
5843 gcc_assert (!MEM_P (operands[0])
5844 || offsettable_memref_p (operands[0]));
5845
5846 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5847 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5848
5849 /* When emitting push, take care for source operands on the stack. */
5850 if (push && MEM_P (operands[1])
5851 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5852 {
5853 rtx src_base = XEXP (part[1][nparts - 1], 0);
5854
5855 /* Compensate for the stack decrement by 4. */
5856 if (!TARGET_64BIT && nparts == 3
5857 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5858 src_base = plus_constant (Pmode, src_base, 4);
5859
5860 /* src_base refers to the stack pointer and is
5861 automatically decreased by emitted push. */
5862 for (i = 0; i < nparts; i++)
5863 part[1][i] = change_address (part[1][i],
5864 GET_MODE (part[1][i]), src_base);
5865 }
5866
5867 /* We need to do copy in the right order in case an address register
5868 of the source overlaps the destination. */
5869 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5870 {
5871 rtx tmp;
5872
5873 for (i = 0; i < nparts; i++)
5874 {
5875 collisionparts[i]
5876 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5877 if (collisionparts[i])
5878 collisions++;
5879 }
5880
5881 /* Collision in the middle part can be handled by reordering. */
5882 if (collisions == 1 && nparts == 3 && collisionparts [1])
5883 {
5884 std::swap (part[0][1], part[0][2]);
5885 std::swap (part[1][1], part[1][2]);
5886 }
5887 else if (collisions == 1
5888 && nparts == 4
5889 && (collisionparts [1] || collisionparts [2]))
5890 {
5891 if (collisionparts [1])
5892 {
5893 std::swap (part[0][1], part[0][2]);
5894 std::swap (part[1][1], part[1][2]);
5895 }
5896 else
5897 {
5898 std::swap (part[0][2], part[0][3]);
5899 std::swap (part[1][2], part[1][3]);
5900 }
5901 }
5902
5903 /* If there are more collisions, we can't handle it by reordering.
5904 Do an lea to the last part and use only one colliding move. */
5905 else if (collisions > 1)
5906 {
5907 rtx base, addr;
5908
5909 collisions = 1;
5910
5911 base = part[0][nparts - 1];
5912
5913 /* Handle the case when the last part isn't valid for lea.
5914 Happens in 64-bit mode storing the 12-byte XFmode. */
5915 if (GET_MODE (base) != Pmode)
5916 base = gen_rtx_REG (Pmode, REGNO (base));
5917
5918 addr = XEXP (part[1][0], 0);
5919 if (TARGET_TLS_DIRECT_SEG_REFS)
5920 {
5921 struct ix86_address parts;
5922 int ok = ix86_decompose_address (addr, &parts);
5923 gcc_assert (ok);
5924 /* It is not valid to use %gs: or %fs: in lea. */
5925 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5926 }
5927 emit_insn (gen_rtx_SET (base, addr));
5928 part[1][0] = replace_equiv_address (part[1][0], base);
5929 for (i = 1; i < nparts; i++)
5930 {
5931 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5932 part[1][i] = replace_equiv_address (part[1][i], tmp);
5933 }
5934 }
5935 }
5936
5937 if (push)
5938 {
5939 if (!TARGET_64BIT)
5940 {
5941 if (nparts == 3)
5942 {
5943 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
d9330fb5 5944 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
2bf6d935
ML
5945 emit_move_insn (part[0][2], part[1][2]);
5946 }
5947 else if (nparts == 4)
5948 {
5949 emit_move_insn (part[0][3], part[1][3]);
5950 emit_move_insn (part[0][2], part[1][2]);
5951 }
5952 }
5953 else
5954 {
5955 /* In 64bit mode we don't have 32bit push available. In case this is
5956 register, it is OK - we will just use larger counterpart. We also
5957 retype memory - these comes from attempt to avoid REX prefix on
5958 moving of second half of TFmode value. */
5959 if (GET_MODE (part[1][1]) == SImode)
5960 {
5961 switch (GET_CODE (part[1][1]))
5962 {
5963 case MEM:
5964 part[1][1] = adjust_address (part[1][1], DImode, 0);
5965 break;
5966
5967 case REG:
5968 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5969 break;
5970
5971 default:
5972 gcc_unreachable ();
5973 }
5974
5975 if (GET_MODE (part[1][0]) == SImode)
5976 part[1][0] = part[1][1];
5977 }
5978 }
5979 emit_move_insn (part[0][1], part[1][1]);
5980 emit_move_insn (part[0][0], part[1][0]);
5981 return;
5982 }
5983
5984 /* Choose correct order to not overwrite the source before it is copied. */
5985 if ((REG_P (part[0][0])
5986 && REG_P (part[1][1])
5987 && (REGNO (part[0][0]) == REGNO (part[1][1])
5988 || (nparts == 3
5989 && REGNO (part[0][0]) == REGNO (part[1][2]))
5990 || (nparts == 4
5991 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5992 || (collisions > 0
5993 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5994 {
5995 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5996 {
5997 operands[2 + i] = part[0][j];
5998 operands[6 + i] = part[1][j];
5999 }
6000 }
6001 else
6002 {
6003 for (i = 0; i < nparts; i++)
6004 {
6005 operands[2 + i] = part[0][i];
6006 operands[6 + i] = part[1][i];
6007 }
6008 }
6009
6010 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6011 if (optimize_insn_for_size_p ())
6012 {
6013 for (j = 0; j < nparts - 1; j++)
6014 if (CONST_INT_P (operands[6 + j])
6015 && operands[6 + j] != const0_rtx
6016 && REG_P (operands[2 + j]))
6017 for (i = j; i < nparts - 1; i++)
6018 if (CONST_INT_P (operands[7 + i])
6019 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6020 operands[7 + i] = operands[2 + j];
6021 }
6022
6023 for (i = 0; i < nparts; i++)
6024 emit_move_insn (operands[2 + i], operands[6 + i]);
6025
6026 return;
6027}
6028
6029/* Helper function of ix86_split_ashl used to generate an SImode/DImode
6030 left shift by a constant, either using a single shift or
6031 a sequence of add instructions. */
6032
6033static void
6034ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6035{
2bf6d935
ML
6036 if (count == 1
6037 || (count * ix86_cost->add <= ix86_cost->shift_const
6038 && !optimize_insn_for_size_p ()))
6039 {
2bf6d935 6040 while (count-- > 0)
83bc5e44 6041 emit_insn (gen_add2_insn (operand, operand));
2bf6d935
ML
6042 }
6043 else
6044 {
83bc5e44
UB
6045 rtx (*insn)(rtx, rtx, rtx);
6046
2bf6d935
ML
6047 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6048 emit_insn (insn (operand, operand, GEN_INT (count)));
6049 }
6050}
6051
6052void
6053ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6054{
6055 rtx (*gen_ashl3)(rtx, rtx, rtx);
6056 rtx (*gen_shld)(rtx, rtx, rtx);
6057 int half_width = GET_MODE_BITSIZE (mode) >> 1;
987a3082 6058 machine_mode half_mode;
2bf6d935
ML
6059
6060 rtx low[2], high[2];
6061 int count;
6062
6063 if (CONST_INT_P (operands[2]))
6064 {
6065 split_double_mode (mode, operands, 2, low, high);
6066 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6067
6068 if (count >= half_width)
6069 {
6070 emit_move_insn (high[0], low[1]);
6071 emit_move_insn (low[0], const0_rtx);
6072
6073 if (count > half_width)
6074 ix86_expand_ashl_const (high[0], count - half_width, mode);
6075 }
6076 else
6077 {
6078 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6079
6080 if (!rtx_equal_p (operands[0], operands[1]))
6081 emit_move_insn (operands[0], operands[1]);
6082
6083 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6084 ix86_expand_ashl_const (low[0], count, mode);
6085 }
6086 return;
6087 }
6088
6089 split_double_mode (mode, operands, 1, low, high);
987a3082 6090 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6091
6092 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6093
6094 if (operands[1] == const1_rtx)
6095 {
6096 /* Assuming we've chosen a QImode capable registers, then 1 << N
6097 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6098 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6099 {
6100 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6101
6102 ix86_expand_clear (low[0]);
6103 ix86_expand_clear (high[0]);
6104 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6105
6106 d = gen_lowpart (QImode, low[0]);
6107 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6108 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6109 emit_insn (gen_rtx_SET (d, s));
6110
6111 d = gen_lowpart (QImode, high[0]);
6112 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6113 s = gen_rtx_NE (QImode, flags, const0_rtx);
6114 emit_insn (gen_rtx_SET (d, s));
6115 }
6116
6117 /* Otherwise, we can get the same results by manually performing
6118 a bit extract operation on bit 5/6, and then performing the two
6119 shifts. The two methods of getting 0/1 into low/high are exactly
6120 the same size. Avoiding the shift in the bit extract case helps
6121 pentium4 a bit; no one else seems to care much either way. */
6122 else
6123 {
2bf6d935
ML
6124 rtx (*gen_lshr3)(rtx, rtx, rtx);
6125 rtx (*gen_and3)(rtx, rtx, rtx);
6126 rtx (*gen_xor3)(rtx, rtx, rtx);
6127 HOST_WIDE_INT bits;
6128 rtx x;
6129
6130 if (mode == DImode)
6131 {
2bf6d935
ML
6132 gen_lshr3 = gen_lshrsi3;
6133 gen_and3 = gen_andsi3;
6134 gen_xor3 = gen_xorsi3;
6135 bits = 5;
6136 }
6137 else
6138 {
2bf6d935
ML
6139 gen_lshr3 = gen_lshrdi3;
6140 gen_and3 = gen_anddi3;
6141 gen_xor3 = gen_xordi3;
6142 bits = 6;
6143 }
6144
6145 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6146 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6147 else
6148 x = gen_lowpart (half_mode, operands[2]);
6149 emit_insn (gen_rtx_SET (high[0], x));
6150
6151 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6152 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6153 emit_move_insn (low[0], high[0]);
6154 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6155 }
6156
6157 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6158 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6159 return;
6160 }
6161
6162 if (operands[1] == constm1_rtx)
6163 {
6164 /* For -1 << N, we can avoid the shld instruction, because we
6165 know that we're shifting 0...31/63 ones into a -1. */
6166 emit_move_insn (low[0], constm1_rtx);
6167 if (optimize_insn_for_size_p ())
6168 emit_move_insn (high[0], low[0]);
6169 else
6170 emit_move_insn (high[0], constm1_rtx);
6171 }
6172 else
6173 {
6174 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6175
6176 if (!rtx_equal_p (operands[0], operands[1]))
6177 emit_move_insn (operands[0], operands[1]);
6178
6179 split_double_mode (mode, operands, 1, low, high);
6180 emit_insn (gen_shld (high[0], low[0], operands[2]));
6181 }
6182
6183 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6184
6185 if (TARGET_CMOVE && scratch)
6186 {
2bf6d935 6187 ix86_expand_clear (scratch);
987a3082
UB
6188 emit_insn (gen_x86_shift_adj_1
6189 (half_mode, high[0], low[0], operands[2], scratch));
2bf6d935
ML
6190 }
6191 else
987a3082 6192 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
2bf6d935
ML
6193}
6194
6195void
6196ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6197{
6198 rtx (*gen_ashr3)(rtx, rtx, rtx)
6199 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6200 rtx (*gen_shrd)(rtx, rtx, rtx);
6201 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6202
6203 rtx low[2], high[2];
6204 int count;
6205
6206 if (CONST_INT_P (operands[2]))
6207 {
6208 split_double_mode (mode, operands, 2, low, high);
6209 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6210
6211 if (count == GET_MODE_BITSIZE (mode) - 1)
6212 {
6213 emit_move_insn (high[0], high[1]);
6214 emit_insn (gen_ashr3 (high[0], high[0],
6215 GEN_INT (half_width - 1)));
6216 emit_move_insn (low[0], high[0]);
6217
6218 }
6219 else if (count >= half_width)
6220 {
6221 emit_move_insn (low[0], high[1]);
6222 emit_move_insn (high[0], low[0]);
6223 emit_insn (gen_ashr3 (high[0], high[0],
6224 GEN_INT (half_width - 1)));
6225
6226 if (count > half_width)
6227 emit_insn (gen_ashr3 (low[0], low[0],
6228 GEN_INT (count - half_width)));
6229 }
6230 else
6231 {
6232 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6233
6234 if (!rtx_equal_p (operands[0], operands[1]))
6235 emit_move_insn (operands[0], operands[1]);
6236
6237 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6238 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6239 }
6240 }
6241 else
6242 {
987a3082
UB
6243 machine_mode half_mode;
6244
2bf6d935
ML
6245 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6246
6247 if (!rtx_equal_p (operands[0], operands[1]))
6248 emit_move_insn (operands[0], operands[1]);
6249
6250 split_double_mode (mode, operands, 1, low, high);
987a3082 6251 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6252
6253 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6254 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6255
6256 if (TARGET_CMOVE && scratch)
6257 {
2bf6d935
ML
6258 emit_move_insn (scratch, high[0]);
6259 emit_insn (gen_ashr3 (scratch, scratch,
6260 GEN_INT (half_width - 1)));
987a3082
UB
6261 emit_insn (gen_x86_shift_adj_1
6262 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
6263 }
6264 else
987a3082
UB
6265 emit_insn (gen_x86_shift_adj_3
6266 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
6267 }
6268}
6269
6270void
6271ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6272{
6273 rtx (*gen_lshr3)(rtx, rtx, rtx)
6274 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6275 rtx (*gen_shrd)(rtx, rtx, rtx);
6276 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6277
6278 rtx low[2], high[2];
6279 int count;
6280
6281 if (CONST_INT_P (operands[2]))
6282 {
6283 split_double_mode (mode, operands, 2, low, high);
6284 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6285
6286 if (count >= half_width)
6287 {
6288 emit_move_insn (low[0], high[1]);
6289 ix86_expand_clear (high[0]);
6290
6291 if (count > half_width)
6292 emit_insn (gen_lshr3 (low[0], low[0],
6293 GEN_INT (count - half_width)));
6294 }
6295 else
6296 {
6297 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6298
6299 if (!rtx_equal_p (operands[0], operands[1]))
6300 emit_move_insn (operands[0], operands[1]);
6301
6302 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6303 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6304 }
6305 }
6306 else
6307 {
987a3082
UB
6308 machine_mode half_mode;
6309
2bf6d935
ML
6310 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6311
6312 if (!rtx_equal_p (operands[0], operands[1]))
6313 emit_move_insn (operands[0], operands[1]);
6314
6315 split_double_mode (mode, operands, 1, low, high);
987a3082 6316 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6317
6318 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6319 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6320
6321 if (TARGET_CMOVE && scratch)
6322 {
2bf6d935 6323 ix86_expand_clear (scratch);
987a3082
UB
6324 emit_insn (gen_x86_shift_adj_1
6325 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
6326 }
6327 else
987a3082
UB
6328 emit_insn (gen_x86_shift_adj_2
6329 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
6330 }
6331}
6332
1188cf5f
RS
6333/* Expand move of V1TI mode register X to a new TI mode register. */
6334static rtx
6335ix86_expand_v1ti_to_ti (rtx x)
6336{
6337 rtx result = gen_reg_rtx (TImode);
a5d269f0
RS
6338 if (TARGET_SSE2)
6339 {
51e9e8a2 6340 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
a5d269f0
RS
6341 rtx lo = gen_lowpart (DImode, result);
6342 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6343 rtx hi = gen_highpart (DImode, result);
6344 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6345 }
6346 else
6347 emit_move_insn (result, gen_lowpart (TImode, x));
1188cf5f
RS
6348 return result;
6349}
6350
6351/* Expand move of TI mode register X to a new V1TI mode register. */
6352static rtx
6353ix86_expand_ti_to_v1ti (rtx x)
6354{
1188cf5f
RS
6355 if (TARGET_SSE2)
6356 {
6357 rtx lo = gen_lowpart (DImode, x);
6358 rtx hi = gen_highpart (DImode, x);
6359 rtx tmp = gen_reg_rtx (V2DImode);
6360 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
51e9e8a2 6361 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
1188cf5f 6362 }
51e9e8a2
RS
6363
6364 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
1188cf5f
RS
6365}
6366
6b8b2557 6367/* Expand V1TI mode shift (of rtx_code CODE) by constant. */
1188cf5f
RS
6368void
6369ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6b8b2557 6370{
6b8b2557
RS
6371 rtx op1 = force_reg (V1TImode, operands[1]);
6372
1188cf5f
RS
6373 if (!CONST_INT_P (operands[2]))
6374 {
6375 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6376 rtx tmp2 = gen_reg_rtx (TImode);
6377 rtx (*shift) (rtx, rtx, rtx)
6378 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6379 emit_insn (shift (tmp2, tmp1, operands[2]));
6380 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6381 emit_move_insn (operands[0], tmp3);
6382 return;
6383 }
6384
6385 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6386
6b8b2557
RS
6387 if (bits == 0)
6388 {
6389 emit_move_insn (operands[0], op1);
6390 return;
6391 }
6392
6393 if ((bits & 7) == 0)
6394 {
6395 rtx tmp = gen_reg_rtx (V1TImode);
6396 if (code == ASHIFT)
1188cf5f 6397 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6b8b2557
RS
6398 else
6399 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6400 emit_move_insn (operands[0], tmp);
6401 return;
6402 }
6403
6404 rtx tmp1 = gen_reg_rtx (V1TImode);
6405 if (code == ASHIFT)
6406 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6407 else
6408 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6409
6410 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
51e9e8a2 6411 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6b8b2557
RS
6412
6413 /* tmp3 will be the V2DImode result. */
6414 rtx tmp3 = gen_reg_rtx (V2DImode);
6415
6416 if (bits > 64)
6417 {
6418 if (code == ASHIFT)
6419 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6420 else
6421 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6422 }
6423 else
6424 {
6425 /* tmp4 is operands[1], in V2DImode. */
51e9e8a2 6426 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6b8b2557
RS
6427
6428 rtx tmp5 = gen_reg_rtx (V2DImode);
6429 if (code == ASHIFT)
6430 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6431 else
6432 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6433
6434 rtx tmp6 = gen_reg_rtx (V2DImode);
6435 if (code == ASHIFT)
6436 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6437 else
6438 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6439
6440 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6441 }
6442
6443 /* Convert the result back to V1TImode and store in operands[0]. */
51e9e8a2 6444 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6b8b2557
RS
6445 emit_move_insn (operands[0], tmp7);
6446}
6447
6448/* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
1188cf5f
RS
6449void
6450ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6b8b2557 6451{
6b8b2557
RS
6452 rtx op1 = force_reg (V1TImode, operands[1]);
6453
1188cf5f
RS
6454 if (!CONST_INT_P (operands[2]))
6455 {
6456 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6457 rtx tmp2 = gen_reg_rtx (TImode);
6458 rtx (*rotate) (rtx, rtx, rtx)
6459 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6460 emit_insn (rotate (tmp2, tmp1, operands[2]));
6461 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6462 emit_move_insn (operands[0], tmp3);
6463 return;
6464 }
6465
6466 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6467
6b8b2557
RS
6468 if (bits == 0)
6469 {
6470 emit_move_insn (operands[0], op1);
6471 return;
6472 }
6473
6474 if (code == ROTATERT)
6475 bits = 128 - bits;
6476
6477 if ((bits & 31) == 0)
6478 {
6b8b2557 6479 rtx tmp2 = gen_reg_rtx (V4SImode);
51e9e8a2 6480 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6b8b2557
RS
6481 if (bits == 32)
6482 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6483 else if (bits == 64)
6484 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6485 else
6486 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
51e9e8a2 6487 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6b8b2557
RS
6488 return;
6489 }
6490
6491 if ((bits & 7) == 0)
6492 {
6493 rtx tmp1 = gen_reg_rtx (V1TImode);
6494 rtx tmp2 = gen_reg_rtx (V1TImode);
6495 rtx tmp3 = gen_reg_rtx (V1TImode);
6496
6497 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6498 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6499 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6500 emit_move_insn (operands[0], tmp3);
6501 return;
6502 }
6503
51e9e8a2 6504 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6b8b2557
RS
6505
6506 rtx lobits;
6507 rtx hibits;
6508
6509 switch (bits >> 5)
6510 {
6511 case 0:
6512 lobits = op1_v4si;
6513 hibits = gen_reg_rtx (V4SImode);
6514 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6515 break;
6516
6517 case 1:
6518 lobits = gen_reg_rtx (V4SImode);
6519 hibits = gen_reg_rtx (V4SImode);
6520 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6521 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6522 break;
6523
6524 case 2:
6525 lobits = gen_reg_rtx (V4SImode);
6526 hibits = gen_reg_rtx (V4SImode);
6527 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6528 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6529 break;
6530
6531 default:
6532 lobits = gen_reg_rtx (V4SImode);
6533 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6534 hibits = op1_v4si;
6535 break;
6536 }
6537
6538 rtx tmp1 = gen_reg_rtx (V4SImode);
6539 rtx tmp2 = gen_reg_rtx (V4SImode);
6540 rtx tmp3 = gen_reg_rtx (V4SImode);
6b8b2557
RS
6541
6542 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6543 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6544 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
51e9e8a2
RS
6545
6546 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6b8b2557
RS
6547}
6548
1188cf5f
RS
6549/* Expand V1TI mode ashiftrt by constant. */
6550void
6551ix86_expand_v1ti_ashiftrt (rtx operands[])
6552{
6553 rtx op1 = force_reg (V1TImode, operands[1]);
6554
6555 if (!CONST_INT_P (operands[2]))
6556 {
6557 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6558 rtx tmp2 = gen_reg_rtx (TImode);
6559 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6560 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6561 emit_move_insn (operands[0], tmp3);
6562 return;
6563 }
6564
6565 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6566
6567 if (bits == 0)
6568 {
6569 emit_move_insn (operands[0], op1);
6570 return;
6571 }
6572
6573 if (bits == 127)
6574 {
6575 /* Two operations. */
51e9e8a2 6576 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6577 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6578 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6579
6580 rtx tmp3 = gen_reg_rtx (V4SImode);
6581 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6582
51e9e8a2 6583 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
1188cf5f
RS
6584 return;
6585 }
6586
6587 if (bits == 64)
6588 {
6589 /* Three operations. */
51e9e8a2 6590 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6591 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6592 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6593
6594 rtx tmp3 = gen_reg_rtx (V4SImode);
6595 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6596
51e9e8a2
RS
6597 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6598 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6599 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6600 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6601
51e9e8a2 6602 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6603 return;
6604 }
6605
6606 if (bits == 96)
6607 {
6608 /* Three operations. */
51e9e8a2 6609 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6610 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6611 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6612
51e9e8a2
RS
6613 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6614 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
1188cf5f 6615 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6616 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6617
51e9e8a2 6618 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
1188cf5f 6619 rtx tmp7 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6620 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6621
51e9e8a2
RS
6622 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6623 return;
6624 }
6625
6626 if (bits >= 111)
6627 {
6628 /* Three operations. */
6629 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6630 rtx tmp2 = gen_reg_rtx (V4SImode);
6631 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6632
6633 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6634 rtx tmp4 = gen_reg_rtx (V8HImode);
6635 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6636
6637 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6638 rtx tmp6 = gen_reg_rtx (V4SImode);
6639 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6640
6641 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6642 return;
6643 }
6644
6645 if (TARGET_AVX2 || TARGET_SSE4_1)
6646 {
6647 /* Three operations. */
6648 if (bits == 32)
6649 {
51e9e8a2 6650 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6651 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6652 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6653
6654 rtx tmp3 = gen_reg_rtx (V1TImode);
6655 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6656
6657 if (TARGET_AVX2)
6658 {
51e9e8a2 6659 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
1188cf5f 6660 rtx tmp5 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6661 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6662 GEN_INT (7)));
6663
51e9e8a2 6664 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
1188cf5f
RS
6665 }
6666 else
6667 {
51e9e8a2
RS
6668 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6669 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
1188cf5f 6670 rtx tmp6 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6671 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6672 GEN_INT (0x3f)));
6673
51e9e8a2 6674 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6675 }
6676 return;
6677 }
6678
6679 /* Three operations. */
6680 if (bits == 8 || bits == 16 || bits == 24)
6681 {
51e9e8a2 6682 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6683 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6684 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6685
6686 rtx tmp3 = gen_reg_rtx (V1TImode);
6687 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6688
6689 if (TARGET_AVX2)
6690 {
51e9e8a2 6691 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
1188cf5f 6692 rtx tmp5 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6693 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6694 GEN_INT (7)));
6695
51e9e8a2 6696 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
1188cf5f
RS
6697 }
6698 else
6699 {
51e9e8a2
RS
6700 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6701 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
1188cf5f 6702 rtx tmp6 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6703 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6704 GEN_INT (0x3f)));
6705
51e9e8a2 6706 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6707 }
6708 return;
6709 }
6710 }
6711
6712 if (bits > 96)
6713 {
6714 /* Four operations. */
51e9e8a2 6715 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6716 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6717 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6718
6719 rtx tmp3 = gen_reg_rtx (V4SImode);
6720 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6721
51e9e8a2
RS
6722 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6723 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6724 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6725 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6726
51e9e8a2 6727 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
1188cf5f 6728 rtx tmp8 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6729 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6730
51e9e8a2 6731 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
1188cf5f
RS
6732 return;
6733 }
6734
6735 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6736 {
6737 /* Four operations. */
51e9e8a2 6738 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6739 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6740 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6741
6742 rtx tmp3 = gen_reg_rtx (V4SImode);
6743 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6744
6745 rtx tmp4 = gen_reg_rtx (V1TImode);
6746 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6747
51e9e8a2
RS
6748 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6749 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
1188cf5f 6750 rtx tmp7 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6751 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6752 GEN_INT (bits == 48 ? 0x1f : 0x07)));
6753
51e9e8a2 6754 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
1188cf5f
RS
6755 return;
6756 }
6757
6758 if ((bits & 7) == 0)
6759 {
6760 /* Five operations. */
51e9e8a2 6761 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6762 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6763 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6764
6765 rtx tmp3 = gen_reg_rtx (V4SImode);
6766 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6767
6768 rtx tmp4 = gen_reg_rtx (V1TImode);
6769 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6770
51e9e8a2 6771 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
1188cf5f 6772 rtx tmp6 = gen_reg_rtx (V1TImode);
1188cf5f
RS
6773 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6774
51e9e8a2
RS
6775 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6776 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
1188cf5f 6777 rtx tmp9 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6778 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6779
51e9e8a2 6780 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
1188cf5f
RS
6781 return;
6782 }
6783
6784 if (TARGET_AVX2 && bits < 32)
6785 {
6786 /* Six operations. */
51e9e8a2 6787 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6788 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6789 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6790
6791 rtx tmp3 = gen_reg_rtx (V1TImode);
6792 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6793
51e9e8a2 6794 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 6795 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6796 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6797
51e9e8a2 6798 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6799 rtx tmp7 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6800 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6801
6802 rtx tmp8 = gen_reg_rtx (V2DImode);
6803 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6804
51e9e8a2 6805 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
1188cf5f 6806 rtx tmp10 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6807 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6808
51e9e8a2 6809 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
1188cf5f
RS
6810 return;
6811 }
6812
6813 if (TARGET_SSE4_1 && bits < 15)
6814 {
6815 /* Six operations. */
51e9e8a2 6816 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6817 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6818 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6819
6820 rtx tmp3 = gen_reg_rtx (V1TImode);
6821 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6822
51e9e8a2 6823 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 6824 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6825 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6826
51e9e8a2 6827 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6828 rtx tmp7 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6829 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6830
6831 rtx tmp8 = gen_reg_rtx (V2DImode);
6832 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6833
51e9e8a2
RS
6834 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6835 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
1188cf5f 6836 rtx tmp11 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6837 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
6838
51e9e8a2 6839 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
1188cf5f
RS
6840 return;
6841 }
6842
6843 if (bits == 1)
6844 {
6845 /* Eight operations. */
6846 rtx tmp1 = gen_reg_rtx (V1TImode);
6847 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6848
51e9e8a2 6849 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 6850 rtx tmp3 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6851 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
6852
51e9e8a2 6853 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
1188cf5f 6854 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6855 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
6856
6857 rtx tmp6 = gen_reg_rtx (V2DImode);
6858 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
6859
6860 rtx tmp7 = gen_reg_rtx (V2DImode);
6861 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
6862
51e9e8a2 6863 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
1188cf5f 6864 rtx tmp9 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6865 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
6866
51e9e8a2 6867 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
1188cf5f 6868 rtx tmp11 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6869 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
6870
6871 rtx tmp12 = gen_reg_rtx (V2DImode);
6872 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
6873
51e9e8a2 6874 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
1188cf5f
RS
6875 return;
6876 }
6877
6878 if (bits > 64)
6879 {
6880 /* Eight operations. */
51e9e8a2 6881 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6882 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6883 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6884
6885 rtx tmp3 = gen_reg_rtx (V4SImode);
6886 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6887
6888 rtx tmp4 = gen_reg_rtx (V1TImode);
6889 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6890
51e9e8a2 6891 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
1188cf5f 6892 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6893 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
6894
51e9e8a2 6895 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
1188cf5f 6896 rtx tmp8 = gen_reg_rtx (V1TImode);
1188cf5f
RS
6897 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
6898
51e9e8a2 6899 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6900 rtx tmp10 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6901 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
6902
51e9e8a2 6903 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
1188cf5f 6904 rtx tmp12 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6905 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
6906
6907 rtx tmp13 = gen_reg_rtx (V2DImode);
6908 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
6909
51e9e8a2 6910 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
1188cf5f
RS
6911 }
6912 else
6913 {
6914 /* Nine operations. */
51e9e8a2 6915 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6916 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6917 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6918
6919 rtx tmp3 = gen_reg_rtx (V4SImode);
6920 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6921
6922 rtx tmp4 = gen_reg_rtx (V1TImode);
6923 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6924
51e9e8a2 6925 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 6926 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6927 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
6928
51e9e8a2 6929 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
1188cf5f 6930 rtx tmp8 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6931 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
6932
6933 rtx tmp9 = gen_reg_rtx (V2DImode);
6934 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
6935
51e9e8a2 6936 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
1188cf5f 6937 rtx tmp11 = gen_reg_rtx (V1TImode);
1188cf5f
RS
6938 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
6939
51e9e8a2 6940 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
1188cf5f 6941 rtx tmp13 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6942 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
6943
6944 rtx tmp14 = gen_reg_rtx (V2DImode);
6945 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
6946
51e9e8a2 6947 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
1188cf5f
RS
6948 }
6949}
6950
2bf6d935
ML
6951/* Return mode for the memcpy/memset loop counter. Prefer SImode over
6952 DImode for constant loop counts. */
6953
6954static machine_mode
6955counter_mode (rtx count_exp)
6956{
6957 if (GET_MODE (count_exp) != VOIDmode)
6958 return GET_MODE (count_exp);
6959 if (!CONST_INT_P (count_exp))
6960 return Pmode;
6961 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
6962 return DImode;
6963 return SImode;
6964}
6965
6966/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
6967 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
6968 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
6969 memory by VALUE (supposed to be in MODE).
6970
6971 The size is rounded down to whole number of chunk size moved at once.
6972 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
6973
6974
6975static void
76715c32 6976expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
2bf6d935
ML
6977 rtx destptr, rtx srcptr, rtx value,
6978 rtx count, machine_mode mode, int unroll,
6979 int expected_size, bool issetmem)
6980{
6981 rtx_code_label *out_label, *top_label;
6982 rtx iter, tmp;
6983 machine_mode iter_mode = counter_mode (count);
6984 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
6985 rtx piece_size = GEN_INT (piece_size_n);
6986 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
6987 rtx size;
6988 int i;
6989
6990 top_label = gen_label_rtx ();
6991 out_label = gen_label_rtx ();
6992 iter = gen_reg_rtx (iter_mode);
6993
6994 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
6995 NULL, 1, OPTAB_DIRECT);
6996 /* Those two should combine. */
6997 if (piece_size == const1_rtx)
6998 {
6999 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7000 true, out_label);
7001 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7002 }
7003 emit_move_insn (iter, const0_rtx);
7004
7005 emit_label (top_label);
7006
7007 tmp = convert_modes (Pmode, iter_mode, iter, true);
7008
7009 /* This assert could be relaxed - in this case we'll need to compute
7010 smallest power of two, containing in PIECE_SIZE_N and pass it to
7011 offset_address. */
7012 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7013 destmem = offset_address (destmem, tmp, piece_size_n);
7014 destmem = adjust_address (destmem, mode, 0);
7015
7016 if (!issetmem)
7017 {
7018 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7019 srcmem = adjust_address (srcmem, mode, 0);
7020
7021 /* When unrolling for chips that reorder memory reads and writes,
7022 we can save registers by using single temporary.
7023 Also using 4 temporaries is overkill in 32bit mode. */
7024 if (!TARGET_64BIT && 0)
7025 {
7026 for (i = 0; i < unroll; i++)
7027 {
7028 if (i)
7029 {
7030 destmem = adjust_address (copy_rtx (destmem), mode,
7031 GET_MODE_SIZE (mode));
7032 srcmem = adjust_address (copy_rtx (srcmem), mode,
7033 GET_MODE_SIZE (mode));
7034 }
7035 emit_move_insn (destmem, srcmem);
7036 }
7037 }
7038 else
7039 {
7040 rtx tmpreg[4];
7041 gcc_assert (unroll <= 4);
7042 for (i = 0; i < unroll; i++)
7043 {
7044 tmpreg[i] = gen_reg_rtx (mode);
7045 if (i)
7046 srcmem = adjust_address (copy_rtx (srcmem), mode,
7047 GET_MODE_SIZE (mode));
7048 emit_move_insn (tmpreg[i], srcmem);
7049 }
7050 for (i = 0; i < unroll; i++)
7051 {
7052 if (i)
7053 destmem = adjust_address (copy_rtx (destmem), mode,
7054 GET_MODE_SIZE (mode));
7055 emit_move_insn (destmem, tmpreg[i]);
7056 }
7057 }
7058 }
7059 else
7060 for (i = 0; i < unroll; i++)
7061 {
7062 if (i)
7063 destmem = adjust_address (copy_rtx (destmem), mode,
7064 GET_MODE_SIZE (mode));
7065 emit_move_insn (destmem, value);
7066 }
7067
7068 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7069 true, OPTAB_LIB_WIDEN);
7070 if (tmp != iter)
7071 emit_move_insn (iter, tmp);
7072
7073 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7074 true, top_label);
7075 if (expected_size != -1)
7076 {
7077 expected_size /= GET_MODE_SIZE (mode) * unroll;
7078 if (expected_size == 0)
7079 predict_jump (0);
7080 else if (expected_size > REG_BR_PROB_BASE)
7081 predict_jump (REG_BR_PROB_BASE - 1);
7082 else
7083 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7084 / expected_size);
7085 }
7086 else
7087 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7088 iter = ix86_zero_extend_to_Pmode (iter);
7089 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7090 true, OPTAB_LIB_WIDEN);
7091 if (tmp != destptr)
7092 emit_move_insn (destptr, tmp);
7093 if (!issetmem)
7094 {
7095 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7096 true, OPTAB_LIB_WIDEN);
7097 if (tmp != srcptr)
7098 emit_move_insn (srcptr, tmp);
7099 }
7100 emit_label (out_label);
7101}
7102
7103/* Divide COUNTREG by SCALE. */
7104static rtx
7105scale_counter (rtx countreg, int scale)
7106{
7107 rtx sc;
7108
7109 if (scale == 1)
7110 return countreg;
7111 if (CONST_INT_P (countreg))
7112 return GEN_INT (INTVAL (countreg) / scale);
7113 gcc_assert (REG_P (countreg));
7114
7115 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7116 GEN_INT (exact_log2 (scale)),
7117 NULL, 1, OPTAB_DIRECT);
7118 return sc;
7119}
7120
7121/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7122 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7123 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7124 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7125 ORIG_VALUE is the original value passed to memset to fill the memory with.
7126 Other arguments have same meaning as for previous function. */
7127
7128static void
76715c32 7129expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
2bf6d935
ML
7130 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7131 rtx count,
7132 machine_mode mode, bool issetmem)
7133{
7134 rtx destexp;
7135 rtx srcexp;
7136 rtx countreg;
7137 HOST_WIDE_INT rounded_count;
7138
7139 /* If possible, it is shorter to use rep movs.
7140 TODO: Maybe it is better to move this logic to decide_alg. */
7141 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
bf24f4ec 7142 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
2bf6d935
ML
7143 && (!issetmem || orig_value == const0_rtx))
7144 mode = SImode;
7145
7146 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7147 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7148
7149 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7150 GET_MODE_SIZE (mode)));
7151 if (mode != QImode)
7152 {
7153 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7154 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7155 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7156 }
7157 else
7158 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7159 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7160 {
7161 rounded_count
7162 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7163 destmem = shallow_copy_rtx (destmem);
7164 set_mem_size (destmem, rounded_count);
7165 }
7166 else if (MEM_SIZE_KNOWN_P (destmem))
7167 clear_mem_size (destmem);
7168
7169 if (issetmem)
7170 {
7171 value = force_reg (mode, gen_lowpart (mode, value));
7172 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7173 }
7174 else
7175 {
7176 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7177 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7178 if (mode != QImode)
7179 {
7180 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7181 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7182 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7183 }
7184 else
7185 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7186 if (CONST_INT_P (count))
7187 {
7188 rounded_count
7189 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7190 srcmem = shallow_copy_rtx (srcmem);
7191 set_mem_size (srcmem, rounded_count);
7192 }
7193 else
7194 {
7195 if (MEM_SIZE_KNOWN_P (srcmem))
7196 clear_mem_size (srcmem);
7197 }
7198 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7199 destexp, srcexp));
7200 }
7201}
7202
7203/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7204 DESTMEM.
7205 SRC is passed by pointer to be updated on return.
7206 Return value is updated DST. */
7207static rtx
7208emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7209 HOST_WIDE_INT size_to_move)
7210{
c3185b64 7211 rtx dst = destmem, src = *srcmem, tempreg;
2bf6d935
ML
7212 enum insn_code code;
7213 machine_mode move_mode;
7214 int piece_size, i;
7215
7216 /* Find the widest mode in which we could perform moves.
7217 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7218 it until move of such size is supported. */
7219 piece_size = 1 << floor_log2 (size_to_move);
7220 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7221 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7222 {
7223 gcc_assert (piece_size > 1);
7224 piece_size >>= 1;
7225 }
7226
7227 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7228 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7229 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7230 {
7231 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7232 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7233 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7234 {
7235 move_mode = word_mode;
7236 piece_size = GET_MODE_SIZE (move_mode);
7237 code = optab_handler (mov_optab, move_mode);
7238 }
7239 }
7240 gcc_assert (code != CODE_FOR_nothing);
7241
7242 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7243 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7244
7245 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7246 gcc_assert (size_to_move % piece_size == 0);
c3185b64 7247
2bf6d935
ML
7248 for (i = 0; i < size_to_move; i += piece_size)
7249 {
7250 /* We move from memory to memory, so we'll need to do it via
7251 a temporary register. */
7252 tempreg = gen_reg_rtx (move_mode);
7253 emit_insn (GEN_FCN (code) (tempreg, src));
7254 emit_insn (GEN_FCN (code) (dst, tempreg));
7255
7256 emit_move_insn (destptr,
c3185b64 7257 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935 7258 emit_move_insn (srcptr,
c3185b64 7259 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
2bf6d935
ML
7260
7261 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7262 piece_size);
7263 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7264 piece_size);
7265 }
7266
7267 /* Update DST and SRC rtx. */
7268 *srcmem = src;
7269 return dst;
7270}
7271
7272/* Helper function for the string operations below. Dest VARIABLE whether
7273 it is aligned to VALUE bytes. If true, jump to the label. */
7274
7275static rtx_code_label *
7276ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7277{
7278 rtx_code_label *label = gen_label_rtx ();
7279 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7280 if (GET_MODE (variable) == DImode)
7281 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7282 else
7283 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7284 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7285 1, label);
7286 if (epilogue)
7287 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7288 else
7289 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7290 return label;
7291}
7292
7293
7294/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7295
7296static void
76715c32 7297expand_cpymem_epilogue (rtx destmem, rtx srcmem,
2bf6d935
ML
7298 rtx destptr, rtx srcptr, rtx count, int max_size)
7299{
7300 rtx src, dest;
7301 if (CONST_INT_P (count))
7302 {
7303 HOST_WIDE_INT countval = INTVAL (count);
7304 HOST_WIDE_INT epilogue_size = countval % max_size;
7305 int i;
7306
7307 /* For now MAX_SIZE should be a power of 2. This assert could be
7308 relaxed, but it'll require a bit more complicated epilogue
7309 expanding. */
7310 gcc_assert ((max_size & (max_size - 1)) == 0);
7311 for (i = max_size; i >= 1; i >>= 1)
7312 {
7313 if (epilogue_size & i)
7314 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7315 }
7316 return;
7317 }
7318 if (max_size > 8)
7319 {
7320 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7321 count, 1, OPTAB_DIRECT);
76715c32 7322 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
2bf6d935
ML
7323 count, QImode, 1, 4, false);
7324 return;
7325 }
7326
7327 /* When there are stringops, we can cheaply increase dest and src pointers.
7328 Otherwise we save code size by maintaining offset (zero is readily
7329 available from preceding rep operation) and using x86 addressing modes.
7330 */
7331 if (TARGET_SINGLE_STRINGOP)
7332 {
7333 if (max_size > 4)
7334 {
7335 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7336 src = change_address (srcmem, SImode, srcptr);
7337 dest = change_address (destmem, SImode, destptr);
7338 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7339 emit_label (label);
7340 LABEL_NUSES (label) = 1;
7341 }
7342 if (max_size > 2)
7343 {
7344 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7345 src = change_address (srcmem, HImode, srcptr);
7346 dest = change_address (destmem, HImode, destptr);
7347 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7348 emit_label (label);
7349 LABEL_NUSES (label) = 1;
7350 }
7351 if (max_size > 1)
7352 {
7353 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7354 src = change_address (srcmem, QImode, srcptr);
7355 dest = change_address (destmem, QImode, destptr);
7356 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7357 emit_label (label);
7358 LABEL_NUSES (label) = 1;
7359 }
7360 }
7361 else
7362 {
7363 rtx offset = force_reg (Pmode, const0_rtx);
7364 rtx tmp;
7365
7366 if (max_size > 4)
7367 {
7368 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7369 src = change_address (srcmem, SImode, srcptr);
7370 dest = change_address (destmem, SImode, destptr);
7371 emit_move_insn (dest, src);
7372 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7373 true, OPTAB_LIB_WIDEN);
7374 if (tmp != offset)
7375 emit_move_insn (offset, tmp);
7376 emit_label (label);
7377 LABEL_NUSES (label) = 1;
7378 }
7379 if (max_size > 2)
7380 {
7381 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7382 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7383 src = change_address (srcmem, HImode, tmp);
7384 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7385 dest = change_address (destmem, HImode, tmp);
7386 emit_move_insn (dest, src);
7387 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7388 true, OPTAB_LIB_WIDEN);
7389 if (tmp != offset)
7390 emit_move_insn (offset, tmp);
7391 emit_label (label);
7392 LABEL_NUSES (label) = 1;
7393 }
7394 if (max_size > 1)
7395 {
7396 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7397 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7398 src = change_address (srcmem, QImode, tmp);
7399 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7400 dest = change_address (destmem, QImode, tmp);
7401 emit_move_insn (dest, src);
7402 emit_label (label);
7403 LABEL_NUSES (label) = 1;
7404 }
7405 }
7406}
7407
7408/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7409 with value PROMOTED_VAL.
7410 SRC is passed by pointer to be updated on return.
7411 Return value is updated DST. */
7412static rtx
7413emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7414 HOST_WIDE_INT size_to_move)
7415{
c3185b64 7416 rtx dst = destmem;
2bf6d935
ML
7417 enum insn_code code;
7418 machine_mode move_mode;
7419 int piece_size, i;
7420
7421 /* Find the widest mode in which we could perform moves.
7422 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7423 it until move of such size is supported. */
7424 move_mode = GET_MODE (promoted_val);
7425 if (move_mode == VOIDmode)
7426 move_mode = QImode;
7427 if (size_to_move < GET_MODE_SIZE (move_mode))
7428 {
7429 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7430 move_mode = int_mode_for_size (move_bits, 0).require ();
7431 promoted_val = gen_lowpart (move_mode, promoted_val);
7432 }
7433 piece_size = GET_MODE_SIZE (move_mode);
7434 code = optab_handler (mov_optab, move_mode);
7435 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7436
7437 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7438
7439 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7440 gcc_assert (size_to_move % piece_size == 0);
c3185b64 7441
2bf6d935
ML
7442 for (i = 0; i < size_to_move; i += piece_size)
7443 {
7444 if (piece_size <= GET_MODE_SIZE (word_mode))
7445 {
7446 emit_insn (gen_strset (destptr, dst, promoted_val));
7447 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7448 piece_size);
7449 continue;
7450 }
7451
7452 emit_insn (GEN_FCN (code) (dst, promoted_val));
7453
7454 emit_move_insn (destptr,
c3185b64 7455 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935
ML
7456
7457 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7458 piece_size);
7459 }
7460
7461 /* Update DST rtx. */
7462 return dst;
7463}
7464/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7465static void
7466expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7467 rtx count, int max_size)
7468{
7469 count = expand_simple_binop (counter_mode (count), AND, count,
7470 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
76715c32 7471 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
2bf6d935
ML
7472 gen_lowpart (QImode, value), count, QImode,
7473 1, max_size / 2, true);
7474}
7475
7476/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7477static void
7478expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7479 rtx count, int max_size)
7480{
7481 rtx dest;
7482
7483 if (CONST_INT_P (count))
7484 {
7485 HOST_WIDE_INT countval = INTVAL (count);
7486 HOST_WIDE_INT epilogue_size = countval % max_size;
7487 int i;
7488
7489 /* For now MAX_SIZE should be a power of 2. This assert could be
7490 relaxed, but it'll require a bit more complicated epilogue
7491 expanding. */
7492 gcc_assert ((max_size & (max_size - 1)) == 0);
7493 for (i = max_size; i >= 1; i >>= 1)
7494 {
7495 if (epilogue_size & i)
7496 {
7497 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7498 destmem = emit_memset (destmem, destptr, vec_value, i);
7499 else
7500 destmem = emit_memset (destmem, destptr, value, i);
7501 }
7502 }
7503 return;
7504 }
7505 if (max_size > 32)
7506 {
7507 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7508 return;
7509 }
7510 if (max_size > 16)
7511 {
7512 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7513 if (TARGET_64BIT)
7514 {
7515 dest = change_address (destmem, DImode, destptr);
7516 emit_insn (gen_strset (destptr, dest, value));
7517 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7518 emit_insn (gen_strset (destptr, dest, value));
7519 }
7520 else
7521 {
7522 dest = change_address (destmem, SImode, destptr);
7523 emit_insn (gen_strset (destptr, dest, value));
7524 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7525 emit_insn (gen_strset (destptr, dest, value));
7526 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7527 emit_insn (gen_strset (destptr, dest, value));
7528 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7529 emit_insn (gen_strset (destptr, dest, value));
7530 }
7531 emit_label (label);
7532 LABEL_NUSES (label) = 1;
7533 }
7534 if (max_size > 8)
7535 {
7536 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7537 if (TARGET_64BIT)
7538 {
7539 dest = change_address (destmem, DImode, destptr);
7540 emit_insn (gen_strset (destptr, dest, value));
7541 }
7542 else
7543 {
7544 dest = change_address (destmem, SImode, destptr);
7545 emit_insn (gen_strset (destptr, dest, value));
7546 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7547 emit_insn (gen_strset (destptr, dest, value));
7548 }
7549 emit_label (label);
7550 LABEL_NUSES (label) = 1;
7551 }
7552 if (max_size > 4)
7553 {
7554 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7555 dest = change_address (destmem, SImode, destptr);
7556 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7557 emit_label (label);
7558 LABEL_NUSES (label) = 1;
7559 }
7560 if (max_size > 2)
7561 {
7562 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7563 dest = change_address (destmem, HImode, destptr);
7564 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7565 emit_label (label);
7566 LABEL_NUSES (label) = 1;
7567 }
7568 if (max_size > 1)
7569 {
7570 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7571 dest = change_address (destmem, QImode, destptr);
7572 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7573 emit_label (label);
7574 LABEL_NUSES (label) = 1;
7575 }
7576}
7577
7578/* Adjust COUNTER by the VALUE. */
7579static void
7580ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7581{
83bc5e44 7582 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
2bf6d935
ML
7583}
7584
7585/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7586 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7587 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7588 ignored.
7589 Return value is updated DESTMEM. */
7590
7591static rtx
76715c32 7592expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
2bf6d935
ML
7593 rtx destptr, rtx srcptr, rtx value,
7594 rtx vec_value, rtx count, int align,
7595 int desired_alignment, bool issetmem)
7596{
7597 int i;
7598 for (i = 1; i < desired_alignment; i <<= 1)
7599 {
7600 if (align <= i)
7601 {
7602 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7603 if (issetmem)
7604 {
7605 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7606 destmem = emit_memset (destmem, destptr, vec_value, i);
7607 else
7608 destmem = emit_memset (destmem, destptr, value, i);
7609 }
7610 else
7611 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7612 ix86_adjust_counter (count, i);
7613 emit_label (label);
7614 LABEL_NUSES (label) = 1;
7615 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7616 }
7617 }
7618 return destmem;
7619}
7620
7621/* Test if COUNT&SIZE is nonzero and if so, expand movme
7622 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7623 and jump to DONE_LABEL. */
7624static void
76715c32 7625expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
2bf6d935
ML
7626 rtx destptr, rtx srcptr,
7627 rtx value, rtx vec_value,
7628 rtx count, int size,
7629 rtx done_label, bool issetmem)
7630{
7631 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7632 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7633 rtx modesize;
7634 int n;
7635
7636 /* If we do not have vector value to copy, we must reduce size. */
7637 if (issetmem)
7638 {
7639 if (!vec_value)
7640 {
7641 if (GET_MODE (value) == VOIDmode && size > 8)
7642 mode = Pmode;
7643 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7644 mode = GET_MODE (value);
7645 }
7646 else
7647 mode = GET_MODE (vec_value), value = vec_value;
7648 }
7649 else
7650 {
7651 /* Choose appropriate vector mode. */
7652 if (size >= 32)
7653 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7654 else if (size >= 16)
7655 mode = TARGET_SSE ? V16QImode : DImode;
7656 srcmem = change_address (srcmem, mode, srcptr);
7657 }
7658 destmem = change_address (destmem, mode, destptr);
7659 modesize = GEN_INT (GET_MODE_SIZE (mode));
7660 gcc_assert (GET_MODE_SIZE (mode) <= size);
7661 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7662 {
7663 if (issetmem)
7664 emit_move_insn (destmem, gen_lowpart (mode, value));
7665 else
7666 {
7667 emit_move_insn (destmem, srcmem);
7668 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7669 }
7670 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7671 }
7672
7673 destmem = offset_address (destmem, count, 1);
7674 destmem = offset_address (destmem, GEN_INT (-2 * size),
7675 GET_MODE_SIZE (mode));
7676 if (!issetmem)
7677 {
7678 srcmem = offset_address (srcmem, count, 1);
7679 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7680 GET_MODE_SIZE (mode));
7681 }
7682 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7683 {
7684 if (issetmem)
7685 emit_move_insn (destmem, gen_lowpart (mode, value));
7686 else
7687 {
7688 emit_move_insn (destmem, srcmem);
7689 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7690 }
7691 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7692 }
7693 emit_jump_insn (gen_jump (done_label));
7694 emit_barrier ();
7695
7696 emit_label (label);
7697 LABEL_NUSES (label) = 1;
7698}
7699
7700/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7701 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7702 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7703 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7704 DONE_LABEL is a label after the whole copying sequence. The label is created
7705 on demand if *DONE_LABEL is NULL.
7706 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7707 bounds after the initial copies.
7708
7709 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7710 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7711 we will dispatch to a library call for large blocks.
7712
7713 In pseudocode we do:
7714
7715 if (COUNT < SIZE)
7716 {
7717 Assume that SIZE is 4. Bigger sizes are handled analogously
7718 if (COUNT & 4)
7719 {
7720 copy 4 bytes from SRCPTR to DESTPTR
7721 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7722 goto done_label
7723 }
7724 if (!COUNT)
7725 goto done_label;
7726 copy 1 byte from SRCPTR to DESTPTR
7727 if (COUNT & 2)
7728 {
7729 copy 2 bytes from SRCPTR to DESTPTR
7730 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7731 }
7732 }
7733 else
7734 {
7735 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7736 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7737
7738 OLD_DESPTR = DESTPTR;
7739 Align DESTPTR up to DESIRED_ALIGN
7740 SRCPTR += DESTPTR - OLD_DESTPTR
7741 COUNT -= DEST_PTR - OLD_DESTPTR
7742 if (DYNAMIC_CHECK)
7743 Round COUNT down to multiple of SIZE
7744 << optional caller supplied zero size guard is here >>
7745 << optional caller supplied dynamic check is here >>
7746 << caller supplied main copy loop is here >>
7747 }
7748 done_label:
7749 */
7750static void
76715c32 7751expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
2bf6d935
ML
7752 rtx *destptr, rtx *srcptr,
7753 machine_mode mode,
7754 rtx value, rtx vec_value,
7755 rtx *count,
7756 rtx_code_label **done_label,
7757 int size,
7758 int desired_align,
7759 int align,
7760 unsigned HOST_WIDE_INT *min_size,
7761 bool dynamic_check,
7762 bool issetmem)
7763{
7764 rtx_code_label *loop_label = NULL, *label;
7765 int n;
7766 rtx modesize;
7767 int prolog_size = 0;
7768 rtx mode_value;
7769
7770 /* Chose proper value to copy. */
7771 if (issetmem && VECTOR_MODE_P (mode))
7772 mode_value = vec_value;
7773 else
7774 mode_value = value;
7775 gcc_assert (GET_MODE_SIZE (mode) <= size);
7776
7777 /* See if block is big or small, handle small blocks. */
7778 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7779 {
7780 int size2 = size;
7781 loop_label = gen_label_rtx ();
7782
7783 if (!*done_label)
7784 *done_label = gen_label_rtx ();
7785
7786 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7787 1, loop_label);
7788 size2 >>= 1;
7789
7790 /* Handle sizes > 3. */
7791 for (;size2 > 2; size2 >>= 1)
76715c32 7792 expand_small_cpymem_or_setmem (destmem, srcmem,
2bf6d935
ML
7793 *destptr, *srcptr,
7794 value, vec_value,
7795 *count,
7796 size2, *done_label, issetmem);
7797 /* Nothing to copy? Jump to DONE_LABEL if so */
7798 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7799 1, *done_label);
7800
7801 /* Do a byte copy. */
7802 destmem = change_address (destmem, QImode, *destptr);
7803 if (issetmem)
7804 emit_move_insn (destmem, gen_lowpart (QImode, value));
7805 else
7806 {
7807 srcmem = change_address (srcmem, QImode, *srcptr);
7808 emit_move_insn (destmem, srcmem);
7809 }
7810
7811 /* Handle sizes 2 and 3. */
7812 label = ix86_expand_aligntest (*count, 2, false);
7813 destmem = change_address (destmem, HImode, *destptr);
7814 destmem = offset_address (destmem, *count, 1);
7815 destmem = offset_address (destmem, GEN_INT (-2), 2);
7816 if (issetmem)
7817 emit_move_insn (destmem, gen_lowpart (HImode, value));
7818 else
7819 {
7820 srcmem = change_address (srcmem, HImode, *srcptr);
7821 srcmem = offset_address (srcmem, *count, 1);
7822 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
7823 emit_move_insn (destmem, srcmem);
7824 }
7825
7826 emit_label (label);
7827 LABEL_NUSES (label) = 1;
7828 emit_jump_insn (gen_jump (*done_label));
7829 emit_barrier ();
7830 }
7831 else
7832 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
7833 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
7834
7835 /* Start memcpy for COUNT >= SIZE. */
7836 if (loop_label)
7837 {
7838 emit_label (loop_label);
7839 LABEL_NUSES (loop_label) = 1;
7840 }
7841
7842 /* Copy first desired_align bytes. */
7843 if (!issetmem)
7844 srcmem = change_address (srcmem, mode, *srcptr);
7845 destmem = change_address (destmem, mode, *destptr);
7846 modesize = GEN_INT (GET_MODE_SIZE (mode));
7847 for (n = 0; prolog_size < desired_align - align; n++)
7848 {
7849 if (issetmem)
7850 emit_move_insn (destmem, mode_value);
7851 else
7852 {
7853 emit_move_insn (destmem, srcmem);
7854 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7855 }
7856 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7857 prolog_size += GET_MODE_SIZE (mode);
7858 }
7859
7860
7861 /* Copy last SIZE bytes. */
7862 destmem = offset_address (destmem, *count, 1);
7863 destmem = offset_address (destmem,
7864 GEN_INT (-size - prolog_size),
7865 1);
7866 if (issetmem)
7867 emit_move_insn (destmem, mode_value);
7868 else
7869 {
7870 srcmem = offset_address (srcmem, *count, 1);
7871 srcmem = offset_address (srcmem,
7872 GEN_INT (-size - prolog_size),
7873 1);
7874 emit_move_insn (destmem, srcmem);
7875 }
7876 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
7877 {
7878 destmem = offset_address (destmem, modesize, 1);
7879 if (issetmem)
7880 emit_move_insn (destmem, mode_value);
7881 else
7882 {
7883 srcmem = offset_address (srcmem, modesize, 1);
7884 emit_move_insn (destmem, srcmem);
7885 }
7886 }
7887
7888 /* Align destination. */
7889 if (desired_align > 1 && desired_align > align)
7890 {
7891 rtx saveddest = *destptr;
7892
7893 gcc_assert (desired_align <= size);
7894 /* Align destptr up, place it to new register. */
7895 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
7896 GEN_INT (prolog_size),
7897 NULL_RTX, 1, OPTAB_DIRECT);
7898 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
7899 REG_POINTER (*destptr) = 1;
7900 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
7901 GEN_INT (-desired_align),
7902 *destptr, 1, OPTAB_DIRECT);
7903 /* See how many bytes we skipped. */
7904 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
7905 *destptr,
7906 saveddest, 1, OPTAB_DIRECT);
7907 /* Adjust srcptr and count. */
7908 if (!issetmem)
7909 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
7910 saveddest, *srcptr, 1, OPTAB_DIRECT);
7911 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7912 saveddest, *count, 1, OPTAB_DIRECT);
7913 /* We copied at most size + prolog_size. */
7914 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
7915 *min_size
7916 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
7917 else
7918 *min_size = 0;
7919
7920 /* Our loops always round down the block size, but for dispatch to
7921 library we need precise value. */
7922 if (dynamic_check)
7923 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
7924 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
7925 }
7926 else
7927 {
7928 gcc_assert (prolog_size == 0);
7929 /* Decrease count, so we won't end up copying last word twice. */
7930 if (!CONST_INT_P (*count))
7931 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7932 constm1_rtx, *count, 1, OPTAB_DIRECT);
7933 else
7934 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
7935 (unsigned HOST_WIDE_INT)size));
7936 if (*min_size)
7937 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
7938 }
7939}
7940
7941
7942/* This function is like the previous one, except here we know how many bytes
7943 need to be copied. That allows us to update alignment not only of DST, which
7944 is returned, but also of SRC, which is passed as a pointer for that
7945 reason. */
7946static rtx
76715c32 7947expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
2bf6d935
ML
7948 rtx srcreg, rtx value, rtx vec_value,
7949 int desired_align, int align_bytes,
7950 bool issetmem)
7951{
7952 rtx src = NULL;
7953 rtx orig_dst = dst;
7954 rtx orig_src = NULL;
7955 int piece_size = 1;
7956 int copied_bytes = 0;
7957
7958 if (!issetmem)
7959 {
7960 gcc_assert (srcp != NULL);
7961 src = *srcp;
7962 orig_src = src;
7963 }
7964
7965 for (piece_size = 1;
7966 piece_size <= desired_align && copied_bytes < align_bytes;
7967 piece_size <<= 1)
7968 {
7969 if (align_bytes & piece_size)
7970 {
7971 if (issetmem)
7972 {
7973 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
7974 dst = emit_memset (dst, destreg, vec_value, piece_size);
7975 else
7976 dst = emit_memset (dst, destreg, value, piece_size);
7977 }
7978 else
7979 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
7980 copied_bytes += piece_size;
7981 }
7982 }
7983 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
7984 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7985 if (MEM_SIZE_KNOWN_P (orig_dst))
7986 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
7987
7988 if (!issetmem)
7989 {
7990 int src_align_bytes = get_mem_align_offset (src, desired_align
7991 * BITS_PER_UNIT);
7992 if (src_align_bytes >= 0)
7993 src_align_bytes = desired_align - src_align_bytes;
7994 if (src_align_bytes >= 0)
7995 {
7996 unsigned int src_align;
7997 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
7998 {
7999 if ((src_align_bytes & (src_align - 1))
8000 == (align_bytes & (src_align - 1)))
8001 break;
8002 }
8003 if (src_align > (unsigned int) desired_align)
8004 src_align = desired_align;
8005 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8006 set_mem_align (src, src_align * BITS_PER_UNIT);
8007 }
8008 if (MEM_SIZE_KNOWN_P (orig_src))
8009 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8010 *srcp = src;
8011 }
8012
8013 return dst;
8014}
8015
8016/* Return true if ALG can be used in current context.
8017 Assume we expand memset if MEMSET is true. */
8018static bool
8019alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8020{
8021 if (alg == no_stringop)
8022 return false;
8023 if (alg == vector_loop)
8024 return TARGET_SSE || TARGET_AVX;
8025 /* Algorithms using the rep prefix want at least edi and ecx;
8026 additionally, memset wants eax and memcpy wants esi. Don't
8027 consider such algorithms if the user has appropriated those
8028 registers for their own purposes, or if we have a non-default
8029 address space, since some string insns cannot override the segment. */
8030 if (alg == rep_prefix_1_byte
8031 || alg == rep_prefix_4_byte
8032 || alg == rep_prefix_8_byte)
8033 {
8034 if (have_as)
8035 return false;
8036 if (fixed_regs[CX_REG]
8037 || fixed_regs[DI_REG]
8038 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8039 return false;
8040 }
8041 return true;
8042}
8043
8044/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8045static enum stringop_alg
8046decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8047 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8048 bool memset, bool zero_memset, bool have_as,
8049 int *dynamic_check, bool *noalign, bool recur)
8050{
8051 const struct stringop_algs *algs;
8052 bool optimize_for_speed;
8053 int max = 0;
8054 const struct processor_costs *cost;
8055 int i;
8056 bool any_alg_usable_p = false;
8057
8058 *noalign = false;
8059 *dynamic_check = -1;
8060
8061 /* Even if the string operation call is cold, we still might spend a lot
8062 of time processing large blocks. */
8063 if (optimize_function_for_size_p (cfun)
8064 || (optimize_insn_for_size_p ()
8065 && (max_size < 256
8066 || (expected_size != -1 && expected_size < 256))))
8067 optimize_for_speed = false;
8068 else
8069 optimize_for_speed = true;
8070
8071 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8072 if (memset)
8073 algs = &cost->memset[TARGET_64BIT != 0];
8074 else
8075 algs = &cost->memcpy[TARGET_64BIT != 0];
8076
8077 /* See maximal size for user defined algorithm. */
8078 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8079 {
8080 enum stringop_alg candidate = algs->size[i].alg;
8081 bool usable = alg_usable_p (candidate, memset, have_as);
8082 any_alg_usable_p |= usable;
8083
8084 if (candidate != libcall && candidate && usable)
8085 max = algs->size[i].max;
8086 }
8087
8088 /* If expected size is not known but max size is small enough
8089 so inline version is a win, set expected size into
8090 the range. */
8091 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8092 && expected_size == -1)
8093 expected_size = min_size / 2 + max_size / 2;
8094
8095 /* If user specified the algorithm, honor it if possible. */
8096 if (ix86_stringop_alg != no_stringop
8097 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8098 return ix86_stringop_alg;
8099 /* rep; movq or rep; movl is the smallest variant. */
8100 else if (!optimize_for_speed)
8101 {
8102 *noalign = true;
8103 if (!count || (count & 3) || (memset && !zero_memset))
8104 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8105 ? rep_prefix_1_byte : loop_1_byte;
8106 else
8107 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8108 ? rep_prefix_4_byte : loop;
8109 }
8110 /* Very tiny blocks are best handled via the loop, REP is expensive to
8111 setup. */
8112 else if (expected_size != -1 && expected_size < 4)
8113 return loop_1_byte;
8114 else if (expected_size != -1)
8115 {
8116 enum stringop_alg alg = libcall;
8117 bool alg_noalign = false;
8118 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8119 {
8120 /* We get here if the algorithms that were not libcall-based
8121 were rep-prefix based and we are unable to use rep prefixes
8122 based on global register usage. Break out of the loop and
8123 use the heuristic below. */
8124 if (algs->size[i].max == 0)
8125 break;
8126 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8127 {
8128 enum stringop_alg candidate = algs->size[i].alg;
8129
8130 if (candidate != libcall
8131 && alg_usable_p (candidate, memset, have_as))
8132 {
8133 alg = candidate;
8134 alg_noalign = algs->size[i].noalign;
8135 }
8136 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8137 last non-libcall inline algorithm. */
8138 if (TARGET_INLINE_ALL_STRINGOPS)
8139 {
8140 /* When the current size is best to be copied by a libcall,
8141 but we are still forced to inline, run the heuristic below
8142 that will pick code for medium sized blocks. */
8143 if (alg != libcall)
8144 {
8145 *noalign = alg_noalign;
8146 return alg;
8147 }
8148 else if (!any_alg_usable_p)
8149 break;
8150 }
bf24f4ec
L
8151 else if (alg_usable_p (candidate, memset, have_as)
8152 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8153 && candidate == rep_prefix_1_byte
8154 /* NB: If min_size != max_size, size is
8155 unknown. */
8156 && min_size != max_size))
2bf6d935
ML
8157 {
8158 *noalign = algs->size[i].noalign;
8159 return candidate;
8160 }
8161 }
8162 }
8163 }
8164 /* When asked to inline the call anyway, try to pick meaningful choice.
8165 We look for maximal size of block that is faster to copy by hand and
8166 take blocks of at most of that size guessing that average size will
8167 be roughly half of the block.
8168
8169 If this turns out to be bad, we might simply specify the preferred
8170 choice in ix86_costs. */
8171 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8172 && (algs->unknown_size == libcall
8173 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8174 {
8175 enum stringop_alg alg;
8176 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8177
8178 /* If there aren't any usable algorithms or if recursing already,
8179 then recursing on smaller sizes or same size isn't going to
8180 find anything. Just return the simple byte-at-a-time copy loop. */
8181 if (!any_alg_usable_p || recur)
8182 {
8183 /* Pick something reasonable. */
8184 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8185 *dynamic_check = 128;
8186 return loop_1_byte;
8187 }
8188 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8189 zero_memset, have_as, dynamic_check, noalign, true);
8190 gcc_assert (*dynamic_check == -1);
8191 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8192 *dynamic_check = max;
8193 else
8194 gcc_assert (alg != libcall);
8195 return alg;
8196 }
8197 return (alg_usable_p (algs->unknown_size, memset, have_as)
8198 ? algs->unknown_size : libcall);
8199}
8200
8201/* Decide on alignment. We know that the operand is already aligned to ALIGN
8202 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8203static int
8204decide_alignment (int align,
8205 enum stringop_alg alg,
8206 int expected_size,
8207 machine_mode move_mode)
8208{
8209 int desired_align = 0;
8210
8211 gcc_assert (alg != no_stringop);
8212
8213 if (alg == libcall)
8214 return 0;
8215 if (move_mode == VOIDmode)
8216 return 0;
8217
8218 desired_align = GET_MODE_SIZE (move_mode);
8219 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8220 copying whole cacheline at once. */
f23881fc 8221 if (TARGET_CPU_P (PENTIUMPRO)
2bf6d935
ML
8222 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8223 desired_align = 8;
8224
8225 if (optimize_size)
8226 desired_align = 1;
8227 if (desired_align < align)
8228 desired_align = align;
8229 if (expected_size != -1 && expected_size < 4)
8230 desired_align = align;
8231
8232 return desired_align;
8233}
8234
8235
8236/* Helper function for memcpy. For QImode value 0xXY produce
8237 0xXYXYXYXY of wide specified by MODE. This is essentially
8238 a * 0x10101010, but we can do slightly better than
8239 synth_mult by unwinding the sequence by hand on CPUs with
8240 slow multiply. */
8241static rtx
8242promote_duplicated_reg (machine_mode mode, rtx val)
8243{
8244 machine_mode valmode = GET_MODE (val);
8245 rtx tmp;
8246 int nops = mode == DImode ? 3 : 2;
8247
8248 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8249 if (val == const0_rtx)
8250 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8251 if (CONST_INT_P (val))
8252 {
8253 HOST_WIDE_INT v = INTVAL (val) & 255;
8254
8255 v |= v << 8;
8256 v |= v << 16;
8257 if (mode == DImode)
8258 v |= (v << 16) << 16;
8259 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8260 }
8261
8262 if (valmode == VOIDmode)
8263 valmode = QImode;
8264 if (valmode != QImode)
8265 val = gen_lowpart (QImode, val);
8266 if (mode == QImode)
8267 return val;
8268 if (!TARGET_PARTIAL_REG_STALL)
8269 nops--;
8270 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8271 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8272 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8273 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8274 {
8275 rtx reg = convert_modes (mode, QImode, val, true);
8276 tmp = promote_duplicated_reg (mode, const1_rtx);
8277 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8278 OPTAB_DIRECT);
8279 }
8280 else
8281 {
8282 rtx reg = convert_modes (mode, QImode, val, true);
8283
8284 if (!TARGET_PARTIAL_REG_STALL)
e9539592 8285 emit_insn (gen_insv_1 (mode, reg, reg));
2bf6d935
ML
8286 else
8287 {
8288 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8289 NULL, 1, OPTAB_DIRECT);
8290 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8291 OPTAB_DIRECT);
8292 }
8293 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8294 NULL, 1, OPTAB_DIRECT);
8295 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8296 if (mode == SImode)
8297 return reg;
8298 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8299 NULL, 1, OPTAB_DIRECT);
8300 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8301 return reg;
8302 }
8303}
8304
8305/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8306 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8307 alignment from ALIGN to DESIRED_ALIGN. */
8308static rtx
8309promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8310 int align)
8311{
8312 rtx promoted_val;
8313
8314 if (TARGET_64BIT
8315 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8316 promoted_val = promote_duplicated_reg (DImode, val);
8317 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8318 promoted_val = promote_duplicated_reg (SImode, val);
8319 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8320 promoted_val = promote_duplicated_reg (HImode, val);
8321 else
8322 promoted_val = val;
8323
8324 return promoted_val;
8325}
8326
8327/* Copy the address to a Pmode register. This is used for x32 to
8328 truncate DImode TLS address to a SImode register. */
8329
8330static rtx
8331ix86_copy_addr_to_reg (rtx addr)
8332{
8333 rtx reg;
8334 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8335 {
8336 reg = copy_addr_to_reg (addr);
8337 REG_POINTER (reg) = 1;
8338 return reg;
8339 }
8340 else
8341 {
8342 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8343 reg = copy_to_mode_reg (DImode, addr);
8344 REG_POINTER (reg) = 1;
8345 return gen_rtx_SUBREG (SImode, reg, 0);
8346 }
8347}
8348
8349/* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8350 operations when profitable. The code depends upon architecture, block size
8351 and alignment, but always has one of the following overall structures:
8352
8353 Aligned move sequence:
8354
8355 1) Prologue guard: Conditional that jumps up to epilogues for small
8356 blocks that can be handled by epilogue alone. This is faster
8357 but also needed for correctness, since prologue assume the block
8358 is larger than the desired alignment.
8359
8360 Optional dynamic check for size and libcall for large
8361 blocks is emitted here too, with -minline-stringops-dynamically.
8362
8363 2) Prologue: copy first few bytes in order to get destination
8364 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8365 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8366 copied. We emit either a jump tree on power of two sized
8367 blocks, or a byte loop.
8368
8369 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8370 with specified algorithm.
8371
8372 4) Epilogue: code copying tail of the block that is too small to be
8373 handled by main body (or up to size guarded by prologue guard).
8374
8375 Misaligned move sequence
8376
8377 1) missaligned move prologue/epilogue containing:
8378 a) Prologue handling small memory blocks and jumping to done_label
8379 (skipped if blocks are known to be large enough)
8380 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8381 needed by single possibly misaligned move
8382 (skipped if alignment is not needed)
8383 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8384
8385 2) Zero size guard dispatching to done_label, if needed
8386
8387 3) dispatch to library call, if needed,
8388
8389 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8390 with specified algorithm. */
8391bool
76715c32 8392ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
2bf6d935
ML
8393 rtx align_exp, rtx expected_align_exp,
8394 rtx expected_size_exp, rtx min_size_exp,
8395 rtx max_size_exp, rtx probable_max_size_exp,
8396 bool issetmem)
8397{
8398 rtx destreg;
8399 rtx srcreg = NULL;
8400 rtx_code_label *label = NULL;
8401 rtx tmp;
8402 rtx_code_label *jump_around_label = NULL;
8403 HOST_WIDE_INT align = 1;
8404 unsigned HOST_WIDE_INT count = 0;
8405 HOST_WIDE_INT expected_size = -1;
8406 int size_needed = 0, epilogue_size_needed;
8407 int desired_align = 0, align_bytes = 0;
8408 enum stringop_alg alg;
8409 rtx promoted_val = NULL;
8410 rtx vec_promoted_val = NULL;
8411 bool force_loopy_epilogue = false;
8412 int dynamic_check;
8413 bool need_zero_guard = false;
8414 bool noalign;
8415 machine_mode move_mode = VOIDmode;
8416 machine_mode wider_mode;
8417 int unroll_factor = 1;
8418 /* TODO: Once value ranges are available, fill in proper data. */
8419 unsigned HOST_WIDE_INT min_size = 0;
8420 unsigned HOST_WIDE_INT max_size = -1;
8421 unsigned HOST_WIDE_INT probable_max_size = -1;
8422 bool misaligned_prologue_used = false;
8423 bool have_as;
8424
8425 if (CONST_INT_P (align_exp))
8426 align = INTVAL (align_exp);
8427 /* i386 can do misaligned access on reasonably increased cost. */
8428 if (CONST_INT_P (expected_align_exp)
8429 && INTVAL (expected_align_exp) > align)
8430 align = INTVAL (expected_align_exp);
8431 /* ALIGN is the minimum of destination and source alignment, but we care here
8432 just about destination alignment. */
8433 else if (!issetmem
8434 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8435 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8436
8437 if (CONST_INT_P (count_exp))
8438 {
8439 min_size = max_size = probable_max_size = count = expected_size
8440 = INTVAL (count_exp);
8441 /* When COUNT is 0, there is nothing to do. */
8442 if (!count)
8443 return true;
8444 }
8445 else
8446 {
8447 if (min_size_exp)
8448 min_size = INTVAL (min_size_exp);
8449 if (max_size_exp)
8450 max_size = INTVAL (max_size_exp);
8451 if (probable_max_size_exp)
8452 probable_max_size = INTVAL (probable_max_size_exp);
8453 if (CONST_INT_P (expected_size_exp))
8454 expected_size = INTVAL (expected_size_exp);
8455 }
8456
8457 /* Make sure we don't need to care about overflow later on. */
8458 if (count > (HOST_WIDE_INT_1U << 30))
8459 return false;
8460
8461 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8462 if (!issetmem)
8463 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8464
8465 /* Step 0: Decide on preferred algorithm, desired alignment and
8466 size of chunks to be copied by main loop. */
8467 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8468 issetmem,
8469 issetmem && val_exp == const0_rtx, have_as,
8470 &dynamic_check, &noalign, false);
8471
8472 if (dump_file)
8473 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8474 stringop_alg_names[alg]);
8475
8476 if (alg == libcall)
8477 return false;
8478 gcc_assert (alg != no_stringop);
8479
8480 /* For now vector-version of memset is generated only for memory zeroing, as
8481 creating of promoted vector value is very cheap in this case. */
8482 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8483 alg = unrolled_loop;
8484
8485 if (!count)
8486 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8487 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8488 if (!issetmem)
8489 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8490
8491 unroll_factor = 1;
8492 move_mode = word_mode;
8493 switch (alg)
8494 {
8495 case libcall:
8496 case no_stringop:
8497 case last_alg:
8498 gcc_unreachable ();
8499 case loop_1_byte:
8500 need_zero_guard = true;
8501 move_mode = QImode;
8502 break;
8503 case loop:
8504 need_zero_guard = true;
8505 break;
8506 case unrolled_loop:
8507 need_zero_guard = true;
8508 unroll_factor = (TARGET_64BIT ? 4 : 2);
8509 break;
8510 case vector_loop:
8511 need_zero_guard = true;
8512 unroll_factor = 4;
8513 /* Find the widest supported mode. */
8514 move_mode = word_mode;
8515 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8516 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8517 move_mode = wider_mode;
8518
586bbef1 8519 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
2bf6d935
ML
8520 move_mode = TImode;
8521
8522 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8523 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8524 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8525 {
8526 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8527 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8528 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8529 move_mode = word_mode;
8530 }
8531 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8532 break;
8533 case rep_prefix_8_byte:
8534 move_mode = DImode;
8535 break;
8536 case rep_prefix_4_byte:
8537 move_mode = SImode;
8538 break;
8539 case rep_prefix_1_byte:
8540 move_mode = QImode;
8541 break;
8542 }
8543 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8544 epilogue_size_needed = size_needed;
8545
8546 /* If we are going to call any library calls conditionally, make sure any
8547 pending stack adjustment happen before the first conditional branch,
8548 otherwise they will be emitted before the library call only and won't
8549 happen from the other branches. */
8550 if (dynamic_check != -1)
8551 do_pending_stack_adjust ();
8552
8553 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8554 if (!TARGET_ALIGN_STRINGOPS || noalign)
8555 align = desired_align;
8556
8557 /* Step 1: Prologue guard. */
8558
8559 /* Alignment code needs count to be in register. */
8560 if (CONST_INT_P (count_exp) && desired_align > align)
8561 {
8562 if (INTVAL (count_exp) > desired_align
8563 && INTVAL (count_exp) > size_needed)
8564 {
8565 align_bytes
8566 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8567 if (align_bytes <= 0)
8568 align_bytes = 0;
8569 else
8570 align_bytes = desired_align - align_bytes;
8571 }
8572 if (align_bytes == 0)
8573 count_exp = force_reg (counter_mode (count_exp), count_exp);
8574 }
8575 gcc_assert (desired_align >= 1 && align >= 1);
8576
8577 /* Misaligned move sequences handle both prologue and epilogue at once.
8578 Default code generation results in a smaller code for large alignments
8579 and also avoids redundant job when sizes are known precisely. */
8580 misaligned_prologue_used
8581 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8582 && MAX (desired_align, epilogue_size_needed) <= 32
8583 && desired_align <= epilogue_size_needed
8584 && ((desired_align > align && !align_bytes)
8585 || (!count && epilogue_size_needed > 1)));
8586
8587 /* Do the cheap promotion to allow better CSE across the
8588 main loop and epilogue (ie one load of the big constant in the
8589 front of all code.
8590 For now the misaligned move sequences do not have fast path
8591 without broadcasting. */
8592 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8593 {
8594 if (alg == vector_loop)
8595 {
8596 gcc_assert (val_exp == const0_rtx);
8597 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8598 promoted_val = promote_duplicated_reg_to_size (val_exp,
8599 GET_MODE_SIZE (word_mode),
8600 desired_align, align);
8601 }
8602 else
8603 {
8604 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8605 desired_align, align);
8606 }
8607 }
8608 /* Misaligned move sequences handles both prologues and epilogues at once.
8609 Default code generation results in smaller code for large alignments and
8610 also avoids redundant job when sizes are known precisely. */
8611 if (misaligned_prologue_used)
8612 {
8613 /* Misaligned move prologue handled small blocks by itself. */
76715c32 8614 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
2bf6d935
ML
8615 (dst, src, &destreg, &srcreg,
8616 move_mode, promoted_val, vec_promoted_val,
8617 &count_exp,
8618 &jump_around_label,
8619 desired_align < align
8620 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8621 desired_align, align, &min_size, dynamic_check, issetmem);
8622 if (!issetmem)
8623 src = change_address (src, BLKmode, srcreg);
8624 dst = change_address (dst, BLKmode, destreg);
8625 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8626 epilogue_size_needed = 0;
8627 if (need_zero_guard
8628 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8629 {
8630 /* It is possible that we copied enough so the main loop will not
8631 execute. */
8632 gcc_assert (size_needed > 1);
8633 if (jump_around_label == NULL_RTX)
8634 jump_around_label = gen_label_rtx ();
8635 emit_cmp_and_jump_insns (count_exp,
8636 GEN_INT (size_needed),
8637 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8638 if (expected_size == -1
8639 || expected_size < (desired_align - align) / 2 + size_needed)
8640 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8641 else
8642 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8643 }
8644 }
8645 /* Ensure that alignment prologue won't copy past end of block. */
8646 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8647 {
8648 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8649 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8650 Make sure it is power of 2. */
8651 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8652
8653 /* To improve performance of small blocks, we jump around the VAL
8654 promoting mode. This mean that if the promoted VAL is not constant,
8655 we might not use it in the epilogue and have to use byte
8656 loop variant. */
8657 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8658 force_loopy_epilogue = true;
8659 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8660 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8661 {
8662 /* If main algorithm works on QImode, no epilogue is needed.
8663 For small sizes just don't align anything. */
8664 if (size_needed == 1)
8665 desired_align = align;
8666 else
8667 goto epilogue;
8668 }
8669 else if (!count
8670 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8671 {
8672 label = gen_label_rtx ();
8673 emit_cmp_and_jump_insns (count_exp,
8674 GEN_INT (epilogue_size_needed),
8675 LTU, 0, counter_mode (count_exp), 1, label);
8676 if (expected_size == -1 || expected_size < epilogue_size_needed)
8677 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8678 else
8679 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8680 }
8681 }
8682
8683 /* Emit code to decide on runtime whether library call or inline should be
8684 used. */
8685 if (dynamic_check != -1)
8686 {
8687 if (!issetmem && CONST_INT_P (count_exp))
8688 {
8689 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8690 {
8691 emit_block_copy_via_libcall (dst, src, count_exp);
8692 count_exp = const0_rtx;
8693 goto epilogue;
8694 }
8695 }
8696 else
8697 {
8698 rtx_code_label *hot_label = gen_label_rtx ();
8699 if (jump_around_label == NULL_RTX)
8700 jump_around_label = gen_label_rtx ();
8701 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8702 LEU, 0, counter_mode (count_exp),
8703 1, hot_label);
8704 predict_jump (REG_BR_PROB_BASE * 90 / 100);
8705 if (issetmem)
8706 set_storage_via_libcall (dst, count_exp, val_exp);
8707 else
8708 emit_block_copy_via_libcall (dst, src, count_exp);
8709 emit_jump (jump_around_label);
8710 emit_label (hot_label);
8711 }
8712 }
8713
8714 /* Step 2: Alignment prologue. */
8715 /* Do the expensive promotion once we branched off the small blocks. */
8716 if (issetmem && !promoted_val)
8717 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8718 desired_align, align);
8719
8720 if (desired_align > align && !misaligned_prologue_used)
8721 {
8722 if (align_bytes == 0)
8723 {
8724 /* Except for the first move in prologue, we no longer know
8725 constant offset in aliasing info. It don't seems to worth
8726 the pain to maintain it for the first move, so throw away
8727 the info early. */
8728 dst = change_address (dst, BLKmode, destreg);
8729 if (!issetmem)
8730 src = change_address (src, BLKmode, srcreg);
76715c32 8731 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
2bf6d935
ML
8732 promoted_val, vec_promoted_val,
8733 count_exp, align, desired_align,
8734 issetmem);
8735 /* At most desired_align - align bytes are copied. */
8736 if (min_size < (unsigned)(desired_align - align))
8737 min_size = 0;
8738 else
8739 min_size -= desired_align - align;
8740 }
8741 else
8742 {
8743 /* If we know how many bytes need to be stored before dst is
8744 sufficiently aligned, maintain aliasing info accurately. */
76715c32 8745 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
2bf6d935
ML
8746 srcreg,
8747 promoted_val,
8748 vec_promoted_val,
8749 desired_align,
8750 align_bytes,
8751 issetmem);
8752
8753 count_exp = plus_constant (counter_mode (count_exp),
8754 count_exp, -align_bytes);
8755 count -= align_bytes;
8756 min_size -= align_bytes;
8757 max_size -= align_bytes;
8758 }
8759 if (need_zero_guard
8760 && min_size < (unsigned HOST_WIDE_INT) size_needed
8761 && (count < (unsigned HOST_WIDE_INT) size_needed
8762 || (align_bytes == 0
8763 && count < ((unsigned HOST_WIDE_INT) size_needed
8764 + desired_align - align))))
8765 {
8766 /* It is possible that we copied enough so the main loop will not
8767 execute. */
8768 gcc_assert (size_needed > 1);
8769 if (label == NULL_RTX)
8770 label = gen_label_rtx ();
8771 emit_cmp_and_jump_insns (count_exp,
8772 GEN_INT (size_needed),
8773 LTU, 0, counter_mode (count_exp), 1, label);
8774 if (expected_size == -1
8775 || expected_size < (desired_align - align) / 2 + size_needed)
8776 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8777 else
8778 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8779 }
8780 }
8781 if (label && size_needed == 1)
8782 {
8783 emit_label (label);
8784 LABEL_NUSES (label) = 1;
8785 label = NULL;
8786 epilogue_size_needed = 1;
8787 if (issetmem)
8788 promoted_val = val_exp;
8789 }
8790 else if (label == NULL_RTX && !misaligned_prologue_used)
8791 epilogue_size_needed = size_needed;
8792
8793 /* Step 3: Main loop. */
8794
8795 switch (alg)
8796 {
8797 case libcall:
8798 case no_stringop:
8799 case last_alg:
8800 gcc_unreachable ();
8801 case loop_1_byte:
8802 case loop:
8803 case unrolled_loop:
76715c32 8804 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
8805 count_exp, move_mode, unroll_factor,
8806 expected_size, issetmem);
8807 break;
8808 case vector_loop:
76715c32 8809 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
2bf6d935
ML
8810 vec_promoted_val, count_exp, move_mode,
8811 unroll_factor, expected_size, issetmem);
8812 break;
8813 case rep_prefix_8_byte:
8814 case rep_prefix_4_byte:
8815 case rep_prefix_1_byte:
76715c32 8816 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
8817 val_exp, count_exp, move_mode, issetmem);
8818 break;
8819 }
8820 /* Adjust properly the offset of src and dest memory for aliasing. */
8821 if (CONST_INT_P (count_exp))
8822 {
8823 if (!issetmem)
8824 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
8825 (count / size_needed) * size_needed);
8826 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
8827 (count / size_needed) * size_needed);
8828 }
8829 else
8830 {
8831 if (!issetmem)
8832 src = change_address (src, BLKmode, srcreg);
8833 dst = change_address (dst, BLKmode, destreg);
8834 }
8835
8836 /* Step 4: Epilogue to copy the remaining bytes. */
8837 epilogue:
8838 if (label)
8839 {
8840 /* When the main loop is done, COUNT_EXP might hold original count,
8841 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8842 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8843 bytes. Compensate if needed. */
8844
8845 if (size_needed < epilogue_size_needed)
8846 {
8847 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
8848 GEN_INT (size_needed - 1), count_exp, 1,
8849 OPTAB_DIRECT);
8850 if (tmp != count_exp)
8851 emit_move_insn (count_exp, tmp);
8852 }
8853 emit_label (label);
8854 LABEL_NUSES (label) = 1;
8855 }
8856
8857 if (count_exp != const0_rtx && epilogue_size_needed > 1)
8858 {
8859 if (force_loopy_epilogue)
8860 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
8861 epilogue_size_needed);
8862 else
8863 {
8864 if (issetmem)
8865 expand_setmem_epilogue (dst, destreg, promoted_val,
8866 vec_promoted_val, count_exp,
8867 epilogue_size_needed);
8868 else
76715c32 8869 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
2bf6d935
ML
8870 epilogue_size_needed);
8871 }
8872 }
8873 if (jump_around_label)
8874 emit_label (jump_around_label);
8875 return true;
8876}
8877
3edc21af
L
8878/* Expand cmpstrn or memcmp. */
8879
8880bool
8881ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
8882 rtx length, rtx align, bool is_cmpstrn)
8883{
4052c05e
L
8884 /* Expand strncmp and memcmp only with -minline-all-stringops since
8885 "repz cmpsb" can be much slower than strncmp and memcmp functions
8886 implemented with vector instructions, see
8887
8888 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
8889 */
8890 if (!TARGET_INLINE_ALL_STRINGOPS)
3edc21af
L
8891 return false;
8892
8893 /* Can't use this if the user has appropriated ecx, esi or edi. */
8894 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
8895 return false;
8896
8897 if (is_cmpstrn)
8898 {
8899 /* For strncmp, length is the maximum length, which can be larger
8900 than actual string lengths. We can expand the cmpstrn pattern
8901 to "repz cmpsb" only if one of the strings is a constant so
8902 that expand_builtin_strncmp() can write the length argument to
8903 be the minimum of the const string length and the actual length
8904 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
8905 tree t1 = MEM_EXPR (src1);
8906 tree t2 = MEM_EXPR (src2);
8907 if (!((t1 && TREE_CODE (t1) == MEM_REF
8908 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
8909 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
8910 == STRING_CST))
8911 || (t2 && TREE_CODE (t2) == MEM_REF
8912 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
8913 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
8914 == STRING_CST))))
8915 return false;
8916 }
3edc21af
L
8917
8918 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
8919 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
8920 if (addr1 != XEXP (src1, 0))
8921 src1 = replace_equiv_address_nv (src1, addr1);
8922 if (addr2 != XEXP (src2, 0))
8923 src2 = replace_equiv_address_nv (src2, addr2);
8924
8925 /* NB: Make a copy of the data length to avoid changing the original
8926 data length by cmpstrnqi patterns. */
8927 length = ix86_zero_extend_to_Pmode (length);
8928 rtx lengthreg = gen_reg_rtx (Pmode);
8929 emit_move_insn (lengthreg, length);
8930
8931 /* If we are testing strict equality, we can use known alignment to
8932 good advantage. This may be possible with combine, particularly
8933 once cc0 is dead. */
8934 if (CONST_INT_P (length))
8935 {
8936 if (length == const0_rtx)
8937 {
8938 emit_move_insn (result, const0_rtx);
8939 return true;
8940 }
8941 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
8942 src1, src2));
8943 }
8944 else
8945 {
8946 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
8947 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
8948 src1, src2));
8949 }
8950
8951 rtx out = gen_lowpart (QImode, result);
8952 emit_insn (gen_cmpintqi (out));
8953 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
8954
8955 return true;
8956}
2bf6d935
ML
8957
8958/* Expand the appropriate insns for doing strlen if not just doing
8959 repnz; scasb
8960
8961 out = result, initialized with the start address
8962 align_rtx = alignment of the address.
8963 scratch = scratch register, initialized with the startaddress when
8964 not aligned, otherwise undefined
8965
8966 This is just the body. It needs the initializations mentioned above and
8967 some address computing at the end. These things are done in i386.md. */
8968
8969static void
8970ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
8971{
8972 int align;
8973 rtx tmp;
8974 rtx_code_label *align_2_label = NULL;
8975 rtx_code_label *align_3_label = NULL;
8976 rtx_code_label *align_4_label = gen_label_rtx ();
8977 rtx_code_label *end_0_label = gen_label_rtx ();
8978 rtx mem;
8979 rtx tmpreg = gen_reg_rtx (SImode);
8980 rtx scratch = gen_reg_rtx (SImode);
8981 rtx cmp;
8982
8983 align = 0;
8984 if (CONST_INT_P (align_rtx))
8985 align = INTVAL (align_rtx);
8986
8987 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
8988
8989 /* Is there a known alignment and is it less than 4? */
8990 if (align < 4)
8991 {
8992 rtx scratch1 = gen_reg_rtx (Pmode);
8993 emit_move_insn (scratch1, out);
8994 /* Is there a known alignment and is it not 2? */
8995 if (align != 2)
8996 {
8997 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
8998 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
8999
9000 /* Leave just the 3 lower bits. */
9001 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9002 NULL_RTX, 0, OPTAB_WIDEN);
9003
9004 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9005 Pmode, 1, align_4_label);
9006 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9007 Pmode, 1, align_2_label);
9008 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9009 Pmode, 1, align_3_label);
9010 }
9011 else
9012 {
9013 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9014 check if is aligned to 4 - byte. */
9015
9016 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9017 NULL_RTX, 0, OPTAB_WIDEN);
9018
9019 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9020 Pmode, 1, align_4_label);
9021 }
9022
9023 mem = change_address (src, QImode, out);
9024
9025 /* Now compare the bytes. */
9026
9027 /* Compare the first n unaligned byte on a byte per byte basis. */
9028 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9029 QImode, 1, end_0_label);
9030
9031 /* Increment the address. */
d9330fb5 9032 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
9033
9034 /* Not needed with an alignment of 2 */
9035 if (align != 2)
9036 {
9037 emit_label (align_2_label);
9038
9039 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9040 end_0_label);
9041
d9330fb5 9042 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
9043
9044 emit_label (align_3_label);
9045 }
9046
9047 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9048 end_0_label);
9049
d9330fb5 9050 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
9051 }
9052
9053 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9054 align this loop. It gives only huge programs, but does not help to
9055 speed up. */
9056 emit_label (align_4_label);
9057
9058 mem = change_address (src, SImode, out);
9059 emit_move_insn (scratch, mem);
d9330fb5 9060 emit_insn (gen_add2_insn (out, GEN_INT (4)));
2bf6d935
ML
9061
9062 /* This formula yields a nonzero result iff one of the bytes is zero.
9063 This saves three branches inside loop and many cycles. */
9064
9065 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9066 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9067 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9068 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9069 gen_int_mode (0x80808080, SImode)));
9070 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9071 align_4_label);
9072
9073 if (TARGET_CMOVE)
9074 {
9075 rtx reg = gen_reg_rtx (SImode);
9076 rtx reg2 = gen_reg_rtx (Pmode);
9077 emit_move_insn (reg, tmpreg);
9078 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9079
9080 /* If zero is not in the first two bytes, move two bytes forward. */
9081 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9082 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9083 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9084 emit_insn (gen_rtx_SET (tmpreg,
9085 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9086 reg,
9087 tmpreg)));
9088 /* Emit lea manually to avoid clobbering of flags. */
c3185b64 9089 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
2bf6d935
ML
9090
9091 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9092 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9093 emit_insn (gen_rtx_SET (out,
9094 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9095 reg2,
9096 out)));
9097 }
9098 else
9099 {
9100 rtx_code_label *end_2_label = gen_label_rtx ();
9101 /* Is zero in the first two bytes? */
9102
9103 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9104 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9105 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9106 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9107 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9108 pc_rtx);
9109 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9110 JUMP_LABEL (tmp) = end_2_label;
9111
9112 /* Not in the first two. Move two bytes forward. */
9113 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
d9330fb5 9114 emit_insn (gen_add2_insn (out, const2_rtx));
2bf6d935
ML
9115
9116 emit_label (end_2_label);
9117
9118 }
9119
9120 /* Avoid branch in fixing the byte. */
9121 tmpreg = gen_lowpart (QImode, tmpreg);
9122 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9123 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9124 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
d9330fb5 9125 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
2bf6d935
ML
9126
9127 emit_label (end_0_label);
9128}
9129
9130/* Expand strlen. */
9131
9132bool
9133ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9134{
9135if (TARGET_UNROLL_STRLEN
9136 && TARGET_INLINE_ALL_STRINGOPS
9137 && eoschar == const0_rtx
9138 && optimize > 1)
9139 {
9140 /* The generic case of strlen expander is long. Avoid it's
9141 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9142 rtx addr = force_reg (Pmode, XEXP (src, 0));
9143 /* Well it seems that some optimizer does not combine a call like
9144 foo(strlen(bar), strlen(bar));
9145 when the move and the subtraction is done here. It does calculate
9146 the length just once when these instructions are done inside of
9147 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9148 often used and I use one fewer register for the lifetime of
9149 output_strlen_unroll() this is better. */
9150
9151 emit_move_insn (out, addr);
9152
9153 ix86_expand_strlensi_unroll_1 (out, src, align);
9154
9155 /* strlensi_unroll_1 returns the address of the zero at the end of
9156 the string, like memchr(), so compute the length by subtracting
9157 the start address. */
d9330fb5 9158 emit_insn (gen_sub2_insn (out, addr));
2bf6d935
ML
9159 return true;
9160 }
9161 else
9162 return false;
9163}
9164
9165/* For given symbol (function) construct code to compute address of it's PLT
9166 entry in large x86-64 PIC model. */
9167
9168static rtx
9169construct_plt_address (rtx symbol)
9170{
9171 rtx tmp, unspec;
9172
9173 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9174 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9175 gcc_assert (Pmode == DImode);
9176
9177 tmp = gen_reg_rtx (Pmode);
9178 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9179
9180 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
d9330fb5 9181 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
2bf6d935
ML
9182 return tmp;
9183}
9184
9185/* Additional registers that are clobbered by SYSV calls. */
9186
9187static int const x86_64_ms_sysv_extra_clobbered_registers
9188 [NUM_X86_64_MS_CLOBBERED_REGS] =
9189{
9190 SI_REG, DI_REG,
9191 XMM6_REG, XMM7_REG,
9192 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9193 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9194};
9195
9196rtx_insn *
9197ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9198 rtx callarg2,
9199 rtx pop, bool sibcall)
9200{
9201 rtx vec[3];
9202 rtx use = NULL, call;
9203 unsigned int vec_len = 0;
9204 tree fndecl;
9205
9206 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9207 {
9208 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9209 if (fndecl
9210 && (lookup_attribute ("interrupt",
9211 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
a9c697b8 9212 error ("interrupt service routine cannot be called directly");
2bf6d935
ML
9213 }
9214 else
9215 fndecl = NULL_TREE;
9216
9217 if (pop == const0_rtx)
9218 pop = NULL;
9219 gcc_assert (!TARGET_64BIT || !pop);
9220
41bd1b19 9221 rtx addr = XEXP (fnaddr, 0);
2bf6d935
ML
9222 if (TARGET_MACHO && !TARGET_64BIT)
9223 {
9224#if TARGET_MACHO
9225 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9226 fnaddr = machopic_indirect_call_target (fnaddr);
9227#endif
9228 }
9229 else
9230 {
9231 /* Static functions and indirect calls don't need the pic register. Also,
9232 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9233 it an indirect call. */
2bf6d935
ML
9234 if (flag_pic
9235 && GET_CODE (addr) == SYMBOL_REF
f7854b90 9236 && ix86_call_use_plt_p (addr))
2bf6d935
ML
9237 {
9238 if (flag_plt
9239 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9240 || !lookup_attribute ("noplt",
9241 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9242 {
9243 if (!TARGET_64BIT
9244 || (ix86_cmodel == CM_LARGE_PIC
9245 && DEFAULT_ABI != MS_ABI))
9246 {
9247 use_reg (&use, gen_rtx_REG (Pmode,
9248 REAL_PIC_OFFSET_TABLE_REGNUM));
9249 if (ix86_use_pseudo_pic_reg ())
9250 emit_move_insn (gen_rtx_REG (Pmode,
9251 REAL_PIC_OFFSET_TABLE_REGNUM),
9252 pic_offset_table_rtx);
9253 }
9254 }
9255 else if (!TARGET_PECOFF && !TARGET_MACHO)
9256 {
69157fe7
JJ
9257 if (TARGET_64BIT
9258 && ix86_cmodel == CM_LARGE_PIC
9259 && DEFAULT_ABI != MS_ABI)
9260 {
9261 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9262 UNSPEC_GOT);
9263 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9264 fnaddr = force_reg (Pmode, fnaddr);
9265 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9266 }
9267 else if (TARGET_64BIT)
2bf6d935
ML
9268 {
9269 fnaddr = gen_rtx_UNSPEC (Pmode,
9270 gen_rtvec (1, addr),
9271 UNSPEC_GOTPCREL);
9272 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9273 }
9274 else
9275 {
9276 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9277 UNSPEC_GOT);
9278 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9279 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9280 fnaddr);
9281 }
9282 fnaddr = gen_const_mem (Pmode, fnaddr);
9283 /* Pmode may not be the same as word_mode for x32, which
9284 doesn't support indirect branch via 32-bit memory slot.
9285 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9286 indirect branch via x32 GOT slot is OK. */
9287 if (GET_MODE (fnaddr) != word_mode)
9288 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9289 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9290 }
9291 }
9292 }
9293
9294 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9295 parameters passed in vector registers. */
9296 if (TARGET_64BIT
9297 && (INTVAL (callarg2) > 0
9298 || (INTVAL (callarg2) == 0
9299 && (TARGET_SSE || !flag_skip_rax_setup))))
9300 {
9301 rtx al = gen_rtx_REG (QImode, AX_REG);
9302 emit_move_insn (al, callarg2);
9303 use_reg (&use, al);
9304 }
9305
9306 if (ix86_cmodel == CM_LARGE_PIC
9307 && !TARGET_PECOFF
9308 && MEM_P (fnaddr)
9309 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9310 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9311 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9312 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9313 branch via x32 GOT slot is OK. */
9314 else if (!(TARGET_X32
9315 && MEM_P (fnaddr)
9316 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9317 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9318 && (sibcall
9319 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9320 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9321 {
9322 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9323 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9324 }
9325
9326 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9327
9328 if (retval)
9329 call = gen_rtx_SET (retval, call);
9330 vec[vec_len++] = call;
9331
9332 if (pop)
9333 {
9334 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9335 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9336 vec[vec_len++] = pop;
9337 }
9338
9339 if (cfun->machine->no_caller_saved_registers
9340 && (!fndecl
9341 || (!TREE_THIS_VOLATILE (fndecl)
9342 && !lookup_attribute ("no_caller_saved_registers",
9343 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9344 {
9345 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9346 bool is_64bit_ms_abi = (TARGET_64BIT
9347 && ix86_function_abi (fndecl) == MS_ABI);
9348 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9349
9350 /* If there are no caller-saved registers, add all registers
9351 that are clobbered by the call which returns. */
9352 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9353 if (!fixed_regs[i]
9354 && (ix86_call_used_regs[i] == 1
9355 || (ix86_call_used_regs[i] & c_mask))
9356 && !STACK_REGNO_P (i)
9357 && !MMX_REGNO_P (i))
9358 clobber_reg (&use,
9359 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9360 }
9361 else if (TARGET_64BIT_MS_ABI
9362 && (!callarg2 || INTVAL (callarg2) != -2))
9363 {
9364 unsigned i;
9365
9366 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9367 {
9368 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9369 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9370
9371 clobber_reg (&use, gen_rtx_REG (mode, regno));
9372 }
9373
9374 /* Set here, but it may get cleared later. */
9375 if (TARGET_CALL_MS2SYSV_XLOGUES)
9376 {
9377 if (!TARGET_SSE)
9378 ;
9379
9380 /* Don't break hot-patched functions. */
9381 else if (ix86_function_ms_hook_prologue (current_function_decl))
9382 ;
9383
9384 /* TODO: Cases not yet examined. */
9385 else if (flag_split_stack)
9386 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9387
9388 else
9389 {
9390 gcc_assert (!reload_completed);
9391 cfun->machine->call_ms2sysv = true;
9392 }
9393 }
9394 }
9395
41bd1b19
IS
9396 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9397 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9398 || !fndecl || TREE_PUBLIC (fndecl)))
9399 {
9400 /* We allow public functions defined in a TU to bind locally for PIC
9401 code (the default) on 64bit Mach-O.
9402 If such functions are not inlined, we cannot tell at compile-time if
9403 they will be called via the lazy symbol resolver (this can depend on
9404 options given at link-time). Therefore, we must assume that the lazy
9405 resolver could be used which clobbers R11 and R10. */
9406 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9407 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9408 }
9409
2bf6d935
ML
9410 if (vec_len > 1)
9411 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9412 rtx_insn *call_insn = emit_call_insn (call);
9413 if (use)
9414 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9415
9416 return call_insn;
9417}
9418
9419/* Split simple return with popping POPC bytes from stack to indirect
9420 branch with stack adjustment . */
9421
9422void
9423ix86_split_simple_return_pop_internal (rtx popc)
9424{
9425 struct machine_function *m = cfun->machine;
9426 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9427 rtx_insn *insn;
9428
9429 /* There is no "pascal" calling convention in any 64bit ABI. */
9430 gcc_assert (!TARGET_64BIT);
9431
9432 insn = emit_insn (gen_pop (ecx));
9433 m->fs.cfa_offset -= UNITS_PER_WORD;
9434 m->fs.sp_offset -= UNITS_PER_WORD;
9435
9436 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9437 x = gen_rtx_SET (stack_pointer_rtx, x);
9438 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9439 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9440 RTX_FRAME_RELATED_P (insn) = 1;
9441
9442 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9443 x = gen_rtx_SET (stack_pointer_rtx, x);
9444 insn = emit_insn (x);
9445 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9446 RTX_FRAME_RELATED_P (insn) = 1;
9447
9448 /* Now return address is in ECX. */
9449 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9450}
9451
9452/* Errors in the source file can cause expand_expr to return const0_rtx
9453 where we expect a vector. To avoid crashing, use one of the vector
9454 clear instructions. */
9455
9456static rtx
9457safe_vector_operand (rtx x, machine_mode mode)
9458{
9459 if (x == const0_rtx)
9460 x = CONST0_RTX (mode);
9461 return x;
9462}
9463
9464/* Subroutine of ix86_expand_builtin to take care of binop insns. */
9465
9466static rtx
9467ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9468{
9469 rtx pat;
9470 tree arg0 = CALL_EXPR_ARG (exp, 0);
9471 tree arg1 = CALL_EXPR_ARG (exp, 1);
9472 rtx op0 = expand_normal (arg0);
9473 rtx op1 = expand_normal (arg1);
9474 machine_mode tmode = insn_data[icode].operand[0].mode;
9475 machine_mode mode0 = insn_data[icode].operand[1].mode;
9476 machine_mode mode1 = insn_data[icode].operand[2].mode;
9477
9478 if (VECTOR_MODE_P (mode0))
9479 op0 = safe_vector_operand (op0, mode0);
9480 if (VECTOR_MODE_P (mode1))
9481 op1 = safe_vector_operand (op1, mode1);
9482
9483 if (optimize || !target
9484 || GET_MODE (target) != tmode
9485 || !insn_data[icode].operand[0].predicate (target, tmode))
9486 target = gen_reg_rtx (tmode);
9487
9488 if (GET_MODE (op1) == SImode && mode1 == TImode)
9489 {
9490 rtx x = gen_reg_rtx (V4SImode);
9491 emit_insn (gen_sse2_loadd (x, op1));
9492 op1 = gen_lowpart (TImode, x);
9493 }
9494
9495 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9496 op0 = copy_to_mode_reg (mode0, op0);
9497 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9498 op1 = copy_to_mode_reg (mode1, op1);
9499
9500 pat = GEN_FCN (icode) (target, op0, op1);
9501 if (! pat)
9502 return 0;
9503
9504 emit_insn (pat);
9505
9506 return target;
9507}
9508
9509/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9510
9511static rtx
9512ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9513 enum ix86_builtin_func_type m_type,
9514 enum rtx_code sub_code)
9515{
9516 rtx pat;
715a8bc8 9517 unsigned int i, nargs;
2bf6d935
ML
9518 bool comparison_p = false;
9519 bool tf_p = false;
9520 bool last_arg_constant = false;
9521 int num_memory = 0;
715a8bc8 9522 rtx xops[4];
2bf6d935
ML
9523
9524 machine_mode tmode = insn_data[icode].operand[0].mode;
9525
9526 switch (m_type)
9527 {
9528 case MULTI_ARG_4_DF2_DI_I:
9529 case MULTI_ARG_4_DF2_DI_I1:
9530 case MULTI_ARG_4_SF2_SI_I:
9531 case MULTI_ARG_4_SF2_SI_I1:
9532 nargs = 4;
9533 last_arg_constant = true;
9534 break;
9535
9536 case MULTI_ARG_3_SF:
9537 case MULTI_ARG_3_DF:
9538 case MULTI_ARG_3_SF2:
9539 case MULTI_ARG_3_DF2:
9540 case MULTI_ARG_3_DI:
9541 case MULTI_ARG_3_SI:
9542 case MULTI_ARG_3_SI_DI:
9543 case MULTI_ARG_3_HI:
9544 case MULTI_ARG_3_HI_SI:
9545 case MULTI_ARG_3_QI:
9546 case MULTI_ARG_3_DI2:
9547 case MULTI_ARG_3_SI2:
9548 case MULTI_ARG_3_HI2:
9549 case MULTI_ARG_3_QI2:
9550 nargs = 3;
9551 break;
9552
9553 case MULTI_ARG_2_SF:
9554 case MULTI_ARG_2_DF:
9555 case MULTI_ARG_2_DI:
9556 case MULTI_ARG_2_SI:
9557 case MULTI_ARG_2_HI:
9558 case MULTI_ARG_2_QI:
9559 nargs = 2;
9560 break;
9561
9562 case MULTI_ARG_2_DI_IMM:
9563 case MULTI_ARG_2_SI_IMM:
9564 case MULTI_ARG_2_HI_IMM:
9565 case MULTI_ARG_2_QI_IMM:
9566 nargs = 2;
9567 last_arg_constant = true;
9568 break;
9569
9570 case MULTI_ARG_1_SF:
9571 case MULTI_ARG_1_DF:
9572 case MULTI_ARG_1_SF2:
9573 case MULTI_ARG_1_DF2:
9574 case MULTI_ARG_1_DI:
9575 case MULTI_ARG_1_SI:
9576 case MULTI_ARG_1_HI:
9577 case MULTI_ARG_1_QI:
9578 case MULTI_ARG_1_SI_DI:
9579 case MULTI_ARG_1_HI_DI:
9580 case MULTI_ARG_1_HI_SI:
9581 case MULTI_ARG_1_QI_DI:
9582 case MULTI_ARG_1_QI_SI:
9583 case MULTI_ARG_1_QI_HI:
9584 nargs = 1;
9585 break;
9586
9587 case MULTI_ARG_2_DI_CMP:
9588 case MULTI_ARG_2_SI_CMP:
9589 case MULTI_ARG_2_HI_CMP:
9590 case MULTI_ARG_2_QI_CMP:
9591 nargs = 2;
9592 comparison_p = true;
9593 break;
9594
9595 case MULTI_ARG_2_SF_TF:
9596 case MULTI_ARG_2_DF_TF:
9597 case MULTI_ARG_2_DI_TF:
9598 case MULTI_ARG_2_SI_TF:
9599 case MULTI_ARG_2_HI_TF:
9600 case MULTI_ARG_2_QI_TF:
9601 nargs = 2;
9602 tf_p = true;
9603 break;
9604
9605 default:
9606 gcc_unreachable ();
9607 }
9608
9609 if (optimize || !target
9610 || GET_MODE (target) != tmode
9611 || !insn_data[icode].operand[0].predicate (target, tmode))
9612 target = gen_reg_rtx (tmode);
9613 else if (memory_operand (target, tmode))
9614 num_memory++;
9615
715a8bc8 9616 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
9617
9618 for (i = 0; i < nargs; i++)
9619 {
9620 tree arg = CALL_EXPR_ARG (exp, i);
9621 rtx op = expand_normal (arg);
9622 int adjust = (comparison_p) ? 1 : 0;
9623 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9624
9625 if (last_arg_constant && i == nargs - 1)
9626 {
9627 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9628 {
9629 enum insn_code new_icode = icode;
9630 switch (icode)
9631 {
9632 case CODE_FOR_xop_vpermil2v2df3:
9633 case CODE_FOR_xop_vpermil2v4sf3:
9634 case CODE_FOR_xop_vpermil2v4df3:
9635 case CODE_FOR_xop_vpermil2v8sf3:
9636 error ("the last argument must be a 2-bit immediate");
9637 return gen_reg_rtx (tmode);
9638 case CODE_FOR_xop_rotlv2di3:
9639 new_icode = CODE_FOR_rotlv2di3;
9640 goto xop_rotl;
9641 case CODE_FOR_xop_rotlv4si3:
9642 new_icode = CODE_FOR_rotlv4si3;
9643 goto xop_rotl;
9644 case CODE_FOR_xop_rotlv8hi3:
9645 new_icode = CODE_FOR_rotlv8hi3;
9646 goto xop_rotl;
9647 case CODE_FOR_xop_rotlv16qi3:
9648 new_icode = CODE_FOR_rotlv16qi3;
9649 xop_rotl:
9650 if (CONST_INT_P (op))
9651 {
9652 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9653 op = GEN_INT (INTVAL (op) & mask);
9654 gcc_checking_assert
9655 (insn_data[icode].operand[i + 1].predicate (op, mode));
9656 }
9657 else
9658 {
9659 gcc_checking_assert
9660 (nargs == 2
9661 && insn_data[new_icode].operand[0].mode == tmode
9662 && insn_data[new_icode].operand[1].mode == tmode
9663 && insn_data[new_icode].operand[2].mode == mode
9664 && insn_data[new_icode].operand[0].predicate
9665 == insn_data[icode].operand[0].predicate
9666 && insn_data[new_icode].operand[1].predicate
9667 == insn_data[icode].operand[1].predicate);
9668 icode = new_icode;
9669 goto non_constant;
9670 }
9671 break;
9672 default:
9673 gcc_unreachable ();
9674 }
9675 }
9676 }
9677 else
9678 {
9679 non_constant:
9680 if (VECTOR_MODE_P (mode))
9681 op = safe_vector_operand (op, mode);
9682
9683 /* If we aren't optimizing, only allow one memory operand to be
9684 generated. */
9685 if (memory_operand (op, mode))
9686 num_memory++;
9687
9688 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9689
9690 if (optimize
9691 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9692 || num_memory > 1)
9693 op = force_reg (mode, op);
9694 }
9695
715a8bc8 9696 xops[i] = op;
2bf6d935
ML
9697 }
9698
9699 switch (nargs)
9700 {
9701 case 1:
715a8bc8 9702 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
9703 break;
9704
9705 case 2:
9706 if (tf_p)
715a8bc8 9707 pat = GEN_FCN (icode) (target, xops[0], xops[1],
2bf6d935
ML
9708 GEN_INT ((int)sub_code));
9709 else if (! comparison_p)
715a8bc8 9710 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
9711 else
9712 {
9713 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
715a8bc8 9714 xops[0], xops[1]);
2bf6d935 9715
715a8bc8 9716 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
2bf6d935
ML
9717 }
9718 break;
9719
9720 case 3:
715a8bc8 9721 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
9722 break;
9723
9724 case 4:
715a8bc8 9725 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
2bf6d935
ML
9726 break;
9727
9728 default:
9729 gcc_unreachable ();
9730 }
9731
9732 if (! pat)
9733 return 0;
9734
9735 emit_insn (pat);
9736 return target;
9737}
9738
9739/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9740 insns with vec_merge. */
9741
9742static rtx
9743ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9744 rtx target)
9745{
9746 rtx pat;
9747 tree arg0 = CALL_EXPR_ARG (exp, 0);
9748 rtx op1, op0 = expand_normal (arg0);
9749 machine_mode tmode = insn_data[icode].operand[0].mode;
9750 machine_mode mode0 = insn_data[icode].operand[1].mode;
9751
9752 if (optimize || !target
9753 || GET_MODE (target) != tmode
9754 || !insn_data[icode].operand[0].predicate (target, tmode))
9755 target = gen_reg_rtx (tmode);
9756
9757 if (VECTOR_MODE_P (mode0))
9758 op0 = safe_vector_operand (op0, mode0);
9759
9760 if ((optimize && !register_operand (op0, mode0))
9761 || !insn_data[icode].operand[1].predicate (op0, mode0))
9762 op0 = copy_to_mode_reg (mode0, op0);
9763
9764 op1 = op0;
9765 if (!insn_data[icode].operand[2].predicate (op1, mode0))
9766 op1 = copy_to_mode_reg (mode0, op1);
9767
9768 pat = GEN_FCN (icode) (target, op0, op1);
9769 if (! pat)
9770 return 0;
9771 emit_insn (pat);
9772 return target;
9773}
9774
9775/* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9776
9777static rtx
9778ix86_expand_sse_compare (const struct builtin_description *d,
9779 tree exp, rtx target, bool swap)
9780{
9781 rtx pat;
9782 tree arg0 = CALL_EXPR_ARG (exp, 0);
9783 tree arg1 = CALL_EXPR_ARG (exp, 1);
9784 rtx op0 = expand_normal (arg0);
9785 rtx op1 = expand_normal (arg1);
9786 rtx op2;
9787 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9788 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9789 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9790 enum rtx_code comparison = d->comparison;
9791
9792 if (VECTOR_MODE_P (mode0))
9793 op0 = safe_vector_operand (op0, mode0);
9794 if (VECTOR_MODE_P (mode1))
9795 op1 = safe_vector_operand (op1, mode1);
9796
9797 /* Swap operands if we have a comparison that isn't available in
9798 hardware. */
9799 if (swap)
9800 std::swap (op0, op1);
9801
9802 if (optimize || !target
9803 || GET_MODE (target) != tmode
9804 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9805 target = gen_reg_rtx (tmode);
9806
9807 if ((optimize && !register_operand (op0, mode0))
9808 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
9809 op0 = copy_to_mode_reg (mode0, op0);
9810 if ((optimize && !register_operand (op1, mode1))
9811 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
9812 op1 = copy_to_mode_reg (mode1, op1);
9813
9814 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
9815 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9816 if (! pat)
9817 return 0;
9818 emit_insn (pat);
9819 return target;
9820}
9821
ae69e6f6 9822/* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
9823 * ordered EQ or unordered NE, generate PF jump. */
9824
9825static rtx
9826ix86_ssecom_setcc (const enum rtx_code comparison,
9827 bool check_unordered, machine_mode mode,
9828 rtx set_dst, rtx target)
9829{
9830
9831 rtx_code_label *label = NULL;
9832
9833 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
9834 with NAN operands. */
9835 if (check_unordered)
9836 {
9837 gcc_assert (comparison == EQ || comparison == NE);
9838
9839 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
9840 label = gen_label_rtx ();
9841 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
9842 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9843 gen_rtx_LABEL_REF (VOIDmode, label),
9844 pc_rtx);
9845 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9846 }
9847
9848 /* NB: Set CCFPmode and check a different CCmode which is in subset
9849 of CCFPmode. */
9850 if (GET_MODE (set_dst) != mode)
9851 {
9852 gcc_assert (mode == CCAmode || mode == CCCmode
9853 || mode == CCOmode || mode == CCPmode
9854 || mode == CCSmode || mode == CCZmode);
9855 set_dst = gen_rtx_REG (mode, FLAGS_REG);
9856 }
9857
9858 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9859 gen_rtx_fmt_ee (comparison, QImode,
9860 set_dst,
9861 const0_rtx)));
9862
9863 if (label)
9864 emit_label (label);
9865
9866 return SUBREG_REG (target);
9867}
9868
2bf6d935
ML
9869/* Subroutine of ix86_expand_builtin to take care of comi insns. */
9870
9871static rtx
9872ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
9873 rtx target)
9874{
ae69e6f6 9875 rtx pat, set_dst;
2bf6d935
ML
9876 tree arg0 = CALL_EXPR_ARG (exp, 0);
9877 tree arg1 = CALL_EXPR_ARG (exp, 1);
9878 rtx op0 = expand_normal (arg0);
9879 rtx op1 = expand_normal (arg1);
ae69e6f6 9880 enum insn_code icode = d->icode;
9881 const struct insn_data_d *insn_p = &insn_data[icode];
9882 machine_mode mode0 = insn_p->operand[0].mode;
9883 machine_mode mode1 = insn_p->operand[1].mode;
2bf6d935
ML
9884
9885 if (VECTOR_MODE_P (mode0))
9886 op0 = safe_vector_operand (op0, mode0);
9887 if (VECTOR_MODE_P (mode1))
9888 op1 = safe_vector_operand (op1, mode1);
9889
ae69e6f6 9890 enum rtx_code comparison = d->comparison;
9891 rtx const_val = const0_rtx;
9892
9893 bool check_unordered = false;
9894 machine_mode mode = CCFPmode;
9895 switch (comparison)
9896 {
9897 case LE: /* -> GE */
9898 case LT: /* -> GT */
9899 std::swap (op0, op1);
9900 comparison = swap_condition (comparison);
9901 /* FALLTHRU */
9902 case GT:
9903 case GE:
9904 break;
9905 case EQ:
9906 check_unordered = true;
9907 mode = CCZmode;
9908 break;
9909 case NE:
9910 check_unordered = true;
9911 mode = CCZmode;
9912 const_val = const1_rtx;
9913 break;
9914 default:
9915 gcc_unreachable ();
9916 }
9917
2bf6d935 9918 target = gen_reg_rtx (SImode);
ae69e6f6 9919 emit_move_insn (target, const_val);
2bf6d935
ML
9920 target = gen_rtx_SUBREG (QImode, target, 0);
9921
9922 if ((optimize && !register_operand (op0, mode0))
ae69e6f6 9923 || !insn_p->operand[0].predicate (op0, mode0))
2bf6d935
ML
9924 op0 = copy_to_mode_reg (mode0, op0);
9925 if ((optimize && !register_operand (op1, mode1))
ae69e6f6 9926 || !insn_p->operand[1].predicate (op1, mode1))
2bf6d935
ML
9927 op1 = copy_to_mode_reg (mode1, op1);
9928
ae69e6f6 9929 pat = GEN_FCN (icode) (op0, op1);
2bf6d935
ML
9930 if (! pat)
9931 return 0;
2bf6d935 9932
ae69e6f6 9933 set_dst = SET_DEST (pat);
9934 emit_insn (pat);
9935 return ix86_ssecom_setcc (comparison, check_unordered, mode,
9936 set_dst, target);
2bf6d935
ML
9937}
9938
9939/* Subroutines of ix86_expand_args_builtin to take care of round insns. */
9940
9941static rtx
9942ix86_expand_sse_round (const struct builtin_description *d, tree exp,
9943 rtx target)
9944{
9945 rtx pat;
9946 tree arg0 = CALL_EXPR_ARG (exp, 0);
9947 rtx op1, op0 = expand_normal (arg0);
9948 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9949 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9950
9951 if (optimize || target == 0
9952 || GET_MODE (target) != tmode
9953 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9954 target = gen_reg_rtx (tmode);
9955
9956 if (VECTOR_MODE_P (mode0))
9957 op0 = safe_vector_operand (op0, mode0);
9958
9959 if ((optimize && !register_operand (op0, mode0))
9960 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9961 op0 = copy_to_mode_reg (mode0, op0);
9962
9963 op1 = GEN_INT (d->comparison);
9964
9965 pat = GEN_FCN (d->icode) (target, op0, op1);
9966 if (! pat)
9967 return 0;
9968 emit_insn (pat);
9969 return target;
9970}
9971
9972static rtx
9973ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
9974 tree exp, rtx target)
9975{
9976 rtx pat;
9977 tree arg0 = CALL_EXPR_ARG (exp, 0);
9978 tree arg1 = CALL_EXPR_ARG (exp, 1);
9979 rtx op0 = expand_normal (arg0);
9980 rtx op1 = expand_normal (arg1);
9981 rtx op2;
9982 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9983 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9984 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9985
9986 if (optimize || target == 0
9987 || GET_MODE (target) != tmode
9988 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9989 target = gen_reg_rtx (tmode);
9990
9991 op0 = safe_vector_operand (op0, mode0);
9992 op1 = safe_vector_operand (op1, mode1);
9993
9994 if ((optimize && !register_operand (op0, mode0))
9995 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9996 op0 = copy_to_mode_reg (mode0, op0);
9997 if ((optimize && !register_operand (op1, mode1))
9998 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9999 op1 = copy_to_mode_reg (mode1, op1);
10000
10001 op2 = GEN_INT (d->comparison);
10002
10003 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10004 if (! pat)
10005 return 0;
10006 emit_insn (pat);
10007 return target;
10008}
10009
10010/* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10011
10012static rtx
10013ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10014 rtx target)
10015{
10016 rtx pat;
10017 tree arg0 = CALL_EXPR_ARG (exp, 0);
10018 tree arg1 = CALL_EXPR_ARG (exp, 1);
10019 rtx op0 = expand_normal (arg0);
10020 rtx op1 = expand_normal (arg1);
10021 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10022 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10023 enum rtx_code comparison = d->comparison;
10024
10025 if (VECTOR_MODE_P (mode0))
10026 op0 = safe_vector_operand (op0, mode0);
10027 if (VECTOR_MODE_P (mode1))
10028 op1 = safe_vector_operand (op1, mode1);
10029
10030 target = gen_reg_rtx (SImode);
10031 emit_move_insn (target, const0_rtx);
10032 target = gen_rtx_SUBREG (QImode, target, 0);
10033
10034 if ((optimize && !register_operand (op0, mode0))
10035 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10036 op0 = copy_to_mode_reg (mode0, op0);
10037 if ((optimize && !register_operand (op1, mode1))
10038 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10039 op1 = copy_to_mode_reg (mode1, op1);
10040
10041 pat = GEN_FCN (d->icode) (op0, op1);
10042 if (! pat)
10043 return 0;
10044 emit_insn (pat);
10045 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10046 gen_rtx_fmt_ee (comparison, QImode,
10047 SET_DEST (pat),
10048 const0_rtx)));
10049
10050 return SUBREG_REG (target);
10051}
10052
10053/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10054
10055static rtx
10056ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10057 tree exp, rtx target)
10058{
10059 rtx pat;
10060 tree arg0 = CALL_EXPR_ARG (exp, 0);
10061 tree arg1 = CALL_EXPR_ARG (exp, 1);
10062 tree arg2 = CALL_EXPR_ARG (exp, 2);
10063 tree arg3 = CALL_EXPR_ARG (exp, 3);
10064 tree arg4 = CALL_EXPR_ARG (exp, 4);
10065 rtx scratch0, scratch1;
10066 rtx op0 = expand_normal (arg0);
10067 rtx op1 = expand_normal (arg1);
10068 rtx op2 = expand_normal (arg2);
10069 rtx op3 = expand_normal (arg3);
10070 rtx op4 = expand_normal (arg4);
10071 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10072
10073 tmode0 = insn_data[d->icode].operand[0].mode;
10074 tmode1 = insn_data[d->icode].operand[1].mode;
10075 modev2 = insn_data[d->icode].operand[2].mode;
10076 modei3 = insn_data[d->icode].operand[3].mode;
10077 modev4 = insn_data[d->icode].operand[4].mode;
10078 modei5 = insn_data[d->icode].operand[5].mode;
10079 modeimm = insn_data[d->icode].operand[6].mode;
10080
10081 if (VECTOR_MODE_P (modev2))
10082 op0 = safe_vector_operand (op0, modev2);
10083 if (VECTOR_MODE_P (modev4))
10084 op2 = safe_vector_operand (op2, modev4);
10085
10086 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10087 op0 = copy_to_mode_reg (modev2, op0);
10088 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10089 op1 = copy_to_mode_reg (modei3, op1);
10090 if ((optimize && !register_operand (op2, modev4))
10091 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10092 op2 = copy_to_mode_reg (modev4, op2);
10093 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10094 op3 = copy_to_mode_reg (modei5, op3);
10095
10096 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10097 {
10098 error ("the fifth argument must be an 8-bit immediate");
10099 return const0_rtx;
10100 }
10101
10102 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10103 {
10104 if (optimize || !target
10105 || GET_MODE (target) != tmode0
10106 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10107 target = gen_reg_rtx (tmode0);
10108
10109 scratch1 = gen_reg_rtx (tmode1);
10110
10111 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10112 }
10113 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10114 {
10115 if (optimize || !target
10116 || GET_MODE (target) != tmode1
10117 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10118 target = gen_reg_rtx (tmode1);
10119
10120 scratch0 = gen_reg_rtx (tmode0);
10121
10122 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10123 }
10124 else
10125 {
10126 gcc_assert (d->flag);
10127
10128 scratch0 = gen_reg_rtx (tmode0);
10129 scratch1 = gen_reg_rtx (tmode1);
10130
10131 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10132 }
10133
10134 if (! pat)
10135 return 0;
10136
10137 emit_insn (pat);
10138
10139 if (d->flag)
10140 {
10141 target = gen_reg_rtx (SImode);
10142 emit_move_insn (target, const0_rtx);
10143 target = gen_rtx_SUBREG (QImode, target, 0);
10144
10145 emit_insn
10146 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10147 gen_rtx_fmt_ee (EQ, QImode,
10148 gen_rtx_REG ((machine_mode) d->flag,
10149 FLAGS_REG),
10150 const0_rtx)));
10151 return SUBREG_REG (target);
10152 }
10153 else
10154 return target;
10155}
10156
10157
10158/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10159
10160static rtx
10161ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10162 tree exp, rtx target)
10163{
10164 rtx pat;
10165 tree arg0 = CALL_EXPR_ARG (exp, 0);
10166 tree arg1 = CALL_EXPR_ARG (exp, 1);
10167 tree arg2 = CALL_EXPR_ARG (exp, 2);
10168 rtx scratch0, scratch1;
10169 rtx op0 = expand_normal (arg0);
10170 rtx op1 = expand_normal (arg1);
10171 rtx op2 = expand_normal (arg2);
10172 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10173
10174 tmode0 = insn_data[d->icode].operand[0].mode;
10175 tmode1 = insn_data[d->icode].operand[1].mode;
10176 modev2 = insn_data[d->icode].operand[2].mode;
10177 modev3 = insn_data[d->icode].operand[3].mode;
10178 modeimm = insn_data[d->icode].operand[4].mode;
10179
10180 if (VECTOR_MODE_P (modev2))
10181 op0 = safe_vector_operand (op0, modev2);
10182 if (VECTOR_MODE_P (modev3))
10183 op1 = safe_vector_operand (op1, modev3);
10184
10185 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10186 op0 = copy_to_mode_reg (modev2, op0);
10187 if ((optimize && !register_operand (op1, modev3))
10188 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10189 op1 = copy_to_mode_reg (modev3, op1);
10190
10191 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10192 {
10193 error ("the third argument must be an 8-bit immediate");
10194 return const0_rtx;
10195 }
10196
10197 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10198 {
10199 if (optimize || !target
10200 || GET_MODE (target) != tmode0
10201 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10202 target = gen_reg_rtx (tmode0);
10203
10204 scratch1 = gen_reg_rtx (tmode1);
10205
10206 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10207 }
10208 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10209 {
10210 if (optimize || !target
10211 || GET_MODE (target) != tmode1
10212 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10213 target = gen_reg_rtx (tmode1);
10214
10215 scratch0 = gen_reg_rtx (tmode0);
10216
10217 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10218 }
10219 else
10220 {
10221 gcc_assert (d->flag);
10222
10223 scratch0 = gen_reg_rtx (tmode0);
10224 scratch1 = gen_reg_rtx (tmode1);
10225
10226 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10227 }
10228
10229 if (! pat)
10230 return 0;
10231
10232 emit_insn (pat);
10233
10234 if (d->flag)
10235 {
10236 target = gen_reg_rtx (SImode);
10237 emit_move_insn (target, const0_rtx);
10238 target = gen_rtx_SUBREG (QImode, target, 0);
10239
10240 emit_insn
10241 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10242 gen_rtx_fmt_ee (EQ, QImode,
10243 gen_rtx_REG ((machine_mode) d->flag,
10244 FLAGS_REG),
10245 const0_rtx)));
10246 return SUBREG_REG (target);
10247 }
10248 else
10249 return target;
10250}
10251
10252/* Fixup modeless constants to fit required mode. */
10253
10254static rtx
10255fixup_modeless_constant (rtx x, machine_mode mode)
10256{
10257 if (GET_MODE (x) == VOIDmode)
10258 x = convert_to_mode (mode, x, 1);
10259 return x;
10260}
10261
10262/* Subroutine of ix86_expand_builtin to take care of insns with
10263 variable number of operands. */
10264
10265static rtx
10266ix86_expand_args_builtin (const struct builtin_description *d,
10267 tree exp, rtx target)
10268{
10269 rtx pat, real_target;
10270 unsigned int i, nargs;
10271 unsigned int nargs_constant = 0;
10272 unsigned int mask_pos = 0;
10273 int num_memory = 0;
715a8bc8 10274 rtx xops[6];
2bf6d935
ML
10275 bool second_arg_count = false;
10276 enum insn_code icode = d->icode;
10277 const struct insn_data_d *insn_p = &insn_data[icode];
10278 machine_mode tmode = insn_p->operand[0].mode;
10279 machine_mode rmode = VOIDmode;
10280 bool swap = false;
10281 enum rtx_code comparison = d->comparison;
10282
10283 switch ((enum ix86_builtin_func_type) d->flag)
10284 {
10285 case V2DF_FTYPE_V2DF_ROUND:
10286 case V4DF_FTYPE_V4DF_ROUND:
10287 case V8DF_FTYPE_V8DF_ROUND:
10288 case V4SF_FTYPE_V4SF_ROUND:
10289 case V8SF_FTYPE_V8SF_ROUND:
10290 case V16SF_FTYPE_V16SF_ROUND:
84bcefd5 10291 case V8HF_FTYPE_V8HF_ROUND:
10292 case V16HF_FTYPE_V16HF_ROUND:
10293 case V32HF_FTYPE_V32HF_ROUND:
2bf6d935
ML
10294 case V4SI_FTYPE_V4SF_ROUND:
10295 case V8SI_FTYPE_V8SF_ROUND:
10296 case V16SI_FTYPE_V16SF_ROUND:
10297 return ix86_expand_sse_round (d, exp, target);
10298 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10299 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10300 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10301 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10302 case INT_FTYPE_V8SF_V8SF_PTEST:
10303 case INT_FTYPE_V4DI_V4DI_PTEST:
10304 case INT_FTYPE_V4DF_V4DF_PTEST:
10305 case INT_FTYPE_V4SF_V4SF_PTEST:
10306 case INT_FTYPE_V2DI_V2DI_PTEST:
10307 case INT_FTYPE_V2DF_V2DF_PTEST:
10308 return ix86_expand_sse_ptest (d, exp, target);
10309 case FLOAT128_FTYPE_FLOAT128:
10310 case FLOAT_FTYPE_FLOAT:
10311 case INT_FTYPE_INT:
10312 case UINT_FTYPE_UINT:
10313 case UINT16_FTYPE_UINT16:
10314 case UINT64_FTYPE_INT:
10315 case UINT64_FTYPE_UINT64:
10316 case INT64_FTYPE_INT64:
10317 case INT64_FTYPE_V4SF:
10318 case INT64_FTYPE_V2DF:
10319 case INT_FTYPE_V16QI:
10320 case INT_FTYPE_V8QI:
10321 case INT_FTYPE_V8SF:
10322 case INT_FTYPE_V4DF:
10323 case INT_FTYPE_V4SF:
10324 case INT_FTYPE_V2DF:
10325 case INT_FTYPE_V32QI:
10326 case V16QI_FTYPE_V16QI:
10327 case V8SI_FTYPE_V8SF:
10328 case V8SI_FTYPE_V4SI:
10329 case V8HI_FTYPE_V8HI:
10330 case V8HI_FTYPE_V16QI:
10331 case V8QI_FTYPE_V8QI:
10332 case V8SF_FTYPE_V8SF:
10333 case V8SF_FTYPE_V8SI:
10334 case V8SF_FTYPE_V4SF:
10335 case V8SF_FTYPE_V8HI:
10336 case V4SI_FTYPE_V4SI:
10337 case V4SI_FTYPE_V16QI:
10338 case V4SI_FTYPE_V4SF:
10339 case V4SI_FTYPE_V8SI:
10340 case V4SI_FTYPE_V8HI:
10341 case V4SI_FTYPE_V4DF:
10342 case V4SI_FTYPE_V2DF:
10343 case V4HI_FTYPE_V4HI:
10344 case V4DF_FTYPE_V4DF:
10345 case V4DF_FTYPE_V4SI:
10346 case V4DF_FTYPE_V4SF:
10347 case V4DF_FTYPE_V2DF:
10348 case V4SF_FTYPE_V4SF:
10349 case V4SF_FTYPE_V4SI:
10350 case V4SF_FTYPE_V8SF:
10351 case V4SF_FTYPE_V4DF:
10352 case V4SF_FTYPE_V8HI:
10353 case V4SF_FTYPE_V2DF:
10354 case V2DI_FTYPE_V2DI:
10355 case V2DI_FTYPE_V16QI:
10356 case V2DI_FTYPE_V8HI:
10357 case V2DI_FTYPE_V4SI:
10358 case V2DF_FTYPE_V2DF:
10359 case V2DF_FTYPE_V4SI:
10360 case V2DF_FTYPE_V4DF:
10361 case V2DF_FTYPE_V4SF:
10362 case V2DF_FTYPE_V2SI:
10363 case V2SI_FTYPE_V2SI:
10364 case V2SI_FTYPE_V4SF:
10365 case V2SI_FTYPE_V2SF:
10366 case V2SI_FTYPE_V2DF:
10367 case V2SF_FTYPE_V2SF:
10368 case V2SF_FTYPE_V2SI:
10369 case V32QI_FTYPE_V32QI:
10370 case V32QI_FTYPE_V16QI:
10371 case V16HI_FTYPE_V16HI:
10372 case V16HI_FTYPE_V8HI:
10373 case V8SI_FTYPE_V8SI:
10374 case V16HI_FTYPE_V16QI:
10375 case V8SI_FTYPE_V16QI:
10376 case V4DI_FTYPE_V16QI:
10377 case V8SI_FTYPE_V8HI:
10378 case V4DI_FTYPE_V8HI:
10379 case V4DI_FTYPE_V4SI:
10380 case V4DI_FTYPE_V2DI:
10381 case UQI_FTYPE_UQI:
10382 case UHI_FTYPE_UHI:
10383 case USI_FTYPE_USI:
10384 case USI_FTYPE_UQI:
10385 case USI_FTYPE_UHI:
10386 case UDI_FTYPE_UDI:
10387 case UHI_FTYPE_V16QI:
10388 case USI_FTYPE_V32QI:
10389 case UDI_FTYPE_V64QI:
10390 case V16QI_FTYPE_UHI:
10391 case V32QI_FTYPE_USI:
10392 case V64QI_FTYPE_UDI:
10393 case V8HI_FTYPE_UQI:
10394 case V16HI_FTYPE_UHI:
10395 case V32HI_FTYPE_USI:
10396 case V4SI_FTYPE_UQI:
10397 case V8SI_FTYPE_UQI:
10398 case V4SI_FTYPE_UHI:
10399 case V8SI_FTYPE_UHI:
10400 case UQI_FTYPE_V8HI:
10401 case UHI_FTYPE_V16HI:
10402 case USI_FTYPE_V32HI:
10403 case UQI_FTYPE_V4SI:
10404 case UQI_FTYPE_V8SI:
10405 case UHI_FTYPE_V16SI:
10406 case UQI_FTYPE_V2DI:
10407 case UQI_FTYPE_V4DI:
10408 case UQI_FTYPE_V8DI:
10409 case V16SI_FTYPE_UHI:
10410 case V2DI_FTYPE_UQI:
10411 case V4DI_FTYPE_UQI:
10412 case V16SI_FTYPE_INT:
10413 case V16SF_FTYPE_V8SF:
10414 case V16SI_FTYPE_V8SI:
10415 case V16SF_FTYPE_V4SF:
10416 case V16SI_FTYPE_V4SI:
10417 case V16SI_FTYPE_V16SF:
10418 case V16SI_FTYPE_V16SI:
10419 case V64QI_FTYPE_V64QI:
10420 case V32HI_FTYPE_V32HI:
10421 case V16SF_FTYPE_V16SF:
10422 case V8DI_FTYPE_UQI:
10423 case V8DI_FTYPE_V8DI:
10424 case V8DF_FTYPE_V4DF:
10425 case V8DF_FTYPE_V2DF:
10426 case V8DF_FTYPE_V8DF:
10427 case V4DI_FTYPE_V4DI:
4f0e90fa
HL
10428 case V16HI_FTYPE_V16SF:
10429 case V8HI_FTYPE_V8SF:
10430 case V8HI_FTYPE_V4SF:
2bf6d935
ML
10431 nargs = 1;
10432 break;
10433 case V4SF_FTYPE_V4SF_VEC_MERGE:
10434 case V2DF_FTYPE_V2DF_VEC_MERGE:
10435 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10436 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10437 case V16QI_FTYPE_V16QI_V16QI:
10438 case V16QI_FTYPE_V8HI_V8HI:
b96cb2ca 10439 case V16HF_FTYPE_V16HF_V16HF:
2bf6d935
ML
10440 case V16SF_FTYPE_V16SF_V16SF:
10441 case V8QI_FTYPE_V8QI_V8QI:
10442 case V8QI_FTYPE_V4HI_V4HI:
10443 case V8HI_FTYPE_V8HI_V8HI:
10444 case V8HI_FTYPE_V16QI_V16QI:
10445 case V8HI_FTYPE_V4SI_V4SI:
b96cb2ca 10446 case V8HF_FTYPE_V8HF_V8HF:
2bf6d935
ML
10447 case V8SF_FTYPE_V8SF_V8SF:
10448 case V8SF_FTYPE_V8SF_V8SI:
10449 case V8DF_FTYPE_V8DF_V8DF:
10450 case V4SI_FTYPE_V4SI_V4SI:
10451 case V4SI_FTYPE_V8HI_V8HI:
10452 case V4SI_FTYPE_V2DF_V2DF:
10453 case V4HI_FTYPE_V4HI_V4HI:
10454 case V4HI_FTYPE_V8QI_V8QI:
10455 case V4HI_FTYPE_V2SI_V2SI:
10456 case V4DF_FTYPE_V4DF_V4DF:
10457 case V4DF_FTYPE_V4DF_V4DI:
10458 case V4SF_FTYPE_V4SF_V4SF:
10459 case V4SF_FTYPE_V4SF_V4SI:
10460 case V4SF_FTYPE_V4SF_V2SI:
10461 case V4SF_FTYPE_V4SF_V2DF:
10462 case V4SF_FTYPE_V4SF_UINT:
10463 case V4SF_FTYPE_V4SF_DI:
10464 case V4SF_FTYPE_V4SF_SI:
10465 case V2DI_FTYPE_V2DI_V2DI:
10466 case V2DI_FTYPE_V16QI_V16QI:
10467 case V2DI_FTYPE_V4SI_V4SI:
10468 case V2DI_FTYPE_V2DI_V16QI:
10469 case V2SI_FTYPE_V2SI_V2SI:
10470 case V2SI_FTYPE_V4HI_V4HI:
10471 case V2SI_FTYPE_V2SF_V2SF:
10472 case V2DF_FTYPE_V2DF_V2DF:
10473 case V2DF_FTYPE_V2DF_V4SF:
10474 case V2DF_FTYPE_V2DF_V2DI:
10475 case V2DF_FTYPE_V2DF_DI:
10476 case V2DF_FTYPE_V2DF_SI:
10477 case V2DF_FTYPE_V2DF_UINT:
10478 case V2SF_FTYPE_V2SF_V2SF:
10479 case V1DI_FTYPE_V1DI_V1DI:
10480 case V1DI_FTYPE_V8QI_V8QI:
10481 case V1DI_FTYPE_V2SI_V2SI:
10482 case V32QI_FTYPE_V16HI_V16HI:
10483 case V16HI_FTYPE_V8SI_V8SI:
10484 case V64QI_FTYPE_V64QI_V64QI:
10485 case V32QI_FTYPE_V32QI_V32QI:
10486 case V16HI_FTYPE_V32QI_V32QI:
10487 case V16HI_FTYPE_V16HI_V16HI:
10488 case V8SI_FTYPE_V4DF_V4DF:
10489 case V8SI_FTYPE_V8SI_V8SI:
10490 case V8SI_FTYPE_V16HI_V16HI:
10491 case V4DI_FTYPE_V4DI_V4DI:
10492 case V4DI_FTYPE_V8SI_V8SI:
6bb0776e 10493 case V4DI_FTYPE_V32QI_V32QI:
2bf6d935
ML
10494 case V8DI_FTYPE_V64QI_V64QI:
10495 if (comparison == UNKNOWN)
10496 return ix86_expand_binop_builtin (icode, exp, target);
10497 nargs = 2;
10498 break;
10499 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10500 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10501 gcc_assert (comparison != UNKNOWN);
10502 nargs = 2;
10503 swap = true;
10504 break;
10505 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10506 case V16HI_FTYPE_V16HI_SI_COUNT:
10507 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10508 case V8SI_FTYPE_V8SI_SI_COUNT:
10509 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10510 case V4DI_FTYPE_V4DI_INT_COUNT:
10511 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10512 case V8HI_FTYPE_V8HI_SI_COUNT:
10513 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10514 case V4SI_FTYPE_V4SI_SI_COUNT:
10515 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10516 case V4HI_FTYPE_V4HI_SI_COUNT:
10517 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10518 case V2DI_FTYPE_V2DI_SI_COUNT:
10519 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10520 case V2SI_FTYPE_V2SI_SI_COUNT:
10521 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10522 case V1DI_FTYPE_V1DI_SI_COUNT:
10523 nargs = 2;
10524 second_arg_count = true;
10525 break;
10526 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10527 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10528 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10529 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10530 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10531 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10532 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10533 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10534 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10535 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10536 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10537 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10538 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10539 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10540 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10541 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10542 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10543 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10544 nargs = 4;
10545 second_arg_count = true;
10546 break;
10547 case UINT64_FTYPE_UINT64_UINT64:
10548 case UINT_FTYPE_UINT_UINT:
10549 case UINT_FTYPE_UINT_USHORT:
10550 case UINT_FTYPE_UINT_UCHAR:
10551 case UINT16_FTYPE_UINT16_INT:
10552 case UINT8_FTYPE_UINT8_INT:
10553 case UQI_FTYPE_UQI_UQI:
10554 case UHI_FTYPE_UHI_UHI:
10555 case USI_FTYPE_USI_USI:
10556 case UDI_FTYPE_UDI_UDI:
10557 case V16SI_FTYPE_V8DF_V8DF:
4f0e90fa
HL
10558 case V32HI_FTYPE_V16SF_V16SF:
10559 case V16HI_FTYPE_V8SF_V8SF:
10560 case V8HI_FTYPE_V4SF_V4SF:
10561 case V16HI_FTYPE_V16SF_UHI:
10562 case V8HI_FTYPE_V8SF_UQI:
10563 case V8HI_FTYPE_V4SF_UQI:
2bf6d935
ML
10564 nargs = 2;
10565 break;
10566 case V2DI_FTYPE_V2DI_INT_CONVERT:
10567 nargs = 2;
10568 rmode = V1TImode;
10569 nargs_constant = 1;
10570 break;
10571 case V4DI_FTYPE_V4DI_INT_CONVERT:
10572 nargs = 2;
10573 rmode = V2TImode;
10574 nargs_constant = 1;
10575 break;
10576 case V8DI_FTYPE_V8DI_INT_CONVERT:
10577 nargs = 2;
10578 rmode = V4TImode;
10579 nargs_constant = 1;
10580 break;
10581 case V8HI_FTYPE_V8HI_INT:
10582 case V8HI_FTYPE_V8SF_INT:
10583 case V16HI_FTYPE_V16SF_INT:
10584 case V8HI_FTYPE_V4SF_INT:
10585 case V8SF_FTYPE_V8SF_INT:
10586 case V4SF_FTYPE_V16SF_INT:
10587 case V16SF_FTYPE_V16SF_INT:
10588 case V4SI_FTYPE_V4SI_INT:
10589 case V4SI_FTYPE_V8SI_INT:
10590 case V4HI_FTYPE_V4HI_INT:
10591 case V4DF_FTYPE_V4DF_INT:
10592 case V4DF_FTYPE_V8DF_INT:
10593 case V4SF_FTYPE_V4SF_INT:
10594 case V4SF_FTYPE_V8SF_INT:
10595 case V2DI_FTYPE_V2DI_INT:
10596 case V2DF_FTYPE_V2DF_INT:
10597 case V2DF_FTYPE_V4DF_INT:
10598 case V16HI_FTYPE_V16HI_INT:
10599 case V8SI_FTYPE_V8SI_INT:
10600 case V16SI_FTYPE_V16SI_INT:
10601 case V4SI_FTYPE_V16SI_INT:
10602 case V4DI_FTYPE_V4DI_INT:
10603 case V2DI_FTYPE_V4DI_INT:
10604 case V4DI_FTYPE_V8DI_INT:
2bf6d935
ML
10605 case UQI_FTYPE_UQI_UQI_CONST:
10606 case UHI_FTYPE_UHI_UQI:
10607 case USI_FTYPE_USI_UQI:
10608 case UDI_FTYPE_UDI_UQI:
10609 nargs = 2;
10610 nargs_constant = 1;
10611 break;
10612 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10613 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10614 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10615 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10616 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10617 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10618 case UHI_FTYPE_V16SI_V16SI_UHI:
10619 case UQI_FTYPE_V8DI_V8DI_UQI:
10620 case V16HI_FTYPE_V16SI_V16HI_UHI:
10621 case V16QI_FTYPE_V16SI_V16QI_UHI:
10622 case V16QI_FTYPE_V8DI_V16QI_UQI:
4204740f 10623 case V32HF_FTYPE_V32HF_V32HF_USI:
2bf6d935
ML
10624 case V16SF_FTYPE_V16SF_V16SF_UHI:
10625 case V16SF_FTYPE_V4SF_V16SF_UHI:
10626 case V16SI_FTYPE_SI_V16SI_UHI:
10627 case V16SI_FTYPE_V16HI_V16SI_UHI:
10628 case V16SI_FTYPE_V16QI_V16SI_UHI:
10629 case V8SF_FTYPE_V4SF_V8SF_UQI:
10630 case V4DF_FTYPE_V2DF_V4DF_UQI:
10631 case V8SI_FTYPE_V4SI_V8SI_UQI:
10632 case V8SI_FTYPE_SI_V8SI_UQI:
10633 case V4SI_FTYPE_V4SI_V4SI_UQI:
10634 case V4SI_FTYPE_SI_V4SI_UQI:
10635 case V4DI_FTYPE_V2DI_V4DI_UQI:
10636 case V4DI_FTYPE_DI_V4DI_UQI:
10637 case V2DI_FTYPE_V2DI_V2DI_UQI:
10638 case V2DI_FTYPE_DI_V2DI_UQI:
10639 case V64QI_FTYPE_V64QI_V64QI_UDI:
10640 case V64QI_FTYPE_V16QI_V64QI_UDI:
10641 case V64QI_FTYPE_QI_V64QI_UDI:
10642 case V32QI_FTYPE_V32QI_V32QI_USI:
10643 case V32QI_FTYPE_V16QI_V32QI_USI:
10644 case V32QI_FTYPE_QI_V32QI_USI:
10645 case V16QI_FTYPE_V16QI_V16QI_UHI:
10646 case V16QI_FTYPE_QI_V16QI_UHI:
10647 case V32HI_FTYPE_V8HI_V32HI_USI:
10648 case V32HI_FTYPE_HI_V32HI_USI:
10649 case V16HI_FTYPE_V8HI_V16HI_UHI:
10650 case V16HI_FTYPE_HI_V16HI_UHI:
10651 case V8HI_FTYPE_V8HI_V8HI_UQI:
10652 case V8HI_FTYPE_HI_V8HI_UQI:
4204740f 10653 case V16HF_FTYPE_V16HF_V16HF_UHI:
2bf6d935
ML
10654 case V8SF_FTYPE_V8HI_V8SF_UQI:
10655 case V4SF_FTYPE_V8HI_V4SF_UQI:
bd610db0 10656 case V8SI_FTYPE_V8HF_V8SI_UQI:
5a744e50 10657 case V8SF_FTYPE_V8HF_V8SF_UQI:
2bf6d935
ML
10658 case V8SI_FTYPE_V8SF_V8SI_UQI:
10659 case V4SI_FTYPE_V4SF_V4SI_UQI:
bd610db0 10660 case V4SI_FTYPE_V8HF_V4SI_UQI:
5a744e50 10661 case V4SF_FTYPE_V8HF_V4SF_UQI:
bd610db0 10662 case V4DI_FTYPE_V8HF_V4DI_UQI:
2bf6d935 10663 case V4DI_FTYPE_V4SF_V4DI_UQI:
bd610db0 10664 case V2DI_FTYPE_V8HF_V2DI_UQI:
2bf6d935 10665 case V2DI_FTYPE_V4SF_V2DI_UQI:
4204740f 10666 case V8HF_FTYPE_V8HF_V8HF_UQI:
081070bc 10667 case V8HF_FTYPE_V8HF_V8HF_V8HF:
be0e4c32 10668 case V8HF_FTYPE_V8HI_V8HF_UQI:
10669 case V8HF_FTYPE_V8SI_V8HF_UQI:
5a744e50 10670 case V8HF_FTYPE_V8SF_V8HF_UQI:
be0e4c32 10671 case V8HF_FTYPE_V4SI_V8HF_UQI:
5a744e50 10672 case V8HF_FTYPE_V4SF_V8HF_UQI:
be0e4c32 10673 case V8HF_FTYPE_V4DI_V8HF_UQI:
5a744e50 10674 case V8HF_FTYPE_V4DF_V8HF_UQI:
be0e4c32 10675 case V8HF_FTYPE_V2DI_V8HF_UQI:
5a744e50 10676 case V8HF_FTYPE_V2DF_V8HF_UQI:
2bf6d935
ML
10677 case V4SF_FTYPE_V4DI_V4SF_UQI:
10678 case V4SF_FTYPE_V2DI_V4SF_UQI:
10679 case V4DF_FTYPE_V4DI_V4DF_UQI:
5a744e50 10680 case V4DF_FTYPE_V8HF_V4DF_UQI:
10681 case V2DF_FTYPE_V8HF_V2DF_UQI:
2bf6d935
ML
10682 case V2DF_FTYPE_V2DI_V2DF_UQI:
10683 case V16QI_FTYPE_V8HI_V16QI_UQI:
10684 case V16QI_FTYPE_V16HI_V16QI_UHI:
10685 case V16QI_FTYPE_V4SI_V16QI_UQI:
10686 case V16QI_FTYPE_V8SI_V16QI_UQI:
bd610db0 10687 case V8HI_FTYPE_V8HF_V8HI_UQI:
2bf6d935
ML
10688 case V8HI_FTYPE_V4SI_V8HI_UQI:
10689 case V8HI_FTYPE_V8SI_V8HI_UQI:
10690 case V16QI_FTYPE_V2DI_V16QI_UQI:
10691 case V16QI_FTYPE_V4DI_V16QI_UQI:
10692 case V8HI_FTYPE_V2DI_V8HI_UQI:
10693 case V8HI_FTYPE_V4DI_V8HI_UQI:
10694 case V4SI_FTYPE_V2DI_V4SI_UQI:
10695 case V4SI_FTYPE_V4DI_V4SI_UQI:
10696 case V32QI_FTYPE_V32HI_V32QI_USI:
10697 case UHI_FTYPE_V16QI_V16QI_UHI:
10698 case USI_FTYPE_V32QI_V32QI_USI:
10699 case UDI_FTYPE_V64QI_V64QI_UDI:
10700 case UQI_FTYPE_V8HI_V8HI_UQI:
10701 case UHI_FTYPE_V16HI_V16HI_UHI:
10702 case USI_FTYPE_V32HI_V32HI_USI:
10703 case UQI_FTYPE_V4SI_V4SI_UQI:
10704 case UQI_FTYPE_V8SI_V8SI_UQI:
10705 case UQI_FTYPE_V2DI_V2DI_UQI:
10706 case UQI_FTYPE_V4DI_V4DI_UQI:
10707 case V4SF_FTYPE_V2DF_V4SF_UQI:
10708 case V4SF_FTYPE_V4DF_V4SF_UQI:
10709 case V16SI_FTYPE_V16SI_V16SI_UHI:
10710 case V16SI_FTYPE_V4SI_V16SI_UHI:
10711 case V2DI_FTYPE_V4SI_V2DI_UQI:
10712 case V2DI_FTYPE_V8HI_V2DI_UQI:
10713 case V2DI_FTYPE_V16QI_V2DI_UQI:
10714 case V4DI_FTYPE_V4DI_V4DI_UQI:
10715 case V4DI_FTYPE_V4SI_V4DI_UQI:
10716 case V4DI_FTYPE_V8HI_V4DI_UQI:
10717 case V4DI_FTYPE_V16QI_V4DI_UQI:
10718 case V4DI_FTYPE_V4DF_V4DI_UQI:
10719 case V2DI_FTYPE_V2DF_V2DI_UQI:
10720 case V4SI_FTYPE_V4DF_V4SI_UQI:
10721 case V4SI_FTYPE_V2DF_V4SI_UQI:
10722 case V4SI_FTYPE_V8HI_V4SI_UQI:
10723 case V4SI_FTYPE_V16QI_V4SI_UQI:
10724 case V4DI_FTYPE_V4DI_V4DI_V4DI:
10725 case V8DF_FTYPE_V2DF_V8DF_UQI:
10726 case V8DF_FTYPE_V4DF_V8DF_UQI:
10727 case V8DF_FTYPE_V8DF_V8DF_UQI:
10728 case V8SF_FTYPE_V8SF_V8SF_UQI:
10729 case V8SF_FTYPE_V8SI_V8SF_UQI:
10730 case V4DF_FTYPE_V4DF_V4DF_UQI:
10731 case V4SF_FTYPE_V4SF_V4SF_UQI:
10732 case V2DF_FTYPE_V2DF_V2DF_UQI:
10733 case V2DF_FTYPE_V4SF_V2DF_UQI:
10734 case V2DF_FTYPE_V4SI_V2DF_UQI:
10735 case V4SF_FTYPE_V4SI_V4SF_UQI:
10736 case V4DF_FTYPE_V4SF_V4DF_UQI:
10737 case V4DF_FTYPE_V4SI_V4DF_UQI:
10738 case V8SI_FTYPE_V8SI_V8SI_UQI:
10739 case V8SI_FTYPE_V8HI_V8SI_UQI:
10740 case V8SI_FTYPE_V16QI_V8SI_UQI:
10741 case V8DF_FTYPE_V8SI_V8DF_UQI:
10742 case V8DI_FTYPE_DI_V8DI_UQI:
10743 case V16SF_FTYPE_V8SF_V16SF_UHI:
10744 case V16SI_FTYPE_V8SI_V16SI_UHI:
be0e4c32 10745 case V16HF_FTYPE_V16HI_V16HF_UHI:
081070bc 10746 case V16HF_FTYPE_V16HF_V16HF_V16HF:
bd610db0 10747 case V16HI_FTYPE_V16HF_V16HI_UHI:
2bf6d935
ML
10748 case V16HI_FTYPE_V16HI_V16HI_UHI:
10749 case V8HI_FTYPE_V16QI_V8HI_UQI:
10750 case V16HI_FTYPE_V16QI_V16HI_UHI:
10751 case V32HI_FTYPE_V32HI_V32HI_USI:
10752 case V32HI_FTYPE_V32QI_V32HI_USI:
10753 case V8DI_FTYPE_V16QI_V8DI_UQI:
10754 case V8DI_FTYPE_V2DI_V8DI_UQI:
10755 case V8DI_FTYPE_V4DI_V8DI_UQI:
10756 case V8DI_FTYPE_V8DI_V8DI_UQI:
10757 case V8DI_FTYPE_V8HI_V8DI_UQI:
10758 case V8DI_FTYPE_V8SI_V8DI_UQI:
10759 case V8HI_FTYPE_V8DI_V8HI_UQI:
10760 case V8SI_FTYPE_V8DI_V8SI_UQI:
10761 case V4SI_FTYPE_V4SI_V4SI_V4SI:
10762 case V16SI_FTYPE_V16SI_V16SI_V16SI:
10763 case V8DI_FTYPE_V8DI_V8DI_V8DI:
10764 case V32HI_FTYPE_V32HI_V32HI_V32HI:
10765 case V2DI_FTYPE_V2DI_V2DI_V2DI:
10766 case V16HI_FTYPE_V16HI_V16HI_V16HI:
10767 case V8SI_FTYPE_V8SI_V8SI_V8SI:
10768 case V8HI_FTYPE_V8HI_V8HI_V8HI:
4f0e90fa
HL
10769 case V32HI_FTYPE_V16SF_V16SF_USI:
10770 case V16HI_FTYPE_V8SF_V8SF_UHI:
10771 case V8HI_FTYPE_V4SF_V4SF_UQI:
10772 case V16HI_FTYPE_V16SF_V16HI_UHI:
10773 case V8HI_FTYPE_V8SF_V8HI_UQI:
10774 case V8HI_FTYPE_V4SF_V8HI_UQI:
10775 case V16SF_FTYPE_V16SF_V32HI_V32HI:
10776 case V8SF_FTYPE_V8SF_V16HI_V16HI:
10777 case V4SF_FTYPE_V4SF_V8HI_V8HI:
2bf6d935
ML
10778 nargs = 3;
10779 break;
10780 case V32QI_FTYPE_V32QI_V32QI_INT:
10781 case V16HI_FTYPE_V16HI_V16HI_INT:
10782 case V16QI_FTYPE_V16QI_V16QI_INT:
10783 case V4DI_FTYPE_V4DI_V4DI_INT:
10784 case V8HI_FTYPE_V8HI_V8HI_INT:
10785 case V8SI_FTYPE_V8SI_V8SI_INT:
10786 case V8SI_FTYPE_V8SI_V4SI_INT:
10787 case V8SF_FTYPE_V8SF_V8SF_INT:
10788 case V8SF_FTYPE_V8SF_V4SF_INT:
10789 case V4SI_FTYPE_V4SI_V4SI_INT:
10790 case V4DF_FTYPE_V4DF_V4DF_INT:
10791 case V16SF_FTYPE_V16SF_V16SF_INT:
10792 case V16SF_FTYPE_V16SF_V4SF_INT:
10793 case V16SI_FTYPE_V16SI_V4SI_INT:
10794 case V4DF_FTYPE_V4DF_V2DF_INT:
10795 case V4SF_FTYPE_V4SF_V4SF_INT:
10796 case V2DI_FTYPE_V2DI_V2DI_INT:
10797 case V4DI_FTYPE_V4DI_V2DI_INT:
10798 case V2DF_FTYPE_V2DF_V2DF_INT:
10799 case UQI_FTYPE_V8DI_V8UDI_INT:
10800 case UQI_FTYPE_V8DF_V8DF_INT:
10801 case UQI_FTYPE_V2DF_V2DF_INT:
10802 case UQI_FTYPE_V4SF_V4SF_INT:
10803 case UHI_FTYPE_V16SI_V16SI_INT:
10804 case UHI_FTYPE_V16SF_V16SF_INT:
10805 case V64QI_FTYPE_V64QI_V64QI_INT:
10806 case V32HI_FTYPE_V32HI_V32HI_INT:
10807 case V16SI_FTYPE_V16SI_V16SI_INT:
10808 case V8DI_FTYPE_V8DI_V8DI_INT:
10809 nargs = 3;
10810 nargs_constant = 1;
10811 break;
10812 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
10813 nargs = 3;
10814 rmode = V4DImode;
10815 nargs_constant = 1;
10816 break;
10817 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
10818 nargs = 3;
10819 rmode = V2DImode;
10820 nargs_constant = 1;
10821 break;
10822 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
10823 nargs = 3;
10824 rmode = DImode;
10825 nargs_constant = 1;
10826 break;
10827 case V2DI_FTYPE_V2DI_UINT_UINT:
10828 nargs = 3;
10829 nargs_constant = 2;
10830 break;
10831 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
10832 nargs = 3;
10833 rmode = V8DImode;
10834 nargs_constant = 1;
10835 break;
10836 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
10837 nargs = 5;
10838 rmode = V8DImode;
10839 mask_pos = 2;
10840 nargs_constant = 1;
10841 break;
10842 case QI_FTYPE_V8DF_INT_UQI:
10843 case QI_FTYPE_V4DF_INT_UQI:
10844 case QI_FTYPE_V2DF_INT_UQI:
10845 case HI_FTYPE_V16SF_INT_UHI:
10846 case QI_FTYPE_V8SF_INT_UQI:
10847 case QI_FTYPE_V4SF_INT_UQI:
8486e9f2 10848 case QI_FTYPE_V8HF_INT_UQI:
10849 case HI_FTYPE_V16HF_INT_UHI:
10850 case SI_FTYPE_V32HF_INT_USI:
2bf6d935
ML
10851 case V4SI_FTYPE_V4SI_V4SI_UHI:
10852 case V8SI_FTYPE_V8SI_V8SI_UHI:
10853 nargs = 3;
10854 mask_pos = 1;
10855 nargs_constant = 1;
10856 break;
10857 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
10858 nargs = 5;
10859 rmode = V4DImode;
10860 mask_pos = 2;
10861 nargs_constant = 1;
10862 break;
10863 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
10864 nargs = 5;
10865 rmode = V2DImode;
10866 mask_pos = 2;
10867 nargs_constant = 1;
10868 break;
10869 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
10870 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
10871 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
10872 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
10873 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
10874 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
10875 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
10876 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
10877 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
10878 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
10879 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
10880 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
10881 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
10882 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
10883 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
10884 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
bd7a34ef 10885 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
2bf6d935
ML
10886 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
10887 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
10888 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
10889 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
10890 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
10891 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
10892 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
10893 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
10894 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
10895 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
10896 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
10897 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
10898 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
10899 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
10900 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
10901 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
10902 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
081070bc 10903 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
bd7a34ef 10904 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
2bf6d935
ML
10905 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
10906 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
10907 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
10908 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
10909 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
10910 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
10911 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
bd7a34ef 10912 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
2bf6d935
ML
10913 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
10914 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
10915 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
10916 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
10917 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
10918 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
10919 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
10920 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
10921 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
10922 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
10923 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
4f0e90fa
HL
10924 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
10925 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
10926 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
2bf6d935
ML
10927 nargs = 4;
10928 break;
10929 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
10930 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
10931 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
10932 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
10933 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
10934 nargs = 4;
10935 nargs_constant = 1;
10936 break;
10937 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
10938 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
10939 case QI_FTYPE_V4DF_V4DF_INT_UQI:
10940 case QI_FTYPE_V8SF_V8SF_INT_UQI:
0f200733 10941 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
2bf6d935
ML
10942 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
10943 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
10944 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
10945 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
0f200733 10946 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
2bf6d935
ML
10947 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
10948 case USI_FTYPE_V32QI_V32QI_INT_USI:
10949 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
10950 case USI_FTYPE_V32HI_V32HI_INT_USI:
0f200733 10951 case USI_FTYPE_V32HF_V32HF_INT_USI:
2bf6d935
ML
10952 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
10953 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
2bf6d935
ML
10954 nargs = 4;
10955 mask_pos = 1;
10956 nargs_constant = 1;
10957 break;
10958 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
10959 nargs = 4;
10960 nargs_constant = 2;
10961 break;
10962 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
10963 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
4f0e90fa
HL
10964 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
10965 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
10966 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
2bf6d935
ML
10967 nargs = 4;
10968 break;
10969 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
10970 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
10971 mask_pos = 1;
10972 nargs = 4;
10973 nargs_constant = 1;
10974 break;
10975 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
10976 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
10977 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
10978 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
10979 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
10980 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
10981 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
10982 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
10983 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
10984 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
10985 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
10986 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
10987 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
10988 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
10989 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
10990 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
10991 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
10992 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
10993 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
10994 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
10995 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
10996 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
10997 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
10998 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
10999 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
8bed7617 11000 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11001 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
2bf6d935
ML
11002 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11003 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11004 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11005 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11006 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11007 nargs = 4;
11008 mask_pos = 2;
11009 nargs_constant = 1;
11010 break;
11011 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11012 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11013 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11014 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11015 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11016 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11017 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11018 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11019 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11020 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11021 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11022 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11023 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11024 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11025 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11026 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11027 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11028 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11029 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11030 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11031 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11032 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11033 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11034 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11035 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11036 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11037 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11038 nargs = 5;
11039 mask_pos = 2;
11040 nargs_constant = 1;
11041 break;
11042 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11043 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11044 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11045 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11046 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11047 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11048 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11049 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11050 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11051 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11052 nargs = 5;
11053 mask_pos = 1;
11054 nargs_constant = 1;
11055 break;
11056 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11057 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11058 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11059 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11060 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11061 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11062 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11063 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11064 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11065 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11066 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11067 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11068 nargs = 5;
11069 mask_pos = 1;
11070 nargs_constant = 2;
11071 break;
11072
11073 default:
11074 gcc_unreachable ();
11075 }
11076
715a8bc8 11077 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
11078
11079 if (comparison != UNKNOWN)
11080 {
11081 gcc_assert (nargs == 2);
11082 return ix86_expand_sse_compare (d, exp, target, swap);
11083 }
11084
11085 if (rmode == VOIDmode || rmode == tmode)
11086 {
11087 if (optimize
11088 || target == 0
11089 || GET_MODE (target) != tmode
11090 || !insn_p->operand[0].predicate (target, tmode))
11091 target = gen_reg_rtx (tmode);
11092 else if (memory_operand (target, tmode))
11093 num_memory++;
11094 real_target = target;
11095 }
11096 else
11097 {
11098 real_target = gen_reg_rtx (tmode);
11099 target = lowpart_subreg (rmode, real_target, tmode);
11100 }
11101
11102 for (i = 0; i < nargs; i++)
11103 {
11104 tree arg = CALL_EXPR_ARG (exp, i);
11105 rtx op = expand_normal (arg);
11106 machine_mode mode = insn_p->operand[i + 1].mode;
11107 bool match = insn_p->operand[i + 1].predicate (op, mode);
11108
11109 if (second_arg_count && i == 1)
11110 {
11111 /* SIMD shift insns take either an 8-bit immediate or
11112 register as count. But builtin functions take int as
11113 count. If count doesn't match, we put it in register.
11114 The instructions are using 64-bit count, if op is just
11115 32-bit, zero-extend it, as negative shift counts
11116 are undefined behavior and zero-extension is more
11117 efficient. */
11118 if (!match)
11119 {
11120 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11121 op = convert_modes (mode, GET_MODE (op), op, 1);
11122 else
11123 op = lowpart_subreg (mode, op, GET_MODE (op));
11124 if (!insn_p->operand[i + 1].predicate (op, mode))
11125 op = copy_to_reg (op);
11126 }
11127 }
11128 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11129 (!mask_pos && (nargs - i) <= nargs_constant))
11130 {
11131 if (!match)
11132 switch (icode)
11133 {
11134 case CODE_FOR_avx_vinsertf128v4di:
11135 case CODE_FOR_avx_vextractf128v4di:
11136 error ("the last argument must be an 1-bit immediate");
11137 return const0_rtx;
11138
11139 case CODE_FOR_avx512f_cmpv8di3_mask:
11140 case CODE_FOR_avx512f_cmpv16si3_mask:
11141 case CODE_FOR_avx512f_ucmpv8di3_mask:
11142 case CODE_FOR_avx512f_ucmpv16si3_mask:
11143 case CODE_FOR_avx512vl_cmpv4di3_mask:
11144 case CODE_FOR_avx512vl_cmpv8si3_mask:
11145 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11146 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11147 case CODE_FOR_avx512vl_cmpv2di3_mask:
11148 case CODE_FOR_avx512vl_cmpv4si3_mask:
11149 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11150 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11151 error ("the last argument must be a 3-bit immediate");
11152 return const0_rtx;
11153
11154 case CODE_FOR_sse4_1_roundsd:
11155 case CODE_FOR_sse4_1_roundss:
11156
11157 case CODE_FOR_sse4_1_roundpd:
11158 case CODE_FOR_sse4_1_roundps:
11159 case CODE_FOR_avx_roundpd256:
11160 case CODE_FOR_avx_roundps256:
11161
11162 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11163 case CODE_FOR_sse4_1_roundps_sfix:
11164 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11165 case CODE_FOR_avx_roundps_sfix256:
11166
11167 case CODE_FOR_sse4_1_blendps:
11168 case CODE_FOR_avx_blendpd256:
11169 case CODE_FOR_avx_vpermilv4df:
11170 case CODE_FOR_avx_vpermilv4df_mask:
11171 case CODE_FOR_avx512f_getmantv8df_mask:
11172 case CODE_FOR_avx512f_getmantv16sf_mask:
8486e9f2 11173 case CODE_FOR_avx512vl_getmantv16hf_mask:
2bf6d935
ML
11174 case CODE_FOR_avx512vl_getmantv8sf_mask:
11175 case CODE_FOR_avx512vl_getmantv4df_mask:
8486e9f2 11176 case CODE_FOR_avx512fp16_getmantv8hf_mask:
2bf6d935
ML
11177 case CODE_FOR_avx512vl_getmantv4sf_mask:
11178 case CODE_FOR_avx512vl_getmantv2df_mask:
11179 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11180 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11181 case CODE_FOR_avx512dq_rangepv4df_mask:
11182 case CODE_FOR_avx512dq_rangepv8sf_mask:
11183 case CODE_FOR_avx512dq_rangepv2df_mask:
11184 case CODE_FOR_avx512dq_rangepv4sf_mask:
11185 case CODE_FOR_avx_shufpd256_mask:
11186 error ("the last argument must be a 4-bit immediate");
11187 return const0_rtx;
11188
11189 case CODE_FOR_sha1rnds4:
11190 case CODE_FOR_sse4_1_blendpd:
11191 case CODE_FOR_avx_vpermilv2df:
11192 case CODE_FOR_avx_vpermilv2df_mask:
11193 case CODE_FOR_xop_vpermil2v2df3:
11194 case CODE_FOR_xop_vpermil2v4sf3:
11195 case CODE_FOR_xop_vpermil2v4df3:
11196 case CODE_FOR_xop_vpermil2v8sf3:
11197 case CODE_FOR_avx512f_vinsertf32x4_mask:
11198 case CODE_FOR_avx512f_vinserti32x4_mask:
11199 case CODE_FOR_avx512f_vextractf32x4_mask:
11200 case CODE_FOR_avx512f_vextracti32x4_mask:
11201 case CODE_FOR_sse2_shufpd:
11202 case CODE_FOR_sse2_shufpd_mask:
11203 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11204 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11205 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11206 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11207 error ("the last argument must be a 2-bit immediate");
11208 return const0_rtx;
11209
11210 case CODE_FOR_avx_vextractf128v4df:
11211 case CODE_FOR_avx_vextractf128v8sf:
11212 case CODE_FOR_avx_vextractf128v8si:
11213 case CODE_FOR_avx_vinsertf128v4df:
11214 case CODE_FOR_avx_vinsertf128v8sf:
11215 case CODE_FOR_avx_vinsertf128v8si:
11216 case CODE_FOR_avx512f_vinsertf64x4_mask:
11217 case CODE_FOR_avx512f_vinserti64x4_mask:
11218 case CODE_FOR_avx512f_vextractf64x4_mask:
11219 case CODE_FOR_avx512f_vextracti64x4_mask:
11220 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11221 case CODE_FOR_avx512dq_vinserti32x8_mask:
11222 case CODE_FOR_avx512vl_vinsertv4df:
11223 case CODE_FOR_avx512vl_vinsertv4di:
11224 case CODE_FOR_avx512vl_vinsertv8sf:
11225 case CODE_FOR_avx512vl_vinsertv8si:
11226 error ("the last argument must be a 1-bit immediate");
11227 return const0_rtx;
11228
11229 case CODE_FOR_avx_vmcmpv2df3:
11230 case CODE_FOR_avx_vmcmpv4sf3:
11231 case CODE_FOR_avx_cmpv2df3:
11232 case CODE_FOR_avx_cmpv4sf3:
11233 case CODE_FOR_avx_cmpv4df3:
11234 case CODE_FOR_avx_cmpv8sf3:
11235 case CODE_FOR_avx512f_cmpv8df3_mask:
11236 case CODE_FOR_avx512f_cmpv16sf3_mask:
11237 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11238 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
0f200733 11239 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11240 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11241 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
2bf6d935
ML
11242 error ("the last argument must be a 5-bit immediate");
11243 return const0_rtx;
11244
11245 default:
11246 switch (nargs_constant)
11247 {
11248 case 2:
11249 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11250 (!mask_pos && (nargs - i) == nargs_constant))
11251 {
11252 error ("the next to last argument must be an 8-bit immediate");
11253 break;
11254 }
11255 /* FALLTHRU */
11256 case 1:
11257 error ("the last argument must be an 8-bit immediate");
11258 break;
11259 default:
11260 gcc_unreachable ();
11261 }
11262 return const0_rtx;
11263 }
11264 }
11265 else
11266 {
11267 if (VECTOR_MODE_P (mode))
11268 op = safe_vector_operand (op, mode);
11269
11270 /* If we aren't optimizing, only allow one memory operand to
11271 be generated. */
11272 if (memory_operand (op, mode))
11273 num_memory++;
11274
11275 op = fixup_modeless_constant (op, mode);
11276
11277 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11278 {
11279 if (optimize || !match || num_memory > 1)
11280 op = copy_to_mode_reg (mode, op);
11281 }
11282 else
11283 {
11284 op = copy_to_reg (op);
11285 op = lowpart_subreg (mode, op, GET_MODE (op));
11286 }
11287 }
11288
715a8bc8 11289 xops[i] = op;
2bf6d935
ML
11290 }
11291
11292 switch (nargs)
11293 {
11294 case 1:
715a8bc8 11295 pat = GEN_FCN (icode) (real_target, xops[0]);
2bf6d935
ML
11296 break;
11297 case 2:
715a8bc8 11298 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
2bf6d935
ML
11299 break;
11300 case 3:
715a8bc8 11301 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
2bf6d935
ML
11302 break;
11303 case 4:
715a8bc8
UB
11304 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11305 xops[2], xops[3]);
2bf6d935
ML
11306 break;
11307 case 5:
715a8bc8
UB
11308 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11309 xops[2], xops[3], xops[4]);
2bf6d935
ML
11310 break;
11311 case 6:
715a8bc8
UB
11312 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11313 xops[2], xops[3], xops[4], xops[5]);
2bf6d935
ML
11314 break;
11315 default:
11316 gcc_unreachable ();
11317 }
11318
11319 if (! pat)
11320 return 0;
11321
11322 emit_insn (pat);
11323 return target;
11324}
11325
11326/* Transform pattern of following layout:
11327 (set A
11328 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11329 )
11330 into:
11331 (set (A B)) */
11332
11333static rtx
11334ix86_erase_embedded_rounding (rtx pat)
11335{
11336 if (GET_CODE (pat) == INSN)
11337 pat = PATTERN (pat);
11338
11339 gcc_assert (GET_CODE (pat) == SET);
11340 rtx src = SET_SRC (pat);
11341 gcc_assert (XVECLEN (src, 0) == 2);
11342 rtx p0 = XVECEXP (src, 0, 0);
11343 gcc_assert (GET_CODE (src) == UNSPEC
11344 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11345 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11346 return res;
11347}
11348
11349/* Subroutine of ix86_expand_round_builtin to take care of comi insns
11350 with rounding. */
11351static rtx
11352ix86_expand_sse_comi_round (const struct builtin_description *d,
11353 tree exp, rtx target)
11354{
11355 rtx pat, set_dst;
11356 tree arg0 = CALL_EXPR_ARG (exp, 0);
11357 tree arg1 = CALL_EXPR_ARG (exp, 1);
11358 tree arg2 = CALL_EXPR_ARG (exp, 2);
11359 tree arg3 = CALL_EXPR_ARG (exp, 3);
11360 rtx op0 = expand_normal (arg0);
11361 rtx op1 = expand_normal (arg1);
11362 rtx op2 = expand_normal (arg2);
11363 rtx op3 = expand_normal (arg3);
11364 enum insn_code icode = d->icode;
11365 const struct insn_data_d *insn_p = &insn_data[icode];
11366 machine_mode mode0 = insn_p->operand[0].mode;
11367 machine_mode mode1 = insn_p->operand[1].mode;
2bf6d935
ML
11368
11369 /* See avxintrin.h for values. */
467e9f38 11370 static const enum rtx_code comparisons[32] =
2bf6d935 11371 {
467e9f38
L
11372 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11373 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11374 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11375 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
2bf6d935 11376 };
467e9f38
L
11377 static const bool ordereds[32] =
11378 {
11379 true, true, true, false, false, false, false, true,
11380 false, false, false, true, true, true, true, false,
11381 true, true, true, false, false, false, false, true,
11382 false, false, false, true, true, true, true, false
11383 };
11384 static const bool non_signalings[32] =
2bf6d935
ML
11385 {
11386 true, false, false, true, true, false, false, true,
11387 true, false, false, true, true, false, false, true,
11388 false, true, true, false, false, true, true, false,
11389 false, true, true, false, false, true, true, false
11390 };
11391
11392 if (!CONST_INT_P (op2))
11393 {
11394 error ("the third argument must be comparison constant");
11395 return const0_rtx;
11396 }
11397 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11398 {
11399 error ("incorrect comparison mode");
11400 return const0_rtx;
11401 }
11402
11403 if (!insn_p->operand[2].predicate (op3, SImode))
11404 {
11405 error ("incorrect rounding operand");
11406 return const0_rtx;
11407 }
11408
2bf6d935
ML
11409 if (VECTOR_MODE_P (mode0))
11410 op0 = safe_vector_operand (op0, mode0);
11411 if (VECTOR_MODE_P (mode1))
11412 op1 = safe_vector_operand (op1, mode1);
11413
467e9f38
L
11414 enum rtx_code comparison = comparisons[INTVAL (op2)];
11415 bool ordered = ordereds[INTVAL (op2)];
11416 bool non_signaling = non_signalings[INTVAL (op2)];
11417 rtx const_val = const0_rtx;
11418
11419 bool check_unordered = false;
11420 machine_mode mode = CCFPmode;
11421 switch (comparison)
11422 {
11423 case ORDERED:
11424 if (!ordered)
11425 {
11426 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11427 if (!non_signaling)
11428 ordered = true;
11429 mode = CCSmode;
11430 }
11431 else
11432 {
11433 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11434 if (non_signaling)
11435 ordered = false;
11436 mode = CCPmode;
11437 }
11438 comparison = NE;
11439 break;
11440 case UNORDERED:
11441 if (ordered)
11442 {
11443 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11444 if (non_signaling)
11445 ordered = false;
11446 mode = CCSmode;
11447 }
11448 else
11449 {
11450 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11451 if (!non_signaling)
11452 ordered = true;
11453 mode = CCPmode;
11454 }
11455 comparison = EQ;
11456 break;
11457
11458 case LE: /* -> GE */
11459 case LT: /* -> GT */
11460 case UNGE: /* -> UNLE */
11461 case UNGT: /* -> UNLT */
11462 std::swap (op0, op1);
11463 comparison = swap_condition (comparison);
11464 /* FALLTHRU */
11465 case GT:
11466 case GE:
11467 case UNEQ:
11468 case UNLT:
11469 case UNLE:
11470 case LTGT:
11471 /* These are supported by CCFPmode. NB: Use ordered/signaling
11472 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11473 with NAN operands. */
11474 if (ordered == non_signaling)
11475 ordered = !ordered;
11476 break;
11477 case EQ:
11478 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11479 _CMP_EQ_OQ/_CMP_EQ_OS. */
11480 check_unordered = true;
11481 mode = CCZmode;
11482 break;
11483 case NE:
11484 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11485 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11486 gcc_assert (!ordered);
11487 check_unordered = true;
11488 mode = CCZmode;
11489 const_val = const1_rtx;
11490 break;
11491 default:
11492 gcc_unreachable ();
11493 }
11494
2bf6d935 11495 target = gen_reg_rtx (SImode);
467e9f38 11496 emit_move_insn (target, const_val);
2bf6d935
ML
11497 target = gen_rtx_SUBREG (QImode, target, 0);
11498
11499 if ((optimize && !register_operand (op0, mode0))
11500 || !insn_p->operand[0].predicate (op0, mode0))
11501 op0 = copy_to_mode_reg (mode0, op0);
11502 if ((optimize && !register_operand (op1, mode1))
11503 || !insn_p->operand[1].predicate (op1, mode1))
11504 op1 = copy_to_mode_reg (mode1, op1);
11505
467e9f38
L
11506 /*
11507 1. COMI: ordered and signaling.
11508 2. UCOMI: unordered and non-signaling.
11509 */
11510 if (non_signaling)
11511 icode = (icode == CODE_FOR_sse_comi_round
11512 ? CODE_FOR_sse_ucomi_round
11513 : CODE_FOR_sse2_ucomi_round);
2bf6d935
ML
11514
11515 pat = GEN_FCN (icode) (op0, op1, op3);
11516 if (! pat)
11517 return 0;
11518
11519 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11520 if (INTVAL (op3) == NO_ROUND)
11521 {
11522 pat = ix86_erase_embedded_rounding (pat);
11523 if (! pat)
11524 return 0;
11525
11526 set_dst = SET_DEST (pat);
11527 }
11528 else
11529 {
11530 gcc_assert (GET_CODE (pat) == SET);
11531 set_dst = SET_DEST (pat);
11532 }
11533
11534 emit_insn (pat);
467e9f38 11535
ae69e6f6 11536 return ix86_ssecom_setcc (comparison, check_unordered, mode,
11537 set_dst, target);
2bf6d935
ML
11538}
11539
11540static rtx
11541ix86_expand_round_builtin (const struct builtin_description *d,
11542 tree exp, rtx target)
11543{
11544 rtx pat;
11545 unsigned int i, nargs;
715a8bc8 11546 rtx xops[6];
2bf6d935
ML
11547 enum insn_code icode = d->icode;
11548 const struct insn_data_d *insn_p = &insn_data[icode];
11549 machine_mode tmode = insn_p->operand[0].mode;
11550 unsigned int nargs_constant = 0;
11551 unsigned int redundant_embed_rnd = 0;
11552
11553 switch ((enum ix86_builtin_func_type) d->flag)
11554 {
11555 case UINT64_FTYPE_V2DF_INT:
11556 case UINT64_FTYPE_V4SF_INT:
3069a2e5 11557 case UINT64_FTYPE_V8HF_INT:
2bf6d935
ML
11558 case UINT_FTYPE_V2DF_INT:
11559 case UINT_FTYPE_V4SF_INT:
3069a2e5 11560 case UINT_FTYPE_V8HF_INT:
2bf6d935
ML
11561 case INT64_FTYPE_V2DF_INT:
11562 case INT64_FTYPE_V4SF_INT:
3069a2e5 11563 case INT64_FTYPE_V8HF_INT:
2bf6d935
ML
11564 case INT_FTYPE_V2DF_INT:
11565 case INT_FTYPE_V4SF_INT:
3069a2e5 11566 case INT_FTYPE_V8HF_INT:
2bf6d935
ML
11567 nargs = 2;
11568 break;
bd7a34ef 11569 case V32HF_FTYPE_V32HF_V32HF_INT:
71838266 11570 case V8HF_FTYPE_V8HF_V8HF_INT:
3069a2e5 11571 case V8HF_FTYPE_V8HF_INT_INT:
11572 case V8HF_FTYPE_V8HF_UINT_INT:
11573 case V8HF_FTYPE_V8HF_INT64_INT:
11574 case V8HF_FTYPE_V8HF_UINT64_INT:
2bf6d935
ML
11575 case V4SF_FTYPE_V4SF_UINT_INT:
11576 case V4SF_FTYPE_V4SF_UINT64_INT:
11577 case V2DF_FTYPE_V2DF_UINT64_INT:
11578 case V4SF_FTYPE_V4SF_INT_INT:
11579 case V4SF_FTYPE_V4SF_INT64_INT:
11580 case V2DF_FTYPE_V2DF_INT64_INT:
11581 case V4SF_FTYPE_V4SF_V4SF_INT:
11582 case V2DF_FTYPE_V2DF_V2DF_INT:
11583 case V4SF_FTYPE_V4SF_V2DF_INT:
11584 case V2DF_FTYPE_V2DF_V4SF_INT:
11585 nargs = 3;
11586 break;
11587 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11588 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
bd610db0 11589 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
2bf6d935 11590 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
bd610db0 11591 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
2bf6d935
ML
11592 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11593 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11594 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
5a744e50 11595 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11596 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
be0e4c32 11597 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
4204740f 11598 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
081070bc 11599 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
2bf6d935
ML
11600 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11601 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11602 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11603 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
bd610db0 11604 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
be0e4c32 11605 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
2bf6d935
ML
11606 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11607 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11608 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11609 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
be0e4c32 11610 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
5a744e50 11611 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11612 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
3c9de0a9 11613 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
2bf6d935
ML
11614 nargs = 4;
11615 break;
11616 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11617 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11618 nargs_constant = 2;
11619 nargs = 4;
11620 break;
11621 case INT_FTYPE_V4SF_V4SF_INT_INT:
11622 case INT_FTYPE_V2DF_V2DF_INT_INT:
11623 return ix86_expand_sse_comi_round (d, exp, target);
11624 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11625 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11626 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
90429b96 11627 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
2bf6d935 11628 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
081070bc 11629 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
bd7a34ef 11630 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
90429b96 11631 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
2bf6d935
ML
11632 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11633 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
93103603 11634 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
2bf6d935
ML
11635 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11636 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
93103603 11637 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
71838266 11638 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
90429b96 11639 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11640 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
2bf6d935
ML
11641 nargs = 5;
11642 break;
8bed7617 11643 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
2bf6d935
ML
11644 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11645 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
93103603
SP
11646 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11647 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
2bf6d935
ML
11648 nargs_constant = 4;
11649 nargs = 5;
11650 break;
11651 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11652 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11653 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11654 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
0f200733 11655 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11656 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
2bf6d935
ML
11657 nargs_constant = 3;
11658 nargs = 5;
11659 break;
11660 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11661 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11662 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11663 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11664 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11665 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
8bed7617 11666 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
2bf6d935
ML
11667 nargs = 6;
11668 nargs_constant = 4;
11669 break;
11670 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11671 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11672 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11673 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11674 nargs = 6;
11675 nargs_constant = 3;
11676 break;
11677 default:
11678 gcc_unreachable ();
11679 }
715a8bc8 11680 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
11681
11682 if (optimize
11683 || target == 0
11684 || GET_MODE (target) != tmode
11685 || !insn_p->operand[0].predicate (target, tmode))
11686 target = gen_reg_rtx (tmode);
11687
11688 for (i = 0; i < nargs; i++)
11689 {
11690 tree arg = CALL_EXPR_ARG (exp, i);
11691 rtx op = expand_normal (arg);
11692 machine_mode mode = insn_p->operand[i + 1].mode;
11693 bool match = insn_p->operand[i + 1].predicate (op, mode);
11694
11695 if (i == nargs - nargs_constant)
11696 {
11697 if (!match)
11698 {
11699 switch (icode)
11700 {
11701 case CODE_FOR_avx512f_getmantv8df_mask_round:
11702 case CODE_FOR_avx512f_getmantv16sf_mask_round:
8486e9f2 11703 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
2bf6d935
ML
11704 case CODE_FOR_avx512f_vgetmantv2df_round:
11705 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11706 case CODE_FOR_avx512f_vgetmantv4sf_round:
11707 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
8486e9f2 11708 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
2bf6d935
ML
11709 error ("the immediate argument must be a 4-bit immediate");
11710 return const0_rtx;
11711 case CODE_FOR_avx512f_cmpv8df3_mask_round:
11712 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11713 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11714 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
0f200733 11715 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11716 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
2bf6d935
ML
11717 error ("the immediate argument must be a 5-bit immediate");
11718 return const0_rtx;
11719 default:
11720 error ("the immediate argument must be an 8-bit immediate");
11721 return const0_rtx;
11722 }
11723 }
11724 }
11725 else if (i == nargs-1)
11726 {
11727 if (!insn_p->operand[nargs].predicate (op, SImode))
11728 {
11729 error ("incorrect rounding operand");
11730 return const0_rtx;
11731 }
11732
11733 /* If there is no rounding use normal version of the pattern. */
11734 if (INTVAL (op) == NO_ROUND)
2f9529fc
HW
11735 {
11736 /* Skip erasing embedded rounding for below expanders who
11737 generates multiple insns. In ix86_erase_embedded_rounding
11738 the pattern will be transformed to a single set, and emit_insn
11739 appends the set insead of insert it to chain. So the insns
11740 emitted inside define_expander would be ignored. */
11741 switch (icode)
11742 {
11743 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11744 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11745 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11746 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11747 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11748 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11749 redundant_embed_rnd = 0;
11750 break;
11751 default:
11752 redundant_embed_rnd = 1;
11753 break;
11754 }
11755 }
2bf6d935
ML
11756 }
11757 else
11758 {
11759 if (VECTOR_MODE_P (mode))
11760 op = safe_vector_operand (op, mode);
11761
11762 op = fixup_modeless_constant (op, mode);
11763
11764 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11765 {
11766 if (optimize || !match)
11767 op = copy_to_mode_reg (mode, op);
11768 }
11769 else
11770 {
11771 op = copy_to_reg (op);
11772 op = lowpart_subreg (mode, op, GET_MODE (op));
11773 }
11774 }
11775
715a8bc8 11776 xops[i] = op;
2bf6d935
ML
11777 }
11778
11779 switch (nargs)
11780 {
11781 case 1:
715a8bc8 11782 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
11783 break;
11784 case 2:
715a8bc8 11785 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
11786 break;
11787 case 3:
715a8bc8 11788 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
11789 break;
11790 case 4:
715a8bc8
UB
11791 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11792 xops[2], xops[3]);
2bf6d935
ML
11793 break;
11794 case 5:
715a8bc8
UB
11795 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11796 xops[2], xops[3], xops[4]);
2bf6d935
ML
11797 break;
11798 case 6:
715a8bc8
UB
11799 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11800 xops[2], xops[3], xops[4], xops[5]);
2bf6d935
ML
11801 break;
11802 default:
11803 gcc_unreachable ();
11804 }
11805
11806 if (!pat)
11807 return 0;
11808
11809 if (redundant_embed_rnd)
11810 pat = ix86_erase_embedded_rounding (pat);
11811
11812 emit_insn (pat);
11813 return target;
11814}
11815
11816/* Subroutine of ix86_expand_builtin to take care of special insns
11817 with variable number of operands. */
11818
11819static rtx
11820ix86_expand_special_args_builtin (const struct builtin_description *d,
11821 tree exp, rtx target)
11822{
11823 tree arg;
11824 rtx pat, op;
11825 unsigned int i, nargs, arg_adjust, memory;
11826 bool aligned_mem = false;
715a8bc8 11827 rtx xops[3];
2bf6d935 11828 enum insn_code icode = d->icode;
2bf6d935
ML
11829 const struct insn_data_d *insn_p = &insn_data[icode];
11830 machine_mode tmode = insn_p->operand[0].mode;
11831 enum { load, store } klass;
11832
11833 switch ((enum ix86_builtin_func_type) d->flag)
11834 {
11835 case VOID_FTYPE_VOID:
11836 emit_insn (GEN_FCN (icode) (target));
11837 return 0;
11838 case VOID_FTYPE_UINT64:
11839 case VOID_FTYPE_UNSIGNED:
11840 nargs = 0;
11841 klass = store;
11842 memory = 0;
11843 break;
11844
11845 case INT_FTYPE_VOID:
11846 case USHORT_FTYPE_VOID:
11847 case UINT64_FTYPE_VOID:
11848 case UINT_FTYPE_VOID:
299a53d7 11849 case UINT8_FTYPE_VOID:
2bf6d935
ML
11850 case UNSIGNED_FTYPE_VOID:
11851 nargs = 0;
11852 klass = load;
11853 memory = 0;
11854 break;
11855 case UINT64_FTYPE_PUNSIGNED:
11856 case V2DI_FTYPE_PV2DI:
11857 case V4DI_FTYPE_PV4DI:
11858 case V32QI_FTYPE_PCCHAR:
11859 case V16QI_FTYPE_PCCHAR:
11860 case V8SF_FTYPE_PCV4SF:
11861 case V8SF_FTYPE_PCFLOAT:
11862 case V4SF_FTYPE_PCFLOAT:
11863 case V4DF_FTYPE_PCV2DF:
11864 case V4DF_FTYPE_PCDOUBLE:
11865 case V2DF_FTYPE_PCDOUBLE:
11866 case VOID_FTYPE_PVOID:
11867 case V8DI_FTYPE_PV8DI:
11868 nargs = 1;
11869 klass = load;
11870 memory = 0;
11871 switch (icode)
11872 {
11873 case CODE_FOR_sse4_1_movntdqa:
11874 case CODE_FOR_avx2_movntdqa:
11875 case CODE_FOR_avx512f_movntdqa:
11876 aligned_mem = true;
11877 break;
11878 default:
11879 break;
11880 }
11881 break;
11882 case VOID_FTYPE_PV2SF_V4SF:
11883 case VOID_FTYPE_PV8DI_V8DI:
11884 case VOID_FTYPE_PV4DI_V4DI:
11885 case VOID_FTYPE_PV2DI_V2DI:
11886 case VOID_FTYPE_PCHAR_V32QI:
11887 case VOID_FTYPE_PCHAR_V16QI:
11888 case VOID_FTYPE_PFLOAT_V16SF:
11889 case VOID_FTYPE_PFLOAT_V8SF:
11890 case VOID_FTYPE_PFLOAT_V4SF:
11891 case VOID_FTYPE_PDOUBLE_V8DF:
11892 case VOID_FTYPE_PDOUBLE_V4DF:
11893 case VOID_FTYPE_PDOUBLE_V2DF:
11894 case VOID_FTYPE_PLONGLONG_LONGLONG:
11895 case VOID_FTYPE_PULONGLONG_ULONGLONG:
11896 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
11897 case VOID_FTYPE_PINT_INT:
11898 nargs = 1;
11899 klass = store;
11900 /* Reserve memory operand for target. */
715a8bc8 11901 memory = ARRAY_SIZE (xops);
2bf6d935
ML
11902 switch (icode)
11903 {
11904 /* These builtins and instructions require the memory
11905 to be properly aligned. */
11906 case CODE_FOR_avx_movntv4di:
11907 case CODE_FOR_sse2_movntv2di:
11908 case CODE_FOR_avx_movntv8sf:
11909 case CODE_FOR_sse_movntv4sf:
11910 case CODE_FOR_sse4a_vmmovntv4sf:
11911 case CODE_FOR_avx_movntv4df:
11912 case CODE_FOR_sse2_movntv2df:
11913 case CODE_FOR_sse4a_vmmovntv2df:
11914 case CODE_FOR_sse2_movntidi:
11915 case CODE_FOR_sse_movntq:
11916 case CODE_FOR_sse2_movntisi:
11917 case CODE_FOR_avx512f_movntv16sf:
11918 case CODE_FOR_avx512f_movntv8df:
11919 case CODE_FOR_avx512f_movntv8di:
11920 aligned_mem = true;
11921 break;
11922 default:
11923 break;
11924 }
11925 break;
11926 case VOID_FTYPE_PVOID_PCVOID:
11927 nargs = 1;
11928 klass = store;
11929 memory = 0;
11930
11931 break;
11932 case V4SF_FTYPE_V4SF_PCV2SF:
11933 case V2DF_FTYPE_V2DF_PCDOUBLE:
11934 nargs = 2;
11935 klass = load;
11936 memory = 1;
11937 break;
11938 case V8SF_FTYPE_PCV8SF_V8SI:
11939 case V4DF_FTYPE_PCV4DF_V4DI:
11940 case V4SF_FTYPE_PCV4SF_V4SI:
11941 case V2DF_FTYPE_PCV2DF_V2DI:
11942 case V8SI_FTYPE_PCV8SI_V8SI:
11943 case V4DI_FTYPE_PCV4DI_V4DI:
11944 case V4SI_FTYPE_PCV4SI_V4SI:
11945 case V2DI_FTYPE_PCV2DI_V2DI:
11946 case VOID_FTYPE_INT_INT64:
11947 nargs = 2;
11948 klass = load;
11949 memory = 0;
11950 break;
11951 case VOID_FTYPE_PV8DF_V8DF_UQI:
11952 case VOID_FTYPE_PV4DF_V4DF_UQI:
11953 case VOID_FTYPE_PV2DF_V2DF_UQI:
11954 case VOID_FTYPE_PV16SF_V16SF_UHI:
11955 case VOID_FTYPE_PV8SF_V8SF_UQI:
11956 case VOID_FTYPE_PV4SF_V4SF_UQI:
11957 case VOID_FTYPE_PV8DI_V8DI_UQI:
11958 case VOID_FTYPE_PV4DI_V4DI_UQI:
11959 case VOID_FTYPE_PV2DI_V2DI_UQI:
11960 case VOID_FTYPE_PV16SI_V16SI_UHI:
11961 case VOID_FTYPE_PV8SI_V8SI_UQI:
11962 case VOID_FTYPE_PV4SI_V4SI_UQI:
11963 case VOID_FTYPE_PV64QI_V64QI_UDI:
11964 case VOID_FTYPE_PV32HI_V32HI_USI:
11965 case VOID_FTYPE_PV32QI_V32QI_USI:
11966 case VOID_FTYPE_PV16QI_V16QI_UHI:
11967 case VOID_FTYPE_PV16HI_V16HI_UHI:
11968 case VOID_FTYPE_PV8HI_V8HI_UQI:
11969 switch (icode)
11970 {
11971 /* These builtins and instructions require the memory
11972 to be properly aligned. */
11973 case CODE_FOR_avx512f_storev16sf_mask:
11974 case CODE_FOR_avx512f_storev16si_mask:
11975 case CODE_FOR_avx512f_storev8df_mask:
11976 case CODE_FOR_avx512f_storev8di_mask:
11977 case CODE_FOR_avx512vl_storev8sf_mask:
11978 case CODE_FOR_avx512vl_storev8si_mask:
11979 case CODE_FOR_avx512vl_storev4df_mask:
11980 case CODE_FOR_avx512vl_storev4di_mask:
11981 case CODE_FOR_avx512vl_storev4sf_mask:
11982 case CODE_FOR_avx512vl_storev4si_mask:
11983 case CODE_FOR_avx512vl_storev2df_mask:
11984 case CODE_FOR_avx512vl_storev2di_mask:
11985 aligned_mem = true;
11986 break;
11987 default:
11988 break;
11989 }
11990 /* FALLTHRU */
11991 case VOID_FTYPE_PV8SF_V8SI_V8SF:
11992 case VOID_FTYPE_PV4DF_V4DI_V4DF:
11993 case VOID_FTYPE_PV4SF_V4SI_V4SF:
11994 case VOID_FTYPE_PV2DF_V2DI_V2DF:
11995 case VOID_FTYPE_PV8SI_V8SI_V8SI:
11996 case VOID_FTYPE_PV4DI_V4DI_V4DI:
11997 case VOID_FTYPE_PV4SI_V4SI_V4SI:
11998 case VOID_FTYPE_PV2DI_V2DI_V2DI:
11999 case VOID_FTYPE_PV8SI_V8DI_UQI:
12000 case VOID_FTYPE_PV8HI_V8DI_UQI:
12001 case VOID_FTYPE_PV16HI_V16SI_UHI:
4a948703 12002 case VOID_FTYPE_PUDI_V8DI_UQI:
2bf6d935
ML
12003 case VOID_FTYPE_PV16QI_V16SI_UHI:
12004 case VOID_FTYPE_PV4SI_V4DI_UQI:
4a948703 12005 case VOID_FTYPE_PUDI_V2DI_UQI:
12006 case VOID_FTYPE_PUDI_V4DI_UQI:
12007 case VOID_FTYPE_PUSI_V2DI_UQI:
2bf6d935 12008 case VOID_FTYPE_PV8HI_V8SI_UQI:
4a948703 12009 case VOID_FTYPE_PUDI_V4SI_UQI:
12010 case VOID_FTYPE_PUSI_V4DI_UQI:
12011 case VOID_FTYPE_PUHI_V2DI_UQI:
12012 case VOID_FTYPE_PUDI_V8SI_UQI:
12013 case VOID_FTYPE_PUSI_V4SI_UQI:
2bf6d935
ML
12014 case VOID_FTYPE_PCHAR_V64QI_UDI:
12015 case VOID_FTYPE_PCHAR_V32QI_USI:
12016 case VOID_FTYPE_PCHAR_V16QI_UHI:
12017 case VOID_FTYPE_PSHORT_V32HI_USI:
12018 case VOID_FTYPE_PSHORT_V16HI_UHI:
12019 case VOID_FTYPE_PSHORT_V8HI_UQI:
12020 case VOID_FTYPE_PINT_V16SI_UHI:
12021 case VOID_FTYPE_PINT_V8SI_UQI:
12022 case VOID_FTYPE_PINT_V4SI_UQI:
12023 case VOID_FTYPE_PINT64_V8DI_UQI:
12024 case VOID_FTYPE_PINT64_V4DI_UQI:
12025 case VOID_FTYPE_PINT64_V2DI_UQI:
12026 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12027 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12028 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12029 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12030 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12031 case VOID_FTYPE_PFLOAT_V4SF_UQI:
c4d423c7 12032 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
2bf6d935
ML
12033 case VOID_FTYPE_PV32QI_V32HI_USI:
12034 case VOID_FTYPE_PV16QI_V16HI_UHI:
4a948703 12035 case VOID_FTYPE_PUDI_V8HI_UQI:
2bf6d935
ML
12036 nargs = 2;
12037 klass = store;
12038 /* Reserve memory operand for target. */
715a8bc8 12039 memory = ARRAY_SIZE (xops);
2bf6d935
ML
12040 break;
12041 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12042 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12043 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12044 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12045 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12046 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12047 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12048 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12049 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12050 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12051 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12052 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12053 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12054 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12055 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12056 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12057 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12058 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12059 switch (icode)
12060 {
12061 /* These builtins and instructions require the memory
12062 to be properly aligned. */
12063 case CODE_FOR_avx512f_loadv16sf_mask:
12064 case CODE_FOR_avx512f_loadv16si_mask:
12065 case CODE_FOR_avx512f_loadv8df_mask:
12066 case CODE_FOR_avx512f_loadv8di_mask:
12067 case CODE_FOR_avx512vl_loadv8sf_mask:
12068 case CODE_FOR_avx512vl_loadv8si_mask:
12069 case CODE_FOR_avx512vl_loadv4df_mask:
12070 case CODE_FOR_avx512vl_loadv4di_mask:
12071 case CODE_FOR_avx512vl_loadv4sf_mask:
12072 case CODE_FOR_avx512vl_loadv4si_mask:
12073 case CODE_FOR_avx512vl_loadv2df_mask:
12074 case CODE_FOR_avx512vl_loadv2di_mask:
12075 case CODE_FOR_avx512bw_loadv64qi_mask:
12076 case CODE_FOR_avx512vl_loadv32qi_mask:
12077 case CODE_FOR_avx512vl_loadv16qi_mask:
12078 case CODE_FOR_avx512bw_loadv32hi_mask:
12079 case CODE_FOR_avx512vl_loadv16hi_mask:
12080 case CODE_FOR_avx512vl_loadv8hi_mask:
12081 aligned_mem = true;
12082 break;
12083 default:
12084 break;
12085 }
12086 /* FALLTHRU */
12087 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12088 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12089 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12090 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12091 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12092 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12093 case V16SI_FTYPE_PCINT_V16SI_UHI:
12094 case V8SI_FTYPE_PCINT_V8SI_UQI:
12095 case V4SI_FTYPE_PCINT_V4SI_UQI:
12096 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12097 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12098 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12099 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12100 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12101 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12102 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12103 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12104 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
c4d423c7 12105 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
2bf6d935
ML
12106 nargs = 3;
12107 klass = load;
12108 memory = 0;
12109 break;
2bf6d935
ML
12110 default:
12111 gcc_unreachable ();
12112 }
12113
715a8bc8 12114 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
12115
12116 if (klass == store)
12117 {
12118 arg = CALL_EXPR_ARG (exp, 0);
12119 op = expand_normal (arg);
12120 gcc_assert (target == 0);
12121 if (memory)
12122 {
12123 op = ix86_zero_extend_to_Pmode (op);
12124 target = gen_rtx_MEM (tmode, op);
12125 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12126 on it. Try to improve it using get_pointer_alignment,
12127 and if the special builtin is one that requires strict
12128 mode alignment, also from it's GET_MODE_ALIGNMENT.
12129 Failure to do so could lead to ix86_legitimate_combined_insn
12130 rejecting all changes to such insns. */
12131 unsigned int align = get_pointer_alignment (arg);
12132 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12133 align = GET_MODE_ALIGNMENT (tmode);
12134 if (MEM_ALIGN (target) < align)
12135 set_mem_align (target, align);
12136 }
12137 else
12138 target = force_reg (tmode, op);
12139 arg_adjust = 1;
12140 }
12141 else
12142 {
12143 arg_adjust = 0;
12144 if (optimize
12145 || target == 0
12146 || !register_operand (target, tmode)
12147 || GET_MODE (target) != tmode)
12148 target = gen_reg_rtx (tmode);
12149 }
12150
12151 for (i = 0; i < nargs; i++)
12152 {
12153 machine_mode mode = insn_p->operand[i + 1].mode;
2bf6d935
ML
12154
12155 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12156 op = expand_normal (arg);
2bf6d935 12157
776a37f6 12158 if (i == memory)
2bf6d935 12159 {
776a37f6 12160 /* This must be the memory operand. */
12161 op = ix86_zero_extend_to_Pmode (op);
12162 op = gen_rtx_MEM (mode, op);
12163 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12164 on it. Try to improve it using get_pointer_alignment,
12165 and if the special builtin is one that requires strict
12166 mode alignment, also from it's GET_MODE_ALIGNMENT.
12167 Failure to do so could lead to ix86_legitimate_combined_insn
12168 rejecting all changes to such insns. */
12169 unsigned int align = get_pointer_alignment (arg);
12170 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12171 align = GET_MODE_ALIGNMENT (mode);
12172 if (MEM_ALIGN (op) < align)
12173 set_mem_align (op, align);
2bf6d935
ML
12174 }
12175 else
12176 {
776a37f6 12177 /* This must be register. */
12178 if (VECTOR_MODE_P (mode))
12179 op = safe_vector_operand (op, mode);
2bf6d935 12180
776a37f6 12181 op = fixup_modeless_constant (op, mode);
2bf6d935 12182
b6efffa5 12183 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
35c4c67e 12184 and that mask operand shoud be at the end.
12185 Keep all-ones mask which would be simplified by the expander. */
12186 if (nargs == 3 && i == 2 && klass == load
b6efffa5 12187 && constm1_operand (op, mode)
12188 && insn_p->operand[i].predicate (op, mode))
35c4c67e 12189 ;
12190 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
776a37f6 12191 op = copy_to_mode_reg (mode, op);
12192 else
12193 {
12194 op = copy_to_reg (op);
12195 op = lowpart_subreg (mode, op, GET_MODE (op));
2bf6d935
ML
12196 }
12197 }
12198
715a8bc8 12199 xops[i]= op;
2bf6d935
ML
12200 }
12201
12202 switch (nargs)
12203 {
12204 case 0:
12205 pat = GEN_FCN (icode) (target);
12206 break;
12207 case 1:
715a8bc8 12208 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
12209 break;
12210 case 2:
715a8bc8 12211 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
12212 break;
12213 case 3:
715a8bc8 12214 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
12215 break;
12216 default:
12217 gcc_unreachable ();
12218 }
12219
12220 if (! pat)
12221 return 0;
715a8bc8 12222
2bf6d935
ML
12223 emit_insn (pat);
12224 return klass == store ? 0 : target;
12225}
12226
12227/* Return the integer constant in ARG. Constrain it to be in the range
12228 of the subparts of VEC_TYPE; issue an error if not. */
12229
12230static int
12231get_element_number (tree vec_type, tree arg)
12232{
12233 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12234
12235 if (!tree_fits_uhwi_p (arg)
12236 || (elt = tree_to_uhwi (arg), elt > max))
12237 {
a9c697b8
MS
12238 error ("selector must be an integer constant in the range "
12239 "[0, %wi]", max);
2bf6d935
ML
12240 return 0;
12241 }
12242
12243 return elt;
12244}
12245
12246/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12247 ix86_expand_vector_init. We DO have language-level syntax for this, in
12248 the form of (type){ init-list }. Except that since we can't place emms
12249 instructions from inside the compiler, we can't allow the use of MMX
12250 registers unless the user explicitly asks for it. So we do *not* define
12251 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12252 we have builtins invoked by mmintrin.h that gives us license to emit
12253 these sorts of instructions. */
12254
12255static rtx
12256ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12257{
12258 machine_mode tmode = TYPE_MODE (type);
12259 machine_mode inner_mode = GET_MODE_INNER (tmode);
12260 int i, n_elt = GET_MODE_NUNITS (tmode);
12261 rtvec v = rtvec_alloc (n_elt);
12262
12263 gcc_assert (VECTOR_MODE_P (tmode));
12264 gcc_assert (call_expr_nargs (exp) == n_elt);
12265
12266 for (i = 0; i < n_elt; ++i)
12267 {
12268 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12269 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12270 }
12271
12272 if (!target || !register_operand (target, tmode))
12273 target = gen_reg_rtx (tmode);
12274
12275 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12276 return target;
12277}
12278
12279/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12280 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12281 had a language-level syntax for referencing vector elements. */
12282
12283static rtx
12284ix86_expand_vec_ext_builtin (tree exp, rtx target)
12285{
12286 machine_mode tmode, mode0;
12287 tree arg0, arg1;
12288 int elt;
12289 rtx op0;
12290
12291 arg0 = CALL_EXPR_ARG (exp, 0);
12292 arg1 = CALL_EXPR_ARG (exp, 1);
12293
12294 op0 = expand_normal (arg0);
12295 elt = get_element_number (TREE_TYPE (arg0), arg1);
12296
12297 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12298 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12299 gcc_assert (VECTOR_MODE_P (mode0));
12300
12301 op0 = force_reg (mode0, op0);
12302
12303 if (optimize || !target || !register_operand (target, tmode))
12304 target = gen_reg_rtx (tmode);
12305
12306 ix86_expand_vector_extract (true, target, op0, elt);
12307
12308 return target;
12309}
12310
12311/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12312 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12313 a language-level syntax for referencing vector elements. */
12314
12315static rtx
12316ix86_expand_vec_set_builtin (tree exp)
12317{
12318 machine_mode tmode, mode1;
12319 tree arg0, arg1, arg2;
12320 int elt;
12321 rtx op0, op1, target;
12322
12323 arg0 = CALL_EXPR_ARG (exp, 0);
12324 arg1 = CALL_EXPR_ARG (exp, 1);
12325 arg2 = CALL_EXPR_ARG (exp, 2);
12326
12327 tmode = TYPE_MODE (TREE_TYPE (arg0));
12328 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12329 gcc_assert (VECTOR_MODE_P (tmode));
12330
12331 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12332 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12333 elt = get_element_number (TREE_TYPE (arg0), arg2);
12334
12335 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
12336 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12337
12338 op0 = force_reg (tmode, op0);
12339 op1 = force_reg (mode1, op1);
12340
12341 /* OP0 is the source of these builtin functions and shouldn't be
12342 modified. Create a copy, use it and return it as target. */
12343 target = gen_reg_rtx (tmode);
12344 emit_move_insn (target, op0);
12345 ix86_expand_vector_set (true, target, op1, elt);
12346
12347 return target;
12348}
12349
823b3b79 12350/* Return true if the necessary isa options for this builtin exist,
12351 else false.
12352 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12353bool
12354ix86_check_builtin_isa_match (unsigned int fcode,
12355 HOST_WIDE_INT* pbisa,
12356 HOST_WIDE_INT* pbisa2)
2bf6d935 12357{
2bf6d935
ML
12358 HOST_WIDE_INT isa = ix86_isa_flags;
12359 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12360 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12361 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12362 /* The general case is we require all the ISAs specified in bisa{,2}
12363 to be enabled.
12364 The exceptions are:
12365 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12366 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12367 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
ca813880 12368 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12369 OPTION_MASK_ISA2_AVXVNNI
a13d6ec8
JJ
12370 where for each such pair it is sufficient if either of the ISAs is
12371 enabled, plus if it is ored with other options also those others.
12372 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
2bf6d935
ML
12373 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12374 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12375 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
12376 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
db3f0d21 12377
2bf6d935
ML
12378 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12379 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12380 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
12381 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
db3f0d21 12382
2bf6d935
ML
12383 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12384 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12385 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
12386 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
6058b874 12387
ca813880 12388 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12389 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12390 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
12391 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12392 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12393 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
12394 {
12395 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
12396 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
12397 }
12398
db3f0d21
UB
12399 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12400 /* __builtin_ia32_maskmovq requires MMX registers. */
6058b874 12401 && fcode != IX86_BUILTIN_MASKMOVQ)
a13d6ec8
JJ
12402 {
12403 bisa &= ~OPTION_MASK_ISA_MMX;
12404 bisa |= OPTION_MASK_ISA_SSE2;
ecfdb16c 12405 }
6058b874 12406
823b3b79 12407 if (pbisa)
12408 *pbisa = bisa;
12409 if (pbisa2)
12410 *pbisa2 = bisa2;
12411
12412 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12413}
12414
12415/* Expand an expression EXP that calls a built-in function,
12416 with result going to TARGET if that's convenient
12417 (and in mode MODE if that's convenient).
12418 SUBTARGET may be used as the target for computing one of EXP's operands.
12419 IGNORE is nonzero if the value is to be ignored. */
12420
12421rtx
12422ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12423 machine_mode mode, int ignore)
12424{
12425 size_t i;
12426 enum insn_code icode, icode2;
12427 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12428 tree arg0, arg1, arg2, arg3, arg4;
12429 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12430 machine_mode mode0, mode1, mode2, mode3, mode4;
12431 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12432 HOST_WIDE_INT bisa, bisa2;
12433
12434 /* For CPU builtins that can be folded, fold first and expand the fold. */
12435 switch (fcode)
12436 {
12437 case IX86_BUILTIN_CPU_INIT:
12438 {
12439 /* Make it call __cpu_indicator_init in libgcc. */
12440 tree call_expr, fndecl, type;
12441 type = build_function_type_list (integer_type_node, NULL_TREE);
12442 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12443 call_expr = build_call_expr (fndecl, 0);
12444 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12445 }
12446 case IX86_BUILTIN_CPU_IS:
12447 case IX86_BUILTIN_CPU_SUPPORTS:
12448 {
12449 tree arg0 = CALL_EXPR_ARG (exp, 0);
12450 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12451 gcc_assert (fold_expr != NULL_TREE);
12452 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12453 }
12454 }
12455
12456 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
2bf6d935
ML
12457 {
12458 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12459 if (TARGET_ABI_X32)
12460 bisa |= OPTION_MASK_ABI_X32;
12461 else
12462 bisa |= OPTION_MASK_ABI_64;
12463 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
46e6341f
JJ
12464 (enum fpmath_unit) 0,
12465 (enum prefer_vector_width) 0,
654cd743 12466 PVW_NONE, PVW_NONE,
46e6341f 12467 false, add_abi_p);
2bf6d935
ML
12468 if (!opts)
12469 error ("%qE needs unknown isa option", fndecl);
12470 else
12471 {
12472 gcc_assert (opts != NULL);
12473 error ("%qE needs isa option %s", fndecl, opts);
12474 free (opts);
12475 }
12476 return expand_call (exp, target, ignore);
12477 }
12478
12479 switch (fcode)
12480 {
12481 case IX86_BUILTIN_MASKMOVQ:
12482 case IX86_BUILTIN_MASKMOVDQU:
12483 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12484 ? CODE_FOR_mmx_maskmovq
12485 : CODE_FOR_sse2_maskmovdqu);
12486 /* Note the arg order is different from the operand order. */
12487 arg1 = CALL_EXPR_ARG (exp, 0);
12488 arg2 = CALL_EXPR_ARG (exp, 1);
12489 arg0 = CALL_EXPR_ARG (exp, 2);
12490 op0 = expand_normal (arg0);
12491 op1 = expand_normal (arg1);
12492 op2 = expand_normal (arg2);
12493 mode0 = insn_data[icode].operand[0].mode;
12494 mode1 = insn_data[icode].operand[1].mode;
12495 mode2 = insn_data[icode].operand[2].mode;
12496
12497 op0 = ix86_zero_extend_to_Pmode (op0);
12498 op0 = gen_rtx_MEM (mode1, op0);
12499
12500 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12501 op0 = copy_to_mode_reg (mode0, op0);
12502 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12503 op1 = copy_to_mode_reg (mode1, op1);
12504 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12505 op2 = copy_to_mode_reg (mode2, op2);
12506 pat = GEN_FCN (icode) (op0, op1, op2);
12507 if (! pat)
12508 return 0;
12509 emit_insn (pat);
12510 return 0;
12511
12512 case IX86_BUILTIN_LDMXCSR:
12513 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12514 target = assign_386_stack_local (SImode, SLOT_TEMP);
12515 emit_move_insn (target, op0);
12516 emit_insn (gen_sse_ldmxcsr (target));
12517 return 0;
12518
12519 case IX86_BUILTIN_STMXCSR:
12520 target = assign_386_stack_local (SImode, SLOT_TEMP);
12521 emit_insn (gen_sse_stmxcsr (target));
12522 return copy_to_mode_reg (SImode, target);
12523
12524 case IX86_BUILTIN_CLFLUSH:
12525 arg0 = CALL_EXPR_ARG (exp, 0);
12526 op0 = expand_normal (arg0);
12527 icode = CODE_FOR_sse2_clflush;
12528 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12529 op0 = ix86_zero_extend_to_Pmode (op0);
12530
12531 emit_insn (gen_sse2_clflush (op0));
12532 return 0;
12533
12534 case IX86_BUILTIN_CLWB:
12535 arg0 = CALL_EXPR_ARG (exp, 0);
12536 op0 = expand_normal (arg0);
12537 icode = CODE_FOR_clwb;
12538 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12539 op0 = ix86_zero_extend_to_Pmode (op0);
12540
12541 emit_insn (gen_clwb (op0));
12542 return 0;
12543
12544 case IX86_BUILTIN_CLFLUSHOPT:
12545 arg0 = CALL_EXPR_ARG (exp, 0);
12546 op0 = expand_normal (arg0);
12547 icode = CODE_FOR_clflushopt;
12548 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12549 op0 = ix86_zero_extend_to_Pmode (op0);
12550
12551 emit_insn (gen_clflushopt (op0));
12552 return 0;
12553
12554 case IX86_BUILTIN_MONITOR:
12555 case IX86_BUILTIN_MONITORX:
12556 arg0 = CALL_EXPR_ARG (exp, 0);
12557 arg1 = CALL_EXPR_ARG (exp, 1);
12558 arg2 = CALL_EXPR_ARG (exp, 2);
12559 op0 = expand_normal (arg0);
12560 op1 = expand_normal (arg1);
12561 op2 = expand_normal (arg2);
12562 if (!REG_P (op0))
12563 op0 = ix86_zero_extend_to_Pmode (op0);
12564 if (!REG_P (op1))
12565 op1 = copy_to_mode_reg (SImode, op1);
12566 if (!REG_P (op2))
12567 op2 = copy_to_mode_reg (SImode, op2);
12568
12569 emit_insn (fcode == IX86_BUILTIN_MONITOR
a963ca40
UB
12570 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12571 : gen_monitorx (Pmode, op0, op1, op2));
2bf6d935
ML
12572 return 0;
12573
12574 case IX86_BUILTIN_MWAIT:
12575 arg0 = CALL_EXPR_ARG (exp, 0);
12576 arg1 = CALL_EXPR_ARG (exp, 1);
12577 op0 = expand_normal (arg0);
12578 op1 = expand_normal (arg1);
12579 if (!REG_P (op0))
12580 op0 = copy_to_mode_reg (SImode, op0);
12581 if (!REG_P (op1))
12582 op1 = copy_to_mode_reg (SImode, op1);
12583 emit_insn (gen_sse3_mwait (op0, op1));
12584 return 0;
12585
12586 case IX86_BUILTIN_MWAITX:
12587 arg0 = CALL_EXPR_ARG (exp, 0);
12588 arg1 = CALL_EXPR_ARG (exp, 1);
12589 arg2 = CALL_EXPR_ARG (exp, 2);
12590 op0 = expand_normal (arg0);
12591 op1 = expand_normal (arg1);
12592 op2 = expand_normal (arg2);
12593 if (!REG_P (op0))
12594 op0 = copy_to_mode_reg (SImode, op0);
12595 if (!REG_P (op1))
12596 op1 = copy_to_mode_reg (SImode, op1);
12597 if (!REG_P (op2))
12598 op2 = copy_to_mode_reg (SImode, op2);
12599 emit_insn (gen_mwaitx (op0, op1, op2));
12600 return 0;
12601
12602 case IX86_BUILTIN_UMONITOR:
12603 arg0 = CALL_EXPR_ARG (exp, 0);
12604 op0 = expand_normal (arg0);
12605
12606 op0 = ix86_zero_extend_to_Pmode (op0);
987a3082 12607 emit_insn (gen_umonitor (Pmode, op0));
2bf6d935
ML
12608 return 0;
12609
12610 case IX86_BUILTIN_UMWAIT:
12611 case IX86_BUILTIN_TPAUSE:
12612 arg0 = CALL_EXPR_ARG (exp, 0);
12613 arg1 = CALL_EXPR_ARG (exp, 1);
12614 op0 = expand_normal (arg0);
12615 op1 = expand_normal (arg1);
12616
12617 if (!REG_P (op0))
12618 op0 = copy_to_mode_reg (SImode, op0);
12619
12620 op1 = force_reg (DImode, op1);
12621
12622 if (TARGET_64BIT)
12623 {
12624 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12625 NULL, 1, OPTAB_DIRECT);
12626 switch (fcode)
12627 {
12628 case IX86_BUILTIN_UMWAIT:
12629 icode = CODE_FOR_umwait_rex64;
12630 break;
12631 case IX86_BUILTIN_TPAUSE:
12632 icode = CODE_FOR_tpause_rex64;
12633 break;
12634 default:
12635 gcc_unreachable ();
12636 }
12637
12638 op2 = gen_lowpart (SImode, op2);
12639 op1 = gen_lowpart (SImode, op1);
12640 pat = GEN_FCN (icode) (op0, op1, op2);
12641 }
12642 else
12643 {
12644 switch (fcode)
12645 {
12646 case IX86_BUILTIN_UMWAIT:
12647 icode = CODE_FOR_umwait;
12648 break;
12649 case IX86_BUILTIN_TPAUSE:
12650 icode = CODE_FOR_tpause;
12651 break;
12652 default:
12653 gcc_unreachable ();
12654 }
12655 pat = GEN_FCN (icode) (op0, op1);
12656 }
12657
12658 if (!pat)
12659 return 0;
12660
12661 emit_insn (pat);
12662
12663 if (target == 0
12664 || !register_operand (target, QImode))
12665 target = gen_reg_rtx (QImode);
12666
12667 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12668 const0_rtx);
12669 emit_insn (gen_rtx_SET (target, pat));
12670
12671 return target;
12672
299a53d7 12673 case IX86_BUILTIN_TESTUI:
12674 emit_insn (gen_testui ());
12675
12676 if (target == 0
12677 || !register_operand (target, QImode))
12678 target = gen_reg_rtx (QImode);
12679
12680 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12681 const0_rtx);
12682 emit_insn (gen_rtx_SET (target, pat));
12683
12684 return target;
12685
2bf6d935
ML
12686 case IX86_BUILTIN_CLZERO:
12687 arg0 = CALL_EXPR_ARG (exp, 0);
12688 op0 = expand_normal (arg0);
12689 if (!REG_P (op0))
12690 op0 = ix86_zero_extend_to_Pmode (op0);
a963ca40 12691 emit_insn (gen_clzero (Pmode, op0));
2bf6d935
ML
12692 return 0;
12693
12694 case IX86_BUILTIN_CLDEMOTE:
12695 arg0 = CALL_EXPR_ARG (exp, 0);
12696 op0 = expand_normal (arg0);
12697 icode = CODE_FOR_cldemote;
12698 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12699 op0 = ix86_zero_extend_to_Pmode (op0);
12700
12701 emit_insn (gen_cldemote (op0));
12702 return 0;
12703
632a2f50 12704 case IX86_BUILTIN_LOADIWKEY:
12705 {
12706 arg0 = CALL_EXPR_ARG (exp, 0);
12707 arg1 = CALL_EXPR_ARG (exp, 1);
12708 arg2 = CALL_EXPR_ARG (exp, 2);
12709 arg3 = CALL_EXPR_ARG (exp, 3);
12710
12711 op0 = expand_normal (arg0);
12712 op1 = expand_normal (arg1);
12713 op2 = expand_normal (arg2);
12714 op3 = expand_normal (arg3);
12715
12716 if (!REG_P (op0))
12717 op0 = copy_to_mode_reg (V2DImode, op0);
12718 if (!REG_P (op1))
12719 op1 = copy_to_mode_reg (V2DImode, op1);
12720 if (!REG_P (op2))
12721 op2 = copy_to_mode_reg (V2DImode, op2);
12722 if (!REG_P (op3))
12723 op3 = copy_to_mode_reg (SImode, op3);
12724
12725 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12726
12727 return 0;
12728 }
12729
12730 case IX86_BUILTIN_AESDEC128KLU8:
12731 icode = CODE_FOR_aesdec128klu8;
12732 goto aesdecenc_expand;
12733
12734 case IX86_BUILTIN_AESDEC256KLU8:
12735 icode = CODE_FOR_aesdec256klu8;
12736 goto aesdecenc_expand;
12737
12738 case IX86_BUILTIN_AESENC128KLU8:
12739 icode = CODE_FOR_aesenc128klu8;
12740 goto aesdecenc_expand;
12741
12742 case IX86_BUILTIN_AESENC256KLU8:
12743 icode = CODE_FOR_aesenc256klu8;
12744
12745 aesdecenc_expand:
12746
12747 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
12748 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
12749 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12750
12751 op0 = expand_normal (arg0);
12752 op1 = expand_normal (arg1);
12753 op2 = expand_normal (arg2);
12754
12755 if (!address_operand (op0, V2DImode))
12756 {
12757 op0 = convert_memory_address (Pmode, op0);
12758 op0 = copy_addr_to_reg (op0);
12759 }
12760 op0 = gen_rtx_MEM (V2DImode, op0);
12761
12762 if (!REG_P (op1))
12763 op1 = copy_to_mode_reg (V2DImode, op1);
12764
12765 if (!address_operand (op2, VOIDmode))
12766 {
12767 op2 = convert_memory_address (Pmode, op2);
12768 op2 = copy_addr_to_reg (op2);
12769 }
12770 op2 = gen_rtx_MEM (BLKmode, op2);
12771
12772 emit_insn (GEN_FCN (icode) (op1, op1, op2));
12773
12774 if (target == 0)
12775 target = gen_reg_rtx (QImode);
12776
1aeefa57
HW
12777 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12778 error occurs. Then the output should be cleared for safety. */
12779 rtx_code_label *ok_label;
12780 rtx tmp;
12781
12782 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12783 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12784 ok_label = gen_label_rtx ();
12785 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12786 true, ok_label);
12787 /* Usually the runtime error seldom occur, so predict OK path as
12788 hotspot to optimize it as fallthrough block. */
12789 predict_jump (REG_BR_PROB_BASE * 90 / 100);
12790
12791 emit_insn (gen_rtx_SET (op1, const0_rtx));
632a2f50 12792
1aeefa57
HW
12793 emit_label (ok_label);
12794 emit_insn (gen_rtx_SET (target, pat));
632a2f50 12795 emit_insn (gen_rtx_SET (op0, op1));
12796
12797 return target;
12798
12799 case IX86_BUILTIN_AESDECWIDE128KLU8:
12800 icode = CODE_FOR_aesdecwide128klu8;
12801 goto wideaesdecenc_expand;
12802
12803 case IX86_BUILTIN_AESDECWIDE256KLU8:
12804 icode = CODE_FOR_aesdecwide256klu8;
12805 goto wideaesdecenc_expand;
12806
12807 case IX86_BUILTIN_AESENCWIDE128KLU8:
12808 icode = CODE_FOR_aesencwide128klu8;
12809 goto wideaesdecenc_expand;
12810
12811 case IX86_BUILTIN_AESENCWIDE256KLU8:
12812 icode = CODE_FOR_aesencwide256klu8;
12813
12814 wideaesdecenc_expand:
12815
12816 rtx xmm_regs[8];
12817 rtx op;
12818
12819 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
12820 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
12821 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12822
12823 op0 = expand_normal (arg0);
12824 op1 = expand_normal (arg1);
12825 op2 = expand_normal (arg2);
12826
12827 if (!address_operand (op2, VOIDmode))
12828 {
12829 op2 = convert_memory_address (Pmode, op2);
12830 op2 = copy_addr_to_reg (op2);
12831 }
12832 op2 = gen_rtx_MEM (BLKmode, op2);
12833
12834 for (i = 0; i < 8; i++)
12835 {
12836 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12837
12838 op = gen_rtx_MEM (V2DImode,
12839 plus_constant (Pmode, op1, (i * 16)));
12840
12841 emit_move_insn (xmm_regs[i], op);
12842 }
12843
12844 emit_insn (GEN_FCN (icode) (op2));
12845
12846 if (target == 0)
12847 target = gen_reg_rtx (QImode);
12848
1aeefa57
HW
12849 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12850 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12851 ok_label = gen_label_rtx ();
12852 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12853 true, ok_label);
12854 predict_jump (REG_BR_PROB_BASE * 90 / 100);
12855
12856 for (i = 0; i < 8; i++)
12857 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
12858
12859 emit_label (ok_label);
632a2f50 12860 emit_insn (gen_rtx_SET (target, pat));
12861
12862 for (i = 0; i < 8; i++)
12863 {
12864 op = gen_rtx_MEM (V2DImode,
12865 plus_constant (Pmode, op0, (i * 16)));
12866 emit_move_insn (op, xmm_regs[i]);
12867 }
12868
12869 return target;
12870
12871 case IX86_BUILTIN_ENCODEKEY128U32:
12872 {
12873 rtx op, xmm_regs[7];
12874
12875 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12876 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
12877 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
12878
12879 op0 = expand_normal (arg0);
12880 op1 = expand_normal (arg1);
12881 op2 = expand_normal (arg2);
12882
12883 if (!REG_P (op0))
12884 op0 = copy_to_mode_reg (SImode, op0);
12885
12886 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12887 emit_move_insn (op, op1);
12888
12889 for (i = 0; i < 3; i++)
12890 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12891
12892 if (target == 0)
12893 target = gen_reg_rtx (SImode);
12894
12895 emit_insn (gen_encodekey128u32 (target, op0));
12896
12897 for (i = 0; i < 3; i++)
12898 {
12899 op = gen_rtx_MEM (V2DImode,
12900 plus_constant (Pmode, op2, (i * 16)));
12901 emit_move_insn (op, xmm_regs[i]);
12902 }
12903
12904 return target;
12905 }
12906 case IX86_BUILTIN_ENCODEKEY256U32:
12907 {
12908 rtx op, xmm_regs[7];
12909
12910 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12911 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
12912 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
12913 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
12914
12915 op0 = expand_normal (arg0);
12916 op1 = expand_normal (arg1);
12917 op2 = expand_normal (arg2);
12918 op3 = expand_normal (arg3);
12919
12920 if (!REG_P (op0))
12921 op0 = copy_to_mode_reg (SImode, op0);
12922
12923 /* Force to use xmm0, xmm1 for keylow, keyhi*/
12924 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12925 emit_move_insn (op, op1);
12926 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
12927 emit_move_insn (op, op2);
12928
12929 for (i = 0; i < 4; i++)
12930 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12931
12932 if (target == 0)
12933 target = gen_reg_rtx (SImode);
12934
12935 emit_insn (gen_encodekey256u32 (target, op0));
12936
12937 for (i = 0; i < 4; i++)
12938 {
12939 op = gen_rtx_MEM (V2DImode,
12940 plus_constant (Pmode, op3, (i * 16)));
12941 emit_move_insn (op, xmm_regs[i]);
12942 }
12943
12944 return target;
12945 }
12946
2bf6d935
ML
12947 case IX86_BUILTIN_VEC_INIT_V2SI:
12948 case IX86_BUILTIN_VEC_INIT_V4HI:
12949 case IX86_BUILTIN_VEC_INIT_V8QI:
12950 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
12951
12952 case IX86_BUILTIN_VEC_EXT_V2DF:
12953 case IX86_BUILTIN_VEC_EXT_V2DI:
12954 case IX86_BUILTIN_VEC_EXT_V4SF:
12955 case IX86_BUILTIN_VEC_EXT_V4SI:
12956 case IX86_BUILTIN_VEC_EXT_V8HI:
12957 case IX86_BUILTIN_VEC_EXT_V2SI:
12958 case IX86_BUILTIN_VEC_EXT_V4HI:
12959 case IX86_BUILTIN_VEC_EXT_V16QI:
12960 return ix86_expand_vec_ext_builtin (exp, target);
12961
12962 case IX86_BUILTIN_VEC_SET_V2DI:
12963 case IX86_BUILTIN_VEC_SET_V4SF:
12964 case IX86_BUILTIN_VEC_SET_V4SI:
12965 case IX86_BUILTIN_VEC_SET_V8HI:
12966 case IX86_BUILTIN_VEC_SET_V4HI:
12967 case IX86_BUILTIN_VEC_SET_V16QI:
12968 return ix86_expand_vec_set_builtin (exp);
12969
12970 case IX86_BUILTIN_NANQ:
12971 case IX86_BUILTIN_NANSQ:
12972 return expand_call (exp, target, ignore);
12973
12974 case IX86_BUILTIN_RDPID:
12975
12976 op0 = gen_reg_rtx (word_mode);
12977
12978 if (TARGET_64BIT)
12979 {
12980 insn = gen_rdpid_rex64 (op0);
12981 op0 = convert_to_mode (SImode, op0, 1);
12982 }
12983 else
12984 insn = gen_rdpid (op0);
12985
12986 emit_insn (insn);
12987
12988 if (target == 0
12989 || !register_operand (target, SImode))
12990 target = gen_reg_rtx (SImode);
12991
12992 emit_move_insn (target, op0);
12993 return target;
12994
e21b52af
HL
12995 case IX86_BUILTIN_2INTERSECTD512:
12996 case IX86_BUILTIN_2INTERSECTQ512:
12997 case IX86_BUILTIN_2INTERSECTD256:
12998 case IX86_BUILTIN_2INTERSECTQ256:
12999 case IX86_BUILTIN_2INTERSECTD128:
13000 case IX86_BUILTIN_2INTERSECTQ128:
13001 arg0 = CALL_EXPR_ARG (exp, 0);
13002 arg1 = CALL_EXPR_ARG (exp, 1);
13003 arg2 = CALL_EXPR_ARG (exp, 2);
13004 arg3 = CALL_EXPR_ARG (exp, 3);
13005 op0 = expand_normal (arg0);
13006 op1 = expand_normal (arg1);
13007 op2 = expand_normal (arg2);
13008 op3 = expand_normal (arg3);
13009
13010 if (!address_operand (op0, VOIDmode))
13011 {
13012 op0 = convert_memory_address (Pmode, op0);
13013 op0 = copy_addr_to_reg (op0);
13014 }
13015 if (!address_operand (op1, VOIDmode))
13016 {
13017 op1 = convert_memory_address (Pmode, op1);
13018 op1 = copy_addr_to_reg (op1);
13019 }
13020
13021 switch (fcode)
13022 {
13023 case IX86_BUILTIN_2INTERSECTD512:
13024 mode4 = P2HImode;
13025 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13026 break;
13027 case IX86_BUILTIN_2INTERSECTQ512:
13028 mode4 = P2QImode;
13029 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13030 break;
13031 case IX86_BUILTIN_2INTERSECTD256:
13032 mode4 = P2QImode;
13033 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13034 break;
13035 case IX86_BUILTIN_2INTERSECTQ256:
13036 mode4 = P2QImode;
13037 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13038 break;
13039 case IX86_BUILTIN_2INTERSECTD128:
13040 mode4 = P2QImode;
13041 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13042 break;
13043 case IX86_BUILTIN_2INTERSECTQ128:
13044 mode4 = P2QImode;
13045 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13046 break;
13047 default:
13048 gcc_unreachable ();
13049 }
13050
13051 mode2 = insn_data[icode].operand[1].mode;
13052 mode3 = insn_data[icode].operand[2].mode;
13053 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13054 op2 = copy_to_mode_reg (mode2, op2);
13055 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13056 op3 = copy_to_mode_reg (mode3, op3);
13057
13058 op4 = gen_reg_rtx (mode4);
13059 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13060 mode0 = mode4 == P2HImode ? HImode : QImode;
13061 emit_move_insn (gen_rtx_MEM (mode0, op0),
13062 gen_lowpart (mode0, op4));
13063 emit_move_insn (gen_rtx_MEM (mode0, op1),
13064 gen_highpart (mode0, op4));
13065
13066 return 0;
13067
2bf6d935
ML
13068 case IX86_BUILTIN_RDPMC:
13069 case IX86_BUILTIN_RDTSC:
13070 case IX86_BUILTIN_RDTSCP:
13071 case IX86_BUILTIN_XGETBV:
13072
13073 op0 = gen_reg_rtx (DImode);
13074 op1 = gen_reg_rtx (DImode);
13075
13076 if (fcode == IX86_BUILTIN_RDPMC)
13077 {
13078 arg0 = CALL_EXPR_ARG (exp, 0);
13079 op2 = expand_normal (arg0);
13080 if (!register_operand (op2, SImode))
13081 op2 = copy_to_mode_reg (SImode, op2);
13082
13083 insn = (TARGET_64BIT
13084 ? gen_rdpmc_rex64 (op0, op1, op2)
13085 : gen_rdpmc (op0, op2));
13086 emit_insn (insn);
13087 }
13088 else if (fcode == IX86_BUILTIN_XGETBV)
13089 {
13090 arg0 = CALL_EXPR_ARG (exp, 0);
13091 op2 = expand_normal (arg0);
13092 if (!register_operand (op2, SImode))
13093 op2 = copy_to_mode_reg (SImode, op2);
13094
13095 insn = (TARGET_64BIT
13096 ? gen_xgetbv_rex64 (op0, op1, op2)
13097 : gen_xgetbv (op0, op2));
13098 emit_insn (insn);
13099 }
13100 else if (fcode == IX86_BUILTIN_RDTSC)
13101 {
13102 insn = (TARGET_64BIT
13103 ? gen_rdtsc_rex64 (op0, op1)
13104 : gen_rdtsc (op0));
13105 emit_insn (insn);
13106 }
13107 else
13108 {
13109 op2 = gen_reg_rtx (SImode);
13110
13111 insn = (TARGET_64BIT
13112 ? gen_rdtscp_rex64 (op0, op1, op2)
13113 : gen_rdtscp (op0, op2));
13114 emit_insn (insn);
13115
13116 arg0 = CALL_EXPR_ARG (exp, 0);
13117 op4 = expand_normal (arg0);
13118 if (!address_operand (op4, VOIDmode))
13119 {
13120 op4 = convert_memory_address (Pmode, op4);
13121 op4 = copy_addr_to_reg (op4);
13122 }
13123 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13124 }
13125
13126 if (target == 0
13127 || !register_operand (target, DImode))
13128 target = gen_reg_rtx (DImode);
13129
13130 if (TARGET_64BIT)
13131 {
13132 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13133 op1, 1, OPTAB_DIRECT);
13134 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13135 op0, 1, OPTAB_DIRECT);
13136 }
13137
13138 emit_move_insn (target, op0);
13139 return target;
13140
6a10feda
XG
13141 case IX86_BUILTIN_ENQCMD:
13142 case IX86_BUILTIN_ENQCMDS:
2bf6d935
ML
13143 case IX86_BUILTIN_MOVDIR64B:
13144
13145 arg0 = CALL_EXPR_ARG (exp, 0);
13146 arg1 = CALL_EXPR_ARG (exp, 1);
13147 op0 = expand_normal (arg0);
13148 op1 = expand_normal (arg1);
13149
13150 op0 = ix86_zero_extend_to_Pmode (op0);
13151 if (!address_operand (op1, VOIDmode))
13152 {
13153 op1 = convert_memory_address (Pmode, op1);
13154 op1 = copy_addr_to_reg (op1);
13155 }
13156 op1 = gen_rtx_MEM (XImode, op1);
13157
6a10feda
XG
13158 if (fcode == IX86_BUILTIN_MOVDIR64B)
13159 {
13160 emit_insn (gen_movdir64b (Pmode, op0, op1));
13161 return 0;
13162 }
13163 else
13164 {
44320665
UB
13165 if (target == 0
13166 || !register_operand (target, SImode))
13167 target = gen_reg_rtx (SImode);
6a10feda 13168
6a10feda
XG
13169 emit_move_insn (target, const0_rtx);
13170 target = gen_rtx_SUBREG (QImode, target, 0);
13171
44320665
UB
13172 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13173 ? UNSPECV_ENQCMD
13174 : UNSPECV_ENQCMDS);
13175 icode = code_for_enqcmd (unspecv, Pmode);
13176 emit_insn (GEN_FCN (icode) (op0, op1));
6a10feda 13177
44320665
UB
13178 emit_insn
13179 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13180 gen_rtx_fmt_ee (EQ, QImode,
13181 gen_rtx_REG (CCZmode, FLAGS_REG),
13182 const0_rtx)));
6a10feda
XG
13183 return SUBREG_REG (target);
13184 }
2bf6d935
ML
13185
13186 case IX86_BUILTIN_FXSAVE:
13187 case IX86_BUILTIN_FXRSTOR:
13188 case IX86_BUILTIN_FXSAVE64:
13189 case IX86_BUILTIN_FXRSTOR64:
13190 case IX86_BUILTIN_FNSTENV:
13191 case IX86_BUILTIN_FLDENV:
13192 mode0 = BLKmode;
13193 switch (fcode)
13194 {
13195 case IX86_BUILTIN_FXSAVE:
13196 icode = CODE_FOR_fxsave;
13197 break;
13198 case IX86_BUILTIN_FXRSTOR:
13199 icode = CODE_FOR_fxrstor;
13200 break;
13201 case IX86_BUILTIN_FXSAVE64:
13202 icode = CODE_FOR_fxsave64;
13203 break;
13204 case IX86_BUILTIN_FXRSTOR64:
13205 icode = CODE_FOR_fxrstor64;
13206 break;
13207 case IX86_BUILTIN_FNSTENV:
13208 icode = CODE_FOR_fnstenv;
13209 break;
13210 case IX86_BUILTIN_FLDENV:
13211 icode = CODE_FOR_fldenv;
13212 break;
13213 default:
13214 gcc_unreachable ();
13215 }
13216
13217 arg0 = CALL_EXPR_ARG (exp, 0);
13218 op0 = expand_normal (arg0);
13219
13220 if (!address_operand (op0, VOIDmode))
13221 {
13222 op0 = convert_memory_address (Pmode, op0);
13223 op0 = copy_addr_to_reg (op0);
13224 }
13225 op0 = gen_rtx_MEM (mode0, op0);
13226
13227 pat = GEN_FCN (icode) (op0);
13228 if (pat)
13229 emit_insn (pat);
13230 return 0;
13231
13232 case IX86_BUILTIN_XSETBV:
13233 arg0 = CALL_EXPR_ARG (exp, 0);
13234 arg1 = CALL_EXPR_ARG (exp, 1);
13235 op0 = expand_normal (arg0);
13236 op1 = expand_normal (arg1);
13237
13238 if (!REG_P (op0))
13239 op0 = copy_to_mode_reg (SImode, op0);
13240
13241 op1 = force_reg (DImode, op1);
13242
13243 if (TARGET_64BIT)
13244 {
13245 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13246 NULL, 1, OPTAB_DIRECT);
13247
13248 icode = CODE_FOR_xsetbv_rex64;
13249
13250 op2 = gen_lowpart (SImode, op2);
13251 op1 = gen_lowpart (SImode, op1);
13252 pat = GEN_FCN (icode) (op0, op1, op2);
13253 }
13254 else
13255 {
13256 icode = CODE_FOR_xsetbv;
13257
13258 pat = GEN_FCN (icode) (op0, op1);
13259 }
13260 if (pat)
13261 emit_insn (pat);
13262 return 0;
13263
13264 case IX86_BUILTIN_XSAVE:
13265 case IX86_BUILTIN_XRSTOR:
13266 case IX86_BUILTIN_XSAVE64:
13267 case IX86_BUILTIN_XRSTOR64:
13268 case IX86_BUILTIN_XSAVEOPT:
13269 case IX86_BUILTIN_XSAVEOPT64:
13270 case IX86_BUILTIN_XSAVES:
13271 case IX86_BUILTIN_XRSTORS:
13272 case IX86_BUILTIN_XSAVES64:
13273 case IX86_BUILTIN_XRSTORS64:
13274 case IX86_BUILTIN_XSAVEC:
13275 case IX86_BUILTIN_XSAVEC64:
13276 arg0 = CALL_EXPR_ARG (exp, 0);
13277 arg1 = CALL_EXPR_ARG (exp, 1);
13278 op0 = expand_normal (arg0);
13279 op1 = expand_normal (arg1);
13280
13281 if (!address_operand (op0, VOIDmode))
13282 {
13283 op0 = convert_memory_address (Pmode, op0);
13284 op0 = copy_addr_to_reg (op0);
13285 }
13286 op0 = gen_rtx_MEM (BLKmode, op0);
13287
13288 op1 = force_reg (DImode, op1);
13289
13290 if (TARGET_64BIT)
13291 {
13292 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13293 NULL, 1, OPTAB_DIRECT);
13294 switch (fcode)
13295 {
13296 case IX86_BUILTIN_XSAVE:
13297 icode = CODE_FOR_xsave_rex64;
13298 break;
13299 case IX86_BUILTIN_XRSTOR:
13300 icode = CODE_FOR_xrstor_rex64;
13301 break;
13302 case IX86_BUILTIN_XSAVE64:
13303 icode = CODE_FOR_xsave64;
13304 break;
13305 case IX86_BUILTIN_XRSTOR64:
13306 icode = CODE_FOR_xrstor64;
13307 break;
13308 case IX86_BUILTIN_XSAVEOPT:
13309 icode = CODE_FOR_xsaveopt_rex64;
13310 break;
13311 case IX86_BUILTIN_XSAVEOPT64:
13312 icode = CODE_FOR_xsaveopt64;
13313 break;
13314 case IX86_BUILTIN_XSAVES:
13315 icode = CODE_FOR_xsaves_rex64;
13316 break;
13317 case IX86_BUILTIN_XRSTORS:
13318 icode = CODE_FOR_xrstors_rex64;
13319 break;
13320 case IX86_BUILTIN_XSAVES64:
13321 icode = CODE_FOR_xsaves64;
13322 break;
13323 case IX86_BUILTIN_XRSTORS64:
13324 icode = CODE_FOR_xrstors64;
13325 break;
13326 case IX86_BUILTIN_XSAVEC:
13327 icode = CODE_FOR_xsavec_rex64;
13328 break;
13329 case IX86_BUILTIN_XSAVEC64:
13330 icode = CODE_FOR_xsavec64;
13331 break;
13332 default:
13333 gcc_unreachable ();
13334 }
13335
13336 op2 = gen_lowpart (SImode, op2);
13337 op1 = gen_lowpart (SImode, op1);
13338 pat = GEN_FCN (icode) (op0, op1, op2);
13339 }
13340 else
13341 {
13342 switch (fcode)
13343 {
13344 case IX86_BUILTIN_XSAVE:
13345 icode = CODE_FOR_xsave;
13346 break;
13347 case IX86_BUILTIN_XRSTOR:
13348 icode = CODE_FOR_xrstor;
13349 break;
13350 case IX86_BUILTIN_XSAVEOPT:
13351 icode = CODE_FOR_xsaveopt;
13352 break;
13353 case IX86_BUILTIN_XSAVES:
13354 icode = CODE_FOR_xsaves;
13355 break;
13356 case IX86_BUILTIN_XRSTORS:
13357 icode = CODE_FOR_xrstors;
13358 break;
13359 case IX86_BUILTIN_XSAVEC:
13360 icode = CODE_FOR_xsavec;
13361 break;
13362 default:
13363 gcc_unreachable ();
13364 }
13365 pat = GEN_FCN (icode) (op0, op1);
13366 }
13367
13368 if (pat)
13369 emit_insn (pat);
13370 return 0;
13371
13372 case IX86_BUILTIN_LLWPCB:
13373 arg0 = CALL_EXPR_ARG (exp, 0);
13374 op0 = expand_normal (arg0);
2398c206
UB
13375
13376 if (!register_operand (op0, Pmode))
2bf6d935 13377 op0 = ix86_zero_extend_to_Pmode (op0);
2398c206 13378 emit_insn (gen_lwp_llwpcb (Pmode, op0));
2bf6d935
ML
13379 return 0;
13380
13381 case IX86_BUILTIN_SLWPCB:
2bf6d935 13382 if (!target
2398c206 13383 || !register_operand (target, Pmode))
2bf6d935 13384 target = gen_reg_rtx (Pmode);
2398c206 13385 emit_insn (gen_lwp_slwpcb (Pmode, target));
2bf6d935
ML
13386 return target;
13387
2398c206
UB
13388 case IX86_BUILTIN_LWPVAL32:
13389 case IX86_BUILTIN_LWPVAL64:
13390 case IX86_BUILTIN_LWPINS32:
13391 case IX86_BUILTIN_LWPINS64:
13392 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13393 || fcode == IX86_BUILTIN_LWPINS32)
13394 ? SImode : DImode);
13395
13396 if (fcode == IX86_BUILTIN_LWPVAL32
13397 || fcode == IX86_BUILTIN_LWPVAL64)
13398 icode = code_for_lwp_lwpval (mode);
13399 else
13400 icode = code_for_lwp_lwpins (mode);
13401
13402 arg0 = CALL_EXPR_ARG (exp, 0);
13403 arg1 = CALL_EXPR_ARG (exp, 1);
13404 arg2 = CALL_EXPR_ARG (exp, 2);
13405 op0 = expand_normal (arg0);
13406 op1 = expand_normal (arg1);
13407 op2 = expand_normal (arg2);
13408 mode0 = insn_data[icode].operand[0].mode;
13409
13410 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13411 op0 = copy_to_mode_reg (mode0, op0);
13412 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13413 op1 = copy_to_mode_reg (SImode, op1);
13414
13415 if (!CONST_INT_P (op2))
13416 {
13417 error ("the last argument must be a 32-bit immediate");
13418 return const0_rtx;
13419 }
13420
13421 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13422
13423 if (fcode == IX86_BUILTIN_LWPINS32
13424 || fcode == IX86_BUILTIN_LWPINS64)
13425 {
13426 if (target == 0
13427 || !nonimmediate_operand (target, QImode))
13428 target = gen_reg_rtx (QImode);
13429
13430 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13431 const0_rtx);
13432 emit_insn (gen_rtx_SET (target, pat));
13433
13434 return target;
13435 }
13436 else
13437 return 0;
13438
2bf6d935
ML
13439 case IX86_BUILTIN_BEXTRI32:
13440 case IX86_BUILTIN_BEXTRI64:
9e026191
UB
13441 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13442
2bf6d935
ML
13443 arg0 = CALL_EXPR_ARG (exp, 0);
13444 arg1 = CALL_EXPR_ARG (exp, 1);
13445 op0 = expand_normal (arg0);
13446 op1 = expand_normal (arg1);
9e026191 13447
2bf6d935 13448 if (!CONST_INT_P (op1))
9e026191
UB
13449 {
13450 error ("last argument must be an immediate");
13451 return const0_rtx;
13452 }
2bf6d935 13453 else
9e026191
UB
13454 {
13455 unsigned char lsb_index = UINTVAL (op1);
13456 unsigned char length = UINTVAL (op1) >> 8;
13457
13458 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13459
13460 icode = code_for_tbm_bextri (mode);
2bf6d935
ML
13461
13462 mode1 = insn_data[icode].operand[1].mode;
13463 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13464 op0 = copy_to_mode_reg (mode1, op0);
13465
13466 mode0 = insn_data[icode].operand[0].mode;
13467 if (target == 0
13468 || !register_operand (target, mode0))
13469 target = gen_reg_rtx (mode0);
13470
9e026191
UB
13471 if (length == 0 || lsb_index >= bitsize)
13472 {
13473 emit_move_insn (target, const0_rtx);
13474 return target;
13475 }
13476
13477 if (length + lsb_index > bitsize)
13478 length = bitsize - lsb_index;
13479
13480 op1 = GEN_INT (length);
13481 op2 = GEN_INT (lsb_index);
13482
13483 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13484 return target;
13485 }
2bf6d935
ML
13486
13487 case IX86_BUILTIN_RDRAND16_STEP:
9e026191 13488 mode = HImode;
2bf6d935
ML
13489 goto rdrand_step;
13490
13491 case IX86_BUILTIN_RDRAND32_STEP:
9e026191 13492 mode = SImode;
2bf6d935
ML
13493 goto rdrand_step;
13494
13495 case IX86_BUILTIN_RDRAND64_STEP:
9e026191 13496 mode = DImode;
2bf6d935
ML
13497
13498rdrand_step:
13499 arg0 = CALL_EXPR_ARG (exp, 0);
13500 op1 = expand_normal (arg0);
13501 if (!address_operand (op1, VOIDmode))
13502 {
13503 op1 = convert_memory_address (Pmode, op1);
13504 op1 = copy_addr_to_reg (op1);
13505 }
13506
9e026191
UB
13507 op0 = gen_reg_rtx (mode);
13508 emit_insn (gen_rdrand (mode, op0));
2bf6d935 13509
9e026191 13510 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935 13511
9e026191 13512 op1 = force_reg (SImode, const1_rtx);
2bf6d935
ML
13513
13514 /* Emit SImode conditional move. */
9e026191 13515 if (mode == HImode)
2bf6d935
ML
13516 {
13517 if (TARGET_ZERO_EXTEND_WITH_AND
13518 && optimize_function_for_speed_p (cfun))
13519 {
13520 op2 = force_reg (SImode, const0_rtx);
13521
13522 emit_insn (gen_movstricthi
13523 (gen_lowpart (HImode, op2), op0));
13524 }
13525 else
13526 {
13527 op2 = gen_reg_rtx (SImode);
13528
13529 emit_insn (gen_zero_extendhisi2 (op2, op0));
13530 }
13531 }
9e026191 13532 else if (mode == SImode)
2bf6d935
ML
13533 op2 = op0;
13534 else
13535 op2 = gen_rtx_SUBREG (SImode, op0, 0);
13536
13537 if (target == 0
13538 || !register_operand (target, SImode))
13539 target = gen_reg_rtx (SImode);
13540
13541 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13542 const0_rtx);
13543 emit_insn (gen_rtx_SET (target,
13544 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13545 return target;
13546
13547 case IX86_BUILTIN_RDSEED16_STEP:
9e026191 13548 mode = HImode;
2bf6d935
ML
13549 goto rdseed_step;
13550
13551 case IX86_BUILTIN_RDSEED32_STEP:
9e026191 13552 mode = SImode;
2bf6d935
ML
13553 goto rdseed_step;
13554
13555 case IX86_BUILTIN_RDSEED64_STEP:
9e026191 13556 mode = DImode;
2bf6d935
ML
13557
13558rdseed_step:
13559 arg0 = CALL_EXPR_ARG (exp, 0);
13560 op1 = expand_normal (arg0);
13561 if (!address_operand (op1, VOIDmode))
13562 {
13563 op1 = convert_memory_address (Pmode, op1);
13564 op1 = copy_addr_to_reg (op1);
13565 }
13566
9e026191
UB
13567 op0 = gen_reg_rtx (mode);
13568 emit_insn (gen_rdseed (mode, op0));
2bf6d935 13569
9e026191 13570 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935
ML
13571
13572 op2 = gen_reg_rtx (QImode);
13573
13574 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13575 const0_rtx);
13576 emit_insn (gen_rtx_SET (op2, pat));
13577
13578 if (target == 0
13579 || !register_operand (target, SImode))
13580 target = gen_reg_rtx (SImode);
13581
13582 emit_insn (gen_zero_extendqisi2 (target, op2));
13583 return target;
13584
13585 case IX86_BUILTIN_SBB32:
13586 icode = CODE_FOR_subborrowsi;
13587 icode2 = CODE_FOR_subborrowsi_0;
13588 mode0 = SImode;
13589 mode1 = DImode;
13590 mode2 = CCmode;
13591 goto handlecarry;
13592
13593 case IX86_BUILTIN_SBB64:
13594 icode = CODE_FOR_subborrowdi;
13595 icode2 = CODE_FOR_subborrowdi_0;
13596 mode0 = DImode;
13597 mode1 = TImode;
13598 mode2 = CCmode;
13599 goto handlecarry;
13600
13601 case IX86_BUILTIN_ADDCARRYX32:
13602 icode = CODE_FOR_addcarrysi;
13603 icode2 = CODE_FOR_addcarrysi_0;
13604 mode0 = SImode;
13605 mode1 = DImode;
13606 mode2 = CCCmode;
13607 goto handlecarry;
13608
13609 case IX86_BUILTIN_ADDCARRYX64:
13610 icode = CODE_FOR_addcarrydi;
13611 icode2 = CODE_FOR_addcarrydi_0;
13612 mode0 = DImode;
13613 mode1 = TImode;
13614 mode2 = CCCmode;
13615
13616 handlecarry:
13617 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
13618 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
13619 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
13620 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
13621
13622 op1 = expand_normal (arg0);
13623 if (!integer_zerop (arg0))
13624 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
13625
13626 op2 = expand_normal (arg1);
13627 if (!register_operand (op2, mode0))
13628 op2 = copy_to_mode_reg (mode0, op2);
13629
13630 op3 = expand_normal (arg2);
13631 if (!register_operand (op3, mode0))
13632 op3 = copy_to_mode_reg (mode0, op3);
13633
13634 op4 = expand_normal (arg3);
13635 if (!address_operand (op4, VOIDmode))
13636 {
13637 op4 = convert_memory_address (Pmode, op4);
13638 op4 = copy_addr_to_reg (op4);
13639 }
13640
13641 op0 = gen_reg_rtx (mode0);
13642 if (integer_zerop (arg0))
13643 {
13644 /* If arg0 is 0, optimize right away into add or sub
13645 instruction that sets CCCmode flags. */
13646 op1 = gen_rtx_REG (mode2, FLAGS_REG);
13647 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
13648 }
13649 else
13650 {
13651 /* Generate CF from input operand. */
13652 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
13653
13654 /* Generate instruction that consumes CF. */
13655 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
13656 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
13657 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
13658 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
13659 }
13660
13661 /* Return current CF value. */
13662 if (target == 0)
13663 target = gen_reg_rtx (QImode);
13664
13665 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
13666 emit_insn (gen_rtx_SET (target, pat));
13667
13668 /* Store the result. */
13669 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
13670
13671 return target;
13672
13673 case IX86_BUILTIN_READ_FLAGS:
b60bc913
JJ
13674 if (ignore)
13675 return const0_rtx;
13676
2bf6d935
ML
13677 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
13678
13679 if (optimize
13680 || target == NULL_RTX
13681 || !nonimmediate_operand (target, word_mode)
13682 || GET_MODE (target) != word_mode)
13683 target = gen_reg_rtx (word_mode);
13684
13685 emit_insn (gen_pop (target));
13686 return target;
13687
13688 case IX86_BUILTIN_WRITE_FLAGS:
13689
13690 arg0 = CALL_EXPR_ARG (exp, 0);
13691 op0 = expand_normal (arg0);
13692 if (!general_no_elim_operand (op0, word_mode))
13693 op0 = copy_to_mode_reg (word_mode, op0);
13694
13695 emit_insn (gen_push (op0));
13696 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
13697 return 0;
13698
13699 case IX86_BUILTIN_KTESTC8:
13700 icode = CODE_FOR_ktestqi;
13701 mode3 = CCCmode;
13702 goto kortest;
13703
13704 case IX86_BUILTIN_KTESTZ8:
13705 icode = CODE_FOR_ktestqi;
13706 mode3 = CCZmode;
13707 goto kortest;
13708
13709 case IX86_BUILTIN_KTESTC16:
13710 icode = CODE_FOR_ktesthi;
13711 mode3 = CCCmode;
13712 goto kortest;
13713
13714 case IX86_BUILTIN_KTESTZ16:
13715 icode = CODE_FOR_ktesthi;
13716 mode3 = CCZmode;
13717 goto kortest;
13718
13719 case IX86_BUILTIN_KTESTC32:
13720 icode = CODE_FOR_ktestsi;
13721 mode3 = CCCmode;
13722 goto kortest;
13723
13724 case IX86_BUILTIN_KTESTZ32:
13725 icode = CODE_FOR_ktestsi;
13726 mode3 = CCZmode;
13727 goto kortest;
13728
13729 case IX86_BUILTIN_KTESTC64:
13730 icode = CODE_FOR_ktestdi;
13731 mode3 = CCCmode;
13732 goto kortest;
13733
13734 case IX86_BUILTIN_KTESTZ64:
13735 icode = CODE_FOR_ktestdi;
13736 mode3 = CCZmode;
13737 goto kortest;
13738
13739 case IX86_BUILTIN_KORTESTC8:
13740 icode = CODE_FOR_kortestqi;
13741 mode3 = CCCmode;
13742 goto kortest;
13743
13744 case IX86_BUILTIN_KORTESTZ8:
13745 icode = CODE_FOR_kortestqi;
13746 mode3 = CCZmode;
13747 goto kortest;
13748
13749 case IX86_BUILTIN_KORTESTC16:
13750 icode = CODE_FOR_kortesthi;
13751 mode3 = CCCmode;
13752 goto kortest;
13753
13754 case IX86_BUILTIN_KORTESTZ16:
13755 icode = CODE_FOR_kortesthi;
13756 mode3 = CCZmode;
13757 goto kortest;
13758
13759 case IX86_BUILTIN_KORTESTC32:
13760 icode = CODE_FOR_kortestsi;
13761 mode3 = CCCmode;
13762 goto kortest;
13763
13764 case IX86_BUILTIN_KORTESTZ32:
13765 icode = CODE_FOR_kortestsi;
13766 mode3 = CCZmode;
13767 goto kortest;
13768
13769 case IX86_BUILTIN_KORTESTC64:
13770 icode = CODE_FOR_kortestdi;
13771 mode3 = CCCmode;
13772 goto kortest;
13773
13774 case IX86_BUILTIN_KORTESTZ64:
13775 icode = CODE_FOR_kortestdi;
13776 mode3 = CCZmode;
13777
13778 kortest:
13779 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
13780 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
13781 op0 = expand_normal (arg0);
13782 op1 = expand_normal (arg1);
13783
13784 mode0 = insn_data[icode].operand[0].mode;
13785 mode1 = insn_data[icode].operand[1].mode;
13786
13787 if (GET_MODE (op0) != VOIDmode)
13788 op0 = force_reg (GET_MODE (op0), op0);
13789
13790 op0 = gen_lowpart (mode0, op0);
13791
13792 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13793 op0 = copy_to_mode_reg (mode0, op0);
13794
13795 if (GET_MODE (op1) != VOIDmode)
13796 op1 = force_reg (GET_MODE (op1), op1);
13797
13798 op1 = gen_lowpart (mode1, op1);
13799
13800 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13801 op1 = copy_to_mode_reg (mode1, op1);
13802
13803 target = gen_reg_rtx (QImode);
13804
13805 /* Emit kortest. */
13806 emit_insn (GEN_FCN (icode) (op0, op1));
13807 /* And use setcc to return result from flags. */
13808 ix86_expand_setcc (target, EQ,
13809 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
13810 return target;
13811
13812 case IX86_BUILTIN_GATHERSIV2DF:
13813 icode = CODE_FOR_avx2_gathersiv2df;
13814 goto gather_gen;
13815 case IX86_BUILTIN_GATHERSIV4DF:
13816 icode = CODE_FOR_avx2_gathersiv4df;
13817 goto gather_gen;
13818 case IX86_BUILTIN_GATHERDIV2DF:
13819 icode = CODE_FOR_avx2_gatherdiv2df;
13820 goto gather_gen;
13821 case IX86_BUILTIN_GATHERDIV4DF:
13822 icode = CODE_FOR_avx2_gatherdiv4df;
13823 goto gather_gen;
13824 case IX86_BUILTIN_GATHERSIV4SF:
13825 icode = CODE_FOR_avx2_gathersiv4sf;
13826 goto gather_gen;
13827 case IX86_BUILTIN_GATHERSIV8SF:
13828 icode = CODE_FOR_avx2_gathersiv8sf;
13829 goto gather_gen;
13830 case IX86_BUILTIN_GATHERDIV4SF:
13831 icode = CODE_FOR_avx2_gatherdiv4sf;
13832 goto gather_gen;
13833 case IX86_BUILTIN_GATHERDIV8SF:
13834 icode = CODE_FOR_avx2_gatherdiv8sf;
13835 goto gather_gen;
13836 case IX86_BUILTIN_GATHERSIV2DI:
13837 icode = CODE_FOR_avx2_gathersiv2di;
13838 goto gather_gen;
13839 case IX86_BUILTIN_GATHERSIV4DI:
13840 icode = CODE_FOR_avx2_gathersiv4di;
13841 goto gather_gen;
13842 case IX86_BUILTIN_GATHERDIV2DI:
13843 icode = CODE_FOR_avx2_gatherdiv2di;
13844 goto gather_gen;
13845 case IX86_BUILTIN_GATHERDIV4DI:
13846 icode = CODE_FOR_avx2_gatherdiv4di;
13847 goto gather_gen;
13848 case IX86_BUILTIN_GATHERSIV4SI:
13849 icode = CODE_FOR_avx2_gathersiv4si;
13850 goto gather_gen;
13851 case IX86_BUILTIN_GATHERSIV8SI:
13852 icode = CODE_FOR_avx2_gathersiv8si;
13853 goto gather_gen;
13854 case IX86_BUILTIN_GATHERDIV4SI:
13855 icode = CODE_FOR_avx2_gatherdiv4si;
13856 goto gather_gen;
13857 case IX86_BUILTIN_GATHERDIV8SI:
13858 icode = CODE_FOR_avx2_gatherdiv8si;
13859 goto gather_gen;
13860 case IX86_BUILTIN_GATHERALTSIV4DF:
13861 icode = CODE_FOR_avx2_gathersiv4df;
13862 goto gather_gen;
13863 case IX86_BUILTIN_GATHERALTDIV8SF:
13864 icode = CODE_FOR_avx2_gatherdiv8sf;
13865 goto gather_gen;
13866 case IX86_BUILTIN_GATHERALTSIV4DI:
13867 icode = CODE_FOR_avx2_gathersiv4di;
13868 goto gather_gen;
13869 case IX86_BUILTIN_GATHERALTDIV8SI:
13870 icode = CODE_FOR_avx2_gatherdiv8si;
13871 goto gather_gen;
13872 case IX86_BUILTIN_GATHER3SIV16SF:
13873 icode = CODE_FOR_avx512f_gathersiv16sf;
13874 goto gather_gen;
13875 case IX86_BUILTIN_GATHER3SIV8DF:
13876 icode = CODE_FOR_avx512f_gathersiv8df;
13877 goto gather_gen;
13878 case IX86_BUILTIN_GATHER3DIV16SF:
13879 icode = CODE_FOR_avx512f_gatherdiv16sf;
13880 goto gather_gen;
13881 case IX86_BUILTIN_GATHER3DIV8DF:
13882 icode = CODE_FOR_avx512f_gatherdiv8df;
13883 goto gather_gen;
13884 case IX86_BUILTIN_GATHER3SIV16SI:
13885 icode = CODE_FOR_avx512f_gathersiv16si;
13886 goto gather_gen;
13887 case IX86_BUILTIN_GATHER3SIV8DI:
13888 icode = CODE_FOR_avx512f_gathersiv8di;
13889 goto gather_gen;
13890 case IX86_BUILTIN_GATHER3DIV16SI:
13891 icode = CODE_FOR_avx512f_gatherdiv16si;
13892 goto gather_gen;
13893 case IX86_BUILTIN_GATHER3DIV8DI:
13894 icode = CODE_FOR_avx512f_gatherdiv8di;
13895 goto gather_gen;
13896 case IX86_BUILTIN_GATHER3ALTSIV8DF:
13897 icode = CODE_FOR_avx512f_gathersiv8df;
13898 goto gather_gen;
13899 case IX86_BUILTIN_GATHER3ALTDIV16SF:
13900 icode = CODE_FOR_avx512f_gatherdiv16sf;
13901 goto gather_gen;
13902 case IX86_BUILTIN_GATHER3ALTSIV8DI:
13903 icode = CODE_FOR_avx512f_gathersiv8di;
13904 goto gather_gen;
13905 case IX86_BUILTIN_GATHER3ALTDIV16SI:
13906 icode = CODE_FOR_avx512f_gatherdiv16si;
13907 goto gather_gen;
13908 case IX86_BUILTIN_GATHER3SIV2DF:
13909 icode = CODE_FOR_avx512vl_gathersiv2df;
13910 goto gather_gen;
13911 case IX86_BUILTIN_GATHER3SIV4DF:
13912 icode = CODE_FOR_avx512vl_gathersiv4df;
13913 goto gather_gen;
13914 case IX86_BUILTIN_GATHER3DIV2DF:
13915 icode = CODE_FOR_avx512vl_gatherdiv2df;
13916 goto gather_gen;
13917 case IX86_BUILTIN_GATHER3DIV4DF:
13918 icode = CODE_FOR_avx512vl_gatherdiv4df;
13919 goto gather_gen;
13920 case IX86_BUILTIN_GATHER3SIV4SF:
13921 icode = CODE_FOR_avx512vl_gathersiv4sf;
13922 goto gather_gen;
13923 case IX86_BUILTIN_GATHER3SIV8SF:
13924 icode = CODE_FOR_avx512vl_gathersiv8sf;
13925 goto gather_gen;
13926 case IX86_BUILTIN_GATHER3DIV4SF:
13927 icode = CODE_FOR_avx512vl_gatherdiv4sf;
13928 goto gather_gen;
13929 case IX86_BUILTIN_GATHER3DIV8SF:
13930 icode = CODE_FOR_avx512vl_gatherdiv8sf;
13931 goto gather_gen;
13932 case IX86_BUILTIN_GATHER3SIV2DI:
13933 icode = CODE_FOR_avx512vl_gathersiv2di;
13934 goto gather_gen;
13935 case IX86_BUILTIN_GATHER3SIV4DI:
13936 icode = CODE_FOR_avx512vl_gathersiv4di;
13937 goto gather_gen;
13938 case IX86_BUILTIN_GATHER3DIV2DI:
13939 icode = CODE_FOR_avx512vl_gatherdiv2di;
13940 goto gather_gen;
13941 case IX86_BUILTIN_GATHER3DIV4DI:
13942 icode = CODE_FOR_avx512vl_gatherdiv4di;
13943 goto gather_gen;
13944 case IX86_BUILTIN_GATHER3SIV4SI:
13945 icode = CODE_FOR_avx512vl_gathersiv4si;
13946 goto gather_gen;
13947 case IX86_BUILTIN_GATHER3SIV8SI:
13948 icode = CODE_FOR_avx512vl_gathersiv8si;
13949 goto gather_gen;
13950 case IX86_BUILTIN_GATHER3DIV4SI:
13951 icode = CODE_FOR_avx512vl_gatherdiv4si;
13952 goto gather_gen;
13953 case IX86_BUILTIN_GATHER3DIV8SI:
13954 icode = CODE_FOR_avx512vl_gatherdiv8si;
13955 goto gather_gen;
13956 case IX86_BUILTIN_GATHER3ALTSIV4DF:
13957 icode = CODE_FOR_avx512vl_gathersiv4df;
13958 goto gather_gen;
13959 case IX86_BUILTIN_GATHER3ALTDIV8SF:
13960 icode = CODE_FOR_avx512vl_gatherdiv8sf;
13961 goto gather_gen;
13962 case IX86_BUILTIN_GATHER3ALTSIV4DI:
13963 icode = CODE_FOR_avx512vl_gathersiv4di;
13964 goto gather_gen;
13965 case IX86_BUILTIN_GATHER3ALTDIV8SI:
13966 icode = CODE_FOR_avx512vl_gatherdiv8si;
13967 goto gather_gen;
13968 case IX86_BUILTIN_SCATTERSIV16SF:
13969 icode = CODE_FOR_avx512f_scattersiv16sf;
13970 goto scatter_gen;
13971 case IX86_BUILTIN_SCATTERSIV8DF:
13972 icode = CODE_FOR_avx512f_scattersiv8df;
13973 goto scatter_gen;
13974 case IX86_BUILTIN_SCATTERDIV16SF:
13975 icode = CODE_FOR_avx512f_scatterdiv16sf;
13976 goto scatter_gen;
13977 case IX86_BUILTIN_SCATTERDIV8DF:
13978 icode = CODE_FOR_avx512f_scatterdiv8df;
13979 goto scatter_gen;
13980 case IX86_BUILTIN_SCATTERSIV16SI:
13981 icode = CODE_FOR_avx512f_scattersiv16si;
13982 goto scatter_gen;
13983 case IX86_BUILTIN_SCATTERSIV8DI:
13984 icode = CODE_FOR_avx512f_scattersiv8di;
13985 goto scatter_gen;
13986 case IX86_BUILTIN_SCATTERDIV16SI:
13987 icode = CODE_FOR_avx512f_scatterdiv16si;
13988 goto scatter_gen;
13989 case IX86_BUILTIN_SCATTERDIV8DI:
13990 icode = CODE_FOR_avx512f_scatterdiv8di;
13991 goto scatter_gen;
13992 case IX86_BUILTIN_SCATTERSIV8SF:
13993 icode = CODE_FOR_avx512vl_scattersiv8sf;
13994 goto scatter_gen;
13995 case IX86_BUILTIN_SCATTERSIV4SF:
13996 icode = CODE_FOR_avx512vl_scattersiv4sf;
13997 goto scatter_gen;
13998 case IX86_BUILTIN_SCATTERSIV4DF:
13999 icode = CODE_FOR_avx512vl_scattersiv4df;
14000 goto scatter_gen;
14001 case IX86_BUILTIN_SCATTERSIV2DF:
14002 icode = CODE_FOR_avx512vl_scattersiv2df;
14003 goto scatter_gen;
14004 case IX86_BUILTIN_SCATTERDIV8SF:
14005 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14006 goto scatter_gen;
14007 case IX86_BUILTIN_SCATTERDIV4SF:
14008 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14009 goto scatter_gen;
14010 case IX86_BUILTIN_SCATTERDIV4DF:
14011 icode = CODE_FOR_avx512vl_scatterdiv4df;
14012 goto scatter_gen;
14013 case IX86_BUILTIN_SCATTERDIV2DF:
14014 icode = CODE_FOR_avx512vl_scatterdiv2df;
14015 goto scatter_gen;
14016 case IX86_BUILTIN_SCATTERSIV8SI:
14017 icode = CODE_FOR_avx512vl_scattersiv8si;
14018 goto scatter_gen;
14019 case IX86_BUILTIN_SCATTERSIV4SI:
14020 icode = CODE_FOR_avx512vl_scattersiv4si;
14021 goto scatter_gen;
14022 case IX86_BUILTIN_SCATTERSIV4DI:
14023 icode = CODE_FOR_avx512vl_scattersiv4di;
14024 goto scatter_gen;
14025 case IX86_BUILTIN_SCATTERSIV2DI:
14026 icode = CODE_FOR_avx512vl_scattersiv2di;
14027 goto scatter_gen;
14028 case IX86_BUILTIN_SCATTERDIV8SI:
14029 icode = CODE_FOR_avx512vl_scatterdiv8si;
14030 goto scatter_gen;
14031 case IX86_BUILTIN_SCATTERDIV4SI:
14032 icode = CODE_FOR_avx512vl_scatterdiv4si;
14033 goto scatter_gen;
14034 case IX86_BUILTIN_SCATTERDIV4DI:
14035 icode = CODE_FOR_avx512vl_scatterdiv4di;
14036 goto scatter_gen;
14037 case IX86_BUILTIN_SCATTERDIV2DI:
14038 icode = CODE_FOR_avx512vl_scatterdiv2di;
14039 goto scatter_gen;
14040 case IX86_BUILTIN_GATHERPFDPD:
14041 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14042 goto vec_prefetch_gen;
14043 case IX86_BUILTIN_SCATTERALTSIV8DF:
14044 icode = CODE_FOR_avx512f_scattersiv8df;
14045 goto scatter_gen;
14046 case IX86_BUILTIN_SCATTERALTDIV16SF:
14047 icode = CODE_FOR_avx512f_scatterdiv16sf;
14048 goto scatter_gen;
14049 case IX86_BUILTIN_SCATTERALTSIV8DI:
14050 icode = CODE_FOR_avx512f_scattersiv8di;
14051 goto scatter_gen;
14052 case IX86_BUILTIN_SCATTERALTDIV16SI:
14053 icode = CODE_FOR_avx512f_scatterdiv16si;
14054 goto scatter_gen;
14055 case IX86_BUILTIN_SCATTERALTSIV4DF:
14056 icode = CODE_FOR_avx512vl_scattersiv4df;
14057 goto scatter_gen;
14058 case IX86_BUILTIN_SCATTERALTDIV8SF:
14059 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14060 goto scatter_gen;
14061 case IX86_BUILTIN_SCATTERALTSIV4DI:
14062 icode = CODE_FOR_avx512vl_scattersiv4di;
14063 goto scatter_gen;
14064 case IX86_BUILTIN_SCATTERALTDIV8SI:
14065 icode = CODE_FOR_avx512vl_scatterdiv8si;
14066 goto scatter_gen;
14067 case IX86_BUILTIN_SCATTERALTSIV2DF:
14068 icode = CODE_FOR_avx512vl_scattersiv2df;
14069 goto scatter_gen;
14070 case IX86_BUILTIN_SCATTERALTDIV4SF:
14071 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14072 goto scatter_gen;
14073 case IX86_BUILTIN_SCATTERALTSIV2DI:
14074 icode = CODE_FOR_avx512vl_scattersiv2di;
14075 goto scatter_gen;
14076 case IX86_BUILTIN_SCATTERALTDIV4SI:
14077 icode = CODE_FOR_avx512vl_scatterdiv4si;
14078 goto scatter_gen;
14079 case IX86_BUILTIN_GATHERPFDPS:
14080 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14081 goto vec_prefetch_gen;
14082 case IX86_BUILTIN_GATHERPFQPD:
14083 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14084 goto vec_prefetch_gen;
14085 case IX86_BUILTIN_GATHERPFQPS:
14086 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14087 goto vec_prefetch_gen;
14088 case IX86_BUILTIN_SCATTERPFDPD:
14089 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14090 goto vec_prefetch_gen;
14091 case IX86_BUILTIN_SCATTERPFDPS:
14092 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14093 goto vec_prefetch_gen;
14094 case IX86_BUILTIN_SCATTERPFQPD:
14095 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14096 goto vec_prefetch_gen;
14097 case IX86_BUILTIN_SCATTERPFQPS:
14098 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14099 goto vec_prefetch_gen;
14100
14101 gather_gen:
14102 rtx half;
14103 rtx (*gen) (rtx, rtx);
14104
14105 arg0 = CALL_EXPR_ARG (exp, 0);
14106 arg1 = CALL_EXPR_ARG (exp, 1);
14107 arg2 = CALL_EXPR_ARG (exp, 2);
14108 arg3 = CALL_EXPR_ARG (exp, 3);
14109 arg4 = CALL_EXPR_ARG (exp, 4);
14110 op0 = expand_normal (arg0);
14111 op1 = expand_normal (arg1);
14112 op2 = expand_normal (arg2);
14113 op3 = expand_normal (arg3);
14114 op4 = expand_normal (arg4);
14115 /* Note the arg order is different from the operand order. */
14116 mode0 = insn_data[icode].operand[1].mode;
14117 mode2 = insn_data[icode].operand[3].mode;
14118 mode3 = insn_data[icode].operand[4].mode;
14119 mode4 = insn_data[icode].operand[5].mode;
14120
14121 if (target == NULL_RTX
14122 || GET_MODE (target) != insn_data[icode].operand[0].mode
14123 || !insn_data[icode].operand[0].predicate (target,
14124 GET_MODE (target)))
14125 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14126 else
14127 subtarget = target;
14128
14129 switch (fcode)
14130 {
14131 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14132 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14133 half = gen_reg_rtx (V8SImode);
14134 if (!nonimmediate_operand (op2, V16SImode))
14135 op2 = copy_to_mode_reg (V16SImode, op2);
14136 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14137 op2 = half;
14138 break;
14139 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14140 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14141 case IX86_BUILTIN_GATHERALTSIV4DF:
14142 case IX86_BUILTIN_GATHERALTSIV4DI:
14143 half = gen_reg_rtx (V4SImode);
14144 if (!nonimmediate_operand (op2, V8SImode))
14145 op2 = copy_to_mode_reg (V8SImode, op2);
14146 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14147 op2 = half;
14148 break;
14149 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14150 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14151 half = gen_reg_rtx (mode0);
14152 if (mode0 == V8SFmode)
14153 gen = gen_vec_extract_lo_v16sf;
14154 else
14155 gen = gen_vec_extract_lo_v16si;
14156 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14157 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14158 emit_insn (gen (half, op0));
14159 op0 = half;
14160 op3 = lowpart_subreg (QImode, op3, HImode);
14161 break;
14162 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14163 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14164 case IX86_BUILTIN_GATHERALTDIV8SF:
14165 case IX86_BUILTIN_GATHERALTDIV8SI:
14166 half = gen_reg_rtx (mode0);
14167 if (mode0 == V4SFmode)
14168 gen = gen_vec_extract_lo_v8sf;
14169 else
14170 gen = gen_vec_extract_lo_v8si;
14171 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14172 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14173 emit_insn (gen (half, op0));
14174 op0 = half;
14175 if (VECTOR_MODE_P (GET_MODE (op3)))
14176 {
14177 half = gen_reg_rtx (mode0);
14178 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14179 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14180 emit_insn (gen (half, op3));
14181 op3 = half;
14182 }
14183 break;
14184 default:
14185 break;
14186 }
14187
14188 /* Force memory operand only with base register here. But we
14189 don't want to do it on memory operand for other builtin
14190 functions. */
14191 op1 = ix86_zero_extend_to_Pmode (op1);
14192
14193 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14194 op0 = copy_to_mode_reg (mode0, op0);
14195 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14196 op1 = copy_to_mode_reg (Pmode, op1);
14197 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14198 op2 = copy_to_mode_reg (mode2, op2);
14199
14200 op3 = fixup_modeless_constant (op3, mode3);
14201
14202 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14203 {
14204 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14205 op3 = copy_to_mode_reg (mode3, op3);
14206 }
14207 else
14208 {
14209 op3 = copy_to_reg (op3);
14210 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14211 }
14212 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14213 {
14214 error ("the last argument must be scale 1, 2, 4, 8");
14215 return const0_rtx;
14216 }
14217
14218 /* Optimize. If mask is known to have all high bits set,
14219 replace op0 with pc_rtx to signal that the instruction
14220 overwrites the whole destination and doesn't use its
14221 previous contents. */
14222 if (optimize)
14223 {
14224 if (TREE_CODE (arg3) == INTEGER_CST)
14225 {
14226 if (integer_all_onesp (arg3))
14227 op0 = pc_rtx;
14228 }
14229 else if (TREE_CODE (arg3) == VECTOR_CST)
14230 {
14231 unsigned int negative = 0;
14232 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14233 {
14234 tree cst = VECTOR_CST_ELT (arg3, i);
14235 if (TREE_CODE (cst) == INTEGER_CST
14236 && tree_int_cst_sign_bit (cst))
14237 negative++;
14238 else if (TREE_CODE (cst) == REAL_CST
14239 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14240 negative++;
14241 }
14242 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14243 op0 = pc_rtx;
14244 }
14245 else if (TREE_CODE (arg3) == SSA_NAME
14246 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
14247 {
14248 /* Recognize also when mask is like:
14249 __v2df src = _mm_setzero_pd ();
14250 __v2df mask = _mm_cmpeq_pd (src, src);
14251 or
14252 __v8sf src = _mm256_setzero_ps ();
14253 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14254 as that is a cheaper way to load all ones into
14255 a register than having to load a constant from
14256 memory. */
14257 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14258 if (is_gimple_call (def_stmt))
14259 {
14260 tree fndecl = gimple_call_fndecl (def_stmt);
14261 if (fndecl
14262 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
4d732405 14263 switch (DECL_MD_FUNCTION_CODE (fndecl))
2bf6d935
ML
14264 {
14265 case IX86_BUILTIN_CMPPD:
14266 case IX86_BUILTIN_CMPPS:
14267 case IX86_BUILTIN_CMPPD256:
14268 case IX86_BUILTIN_CMPPS256:
14269 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14270 break;
14271 /* FALLTHRU */
14272 case IX86_BUILTIN_CMPEQPD:
14273 case IX86_BUILTIN_CMPEQPS:
14274 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14275 && initializer_zerop (gimple_call_arg (def_stmt,
14276 1)))
14277 op0 = pc_rtx;
14278 break;
14279 default:
14280 break;
14281 }
14282 }
14283 }
14284 }
14285
14286 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14287 if (! pat)
14288 return const0_rtx;
14289 emit_insn (pat);
14290
14291 switch (fcode)
14292 {
14293 case IX86_BUILTIN_GATHER3DIV16SF:
14294 if (target == NULL_RTX)
14295 target = gen_reg_rtx (V8SFmode);
14296 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14297 break;
14298 case IX86_BUILTIN_GATHER3DIV16SI:
14299 if (target == NULL_RTX)
14300 target = gen_reg_rtx (V8SImode);
14301 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14302 break;
14303 case IX86_BUILTIN_GATHER3DIV8SF:
14304 case IX86_BUILTIN_GATHERDIV8SF:
14305 if (target == NULL_RTX)
14306 target = gen_reg_rtx (V4SFmode);
14307 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14308 break;
14309 case IX86_BUILTIN_GATHER3DIV8SI:
14310 case IX86_BUILTIN_GATHERDIV8SI:
14311 if (target == NULL_RTX)
14312 target = gen_reg_rtx (V4SImode);
14313 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14314 break;
14315 default:
14316 target = subtarget;
14317 break;
14318 }
14319 return target;
14320
14321 scatter_gen:
14322 arg0 = CALL_EXPR_ARG (exp, 0);
14323 arg1 = CALL_EXPR_ARG (exp, 1);
14324 arg2 = CALL_EXPR_ARG (exp, 2);
14325 arg3 = CALL_EXPR_ARG (exp, 3);
14326 arg4 = CALL_EXPR_ARG (exp, 4);
14327 op0 = expand_normal (arg0);
14328 op1 = expand_normal (arg1);
14329 op2 = expand_normal (arg2);
14330 op3 = expand_normal (arg3);
14331 op4 = expand_normal (arg4);
14332 mode1 = insn_data[icode].operand[1].mode;
14333 mode2 = insn_data[icode].operand[2].mode;
14334 mode3 = insn_data[icode].operand[3].mode;
14335 mode4 = insn_data[icode].operand[4].mode;
14336
14337 /* Scatter instruction stores operand op3 to memory with
14338 indices from op2 and scale from op4 under writemask op1.
14339 If index operand op2 has more elements then source operand
14340 op3 one need to use only its low half. And vice versa. */
14341 switch (fcode)
14342 {
14343 case IX86_BUILTIN_SCATTERALTSIV8DF:
14344 case IX86_BUILTIN_SCATTERALTSIV8DI:
14345 half = gen_reg_rtx (V8SImode);
14346 if (!nonimmediate_operand (op2, V16SImode))
14347 op2 = copy_to_mode_reg (V16SImode, op2);
14348 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14349 op2 = half;
14350 break;
14351 case IX86_BUILTIN_SCATTERALTDIV16SF:
14352 case IX86_BUILTIN_SCATTERALTDIV16SI:
14353 half = gen_reg_rtx (mode3);
14354 if (mode3 == V8SFmode)
14355 gen = gen_vec_extract_lo_v16sf;
14356 else
14357 gen = gen_vec_extract_lo_v16si;
14358 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14359 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14360 emit_insn (gen (half, op3));
14361 op3 = half;
14362 break;
14363 case IX86_BUILTIN_SCATTERALTSIV4DF:
14364 case IX86_BUILTIN_SCATTERALTSIV4DI:
14365 half = gen_reg_rtx (V4SImode);
14366 if (!nonimmediate_operand (op2, V8SImode))
14367 op2 = copy_to_mode_reg (V8SImode, op2);
14368 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14369 op2 = half;
14370 break;
14371 case IX86_BUILTIN_SCATTERALTDIV8SF:
14372 case IX86_BUILTIN_SCATTERALTDIV8SI:
14373 half = gen_reg_rtx (mode3);
14374 if (mode3 == V4SFmode)
14375 gen = gen_vec_extract_lo_v8sf;
14376 else
14377 gen = gen_vec_extract_lo_v8si;
14378 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14379 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14380 emit_insn (gen (half, op3));
14381 op3 = half;
14382 break;
14383 case IX86_BUILTIN_SCATTERALTSIV2DF:
14384 case IX86_BUILTIN_SCATTERALTSIV2DI:
14385 if (!nonimmediate_operand (op2, V4SImode))
14386 op2 = copy_to_mode_reg (V4SImode, op2);
14387 break;
14388 case IX86_BUILTIN_SCATTERALTDIV4SF:
14389 case IX86_BUILTIN_SCATTERALTDIV4SI:
14390 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14391 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14392 break;
14393 default:
14394 break;
14395 }
14396
14397 /* Force memory operand only with base register here. But we
14398 don't want to do it on memory operand for other builtin
14399 functions. */
14400 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14401
14402 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14403 op0 = copy_to_mode_reg (Pmode, op0);
14404
14405 op1 = fixup_modeless_constant (op1, mode1);
14406
14407 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14408 {
14409 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14410 op1 = copy_to_mode_reg (mode1, op1);
14411 }
14412 else
14413 {
14414 op1 = copy_to_reg (op1);
14415 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14416 }
14417
14418 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14419 op2 = copy_to_mode_reg (mode2, op2);
14420
14421 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14422 op3 = copy_to_mode_reg (mode3, op3);
14423
14424 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14425 {
14426 error ("the last argument must be scale 1, 2, 4, 8");
14427 return const0_rtx;
14428 }
14429
14430 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14431 if (! pat)
14432 return const0_rtx;
14433
14434 emit_insn (pat);
14435 return 0;
14436
14437 vec_prefetch_gen:
14438 arg0 = CALL_EXPR_ARG (exp, 0);
14439 arg1 = CALL_EXPR_ARG (exp, 1);
14440 arg2 = CALL_EXPR_ARG (exp, 2);
14441 arg3 = CALL_EXPR_ARG (exp, 3);
14442 arg4 = CALL_EXPR_ARG (exp, 4);
14443 op0 = expand_normal (arg0);
14444 op1 = expand_normal (arg1);
14445 op2 = expand_normal (arg2);
14446 op3 = expand_normal (arg3);
14447 op4 = expand_normal (arg4);
14448 mode0 = insn_data[icode].operand[0].mode;
14449 mode1 = insn_data[icode].operand[1].mode;
14450 mode3 = insn_data[icode].operand[3].mode;
14451 mode4 = insn_data[icode].operand[4].mode;
14452
14453 op0 = fixup_modeless_constant (op0, mode0);
14454
14455 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14456 {
14457 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14458 op0 = copy_to_mode_reg (mode0, op0);
14459 }
14460 else
14461 {
14462 op0 = copy_to_reg (op0);
14463 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14464 }
14465
14466 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14467 op1 = copy_to_mode_reg (mode1, op1);
14468
14469 /* Force memory operand only with base register here. But we
14470 don't want to do it on memory operand for other builtin
14471 functions. */
14472 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14473
14474 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14475 op2 = copy_to_mode_reg (Pmode, op2);
14476
14477 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14478 {
14479 error ("the forth argument must be scale 1, 2, 4, 8");
14480 return const0_rtx;
14481 }
14482
14483 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14484 {
14485 error ("incorrect hint operand");
14486 return const0_rtx;
14487 }
14488
14489 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14490 if (! pat)
14491 return const0_rtx;
14492
14493 emit_insn (pat);
14494
14495 return 0;
14496
14497 case IX86_BUILTIN_XABORT:
14498 icode = CODE_FOR_xabort;
14499 arg0 = CALL_EXPR_ARG (exp, 0);
14500 op0 = expand_normal (arg0);
14501 mode0 = insn_data[icode].operand[0].mode;
14502 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14503 {
14504 error ("the argument to %<xabort%> intrinsic must "
14505 "be an 8-bit immediate");
14506 return const0_rtx;
14507 }
14508 emit_insn (gen_xabort (op0));
14509 return 0;
14510
b5034abb
UB
14511 case IX86_BUILTIN_RDSSPD:
14512 case IX86_BUILTIN_RDSSPQ:
14513 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14514
14515 if (target == 0
14516 || !register_operand (target, mode))
14517 target = gen_reg_rtx (mode);
14518
14519 op0 = force_reg (mode, const0_rtx);
14520
14521 emit_insn (gen_rdssp (mode, target, op0));
14522 return target;
14523
14524 case IX86_BUILTIN_INCSSPD:
14525 case IX86_BUILTIN_INCSSPQ:
14526 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14527
14528 arg0 = CALL_EXPR_ARG (exp, 0);
14529 op0 = expand_normal (arg0);
14530
14531 op0 = force_reg (mode, op0);
14532
14533 emit_insn (gen_incssp (mode, op0));
14534 return 0;
14535
83927c63
HW
14536 case IX86_BUILTIN_HRESET:
14537 icode = CODE_FOR_hreset;
14538 arg0 = CALL_EXPR_ARG (exp, 0);
14539 op0 = expand_normal (arg0);
14540 op0 = force_reg (SImode, op0);
14541 emit_insn (gen_hreset (op0));
14542 return 0;
14543
2bf6d935
ML
14544 case IX86_BUILTIN_RSTORSSP:
14545 case IX86_BUILTIN_CLRSSBSY:
14546 arg0 = CALL_EXPR_ARG (exp, 0);
14547 op0 = expand_normal (arg0);
14548 icode = (fcode == IX86_BUILTIN_RSTORSSP
b5034abb
UB
14549 ? CODE_FOR_rstorssp
14550 : CODE_FOR_clrssbsy);
14551
2bf6d935
ML
14552 if (!address_operand (op0, VOIDmode))
14553 {
b5034abb
UB
14554 op0 = convert_memory_address (Pmode, op0);
14555 op0 = copy_addr_to_reg (op0);
2bf6d935 14556 }
b5034abb 14557 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
2bf6d935
ML
14558 return 0;
14559
14560 case IX86_BUILTIN_WRSSD:
14561 case IX86_BUILTIN_WRSSQ:
14562 case IX86_BUILTIN_WRUSSD:
14563 case IX86_BUILTIN_WRUSSQ:
b5034abb
UB
14564 mode = ((fcode == IX86_BUILTIN_WRSSD
14565 || fcode == IX86_BUILTIN_WRUSSD)
14566 ? SImode : DImode);
14567
2bf6d935
ML
14568 arg0 = CALL_EXPR_ARG (exp, 0);
14569 op0 = expand_normal (arg0);
14570 arg1 = CALL_EXPR_ARG (exp, 1);
14571 op1 = expand_normal (arg1);
b5034abb 14572
2bf6d935 14573 op0 = force_reg (mode, op0);
b5034abb 14574
2bf6d935
ML
14575 if (!address_operand (op1, VOIDmode))
14576 {
b5034abb
UB
14577 op1 = convert_memory_address (Pmode, op1);
14578 op1 = copy_addr_to_reg (op1);
2bf6d935 14579 }
b5034abb
UB
14580 op1 = gen_rtx_MEM (mode, op1);
14581
44320665
UB
14582 icode = ((fcode == IX86_BUILTIN_WRSSD
14583 || fcode == IX86_BUILTIN_WRSSQ)
14584 ? code_for_wrss (mode)
14585 : code_for_wruss (mode));
14586 emit_insn (GEN_FCN (icode) (op0, op1));
14587
2bf6d935
ML
14588 return 0;
14589
14590 default:
14591 break;
14592 }
14593
14594 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14595 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14596 {
14597 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14598 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14599 target);
14600 }
14601
fd5d5794
UB
14602 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14603 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14604 {
14605 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14606 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14607 target);
14608 }
14609
2bf6d935
ML
14610 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14611 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14612 {
14613 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14614 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14615 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14616 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14617 int masked = 1;
14618 machine_mode mode, wide_mode, nar_mode;
14619
14620 nar_mode = V4SFmode;
14621 mode = V16SFmode;
14622 wide_mode = V64SFmode;
14623 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
14624 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14625
14626 switch (fcode)
14627 {
14628 case IX86_BUILTIN_4FMAPS:
14629 fcn = gen_avx5124fmaddps_4fmaddps;
14630 masked = 0;
14631 goto v4fma_expand;
14632
14633 case IX86_BUILTIN_4DPWSSD:
14634 nar_mode = V4SImode;
14635 mode = V16SImode;
14636 wide_mode = V64SImode;
14637 fcn = gen_avx5124vnniw_vp4dpwssd;
14638 masked = 0;
14639 goto v4fma_expand;
14640
14641 case IX86_BUILTIN_4DPWSSDS:
14642 nar_mode = V4SImode;
14643 mode = V16SImode;
14644 wide_mode = V64SImode;
14645 fcn = gen_avx5124vnniw_vp4dpwssds;
14646 masked = 0;
14647 goto v4fma_expand;
14648
14649 case IX86_BUILTIN_4FNMAPS:
14650 fcn = gen_avx5124fmaddps_4fnmaddps;
14651 masked = 0;
14652 goto v4fma_expand;
14653
14654 case IX86_BUILTIN_4FNMAPS_MASK:
14655 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
14656 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
14657 goto v4fma_expand;
14658
14659 case IX86_BUILTIN_4DPWSSD_MASK:
14660 nar_mode = V4SImode;
14661 mode = V16SImode;
14662 wide_mode = V64SImode;
14663 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
14664 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
14665 goto v4fma_expand;
14666
14667 case IX86_BUILTIN_4DPWSSDS_MASK:
14668 nar_mode = V4SImode;
14669 mode = V16SImode;
14670 wide_mode = V64SImode;
14671 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
14672 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
14673 goto v4fma_expand;
14674
14675 case IX86_BUILTIN_4FMAPS_MASK:
14676 {
14677 tree args[4];
14678 rtx ops[4];
14679 rtx wide_reg;
14680 rtx accum;
14681 rtx addr;
14682 rtx mem;
14683
14684v4fma_expand:
14685 wide_reg = gen_reg_rtx (wide_mode);
14686 for (i = 0; i < 4; i++)
14687 {
14688 args[i] = CALL_EXPR_ARG (exp, i);
14689 ops[i] = expand_normal (args[i]);
14690
14691 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
14692 ops[i]);
14693 }
14694
14695 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14696 accum = force_reg (mode, accum);
14697
14698 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14699 addr = force_reg (Pmode, addr);
14700
14701 mem = gen_rtx_MEM (nar_mode, addr);
14702
14703 target = gen_reg_rtx (mode);
14704
14705 emit_move_insn (target, accum);
14706
14707 if (! masked)
14708 emit_insn (fcn (target, accum, wide_reg, mem));
14709 else
14710 {
14711 rtx merge, mask;
14712 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14713
14714 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14715
14716 if (CONST_INT_P (mask))
14717 mask = fixup_modeless_constant (mask, HImode);
14718
14719 mask = force_reg (HImode, mask);
14720
14721 if (GET_MODE (mask) != HImode)
14722 mask = gen_rtx_SUBREG (HImode, mask, 0);
14723
14724 /* If merge is 0 then we're about to emit z-masked variant. */
14725 if (const0_operand (merge, mode))
14726 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14727 /* If merge is the same as accum then emit merge-masked variant. */
14728 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14729 {
14730 merge = force_reg (mode, merge);
14731 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14732 }
14733 /* Merge with something unknown might happen if we z-mask w/ -O0. */
14734 else
14735 {
14736 target = gen_reg_rtx (mode);
14737 emit_move_insn (target, merge);
14738 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14739 }
14740 }
14741 return target;
14742 }
14743
14744 case IX86_BUILTIN_4FNMASS:
14745 fcn = gen_avx5124fmaddps_4fnmaddss;
14746 masked = 0;
14747 goto s4fma_expand;
14748
14749 case IX86_BUILTIN_4FMASS:
14750 fcn = gen_avx5124fmaddps_4fmaddss;
14751 masked = 0;
14752 goto s4fma_expand;
14753
14754 case IX86_BUILTIN_4FNMASS_MASK:
14755 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
14756 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
14757 goto s4fma_expand;
14758
14759 case IX86_BUILTIN_4FMASS_MASK:
14760 {
14761 tree args[4];
14762 rtx ops[4];
14763 rtx wide_reg;
14764 rtx accum;
14765 rtx addr;
14766 rtx mem;
14767
14768 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
14769 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
14770
14771s4fma_expand:
14772 mode = V4SFmode;
14773 wide_reg = gen_reg_rtx (V64SFmode);
14774 for (i = 0; i < 4; i++)
14775 {
14776 rtx tmp;
14777 args[i] = CALL_EXPR_ARG (exp, i);
14778 ops[i] = expand_normal (args[i]);
14779
14780 tmp = gen_reg_rtx (SFmode);
14781 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
14782
14783 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
14784 gen_rtx_SUBREG (V16SFmode, tmp, 0));
14785 }
14786
14787 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14788 accum = force_reg (V4SFmode, accum);
14789
14790 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14791 addr = force_reg (Pmode, addr);
14792
14793 mem = gen_rtx_MEM (V4SFmode, addr);
14794
14795 target = gen_reg_rtx (V4SFmode);
14796
14797 emit_move_insn (target, accum);
14798
14799 if (! masked)
14800 emit_insn (fcn (target, accum, wide_reg, mem));
14801 else
14802 {
14803 rtx merge, mask;
14804 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14805
14806 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14807
14808 if (CONST_INT_P (mask))
14809 mask = fixup_modeless_constant (mask, QImode);
14810
14811 mask = force_reg (QImode, mask);
14812
14813 if (GET_MODE (mask) != QImode)
14814 mask = gen_rtx_SUBREG (QImode, mask, 0);
14815
14816 /* If merge is 0 then we're about to emit z-masked variant. */
14817 if (const0_operand (merge, mode))
14818 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14819 /* If merge is the same as accum then emit merge-masked
14820 variant. */
14821 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14822 {
14823 merge = force_reg (mode, merge);
14824 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14825 }
14826 /* Merge with something unknown might happen if we z-mask
14827 w/ -O0. */
14828 else
14829 {
14830 target = gen_reg_rtx (mode);
14831 emit_move_insn (target, merge);
14832 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14833 }
14834 }
14835 return target;
14836 }
14837 case IX86_BUILTIN_RDPID:
14838 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
14839 target);
14840 case IX86_BUILTIN_FABSQ:
14841 case IX86_BUILTIN_COPYSIGNQ:
14842 if (!TARGET_SSE)
14843 /* Emit a normal call if SSE isn't available. */
14844 return expand_call (exp, target, ignore);
14845 /* FALLTHRU */
14846 default:
14847 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
14848 }
14849 }
14850
14851 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
14852 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
14853 {
14854 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
14855 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
14856 }
14857
14858 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
14859 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
14860 {
14861 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
14862 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
14863 }
14864
14865 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
14866 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
14867 {
14868 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
14869 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
14870 }
14871
14872 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
14873 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
14874 {
14875 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
14876 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
14877 }
14878
14879 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
14880 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
14881 {
14882 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
14883 const struct builtin_description *d = bdesc_multi_arg + i;
14884 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
14885 (enum ix86_builtin_func_type)
14886 d->flag, d->comparison);
14887 }
14888
14889 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
14890 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
14891 {
14892 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
14893 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
14894 target);
14895 }
14896
2bf6d935
ML
14897 gcc_unreachable ();
14898}
14899
14900/* A subroutine of ix86_expand_vector_init_duplicate. Tries to
14901 fill target with val via vec_duplicate. */
14902
14903static bool
14904ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
14905{
14906 bool ok;
14907 rtx_insn *insn;
14908 rtx dup;
14909
14910 /* First attempt to recognize VAL as-is. */
14911 dup = gen_vec_duplicate (mode, val);
14912 insn = emit_insn (gen_rtx_SET (target, dup));
14913 if (recog_memoized (insn) < 0)
14914 {
14915 rtx_insn *seq;
14916 machine_mode innermode = GET_MODE_INNER (mode);
14917 rtx reg;
14918
14919 /* If that fails, force VAL into a register. */
14920
14921 start_sequence ();
14922 reg = force_reg (innermode, val);
14923 if (GET_MODE (reg) != innermode)
14924 reg = gen_lowpart (innermode, reg);
14925 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
14926 seq = get_insns ();
14927 end_sequence ();
14928 if (seq)
14929 emit_insn_before (seq, insn);
14930
14931 ok = recog_memoized (insn) >= 0;
14932 gcc_assert (ok);
14933 }
14934 return true;
14935}
14936
14937/* Get a vector mode of the same size as the original but with elements
14938 twice as wide. This is only guaranteed to apply to integral vectors. */
14939
14940static machine_mode
14941get_mode_wider_vector (machine_mode o)
14942{
e53b6e56 14943 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
2bf6d935
ML
14944 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
14945 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
14946 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
14947 return n;
14948}
14949
14950static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
14951static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
14952
14953/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
14954 with all elements equal to VAR. Return true if successful. */
14955
51c30227 14956bool
2bf6d935
ML
14957ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
14958 rtx target, rtx val)
14959{
14960 bool ok;
14961
14962 switch (mode)
14963 {
14964 case E_V2SImode:
14965 case E_V2SFmode:
14966 if (!mmx_ok)
14967 return false;
14968 /* FALLTHRU */
14969
14970 case E_V4DFmode:
14971 case E_V4DImode:
14972 case E_V8SFmode:
14973 case E_V8SImode:
14974 case E_V2DFmode:
14975 case E_V2DImode:
14976 case E_V4SFmode:
14977 case E_V4SImode:
14978 case E_V16SImode:
14979 case E_V8DImode:
14980 case E_V16SFmode:
14981 case E_V8DFmode:
14982 return ix86_vector_duplicate_value (mode, target, val);
14983
14984 case E_V4HImode:
14985 if (!mmx_ok)
14986 return false;
14987 if (TARGET_SSE || TARGET_3DNOW_A)
14988 {
14989 rtx x;
14990
14991 val = gen_lowpart (SImode, val);
14992 x = gen_rtx_TRUNCATE (HImode, val);
14993 x = gen_rtx_VEC_DUPLICATE (mode, x);
14994 emit_insn (gen_rtx_SET (target, x));
14995 return true;
14996 }
14997 goto widen;
14998
8d7dae0e
UB
14999 case E_V2HImode:
15000 if (TARGET_SSE2)
15001 {
15002 rtx x;
15003
15004 val = gen_lowpart (SImode, val);
15005 x = gen_rtx_TRUNCATE (HImode, val);
15006 x = gen_rtx_VEC_DUPLICATE (mode, x);
15007 emit_insn (gen_rtx_SET (target, x));
15008 return true;
15009 }
15010 return false;
15011
2bf6d935 15012 case E_V8QImode:
64735dc9 15013 case E_V4QImode:
2bf6d935
ML
15014 if (!mmx_ok)
15015 return false;
15016 goto widen;
15017
15018 case E_V8HImode:
7a54d3de 15019 case E_V8HFmode:
6910cad5 15020 case E_V8BFmode:
2bf6d935
ML
15021 if (TARGET_AVX2)
15022 return ix86_vector_duplicate_value (mode, target, val);
15023
15024 if (TARGET_SSE2)
15025 {
15026 struct expand_vec_perm_d dperm;
15027 rtx tmp1, tmp2;
15028
15029 permute:
15030 memset (&dperm, 0, sizeof (dperm));
15031 dperm.target = target;
15032 dperm.vmode = mode;
15033 dperm.nelt = GET_MODE_NUNITS (mode);
15034 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15035 dperm.one_operand_p = true;
15036
7a54d3de 15037 if (mode == V8HFmode)
e2385690
HW
15038 {
15039 tmp1 = force_reg (HFmode, val);
15040 tmp2 = gen_reg_rtx (mode);
15041 emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
15042 tmp1 = gen_lowpart (mode, tmp2);
15043 }
7a54d3de
UB
15044 else
15045 {
15046 /* Extend to SImode using a paradoxical SUBREG. */
15047 tmp1 = gen_reg_rtx (SImode);
15048 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15049
15050 /* Insert the SImode value as
15051 low element of a V4SImode vector. */
15052 tmp2 = gen_reg_rtx (V4SImode);
15053 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15054 tmp1 = gen_lowpart (mode, tmp2);
15055 }
2bf6d935 15056
7a54d3de 15057 emit_move_insn (dperm.op0, tmp1);
2bf6d935
ML
15058 ok = (expand_vec_perm_1 (&dperm)
15059 || expand_vec_perm_broadcast_1 (&dperm));
15060 gcc_assert (ok);
15061 return ok;
15062 }
15063 goto widen;
15064
15065 case E_V16QImode:
15066 if (TARGET_AVX2)
15067 return ix86_vector_duplicate_value (mode, target, val);
15068
15069 if (TARGET_SSE2)
15070 goto permute;
15071 goto widen;
15072
15073 widen:
15074 /* Replicate the value once into the next wider mode and recurse. */
15075 {
15076 machine_mode smode, wsmode, wvmode;
15077 rtx x;
15078
15079 smode = GET_MODE_INNER (mode);
15080 wvmode = get_mode_wider_vector (mode);
15081 wsmode = GET_MODE_INNER (wvmode);
15082
15083 val = convert_modes (wsmode, smode, val, true);
20a2c8ac
UB
15084
15085 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15086 emit_insn (gen_insv_1 (wsmode, val, val));
15087 else
15088 {
15089 x = expand_simple_binop (wsmode, ASHIFT, val,
15090 GEN_INT (GET_MODE_BITSIZE (smode)),
15091 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15092 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15093 OPTAB_LIB_WIDEN);
15094 }
2bf6d935
ML
15095
15096 x = gen_reg_rtx (wvmode);
15097 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15098 gcc_assert (ok);
15099 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15100 return ok;
15101 }
15102
15103 case E_V16HImode:
7a54d3de 15104 case E_V16HFmode:
6910cad5 15105 case E_V16BFmode:
2bf6d935
ML
15106 case E_V32QImode:
15107 if (TARGET_AVX2)
15108 return ix86_vector_duplicate_value (mode, target, val);
15109 else
15110 {
7a54d3de
UB
15111 machine_mode hvmode = (mode == V16HImode ? V8HImode
15112 : mode == V16HFmode ? V8HFmode
15113 : V16QImode);
2bf6d935
ML
15114 rtx x = gen_reg_rtx (hvmode);
15115
15116 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15117 gcc_assert (ok);
15118
15119 x = gen_rtx_VEC_CONCAT (mode, x, x);
15120 emit_insn (gen_rtx_SET (target, x));
15121 }
15122 return true;
15123
2bf6d935 15124 case E_V32HImode:
7a54d3de 15125 case E_V32HFmode:
6910cad5 15126 case E_V32BFmode:
7a54d3de 15127 case E_V64QImode:
2bf6d935
ML
15128 if (TARGET_AVX512BW)
15129 return ix86_vector_duplicate_value (mode, target, val);
15130 else
15131 {
7a54d3de
UB
15132 machine_mode hvmode = (mode == V32HImode ? V16HImode
15133 : mode == V32HFmode ? V16HFmode
6910cad5 15134 : mode == V32BFmode ? V16BFmode
7a54d3de 15135 : V32QImode);
2bf6d935
ML
15136 rtx x = gen_reg_rtx (hvmode);
15137
15138 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15139 gcc_assert (ok);
15140
15141 x = gen_rtx_VEC_CONCAT (mode, x, x);
15142 emit_insn (gen_rtx_SET (target, x));
15143 }
15144 return true;
15145
15146 default:
15147 return false;
15148 }
15149}
15150
15151/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15152 whose ONE_VAR element is VAR, and other elements are zero. Return true
15153 if successful. */
15154
15155static bool
15156ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15157 rtx target, rtx var, int one_var)
15158{
15159 machine_mode vsimode;
15160 rtx new_target;
15161 rtx x, tmp;
15162 bool use_vector_set = false;
15163 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15164
15165 switch (mode)
15166 {
15167 case E_V2DImode:
15168 /* For SSE4.1, we normally use vector set. But if the second
15169 element is zero and inter-unit moves are OK, we use movq
15170 instead. */
15171 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15172 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15173 && one_var == 0));
15174 break;
15175 case E_V16QImode:
15176 case E_V4SImode:
15177 case E_V4SFmode:
15178 use_vector_set = TARGET_SSE4_1;
15179 break;
15180 case E_V8HImode:
15181 use_vector_set = TARGET_SSE2;
c4d423c7 15182 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15183 ? gen_vec_setv8hi_0 : NULL;
2bf6d935 15184 break;
8a0eb0cd
UB
15185 case E_V8QImode:
15186 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15187 break;
2bf6d935
ML
15188 case E_V4HImode:
15189 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15190 break;
64735dc9
UB
15191 case E_V4QImode:
15192 use_vector_set = TARGET_SSE4_1;
15193 break;
2bf6d935 15194 case E_V32QImode:
c4d423c7 15195 use_vector_set = TARGET_AVX;
15196 break;
2bf6d935
ML
15197 case E_V16HImode:
15198 use_vector_set = TARGET_AVX;
c4d423c7 15199 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15200 ? gen_vec_setv16hi_0 : NULL;
2bf6d935
ML
15201 break;
15202 case E_V8SImode:
15203 use_vector_set = TARGET_AVX;
15204 gen_vec_set_0 = gen_vec_setv8si_0;
15205 break;
15206 case E_V8SFmode:
15207 use_vector_set = TARGET_AVX;
15208 gen_vec_set_0 = gen_vec_setv8sf_0;
15209 break;
15210 case E_V4DFmode:
15211 use_vector_set = TARGET_AVX;
15212 gen_vec_set_0 = gen_vec_setv4df_0;
15213 break;
15214 case E_V4DImode:
15215 /* Use ix86_expand_vector_set in 64bit mode only. */
15216 use_vector_set = TARGET_AVX && TARGET_64BIT;
15217 gen_vec_set_0 = gen_vec_setv4di_0;
15218 break;
15219 case E_V16SImode:
15220 use_vector_set = TARGET_AVX512F && one_var == 0;
15221 gen_vec_set_0 = gen_vec_setv16si_0;
15222 break;
15223 case E_V16SFmode:
15224 use_vector_set = TARGET_AVX512F && one_var == 0;
15225 gen_vec_set_0 = gen_vec_setv16sf_0;
15226 break;
15227 case E_V8DFmode:
15228 use_vector_set = TARGET_AVX512F && one_var == 0;
15229 gen_vec_set_0 = gen_vec_setv8df_0;
15230 break;
15231 case E_V8DImode:
15232 /* Use ix86_expand_vector_set in 64bit mode only. */
15233 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15234 gen_vec_set_0 = gen_vec_setv8di_0;
15235 break;
9e2a82e1 15236 case E_V8HFmode:
15237 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15238 gen_vec_set_0 = gen_vec_setv8hf_0;
15239 break;
15240 case E_V16HFmode:
15241 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15242 gen_vec_set_0 = gen_vec_setv16hf_0;
15243 break;
15244 case E_V32HFmode:
15245 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15246 gen_vec_set_0 = gen_vec_setv32hf_0;
15247 break;
6910cad5 15248 case E_V8BFmode:
15249 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15250 gen_vec_set_0 = gen_vec_setv8bf_0;
15251 break;
15252 case E_V16BFmode:
15253 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15254 gen_vec_set_0 = gen_vec_setv16bf_0;
15255 break;
15256 case E_V32BFmode:
15257 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15258 gen_vec_set_0 = gen_vec_setv32bf_0;
15259 break;
c4d423c7 15260 case E_V32HImode:
15261 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15262 gen_vec_set_0 = gen_vec_setv32hi_0;
2bf6d935
ML
15263 default:
15264 break;
15265 }
15266
15267 if (use_vector_set)
15268 {
15269 if (gen_vec_set_0 && one_var == 0)
15270 {
15271 var = force_reg (GET_MODE_INNER (mode), var);
15272 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15273 return true;
15274 }
15275 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15276 var = force_reg (GET_MODE_INNER (mode), var);
15277 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15278 return true;
15279 }
15280
15281 switch (mode)
15282 {
15283 case E_V2SFmode:
15284 case E_V2SImode:
15285 if (!mmx_ok)
15286 return false;
15287 /* FALLTHRU */
15288
15289 case E_V2DFmode:
15290 case E_V2DImode:
15291 if (one_var != 0)
15292 return false;
15293 var = force_reg (GET_MODE_INNER (mode), var);
15294 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15295 emit_insn (gen_rtx_SET (target, x));
15296 return true;
15297
15298 case E_V4SFmode:
15299 case E_V4SImode:
15300 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15301 new_target = gen_reg_rtx (mode);
15302 else
15303 new_target = target;
15304 var = force_reg (GET_MODE_INNER (mode), var);
15305 x = gen_rtx_VEC_DUPLICATE (mode, var);
15306 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15307 emit_insn (gen_rtx_SET (new_target, x));
15308 if (one_var != 0)
15309 {
15310 /* We need to shuffle the value to the correct position, so
15311 create a new pseudo to store the intermediate result. */
15312
15313 /* With SSE2, we can use the integer shuffle insns. */
15314 if (mode != V4SFmode && TARGET_SSE2)
15315 {
15316 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15317 const1_rtx,
15318 GEN_INT (one_var == 1 ? 0 : 1),
15319 GEN_INT (one_var == 2 ? 0 : 1),
15320 GEN_INT (one_var == 3 ? 0 : 1)));
15321 if (target != new_target)
15322 emit_move_insn (target, new_target);
15323 return true;
15324 }
15325
15326 /* Otherwise convert the intermediate result to V4SFmode and
15327 use the SSE1 shuffle instructions. */
15328 if (mode != V4SFmode)
15329 {
15330 tmp = gen_reg_rtx (V4SFmode);
15331 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15332 }
15333 else
15334 tmp = new_target;
15335
15336 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15337 const1_rtx,
15338 GEN_INT (one_var == 1 ? 0 : 1),
15339 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15340 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15341
15342 if (mode != V4SFmode)
15343 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15344 else if (tmp != target)
15345 emit_move_insn (target, tmp);
15346 }
15347 else if (target != new_target)
15348 emit_move_insn (target, new_target);
15349 return true;
15350
15351 case E_V8HImode:
15352 case E_V16QImode:
15353 vsimode = V4SImode;
15354 goto widen;
15355 case E_V4HImode:
15356 case E_V8QImode:
15357 if (!mmx_ok)
15358 return false;
15359 vsimode = V2SImode;
15360 goto widen;
15361 widen:
15362 if (one_var != 0)
15363 return false;
15364
15365 /* Zero extend the variable element to SImode and recurse. */
15366 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15367
15368 x = gen_reg_rtx (vsimode);
15369 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15370 var, one_var))
15371 gcc_unreachable ();
15372
15373 emit_move_insn (target, gen_lowpart (mode, x));
15374 return true;
15375
15376 default:
15377 return false;
15378 }
15379}
15380
15381/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15382 consisting of the values in VALS. It is known that all elements
15383 except ONE_VAR are constants. Return true if successful. */
15384
15385static bool
15386ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15387 rtx target, rtx vals, int one_var)
15388{
15389 rtx var = XVECEXP (vals, 0, one_var);
15390 machine_mode wmode;
15391 rtx const_vec, x;
15392
15393 const_vec = copy_rtx (vals);
15394 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15395 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15396
15397 switch (mode)
15398 {
15399 case E_V2DFmode:
15400 case E_V2DImode:
15401 case E_V2SFmode:
15402 case E_V2SImode:
15403 /* For the two element vectors, it's just as easy to use
15404 the general case. */
15405 return false;
15406
15407 case E_V4DImode:
15408 /* Use ix86_expand_vector_set in 64bit mode only. */
15409 if (!TARGET_64BIT)
15410 return false;
15411 /* FALLTHRU */
9e2a82e1 15412 case E_V8HFmode:
15413 case E_V16HFmode:
6910cad5 15414 case E_V8BFmode:
15415 case E_V16BFmode:
2bf6d935
ML
15416 case E_V4DFmode:
15417 case E_V8SFmode:
15418 case E_V8SImode:
15419 case E_V16HImode:
15420 case E_V32QImode:
15421 case E_V4SFmode:
15422 case E_V4SImode:
15423 case E_V8HImode:
15424 case E_V4HImode:
15425 break;
15426
15427 case E_V16QImode:
15428 if (TARGET_SSE4_1)
15429 break;
15430 wmode = V8HImode;
15431 goto widen;
15432 case E_V8QImode:
8a0eb0cd
UB
15433 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15434 break;
2bf6d935
ML
15435 wmode = V4HImode;
15436 goto widen;
64735dc9
UB
15437 case E_V4QImode:
15438 if (TARGET_SSE4_1)
15439 break;
15440 wmode = V2HImode;
2bf6d935
ML
15441 widen:
15442 /* There's no way to set one QImode entry easily. Combine
15443 the variable value with its adjacent constant value, and
15444 promote to an HImode set. */
15445 x = XVECEXP (vals, 0, one_var ^ 1);
15446 if (one_var & 1)
15447 {
15448 var = convert_modes (HImode, QImode, var, true);
15449 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15450 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15451 x = GEN_INT (INTVAL (x) & 0xff);
15452 }
15453 else
15454 {
15455 var = convert_modes (HImode, QImode, var, true);
15456 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15457 }
15458 if (x != const0_rtx)
15459 var = expand_simple_binop (HImode, IOR, var, x, var,
15460 1, OPTAB_LIB_WIDEN);
15461
15462 x = gen_reg_rtx (wmode);
15463 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15464 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15465
15466 emit_move_insn (target, gen_lowpart (mode, x));
15467 return true;
15468
15469 default:
15470 return false;
15471 }
15472
15473 emit_move_insn (target, const_vec);
15474 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15475 return true;
15476}
15477
15478/* A subroutine of ix86_expand_vector_init_general. Use vector
15479 concatenate to handle the most general case: all values variable,
15480 and none identical. */
15481
15482static void
15483ix86_expand_vector_init_concat (machine_mode mode,
15484 rtx target, rtx *ops, int n)
15485{
1aeecaf5
HL
15486 machine_mode half_mode = VOIDmode;
15487 rtx half[2];
2bf6d935
ML
15488 rtvec v;
15489 int i, j;
15490
15491 switch (n)
15492 {
15493 case 2:
15494 switch (mode)
15495 {
9e2a82e1 15496 case E_V32HFmode:
15497 half_mode = V16HFmode;
15498 break;
6910cad5 15499 case E_V32BFmode:
15500 half_mode = V16BFmode;
15501 break;
2bf6d935 15502 case E_V16SImode:
1aeecaf5 15503 half_mode = V8SImode;
2bf6d935
ML
15504 break;
15505 case E_V16SFmode:
1aeecaf5 15506 half_mode = V8SFmode;
2bf6d935
ML
15507 break;
15508 case E_V8DImode:
1aeecaf5 15509 half_mode = V4DImode;
2bf6d935
ML
15510 break;
15511 case E_V8DFmode:
1aeecaf5 15512 half_mode = V4DFmode;
2bf6d935 15513 break;
9e2a82e1 15514 case E_V16HFmode:
15515 half_mode = V8HFmode;
15516 break;
6910cad5 15517 case E_V16BFmode:
15518 half_mode = V8BFmode;
15519 break;
2bf6d935 15520 case E_V8SImode:
1aeecaf5 15521 half_mode = V4SImode;
2bf6d935
ML
15522 break;
15523 case E_V8SFmode:
1aeecaf5 15524 half_mode = V4SFmode;
2bf6d935
ML
15525 break;
15526 case E_V4DImode:
1aeecaf5 15527 half_mode = V2DImode;
2bf6d935
ML
15528 break;
15529 case E_V4DFmode:
1aeecaf5 15530 half_mode = V2DFmode;
2bf6d935
ML
15531 break;
15532 case E_V4SImode:
1aeecaf5 15533 half_mode = V2SImode;
2bf6d935
ML
15534 break;
15535 case E_V4SFmode:
1aeecaf5 15536 half_mode = V2SFmode;
2bf6d935
ML
15537 break;
15538 case E_V2DImode:
1aeecaf5 15539 half_mode = DImode;
2bf6d935
ML
15540 break;
15541 case E_V2SImode:
1aeecaf5 15542 half_mode = SImode;
2bf6d935
ML
15543 break;
15544 case E_V2DFmode:
1aeecaf5 15545 half_mode = DFmode;
2bf6d935
ML
15546 break;
15547 case E_V2SFmode:
1aeecaf5 15548 half_mode = SFmode;
2bf6d935
ML
15549 break;
15550 default:
15551 gcc_unreachable ();
15552 }
15553
1aeecaf5
HL
15554 if (!register_operand (ops[1], half_mode))
15555 ops[1] = force_reg (half_mode, ops[1]);
15556 if (!register_operand (ops[0], half_mode))
15557 ops[0] = force_reg (half_mode, ops[0]);
2bf6d935
ML
15558 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15559 ops[1])));
15560 break;
15561
15562 case 4:
15563 switch (mode)
15564 {
15565 case E_V4DImode:
1aeecaf5 15566 half_mode = V2DImode;
2bf6d935
ML
15567 break;
15568 case E_V4DFmode:
1aeecaf5 15569 half_mode = V2DFmode;
2bf6d935
ML
15570 break;
15571 case E_V4SImode:
1aeecaf5 15572 half_mode = V2SImode;
2bf6d935
ML
15573 break;
15574 case E_V4SFmode:
1aeecaf5 15575 half_mode = V2SFmode;
2bf6d935
ML
15576 break;
15577 default:
15578 gcc_unreachable ();
15579 }
15580 goto half;
15581
15582 case 8:
15583 switch (mode)
15584 {
15585 case E_V8DImode:
1aeecaf5 15586 half_mode = V4DImode;
2bf6d935
ML
15587 break;
15588 case E_V8DFmode:
1aeecaf5 15589 half_mode = V4DFmode;
2bf6d935
ML
15590 break;
15591 case E_V8SImode:
1aeecaf5 15592 half_mode = V4SImode;
2bf6d935
ML
15593 break;
15594 case E_V8SFmode:
1aeecaf5 15595 half_mode = V4SFmode;
2bf6d935
ML
15596 break;
15597 default:
15598 gcc_unreachable ();
15599 }
15600 goto half;
15601
15602 case 16:
15603 switch (mode)
15604 {
15605 case E_V16SImode:
1aeecaf5 15606 half_mode = V8SImode;
2bf6d935
ML
15607 break;
15608 case E_V16SFmode:
1aeecaf5 15609 half_mode = V8SFmode;
2bf6d935
ML
15610 break;
15611 default:
15612 gcc_unreachable ();
15613 }
15614 goto half;
15615
15616half:
15617 /* FIXME: We process inputs backward to help RA. PR 36222. */
15618 i = n - 1;
1aeecaf5 15619 for (j = 1; j != -1; j--)
2bf6d935 15620 {
1aeecaf5
HL
15621 half[j] = gen_reg_rtx (half_mode);
15622 switch (n >> 1)
2bf6d935 15623 {
1aeecaf5
HL
15624 case 2:
15625 v = gen_rtvec (2, ops[i-1], ops[i]);
15626 i -= 2;
15627 break;
15628 case 4:
15629 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
15630 i -= 4;
15631 break;
15632 case 8:
15633 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
15634 ops[i-3], ops[i-2], ops[i-1], ops[i]);
15635 i -= 8;
15636 break;
15637 default:
15638 gcc_unreachable ();
2bf6d935 15639 }
1aeecaf5
HL
15640 ix86_expand_vector_init (false, half[j],
15641 gen_rtx_PARALLEL (half_mode, v));
2bf6d935 15642 }
1aeecaf5
HL
15643
15644 ix86_expand_vector_init_concat (mode, target, half, 2);
2bf6d935
ML
15645 break;
15646
15647 default:
15648 gcc_unreachable ();
15649 }
15650}
15651
15652/* A subroutine of ix86_expand_vector_init_general. Use vector
15653 interleave to handle the most general case: all values variable,
15654 and none identical. */
15655
15656static void
15657ix86_expand_vector_init_interleave (machine_mode mode,
15658 rtx target, rtx *ops, int n)
15659{
15660 machine_mode first_imode, second_imode, third_imode, inner_mode;
15661 int i, j;
9e2a82e1 15662 rtx op, op0, op1;
2bf6d935
ML
15663 rtx (*gen_load_even) (rtx, rtx, rtx);
15664 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
15665 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
15666
15667 switch (mode)
15668 {
9e2a82e1 15669 case E_V8HFmode:
7fc4d600 15670 gen_load_even = gen_vec_interleave_lowv8hf;
9e2a82e1 15671 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15672 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15673 inner_mode = HFmode;
15674 first_imode = V4SImode;
15675 second_imode = V2DImode;
15676 third_imode = VOIDmode;
15677 break;
6910cad5 15678 case E_V8BFmode:
15679 gen_load_even = gen_vec_interleave_lowv8bf;
15680 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15681 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15682 inner_mode = BFmode;
15683 first_imode = V4SImode;
15684 second_imode = V2DImode;
15685 third_imode = VOIDmode;
15686 break;
2bf6d935
ML
15687 case E_V8HImode:
15688 gen_load_even = gen_vec_setv8hi;
15689 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15690 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15691 inner_mode = HImode;
15692 first_imode = V4SImode;
15693 second_imode = V2DImode;
15694 third_imode = VOIDmode;
15695 break;
15696 case E_V16QImode:
15697 gen_load_even = gen_vec_setv16qi;
15698 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
15699 gen_interleave_second_low = gen_vec_interleave_lowv4si;
15700 inner_mode = QImode;
15701 first_imode = V8HImode;
15702 second_imode = V4SImode;
15703 third_imode = V2DImode;
15704 break;
15705 default:
15706 gcc_unreachable ();
15707 }
15708
15709 for (i = 0; i < n; i++)
15710 {
9e2a82e1 15711 op = ops [i + i];
6910cad5 15712 if (inner_mode == HFmode || inner_mode == BFmode)
9e2a82e1 15713 {
7fc4d600 15714 rtx even, odd;
6910cad5 15715 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
15716 machine_mode vec_mode =
15717 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
15718 op0 = gen_reg_rtx (vec_mode);
15719 even = lowpart_subreg (vec_mode,
15720 force_reg (inner_mode, op), inner_mode);
15721 odd = lowpart_subreg (vec_mode,
15722 force_reg (inner_mode, ops[i + i + 1]),
15723 inner_mode);
7fc4d600 15724 emit_insn (gen_load_even (op0, even, odd));
9e2a82e1 15725 }
7fc4d600 15726 else
15727 {
15728 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
15729 op0 = gen_reg_rtx (SImode);
15730 emit_move_insn (op0, gen_lowpart (SImode, op));
9e2a82e1 15731
7fc4d600 15732 /* Insert the SImode value as low element of V4SImode vector. */
15733 op1 = gen_reg_rtx (V4SImode);
15734 op0 = gen_rtx_VEC_MERGE (V4SImode,
15735 gen_rtx_VEC_DUPLICATE (V4SImode,
15736 op0),
15737 CONST0_RTX (V4SImode),
15738 const1_rtx);
15739 emit_insn (gen_rtx_SET (op1, op0));
2bf6d935 15740
7fc4d600 15741 /* Cast the V4SImode vector back to a vector in orignal mode. */
15742 op0 = gen_reg_rtx (mode);
15743 emit_move_insn (op0, gen_lowpart (mode, op1));
2bf6d935 15744
7fc4d600 15745 /* Load even elements into the second position. */
15746 emit_insn (gen_load_even (op0,
15747 force_reg (inner_mode,
15748 ops[i + i + 1]),
15749 const1_rtx));
15750 }
2bf6d935
ML
15751
15752 /* Cast vector to FIRST_IMODE vector. */
15753 ops[i] = gen_reg_rtx (first_imode);
15754 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
15755 }
15756
15757 /* Interleave low FIRST_IMODE vectors. */
15758 for (i = j = 0; i < n; i += 2, j++)
15759 {
15760 op0 = gen_reg_rtx (first_imode);
15761 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
15762
15763 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
15764 ops[j] = gen_reg_rtx (second_imode);
15765 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
15766 }
15767
15768 /* Interleave low SECOND_IMODE vectors. */
15769 switch (second_imode)
15770 {
15771 case E_V4SImode:
15772 for (i = j = 0; i < n / 2; i += 2, j++)
15773 {
15774 op0 = gen_reg_rtx (second_imode);
15775 emit_insn (gen_interleave_second_low (op0, ops[i],
15776 ops[i + 1]));
15777
15778 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
15779 vector. */
15780 ops[j] = gen_reg_rtx (third_imode);
15781 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
15782 }
15783 second_imode = V2DImode;
15784 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15785 /* FALLTHRU */
15786
15787 case E_V2DImode:
15788 op0 = gen_reg_rtx (second_imode);
15789 emit_insn (gen_interleave_second_low (op0, ops[0],
15790 ops[1]));
15791
15792 /* Cast the SECOND_IMODE vector back to a vector on original
15793 mode. */
15794 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
15795 break;
15796
15797 default:
15798 gcc_unreachable ();
15799 }
15800}
15801
15802/* A subroutine of ix86_expand_vector_init. Handle the most general case:
15803 all values variable, and none identical. */
15804
15805static void
15806ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
15807 rtx target, rtx vals)
15808{
15809 rtx ops[64], op0, op1, op2, op3, op4, op5;
15810 machine_mode half_mode = VOIDmode;
15811 machine_mode quarter_mode = VOIDmode;
15812 int n, i;
15813
15814 switch (mode)
15815 {
15816 case E_V2SFmode:
15817 case E_V2SImode:
15818 if (!mmx_ok && !TARGET_SSE)
15819 break;
15820 /* FALLTHRU */
15821
15822 case E_V16SImode:
15823 case E_V16SFmode:
15824 case E_V8DFmode:
15825 case E_V8DImode:
15826 case E_V8SFmode:
15827 case E_V8SImode:
15828 case E_V4DFmode:
15829 case E_V4DImode:
15830 case E_V4SFmode:
15831 case E_V4SImode:
15832 case E_V2DFmode:
15833 case E_V2DImode:
15834 n = GET_MODE_NUNITS (mode);
15835 for (i = 0; i < n; i++)
15836 ops[i] = XVECEXP (vals, 0, i);
15837 ix86_expand_vector_init_concat (mode, target, ops, n);
15838 return;
15839
15840 case E_V2TImode:
15841 for (i = 0; i < 2; i++)
15842 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15843 op0 = gen_reg_rtx (V4DImode);
15844 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
15845 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15846 return;
15847
15848 case E_V4TImode:
15849 for (i = 0; i < 4; i++)
15850 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15851 ops[4] = gen_reg_rtx (V4DImode);
15852 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
15853 ops[5] = gen_reg_rtx (V4DImode);
15854 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
15855 op0 = gen_reg_rtx (V8DImode);
15856 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
15857 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15858 return;
15859
15860 case E_V32QImode:
15861 half_mode = V16QImode;
15862 goto half;
15863
15864 case E_V16HImode:
15865 half_mode = V8HImode;
15866 goto half;
15867
9e2a82e1 15868 case E_V16HFmode:
15869 half_mode = V8HFmode;
15870 goto half;
15871
6910cad5 15872 case E_V16BFmode:
15873 half_mode = V8BFmode;
15874 goto half;
15875
2bf6d935
ML
15876half:
15877 n = GET_MODE_NUNITS (mode);
15878 for (i = 0; i < n; i++)
15879 ops[i] = XVECEXP (vals, 0, i);
15880 op0 = gen_reg_rtx (half_mode);
15881 op1 = gen_reg_rtx (half_mode);
15882 ix86_expand_vector_init_interleave (half_mode, op0, ops,
15883 n >> 2);
15884 ix86_expand_vector_init_interleave (half_mode, op1,
15885 &ops [n >> 1], n >> 2);
15886 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
15887 return;
15888
15889 case E_V64QImode:
15890 quarter_mode = V16QImode;
15891 half_mode = V32QImode;
15892 goto quarter;
15893
15894 case E_V32HImode:
15895 quarter_mode = V8HImode;
15896 half_mode = V16HImode;
15897 goto quarter;
15898
9e2a82e1 15899 case E_V32HFmode:
15900 quarter_mode = V8HFmode;
15901 half_mode = V16HFmode;
15902 goto quarter;
15903
6910cad5 15904 case E_V32BFmode:
15905 quarter_mode = V8BFmode;
15906 half_mode = V16BFmode;
15907 goto quarter;
15908
2bf6d935
ML
15909quarter:
15910 n = GET_MODE_NUNITS (mode);
15911 for (i = 0; i < n; i++)
15912 ops[i] = XVECEXP (vals, 0, i);
15913 op0 = gen_reg_rtx (quarter_mode);
15914 op1 = gen_reg_rtx (quarter_mode);
15915 op2 = gen_reg_rtx (quarter_mode);
15916 op3 = gen_reg_rtx (quarter_mode);
15917 op4 = gen_reg_rtx (half_mode);
15918 op5 = gen_reg_rtx (half_mode);
15919 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
15920 n >> 3);
15921 ix86_expand_vector_init_interleave (quarter_mode, op1,
15922 &ops [n >> 2], n >> 3);
15923 ix86_expand_vector_init_interleave (quarter_mode, op2,
15924 &ops [n >> 1], n >> 3);
15925 ix86_expand_vector_init_interleave (quarter_mode, op3,
15926 &ops [(n >> 1) | (n >> 2)], n >> 3);
15927 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
15928 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
15929 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
15930 return;
15931
15932 case E_V16QImode:
15933 if (!TARGET_SSE4_1)
15934 break;
15935 /* FALLTHRU */
15936
15937 case E_V8HImode:
15938 if (!TARGET_SSE2)
15939 break;
15940
15941 /* Don't use ix86_expand_vector_init_interleave if we can't
15942 move from GPR to SSE register directly. */
15943 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
15944 break;
9e2a82e1 15945 /* FALLTHRU */
15946
15947 case E_V8HFmode:
6910cad5 15948 case E_V8BFmode:
2bf6d935
ML
15949
15950 n = GET_MODE_NUNITS (mode);
15951 for (i = 0; i < n; i++)
15952 ops[i] = XVECEXP (vals, 0, i);
15953 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
15954 return;
15955
15956 case E_V4HImode:
15957 case E_V8QImode:
8d7dae0e
UB
15958
15959 case E_V2HImode:
64735dc9 15960 case E_V4QImode:
2bf6d935
ML
15961 break;
15962
15963 default:
15964 gcc_unreachable ();
15965 }
15966
15967 {
15968 int i, j, n_elts, n_words, n_elt_per_word;
8d7dae0e 15969 machine_mode tmp_mode, inner_mode;
2bf6d935
ML
15970 rtx words[4], shift;
15971
8d7dae0e
UB
15972 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
15973
2bf6d935
ML
15974 inner_mode = GET_MODE_INNER (mode);
15975 n_elts = GET_MODE_NUNITS (mode);
8d7dae0e 15976 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
2bf6d935
ML
15977 n_elt_per_word = n_elts / n_words;
15978 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
15979
15980 for (i = 0; i < n_words; ++i)
15981 {
15982 rtx word = NULL_RTX;
15983
15984 for (j = 0; j < n_elt_per_word; ++j)
15985 {
15986 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
8d7dae0e 15987 elt = convert_modes (tmp_mode, inner_mode, elt, true);
2bf6d935
ML
15988
15989 if (j == 0)
15990 word = elt;
15991 else
15992 {
8d7dae0e 15993 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
e1a74058 15994 NULL_RTX, 1, OPTAB_LIB_WIDEN);
8d7dae0e 15995 word = expand_simple_binop (tmp_mode, IOR, word, elt,
e1a74058 15996 NULL_RTX, 1, OPTAB_LIB_WIDEN);
2bf6d935
ML
15997 }
15998 }
15999
16000 words[i] = word;
16001 }
16002
16003 if (n_words == 1)
16004 emit_move_insn (target, gen_lowpart (mode, words[0]));
16005 else if (n_words == 2)
16006 {
16007 rtx tmp = gen_reg_rtx (mode);
16008 emit_clobber (tmp);
8d7dae0e
UB
16009 emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
16010 emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
2bf6d935
ML
16011 emit_move_insn (target, tmp);
16012 }
16013 else if (n_words == 4)
16014 {
16015 rtx tmp = gen_reg_rtx (V4SImode);
8d7dae0e 16016 gcc_assert (tmp_mode == SImode);
2bf6d935
ML
16017 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16018 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
16019 emit_move_insn (target, gen_lowpart (mode, tmp));
16020 }
16021 else
16022 gcc_unreachable ();
16023 }
16024}
16025
16026/* Initialize vector TARGET via VALS. Suppress the use of MMX
16027 instructions unless MMX_OK is true. */
16028
16029void
16030ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16031{
16032 machine_mode mode = GET_MODE (target);
16033 machine_mode inner_mode = GET_MODE_INNER (mode);
16034 int n_elts = GET_MODE_NUNITS (mode);
16035 int n_var = 0, one_var = -1;
16036 bool all_same = true, all_const_zero = true;
16037 int i;
16038 rtx x;
16039
16040 /* Handle first initialization from vector elts. */
16041 if (n_elts != XVECLEN (vals, 0))
16042 {
16043 rtx subtarget = target;
16044 x = XVECEXP (vals, 0, 0);
16045 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16046 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16047 {
16048 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
b7dd2e4e
JJ
16049 if (inner_mode == QImode
16050 || inner_mode == HImode
575191b9 16051 || inner_mode == TImode
6910cad5 16052 || inner_mode == HFmode
16053 || inner_mode == BFmode)
2bf6d935
ML
16054 {
16055 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
b7dd2e4e
JJ
16056 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16057 n_bits /= GET_MODE_SIZE (elt_mode);
16058 mode = mode_for_vector (elt_mode, n_bits).require ();
16059 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
2bf6d935
ML
16060 ops[0] = gen_lowpart (inner_mode, ops[0]);
16061 ops[1] = gen_lowpart (inner_mode, ops[1]);
16062 subtarget = gen_reg_rtx (mode);
16063 }
16064 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
16065 if (subtarget != target)
16066 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
16067 return;
16068 }
16069 gcc_unreachable ();
16070 }
16071
16072 for (i = 0; i < n_elts; ++i)
16073 {
16074 x = XVECEXP (vals, 0, i);
16075 if (!(CONST_SCALAR_INT_P (x)
16076 || CONST_DOUBLE_P (x)
16077 || CONST_FIXED_P (x)))
16078 n_var++, one_var = i;
16079 else if (x != CONST0_RTX (inner_mode))
16080 all_const_zero = false;
16081 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
16082 all_same = false;
16083 }
16084
16085 /* Constants are best loaded from the constant pool. */
16086 if (n_var == 0)
16087 {
16088 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
16089 return;
16090 }
16091
16092 /* If all values are identical, broadcast the value. */
16093 if (all_same
16094 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16095 XVECEXP (vals, 0, 0)))
16096 return;
16097
16098 /* Values where only one field is non-constant are best loaded from
16099 the pool and overwritten via move later. */
16100 if (n_var == 1)
16101 {
16102 if (all_const_zero
16103 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16104 XVECEXP (vals, 0, one_var),
16105 one_var))
16106 return;
16107
16108 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16109 return;
16110 }
16111
16112 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16113}
16114
287cc750 16115/* Implemented as
16116 V setg (V v, int idx, T val)
16117 {
16118 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16119 V valv = (V){val, val, val, val, val, val, val, val};
16120 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16121 v = (v & ~mask) | (valv & mask);
16122 return v;
16123 }. */
16124void
16125ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16126{
16127 rtx vec[64];
16128 machine_mode mode = GET_MODE (target);
16129 machine_mode cmp_mode = mode;
16130 int n_elts = GET_MODE_NUNITS (mode);
16131 rtx valv,idxv,constv,idx_tmp;
16132 bool ok = false;
16133
16134 /* 512-bits vector byte/word broadcast and comparison only available
16135 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16136 when without TARGET_AVX512BW. */
6910cad5 16137 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
16138 || mode == V64QImode)
7a54d3de 16139 && !TARGET_AVX512BW)
287cc750 16140 {
16141 gcc_assert (TARGET_AVX512F);
16142 rtx vhi, vlo, idx_hi;
16143 machine_mode half_mode;
16144 rtx (*extract_hi)(rtx, rtx);
16145 rtx (*extract_lo)(rtx, rtx);
16146
16147 if (mode == V32HImode)
16148 {
16149 half_mode = V16HImode;
16150 extract_hi = gen_vec_extract_hi_v32hi;
16151 extract_lo = gen_vec_extract_lo_v32hi;
16152 }
7a54d3de
UB
16153 else if (mode == V32HFmode)
16154 {
16155 half_mode = V16HFmode;
16156 extract_hi = gen_vec_extract_hi_v32hf;
16157 extract_lo = gen_vec_extract_lo_v32hf;
16158 }
6910cad5 16159 else if (mode == V32BFmode)
16160 {
16161 half_mode = V16BFmode;
16162 extract_hi = gen_vec_extract_hi_v32bf;
16163 extract_lo = gen_vec_extract_lo_v32bf;
16164 }
287cc750 16165 else
16166 {
16167 half_mode = V32QImode;
16168 extract_hi = gen_vec_extract_hi_v64qi;
16169 extract_lo = gen_vec_extract_lo_v64qi;
16170 }
16171
16172 vhi = gen_reg_rtx (half_mode);
16173 vlo = gen_reg_rtx (half_mode);
16174 idx_hi = gen_reg_rtx (GET_MODE (idx));
16175 emit_insn (extract_hi (vhi, target));
16176 emit_insn (extract_lo (vlo, target));
16177 vec[0] = idx_hi;
16178 vec[1] = idx;
16179 vec[2] = GEN_INT (n_elts/2);
16180 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16181 ix86_expand_vector_set_var (vhi, val, idx_hi);
16182 ix86_expand_vector_set_var (vlo, val, idx);
16183 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16184 return;
16185 }
16186
16187 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16188 {
16189 switch (mode)
16190 {
16191 case E_V2DFmode:
16192 cmp_mode = V2DImode;
16193 break;
16194 case E_V4DFmode:
16195 cmp_mode = V4DImode;
16196 break;
16197 case E_V8DFmode:
16198 cmp_mode = V8DImode;
16199 break;
20a2c8ac
UB
16200 case E_V2SFmode:
16201 cmp_mode = V2SImode;
16202 break;
287cc750 16203 case E_V4SFmode:
16204 cmp_mode = V4SImode;
16205 break;
16206 case E_V8SFmode:
16207 cmp_mode = V8SImode;
16208 break;
16209 case E_V16SFmode:
16210 cmp_mode = V16SImode;
16211 break;
9e2a82e1 16212 case E_V8HFmode:
16213 cmp_mode = V8HImode;
16214 break;
16215 case E_V16HFmode:
16216 cmp_mode = V16HImode;
16217 break;
16218 case E_V32HFmode:
16219 cmp_mode = V32HImode;
16220 break;
6910cad5 16221 case E_V8BFmode:
16222 cmp_mode = V8HImode;
16223 break;
16224 case E_V16BFmode:
16225 cmp_mode = V16HImode;
16226 break;
16227 case E_V32BFmode:
16228 cmp_mode = V32HImode;
16229 break;
287cc750 16230 default:
16231 gcc_unreachable ();
16232 }
16233 }
16234
16235 for (int i = 0; i != n_elts; i++)
16236 vec[i] = GEN_INT (i);
16237 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16238 valv = gen_reg_rtx (mode);
16239 idxv = gen_reg_rtx (cmp_mode);
16240 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16241
20a2c8ac
UB
16242 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16243 mode, valv, val);
287cc750 16244 gcc_assert (ok);
20a2c8ac
UB
16245 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16246 cmp_mode, idxv, idx_tmp);
287cc750 16247 gcc_assert (ok);
16248 vec[0] = target;
16249 vec[1] = valv;
16250 vec[2] = target;
16251 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16252 vec[4] = idxv;
16253 vec[5] = constv;
16254 ok = ix86_expand_int_vcond (vec);
16255 gcc_assert (ok);
16256}
16257
2bf6d935
ML
16258void
16259ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16260{
16261 machine_mode mode = GET_MODE (target);
16262 machine_mode inner_mode = GET_MODE_INNER (mode);
16263 machine_mode half_mode;
16264 bool use_vec_merge = false;
7fc4d600 16265 bool blendm_const = false;
2bf6d935 16266 rtx tmp;
6910cad5 16267 static rtx (*gen_extract[8][2]) (rtx, rtx)
2bf6d935
ML
16268 = {
16269 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16270 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16271 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16272 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16273 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
9e2a82e1 16274 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
6910cad5 16275 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
16276 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
2bf6d935 16277 };
6910cad5 16278 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
2bf6d935
ML
16279 = {
16280 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16281 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16282 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16283 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16284 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
9e2a82e1 16285 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16286 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
6910cad5 16287 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
2bf6d935
ML
16288 };
16289 int i, j, n;
16290 machine_mode mmode = VOIDmode;
16291 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16292
16293 switch (mode)
16294 {
2bf6d935 16295 case E_V2SImode:
f15c7bd1
UB
16296 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16297 if (use_vec_merge)
16298 break;
16299 /* FALLTHRU */
16300
16301 case E_V2SFmode:
2bf6d935
ML
16302 if (mmx_ok)
16303 {
16304 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16305 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16306 if (elt == 0)
16307 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16308 else
16309 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16310 emit_insn (gen_rtx_SET (target, tmp));
16311 return;
16312 }
16313 break;
16314
16315 case E_V2DImode:
16316 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16317 if (use_vec_merge)
16318 break;
16319
16320 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16321 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16322 if (elt == 0)
16323 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16324 else
16325 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16326 emit_insn (gen_rtx_SET (target, tmp));
16327 return;
16328
16329 case E_V2DFmode:
ac173024
L
16330 /* NB: For ELT == 0, use standard scalar operation patterns which
16331 preserve the rest of the vector for combiner:
16332
16333 (vec_merge:V2DF
16334 (vec_duplicate:V2DF (reg:DF))
16335 (reg:V2DF)
16336 (const_int 1))
16337 */
16338 if (elt == 0)
16339 goto do_vec_merge;
16340
2bf6d935
ML
16341 {
16342 rtx op0, op1;
16343
16344 /* For the two element vectors, we implement a VEC_CONCAT with
16345 the extraction of the other element. */
16346
16347 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16348 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16349
16350 if (elt == 0)
16351 op0 = val, op1 = tmp;
16352 else
16353 op0 = tmp, op1 = val;
16354
16355 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16356 emit_insn (gen_rtx_SET (target, tmp));
16357 }
16358 return;
16359
16360 case E_V4SFmode:
16361 use_vec_merge = TARGET_SSE4_1;
16362 if (use_vec_merge)
16363 break;
16364
16365 switch (elt)
16366 {
16367 case 0:
16368 use_vec_merge = true;
16369 break;
16370
16371 case 1:
16372 /* tmp = target = A B C D */
16373 tmp = copy_to_reg (target);
16374 /* target = A A B B */
16375 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16376 /* target = X A B B */
16377 ix86_expand_vector_set (false, target, val, 0);
16378 /* target = A X C D */
16379 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16380 const1_rtx, const0_rtx,
16381 GEN_INT (2+4), GEN_INT (3+4)));
16382 return;
16383
16384 case 2:
16385 /* tmp = target = A B C D */
16386 tmp = copy_to_reg (target);
16387 /* tmp = X B C D */
16388 ix86_expand_vector_set (false, tmp, val, 0);
16389 /* target = A B X D */
16390 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16391 const0_rtx, const1_rtx,
16392 GEN_INT (0+4), GEN_INT (3+4)));
16393 return;
16394
16395 case 3:
16396 /* tmp = target = A B C D */
16397 tmp = copy_to_reg (target);
16398 /* tmp = X B C D */
16399 ix86_expand_vector_set (false, tmp, val, 0);
16400 /* target = A B X D */
16401 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16402 const0_rtx, const1_rtx,
16403 GEN_INT (2+4), GEN_INT (0+4)));
16404 return;
16405
16406 default:
16407 gcc_unreachable ();
16408 }
16409 break;
16410
16411 case E_V4SImode:
16412 use_vec_merge = TARGET_SSE4_1;
16413 if (use_vec_merge)
16414 break;
16415
16416 /* Element 0 handled by vec_merge below. */
16417 if (elt == 0)
16418 {
16419 use_vec_merge = true;
16420 break;
16421 }
16422
16423 if (TARGET_SSE2)
16424 {
16425 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16426 store into element 0, then shuffle them back. */
16427
16428 rtx order[4];
16429
16430 order[0] = GEN_INT (elt);
16431 order[1] = const1_rtx;
16432 order[2] = const2_rtx;
16433 order[3] = GEN_INT (3);
16434 order[elt] = const0_rtx;
16435
16436 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16437 order[1], order[2], order[3]));
16438
16439 ix86_expand_vector_set (false, target, val, 0);
16440
16441 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16442 order[1], order[2], order[3]));
16443 }
16444 else
16445 {
16446 /* For SSE1, we have to reuse the V4SF code. */
16447 rtx t = gen_reg_rtx (V4SFmode);
16448 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16449 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16450 emit_move_insn (target, gen_lowpart (mode, t));
16451 }
16452 return;
16453
16454 case E_V8HImode:
7eb961d8 16455 case E_V8HFmode:
6910cad5 16456 case E_V8BFmode:
5883e567 16457 case E_V2HImode:
2bf6d935
ML
16458 use_vec_merge = TARGET_SSE2;
16459 break;
16460 case E_V4HImode:
16461 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16462 break;
16463
16464 case E_V16QImode:
5883e567 16465 case E_V4QImode:
2bf6d935
ML
16466 use_vec_merge = TARGET_SSE4_1;
16467 break;
16468
16469 case E_V8QImode:
f15c7bd1 16470 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935
ML
16471 break;
16472
16473 case E_V32QImode:
16474 half_mode = V16QImode;
16475 j = 0;
16476 n = 16;
16477 goto half;
16478
9e2a82e1 16479 case E_V16HFmode:
6910cad5 16480 case E_V16BFmode:
1f759dbd 16481 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16482 if (TARGET_AVX2 && elt != 0)
7fc4d600 16483 {
16484 mmode = SImode;
6910cad5 16485 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
16486 : gen_avx2_pblendbf_1);
7fc4d600 16487 blendm_const = true;
16488 break;
16489 }
16490 else
16491 {
6910cad5 16492 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
16493 j = ((mode == E_V16HFmode) ? 6 : 7);
7fc4d600 16494 n = 8;
16495 goto half;
16496 }
9e2a82e1 16497
2bf6d935
ML
16498 case E_V16HImode:
16499 half_mode = V8HImode;
16500 j = 1;
16501 n = 8;
16502 goto half;
16503
16504 case E_V8SImode:
16505 half_mode = V4SImode;
16506 j = 2;
16507 n = 4;
16508 goto half;
16509
16510 case E_V4DImode:
16511 half_mode = V2DImode;
16512 j = 3;
16513 n = 2;
16514 goto half;
16515
16516 case E_V8SFmode:
16517 half_mode = V4SFmode;
16518 j = 4;
16519 n = 4;
16520 goto half;
16521
16522 case E_V4DFmode:
16523 half_mode = V2DFmode;
16524 j = 5;
16525 n = 2;
16526 goto half;
16527
16528half:
16529 /* Compute offset. */
16530 i = elt / n;
16531 elt %= n;
16532
16533 gcc_assert (i <= 1);
16534
16535 /* Extract the half. */
16536 tmp = gen_reg_rtx (half_mode);
16537 emit_insn (gen_extract[j][i] (tmp, target));
16538
16539 /* Put val in tmp at elt. */
16540 ix86_expand_vector_set (false, tmp, val, elt);
16541
16542 /* Put it back. */
16543 emit_insn (gen_insert[j][i] (target, target, tmp));
16544 return;
16545
16546 case E_V8DFmode:
16547 if (TARGET_AVX512F)
16548 {
16549 mmode = QImode;
16550 gen_blendm = gen_avx512f_blendmv8df;
16551 }
16552 break;
16553
16554 case E_V8DImode:
16555 if (TARGET_AVX512F)
16556 {
16557 mmode = QImode;
16558 gen_blendm = gen_avx512f_blendmv8di;
16559 }
16560 break;
16561
16562 case E_V16SFmode:
16563 if (TARGET_AVX512F)
16564 {
16565 mmode = HImode;
16566 gen_blendm = gen_avx512f_blendmv16sf;
16567 }
16568 break;
16569
16570 case E_V16SImode:
16571 if (TARGET_AVX512F)
16572 {
16573 mmode = HImode;
16574 gen_blendm = gen_avx512f_blendmv16si;
16575 }
16576 break;
16577
9e2a82e1 16578 case E_V32HFmode:
16579 if (TARGET_AVX512BW)
16580 {
16581 mmode = SImode;
16582 gen_blendm = gen_avx512bw_blendmv32hf;
16583 }
16584 break;
6910cad5 16585 case E_V32BFmode:
16586 if (TARGET_AVX512BW)
16587 {
16588 mmode = SImode;
16589 gen_blendm = gen_avx512bw_blendmv32bf;
16590 }
16591 break;
2bf6d935
ML
16592 case E_V32HImode:
16593 if (TARGET_AVX512BW)
16594 {
16595 mmode = SImode;
16596 gen_blendm = gen_avx512bw_blendmv32hi;
16597 }
16598 else if (TARGET_AVX512F)
16599 {
16600 half_mode = E_V8HImode;
16601 n = 8;
16602 goto quarter;
16603 }
16604 break;
16605
16606 case E_V64QImode:
16607 if (TARGET_AVX512BW)
16608 {
16609 mmode = DImode;
16610 gen_blendm = gen_avx512bw_blendmv64qi;
16611 }
16612 else if (TARGET_AVX512F)
16613 {
16614 half_mode = E_V16QImode;
16615 n = 16;
16616 goto quarter;
16617 }
16618 break;
16619
16620quarter:
16621 /* Compute offset. */
16622 i = elt / n;
16623 elt %= n;
16624
16625 gcc_assert (i <= 3);
16626
16627 {
16628 /* Extract the quarter. */
16629 tmp = gen_reg_rtx (V4SImode);
16630 rtx tmp2 = gen_lowpart (V16SImode, target);
16631 rtx mask = gen_reg_rtx (QImode);
16632
16633 emit_move_insn (mask, constm1_rtx);
16634 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
16635 tmp, mask));
16636
16637 tmp2 = gen_reg_rtx (half_mode);
16638 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
16639 tmp = tmp2;
16640
16641 /* Put val in tmp at elt. */
16642 ix86_expand_vector_set (false, tmp, val, elt);
16643
16644 /* Put it back. */
16645 tmp2 = gen_reg_rtx (V16SImode);
16646 rtx tmp3 = gen_lowpart (V16SImode, target);
16647 mask = gen_reg_rtx (HImode);
16648 emit_move_insn (mask, constm1_rtx);
16649 tmp = gen_lowpart (V4SImode, tmp);
16650 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
16651 tmp3, mask));
16652 emit_move_insn (target, gen_lowpart (mode, tmp2));
16653 }
16654 return;
16655
16656 default:
16657 break;
16658 }
16659
16660 if (mmode != VOIDmode)
16661 {
16662 tmp = gen_reg_rtx (mode);
16663 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
7fc4d600 16664 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
2bf6d935
ML
16665 /* The avx512*_blendm<mode> expanders have different operand order
16666 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
16667 elements where the mask is set and second input operand otherwise,
16668 in {sse,avx}*_*blend* the first input operand is used for elements
16669 where the mask is clear and second input operand otherwise. */
7fc4d600 16670 if (!blendm_const)
16671 merge_mask = force_reg (mmode, merge_mask);
16672 emit_insn (gen_blendm (target, target, tmp, merge_mask));
2bf6d935
ML
16673 }
16674 else if (use_vec_merge)
16675 {
ac173024 16676do_vec_merge:
2bf6d935
ML
16677 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
16678 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
16679 GEN_INT (HOST_WIDE_INT_1U << elt));
16680 emit_insn (gen_rtx_SET (target, tmp));
16681 }
16682 else
16683 {
16684 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16685
16686 emit_move_insn (mem, target);
16687
16688 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
16689 emit_move_insn (tmp, val);
16690
16691 emit_move_insn (target, mem);
16692 }
16693}
16694
16695void
16696ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
16697{
16698 machine_mode mode = GET_MODE (vec);
16699 machine_mode inner_mode = GET_MODE_INNER (mode);
16700 bool use_vec_extr = false;
16701 rtx tmp;
16702
16703 switch (mode)
16704 {
16705 case E_V2SImode:
5fbc8ab4
UB
16706 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16707 if (use_vec_extr)
16708 break;
16709 /* FALLTHRU */
16710
2bf6d935
ML
16711 case E_V2SFmode:
16712 if (!mmx_ok)
16713 break;
16714 /* FALLTHRU */
16715
16716 case E_V2DFmode:
16717 case E_V2DImode:
16718 case E_V2TImode:
16719 case E_V4TImode:
16720 use_vec_extr = true;
16721 break;
16722
16723 case E_V4SFmode:
16724 use_vec_extr = TARGET_SSE4_1;
16725 if (use_vec_extr)
16726 break;
16727
16728 switch (elt)
16729 {
16730 case 0:
16731 tmp = vec;
16732 break;
16733
16734 case 1:
16735 case 3:
16736 tmp = gen_reg_rtx (mode);
16737 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
16738 GEN_INT (elt), GEN_INT (elt),
16739 GEN_INT (elt+4), GEN_INT (elt+4)));
16740 break;
16741
16742 case 2:
16743 tmp = gen_reg_rtx (mode);
16744 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
16745 break;
16746
16747 default:
16748 gcc_unreachable ();
16749 }
16750 vec = tmp;
16751 use_vec_extr = true;
16752 elt = 0;
16753 break;
16754
16755 case E_V4SImode:
16756 use_vec_extr = TARGET_SSE4_1;
16757 if (use_vec_extr)
16758 break;
16759
16760 if (TARGET_SSE2)
16761 {
16762 switch (elt)
16763 {
16764 case 0:
16765 tmp = vec;
16766 break;
16767
16768 case 1:
16769 case 3:
16770 tmp = gen_reg_rtx (mode);
16771 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
16772 GEN_INT (elt), GEN_INT (elt),
16773 GEN_INT (elt), GEN_INT (elt)));
16774 break;
16775
16776 case 2:
16777 tmp = gen_reg_rtx (mode);
16778 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
16779 break;
16780
16781 default:
16782 gcc_unreachable ();
16783 }
16784 vec = tmp;
16785 use_vec_extr = true;
16786 elt = 0;
16787 }
16788 else
16789 {
16790 /* For SSE1, we have to reuse the V4SF code. */
16791 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
16792 gen_lowpart (V4SFmode, vec), elt);
16793 return;
16794 }
16795 break;
16796
16797 case E_V8HImode:
7a54d3de 16798 case E_V8HFmode:
6910cad5 16799 case E_V8BFmode:
5883e567 16800 case E_V2HImode:
2bf6d935
ML
16801 use_vec_extr = TARGET_SSE2;
16802 break;
16803 case E_V4HImode:
16804 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16805 break;
16806
16807 case E_V16QImode:
16808 use_vec_extr = TARGET_SSE4_1;
f66e6e2b
JJ
16809 if (!use_vec_extr
16810 && TARGET_SSE2
16811 && elt == 0
16812 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
16813 {
16814 tmp = gen_reg_rtx (SImode);
16815 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
16816 0);
16817 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
16818 return;
16819 }
2bf6d935 16820 break;
5883e567
UB
16821 case E_V4QImode:
16822 use_vec_extr = TARGET_SSE4_1;
16823 break;
2bf6d935
ML
16824
16825 case E_V8SFmode:
16826 if (TARGET_AVX)
16827 {
16828 tmp = gen_reg_rtx (V4SFmode);
16829 if (elt < 4)
16830 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
16831 else
16832 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
16833 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16834 return;
16835 }
16836 break;
16837
16838 case E_V4DFmode:
16839 if (TARGET_AVX)
16840 {
16841 tmp = gen_reg_rtx (V2DFmode);
16842 if (elt < 2)
16843 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
16844 else
16845 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
16846 ix86_expand_vector_extract (false, target, tmp, elt & 1);
16847 return;
16848 }
16849 break;
16850
16851 case E_V32QImode:
16852 if (TARGET_AVX)
16853 {
16854 tmp = gen_reg_rtx (V16QImode);
16855 if (elt < 16)
16856 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
16857 else
16858 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
16859 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16860 return;
16861 }
16862 break;
16863
16864 case E_V16HImode:
16865 if (TARGET_AVX)
16866 {
16867 tmp = gen_reg_rtx (V8HImode);
16868 if (elt < 8)
16869 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
16870 else
16871 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
16872 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16873 return;
16874 }
16875 break;
16876
16877 case E_V8SImode:
16878 if (TARGET_AVX)
16879 {
16880 tmp = gen_reg_rtx (V4SImode);
16881 if (elt < 4)
16882 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
16883 else
16884 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
16885 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16886 return;
16887 }
16888 break;
16889
16890 case E_V4DImode:
16891 if (TARGET_AVX)
16892 {
16893 tmp = gen_reg_rtx (V2DImode);
16894 if (elt < 2)
16895 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
16896 else
16897 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
16898 ix86_expand_vector_extract (false, target, tmp, elt & 1);
16899 return;
16900 }
16901 break;
16902
16903 case E_V32HImode:
16904 if (TARGET_AVX512BW)
16905 {
16906 tmp = gen_reg_rtx (V16HImode);
16907 if (elt < 16)
16908 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
16909 else
16910 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
16911 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16912 return;
16913 }
16914 break;
16915
16916 case E_V64QImode:
16917 if (TARGET_AVX512BW)
16918 {
16919 tmp = gen_reg_rtx (V32QImode);
16920 if (elt < 32)
16921 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
16922 else
16923 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
16924 ix86_expand_vector_extract (false, target, tmp, elt & 31);
16925 return;
16926 }
16927 break;
16928
16929 case E_V16SFmode:
16930 tmp = gen_reg_rtx (V8SFmode);
16931 if (elt < 8)
16932 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
16933 else
16934 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
16935 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16936 return;
16937
16938 case E_V8DFmode:
16939 tmp = gen_reg_rtx (V4DFmode);
16940 if (elt < 4)
16941 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
16942 else
16943 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
16944 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16945 return;
16946
16947 case E_V16SImode:
16948 tmp = gen_reg_rtx (V8SImode);
16949 if (elt < 8)
16950 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
16951 else
16952 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
16953 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16954 return;
16955
16956 case E_V8DImode:
16957 tmp = gen_reg_rtx (V4DImode);
16958 if (elt < 4)
16959 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
16960 else
16961 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
16962 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16963 return;
16964
9e2a82e1 16965 case E_V32HFmode:
6910cad5 16966 case E_V32BFmode:
7a54d3de
UB
16967 if (TARGET_AVX512BW)
16968 {
6910cad5 16969 tmp = (mode == E_V32HFmode
16970 ? gen_reg_rtx (V16HFmode)
16971 : gen_reg_rtx (V16BFmode));
7a54d3de 16972 if (elt < 16)
6910cad5 16973 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
7a54d3de 16974 else
6910cad5 16975 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
7a54d3de
UB
16976 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16977 return;
16978 }
16979 break;
9e2a82e1 16980
16981 case E_V16HFmode:
6910cad5 16982 case E_V16BFmode:
7a54d3de
UB
16983 if (TARGET_AVX)
16984 {
6910cad5 16985 tmp = (mode == E_V16HFmode
16986 ? gen_reg_rtx (V8HFmode)
16987 : gen_reg_rtx (V8BFmode));
7a54d3de 16988 if (elt < 8)
6910cad5 16989 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
7a54d3de 16990 else
6910cad5 16991 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
7a54d3de
UB
16992 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16993 return;
16994 }
9e2a82e1 16995 break;
16996
2bf6d935 16997 case E_V8QImode:
5fbc8ab4 16998 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935 16999 /* ??? Could extract the appropriate HImode element and shift. */
5fbc8ab4
UB
17000 break;
17001
2bf6d935
ML
17002 default:
17003 break;
17004 }
17005
17006 if (use_vec_extr)
17007 {
17008 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17009 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17010
17011 /* Let the rtl optimizers know about the zero extension performed. */
17012 if (inner_mode == QImode || inner_mode == HImode)
17013 {
97c32001 17014 rtx reg = gen_reg_rtx (SImode);
2bf6d935 17015 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
97c32001
RS
17016 emit_move_insn (reg, tmp);
17017 tmp = gen_lowpart (inner_mode, reg);
17018 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17019 SUBREG_PROMOTED_SET (tmp, 1);
2bf6d935
ML
17020 }
17021
97c32001 17022 emit_move_insn (target, tmp);
2bf6d935
ML
17023 }
17024 else
17025 {
17026 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17027
17028 emit_move_insn (mem, vec);
17029
17030 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17031 emit_move_insn (target, tmp);
17032 }
17033}
17034
17035/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17036 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17037 The upper bits of DEST are undefined, though they shouldn't cause
17038 exceptions (some bits from src or all zeros are ok). */
17039
17040static void
17041emit_reduc_half (rtx dest, rtx src, int i)
17042{
17043 rtx tem, d = dest;
17044 switch (GET_MODE (src))
17045 {
17046 case E_V4SFmode:
17047 if (i == 128)
17048 tem = gen_sse_movhlps (dest, src, src);
17049 else
17050 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
17051 GEN_INT (1 + 4), GEN_INT (1 + 4));
17052 break;
17053 case E_V2DFmode:
17054 tem = gen_vec_interleave_highv2df (dest, src, src);
17055 break;
73c535a0 17056 case E_V4QImode:
17057 d = gen_reg_rtx (V1SImode);
17058 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
17059 GEN_INT (i / 2));
17060 break;
77ca2cfc 17061 case E_V4HImode:
17062 d = gen_reg_rtx (V1DImode);
17063 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
17064 GEN_INT (i / 2));
17065 break;
2bf6d935
ML
17066 case E_V16QImode:
17067 case E_V8HImode:
3540429b 17068 case E_V8HFmode:
2bf6d935
ML
17069 case E_V4SImode:
17070 case E_V2DImode:
17071 d = gen_reg_rtx (V1TImode);
17072 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
17073 GEN_INT (i / 2));
17074 break;
17075 case E_V8SFmode:
17076 if (i == 256)
17077 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
17078 else
17079 tem = gen_avx_shufps256 (dest, src, src,
17080 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
17081 break;
17082 case E_V4DFmode:
17083 if (i == 256)
17084 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
17085 else
17086 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
17087 break;
17088 case E_V32QImode:
17089 case E_V16HImode:
3540429b 17090 case E_V16HFmode:
2bf6d935
ML
17091 case E_V8SImode:
17092 case E_V4DImode:
17093 if (i == 256)
17094 {
17095 if (GET_MODE (dest) != V4DImode)
17096 d = gen_reg_rtx (V4DImode);
17097 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
17098 gen_lowpart (V4DImode, src),
17099 const1_rtx);
17100 }
17101 else
17102 {
17103 d = gen_reg_rtx (V2TImode);
17104 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
17105 GEN_INT (i / 2));
17106 }
17107 break;
17108 case E_V64QImode:
17109 case E_V32HImode:
3540429b 17110 case E_V32HFmode:
bee27152
JJ
17111 if (i < 64)
17112 {
17113 d = gen_reg_rtx (V4TImode);
17114 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
17115 GEN_INT (i / 2));
17116 break;
17117 }
17118 /* FALLTHRU */
2bf6d935
ML
17119 case E_V16SImode:
17120 case E_V16SFmode:
17121 case E_V8DImode:
17122 case E_V8DFmode:
17123 if (i > 128)
17124 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
17125 gen_lowpart (V16SImode, src),
17126 gen_lowpart (V16SImode, src),
17127 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17128 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17129 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17130 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17131 GEN_INT (0xC), GEN_INT (0xD),
17132 GEN_INT (0xE), GEN_INT (0xF),
17133 GEN_INT (0x10), GEN_INT (0x11),
17134 GEN_INT (0x12), GEN_INT (0x13),
17135 GEN_INT (0x14), GEN_INT (0x15),
17136 GEN_INT (0x16), GEN_INT (0x17));
2bf6d935
ML
17137 else
17138 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
17139 gen_lowpart (V16SImode, src),
17140 GEN_INT (i == 128 ? 0x2 : 0x1),
17141 GEN_INT (0x3),
17142 GEN_INT (0x3),
17143 GEN_INT (0x3),
17144 GEN_INT (i == 128 ? 0x6 : 0x5),
17145 GEN_INT (0x7),
17146 GEN_INT (0x7),
17147 GEN_INT (0x7),
17148 GEN_INT (i == 128 ? 0xA : 0x9),
17149 GEN_INT (0xB),
17150 GEN_INT (0xB),
17151 GEN_INT (0xB),
17152 GEN_INT (i == 128 ? 0xE : 0xD),
17153 GEN_INT (0xF),
17154 GEN_INT (0xF),
17155 GEN_INT (0xF));
2bf6d935
ML
17156 break;
17157 default:
17158 gcc_unreachable ();
17159 }
17160 emit_insn (tem);
17161 if (d != dest)
17162 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17163}
17164
17165/* Expand a vector reduction. FN is the binary pattern to reduce;
17166 DEST is the destination; IN is the input vector. */
17167
17168void
17169ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17170{
17171 rtx half, dst, vec = in;
17172 machine_mode mode = GET_MODE (in);
17173 int i;
17174
17175 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17176 if (TARGET_SSE4_1
17177 && mode == V8HImode
17178 && fn == gen_uminv8hi3)
17179 {
17180 emit_insn (gen_sse4_1_phminposuw (dest, in));
17181 return;
17182 }
17183
17184 for (i = GET_MODE_BITSIZE (mode);
17185 i > GET_MODE_UNIT_BITSIZE (mode);
17186 i >>= 1)
17187 {
17188 half = gen_reg_rtx (mode);
17189 emit_reduc_half (half, vec, i);
17190 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17191 dst = dest;
17192 else
17193 dst = gen_reg_rtx (mode);
17194 emit_insn (fn (dst, half, vec));
17195 vec = dst;
17196 }
17197}
17198
17199/* Output code to perform a conditional jump to LABEL, if C2 flag in
17200 FP status register is set. */
17201
17202void
17203ix86_emit_fp_unordered_jump (rtx label)
17204{
17205 rtx reg = gen_reg_rtx (HImode);
17206 rtx_insn *insn;
17207 rtx temp;
17208
17209 emit_insn (gen_x86_fnstsw_1 (reg));
17210
17211 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17212 {
17213 emit_insn (gen_x86_sahf_1 (reg));
17214
17215 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17216 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17217 }
17218 else
17219 {
17220 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17221
17222 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17223 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17224 }
17225
17226 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17227 gen_rtx_LABEL_REF (VOIDmode, label),
17228 pc_rtx);
17229 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17230 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17231 JUMP_LABEL (insn) = label;
17232}
17233
17234/* Output code to perform an sinh XFmode calculation. */
17235
152f243f
JJ
17236void
17237ix86_emit_i387_sinh (rtx op0, rtx op1)
2bf6d935
ML
17238{
17239 rtx e1 = gen_reg_rtx (XFmode);
17240 rtx e2 = gen_reg_rtx (XFmode);
17241 rtx scratch = gen_reg_rtx (HImode);
17242 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17243 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17244 rtx cst1, tmp;
17245 rtx_code_label *jump_label = gen_label_rtx ();
17246 rtx_insn *insn;
17247
17248 /* scratch = fxam (op1) */
17249 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17250
17251 /* e1 = expm1 (|op1|) */
17252 emit_insn (gen_absxf2 (e2, op1));
17253 emit_insn (gen_expm1xf2 (e1, e2));
17254
17255 /* e2 = e1 / (e1 + 1.0) + e1 */
17256 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17257 emit_insn (gen_addxf3 (e2, e1, cst1));
17258 emit_insn (gen_divxf3 (e2, e1, e2));
17259 emit_insn (gen_addxf3 (e2, e2, e1));
17260
17261 /* flags = signbit (op1) */
17262 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17263
17264 /* if (flags) then e2 = -e2 */
17265 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17266 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17267 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17268 pc_rtx);
17269 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17270 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17271 JUMP_LABEL (insn) = jump_label;
17272
17273 emit_insn (gen_negxf2 (e2, e2));
17274
17275 emit_label (jump_label);
17276 LABEL_NUSES (jump_label) = 1;
17277
17278 /* op0 = 0.5 * e2 */
17279 half = force_reg (XFmode, half);
17280 emit_insn (gen_mulxf3 (op0, e2, half));
17281}
17282
17283/* Output code to perform an cosh XFmode calculation. */
17284
152f243f
JJ
17285void
17286ix86_emit_i387_cosh (rtx op0, rtx op1)
2bf6d935
ML
17287{
17288 rtx e1 = gen_reg_rtx (XFmode);
17289 rtx e2 = gen_reg_rtx (XFmode);
17290 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17291 rtx cst1;
17292
17293 /* e1 = exp (op1) */
17294 emit_insn (gen_expxf2 (e1, op1));
17295
17296 /* e2 = e1 + 1.0 / e1 */
17297 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17298 emit_insn (gen_divxf3 (e2, cst1, e1));
17299 emit_insn (gen_addxf3 (e2, e1, e2));
17300
17301 /* op0 = 0.5 * e2 */
17302 half = force_reg (XFmode, half);
17303 emit_insn (gen_mulxf3 (op0, e2, half));
17304}
17305
17306/* Output code to perform an tanh XFmode calculation. */
17307
152f243f
JJ
17308void
17309ix86_emit_i387_tanh (rtx op0, rtx op1)
2bf6d935
ML
17310{
17311 rtx e1 = gen_reg_rtx (XFmode);
17312 rtx e2 = gen_reg_rtx (XFmode);
17313 rtx scratch = gen_reg_rtx (HImode);
17314 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17315 rtx cst2, tmp;
17316 rtx_code_label *jump_label = gen_label_rtx ();
17317 rtx_insn *insn;
17318
17319 /* scratch = fxam (op1) */
17320 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17321
17322 /* e1 = expm1 (-|2 * op1|) */
17323 emit_insn (gen_addxf3 (e2, op1, op1));
17324 emit_insn (gen_absxf2 (e2, e2));
17325 emit_insn (gen_negxf2 (e2, e2));
17326 emit_insn (gen_expm1xf2 (e1, e2));
17327
17328 /* e2 = e1 / (e1 + 2.0) */
17329 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17330 emit_insn (gen_addxf3 (e2, e1, cst2));
17331 emit_insn (gen_divxf3 (e2, e1, e2));
17332
17333 /* flags = signbit (op1) */
17334 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17335
17336 /* if (!flags) then e2 = -e2 */
17337 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17338 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17339 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17340 pc_rtx);
17341 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17342 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17343 JUMP_LABEL (insn) = jump_label;
17344
17345 emit_insn (gen_negxf2 (e2, e2));
17346
17347 emit_label (jump_label);
17348 LABEL_NUSES (jump_label) = 1;
17349
17350 emit_move_insn (op0, e2);
17351}
17352
17353/* Output code to perform an asinh XFmode calculation. */
17354
152f243f
JJ
17355void
17356ix86_emit_i387_asinh (rtx op0, rtx op1)
2bf6d935
ML
17357{
17358 rtx e1 = gen_reg_rtx (XFmode);
17359 rtx e2 = gen_reg_rtx (XFmode);
17360 rtx scratch = gen_reg_rtx (HImode);
17361 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17362 rtx cst1, tmp;
17363 rtx_code_label *jump_label = gen_label_rtx ();
17364 rtx_insn *insn;
17365
17366 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17367 emit_insn (gen_mulxf3 (e1, op1, op1));
17368 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17369 emit_insn (gen_addxf3 (e2, e1, cst1));
17370 emit_insn (gen_sqrtxf2 (e2, e2));
17371 emit_insn (gen_addxf3 (e2, e2, cst1));
17372
17373 /* e1 = e1 / e2 */
17374 emit_insn (gen_divxf3 (e1, e1, e2));
17375
17376 /* scratch = fxam (op1) */
17377 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17378
17379 /* e1 = e1 + |op1| */
17380 emit_insn (gen_absxf2 (e2, op1));
17381 emit_insn (gen_addxf3 (e1, e1, e2));
17382
17383 /* e2 = log1p (e1) */
17384 ix86_emit_i387_log1p (e2, e1);
17385
17386 /* flags = signbit (op1) */
17387 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17388
17389 /* if (flags) then e2 = -e2 */
17390 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17391 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17392 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17393 pc_rtx);
17394 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17395 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17396 JUMP_LABEL (insn) = jump_label;
17397
17398 emit_insn (gen_negxf2 (e2, e2));
17399
17400 emit_label (jump_label);
17401 LABEL_NUSES (jump_label) = 1;
17402
17403 emit_move_insn (op0, e2);
17404}
17405
17406/* Output code to perform an acosh XFmode calculation. */
17407
152f243f
JJ
17408void
17409ix86_emit_i387_acosh (rtx op0, rtx op1)
2bf6d935
ML
17410{
17411 rtx e1 = gen_reg_rtx (XFmode);
17412 rtx e2 = gen_reg_rtx (XFmode);
17413 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17414
17415 /* e2 = sqrt (op1 + 1.0) */
17416 emit_insn (gen_addxf3 (e2, op1, cst1));
17417 emit_insn (gen_sqrtxf2 (e2, e2));
17418
17419 /* e1 = sqrt (op1 - 1.0) */
17420 emit_insn (gen_subxf3 (e1, op1, cst1));
17421 emit_insn (gen_sqrtxf2 (e1, e1));
17422
17423 /* e1 = e1 * e2 */
17424 emit_insn (gen_mulxf3 (e1, e1, e2));
17425
17426 /* e1 = e1 + op1 */
17427 emit_insn (gen_addxf3 (e1, e1, op1));
17428
17429 /* op0 = log (e1) */
17430 emit_insn (gen_logxf2 (op0, e1));
17431}
17432
17433/* Output code to perform an atanh XFmode calculation. */
17434
152f243f
JJ
17435void
17436ix86_emit_i387_atanh (rtx op0, rtx op1)
2bf6d935
ML
17437{
17438 rtx e1 = gen_reg_rtx (XFmode);
17439 rtx e2 = gen_reg_rtx (XFmode);
17440 rtx scratch = gen_reg_rtx (HImode);
17441 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17442 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17443 rtx cst1, tmp;
17444 rtx_code_label *jump_label = gen_label_rtx ();
17445 rtx_insn *insn;
17446
17447 /* scratch = fxam (op1) */
17448 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17449
17450 /* e2 = |op1| */
17451 emit_insn (gen_absxf2 (e2, op1));
17452
17453 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17454 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17455 emit_insn (gen_addxf3 (e1, e2, cst1));
17456 emit_insn (gen_addxf3 (e2, e2, e2));
17457 emit_insn (gen_negxf2 (e2, e2));
17458 emit_insn (gen_divxf3 (e1, e2, e1));
17459
17460 /* e2 = log1p (e1) */
17461 ix86_emit_i387_log1p (e2, e1);
17462
17463 /* flags = signbit (op1) */
17464 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17465
17466 /* if (!flags) then e2 = -e2 */
17467 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17468 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17469 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17470 pc_rtx);
17471 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17472 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17473 JUMP_LABEL (insn) = jump_label;
17474
17475 emit_insn (gen_negxf2 (e2, e2));
17476
17477 emit_label (jump_label);
17478 LABEL_NUSES (jump_label) = 1;
17479
17480 /* op0 = 0.5 * e2 */
17481 half = force_reg (XFmode, half);
17482 emit_insn (gen_mulxf3 (op0, e2, half));
17483}
17484
17485/* Output code to perform a log1p XFmode calculation. */
17486
152f243f
JJ
17487void
17488ix86_emit_i387_log1p (rtx op0, rtx op1)
2bf6d935
ML
17489{
17490 rtx_code_label *label1 = gen_label_rtx ();
17491 rtx_code_label *label2 = gen_label_rtx ();
17492
17493 rtx tmp = gen_reg_rtx (XFmode);
17494 rtx res = gen_reg_rtx (XFmode);
17495 rtx cst, cstln2, cst1;
17496 rtx_insn *insn;
17497
d481d137
JJ
17498 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17499 before the conditional jump, otherwise the stack adjustment will be
17500 only conditional. */
17501 do_pending_stack_adjust ();
17502
2bf6d935
ML
17503 cst = const_double_from_real_value
17504 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17505 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17506
17507 emit_insn (gen_absxf2 (tmp, op1));
17508
17509 cst = force_reg (XFmode, cst);
17510 ix86_expand_branch (GE, tmp, cst, label1);
17511 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17512 insn = get_last_insn ();
17513 JUMP_LABEL (insn) = label1;
17514
17515 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17516 emit_jump (label2);
17517
17518 emit_label (label1);
17519 LABEL_NUSES (label1) = 1;
17520
17521 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17522 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17523 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17524
17525 emit_label (label2);
17526 LABEL_NUSES (label2) = 1;
17527
17528 emit_move_insn (op0, res);
17529}
17530
17531/* Emit code for round calculation. */
152f243f
JJ
17532void
17533ix86_emit_i387_round (rtx op0, rtx op1)
2bf6d935
ML
17534{
17535 machine_mode inmode = GET_MODE (op1);
17536 machine_mode outmode = GET_MODE (op0);
17537 rtx e1 = gen_reg_rtx (XFmode);
17538 rtx e2 = gen_reg_rtx (XFmode);
17539 rtx scratch = gen_reg_rtx (HImode);
17540 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17541 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17542 rtx res = gen_reg_rtx (outmode);
17543 rtx_code_label *jump_label = gen_label_rtx ();
17544 rtx (*floor_insn) (rtx, rtx);
17545 rtx (*neg_insn) (rtx, rtx);
17546 rtx_insn *insn;
17547 rtx tmp;
17548
17549 switch (inmode)
17550 {
17551 case E_SFmode:
17552 case E_DFmode:
17553 tmp = gen_reg_rtx (XFmode);
17554
17555 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17556 op1 = tmp;
17557 break;
17558 case E_XFmode:
17559 break;
17560 default:
17561 gcc_unreachable ();
17562 }
17563
17564 switch (outmode)
17565 {
17566 case E_SFmode:
17567 floor_insn = gen_frndintxf2_floor;
17568 neg_insn = gen_negsf2;
17569 break;
17570 case E_DFmode:
17571 floor_insn = gen_frndintxf2_floor;
17572 neg_insn = gen_negdf2;
17573 break;
17574 case E_XFmode:
17575 floor_insn = gen_frndintxf2_floor;
17576 neg_insn = gen_negxf2;
17577 break;
17578 case E_HImode:
17579 floor_insn = gen_lfloorxfhi2;
17580 neg_insn = gen_neghi2;
17581 break;
17582 case E_SImode:
17583 floor_insn = gen_lfloorxfsi2;
17584 neg_insn = gen_negsi2;
17585 break;
17586 case E_DImode:
17587 floor_insn = gen_lfloorxfdi2;
17588 neg_insn = gen_negdi2;
17589 break;
17590 default:
17591 gcc_unreachable ();
17592 }
17593
17594 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17595
17596 /* scratch = fxam(op1) */
17597 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17598
17599 /* e1 = fabs(op1) */
17600 emit_insn (gen_absxf2 (e1, op1));
17601
17602 /* e2 = e1 + 0.5 */
17603 half = force_reg (XFmode, half);
17604 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
17605
17606 /* res = floor(e2) */
17607 switch (outmode)
17608 {
17609 case E_SFmode:
17610 case E_DFmode:
17611 {
17612 tmp = gen_reg_rtx (XFmode);
17613
17614 emit_insn (floor_insn (tmp, e2));
17615 emit_insn (gen_rtx_SET (res,
17616 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
17617 UNSPEC_TRUNC_NOOP)));
17618 }
17619 break;
17620 default:
17621 emit_insn (floor_insn (res, e2));
17622 }
17623
17624 /* flags = signbit(a) */
17625 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17626
17627 /* if (flags) then res = -res */
17628 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17629 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17630 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17631 pc_rtx);
17632 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17633 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17634 JUMP_LABEL (insn) = jump_label;
17635
17636 emit_insn (neg_insn (res, res));
17637
17638 emit_label (jump_label);
17639 LABEL_NUSES (jump_label) = 1;
17640
17641 emit_move_insn (op0, res);
17642}
17643
17644/* Output code to perform a Newton-Rhapson approximation of a single precision
17645 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
17646
152f243f
JJ
17647void
17648ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
2bf6d935
ML
17649{
17650 rtx x0, x1, e0, e1;
17651
17652 x0 = gen_reg_rtx (mode);
17653 e0 = gen_reg_rtx (mode);
17654 e1 = gen_reg_rtx (mode);
17655 x1 = gen_reg_rtx (mode);
17656
17657 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17658
17659 b = force_reg (mode, b);
17660
17661 /* x0 = rcp(b) estimate */
17662 if (mode == V16SFmode || mode == V8DFmode)
17663 {
17664 if (TARGET_AVX512ER)
17665 {
17666 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17667 UNSPEC_RCP28)));
17668 /* res = a * x0 */
17669 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
17670 return;
17671 }
17672 else
17673 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17674 UNSPEC_RCP14)));
17675 }
17676 else
17677 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17678 UNSPEC_RCP)));
17679
17680 /* e0 = x0 * b */
17681 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
17682
17683 /* e0 = x0 * e0 */
17684 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
17685
17686 /* e1 = x0 + x0 */
17687 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
17688
17689 /* x1 = e1 - e0 */
17690 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
17691
17692 /* res = a * x1 */
17693 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
17694}
17695
17696/* Output code to perform a Newton-Rhapson approximation of a
17697 single precision floating point [reciprocal] square root. */
17698
152f243f
JJ
17699void
17700ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
2bf6d935
ML
17701{
17702 rtx x0, e0, e1, e2, e3, mthree, mhalf;
17703 REAL_VALUE_TYPE r;
17704 int unspec;
17705
17706 x0 = gen_reg_rtx (mode);
17707 e0 = gen_reg_rtx (mode);
17708 e1 = gen_reg_rtx (mode);
17709 e2 = gen_reg_rtx (mode);
17710 e3 = gen_reg_rtx (mode);
17711
17712 if (TARGET_AVX512ER && mode == V16SFmode)
17713 {
17714 if (recip)
17715 /* res = rsqrt28(a) estimate */
17716 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17717 UNSPEC_RSQRT28)));
17718 else
17719 {
17720 /* x0 = rsqrt28(a) estimate */
17721 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17722 UNSPEC_RSQRT28)));
17723 /* res = rcp28(x0) estimate */
17724 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
17725 UNSPEC_RCP28)));
17726 }
17727 return;
17728 }
17729
17730 real_from_integer (&r, VOIDmode, -3, SIGNED);
17731 mthree = const_double_from_real_value (r, SFmode);
17732
17733 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
17734 mhalf = const_double_from_real_value (r, SFmode);
17735 unspec = UNSPEC_RSQRT;
17736
17737 if (VECTOR_MODE_P (mode))
17738 {
17739 mthree = ix86_build_const_vector (mode, true, mthree);
17740 mhalf = ix86_build_const_vector (mode, true, mhalf);
17741 /* There is no 512-bit rsqrt. There is however rsqrt14. */
17742 if (GET_MODE_SIZE (mode) == 64)
17743 unspec = UNSPEC_RSQRT14;
17744 }
17745
17746 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
17747 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
17748
17749 a = force_reg (mode, a);
17750
17751 /* x0 = rsqrt(a) estimate */
17752 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17753 unspec)));
17754
17755 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
17756 if (!recip)
17757 {
17758 rtx zero = force_reg (mode, CONST0_RTX(mode));
17759 rtx mask;
17760
17761 /* Handle masked compare. */
17762 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
17763 {
17764 mask = gen_reg_rtx (HImode);
17765 /* Imm value 0x4 corresponds to not-equal comparison. */
17766 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
17767 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
17768 }
17769 else
17770 {
17771 mask = gen_reg_rtx (mode);
17772 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
17773 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
17774 }
17775 }
17776
fab263ab
L
17777 mthree = force_reg (mode, mthree);
17778
2bf6d935
ML
17779 /* e0 = x0 * a */
17780 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
2bf6d935 17781
a6645a82
L
17782 unsigned vector_size = GET_MODE_SIZE (mode);
17783 if (TARGET_FMA
17784 || (TARGET_AVX512F && vector_size == 64)
17785 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
fab263ab
L
17786 emit_insn (gen_rtx_SET (e2,
17787 gen_rtx_FMA (mode, e0, x0, mthree)));
17788 else
17789 {
17790 /* e1 = e0 * x0 */
17791 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
17792
17793 /* e2 = e1 - 3. */
17794 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
17795 }
2bf6d935
ML
17796
17797 mhalf = force_reg (mode, mhalf);
17798 if (recip)
17799 /* e3 = -.5 * x0 */
17800 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
17801 else
17802 /* e3 = -.5 * e0 */
17803 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
17804 /* ret = e2 * e3 */
17805 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
17806}
17807
17808/* Expand fabs (OP0) and return a new rtx that holds the result. The
17809 mask for masking out the sign-bit is stored in *SMASK, if that is
17810 non-null. */
17811
17812static rtx
17813ix86_expand_sse_fabs (rtx op0, rtx *smask)
17814{
17815 machine_mode vmode, mode = GET_MODE (op0);
17816 rtx xa, mask;
17817
17818 xa = gen_reg_rtx (mode);
17819 if (mode == SFmode)
17820 vmode = V4SFmode;
17821 else if (mode == DFmode)
17822 vmode = V2DFmode;
17823 else
17824 vmode = mode;
17825 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
17826 if (!VECTOR_MODE_P (mode))
17827 {
17828 /* We need to generate a scalar mode mask in this case. */
17829 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17830 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17831 mask = gen_reg_rtx (mode);
17832 emit_insn (gen_rtx_SET (mask, tmp));
17833 }
17834 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
17835
17836 if (smask)
17837 *smask = mask;
17838
17839 return xa;
17840}
17841
17842/* Expands a comparison of OP0 with OP1 using comparison code CODE,
17843 swapping the operands if SWAP_OPERANDS is true. The expanded
17844 code is a forward jump to a newly created label in case the
17845 comparison is true. The generated label rtx is returned. */
17846static rtx_code_label *
17847ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
17848 bool swap_operands)
17849{
17850 bool unordered_compare = ix86_unordered_fp_compare (code);
17851 rtx_code_label *label;
17852 rtx tmp, reg;
17853
17854 if (swap_operands)
17855 std::swap (op0, op1);
17856
17857 label = gen_label_rtx ();
17858 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
17859 if (unordered_compare)
17860 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
17861 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
17862 emit_insn (gen_rtx_SET (reg, tmp));
17863 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
17864 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
17865 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
17866 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17867 JUMP_LABEL (tmp) = label;
17868
17869 return label;
17870}
17871
17872/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
17873 using comparison code CODE. Operands are swapped for the comparison if
17874 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
17875static rtx
17876ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
17877 bool swap_operands)
17878{
17879 rtx (*insn)(rtx, rtx, rtx, rtx);
17880 machine_mode mode = GET_MODE (op0);
17881 rtx mask = gen_reg_rtx (mode);
17882
17883 if (swap_operands)
17884 std::swap (op0, op1);
17885
17886 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
17887
17888 emit_insn (insn (mask, op0, op1,
17889 gen_rtx_fmt_ee (code, mode, op0, op1)));
17890 return mask;
17891}
17892
17893/* Expand copysign from SIGN to the positive value ABS_VALUE
17894 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
17895 the sign-bit. */
17896
17897static void
17898ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
17899{
17900 machine_mode mode = GET_MODE (sign);
17901 rtx sgn = gen_reg_rtx (mode);
17902 if (mask == NULL_RTX)
17903 {
17904 machine_mode vmode;
17905
17906 if (mode == SFmode)
17907 vmode = V4SFmode;
17908 else if (mode == DFmode)
17909 vmode = V2DFmode;
17910 else
17911 vmode = mode;
17912
17913 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
17914 if (!VECTOR_MODE_P (mode))
17915 {
17916 /* We need to generate a scalar mode mask in this case. */
17917 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17918 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17919 mask = gen_reg_rtx (mode);
17920 emit_insn (gen_rtx_SET (mask, tmp));
17921 }
17922 }
17923 else
17924 mask = gen_rtx_NOT (mode, mask);
17925 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
17926 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
17927}
17928
17929/* Expand SSE sequence for computing lround from OP1 storing
17930 into OP0. */
17931
17932void
17933ix86_expand_lround (rtx op0, rtx op1)
17934{
17935 /* C code for the stuff we're doing below:
d2754fbb
UB
17936 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
17937 return (long)tmp;
2bf6d935
ML
17938 */
17939 machine_mode mode = GET_MODE (op1);
17940 const struct real_format *fmt;
17941 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
17942 rtx adj;
17943
17944 /* load nextafter (0.5, 0.0) */
17945 fmt = REAL_MODE_FORMAT (mode);
17946 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
17947 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
17948
17949 /* adj = copysign (0.5, op1) */
17950 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
17951 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
17952
17953 /* adj = op1 + adj */
17954 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
17955
17956 /* op0 = (imode)adj */
17957 expand_fix (op0, adj, 0);
17958}
17959
17960/* Expand SSE2 sequence for computing lround from OPERAND1 storing
17961 into OPERAND0. */
17962
17963void
17964ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
17965{
17966 /* C code for the stuff we're doing below (for do_floor):
17967 xi = (long)op1;
d2754fbb
UB
17968 xi -= (double)xi > op1 ? 1 : 0;
17969 return xi;
2bf6d935
ML
17970 */
17971 machine_mode fmode = GET_MODE (op1);
17972 machine_mode imode = GET_MODE (op0);
17973 rtx ireg, freg, tmp;
17974 rtx_code_label *label;
17975
17976 /* reg = (long)op1 */
17977 ireg = gen_reg_rtx (imode);
17978 expand_fix (ireg, op1, 0);
17979
17980 /* freg = (double)reg */
17981 freg = gen_reg_rtx (fmode);
17982 expand_float (freg, ireg, 0);
17983
17984 /* ireg = (freg > op1) ? ireg - 1 : ireg */
17985 label = ix86_expand_sse_compare_and_jump (UNLE,
17986 freg, op1, !do_floor);
17987 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
17988 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
17989 emit_move_insn (ireg, tmp);
17990
17991 emit_label (label);
17992 LABEL_NUSES (label) = 1;
17993
17994 emit_move_insn (op0, ireg);
17995}
17996
17997/* Generate and return a rtx of mode MODE for 2**n where n is the number
17998 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
17999
18000static rtx
18001ix86_gen_TWO52 (machine_mode mode)
18002{
d2754fbb 18003 const struct real_format *fmt;
2bf6d935
ML
18004 REAL_VALUE_TYPE TWO52r;
18005 rtx TWO52;
18006
d2754fbb
UB
18007 fmt = REAL_MODE_FORMAT (mode);
18008 real_2expN (&TWO52r, fmt->p - 1, mode);
2bf6d935
ML
18009 TWO52 = const_double_from_real_value (TWO52r, mode);
18010 TWO52 = force_reg (mode, TWO52);
18011
18012 return TWO52;
18013}
18014
18015/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18016
18017void
18018ix86_expand_rint (rtx operand0, rtx operand1)
18019{
18020 /* C code for the stuff we're doing below:
18021 xa = fabs (operand1);
d2754fbb 18022 if (!isless (xa, 2**52))
2bf6d935 18023 return operand1;
d2754fbb
UB
18024 two52 = 2**52;
18025 if (flag_rounding_math)
2bf6d935
ML
18026 {
18027 two52 = copysign (two52, operand1);
18028 xa = operand1;
18029 }
d2754fbb
UB
18030 xa = xa + two52 - two52;
18031 return copysign (xa, operand1);
2bf6d935
ML
18032 */
18033 machine_mode mode = GET_MODE (operand0);
81615bb0 18034 rtx res, xa, TWO52, mask;
2bf6d935
ML
18035 rtx_code_label *label;
18036
d2754fbb
UB
18037 TWO52 = ix86_gen_TWO52 (mode);
18038
18039 /* Temporary for holding the result, initialized to the input
18040 operand to ease control flow. */
18041 res = copy_to_reg (operand1);
2bf6d935
ML
18042
18043 /* xa = abs (operand1) */
18044 xa = ix86_expand_sse_fabs (res, &mask);
18045
18046 /* if (!isless (xa, TWO52)) goto label; */
2bf6d935
ML
18047 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18048
2bf6d935
ML
18049 if (flag_rounding_math)
18050 {
81615bb0 18051 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
2bf6d935
ML
18052 xa = res;
18053 }
18054
81615bb0
UB
18055 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18056 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18057
18058 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18059 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18060 xa = ix86_expand_sse_fabs (xa, NULL);
2bf6d935
ML
18061
18062 ix86_sse_copysign_to_positive (res, xa, res, mask);
18063
18064 emit_label (label);
18065 LABEL_NUSES (label) = 1;
18066
18067 emit_move_insn (operand0, res);
18068}
18069
36d387f2
UB
18070/* Expand SSE2 sequence for computing floor or ceil
18071 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
18072void
18073ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
18074{
18075 /* C code for the stuff we expand below.
18076 double xa = fabs (x), x2;
d2754fbb
UB
18077 if (!isless (xa, TWO52))
18078 return x;
2bf6d935 18079 x2 = (double)(long)x;
337ed0eb 18080
2bf6d935
ML
18081 Compensate. Floor:
18082 if (x2 > x)
18083 x2 -= 1;
18084 Compensate. Ceil:
18085 if (x2 < x)
18086 x2 += 1;
337ed0eb 18087
2bf6d935
ML
18088 if (HONOR_SIGNED_ZEROS (mode))
18089 return copysign (x2, x);
18090 return x2;
18091 */
18092 machine_mode mode = GET_MODE (operand0);
18093 rtx xa, xi, TWO52, tmp, one, res, mask;
18094 rtx_code_label *label;
18095
18096 TWO52 = ix86_gen_TWO52 (mode);
18097
18098 /* Temporary for holding the result, initialized to the input
18099 operand to ease control flow. */
d2754fbb 18100 res = copy_to_reg (operand1);
2bf6d935
ML
18101
18102 /* xa = abs (operand1) */
18103 xa = ix86_expand_sse_fabs (res, &mask);
18104
18105 /* if (!isless (xa, TWO52)) goto label; */
18106 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18107
18108 /* xa = (double)(long)x */
d2754fbb 18109 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935
ML
18110 expand_fix (xi, res, 0);
18111 expand_float (xa, xi, 0);
18112
18113 /* generate 1.0 */
18114 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18115
18116 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18117 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18118 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18119 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18120 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
2bf6d935 18121 if (HONOR_SIGNED_ZEROS (mode))
337ed0eb
UB
18122 {
18123 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18124 if (do_floor && flag_rounding_math)
18125 tmp = ix86_expand_sse_fabs (tmp, NULL);
18126
18127 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18128 }
18129 emit_move_insn (res, tmp);
2bf6d935
ML
18130
18131 emit_label (label);
18132 LABEL_NUSES (label) = 1;
18133
18134 emit_move_insn (operand0, res);
18135}
18136
36d387f2
UB
18137/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18138 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18139 that is only available on 64bit targets. */
2bf6d935 18140void
36d387f2 18141ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
2bf6d935
ML
18142{
18143 /* C code for the stuff we expand below.
d2754fbb
UB
18144 double xa = fabs (x), x2;
18145 if (!isless (xa, TWO52))
18146 return x;
18147 xa = xa + TWO52 - TWO52;
18148 x2 = copysign (xa, x);
337ed0eb 18149
36d387f2 18150 Compensate. Floor:
d2754fbb
UB
18151 if (x2 > x)
18152 x2 -= 1;
36d387f2 18153 Compensate. Ceil:
d2754fbb
UB
18154 if (x2 < x)
18155 x2 += 1;
337ed0eb 18156
36d387f2
UB
18157 if (HONOR_SIGNED_ZEROS (mode))
18158 x2 = copysign (x2, x);
18159 return x2;
2bf6d935
ML
18160 */
18161 machine_mode mode = GET_MODE (operand0);
36d387f2 18162 rtx xa, TWO52, tmp, one, res, mask;
2bf6d935
ML
18163 rtx_code_label *label;
18164
18165 TWO52 = ix86_gen_TWO52 (mode);
18166
18167 /* Temporary for holding the result, initialized to the input
18168 operand to ease control flow. */
d2754fbb 18169 res = copy_to_reg (operand1);
2bf6d935
ML
18170
18171 /* xa = abs (operand1) */
18172 xa = ix86_expand_sse_fabs (res, &mask);
18173
18174 /* if (!isless (xa, TWO52)) goto label; */
18175 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18176
36d387f2
UB
18177 /* xa = xa + TWO52 - TWO52; */
18178 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18179 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
2bf6d935 18180
36d387f2
UB
18181 /* xa = copysign (xa, operand1) */
18182 ix86_sse_copysign_to_positive (xa, xa, res, mask);
2bf6d935 18183
36d387f2
UB
18184 /* generate 1.0 */
18185 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
2bf6d935 18186
36d387f2
UB
18187 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18188 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18189 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18190 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18191 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
337ed0eb
UB
18192 if (HONOR_SIGNED_ZEROS (mode))
18193 {
18194 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18195 if (do_floor && flag_rounding_math)
18196 tmp = ix86_expand_sse_fabs (tmp, NULL);
18197
18198 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18199 }
36d387f2 18200 emit_move_insn (res, tmp);
2bf6d935
ML
18201
18202 emit_label (label);
18203 LABEL_NUSES (label) = 1;
18204
18205 emit_move_insn (operand0, res);
18206}
18207
36d387f2
UB
18208/* Expand SSE sequence for computing trunc
18209 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
18210void
18211ix86_expand_trunc (rtx operand0, rtx operand1)
18212{
18213 /* C code for SSE variant we expand below.
d2754fbb
UB
18214 double xa = fabs (x), x2;
18215 if (!isless (xa, TWO52))
18216 return x;
18217 x2 = (double)(long)x;
2bf6d935
ML
18218 if (HONOR_SIGNED_ZEROS (mode))
18219 return copysign (x2, x);
18220 return x2;
18221 */
18222 machine_mode mode = GET_MODE (operand0);
18223 rtx xa, xi, TWO52, res, mask;
18224 rtx_code_label *label;
18225
18226 TWO52 = ix86_gen_TWO52 (mode);
18227
18228 /* Temporary for holding the result, initialized to the input
18229 operand to ease control flow. */
d2754fbb 18230 res = copy_to_reg (operand1);
2bf6d935
ML
18231
18232 /* xa = abs (operand1) */
18233 xa = ix86_expand_sse_fabs (res, &mask);
18234
18235 /* if (!isless (xa, TWO52)) goto label; */
18236 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18237
97d3ddcf 18238 /* xa = (double)(long)x */
d2754fbb 18239 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935 18240 expand_fix (xi, res, 0);
97d3ddcf 18241 expand_float (xa, xi, 0);
2bf6d935
ML
18242
18243 if (HONOR_SIGNED_ZEROS (mode))
97d3ddcf
UB
18244 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18245
18246 emit_move_insn (res, xa);
2bf6d935
ML
18247
18248 emit_label (label);
18249 LABEL_NUSES (label) = 1;
18250
18251 emit_move_insn (operand0, res);
18252}
18253
18254/* Expand SSE sequence for computing trunc from OPERAND1 storing
36d387f2
UB
18255 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18256 that is only available on 64bit targets. */
2bf6d935
ML
18257void
18258ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18259{
18260 machine_mode mode = GET_MODE (operand0);
c142ae5e 18261 rtx xa, xa2, TWO52, tmp, one, res, mask;
2bf6d935
ML
18262 rtx_code_label *label;
18263
18264 /* C code for SSE variant we expand below.
d2754fbb
UB
18265 double xa = fabs (x), x2;
18266 if (!isless (xa, TWO52))
18267 return x;
18268 xa2 = xa + TWO52 - TWO52;
2bf6d935 18269 Compensate:
d2754fbb
UB
18270 if (xa2 > xa)
18271 xa2 -= 1.0;
18272 x2 = copysign (xa2, x);
18273 return x2;
2bf6d935
ML
18274 */
18275
18276 TWO52 = ix86_gen_TWO52 (mode);
18277
18278 /* Temporary for holding the result, initialized to the input
18279 operand to ease control flow. */
d2754fbb 18280 res =copy_to_reg (operand1);
2bf6d935
ML
18281
18282 /* xa = abs (operand1) */
c142ae5e 18283 xa = ix86_expand_sse_fabs (res, &mask);
2bf6d935
ML
18284
18285 /* if (!isless (xa, TWO52)) goto label; */
18286 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18287
c142ae5e
UB
18288 /* xa2 = xa + TWO52 - TWO52; */
18289 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18290 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
2bf6d935
ML
18291
18292 /* generate 1.0 */
18293 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18294
c142ae5e
UB
18295 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18296 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18297 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
2bf6d935 18298 tmp = expand_simple_binop (mode, MINUS,
c142ae5e
UB
18299 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18300 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
81615bb0 18301 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
c142ae5e 18302 tmp = ix86_expand_sse_fabs (tmp, NULL);
2bf6d935 18303
c142ae5e
UB
18304 /* res = copysign (xa2, operand1) */
18305 ix86_sse_copysign_to_positive (res, tmp, res, mask);
2bf6d935
ML
18306
18307 emit_label (label);
18308 LABEL_NUSES (label) = 1;
18309
18310 emit_move_insn (operand0, res);
18311}
18312
36d387f2
UB
18313/* Expand SSE sequence for computing round
18314 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
18315void
18316ix86_expand_round (rtx operand0, rtx operand1)
18317{
18318 /* C code for the stuff we're doing below:
d2754fbb
UB
18319 double xa = fabs (x);
18320 if (!isless (xa, TWO52))
18321 return x;
18322 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18323 return copysign (xa, x);
2bf6d935
ML
18324 */
18325 machine_mode mode = GET_MODE (operand0);
18326 rtx res, TWO52, xa, xi, half, mask;
18327 rtx_code_label *label;
18328 const struct real_format *fmt;
18329 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18330
18331 /* Temporary for holding the result, initialized to the input
18332 operand to ease control flow. */
d2754fbb 18333 res = copy_to_reg (operand1);
2bf6d935
ML
18334
18335 TWO52 = ix86_gen_TWO52 (mode);
18336 xa = ix86_expand_sse_fabs (res, &mask);
18337 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18338
18339 /* load nextafter (0.5, 0.0) */
18340 fmt = REAL_MODE_FORMAT (mode);
18341 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18342 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18343
18344 /* xa = xa + 0.5 */
18345 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18346 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18347
18348 /* xa = (double)(int64_t)xa */
d2754fbb 18349 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935
ML
18350 expand_fix (xi, xa, 0);
18351 expand_float (xa, xi, 0);
18352
18353 /* res = copysign (xa, operand1) */
97d3ddcf 18354 ix86_sse_copysign_to_positive (res, xa, res, mask);
2bf6d935
ML
18355
18356 emit_label (label);
18357 LABEL_NUSES (label) = 1;
18358
18359 emit_move_insn (operand0, res);
18360}
18361
36d387f2
UB
18362/* Expand SSE sequence for computing round from OPERAND1 storing
18363 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18364 that is only available on 64bit targets. */
18365void
18366ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18367{
18368 /* C code for the stuff we expand below.
d2754fbb
UB
18369 double xa = fabs (x), xa2, x2;
18370 if (!isless (xa, TWO52))
18371 return x;
36d387f2
UB
18372 Using the absolute value and copying back sign makes
18373 -0.0 -> -0.0 correct.
d2754fbb 18374 xa2 = xa + TWO52 - TWO52;
36d387f2
UB
18375 Compensate.
18376 dxa = xa2 - xa;
d2754fbb
UB
18377 if (dxa <= -0.5)
18378 xa2 += 1;
18379 else if (dxa > 0.5)
18380 xa2 -= 1;
18381 x2 = copysign (xa2, x);
18382 return x2;
36d387f2
UB
18383 */
18384 machine_mode mode = GET_MODE (operand0);
18385 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18386 rtx_code_label *label;
18387
18388 TWO52 = ix86_gen_TWO52 (mode);
18389
18390 /* Temporary for holding the result, initialized to the input
18391 operand to ease control flow. */
d2754fbb 18392 res = copy_to_reg (operand1);
36d387f2
UB
18393
18394 /* xa = abs (operand1) */
18395 xa = ix86_expand_sse_fabs (res, &mask);
18396
18397 /* if (!isless (xa, TWO52)) goto label; */
18398 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18399
18400 /* xa2 = xa + TWO52 - TWO52; */
18401 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18402 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18403
18404 /* dxa = xa2 - xa; */
18405 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18406
18407 /* generate 0.5, 1.0 and -0.5 */
18408 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18409 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18410 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18411 0, OPTAB_DIRECT);
18412
18413 /* Compensate. */
18414 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18415 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18416 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18417 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18418 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18419 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18420 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18421 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18422
18423 /* res = copysign (xa2, operand1) */
97d3ddcf 18424 ix86_sse_copysign_to_positive (res, xa2, res, mask);
36d387f2
UB
18425
18426 emit_label (label);
18427 LABEL_NUSES (label) = 1;
18428
18429 emit_move_insn (operand0, res);
18430}
18431
2bf6d935
ML
18432/* Expand SSE sequence for computing round
18433 from OP1 storing into OP0 using sse4 round insn. */
18434void
18435ix86_expand_round_sse4 (rtx op0, rtx op1)
18436{
18437 machine_mode mode = GET_MODE (op0);
18438 rtx e1, e2, res, half;
18439 const struct real_format *fmt;
18440 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18441 rtx (*gen_copysign) (rtx, rtx, rtx);
18442 rtx (*gen_round) (rtx, rtx, rtx);
18443
18444 switch (mode)
18445 {
18446 case E_SFmode:
18447 gen_copysign = gen_copysignsf3;
18448 gen_round = gen_sse4_1_roundsf2;
18449 break;
18450 case E_DFmode:
18451 gen_copysign = gen_copysigndf3;
18452 gen_round = gen_sse4_1_rounddf2;
18453 break;
18454 default:
18455 gcc_unreachable ();
18456 }
18457
18458 /* round (a) = trunc (a + copysign (0.5, a)) */
18459
18460 /* load nextafter (0.5, 0.0) */
18461 fmt = REAL_MODE_FORMAT (mode);
18462 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18463 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18464 half = const_double_from_real_value (pred_half, mode);
18465
18466 /* e1 = copysign (0.5, op1) */
18467 e1 = gen_reg_rtx (mode);
18468 emit_insn (gen_copysign (e1, half, op1));
18469
18470 /* e2 = op1 + e1 */
18471 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18472
18473 /* res = trunc (e2) */
18474 res = gen_reg_rtx (mode);
18475 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18476
18477 emit_move_insn (op0, res);
18478}
18479
18480/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18481 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18482 insn every time. */
18483
18484static GTY(()) rtx_insn *vselect_insn;
18485
18486/* Initialize vselect_insn. */
18487
18488static void
18489init_vselect_insn (void)
18490{
18491 unsigned i;
18492 rtx x;
18493
18494 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18495 for (i = 0; i < MAX_VECT_LEN; ++i)
18496 XVECEXP (x, 0, i) = const0_rtx;
18497 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18498 const0_rtx), x);
18499 x = gen_rtx_SET (const0_rtx, x);
18500 start_sequence ();
18501 vselect_insn = emit_insn (x);
18502 end_sequence ();
18503}
18504
18505/* Construct (set target (vec_select op0 (parallel perm))) and
18506 return true if that's a valid instruction in the active ISA. */
18507
18508static bool
18509expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18510 unsigned nelt, bool testing_p)
18511{
18512 unsigned int i;
18513 rtx x, save_vconcat;
18514 int icode;
18515
18516 if (vselect_insn == NULL_RTX)
18517 init_vselect_insn ();
18518
18519 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18520 PUT_NUM_ELEM (XVEC (x, 0), nelt);
18521 for (i = 0; i < nelt; ++i)
18522 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18523 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18524 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18525 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18526 SET_DEST (PATTERN (vselect_insn)) = target;
18527 icode = recog_memoized (vselect_insn);
18528
18529 if (icode >= 0 && !testing_p)
18530 emit_insn (copy_rtx (PATTERN (vselect_insn)));
18531
18532 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18533 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18534 INSN_CODE (vselect_insn) = -1;
18535
18536 return icode >= 0;
18537}
18538
18539/* Similar, but generate a vec_concat from op0 and op1 as well. */
18540
18541static bool
18542expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18543 const unsigned char *perm, unsigned nelt,
18544 bool testing_p)
18545{
18546 machine_mode v2mode;
18547 rtx x;
18548 bool ok;
18549
18550 if (vselect_insn == NULL_RTX)
18551 init_vselect_insn ();
18552
18553 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18554 return false;
18555 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18556 PUT_MODE (x, v2mode);
18557 XEXP (x, 0) = op0;
18558 XEXP (x, 1) = op1;
18559 ok = expand_vselect (target, x, perm, nelt, testing_p);
18560 XEXP (x, 0) = const0_rtx;
18561 XEXP (x, 1) = const0_rtx;
18562 return ok;
18563}
18564
4bf4c103 18565/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
18566 using movss or movsd. */
18567static bool
18568expand_vec_perm_movs (struct expand_vec_perm_d *d)
18569{
18570 machine_mode vmode = d->vmode;
18571 unsigned i, nelt = d->nelt;
18572 rtx x;
18573
18574 if (d->one_operand_p)
18575 return false;
18576
18577 if (!(TARGET_SSE && vmode == V4SFmode)
240198fe 18578 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
2bf6d935
ML
18579 && !(TARGET_SSE2 && vmode == V2DFmode))
18580 return false;
18581
18582 /* Only the first element is changed. */
18583 if (d->perm[0] != nelt && d->perm[0] != 0)
18584 return false;
18585 for (i = 1; i < nelt; ++i)
18586 if (d->perm[i] != i + nelt - d->perm[0])
18587 return false;
18588
18589 if (d->testing_p)
18590 return true;
18591
18592 if (d->perm[0] == nelt)
18593 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18594 else
18595 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18596
18597 emit_insn (gen_rtx_SET (d->target, x));
18598
18599 return true;
18600}
18601
4bf4c103 18602/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
18603 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
18604
18605static bool
18606expand_vec_perm_blend (struct expand_vec_perm_d *d)
18607{
18608 machine_mode mmode, vmode = d->vmode;
fa2987ed
JJ
18609 unsigned i, nelt = d->nelt;
18610 unsigned HOST_WIDE_INT mask;
2bf6d935
ML
18611 rtx target, op0, op1, maskop, x;
18612 rtx rperm[32], vperm;
18613
18614 if (d->one_operand_p)
18615 return false;
18616 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
18617 && (TARGET_AVX512BW
18618 || GET_MODE_UNIT_SIZE (vmode) >= 4))
18619 ;
18620 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
18621 ;
18622 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
18623 ;
a325bdd1 18624 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
be8749f9
UB
18625 || GET_MODE_SIZE (vmode) == 8
18626 || GET_MODE_SIZE (vmode) == 4))
2bf6d935
ML
18627 ;
18628 else
18629 return false;
18630
18631 /* This is a blend, not a permute. Elements must stay in their
18632 respective lanes. */
18633 for (i = 0; i < nelt; ++i)
18634 {
18635 unsigned e = d->perm[i];
18636 if (!(e == i || e == i + nelt))
18637 return false;
18638 }
18639
18640 if (d->testing_p)
18641 return true;
18642
18643 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
18644 decision should be extracted elsewhere, so that we only try that
18645 sequence once all budget==3 options have been tried. */
18646 target = d->target;
18647 op0 = d->op0;
18648 op1 = d->op1;
18649 mask = 0;
18650
18651 switch (vmode)
18652 {
18653 case E_V8DFmode:
18654 case E_V16SFmode:
18655 case E_V4DFmode:
18656 case E_V8SFmode:
18657 case E_V2DFmode:
18658 case E_V4SFmode:
a325bdd1 18659 case E_V4HImode:
2bf6d935
ML
18660 case E_V8HImode:
18661 case E_V8SImode:
18662 case E_V32HImode:
18663 case E_V64QImode:
18664 case E_V16SImode:
18665 case E_V8DImode:
18666 for (i = 0; i < nelt; ++i)
fa2987ed 18667 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
2bf6d935
ML
18668 break;
18669
18670 case E_V2DImode:
18671 for (i = 0; i < 2; ++i)
18672 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
18673 vmode = V8HImode;
18674 goto do_subreg;
18675
a325bdd1
PB
18676 case E_V2SImode:
18677 for (i = 0; i < 2; ++i)
18678 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
18679 vmode = V4HImode;
18680 goto do_subreg;
18681
2bf6d935
ML
18682 case E_V4SImode:
18683 for (i = 0; i < 4; ++i)
18684 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18685 vmode = V8HImode;
18686 goto do_subreg;
18687
18688 case E_V16QImode:
18689 /* See if bytes move in pairs so we can use pblendw with
18690 an immediate argument, rather than pblendvb with a vector
18691 argument. */
18692 for (i = 0; i < 16; i += 2)
18693 if (d->perm[i] + 1 != d->perm[i + 1])
18694 {
18695 use_pblendvb:
18696 for (i = 0; i < nelt; ++i)
18697 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
18698
18699 finish_pblendvb:
18700 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18701 vperm = force_reg (vmode, vperm);
18702
be8749f9 18703 if (GET_MODE_SIZE (vmode) == 4)
820ac79e 18704 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
be8749f9 18705 else if (GET_MODE_SIZE (vmode) == 8)
820ac79e 18706 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
a325bdd1 18707 else if (GET_MODE_SIZE (vmode) == 16)
2bf6d935
ML
18708 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
18709 else
18710 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
18711 if (target != d->target)
18712 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18713 return true;
18714 }
18715
18716 for (i = 0; i < 8; ++i)
18717 mask |= (d->perm[i * 2] >= 16) << i;
18718 vmode = V8HImode;
18719 /* FALLTHRU */
18720
18721 do_subreg:
18722 target = gen_reg_rtx (vmode);
18723 op0 = gen_lowpart (vmode, op0);
18724 op1 = gen_lowpart (vmode, op1);
18725 break;
18726
a325bdd1
PB
18727 case E_V8QImode:
18728 for (i = 0; i < 8; i += 2)
18729 if (d->perm[i] + 1 != d->perm[i + 1])
18730 goto use_pblendvb;
18731
18732 for (i = 0; i < 4; ++i)
18733 mask |= (d->perm[i * 2] >= 8) << i;
18734 vmode = V4HImode;
18735 goto do_subreg;
18736
be8749f9
UB
18737 case E_V4QImode:
18738 for (i = 0; i < 4; i += 2)
18739 if (d->perm[i] + 1 != d->perm[i + 1])
18740 goto use_pblendvb;
18741
18742 for (i = 0; i < 2; ++i)
18743 mask |= (d->perm[i * 2] >= 4) << i;
18744 vmode = V2HImode;
18745 goto do_subreg;
18746
2bf6d935
ML
18747 case E_V32QImode:
18748 /* See if bytes move in pairs. If not, vpblendvb must be used. */
18749 for (i = 0; i < 32; i += 2)
18750 if (d->perm[i] + 1 != d->perm[i + 1])
18751 goto use_pblendvb;
18752 /* See if bytes move in quadruplets. If yes, vpblendd
18753 with immediate can be used. */
18754 for (i = 0; i < 32; i += 4)
18755 if (d->perm[i] + 2 != d->perm[i + 2])
18756 break;
18757 if (i < 32)
18758 {
18759 /* See if bytes move the same in both lanes. If yes,
18760 vpblendw with immediate can be used. */
18761 for (i = 0; i < 16; i += 2)
18762 if (d->perm[i] + 16 != d->perm[i + 16])
18763 goto use_pblendvb;
18764
18765 /* Use vpblendw. */
18766 for (i = 0; i < 16; ++i)
18767 mask |= (d->perm[i * 2] >= 32) << i;
18768 vmode = V16HImode;
18769 goto do_subreg;
18770 }
18771
18772 /* Use vpblendd. */
18773 for (i = 0; i < 8; ++i)
18774 mask |= (d->perm[i * 4] >= 32) << i;
18775 vmode = V8SImode;
18776 goto do_subreg;
18777
18778 case E_V16HImode:
18779 /* See if words move in pairs. If yes, vpblendd can be used. */
18780 for (i = 0; i < 16; i += 2)
18781 if (d->perm[i] + 1 != d->perm[i + 1])
18782 break;
18783 if (i < 16)
18784 {
18785 /* See if words move the same in both lanes. If not,
18786 vpblendvb must be used. */
18787 for (i = 0; i < 8; i++)
18788 if (d->perm[i] + 8 != d->perm[i + 8])
18789 {
18790 /* Use vpblendvb. */
18791 for (i = 0; i < 32; ++i)
18792 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
18793
18794 vmode = V32QImode;
18795 nelt = 32;
18796 target = gen_reg_rtx (vmode);
18797 op0 = gen_lowpart (vmode, op0);
18798 op1 = gen_lowpart (vmode, op1);
18799 goto finish_pblendvb;
18800 }
18801
18802 /* Use vpblendw. */
18803 for (i = 0; i < 16; ++i)
18804 mask |= (d->perm[i] >= 16) << i;
18805 break;
18806 }
18807
18808 /* Use vpblendd. */
18809 for (i = 0; i < 8; ++i)
18810 mask |= (d->perm[i * 2] >= 16) << i;
18811 vmode = V8SImode;
18812 goto do_subreg;
18813
18814 case E_V4DImode:
18815 /* Use vpblendd. */
18816 for (i = 0; i < 4; ++i)
18817 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18818 vmode = V8SImode;
18819 goto do_subreg;
18820
18821 default:
18822 gcc_unreachable ();
18823 }
18824
18825 switch (vmode)
18826 {
18827 case E_V8DFmode:
18828 case E_V8DImode:
18829 mmode = QImode;
18830 break;
18831 case E_V16SFmode:
18832 case E_V16SImode:
18833 mmode = HImode;
18834 break;
18835 case E_V32HImode:
18836 mmode = SImode;
18837 break;
18838 case E_V64QImode:
18839 mmode = DImode;
18840 break;
18841 default:
18842 mmode = VOIDmode;
18843 }
18844
18845 if (mmode != VOIDmode)
18846 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
18847 else
18848 maskop = GEN_INT (mask);
18849
18850 /* This matches five different patterns with the different modes. */
18851 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
18852 x = gen_rtx_SET (target, x);
18853 emit_insn (x);
18854 if (target != d->target)
18855 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18856
18857 return true;
18858}
18859
4bf4c103 18860/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
18861 in terms of the variable form of vpermilps.
18862
18863 Note that we will have already failed the immediate input vpermilps,
18864 which requires that the high and low part shuffle be identical; the
18865 variable form doesn't require that. */
18866
18867static bool
18868expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
18869{
18870 rtx rperm[8], vperm;
18871 unsigned i;
18872
18873 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
18874 return false;
18875
18876 /* We can only permute within the 128-bit lane. */
18877 for (i = 0; i < 8; ++i)
18878 {
18879 unsigned e = d->perm[i];
18880 if (i < 4 ? e >= 4 : e < 4)
18881 return false;
18882 }
18883
18884 if (d->testing_p)
18885 return true;
18886
18887 for (i = 0; i < 8; ++i)
18888 {
18889 unsigned e = d->perm[i];
18890
18891 /* Within each 128-bit lane, the elements of op0 are numbered
18892 from 0 and the elements of op1 are numbered from 4. */
18893 if (e >= 8 + 4)
18894 e -= 8;
18895 else if (e >= 4)
18896 e -= 4;
18897
18898 rperm[i] = GEN_INT (e);
18899 }
18900
18901 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
18902 vperm = force_reg (V8SImode, vperm);
18903 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
18904
18905 return true;
18906}
18907
1fa991d1
UB
18908/* For V*[QHS]Imode permutations, check if the same permutation
18909 can't be performed in a 2x, 4x or 8x wider inner mode. */
18910
18911static bool
18912canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
18913 struct expand_vec_perm_d *nd)
18914{
18915 int i;
18916 machine_mode mode = VOIDmode;
18917
18918 switch (d->vmode)
18919 {
18920 case E_V8QImode: mode = V4HImode; break;
18921 case E_V16QImode: mode = V8HImode; break;
18922 case E_V32QImode: mode = V16HImode; break;
18923 case E_V64QImode: mode = V32HImode; break;
18924 case E_V4HImode: mode = V2SImode; break;
18925 case E_V8HImode: mode = V4SImode; break;
18926 case E_V16HImode: mode = V8SImode; break;
18927 case E_V32HImode: mode = V16SImode; break;
18928 case E_V4SImode: mode = V2DImode; break;
18929 case E_V8SImode: mode = V4DImode; break;
18930 case E_V16SImode: mode = V8DImode; break;
18931 default: return false;
18932 }
18933 for (i = 0; i < d->nelt; i += 2)
18934 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
18935 return false;
18936 nd->vmode = mode;
18937 nd->nelt = d->nelt / 2;
18938 for (i = 0; i < nd->nelt; i++)
18939 nd->perm[i] = d->perm[2 * i] / 2;
18940 if (GET_MODE_INNER (mode) != DImode)
18941 canonicalize_vector_int_perm (nd, nd);
18942 if (nd != d)
18943 {
18944 nd->one_operand_p = d->one_operand_p;
18945 nd->testing_p = d->testing_p;
18946 if (d->op0 == d->op1)
18947 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
18948 else
18949 {
18950 nd->op0 = gen_lowpart (nd->vmode, d->op0);
18951 nd->op1 = gen_lowpart (nd->vmode, d->op1);
18952 }
18953 if (d->testing_p)
18954 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
18955 else
18956 nd->target = gen_reg_rtx (nd->vmode);
18957 }
18958 return true;
18959}
18960
2bf6d935
ML
18961/* Return true if permutation D can be performed as VMODE permutation
18962 instead. */
18963
18964static bool
18965valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
18966{
18967 unsigned int i, j, chunk;
18968
18969 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
18970 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
18971 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
18972 return false;
18973
18974 if (GET_MODE_NUNITS (vmode) >= d->nelt)
18975 return true;
18976
18977 chunk = d->nelt / GET_MODE_NUNITS (vmode);
18978 for (i = 0; i < d->nelt; i += chunk)
18979 if (d->perm[i] & (chunk - 1))
18980 return false;
18981 else
18982 for (j = 1; j < chunk; ++j)
18983 if (d->perm[i] + j != d->perm[i + j])
18984 return false;
18985
18986 return true;
18987}
18988
4bf4c103 18989/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
18990 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
18991
18992static bool
18993expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
18994{
18995 unsigned i, nelt, eltsz, mask;
18996 unsigned char perm[64];
877c9e33 18997 machine_mode vmode;
1fa991d1 18998 struct expand_vec_perm_d nd;
2bf6d935
ML
18999 rtx rperm[64], vperm, target, op0, op1;
19000
19001 nelt = d->nelt;
19002
19003 if (!d->one_operand_p)
be8749f9
UB
19004 switch (GET_MODE_SIZE (d->vmode))
19005 {
19006 case 4:
19007 if (!TARGET_XOP)
19008 return false;
19009 vmode = V4QImode;
19010 break;
37e93925 19011
be8749f9
UB
19012 case 8:
19013 if (!TARGET_XOP)
19014 return false;
19015 vmode = V8QImode;
19016 break;
2bf6d935 19017
be8749f9
UB
19018 case 16:
19019 if (!TARGET_XOP)
2bf6d935 19020 return false;
877c9e33 19021 vmode = V16QImode;
be8749f9
UB
19022 break;
19023
19024 case 32:
19025 if (!TARGET_AVX2)
19026 return false;
19027
19028 if (valid_perm_using_mode_p (V2TImode, d))
19029 {
19030 if (d->testing_p)
19031 return true;
19032
19033 /* Use vperm2i128 insn. The pattern uses
19034 V4DImode instead of V2TImode. */
19035 target = d->target;
19036 if (d->vmode != V4DImode)
19037 target = gen_reg_rtx (V4DImode);
19038 op0 = gen_lowpart (V4DImode, d->op0);
19039 op1 = gen_lowpart (V4DImode, d->op1);
19040 rperm[0]
19041 = GEN_INT ((d->perm[0] / (nelt / 2))
19042 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
19043 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
19044 if (target != d->target)
19045 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19046 return true;
19047 }
19048 /* FALLTHRU */
19049
19050 default:
37e93925 19051 return false;
be8749f9 19052 }
2bf6d935 19053 else
be8749f9
UB
19054 switch (GET_MODE_SIZE (d->vmode))
19055 {
19056 case 4:
19057 if (!TARGET_SSSE3)
19058 return false;
19059 vmode = V4QImode;
19060 break;
2bf6d935 19061
be8749f9
UB
19062 case 8:
19063 if (!TARGET_SSSE3)
19064 return false;
19065 vmode = V8QImode;
19066 break;
2bf6d935 19067
be8749f9
UB
19068 case 16:
19069 if (!TARGET_SSSE3)
19070 return false;
877c9e33 19071 vmode = V16QImode;
be8749f9
UB
19072 break;
19073
19074 case 32:
19075 if (!TARGET_AVX2)
19076 return false;
19077
19078 /* V4DImode should be already handled through
19079 expand_vselect by vpermq instruction. */
19080 gcc_assert (d->vmode != V4DImode);
19081
19082 vmode = V32QImode;
19083 if (d->vmode == V8SImode
19084 || d->vmode == V16HImode
19085 || d->vmode == V32QImode)
19086 {
19087 /* First see if vpermq can be used for
19088 V8SImode/V16HImode/V32QImode. */
19089 if (valid_perm_using_mode_p (V4DImode, d))
19090 {
19091 for (i = 0; i < 4; i++)
19092 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
19093 if (d->testing_p)
19094 return true;
19095 target = gen_reg_rtx (V4DImode);
19096 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
19097 perm, 4, false))
19098 {
19099 emit_move_insn (d->target,
19100 gen_lowpart (d->vmode, target));
2bf6d935 19101 return true;
be8749f9
UB
19102 }
19103 return false;
19104 }
2bf6d935 19105
be8749f9
UB
19106 /* Next see if vpermd can be used. */
19107 if (valid_perm_using_mode_p (V8SImode, d))
19108 vmode = V8SImode;
19109 }
19110 /* Or if vpermps can be used. */
19111 else if (d->vmode == V8SFmode)
19112 vmode = V8SImode;
2bf6d935 19113
be8749f9
UB
19114 if (vmode == V32QImode)
19115 {
19116 /* vpshufb only works intra lanes, it is not
19117 possible to shuffle bytes in between the lanes. */
19118 for (i = 0; i < nelt; ++i)
19119 if ((d->perm[i] ^ i) & (nelt / 2))
19120 return false;
19121 }
19122 break;
2bf6d935 19123
be8749f9
UB
19124 case 64:
19125 if (!TARGET_AVX512BW)
19126 return false;
2bf6d935 19127
be8749f9
UB
19128 /* If vpermq didn't work, vpshufb won't work either. */
19129 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19130 return false;
19131
19132 vmode = V64QImode;
19133 if (d->vmode == V16SImode
19134 || d->vmode == V32HImode
19135 || d->vmode == V64QImode)
19136 {
19137 /* First see if vpermq can be used for
19138 V16SImode/V32HImode/V64QImode. */
19139 if (valid_perm_using_mode_p (V8DImode, d))
19140 {
19141 for (i = 0; i < 8; i++)
19142 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19143 if (d->testing_p)
19144 return true;
19145 target = gen_reg_rtx (V8DImode);
19146 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19147 perm, 8, false))
19148 {
19149 emit_move_insn (d->target,
19150 gen_lowpart (d->vmode, target));
2bf6d935 19151 return true;
be8749f9
UB
19152 }
19153 return false;
19154 }
2bf6d935 19155
be8749f9
UB
19156 /* Next see if vpermd can be used. */
19157 if (valid_perm_using_mode_p (V16SImode, d))
19158 vmode = V16SImode;
19159 }
19160 /* Or if vpermps can be used. */
19161 else if (d->vmode == V16SFmode)
19162 vmode = V16SImode;
877c9e33 19163
be8749f9
UB
19164 if (vmode == V64QImode)
19165 {
19166 /* vpshufb only works intra lanes, it is not
19167 possible to shuffle bytes in between the lanes. */
19168 for (i = 0; i < nelt; ++i)
19169 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19170 return false;
19171 }
19172 break;
19173
19174 default:
2bf6d935 19175 return false;
be8749f9 19176 }
2bf6d935
ML
19177
19178 if (d->testing_p)
19179 return true;
19180
681143b9
UB
19181 /* Try to avoid variable permutation instruction. */
19182 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19183 {
19184 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19185 return true;
19186 }
19187
2bf6d935
ML
19188 if (vmode == V8SImode)
19189 for (i = 0; i < 8; ++i)
19190 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19191 else if (vmode == V16SImode)
19192 for (i = 0; i < 16; ++i)
19193 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19194 else
19195 {
19196 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19197 if (!d->one_operand_p)
19198 mask = 2 * nelt - 1;
2bf6d935
ML
19199 else if (vmode == V64QImode)
19200 mask = nelt / 4 - 1;
a325bdd1 19201 else if (vmode == V32QImode)
2bf6d935 19202 mask = nelt / 2 - 1;
a325bdd1
PB
19203 else
19204 mask = nelt - 1;
2bf6d935
ML
19205
19206 for (i = 0; i < nelt; ++i)
19207 {
19208 unsigned j, e = d->perm[i] & mask;
19209 for (j = 0; j < eltsz; ++j)
19210 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19211 }
19212 }
19213
a325bdd1
PB
19214 machine_mode vpmode = vmode;
19215
877c9e33
UB
19216 nelt = GET_MODE_SIZE (vmode);
19217
19218 /* Emulate narrow modes with V16QI instructions. */
19219 if (nelt < 16)
a325bdd1 19220 {
dd835ec2
UB
19221 rtx m128 = GEN_INT (-128);
19222
37e93925 19223 /* Remap elements from the second operand, as we have to
be8749f9 19224 account for inactive top elements from the first operand. */
37e93925 19225 if (!d->one_operand_p)
be8749f9 19226 {
be8749f9
UB
19227 for (i = 0; i < nelt; ++i)
19228 {
877c9e33
UB
19229 unsigned ival = UINTVAL (rperm[i]);
19230 if (ival >= nelt)
19231 rperm[i] = GEN_INT (ival + 16 - nelt);
be8749f9
UB
19232 }
19233 }
37e93925 19234
877c9e33 19235 /* Fill inactive elements in the top positions with zeros. */
a325bdd1 19236 for (i = nelt; i < 16; ++i)
dd835ec2 19237 rperm[i] = m128;
37e93925 19238
a325bdd1
PB
19239 vpmode = V16QImode;
19240 }
19241
19242 vperm = gen_rtx_CONST_VECTOR (vpmode,
19243 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19244 vperm = force_reg (vpmode, vperm);
2bf6d935 19245
37e93925
UB
19246 if (vmode == d->vmode)
19247 target = d->target;
19248 else
2bf6d935 19249 target = gen_reg_rtx (vmode);
37e93925 19250
2bf6d935 19251 op0 = gen_lowpart (vmode, d->op0);
37e93925 19252
2bf6d935
ML
19253 if (d->one_operand_p)
19254 {
37e93925
UB
19255 rtx (*gen) (rtx, rtx, rtx);
19256
be8749f9
UB
19257 if (vmode == V4QImode)
19258 gen = gen_mmx_pshufbv4qi3;
19259 else if (vmode == V8QImode)
37e93925 19260 gen = gen_mmx_pshufbv8qi3;
a325bdd1 19261 else if (vmode == V16QImode)
37e93925 19262 gen = gen_ssse3_pshufbv16qi3;
2bf6d935 19263 else if (vmode == V32QImode)
37e93925 19264 gen = gen_avx2_pshufbv32qi3;
2bf6d935 19265 else if (vmode == V64QImode)
37e93925 19266 gen = gen_avx512bw_pshufbv64qi3;
2bf6d935 19267 else if (vmode == V8SFmode)
37e93925 19268 gen = gen_avx2_permvarv8sf;
2bf6d935 19269 else if (vmode == V8SImode)
37e93925 19270 gen = gen_avx2_permvarv8si;
2bf6d935 19271 else if (vmode == V16SFmode)
37e93925 19272 gen = gen_avx512f_permvarv16sf;
2bf6d935 19273 else if (vmode == V16SImode)
37e93925 19274 gen = gen_avx512f_permvarv16si;
2bf6d935
ML
19275 else
19276 gcc_unreachable ();
37e93925
UB
19277
19278 emit_insn (gen (target, op0, vperm));
2bf6d935
ML
19279 }
19280 else
19281 {
37e93925
UB
19282 rtx (*gen) (rtx, rtx, rtx, rtx);
19283
2bf6d935 19284 op1 = gen_lowpart (vmode, d->op1);
37e93925 19285
be8749f9
UB
19286 if (vmode == V4QImode)
19287 gen = gen_mmx_ppermv32;
19288 else if (vmode == V8QImode)
37e93925
UB
19289 gen = gen_mmx_ppermv64;
19290 else if (vmode == V16QImode)
19291 gen = gen_xop_pperm;
19292 else
19293 gcc_unreachable ();
19294
19295 emit_insn (gen (target, op0, op1, vperm));
2bf6d935 19296 }
37e93925 19297
2bf6d935
ML
19298 if (target != d->target)
19299 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19300
19301 return true;
19302}
19303
2bf6d935
ML
19304/* Try to expand one-operand permutation with constant mask. */
19305
19306static bool
19307ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19308{
19309 machine_mode mode = GET_MODE (d->op0);
19310 machine_mode maskmode = mode;
faf2b6bc 19311 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
2bf6d935
ML
19312 rtx (*gen) (rtx, rtx, rtx) = NULL;
19313 rtx target, op0, mask;
19314 rtx vec[64];
19315
19316 if (!rtx_equal_p (d->op0, d->op1))
19317 return false;
19318
19319 if (!TARGET_AVX512F)
19320 return false;
19321
faf2b6bc 19322 /* Accept VNxHImode and VNxQImode now. */
19323 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19324 return false;
19325
19326 /* vpermw. */
19327 if (!TARGET_AVX512BW && inner_size == 2)
19328 return false;
19329
19330 /* vpermb. */
19331 if (!TARGET_AVX512VBMI && inner_size == 1)
19332 return false;
19333
2bf6d935
ML
19334 switch (mode)
19335 {
19336 case E_V16SImode:
19337 gen = gen_avx512f_permvarv16si;
19338 break;
19339 case E_V16SFmode:
19340 gen = gen_avx512f_permvarv16sf;
19341 maskmode = V16SImode;
19342 break;
19343 case E_V8DImode:
19344 gen = gen_avx512f_permvarv8di;
19345 break;
19346 case E_V8DFmode:
19347 gen = gen_avx512f_permvarv8df;
19348 maskmode = V8DImode;
19349 break;
faf2b6bc 19350 case E_V32HImode:
19351 gen = gen_avx512bw_permvarv32hi;
19352 break;
19353 case E_V16HImode:
19354 gen = gen_avx512vl_permvarv16hi;
19355 break;
19356 case E_V8HImode:
19357 gen = gen_avx512vl_permvarv8hi;
19358 break;
19359 case E_V64QImode:
19360 gen = gen_avx512bw_permvarv64qi;
19361 break;
19362 case E_V32QImode:
19363 gen = gen_avx512vl_permvarv32qi;
19364 break;
19365 case E_V16QImode:
19366 gen = gen_avx512vl_permvarv16qi;
19367 break;
19368
2bf6d935
ML
19369 default:
19370 return false;
19371 }
19372
04b4f315
JJ
19373 if (d->testing_p)
19374 return true;
19375
2bf6d935
ML
19376 target = d->target;
19377 op0 = d->op0;
19378 for (int i = 0; i < d->nelt; ++i)
19379 vec[i] = GEN_INT (d->perm[i]);
19380 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19381 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19382 return true;
19383}
19384
19385static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19386
4bf4c103 19387/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
2bf6d935
ML
19388 in a single instruction. */
19389
19390static bool
19391expand_vec_perm_1 (struct expand_vec_perm_d *d)
19392{
19393 unsigned i, nelt = d->nelt;
19394 struct expand_vec_perm_d nd;
19395
19396 /* Check plain VEC_SELECT first, because AVX has instructions that could
19397 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19398 input where SEL+CONCAT may not. */
19399 if (d->one_operand_p)
19400 {
19401 int mask = nelt - 1;
19402 bool identity_perm = true;
19403 bool broadcast_perm = true;
19404
19405 for (i = 0; i < nelt; i++)
19406 {
19407 nd.perm[i] = d->perm[i] & mask;
19408 if (nd.perm[i] != i)
19409 identity_perm = false;
19410 if (nd.perm[i])
19411 broadcast_perm = false;
19412 }
19413
19414 if (identity_perm)
19415 {
19416 if (!d->testing_p)
19417 emit_move_insn (d->target, d->op0);
19418 return true;
19419 }
19420 else if (broadcast_perm && TARGET_AVX2)
19421 {
19422 /* Use vpbroadcast{b,w,d}. */
19423 rtx (*gen) (rtx, rtx) = NULL;
19424 switch (d->vmode)
19425 {
19426 case E_V64QImode:
19427 if (TARGET_AVX512BW)
19428 gen = gen_avx512bw_vec_dupv64qi_1;
19429 break;
19430 case E_V32QImode:
19431 gen = gen_avx2_pbroadcastv32qi_1;
19432 break;
19433 case E_V32HImode:
19434 if (TARGET_AVX512BW)
19435 gen = gen_avx512bw_vec_dupv32hi_1;
19436 break;
19437 case E_V16HImode:
19438 gen = gen_avx2_pbroadcastv16hi_1;
19439 break;
19440 case E_V16SImode:
19441 if (TARGET_AVX512F)
19442 gen = gen_avx512f_vec_dupv16si_1;
19443 break;
19444 case E_V8SImode:
19445 gen = gen_avx2_pbroadcastv8si_1;
19446 break;
19447 case E_V16QImode:
19448 gen = gen_avx2_pbroadcastv16qi;
19449 break;
19450 case E_V8HImode:
19451 gen = gen_avx2_pbroadcastv8hi;
19452 break;
19453 case E_V16SFmode:
19454 if (TARGET_AVX512F)
19455 gen = gen_avx512f_vec_dupv16sf_1;
19456 break;
19457 case E_V8SFmode:
19458 gen = gen_avx2_vec_dupv8sf_1;
19459 break;
19460 case E_V8DFmode:
19461 if (TARGET_AVX512F)
19462 gen = gen_avx512f_vec_dupv8df_1;
19463 break;
19464 case E_V8DImode:
19465 if (TARGET_AVX512F)
19466 gen = gen_avx512f_vec_dupv8di_1;
19467 break;
19468 /* For other modes prefer other shuffles this function creates. */
19469 default: break;
19470 }
19471 if (gen != NULL)
19472 {
19473 if (!d->testing_p)
19474 emit_insn (gen (d->target, d->op0));
19475 return true;
19476 }
19477 }
19478
19479 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19480 return true;
19481
19482 /* There are plenty of patterns in sse.md that are written for
19483 SEL+CONCAT and are not replicated for a single op. Perhaps
19484 that should be changed, to avoid the nastiness here. */
19485
19486 /* Recognize interleave style patterns, which means incrementing
19487 every other permutation operand. */
19488 for (i = 0; i < nelt; i += 2)
19489 {
19490 nd.perm[i] = d->perm[i] & mask;
19491 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19492 }
19493 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19494 d->testing_p))
19495 return true;
19496
19497 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19498 if (nelt >= 4)
19499 {
19500 for (i = 0; i < nelt; i += 4)
19501 {
19502 nd.perm[i + 0] = d->perm[i + 0] & mask;
19503 nd.perm[i + 1] = d->perm[i + 1] & mask;
19504 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19505 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19506 }
19507
19508 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19509 d->testing_p))
19510 return true;
19511 }
19512 }
19513
19514 /* Try movss/movsd instructions. */
19515 if (expand_vec_perm_movs (d))
19516 return true;
19517
19518 /* Finally, try the fully general two operand permute. */
19519 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
19520 d->testing_p))
19521 return true;
19522
19523 /* Recognize interleave style patterns with reversed operands. */
19524 if (!d->one_operand_p)
19525 {
19526 for (i = 0; i < nelt; ++i)
19527 {
19528 unsigned e = d->perm[i];
19529 if (e >= nelt)
19530 e -= nelt;
19531 else
19532 e += nelt;
19533 nd.perm[i] = e;
19534 }
19535
19536 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
19537 d->testing_p))
19538 return true;
19539 }
19540
19541 /* Try the SSE4.1 blend variable merge instructions. */
19542 if (expand_vec_perm_blend (d))
19543 return true;
19544
19545 /* Try one of the AVX vpermil variable permutations. */
19546 if (expand_vec_perm_vpermil (d))
19547 return true;
19548
19549 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19550 vpshufb, vpermd, vpermps or vpermq variable permutation. */
19551 if (expand_vec_perm_pshufb (d))
19552 return true;
19553
19554 /* Try the AVX2 vpalignr instruction. */
19555 if (expand_vec_perm_palignr (d, true))
19556 return true;
19557
faf2b6bc 19558 /* Try the AVX512F vperm{w,b,s,d} instructions */
2bf6d935
ML
19559 if (ix86_expand_vec_one_operand_perm_avx512 (d))
19560 return true;
19561
19562 /* Try the AVX512F vpermt2/vpermi2 instructions. */
19563 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
19564 return true;
19565
19566 /* See if we can get the same permutation in different vector integer
19567 mode. */
19568 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19569 {
19570 if (!d->testing_p)
19571 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19572 return true;
19573 }
19574 return false;
19575}
19576
4bf4c103 19577/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
19578 in terms of a pair of pshuflw + pshufhw instructions. */
19579
19580static bool
19581expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
19582{
19583 unsigned char perm2[MAX_VECT_LEN];
19584 unsigned i;
19585 bool ok;
19586
19587 if (d->vmode != V8HImode || !d->one_operand_p)
19588 return false;
19589
19590 /* The two permutations only operate in 64-bit lanes. */
19591 for (i = 0; i < 4; ++i)
19592 if (d->perm[i] >= 4)
19593 return false;
19594 for (i = 4; i < 8; ++i)
19595 if (d->perm[i] < 4)
19596 return false;
19597
19598 if (d->testing_p)
19599 return true;
19600
19601 /* Emit the pshuflw. */
19602 memcpy (perm2, d->perm, 4);
19603 for (i = 4; i < 8; ++i)
19604 perm2[i] = i;
19605 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
19606 gcc_assert (ok);
19607
19608 /* Emit the pshufhw. */
19609 memcpy (perm2 + 4, d->perm + 4, 4);
19610 for (i = 0; i < 4; ++i)
19611 perm2[i] = i;
19612 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
19613 gcc_assert (ok);
19614
19615 return true;
19616}
19617
4bf4c103 19618/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
19619 the permutation using the SSSE3 palignr instruction. This succeeds
19620 when all of the elements in PERM fit within one vector and we merely
19621 need to shift them down so that a single vector permutation has a
19622 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
19623 the vpalignr instruction itself can perform the requested permutation. */
19624
19625static bool
19626expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
19627{
19628 unsigned i, nelt = d->nelt;
19629 unsigned min, max, minswap, maxswap;
19630 bool in_order, ok, swap = false;
19631 rtx shift, target;
19632 struct expand_vec_perm_d dcopy;
19633
19634 /* Even with AVX, palignr only operates on 128-bit vectors,
19635 in AVX2 palignr operates on both 128-bit lanes. */
19636 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
19637 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
19638 return false;
19639
19640 min = 2 * nelt;
19641 max = 0;
19642 minswap = 2 * nelt;
19643 maxswap = 0;
19644 for (i = 0; i < nelt; ++i)
19645 {
19646 unsigned e = d->perm[i];
19647 unsigned eswap = d->perm[i] ^ nelt;
19648 if (GET_MODE_SIZE (d->vmode) == 32)
19649 {
19650 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
19651 eswap = e ^ (nelt / 2);
19652 }
19653 if (e < min)
19654 min = e;
19655 if (e > max)
19656 max = e;
19657 if (eswap < minswap)
19658 minswap = eswap;
19659 if (eswap > maxswap)
19660 maxswap = eswap;
19661 }
19662 if (min == 0
19663 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
19664 {
19665 if (d->one_operand_p
19666 || minswap == 0
19667 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
19668 ? nelt / 2 : nelt))
19669 return false;
19670 swap = true;
19671 min = minswap;
19672 max = maxswap;
19673 }
19674
19675 /* Given that we have SSSE3, we know we'll be able to implement the
19676 single operand permutation after the palignr with pshufb for
19677 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
19678 first. */
19679 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
19680 return true;
19681
19682 dcopy = *d;
19683 if (swap)
19684 {
19685 dcopy.op0 = d->op1;
19686 dcopy.op1 = d->op0;
19687 for (i = 0; i < nelt; ++i)
19688 dcopy.perm[i] ^= nelt;
19689 }
19690
19691 in_order = true;
19692 for (i = 0; i < nelt; ++i)
19693 {
19694 unsigned e = dcopy.perm[i];
19695 if (GET_MODE_SIZE (d->vmode) == 32
19696 && e >= nelt
19697 && (e & (nelt / 2 - 1)) < min)
19698 e = e - min - (nelt / 2);
19699 else
19700 e = e - min;
19701 if (e != i)
19702 in_order = false;
19703 dcopy.perm[i] = e;
19704 }
19705 dcopy.one_operand_p = true;
19706
19707 if (single_insn_only_p && !in_order)
19708 return false;
19709
19710 /* For AVX2, test whether we can permute the result in one instruction. */
19711 if (d->testing_p)
19712 {
19713 if (in_order)
19714 return true;
19715 dcopy.op1 = dcopy.op0;
19716 return expand_vec_perm_1 (&dcopy);
19717 }
19718
19719 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
19720 if (GET_MODE_SIZE (d->vmode) == 16)
19721 {
02e2e15e
RS
19722 target = gen_reg_rtx (V1TImode);
19723 emit_insn (gen_ssse3_palignrv1ti (target,
19724 gen_lowpart (V1TImode, dcopy.op1),
19725 gen_lowpart (V1TImode, dcopy.op0),
19726 shift));
2bf6d935
ML
19727 }
19728 else
19729 {
19730 target = gen_reg_rtx (V2TImode);
19731 emit_insn (gen_avx2_palignrv2ti (target,
19732 gen_lowpart (V2TImode, dcopy.op1),
19733 gen_lowpart (V2TImode, dcopy.op0),
19734 shift));
19735 }
19736
19737 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
19738
19739 /* Test for the degenerate case where the alignment by itself
19740 produces the desired permutation. */
19741 if (in_order)
19742 {
19743 emit_move_insn (d->target, dcopy.op0);
19744 return true;
19745 }
19746
19747 ok = expand_vec_perm_1 (&dcopy);
19748 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
19749
19750 return ok;
19751}
19752
19753/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19754 the permutation using the SSE4_1 pblendv instruction. Potentially
19755 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
19756
19757static bool
19758expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
19759{
19760 unsigned i, which, nelt = d->nelt;
19761 struct expand_vec_perm_d dcopy, dcopy1;
19762 machine_mode vmode = d->vmode;
19763 bool ok;
19764
19765 /* Use the same checks as in expand_vec_perm_blend. */
19766 if (d->one_operand_p)
19767 return false;
19768 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19769 ;
19770 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19771 ;
be8749f9
UB
19772 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
19773 || GET_MODE_SIZE (vmode) == 8
a325bdd1 19774 || GET_MODE_SIZE (vmode) == 16))
2bf6d935
ML
19775 ;
19776 else
19777 return false;
19778
19779 /* Figure out where permutation elements stay not in their
19780 respective lanes. */
19781 for (i = 0, which = 0; i < nelt; ++i)
19782 {
19783 unsigned e = d->perm[i];
19784 if (e != i)
19785 which |= (e < nelt ? 1 : 2);
19786 }
19787 /* We can pblend the part where elements stay not in their
19788 respective lanes only when these elements are all in one
19789 half of a permutation.
19790 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
19791 lanes, but both 8 and 9 >= 8
19792 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
19793 respective lanes and 8 >= 8, but 2 not. */
19794 if (which != 1 && which != 2)
19795 return false;
19796 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
19797 return true;
19798
19799 /* First we apply one operand permutation to the part where
19800 elements stay not in their respective lanes. */
19801 dcopy = *d;
19802 if (which == 2)
19803 dcopy.op0 = dcopy.op1 = d->op1;
19804 else
19805 dcopy.op0 = dcopy.op1 = d->op0;
19806 if (!d->testing_p)
19807 dcopy.target = gen_reg_rtx (vmode);
19808 dcopy.one_operand_p = true;
19809
19810 for (i = 0; i < nelt; ++i)
19811 dcopy.perm[i] = d->perm[i] & (nelt - 1);
19812
19813 ok = expand_vec_perm_1 (&dcopy);
19814 if (GET_MODE_SIZE (vmode) != 16 && !ok)
19815 return false;
19816 else
19817 gcc_assert (ok);
19818 if (d->testing_p)
19819 return true;
19820
19821 /* Next we put permuted elements into their positions. */
19822 dcopy1 = *d;
19823 if (which == 2)
19824 dcopy1.op1 = dcopy.target;
19825 else
19826 dcopy1.op0 = dcopy.target;
19827
19828 for (i = 0; i < nelt; ++i)
19829 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
19830
19831 ok = expand_vec_perm_blend (&dcopy1);
19832 gcc_assert (ok);
19833
19834 return true;
19835}
19836
19837static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
19838
4bf4c103 19839/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
19840 a two vector permutation into a single vector permutation by using
19841 an interleave operation to merge the vectors. */
19842
19843static bool
19844expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
19845{
19846 struct expand_vec_perm_d dremap, dfinal;
19847 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
19848 unsigned HOST_WIDE_INT contents;
19849 unsigned char remap[2 * MAX_VECT_LEN];
19850 rtx_insn *seq;
19851 bool ok, same_halves = false;
19852
be8749f9
UB
19853 if (GET_MODE_SIZE (d->vmode) == 4
19854 || GET_MODE_SIZE (d->vmode) == 8
a325bdd1 19855 || GET_MODE_SIZE (d->vmode) == 16)
2bf6d935
ML
19856 {
19857 if (d->one_operand_p)
19858 return false;
19859 }
19860 else if (GET_MODE_SIZE (d->vmode) == 32)
19861 {
19862 if (!TARGET_AVX)
19863 return false;
19864 /* For 32-byte modes allow even d->one_operand_p.
19865 The lack of cross-lane shuffling in some instructions
19866 might prevent a single insn shuffle. */
19867 dfinal = *d;
19868 dfinal.testing_p = true;
19869 /* If expand_vec_perm_interleave3 can expand this into
19870 a 3 insn sequence, give up and let it be expanded as
19871 3 insn sequence. While that is one insn longer,
19872 it doesn't need a memory operand and in the common
19873 case that both interleave low and high permutations
19874 with the same operands are adjacent needs 4 insns
19875 for both after CSE. */
19876 if (expand_vec_perm_interleave3 (&dfinal))
19877 return false;
19878 }
19879 else
19880 return false;
19881
19882 /* Examine from whence the elements come. */
19883 contents = 0;
19884 for (i = 0; i < nelt; ++i)
19885 contents |= HOST_WIDE_INT_1U << d->perm[i];
19886
19887 memset (remap, 0xff, sizeof (remap));
19888 dremap = *d;
19889
be8749f9
UB
19890 if (GET_MODE_SIZE (d->vmode) == 4
19891 || GET_MODE_SIZE (d->vmode) == 8)
a325bdd1
PB
19892 {
19893 unsigned HOST_WIDE_INT h1, h2, h3, h4;
19894
19895 /* Split the two input vectors into 4 halves. */
19896 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19897 h2 = h1 << nelt2;
19898 h3 = h2 << nelt2;
19899 h4 = h3 << nelt2;
19900
19901 /* If the elements from the low halves use interleave low,
19902 and similarly for interleave high. */
19903 if ((contents & (h1 | h3)) == contents)
19904 {
19905 /* punpckl* */
19906 for (i = 0; i < nelt2; ++i)
19907 {
19908 remap[i] = i * 2;
19909 remap[i + nelt] = i * 2 + 1;
19910 dremap.perm[i * 2] = i;
19911 dremap.perm[i * 2 + 1] = i + nelt;
19912 }
19913 }
19914 else if ((contents & (h2 | h4)) == contents)
19915 {
19916 /* punpckh* */
19917 for (i = 0; i < nelt2; ++i)
19918 {
19919 remap[i + nelt2] = i * 2;
19920 remap[i + nelt + nelt2] = i * 2 + 1;
19921 dremap.perm[i * 2] = i + nelt2;
19922 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19923 }
19924 }
19925 else
19926 return false;
19927 }
19928 else if (GET_MODE_SIZE (d->vmode) == 16)
2bf6d935
ML
19929 {
19930 unsigned HOST_WIDE_INT h1, h2, h3, h4;
19931
19932 /* Split the two input vectors into 4 halves. */
19933 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19934 h2 = h1 << nelt2;
19935 h3 = h2 << nelt2;
19936 h4 = h3 << nelt2;
19937
19938 /* If the elements from the low halves use interleave low, and similarly
19939 for interleave high. If the elements are from mis-matched halves, we
19940 can use shufps for V4SF/V4SI or do a DImode shuffle. */
19941 if ((contents & (h1 | h3)) == contents)
19942 {
19943 /* punpckl* */
19944 for (i = 0; i < nelt2; ++i)
19945 {
19946 remap[i] = i * 2;
19947 remap[i + nelt] = i * 2 + 1;
19948 dremap.perm[i * 2] = i;
19949 dremap.perm[i * 2 + 1] = i + nelt;
19950 }
19951 if (!TARGET_SSE2 && d->vmode == V4SImode)
19952 dremap.vmode = V4SFmode;
19953 }
19954 else if ((contents & (h2 | h4)) == contents)
19955 {
19956 /* punpckh* */
19957 for (i = 0; i < nelt2; ++i)
19958 {
19959 remap[i + nelt2] = i * 2;
19960 remap[i + nelt + nelt2] = i * 2 + 1;
19961 dremap.perm[i * 2] = i + nelt2;
19962 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19963 }
19964 if (!TARGET_SSE2 && d->vmode == V4SImode)
19965 dremap.vmode = V4SFmode;
19966 }
19967 else if ((contents & (h1 | h4)) == contents)
19968 {
19969 /* shufps */
19970 for (i = 0; i < nelt2; ++i)
19971 {
19972 remap[i] = i;
19973 remap[i + nelt + nelt2] = i + nelt2;
19974 dremap.perm[i] = i;
19975 dremap.perm[i + nelt2] = i + nelt + nelt2;
19976 }
19977 if (nelt != 4)
19978 {
19979 /* shufpd */
19980 dremap.vmode = V2DImode;
19981 dremap.nelt = 2;
19982 dremap.perm[0] = 0;
19983 dremap.perm[1] = 3;
19984 }
19985 }
19986 else if ((contents & (h2 | h3)) == contents)
19987 {
19988 /* shufps */
19989 for (i = 0; i < nelt2; ++i)
19990 {
19991 remap[i + nelt2] = i;
19992 remap[i + nelt] = i + nelt2;
19993 dremap.perm[i] = i + nelt2;
19994 dremap.perm[i + nelt2] = i + nelt;
19995 }
19996 if (nelt != 4)
19997 {
19998 /* shufpd */
19999 dremap.vmode = V2DImode;
20000 dremap.nelt = 2;
20001 dremap.perm[0] = 1;
20002 dremap.perm[1] = 2;
20003 }
20004 }
20005 else
20006 return false;
20007 }
20008 else
20009 {
20010 unsigned int nelt4 = nelt / 4, nzcnt = 0;
20011 unsigned HOST_WIDE_INT q[8];
20012 unsigned int nonzero_halves[4];
20013
20014 /* Split the two input vectors into 8 quarters. */
20015 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
20016 for (i = 1; i < 8; ++i)
20017 q[i] = q[0] << (nelt4 * i);
20018 for (i = 0; i < 4; ++i)
20019 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
20020 {
20021 nonzero_halves[nzcnt] = i;
20022 ++nzcnt;
20023 }
20024
20025 if (nzcnt == 1)
20026 {
20027 gcc_assert (d->one_operand_p);
20028 nonzero_halves[1] = nonzero_halves[0];
20029 same_halves = true;
20030 }
20031 else if (d->one_operand_p)
20032 {
20033 gcc_assert (nonzero_halves[0] == 0);
20034 gcc_assert (nonzero_halves[1] == 1);
20035 }
20036
20037 if (nzcnt <= 2)
20038 {
20039 if (d->perm[0] / nelt2 == nonzero_halves[1])
20040 {
20041 /* Attempt to increase the likelihood that dfinal
20042 shuffle will be intra-lane. */
20043 std::swap (nonzero_halves[0], nonzero_halves[1]);
20044 }
20045
20046 /* vperm2f128 or vperm2i128. */
20047 for (i = 0; i < nelt2; ++i)
20048 {
20049 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
20050 remap[i + nonzero_halves[0] * nelt2] = i;
20051 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
20052 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
20053 }
20054
20055 if (d->vmode != V8SFmode
20056 && d->vmode != V4DFmode
20057 && d->vmode != V8SImode)
20058 {
20059 dremap.vmode = V8SImode;
20060 dremap.nelt = 8;
20061 for (i = 0; i < 4; ++i)
20062 {
20063 dremap.perm[i] = i + nonzero_halves[0] * 4;
20064 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
20065 }
20066 }
20067 }
20068 else if (d->one_operand_p)
20069 return false;
20070 else if (TARGET_AVX2
20071 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
20072 {
20073 /* vpunpckl* */
20074 for (i = 0; i < nelt4; ++i)
20075 {
20076 remap[i] = i * 2;
20077 remap[i + nelt] = i * 2 + 1;
20078 remap[i + nelt2] = i * 2 + nelt2;
20079 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
20080 dremap.perm[i * 2] = i;
20081 dremap.perm[i * 2 + 1] = i + nelt;
20082 dremap.perm[i * 2 + nelt2] = i + nelt2;
20083 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
20084 }
20085 }
20086 else if (TARGET_AVX2
20087 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
20088 {
20089 /* vpunpckh* */
20090 for (i = 0; i < nelt4; ++i)
20091 {
20092 remap[i + nelt4] = i * 2;
20093 remap[i + nelt + nelt4] = i * 2 + 1;
20094 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
20095 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
20096 dremap.perm[i * 2] = i + nelt4;
20097 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
20098 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
20099 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
20100 }
20101 }
20102 else
20103 return false;
20104 }
20105
20106 /* Use the remapping array set up above to move the elements from their
20107 swizzled locations into their final destinations. */
20108 dfinal = *d;
20109 for (i = 0; i < nelt; ++i)
20110 {
20111 unsigned e = remap[d->perm[i]];
20112 gcc_assert (e < nelt);
20113 /* If same_halves is true, both halves of the remapped vector are the
20114 same. Avoid cross-lane accesses if possible. */
20115 if (same_halves && i >= nelt2)
20116 {
20117 gcc_assert (e < nelt2);
20118 dfinal.perm[i] = e + nelt2;
20119 }
20120 else
20121 dfinal.perm[i] = e;
20122 }
20123 if (!d->testing_p)
20124 {
20125 dremap.target = gen_reg_rtx (dremap.vmode);
20126 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20127 }
20128 dfinal.op1 = dfinal.op0;
20129 dfinal.one_operand_p = true;
20130
20131 /* Test if the final remap can be done with a single insn. For V4SFmode or
20132 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20133 start_sequence ();
20134 ok = expand_vec_perm_1 (&dfinal);
20135 seq = get_insns ();
20136 end_sequence ();
20137
20138 if (!ok)
20139 return false;
20140
20141 if (d->testing_p)
20142 return true;
20143
20144 if (dremap.vmode != dfinal.vmode)
20145 {
20146 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20147 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20148 }
20149
20150 ok = expand_vec_perm_1 (&dremap);
20151 gcc_assert (ok);
20152
20153 emit_insn (seq);
20154 return true;
20155}
20156
4bf4c103 20157/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
20158 a single vector cross-lane permutation into vpermq followed
20159 by any of the single insn permutations. */
20160
20161static bool
20162expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20163{
20164 struct expand_vec_perm_d dremap, dfinal;
20165 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20166 unsigned contents[2];
20167 bool ok;
20168
20169 if (!(TARGET_AVX2
20170 && (d->vmode == V32QImode || d->vmode == V16HImode)
20171 && d->one_operand_p))
20172 return false;
20173
20174 contents[0] = 0;
20175 contents[1] = 0;
20176 for (i = 0; i < nelt2; ++i)
20177 {
20178 contents[0] |= 1u << (d->perm[i] / nelt4);
20179 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20180 }
20181
20182 for (i = 0; i < 2; ++i)
20183 {
20184 unsigned int cnt = 0;
20185 for (j = 0; j < 4; ++j)
20186 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20187 return false;
20188 }
20189
20190 if (d->testing_p)
20191 return true;
20192
20193 dremap = *d;
20194 dremap.vmode = V4DImode;
20195 dremap.nelt = 4;
20196 dremap.target = gen_reg_rtx (V4DImode);
20197 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20198 dremap.op1 = dremap.op0;
20199 dremap.one_operand_p = true;
20200 for (i = 0; i < 2; ++i)
20201 {
20202 unsigned int cnt = 0;
20203 for (j = 0; j < 4; ++j)
20204 if ((contents[i] & (1u << j)) != 0)
20205 dremap.perm[2 * i + cnt++] = j;
20206 for (; cnt < 2; ++cnt)
20207 dremap.perm[2 * i + cnt] = 0;
20208 }
20209
20210 dfinal = *d;
20211 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20212 dfinal.op1 = dfinal.op0;
20213 dfinal.one_operand_p = true;
20214 for (i = 0, j = 0; i < nelt; ++i)
20215 {
20216 if (i == nelt2)
20217 j = 2;
20218 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20219 if ((d->perm[i] / nelt4) == dremap.perm[j])
20220 ;
20221 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20222 dfinal.perm[i] |= nelt4;
20223 else
20224 gcc_unreachable ();
20225 }
20226
20227 ok = expand_vec_perm_1 (&dremap);
20228 gcc_assert (ok);
20229
20230 ok = expand_vec_perm_1 (&dfinal);
20231 gcc_assert (ok);
20232
20233 return true;
20234}
20235
20236static bool canonicalize_perm (struct expand_vec_perm_d *d);
20237
4bf4c103 20238/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
2bf6d935
ML
20239 a vector permutation using two instructions, vperm2f128 resp.
20240 vperm2i128 followed by any single in-lane permutation. */
20241
20242static bool
20243expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20244{
20245 struct expand_vec_perm_d dfirst, dsecond;
20246 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20247 bool ok;
20248
20249 if (!TARGET_AVX
20250 || GET_MODE_SIZE (d->vmode) != 32
20251 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20252 return false;
20253
20254 dsecond = *d;
20255 dsecond.one_operand_p = false;
20256 dsecond.testing_p = true;
20257
20258 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20259 immediate. For perm < 16 the second permutation uses
20260 d->op0 as first operand, for perm >= 16 it uses d->op1
20261 as first operand. The second operand is the result of
20262 vperm2[fi]128. */
20263 for (perm = 0; perm < 32; perm++)
20264 {
20265 /* Ignore permutations which do not move anything cross-lane. */
20266 if (perm < 16)
20267 {
20268 /* The second shuffle for e.g. V4DFmode has
20269 0123 and ABCD operands.
20270 Ignore AB23, as 23 is already in the second lane
20271 of the first operand. */
20272 if ((perm & 0xc) == (1 << 2)) continue;
20273 /* And 01CD, as 01 is in the first lane of the first
20274 operand. */
20275 if ((perm & 3) == 0) continue;
20276 /* And 4567, as then the vperm2[fi]128 doesn't change
20277 anything on the original 4567 second operand. */
20278 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
20279 }
20280 else
20281 {
20282 /* The second shuffle for e.g. V4DFmode has
20283 4567 and ABCD operands.
20284 Ignore AB67, as 67 is already in the second lane
20285 of the first operand. */
20286 if ((perm & 0xc) == (3 << 2)) continue;
20287 /* And 45CD, as 45 is in the first lane of the first
20288 operand. */
20289 if ((perm & 3) == 2) continue;
20290 /* And 0123, as then the vperm2[fi]128 doesn't change
20291 anything on the original 0123 first operand. */
20292 if ((perm & 0xf) == (1 << 2)) continue;
20293 }
20294
20295 for (i = 0; i < nelt; i++)
20296 {
20297 j = d->perm[i] / nelt2;
20298 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20299 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20300 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20301 dsecond.perm[i] = d->perm[i] & (nelt - 1);
20302 else
20303 break;
20304 }
20305
20306 if (i == nelt)
20307 {
20308 start_sequence ();
20309 ok = expand_vec_perm_1 (&dsecond);
20310 end_sequence ();
20311 }
20312 else
20313 ok = false;
20314
20315 if (ok)
20316 {
20317 if (d->testing_p)
20318 return true;
20319
20320 /* Found a usable second shuffle. dfirst will be
20321 vperm2f128 on d->op0 and d->op1. */
20322 dsecond.testing_p = false;
20323 dfirst = *d;
20324 dfirst.target = gen_reg_rtx (d->vmode);
20325 for (i = 0; i < nelt; i++)
20326 dfirst.perm[i] = (i & (nelt2 - 1))
20327 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20328
20329 canonicalize_perm (&dfirst);
20330 ok = expand_vec_perm_1 (&dfirst);
20331 gcc_assert (ok);
20332
20333 /* And dsecond is some single insn shuffle, taking
20334 d->op0 and result of vperm2f128 (if perm < 16) or
20335 d->op1 and result of vperm2f128 (otherwise). */
20336 if (perm >= 16)
20337 dsecond.op0 = dsecond.op1;
20338 dsecond.op1 = dfirst.target;
20339
20340 ok = expand_vec_perm_1 (&dsecond);
20341 gcc_assert (ok);
20342
20343 return true;
20344 }
20345
20346 /* For one operand, the only useful vperm2f128 permutation is 0x01
20347 aka lanes swap. */
20348 if (d->one_operand_p)
20349 return false;
20350 }
20351
20352 return false;
20353}
20354
4bf4c103 20355/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
20356 a two vector permutation using 2 intra-lane interleave insns
20357 and cross-lane shuffle for 32-byte vectors. */
20358
20359static bool
20360expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20361{
20362 unsigned i, nelt;
20363 rtx (*gen) (rtx, rtx, rtx);
20364
20365 if (d->one_operand_p)
20366 return false;
20367 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20368 ;
20369 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20370 ;
20371 else
20372 return false;
20373
20374 nelt = d->nelt;
20375 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20376 return false;
20377 for (i = 0; i < nelt; i += 2)
20378 if (d->perm[i] != d->perm[0] + i / 2
20379 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
20380 return false;
20381
20382 if (d->testing_p)
20383 return true;
20384
20385 switch (d->vmode)
20386 {
20387 case E_V32QImode:
20388 if (d->perm[0])
20389 gen = gen_vec_interleave_highv32qi;
20390 else
20391 gen = gen_vec_interleave_lowv32qi;
20392 break;
20393 case E_V16HImode:
20394 if (d->perm[0])
20395 gen = gen_vec_interleave_highv16hi;
20396 else
20397 gen = gen_vec_interleave_lowv16hi;
20398 break;
20399 case E_V8SImode:
20400 if (d->perm[0])
20401 gen = gen_vec_interleave_highv8si;
20402 else
20403 gen = gen_vec_interleave_lowv8si;
20404 break;
20405 case E_V4DImode:
20406 if (d->perm[0])
20407 gen = gen_vec_interleave_highv4di;
20408 else
20409 gen = gen_vec_interleave_lowv4di;
20410 break;
20411 case E_V8SFmode:
20412 if (d->perm[0])
20413 gen = gen_vec_interleave_highv8sf;
20414 else
20415 gen = gen_vec_interleave_lowv8sf;
20416 break;
20417 case E_V4DFmode:
20418 if (d->perm[0])
20419 gen = gen_vec_interleave_highv4df;
20420 else
20421 gen = gen_vec_interleave_lowv4df;
20422 break;
20423 default:
20424 gcc_unreachable ();
20425 }
20426
20427 emit_insn (gen (d->target, d->op0, d->op1));
20428 return true;
20429}
20430
4bf4c103 20431/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
2bf6d935
ML
20432 a single vector permutation using a single intra-lane vector
20433 permutation, vperm2f128 swapping the lanes and vblend* insn blending
20434 the non-swapped and swapped vectors together. */
20435
20436static bool
20437expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
20438{
20439 struct expand_vec_perm_d dfirst, dsecond;
20440 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
20441 rtx_insn *seq;
20442 bool ok;
20443 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20444
20445 if (!TARGET_AVX
20446 || TARGET_AVX2
20447 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20448 || !d->one_operand_p)
20449 return false;
20450
20451 dfirst = *d;
20452 for (i = 0; i < nelt; i++)
20453 dfirst.perm[i] = 0xff;
20454 for (i = 0, msk = 0; i < nelt; i++)
20455 {
20456 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20457 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
20458 return false;
20459 dfirst.perm[j] = d->perm[i];
20460 if (j != i)
20461 msk |= (1 << i);
20462 }
20463 for (i = 0; i < nelt; i++)
20464 if (dfirst.perm[i] == 0xff)
20465 dfirst.perm[i] = i;
20466
20467 if (!d->testing_p)
20468 dfirst.target = gen_reg_rtx (dfirst.vmode);
20469
20470 start_sequence ();
20471 ok = expand_vec_perm_1 (&dfirst);
20472 seq = get_insns ();
20473 end_sequence ();
20474
20475 if (!ok)
20476 return false;
20477
20478 if (d->testing_p)
20479 return true;
20480
20481 emit_insn (seq);
20482
20483 dsecond = *d;
20484 dsecond.op0 = dfirst.target;
20485 dsecond.op1 = dfirst.target;
20486 dsecond.one_operand_p = true;
20487 dsecond.target = gen_reg_rtx (dsecond.vmode);
20488 for (i = 0; i < nelt; i++)
20489 dsecond.perm[i] = i ^ nelt2;
20490
20491 ok = expand_vec_perm_1 (&dsecond);
20492 gcc_assert (ok);
20493
20494 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20495 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
20496 return true;
20497}
20498
829c4bea
JJ
20499/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20500 a two vector permutation using two single vector permutations and
20501 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
20502 of dfirst or dsecond is identity permutation. */
20503
20504static bool
20505expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
20506{
20507 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
20508 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20509 bool ident1 = true, ident2 = true;
20510
20511 if (d->one_operand_p)
20512 return false;
20513
20514 if (GET_MODE_SIZE (d->vmode) == 16)
20515 {
20516 if (!TARGET_SSE)
20517 return false;
20518 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
20519 return false;
20520 }
20521 else if (GET_MODE_SIZE (d->vmode) == 32)
20522 {
20523 if (!TARGET_AVX)
20524 return false;
20525 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
20526 return false;
20527 lane = nelt2;
20528 }
20529 else
20530 return false;
20531
20532 for (i = 1; i < nelt; i++)
20533 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
20534 return false;
20535
20536 dfirst = *d;
20537 dsecond = *d;
20538 dfinal = *d;
20539 dfirst.op1 = dfirst.op0;
20540 dfirst.one_operand_p = true;
20541 dsecond.op0 = dsecond.op1;
20542 dsecond.one_operand_p = true;
20543
20544 for (i = 0; i < nelt; i++)
20545 if (d->perm[i] >= nelt)
20546 {
20547 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
20548 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
20549 ident2 = false;
20550 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
20551 = d->perm[i] - nelt;
20552 }
20553 else
20554 {
20555 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
20556 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
20557 ident1 = false;
20558 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
20559 }
20560
20561 if (two_insn && !ident1 && !ident2)
20562 return false;
20563
20564 if (!d->testing_p)
20565 {
20566 if (!ident1)
20567 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20568 if (!ident2)
20569 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20570 if (d->perm[0] >= nelt)
20571 std::swap (dfinal.op0, dfinal.op1);
20572 }
20573
20574 bool ok;
20575 rtx_insn *seq1 = NULL, *seq2 = NULL;
20576
20577 if (!ident1)
20578 {
20579 start_sequence ();
20580 ok = expand_vec_perm_1 (&dfirst);
20581 seq1 = get_insns ();
20582 end_sequence ();
20583
20584 if (!ok)
20585 return false;
20586 }
20587
20588 if (!ident2)
20589 {
20590 start_sequence ();
20591 ok = expand_vec_perm_1 (&dsecond);
20592 seq2 = get_insns ();
20593 end_sequence ();
20594
20595 if (!ok)
20596 return false;
20597 }
20598
20599 if (d->testing_p)
20600 return true;
20601
20602 for (i = 0; i < nelt; i++)
20603 {
20604 dfinal.perm[i] = i / 2;
20605 if (i >= lane)
20606 dfinal.perm[i] += lane / 2;
20607 if ((i & 1) != 0)
20608 dfinal.perm[i] += nelt;
20609 }
20610 emit_insn (seq1);
20611 emit_insn (seq2);
20612 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
20613 dfinal.perm, dfinal.nelt, false);
20614 gcc_assert (ok);
20615 return true;
20616}
20617
20618/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20619 the permutation using two single vector permutations and the SSE4_1 pblendv
20620 instruction. If two_insn, succeed only if one of dfirst or dsecond is
20621 identity permutation. */
20622
20623static bool
20624expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
20625{
20626 unsigned i, nelt = d->nelt;
20627 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20628 machine_mode vmode = d->vmode;
20629 bool ident1 = true, ident2 = true;
20630
20631 /* Use the same checks as in expand_vec_perm_blend. */
20632 if (d->one_operand_p)
20633 return false;
20634 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20635 ;
20636 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20637 ;
dd835ec2 20638 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
be8749f9
UB
20639 || GET_MODE_SIZE (vmode) == 8
20640 || GET_MODE_SIZE (vmode) == 4))
829c4bea
JJ
20641 ;
20642 else
20643 return false;
20644
20645 dfirst = *d;
20646 dsecond = *d;
20647 dfinal = *d;
20648 dfirst.op1 = dfirst.op0;
20649 dfirst.one_operand_p = true;
20650 dsecond.op0 = dsecond.op1;
20651 dsecond.one_operand_p = true;
20652
20653 for (i = 0; i < nelt; ++i)
20654 if (d->perm[i] >= nelt)
20655 {
20656 dfirst.perm[i] = 0xff;
20657 dsecond.perm[i] = d->perm[i] - nelt;
20658 if (d->perm[i] != i + nelt)
20659 ident2 = false;
20660 }
20661 else
20662 {
20663 dsecond.perm[i] = 0xff;
20664 dfirst.perm[i] = d->perm[i];
20665 if (d->perm[i] != i)
20666 ident1 = false;
20667 }
20668
20669 if (two_insn && !ident1 && !ident2)
20670 return false;
20671
20672 /* For now. Ideally treat 0xff as a wildcard. */
20673 for (i = 0; i < nelt; ++i)
20674 if (dfirst.perm[i] == 0xff)
20675 {
20676 if (GET_MODE_SIZE (vmode) == 32
20677 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
20678 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20679 else
20680 dfirst.perm[i] = i;
20681 }
20682 else
20683 {
20684 if (GET_MODE_SIZE (vmode) == 32
20685 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
20686 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20687 else
20688 dsecond.perm[i] = i;
20689 }
20690
20691 if (!d->testing_p)
20692 {
20693 if (!ident1)
20694 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20695 if (!ident2)
20696 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20697 }
20698
20699 bool ok;
20700 rtx_insn *seq1 = NULL, *seq2 = NULL;
20701
20702 if (!ident1)
20703 {
20704 start_sequence ();
20705 ok = expand_vec_perm_1 (&dfirst);
20706 seq1 = get_insns ();
20707 end_sequence ();
20708
20709 if (!ok)
20710 return false;
20711 }
20712
20713 if (!ident2)
20714 {
20715 start_sequence ();
20716 ok = expand_vec_perm_1 (&dsecond);
20717 seq2 = get_insns ();
20718 end_sequence ();
20719
20720 if (!ok)
20721 return false;
20722 }
20723
20724 if (d->testing_p)
20725 return true;
20726
20727 for (i = 0; i < nelt; ++i)
20728 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
20729
20730 emit_insn (seq1);
20731 emit_insn (seq2);
20732 ok = expand_vec_perm_blend (&dfinal);
20733 gcc_assert (ok);
20734 return true;
20735}
20736
4bf4c103 20737/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
2bf6d935
ML
20738 permutation using two vperm2f128, followed by a vshufpd insn blending
20739 the two vectors together. */
20740
20741static bool
20742expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
20743{
20744 struct expand_vec_perm_d dfirst, dsecond, dthird;
20745 bool ok;
20746
20747 if (!TARGET_AVX || (d->vmode != V4DFmode))
20748 return false;
20749
20750 if (d->testing_p)
20751 return true;
20752
20753 dfirst = *d;
20754 dsecond = *d;
20755 dthird = *d;
20756
20757 dfirst.perm[0] = (d->perm[0] & ~1);
20758 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
20759 dfirst.perm[2] = (d->perm[2] & ~1);
20760 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
20761 dsecond.perm[0] = (d->perm[1] & ~1);
20762 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
20763 dsecond.perm[2] = (d->perm[3] & ~1);
20764 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
20765 dthird.perm[0] = (d->perm[0] % 2);
20766 dthird.perm[1] = (d->perm[1] % 2) + 4;
20767 dthird.perm[2] = (d->perm[2] % 2) + 2;
20768 dthird.perm[3] = (d->perm[3] % 2) + 6;
20769
20770 dfirst.target = gen_reg_rtx (dfirst.vmode);
20771 dsecond.target = gen_reg_rtx (dsecond.vmode);
20772 dthird.op0 = dfirst.target;
20773 dthird.op1 = dsecond.target;
20774 dthird.one_operand_p = false;
20775
20776 canonicalize_perm (&dfirst);
20777 canonicalize_perm (&dsecond);
20778
20779 ok = expand_vec_perm_1 (&dfirst)
20780 && expand_vec_perm_1 (&dsecond)
20781 && expand_vec_perm_1 (&dthird);
20782
20783 gcc_assert (ok);
20784
20785 return true;
20786}
20787
4bf4c103
JJ
20788static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
20789
20790/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20791 a two vector permutation using two intra-lane vector
20792 permutations, vperm2f128 swapping the lanes and vblend* insn blending
20793 the non-swapped and swapped vectors together. */
20794
20795static bool
20796expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
20797{
20798 struct expand_vec_perm_d dfirst, dsecond, dthird;
20799 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
20800 rtx_insn *seq1, *seq2;
20801 bool ok;
20802 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20803
20804 if (!TARGET_AVX
20805 || TARGET_AVX2
20806 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20807 || d->one_operand_p)
20808 return false;
20809
20810 dfirst = *d;
20811 dsecond = *d;
20812 for (i = 0; i < nelt; i++)
20813 {
20814 dfirst.perm[i] = 0xff;
20815 dsecond.perm[i] = 0xff;
20816 }
20817 for (i = 0, msk = 0; i < nelt; i++)
20818 {
20819 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20820 if (j == i)
20821 {
20822 dfirst.perm[j] = d->perm[i];
20823 which1 |= (d->perm[i] < nelt ? 1 : 2);
20824 }
20825 else
20826 {
20827 dsecond.perm[j] = d->perm[i];
20828 which2 |= (d->perm[i] < nelt ? 1 : 2);
20829 msk |= (1U << i);
20830 }
20831 }
20832 if (msk == 0 || msk == (1U << nelt) - 1)
20833 return false;
20834
20835 if (!d->testing_p)
20836 {
20837 dfirst.target = gen_reg_rtx (dfirst.vmode);
20838 dsecond.target = gen_reg_rtx (dsecond.vmode);
20839 }
20840
20841 for (i = 0; i < nelt; i++)
20842 {
20843 if (dfirst.perm[i] == 0xff)
20844 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
20845 if (dsecond.perm[i] == 0xff)
20846 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
20847 }
20848 canonicalize_perm (&dfirst);
20849 start_sequence ();
20850 ok = ix86_expand_vec_perm_const_1 (&dfirst);
20851 seq1 = get_insns ();
20852 end_sequence ();
20853
20854 if (!ok)
20855 return false;
20856
20857 canonicalize_perm (&dsecond);
20858 start_sequence ();
20859 ok = ix86_expand_vec_perm_const_1 (&dsecond);
20860 seq2 = get_insns ();
20861 end_sequence ();
20862
20863 if (!ok)
20864 return false;
20865
20866 if (d->testing_p)
20867 return true;
20868
20869 emit_insn (seq1);
20870 emit_insn (seq2);
20871
20872 dthird = *d;
20873 dthird.op0 = dsecond.target;
20874 dthird.op1 = dsecond.target;
20875 dthird.one_operand_p = true;
20876 dthird.target = gen_reg_rtx (dthird.vmode);
20877 for (i = 0; i < nelt; i++)
20878 dthird.perm[i] = i ^ nelt2;
20879
20880 ok = expand_vec_perm_1 (&dthird);
20881 gcc_assert (ok);
20882
20883 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20884 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
20885 return true;
20886}
20887
2bf6d935
ML
20888/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
20889 permutation with two pshufb insns and an ior. We should have already
20890 failed all two instruction sequences. */
20891
20892static bool
20893expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
20894{
20895 rtx rperm[2][16], vperm, l, h, op, m128;
20896 unsigned int i, nelt, eltsz;
dd835ec2
UB
20897 machine_mode mode;
20898 rtx (*gen) (rtx, rtx, rtx);
2bf6d935 20899
dd835ec2 20900 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
be8749f9
UB
20901 && GET_MODE_SIZE (d->vmode) != 8
20902 && GET_MODE_SIZE (d->vmode) != 4))
2bf6d935
ML
20903 return false;
20904 gcc_assert (!d->one_operand_p);
20905
20906 if (d->testing_p)
20907 return true;
20908
dd835ec2
UB
20909 switch (GET_MODE_SIZE (d->vmode))
20910 {
be8749f9
UB
20911 case 4:
20912 mode = V4QImode;
20913 gen = gen_mmx_pshufbv4qi3;
20914 break;
dd835ec2
UB
20915 case 8:
20916 mode = V8QImode;
20917 gen = gen_mmx_pshufbv8qi3;
20918 break;
20919 case 16:
20920 mode = V16QImode;
20921 gen = gen_ssse3_pshufbv16qi3;
20922 break;
20923 default:
20924 gcc_unreachable ();
20925 }
20926
2bf6d935
ML
20927 nelt = d->nelt;
20928 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20929
20930 /* Generate two permutation masks. If the required element is within
20931 the given vector it is shuffled into the proper lane. If the required
20932 element is in the other vector, force a zero into the lane by setting
20933 bit 7 in the permutation mask. */
20934 m128 = GEN_INT (-128);
20935 for (i = 0; i < nelt; ++i)
20936 {
dd835ec2 20937 unsigned j, k, e = d->perm[i];
2bf6d935
ML
20938 unsigned which = (e >= nelt);
20939 if (e >= nelt)
20940 e -= nelt;
20941
20942 for (j = 0; j < eltsz; ++j)
20943 {
20944 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
20945 rperm[1-which][i*eltsz + j] = m128;
20946 }
dd835ec2
UB
20947
20948 for (k = i*eltsz + j; k < 16; ++k)
20949 rperm[0][k] = rperm[1][k] = m128;
2bf6d935
ML
20950 }
20951
20952 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
20953 vperm = force_reg (V16QImode, vperm);
20954
dd835ec2
UB
20955 l = gen_reg_rtx (mode);
20956 op = gen_lowpart (mode, d->op0);
20957 emit_insn (gen (l, op, vperm));
2bf6d935
ML
20958
20959 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
20960 vperm = force_reg (V16QImode, vperm);
20961
dd835ec2
UB
20962 h = gen_reg_rtx (mode);
20963 op = gen_lowpart (mode, d->op1);
20964 emit_insn (gen (h, op, vperm));
2bf6d935
ML
20965
20966 op = d->target;
dd835ec2
UB
20967 if (d->vmode != mode)
20968 op = gen_reg_rtx (mode);
b5193e35 20969 ix86_emit_vec_binop (IOR, mode, op, l, h);
2bf6d935
ML
20970 if (op != d->target)
20971 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20972
20973 return true;
20974}
20975
20976/* Implement arbitrary permutation of one V32QImode and V16QImode operand
20977 with two vpshufb insns, vpermq and vpor. We should have already failed
20978 all two or three instruction sequences. */
20979
20980static bool
20981expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
20982{
20983 rtx rperm[2][32], vperm, l, h, hp, op, m128;
20984 unsigned int i, nelt, eltsz;
20985
20986 if (!TARGET_AVX2
20987 || !d->one_operand_p
20988 || (d->vmode != V32QImode && d->vmode != V16HImode))
20989 return false;
20990
20991 if (d->testing_p)
20992 return true;
20993
20994 nelt = d->nelt;
20995 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20996
20997 /* Generate two permutation masks. If the required element is within
20998 the same lane, it is shuffled in. If the required element from the
20999 other lane, force a zero by setting bit 7 in the permutation mask.
21000 In the other mask the mask has non-negative elements if element
21001 is requested from the other lane, but also moved to the other lane,
21002 so that the result of vpshufb can have the two V2TImode halves
21003 swapped. */
21004 m128 = GEN_INT (-128);
21005 for (i = 0; i < nelt; ++i)
21006 {
21007 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21008 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21009
21010 for (j = 0; j < eltsz; ++j)
21011 {
21012 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
21013 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
21014 }
21015 }
21016
21017 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21018 vperm = force_reg (V32QImode, vperm);
21019
21020 h = gen_reg_rtx (V32QImode);
21021 op = gen_lowpart (V32QImode, d->op0);
21022 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21023
21024 /* Swap the 128-byte lanes of h into hp. */
21025 hp = gen_reg_rtx (V4DImode);
21026 op = gen_lowpart (V4DImode, h);
21027 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
21028 const1_rtx));
21029
21030 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21031 vperm = force_reg (V32QImode, vperm);
21032
21033 l = gen_reg_rtx (V32QImode);
21034 op = gen_lowpart (V32QImode, d->op0);
21035 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21036
21037 op = d->target;
21038 if (d->vmode != V32QImode)
21039 op = gen_reg_rtx (V32QImode);
21040 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
21041 if (op != d->target)
21042 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21043
21044 return true;
21045}
21046
21047/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21048 and extract-odd permutations of two V32QImode and V16QImode operand
21049 with two vpshufb insns, vpor and vpermq. We should have already
21050 failed all two or three instruction sequences. */
21051
21052static bool
21053expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
21054{
21055 rtx rperm[2][32], vperm, l, h, ior, op, m128;
21056 unsigned int i, nelt, eltsz;
21057
21058 if (!TARGET_AVX2
21059 || d->one_operand_p
21060 || (d->vmode != V32QImode && d->vmode != V16HImode))
21061 return false;
21062
21063 for (i = 0; i < d->nelt; ++i)
21064 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
21065 return false;
21066
21067 if (d->testing_p)
21068 return true;
21069
21070 nelt = d->nelt;
21071 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21072
21073 /* Generate two permutation masks. In the first permutation mask
21074 the first quarter will contain indexes for the first half
21075 of the op0, the second quarter will contain bit 7 set, third quarter
21076 will contain indexes for the second half of the op0 and the
21077 last quarter bit 7 set. In the second permutation mask
21078 the first quarter will contain bit 7 set, the second quarter
21079 indexes for the first half of the op1, the third quarter bit 7 set
21080 and last quarter indexes for the second half of the op1.
21081 I.e. the first mask e.g. for V32QImode extract even will be:
21082 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21083 (all values masked with 0xf except for -128) and second mask
21084 for extract even will be
21085 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21086 m128 = GEN_INT (-128);
21087 for (i = 0; i < nelt; ++i)
21088 {
21089 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21090 unsigned which = d->perm[i] >= nelt;
21091 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
21092
21093 for (j = 0; j < eltsz; ++j)
21094 {
21095 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
21096 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
21097 }
21098 }
21099
21100 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21101 vperm = force_reg (V32QImode, vperm);
21102
21103 l = gen_reg_rtx (V32QImode);
21104 op = gen_lowpart (V32QImode, d->op0);
21105 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21106
21107 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21108 vperm = force_reg (V32QImode, vperm);
21109
21110 h = gen_reg_rtx (V32QImode);
21111 op = gen_lowpart (V32QImode, d->op1);
21112 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21113
21114 ior = gen_reg_rtx (V32QImode);
21115 emit_insn (gen_iorv32qi3 (ior, l, h));
21116
21117 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21118 op = gen_reg_rtx (V4DImode);
21119 ior = gen_lowpart (V4DImode, ior);
21120 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
21121 const1_rtx, GEN_INT (3)));
21122 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21123
21124 return true;
21125}
21126
fcda0efc 21127/* Implement permutation with pslldq + psrldq + por when pshufb is not
21128 available. */
21129static bool
21130expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
21131{
21132 unsigned i, nelt = d->nelt;
21133 unsigned start1, end1 = -1;
21134 machine_mode vmode = d->vmode, imode;
21135 int start2 = -1;
21136 bool clear_op0, clear_op1;
21137 unsigned inner_size;
21138 rtx op0, op1, dop1;
21139 rtx (*gen_vec_shr) (rtx, rtx, rtx);
21140 rtx (*gen_vec_shl) (rtx, rtx, rtx);
21141
21142 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21143 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
21144 return false;
21145
21146 start1 = d->perm[0];
21147 for (i = 1; i < nelt; i++)
21148 {
69c4b5c5 21149 if (d->perm[i] != d->perm[i-1] + 1
21150 || d->perm[i] == nelt)
fcda0efc 21151 {
21152 if (start2 == -1)
21153 {
21154 start2 = d->perm[i];
21155 end1 = d->perm[i-1];
21156 }
21157 else
21158 return false;
21159 }
fcda0efc 21160 }
21161
21162 clear_op0 = end1 != nelt - 1;
21163 clear_op1 = start2 % nelt != 0;
21164 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21165 if (!pandn && (clear_op0 || clear_op1))
21166 return false;
21167
21168 if (d->testing_p)
21169 return true;
21170
21171 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
21172 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
21173 imode = GET_MODE_INNER (vmode);
21174 inner_size = GET_MODE_BITSIZE (imode);
21175 op0 = gen_reg_rtx (vmode);
21176 op1 = gen_reg_rtx (vmode);
21177
21178 if (start1)
21179 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
21180 else
21181 emit_move_insn (op0, d->op0);
21182
21183 dop1 = d->op1;
21184 if (d->one_operand_p)
21185 dop1 = d->op0;
21186
21187 int shl_offset = end1 - start1 + 1 - start2 % nelt;
21188 if (shl_offset)
21189 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
21190 else
21191 emit_move_insn (op1, dop1);
21192
21193 /* Clear lower/upper bits for op0/op1. */
21194 if (clear_op0 || clear_op1)
21195 {
21196 rtx vec[16];
21197 rtx const_vec;
21198 rtx clear;
21199 for (i = 0; i != nelt; i++)
21200 {
21201 if (i < (end1 - start1 + 1))
21202 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
21203 else
21204 vec[i] = CONST0_RTX (imode);
21205 }
21206 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
21207 const_vec = validize_mem (force_const_mem (vmode, const_vec));
21208 clear = force_reg (vmode, const_vec);
21209
21210 if (clear_op0)
21211 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
21212 if (clear_op1)
21213 emit_move_insn (op1, gen_rtx_AND (vmode,
21214 gen_rtx_NOT (vmode, clear),
21215 op1));
21216 }
21217
21218 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
21219 return true;
21220}
21221
2bf6d935 21222/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
a325bdd1
PB
21223 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21224 operands with two "and" and "pack" or two "shift" and "pack" insns.
21225 We should have already failed all two instruction sequences. */
2bf6d935
ML
21226
21227static bool
21228expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
21229{
21230 rtx op, dop0, dop1, t;
21231 unsigned i, odd, c, s, nelt = d->nelt;
21232 bool end_perm = false;
21233 machine_mode half_mode;
21234 rtx (*gen_and) (rtx, rtx, rtx);
21235 rtx (*gen_pack) (rtx, rtx, rtx);
21236 rtx (*gen_shift) (rtx, rtx, rtx);
21237
21238 if (d->one_operand_p)
21239 return false;
21240
21241 switch (d->vmode)
21242 {
dd835ec2
UB
21243 case E_V4HImode:
21244 /* Required for "pack". */
21245 if (!TARGET_SSE4_1)
21246 return false;
21247 c = 0xffff;
21248 s = 16;
21249 half_mode = V2SImode;
21250 gen_and = gen_andv2si3;
21251 gen_pack = gen_mmx_packusdw;
21252 gen_shift = gen_lshrv2si3;
21253 break;
2bf6d935
ML
21254 case E_V8HImode:
21255 /* Required for "pack". */
21256 if (!TARGET_SSE4_1)
21257 return false;
21258 c = 0xffff;
21259 s = 16;
21260 half_mode = V4SImode;
21261 gen_and = gen_andv4si3;
21262 gen_pack = gen_sse4_1_packusdw;
21263 gen_shift = gen_lshrv4si3;
21264 break;
a325bdd1
PB
21265 case E_V8QImode:
21266 /* No check as all instructions are SSE2. */
21267 c = 0xff;
21268 s = 8;
21269 half_mode = V4HImode;
21270 gen_and = gen_andv4hi3;
21271 gen_pack = gen_mmx_packuswb;
21272 gen_shift = gen_lshrv4hi3;
21273 break;
2bf6d935
ML
21274 case E_V16QImode:
21275 /* No check as all instructions are SSE2. */
21276 c = 0xff;
21277 s = 8;
21278 half_mode = V8HImode;
21279 gen_and = gen_andv8hi3;
21280 gen_pack = gen_sse2_packuswb;
21281 gen_shift = gen_lshrv8hi3;
21282 break;
21283 case E_V16HImode:
21284 if (!TARGET_AVX2)
21285 return false;
21286 c = 0xffff;
21287 s = 16;
21288 half_mode = V8SImode;
21289 gen_and = gen_andv8si3;
21290 gen_pack = gen_avx2_packusdw;
21291 gen_shift = gen_lshrv8si3;
21292 end_perm = true;
21293 break;
21294 case E_V32QImode:
21295 if (!TARGET_AVX2)
21296 return false;
21297 c = 0xff;
21298 s = 8;
21299 half_mode = V16HImode;
21300 gen_and = gen_andv16hi3;
21301 gen_pack = gen_avx2_packuswb;
21302 gen_shift = gen_lshrv16hi3;
21303 end_perm = true;
21304 break;
21305 default:
dd835ec2 21306 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
a325bdd1 21307 are more profitable than general shuffles. */
2bf6d935
ML
21308 return false;
21309 }
21310
21311 /* Check that permutation is even or odd. */
21312 odd = d->perm[0];
21313 if (odd > 1)
21314 return false;
21315
21316 for (i = 1; i < nelt; ++i)
21317 if (d->perm[i] != 2 * i + odd)
21318 return false;
21319
21320 if (d->testing_p)
21321 return true;
21322
21323 dop0 = gen_reg_rtx (half_mode);
21324 dop1 = gen_reg_rtx (half_mode);
21325 if (odd == 0)
21326 {
21327 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
21328 t = force_reg (half_mode, t);
21329 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
21330 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
21331 }
21332 else
21333 {
21334 emit_insn (gen_shift (dop0,
21335 gen_lowpart (half_mode, d->op0),
21336 GEN_INT (s)));
21337 emit_insn (gen_shift (dop1,
21338 gen_lowpart (half_mode, d->op1),
21339 GEN_INT (s)));
21340 }
21341 /* In AVX2 for 256 bit case we need to permute pack result. */
21342 if (TARGET_AVX2 && end_perm)
21343 {
21344 op = gen_reg_rtx (d->vmode);
21345 t = gen_reg_rtx (V4DImode);
21346 emit_insn (gen_pack (op, dop0, dop1));
21347 emit_insn (gen_avx2_permv4di_1 (t,
21348 gen_lowpart (V4DImode, op),
21349 const0_rtx,
21350 const2_rtx,
21351 const1_rtx,
21352 GEN_INT (3)));
21353 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
21354 }
21355 else
21356 emit_insn (gen_pack (d->target, dop0, dop1));
21357
21358 return true;
21359}
21360
21361/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21362 and extract-odd permutations of two V64QI operands
21363 with two "shifts", two "truncs" and one "concat" insns for "odd"
21364 and two "truncs" and one concat insn for "even."
21365 Have already failed all two instruction sequences. */
21366
21367static bool
21368expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
21369{
21370 rtx t1, t2, t3, t4;
21371 unsigned i, odd, nelt = d->nelt;
21372
21373 if (!TARGET_AVX512BW
21374 || d->one_operand_p
21375 || d->vmode != V64QImode)
21376 return false;
21377
21378 /* Check that permutation is even or odd. */
21379 odd = d->perm[0];
21380 if (odd > 1)
21381 return false;
21382
21383 for (i = 1; i < nelt; ++i)
21384 if (d->perm[i] != 2 * i + odd)
21385 return false;
21386
21387 if (d->testing_p)
21388 return true;
21389
21390
21391 if (odd)
21392 {
21393 t1 = gen_reg_rtx (V32HImode);
21394 t2 = gen_reg_rtx (V32HImode);
21395 emit_insn (gen_lshrv32hi3 (t1,
21396 gen_lowpart (V32HImode, d->op0),
21397 GEN_INT (8)));
21398 emit_insn (gen_lshrv32hi3 (t2,
21399 gen_lowpart (V32HImode, d->op1),
21400 GEN_INT (8)));
21401 }
21402 else
21403 {
21404 t1 = gen_lowpart (V32HImode, d->op0);
21405 t2 = gen_lowpart (V32HImode, d->op1);
21406 }
21407
21408 t3 = gen_reg_rtx (V32QImode);
21409 t4 = gen_reg_rtx (V32QImode);
21410 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
21411 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
21412 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
21413
21414 return true;
21415}
21416
4bf4c103 21417/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
2bf6d935
ML
21418 and extract-odd permutations. */
21419
21420static bool
21421expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
21422{
21423 rtx t1, t2, t3, t4, t5;
21424
21425 switch (d->vmode)
21426 {
21427 case E_V4DFmode:
21428 if (d->testing_p)
21429 break;
21430 t1 = gen_reg_rtx (V4DFmode);
21431 t2 = gen_reg_rtx (V4DFmode);
21432
21433 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21434 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
21435 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
21436
21437 /* Now an unpck[lh]pd will produce the result required. */
21438 if (odd)
21439 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
21440 else
21441 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
21442 emit_insn (t3);
21443 break;
21444
21445 case E_V8SFmode:
21446 {
21447 int mask = odd ? 0xdd : 0x88;
21448
21449 if (d->testing_p)
21450 break;
21451 t1 = gen_reg_rtx (V8SFmode);
21452 t2 = gen_reg_rtx (V8SFmode);
21453 t3 = gen_reg_rtx (V8SFmode);
21454
21455 /* Shuffle within the 128-bit lanes to produce:
21456 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
21457 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
21458 GEN_INT (mask)));
21459
21460 /* Shuffle the lanes around to produce:
21461 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
21462 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
21463 GEN_INT (0x3)));
21464
21465 /* Shuffle within the 128-bit lanes to produce:
21466 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
21467 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
21468
21469 /* Shuffle within the 128-bit lanes to produce:
21470 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
21471 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
21472
21473 /* Shuffle the lanes around to produce:
21474 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
21475 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
21476 GEN_INT (0x20)));
21477 }
21478 break;
21479
21480 case E_V2DFmode:
21481 case E_V4SFmode:
21482 case E_V2DImode:
9b8579a6 21483 case E_V2SImode:
2bf6d935 21484 case E_V4SImode:
8d7dae0e 21485 case E_V2HImode:
2bf6d935
ML
21486 /* These are always directly implementable by expand_vec_perm_1. */
21487 gcc_unreachable ();
21488
240198fe
UB
21489 case E_V2SFmode:
21490 gcc_assert (TARGET_MMX_WITH_SSE);
21491 /* We have no suitable instructions. */
21492 if (d->testing_p)
21493 return false;
21494 break;
21495
be8749f9
UB
21496 case E_V4QImode:
21497 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21498 return expand_vec_perm_pshufb2 (d);
21499 else
21500 {
21501 if (d->testing_p)
21502 break;
21503 /* We need 2*log2(N)-1 operations to achieve odd/even
21504 with interleave. */
21505 t1 = gen_reg_rtx (V4QImode);
21506 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
21507 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
21508 if (odd)
21509 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
21510 else
21511 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
21512 emit_insn (t2);
21513 }
21514 break;
21515
9b8579a6 21516 case E_V4HImode:
dd835ec2
UB
21517 if (TARGET_SSE4_1)
21518 return expand_vec_perm_even_odd_pack (d);
21519 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21520 return expand_vec_perm_pshufb2 (d);
9b8579a6 21521 else
dd835ec2
UB
21522 {
21523 if (d->testing_p)
21524 break;
21525 /* We need 2*log2(N)-1 operations to achieve odd/even
21526 with interleave. */
21527 t1 = gen_reg_rtx (V4HImode);
21528 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
21529 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
21530 if (odd)
21531 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
21532 else
21533 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
21534 emit_insn (t2);
21535 }
9b8579a6
UB
21536 break;
21537
2bf6d935
ML
21538 case E_V8HImode:
21539 if (TARGET_SSE4_1)
21540 return expand_vec_perm_even_odd_pack (d);
21541 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21542 return expand_vec_perm_pshufb2 (d);
21543 else
21544 {
21545 if (d->testing_p)
21546 break;
21547 /* We need 2*log2(N)-1 operations to achieve odd/even
21548 with interleave. */
21549 t1 = gen_reg_rtx (V8HImode);
21550 t2 = gen_reg_rtx (V8HImode);
21551 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
21552 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
21553 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
21554 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
21555 if (odd)
21556 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
21557 else
21558 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
21559 emit_insn (t3);
21560 }
21561 break;
21562
a325bdd1 21563 case E_V8QImode:
2bf6d935
ML
21564 case E_V16QImode:
21565 return expand_vec_perm_even_odd_pack (d);
21566
21567 case E_V16HImode:
21568 case E_V32QImode:
21569 return expand_vec_perm_even_odd_pack (d);
21570
21571 case E_V64QImode:
21572 return expand_vec_perm_even_odd_trunc (d);
21573
21574 case E_V4DImode:
21575 if (!TARGET_AVX2)
21576 {
21577 struct expand_vec_perm_d d_copy = *d;
21578 d_copy.vmode = V4DFmode;
21579 if (d->testing_p)
21580 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
21581 else
21582 d_copy.target = gen_reg_rtx (V4DFmode);
21583 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
21584 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
21585 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21586 {
21587 if (!d->testing_p)
21588 emit_move_insn (d->target,
21589 gen_lowpart (V4DImode, d_copy.target));
21590 return true;
21591 }
21592 return false;
21593 }
21594
21595 if (d->testing_p)
21596 break;
21597
21598 t1 = gen_reg_rtx (V4DImode);
21599 t2 = gen_reg_rtx (V4DImode);
21600
21601 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21602 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
21603 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
21604
21605 /* Now an vpunpck[lh]qdq will produce the result required. */
21606 if (odd)
21607 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
21608 else
21609 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
21610 emit_insn (t3);
21611 break;
21612
21613 case E_V8SImode:
21614 if (!TARGET_AVX2)
21615 {
21616 struct expand_vec_perm_d d_copy = *d;
21617 d_copy.vmode = V8SFmode;
21618 if (d->testing_p)
21619 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
21620 else
21621 d_copy.target = gen_reg_rtx (V8SFmode);
21622 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
21623 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
21624 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21625 {
21626 if (!d->testing_p)
21627 emit_move_insn (d->target,
21628 gen_lowpart (V8SImode, d_copy.target));
21629 return true;
21630 }
21631 return false;
21632 }
21633
21634 if (d->testing_p)
21635 break;
21636
21637 t1 = gen_reg_rtx (V8SImode);
21638 t2 = gen_reg_rtx (V8SImode);
21639 t3 = gen_reg_rtx (V4DImode);
21640 t4 = gen_reg_rtx (V4DImode);
21641 t5 = gen_reg_rtx (V4DImode);
21642
21643 /* Shuffle the lanes around into
21644 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
21645 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
21646 gen_lowpart (V4DImode, d->op1),
21647 GEN_INT (0x20)));
21648 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
21649 gen_lowpart (V4DImode, d->op1),
21650 GEN_INT (0x31)));
21651
21652 /* Swap the 2nd and 3rd position in each lane into
21653 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
21654 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
21655 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21656 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
21657 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21658
21659 /* Now an vpunpck[lh]qdq will produce
21660 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
21661 if (odd)
21662 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
21663 gen_lowpart (V4DImode, t2));
21664 else
21665 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
21666 gen_lowpart (V4DImode, t2));
21667 emit_insn (t3);
21668 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
21669 break;
21670
21671 default:
21672 gcc_unreachable ();
21673 }
21674
21675 return true;
21676}
21677
4bf4c103 21678/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
21679 extract-even and extract-odd permutations. */
21680
21681static bool
21682expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
21683{
21684 unsigned i, odd, nelt = d->nelt;
21685
21686 odd = d->perm[0];
21687 if (odd != 0 && odd != 1)
21688 return false;
21689
21690 for (i = 1; i < nelt; ++i)
21691 if (d->perm[i] != 2 * i + odd)
21692 return false;
21693
50b58779
JJ
21694 if (d->vmode == E_V32HImode
21695 && d->testing_p
21696 && !TARGET_AVX512BW)
21697 return false;
21698
2bf6d935
ML
21699 return expand_vec_perm_even_odd_1 (d, odd);
21700}
21701
4bf4c103 21702/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
2bf6d935
ML
21703 permutations. We assume that expand_vec_perm_1 has already failed. */
21704
21705static bool
21706expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
21707{
21708 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
21709 machine_mode vmode = d->vmode;
be8749f9 21710 rtx (*gen) (rtx, rtx, rtx);
2bf6d935
ML
21711 unsigned char perm2[4];
21712 rtx op0 = d->op0, dest;
21713 bool ok;
21714
21715 switch (vmode)
21716 {
21717 case E_V4DFmode:
21718 case E_V8SFmode:
21719 /* These are special-cased in sse.md so that we can optionally
21720 use the vbroadcast instruction. They expand to two insns
21721 if the input happens to be in a register. */
21722 gcc_unreachable ();
21723
21724 case E_V2DFmode:
240198fe 21725 case E_V2SFmode:
2bf6d935 21726 case E_V4SFmode:
240198fe 21727 case E_V2DImode:
9b8579a6 21728 case E_V2SImode:
2bf6d935 21729 case E_V4SImode:
8d7dae0e
UB
21730 case E_V2HImode:
21731 case E_V4HImode:
2bf6d935
ML
21732 /* These are always implementable using standard shuffle patterns. */
21733 gcc_unreachable ();
21734
be8749f9
UB
21735 case E_V4QImode:
21736 /* This can be implemented via interleave and pshuflw. */
21737 if (d->testing_p)
21738 return true;
21739
21740 if (elt >= nelt2)
21741 {
21742 gen = gen_mmx_punpckhbw_low;
21743 elt -= nelt2;
21744 }
21745 else
21746 gen = gen_mmx_punpcklbw_low;
21747
21748 dest = gen_reg_rtx (vmode);
21749 emit_insn (gen (dest, op0, op0));
21750 vmode = get_mode_wider_vector (vmode);
21751 op0 = gen_lowpart (vmode, dest);
21752
21753 memset (perm2, elt, 2);
21754 dest = gen_reg_rtx (vmode);
21755 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21756 gcc_assert (ok);
21757
21758 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21759 return true;
21760
a325bdd1 21761 case E_V8QImode:
be8749f9 21762 /* This can be implemented via interleave. We save one insn by
a325bdd1
PB
21763 stopping once we have promoted to V2SImode and then use pshufd. */
21764 if (d->testing_p)
21765 return true;
21766 do
21767 {
a325bdd1
PB
21768 if (elt >= nelt2)
21769 {
21770 gen = vmode == V8QImode ? gen_mmx_punpckhbw
21771 : gen_mmx_punpckhwd;
21772 elt -= nelt2;
21773 }
be8749f9
UB
21774 else
21775 gen = vmode == V8QImode ? gen_mmx_punpcklbw
21776 : gen_mmx_punpcklwd;
a325bdd1
PB
21777 nelt2 /= 2;
21778
21779 dest = gen_reg_rtx (vmode);
21780 emit_insn (gen (dest, op0, op0));
21781 vmode = get_mode_wider_vector (vmode);
21782 op0 = gen_lowpart (vmode, dest);
21783 }
21784 while (vmode != V2SImode);
21785
21786 memset (perm2, elt, 2);
be8749f9 21787 dest = gen_reg_rtx (vmode);
a325bdd1
PB
21788 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21789 gcc_assert (ok);
be8749f9
UB
21790
21791 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
a325bdd1
PB
21792 return true;
21793
2bf6d935
ML
21794 case E_V8HImode:
21795 case E_V16QImode:
21796 /* These can be implemented via interleave. We save one insn by
21797 stopping once we have promoted to V4SImode and then use pshufd. */
21798 if (d->testing_p)
21799 return true;
21800 do
21801 {
2bf6d935
ML
21802 if (elt >= nelt2)
21803 {
21804 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
21805 : gen_vec_interleave_highv8hi;
21806 elt -= nelt2;
21807 }
be8749f9
UB
21808 else
21809 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
21810 : gen_vec_interleave_lowv8hi;
2bf6d935
ML
21811 nelt2 /= 2;
21812
21813 dest = gen_reg_rtx (vmode);
21814 emit_insn (gen (dest, op0, op0));
21815 vmode = get_mode_wider_vector (vmode);
21816 op0 = gen_lowpart (vmode, dest);
21817 }
21818 while (vmode != V4SImode);
21819
21820 memset (perm2, elt, 4);
be8749f9 21821 dest = gen_reg_rtx (vmode);
2bf6d935
ML
21822 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
21823 gcc_assert (ok);
be8749f9
UB
21824
21825 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
2bf6d935
ML
21826 return true;
21827
7a54d3de
UB
21828 case E_V8HFmode:
21829 /* This can be implemented via interleave and pshufd. */
21830 if (d->testing_p)
21831 return true;
21832
21833 if (elt >= nelt2)
21834 {
21835 gen = gen_vec_interleave_highv8hf;
21836 elt -= nelt2;
21837 }
21838 else
21839 gen = gen_vec_interleave_lowv8hf;
21840 nelt2 /= 2;
21841
21842 dest = gen_reg_rtx (vmode);
21843 emit_insn (gen (dest, op0, op0));
21844
21845 vmode = V4SImode;
21846 op0 = gen_lowpart (vmode, dest);
21847
21848 memset (perm2, elt, 4);
21849 dest = gen_reg_rtx (vmode);
21850 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
21851 gcc_assert (ok);
21852
21853 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21854 return true;
21855
2bf6d935
ML
21856 case E_V32QImode:
21857 case E_V16HImode:
21858 case E_V8SImode:
21859 case E_V4DImode:
21860 /* For AVX2 broadcasts of the first element vpbroadcast* or
21861 vpermq should be used by expand_vec_perm_1. */
21862 gcc_assert (!TARGET_AVX2 || d->perm[0]);
21863 return false;
21864
240f0780
JJ
21865 case E_V64QImode:
21866 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
21867 return false;
21868
04b4f315
JJ
21869 case E_V32HImode:
21870 gcc_assert (!TARGET_AVX512BW);
21871 return false;
21872
2bf6d935
ML
21873 default:
21874 gcc_unreachable ();
21875 }
21876}
21877
4bf4c103 21878/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
21879 broadcast permutations. */
21880
21881static bool
21882expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
21883{
21884 unsigned i, elt, nelt = d->nelt;
21885
21886 if (!d->one_operand_p)
21887 return false;
21888
21889 elt = d->perm[0];
21890 for (i = 1; i < nelt; ++i)
21891 if (d->perm[i] != elt)
21892 return false;
21893
21894 return expand_vec_perm_broadcast_1 (d);
21895}
21896
21897/* Implement arbitrary permutations of two V64QImode operands
21898 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
21899static bool
21900expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
21901{
21902 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
21903 return false;
21904
21905 if (d->testing_p)
21906 return true;
21907
21908 struct expand_vec_perm_d ds[2];
21909 rtx rperm[128], vperm, target0, target1;
21910 unsigned int i, nelt;
21911 machine_mode vmode;
21912
21913 nelt = d->nelt;
21914 vmode = V64QImode;
21915
21916 for (i = 0; i < 2; i++)
21917 {
21918 ds[i] = *d;
21919 ds[i].vmode = V32HImode;
21920 ds[i].nelt = 32;
21921 ds[i].target = gen_reg_rtx (V32HImode);
21922 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
21923 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
21924 }
21925
21926 /* Prepare permutations such that the first one takes care of
21927 putting the even bytes into the right positions or one higher
21928 positions (ds[0]) and the second one takes care of
21929 putting the odd bytes into the right positions or one below
21930 (ds[1]). */
21931
21932 for (i = 0; i < nelt; i++)
21933 {
21934 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
21935 if (i & 1)
21936 {
21937 rperm[i] = constm1_rtx;
21938 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21939 }
21940 else
21941 {
21942 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21943 rperm[i + 64] = constm1_rtx;
21944 }
21945 }
21946
21947 bool ok = expand_vec_perm_1 (&ds[0]);
21948 gcc_assert (ok);
21949 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
21950
21951 ok = expand_vec_perm_1 (&ds[1]);
21952 gcc_assert (ok);
21953 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
21954
21955 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
21956 vperm = force_reg (vmode, vperm);
21957 target0 = gen_reg_rtx (V64QImode);
21958 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
21959
21960 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
21961 vperm = force_reg (vmode, vperm);
21962 target1 = gen_reg_rtx (V64QImode);
21963 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
21964
21965 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
21966 return true;
21967}
21968
21969/* Implement arbitrary permutation of two V32QImode and V16QImode operands
21970 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
21971 all the shorter instruction sequences. */
21972
21973static bool
21974expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
21975{
21976 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
21977 unsigned int i, nelt, eltsz;
21978 bool used[4];
21979
21980 if (!TARGET_AVX2
21981 || d->one_operand_p
21982 || (d->vmode != V32QImode && d->vmode != V16HImode))
21983 return false;
21984
21985 if (d->testing_p)
21986 return true;
21987
21988 nelt = d->nelt;
21989 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21990
21991 /* Generate 4 permutation masks. If the required element is within
21992 the same lane, it is shuffled in. If the required element from the
21993 other lane, force a zero by setting bit 7 in the permutation mask.
21994 In the other mask the mask has non-negative elements if element
21995 is requested from the other lane, but also moved to the other lane,
21996 so that the result of vpshufb can have the two V2TImode halves
21997 swapped. */
21998 m128 = GEN_INT (-128);
21999 for (i = 0; i < 32; ++i)
22000 {
22001 rperm[0][i] = m128;
22002 rperm[1][i] = m128;
22003 rperm[2][i] = m128;
22004 rperm[3][i] = m128;
22005 }
22006 used[0] = false;
22007 used[1] = false;
22008 used[2] = false;
22009 used[3] = false;
22010 for (i = 0; i < nelt; ++i)
22011 {
22012 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22013 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22014 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
22015
22016 for (j = 0; j < eltsz; ++j)
22017 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
22018 used[which] = true;
22019 }
22020
22021 for (i = 0; i < 2; ++i)
22022 {
22023 if (!used[2 * i + 1])
22024 {
22025 h[i] = NULL_RTX;
22026 continue;
22027 }
22028 vperm = gen_rtx_CONST_VECTOR (V32QImode,
22029 gen_rtvec_v (32, rperm[2 * i + 1]));
22030 vperm = force_reg (V32QImode, vperm);
22031 h[i] = gen_reg_rtx (V32QImode);
22032 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22033 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
22034 }
22035
22036 /* Swap the 128-byte lanes of h[X]. */
22037 for (i = 0; i < 2; ++i)
22038 {
22039 if (h[i] == NULL_RTX)
22040 continue;
22041 op = gen_reg_rtx (V4DImode);
22042 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
22043 const2_rtx, GEN_INT (3), const0_rtx,
22044 const1_rtx));
22045 h[i] = gen_lowpart (V32QImode, op);
22046 }
22047
22048 for (i = 0; i < 2; ++i)
22049 {
22050 if (!used[2 * i])
22051 {
22052 l[i] = NULL_RTX;
22053 continue;
22054 }
22055 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
22056 vperm = force_reg (V32QImode, vperm);
22057 l[i] = gen_reg_rtx (V32QImode);
22058 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22059 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
22060 }
22061
22062 for (i = 0; i < 2; ++i)
22063 {
22064 if (h[i] && l[i])
22065 {
22066 op = gen_reg_rtx (V32QImode);
22067 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
22068 l[i] = op;
22069 }
22070 else if (h[i])
22071 l[i] = h[i];
22072 }
22073
22074 gcc_assert (l[0] && l[1]);
22075 op = d->target;
22076 if (d->vmode != V32QImode)
22077 op = gen_reg_rtx (V32QImode);
22078 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
22079 if (op != d->target)
22080 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22081 return true;
22082}
22083
22084/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22085 taken care of, perform the expansion in D and return true on success. */
22086
22087static bool
22088ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22089{
22090 /* Try a single instruction expansion. */
22091 if (expand_vec_perm_1 (d))
22092 return true;
22093
22094 /* Try sequences of two instructions. */
22095
22096 if (expand_vec_perm_pshuflw_pshufhw (d))
22097 return true;
22098
22099 if (expand_vec_perm_palignr (d, false))
22100 return true;
22101
22102 if (expand_vec_perm_interleave2 (d))
22103 return true;
22104
22105 if (expand_vec_perm_broadcast (d))
22106 return true;
22107
22108 if (expand_vec_perm_vpermq_perm_1 (d))
22109 return true;
22110
22111 if (expand_vec_perm_vperm2f128 (d))
22112 return true;
22113
22114 if (expand_vec_perm_pblendv (d))
22115 return true;
22116
829c4bea
JJ
22117 if (expand_vec_perm_2perm_interleave (d, true))
22118 return true;
22119
22120 if (expand_vec_perm_2perm_pblendv (d, true))
22121 return true;
22122
2bf6d935
ML
22123 /* Try sequences of three instructions. */
22124
22125 if (expand_vec_perm_even_odd_pack (d))
22126 return true;
22127
22128 if (expand_vec_perm_2vperm2f128_vshuf (d))
22129 return true;
22130
22131 if (expand_vec_perm_pshufb2 (d))
22132 return true;
22133
fcda0efc 22134 if (expand_vec_perm_pslldq_psrldq_por (d, false))
22135 return true;
22136
2bf6d935
ML
22137 if (expand_vec_perm_interleave3 (d))
22138 return true;
22139
22140 if (expand_vec_perm_vperm2f128_vblend (d))
22141 return true;
22142
829c4bea
JJ
22143 if (expand_vec_perm_2perm_interleave (d, false))
22144 return true;
22145
22146 if (expand_vec_perm_2perm_pblendv (d, false))
22147 return true;
22148
2bf6d935
ML
22149 /* Try sequences of four instructions. */
22150
22151 if (expand_vec_perm_even_odd_trunc (d))
22152 return true;
22153 if (expand_vec_perm_vpshufb2_vpermq (d))
22154 return true;
22155
22156 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
22157 return true;
22158
22159 if (expand_vec_perm_vpermt2_vpshub2 (d))
22160 return true;
22161
22162 /* ??? Look for narrow permutations whose element orderings would
22163 allow the promotion to a wider mode. */
22164
22165 /* ??? Look for sequences of interleave or a wider permute that place
22166 the data into the correct lanes for a half-vector shuffle like
22167 pshuf[lh]w or vpermilps. */
22168
22169 /* ??? Look for sequences of interleave that produce the desired results.
22170 The combinatorics of punpck[lh] get pretty ugly... */
22171
22172 if (expand_vec_perm_even_odd (d))
22173 return true;
22174
fcda0efc 22175 /* Generate four or five instructions. */
22176 if (expand_vec_perm_pslldq_psrldq_por (d, true))
22177 return true;
22178
2bf6d935
ML
22179 /* Even longer sequences. */
22180 if (expand_vec_perm_vpshufb4_vpermq2 (d))
22181 return true;
22182
22183 /* See if we can get the same permutation in different vector integer
22184 mode. */
22185 struct expand_vec_perm_d nd;
22186 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22187 {
22188 if (!d->testing_p)
22189 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22190 return true;
22191 }
22192
4bf4c103
JJ
22193 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22194 if (expand_vec_perm2_vperm2f128_vblend (d))
22195 return true;
22196
2bf6d935
ML
22197 return false;
22198}
22199
22200/* If a permutation only uses one operand, make it clear. Returns true
22201 if the permutation references both operands. */
22202
22203static bool
22204canonicalize_perm (struct expand_vec_perm_d *d)
22205{
22206 int i, which, nelt = d->nelt;
22207
22208 for (i = which = 0; i < nelt; ++i)
4bf4c103 22209 which |= (d->perm[i] < nelt ? 1 : 2);
2bf6d935
ML
22210
22211 d->one_operand_p = true;
22212 switch (which)
22213 {
22214 default:
22215 gcc_unreachable();
22216
22217 case 3:
22218 if (!rtx_equal_p (d->op0, d->op1))
22219 {
22220 d->one_operand_p = false;
22221 break;
22222 }
22223 /* The elements of PERM do not suggest that only the first operand
22224 is used, but both operands are identical. Allow easier matching
22225 of the permutation by folding the permutation into the single
22226 input vector. */
22227 /* FALLTHRU */
22228
22229 case 2:
22230 for (i = 0; i < nelt; ++i)
22231 d->perm[i] &= nelt - 1;
22232 d->op0 = d->op1;
22233 break;
22234
22235 case 1:
22236 d->op1 = d->op0;
22237 break;
22238 }
22239
22240 return (which == 3);
22241}
22242
22243/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22244
22245bool
ae8decf1
PK
22246ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
22247 rtx target, rtx op0, rtx op1,
22248 const vec_perm_indices &sel)
2bf6d935 22249{
ae8decf1
PK
22250 if (vmode != op_mode)
22251 return false;
22252
2bf6d935
ML
22253 struct expand_vec_perm_d d;
22254 unsigned char perm[MAX_VECT_LEN];
22255 unsigned int i, nelt, which;
22256 bool two_args;
22257
be072bfa
HW
22258 /* For HF mode vector, convert it to HI using subreg. */
22259 if (GET_MODE_INNER (vmode) == HFmode)
22260 {
22261 machine_mode orig_mode = vmode;
22262 vmode = mode_for_vector (HImode,
22263 GET_MODE_NUNITS (vmode)).require ();
22264 if (target)
22265 target = lowpart_subreg (vmode, target, orig_mode);
22266 if (op0)
22267 op0 = lowpart_subreg (vmode, op0, orig_mode);
22268 if (op1)
22269 op1 = lowpart_subreg (vmode, op1, orig_mode);
22270 }
22271
2bf6d935
ML
22272 d.target = target;
22273 d.op0 = op0;
22274 d.op1 = op1;
22275
22276 d.vmode = vmode;
22277 gcc_assert (VECTOR_MODE_P (d.vmode));
22278 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22279 d.testing_p = !target;
22280
22281 gcc_assert (sel.length () == nelt);
22282 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
22283
22284 /* Given sufficient ISA support we can just return true here
22285 for selected vector modes. */
22286 switch (d.vmode)
22287 {
22288 case E_V16SFmode:
22289 case E_V16SImode:
22290 case E_V8DImode:
22291 case E_V8DFmode:
22292 if (!TARGET_AVX512F)
22293 return false;
22294 /* All implementable with a single vperm[it]2 insn. */
22295 if (d.testing_p)
22296 return true;
22297 break;
22298 case E_V32HImode:
50b58779 22299 if (!TARGET_AVX512F)
2bf6d935 22300 return false;
50b58779 22301 if (d.testing_p && TARGET_AVX512BW)
2bf6d935
ML
22302 /* All implementable with a single vperm[it]2 insn. */
22303 return true;
22304 break;
22305 case E_V64QImode:
50b58779 22306 if (!TARGET_AVX512F)
2bf6d935 22307 return false;
50b58779 22308 if (d.testing_p && TARGET_AVX512BW)
2bf6d935
ML
22309 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22310 return true;
22311 break;
22312 case E_V8SImode:
22313 case E_V8SFmode:
22314 case E_V4DFmode:
22315 case E_V4DImode:
22316 if (!TARGET_AVX)
22317 return false;
22318 if (d.testing_p && TARGET_AVX512VL)
22319 /* All implementable with a single vperm[it]2 insn. */
22320 return true;
22321 break;
22322 case E_V16HImode:
22323 if (!TARGET_SSE2)
22324 return false;
22325 if (d.testing_p && TARGET_AVX2)
22326 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22327 return true;
22328 break;
22329 case E_V32QImode:
22330 if (!TARGET_SSE2)
22331 return false;
22332 if (d.testing_p && TARGET_AVX2)
22333 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22334 return true;
22335 break;
22336 case E_V8HImode:
22337 case E_V16QImode:
22338 if (!TARGET_SSE2)
22339 return false;
22340 /* Fall through. */
22341 case E_V4SImode:
22342 case E_V4SFmode:
22343 if (!TARGET_SSE)
22344 return false;
22345 /* All implementable with a single vpperm insn. */
22346 if (d.testing_p && TARGET_XOP)
22347 return true;
22348 /* All implementable with 2 pshufb + 1 ior. */
22349 if (d.testing_p && TARGET_SSSE3)
22350 return true;
22351 break;
240198fe 22352 case E_V2SFmode:
9b8579a6
UB
22353 case E_V2SImode:
22354 case E_V4HImode:
a325bdd1 22355 case E_V8QImode:
9b8579a6
UB
22356 if (!TARGET_MMX_WITH_SSE)
22357 return false;
22358 break;
8d7dae0e 22359 case E_V2HImode:
4986946f
UB
22360 if (!TARGET_SSE2)
22361 return false;
22362 /* All implementable with *punpckwd. */
22363 if (d.testing_p)
22364 return true;
22365 break;
be8749f9
UB
22366 case E_V4QImode:
22367 if (!TARGET_SSE2)
22368 return false;
22369 break;
2bf6d935
ML
22370 case E_V2DImode:
22371 case E_V2DFmode:
22372 if (!TARGET_SSE)
22373 return false;
22374 /* All implementable with shufpd or unpck[lh]pd. */
22375 if (d.testing_p)
22376 return true;
22377 break;
22378 default:
22379 return false;
22380 }
22381
22382 for (i = which = 0; i < nelt; ++i)
22383 {
22384 unsigned char e = sel[i];
22385 gcc_assert (e < 2 * nelt);
22386 d.perm[i] = e;
22387 perm[i] = e;
22388 which |= (e < nelt ? 1 : 2);
22389 }
22390
22391 if (d.testing_p)
22392 {
22393 /* For all elements from second vector, fold the elements to first. */
22394 if (which == 2)
22395 for (i = 0; i < nelt; ++i)
22396 d.perm[i] -= nelt;
22397
22398 /* Check whether the mask can be applied to the vector type. */
22399 d.one_operand_p = (which != 3);
22400
8d7dae0e 22401 /* Implementable with shufps, pshufd or pshuflw. */
9b8579a6 22402 if (d.one_operand_p
240198fe 22403 && (d.vmode == V4SFmode || d.vmode == V2SFmode
8d7dae0e
UB
22404 || d.vmode == V4SImode || d.vmode == V2SImode
22405 || d.vmode == V4HImode || d.vmode == V2HImode))
2bf6d935
ML
22406 return true;
22407
22408 /* Otherwise we have to go through the motions and see if we can
22409 figure out how to generate the requested permutation. */
22410 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
22411 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
22412 if (!d.one_operand_p)
22413 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
22414
22415 start_sequence ();
22416 bool ret = ix86_expand_vec_perm_const_1 (&d);
22417 end_sequence ();
22418
22419 return ret;
22420 }
22421
22422 two_args = canonicalize_perm (&d);
22423
b1d1e2b5
JJ
22424 /* If one of the operands is a zero vector, try to match pmovzx. */
22425 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
22426 {
22427 struct expand_vec_perm_d dzero = d;
22428 if (d.op0 == CONST0_RTX (vmode))
22429 {
22430 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
22431 std::swap (dzero.op0, dzero.op1);
22432 for (i = 0; i < nelt; ++i)
22433 dzero.perm[i] ^= nelt;
22434 }
22435 else
22436 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
22437
22438 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
22439 dzero.perm, nelt, dzero.testing_p))
22440 return true;
22441 }
22442
22443 /* Force operands into registers. */
22444 rtx nop0 = force_reg (vmode, d.op0);
22445 if (d.op0 == d.op1)
22446 d.op1 = nop0;
22447 d.op0 = nop0;
22448 d.op1 = force_reg (vmode, d.op1);
22449
2bf6d935
ML
22450 if (ix86_expand_vec_perm_const_1 (&d))
22451 return true;
22452
22453 /* If the selector says both arguments are needed, but the operands are the
22454 same, the above tried to expand with one_operand_p and flattened selector.
22455 If that didn't work, retry without one_operand_p; we succeeded with that
22456 during testing. */
22457 if (two_args && d.one_operand_p)
22458 {
22459 d.one_operand_p = false;
22460 memcpy (d.perm, perm, sizeof (perm));
22461 return ix86_expand_vec_perm_const_1 (&d);
22462 }
22463
22464 return false;
22465}
22466
22467void
22468ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
22469{
22470 struct expand_vec_perm_d d;
22471 unsigned i, nelt;
22472
22473 d.target = targ;
22474 d.op0 = op0;
22475 d.op1 = op1;
22476 d.vmode = GET_MODE (targ);
22477 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22478 d.one_operand_p = false;
22479 d.testing_p = false;
22480
22481 for (i = 0; i < nelt; ++i)
22482 d.perm[i] = i * 2 + odd;
22483
22484 /* We'll either be able to implement the permutation directly... */
22485 if (expand_vec_perm_1 (&d))
22486 return;
22487
22488 /* ... or we use the special-case patterns. */
22489 expand_vec_perm_even_odd_1 (&d, odd);
22490}
22491
22492static void
22493ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
22494{
22495 struct expand_vec_perm_d d;
22496 unsigned i, nelt, base;
22497 bool ok;
22498
22499 d.target = targ;
22500 d.op0 = op0;
22501 d.op1 = op1;
22502 d.vmode = GET_MODE (targ);
22503 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22504 d.one_operand_p = false;
22505 d.testing_p = false;
22506
22507 base = high_p ? nelt / 2 : 0;
22508 for (i = 0; i < nelt / 2; ++i)
22509 {
22510 d.perm[i * 2] = i + base;
22511 d.perm[i * 2 + 1] = i + base + nelt;
22512 }
22513
22514 /* Note that for AVX this isn't one instruction. */
22515 ok = ix86_expand_vec_perm_const_1 (&d);
22516 gcc_assert (ok);
22517}
22518
3bd86940 22519/* This function is similar as ix86_expand_vecop_qihi,
22520 but optimized under AVX512BW by using vpmovwb.
22521 For example, optimize vector MUL generation like
54cdb2f5 22522
22523 vpmovzxbw ymm2, xmm0
22524 vpmovzxbw ymm3, xmm1
22525 vpmullw ymm4, ymm2, ymm3
22526 vpmovwb xmm0, ymm4
22527
22528 it would take less instructions than ix86_expand_vecop_qihi.
22529 Return true if success. */
22530
3bd86940 22531static bool
22532ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
54cdb2f5 22533{
22534 machine_mode himode, qimode = GET_MODE (dest);
22535 rtx hop1, hop2, hdest;
22536 rtx (*gen_extend)(rtx, rtx);
22537 rtx (*gen_truncate)(rtx, rtx);
3bd86940 22538 bool uns_p = (code == ASHIFTRT) ? false : true;
54cdb2f5 22539
22540 /* There's no V64HImode multiplication instruction. */
22541 if (qimode == E_V64QImode)
22542 return false;
22543
22544 /* vpmovwb only available under AVX512BW. */
22545 if (!TARGET_AVX512BW)
22546 return false;
22547 if ((qimode == V8QImode || qimode == V16QImode)
22548 && !TARGET_AVX512VL)
22549 return false;
22550 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
22551 if (qimode == V32QImode
22552 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
22553 return false;
22554
22555 switch (qimode)
22556 {
22557 case E_V8QImode:
22558 himode = V8HImode;
3bd86940 22559 gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
54cdb2f5 22560 gen_truncate = gen_truncv8hiv8qi2;
22561 break;
22562 case E_V16QImode:
22563 himode = V16HImode;
3bd86940 22564 gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
54cdb2f5 22565 gen_truncate = gen_truncv16hiv16qi2;
22566 break;
22567 case E_V32QImode:
22568 himode = V32HImode;
3bd86940 22569 gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
54cdb2f5 22570 gen_truncate = gen_truncv32hiv32qi2;
22571 break;
22572 default:
22573 gcc_unreachable ();
22574 }
22575
22576 hop1 = gen_reg_rtx (himode);
22577 hop2 = gen_reg_rtx (himode);
22578 hdest = gen_reg_rtx (himode);
22579 emit_insn (gen_extend (hop1, op1));
22580 emit_insn (gen_extend (hop2, op2));
3bd86940 22581 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
54cdb2f5 22582 hop1, hop2)));
22583 emit_insn (gen_truncate (dest, hdest));
22584 return true;
22585}
2bf6d935 22586
c7199fb6 22587/* Expand a vector operation shift by constant for a V*QImode in terms of the
22588 same operation on V*HImode. Return true if success. */
3bd86940 22589static bool
22590ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
22591 rtx dest, rtx op1, rtx op2)
c7199fb6 22592{
22593 machine_mode qimode, himode;
c44c2a3b 22594 HOST_WIDE_INT and_constant, xor_constant;
c7199fb6 22595 HOST_WIDE_INT shift_amount;
22596 rtx vec_const_and, vec_const_xor;
22597 rtx tmp, op1_subreg;
22598 rtx (*gen_shift) (rtx, rtx, rtx);
22599 rtx (*gen_and) (rtx, rtx, rtx);
22600 rtx (*gen_xor) (rtx, rtx, rtx);
22601 rtx (*gen_sub) (rtx, rtx, rtx);
22602
22603 /* Only optimize shift by constant. */
22604 if (!CONST_INT_P (op2))
22605 return false;
22606
22607 qimode = GET_MODE (dest);
22608 shift_amount = INTVAL (op2);
22609 /* Do nothing when shift amount greater equal 8. */
22610 if (shift_amount > 7)
22611 return false;
22612
22613 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
22614 /* Record sign bit. */
22615 xor_constant = 1 << (8 - shift_amount - 1);
22616
22617 /* Zero upper/lower bits shift from left/right element. */
22618 and_constant
22619 = (code == ASHIFT ? 256 - (1 << shift_amount)
22620 : (1 << (8 - shift_amount)) - 1);
22621
22622 switch (qimode)
22623 {
22624 case V16QImode:
22625 himode = V8HImode;
22626 gen_shift =
22627 ((code == ASHIFT)
22628 ? gen_ashlv8hi3
22629 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
22630 gen_and = gen_andv16qi3;
22631 gen_xor = gen_xorv16qi3;
22632 gen_sub = gen_subv16qi3;
22633 break;
22634 case V32QImode:
22635 himode = V16HImode;
22636 gen_shift =
22637 ((code == ASHIFT)
22638 ? gen_ashlv16hi3
22639 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
22640 gen_and = gen_andv32qi3;
22641 gen_xor = gen_xorv32qi3;
22642 gen_sub = gen_subv32qi3;
22643 break;
22644 case V64QImode:
22645 himode = V32HImode;
22646 gen_shift =
22647 ((code == ASHIFT)
22648 ? gen_ashlv32hi3
22649 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
22650 gen_and = gen_andv64qi3;
22651 gen_xor = gen_xorv64qi3;
22652 gen_sub = gen_subv64qi3;
22653 break;
22654 default:
22655 gcc_unreachable ();
22656 }
22657
22658 tmp = gen_reg_rtx (himode);
22659 vec_const_and = gen_reg_rtx (qimode);
22660 op1_subreg = lowpart_subreg (himode, op1, qimode);
22661
22662 /* For ASHIFT and LSHIFTRT, perform operation like
22663 vpsllw/vpsrlw $shift_amount, %op1, %dest.
22664 vpand %vec_const_and, %dest. */
22665 emit_insn (gen_shift (tmp, op1_subreg, op2));
22666 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
22667 emit_move_insn (vec_const_and,
22668 ix86_build_const_vector (qimode, true,
c44c2a3b 22669 gen_int_mode (and_constant, QImode)));
c7199fb6 22670 emit_insn (gen_and (dest, dest, vec_const_and));
22671
22672 /* For ASHIFTRT, perform extra operation like
22673 vpxor %vec_const_xor, %dest, %dest
22674 vpsubb %vec_const_xor, %dest, %dest */
22675 if (code == ASHIFTRT)
22676 {
22677 vec_const_xor = gen_reg_rtx (qimode);
22678 emit_move_insn (vec_const_xor,
22679 ix86_build_const_vector (qimode, true,
c44c2a3b 22680 gen_int_mode (xor_constant, QImode)));
c7199fb6 22681 emit_insn (gen_xor (dest, dest, vec_const_xor));
22682 emit_insn (gen_sub (dest, dest, vec_const_xor));
22683 }
22684 return true;
22685}
22686
2bf6d935
ML
22687/* Expand a vector operation CODE for a V*QImode in terms of the
22688 same operation on V*HImode. */
22689
22690void
22691ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22692{
22693 machine_mode qimode = GET_MODE (dest);
22694 machine_mode himode;
22695 rtx (*gen_il) (rtx, rtx, rtx);
22696 rtx (*gen_ih) (rtx, rtx, rtx);
22697 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
22698 struct expand_vec_perm_d d;
22699 bool ok, full_interleave;
22700 bool uns_p = false;
22701 int i;
22702
3bd86940 22703 if (CONST_INT_P (op2)
22704 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
22705 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
22706 return;
22707
22708 if (TARGET_AVX512BW
22709 && VECTOR_MODE_P (GET_MODE (op2))
22710 && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
22711 return;
22712
2bf6d935
ML
22713 switch (qimode)
22714 {
22715 case E_V16QImode:
22716 himode = V8HImode;
22717 gen_il = gen_vec_interleave_lowv16qi;
22718 gen_ih = gen_vec_interleave_highv16qi;
22719 break;
22720 case E_V32QImode:
22721 himode = V16HImode;
22722 gen_il = gen_avx2_interleave_lowv32qi;
22723 gen_ih = gen_avx2_interleave_highv32qi;
22724 break;
22725 case E_V64QImode:
22726 himode = V32HImode;
22727 gen_il = gen_avx512bw_interleave_lowv64qi;
22728 gen_ih = gen_avx512bw_interleave_highv64qi;
22729 break;
22730 default:
22731 gcc_unreachable ();
22732 }
22733
2bf6d935
ML
22734 switch (code)
22735 {
22736 case MULT:
22737 /* Unpack data such that we've got a source byte in each low byte of
22738 each word. We don't care what goes into the high byte of each word.
22739 Rather than trying to get zero in there, most convenient is to let
22740 it be a copy of the low byte. */
22741 op2_l = gen_reg_rtx (qimode);
22742 op2_h = gen_reg_rtx (qimode);
22743 emit_insn (gen_il (op2_l, op2, op2));
22744 emit_insn (gen_ih (op2_h, op2, op2));
22745
22746 op1_l = gen_reg_rtx (qimode);
22747 op1_h = gen_reg_rtx (qimode);
22748 emit_insn (gen_il (op1_l, op1, op1));
22749 emit_insn (gen_ih (op1_h, op1, op1));
22750 full_interleave = qimode == V16QImode;
22751 break;
22752
22753 case ASHIFT:
22754 case LSHIFTRT:
22755 uns_p = true;
22756 /* FALLTHRU */
22757 case ASHIFTRT:
22758 op1_l = gen_reg_rtx (himode);
22759 op1_h = gen_reg_rtx (himode);
22760 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
22761 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
3bd86940 22762 /* vashr/vlshr/vashl */
22763 if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22764 {
22765 rtx tmp = force_reg (qimode, op2);
22766 op2_l = gen_reg_rtx (himode);
22767 op2_h = gen_reg_rtx (himode);
22768 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
22769 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
22770 }
22771 else
22772 op2_l = op2_h = op2;
22773
2bf6d935
ML
22774 full_interleave = true;
22775 break;
22776 default:
22777 gcc_unreachable ();
22778 }
22779
3bd86940 22780 /* Perform vashr/vlshr/vashl. */
22781 if (code != MULT
22782 && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22783 {
22784 res_l = gen_reg_rtx (himode);
22785 res_h = gen_reg_rtx (himode);
22786 emit_insn (gen_rtx_SET (res_l,
22787 simplify_gen_binary (code, himode,
22788 op1_l, op2_l)));
22789 emit_insn (gen_rtx_SET (res_h,
22790 simplify_gen_binary (code, himode,
22791 op1_h, op2_h)));
22792 }
22793 /* Performance mult/ashr/lshr/ashl. */
22794 else
22795 {
22796 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
22797 1, OPTAB_DIRECT);
22798 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
22799 1, OPTAB_DIRECT);
22800 }
22801
2bf6d935
ML
22802 gcc_assert (res_l && res_h);
22803
22804 /* Merge the data back into the right place. */
22805 d.target = dest;
22806 d.op0 = gen_lowpart (qimode, res_l);
22807 d.op1 = gen_lowpart (qimode, res_h);
22808 d.vmode = qimode;
22809 d.nelt = GET_MODE_NUNITS (qimode);
22810 d.one_operand_p = false;
22811 d.testing_p = false;
22812
22813 if (full_interleave)
22814 {
22815 /* For SSE2, we used an full interleave, so the desired
22816 results are in the even elements. */
22817 for (i = 0; i < d.nelt; ++i)
22818 d.perm[i] = i * 2;
22819 }
22820 else
22821 {
22822 /* For AVX, the interleave used above was not cross-lane. So the
22823 extraction is evens but with the second and third quarter swapped.
22824 Happily, that is even one insn shorter than even extraction.
22825 For AVX512BW we have 4 lanes. We extract evens from within a lane,
22826 always first from the first and then from the second source operand,
22827 the index bits above the low 4 bits remains the same.
22828 Thus, for d.nelt == 32 we want permutation
22829 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
22830 and for d.nelt == 64 we want permutation
22831 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
22832 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
22833 for (i = 0; i < d.nelt; ++i)
22834 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
22835 }
22836
22837 ok = ix86_expand_vec_perm_const_1 (&d);
22838 gcc_assert (ok);
22839
22840 set_unique_reg_note (get_last_insn (), REG_EQUAL,
22841 gen_rtx_fmt_ee (code, qimode, op1, op2));
22842}
22843
22844/* Helper function of ix86_expand_mul_widen_evenodd. Return true
22845 if op is CONST_VECTOR with all odd elements equal to their
22846 preceding element. */
22847
22848static bool
22849const_vector_equal_evenodd_p (rtx op)
22850{
22851 machine_mode mode = GET_MODE (op);
22852 int i, nunits = GET_MODE_NUNITS (mode);
22853 if (GET_CODE (op) != CONST_VECTOR
22854 || nunits != CONST_VECTOR_NUNITS (op))
22855 return false;
22856 for (i = 0; i < nunits; i += 2)
22857 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
22858 return false;
22859 return true;
22860}
22861
22862void
22863ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
22864 bool uns_p, bool odd_p)
22865{
22866 machine_mode mode = GET_MODE (op1);
22867 machine_mode wmode = GET_MODE (dest);
22868 rtx x;
22869 rtx orig_op1 = op1, orig_op2 = op2;
22870
22871 if (!nonimmediate_operand (op1, mode))
22872 op1 = force_reg (mode, op1);
22873 if (!nonimmediate_operand (op2, mode))
22874 op2 = force_reg (mode, op2);
22875
22876 /* We only play even/odd games with vectors of SImode. */
22877 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
22878
22879 /* If we're looking for the odd results, shift those members down to
22880 the even slots. For some cpus this is faster than a PSHUFD. */
22881 if (odd_p)
22882 {
22883 /* For XOP use vpmacsdqh, but only for smult, as it is only
22884 signed. */
22885 if (TARGET_XOP && mode == V4SImode && !uns_p)
22886 {
22887 x = force_reg (wmode, CONST0_RTX (wmode));
22888 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
22889 return;
22890 }
22891
22892 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
22893 if (!const_vector_equal_evenodd_p (orig_op1))
22894 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
22895 x, NULL, 1, OPTAB_DIRECT);
22896 if (!const_vector_equal_evenodd_p (orig_op2))
22897 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
22898 x, NULL, 1, OPTAB_DIRECT);
22899 op1 = gen_lowpart (mode, op1);
22900 op2 = gen_lowpart (mode, op2);
22901 }
22902
22903 if (mode == V16SImode)
22904 {
22905 if (uns_p)
22906 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
22907 else
22908 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
22909 }
22910 else if (mode == V8SImode)
22911 {
22912 if (uns_p)
22913 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
22914 else
22915 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
22916 }
22917 else if (uns_p)
22918 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
22919 else if (TARGET_SSE4_1)
22920 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
22921 else
22922 {
22923 rtx s1, s2, t0, t1, t2;
22924
22925 /* The easiest way to implement this without PMULDQ is to go through
22926 the motions as if we are performing a full 64-bit multiply. With
22927 the exception that we need to do less shuffling of the elements. */
22928
22929 /* Compute the sign-extension, aka highparts, of the two operands. */
22930 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22931 op1, pc_rtx, pc_rtx);
22932 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22933 op2, pc_rtx, pc_rtx);
22934
22935 /* Multiply LO(A) * HI(B), and vice-versa. */
22936 t1 = gen_reg_rtx (wmode);
22937 t2 = gen_reg_rtx (wmode);
22938 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
22939 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
22940
22941 /* Multiply LO(A) * LO(B). */
22942 t0 = gen_reg_rtx (wmode);
22943 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
22944
22945 /* Combine and shift the highparts into place. */
22946 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
22947 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
22948 1, OPTAB_DIRECT);
22949
22950 /* Combine high and low parts. */
22951 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
22952 return;
22953 }
22954 emit_insn (x);
22955}
22956
22957void
22958ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
22959 bool uns_p, bool high_p)
22960{
22961 machine_mode wmode = GET_MODE (dest);
22962 machine_mode mode = GET_MODE (op1);
22963 rtx t1, t2, t3, t4, mask;
22964
22965 switch (mode)
22966 {
22967 case E_V4SImode:
22968 t1 = gen_reg_rtx (mode);
22969 t2 = gen_reg_rtx (mode);
22970 if (TARGET_XOP && !uns_p)
22971 {
22972 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
22973 shuffle the elements once so that all elements are in the right
22974 place for immediate use: { A C B D }. */
22975 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
22976 const1_rtx, GEN_INT (3)));
22977 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
22978 const1_rtx, GEN_INT (3)));
22979 }
22980 else
22981 {
22982 /* Put the elements into place for the multiply. */
22983 ix86_expand_vec_interleave (t1, op1, op1, high_p);
22984 ix86_expand_vec_interleave (t2, op2, op2, high_p);
22985 high_p = false;
22986 }
22987 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
22988 break;
22989
22990 case E_V8SImode:
22991 /* Shuffle the elements between the lanes. After this we
22992 have { A B E F | C D G H } for each operand. */
22993 t1 = gen_reg_rtx (V4DImode);
22994 t2 = gen_reg_rtx (V4DImode);
22995 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
22996 const0_rtx, const2_rtx,
22997 const1_rtx, GEN_INT (3)));
22998 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
22999 const0_rtx, const2_rtx,
23000 const1_rtx, GEN_INT (3)));
23001
23002 /* Shuffle the elements within the lanes. After this we
23003 have { A A B B | C C D D } or { E E F F | G G H H }. */
23004 t3 = gen_reg_rtx (V8SImode);
23005 t4 = gen_reg_rtx (V8SImode);
23006 mask = GEN_INT (high_p
23007 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23008 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23009 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
23010 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
23011
23012 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
23013 break;
23014
23015 case E_V8HImode:
23016 case E_V16HImode:
23017 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
23018 uns_p, OPTAB_DIRECT);
23019 t2 = expand_binop (mode,
23020 uns_p ? umul_highpart_optab : smul_highpart_optab,
23021 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
23022 gcc_assert (t1 && t2);
23023
23024 t3 = gen_reg_rtx (mode);
23025 ix86_expand_vec_interleave (t3, t1, t2, high_p);
23026 emit_move_insn (dest, gen_lowpart (wmode, t3));
23027 break;
23028
23029 case E_V16QImode:
23030 case E_V32QImode:
23031 case E_V32HImode:
23032 case E_V16SImode:
23033 case E_V64QImode:
23034 t1 = gen_reg_rtx (wmode);
23035 t2 = gen_reg_rtx (wmode);
23036 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
23037 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
23038
23039 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
23040 break;
23041
23042 default:
23043 gcc_unreachable ();
23044 }
23045}
23046
23047void
23048ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
23049{
23050 rtx res_1, res_2, res_3, res_4;
23051
23052 res_1 = gen_reg_rtx (V4SImode);
23053 res_2 = gen_reg_rtx (V4SImode);
23054 res_3 = gen_reg_rtx (V2DImode);
23055 res_4 = gen_reg_rtx (V2DImode);
23056 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
23057 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
23058
23059 /* Move the results in element 2 down to element 1; we don't care
23060 what goes in elements 2 and 3. Then we can merge the parts
23061 back together with an interleave.
23062
23063 Note that two other sequences were tried:
23064 (1) Use interleaves at the start instead of psrldq, which allows
23065 us to use a single shufps to merge things back at the end.
23066 (2) Use shufps here to combine the two vectors, then pshufd to
23067 put the elements in the correct order.
23068 In both cases the cost of the reformatting stall was too high
23069 and the overall sequence slower. */
23070
23071 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
23072 const0_rtx, const2_rtx,
23073 const0_rtx, const0_rtx));
23074 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
23075 const0_rtx, const2_rtx,
23076 const0_rtx, const0_rtx));
23077 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
23078
23079 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
23080}
23081
23082void
23083ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
23084{
23085 machine_mode mode = GET_MODE (op0);
23086 rtx t1, t2, t3, t4, t5, t6;
23087
23088 if (TARGET_AVX512DQ && mode == V8DImode)
23089 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
23090 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
23091 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
23092 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
23093 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
23094 else if (TARGET_XOP && mode == V2DImode)
23095 {
23096 /* op1: A,B,C,D, op2: E,F,G,H */
23097 op1 = gen_lowpart (V4SImode, op1);
23098 op2 = gen_lowpart (V4SImode, op2);
23099
23100 t1 = gen_reg_rtx (V4SImode);
23101 t2 = gen_reg_rtx (V4SImode);
23102 t3 = gen_reg_rtx (V2DImode);
23103 t4 = gen_reg_rtx (V2DImode);
23104
23105 /* t1: B,A,D,C */
23106 emit_insn (gen_sse2_pshufd_1 (t1, op1,
23107 GEN_INT (1),
23108 GEN_INT (0),
23109 GEN_INT (3),
23110 GEN_INT (2)));
23111
23112 /* t2: (B*E),(A*F),(D*G),(C*H) */
23113 emit_insn (gen_mulv4si3 (t2, t1, op2));
23114
23115 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
23116 emit_insn (gen_xop_phadddq (t3, t2));
23117
23118 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
23119 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
23120
23121 /* Multiply lower parts and add all */
23122 t5 = gen_reg_rtx (V2DImode);
23123 emit_insn (gen_vec_widen_umult_even_v4si (t5,
23124 gen_lowpart (V4SImode, op1),
23125 gen_lowpart (V4SImode, op2)));
8ba6ea87 23126 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
2bf6d935
ML
23127 }
23128 else
23129 {
23130 machine_mode nmode;
23131 rtx (*umul) (rtx, rtx, rtx);
23132
23133 if (mode == V2DImode)
23134 {
23135 umul = gen_vec_widen_umult_even_v4si;
23136 nmode = V4SImode;
23137 }
23138 else if (mode == V4DImode)
23139 {
23140 umul = gen_vec_widen_umult_even_v8si;
23141 nmode = V8SImode;
23142 }
23143 else if (mode == V8DImode)
23144 {
23145 umul = gen_vec_widen_umult_even_v16si;
23146 nmode = V16SImode;
23147 }
23148 else
23149 gcc_unreachable ();
23150
23151
23152 /* Multiply low parts. */
23153 t1 = gen_reg_rtx (mode);
23154 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
23155
23156 /* Shift input vectors right 32 bits so we can multiply high parts. */
23157 t6 = GEN_INT (32);
23158 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
23159 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
23160
23161 /* Multiply high parts by low parts. */
23162 t4 = gen_reg_rtx (mode);
23163 t5 = gen_reg_rtx (mode);
23164 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
23165 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
23166
23167 /* Combine and shift the highparts back. */
23168 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
23169 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
23170
23171 /* Combine high and low parts. */
23172 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
23173 }
23174
23175 set_unique_reg_note (get_last_insn (), REG_EQUAL,
23176 gen_rtx_MULT (mode, op1, op2));
23177}
23178
23179/* Return 1 if control tansfer instruction INSN
23180 should be encoded with notrack prefix. */
23181
23182bool
e8b0314a 23183ix86_notrack_prefixed_insn_p (rtx_insn *insn)
2bf6d935
ML
23184{
23185 if (!insn || !((flag_cf_protection & CF_BRANCH)))
23186 return false;
23187
23188 if (CALL_P (insn))
23189 {
23190 rtx call = get_call_rtx_from (insn);
23191 gcc_assert (call != NULL_RTX);
23192 rtx addr = XEXP (call, 0);
23193
23194 /* Do not emit 'notrack' if it's not an indirect call. */
23195 if (MEM_P (addr)
23196 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
23197 return false;
23198 else
23199 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
23200 }
23201
23202 if (JUMP_P (insn) && !flag_cet_switch)
23203 {
23204 rtx target = JUMP_LABEL (insn);
23205 if (target == NULL_RTX || ANY_RETURN_P (target))
23206 return false;
23207
23208 /* Check the jump is a switch table. */
23209 rtx_insn *label = as_a<rtx_insn *> (target);
23210 rtx_insn *table = next_insn (label);
23211 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
23212 return false;
23213 else
23214 return true;
23215 }
23216 return false;
23217}
23218
23219/* Calculate integer abs() using only SSE2 instructions. */
23220
23221void
23222ix86_expand_sse2_abs (rtx target, rtx input)
23223{
23224 machine_mode mode = GET_MODE (target);
23225 rtx tmp0, tmp1, x;
23226
23227 switch (mode)
23228 {
23229 case E_V2DImode:
23230 case E_V4DImode:
23231 /* For 64-bit signed integer X, with SSE4.2 use
23232 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
23233 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
23234 32 and use logical instead of arithmetic right shift (which is
23235 unimplemented) and subtract. */
23236 if (TARGET_SSE4_2)
23237 {
23238 tmp0 = gen_reg_rtx (mode);
23239 tmp1 = gen_reg_rtx (mode);
23240 emit_move_insn (tmp1, CONST0_RTX (mode));
23241 if (mode == E_V2DImode)
23242 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
23243 else
23244 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
23245 }
23246 else
23247 {
23248 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
23249 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
23250 - 1), NULL, 0, OPTAB_DIRECT);
23251 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
23252 }
23253
23254 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23255 NULL, 0, OPTAB_DIRECT);
23256 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23257 target, 0, OPTAB_DIRECT);
23258 break;
23259
23260 case E_V4SImode:
23261 /* For 32-bit signed integer X, the best way to calculate the absolute
23262 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
23263 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
23264 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
23265 NULL, 0, OPTAB_DIRECT);
23266 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23267 NULL, 0, OPTAB_DIRECT);
23268 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23269 target, 0, OPTAB_DIRECT);
23270 break;
23271
23272 case E_V8HImode:
23273 /* For 16-bit signed integer X, the best way to calculate the absolute
23274 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
23275 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23276
23277 x = expand_simple_binop (mode, SMAX, tmp0, input,
23278 target, 0, OPTAB_DIRECT);
23279 break;
23280
23281 case E_V16QImode:
23282 /* For 8-bit signed integer X, the best way to calculate the absolute
23283 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23284 as SSE2 provides the PMINUB insn. */
23285 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23286
23287 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
23288 target, 0, OPTAB_DIRECT);
23289 break;
23290
23291 default:
23292 gcc_unreachable ();
23293 }
23294
23295 if (x != target)
23296 emit_move_insn (target, x);
23297}
23298
23299/* Expand an extract from a vector register through pextr insn.
23300 Return true if successful. */
23301
23302bool
23303ix86_expand_pextr (rtx *operands)
23304{
23305 rtx dst = operands[0];
23306 rtx src = operands[1];
23307
23308 unsigned int size = INTVAL (operands[2]);
23309 unsigned int pos = INTVAL (operands[3]);
23310
23311 if (SUBREG_P (dst))
23312 {
23313 /* Reject non-lowpart subregs. */
23314 if (SUBREG_BYTE (dst) > 0)
23315 return false;
23316 dst = SUBREG_REG (dst);
23317 }
23318
23319 if (SUBREG_P (src))
23320 {
23321 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
23322 src = SUBREG_REG (src);
23323 }
23324
23325 switch (GET_MODE (src))
23326 {
23327 case E_V16QImode:
23328 case E_V8HImode:
23329 case E_V4SImode:
23330 case E_V2DImode:
23331 case E_V1TImode:
2bf6d935
ML
23332 {
23333 machine_mode srcmode, dstmode;
23334 rtx d, pat;
23335
23336 if (!int_mode_for_size (size, 0).exists (&dstmode))
23337 return false;
23338
23339 switch (dstmode)
23340 {
23341 case E_QImode:
23342 if (!TARGET_SSE4_1)
23343 return false;
23344 srcmode = V16QImode;
23345 break;
23346
23347 case E_HImode:
23348 if (!TARGET_SSE2)
23349 return false;
23350 srcmode = V8HImode;
23351 break;
23352
23353 case E_SImode:
23354 if (!TARGET_SSE4_1)
23355 return false;
23356 srcmode = V4SImode;
23357 break;
23358
23359 case E_DImode:
23360 gcc_assert (TARGET_64BIT);
23361 if (!TARGET_SSE4_1)
23362 return false;
23363 srcmode = V2DImode;
23364 break;
23365
23366 default:
23367 return false;
23368 }
23369
23370 /* Reject extractions from misaligned positions. */
23371 if (pos & (size-1))
23372 return false;
23373
23374 if (GET_MODE (dst) == dstmode)
23375 d = dst;
23376 else
23377 d = gen_reg_rtx (dstmode);
23378
23379 /* Construct insn pattern. */
23380 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
23381 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
23382
23383 /* Let the rtl optimizers know about the zero extension performed. */
23384 if (dstmode == QImode || dstmode == HImode)
23385 {
23386 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
23387 d = gen_lowpart (SImode, d);
23388 }
23389
23390 emit_insn (gen_rtx_SET (d, pat));
23391
23392 if (d != dst)
23393 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23394 return true;
23395 }
23396
23397 default:
23398 return false;
23399 }
23400}
23401
23402/* Expand an insert into a vector register through pinsr insn.
23403 Return true if successful. */
23404
23405bool
23406ix86_expand_pinsr (rtx *operands)
23407{
23408 rtx dst = operands[0];
23409 rtx src = operands[3];
23410
23411 unsigned int size = INTVAL (operands[1]);
23412 unsigned int pos = INTVAL (operands[2]);
23413
23414 if (SUBREG_P (dst))
23415 {
23416 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
23417 dst = SUBREG_REG (dst);
23418 }
23419
23420 switch (GET_MODE (dst))
23421 {
23422 case E_V16QImode:
23423 case E_V8HImode:
23424 case E_V4SImode:
23425 case E_V2DImode:
23426 case E_V1TImode:
2bf6d935
ML
23427 {
23428 machine_mode srcmode, dstmode;
23429 rtx (*pinsr)(rtx, rtx, rtx, rtx);
23430 rtx d;
23431
23432 if (!int_mode_for_size (size, 0).exists (&srcmode))
23433 return false;
23434
23435 switch (srcmode)
23436 {
23437 case E_QImode:
23438 if (!TARGET_SSE4_1)
23439 return false;
23440 dstmode = V16QImode;
23441 pinsr = gen_sse4_1_pinsrb;
23442 break;
23443
23444 case E_HImode:
23445 if (!TARGET_SSE2)
23446 return false;
23447 dstmode = V8HImode;
23448 pinsr = gen_sse2_pinsrw;
23449 break;
23450
23451 case E_SImode:
23452 if (!TARGET_SSE4_1)
23453 return false;
23454 dstmode = V4SImode;
23455 pinsr = gen_sse4_1_pinsrd;
23456 break;
23457
23458 case E_DImode:
23459 gcc_assert (TARGET_64BIT);
23460 if (!TARGET_SSE4_1)
23461 return false;
23462 dstmode = V2DImode;
23463 pinsr = gen_sse4_1_pinsrq;
23464 break;
23465
23466 default:
23467 return false;
23468 }
23469
23470 /* Reject insertions to misaligned positions. */
23471 if (pos & (size-1))
23472 return false;
23473
23474 if (SUBREG_P (src))
23475 {
23476 unsigned int srcpos = SUBREG_BYTE (src);
23477
23478 if (srcpos > 0)
23479 {
23480 rtx extr_ops[4];
23481
23482 extr_ops[0] = gen_reg_rtx (srcmode);
23483 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
23484 extr_ops[2] = GEN_INT (size);
23485 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
23486
23487 if (!ix86_expand_pextr (extr_ops))
23488 return false;
23489
23490 src = extr_ops[0];
23491 }
23492 else
23493 src = gen_lowpart (srcmode, SUBREG_REG (src));
23494 }
23495
23496 if (GET_MODE (dst) == dstmode)
23497 d = dst;
23498 else
23499 d = gen_reg_rtx (dstmode);
23500
23501 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
23502 gen_lowpart (srcmode, src),
23503 GEN_INT (1 << (pos / size))));
23504 if (d != dst)
23505 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23506 return true;
23507 }
23508
23509 default:
23510 return false;
23511 }
23512}
23513
23514/* All CPUs prefer to avoid cross-lane operations so perform reductions
23515 upper against lower halves up to SSE reg size. */
23516
23517machine_mode
23518ix86_split_reduction (machine_mode mode)
23519{
23520 /* Reduce lowpart against highpart until we reach SSE reg width to
23521 avoid cross-lane operations. */
23522 switch (mode)
23523 {
23524 case E_V8DImode:
23525 case E_V4DImode:
23526 return V2DImode;
23527 case E_V16SImode:
23528 case E_V8SImode:
23529 return V4SImode;
23530 case E_V32HImode:
23531 case E_V16HImode:
23532 return V8HImode;
23533 case E_V64QImode:
23534 case E_V32QImode:
23535 return V16QImode;
23536 case E_V16SFmode:
23537 case E_V8SFmode:
23538 return V4SFmode;
23539 case E_V8DFmode:
23540 case E_V4DFmode:
23541 return V2DFmode;
23542 default:
23543 return mode;
23544 }
23545}
23546
23547/* Generate call to __divmoddi4. */
23548
23549void
23550ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
23551 rtx op0, rtx op1,
23552 rtx *quot_p, rtx *rem_p)
23553{
23554 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
23555
23556 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
23557 mode, op0, mode, op1, mode,
23558 XEXP (rem, 0), Pmode);
23559 *quot_p = quot;
23560 *rem_p = rem;
23561}
23562
152f243f
JJ
23563void
23564ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
23565 enum rtx_code code, bool after,
23566 bool doubleword)
4d281ff7 23567{
0435b978 23568 rtx old_reg, new_reg, old_mem, success;
4d281ff7 23569 machine_mode mode = GET_MODE (target);
0435b978 23570 rtx_code_label *loop_label = NULL;
4d281ff7
HW
23571
23572 old_reg = gen_reg_rtx (mode);
23573 new_reg = old_reg;
4d281ff7 23574 old_mem = copy_to_reg (mem);
0435b978 23575 loop_label = gen_label_rtx ();
4d281ff7
HW
23576 emit_label (loop_label);
23577 emit_move_insn (old_reg, old_mem);
23578
23579 /* return value for atomic_fetch_op. */
23580 if (!after)
23581 emit_move_insn (target, old_reg);
23582
23583 if (code == NOT)
23584 {
23585 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
23586 true, OPTAB_LIB_WIDEN);
23587 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
23588 }
23589 else
23590 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
23591 true, OPTAB_LIB_WIDEN);
23592
23593 /* return value for atomic_op_fetch. */
23594 if (after)
23595 emit_move_insn (target, new_reg);
23596
0435b978
HW
23597 success = NULL_RTX;
23598
23599 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
23600 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
23601 SImode),
23602 doubleword, loop_label);
23603}
23604
23605/* Relax cmpxchg instruction, param loop_label indicates whether
23606 the instruction should be relaxed with a pause loop. If not,
23607 it will be relaxed to an atomic load + compare, and skip
23608 cmpxchg instruction if mem != exp_input. */
23609
152f243f
JJ
23610void
23611ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
23612 rtx mem, rtx exp_input, rtx new_input,
23613 rtx mem_model, bool doubleword,
23614 rtx_code_label *loop_label)
0435b978
HW
23615{
23616 rtx_code_label *cmp_label = NULL;
23617 rtx_code_label *done_label = NULL;
23618 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
23619 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
23620 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
23621 machine_mode mode = GET_MODE (target_val), hmode = mode;
23622
23623 if (*ptarget_bool == NULL)
23624 target_bool = gen_reg_rtx (QImode);
23625 else
23626 target_bool = *ptarget_bool;
23627
23628 cmp_label = gen_label_rtx ();
23629 done_label = gen_label_rtx ();
23630
23631 new_mem = gen_reg_rtx (mode);
23632 /* Load memory first. */
23633 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
23634
23635 switch (mode)
23636 {
9d1796d8 23637 case E_TImode:
0435b978
HW
23638 gendw = gen_atomic_compare_and_swapti_doubleword;
23639 hmode = DImode;
23640 break;
9d1796d8 23641 case E_DImode:
0435b978
HW
23642 if (doubleword)
23643 {
23644 gendw = gen_atomic_compare_and_swapdi_doubleword;
23645 hmode = SImode;
23646 }
23647 else
23648 gen = gen_atomic_compare_and_swapdi_1;
23649 break;
9d1796d8
RS
23650 case E_SImode:
23651 gen = gen_atomic_compare_and_swapsi_1;
23652 break;
23653 case E_HImode:
23654 gen = gen_atomic_compare_and_swaphi_1;
23655 break;
23656 case E_QImode:
23657 gen = gen_atomic_compare_and_swapqi_1;
23658 break;
0435b978
HW
23659 default:
23660 gcc_unreachable ();
23661 }
4d281ff7 23662
0435b978 23663 /* Compare mem value with expected value. */
4d281ff7
HW
23664 if (doubleword)
23665 {
0435b978
HW
23666 rtx low_new_mem = gen_lowpart (hmode, new_mem);
23667 rtx low_exp_input = gen_lowpart (hmode, exp_input);
23668 rtx high_new_mem = gen_highpart (hmode, new_mem);
23669 rtx high_exp_input = gen_highpart (hmode, exp_input);
23670 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
23671 hmode, 1, cmp_label,
4d281ff7 23672 profile_probability::guessed_never ());
0435b978
HW
23673 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
23674 hmode, 1, cmp_label,
4d281ff7
HW
23675 profile_probability::guessed_never ());
23676 }
23677 else
0435b978
HW
23678 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
23679 GET_MODE (exp_input), 1, cmp_label,
4d281ff7
HW
23680 profile_probability::guessed_never ());
23681
0435b978
HW
23682 /* Directly emits cmpxchg here. */
23683 if (doubleword)
23684 emit_insn (gendw (target_val, mem, exp_input,
23685 gen_lowpart (hmode, new_input),
23686 gen_highpart (hmode, new_input),
23687 mem_model));
23688 else
23689 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
23690
23691 if (!loop_label)
23692 {
23693 emit_jump_insn (gen_jump (done_label));
23694 emit_barrier ();
23695 emit_label (cmp_label);
23696 emit_move_insn (target_val, new_mem);
23697 emit_label (done_label);
23698 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23699 const0_rtx);
23700 }
23701 else
23702 {
23703 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23704 const0_rtx);
23705 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
23706 GET_MODE (target_bool), 1, loop_label,
23707 profile_probability::guessed_never ());
23708 emit_jump_insn (gen_jump (done_label));
23709 emit_barrier ();
23710
23711 /* If mem is not expected, pause and loop back. */
23712 emit_label (cmp_label);
522f25e9 23713 emit_move_insn (target_val, new_mem);
0435b978
HW
23714 emit_insn (gen_pause ());
23715 emit_jump_insn (gen_jump (loop_label));
23716 emit_barrier ();
23717 emit_label (done_label);
23718 }
23719
23720 *ptarget_bool = target_bool;
4d281ff7
HW
23721}
23722
2bf6d935 23723#include "gt-i386-expand.h"