]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/i386-expand.cc
i386: Improve memory copy from named address space [PR111657]
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
CommitLineData
83ffe9cd 1/* Copyright (C) 1988-2023 Free Software Foundation, Inc.
2bf6d935
ML
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3, or (at your option)
8any later version.
9
10GCC is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with GCC; see the file COPYING3. If not see
17<http://www.gnu.org/licenses/>. */
18
19#define IN_TARGET_CODE 1
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "gimple.h"
29#include "cfghooks.h"
30#include "cfgloop.h"
31#include "df.h"
32#include "tm_p.h"
33#include "stringpool.h"
34#include "expmed.h"
35#include "optabs.h"
36#include "regs.h"
37#include "emit-rtl.h"
38#include "recog.h"
39#include "cgraph.h"
40#include "diagnostic.h"
41#include "cfgbuild.h"
42#include "alias.h"
43#include "fold-const.h"
44#include "attribs.h"
45#include "calls.h"
46#include "stor-layout.h"
47#include "varasm.h"
48#include "output.h"
49#include "insn-attr.h"
50#include "flags.h"
51#include "except.h"
52#include "explow.h"
53#include "expr.h"
54#include "cfgrtl.h"
55#include "common/common-target.h"
56#include "langhooks.h"
57#include "reload.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "tm-constrs.h"
2bf6d935
ML
61#include "cselib.h"
62#include "sched-int.h"
63#include "opts.h"
64#include "tree-pass.h"
65#include "context.h"
66#include "pass_manager.h"
67#include "target-globals.h"
68#include "gimple-iterator.h"
2bf6d935
ML
69#include "shrink-wrap.h"
70#include "builtins.h"
71#include "rtl-iter.h"
72#include "tree-iterator.h"
73#include "dbgcnt.h"
74#include "case-cfn-macros.h"
75#include "dojump.h"
76#include "fold-const-call.h"
77#include "tree-vrp.h"
78#include "tree-ssanames.h"
79#include "selftest.h"
80#include "selftest-rtl.h"
81#include "print-rtl.h"
82#include "intl.h"
83#include "ifcvt.h"
84#include "symbol-summary.h"
85#include "ipa-prop.h"
86#include "ipa-fnsummary.h"
87#include "wide-int-bitmask.h"
88#include "tree-vector-builder.h"
89#include "debug.h"
90#include "dwarf2out.h"
91#include "i386-options.h"
92#include "i386-builtins.h"
93#include "i386-expand.h"
bb576017 94#include "asan.h"
2bf6d935
ML
95
96/* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102void
103split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105{
106 machine_mode half_mode;
107 unsigned int byte;
deeedbad
JJ
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
2bf6d935
ML
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
58d6eea0 119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
2bf6d935
ML
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
deeedbad
JJ
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
2bf6d935
ML
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
d39fbed7
UB
157
158 rtx tmp = simplify_gen_subreg (half_mode, op,
159 GET_MODE (op) == VOIDmode
160 ? mode : GET_MODE (op), byte);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
2bf6d935
ML
164 }
165 }
166}
167
16aafa31
RS
168/* Emit the double word assignment DST = { LO, HI }. */
169
170void
171split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
172{
173 rtx dlo, dhi;
174 int deleted_move_count = 0;
175 split_double_mode (mode, &dst, 1, &dlo, &dhi);
2c089640
JJ
176 /* Constraints ensure that if both lo and hi are MEMs, then
177 dst has early-clobber and thus addresses of MEMs don't use
178 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
179 dlo/dhi are registers. */
180 if (MEM_P (lo)
181 && rtx_equal_p (dlo, hi)
182 && reg_overlap_mentioned_p (dhi, lo))
183 {
184 /* If dlo is same as hi and lo's address uses dhi register,
185 code below would first emit_move_insn (dhi, hi)
186 and then emit_move_insn (dlo, lo). But the former
187 would invalidate lo's address. Load into dhi first,
188 then swap. */
189 emit_move_insn (dhi, lo);
190 lo = dhi;
191 }
192 else if (MEM_P (hi)
193 && !MEM_P (lo)
194 && !rtx_equal_p (dlo, lo)
195 && reg_overlap_mentioned_p (dlo, hi))
196 {
197 /* In this case, code below would first emit_move_insn (dlo, lo)
198 and then emit_move_insn (dhi, hi). But the former would
42630fad
JJ
199 invalidate hi's address. */
200 if (rtx_equal_p (dhi, lo))
201 {
202 /* We can't load into dhi first, so load into dlo
203 first and we'll swap. */
204 emit_move_insn (dlo, hi);
205 hi = dlo;
206 }
207 else
208 {
209 /* Load into dhi first. */
210 emit_move_insn (dhi, hi);
211 hi = dhi;
212 }
2c089640 213 }
16aafa31
RS
214 if (!rtx_equal_p (dlo, hi))
215 {
216 if (!rtx_equal_p (dlo, lo))
217 emit_move_insn (dlo, lo);
218 else
219 deleted_move_count++;
220 if (!rtx_equal_p (dhi, hi))
221 emit_move_insn (dhi, hi);
222 else
223 deleted_move_count++;
224 }
225 else if (!rtx_equal_p (lo, dhi))
226 {
227 if (!rtx_equal_p (dhi, hi))
228 emit_move_insn (dhi, hi);
229 else
230 deleted_move_count++;
231 if (!rtx_equal_p (dlo, lo))
232 emit_move_insn (dlo, lo);
233 else
234 deleted_move_count++;
235 }
236 else if (mode == TImode)
237 emit_insn (gen_swapdi (dlo, dhi));
238 else
239 emit_insn (gen_swapsi (dlo, dhi));
240
241 if (deleted_move_count == 2)
242 emit_note (NOTE_INSN_DELETED);
243}
244
245
2bf6d935
ML
246/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
247 for the target. */
248
249void
250ix86_expand_clear (rtx dest)
251{
252 rtx tmp;
253
254 /* We play register width games, which are only valid after reload. */
255 gcc_assert (reload_completed);
256
257 /* Avoid HImode and its attendant prefix byte. */
258 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
259 dest = gen_rtx_REG (SImode, REGNO (dest));
260 tmp = gen_rtx_SET (dest, const0_rtx);
261
262 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
263 {
264 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
265 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
266 }
267
268 emit_insn (tmp);
269}
270
edafb35b
L
271/* Return true if V can be broadcasted from an integer of WIDTH bits
272 which is returned in VAL_BROADCAST. Otherwise, return false. */
273
274static bool
275ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
276 HOST_WIDE_INT &val_broadcast)
277{
278 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
279 val_broadcast = wi::extract_uhwi (val, 0, width);
280 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
281 {
282 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
283 if (val_broadcast != each)
284 return false;
285 }
286 val_broadcast = sext_hwi (val_broadcast, width);
287 return true;
288}
289
290/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
291
292static rtx
293ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
294{
295 /* Don't use integer vector broadcast if we can't move from GPR to SSE
296 register directly. */
297 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
298 return nullptr;
299
300 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
301 broadcast only if vector broadcast is available. */
302 if (!TARGET_AVX
303 || !CONST_WIDE_INT_P (op)
963315a9
JJ
304 || standard_sse_constant_p (op, mode)
305 || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
306 != GET_MODE_BITSIZE (mode)))
edafb35b
L
307 return nullptr;
308
309 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
310 HOST_WIDE_INT val_broadcast;
311 scalar_int_mode broadcast_mode;
312 if (TARGET_AVX2
313 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
314 val_broadcast))
315 broadcast_mode = QImode;
316 else if (TARGET_AVX2
317 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
318 val_broadcast))
319 broadcast_mode = HImode;
320 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
321 val_broadcast))
322 broadcast_mode = SImode;
323 else if (TARGET_64BIT
324 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
325 val_broadcast))
326 broadcast_mode = DImode;
327 else
328 return nullptr;
329
330 /* Check if OP can be broadcasted from VAL. */
331 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
332 if (val != CONST_WIDE_INT_ELT (op, i))
333 return nullptr;
334
335 unsigned int nunits = (GET_MODE_SIZE (mode)
336 / GET_MODE_SIZE (broadcast_mode));
337 machine_mode vector_mode;
338 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
339 gcc_unreachable ();
77127363 340 rtx target = gen_reg_rtx (vector_mode);
edafb35b
L
341 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
342 target,
343 GEN_INT (val_broadcast));
344 gcc_assert (ok);
345 target = lowpart_subreg (mode, target, vector_mode);
346 return target;
347}
348
2bf6d935
ML
349void
350ix86_expand_move (machine_mode mode, rtx operands[])
351{
352 rtx op0, op1;
353 rtx tmp, addend = NULL_RTX;
354 enum tls_model model;
355
356 op0 = operands[0];
357 op1 = operands[1];
358
be39636d
RS
359 /* Avoid complex sets of likely spilled hard registers before reload. */
360 if (!ix86_hardreg_mov_ok (op0, op1))
361 {
362 tmp = gen_reg_rtx (mode);
363 operands[0] = tmp;
364 ix86_expand_move (mode, operands);
365 operands[0] = op0;
366 operands[1] = tmp;
367 op1 = tmp;
368 }
369
2bf6d935
ML
370 switch (GET_CODE (op1))
371 {
372 case CONST:
373 tmp = XEXP (op1, 0);
374
375 if (GET_CODE (tmp) != PLUS
376 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
377 break;
378
379 op1 = XEXP (tmp, 0);
380 addend = XEXP (tmp, 1);
381 /* FALLTHRU */
382
383 case SYMBOL_REF:
384 model = SYMBOL_REF_TLS_MODEL (op1);
385
386 if (model)
387 op1 = legitimize_tls_address (op1, model, true);
388 else if (ix86_force_load_from_GOT_p (op1))
389 {
390 /* Load the external function address via GOT slot to avoid PLT. */
391 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
392 (TARGET_64BIT
393 ? UNSPEC_GOTPCREL
394 : UNSPEC_GOT));
395 op1 = gen_rtx_CONST (Pmode, op1);
396 op1 = gen_const_mem (Pmode, op1);
397 set_mem_alias_set (op1, ix86_GOT_alias_set ());
398 }
399 else
400 {
401 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
402 if (tmp)
403 {
404 op1 = tmp;
405 if (!addend)
406 break;
407 }
408 else
409 {
410 op1 = operands[1];
411 break;
412 }
413 }
414
415 if (addend)
416 {
417 op1 = force_operand (op1, NULL_RTX);
418 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
419 op0, 1, OPTAB_DIRECT);
420 }
421 else
422 op1 = force_operand (op1, op0);
423
424 if (op1 == op0)
425 return;
426
427 op1 = convert_to_mode (mode, op1, 1);
428
429 default:
430 break;
bdf2737c
RS
431
432 case SUBREG:
433 /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
434 if (TARGET_64BIT
435 && mode == TImode
436 && SUBREG_P (op1)
437 && GET_MODE (SUBREG_REG (op1)) == DImode
438 && SUBREG_BYTE (op1) == 0)
439 op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
440 break;
2bf6d935
ML
441 }
442
443 if ((flag_pic || MACHOPIC_INDIRECT)
444 && symbolic_operand (op1, mode))
445 {
446 if (TARGET_MACHO && !TARGET_64BIT)
447 {
448#if TARGET_MACHO
449 /* dynamic-no-pic */
450 if (MACHOPIC_INDIRECT)
451 {
452 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
453 ? op0 : gen_reg_rtx (Pmode);
454 op1 = machopic_indirect_data_reference (op1, temp);
455 if (MACHOPIC_PURE)
456 op1 = machopic_legitimize_pic_address (op1, mode,
457 temp == op1 ? 0 : temp);
458 }
459 if (op0 != op1 && GET_CODE (op0) != MEM)
460 {
461 rtx insn = gen_rtx_SET (op0, op1);
462 emit_insn (insn);
463 return;
464 }
465 if (GET_CODE (op0) == MEM)
466 op1 = force_reg (Pmode, op1);
467 else
468 {
469 rtx temp = op0;
470 if (GET_CODE (temp) != REG)
471 temp = gen_reg_rtx (Pmode);
472 temp = legitimize_pic_address (op1, temp);
473 if (temp == op0)
474 return;
475 op1 = temp;
476 }
477 /* dynamic-no-pic */
478#endif
479 }
480 else
481 {
482 if (MEM_P (op0))
483 op1 = force_reg (mode, op1);
484 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
485 {
486 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
487 op1 = legitimize_pic_address (op1, reg);
488 if (op0 == op1)
489 return;
490 op1 = convert_to_mode (mode, op1, 1);
491 }
492 }
493 }
494 else
495 {
496 if (MEM_P (op0)
497 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
498 || !push_operand (op0, mode))
499 && MEM_P (op1))
500 op1 = force_reg (mode, op1);
501
502 if (push_operand (op0, mode)
503 && ! general_no_elim_operand (op1, mode))
504 op1 = copy_to_mode_reg (mode, op1);
505
506 /* Force large constants in 64bit compilation into register
507 to get them CSEed. */
508 if (can_create_pseudo_p ()
509 && (mode == DImode) && TARGET_64BIT
510 && immediate_operand (op1, mode)
511 && !x86_64_zext_immediate_operand (op1, VOIDmode)
512 && !register_operand (op0, mode)
513 && optimize)
514 op1 = copy_to_mode_reg (mode, op1);
515
edafb35b 516 if (can_create_pseudo_p ())
2bf6d935 517 {
edafb35b 518 if (CONST_DOUBLE_P (op1))
2bf6d935 519 {
edafb35b
L
520 /* If we are loading a floating point constant to a
521 register, force the value to memory now, since we'll
522 get better code out the back end. */
523
524 op1 = validize_mem (force_const_mem (mode, op1));
525 if (!register_operand (op0, mode))
526 {
527 rtx temp = gen_reg_rtx (mode);
528 emit_insn (gen_rtx_SET (temp, op1));
529 emit_move_insn (op0, temp);
530 return;
531 }
532 }
96c3539f
RS
533 else if (CONST_WIDE_INT_P (op1)
534 && GET_MODE_SIZE (mode) >= 16)
edafb35b
L
535 {
536 rtx tmp = ix86_convert_const_wide_int_to_broadcast
537 (GET_MODE (op0), op1);
538 if (tmp != nullptr)
539 op1 = tmp;
2bf6d935
ML
540 }
541 }
542 }
543
89118794 544 /* Special case inserting 64-bit values into a TImode register. */
bdf2737c 545 if (TARGET_64BIT
8125b12f
RS
546 /* Disable for -O0 (see PR110587) unless naked (PR110533). */
547 && (optimize || ix86_function_naked (current_function_decl))
89118794 548 && (mode == DImode || mode == DFmode)
bdf2737c 549 && SUBREG_P (op0)
bdf2737c
RS
550 && GET_MODE (SUBREG_REG (op0)) == TImode
551 && REG_P (SUBREG_REG (op0))
552 && REG_P (op1))
553 {
89118794
RS
554 /* Use *insvti_lowpart_1 to set lowpart. */
555 if (SUBREG_BYTE (op0) == 0)
556 {
557 wide_int mask = wi::mask (64, true, 128);
558 rtx tmp = immed_wide_int_const (mask, TImode);
559 op0 = SUBREG_REG (op0);
560 tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
561 if (mode == DFmode)
09710697 562 op1 = gen_lowpart (DImode, op1);
89118794
RS
563 op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
564 op1 = gen_rtx_IOR (TImode, tmp, op1);
565 }
566 /* Use *insvti_highpart_1 to set highpart. */
567 else if (SUBREG_BYTE (op0) == 8)
568 {
569 wide_int mask = wi::mask (64, false, 128);
570 rtx tmp = immed_wide_int_const (mask, TImode);
571 op0 = SUBREG_REG (op0);
572 tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
573 if (mode == DFmode)
09710697 574 op1 = gen_lowpart (DImode, op1);
89118794
RS
575 op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
576 op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
577 op1 = gen_rtx_IOR (TImode, tmp, op1);
578 }
bdf2737c
RS
579 }
580
2bf6d935
ML
581 emit_insn (gen_rtx_SET (op0, op1));
582}
583
a6291d88 584/* OP is a memref of CONST_VECTOR, return scalar constant mem
585 if CONST_VECTOR is a vec_duplicate, else return NULL. */
edafb35b 586static rtx
a6291d88 587ix86_broadcast_from_constant (machine_mode mode, rtx op)
edafb35b
L
588{
589 int nunits = GET_MODE_NUNITS (mode);
590 if (nunits < 2)
591 return nullptr;
592
593 /* Don't use integer vector broadcast if we can't move from GPR to SSE
594 register directly. */
a6291d88 595 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
596 && INTEGRAL_MODE_P (mode))
edafb35b
L
597 return nullptr;
598
599 /* Convert CONST_VECTOR to a non-standard SSE constant integer
600 broadcast only if vector broadcast is available. */
601 if (!(TARGET_AVX2
602 || (TARGET_AVX
603 && (GET_MODE_INNER (mode) == SImode
a6291d88 604 || GET_MODE_INNER (mode) == DImode))
605 || FLOAT_MODE_P (mode))
edafb35b
L
606 || standard_sse_constant_p (op, mode))
607 return nullptr;
608
a6291d88 609 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
610 We can still put 64-bit integer constant in memory when
611 avx512 embed broadcast is available. */
612 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
613 && (!TARGET_AVX512F
614 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
edafb35b
L
615 return nullptr;
616
f7cad1a0
JJ
617 if (GET_MODE_INNER (mode) == TImode)
618 return nullptr;
619
edafb35b
L
620 rtx constant = get_pool_constant (XEXP (op, 0));
621 if (GET_CODE (constant) != CONST_VECTOR)
622 return nullptr;
623
624 /* There could be some rtx like
625 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
626 but with "*.LC1" refer to V2DI constant vector. */
627 if (GET_MODE (constant) != mode)
628 {
629 constant = simplify_subreg (mode, constant, GET_MODE (constant),
630 0);
631 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
632 return nullptr;
633 }
634
635 rtx first = XVECEXP (constant, 0, 0);
636
637 for (int i = 1; i < nunits; ++i)
638 {
639 rtx tmp = XVECEXP (constant, 0, i);
640 /* Vector duplicate value. */
641 if (!rtx_equal_p (tmp, first))
642 return nullptr;
643 }
644
645 return first;
646}
647
2bf6d935
ML
648void
649ix86_expand_vector_move (machine_mode mode, rtx operands[])
650{
651 rtx op0 = operands[0], op1 = operands[1];
652 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
653 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
654 unsigned int align = (TARGET_IAMCU
655 ? GET_MODE_BITSIZE (mode)
656 : GET_MODE_ALIGNMENT (mode));
657
658 if (push_operand (op0, VOIDmode))
659 op0 = emit_move_resolve_push (mode, op0);
660
661 /* Force constants other than zero into memory. We do not know how
662 the instructions used to build constants modify the upper 64 bits
663 of the register, once we have that information we may be able
664 to handle some of them more efficiently. */
665 if (can_create_pseudo_p ()
666 && (CONSTANT_P (op1)
667 || (SUBREG_P (op1)
668 && CONSTANT_P (SUBREG_REG (op1))))
669 && ((register_operand (op0, mode)
670 && !standard_sse_constant_p (op1, mode))
671 /* ix86_expand_vector_move_misalign() does not like constants. */
672 || (SSE_REG_MODE_P (mode)
673 && MEM_P (op0)
674 && MEM_ALIGN (op0) < align)))
675 {
676 if (SUBREG_P (op1))
677 {
678 machine_mode imode = GET_MODE (SUBREG_REG (op1));
679 rtx r = force_const_mem (imode, SUBREG_REG (op1));
680 if (r)
681 r = validize_mem (r);
682 else
683 r = force_reg (imode, SUBREG_REG (op1));
684 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
685 }
686 else
edafb35b
L
687 {
688 machine_mode mode = GET_MODE (op0);
689 rtx tmp = ix86_convert_const_wide_int_to_broadcast
690 (mode, op1);
691 if (tmp == nullptr)
692 op1 = validize_mem (force_const_mem (mode, op1));
693 else
694 op1 = tmp;
695 }
696 }
697
698 if (can_create_pseudo_p ()
699 && GET_MODE_SIZE (mode) >= 16
a6291d88 700 && VECTOR_MODE_P (mode)
edafb35b
L
701 && (MEM_P (op1)
702 && SYMBOL_REF_P (XEXP (op1, 0))
703 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
704 {
a6291d88 705 rtx first = ix86_broadcast_from_constant (mode, op1);
edafb35b
L
706 if (first != nullptr)
707 {
708 /* Broadcast to XMM/YMM/ZMM register from an integer
a6291d88 709 constant or scalar mem. */
6e5401e8 710 op1 = gen_reg_rtx (mode);
a6291d88 711 if (FLOAT_MODE_P (mode)
712 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
6e5401e8 713 first = force_const_mem (GET_MODE_INNER (mode), first);
edafb35b
L
714 bool ok = ix86_expand_vector_init_duplicate (false, mode,
715 op1, first);
716 gcc_assert (ok);
717 emit_move_insn (op0, op1);
718 return;
719 }
2bf6d935
ML
720 }
721
722 /* We need to check memory alignment for SSE mode since attribute
723 can make operands unaligned. */
724 if (can_create_pseudo_p ()
725 && SSE_REG_MODE_P (mode)
726 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
727 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
728 {
729 rtx tmp[2];
730
731 /* ix86_expand_vector_move_misalign() does not like both
732 arguments in memory. */
733 if (!register_operand (op0, mode)
734 && !register_operand (op1, mode))
09dba016 735 {
77127363 736 rtx scratch = gen_reg_rtx (mode);
09dba016
L
737 emit_move_insn (scratch, op1);
738 op1 = scratch;
739 }
2bf6d935
ML
740
741 tmp[0] = op0; tmp[1] = op1;
742 ix86_expand_vector_move_misalign (mode, tmp);
743 return;
744 }
745
96c3539f
RS
746 /* Special case TImode to 128-bit vector conversions via V2DI. */
747 if (VECTOR_MODE_P (mode)
748 && GET_MODE_SIZE (mode) == 16
fad14a02
RS
749 && SUBREG_P (op1)
750 && GET_MODE (SUBREG_REG (op1)) == TImode
751 && TARGET_64BIT && TARGET_SSE
752 && can_create_pseudo_p ())
753 {
754 rtx tmp = gen_reg_rtx (V2DImode);
755 rtx lo = gen_reg_rtx (DImode);
756 rtx hi = gen_reg_rtx (DImode);
757 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
758 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
759 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
96c3539f 760 emit_move_insn (op0, gen_lowpart (mode, tmp));
fad14a02
RS
761 return;
762 }
763
97c32001
RS
764 /* If operand0 is a hard register, make operand1 a pseudo. */
765 if (can_create_pseudo_p ()
766 && !ix86_hardreg_mov_ok (op0, op1))
767 {
768 rtx tmp = gen_reg_rtx (GET_MODE (op0));
769 emit_move_insn (tmp, op1);
770 emit_move_insn (op0, tmp);
771 return;
772 }
773
2bf6d935
ML
774 /* Make operand1 a register if it isn't already. */
775 if (can_create_pseudo_p ()
776 && !register_operand (op0, mode)
777 && !register_operand (op1, mode))
778 {
77127363 779 rtx tmp = gen_reg_rtx (GET_MODE (op0));
7f4c3943
L
780 emit_move_insn (tmp, op1);
781 emit_move_insn (op0, tmp);
2bf6d935
ML
782 return;
783 }
784
785 emit_insn (gen_rtx_SET (op0, op1));
786}
787
788/* Split 32-byte AVX unaligned load and store if needed. */
789
790static void
791ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
792{
793 rtx m;
794 rtx (*extract) (rtx, rtx, rtx);
795 machine_mode mode;
796
797 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
798 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
799 {
800 emit_insn (gen_rtx_SET (op0, op1));
801 return;
802 }
803
804 rtx orig_op0 = NULL_RTX;
805 mode = GET_MODE (op0);
806 switch (GET_MODE_CLASS (mode))
807 {
808 case MODE_VECTOR_INT:
809 case MODE_INT:
810 if (mode != V32QImode)
811 {
812 if (!MEM_P (op0))
813 {
814 orig_op0 = op0;
815 op0 = gen_reg_rtx (V32QImode);
816 }
817 else
818 op0 = gen_lowpart (V32QImode, op0);
819 op1 = gen_lowpart (V32QImode, op1);
820 mode = V32QImode;
821 }
822 break;
823 case MODE_VECTOR_FLOAT:
824 break;
825 default:
826 gcc_unreachable ();
827 }
828
829 switch (mode)
830 {
831 default:
832 gcc_unreachable ();
833 case E_V32QImode:
834 extract = gen_avx_vextractf128v32qi;
835 mode = V16QImode;
836 break;
60d1d296
L
837 case E_V16BFmode:
838 extract = gen_avx_vextractf128v16bf;
839 mode = V8BFmode;
840 break;
d959312b
L
841 case E_V16HFmode:
842 extract = gen_avx_vextractf128v16hf;
843 mode = V8HFmode;
844 break;
2bf6d935
ML
845 case E_V8SFmode:
846 extract = gen_avx_vextractf128v8sf;
847 mode = V4SFmode;
848 break;
849 case E_V4DFmode:
850 extract = gen_avx_vextractf128v4df;
851 mode = V2DFmode;
852 break;
853 }
854
855 if (MEM_P (op1))
856 {
857 rtx r = gen_reg_rtx (mode);
858 m = adjust_address (op1, mode, 0);
859 emit_move_insn (r, m);
860 m = adjust_address (op1, mode, 16);
861 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
862 emit_move_insn (op0, r);
863 }
864 else if (MEM_P (op0))
865 {
866 m = adjust_address (op0, mode, 0);
867 emit_insn (extract (m, op1, const0_rtx));
868 m = adjust_address (op0, mode, 16);
869 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
870 }
871 else
872 gcc_unreachable ();
873
874 if (orig_op0)
875 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
876}
877
878/* Implement the movmisalign patterns for SSE. Non-SSE modes go
879 straight to ix86_expand_vector_move. */
880/* Code generation for scalar reg-reg moves of single and double precision data:
881 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
882 movaps reg, reg
883 else
884 movss reg, reg
885 if (x86_sse_partial_reg_dependency == true)
886 movapd reg, reg
887 else
888 movsd reg, reg
889
890 Code generation for scalar loads of double precision data:
891 if (x86_sse_split_regs == true)
892 movlpd mem, reg (gas syntax)
893 else
894 movsd mem, reg
895
896 Code generation for unaligned packed loads of single precision data
897 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
898 if (x86_sse_unaligned_move_optimal)
899 movups mem, reg
900
901 if (x86_sse_partial_reg_dependency == true)
902 {
903 xorps reg, reg
904 movlps mem, reg
905 movhps mem+8, reg
906 }
907 else
908 {
909 movlps mem, reg
910 movhps mem+8, reg
911 }
912
913 Code generation for unaligned packed loads of double precision data
914 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
915 if (x86_sse_unaligned_move_optimal)
916 movupd mem, reg
917
918 if (x86_sse_split_regs == true)
919 {
920 movlpd mem, reg
921 movhpd mem+8, reg
922 }
923 else
924 {
925 movsd mem, reg
926 movhpd mem+8, reg
927 }
928 */
929
930void
931ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
932{
933 rtx op0, op1, m;
934
935 op0 = operands[0];
936 op1 = operands[1];
937
938 /* Use unaligned load/store for AVX512 or when optimizing for size. */
939 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
940 {
941 emit_insn (gen_rtx_SET (op0, op1));
942 return;
943 }
944
945 if (TARGET_AVX)
946 {
947 if (GET_MODE_SIZE (mode) == 32)
948 ix86_avx256_split_vector_move_misalign (op0, op1);
949 else
950 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
951 emit_insn (gen_rtx_SET (op0, op1));
952 return;
953 }
954
955 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
956 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
957 {
958 emit_insn (gen_rtx_SET (op0, op1));
959 return;
960 }
961
962 /* ??? If we have typed data, then it would appear that using
963 movdqu is the only way to get unaligned data loaded with
964 integer type. */
965 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
966 {
967 emit_insn (gen_rtx_SET (op0, op1));
968 return;
969 }
970
971 if (MEM_P (op1))
972 {
973 if (TARGET_SSE2 && mode == V2DFmode)
974 {
975 rtx zero;
976
977 /* When SSE registers are split into halves, we can avoid
978 writing to the top half twice. */
979 if (TARGET_SSE_SPLIT_REGS)
980 {
981 emit_clobber (op0);
982 zero = op0;
983 }
984 else
985 {
986 /* ??? Not sure about the best option for the Intel chips.
987 The following would seem to satisfy; the register is
988 entirely cleared, breaking the dependency chain. We
989 then store to the upper half, with a dependency depth
990 of one. A rumor has it that Intel recommends two movsd
991 followed by an unpacklpd, but this is unconfirmed. And
992 given that the dependency depth of the unpacklpd would
993 still be one, I'm not sure why this would be better. */
994 zero = CONST0_RTX (V2DFmode);
995 }
996
997 m = adjust_address (op1, DFmode, 0);
998 emit_insn (gen_sse2_loadlpd (op0, zero, m));
999 m = adjust_address (op1, DFmode, 8);
1000 emit_insn (gen_sse2_loadhpd (op0, op0, m));
1001 }
1002 else
1003 {
1004 rtx t;
1005
1006 if (mode != V4SFmode)
1007 t = gen_reg_rtx (V4SFmode);
1008 else
1009 t = op0;
1010
1011 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
1012 emit_move_insn (t, CONST0_RTX (V4SFmode));
1013 else
1014 emit_clobber (t);
1015
1016 m = adjust_address (op1, V2SFmode, 0);
1017 emit_insn (gen_sse_loadlps (t, t, m));
1018 m = adjust_address (op1, V2SFmode, 8);
1019 emit_insn (gen_sse_loadhps (t, t, m));
1020 if (mode != V4SFmode)
1021 emit_move_insn (op0, gen_lowpart (mode, t));
1022 }
1023 }
1024 else if (MEM_P (op0))
1025 {
1026 if (TARGET_SSE2 && mode == V2DFmode)
1027 {
1028 m = adjust_address (op0, DFmode, 0);
1029 emit_insn (gen_sse2_storelpd (m, op1));
1030 m = adjust_address (op0, DFmode, 8);
1031 emit_insn (gen_sse2_storehpd (m, op1));
1032 }
1033 else
1034 {
1035 if (mode != V4SFmode)
1036 op1 = gen_lowpart (V4SFmode, op1);
1037
1038 m = adjust_address (op0, V2SFmode, 0);
1039 emit_insn (gen_sse_storelps (m, op1));
1040 m = adjust_address (op0, V2SFmode, 8);
1041 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
1042 }
1043 }
1044 else
1045 gcc_unreachable ();
1046}
1047
b74ebb2a
L
1048/* Move bits 64:95 to bits 32:63. */
1049
1050void
1051ix86_move_vector_high_sse_to_mmx (rtx op)
1052{
1053 rtx mask = gen_rtx_PARALLEL (VOIDmode,
1054 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1055 GEN_INT (0), GEN_INT (0)));
1056 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
1057 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1058 rtx insn = gen_rtx_SET (dest, op);
1059 emit_insn (insn);
1060}
1061
1062/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1063
1064void
1065ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1066{
1067 rtx op0 = operands[0];
1068 rtx op1 = operands[1];
1069 rtx op2 = operands[2];
58e61a3a 1070 rtx src;
b74ebb2a
L
1071
1072 machine_mode dmode = GET_MODE (op0);
1073 machine_mode smode = GET_MODE (op1);
1074 machine_mode inner_dmode = GET_MODE_INNER (dmode);
1075 machine_mode inner_smode = GET_MODE_INNER (smode);
1076
1077 /* Get the corresponding SSE mode for destination. */
1078 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1079 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1080 nunits).require ();
1081 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1082 nunits / 2).require ();
1083
1084 /* Get the corresponding SSE mode for source. */
1085 nunits = 16 / GET_MODE_SIZE (inner_smode);
1086 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1087 nunits).require ();
1088
1089 /* Generate SSE pack with signed/unsigned saturation. */
1090 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1091 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1092 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1093
58e61a3a 1094 /* paskusdw/packuswb does unsigned saturation of a signed source
1095 which is different from generic us_truncate RTX. */
1096 if (code == US_TRUNCATE)
1097 src = gen_rtx_UNSPEC (sse_dmode,
1098 gen_rtvec (2, op1, op2),
1099 UNSPEC_US_TRUNCATE);
1100 else
1101 {
1102 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1103 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1104 src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
1105 }
1106
1107 emit_move_insn (dest, src);
b74ebb2a
L
1108
1109 ix86_move_vector_high_sse_to_mmx (op0);
1110}
1111
6e9fffcf
L
1112/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1113
1114void
1115ix86_split_mmx_punpck (rtx operands[], bool high_p)
1116{
1117 rtx op0 = operands[0];
1118 rtx op1 = operands[1];
1119 rtx op2 = operands[2];
1120 machine_mode mode = GET_MODE (op0);
1121 rtx mask;
1122 /* The corresponding SSE mode. */
1123 machine_mode sse_mode, double_sse_mode;
1124
1125 switch (mode)
1126 {
1127 case E_V8QImode:
4123b560
UB
1128 case E_V4QImode:
1129 case E_V2QImode:
6e9fffcf
L
1130 sse_mode = V16QImode;
1131 double_sse_mode = V32QImode;
1132 mask = gen_rtx_PARALLEL (VOIDmode,
1133 gen_rtvec (16,
1134 GEN_INT (0), GEN_INT (16),
1135 GEN_INT (1), GEN_INT (17),
1136 GEN_INT (2), GEN_INT (18),
1137 GEN_INT (3), GEN_INT (19),
1138 GEN_INT (4), GEN_INT (20),
1139 GEN_INT (5), GEN_INT (21),
1140 GEN_INT (6), GEN_INT (22),
1141 GEN_INT (7), GEN_INT (23)));
1142 break;
1143
1144 case E_V4HImode:
be8749f9 1145 case E_V2HImode:
6e9fffcf
L
1146 sse_mode = V8HImode;
1147 double_sse_mode = V16HImode;
1148 mask = gen_rtx_PARALLEL (VOIDmode,
1149 gen_rtvec (8,
1150 GEN_INT (0), GEN_INT (8),
1151 GEN_INT (1), GEN_INT (9),
1152 GEN_INT (2), GEN_INT (10),
1153 GEN_INT (3), GEN_INT (11)));
1154 break;
1155
1156 case E_V2SImode:
1157 sse_mode = V4SImode;
1158 double_sse_mode = V8SImode;
1159 mask = gen_rtx_PARALLEL (VOIDmode,
1160 gen_rtvec (4,
1161 GEN_INT (0), GEN_INT (4),
1162 GEN_INT (1), GEN_INT (5)));
1163 break;
1164
a325bdd1
PB
1165 case E_V2SFmode:
1166 sse_mode = V4SFmode;
1167 double_sse_mode = V8SFmode;
1168 mask = gen_rtx_PARALLEL (VOIDmode,
1169 gen_rtvec (4,
1170 GEN_INT (0), GEN_INT (4),
1171 GEN_INT (1), GEN_INT (5)));
1172 break;
1173
6e9fffcf
L
1174 default:
1175 gcc_unreachable ();
1176 }
1177
1178 /* Generate SSE punpcklXX. */
1179 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1180 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1181 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1182
1183 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1184 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1185 rtx insn = gen_rtx_SET (dest, op2);
1186 emit_insn (insn);
1187
be8749f9 1188 /* Move high bits to low bits. */
6e9fffcf
L
1189 if (high_p)
1190 {
a325bdd1
PB
1191 if (sse_mode == V4SFmode)
1192 {
1193 mask = gen_rtx_PARALLEL (VOIDmode,
1194 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1195 GEN_INT (4), GEN_INT (5)));
1196 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1197 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1198 }
1199 else
1200 {
be8749f9
UB
1201 int sz = GET_MODE_SIZE (mode);
1202
1203 if (sz == 4)
1204 mask = gen_rtx_PARALLEL (VOIDmode,
1205 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1206 GEN_INT (0), GEN_INT (1)));
1207 else if (sz == 8)
1208 mask = gen_rtx_PARALLEL (VOIDmode,
1209 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1210 GEN_INT (0), GEN_INT (1)));
1211 else
1212 gcc_unreachable ();
1213
a325bdd1
PB
1214 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1215 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1216 }
1217
6e9fffcf
L
1218 insn = gen_rtx_SET (dest, op1);
1219 emit_insn (insn);
1220 }
1221}
1222
2bf6d935
ML
1223/* Helper function of ix86_fixup_binary_operands to canonicalize
1224 operand order. Returns true if the operands should be swapped. */
1225
1226static bool
1227ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1228 rtx operands[])
1229{
1230 rtx dst = operands[0];
1231 rtx src1 = operands[1];
1232 rtx src2 = operands[2];
1233
1234 /* If the operation is not commutative, we can't do anything. */
1235 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1236 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1237 return false;
1238
1239 /* Highest priority is that src1 should match dst. */
1240 if (rtx_equal_p (dst, src1))
1241 return false;
1242 if (rtx_equal_p (dst, src2))
1243 return true;
1244
1245 /* Next highest priority is that immediate constants come second. */
1246 if (immediate_operand (src2, mode))
1247 return false;
1248 if (immediate_operand (src1, mode))
1249 return true;
1250
1251 /* Lowest priority is that memory references should come second. */
1252 if (MEM_P (src2))
1253 return false;
1254 if (MEM_P (src1))
1255 return true;
1256
1257 return false;
1258}
1259
1260
1261/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1262 destination to use for the operation. If different from the true
1263 destination in operands[0], a copy operation will be required. */
1264
1265rtx
1266ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1267 rtx operands[])
1268{
1269 rtx dst = operands[0];
1270 rtx src1 = operands[1];
1271 rtx src2 = operands[2];
1272
1273 /* Canonicalize operand order. */
1274 if (ix86_swap_binary_operands_p (code, mode, operands))
1275 {
1276 /* It is invalid to swap operands of different modes. */
1277 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1278
1279 std::swap (src1, src2);
1280 }
1281
1282 /* Both source operands cannot be in memory. */
1283 if (MEM_P (src1) && MEM_P (src2))
1284 {
1285 /* Optimization: Only read from memory once. */
1286 if (rtx_equal_p (src1, src2))
1287 {
1288 src2 = force_reg (mode, src2);
1289 src1 = src2;
1290 }
1291 else if (rtx_equal_p (dst, src1))
1292 src2 = force_reg (mode, src2);
1293 else
1294 src1 = force_reg (mode, src1);
1295 }
1296
1297 /* If the destination is memory, and we do not have matching source
1298 operands, do things in registers. */
1299 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1300 dst = gen_reg_rtx (mode);
1301
1302 /* Source 1 cannot be a constant. */
1303 if (CONSTANT_P (src1))
1304 src1 = force_reg (mode, src1);
1305
1306 /* Source 1 cannot be a non-matching memory. */
1307 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1308 src1 = force_reg (mode, src1);
1309
1310 /* Improve address combine. */
1311 if (code == PLUS
1312 && GET_MODE_CLASS (mode) == MODE_INT
1313 && MEM_P (src2))
1314 src2 = force_reg (mode, src2);
1315
1316 operands[1] = src1;
1317 operands[2] = src2;
1318 return dst;
1319}
1320
1321/* Similarly, but assume that the destination has already been
1322 set up properly. */
1323
1324void
1325ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1326 machine_mode mode, rtx operands[])
1327{
1328 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1329 gcc_assert (dst == operands[0]);
1330}
1331
1332/* Attempt to expand a binary operator. Make the expansion closer to the
1333 actual machine, then just general_operand, which will allow 3 separate
1334 memory references (one output, two input) in a single insn. */
1335
1336void
1337ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1338 rtx operands[])
1339{
1340 rtx src1, src2, dst, op, clob;
1341
1342 dst = ix86_fixup_binary_operands (code, mode, operands);
1343 src1 = operands[1];
1344 src2 = operands[2];
1345
1346 /* Emit the instruction. */
1347
1348 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1349
1350 if (reload_completed
1351 && code == PLUS
1352 && !rtx_equal_p (dst, src1))
1353 {
1354 /* This is going to be an LEA; avoid splitting it later. */
1355 emit_insn (op);
1356 }
1357 else
1358 {
1359 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1360 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1361 }
1362
1363 /* Fix up the destination if needed. */
1364 if (dst != operands[0])
1365 emit_move_insn (operands[0], dst);
1366}
1367
1368/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1369 the given OPERANDS. */
1370
1371void
1372ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1373 rtx operands[])
1374{
1375 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1376 if (SUBREG_P (operands[1]))
1377 {
1378 op1 = operands[1];
1379 op2 = operands[2];
1380 }
1381 else if (SUBREG_P (operands[2]))
1382 {
1383 op1 = operands[2];
1384 op2 = operands[1];
1385 }
1386 /* Optimize (__m128i) d | (__m128i) e and similar code
1387 when d and e are float vectors into float vector logical
1388 insn. In C/C++ without using intrinsics there is no other way
1389 to express vector logical operation on float vectors than
1390 to cast them temporarily to integer vectors. */
1391 if (op1
1392 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1393 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1394 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1395 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1396 && SUBREG_BYTE (op1) == 0
1397 && (GET_CODE (op2) == CONST_VECTOR
1398 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1399 && SUBREG_BYTE (op2) == 0))
1400 && can_create_pseudo_p ())
1401 {
1402 rtx dst;
1403 switch (GET_MODE (SUBREG_REG (op1)))
1404 {
1405 case E_V4SFmode:
1406 case E_V8SFmode:
1407 case E_V16SFmode:
1408 case E_V2DFmode:
1409 case E_V4DFmode:
1410 case E_V8DFmode:
1411 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1412 if (GET_CODE (op2) == CONST_VECTOR)
1413 {
1414 op2 = gen_lowpart (GET_MODE (dst), op2);
1415 op2 = force_reg (GET_MODE (dst), op2);
1416 }
1417 else
1418 {
1419 op1 = operands[1];
1420 op2 = SUBREG_REG (operands[2]);
1421 if (!vector_operand (op2, GET_MODE (dst)))
1422 op2 = force_reg (GET_MODE (dst), op2);
1423 }
1424 op1 = SUBREG_REG (op1);
1425 if (!vector_operand (op1, GET_MODE (dst)))
1426 op1 = force_reg (GET_MODE (dst), op1);
1427 emit_insn (gen_rtx_SET (dst,
1428 gen_rtx_fmt_ee (code, GET_MODE (dst),
1429 op1, op2)));
1430 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1431 return;
1432 default:
1433 break;
1434 }
1435 }
1436 if (!vector_operand (operands[1], mode))
1437 operands[1] = force_reg (mode, operands[1]);
1438 if (!vector_operand (operands[2], mode))
1439 operands[2] = force_reg (mode, operands[2]);
1440 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1441 emit_insn (gen_rtx_SET (operands[0],
1442 gen_rtx_fmt_ee (code, mode, operands[1],
1443 operands[2])));
1444}
1445
1446/* Return TRUE or FALSE depending on whether the binary operator meets the
1447 appropriate constraints. */
1448
1449bool
1450ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1451 rtx operands[3])
1452{
1453 rtx dst = operands[0];
1454 rtx src1 = operands[1];
1455 rtx src2 = operands[2];
1456
1457 /* Both source operands cannot be in memory. */
7026bb95 1458 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1459 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
2bf6d935
ML
1460 return false;
1461
1462 /* Canonicalize operand order for commutative operators. */
1463 if (ix86_swap_binary_operands_p (code, mode, operands))
1464 std::swap (src1, src2);
1465
1466 /* If the destination is memory, we must have a matching source operand. */
1467 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1468 return false;
1469
1470 /* Source 1 cannot be a constant. */
1471 if (CONSTANT_P (src1))
1472 return false;
1473
1474 /* Source 1 cannot be a non-matching memory. */
1475 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1476 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1477 return (code == AND
1478 && (mode == HImode
1479 || mode == SImode
1480 || (TARGET_64BIT && mode == DImode))
1481 && satisfies_constraint_L (src2));
1482
1483 return true;
1484}
1485
1486/* Attempt to expand a unary operator. Make the expansion closer to the
1487 actual machine, then just general_operand, which will allow 2 separate
1488 memory references (one output, one input) in a single insn. */
1489
1490void
1491ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1492 rtx operands[])
1493{
1494 bool matching_memory = false;
1495 rtx src, dst, op, clob;
1496
1497 dst = operands[0];
1498 src = operands[1];
1499
1500 /* If the destination is memory, and we do not have matching source
1501 operands, do things in registers. */
1502 if (MEM_P (dst))
1503 {
1504 if (rtx_equal_p (dst, src))
1505 matching_memory = true;
1506 else
1507 dst = gen_reg_rtx (mode);
1508 }
1509
1510 /* When source operand is memory, destination must match. */
1511 if (MEM_P (src) && !matching_memory)
1512 src = force_reg (mode, src);
1513
1514 /* Emit the instruction. */
1515
1516 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1517
1518 if (code == NOT)
1519 emit_insn (op);
1520 else
1521 {
1522 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1523 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1524 }
1525
1526 /* Fix up the destination if needed. */
1527 if (dst != operands[0])
1528 emit_move_insn (operands[0], dst);
1529}
1530
1531/* Predict just emitted jump instruction to be taken with probability PROB. */
1532
1533static void
1534predict_jump (int prob)
1535{
1536 rtx_insn *insn = get_last_insn ();
1537 gcc_assert (JUMP_P (insn));
1538 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1539}
1540
1541/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1542 divisor are within the range [0-255]. */
1543
1544void
1545ix86_split_idivmod (machine_mode mode, rtx operands[],
40c81f84 1546 bool unsigned_p)
2bf6d935
ML
1547{
1548 rtx_code_label *end_label, *qimode_label;
1549 rtx div, mod;
1550 rtx_insn *insn;
1551 rtx scratch, tmp0, tmp1, tmp2;
1552 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
2bf6d935 1553
2b399dba
UB
1554 operands[2] = force_reg (mode, operands[2]);
1555 operands[3] = force_reg (mode, operands[3]);
1556
2bf6d935
ML
1557 switch (mode)
1558 {
1559 case E_SImode:
1560 if (GET_MODE (operands[0]) == SImode)
1561 {
1562 if (GET_MODE (operands[1]) == SImode)
40c81f84 1563 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
2bf6d935
ML
1564 else
1565 gen_divmod4_1
40c81f84 1566 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
2bf6d935
ML
1567 }
1568 else
ea298f7a
UB
1569 gen_divmod4_1
1570 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
2bf6d935 1571 break;
ea298f7a 1572
2bf6d935 1573 case E_DImode:
40c81f84 1574 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
2bf6d935 1575 break;
ea298f7a 1576
2bf6d935
ML
1577 default:
1578 gcc_unreachable ();
1579 }
1580
1581 end_label = gen_label_rtx ();
1582 qimode_label = gen_label_rtx ();
1583
1584 scratch = gen_reg_rtx (mode);
1585
1586 /* Use 8bit unsigned divimod if dividend and divisor are within
1587 the range [0-255]. */
1588 emit_move_insn (scratch, operands[2]);
1589 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1590 scratch, 1, OPTAB_DIRECT);
ea298f7a 1591 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
2bf6d935
ML
1592 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1593 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1594 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1595 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1596 pc_rtx);
1597 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1598 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1599 JUMP_LABEL (insn) = qimode_label;
1600
1601 /* Generate original signed/unsigned divimod. */
e9539592
UB
1602 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1603 operands[2], operands[3]));
2bf6d935
ML
1604
1605 /* Branch to the end. */
1606 emit_jump_insn (gen_jump (end_label));
1607 emit_barrier ();
1608
1609 /* Generate 8bit unsigned divide. */
1610 emit_label (qimode_label);
1611 /* Don't use operands[0] for result of 8bit divide since not all
1612 registers support QImode ZERO_EXTRACT. */
1613 tmp0 = lowpart_subreg (HImode, scratch, mode);
1614 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1615 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1616 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1617
40c81f84 1618 if (unsigned_p)
2bf6d935 1619 {
40c81f84
UB
1620 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1621 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
2bf6d935
ML
1622 }
1623 else
1624 {
40c81f84
UB
1625 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1626 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
2bf6d935
ML
1627 }
1628 if (mode == SImode)
1629 {
1630 if (GET_MODE (operands[0]) != SImode)
1631 div = gen_rtx_ZERO_EXTEND (DImode, div);
1632 if (GET_MODE (operands[1]) != SImode)
1633 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1634 }
1635
1636 /* Extract remainder from AH. */
e9539592
UB
1637 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1638 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1639 GEN_INT (8), GEN_INT (8));
1640 insn = emit_move_insn (operands[1], tmp1);
2bf6d935
ML
1641 set_unique_reg_note (insn, REG_EQUAL, mod);
1642
1643 /* Zero extend quotient from AL. */
1644 tmp1 = gen_lowpart (QImode, tmp0);
ea298f7a
UB
1645 insn = emit_insn (gen_extend_insn
1646 (operands[0], tmp1,
1647 GET_MODE (operands[0]), QImode, 1));
2bf6d935
ML
1648 set_unique_reg_note (insn, REG_EQUAL, div);
1649
1650 emit_label (end_label);
1651}
1652
1653/* Emit x86 binary operand CODE in mode MODE, where the first operand
1654 matches destination. RTX includes clobber of FLAGS_REG. */
1655
1656void
1657ix86_emit_binop (enum rtx_code code, machine_mode mode,
1658 rtx dst, rtx src)
1659{
1660 rtx op, clob;
1661
1662 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1663 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1664
1665 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1666}
1667
1668/* Return true if regno1 def is nearest to the insn. */
1669
1670static bool
1671find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1672{
1673 rtx_insn *prev = insn;
1674 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1675
1676 if (insn == start)
1677 return false;
1678 while (prev && prev != start)
1679 {
1680 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1681 {
1682 prev = PREV_INSN (prev);
1683 continue;
1684 }
1685 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1686 return true;
1687 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1688 return false;
1689 prev = PREV_INSN (prev);
1690 }
1691
1692 /* None of the regs is defined in the bb. */
1693 return false;
1694}
1695
d58a66aa
JJ
1696/* INSN_UID of the last insn emitted by zero store peephole2s. */
1697int ix86_last_zero_store_uid;
1698
2bf6d935
ML
1699/* Split lea instructions into a sequence of instructions
1700 which are executed on ALU to avoid AGU stalls.
1701 It is assumed that it is allowed to clobber flags register
1702 at lea position. */
1703
1704void
1705ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1706{
1707 unsigned int regno0, regno1, regno2;
1708 struct ix86_address parts;
1709 rtx target, tmp;
1710 int ok, adds;
1711
1712 ok = ix86_decompose_address (operands[1], &parts);
1713 gcc_assert (ok);
1714
1715 target = gen_lowpart (mode, operands[0]);
1716
1717 regno0 = true_regnum (target);
1718 regno1 = INVALID_REGNUM;
1719 regno2 = INVALID_REGNUM;
1720
1721 if (parts.base)
1722 {
1723 parts.base = gen_lowpart (mode, parts.base);
1724 regno1 = true_regnum (parts.base);
1725 }
1726
1727 if (parts.index)
1728 {
1729 parts.index = gen_lowpart (mode, parts.index);
1730 regno2 = true_regnum (parts.index);
1731 }
1732
1733 if (parts.disp)
1734 parts.disp = gen_lowpart (mode, parts.disp);
1735
1736 if (parts.scale > 1)
1737 {
1738 /* Case r1 = r1 + ... */
1739 if (regno1 == regno0)
1740 {
1741 /* If we have a case r1 = r1 + C * r2 then we
1742 should use multiplication which is very
1743 expensive. Assume cost model is wrong if we
1744 have such case here. */
1745 gcc_assert (regno2 != regno0);
1746
1747 for (adds = parts.scale; adds > 0; adds--)
1748 ix86_emit_binop (PLUS, mode, target, parts.index);
1749 }
1750 else
1751 {
1752 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1753 if (regno0 != regno2)
1754 emit_insn (gen_rtx_SET (target, parts.index));
1755
d55ce33a
JJ
1756 /* Use shift for scaling, but emit it as MULT instead
1757 to avoid it being immediately peephole2 optimized back
1758 into lea. */
1759 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
2bf6d935
ML
1760
1761 if (parts.base)
1762 ix86_emit_binop (PLUS, mode, target, parts.base);
1763
1764 if (parts.disp && parts.disp != const0_rtx)
1765 ix86_emit_binop (PLUS, mode, target, parts.disp);
1766 }
1767 }
1768 else if (!parts.base && !parts.index)
1769 {
1770 gcc_assert(parts.disp);
1771 emit_insn (gen_rtx_SET (target, parts.disp));
1772 }
1773 else
1774 {
1775 if (!parts.base)
1776 {
1777 if (regno0 != regno2)
1778 emit_insn (gen_rtx_SET (target, parts.index));
1779 }
1780 else if (!parts.index)
1781 {
1782 if (regno0 != regno1)
1783 emit_insn (gen_rtx_SET (target, parts.base));
1784 }
1785 else
1786 {
1787 if (regno0 == regno1)
1788 tmp = parts.index;
1789 else if (regno0 == regno2)
1790 tmp = parts.base;
1791 else
1792 {
1793 rtx tmp1;
1794
1795 /* Find better operand for SET instruction, depending
1796 on which definition is farther from the insn. */
1797 if (find_nearest_reg_def (insn, regno1, regno2))
1798 tmp = parts.index, tmp1 = parts.base;
1799 else
1800 tmp = parts.base, tmp1 = parts.index;
1801
1802 emit_insn (gen_rtx_SET (target, tmp));
1803
1804 if (parts.disp && parts.disp != const0_rtx)
1805 ix86_emit_binop (PLUS, mode, target, parts.disp);
1806
1807 ix86_emit_binop (PLUS, mode, target, tmp1);
1808 return;
1809 }
1810
1811 ix86_emit_binop (PLUS, mode, target, tmp);
1812 }
1813
1814 if (parts.disp && parts.disp != const0_rtx)
1815 ix86_emit_binop (PLUS, mode, target, parts.disp);
1816 }
1817}
1818
1819/* Post-reload splitter for converting an SF or DFmode value in an
1820 SSE register into an unsigned SImode. */
1821
1822void
1823ix86_split_convert_uns_si_sse (rtx operands[])
1824{
1825 machine_mode vecmode;
1826 rtx value, large, zero_or_two31, input, two31, x;
1827
1828 large = operands[1];
1829 zero_or_two31 = operands[2];
1830 input = operands[3];
1831 two31 = operands[4];
1832 vecmode = GET_MODE (large);
1833 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1834
1835 /* Load up the value into the low element. We must ensure that the other
1836 elements are valid floats -- zero is the easiest such value. */
1837 if (MEM_P (input))
1838 {
1839 if (vecmode == V4SFmode)
1840 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1841 else
1842 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1843 }
1844 else
1845 {
1846 input = gen_rtx_REG (vecmode, REGNO (input));
1847 emit_move_insn (value, CONST0_RTX (vecmode));
1848 if (vecmode == V4SFmode)
febb58d2 1849 emit_insn (gen_sse_movss_v4sf (value, value, input));
2bf6d935 1850 else
febb58d2 1851 emit_insn (gen_sse2_movsd_v2df (value, value, input));
2bf6d935
ML
1852 }
1853
1854 emit_move_insn (large, two31);
1855 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1856
1857 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1858 emit_insn (gen_rtx_SET (large, x));
1859
1860 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1861 emit_insn (gen_rtx_SET (zero_or_two31, x));
1862
1863 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1864 emit_insn (gen_rtx_SET (value, x));
1865
1866 large = gen_rtx_REG (V4SImode, REGNO (large));
1867 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1868
1869 x = gen_rtx_REG (V4SImode, REGNO (value));
1870 if (vecmode == V4SFmode)
1871 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1872 else
1873 emit_insn (gen_sse2_cvttpd2dq (x, value));
1874 value = x;
1875
1876 emit_insn (gen_xorv4si3 (value, value, large));
1877}
1878
1879static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1880 machine_mode mode, rtx target,
1881 rtx var, int one_var);
1882
1883/* Convert an unsigned DImode value into a DFmode, using only SSE.
1884 Expects the 64-bit DImode to be supplied in a pair of integral
1885 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1886 -mfpmath=sse, !optimize_size only. */
1887
1888void
1889ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1890{
1891 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1892 rtx int_xmm, fp_xmm;
1893 rtx biases, exponents;
1894 rtx x;
1895
1896 int_xmm = gen_reg_rtx (V4SImode);
1897 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1898 emit_insn (gen_movdi_to_sse (int_xmm, input));
1899 else if (TARGET_SSE_SPLIT_REGS)
1900 {
1901 emit_clobber (int_xmm);
1902 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1903 }
1904 else
1905 {
1906 x = gen_reg_rtx (V2DImode);
1907 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1908 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1909 }
1910
1911 x = gen_rtx_CONST_VECTOR (V4SImode,
1912 gen_rtvec (4, GEN_INT (0x43300000UL),
1913 GEN_INT (0x45300000UL),
1914 const0_rtx, const0_rtx));
1915 exponents = validize_mem (force_const_mem (V4SImode, x));
1916
1917 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1918 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1919
1920 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1921 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1922 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1923 (0x1.0p84 + double(fp_value_hi_xmm)).
1924 Note these exponents differ by 32. */
1925
1926 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1927
1928 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1929 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1930 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1931 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1932 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1933 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1934 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1935 biases = validize_mem (force_const_mem (V2DFmode, biases));
1936 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1937
1938 /* Add the upper and lower DFmode values together. */
1939 if (TARGET_SSE3)
1940 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1941 else
1942 {
1943 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1944 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1945 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1946 }
1947
1948 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1949}
1950
1951/* Not used, but eases macroization of patterns. */
1952void
1953ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1954{
1955 gcc_unreachable ();
1956}
1957
0cda606d
UB
1958static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1959
2bf6d935
ML
1960/* Convert an unsigned SImode value into a DFmode. Only currently used
1961 for SSE, but applicable anywhere. */
1962
1963void
1964ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1965{
1966 REAL_VALUE_TYPE TWO31r;
1967 rtx x, fp;
1968
1969 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1970 NULL, 1, OPTAB_DIRECT);
1971
1972 fp = gen_reg_rtx (DFmode);
1973 emit_insn (gen_floatsidf2 (fp, x));
1974
1975 real_ldexp (&TWO31r, &dconst1, 31);
1976 x = const_double_from_real_value (TWO31r, DFmode);
1977
1978 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
0cda606d
UB
1979
1980 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1981 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1982 x = ix86_expand_sse_fabs (x, NULL);
1983
2bf6d935
ML
1984 if (x != target)
1985 emit_move_insn (target, x);
1986}
1987
1988/* Convert a signed DImode value into a DFmode. Only used for SSE in
1989 32-bit mode; otherwise we have a direct convert instruction. */
1990
1991void
1992ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1993{
1994 REAL_VALUE_TYPE TWO32r;
1995 rtx fp_lo, fp_hi, x;
1996
1997 fp_lo = gen_reg_rtx (DFmode);
1998 fp_hi = gen_reg_rtx (DFmode);
1999
2000 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
2001
2002 real_ldexp (&TWO32r, &dconst1, 32);
2003 x = const_double_from_real_value (TWO32r, DFmode);
2004 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
2005
2006 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
2007
2008 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
2009 0, OPTAB_DIRECT);
2010 if (x != target)
2011 emit_move_insn (target, x);
2012}
2013
2014/* Convert an unsigned SImode value into a SFmode, using only SSE.
2015 For x86_32, -mfpmath=sse, !optimize_size only. */
2016void
2017ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
2018{
2019 REAL_VALUE_TYPE ONE16r;
2020 rtx fp_hi, fp_lo, int_hi, int_lo, x;
2021
2022 real_ldexp (&ONE16r, &dconst1, 16);
2023 x = const_double_from_real_value (ONE16r, SFmode);
2024 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
2025 NULL, 0, OPTAB_DIRECT);
2026 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
2027 NULL, 0, OPTAB_DIRECT);
2028 fp_hi = gen_reg_rtx (SFmode);
2029 fp_lo = gen_reg_rtx (SFmode);
2030 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
2031 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
ad9fcb96
L
2032 if (TARGET_FMA)
2033 {
2034 x = validize_mem (force_const_mem (SFmode, x));
2035 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
2036 emit_move_insn (target, fp_hi);
2037 }
2038 else
2039 {
2040 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
2041 0, OPTAB_DIRECT);
2042 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
2043 0, OPTAB_DIRECT);
2044 if (!rtx_equal_p (target, fp_hi))
2045 emit_move_insn (target, fp_hi);
2046 }
2bf6d935
ML
2047}
2048
2049/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2050 a vector of unsigned ints VAL to vector of floats TARGET. */
2051
2052void
2053ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
2054{
2055 rtx tmp[8];
2056 REAL_VALUE_TYPE TWO16r;
2057 machine_mode intmode = GET_MODE (val);
2058 machine_mode fltmode = GET_MODE (target);
2059 rtx (*cvt) (rtx, rtx);
2060
2061 if (intmode == V4SImode)
2062 cvt = gen_floatv4siv4sf2;
2063 else
2064 cvt = gen_floatv8siv8sf2;
2065 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
2066 tmp[0] = force_reg (intmode, tmp[0]);
2067 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
2068 OPTAB_DIRECT);
2069 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
2070 NULL_RTX, 1, OPTAB_DIRECT);
2071 tmp[3] = gen_reg_rtx (fltmode);
2072 emit_insn (cvt (tmp[3], tmp[1]));
2073 tmp[4] = gen_reg_rtx (fltmode);
2074 emit_insn (cvt (tmp[4], tmp[2]));
2075 real_ldexp (&TWO16r, &dconst1, 16);
2076 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2077 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
ad9fcb96
L
2078 if (TARGET_FMA)
2079 {
2080 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2081 emit_move_insn (target, tmp[6]);
2082 }
2083 else
2084 {
2085 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2086 NULL_RTX, 1, OPTAB_DIRECT);
2087 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2088 target, 1, OPTAB_DIRECT);
2089 if (tmp[7] != target)
2090 emit_move_insn (target, tmp[7]);
2091 }
2bf6d935
ML
2092}
2093
2094/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
fe42e7fe 2095 pattern can be used on it instead of fixuns_trunc*.
2bf6d935
ML
2096 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2097 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2098
2099rtx
2100ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2101{
2102 REAL_VALUE_TYPE TWO31r;
2103 rtx two31r, tmp[4];
2104 machine_mode mode = GET_MODE (val);
2105 machine_mode scalarmode = GET_MODE_INNER (mode);
2106 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2107 rtx (*cmp) (rtx, rtx, rtx, rtx);
2108 int i;
2109
2110 for (i = 0; i < 3; i++)
2111 tmp[i] = gen_reg_rtx (mode);
2112 real_ldexp (&TWO31r, &dconst1, 31);
2113 two31r = const_double_from_real_value (TWO31r, scalarmode);
2114 two31r = ix86_build_const_vector (mode, 1, two31r);
2115 two31r = force_reg (mode, two31r);
2116 switch (mode)
2117 {
2118 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2119 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2120 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2121 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2122 default: gcc_unreachable ();
2123 }
2124 tmp[3] = gen_rtx_LE (mode, two31r, val);
2125 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2126 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2127 0, OPTAB_DIRECT);
2128 if (intmode == V4SImode || TARGET_AVX2)
2129 *xorp = expand_simple_binop (intmode, ASHIFT,
2130 gen_lowpart (intmode, tmp[0]),
2131 GEN_INT (31), NULL_RTX, 0,
2132 OPTAB_DIRECT);
2133 else
2134 {
6a556ba4 2135 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2bf6d935
ML
2136 two31 = ix86_build_const_vector (intmode, 1, two31);
2137 *xorp = expand_simple_binop (intmode, AND,
2138 gen_lowpart (intmode, tmp[0]),
2139 two31, NULL_RTX, 0,
2140 OPTAB_DIRECT);
2141 }
2142 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2143 0, OPTAB_DIRECT);
2144}
2145
2146/* Generate code for floating point ABS or NEG. */
2147
2148void
2149ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2150 rtx operands[])
2151{
f359611b 2152 rtx set, dst, src;
2bf6d935
ML
2153 bool use_sse = false;
2154 bool vector_mode = VECTOR_MODE_P (mode);
2155 machine_mode vmode = mode;
f359611b 2156 rtvec par;
2bf6d935 2157
75a97b59
L
2158 if (vector_mode || mode == TFmode || mode == HFmode)
2159 {
2160 use_sse = true;
2161 if (mode == HFmode)
2162 vmode = V8HFmode;
2163 }
2bf6d935
ML
2164 else if (TARGET_SSE_MATH)
2165 {
2166 use_sse = SSE_FLOAT_MODE_P (mode);
2167 if (mode == SFmode)
2168 vmode = V4SFmode;
2169 else if (mode == DFmode)
2170 vmode = V2DFmode;
2171 }
2172
2bf6d935
ML
2173 dst = operands[0];
2174 src = operands[1];
2175
2176 set = gen_rtx_fmt_e (code, mode, src);
2177 set = gen_rtx_SET (dst, set);
2178
f359611b 2179 if (use_sse)
2bf6d935 2180 {
f359611b 2181 rtx mask, use, clob;
2bf6d935 2182
f359611b
UB
2183 /* NEG and ABS performed with SSE use bitwise mask operations.
2184 Create the appropriate mask now. */
2185 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2bf6d935 2186 use = gen_rtx_USE (VOIDmode, mask);
94f687bd 2187 if (vector_mode || mode == TFmode)
2bf6d935
ML
2188 par = gen_rtvec (2, set, use);
2189 else
2190 {
2191 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2192 par = gen_rtvec (3, set, use, clob);
2193 }
2bf6d935
ML
2194 }
2195 else
f359611b
UB
2196 {
2197 rtx clob;
2198
2199 /* Changing of sign for FP values is doable using integer unit too. */
2200 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2201 par = gen_rtvec (2, set, clob);
2202 }
2203
2204 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2205}
2206
2207/* Deconstruct a floating point ABS or NEG operation
2208 with integer registers into integer operations. */
2209
2210void
2211ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2212 rtx operands[])
2213{
2214 enum rtx_code absneg_op;
2215 rtx dst, set;
2216
2217 gcc_assert (operands_match_p (operands[0], operands[1]));
2218
2219 switch (mode)
2220 {
2221 case E_SFmode:
2222 dst = gen_lowpart (SImode, operands[0]);
2223
2224 if (code == ABS)
2225 {
2226 set = gen_int_mode (0x7fffffff, SImode);
2227 absneg_op = AND;
2228 }
2229 else
2230 {
2231 set = gen_int_mode (0x80000000, SImode);
2232 absneg_op = XOR;
2233 }
2234 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2235 break;
2236
2237 case E_DFmode:
2238 if (TARGET_64BIT)
2239 {
2240 dst = gen_lowpart (DImode, operands[0]);
2241 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2242
2243 if (code == ABS)
2244 set = const0_rtx;
2245 else
2246 set = gen_rtx_NOT (DImode, dst);
2247 }
2248 else
2249 {
2250 dst = gen_highpart (SImode, operands[0]);
2251
2252 if (code == ABS)
2253 {
2254 set = gen_int_mode (0x7fffffff, SImode);
2255 absneg_op = AND;
2256 }
2257 else
2258 {
2259 set = gen_int_mode (0x80000000, SImode);
2260 absneg_op = XOR;
2261 }
2262 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2263 }
2264 break;
2265
2266 case E_XFmode:
2267 dst = gen_rtx_REG (SImode,
2268 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2269 if (code == ABS)
2270 {
2271 set = GEN_INT (0x7fff);
2272 absneg_op = AND;
2273 }
2274 else
2275 {
2276 set = GEN_INT (0x8000);
2277 absneg_op = XOR;
2278 }
2279 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2280 break;
2281
2282 default:
2283 gcc_unreachable ();
2284 }
2285
2286 set = gen_rtx_SET (dst, set);
2287
2288 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2289 rtvec par = gen_rtvec (2, set, clob);
2290
2291 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2bf6d935
ML
2292}
2293
2294/* Expand a copysign operation. Special case operand 0 being a constant. */
2295
2296void
2297ix86_expand_copysign (rtx operands[])
2298{
2299 machine_mode mode, vmode;
7e691189 2300 rtx dest, vdest, op0, op1, mask, op2, op3;
2bf6d935 2301
60efb1fe 2302 mode = GET_MODE (operands[0]);
2bf6d935 2303
75a97b59
L
2304 if (mode == HFmode)
2305 vmode = V8HFmode;
2306 else if (mode == SFmode)
2bf6d935
ML
2307 vmode = V4SFmode;
2308 else if (mode == DFmode)
2309 vmode = V2DFmode;
987a3082 2310 else if (mode == TFmode)
2bf6d935 2311 vmode = mode;
987a3082
UB
2312 else
2313 gcc_unreachable ();
2314
60efb1fe 2315 if (rtx_equal_p (operands[1], operands[2]))
2bf6d935 2316 {
60efb1fe 2317 emit_move_insn (operands[0], operands[1]);
2bf6d935
ML
2318 return;
2319 }
2320
7e691189
JJ
2321 dest = operands[0];
2322 vdest = lowpart_subreg (vmode, dest, mode);
2323 if (vdest == NULL_RTX)
2324 vdest = gen_reg_rtx (vmode);
2325 else
2326 dest = NULL_RTX;
2327 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
864c6471 2328 mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
2bf6d935 2329
60efb1fe 2330 if (CONST_DOUBLE_P (operands[1]))
2bf6d935 2331 {
60efb1fe 2332 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2333 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2334 if (op0 == CONST0_RTX (mode))
2bf6d935 2335 {
7e691189
JJ
2336 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2337 if (dest)
2338 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
60efb1fe 2339 return;
2bf6d935 2340 }
2bf6d935 2341
60efb1fe 2342 if (GET_MODE_SIZE (mode) < 16)
2343 op0 = ix86_build_const_vector (vmode, false, op0);
2344 op0 = force_reg (vmode, op0);
2bf6d935 2345 }
60efb1fe 2346 else
7e691189 2347 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
60efb1fe 2348
2349 op2 = gen_reg_rtx (vmode);
2350 op3 = gen_reg_rtx (vmode);
2351 emit_move_insn (op2, gen_rtx_AND (vmode,
2352 gen_rtx_NOT (vmode, mask),
2353 op0));
2354 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
7e691189
JJ
2355 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2356 if (dest)
2357 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2bf6d935
ML
2358}
2359
2360/* Expand an xorsign operation. */
2361
2362void
2363ix86_expand_xorsign (rtx operands[])
2364{
2bf6d935 2365 machine_mode mode, vmode;
7e691189 2366 rtx dest, vdest, op0, op1, mask, x, temp;
2bf6d935
ML
2367
2368 dest = operands[0];
2369 op0 = operands[1];
2370 op1 = operands[2];
2371
2372 mode = GET_MODE (dest);
2373
75a97b59
L
2374 if (mode == HFmode)
2375 vmode = V8HFmode;
2376 else if (mode == SFmode)
987a3082 2377 vmode = V4SFmode;
2bf6d935 2378 else if (mode == DFmode)
987a3082 2379 vmode = V2DFmode;
2bf6d935
ML
2380 else
2381 gcc_unreachable ();
2382
7485a525 2383 temp = gen_reg_rtx (vmode);
2bf6d935
ML
2384 mask = ix86_build_signbit_mask (vmode, 0, 0);
2385
7e691189 2386 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
7485a525
JJ
2387 x = gen_rtx_AND (vmode, op1, mask);
2388 emit_insn (gen_rtx_SET (temp, x));
2bf6d935 2389
7e691189 2390 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
7485a525 2391 x = gen_rtx_XOR (vmode, temp, op0);
652bef70 2392
7e691189
JJ
2393 vdest = lowpart_subreg (vmode, dest, mode);
2394 if (vdest == NULL_RTX)
2395 vdest = gen_reg_rtx (vmode);
2396 else
2397 dest = NULL_RTX;
2398 emit_insn (gen_rtx_SET (vdest, x));
2399
2400 if (dest)
2401 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2bf6d935
ML
2402}
2403
2404static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2405
2406void
2407ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2408{
2409 machine_mode mode = GET_MODE (op0);
2410 rtx tmp;
2411
2412 /* Handle special case - vector comparsion with boolean result, transform
2413 it using ptest instruction. */
850a13d7 2414 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4afbebcd 2415 || (mode == TImode && !TARGET_64BIT)
850a13d7 2416 || mode == OImode)
2bf6d935
ML
2417 {
2418 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2419 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2420
2421 gcc_assert (code == EQ || code == NE);
850a13d7 2422
4afbebcd 2423 if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
850a13d7 2424 {
2425 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2426 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2427 mode = p_mode;
2428 }
2bf6d935
ML
2429 /* Generate XOR since we can't check that one operand is zero vector. */
2430 tmp = gen_reg_rtx (mode);
2431 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2432 tmp = gen_lowpart (p_mode, tmp);
3635e8c6
RS
2433 emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
2434 gen_rtx_UNSPEC (CCZmode,
2bf6d935
ML
2435 gen_rtvec (2, tmp, tmp),
2436 UNSPEC_PTEST)));
2437 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2438 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2439 gen_rtx_LABEL_REF (VOIDmode, label),
2440 pc_rtx);
2441 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2442 return;
2443 }
2444
2445 switch (mode)
2446 {
a6841211 2447 case E_HFmode:
2bf6d935
ML
2448 case E_SFmode:
2449 case E_DFmode:
2450 case E_XFmode:
2451 case E_QImode:
2452 case E_HImode:
2453 case E_SImode:
2454 simple:
2455 tmp = ix86_expand_compare (code, op0, op1);
2456 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2457 gen_rtx_LABEL_REF (VOIDmode, label),
2458 pc_rtx);
2459 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2460 return;
2461
2462 case E_DImode:
2463 if (TARGET_64BIT)
2464 goto simple;
2bf6d935
ML
2465 /* FALLTHRU */
2466 case E_TImode:
43201f2c
RS
2467 /* DI and TI mode equality/inequality comparisons may be performed
2468 on SSE registers. Avoid splitting them, except when optimizing
2469 for size. */
2470 if ((code == EQ || code == NE)
2471 && !optimize_insn_for_size_p ())
2472 goto simple;
2473
2bf6d935
ML
2474 /* Expand DImode branch into multiple compare+branch. */
2475 {
2476 rtx lo[2], hi[2];
2477 rtx_code_label *label2;
2478 enum rtx_code code1, code2, code3;
2479 machine_mode submode;
2480
2481 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2482 {
2483 std::swap (op0, op1);
2484 code = swap_condition (code);
2485 }
2486
2487 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2488 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2489
2490 submode = mode == DImode ? SImode : DImode;
2491
43201f2c 2492 /* If we are doing less-than or greater-or-equal-than,
2bf6d935
ML
2493 op1 is a constant and the low word is zero, then we can just
2494 examine the high word. Similarly for low word -1 and
2495 less-or-equal-than or greater-than. */
2496
2497 if (CONST_INT_P (hi[1]))
2498 switch (code)
2499 {
2500 case LT: case LTU: case GE: case GEU:
2501 if (lo[1] == const0_rtx)
2502 {
2503 ix86_expand_branch (code, hi[0], hi[1], label);
2504 return;
2505 }
2506 break;
2507 case LE: case LEU: case GT: case GTU:
2508 if (lo[1] == constm1_rtx)
2509 {
2510 ix86_expand_branch (code, hi[0], hi[1], label);
2511 return;
2512 }
2513 break;
2514 default:
2515 break;
2516 }
2517
2518 /* Emulate comparisons that do not depend on Zero flag with
2519 double-word subtraction. Note that only Overflow, Sign
2520 and Carry flags are valid, so swap arguments and condition
2521 of comparisons that would otherwise test Zero flag. */
2522
2523 switch (code)
2524 {
2525 case LE: case LEU: case GT: case GTU:
2526 std::swap (lo[0], lo[1]);
2527 std::swap (hi[0], hi[1]);
2528 code = swap_condition (code);
2529 /* FALLTHRU */
2530
2531 case LT: case LTU: case GE: case GEU:
2532 {
2bf6d935 2533 bool uns = (code == LTU || code == GEU);
987a3082
UB
2534 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2535 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2bf6d935
ML
2536
2537 if (!nonimmediate_operand (lo[0], submode))
2538 lo[0] = force_reg (submode, lo[0]);
2539 if (!x86_64_general_operand (lo[1], submode))
2540 lo[1] = force_reg (submode, lo[1]);
2541
2542 if (!register_operand (hi[0], submode))
2543 hi[0] = force_reg (submode, hi[0]);
2544 if ((uns && !nonimmediate_operand (hi[1], submode))
2545 || (!uns && !x86_64_general_operand (hi[1], submode)))
2546 hi[1] = force_reg (submode, hi[1]);
2547
987a3082 2548 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2bf6d935 2549
987a3082
UB
2550 tmp = gen_rtx_SCRATCH (submode);
2551 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2bf6d935 2552
987a3082 2553 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2bf6d935
ML
2554 ix86_expand_branch (code, tmp, const0_rtx, label);
2555 return;
2556 }
2557
2558 default:
2559 break;
2560 }
2561
2562 /* Otherwise, we need two or three jumps. */
2563
2564 label2 = gen_label_rtx ();
2565
2566 code1 = code;
2567 code2 = swap_condition (code);
2568 code3 = unsigned_condition (code);
2569
2570 switch (code)
2571 {
2572 case LT: case GT: case LTU: case GTU:
2573 break;
2574
2575 case LE: code1 = LT; code2 = GT; break;
2576 case GE: code1 = GT; code2 = LT; break;
2577 case LEU: code1 = LTU; code2 = GTU; break;
2578 case GEU: code1 = GTU; code2 = LTU; break;
2579
2580 case EQ: code1 = UNKNOWN; code2 = NE; break;
2581 case NE: code2 = UNKNOWN; break;
2582
2583 default:
2584 gcc_unreachable ();
2585 }
2586
2587 /*
2588 * a < b =>
2589 * if (hi(a) < hi(b)) goto true;
2590 * if (hi(a) > hi(b)) goto false;
2591 * if (lo(a) < lo(b)) goto true;
2592 * false:
2593 */
2594
2595 if (code1 != UNKNOWN)
2596 ix86_expand_branch (code1, hi[0], hi[1], label);
2597 if (code2 != UNKNOWN)
2598 ix86_expand_branch (code2, hi[0], hi[1], label2);
2599
2600 ix86_expand_branch (code3, lo[0], lo[1], label);
2601
2602 if (code2 != UNKNOWN)
2603 emit_label (label2);
2604 return;
2605 }
2606
2607 default:
2608 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2609 goto simple;
2610 }
2611}
2612
2613/* Figure out whether to use unordered fp comparisons. */
2614
2615static bool
2616ix86_unordered_fp_compare (enum rtx_code code)
2617{
2618 if (!TARGET_IEEE_FP)
2619 return false;
2620
2621 switch (code)
2622 {
2bf6d935
ML
2623 case LT:
2624 case LE:
d6038777
UB
2625 case GT:
2626 case GE:
2627 case LTGT:
2bf6d935
ML
2628 return false;
2629
2630 case EQ:
2631 case NE:
2632
2bf6d935
ML
2633 case UNORDERED:
2634 case ORDERED:
2635 case UNLT:
2636 case UNLE:
2637 case UNGT:
2638 case UNGE:
2639 case UNEQ:
2640 return true;
2641
2642 default:
2643 gcc_unreachable ();
2644 }
2645}
2646
2647/* Return a comparison we can do and that it is equivalent to
2648 swap_condition (code) apart possibly from orderedness.
2649 But, never change orderedness if TARGET_IEEE_FP, returning
2650 UNKNOWN in that case if necessary. */
2651
2652static enum rtx_code
2653ix86_fp_swap_condition (enum rtx_code code)
2654{
2655 switch (code)
2656 {
2657 case GT: /* GTU - CF=0 & ZF=0 */
2658 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2659 case GE: /* GEU - CF=0 */
2660 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2661 case UNLT: /* LTU - CF=1 */
2662 return TARGET_IEEE_FP ? UNKNOWN : GT;
2663 case UNLE: /* LEU - CF=1 | ZF=1 */
2664 return TARGET_IEEE_FP ? UNKNOWN : GE;
2665 default:
2666 return swap_condition (code);
2667 }
2668}
2669
2670/* Return cost of comparison CODE using the best strategy for performance.
2671 All following functions do use number of instructions as a cost metrics.
2672 In future this should be tweaked to compute bytes for optimize_size and
2673 take into account performance of various instructions on various CPUs. */
2674
2675static int
2676ix86_fp_comparison_cost (enum rtx_code code)
2677{
2678 int arith_cost;
2679
2680 /* The cost of code using bit-twiddling on %ah. */
2681 switch (code)
2682 {
2683 case UNLE:
2684 case UNLT:
2685 case LTGT:
2686 case GT:
2687 case GE:
2688 case UNORDERED:
2689 case ORDERED:
2690 case UNEQ:
2691 arith_cost = 4;
2692 break;
2693 case LT:
2694 case NE:
2695 case EQ:
2696 case UNGE:
2697 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2698 break;
2699 case LE:
2700 case UNGT:
2701 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2702 break;
2703 default:
2704 gcc_unreachable ();
2705 }
2706
2707 switch (ix86_fp_comparison_strategy (code))
2708 {
2709 case IX86_FPCMP_COMI:
2710 return arith_cost > 4 ? 3 : 2;
2711 case IX86_FPCMP_SAHF:
2712 return arith_cost > 4 ? 4 : 3;
2713 default:
2714 return arith_cost;
2715 }
2716}
2717
2718/* Swap, force into registers, or otherwise massage the two operands
2719 to a fp comparison. The operands are updated in place; the new
2720 comparison code is returned. */
2721
2722static enum rtx_code
2723ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2724{
2725 bool unordered_compare = ix86_unordered_fp_compare (code);
2726 rtx op0 = *pop0, op1 = *pop1;
2727 machine_mode op_mode = GET_MODE (op0);
a6841211 2728 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2bf6d935 2729
5792208f
JJ
2730 if (op_mode == BFmode)
2731 {
2732 rtx op = gen_lowpart (HImode, op0);
2733 if (CONST_INT_P (op))
2734 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2735 op0, BFmode);
2736 else
2737 {
2738 rtx t1 = gen_reg_rtx (SImode);
2739 emit_insn (gen_zero_extendhisi2 (t1, op));
2740 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2741 op = gen_lowpart (SFmode, t1);
2742 }
2743 *pop0 = op;
2744 op = gen_lowpart (HImode, op1);
2745 if (CONST_INT_P (op))
2746 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2747 op1, BFmode);
2748 else
2749 {
2750 rtx t1 = gen_reg_rtx (SImode);
2751 emit_insn (gen_zero_extendhisi2 (t1, op));
2752 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2753 op = gen_lowpart (SFmode, t1);
2754 }
2755 *pop1 = op;
2756 return ix86_prepare_fp_compare_args (code, pop0, pop1);
2757 }
2758
2bf6d935
ML
2759 /* All of the unordered compare instructions only work on registers.
2760 The same is true of the fcomi compare instructions. The XFmode
2761 compare instructions require registers except when comparing
2762 against zero or when converting operand 1 from fixed point to
2763 floating point. */
2764
2765 if (!is_sse
2766 && (unordered_compare
2767 || (op_mode == XFmode
2768 && ! (standard_80387_constant_p (op0) == 1
2769 || standard_80387_constant_p (op1) == 1)
2770 && GET_CODE (op1) != FLOAT)
2771 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2772 {
2773 op0 = force_reg (op_mode, op0);
2774 op1 = force_reg (op_mode, op1);
2775 }
2776 else
2777 {
2778 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2779 things around if they appear profitable, otherwise force op0
2780 into a register. */
2781
2782 if (standard_80387_constant_p (op0) == 0
2783 || (MEM_P (op0)
2784 && ! (standard_80387_constant_p (op1) == 0
2785 || MEM_P (op1))))
2786 {
2787 enum rtx_code new_code = ix86_fp_swap_condition (code);
2788 if (new_code != UNKNOWN)
2789 {
2790 std::swap (op0, op1);
2791 code = new_code;
2792 }
2793 }
2794
2795 if (!REG_P (op0))
2796 op0 = force_reg (op_mode, op0);
2797
2798 if (CONSTANT_P (op1))
2799 {
2800 int tmp = standard_80387_constant_p (op1);
2801 if (tmp == 0)
2802 op1 = validize_mem (force_const_mem (op_mode, op1));
2803 else if (tmp == 1)
2804 {
2805 if (TARGET_CMOVE)
2806 op1 = force_reg (op_mode, op1);
2807 }
2808 else
2809 op1 = force_reg (op_mode, op1);
2810 }
2811 }
2812
2813 /* Try to rearrange the comparison to make it cheaper. */
2814 if (ix86_fp_comparison_cost (code)
2815 > ix86_fp_comparison_cost (swap_condition (code))
2816 && (REG_P (op1) || can_create_pseudo_p ()))
2817 {
2818 std::swap (op0, op1);
2819 code = swap_condition (code);
2820 if (!REG_P (op0))
2821 op0 = force_reg (op_mode, op0);
2822 }
2823
2824 *pop0 = op0;
2825 *pop1 = op1;
2826 return code;
2827}
2828
2829/* Generate insn patterns to do a floating point compare of OPERANDS. */
2830
2831static rtx
2832ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2833{
2834 bool unordered_compare = ix86_unordered_fp_compare (code);
2835 machine_mode cmp_mode;
2836 rtx tmp, scratch;
2837
2838 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2839
2840 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2841 if (unordered_compare)
2842 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2843
2844 /* Do fcomi/sahf based test when profitable. */
2845 switch (ix86_fp_comparison_strategy (code))
2846 {
2847 case IX86_FPCMP_COMI:
2848 cmp_mode = CCFPmode;
2849 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2850 break;
2851
2852 case IX86_FPCMP_SAHF:
2853 cmp_mode = CCFPmode;
2854 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2855 scratch = gen_reg_rtx (HImode);
2856 emit_insn (gen_rtx_SET (scratch, tmp));
2857 emit_insn (gen_x86_sahf_1 (scratch));
2858 break;
2859
2860 case IX86_FPCMP_ARITH:
2861 cmp_mode = CCNOmode;
2862 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2863 scratch = gen_reg_rtx (HImode);
2864 emit_insn (gen_rtx_SET (scratch, tmp));
2865
2866 /* In the unordered case, we have to check C2 for NaN's, which
2867 doesn't happen to work out to anything nice combination-wise.
2868 So do some bit twiddling on the value we've got in AH to come
2869 up with an appropriate set of condition codes. */
2870
2871 switch (code)
2872 {
2873 case GT:
2874 case UNGT:
2875 if (code == GT || !TARGET_IEEE_FP)
2876 {
2877 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2878 code = EQ;
2879 }
2880 else
2881 {
2882 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2883 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2884 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2885 cmp_mode = CCmode;
2886 code = GEU;
2887 }
2888 break;
2889 case LT:
2890 case UNLT:
2891 if (code == LT && TARGET_IEEE_FP)
2892 {
2893 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2894 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2895 cmp_mode = CCmode;
2896 code = EQ;
2897 }
2898 else
2899 {
2900 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2901 code = NE;
2902 }
2903 break;
2904 case GE:
2905 case UNGE:
2906 if (code == GE || !TARGET_IEEE_FP)
2907 {
2908 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2909 code = EQ;
2910 }
2911 else
2912 {
2913 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2914 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2915 code = NE;
2916 }
2917 break;
2918 case LE:
2919 case UNLE:
2920 if (code == LE && TARGET_IEEE_FP)
2921 {
2922 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2923 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2924 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2925 cmp_mode = CCmode;
2926 code = LTU;
2927 }
2928 else
2929 {
2930 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2931 code = NE;
2932 }
2933 break;
2934 case EQ:
2935 case UNEQ:
2936 if (code == EQ && TARGET_IEEE_FP)
2937 {
2938 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2939 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2940 cmp_mode = CCmode;
2941 code = EQ;
2942 }
2943 else
2944 {
2945 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2946 code = NE;
2947 }
2948 break;
2949 case NE:
2950 case LTGT:
2951 if (code == NE && TARGET_IEEE_FP)
2952 {
2953 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2954 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2955 GEN_INT (0x40)));
2956 code = NE;
2957 }
2958 else
2959 {
2960 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2961 code = EQ;
2962 }
2963 break;
2964
2965 case UNORDERED:
2966 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2967 code = NE;
2968 break;
2969 case ORDERED:
2970 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2971 code = EQ;
2972 break;
2973
2974 default:
2975 gcc_unreachable ();
2976 }
2977 break;
2978
2979 default:
2980 gcc_unreachable();
2981 }
2982
2983 /* Return the test that should be put into the flags user, i.e.
2984 the bcc, scc, or cmov instruction. */
2985 return gen_rtx_fmt_ee (code, VOIDmode,
2986 gen_rtx_REG (cmp_mode, FLAGS_REG),
2987 const0_rtx);
2988}
2989
2990/* Generate insn patterns to do an integer compare of OPERANDS. */
2991
2992static rtx
2993ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2994{
2995 machine_mode cmpmode;
2996 rtx tmp, flags;
2997
86403f4e
UB
2998 /* Swap operands to emit carry flag comparison. */
2999 if ((code == GTU || code == LEU)
3000 && nonimmediate_operand (op1, VOIDmode))
3001 {
3002 std::swap (op0, op1);
3003 code = swap_condition (code);
3004 }
3005
2bf6d935
ML
3006 cmpmode = SELECT_CC_MODE (code, op0, op1);
3007 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
3008
46ade8c9
RS
3009 /* Attempt to use PTEST, if available, when testing vector modes for
3010 equality/inequality against zero. */
3011 if (op1 == const0_rtx
3012 && SUBREG_P (op0)
3013 && cmpmode == CCZmode
3014 && SUBREG_BYTE (op0) == 0
3015 && REG_P (SUBREG_REG (op0))
3016 && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
3017 && TARGET_SSE4_1
3018 && GET_MODE (op0) == TImode
3019 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
3020 {
3021 tmp = SUBREG_REG (op0);
3022 tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
3023 }
3024 else
3025 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
3026
2bf6d935
ML
3027 /* This is very simple, but making the interface the same as in the
3028 FP case makes the rest of the code easier. */
2bf6d935
ML
3029 emit_insn (gen_rtx_SET (flags, tmp));
3030
3031 /* Return the test that should be put into the flags user, i.e.
3032 the bcc, scc, or cmov instruction. */
3033 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
3034}
3035
3036static rtx
3037ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
3038{
3039 rtx ret;
3040
3041 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
3042 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
3043
3044 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
3045 {
3046 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
3047 ret = ix86_expand_fp_compare (code, op0, op1);
3048 }
3049 else
3050 ret = ix86_expand_int_compare (code, op0, op1);
3051
3052 return ret;
3053}
3054
3055void
3056ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
3057{
3058 rtx ret;
3059
3060 gcc_assert (GET_MODE (dest) == QImode);
3061
3062 ret = ix86_expand_compare (code, op0, op1);
3063 PUT_MODE (ret, QImode);
3064 emit_insn (gen_rtx_SET (dest, ret));
3065}
3066
463d9108
JJ
3067/* Expand floating point op0 <=> op1, i.e.
3068 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
3069
3070void
3071ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
3072{
3073 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
3074 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
3075 rtx l0 = gen_label_rtx ();
3076 rtx l1 = gen_label_rtx ();
3077 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
3078 rtx lend = gen_label_rtx ();
3079 rtx tmp;
3080 rtx_insn *jmp;
3081 if (l2)
3082 {
3083 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
3084 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3085 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
3086 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
3087 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3088 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
3089 }
3090 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3091 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3092 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3093 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3094 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3095 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3096 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3097 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3098 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3099 add_reg_br_prob_note (jmp, profile_probability::even ());
3100 emit_move_insn (dest, constm1_rtx);
3101 emit_jump (lend);
3102 emit_label (l0);
3103 emit_move_insn (dest, const0_rtx);
3104 emit_jump (lend);
3105 emit_label (l1);
3106 emit_move_insn (dest, const1_rtx);
3107 emit_jump (lend);
3108 if (l2)
3109 {
3110 emit_label (l2);
3111 emit_move_insn (dest, const2_rtx);
3112 }
3113 emit_label (lend);
3114}
3115
2bf6d935
ML
3116/* Expand comparison setting or clearing carry flag. Return true when
3117 successful and set pop for the operation. */
3118static bool
3119ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3120{
3121 machine_mode mode
3122 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3123
3124 /* Do not handle double-mode compares that go through special path. */
3125 if (mode == (TARGET_64BIT ? TImode : DImode))
3126 return false;
3127
3128 if (SCALAR_FLOAT_MODE_P (mode))
3129 {
3130 rtx compare_op;
3131 rtx_insn *compare_seq;
3132
3133 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3134
3135 /* Shortcut: following common codes never translate
3136 into carry flag compares. */
3137 if (code == EQ || code == NE || code == UNEQ || code == LTGT
3138 || code == ORDERED || code == UNORDERED)
3139 return false;
3140
3141 /* These comparisons require zero flag; swap operands so they won't. */
3142 if ((code == GT || code == UNLE || code == LE || code == UNGT)
3143 && !TARGET_IEEE_FP)
3144 {
3145 std::swap (op0, op1);
3146 code = swap_condition (code);
3147 }
3148
3149 /* Try to expand the comparison and verify that we end up with
3150 carry flag based comparison. This fails to be true only when
3151 we decide to expand comparison using arithmetic that is not
3152 too common scenario. */
3153 start_sequence ();
3154 compare_op = ix86_expand_fp_compare (code, op0, op1);
3155 compare_seq = get_insns ();
3156 end_sequence ();
3157
3158 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3159 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3160 else
3161 code = GET_CODE (compare_op);
3162
3163 if (code != LTU && code != GEU)
3164 return false;
3165
3166 emit_insn (compare_seq);
3167 *pop = compare_op;
3168 return true;
3169 }
3170
3171 if (!INTEGRAL_MODE_P (mode))
3172 return false;
3173
3174 switch (code)
3175 {
3176 case LTU:
3177 case GEU:
3178 break;
3179
3180 /* Convert a==0 into (unsigned)a<1. */
3181 case EQ:
3182 case NE:
3183 if (op1 != const0_rtx)
3184 return false;
3185 op1 = const1_rtx;
3186 code = (code == EQ ? LTU : GEU);
3187 break;
3188
3189 /* Convert a>b into b<a or a>=b-1. */
3190 case GTU:
3191 case LEU:
3192 if (CONST_INT_P (op1))
3193 {
3194 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3195 /* Bail out on overflow. We still can swap operands but that
3196 would force loading of the constant into register. */
3197 if (op1 == const0_rtx
3198 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3199 return false;
3200 code = (code == GTU ? GEU : LTU);
3201 }
3202 else
3203 {
3204 std::swap (op0, op1);
3205 code = (code == GTU ? LTU : GEU);
3206 }
3207 break;
3208
3209 /* Convert a>=0 into (unsigned)a<0x80000000. */
3210 case LT:
3211 case GE:
3212 if (mode == DImode || op1 != const0_rtx)
3213 return false;
3214 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3215 code = (code == LT ? GEU : LTU);
3216 break;
3217 case LE:
3218 case GT:
3219 if (mode == DImode || op1 != constm1_rtx)
3220 return false;
3221 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3222 code = (code == LE ? GEU : LTU);
3223 break;
3224
3225 default:
3226 return false;
3227 }
3228 /* Swapping operands may cause constant to appear as first operand. */
3229 if (!nonimmediate_operand (op0, VOIDmode))
3230 {
3231 if (!can_create_pseudo_p ())
3232 return false;
3233 op0 = force_reg (mode, op0);
3234 }
3235 *pop = ix86_expand_compare (code, op0, op1);
3236 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3237 return true;
3238}
3239
3240/* Expand conditional increment or decrement using adb/sbb instructions.
3241 The default case using setcc followed by the conditional move can be
3242 done by generic code. */
3243bool
3244ix86_expand_int_addcc (rtx operands[])
3245{
3246 enum rtx_code code = GET_CODE (operands[1]);
3247 rtx flags;
987a3082 3248 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2bf6d935
ML
3249 rtx compare_op;
3250 rtx val = const0_rtx;
3251 bool fpcmp = false;
3252 machine_mode mode;
3253 rtx op0 = XEXP (operands[1], 0);
3254 rtx op1 = XEXP (operands[1], 1);
3255
3256 if (operands[3] != const1_rtx
3257 && operands[3] != constm1_rtx)
3258 return false;
3259 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3260 return false;
3261 code = GET_CODE (compare_op);
3262
3263 flags = XEXP (compare_op, 0);
3264
3265 if (GET_MODE (flags) == CCFPmode)
3266 {
3267 fpcmp = true;
3268 code = ix86_fp_compare_code_to_integer (code);
3269 }
3270
3271 if (code != LTU)
3272 {
3273 val = constm1_rtx;
3274 if (fpcmp)
3275 PUT_CODE (compare_op,
3276 reverse_condition_maybe_unordered
3277 (GET_CODE (compare_op)));
3278 else
3279 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3280 }
3281
3282 mode = GET_MODE (operands[0]);
3283
3284 /* Construct either adc or sbb insn. */
3285 if ((code == LTU) == (operands[3] == constm1_rtx))
987a3082 3286 insn = gen_sub3_carry;
2bf6d935 3287 else
987a3082
UB
3288 insn = gen_add3_carry;
3289
3290 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2bf6d935
ML
3291
3292 return true;
3293}
3294
3295bool
3296ix86_expand_int_movcc (rtx operands[])
3297{
3298 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3299 rtx_insn *compare_seq;
3300 rtx compare_op;
3301 machine_mode mode = GET_MODE (operands[0]);
3302 bool sign_bit_compare_p = false;
f1652e33 3303 bool negate_cc_compare_p = false;
2bf6d935
ML
3304 rtx op0 = XEXP (operands[1], 0);
3305 rtx op1 = XEXP (operands[1], 1);
1ceddd74
JJ
3306 rtx op2 = operands[2];
3307 rtx op3 = operands[3];
2bf6d935
ML
3308
3309 if (GET_MODE (op0) == TImode
3310 || (GET_MODE (op0) == DImode
3311 && !TARGET_64BIT))
3312 return false;
3313
5792208f
JJ
3314 if (GET_MODE (op0) == BFmode
3315 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3316 return false;
3317
2bf6d935
ML
3318 start_sequence ();
3319 compare_op = ix86_expand_compare (code, op0, op1);
3320 compare_seq = get_insns ();
3321 end_sequence ();
3322
3323 compare_code = GET_CODE (compare_op);
3324
3325 if ((op1 == const0_rtx && (code == GE || code == LT))
3326 || (op1 == constm1_rtx && (code == GT || code == LE)))
3327 sign_bit_compare_p = true;
3328
1ceddd74
JJ
3329 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3330 but if op1 is a constant, the latter form allows more optimizations,
3331 either through the last 2 ops being constant handling, or the one
3332 constant and one variable cases. On the other side, for cmov the
3333 former might be better as we don't need to load the constant into
3334 another register. */
3335 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3336 op2 = op1;
3337 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3338 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3339 op3 = op1;
3340
2bf6d935
ML
3341 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3342 HImode insns, we'd be swallowed in word prefix ops. */
3343
3344 if ((mode != HImode || TARGET_FAST_PREFIX)
3345 && (mode != (TARGET_64BIT ? TImode : DImode))
1ceddd74
JJ
3346 && CONST_INT_P (op2)
3347 && CONST_INT_P (op3))
2bf6d935
ML
3348 {
3349 rtx out = operands[0];
1ceddd74
JJ
3350 HOST_WIDE_INT ct = INTVAL (op2);
3351 HOST_WIDE_INT cf = INTVAL (op3);
2bf6d935
ML
3352 HOST_WIDE_INT diff;
3353
f1652e33
RS
3354 if ((mode == SImode
3355 || (TARGET_64BIT && mode == DImode))
3356 && (GET_MODE (op0) == SImode
3357 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3358 {
3359 /* Special case x != 0 ? -1 : y. */
3360 if (code == NE && op1 == const0_rtx && ct == -1)
3361 {
3362 negate_cc_compare_p = true;
3363 std::swap (ct, cf);
3364 code = EQ;
3365 }
3366 else if (code == EQ && op1 == const0_rtx && cf == -1)
3367 negate_cc_compare_p = true;
3368 }
3369
2bf6d935
ML
3370 diff = ct - cf;
3371 /* Sign bit compares are better done using shifts than we do by using
3372 sbb. */
3373 if (sign_bit_compare_p
f1652e33 3374 || negate_cc_compare_p
2bf6d935
ML
3375 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3376 {
9e6ac747
RS
3377 /* Detect overlap between destination and compare sources. */
3378 rtx tmp = out;
2bf6d935 3379
f1652e33
RS
3380 if (negate_cc_compare_p)
3381 {
3382 if (GET_MODE (op0) == DImode)
3383 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3384 else
3385 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3386 gen_lowpart (SImode, op0)));
3387
9e6ac747 3388 tmp = gen_reg_rtx (mode);
f1652e33
RS
3389 if (mode == DImode)
3390 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3391 else
3392 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3393 tmp)));
3394 }
3395 else if (!sign_bit_compare_p)
2bf6d935
ML
3396 {
3397 rtx flags;
3398 bool fpcmp = false;
3399
3400 compare_code = GET_CODE (compare_op);
3401
3402 flags = XEXP (compare_op, 0);
3403
3404 if (GET_MODE (flags) == CCFPmode)
3405 {
3406 fpcmp = true;
3407 compare_code
3408 = ix86_fp_compare_code_to_integer (compare_code);
3409 }
3410
3411 /* To simplify rest of code, restrict to the GEU case. */
3412 if (compare_code == LTU)
3413 {
3414 std::swap (ct, cf);
3415 compare_code = reverse_condition (compare_code);
3416 code = reverse_condition (code);
3417 }
3418 else
3419 {
3420 if (fpcmp)
3421 PUT_CODE (compare_op,
3422 reverse_condition_maybe_unordered
3423 (GET_CODE (compare_op)));
3424 else
3425 PUT_CODE (compare_op,
3426 reverse_condition (GET_CODE (compare_op)));
3427 }
3428 diff = ct - cf;
3429
9e6ac747
RS
3430 if (reg_overlap_mentioned_p (out, compare_op))
3431 tmp = gen_reg_rtx (mode);
3432
2bf6d935
ML
3433 if (mode == DImode)
3434 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3435 else
3436 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3437 flags, compare_op));
3438 }
3439 else
3440 {
3441 if (code == GT || code == GE)
3442 code = reverse_condition (code);
3443 else
3444 {
3445 std::swap (ct, cf);
3446 diff = ct - cf;
3447 }
3448 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3449 }
3450
3451 if (diff == 1)
3452 {
3453 /*
3454 * cmpl op0,op1
3455 * sbbl dest,dest
3456 * [addl dest, ct]
3457 *
3458 * Size 5 - 8.
3459 */
3460 if (ct)
9e6ac747
RS
3461 tmp = expand_simple_binop (mode, PLUS,
3462 tmp, GEN_INT (ct),
3463 copy_rtx (tmp), 1, OPTAB_DIRECT);
2bf6d935
ML
3464 }
3465 else if (cf == -1)
3466 {
3467 /*
3468 * cmpl op0,op1
3469 * sbbl dest,dest
3470 * orl $ct, dest
3471 *
3472 * Size 8.
3473 */
9e6ac747
RS
3474 tmp = expand_simple_binop (mode, IOR,
3475 tmp, GEN_INT (ct),
3476 copy_rtx (tmp), 1, OPTAB_DIRECT);
2bf6d935
ML
3477 }
3478 else if (diff == -1 && ct)
3479 {
3480 /*
3481 * cmpl op0,op1
3482 * sbbl dest,dest
3483 * notl dest
3484 * [addl dest, cf]
3485 *
3486 * Size 8 - 11.
3487 */
9e6ac747 3488 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
2bf6d935 3489 if (cf)
9e6ac747
RS
3490 tmp = expand_simple_binop (mode, PLUS,
3491 copy_rtx (tmp), GEN_INT (cf),
3492 copy_rtx (tmp), 1, OPTAB_DIRECT);
2bf6d935
ML
3493 }
3494 else
3495 {
3496 /*
3497 * cmpl op0,op1
3498 * sbbl dest,dest
3499 * [notl dest]
3500 * andl cf - ct, dest
3501 * [addl dest, ct]
3502 *
3503 * Size 8 - 11.
3504 */
3505
3506 if (cf == 0)
3507 {
3508 cf = ct;
3509 ct = 0;
9e6ac747 3510 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
2bf6d935
ML
3511 }
3512
9e6ac747
RS
3513 tmp = expand_simple_binop (mode, AND,
3514 copy_rtx (tmp),
2bf6d935 3515 gen_int_mode (cf - ct, mode),
9e6ac747 3516 copy_rtx (tmp), 1, OPTAB_DIRECT);
2bf6d935 3517 if (ct)
9e6ac747
RS
3518 tmp = expand_simple_binop (mode, PLUS,
3519 copy_rtx (tmp), GEN_INT (ct),
3520 copy_rtx (tmp), 1, OPTAB_DIRECT);
2bf6d935
ML
3521 }
3522
9e6ac747
RS
3523 if (!rtx_equal_p (tmp, out))
3524 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3525
2bf6d935
ML
3526 return true;
3527 }
3528
3529 if (diff < 0)
3530 {
3531 machine_mode cmp_mode = GET_MODE (op0);
3532 enum rtx_code new_code;
3533
3534 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3535 {
3536 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3537
8f17461b
UB
3538 /* We may be reversing a non-trapping
3539 comparison to a trapping comparison. */
3540 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3541 && code != EQ && code != NE
3542 && code != ORDERED && code != UNORDERED)
3543 new_code = UNKNOWN;
3544 else
3545 new_code = reverse_condition_maybe_unordered (code);
2bf6d935
ML
3546 }
3547 else
3548 new_code = ix86_reverse_condition (code, cmp_mode);
3549 if (new_code != UNKNOWN)
3550 {
3551 std::swap (ct, cf);
3552 diff = -diff;
3553 code = new_code;
3554 }
3555 }
3556
3557 compare_code = UNKNOWN;
3558 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3559 && CONST_INT_P (op1))
3560 {
3561 if (op1 == const0_rtx
3562 && (code == LT || code == GE))
3563 compare_code = code;
3564 else if (op1 == constm1_rtx)
3565 {
3566 if (code == LE)
3567 compare_code = LT;
3568 else if (code == GT)
3569 compare_code = GE;
3570 }
3571 }
3572
3573 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3574 if (compare_code != UNKNOWN
3575 && GET_MODE (op0) == GET_MODE (out)
3576 && (cf == -1 || ct == -1))
3577 {
3578 /* If lea code below could be used, only optimize
3579 if it results in a 2 insn sequence. */
3580
3581 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3582 || diff == 3 || diff == 5 || diff == 9)
3583 || (compare_code == LT && ct == -1)
3584 || (compare_code == GE && cf == -1))
3585 {
3586 /*
3587 * notl op1 (if necessary)
3588 * sarl $31, op1
3589 * orl cf, op1
3590 */
3591 if (ct != -1)
3592 {
3593 cf = ct;
3594 ct = -1;
3595 code = reverse_condition (code);
3596 }
3597
3598 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3599
3600 out = expand_simple_binop (mode, IOR,
3601 out, GEN_INT (cf),
3602 out, 1, OPTAB_DIRECT);
3603 if (out != operands[0])
3604 emit_move_insn (operands[0], out);
3605
3606 return true;
3607 }
3608 }
3609
3610
3611 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3612 || diff == 3 || diff == 5 || diff == 9)
3613 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3614 && (mode != DImode
3615 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3616 {
3617 /*
3618 * xorl dest,dest
3619 * cmpl op1,op2
3620 * setcc dest
3621 * lea cf(dest*(ct-cf)),dest
3622 *
3623 * Size 14.
3624 *
3625 * This also catches the degenerate setcc-only case.
3626 */
3627
3628 rtx tmp;
3629 int nops;
3630
3631 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3632
3633 nops = 0;
3634 /* On x86_64 the lea instruction operates on Pmode, so we need
3635 to get arithmetics done in proper mode to match. */
3636 if (diff == 1)
3637 tmp = copy_rtx (out);
3638 else
3639 {
3640 rtx out1;
3641 out1 = copy_rtx (out);
3642 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3643 nops++;
3644 if (diff & 1)
3645 {
3646 tmp = gen_rtx_PLUS (mode, tmp, out1);
3647 nops++;
3648 }
3649 }
3650 if (cf != 0)
3651 {
c3185b64 3652 tmp = plus_constant (mode, tmp, cf);
2bf6d935
ML
3653 nops++;
3654 }
3655 if (!rtx_equal_p (tmp, out))
3656 {
3657 if (nops == 1)
3658 out = force_operand (tmp, copy_rtx (out));
3659 else
3660 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3661 }
3662 if (!rtx_equal_p (out, operands[0]))
3663 emit_move_insn (operands[0], copy_rtx (out));
3664
3665 return true;
3666 }
3667
3668 /*
3669 * General case: Jumpful:
3670 * xorl dest,dest cmpl op1, op2
3671 * cmpl op1, op2 movl ct, dest
3672 * setcc dest jcc 1f
3673 * decl dest movl cf, dest
3674 * andl (cf-ct),dest 1:
3675 * addl ct,dest
3676 *
3677 * Size 20. Size 14.
3678 *
3679 * This is reasonably steep, but branch mispredict costs are
3680 * high on modern cpus, so consider failing only if optimizing
3681 * for space.
3682 */
3683
3684 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3685 && BRANCH_COST (optimize_insn_for_speed_p (),
3686 false) >= 2)
3687 {
3688 if (cf == 0)
3689 {
3690 machine_mode cmp_mode = GET_MODE (op0);
3691 enum rtx_code new_code;
3692
3693 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3694 {
3695 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3696
8f17461b
UB
3697 /* We may be reversing a non-trapping
3698 comparison to a trapping comparison. */
3699 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3700 && code != EQ && code != NE
3701 && code != ORDERED && code != UNORDERED)
3702 new_code = UNKNOWN;
3703 else
3704 new_code = reverse_condition_maybe_unordered (code);
3705
2bf6d935
ML
3706 }
3707 else
3708 {
3709 new_code = ix86_reverse_condition (code, cmp_mode);
3710 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3711 compare_code = reverse_condition (compare_code);
3712 }
3713
3714 if (new_code != UNKNOWN)
3715 {
3716 cf = ct;
3717 ct = 0;
3718 code = new_code;
3719 }
3720 }
3721
3722 if (compare_code != UNKNOWN)
3723 {
3724 /* notl op1 (if needed)
3725 sarl $31, op1
3726 andl (cf-ct), op1
3727 addl ct, op1
3728
3729 For x < 0 (resp. x <= -1) there will be no notl,
3730 so if possible swap the constants to get rid of the
3731 complement.
3732 True/false will be -1/0 while code below (store flag
3733 followed by decrement) is 0/-1, so the constants need
3734 to be exchanged once more. */
3735
3736 if (compare_code == GE || !cf)
3737 {
3738 code = reverse_condition (code);
3739 compare_code = LT;
3740 }
3741 else
3742 std::swap (ct, cf);
3743
3744 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3745 }
3746 else
3747 {
3748 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3749
3750 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3751 constm1_rtx,
3752 copy_rtx (out), 1, OPTAB_DIRECT);
3753 }
3754
3755 out = expand_simple_binop (mode, AND, copy_rtx (out),
3756 gen_int_mode (cf - ct, mode),
3757 copy_rtx (out), 1, OPTAB_DIRECT);
3758 if (ct)
3759 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3760 copy_rtx (out), 1, OPTAB_DIRECT);
3761 if (!rtx_equal_p (out, operands[0]))
3762 emit_move_insn (operands[0], copy_rtx (out));
3763
3764 return true;
3765 }
3766 }
3767
3768 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3769 {
3770 /* Try a few things more with specific constants and a variable. */
3771
3772 optab op;
3773 rtx var, orig_out, out, tmp;
3774
3775 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3776 return false;
3777
1ceddd74
JJ
3778 operands[2] = op2;
3779 operands[3] = op3;
3780
2bf6d935
ML
3781 /* If one of the two operands is an interesting constant, load a
3782 constant with the above and mask it in with a logical operation. */
3783
3784 if (CONST_INT_P (operands[2]))
3785 {
3786 var = operands[3];
3787 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3788 operands[3] = constm1_rtx, op = and_optab;
3789 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3790 operands[3] = const0_rtx, op = ior_optab;
3791 else
3792 return false;
3793 }
3794 else if (CONST_INT_P (operands[3]))
3795 {
3796 var = operands[2];
3797 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
e4ced0b6
RS
3798 {
3799 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3800 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3801 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3802 operands[1] = simplify_gen_relational (LT, VOIDmode,
3803 GET_MODE (op0),
3804 op0, const0_rtx);
3805
3806 operands[2] = constm1_rtx;
3807 op = and_optab;
3808 }
2bf6d935
ML
3809 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3810 operands[2] = const0_rtx, op = ior_optab;
3811 else
3812 return false;
3813 }
3814 else
3815 return false;
3816
3817 orig_out = operands[0];
3818 tmp = gen_reg_rtx (mode);
3819 operands[0] = tmp;
3820
3821 /* Recurse to get the constant loaded. */
3822 if (!ix86_expand_int_movcc (operands))
3823 return false;
3824
3825 /* Mask in the interesting variable. */
3826 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3827 OPTAB_WIDEN);
3828 if (!rtx_equal_p (out, orig_out))
3829 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3830
3831 return true;
3832 }
3833
3834 /*
3835 * For comparison with above,
3836 *
3837 * movl cf,dest
3838 * movl ct,tmp
3839 * cmpl op1,op2
3840 * cmovcc tmp,dest
3841 *
3842 * Size 15.
3843 */
3844
3845 if (! nonimmediate_operand (operands[2], mode))
3846 operands[2] = force_reg (mode, operands[2]);
3847 if (! nonimmediate_operand (operands[3], mode))
3848 operands[3] = force_reg (mode, operands[3]);
3849
3850 if (! register_operand (operands[2], VOIDmode)
3851 && (mode == QImode
3852 || ! register_operand (operands[3], VOIDmode)))
3853 operands[2] = force_reg (mode, operands[2]);
3854
3855 if (mode == QImode
3856 && ! register_operand (operands[3], VOIDmode))
3857 operands[3] = force_reg (mode, operands[3]);
3858
3859 emit_insn (compare_seq);
3860 emit_insn (gen_rtx_SET (operands[0],
3861 gen_rtx_IF_THEN_ELSE (mode,
3862 compare_op, operands[2],
3863 operands[3])));
3864 return true;
3865}
3866
3867/* Detect conditional moves that exactly match min/max operational
3868 semantics. Note that this is IEEE safe, as long as we don't
3869 interchange the operands.
3870
3871 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3872 and TRUE if the operation is successful and instructions are emitted. */
3873
3874static bool
3875ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3876 rtx cmp_op1, rtx if_true, rtx if_false)
3877{
3878 machine_mode mode;
3879 bool is_min;
3880 rtx tmp;
3881
3882 if (code == LT)
3883 ;
3884 else if (code == UNGE)
3885 std::swap (if_true, if_false);
3886 else
3887 return false;
3888
3889 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3890 is_min = true;
3891 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3892 is_min = false;
3893 else
3894 return false;
3895
3896 mode = GET_MODE (dest);
3897
3898 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3899 but MODE may be a vector mode and thus not appropriate. */
3900 if (!flag_finite_math_only || flag_signed_zeros)
3901 {
3902 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3903 rtvec v;
3904
3905 if_true = force_reg (mode, if_true);
3906 v = gen_rtvec (2, if_true, if_false);
3907 tmp = gen_rtx_UNSPEC (mode, v, u);
3908 }
3909 else
3910 {
3911 code = is_min ? SMIN : SMAX;
3912 if (MEM_P (if_true) && MEM_P (if_false))
3913 if_true = force_reg (mode, if_true);
3914 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3915 }
3916
3917 emit_insn (gen_rtx_SET (dest, tmp));
3918 return true;
3919}
3920
8b905e9b
HL
3921/* Return true if MODE is valid for vector compare to mask register,
3922 Same result for conditionl vector move with mask register. */
3923static bool
3924ix86_valid_mask_cmp_mode (machine_mode mode)
3925{
3926 /* XOP has its own vector conditional movement. */
a8654147 3927 if (TARGET_XOP && !TARGET_AVX512F)
8b905e9b
HL
3928 return false;
3929
0d788c35 3930 /* HFmode only supports vcmpsh whose dest is mask register. */
3931 if (TARGET_AVX512FP16 && mode == HFmode)
3932 return true;
3933
8b905e9b
HL
3934 /* AVX512F is needed for mask operation. */
3935 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3936 return false;
3937
3938 /* AVX512BW is needed for vector QI/HImode,
3939 AVX512VL is needed for 128/256-bit vector. */
3940 machine_mode inner_mode = GET_MODE_INNER (mode);
3941 int vector_size = GET_MODE_SIZE (mode);
3942 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3943 return false;
3944
3945 return vector_size == 64 || TARGET_AVX512VL;
3946}
3947
8d0737d8 3948/* Return true if integer mask comparison should be used. */
3949static bool
3950ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3951 rtx op_true, rtx op_false)
3952{
92f372f0
UB
3953 int vector_size = GET_MODE_SIZE (mode);
3954
0d788c35 3955 if (cmp_mode == HFmode)
3956 return true;
3957 else if (vector_size < 16)
92f372f0
UB
3958 return false;
3959 else if (vector_size == 64)
8d0737d8 3960 return true;
9ce50028
HW
3961 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3962 return true;
8d0737d8 3963
3964 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3965 gcc_assert (!op_true == !op_false);
3966
3967 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3968 vector dest is required. */
3969 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3970 return false;
3971
3972 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3973 if (op_false == CONST0_RTX (mode)
3974 || op_true == CONST0_RTX (mode)
3975 || (INTEGRAL_MODE_P (mode)
3976 && (op_true == CONSTM1_RTX (mode)
3977 || op_false == CONSTM1_RTX (mode))))
3978 return false;
3979
3980 return true;
3981}
3982
2bf6d935
ML
3983/* Expand an SSE comparison. Return the register with the result. */
3984
3985static rtx
3986ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3987 rtx op_true, rtx op_false)
3988{
3989 machine_mode mode = GET_MODE (dest);
3990 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3991
3992 /* In general case result of comparison can differ from operands' type. */
3993 machine_mode cmp_mode;
3994
3995 /* In AVX512F the result of comparison is an integer mask. */
3996 bool maskcmp = false;
3997 rtx x;
3998
8d0737d8 3999 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
2bf6d935
ML
4000 {
4001 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
2bf6d935 4002 maskcmp = true;
8b905e9b 4003 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
2bf6d935
ML
4004 }
4005 else
4006 cmp_mode = cmp_ops_mode;
4007
4008 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
4009
a86b3453 4010 bool (*op1_predicate)(rtx, machine_mode)
2bf6d935
ML
4011 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
4012
4013 if (!op1_predicate (cmp_op1, cmp_ops_mode))
4014 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
4015
4016 if (optimize
4017 || (maskcmp && cmp_mode != mode)
4018 || (op_true && reg_overlap_mentioned_p (dest, op_true))
4019 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
4020 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
4021
99e4891e 4022 if (maskcmp)
4023 {
4024 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
4025 gcc_assert (ok);
4026 return dest;
4027 }
4028
2bf6d935
ML
4029 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
4030
8d0737d8 4031 if (cmp_mode != mode)
2bf6d935
ML
4032 {
4033 x = force_reg (cmp_ops_mode, x);
4034 convert_move (dest, x, false);
4035 }
4036 else
4037 emit_insn (gen_rtx_SET (dest, x));
4038
4039 return dest;
4040}
4041
b5193e35
UB
4042/* Emit x86 binary operand CODE in mode MODE for SSE vector
4043 instructions that can be performed using GP registers. */
4044
4045static void
4046ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
4047 rtx dst, rtx src1, rtx src2)
4048{
4049 rtx tmp;
4050
4051 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
4052
4053 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
4054 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4055 {
4056 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
4057 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
4058 }
4059
4060 emit_insn (tmp);
4061}
4062
2bf6d935
ML
4063/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
4064 operations. This is used for both scalar and vector conditional moves. */
4065
4066void
4067ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
4068{
4069 machine_mode mode = GET_MODE (dest);
4070 machine_mode cmpmode = GET_MODE (cmp);
f4a2cecd 4071 rtx x;
2bf6d935 4072
9b5d50b7 4073 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4074 if (rtx_equal_p (op_true, op_false))
4075 {
4076 emit_move_insn (dest, op_true);
4077 return;
4078 }
4079
2bf6d935
ML
4080 /* If we have an integer mask and FP value then we need
4081 to cast mask to FP mode. */
4082 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
4083 {
4084 cmp = force_reg (cmpmode, cmp);
4085 cmp = gen_rtx_SUBREG (mode, cmp, 0);
4086 }
4087
8d0737d8 4088 /* In AVX512F the result of comparison is an integer mask. */
4089 if (mode != cmpmode
4090 && GET_MODE_CLASS (cmpmode) == MODE_INT)
2bf6d935 4091 {
8d0737d8 4092 gcc_assert (ix86_valid_mask_cmp_mode (mode));
0d788c35 4093 /* Using scalar/vector move with mask register. */
8b905e9b
HL
4094 cmp = force_reg (cmpmode, cmp);
4095 /* Optimize for mask zero. */
4096 op_true = (op_true != CONST0_RTX (mode)
4097 ? force_reg (mode, op_true) : op_true);
4098 op_false = (op_false != CONST0_RTX (mode)
4099 ? force_reg (mode, op_false) : op_false);
4100 if (op_true == CONST0_RTX (mode))
2bf6d935 4101 {
ee78c20e 4102 if (cmpmode == E_DImode && !TARGET_64BIT)
f4a2cecd
UB
4103 {
4104 x = gen_reg_rtx (cmpmode);
4105 emit_insn (gen_knotdi (x, cmp));
4106 }
ee78c20e 4107 else
f4a2cecd
UB
4108 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4109 cmp = x;
8b905e9b
HL
4110 /* Reverse op_true op_false. */
4111 std::swap (op_true, op_false);
2bf6d935 4112 }
8b905e9b 4113
0d788c35 4114 if (mode == HFmode)
4115 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4116 else
f4a2cecd
UB
4117 emit_insn (gen_rtx_SET (dest,
4118 gen_rtx_VEC_MERGE (mode,
4119 op_true, op_false, cmp)));
8b905e9b 4120 return;
2bf6d935 4121 }
f4a2cecd
UB
4122
4123 if (vector_all_ones_operand (op_true, mode)
4124 && op_false == CONST0_RTX (mode))
2bf6d935 4125 {
f4a2cecd 4126 emit_move_insn (dest, cmp);
2bf6d935
ML
4127 return;
4128 }
4129 else if (op_false == CONST0_RTX (mode))
4130 {
f4a2cecd
UB
4131 x = expand_simple_binop (mode, AND, cmp, op_true,
4132 dest, 1, OPTAB_DIRECT);
4133 if (x != dest)
4134 emit_move_insn (dest, x);
2bf6d935
ML
4135 return;
4136 }
4137 else if (op_true == CONST0_RTX (mode))
4138 {
4139 op_false = force_reg (mode, op_false);
4140 x = gen_rtx_NOT (mode, cmp);
b5193e35 4141 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
2bf6d935
ML
4142 return;
4143 }
f4a2cecd 4144 else if (vector_all_ones_operand (op_true, mode))
2bf6d935 4145 {
f4a2cecd
UB
4146 x = expand_simple_binop (mode, IOR, cmp, op_false,
4147 dest, 1, OPTAB_DIRECT);
4148 if (x != dest)
4149 emit_move_insn (dest, x);
2bf6d935
ML
4150 return;
4151 }
f4a2cecd
UB
4152
4153 if (TARGET_XOP)
2bf6d935
ML
4154 {
4155 op_true = force_reg (mode, op_true);
4156
f1693741
UB
4157 if (GET_MODE_SIZE (mode) < 16
4158 || !nonimmediate_operand (op_false, mode))
2bf6d935
ML
4159 op_false = force_reg (mode, op_false);
4160
f4a2cecd
UB
4161 emit_insn (gen_rtx_SET (dest,
4162 gen_rtx_IF_THEN_ELSE (mode, cmp,
4163 op_true, op_false)));
2bf6d935
ML
4164 return;
4165 }
4166
4167 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
f4a2cecd 4168 machine_mode blend_mode = mode;
2bf6d935 4169
f4a2cecd
UB
4170 if (GET_MODE_SIZE (mode) < 16
4171 || !vector_operand (op_true, mode))
2bf6d935
ML
4172 op_true = force_reg (mode, op_true);
4173
4174 op_false = force_reg (mode, op_false);
4175
4176 switch (mode)
4177 {
b1f7fd8a
UB
4178 case E_V2SFmode:
4179 if (TARGET_SSE4_1)
f4a2cecd 4180 gen = gen_mmx_blendvps;
b1f7fd8a 4181 break;
2bf6d935
ML
4182 case E_V4SFmode:
4183 if (TARGET_SSE4_1)
4184 gen = gen_sse4_1_blendvps;
4185 break;
4186 case E_V2DFmode:
4187 if (TARGET_SSE4_1)
4188 gen = gen_sse4_1_blendvpd;
4189 break;
4190 case E_SFmode:
4191 if (TARGET_SSE4_1)
f4a2cecd 4192 gen = gen_sse4_1_blendvss;
2bf6d935
ML
4193 break;
4194 case E_DFmode:
4195 if (TARGET_SSE4_1)
f4a2cecd 4196 gen = gen_sse4_1_blendvsd;
2bf6d935 4197 break;
5795ec0e
UB
4198 case E_V8QImode:
4199 case E_V4HImode:
4200 case E_V2SImode:
4201 if (TARGET_SSE4_1)
4202 {
820ac79e 4203 gen = gen_mmx_pblendvb_v8qi;
f4a2cecd 4204 blend_mode = V8QImode;
5795ec0e
UB
4205 }
4206 break;
2df9d3c5
UB
4207 case E_V4QImode:
4208 case E_V2HImode:
4209 if (TARGET_SSE4_1)
4210 {
820ac79e 4211 gen = gen_mmx_pblendvb_v4qi;
f4a2cecd 4212 blend_mode = V4QImode;
2df9d3c5
UB
4213 }
4214 break;
820ac79e
UB
4215 case E_V2QImode:
4216 if (TARGET_SSE4_1)
f4a2cecd 4217 gen = gen_mmx_pblendvb_v2qi;
820ac79e 4218 break;
2bf6d935
ML
4219 case E_V16QImode:
4220 case E_V8HImode:
9e2a82e1 4221 case E_V8HFmode:
6910cad5 4222 case E_V8BFmode:
2bf6d935
ML
4223 case E_V4SImode:
4224 case E_V2DImode:
793f847b 4225 case E_V1TImode:
2bf6d935
ML
4226 if (TARGET_SSE4_1)
4227 {
4228 gen = gen_sse4_1_pblendvb;
f4a2cecd 4229 blend_mode = V16QImode;
2bf6d935
ML
4230 }
4231 break;
4232 case E_V8SFmode:
4233 if (TARGET_AVX)
4234 gen = gen_avx_blendvps256;
4235 break;
4236 case E_V4DFmode:
4237 if (TARGET_AVX)
4238 gen = gen_avx_blendvpd256;
4239 break;
4240 case E_V32QImode:
4241 case E_V16HImode:
9e2a82e1 4242 case E_V16HFmode:
6910cad5 4243 case E_V16BFmode:
2bf6d935
ML
4244 case E_V8SImode:
4245 case E_V4DImode:
4246 if (TARGET_AVX2)
4247 {
4248 gen = gen_avx2_pblendvb;
f4a2cecd 4249 blend_mode = V32QImode;
2bf6d935
ML
4250 }
4251 break;
4252
4253 case E_V64QImode:
4254 gen = gen_avx512bw_blendmv64qi;
4255 break;
4256 case E_V32HImode:
4257 gen = gen_avx512bw_blendmv32hi;
4258 break;
9e2a82e1 4259 case E_V32HFmode:
4260 gen = gen_avx512bw_blendmv32hf;
4261 break;
6910cad5 4262 case E_V32BFmode:
4263 gen = gen_avx512bw_blendmv32bf;
4264 break;
2bf6d935
ML
4265 case E_V16SImode:
4266 gen = gen_avx512f_blendmv16si;
4267 break;
4268 case E_V8DImode:
4269 gen = gen_avx512f_blendmv8di;
4270 break;
4271 case E_V8DFmode:
4272 gen = gen_avx512f_blendmv8df;
4273 break;
4274 case E_V16SFmode:
4275 gen = gen_avx512f_blendmv16sf;
4276 break;
4277
4278 default:
4279 break;
4280 }
4281
4282 if (gen != NULL)
4283 {
f4a2cecd
UB
4284 if (blend_mode == mode)
4285 x = dest;
4286 else
4287 {
4288 x = gen_reg_rtx (blend_mode);
4289 op_false = gen_lowpart (blend_mode, op_false);
4290 op_true = gen_lowpart (blend_mode, op_true);
4291 cmp = gen_lowpart (blend_mode, cmp);
4292 }
4293
4294 emit_insn (gen (x, op_false, op_true, cmp));
4295
4296 if (x != dest)
4297 emit_move_insn (dest, gen_lowpart (mode, x));
2bf6d935
ML
4298 }
4299 else
4300 {
f4a2cecd 4301 rtx t2, t3;
2bf6d935 4302
f4a2cecd
UB
4303 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4304 NULL, 1, OPTAB_DIRECT);
2bf6d935 4305
f4a2cecd 4306 t3 = gen_reg_rtx (mode);
2bf6d935 4307 x = gen_rtx_NOT (mode, cmp);
b5193e35 4308 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
2bf6d935 4309
f4a2cecd
UB
4310 x = expand_simple_binop (mode, IOR, t3, t2,
4311 dest, 1, OPTAB_DIRECT);
4312 if (x != dest)
4313 emit_move_insn (dest, x);
2bf6d935
ML
4314 }
4315}
4316
4317/* Swap, force into registers, or otherwise massage the two operands
4318 to an sse comparison with a mask result. Thus we differ a bit from
4319 ix86_prepare_fp_compare_args which expects to produce a flags result.
4320
4321 The DEST operand exists to help determine whether to commute commutative
4322 operators. The POP0/POP1 operands are updated in place. The new
4323 comparison code is returned, or UNKNOWN if not implementable. */
4324
4325static enum rtx_code
4326ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4327 rtx *pop0, rtx *pop1)
4328{
4329 switch (code)
4330 {
4331 case LTGT:
4332 case UNEQ:
4333 /* AVX supports all the needed comparisons. */
4334 if (TARGET_AVX)
4335 break;
4336 /* We have no LTGT as an operator. We could implement it with
4337 NE & ORDERED, but this requires an extra temporary. It's
4338 not clear that it's worth it. */
4339 return UNKNOWN;
4340
4341 case LT:
4342 case LE:
4343 case UNGT:
4344 case UNGE:
4345 /* These are supported directly. */
4346 break;
4347
4348 case EQ:
4349 case NE:
4350 case UNORDERED:
4351 case ORDERED:
4352 /* AVX has 3 operand comparisons, no need to swap anything. */
4353 if (TARGET_AVX)
4354 break;
4355 /* For commutative operators, try to canonicalize the destination
4356 operand to be first in the comparison - this helps reload to
4357 avoid extra moves. */
4358 if (!dest || !rtx_equal_p (dest, *pop1))
4359 break;
4360 /* FALLTHRU */
4361
4362 case GE:
4363 case GT:
4364 case UNLE:
4365 case UNLT:
4366 /* These are not supported directly before AVX, and furthermore
4367 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4368 comparison operands to transform into something that is
4369 supported. */
4370 std::swap (*pop0, *pop1);
4371 code = swap_condition (code);
4372 break;
4373
4374 default:
4375 gcc_unreachable ();
4376 }
4377
4378 return code;
4379}
4380
4381/* Expand a floating-point conditional move. Return true if successful. */
4382
4383bool
4384ix86_expand_fp_movcc (rtx operands[])
4385{
4386 machine_mode mode = GET_MODE (operands[0]);
4387 enum rtx_code code = GET_CODE (operands[1]);
4388 rtx tmp, compare_op;
4389 rtx op0 = XEXP (operands[1], 0);
4390 rtx op1 = XEXP (operands[1], 1);
4391
5792208f
JJ
4392 if (GET_MODE (op0) == BFmode
4393 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4394 return false;
4395
a6841211 4396 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
2bf6d935
ML
4397 {
4398 machine_mode cmode;
4399
4400 /* Since we've no cmove for sse registers, don't force bad register
4401 allocation just to gain access to it. Deny movcc when the
4402 comparison mode doesn't match the move mode. */
4403 cmode = GET_MODE (op0);
4404 if (cmode == VOIDmode)
4405 cmode = GET_MODE (op1);
4406 if (cmode != mode)
4407 return false;
4408
4409 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4410 if (code == UNKNOWN)
4411 return false;
4412
4413 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4414 operands[2], operands[3]))
4415 return true;
4416
4417 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4418 operands[2], operands[3]);
4419 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4420 return true;
4421 }
4422
4423 if (GET_MODE (op0) == TImode
4424 || (GET_MODE (op0) == DImode
4425 && !TARGET_64BIT))
4426 return false;
4427
4428 /* The floating point conditional move instructions don't directly
4429 support conditions resulting from a signed integer comparison. */
4430
4431 compare_op = ix86_expand_compare (code, op0, op1);
4432 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4433 {
4434 tmp = gen_reg_rtx (QImode);
4435 ix86_expand_setcc (tmp, code, op0, op1);
4436
4437 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4438 }
4439
4440 emit_insn (gen_rtx_SET (operands[0],
4441 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4442 operands[2], operands[3])));
4443
4444 return true;
4445}
4446
4447/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4448
4449static int
4450ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4451{
4452 switch (code)
4453 {
4454 case EQ:
4455 return 0;
4456 case LT:
4457 case LTU:
4458 return 1;
4459 case LE:
4460 case LEU:
4461 return 2;
4462 case NE:
4463 return 4;
4464 case GE:
4465 case GEU:
4466 return 5;
4467 case GT:
4468 case GTU:
4469 return 6;
4470 default:
4471 gcc_unreachable ();
4472 }
4473}
4474
4475/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4476
4477static int
4478ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4479{
4480 switch (code)
4481 {
4482 case EQ:
4483 return 0x00;
4484 case NE:
4485 return 0x04;
4486 case GT:
4487 return 0x0e;
4488 case LE:
4489 return 0x02;
4490 case GE:
4491 return 0x0d;
4492 case LT:
4493 return 0x01;
4494 case UNLE:
4495 return 0x0a;
4496 case UNLT:
4497 return 0x09;
4498 case UNGE:
4499 return 0x05;
4500 case UNGT:
4501 return 0x06;
4502 case UNEQ:
4503 return 0x18;
4504 case LTGT:
4505 return 0x0c;
4506 case ORDERED:
4507 return 0x07;
4508 case UNORDERED:
4509 return 0x03;
4510 default:
4511 gcc_unreachable ();
4512 }
4513}
4514
4515/* Return immediate value to be used in UNSPEC_PCMP
4516 for comparison CODE in MODE. */
4517
4518static int
4519ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4520{
4521 if (FLOAT_MODE_P (mode))
4522 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4523 return ix86_int_cmp_code_to_pcmp_immediate (code);
4524}
4525
4526/* Expand AVX-512 vector comparison. */
4527
4528bool
99e4891e 4529ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
2bf6d935 4530{
99e4891e 4531 machine_mode mask_mode = GET_MODE (dest);
4532 machine_mode cmp_mode = GET_MODE (cmp_op0);
2bf6d935
ML
4533 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4534 int unspec_code;
4535 rtx unspec;
4536
4537 switch (code)
4538 {
4539 case LEU:
4540 case GTU:
4541 case GEU:
4542 case LTU:
4543 unspec_code = UNSPEC_UNSIGNED_PCMP;
4544 break;
4545
4546 default:
4547 unspec_code = UNSPEC_PCMP;
4548 }
4549
99e4891e 4550 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
2bf6d935 4551 unspec_code);
99e4891e 4552 emit_insn (gen_rtx_SET (dest, unspec));
2bf6d935
ML
4553
4554 return true;
4555}
4556
4557/* Expand fp vector comparison. */
4558
4559bool
4560ix86_expand_fp_vec_cmp (rtx operands[])
4561{
4562 enum rtx_code code = GET_CODE (operands[1]);
4563 rtx cmp;
4564
4565 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4566 &operands[2], &operands[3]);
4567 if (code == UNKNOWN)
4568 {
4569 rtx temp;
4570 switch (GET_CODE (operands[1]))
4571 {
4572 case LTGT:
4573 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4574 operands[3], NULL, NULL);
4575 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4576 operands[3], NULL, NULL);
4577 code = AND;
4578 break;
4579 case UNEQ:
4580 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4581 operands[3], NULL, NULL);
4582 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4583 operands[3], NULL, NULL);
4584 code = IOR;
4585 break;
4586 default:
4587 gcc_unreachable ();
4588 }
4589 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4590 OPTAB_DIRECT);
4591 }
4592 else
4593 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
8d0737d8 4594 NULL, NULL);
2bf6d935
ML
4595
4596 if (operands[0] != cmp)
4597 emit_move_insn (operands[0], cmp);
4598
4599 return true;
4600}
4601
4602static rtx
4603ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4604 rtx op_true, rtx op_false, bool *negate)
4605{
4606 machine_mode data_mode = GET_MODE (dest);
4607 machine_mode mode = GET_MODE (cop0);
4608 rtx x;
4609
4610 *negate = false;
4611
4612 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4613 if (TARGET_XOP
6c67afaf
UB
4614 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4615 && GET_MODE_SIZE (mode) <= 16)
2bf6d935 4616 ;
8b905e9b
HL
4617 /* AVX512F supports all of the comparsions
4618 on all 128/256/512-bit vector int types. */
8d0737d8 4619 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
8b905e9b 4620 ;
2bf6d935
ML
4621 else
4622 {
4623 /* Canonicalize the comparison to EQ, GT, GTU. */
4624 switch (code)
4625 {
4626 case EQ:
4627 case GT:
4628 case GTU:
4629 break;
4630
2bf6d935
ML
4631 case LE:
4632 case LEU:
fa271afb
JJ
4633 /* x <= cst can be handled as x < cst + 1 unless there is
4634 wrap around in cst + 1. */
4635 if (GET_CODE (cop1) == CONST_VECTOR
4636 && GET_MODE_INNER (mode) != TImode)
4637 {
4638 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4639 machine_mode eltmode = GET_MODE_INNER (mode);
4640 for (i = 0; i < n_elts; ++i)
4641 {
4642 rtx elt = CONST_VECTOR_ELT (cop1, i);
4643 if (!CONST_INT_P (elt))
4644 break;
4645 if (code == GE)
4646 {
4647 /* For LE punt if some element is signed maximum. */
4648 if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4649 == (GET_MODE_MASK (eltmode) >> 1))
4650 break;
4651 }
4652 /* For LEU punt if some element is unsigned maximum. */
4653 else if (elt == constm1_rtx)
4654 break;
4655 }
4656 if (i == n_elts)
4657 {
4658 rtvec v = rtvec_alloc (n_elts);
4659 for (i = 0; i < n_elts; ++i)
4660 RTVEC_ELT (v, i)
49de156a
JJ
4661 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4662 eltmode);
fa271afb
JJ
4663 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4664 std::swap (cop0, cop1);
4665 code = code == LE ? GT : GTU;
4666 break;
4667 }
4668 }
4669 /* FALLTHRU */
4670 case NE:
2bf6d935
ML
4671 code = reverse_condition (code);
4672 *negate = true;
4673 break;
4674
4675 case GE:
4676 case GEU:
fa271afb
JJ
4677 /* x >= cst can be handled as x > cst - 1 unless there is
4678 wrap around in cst - 1. */
4679 if (GET_CODE (cop1) == CONST_VECTOR
4680 && GET_MODE_INNER (mode) != TImode)
4681 {
4682 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4683 machine_mode eltmode = GET_MODE_INNER (mode);
4684 for (i = 0; i < n_elts; ++i)
4685 {
4686 rtx elt = CONST_VECTOR_ELT (cop1, i);
4687 if (!CONST_INT_P (elt))
4688 break;
4689 if (code == GE)
4690 {
4691 /* For GE punt if some element is signed minimum. */
4692 if (INTVAL (elt) < 0
4693 && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4694 == 0))
4695 break;
4696 }
4697 /* For GEU punt if some element is zero. */
4698 else if (elt == const0_rtx)
4699 break;
4700 }
4701 if (i == n_elts)
4702 {
4703 rtvec v = rtvec_alloc (n_elts);
4704 for (i = 0; i < n_elts; ++i)
4705 RTVEC_ELT (v, i)
49de156a
JJ
4706 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
4707 eltmode);
fa271afb
JJ
4708 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4709 code = code == GE ? GT : GTU;
4710 break;
4711 }
4712 }
2bf6d935
ML
4713 code = reverse_condition (code);
4714 *negate = true;
4715 /* FALLTHRU */
4716
4717 case LT:
4718 case LTU:
4719 std::swap (cop0, cop1);
4720 code = swap_condition (code);
4721 break;
4722
4723 default:
4724 gcc_unreachable ();
4725 }
4726
4727 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4728 if (mode == V2DImode)
4729 {
4730 switch (code)
4731 {
4732 case EQ:
4733 /* SSE4.1 supports EQ. */
4734 if (!TARGET_SSE4_1)
4735 return NULL;
4736 break;
4737
4738 case GT:
4739 case GTU:
4740 /* SSE4.2 supports GT/GTU. */
4741 if (!TARGET_SSE4_2)
4742 return NULL;
4743 break;
4744
4745 default:
4746 gcc_unreachable ();
4747 }
4748 }
4749
fa271afb
JJ
4750 if (GET_CODE (cop0) == CONST_VECTOR)
4751 cop0 = force_reg (mode, cop0);
4752 else if (GET_CODE (cop1) == CONST_VECTOR)
4753 cop1 = force_reg (mode, cop1);
4754
2bf6d935
ML
4755 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4756 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4757 if (*negate)
4758 std::swap (optrue, opfalse);
4759
4760 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4761 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4762 min (x, y) == x). While we add one instruction (the minimum),
4763 we remove the need for two instructions in the negation, as the
4764 result is done this way.
4765 When using masks, do it for SI/DImode element types, as it is shorter
4766 than the two subtractions. */
4767 if ((code != EQ
4768 && GET_MODE_SIZE (mode) != 64
4769 && vector_all_ones_operand (opfalse, data_mode)
4770 && optrue == CONST0_RTX (data_mode))
4771 || (code == GTU
4772 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4773 /* Don't do it if not using integer masks and we'd end up with
4774 the right values in the registers though. */
4775 && (GET_MODE_SIZE (mode) == 64
4776 || !vector_all_ones_operand (optrue, data_mode)
4777 || opfalse != CONST0_RTX (data_mode))))
4778 {
4779 rtx (*gen) (rtx, rtx, rtx) = NULL;
4780
4781 switch (mode)
4782 {
4783 case E_V16SImode:
4784 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4785 break;
4786 case E_V8DImode:
4787 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4788 cop0 = force_reg (mode, cop0);
4789 cop1 = force_reg (mode, cop1);
4790 break;
4791 case E_V32QImode:
4792 if (TARGET_AVX2)
4793 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4794 break;
4795 case E_V16HImode:
4796 if (TARGET_AVX2)
4797 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4798 break;
4799 case E_V8SImode:
4800 if (TARGET_AVX2)
4801 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4802 break;
4803 case E_V4DImode:
4804 if (TARGET_AVX512VL)
4805 {
4806 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4807 cop0 = force_reg (mode, cop0);
4808 cop1 = force_reg (mode, cop1);
4809 }
4810 break;
4811 case E_V16QImode:
4812 if (code == GTU && TARGET_SSE2)
4813 gen = gen_uminv16qi3;
4814 else if (code == GT && TARGET_SSE4_1)
4815 gen = gen_sminv16qi3;
4816 break;
f3661f2d
UB
4817 case E_V8QImode:
4818 if (code == GTU && TARGET_SSE2)
4819 gen = gen_uminv8qi3;
4820 else if (code == GT && TARGET_SSE4_1)
4821 gen = gen_sminv8qi3;
4822 break;
2df9d3c5
UB
4823 case E_V4QImode:
4824 if (code == GTU && TARGET_SSE2)
4825 gen = gen_uminv4qi3;
4826 else if (code == GT && TARGET_SSE4_1)
4827 gen = gen_sminv4qi3;
4828 break;
04a74555
UB
4829 case E_V2QImode:
4830 if (code == GTU && TARGET_SSE2)
4831 gen = gen_uminv2qi3;
4832 else if (code == GT && TARGET_SSE4_1)
4833 gen = gen_sminv2qi3;
4834 break;
2bf6d935
ML
4835 case E_V8HImode:
4836 if (code == GTU && TARGET_SSE4_1)
4837 gen = gen_uminv8hi3;
4838 else if (code == GT && TARGET_SSE2)
4839 gen = gen_sminv8hi3;
4840 break;
f3661f2d
UB
4841 case E_V4HImode:
4842 if (code == GTU && TARGET_SSE4_1)
4843 gen = gen_uminv4hi3;
4844 else if (code == GT && TARGET_SSE2)
4845 gen = gen_sminv4hi3;
4846 break;
2df9d3c5
UB
4847 case E_V2HImode:
4848 if (code == GTU && TARGET_SSE4_1)
4849 gen = gen_uminv2hi3;
4850 else if (code == GT && TARGET_SSE2)
4851 gen = gen_sminv2hi3;
4852 break;
2bf6d935
ML
4853 case E_V4SImode:
4854 if (TARGET_SSE4_1)
4855 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4856 break;
f3661f2d
UB
4857 case E_V2SImode:
4858 if (TARGET_SSE4_1)
4859 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4860 break;
2bf6d935
ML
4861 case E_V2DImode:
4862 if (TARGET_AVX512VL)
4863 {
4864 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4865 cop0 = force_reg (mode, cop0);
4866 cop1 = force_reg (mode, cop1);
4867 }
4868 break;
4869 default:
4870 break;
4871 }
4872
4873 if (gen)
4874 {
4875 rtx tem = gen_reg_rtx (mode);
4876 if (!vector_operand (cop0, mode))
4877 cop0 = force_reg (mode, cop0);
4878 if (!vector_operand (cop1, mode))
4879 cop1 = force_reg (mode, cop1);
4880 *negate = !*negate;
4881 emit_insn (gen (tem, cop0, cop1));
4882 cop1 = tem;
4883 code = EQ;
4884 }
4885 }
4886
4887 /* Unsigned parallel compare is not supported by the hardware.
4888 Play some tricks to turn this into a signed comparison
4889 against 0. */
4890 if (code == GTU)
4891 {
4892 cop0 = force_reg (mode, cop0);
4893
4894 switch (mode)
4895 {
4896 case E_V16SImode:
4897 case E_V8DImode:
4898 case E_V8SImode:
4899 case E_V4DImode:
4900 case E_V4SImode:
f3661f2d 4901 case E_V2SImode:
2bf6d935
ML
4902 case E_V2DImode:
4903 {
4904 rtx t1, t2, mask;
83bc5e44 4905
2bf6d935
ML
4906 /* Subtract (-(INT MAX) - 1) from both operands to make
4907 them signed. */
4908 mask = ix86_build_signbit_mask (mode, true, false);
4909 t1 = gen_reg_rtx (mode);
83bc5e44 4910 emit_insn (gen_sub3_insn (t1, cop0, mask));
2bf6d935
ML
4911
4912 t2 = gen_reg_rtx (mode);
83bc5e44 4913 emit_insn (gen_sub3_insn (t2, cop1, mask));
2bf6d935
ML
4914
4915 cop0 = t1;
4916 cop1 = t2;
4917 code = GT;
4918 }
4919 break;
4920
4921 case E_V64QImode:
4922 case E_V32HImode:
4923 case E_V32QImode:
4924 case E_V16HImode:
4925 case E_V16QImode:
f3661f2d 4926 case E_V8QImode:
2df9d3c5 4927 case E_V4QImode:
04a74555 4928 case E_V2QImode:
2bf6d935 4929 case E_V8HImode:
f3661f2d 4930 case E_V4HImode:
2df9d3c5 4931 case E_V2HImode:
2bf6d935
ML
4932 /* Perform a parallel unsigned saturating subtraction. */
4933 x = gen_reg_rtx (mode);
83bc5e44
UB
4934 emit_insn (gen_rtx_SET
4935 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
2bf6d935
ML
4936 cop0 = x;
4937 cop1 = CONST0_RTX (mode);
4938 code = EQ;
4939 *negate = !*negate;
4940 break;
4941
4942 default:
4943 gcc_unreachable ();
4944 }
4945 }
4946 }
4947
4948 if (*negate)
4949 std::swap (op_true, op_false);
4950
fa271afb
JJ
4951 if (GET_CODE (cop1) == CONST_VECTOR)
4952 cop1 = force_reg (mode, cop1);
4953
2bf6d935
ML
4954 /* Allow the comparison to be done in one mode, but the movcc to
4955 happen in another mode. */
4956 if (data_mode == mode)
fa271afb 4957 x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
2bf6d935
ML
4958 else
4959 {
4960 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4961 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4962 op_true, op_false);
4963 if (GET_MODE (x) == mode)
4964 x = gen_lowpart (data_mode, x);
4965 }
4966
4967 return x;
4968}
4969
4970/* Expand integer vector comparison. */
4971
4972bool
4973ix86_expand_int_vec_cmp (rtx operands[])
4974{
4975 rtx_code code = GET_CODE (operands[1]);
4976 bool negate = false;
4977 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4978 operands[3], NULL, NULL, &negate);
4979
4980 if (!cmp)
4981 return false;
4982
4983 if (negate)
4984 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4985 CONST0_RTX (GET_MODE (cmp)),
4986 NULL, NULL, &negate);
4987
4988 gcc_assert (!negate);
4989
4990 if (operands[0] != cmp)
4991 emit_move_insn (operands[0], cmp);
4992
4993 return true;
4994}
4995
4996/* Expand a floating-point vector conditional move; a vcond operation
4997 rather than a movcc operation. */
4998
4999bool
5000ix86_expand_fp_vcond (rtx operands[])
5001{
5002 enum rtx_code code = GET_CODE (operands[3]);
5003 rtx cmp;
5004
5005 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
5006 &operands[4], &operands[5]);
5007 if (code == UNKNOWN)
5008 {
5009 rtx temp;
5010 switch (GET_CODE (operands[3]))
5011 {
5012 case LTGT:
5013 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
5014 operands[5], operands[0], operands[0]);
5015 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
5016 operands[5], operands[1], operands[2]);
5017 code = AND;
5018 break;
5019 case UNEQ:
5020 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
5021 operands[5], operands[0], operands[0]);
5022 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
5023 operands[5], operands[1], operands[2]);
5024 code = IOR;
5025 break;
5026 default:
5027 gcc_unreachable ();
5028 }
5029 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
5030 OPTAB_DIRECT);
5031 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5032 return true;
5033 }
5034
5035 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
5036 operands[5], operands[1], operands[2]))
5037 return true;
5038
5039 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
5040 operands[1], operands[2]);
5041 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5042 return true;
5043}
5044
5045/* Expand a signed/unsigned integral vector conditional move. */
5046
5047bool
5048ix86_expand_int_vcond (rtx operands[])
5049{
5050 machine_mode data_mode = GET_MODE (operands[0]);
5051 machine_mode mode = GET_MODE (operands[4]);
5052 enum rtx_code code = GET_CODE (operands[3]);
5053 bool negate = false;
5054 rtx x, cop0, cop1;
5055
5056 cop0 = operands[4];
5057 cop1 = operands[5];
5058
5059 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
5060 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
5061 if ((code == LT || code == GE)
5062 && data_mode == mode
5063 && cop1 == CONST0_RTX (mode)
5064 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
5065 && GET_MODE_UNIT_SIZE (data_mode) > 1
5066 && GET_MODE_UNIT_SIZE (data_mode) <= 8
5067 && (GET_MODE_SIZE (data_mode) == 16
5068 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
5069 {
5070 rtx negop = operands[2 - (code == LT)];
5071 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
5072 if (negop == CONST1_RTX (data_mode))
5073 {
5074 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
5075 operands[0], 1, OPTAB_DIRECT);
5076 if (res != operands[0])
5077 emit_move_insn (operands[0], res);
5078 return true;
5079 }
5080 else if (GET_MODE_INNER (data_mode) != DImode
5081 && vector_all_ones_operand (negop, data_mode))
5082 {
5083 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
5084 operands[0], 0, OPTAB_DIRECT);
5085 if (res != operands[0])
5086 emit_move_insn (operands[0], res);
5087 return true;
5088 }
5089 }
5090
5091 if (!nonimmediate_operand (cop1, mode))
5092 cop1 = force_reg (mode, cop1);
5093 if (!general_operand (operands[1], data_mode))
5094 operands[1] = force_reg (data_mode, operands[1]);
5095 if (!general_operand (operands[2], data_mode))
5096 operands[2] = force_reg (data_mode, operands[2]);
5097
5098 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5099 operands[1], operands[2], &negate);
5100
5101 if (!x)
5102 return false;
5103
5104 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5105 operands[2-negate]);
5106 return true;
5107}
5108
5109static bool
5110ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5111 struct expand_vec_perm_d *d)
5112{
5113 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5114 expander, so args are either in d, or in op0, op1 etc. */
5115 machine_mode mode = GET_MODE (d ? d->op0 : op0);
5116 machine_mode maskmode = mode;
5117 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5118
5119 switch (mode)
5120 {
faf2b6bc 5121 case E_V16QImode:
5122 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5123 gen = gen_avx512vl_vpermt2varv16qi3;
5124 break;
5125 case E_V32QImode:
5126 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5127 gen = gen_avx512vl_vpermt2varv32qi3;
5128 break;
5129 case E_V64QImode:
5130 if (TARGET_AVX512VBMI)
5131 gen = gen_avx512bw_vpermt2varv64qi3;
5132 break;
2bf6d935
ML
5133 case E_V8HImode:
5134 if (TARGET_AVX512VL && TARGET_AVX512BW)
5135 gen = gen_avx512vl_vpermt2varv8hi3;
5136 break;
5137 case E_V16HImode:
5138 if (TARGET_AVX512VL && TARGET_AVX512BW)
5139 gen = gen_avx512vl_vpermt2varv16hi3;
5140 break;
2bf6d935
ML
5141 case E_V32HImode:
5142 if (TARGET_AVX512BW)
5143 gen = gen_avx512bw_vpermt2varv32hi3;
5144 break;
5145 case E_V4SImode:
5146 if (TARGET_AVX512VL)
5147 gen = gen_avx512vl_vpermt2varv4si3;
5148 break;
5149 case E_V8SImode:
5150 if (TARGET_AVX512VL)
5151 gen = gen_avx512vl_vpermt2varv8si3;
5152 break;
5153 case E_V16SImode:
5154 if (TARGET_AVX512F)
5155 gen = gen_avx512f_vpermt2varv16si3;
5156 break;
5157 case E_V4SFmode:
5158 if (TARGET_AVX512VL)
5159 {
5160 gen = gen_avx512vl_vpermt2varv4sf3;
5161 maskmode = V4SImode;
5162 }
5163 break;
5164 case E_V8SFmode:
5165 if (TARGET_AVX512VL)
5166 {
5167 gen = gen_avx512vl_vpermt2varv8sf3;
5168 maskmode = V8SImode;
5169 }
5170 break;
5171 case E_V16SFmode:
5172 if (TARGET_AVX512F)
5173 {
5174 gen = gen_avx512f_vpermt2varv16sf3;
5175 maskmode = V16SImode;
5176 }
5177 break;
5178 case E_V2DImode:
5179 if (TARGET_AVX512VL)
5180 gen = gen_avx512vl_vpermt2varv2di3;
5181 break;
5182 case E_V4DImode:
5183 if (TARGET_AVX512VL)
5184 gen = gen_avx512vl_vpermt2varv4di3;
5185 break;
5186 case E_V8DImode:
5187 if (TARGET_AVX512F)
5188 gen = gen_avx512f_vpermt2varv8di3;
5189 break;
5190 case E_V2DFmode:
5191 if (TARGET_AVX512VL)
5192 {
5193 gen = gen_avx512vl_vpermt2varv2df3;
5194 maskmode = V2DImode;
5195 }
5196 break;
5197 case E_V4DFmode:
5198 if (TARGET_AVX512VL)
5199 {
5200 gen = gen_avx512vl_vpermt2varv4df3;
5201 maskmode = V4DImode;
5202 }
5203 break;
5204 case E_V8DFmode:
5205 if (TARGET_AVX512F)
5206 {
5207 gen = gen_avx512f_vpermt2varv8df3;
5208 maskmode = V8DImode;
5209 }
5210 break;
5211 default:
5212 break;
5213 }
5214
5215 if (gen == NULL)
5216 return false;
5217
5218 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5219 expander, so args are either in d, or in op0, op1 etc. */
5220 if (d)
5221 {
5222 rtx vec[64];
5223 target = d->target;
5224 op0 = d->op0;
5225 op1 = d->op1;
5226 for (int i = 0; i < d->nelt; ++i)
5227 vec[i] = GEN_INT (d->perm[i]);
5228 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5229 }
5230
5231 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5232 return true;
5233}
5234
5235/* Expand a variable vector permutation. */
5236
5237void
5238ix86_expand_vec_perm (rtx operands[])
5239{
5240 rtx target = operands[0];
5241 rtx op0 = operands[1];
5242 rtx op1 = operands[2];
5243 rtx mask = operands[3];
5244 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5245 machine_mode mode = GET_MODE (op0);
5246 machine_mode maskmode = GET_MODE (mask);
5247 int w, e, i;
5248 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5249
5250 /* Number of elements in the vector. */
5251 w = GET_MODE_NUNITS (mode);
5252 e = GET_MODE_UNIT_SIZE (mode);
5253 gcc_assert (w <= 64);
5254
be072bfa
HW
5255 /* For HF mode vector, convert it to HI using subreg. */
5256 if (GET_MODE_INNER (mode) == HFmode)
5257 {
5258 machine_mode orig_mode = mode;
5259 mode = mode_for_vector (HImode, w).require ();
5260 target = lowpart_subreg (mode, target, orig_mode);
5261 op0 = lowpart_subreg (mode, op0, orig_mode);
5262 op1 = lowpart_subreg (mode, op1, orig_mode);
5263 }
5264
2bf6d935
ML
5265 if (TARGET_AVX512F && one_operand_shuffle)
5266 {
5267 rtx (*gen) (rtx, rtx, rtx) = NULL;
5268 switch (mode)
5269 {
5270 case E_V16SImode:
5271 gen =gen_avx512f_permvarv16si;
5272 break;
5273 case E_V16SFmode:
5274 gen = gen_avx512f_permvarv16sf;
5275 break;
5276 case E_V8DImode:
5277 gen = gen_avx512f_permvarv8di;
5278 break;
5279 case E_V8DFmode:
5280 gen = gen_avx512f_permvarv8df;
5281 break;
5282 default:
5283 break;
5284 }
5285 if (gen != NULL)
5286 {
5287 emit_insn (gen (target, op0, mask));
5288 return;
5289 }
5290 }
5291
5292 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5293 return;
5294
5295 if (TARGET_AVX2)
5296 {
5297 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5298 {
5299 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5300 an constant shuffle operand. With a tiny bit of effort we can
5301 use VPERMD instead. A re-interpretation stall for V4DFmode is
5302 unfortunate but there's no avoiding it.
5303 Similarly for V16HImode we don't have instructions for variable
5304 shuffling, while for V32QImode we can use after preparing suitable
5305 masks vpshufb; vpshufb; vpermq; vpor. */
5306
5307 if (mode == V16HImode)
5308 {
5309 maskmode = mode = V32QImode;
5310 w = 32;
5311 e = 1;
5312 }
5313 else
5314 {
5315 maskmode = mode = V8SImode;
5316 w = 8;
5317 e = 4;
5318 }
5319 t1 = gen_reg_rtx (maskmode);
5320
5321 /* Replicate the low bits of the V4DImode mask into V8SImode:
5322 mask = { A B C D }
5323 t1 = { A A B B C C D D }. */
5324 for (i = 0; i < w / 2; ++i)
5325 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5326 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5327 vt = force_reg (maskmode, vt);
5328 mask = gen_lowpart (maskmode, mask);
5329 if (maskmode == V8SImode)
5330 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5331 else
5332 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5333
5334 /* Multiply the shuffle indicies by two. */
5335 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5336 OPTAB_DIRECT);
5337
5338 /* Add one to the odd shuffle indicies:
5339 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5340 for (i = 0; i < w / 2; ++i)
5341 {
5342 vec[i * 2] = const0_rtx;
5343 vec[i * 2 + 1] = const1_rtx;
5344 }
5345 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5346 vt = validize_mem (force_const_mem (maskmode, vt));
5347 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5348 OPTAB_DIRECT);
5349
5350 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5351 operands[3] = mask = t1;
5352 target = gen_reg_rtx (mode);
5353 op0 = gen_lowpart (mode, op0);
5354 op1 = gen_lowpart (mode, op1);
5355 }
5356
5357 switch (mode)
5358 {
5359 case E_V8SImode:
5360 /* The VPERMD and VPERMPS instructions already properly ignore
5361 the high bits of the shuffle elements. No need for us to
5362 perform an AND ourselves. */
5363 if (one_operand_shuffle)
5364 {
5365 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5366 if (target != operands[0])
5367 emit_move_insn (operands[0],
5368 gen_lowpart (GET_MODE (operands[0]), target));
5369 }
5370 else
5371 {
5372 t1 = gen_reg_rtx (V8SImode);
5373 t2 = gen_reg_rtx (V8SImode);
5374 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5375 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5376 goto merge_two;
5377 }
5378 return;
5379
5380 case E_V8SFmode:
5381 mask = gen_lowpart (V8SImode, mask);
5382 if (one_operand_shuffle)
5383 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5384 else
5385 {
5386 t1 = gen_reg_rtx (V8SFmode);
5387 t2 = gen_reg_rtx (V8SFmode);
5388 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5389 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5390 goto merge_two;
5391 }
5392 return;
5393
5394 case E_V4SImode:
5395 /* By combining the two 128-bit input vectors into one 256-bit
5396 input vector, we can use VPERMD and VPERMPS for the full
5397 two-operand shuffle. */
5398 t1 = gen_reg_rtx (V8SImode);
5399 t2 = gen_reg_rtx (V8SImode);
5400 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5401 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5402 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5403 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5404 return;
5405
5406 case E_V4SFmode:
5407 t1 = gen_reg_rtx (V8SFmode);
5408 t2 = gen_reg_rtx (V8SImode);
5409 mask = gen_lowpart (V4SImode, mask);
5410 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5411 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5412 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5413 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5414 return;
5415
5416 case E_V32QImode:
5417 t1 = gen_reg_rtx (V32QImode);
5418 t2 = gen_reg_rtx (V32QImode);
5419 t3 = gen_reg_rtx (V32QImode);
5420 vt2 = GEN_INT (-128);
5421 vt = gen_const_vec_duplicate (V32QImode, vt2);
5422 vt = force_reg (V32QImode, vt);
5423 for (i = 0; i < 32; i++)
5424 vec[i] = i < 16 ? vt2 : const0_rtx;
5425 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5426 vt2 = force_reg (V32QImode, vt2);
5427 /* From mask create two adjusted masks, which contain the same
5428 bits as mask in the low 7 bits of each vector element.
5429 The first mask will have the most significant bit clear
5430 if it requests element from the same 128-bit lane
5431 and MSB set if it requests element from the other 128-bit lane.
5432 The second mask will have the opposite values of the MSB,
5433 and additionally will have its 128-bit lanes swapped.
5434 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5435 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5436 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5437 stands for other 12 bytes. */
5438 /* The bit whether element is from the same lane or the other
5439 lane is bit 4, so shift it up by 3 to the MSB position. */
5440 t5 = gen_reg_rtx (V4DImode);
5441 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5442 GEN_INT (3)));
5443 /* Clear MSB bits from the mask just in case it had them set. */
5444 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5445 /* After this t1 will have MSB set for elements from other lane. */
5446 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5447 /* Clear bits other than MSB. */
5448 emit_insn (gen_andv32qi3 (t1, t1, vt));
5449 /* Or in the lower bits from mask into t3. */
5450 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5451 /* And invert MSB bits in t1, so MSB is set for elements from the same
5452 lane. */
5453 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5454 /* Swap 128-bit lanes in t3. */
5455 t6 = gen_reg_rtx (V4DImode);
5456 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5457 const2_rtx, GEN_INT (3),
5458 const0_rtx, const1_rtx));
5459 /* And or in the lower bits from mask into t1. */
5460 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5461 if (one_operand_shuffle)
5462 {
5463 /* Each of these shuffles will put 0s in places where
5464 element from the other 128-bit lane is needed, otherwise
5465 will shuffle in the requested value. */
5466 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5467 gen_lowpart (V32QImode, t6)));
5468 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5469 /* For t3 the 128-bit lanes are swapped again. */
5470 t7 = gen_reg_rtx (V4DImode);
5471 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5472 const2_rtx, GEN_INT (3),
5473 const0_rtx, const1_rtx));
5474 /* And oring both together leads to the result. */
5475 emit_insn (gen_iorv32qi3 (target, t1,
5476 gen_lowpart (V32QImode, t7)));
5477 if (target != operands[0])
5478 emit_move_insn (operands[0],
5479 gen_lowpart (GET_MODE (operands[0]), target));
5480 return;
5481 }
5482
5483 t4 = gen_reg_rtx (V32QImode);
5484 /* Similarly to the above one_operand_shuffle code,
5485 just for repeated twice for each operand. merge_two:
5486 code will merge the two results together. */
5487 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5488 gen_lowpart (V32QImode, t6)));
5489 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5490 gen_lowpart (V32QImode, t6)));
5491 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5492 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5493 t7 = gen_reg_rtx (V4DImode);
5494 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5495 const2_rtx, GEN_INT (3),
5496 const0_rtx, const1_rtx));
5497 t8 = gen_reg_rtx (V4DImode);
5498 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5499 const2_rtx, GEN_INT (3),
5500 const0_rtx, const1_rtx));
5501 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5502 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5503 t1 = t4;
5504 t2 = t3;
5505 goto merge_two;
5506
5507 default:
5508 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5509 break;
5510 }
5511 }
5512
5513 if (TARGET_XOP)
5514 {
5515 /* The XOP VPPERM insn supports three inputs. By ignoring the
5516 one_operand_shuffle special case, we avoid creating another
5517 set of constant vectors in memory. */
5518 one_operand_shuffle = false;
5519
5520 /* mask = mask & {2*w-1, ...} */
5521 vt = GEN_INT (2*w - 1);
5522 }
5523 else
5524 {
5525 /* mask = mask & {w-1, ...} */
5526 vt = GEN_INT (w - 1);
5527 }
5528
5529 vt = gen_const_vec_duplicate (maskmode, vt);
5530 mask = expand_simple_binop (maskmode, AND, mask, vt,
5531 NULL_RTX, 0, OPTAB_DIRECT);
5532
5533 /* For non-QImode operations, convert the word permutation control
5534 into a byte permutation control. */
5535 if (mode != V16QImode)
5536 {
5537 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5538 GEN_INT (exact_log2 (e)),
5539 NULL_RTX, 0, OPTAB_DIRECT);
5540
5541 /* Convert mask to vector of chars. */
5542 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5543
5544 /* Replicate each of the input bytes into byte positions:
5545 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5546 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5547 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5548 for (i = 0; i < 16; ++i)
5549 vec[i] = GEN_INT (i/e * e);
5550 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5551 vt = validize_mem (force_const_mem (V16QImode, vt));
5552 if (TARGET_XOP)
5553 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5554 else
5555 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5556
5557 /* Convert it into the byte positions by doing
5558 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5559 for (i = 0; i < 16; ++i)
5560 vec[i] = GEN_INT (i % e);
5561 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5562 vt = validize_mem (force_const_mem (V16QImode, vt));
5563 emit_insn (gen_addv16qi3 (mask, mask, vt));
5564 }
5565
5566 /* The actual shuffle operations all operate on V16QImode. */
5567 op0 = gen_lowpart (V16QImode, op0);
5568 op1 = gen_lowpart (V16QImode, op1);
5569
5570 if (TARGET_XOP)
5571 {
5572 if (GET_MODE (target) != V16QImode)
5573 target = gen_reg_rtx (V16QImode);
5574 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5575 if (target != operands[0])
5576 emit_move_insn (operands[0],
5577 gen_lowpart (GET_MODE (operands[0]), target));
5578 }
5579 else if (one_operand_shuffle)
5580 {
5581 if (GET_MODE (target) != V16QImode)
5582 target = gen_reg_rtx (V16QImode);
5583 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5584 if (target != operands[0])
5585 emit_move_insn (operands[0],
5586 gen_lowpart (GET_MODE (operands[0]), target));
5587 }
5588 else
5589 {
5590 rtx xops[6];
5591 bool ok;
5592
5593 /* Shuffle the two input vectors independently. */
5594 t1 = gen_reg_rtx (V16QImode);
5595 t2 = gen_reg_rtx (V16QImode);
5596 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5597 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5598
5599 merge_two:
5600 /* Then merge them together. The key is whether any given control
5601 element contained a bit set that indicates the second word. */
5602 mask = operands[3];
5603 vt = GEN_INT (w);
5604 if (maskmode == V2DImode && !TARGET_SSE4_1)
5605 {
5606 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5607 more shuffle to convert the V2DI input mask into a V4SI
5608 input mask. At which point the masking that expand_int_vcond
5609 will work as desired. */
5610 rtx t3 = gen_reg_rtx (V4SImode);
5611 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5612 const0_rtx, const0_rtx,
5613 const2_rtx, const2_rtx));
5614 mask = t3;
5615 maskmode = V4SImode;
5616 e = w = 4;
5617 }
5618
5619 vt = gen_const_vec_duplicate (maskmode, vt);
5620 vt = force_reg (maskmode, vt);
5621 mask = expand_simple_binop (maskmode, AND, mask, vt,
5622 NULL_RTX, 0, OPTAB_DIRECT);
5623
5624 if (GET_MODE (target) != mode)
5625 target = gen_reg_rtx (mode);
5626 xops[0] = target;
5627 xops[1] = gen_lowpart (mode, t2);
5628 xops[2] = gen_lowpart (mode, t1);
5629 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5630 xops[4] = mask;
5631 xops[5] = vt;
5632 ok = ix86_expand_int_vcond (xops);
5633 gcc_assert (ok);
5634 if (target != operands[0])
5635 emit_move_insn (operands[0],
5636 gen_lowpart (GET_MODE (operands[0]), target));
5637 }
5638}
5639
4123b560
UB
5640/* Extend SRC into next wider integer vector type. UNSIGNED_P is
5641 true if we should do zero extension, else sign extension. */
5642
5643void
5644ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
5645{
5646 machine_mode imode = GET_MODE (src);
5647 rtx ops[3];
5648
5649 switch (imode)
5650 {
5651 case E_V8QImode:
5652 case E_V4QImode:
5653 case E_V2QImode:
5654 case E_V4HImode:
5655 case E_V2HImode:
5656 case E_V2SImode:
5657 break;
5658 default:
5659 gcc_unreachable ();
5660 }
5661
5662 ops[0] = gen_reg_rtx (imode);
5663
5664 ops[1] = force_reg (imode, src);
5665
5666 if (unsigned_p)
5667 ops[2] = force_reg (imode, CONST0_RTX (imode));
5668 else
5669 ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
791952ef 5670 ops[1], pc_rtx, pc_rtx);
4123b560
UB
5671
5672 ix86_split_mmx_punpck (ops, false);
5673 emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
5674}
5675
5676/* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
2bf6d935
ML
5677 true if we should do zero extension, else sign extension. HIGH_P is
5678 true if we want the N/2 high elements, else the low elements. */
5679
5680void
5681ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5682{
5683 machine_mode imode = GET_MODE (src);
5684 rtx tmp;
5685
5686 if (TARGET_SSE4_1)
5687 {
5688 rtx (*unpack)(rtx, rtx);
5689 rtx (*extract)(rtx, rtx) = NULL;
5690 machine_mode halfmode = BLKmode;
5691
5692 switch (imode)
5693 {
5694 case E_V64QImode:
5695 if (unsigned_p)
5696 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5697 else
5698 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5699 halfmode = V32QImode;
5700 extract
5701 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5702 break;
5703 case E_V32QImode:
5704 if (unsigned_p)
5705 unpack = gen_avx2_zero_extendv16qiv16hi2;
5706 else
5707 unpack = gen_avx2_sign_extendv16qiv16hi2;
5708 halfmode = V16QImode;
5709 extract
5710 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5711 break;
5712 case E_V32HImode:
5713 if (unsigned_p)
5714 unpack = gen_avx512f_zero_extendv16hiv16si2;
5715 else
5716 unpack = gen_avx512f_sign_extendv16hiv16si2;
5717 halfmode = V16HImode;
5718 extract
5719 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5720 break;
5721 case E_V16HImode:
5722 if (unsigned_p)
5723 unpack = gen_avx2_zero_extendv8hiv8si2;
5724 else
5725 unpack = gen_avx2_sign_extendv8hiv8si2;
5726 halfmode = V8HImode;
5727 extract
5728 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5729 break;
5730 case E_V16SImode:
5731 if (unsigned_p)
5732 unpack = gen_avx512f_zero_extendv8siv8di2;
5733 else
5734 unpack = gen_avx512f_sign_extendv8siv8di2;
5735 halfmode = V8SImode;
5736 extract
5737 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5738 break;
5739 case E_V8SImode:
5740 if (unsigned_p)
5741 unpack = gen_avx2_zero_extendv4siv4di2;
5742 else
5743 unpack = gen_avx2_sign_extendv4siv4di2;
5744 halfmode = V4SImode;
5745 extract
5746 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5747 break;
5748 case E_V16QImode:
5749 if (unsigned_p)
5750 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5751 else
5752 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5753 break;
5754 case E_V8HImode:
5755 if (unsigned_p)
5756 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5757 else
5758 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5759 break;
5760 case E_V4SImode:
5761 if (unsigned_p)
5762 unpack = gen_sse4_1_zero_extendv2siv2di2;
5763 else
5764 unpack = gen_sse4_1_sign_extendv2siv2di2;
5765 break;
836328b2
UB
5766 case E_V8QImode:
5767 if (unsigned_p)
5768 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5769 else
5770 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5771 break;
5772 case E_V4HImode:
5773 if (unsigned_p)
5774 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5775 else
5776 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5777 break;
663a014e
UB
5778 case E_V4QImode:
5779 if (unsigned_p)
5780 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5781 else
5782 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5783 break;
2bf6d935
ML
5784 default:
5785 gcc_unreachable ();
5786 }
5787
5788 if (GET_MODE_SIZE (imode) >= 32)
5789 {
5790 tmp = gen_reg_rtx (halfmode);
5791 emit_insn (extract (tmp, src));
5792 }
5793 else if (high_p)
5794 {
836328b2
UB
5795 switch (GET_MODE_SIZE (imode))
5796 {
5797 case 16:
5798 /* Shift higher 8 bytes to lower 8 bytes. */
5799 tmp = gen_reg_rtx (V1TImode);
5800 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5801 GEN_INT (64)));
5802 break;
5803 case 8:
5804 /* Shift higher 4 bytes to lower 4 bytes. */
5805 tmp = gen_reg_rtx (V1DImode);
5806 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5807 GEN_INT (32)));
5808 break;
663a014e
UB
5809 case 4:
5810 /* Shift higher 2 bytes to lower 2 bytes. */
5811 tmp = gen_reg_rtx (V1SImode);
5812 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5813 GEN_INT (16)));
5814 break;
836328b2
UB
5815 default:
5816 gcc_unreachable ();
5817 }
5818
2bf6d935
ML
5819 tmp = gen_lowpart (imode, tmp);
5820 }
5821 else
5822 tmp = src;
5823
5824 emit_insn (unpack (dest, tmp));
5825 }
5826 else
5827 {
5828 rtx (*unpack)(rtx, rtx, rtx);
5829
5830 switch (imode)
5831 {
5832 case E_V16QImode:
5833 if (high_p)
5834 unpack = gen_vec_interleave_highv16qi;
5835 else
5836 unpack = gen_vec_interleave_lowv16qi;
5837 break;
5838 case E_V8HImode:
5839 if (high_p)
5840 unpack = gen_vec_interleave_highv8hi;
5841 else
5842 unpack = gen_vec_interleave_lowv8hi;
5843 break;
5844 case E_V4SImode:
5845 if (high_p)
5846 unpack = gen_vec_interleave_highv4si;
5847 else
5848 unpack = gen_vec_interleave_lowv4si;
5849 break;
836328b2
UB
5850 case E_V8QImode:
5851 if (high_p)
5852 unpack = gen_mmx_punpckhbw;
5853 else
5854 unpack = gen_mmx_punpcklbw;
5855 break;
5856 case E_V4HImode:
5857 if (high_p)
5858 unpack = gen_mmx_punpckhwd;
5859 else
5860 unpack = gen_mmx_punpcklwd;
5861 break;
663a014e
UB
5862 case E_V4QImode:
5863 if (high_p)
5864 unpack = gen_mmx_punpckhbw_low;
5865 else
5866 unpack = gen_mmx_punpcklbw_low;
5867 break;
2bf6d935
ML
5868 default:
5869 gcc_unreachable ();
5870 }
5871
5872 if (unsigned_p)
5873 tmp = force_reg (imode, CONST0_RTX (imode));
5874 else
5875 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5876 src, pc_rtx, pc_rtx);
5877
5878 rtx tmp2 = gen_reg_rtx (imode);
5879 emit_insn (unpack (tmp2, src, tmp));
5880 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5881 }
5882}
5883
faf2b6bc 5884/* Return true if mem is pool constant which contains a const_vector
5885 perm index, assign the index to PERM. */
5886bool
5887ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5888{
5889 machine_mode mode = GET_MODE (mem);
5890 int nelt = GET_MODE_NUNITS (mode);
5891
5892 if (!INTEGRAL_MODE_P (mode))
5893 return false;
5894
5895 /* Needs to be constant pool. */
5896 if (!(MEM_P (mem))
5897 || !SYMBOL_REF_P (XEXP (mem, 0))
5898 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5899 return false;
5900
5901 rtx constant = get_pool_constant (XEXP (mem, 0));
5902
5903 if (GET_CODE (constant) != CONST_VECTOR)
5904 return false;
5905
5906 /* There could be some rtx like
5907 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5908 but with "*.LC1" refer to V2DI constant vector. */
5909 if (GET_MODE (constant) != mode)
5910 {
5911 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5912
5913 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5914 return false;
5915 }
5916
5917 for (int i = 0; i != nelt; i++)
5918 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5919
5920 return true;
5921}
5922
2bf6d935
ML
5923/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5924 but works for floating pointer parameters and nonoffsetable memories.
5925 For pushes, it returns just stack offsets; the values will be saved
5926 in the right order. Maximally three parts are generated. */
5927
5928static int
5929ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5930{
5931 int size;
5932
5933 if (!TARGET_64BIT)
5934 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5935 else
5936 size = (GET_MODE_SIZE (mode) + 4) / 8;
5937
5938 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5939 gcc_assert (size >= 2 && size <= 4);
5940
5941 /* Optimize constant pool reference to immediates. This is used by fp
5942 moves, that force all constants to memory to allow combining. */
5943 if (MEM_P (operand) && MEM_READONLY_P (operand))
5944 operand = avoid_constant_pool_reference (operand);
5945
5946 if (MEM_P (operand) && !offsettable_memref_p (operand))
5947 {
5948 /* The only non-offsetable memories we handle are pushes. */
5949 int ok = push_operand (operand, VOIDmode);
5950
5951 gcc_assert (ok);
5952
5953 operand = copy_rtx (operand);
5954 PUT_MODE (operand, word_mode);
5955 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5956 return size;
5957 }
5958
5959 if (GET_CODE (operand) == CONST_VECTOR)
5960 {
5961 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5962 /* Caution: if we looked through a constant pool memory above,
5963 the operand may actually have a different mode now. That's
5964 ok, since we want to pun this all the way back to an integer. */
5965 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5966 gcc_assert (operand != NULL);
5967 mode = imode;
5968 }
5969
5970 if (!TARGET_64BIT)
5971 {
5972 if (mode == DImode)
5973 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5974 else
5975 {
5976 int i;
5977
5978 if (REG_P (operand))
5979 {
5980 gcc_assert (reload_completed);
5981 for (i = 0; i < size; i++)
5982 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5983 }
5984 else if (offsettable_memref_p (operand))
5985 {
5986 operand = adjust_address (operand, SImode, 0);
5987 parts[0] = operand;
5988 for (i = 1; i < size; i++)
5989 parts[i] = adjust_address (operand, SImode, 4 * i);
5990 }
5991 else if (CONST_DOUBLE_P (operand))
5992 {
5993 const REAL_VALUE_TYPE *r;
5994 long l[4];
5995
5996 r = CONST_DOUBLE_REAL_VALUE (operand);
5997 switch (mode)
5998 {
5999 case E_TFmode:
6000 real_to_target (l, r, mode);
6001 parts[3] = gen_int_mode (l[3], SImode);
6002 parts[2] = gen_int_mode (l[2], SImode);
6003 break;
6004 case E_XFmode:
6005 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
6006 long double may not be 80-bit. */
6007 real_to_target (l, r, mode);
6008 parts[2] = gen_int_mode (l[2], SImode);
6009 break;
6010 case E_DFmode:
6011 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
6012 break;
6013 default:
6014 gcc_unreachable ();
6015 }
6016 parts[1] = gen_int_mode (l[1], SImode);
6017 parts[0] = gen_int_mode (l[0], SImode);
6018 }
6019 else
6020 gcc_unreachable ();
6021 }
6022 }
6023 else
6024 {
6025 if (mode == TImode)
6026 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6027 if (mode == XFmode || mode == TFmode)
6028 {
6029 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
6030 if (REG_P (operand))
6031 {
6032 gcc_assert (reload_completed);
6033 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
6034 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
6035 }
6036 else if (offsettable_memref_p (operand))
6037 {
6038 operand = adjust_address (operand, DImode, 0);
6039 parts[0] = operand;
6040 parts[1] = adjust_address (operand, upper_mode, 8);
6041 }
6042 else if (CONST_DOUBLE_P (operand))
6043 {
6044 long l[4];
6045
6046 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
6047
6048 /* real_to_target puts 32-bit pieces in each long. */
6049 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
6050 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
6051 << 32), DImode);
6052
6053 if (upper_mode == SImode)
6054 parts[1] = gen_int_mode (l[2], SImode);
6055 else
6056 parts[1]
6057 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
6058 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
6059 << 32), DImode);
6060 }
6061 else
6062 gcc_unreachable ();
6063 }
6064 }
6065
6066 return size;
6067}
6068
6069/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
6070 Return false when normal moves are needed; true when all required
6071 insns have been emitted. Operands 2-4 contain the input values
6072 int the correct order; operands 5-7 contain the output values. */
6073
6074void
6075ix86_split_long_move (rtx operands[])
6076{
6077 rtx part[2][4];
6078 int nparts, i, j;
6079 int push = 0;
6080 int collisions = 0;
6081 machine_mode mode = GET_MODE (operands[0]);
6082 bool collisionparts[4];
6083
6084 /* The DFmode expanders may ask us to move double.
6085 For 64bit target this is single move. By hiding the fact
6086 here we simplify i386.md splitters. */
6087 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
6088 {
6089 /* Optimize constant pool reference to immediates. This is used by
6090 fp moves, that force all constants to memory to allow combining. */
6091
6092 if (MEM_P (operands[1])
6093 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
6094 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
6095 operands[1] = get_pool_constant (XEXP (operands[1], 0));
6096 if (push_operand (operands[0], VOIDmode))
6097 {
6098 operands[0] = copy_rtx (operands[0]);
6099 PUT_MODE (operands[0], word_mode);
6100 }
6101 else
6102 operands[0] = gen_lowpart (DImode, operands[0]);
6103 operands[1] = gen_lowpart (DImode, operands[1]);
6104 emit_move_insn (operands[0], operands[1]);
6105 return;
6106 }
6107
6108 /* The only non-offsettable memory we handle is push. */
6109 if (push_operand (operands[0], VOIDmode))
6110 push = 1;
6111 else
6112 gcc_assert (!MEM_P (operands[0])
6113 || offsettable_memref_p (operands[0]));
6114
6115 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
6116 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
6117
6118 /* When emitting push, take care for source operands on the stack. */
6119 if (push && MEM_P (operands[1])
6120 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
6121 {
6122 rtx src_base = XEXP (part[1][nparts - 1], 0);
6123
6124 /* Compensate for the stack decrement by 4. */
6125 if (!TARGET_64BIT && nparts == 3
6126 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6127 src_base = plus_constant (Pmode, src_base, 4);
6128
6129 /* src_base refers to the stack pointer and is
6130 automatically decreased by emitted push. */
6131 for (i = 0; i < nparts; i++)
6132 part[1][i] = change_address (part[1][i],
6133 GET_MODE (part[1][i]), src_base);
6134 }
6135
6136 /* We need to do copy in the right order in case an address register
6137 of the source overlaps the destination. */
6138 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6139 {
6140 rtx tmp;
6141
6142 for (i = 0; i < nparts; i++)
6143 {
6144 collisionparts[i]
6145 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6146 if (collisionparts[i])
6147 collisions++;
6148 }
6149
6150 /* Collision in the middle part can be handled by reordering. */
6151 if (collisions == 1 && nparts == 3 && collisionparts [1])
6152 {
6153 std::swap (part[0][1], part[0][2]);
6154 std::swap (part[1][1], part[1][2]);
6155 }
6156 else if (collisions == 1
6157 && nparts == 4
6158 && (collisionparts [1] || collisionparts [2]))
6159 {
6160 if (collisionparts [1])
6161 {
6162 std::swap (part[0][1], part[0][2]);
6163 std::swap (part[1][1], part[1][2]);
6164 }
6165 else
6166 {
6167 std::swap (part[0][2], part[0][3]);
6168 std::swap (part[1][2], part[1][3]);
6169 }
6170 }
6171
6172 /* If there are more collisions, we can't handle it by reordering.
6173 Do an lea to the last part and use only one colliding move. */
6174 else if (collisions > 1)
6175 {
6176 rtx base, addr;
6177
6178 collisions = 1;
6179
6180 base = part[0][nparts - 1];
6181
6182 /* Handle the case when the last part isn't valid for lea.
6183 Happens in 64-bit mode storing the 12-byte XFmode. */
6184 if (GET_MODE (base) != Pmode)
6185 base = gen_rtx_REG (Pmode, REGNO (base));
6186
6187 addr = XEXP (part[1][0], 0);
6188 if (TARGET_TLS_DIRECT_SEG_REFS)
6189 {
6190 struct ix86_address parts;
6191 int ok = ix86_decompose_address (addr, &parts);
6192 gcc_assert (ok);
6193 /* It is not valid to use %gs: or %fs: in lea. */
6194 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6195 }
6196 emit_insn (gen_rtx_SET (base, addr));
6197 part[1][0] = replace_equiv_address (part[1][0], base);
6198 for (i = 1; i < nparts; i++)
6199 {
6200 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6201 part[1][i] = replace_equiv_address (part[1][i], tmp);
6202 }
6203 }
6204 }
6205
6206 if (push)
6207 {
6208 if (!TARGET_64BIT)
6209 {
6210 if (nparts == 3)
6211 {
6212 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
d9330fb5 6213 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
2bf6d935
ML
6214 emit_move_insn (part[0][2], part[1][2]);
6215 }
6216 else if (nparts == 4)
6217 {
6218 emit_move_insn (part[0][3], part[1][3]);
6219 emit_move_insn (part[0][2], part[1][2]);
6220 }
6221 }
6222 else
6223 {
6224 /* In 64bit mode we don't have 32bit push available. In case this is
6225 register, it is OK - we will just use larger counterpart. We also
6226 retype memory - these comes from attempt to avoid REX prefix on
6227 moving of second half of TFmode value. */
6228 if (GET_MODE (part[1][1]) == SImode)
6229 {
6230 switch (GET_CODE (part[1][1]))
6231 {
6232 case MEM:
6233 part[1][1] = adjust_address (part[1][1], DImode, 0);
6234 break;
6235
6236 case REG:
6237 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6238 break;
6239
6240 default:
6241 gcc_unreachable ();
6242 }
6243
6244 if (GET_MODE (part[1][0]) == SImode)
6245 part[1][0] = part[1][1];
6246 }
6247 }
6248 emit_move_insn (part[0][1], part[1][1]);
6249 emit_move_insn (part[0][0], part[1][0]);
6250 return;
6251 }
6252
6253 /* Choose correct order to not overwrite the source before it is copied. */
6254 if ((REG_P (part[0][0])
6255 && REG_P (part[1][1])
6256 && (REGNO (part[0][0]) == REGNO (part[1][1])
6257 || (nparts == 3
6258 && REGNO (part[0][0]) == REGNO (part[1][2]))
6259 || (nparts == 4
6260 && REGNO (part[0][0]) == REGNO (part[1][3]))))
6261 || (collisions > 0
6262 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6263 {
6264 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6265 {
6266 operands[2 + i] = part[0][j];
6267 operands[6 + i] = part[1][j];
6268 }
6269 }
6270 else
6271 {
6272 for (i = 0; i < nparts; i++)
6273 {
6274 operands[2 + i] = part[0][i];
6275 operands[6 + i] = part[1][i];
6276 }
6277 }
6278
6279 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6280 if (optimize_insn_for_size_p ())
6281 {
6282 for (j = 0; j < nparts - 1; j++)
6283 if (CONST_INT_P (operands[6 + j])
6284 && operands[6 + j] != const0_rtx
6285 && REG_P (operands[2 + j]))
6286 for (i = j; i < nparts - 1; i++)
6287 if (CONST_INT_P (operands[7 + i])
6288 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6289 operands[7 + i] = operands[2 + j];
6290 }
6291
6292 for (i = 0; i < nparts; i++)
6293 emit_move_insn (operands[2 + i], operands[6 + i]);
6294
6295 return;
6296}
6297
6298/* Helper function of ix86_split_ashl used to generate an SImode/DImode
6299 left shift by a constant, either using a single shift or
6300 a sequence of add instructions. */
6301
6302static void
6303ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6304{
2bf6d935
ML
6305 if (count == 1
6306 || (count * ix86_cost->add <= ix86_cost->shift_const
6307 && !optimize_insn_for_size_p ()))
6308 {
2bf6d935 6309 while (count-- > 0)
83bc5e44 6310 emit_insn (gen_add2_insn (operand, operand));
2bf6d935
ML
6311 }
6312 else
6313 {
83bc5e44
UB
6314 rtx (*insn)(rtx, rtx, rtx);
6315
2bf6d935
ML
6316 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6317 emit_insn (insn (operand, operand, GEN_INT (count)));
6318 }
6319}
6320
6321void
6322ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6323{
6324 rtx (*gen_ashl3)(rtx, rtx, rtx);
6325 rtx (*gen_shld)(rtx, rtx, rtx);
6326 int half_width = GET_MODE_BITSIZE (mode) >> 1;
987a3082 6327 machine_mode half_mode;
2bf6d935
ML
6328
6329 rtx low[2], high[2];
6330 int count;
6331
6332 if (CONST_INT_P (operands[2]))
6333 {
6334 split_double_mode (mode, operands, 2, low, high);
6335 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6336
6337 if (count >= half_width)
6338 {
6339 emit_move_insn (high[0], low[1]);
38b649ec 6340 ix86_expand_clear (low[0]);
2bf6d935
ML
6341
6342 if (count > half_width)
6343 ix86_expand_ashl_const (high[0], count - half_width, mode);
6344 }
6345 else
6346 {
6347 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6348
6349 if (!rtx_equal_p (operands[0], operands[1]))
6350 emit_move_insn (operands[0], operands[1]);
6351
6352 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6353 ix86_expand_ashl_const (low[0], count, mode);
6354 }
6355 return;
6356 }
6357
6358 split_double_mode (mode, operands, 1, low, high);
987a3082 6359 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6360
6361 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6362
6363 if (operands[1] == const1_rtx)
6364 {
6365 /* Assuming we've chosen a QImode capable registers, then 1 << N
6366 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6367 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6368 {
6369 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6370
6371 ix86_expand_clear (low[0]);
6372 ix86_expand_clear (high[0]);
6373 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6374
6375 d = gen_lowpart (QImode, low[0]);
6376 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6377 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6378 emit_insn (gen_rtx_SET (d, s));
6379
6380 d = gen_lowpart (QImode, high[0]);
6381 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6382 s = gen_rtx_NE (QImode, flags, const0_rtx);
6383 emit_insn (gen_rtx_SET (d, s));
6384 }
6385
6386 /* Otherwise, we can get the same results by manually performing
6387 a bit extract operation on bit 5/6, and then performing the two
6388 shifts. The two methods of getting 0/1 into low/high are exactly
6389 the same size. Avoiding the shift in the bit extract case helps
6390 pentium4 a bit; no one else seems to care much either way. */
6391 else
6392 {
2bf6d935
ML
6393 rtx (*gen_lshr3)(rtx, rtx, rtx);
6394 rtx (*gen_and3)(rtx, rtx, rtx);
6395 rtx (*gen_xor3)(rtx, rtx, rtx);
6396 HOST_WIDE_INT bits;
6397 rtx x;
6398
6399 if (mode == DImode)
6400 {
2bf6d935
ML
6401 gen_lshr3 = gen_lshrsi3;
6402 gen_and3 = gen_andsi3;
6403 gen_xor3 = gen_xorsi3;
6404 bits = 5;
6405 }
6406 else
6407 {
2bf6d935
ML
6408 gen_lshr3 = gen_lshrdi3;
6409 gen_and3 = gen_anddi3;
6410 gen_xor3 = gen_xordi3;
6411 bits = 6;
6412 }
6413
6414 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6415 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6416 else
6417 x = gen_lowpart (half_mode, operands[2]);
6418 emit_insn (gen_rtx_SET (high[0], x));
6419
6420 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6421 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6422 emit_move_insn (low[0], high[0]);
6423 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6424 }
6425
6426 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6427 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6428 return;
6429 }
6430
6431 if (operands[1] == constm1_rtx)
6432 {
6433 /* For -1 << N, we can avoid the shld instruction, because we
6434 know that we're shifting 0...31/63 ones into a -1. */
6435 emit_move_insn (low[0], constm1_rtx);
6436 if (optimize_insn_for_size_p ())
6437 emit_move_insn (high[0], low[0]);
6438 else
6439 emit_move_insn (high[0], constm1_rtx);
6440 }
6441 else
6442 {
6443 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6444
6445 if (!rtx_equal_p (operands[0], operands[1]))
6446 emit_move_insn (operands[0], operands[1]);
6447
6448 split_double_mode (mode, operands, 1, low, high);
6449 emit_insn (gen_shld (high[0], low[0], operands[2]));
6450 }
6451
6452 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6453
6454 if (TARGET_CMOVE && scratch)
6455 {
2bf6d935 6456 ix86_expand_clear (scratch);
987a3082
UB
6457 emit_insn (gen_x86_shift_adj_1
6458 (half_mode, high[0], low[0], operands[2], scratch));
2bf6d935
ML
6459 }
6460 else
987a3082 6461 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
2bf6d935
ML
6462}
6463
6464void
6465ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6466{
6467 rtx (*gen_ashr3)(rtx, rtx, rtx)
6468 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6469 rtx (*gen_shrd)(rtx, rtx, rtx);
6470 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6471
6472 rtx low[2], high[2];
6473 int count;
6474
6475 if (CONST_INT_P (operands[2]))
6476 {
6477 split_double_mode (mode, operands, 2, low, high);
6478 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6479
6480 if (count == GET_MODE_BITSIZE (mode) - 1)
6481 {
6482 emit_move_insn (high[0], high[1]);
6483 emit_insn (gen_ashr3 (high[0], high[0],
6484 GEN_INT (half_width - 1)));
6485 emit_move_insn (low[0], high[0]);
6486
6487 }
6488 else if (count >= half_width)
6489 {
6490 emit_move_insn (low[0], high[1]);
6491 emit_move_insn (high[0], low[0]);
6492 emit_insn (gen_ashr3 (high[0], high[0],
6493 GEN_INT (half_width - 1)));
6494
6495 if (count > half_width)
6496 emit_insn (gen_ashr3 (low[0], low[0],
6497 GEN_INT (count - half_width)));
6498 }
6499 else
6500 {
6501 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6502
6503 if (!rtx_equal_p (operands[0], operands[1]))
6504 emit_move_insn (operands[0], operands[1]);
6505
6506 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6507 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6508 }
6509 }
6510 else
6511 {
987a3082
UB
6512 machine_mode half_mode;
6513
2bf6d935
ML
6514 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6515
6516 if (!rtx_equal_p (operands[0], operands[1]))
6517 emit_move_insn (operands[0], operands[1]);
6518
6519 split_double_mode (mode, operands, 1, low, high);
987a3082 6520 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6521
6522 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6523 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6524
6525 if (TARGET_CMOVE && scratch)
6526 {
2bf6d935
ML
6527 emit_move_insn (scratch, high[0]);
6528 emit_insn (gen_ashr3 (scratch, scratch,
6529 GEN_INT (half_width - 1)));
987a3082
UB
6530 emit_insn (gen_x86_shift_adj_1
6531 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
6532 }
6533 else
987a3082
UB
6534 emit_insn (gen_x86_shift_adj_3
6535 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
6536 }
6537}
6538
6539void
6540ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6541{
6542 rtx (*gen_lshr3)(rtx, rtx, rtx)
6543 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6544 rtx (*gen_shrd)(rtx, rtx, rtx);
6545 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6546
6547 rtx low[2], high[2];
6548 int count;
6549
6550 if (CONST_INT_P (operands[2]))
6551 {
6552 split_double_mode (mode, operands, 2, low, high);
6553 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6554
6555 if (count >= half_width)
6556 {
6557 emit_move_insn (low[0], high[1]);
6558 ix86_expand_clear (high[0]);
6559
6560 if (count > half_width)
6561 emit_insn (gen_lshr3 (low[0], low[0],
6562 GEN_INT (count - half_width)));
6563 }
6564 else
6565 {
6566 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6567
6568 if (!rtx_equal_p (operands[0], operands[1]))
6569 emit_move_insn (operands[0], operands[1]);
6570
6571 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6572 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6573 }
6574 }
6575 else
6576 {
987a3082
UB
6577 machine_mode half_mode;
6578
2bf6d935
ML
6579 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6580
6581 if (!rtx_equal_p (operands[0], operands[1]))
6582 emit_move_insn (operands[0], operands[1]);
6583
6584 split_double_mode (mode, operands, 1, low, high);
987a3082 6585 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6586
6587 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6588 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6589
6590 if (TARGET_CMOVE && scratch)
6591 {
2bf6d935 6592 ix86_expand_clear (scratch);
987a3082
UB
6593 emit_insn (gen_x86_shift_adj_1
6594 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
6595 }
6596 else
987a3082
UB
6597 emit_insn (gen_x86_shift_adj_2
6598 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
6599 }
6600}
6601
1188cf5f
RS
6602/* Expand move of V1TI mode register X to a new TI mode register. */
6603static rtx
6604ix86_expand_v1ti_to_ti (rtx x)
6605{
6606 rtx result = gen_reg_rtx (TImode);
a5d269f0
RS
6607 if (TARGET_SSE2)
6608 {
51e9e8a2 6609 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
a5d269f0
RS
6610 rtx lo = gen_lowpart (DImode, result);
6611 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6612 rtx hi = gen_highpart (DImode, result);
6613 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6614 }
6615 else
6616 emit_move_insn (result, gen_lowpart (TImode, x));
1188cf5f
RS
6617 return result;
6618}
6619
6620/* Expand move of TI mode register X to a new V1TI mode register. */
6621static rtx
6622ix86_expand_ti_to_v1ti (rtx x)
6623{
1188cf5f
RS
6624 if (TARGET_SSE2)
6625 {
6626 rtx lo = gen_lowpart (DImode, x);
6627 rtx hi = gen_highpart (DImode, x);
6628 rtx tmp = gen_reg_rtx (V2DImode);
6629 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
51e9e8a2 6630 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
1188cf5f 6631 }
51e9e8a2
RS
6632
6633 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
1188cf5f
RS
6634}
6635
6b8b2557 6636/* Expand V1TI mode shift (of rtx_code CODE) by constant. */
1188cf5f
RS
6637void
6638ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6b8b2557 6639{
6b8b2557
RS
6640 rtx op1 = force_reg (V1TImode, operands[1]);
6641
1188cf5f
RS
6642 if (!CONST_INT_P (operands[2]))
6643 {
6644 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6645 rtx tmp2 = gen_reg_rtx (TImode);
6646 rtx (*shift) (rtx, rtx, rtx)
6647 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6648 emit_insn (shift (tmp2, tmp1, operands[2]));
6649 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6650 emit_move_insn (operands[0], tmp3);
6651 return;
6652 }
6653
6654 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6655
6b8b2557
RS
6656 if (bits == 0)
6657 {
6658 emit_move_insn (operands[0], op1);
6659 return;
6660 }
6661
6662 if ((bits & 7) == 0)
6663 {
6664 rtx tmp = gen_reg_rtx (V1TImode);
6665 if (code == ASHIFT)
1188cf5f 6666 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6b8b2557
RS
6667 else
6668 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6669 emit_move_insn (operands[0], tmp);
6670 return;
6671 }
6672
6673 rtx tmp1 = gen_reg_rtx (V1TImode);
6674 if (code == ASHIFT)
6675 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6676 else
6677 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6678
6679 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
51e9e8a2 6680 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6b8b2557
RS
6681
6682 /* tmp3 will be the V2DImode result. */
6683 rtx tmp3 = gen_reg_rtx (V2DImode);
6684
6685 if (bits > 64)
6686 {
6687 if (code == ASHIFT)
6688 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6689 else
6690 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6691 }
6692 else
6693 {
6694 /* tmp4 is operands[1], in V2DImode. */
51e9e8a2 6695 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6b8b2557
RS
6696
6697 rtx tmp5 = gen_reg_rtx (V2DImode);
6698 if (code == ASHIFT)
6699 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6700 else
6701 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6702
6703 rtx tmp6 = gen_reg_rtx (V2DImode);
6704 if (code == ASHIFT)
6705 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6706 else
6707 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6708
6709 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6710 }
6711
6712 /* Convert the result back to V1TImode and store in operands[0]. */
51e9e8a2 6713 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6b8b2557
RS
6714 emit_move_insn (operands[0], tmp7);
6715}
6716
6717/* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
1188cf5f
RS
6718void
6719ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6b8b2557 6720{
6b8b2557
RS
6721 rtx op1 = force_reg (V1TImode, operands[1]);
6722
1188cf5f
RS
6723 if (!CONST_INT_P (operands[2]))
6724 {
6725 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6726 rtx tmp2 = gen_reg_rtx (TImode);
6727 rtx (*rotate) (rtx, rtx, rtx)
6728 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6729 emit_insn (rotate (tmp2, tmp1, operands[2]));
6730 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6731 emit_move_insn (operands[0], tmp3);
6732 return;
6733 }
6734
6735 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6736
6b8b2557
RS
6737 if (bits == 0)
6738 {
6739 emit_move_insn (operands[0], op1);
6740 return;
6741 }
6742
6743 if (code == ROTATERT)
6744 bits = 128 - bits;
6745
6746 if ((bits & 31) == 0)
6747 {
6b8b2557 6748 rtx tmp2 = gen_reg_rtx (V4SImode);
51e9e8a2 6749 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6b8b2557
RS
6750 if (bits == 32)
6751 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6752 else if (bits == 64)
6753 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6754 else
6755 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
51e9e8a2 6756 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6b8b2557
RS
6757 return;
6758 }
6759
6760 if ((bits & 7) == 0)
6761 {
6762 rtx tmp1 = gen_reg_rtx (V1TImode);
6763 rtx tmp2 = gen_reg_rtx (V1TImode);
6764 rtx tmp3 = gen_reg_rtx (V1TImode);
6765
6766 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6767 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6768 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6769 emit_move_insn (operands[0], tmp3);
6770 return;
6771 }
6772
51e9e8a2 6773 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6b8b2557
RS
6774
6775 rtx lobits;
6776 rtx hibits;
6777
6778 switch (bits >> 5)
6779 {
6780 case 0:
6781 lobits = op1_v4si;
6782 hibits = gen_reg_rtx (V4SImode);
6783 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6784 break;
6785
6786 case 1:
6787 lobits = gen_reg_rtx (V4SImode);
6788 hibits = gen_reg_rtx (V4SImode);
6789 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6790 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6791 break;
6792
6793 case 2:
6794 lobits = gen_reg_rtx (V4SImode);
6795 hibits = gen_reg_rtx (V4SImode);
6796 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6797 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6798 break;
6799
6800 default:
6801 lobits = gen_reg_rtx (V4SImode);
6802 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6803 hibits = op1_v4si;
6804 break;
6805 }
6806
6807 rtx tmp1 = gen_reg_rtx (V4SImode);
6808 rtx tmp2 = gen_reg_rtx (V4SImode);
6809 rtx tmp3 = gen_reg_rtx (V4SImode);
6b8b2557
RS
6810
6811 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6812 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6813 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
51e9e8a2
RS
6814
6815 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6b8b2557
RS
6816}
6817
1188cf5f
RS
6818/* Expand V1TI mode ashiftrt by constant. */
6819void
6820ix86_expand_v1ti_ashiftrt (rtx operands[])
6821{
6822 rtx op1 = force_reg (V1TImode, operands[1]);
6823
6824 if (!CONST_INT_P (operands[2]))
6825 {
6826 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6827 rtx tmp2 = gen_reg_rtx (TImode);
6828 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6829 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6830 emit_move_insn (operands[0], tmp3);
6831 return;
6832 }
6833
6834 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6835
6836 if (bits == 0)
6837 {
6838 emit_move_insn (operands[0], op1);
6839 return;
6840 }
6841
6842 if (bits == 127)
6843 {
6844 /* Two operations. */
51e9e8a2 6845 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6846 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6847 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6848
6849 rtx tmp3 = gen_reg_rtx (V4SImode);
6850 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6851
51e9e8a2 6852 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
1188cf5f
RS
6853 return;
6854 }
6855
6856 if (bits == 64)
6857 {
6858 /* Three operations. */
51e9e8a2 6859 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6860 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6861 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6862
6863 rtx tmp3 = gen_reg_rtx (V4SImode);
6864 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6865
51e9e8a2
RS
6866 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6867 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6868 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6869 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6870
51e9e8a2 6871 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6872 return;
6873 }
6874
6875 if (bits == 96)
6876 {
6877 /* Three operations. */
51e9e8a2 6878 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6879 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6880 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6881
51e9e8a2
RS
6882 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6883 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
1188cf5f 6884 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6885 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6886
51e9e8a2 6887 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
1188cf5f 6888 rtx tmp7 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6889 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6890
51e9e8a2
RS
6891 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6892 return;
6893 }
6894
6895 if (bits >= 111)
6896 {
6897 /* Three operations. */
6898 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6899 rtx tmp2 = gen_reg_rtx (V4SImode);
6900 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6901
6902 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6903 rtx tmp4 = gen_reg_rtx (V8HImode);
6904 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6905
6906 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6907 rtx tmp6 = gen_reg_rtx (V4SImode);
6908 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6909
6910 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6911 return;
6912 }
6913
6914 if (TARGET_AVX2 || TARGET_SSE4_1)
6915 {
6916 /* Three operations. */
6917 if (bits == 32)
6918 {
51e9e8a2 6919 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6920 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6921 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6922
6923 rtx tmp3 = gen_reg_rtx (V1TImode);
6924 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6925
6926 if (TARGET_AVX2)
6927 {
51e9e8a2 6928 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
1188cf5f 6929 rtx tmp5 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6930 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6931 GEN_INT (7)));
6932
51e9e8a2 6933 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
1188cf5f
RS
6934 }
6935 else
6936 {
51e9e8a2
RS
6937 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6938 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
1188cf5f 6939 rtx tmp6 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6940 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6941 GEN_INT (0x3f)));
6942
51e9e8a2 6943 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6944 }
6945 return;
6946 }
6947
6948 /* Three operations. */
6949 if (bits == 8 || bits == 16 || bits == 24)
6950 {
51e9e8a2 6951 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6952 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6953 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6954
6955 rtx tmp3 = gen_reg_rtx (V1TImode);
6956 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6957
6958 if (TARGET_AVX2)
6959 {
51e9e8a2 6960 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
1188cf5f 6961 rtx tmp5 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6962 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6963 GEN_INT (7)));
6964
51e9e8a2 6965 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
1188cf5f
RS
6966 }
6967 else
6968 {
51e9e8a2
RS
6969 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6970 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
1188cf5f 6971 rtx tmp6 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6972 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6973 GEN_INT (0x3f)));
6974
51e9e8a2 6975 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6976 }
6977 return;
6978 }
6979 }
6980
6981 if (bits > 96)
6982 {
6983 /* Four operations. */
51e9e8a2 6984 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6985 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6986 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6987
6988 rtx tmp3 = gen_reg_rtx (V4SImode);
6989 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6990
51e9e8a2
RS
6991 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6992 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6993 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6994 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6995
51e9e8a2 6996 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
1188cf5f 6997 rtx tmp8 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6998 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6999
51e9e8a2 7000 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
1188cf5f
RS
7001 return;
7002 }
7003
7004 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
7005 {
7006 /* Four operations. */
51e9e8a2 7007 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 7008 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7009 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7010
7011 rtx tmp3 = gen_reg_rtx (V4SImode);
7012 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7013
7014 rtx tmp4 = gen_reg_rtx (V1TImode);
7015 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7016
51e9e8a2
RS
7017 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7018 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
1188cf5f 7019 rtx tmp7 = gen_reg_rtx (V8HImode);
1188cf5f
RS
7020 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
7021 GEN_INT (bits == 48 ? 0x1f : 0x07)));
7022
51e9e8a2 7023 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
1188cf5f
RS
7024 return;
7025 }
7026
7027 if ((bits & 7) == 0)
7028 {
7029 /* Five operations. */
51e9e8a2 7030 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 7031 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7032 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7033
7034 rtx tmp3 = gen_reg_rtx (V4SImode);
7035 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7036
7037 rtx tmp4 = gen_reg_rtx (V1TImode);
7038 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7039
51e9e8a2 7040 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
1188cf5f 7041 rtx tmp6 = gen_reg_rtx (V1TImode);
1188cf5f
RS
7042 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
7043
51e9e8a2
RS
7044 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7045 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
1188cf5f 7046 rtx tmp9 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7047 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
7048
51e9e8a2 7049 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
1188cf5f
RS
7050 return;
7051 }
7052
7053 if (TARGET_AVX2 && bits < 32)
7054 {
7055 /* Six operations. */
51e9e8a2 7056 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 7057 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7058 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7059
7060 rtx tmp3 = gen_reg_rtx (V1TImode);
7061 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7062
51e9e8a2 7063 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 7064 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7065 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7066
51e9e8a2 7067 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 7068 rtx tmp7 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7069 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7070
7071 rtx tmp8 = gen_reg_rtx (V2DImode);
7072 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7073
51e9e8a2 7074 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
1188cf5f 7075 rtx tmp10 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7076 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
7077
51e9e8a2 7078 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
1188cf5f
RS
7079 return;
7080 }
7081
7082 if (TARGET_SSE4_1 && bits < 15)
7083 {
7084 /* Six operations. */
51e9e8a2 7085 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 7086 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7087 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7088
7089 rtx tmp3 = gen_reg_rtx (V1TImode);
7090 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7091
51e9e8a2 7092 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 7093 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7094 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7095
51e9e8a2 7096 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 7097 rtx tmp7 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7098 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7099
7100 rtx tmp8 = gen_reg_rtx (V2DImode);
7101 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7102
51e9e8a2
RS
7103 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7104 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
1188cf5f 7105 rtx tmp11 = gen_reg_rtx (V8HImode);
1188cf5f
RS
7106 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
7107
51e9e8a2 7108 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
1188cf5f
RS
7109 return;
7110 }
7111
7112 if (bits == 1)
7113 {
7114 /* Eight operations. */
7115 rtx tmp1 = gen_reg_rtx (V1TImode);
7116 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7117
51e9e8a2 7118 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 7119 rtx tmp3 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7120 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
7121
51e9e8a2 7122 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
1188cf5f 7123 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7124 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
7125
7126 rtx tmp6 = gen_reg_rtx (V2DImode);
7127 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7128
7129 rtx tmp7 = gen_reg_rtx (V2DImode);
7130 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7131
51e9e8a2 7132 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
1188cf5f 7133 rtx tmp9 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7134 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7135
51e9e8a2 7136 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
1188cf5f 7137 rtx tmp11 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7138 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7139
7140 rtx tmp12 = gen_reg_rtx (V2DImode);
7141 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7142
51e9e8a2 7143 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
1188cf5f
RS
7144 return;
7145 }
7146
7147 if (bits > 64)
7148 {
7149 /* Eight operations. */
51e9e8a2 7150 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 7151 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7152 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7153
7154 rtx tmp3 = gen_reg_rtx (V4SImode);
7155 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7156
7157 rtx tmp4 = gen_reg_rtx (V1TImode);
7158 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7159
51e9e8a2 7160 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
1188cf5f 7161 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7162 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7163
51e9e8a2 7164 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
1188cf5f 7165 rtx tmp8 = gen_reg_rtx (V1TImode);
1188cf5f
RS
7166 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7167
51e9e8a2 7168 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 7169 rtx tmp10 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7170 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7171
51e9e8a2 7172 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
1188cf5f 7173 rtx tmp12 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7174 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7175
7176 rtx tmp13 = gen_reg_rtx (V2DImode);
7177 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7178
51e9e8a2 7179 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
1188cf5f
RS
7180 }
7181 else
7182 {
7183 /* Nine operations. */
51e9e8a2 7184 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 7185 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7186 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7187
7188 rtx tmp3 = gen_reg_rtx (V4SImode);
7189 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7190
7191 rtx tmp4 = gen_reg_rtx (V1TImode);
7192 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7193
51e9e8a2 7194 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 7195 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7196 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7197
51e9e8a2 7198 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
1188cf5f 7199 rtx tmp8 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7200 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7201
7202 rtx tmp9 = gen_reg_rtx (V2DImode);
7203 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7204
51e9e8a2 7205 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
1188cf5f 7206 rtx tmp11 = gen_reg_rtx (V1TImode);
1188cf5f
RS
7207 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7208
51e9e8a2 7209 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
1188cf5f 7210 rtx tmp13 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7211 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7212
7213 rtx tmp14 = gen_reg_rtx (V2DImode);
7214 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7215
51e9e8a2 7216 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
1188cf5f
RS
7217 }
7218}
7219
3c135697
JJ
7220/* Replace all occurrences of REG FROM with REG TO in X, including
7221 occurrences with different modes. */
7222
7223rtx
7224ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
7225{
7226 gcc_checking_assert (REG_P (from)
7227 && REG_P (to)
7228 && GET_MODE (from) == GET_MODE (to));
7229 if (!reg_overlap_mentioned_p (from, x))
7230 return x;
7231 rtx ret = copy_rtx (x);
7232 subrtx_ptr_iterator::array_type array;
7233 FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
7234 {
7235 rtx *loc = *iter;
7236 x = *loc;
7237 if (REG_P (x) && REGNO (x) == REGNO (from))
7238 {
7239 if (x == from)
7240 *loc = to;
7241 else
7242 {
7243 gcc_checking_assert (REG_NREGS (x) == 1);
7244 *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
7245 }
7246 }
7247 }
7248 return ret;
7249}
7250
2bf6d935
ML
7251/* Return mode for the memcpy/memset loop counter. Prefer SImode over
7252 DImode for constant loop counts. */
7253
7254static machine_mode
7255counter_mode (rtx count_exp)
7256{
7257 if (GET_MODE (count_exp) != VOIDmode)
7258 return GET_MODE (count_exp);
7259 if (!CONST_INT_P (count_exp))
7260 return Pmode;
7261 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7262 return DImode;
7263 return SImode;
7264}
7265
7266/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7267 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7268 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7269 memory by VALUE (supposed to be in MODE).
7270
7271 The size is rounded down to whole number of chunk size moved at once.
7272 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7273
7274
7275static void
76715c32 7276expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
2bf6d935
ML
7277 rtx destptr, rtx srcptr, rtx value,
7278 rtx count, machine_mode mode, int unroll,
7279 int expected_size, bool issetmem)
7280{
7281 rtx_code_label *out_label, *top_label;
7282 rtx iter, tmp;
7283 machine_mode iter_mode = counter_mode (count);
7284 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7285 rtx piece_size = GEN_INT (piece_size_n);
7286 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7287 rtx size;
7288 int i;
7289
7290 top_label = gen_label_rtx ();
7291 out_label = gen_label_rtx ();
7292 iter = gen_reg_rtx (iter_mode);
7293
7294 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7295 NULL, 1, OPTAB_DIRECT);
7296 /* Those two should combine. */
7297 if (piece_size == const1_rtx)
7298 {
7299 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7300 true, out_label);
7301 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7302 }
7303 emit_move_insn (iter, const0_rtx);
7304
7305 emit_label (top_label);
7306
7307 tmp = convert_modes (Pmode, iter_mode, iter, true);
7308
7309 /* This assert could be relaxed - in this case we'll need to compute
7310 smallest power of two, containing in PIECE_SIZE_N and pass it to
7311 offset_address. */
7312 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7313 destmem = offset_address (destmem, tmp, piece_size_n);
7314 destmem = adjust_address (destmem, mode, 0);
7315
7316 if (!issetmem)
7317 {
7318 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7319 srcmem = adjust_address (srcmem, mode, 0);
7320
7321 /* When unrolling for chips that reorder memory reads and writes,
7322 we can save registers by using single temporary.
7323 Also using 4 temporaries is overkill in 32bit mode. */
7324 if (!TARGET_64BIT && 0)
7325 {
7326 for (i = 0; i < unroll; i++)
7327 {
7328 if (i)
7329 {
7330 destmem = adjust_address (copy_rtx (destmem), mode,
7331 GET_MODE_SIZE (mode));
7332 srcmem = adjust_address (copy_rtx (srcmem), mode,
7333 GET_MODE_SIZE (mode));
7334 }
7335 emit_move_insn (destmem, srcmem);
7336 }
7337 }
7338 else
7339 {
7340 rtx tmpreg[4];
7341 gcc_assert (unroll <= 4);
7342 for (i = 0; i < unroll; i++)
7343 {
7344 tmpreg[i] = gen_reg_rtx (mode);
7345 if (i)
7346 srcmem = adjust_address (copy_rtx (srcmem), mode,
7347 GET_MODE_SIZE (mode));
7348 emit_move_insn (tmpreg[i], srcmem);
7349 }
7350 for (i = 0; i < unroll; i++)
7351 {
7352 if (i)
7353 destmem = adjust_address (copy_rtx (destmem), mode,
7354 GET_MODE_SIZE (mode));
7355 emit_move_insn (destmem, tmpreg[i]);
7356 }
7357 }
7358 }
7359 else
7360 for (i = 0; i < unroll; i++)
7361 {
7362 if (i)
7363 destmem = adjust_address (copy_rtx (destmem), mode,
7364 GET_MODE_SIZE (mode));
7365 emit_move_insn (destmem, value);
7366 }
7367
7368 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7369 true, OPTAB_LIB_WIDEN);
7370 if (tmp != iter)
7371 emit_move_insn (iter, tmp);
7372
7373 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7374 true, top_label);
7375 if (expected_size != -1)
7376 {
7377 expected_size /= GET_MODE_SIZE (mode) * unroll;
7378 if (expected_size == 0)
7379 predict_jump (0);
7380 else if (expected_size > REG_BR_PROB_BASE)
7381 predict_jump (REG_BR_PROB_BASE - 1);
7382 else
7383 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7384 / expected_size);
7385 }
7386 else
7387 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7388 iter = ix86_zero_extend_to_Pmode (iter);
7389 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7390 true, OPTAB_LIB_WIDEN);
7391 if (tmp != destptr)
7392 emit_move_insn (destptr, tmp);
7393 if (!issetmem)
7394 {
7395 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7396 true, OPTAB_LIB_WIDEN);
7397 if (tmp != srcptr)
7398 emit_move_insn (srcptr, tmp);
7399 }
7400 emit_label (out_label);
7401}
7402
7403/* Divide COUNTREG by SCALE. */
7404static rtx
7405scale_counter (rtx countreg, int scale)
7406{
7407 rtx sc;
7408
7409 if (scale == 1)
7410 return countreg;
7411 if (CONST_INT_P (countreg))
7412 return GEN_INT (INTVAL (countreg) / scale);
7413 gcc_assert (REG_P (countreg));
7414
7415 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7416 GEN_INT (exact_log2 (scale)),
7417 NULL, 1, OPTAB_DIRECT);
7418 return sc;
7419}
7420
7421/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7422 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7423 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7424 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7425 ORIG_VALUE is the original value passed to memset to fill the memory with.
7426 Other arguments have same meaning as for previous function. */
7427
7428static void
76715c32 7429expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
2bf6d935
ML
7430 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7431 rtx count,
7432 machine_mode mode, bool issetmem)
7433{
7434 rtx destexp;
7435 rtx srcexp;
7436 rtx countreg;
7437 HOST_WIDE_INT rounded_count;
7438
7439 /* If possible, it is shorter to use rep movs.
7440 TODO: Maybe it is better to move this logic to decide_alg. */
7441 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
bf24f4ec 7442 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
2bf6d935
ML
7443 && (!issetmem || orig_value == const0_rtx))
7444 mode = SImode;
7445
7446 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7447 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7448
7449 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7450 GET_MODE_SIZE (mode)));
7451 if (mode != QImode)
7452 {
7453 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7454 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7455 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7456 }
7457 else
7458 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7459 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7460 {
7461 rounded_count
7462 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7463 destmem = shallow_copy_rtx (destmem);
7464 set_mem_size (destmem, rounded_count);
7465 }
7466 else if (MEM_SIZE_KNOWN_P (destmem))
7467 clear_mem_size (destmem);
7468
7469 if (issetmem)
7470 {
7471 value = force_reg (mode, gen_lowpart (mode, value));
7472 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7473 }
7474 else
7475 {
7476 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7477 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7478 if (mode != QImode)
7479 {
7480 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7481 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7482 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7483 }
7484 else
7485 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7486 if (CONST_INT_P (count))
7487 {
7488 rounded_count
7489 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7490 srcmem = shallow_copy_rtx (srcmem);
7491 set_mem_size (srcmem, rounded_count);
7492 }
7493 else
7494 {
7495 if (MEM_SIZE_KNOWN_P (srcmem))
7496 clear_mem_size (srcmem);
7497 }
7498 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7499 destexp, srcexp));
7500 }
7501}
7502
7503/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7504 DESTMEM.
7505 SRC is passed by pointer to be updated on return.
7506 Return value is updated DST. */
7507static rtx
7508emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7509 HOST_WIDE_INT size_to_move)
7510{
c3185b64 7511 rtx dst = destmem, src = *srcmem, tempreg;
2bf6d935
ML
7512 enum insn_code code;
7513 machine_mode move_mode;
7514 int piece_size, i;
7515
7516 /* Find the widest mode in which we could perform moves.
7517 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7518 it until move of such size is supported. */
7519 piece_size = 1 << floor_log2 (size_to_move);
7520 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7521 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7522 {
7523 gcc_assert (piece_size > 1);
7524 piece_size >>= 1;
7525 }
7526
7527 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7528 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7529 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7530 {
7531 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7532 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7533 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7534 {
7535 move_mode = word_mode;
7536 piece_size = GET_MODE_SIZE (move_mode);
7537 code = optab_handler (mov_optab, move_mode);
7538 }
7539 }
7540 gcc_assert (code != CODE_FOR_nothing);
7541
7542 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7543 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7544
7545 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7546 gcc_assert (size_to_move % piece_size == 0);
c3185b64 7547
2bf6d935
ML
7548 for (i = 0; i < size_to_move; i += piece_size)
7549 {
7550 /* We move from memory to memory, so we'll need to do it via
7551 a temporary register. */
7552 tempreg = gen_reg_rtx (move_mode);
7553 emit_insn (GEN_FCN (code) (tempreg, src));
7554 emit_insn (GEN_FCN (code) (dst, tempreg));
7555
7556 emit_move_insn (destptr,
c3185b64 7557 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935 7558 emit_move_insn (srcptr,
c3185b64 7559 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
2bf6d935
ML
7560
7561 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7562 piece_size);
7563 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7564 piece_size);
7565 }
7566
7567 /* Update DST and SRC rtx. */
7568 *srcmem = src;
7569 return dst;
7570}
7571
7572/* Helper function for the string operations below. Dest VARIABLE whether
7573 it is aligned to VALUE bytes. If true, jump to the label. */
7574
7575static rtx_code_label *
7576ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7577{
7578 rtx_code_label *label = gen_label_rtx ();
7579 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7580 if (GET_MODE (variable) == DImode)
7581 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7582 else
7583 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7584 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7585 1, label);
7586 if (epilogue)
7587 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7588 else
7589 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7590 return label;
7591}
7592
7593
7594/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7595
7596static void
76715c32 7597expand_cpymem_epilogue (rtx destmem, rtx srcmem,
2bf6d935
ML
7598 rtx destptr, rtx srcptr, rtx count, int max_size)
7599{
7600 rtx src, dest;
7601 if (CONST_INT_P (count))
7602 {
7603 HOST_WIDE_INT countval = INTVAL (count);
7604 HOST_WIDE_INT epilogue_size = countval % max_size;
7605 int i;
7606
7607 /* For now MAX_SIZE should be a power of 2. This assert could be
7608 relaxed, but it'll require a bit more complicated epilogue
7609 expanding. */
7610 gcc_assert ((max_size & (max_size - 1)) == 0);
7611 for (i = max_size; i >= 1; i >>= 1)
7612 {
7613 if (epilogue_size & i)
7614 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7615 }
7616 return;
7617 }
7618 if (max_size > 8)
7619 {
7620 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7621 count, 1, OPTAB_DIRECT);
76715c32 7622 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
2bf6d935
ML
7623 count, QImode, 1, 4, false);
7624 return;
7625 }
7626
7627 /* When there are stringops, we can cheaply increase dest and src pointers.
7628 Otherwise we save code size by maintaining offset (zero is readily
7629 available from preceding rep operation) and using x86 addressing modes.
7630 */
7631 if (TARGET_SINGLE_STRINGOP)
7632 {
7633 if (max_size > 4)
7634 {
7635 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7636 src = change_address (srcmem, SImode, srcptr);
7637 dest = change_address (destmem, SImode, destptr);
7638 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7639 emit_label (label);
7640 LABEL_NUSES (label) = 1;
7641 }
7642 if (max_size > 2)
7643 {
7644 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7645 src = change_address (srcmem, HImode, srcptr);
7646 dest = change_address (destmem, HImode, destptr);
7647 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7648 emit_label (label);
7649 LABEL_NUSES (label) = 1;
7650 }
7651 if (max_size > 1)
7652 {
7653 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7654 src = change_address (srcmem, QImode, srcptr);
7655 dest = change_address (destmem, QImode, destptr);
7656 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7657 emit_label (label);
7658 LABEL_NUSES (label) = 1;
7659 }
7660 }
7661 else
7662 {
7663 rtx offset = force_reg (Pmode, const0_rtx);
7664 rtx tmp;
7665
7666 if (max_size > 4)
7667 {
7668 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7669 src = change_address (srcmem, SImode, srcptr);
7670 dest = change_address (destmem, SImode, destptr);
7671 emit_move_insn (dest, src);
7672 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7673 true, OPTAB_LIB_WIDEN);
7674 if (tmp != offset)
7675 emit_move_insn (offset, tmp);
7676 emit_label (label);
7677 LABEL_NUSES (label) = 1;
7678 }
7679 if (max_size > 2)
7680 {
7681 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7682 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7683 src = change_address (srcmem, HImode, tmp);
7684 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7685 dest = change_address (destmem, HImode, tmp);
7686 emit_move_insn (dest, src);
7687 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7688 true, OPTAB_LIB_WIDEN);
7689 if (tmp != offset)
7690 emit_move_insn (offset, tmp);
7691 emit_label (label);
7692 LABEL_NUSES (label) = 1;
7693 }
7694 if (max_size > 1)
7695 {
7696 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7697 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7698 src = change_address (srcmem, QImode, tmp);
7699 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7700 dest = change_address (destmem, QImode, tmp);
7701 emit_move_insn (dest, src);
7702 emit_label (label);
7703 LABEL_NUSES (label) = 1;
7704 }
7705 }
7706}
7707
7708/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7709 with value PROMOTED_VAL.
7710 SRC is passed by pointer to be updated on return.
7711 Return value is updated DST. */
7712static rtx
7713emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7714 HOST_WIDE_INT size_to_move)
7715{
c3185b64 7716 rtx dst = destmem;
2bf6d935
ML
7717 enum insn_code code;
7718 machine_mode move_mode;
7719 int piece_size, i;
7720
7721 /* Find the widest mode in which we could perform moves.
7722 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7723 it until move of such size is supported. */
7724 move_mode = GET_MODE (promoted_val);
7725 if (move_mode == VOIDmode)
7726 move_mode = QImode;
7727 if (size_to_move < GET_MODE_SIZE (move_mode))
7728 {
7729 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7730 move_mode = int_mode_for_size (move_bits, 0).require ();
7731 promoted_val = gen_lowpart (move_mode, promoted_val);
7732 }
7733 piece_size = GET_MODE_SIZE (move_mode);
7734 code = optab_handler (mov_optab, move_mode);
7735 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7736
7737 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7738
7739 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7740 gcc_assert (size_to_move % piece_size == 0);
c3185b64 7741
2bf6d935
ML
7742 for (i = 0; i < size_to_move; i += piece_size)
7743 {
7744 if (piece_size <= GET_MODE_SIZE (word_mode))
7745 {
7746 emit_insn (gen_strset (destptr, dst, promoted_val));
7747 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7748 piece_size);
7749 continue;
7750 }
7751
7752 emit_insn (GEN_FCN (code) (dst, promoted_val));
7753
7754 emit_move_insn (destptr,
c3185b64 7755 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935
ML
7756
7757 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7758 piece_size);
7759 }
7760
7761 /* Update DST rtx. */
7762 return dst;
7763}
7764/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7765static void
7766expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7767 rtx count, int max_size)
7768{
7769 count = expand_simple_binop (counter_mode (count), AND, count,
7770 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
76715c32 7771 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
2bf6d935
ML
7772 gen_lowpart (QImode, value), count, QImode,
7773 1, max_size / 2, true);
7774}
7775
7776/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7777static void
7778expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7779 rtx count, int max_size)
7780{
7781 rtx dest;
7782
7783 if (CONST_INT_P (count))
7784 {
7785 HOST_WIDE_INT countval = INTVAL (count);
7786 HOST_WIDE_INT epilogue_size = countval % max_size;
7787 int i;
7788
7789 /* For now MAX_SIZE should be a power of 2. This assert could be
7790 relaxed, but it'll require a bit more complicated epilogue
7791 expanding. */
7792 gcc_assert ((max_size & (max_size - 1)) == 0);
7793 for (i = max_size; i >= 1; i >>= 1)
7794 {
7795 if (epilogue_size & i)
7796 {
7797 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7798 destmem = emit_memset (destmem, destptr, vec_value, i);
7799 else
7800 destmem = emit_memset (destmem, destptr, value, i);
7801 }
7802 }
7803 return;
7804 }
7805 if (max_size > 32)
7806 {
7807 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7808 return;
7809 }
7810 if (max_size > 16)
7811 {
7812 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7813 if (TARGET_64BIT)
7814 {
7815 dest = change_address (destmem, DImode, destptr);
7816 emit_insn (gen_strset (destptr, dest, value));
7817 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7818 emit_insn (gen_strset (destptr, dest, value));
7819 }
7820 else
7821 {
7822 dest = change_address (destmem, SImode, destptr);
7823 emit_insn (gen_strset (destptr, dest, value));
7824 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7825 emit_insn (gen_strset (destptr, dest, value));
7826 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7827 emit_insn (gen_strset (destptr, dest, value));
7828 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7829 emit_insn (gen_strset (destptr, dest, value));
7830 }
7831 emit_label (label);
7832 LABEL_NUSES (label) = 1;
7833 }
7834 if (max_size > 8)
7835 {
7836 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7837 if (TARGET_64BIT)
7838 {
7839 dest = change_address (destmem, DImode, destptr);
7840 emit_insn (gen_strset (destptr, dest, value));
7841 }
7842 else
7843 {
7844 dest = change_address (destmem, SImode, destptr);
7845 emit_insn (gen_strset (destptr, dest, value));
7846 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7847 emit_insn (gen_strset (destptr, dest, value));
7848 }
7849 emit_label (label);
7850 LABEL_NUSES (label) = 1;
7851 }
7852 if (max_size > 4)
7853 {
7854 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7855 dest = change_address (destmem, SImode, destptr);
7856 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7857 emit_label (label);
7858 LABEL_NUSES (label) = 1;
7859 }
7860 if (max_size > 2)
7861 {
7862 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7863 dest = change_address (destmem, HImode, destptr);
7864 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7865 emit_label (label);
7866 LABEL_NUSES (label) = 1;
7867 }
7868 if (max_size > 1)
7869 {
7870 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7871 dest = change_address (destmem, QImode, destptr);
7872 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7873 emit_label (label);
7874 LABEL_NUSES (label) = 1;
7875 }
7876}
7877
7878/* Adjust COUNTER by the VALUE. */
7879static void
7880ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7881{
83bc5e44 7882 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
2bf6d935
ML
7883}
7884
7885/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7886 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7887 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7888 ignored.
7889 Return value is updated DESTMEM. */
7890
7891static rtx
76715c32 7892expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
2bf6d935
ML
7893 rtx destptr, rtx srcptr, rtx value,
7894 rtx vec_value, rtx count, int align,
7895 int desired_alignment, bool issetmem)
7896{
7897 int i;
7898 for (i = 1; i < desired_alignment; i <<= 1)
7899 {
7900 if (align <= i)
7901 {
7902 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7903 if (issetmem)
7904 {
7905 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7906 destmem = emit_memset (destmem, destptr, vec_value, i);
7907 else
7908 destmem = emit_memset (destmem, destptr, value, i);
7909 }
7910 else
7911 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7912 ix86_adjust_counter (count, i);
7913 emit_label (label);
7914 LABEL_NUSES (label) = 1;
7915 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7916 }
7917 }
7918 return destmem;
7919}
7920
7921/* Test if COUNT&SIZE is nonzero and if so, expand movme
7922 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7923 and jump to DONE_LABEL. */
7924static void
76715c32 7925expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
2bf6d935
ML
7926 rtx destptr, rtx srcptr,
7927 rtx value, rtx vec_value,
7928 rtx count, int size,
7929 rtx done_label, bool issetmem)
7930{
7931 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7932 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7933 rtx modesize;
7934 int n;
7935
7936 /* If we do not have vector value to copy, we must reduce size. */
7937 if (issetmem)
7938 {
7939 if (!vec_value)
7940 {
7941 if (GET_MODE (value) == VOIDmode && size > 8)
7942 mode = Pmode;
7943 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7944 mode = GET_MODE (value);
7945 }
7946 else
7947 mode = GET_MODE (vec_value), value = vec_value;
7948 }
7949 else
7950 {
7951 /* Choose appropriate vector mode. */
7952 if (size >= 32)
7953 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7954 else if (size >= 16)
7955 mode = TARGET_SSE ? V16QImode : DImode;
7956 srcmem = change_address (srcmem, mode, srcptr);
7957 }
7958 destmem = change_address (destmem, mode, destptr);
7959 modesize = GEN_INT (GET_MODE_SIZE (mode));
7960 gcc_assert (GET_MODE_SIZE (mode) <= size);
7961 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7962 {
7963 if (issetmem)
7964 emit_move_insn (destmem, gen_lowpart (mode, value));
7965 else
7966 {
7967 emit_move_insn (destmem, srcmem);
7968 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7969 }
7970 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7971 }
7972
7973 destmem = offset_address (destmem, count, 1);
7974 destmem = offset_address (destmem, GEN_INT (-2 * size),
7975 GET_MODE_SIZE (mode));
7976 if (!issetmem)
7977 {
7978 srcmem = offset_address (srcmem, count, 1);
7979 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7980 GET_MODE_SIZE (mode));
7981 }
7982 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7983 {
7984 if (issetmem)
7985 emit_move_insn (destmem, gen_lowpart (mode, value));
7986 else
7987 {
7988 emit_move_insn (destmem, srcmem);
7989 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7990 }
7991 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7992 }
7993 emit_jump_insn (gen_jump (done_label));
7994 emit_barrier ();
7995
7996 emit_label (label);
7997 LABEL_NUSES (label) = 1;
7998}
7999
8000/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
8001 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
8002 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
8003 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
8004 DONE_LABEL is a label after the whole copying sequence. The label is created
8005 on demand if *DONE_LABEL is NULL.
8006 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
8007 bounds after the initial copies.
8008
8009 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
8010 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
8011 we will dispatch to a library call for large blocks.
8012
8013 In pseudocode we do:
8014
8015 if (COUNT < SIZE)
8016 {
8017 Assume that SIZE is 4. Bigger sizes are handled analogously
8018 if (COUNT & 4)
8019 {
8020 copy 4 bytes from SRCPTR to DESTPTR
8021 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
8022 goto done_label
8023 }
8024 if (!COUNT)
8025 goto done_label;
8026 copy 1 byte from SRCPTR to DESTPTR
8027 if (COUNT & 2)
8028 {
8029 copy 2 bytes from SRCPTR to DESTPTR
8030 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
8031 }
8032 }
8033 else
8034 {
8035 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
8036 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
8037
8038 OLD_DESPTR = DESTPTR;
8039 Align DESTPTR up to DESIRED_ALIGN
8040 SRCPTR += DESTPTR - OLD_DESTPTR
8041 COUNT -= DEST_PTR - OLD_DESTPTR
8042 if (DYNAMIC_CHECK)
8043 Round COUNT down to multiple of SIZE
8044 << optional caller supplied zero size guard is here >>
8045 << optional caller supplied dynamic check is here >>
8046 << caller supplied main copy loop is here >>
8047 }
8048 done_label:
8049 */
8050static void
76715c32 8051expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
2bf6d935
ML
8052 rtx *destptr, rtx *srcptr,
8053 machine_mode mode,
8054 rtx value, rtx vec_value,
8055 rtx *count,
8056 rtx_code_label **done_label,
8057 int size,
8058 int desired_align,
8059 int align,
8060 unsigned HOST_WIDE_INT *min_size,
8061 bool dynamic_check,
8062 bool issetmem)
8063{
8064 rtx_code_label *loop_label = NULL, *label;
8065 int n;
8066 rtx modesize;
8067 int prolog_size = 0;
8068 rtx mode_value;
8069
8070 /* Chose proper value to copy. */
8071 if (issetmem && VECTOR_MODE_P (mode))
8072 mode_value = vec_value;
8073 else
8074 mode_value = value;
8075 gcc_assert (GET_MODE_SIZE (mode) <= size);
8076
8077 /* See if block is big or small, handle small blocks. */
8078 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
8079 {
8080 int size2 = size;
8081 loop_label = gen_label_rtx ();
8082
8083 if (!*done_label)
8084 *done_label = gen_label_rtx ();
8085
8086 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
8087 1, loop_label);
8088 size2 >>= 1;
8089
8090 /* Handle sizes > 3. */
8091 for (;size2 > 2; size2 >>= 1)
76715c32 8092 expand_small_cpymem_or_setmem (destmem, srcmem,
2bf6d935
ML
8093 *destptr, *srcptr,
8094 value, vec_value,
8095 *count,
8096 size2, *done_label, issetmem);
8097 /* Nothing to copy? Jump to DONE_LABEL if so */
8098 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
8099 1, *done_label);
8100
8101 /* Do a byte copy. */
8102 destmem = change_address (destmem, QImode, *destptr);
8103 if (issetmem)
8104 emit_move_insn (destmem, gen_lowpart (QImode, value));
8105 else
8106 {
8107 srcmem = change_address (srcmem, QImode, *srcptr);
8108 emit_move_insn (destmem, srcmem);
8109 }
8110
8111 /* Handle sizes 2 and 3. */
8112 label = ix86_expand_aligntest (*count, 2, false);
8113 destmem = change_address (destmem, HImode, *destptr);
8114 destmem = offset_address (destmem, *count, 1);
8115 destmem = offset_address (destmem, GEN_INT (-2), 2);
8116 if (issetmem)
8117 emit_move_insn (destmem, gen_lowpart (HImode, value));
8118 else
8119 {
8120 srcmem = change_address (srcmem, HImode, *srcptr);
8121 srcmem = offset_address (srcmem, *count, 1);
8122 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
8123 emit_move_insn (destmem, srcmem);
8124 }
8125
8126 emit_label (label);
8127 LABEL_NUSES (label) = 1;
8128 emit_jump_insn (gen_jump (*done_label));
8129 emit_barrier ();
8130 }
8131 else
8132 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
8133 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
8134
8135 /* Start memcpy for COUNT >= SIZE. */
8136 if (loop_label)
8137 {
8138 emit_label (loop_label);
8139 LABEL_NUSES (loop_label) = 1;
8140 }
8141
8142 /* Copy first desired_align bytes. */
8143 if (!issetmem)
8144 srcmem = change_address (srcmem, mode, *srcptr);
8145 destmem = change_address (destmem, mode, *destptr);
8146 modesize = GEN_INT (GET_MODE_SIZE (mode));
8147 for (n = 0; prolog_size < desired_align - align; n++)
8148 {
8149 if (issetmem)
8150 emit_move_insn (destmem, mode_value);
8151 else
8152 {
8153 emit_move_insn (destmem, srcmem);
8154 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8155 }
8156 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8157 prolog_size += GET_MODE_SIZE (mode);
8158 }
8159
8160
8161 /* Copy last SIZE bytes. */
8162 destmem = offset_address (destmem, *count, 1);
8163 destmem = offset_address (destmem,
8164 GEN_INT (-size - prolog_size),
8165 1);
8166 if (issetmem)
8167 emit_move_insn (destmem, mode_value);
8168 else
8169 {
8170 srcmem = offset_address (srcmem, *count, 1);
8171 srcmem = offset_address (srcmem,
8172 GEN_INT (-size - prolog_size),
8173 1);
8174 emit_move_insn (destmem, srcmem);
8175 }
8176 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8177 {
8178 destmem = offset_address (destmem, modesize, 1);
8179 if (issetmem)
8180 emit_move_insn (destmem, mode_value);
8181 else
8182 {
8183 srcmem = offset_address (srcmem, modesize, 1);
8184 emit_move_insn (destmem, srcmem);
8185 }
8186 }
8187
8188 /* Align destination. */
8189 if (desired_align > 1 && desired_align > align)
8190 {
8191 rtx saveddest = *destptr;
8192
8193 gcc_assert (desired_align <= size);
8194 /* Align destptr up, place it to new register. */
8195 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8196 GEN_INT (prolog_size),
8197 NULL_RTX, 1, OPTAB_DIRECT);
8198 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8199 REG_POINTER (*destptr) = 1;
8200 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8201 GEN_INT (-desired_align),
8202 *destptr, 1, OPTAB_DIRECT);
8203 /* See how many bytes we skipped. */
8204 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8205 *destptr,
8206 saveddest, 1, OPTAB_DIRECT);
8207 /* Adjust srcptr and count. */
8208 if (!issetmem)
8209 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8210 saveddest, *srcptr, 1, OPTAB_DIRECT);
8211 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8212 saveddest, *count, 1, OPTAB_DIRECT);
8213 /* We copied at most size + prolog_size. */
8214 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8215 *min_size
8216 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8217 else
8218 *min_size = 0;
8219
8220 /* Our loops always round down the block size, but for dispatch to
8221 library we need precise value. */
8222 if (dynamic_check)
8223 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
8224 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
8225 }
8226 else
8227 {
8228 gcc_assert (prolog_size == 0);
8229 /* Decrease count, so we won't end up copying last word twice. */
8230 if (!CONST_INT_P (*count))
8231 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8232 constm1_rtx, *count, 1, OPTAB_DIRECT);
8233 else
8234 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
8235 (unsigned HOST_WIDE_INT)size));
8236 if (*min_size)
8237 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
8238 }
8239}
8240
8241
8242/* This function is like the previous one, except here we know how many bytes
8243 need to be copied. That allows us to update alignment not only of DST, which
8244 is returned, but also of SRC, which is passed as a pointer for that
8245 reason. */
8246static rtx
76715c32 8247expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
2bf6d935
ML
8248 rtx srcreg, rtx value, rtx vec_value,
8249 int desired_align, int align_bytes,
8250 bool issetmem)
8251{
8252 rtx src = NULL;
8253 rtx orig_dst = dst;
8254 rtx orig_src = NULL;
8255 int piece_size = 1;
8256 int copied_bytes = 0;
8257
8258 if (!issetmem)
8259 {
8260 gcc_assert (srcp != NULL);
8261 src = *srcp;
8262 orig_src = src;
8263 }
8264
8265 for (piece_size = 1;
8266 piece_size <= desired_align && copied_bytes < align_bytes;
8267 piece_size <<= 1)
8268 {
8269 if (align_bytes & piece_size)
8270 {
8271 if (issetmem)
8272 {
8273 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
8274 dst = emit_memset (dst, destreg, vec_value, piece_size);
8275 else
8276 dst = emit_memset (dst, destreg, value, piece_size);
8277 }
8278 else
8279 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
8280 copied_bytes += piece_size;
8281 }
8282 }
8283 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
8284 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8285 if (MEM_SIZE_KNOWN_P (orig_dst))
8286 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
8287
8288 if (!issetmem)
8289 {
8290 int src_align_bytes = get_mem_align_offset (src, desired_align
8291 * BITS_PER_UNIT);
8292 if (src_align_bytes >= 0)
8293 src_align_bytes = desired_align - src_align_bytes;
8294 if (src_align_bytes >= 0)
8295 {
8296 unsigned int src_align;
8297 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
8298 {
8299 if ((src_align_bytes & (src_align - 1))
8300 == (align_bytes & (src_align - 1)))
8301 break;
8302 }
8303 if (src_align > (unsigned int) desired_align)
8304 src_align = desired_align;
8305 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8306 set_mem_align (src, src_align * BITS_PER_UNIT);
8307 }
8308 if (MEM_SIZE_KNOWN_P (orig_src))
8309 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8310 *srcp = src;
8311 }
8312
8313 return dst;
8314}
8315
8316/* Return true if ALG can be used in current context.
8317 Assume we expand memset if MEMSET is true. */
8318static bool
8319alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8320{
8321 if (alg == no_stringop)
8322 return false;
c6bff80d
UB
8323 /* It is not possible to use a library call if we have non-default
8324 address space. We can do better than the generic byte-at-a-time
8325 loop, used as a fallback. */
8326 if (alg == libcall && have_as)
8327 return false;
2bf6d935
ML
8328 if (alg == vector_loop)
8329 return TARGET_SSE || TARGET_AVX;
8330 /* Algorithms using the rep prefix want at least edi and ecx;
8331 additionally, memset wants eax and memcpy wants esi. Don't
8332 consider such algorithms if the user has appropriated those
8333 registers for their own purposes, or if we have a non-default
8334 address space, since some string insns cannot override the segment. */
8335 if (alg == rep_prefix_1_byte
8336 || alg == rep_prefix_4_byte
8337 || alg == rep_prefix_8_byte)
8338 {
8339 if (have_as)
8340 return false;
8341 if (fixed_regs[CX_REG]
8342 || fixed_regs[DI_REG]
8343 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8344 return false;
8345 }
8346 return true;
8347}
8348
8349/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8350static enum stringop_alg
8351decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8352 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8353 bool memset, bool zero_memset, bool have_as,
8354 int *dynamic_check, bool *noalign, bool recur)
8355{
8356 const struct stringop_algs *algs;
8357 bool optimize_for_speed;
8358 int max = 0;
8359 const struct processor_costs *cost;
8360 int i;
8361 bool any_alg_usable_p = false;
8362
8363 *noalign = false;
8364 *dynamic_check = -1;
8365
8366 /* Even if the string operation call is cold, we still might spend a lot
8367 of time processing large blocks. */
8368 if (optimize_function_for_size_p (cfun)
8369 || (optimize_insn_for_size_p ()
8370 && (max_size < 256
8371 || (expected_size != -1 && expected_size < 256))))
8372 optimize_for_speed = false;
8373 else
8374 optimize_for_speed = true;
8375
8376 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8377 if (memset)
8378 algs = &cost->memset[TARGET_64BIT != 0];
8379 else
8380 algs = &cost->memcpy[TARGET_64BIT != 0];
8381
8382 /* See maximal size for user defined algorithm. */
8383 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8384 {
8385 enum stringop_alg candidate = algs->size[i].alg;
8386 bool usable = alg_usable_p (candidate, memset, have_as);
8387 any_alg_usable_p |= usable;
8388
8389 if (candidate != libcall && candidate && usable)
8390 max = algs->size[i].max;
8391 }
8392
8393 /* If expected size is not known but max size is small enough
8394 so inline version is a win, set expected size into
8395 the range. */
8396 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8397 && expected_size == -1)
8398 expected_size = min_size / 2 + max_size / 2;
8399
8400 /* If user specified the algorithm, honor it if possible. */
8401 if (ix86_stringop_alg != no_stringop
8402 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8403 return ix86_stringop_alg;
8404 /* rep; movq or rep; movl is the smallest variant. */
8405 else if (!optimize_for_speed)
8406 {
8407 *noalign = true;
8408 if (!count || (count & 3) || (memset && !zero_memset))
8409 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8410 ? rep_prefix_1_byte : loop_1_byte;
8411 else
8412 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8413 ? rep_prefix_4_byte : loop;
8414 }
8415 /* Very tiny blocks are best handled via the loop, REP is expensive to
8416 setup. */
8417 else if (expected_size != -1 && expected_size < 4)
8418 return loop_1_byte;
8419 else if (expected_size != -1)
8420 {
8421 enum stringop_alg alg = libcall;
8422 bool alg_noalign = false;
8423 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8424 {
8425 /* We get here if the algorithms that were not libcall-based
8426 were rep-prefix based and we are unable to use rep prefixes
8427 based on global register usage. Break out of the loop and
8428 use the heuristic below. */
8429 if (algs->size[i].max == 0)
8430 break;
8431 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8432 {
8433 enum stringop_alg candidate = algs->size[i].alg;
8434
8435 if (candidate != libcall
8436 && alg_usable_p (candidate, memset, have_as))
8437 {
8438 alg = candidate;
8439 alg_noalign = algs->size[i].noalign;
8440 }
8441 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8442 last non-libcall inline algorithm. */
8443 if (TARGET_INLINE_ALL_STRINGOPS)
8444 {
8445 /* When the current size is best to be copied by a libcall,
8446 but we are still forced to inline, run the heuristic below
8447 that will pick code for medium sized blocks. */
8448 if (alg != libcall)
8449 {
8450 *noalign = alg_noalign;
8451 return alg;
8452 }
8453 else if (!any_alg_usable_p)
8454 break;
8455 }
bf24f4ec
L
8456 else if (alg_usable_p (candidate, memset, have_as)
8457 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8458 && candidate == rep_prefix_1_byte
8459 /* NB: If min_size != max_size, size is
8460 unknown. */
8461 && min_size != max_size))
2bf6d935
ML
8462 {
8463 *noalign = algs->size[i].noalign;
8464 return candidate;
8465 }
8466 }
8467 }
8468 }
8469 /* When asked to inline the call anyway, try to pick meaningful choice.
8470 We look for maximal size of block that is faster to copy by hand and
8471 take blocks of at most of that size guessing that average size will
8472 be roughly half of the block.
8473
8474 If this turns out to be bad, we might simply specify the preferred
8475 choice in ix86_costs. */
8476 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8477 && (algs->unknown_size == libcall
8478 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8479 {
8480 enum stringop_alg alg;
8481 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8482
8483 /* If there aren't any usable algorithms or if recursing already,
8484 then recursing on smaller sizes or same size isn't going to
8485 find anything. Just return the simple byte-at-a-time copy loop. */
8486 if (!any_alg_usable_p || recur)
8487 {
8488 /* Pick something reasonable. */
8489 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8490 *dynamic_check = 128;
8491 return loop_1_byte;
8492 }
8493 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8494 zero_memset, have_as, dynamic_check, noalign, true);
8495 gcc_assert (*dynamic_check == -1);
8496 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8497 *dynamic_check = max;
8498 else
8499 gcc_assert (alg != libcall);
8500 return alg;
8501 }
c6bff80d
UB
8502
8503 /* Try to use some reasonable fallback algorithm. Note that for
8504 non-default address spaces we default to a loop instead of
8505 a libcall. */
2bf6d935 8506 return (alg_usable_p (algs->unknown_size, memset, have_as)
c6bff80d 8507 ? algs->unknown_size : have_as ? loop : libcall);
2bf6d935
ML
8508}
8509
8510/* Decide on alignment. We know that the operand is already aligned to ALIGN
8511 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8512static int
8513decide_alignment (int align,
8514 enum stringop_alg alg,
8515 int expected_size,
8516 machine_mode move_mode)
8517{
8518 int desired_align = 0;
8519
8520 gcc_assert (alg != no_stringop);
8521
8522 if (alg == libcall)
8523 return 0;
8524 if (move_mode == VOIDmode)
8525 return 0;
8526
8527 desired_align = GET_MODE_SIZE (move_mode);
8528 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8529 copying whole cacheline at once. */
f23881fc 8530 if (TARGET_CPU_P (PENTIUMPRO)
2bf6d935
ML
8531 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8532 desired_align = 8;
8533
8534 if (optimize_size)
8535 desired_align = 1;
8536 if (desired_align < align)
8537 desired_align = align;
8538 if (expected_size != -1 && expected_size < 4)
8539 desired_align = align;
8540
8541 return desired_align;
8542}
8543
8544
8545/* Helper function for memcpy. For QImode value 0xXY produce
8546 0xXYXYXYXY of wide specified by MODE. This is essentially
8547 a * 0x10101010, but we can do slightly better than
8548 synth_mult by unwinding the sequence by hand on CPUs with
8549 slow multiply. */
8550static rtx
8551promote_duplicated_reg (machine_mode mode, rtx val)
8552{
8553 machine_mode valmode = GET_MODE (val);
8554 rtx tmp;
8555 int nops = mode == DImode ? 3 : 2;
8556
8557 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8558 if (val == const0_rtx)
8559 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8560 if (CONST_INT_P (val))
8561 {
8562 HOST_WIDE_INT v = INTVAL (val) & 255;
8563
8564 v |= v << 8;
8565 v |= v << 16;
8566 if (mode == DImode)
8567 v |= (v << 16) << 16;
8568 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8569 }
8570
8571 if (valmode == VOIDmode)
8572 valmode = QImode;
8573 if (valmode != QImode)
8574 val = gen_lowpart (QImode, val);
8575 if (mode == QImode)
8576 return val;
8577 if (!TARGET_PARTIAL_REG_STALL)
8578 nops--;
8579 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8580 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8581 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8582 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8583 {
8584 rtx reg = convert_modes (mode, QImode, val, true);
8585 tmp = promote_duplicated_reg (mode, const1_rtx);
8586 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8587 OPTAB_DIRECT);
8588 }
8589 else
8590 {
8591 rtx reg = convert_modes (mode, QImode, val, true);
8592
8593 if (!TARGET_PARTIAL_REG_STALL)
e9539592 8594 emit_insn (gen_insv_1 (mode, reg, reg));
2bf6d935
ML
8595 else
8596 {
8597 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8598 NULL, 1, OPTAB_DIRECT);
8599 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8600 OPTAB_DIRECT);
8601 }
8602 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8603 NULL, 1, OPTAB_DIRECT);
8604 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8605 if (mode == SImode)
8606 return reg;
8607 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8608 NULL, 1, OPTAB_DIRECT);
8609 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8610 return reg;
8611 }
8612}
8613
8614/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8615 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8616 alignment from ALIGN to DESIRED_ALIGN. */
8617static rtx
8618promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8619 int align)
8620{
8621 rtx promoted_val;
8622
8623 if (TARGET_64BIT
8624 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8625 promoted_val = promote_duplicated_reg (DImode, val);
8626 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8627 promoted_val = promote_duplicated_reg (SImode, val);
8628 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8629 promoted_val = promote_duplicated_reg (HImode, val);
8630 else
8631 promoted_val = val;
8632
8633 return promoted_val;
8634}
8635
8636/* Copy the address to a Pmode register. This is used for x32 to
8637 truncate DImode TLS address to a SImode register. */
8638
8639static rtx
8640ix86_copy_addr_to_reg (rtx addr)
8641{
8642 rtx reg;
8643 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8644 {
8645 reg = copy_addr_to_reg (addr);
8646 REG_POINTER (reg) = 1;
8647 return reg;
8648 }
8649 else
8650 {
8651 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8652 reg = copy_to_mode_reg (DImode, addr);
8653 REG_POINTER (reg) = 1;
8654 return gen_rtx_SUBREG (SImode, reg, 0);
8655 }
8656}
8657
8658/* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8659 operations when profitable. The code depends upon architecture, block size
8660 and alignment, but always has one of the following overall structures:
8661
8662 Aligned move sequence:
8663
8664 1) Prologue guard: Conditional that jumps up to epilogues for small
8665 blocks that can be handled by epilogue alone. This is faster
8666 but also needed for correctness, since prologue assume the block
8667 is larger than the desired alignment.
8668
8669 Optional dynamic check for size and libcall for large
8670 blocks is emitted here too, with -minline-stringops-dynamically.
8671
8672 2) Prologue: copy first few bytes in order to get destination
8673 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8674 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8675 copied. We emit either a jump tree on power of two sized
8676 blocks, or a byte loop.
8677
8678 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8679 with specified algorithm.
8680
8681 4) Epilogue: code copying tail of the block that is too small to be
8682 handled by main body (or up to size guarded by prologue guard).
8683
8684 Misaligned move sequence
8685
8686 1) missaligned move prologue/epilogue containing:
8687 a) Prologue handling small memory blocks and jumping to done_label
8688 (skipped if blocks are known to be large enough)
8689 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8690 needed by single possibly misaligned move
8691 (skipped if alignment is not needed)
8692 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8693
8694 2) Zero size guard dispatching to done_label, if needed
8695
8696 3) dispatch to library call, if needed,
8697
8698 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8699 with specified algorithm. */
8700bool
76715c32 8701ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
2bf6d935
ML
8702 rtx align_exp, rtx expected_align_exp,
8703 rtx expected_size_exp, rtx min_size_exp,
8704 rtx max_size_exp, rtx probable_max_size_exp,
8705 bool issetmem)
8706{
8707 rtx destreg;
8708 rtx srcreg = NULL;
8709 rtx_code_label *label = NULL;
8710 rtx tmp;
8711 rtx_code_label *jump_around_label = NULL;
8712 HOST_WIDE_INT align = 1;
8713 unsigned HOST_WIDE_INT count = 0;
8714 HOST_WIDE_INT expected_size = -1;
8715 int size_needed = 0, epilogue_size_needed;
8716 int desired_align = 0, align_bytes = 0;
8717 enum stringop_alg alg;
8718 rtx promoted_val = NULL;
8719 rtx vec_promoted_val = NULL;
8720 bool force_loopy_epilogue = false;
8721 int dynamic_check;
8722 bool need_zero_guard = false;
8723 bool noalign;
8724 machine_mode move_mode = VOIDmode;
8725 machine_mode wider_mode;
8726 int unroll_factor = 1;
8727 /* TODO: Once value ranges are available, fill in proper data. */
8728 unsigned HOST_WIDE_INT min_size = 0;
8729 unsigned HOST_WIDE_INT max_size = -1;
8730 unsigned HOST_WIDE_INT probable_max_size = -1;
8731 bool misaligned_prologue_used = false;
8732 bool have_as;
8733
8734 if (CONST_INT_P (align_exp))
8735 align = INTVAL (align_exp);
8736 /* i386 can do misaligned access on reasonably increased cost. */
8737 if (CONST_INT_P (expected_align_exp)
8738 && INTVAL (expected_align_exp) > align)
8739 align = INTVAL (expected_align_exp);
8740 /* ALIGN is the minimum of destination and source alignment, but we care here
8741 just about destination alignment. */
8742 else if (!issetmem
8743 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8744 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8745
8746 if (CONST_INT_P (count_exp))
8747 {
8748 min_size = max_size = probable_max_size = count = expected_size
8749 = INTVAL (count_exp);
8750 /* When COUNT is 0, there is nothing to do. */
8751 if (!count)
8752 return true;
8753 }
8754 else
8755 {
8756 if (min_size_exp)
8757 min_size = INTVAL (min_size_exp);
8758 if (max_size_exp)
8759 max_size = INTVAL (max_size_exp);
8760 if (probable_max_size_exp)
8761 probable_max_size = INTVAL (probable_max_size_exp);
8762 if (CONST_INT_P (expected_size_exp))
8763 expected_size = INTVAL (expected_size_exp);
8764 }
8765
8766 /* Make sure we don't need to care about overflow later on. */
8767 if (count > (HOST_WIDE_INT_1U << 30))
8768 return false;
8769
8770 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8771 if (!issetmem)
8772 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8773
8774 /* Step 0: Decide on preferred algorithm, desired alignment and
8775 size of chunks to be copied by main loop. */
8776 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8777 issetmem,
8778 issetmem && val_exp == const0_rtx, have_as,
8779 &dynamic_check, &noalign, false);
8780
8781 if (dump_file)
8782 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8783 stringop_alg_names[alg]);
8784
8785 if (alg == libcall)
8786 return false;
8787 gcc_assert (alg != no_stringop);
8788
8789 /* For now vector-version of memset is generated only for memory zeroing, as
8790 creating of promoted vector value is very cheap in this case. */
8791 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8792 alg = unrolled_loop;
8793
8794 if (!count)
8795 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8796 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8797 if (!issetmem)
8798 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8799
8800 unroll_factor = 1;
8801 move_mode = word_mode;
8802 switch (alg)
8803 {
8804 case libcall:
8805 case no_stringop:
8806 case last_alg:
8807 gcc_unreachable ();
8808 case loop_1_byte:
8809 need_zero_guard = true;
8810 move_mode = QImode;
8811 break;
8812 case loop:
8813 need_zero_guard = true;
8814 break;
8815 case unrolled_loop:
8816 need_zero_guard = true;
8817 unroll_factor = (TARGET_64BIT ? 4 : 2);
8818 break;
8819 case vector_loop:
8820 need_zero_guard = true;
8821 unroll_factor = 4;
8822 /* Find the widest supported mode. */
8823 move_mode = word_mode;
8824 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8825 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8826 move_mode = wider_mode;
8827
586bbef1 8828 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
2bf6d935 8829 move_mode = TImode;
eef81eef
JH
8830 if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
8831 move_mode = OImode;
2bf6d935
ML
8832
8833 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8834 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8835 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8836 {
8837 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8838 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8839 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8840 move_mode = word_mode;
8841 }
8842 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8843 break;
8844 case rep_prefix_8_byte:
8845 move_mode = DImode;
8846 break;
8847 case rep_prefix_4_byte:
8848 move_mode = SImode;
8849 break;
8850 case rep_prefix_1_byte:
8851 move_mode = QImode;
8852 break;
8853 }
8854 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8855 epilogue_size_needed = size_needed;
8856
8857 /* If we are going to call any library calls conditionally, make sure any
8858 pending stack adjustment happen before the first conditional branch,
8859 otherwise they will be emitted before the library call only and won't
8860 happen from the other branches. */
8861 if (dynamic_check != -1)
8862 do_pending_stack_adjust ();
8863
8864 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8865 if (!TARGET_ALIGN_STRINGOPS || noalign)
8866 align = desired_align;
8867
8868 /* Step 1: Prologue guard. */
8869
8870 /* Alignment code needs count to be in register. */
8871 if (CONST_INT_P (count_exp) && desired_align > align)
8872 {
8873 if (INTVAL (count_exp) > desired_align
8874 && INTVAL (count_exp) > size_needed)
8875 {
8876 align_bytes
8877 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8878 if (align_bytes <= 0)
8879 align_bytes = 0;
8880 else
8881 align_bytes = desired_align - align_bytes;
8882 }
8883 if (align_bytes == 0)
8884 count_exp = force_reg (counter_mode (count_exp), count_exp);
8885 }
8886 gcc_assert (desired_align >= 1 && align >= 1);
8887
8888 /* Misaligned move sequences handle both prologue and epilogue at once.
8889 Default code generation results in a smaller code for large alignments
8890 and also avoids redundant job when sizes are known precisely. */
8891 misaligned_prologue_used
8892 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8893 && MAX (desired_align, epilogue_size_needed) <= 32
8894 && desired_align <= epilogue_size_needed
8895 && ((desired_align > align && !align_bytes)
8896 || (!count && epilogue_size_needed > 1)));
8897
8898 /* Do the cheap promotion to allow better CSE across the
8899 main loop and epilogue (ie one load of the big constant in the
8900 front of all code.
8901 For now the misaligned move sequences do not have fast path
8902 without broadcasting. */
8903 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8904 {
8905 if (alg == vector_loop)
8906 {
8907 gcc_assert (val_exp == const0_rtx);
8908 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8909 promoted_val = promote_duplicated_reg_to_size (val_exp,
8910 GET_MODE_SIZE (word_mode),
8911 desired_align, align);
8912 }
8913 else
8914 {
8915 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8916 desired_align, align);
8917 }
8918 }
8919 /* Misaligned move sequences handles both prologues and epilogues at once.
8920 Default code generation results in smaller code for large alignments and
8921 also avoids redundant job when sizes are known precisely. */
8922 if (misaligned_prologue_used)
8923 {
8924 /* Misaligned move prologue handled small blocks by itself. */
76715c32 8925 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
2bf6d935
ML
8926 (dst, src, &destreg, &srcreg,
8927 move_mode, promoted_val, vec_promoted_val,
8928 &count_exp,
8929 &jump_around_label,
8930 desired_align < align
8931 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8932 desired_align, align, &min_size, dynamic_check, issetmem);
8933 if (!issetmem)
8934 src = change_address (src, BLKmode, srcreg);
8935 dst = change_address (dst, BLKmode, destreg);
8936 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8937 epilogue_size_needed = 0;
8938 if (need_zero_guard
8939 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8940 {
8941 /* It is possible that we copied enough so the main loop will not
8942 execute. */
8943 gcc_assert (size_needed > 1);
8944 if (jump_around_label == NULL_RTX)
8945 jump_around_label = gen_label_rtx ();
8946 emit_cmp_and_jump_insns (count_exp,
8947 GEN_INT (size_needed),
8948 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8949 if (expected_size == -1
8950 || expected_size < (desired_align - align) / 2 + size_needed)
8951 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8952 else
8953 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8954 }
8955 }
8956 /* Ensure that alignment prologue won't copy past end of block. */
8957 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8958 {
8959 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8960 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8961 Make sure it is power of 2. */
8962 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8963
8964 /* To improve performance of small blocks, we jump around the VAL
8965 promoting mode. This mean that if the promoted VAL is not constant,
8966 we might not use it in the epilogue and have to use byte
8967 loop variant. */
8968 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8969 force_loopy_epilogue = true;
8970 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8971 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8972 {
8973 /* If main algorithm works on QImode, no epilogue is needed.
8974 For small sizes just don't align anything. */
8975 if (size_needed == 1)
8976 desired_align = align;
8977 else
8978 goto epilogue;
8979 }
8980 else if (!count
8981 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8982 {
8983 label = gen_label_rtx ();
8984 emit_cmp_and_jump_insns (count_exp,
8985 GEN_INT (epilogue_size_needed),
8986 LTU, 0, counter_mode (count_exp), 1, label);
8987 if (expected_size == -1 || expected_size < epilogue_size_needed)
8988 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8989 else
8990 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8991 }
8992 }
8993
8994 /* Emit code to decide on runtime whether library call or inline should be
8995 used. */
8996 if (dynamic_check != -1)
8997 {
8998 if (!issetmem && CONST_INT_P (count_exp))
8999 {
9000 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
9001 {
9002 emit_block_copy_via_libcall (dst, src, count_exp);
9003 count_exp = const0_rtx;
9004 goto epilogue;
9005 }
9006 }
9007 else
9008 {
9009 rtx_code_label *hot_label = gen_label_rtx ();
9010 if (jump_around_label == NULL_RTX)
9011 jump_around_label = gen_label_rtx ();
9012 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
9013 LEU, 0, counter_mode (count_exp),
9014 1, hot_label);
9015 predict_jump (REG_BR_PROB_BASE * 90 / 100);
9016 if (issetmem)
9017 set_storage_via_libcall (dst, count_exp, val_exp);
9018 else
9019 emit_block_copy_via_libcall (dst, src, count_exp);
9020 emit_jump (jump_around_label);
9021 emit_label (hot_label);
9022 }
9023 }
9024
9025 /* Step 2: Alignment prologue. */
9026 /* Do the expensive promotion once we branched off the small blocks. */
9027 if (issetmem && !promoted_val)
9028 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9029 desired_align, align);
9030
9031 if (desired_align > align && !misaligned_prologue_used)
9032 {
9033 if (align_bytes == 0)
9034 {
9035 /* Except for the first move in prologue, we no longer know
9036 constant offset in aliasing info. It don't seems to worth
9037 the pain to maintain it for the first move, so throw away
9038 the info early. */
9039 dst = change_address (dst, BLKmode, destreg);
9040 if (!issetmem)
9041 src = change_address (src, BLKmode, srcreg);
76715c32 9042 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
2bf6d935
ML
9043 promoted_val, vec_promoted_val,
9044 count_exp, align, desired_align,
9045 issetmem);
9046 /* At most desired_align - align bytes are copied. */
9047 if (min_size < (unsigned)(desired_align - align))
9048 min_size = 0;
9049 else
9050 min_size -= desired_align - align;
9051 }
9052 else
9053 {
9054 /* If we know how many bytes need to be stored before dst is
9055 sufficiently aligned, maintain aliasing info accurately. */
76715c32 9056 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
2bf6d935
ML
9057 srcreg,
9058 promoted_val,
9059 vec_promoted_val,
9060 desired_align,
9061 align_bytes,
9062 issetmem);
9063
9064 count_exp = plus_constant (counter_mode (count_exp),
9065 count_exp, -align_bytes);
9066 count -= align_bytes;
9067 min_size -= align_bytes;
9068 max_size -= align_bytes;
9069 }
9070 if (need_zero_guard
9071 && min_size < (unsigned HOST_WIDE_INT) size_needed
9072 && (count < (unsigned HOST_WIDE_INT) size_needed
9073 || (align_bytes == 0
9074 && count < ((unsigned HOST_WIDE_INT) size_needed
9075 + desired_align - align))))
9076 {
9077 /* It is possible that we copied enough so the main loop will not
9078 execute. */
9079 gcc_assert (size_needed > 1);
9080 if (label == NULL_RTX)
9081 label = gen_label_rtx ();
9082 emit_cmp_and_jump_insns (count_exp,
9083 GEN_INT (size_needed),
9084 LTU, 0, counter_mode (count_exp), 1, label);
9085 if (expected_size == -1
9086 || expected_size < (desired_align - align) / 2 + size_needed)
9087 predict_jump (REG_BR_PROB_BASE * 20 / 100);
9088 else
9089 predict_jump (REG_BR_PROB_BASE * 60 / 100);
9090 }
9091 }
9092 if (label && size_needed == 1)
9093 {
9094 emit_label (label);
9095 LABEL_NUSES (label) = 1;
9096 label = NULL;
9097 epilogue_size_needed = 1;
9098 if (issetmem)
9099 promoted_val = val_exp;
9100 }
9101 else if (label == NULL_RTX && !misaligned_prologue_used)
9102 epilogue_size_needed = size_needed;
9103
9104 /* Step 3: Main loop. */
9105
9106 switch (alg)
9107 {
9108 case libcall:
9109 case no_stringop:
9110 case last_alg:
9111 gcc_unreachable ();
9112 case loop_1_byte:
9113 case loop:
9114 case unrolled_loop:
76715c32 9115 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
9116 count_exp, move_mode, unroll_factor,
9117 expected_size, issetmem);
9118 break;
9119 case vector_loop:
76715c32 9120 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
2bf6d935
ML
9121 vec_promoted_val, count_exp, move_mode,
9122 unroll_factor, expected_size, issetmem);
9123 break;
9124 case rep_prefix_8_byte:
9125 case rep_prefix_4_byte:
9126 case rep_prefix_1_byte:
76715c32 9127 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
9128 val_exp, count_exp, move_mode, issetmem);
9129 break;
9130 }
9131 /* Adjust properly the offset of src and dest memory for aliasing. */
9132 if (CONST_INT_P (count_exp))
9133 {
9134 if (!issetmem)
9135 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
9136 (count / size_needed) * size_needed);
9137 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
9138 (count / size_needed) * size_needed);
9139 }
9140 else
9141 {
9142 if (!issetmem)
9143 src = change_address (src, BLKmode, srcreg);
9144 dst = change_address (dst, BLKmode, destreg);
9145 }
9146
9147 /* Step 4: Epilogue to copy the remaining bytes. */
9148 epilogue:
9149 if (label)
9150 {
9151 /* When the main loop is done, COUNT_EXP might hold original count,
9152 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9153 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9154 bytes. Compensate if needed. */
9155
9156 if (size_needed < epilogue_size_needed)
9157 {
9158 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
9159 GEN_INT (size_needed - 1), count_exp, 1,
9160 OPTAB_DIRECT);
9161 if (tmp != count_exp)
9162 emit_move_insn (count_exp, tmp);
9163 }
9164 emit_label (label);
9165 LABEL_NUSES (label) = 1;
9166 }
9167
9168 if (count_exp != const0_rtx && epilogue_size_needed > 1)
9169 {
9170 if (force_loopy_epilogue)
9171 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
9172 epilogue_size_needed);
9173 else
9174 {
9175 if (issetmem)
9176 expand_setmem_epilogue (dst, destreg, promoted_val,
9177 vec_promoted_val, count_exp,
9178 epilogue_size_needed);
9179 else
76715c32 9180 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
2bf6d935
ML
9181 epilogue_size_needed);
9182 }
9183 }
9184 if (jump_around_label)
9185 emit_label (jump_around_label);
9186 return true;
9187}
9188
3edc21af
L
9189/* Expand cmpstrn or memcmp. */
9190
9191bool
9192ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
9193 rtx length, rtx align, bool is_cmpstrn)
9194{
4052c05e
L
9195 /* Expand strncmp and memcmp only with -minline-all-stringops since
9196 "repz cmpsb" can be much slower than strncmp and memcmp functions
9197 implemented with vector instructions, see
9198
9199 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9200 */
9201 if (!TARGET_INLINE_ALL_STRINGOPS)
3edc21af
L
9202 return false;
9203
9204 /* Can't use this if the user has appropriated ecx, esi or edi. */
9205 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
9206 return false;
9207
9208 if (is_cmpstrn)
9209 {
9210 /* For strncmp, length is the maximum length, which can be larger
9211 than actual string lengths. We can expand the cmpstrn pattern
9212 to "repz cmpsb" only if one of the strings is a constant so
9213 that expand_builtin_strncmp() can write the length argument to
9214 be the minimum of the const string length and the actual length
9215 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9216 tree t1 = MEM_EXPR (src1);
9217 tree t2 = MEM_EXPR (src2);
9218 if (!((t1 && TREE_CODE (t1) == MEM_REF
9219 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
9220 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
9221 == STRING_CST))
9222 || (t2 && TREE_CODE (t2) == MEM_REF
9223 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
9224 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
9225 == STRING_CST))))
9226 return false;
9227 }
3edc21af
L
9228
9229 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
9230 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
9231 if (addr1 != XEXP (src1, 0))
9232 src1 = replace_equiv_address_nv (src1, addr1);
9233 if (addr2 != XEXP (src2, 0))
9234 src2 = replace_equiv_address_nv (src2, addr2);
9235
9236 /* NB: Make a copy of the data length to avoid changing the original
9237 data length by cmpstrnqi patterns. */
9238 length = ix86_zero_extend_to_Pmode (length);
9239 rtx lengthreg = gen_reg_rtx (Pmode);
9240 emit_move_insn (lengthreg, length);
9241
9242 /* If we are testing strict equality, we can use known alignment to
9243 good advantage. This may be possible with combine, particularly
9244 once cc0 is dead. */
9245 if (CONST_INT_P (length))
9246 {
9247 if (length == const0_rtx)
9248 {
9249 emit_move_insn (result, const0_rtx);
9250 return true;
9251 }
9252 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
9253 src1, src2));
9254 }
9255 else
9256 {
9257 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
9258 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
9259 src1, src2));
9260 }
9261
9262 rtx out = gen_lowpart (QImode, result);
9263 emit_insn (gen_cmpintqi (out));
9264 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
9265
9266 return true;
9267}
2bf6d935
ML
9268
9269/* Expand the appropriate insns for doing strlen if not just doing
9270 repnz; scasb
9271
9272 out = result, initialized with the start address
9273 align_rtx = alignment of the address.
9274 scratch = scratch register, initialized with the startaddress when
9275 not aligned, otherwise undefined
9276
9277 This is just the body. It needs the initializations mentioned above and
9278 some address computing at the end. These things are done in i386.md. */
9279
9280static void
9281ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
9282{
9283 int align;
9284 rtx tmp;
9285 rtx_code_label *align_2_label = NULL;
9286 rtx_code_label *align_3_label = NULL;
9287 rtx_code_label *align_4_label = gen_label_rtx ();
9288 rtx_code_label *end_0_label = gen_label_rtx ();
9289 rtx mem;
9290 rtx tmpreg = gen_reg_rtx (SImode);
9291 rtx scratch = gen_reg_rtx (SImode);
9292 rtx cmp;
9293
9294 align = 0;
9295 if (CONST_INT_P (align_rtx))
9296 align = INTVAL (align_rtx);
9297
9298 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9299
9300 /* Is there a known alignment and is it less than 4? */
9301 if (align < 4)
9302 {
9303 rtx scratch1 = gen_reg_rtx (Pmode);
9304 emit_move_insn (scratch1, out);
9305 /* Is there a known alignment and is it not 2? */
9306 if (align != 2)
9307 {
9308 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
9309 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
9310
9311 /* Leave just the 3 lower bits. */
9312 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9313 NULL_RTX, 0, OPTAB_WIDEN);
9314
9315 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9316 Pmode, 1, align_4_label);
9317 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9318 Pmode, 1, align_2_label);
9319 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9320 Pmode, 1, align_3_label);
9321 }
9322 else
9323 {
9324 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9325 check if is aligned to 4 - byte. */
9326
9327 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9328 NULL_RTX, 0, OPTAB_WIDEN);
9329
9330 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9331 Pmode, 1, align_4_label);
9332 }
9333
9334 mem = change_address (src, QImode, out);
9335
9336 /* Now compare the bytes. */
9337
9338 /* Compare the first n unaligned byte on a byte per byte basis. */
9339 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9340 QImode, 1, end_0_label);
9341
9342 /* Increment the address. */
d9330fb5 9343 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
9344
9345 /* Not needed with an alignment of 2 */
9346 if (align != 2)
9347 {
9348 emit_label (align_2_label);
9349
9350 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9351 end_0_label);
9352
d9330fb5 9353 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
9354
9355 emit_label (align_3_label);
9356 }
9357
9358 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9359 end_0_label);
9360
d9330fb5 9361 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
9362 }
9363
9364 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9365 align this loop. It gives only huge programs, but does not help to
9366 speed up. */
9367 emit_label (align_4_label);
9368
9369 mem = change_address (src, SImode, out);
9370 emit_move_insn (scratch, mem);
d9330fb5 9371 emit_insn (gen_add2_insn (out, GEN_INT (4)));
2bf6d935
ML
9372
9373 /* This formula yields a nonzero result iff one of the bytes is zero.
9374 This saves three branches inside loop and many cycles. */
9375
9376 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9377 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9378 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9379 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9380 gen_int_mode (0x80808080, SImode)));
9381 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9382 align_4_label);
9383
9384 if (TARGET_CMOVE)
9385 {
9386 rtx reg = gen_reg_rtx (SImode);
9387 rtx reg2 = gen_reg_rtx (Pmode);
9388 emit_move_insn (reg, tmpreg);
9389 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9390
9391 /* If zero is not in the first two bytes, move two bytes forward. */
9392 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9393 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9394 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9395 emit_insn (gen_rtx_SET (tmpreg,
9396 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9397 reg,
9398 tmpreg)));
9399 /* Emit lea manually to avoid clobbering of flags. */
c3185b64 9400 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
2bf6d935
ML
9401
9402 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9403 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9404 emit_insn (gen_rtx_SET (out,
9405 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9406 reg2,
9407 out)));
9408 }
9409 else
9410 {
9411 rtx_code_label *end_2_label = gen_label_rtx ();
9412 /* Is zero in the first two bytes? */
9413
9414 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9415 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9416 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9417 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9418 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9419 pc_rtx);
9420 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9421 JUMP_LABEL (tmp) = end_2_label;
9422
9423 /* Not in the first two. Move two bytes forward. */
9424 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
d9330fb5 9425 emit_insn (gen_add2_insn (out, const2_rtx));
2bf6d935
ML
9426
9427 emit_label (end_2_label);
9428
9429 }
9430
9431 /* Avoid branch in fixing the byte. */
9432 tmpreg = gen_lowpart (QImode, tmpreg);
9433 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9434 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9435 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
d9330fb5 9436 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
2bf6d935
ML
9437
9438 emit_label (end_0_label);
9439}
9440
9441/* Expand strlen. */
9442
9443bool
9444ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9445{
9446if (TARGET_UNROLL_STRLEN
9447 && TARGET_INLINE_ALL_STRINGOPS
9448 && eoschar == const0_rtx
9449 && optimize > 1)
9450 {
9451 /* The generic case of strlen expander is long. Avoid it's
9452 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9453 rtx addr = force_reg (Pmode, XEXP (src, 0));
9454 /* Well it seems that some optimizer does not combine a call like
9455 foo(strlen(bar), strlen(bar));
9456 when the move and the subtraction is done here. It does calculate
9457 the length just once when these instructions are done inside of
9458 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9459 often used and I use one fewer register for the lifetime of
9460 output_strlen_unroll() this is better. */
9461
9462 emit_move_insn (out, addr);
9463
9464 ix86_expand_strlensi_unroll_1 (out, src, align);
9465
9466 /* strlensi_unroll_1 returns the address of the zero at the end of
9467 the string, like memchr(), so compute the length by subtracting
9468 the start address. */
d9330fb5 9469 emit_insn (gen_sub2_insn (out, addr));
2bf6d935
ML
9470 return true;
9471 }
9472 else
9473 return false;
9474}
9475
9476/* For given symbol (function) construct code to compute address of it's PLT
9477 entry in large x86-64 PIC model. */
9478
9479static rtx
9480construct_plt_address (rtx symbol)
9481{
9482 rtx tmp, unspec;
9483
9484 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9485 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9486 gcc_assert (Pmode == DImode);
9487
9488 tmp = gen_reg_rtx (Pmode);
9489 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9490
9491 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
d9330fb5 9492 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
2bf6d935
ML
9493 return tmp;
9494}
9495
9496/* Additional registers that are clobbered by SYSV calls. */
9497
9498static int const x86_64_ms_sysv_extra_clobbered_registers
9499 [NUM_X86_64_MS_CLOBBERED_REGS] =
9500{
9501 SI_REG, DI_REG,
9502 XMM6_REG, XMM7_REG,
9503 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9504 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9505};
9506
9507rtx_insn *
9508ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9509 rtx callarg2,
9510 rtx pop, bool sibcall)
9511{
9512 rtx vec[3];
9513 rtx use = NULL, call;
9514 unsigned int vec_len = 0;
9515 tree fndecl;
9516
9517 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9518 {
9519 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9520 if (fndecl
9521 && (lookup_attribute ("interrupt",
9522 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
a9c697b8 9523 error ("interrupt service routine cannot be called directly");
2bf6d935
ML
9524 }
9525 else
9526 fndecl = NULL_TREE;
9527
9528 if (pop == const0_rtx)
9529 pop = NULL;
9530 gcc_assert (!TARGET_64BIT || !pop);
9531
41bd1b19 9532 rtx addr = XEXP (fnaddr, 0);
2bf6d935
ML
9533 if (TARGET_MACHO && !TARGET_64BIT)
9534 {
9535#if TARGET_MACHO
9536 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9537 fnaddr = machopic_indirect_call_target (fnaddr);
9538#endif
9539 }
9540 else
9541 {
9542 /* Static functions and indirect calls don't need the pic register. Also,
9543 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9544 it an indirect call. */
2bf6d935
ML
9545 if (flag_pic
9546 && GET_CODE (addr) == SYMBOL_REF
f7854b90 9547 && ix86_call_use_plt_p (addr))
2bf6d935
ML
9548 {
9549 if (flag_plt
9550 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9551 || !lookup_attribute ("noplt",
9552 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9553 {
9554 if (!TARGET_64BIT
9555 || (ix86_cmodel == CM_LARGE_PIC
9556 && DEFAULT_ABI != MS_ABI))
9557 {
9558 use_reg (&use, gen_rtx_REG (Pmode,
9559 REAL_PIC_OFFSET_TABLE_REGNUM));
9560 if (ix86_use_pseudo_pic_reg ())
9561 emit_move_insn (gen_rtx_REG (Pmode,
9562 REAL_PIC_OFFSET_TABLE_REGNUM),
9563 pic_offset_table_rtx);
9564 }
9565 }
9566 else if (!TARGET_PECOFF && !TARGET_MACHO)
9567 {
69157fe7
JJ
9568 if (TARGET_64BIT
9569 && ix86_cmodel == CM_LARGE_PIC
9570 && DEFAULT_ABI != MS_ABI)
9571 {
9572 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9573 UNSPEC_GOT);
9574 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9575 fnaddr = force_reg (Pmode, fnaddr);
9576 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9577 }
9578 else if (TARGET_64BIT)
2bf6d935
ML
9579 {
9580 fnaddr = gen_rtx_UNSPEC (Pmode,
9581 gen_rtvec (1, addr),
9582 UNSPEC_GOTPCREL);
9583 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9584 }
9585 else
9586 {
9587 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9588 UNSPEC_GOT);
9589 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9590 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9591 fnaddr);
9592 }
9593 fnaddr = gen_const_mem (Pmode, fnaddr);
9594 /* Pmode may not be the same as word_mode for x32, which
9595 doesn't support indirect branch via 32-bit memory slot.
9596 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9597 indirect branch via x32 GOT slot is OK. */
9598 if (GET_MODE (fnaddr) != word_mode)
9599 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9600 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9601 }
9602 }
9603 }
9604
9605 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9606 parameters passed in vector registers. */
9607 if (TARGET_64BIT
9608 && (INTVAL (callarg2) > 0
9609 || (INTVAL (callarg2) == 0
9610 && (TARGET_SSE || !flag_skip_rax_setup))))
9611 {
9612 rtx al = gen_rtx_REG (QImode, AX_REG);
9613 emit_move_insn (al, callarg2);
9614 use_reg (&use, al);
9615 }
9616
9617 if (ix86_cmodel == CM_LARGE_PIC
9618 && !TARGET_PECOFF
9619 && MEM_P (fnaddr)
9620 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9621 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9622 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9623 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9624 branch via x32 GOT slot is OK. */
9625 else if (!(TARGET_X32
9626 && MEM_P (fnaddr)
9627 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9628 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9629 && (sibcall
9630 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9631 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9632 {
9633 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9634 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9635 }
9636
bb576017 9637 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9638 mask off code pointers here.
9639 TODO: also need to handle indirect jump. */
9640 if (ix86_memtag_can_tag_addresses () && !fndecl
9641 && sanitize_flags_p (SANITIZE_HWADDRESS))
9642 {
9643 rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
9644 NULL_RTX);
9645 fnaddr = gen_rtx_MEM (QImode, untagged_addr);
9646 }
9647
2bf6d935
ML
9648 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9649
9650 if (retval)
9651 call = gen_rtx_SET (retval, call);
9652 vec[vec_len++] = call;
9653
9654 if (pop)
9655 {
9656 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9657 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9658 vec[vec_len++] = pop;
9659 }
9660
9661 if (cfun->machine->no_caller_saved_registers
9662 && (!fndecl
9663 || (!TREE_THIS_VOLATILE (fndecl)
9664 && !lookup_attribute ("no_caller_saved_registers",
9665 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9666 {
9667 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9668 bool is_64bit_ms_abi = (TARGET_64BIT
9669 && ix86_function_abi (fndecl) == MS_ABI);
9670 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9671
9672 /* If there are no caller-saved registers, add all registers
9673 that are clobbered by the call which returns. */
9674 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9675 if (!fixed_regs[i]
9676 && (ix86_call_used_regs[i] == 1
9677 || (ix86_call_used_regs[i] & c_mask))
9678 && !STACK_REGNO_P (i)
9679 && !MMX_REGNO_P (i))
9680 clobber_reg (&use,
9681 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9682 }
9683 else if (TARGET_64BIT_MS_ABI
9684 && (!callarg2 || INTVAL (callarg2) != -2))
9685 {
9686 unsigned i;
9687
9688 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9689 {
9690 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9691 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9692
9693 clobber_reg (&use, gen_rtx_REG (mode, regno));
9694 }
9695
9696 /* Set here, but it may get cleared later. */
9697 if (TARGET_CALL_MS2SYSV_XLOGUES)
9698 {
9699 if (!TARGET_SSE)
9700 ;
9701
9702 /* Don't break hot-patched functions. */
9703 else if (ix86_function_ms_hook_prologue (current_function_decl))
9704 ;
9705
9706 /* TODO: Cases not yet examined. */
9707 else if (flag_split_stack)
9708 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9709
9710 else
9711 {
9712 gcc_assert (!reload_completed);
9713 cfun->machine->call_ms2sysv = true;
9714 }
9715 }
9716 }
9717
41bd1b19
IS
9718 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9719 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9720 || !fndecl || TREE_PUBLIC (fndecl)))
9721 {
9722 /* We allow public functions defined in a TU to bind locally for PIC
9723 code (the default) on 64bit Mach-O.
9724 If such functions are not inlined, we cannot tell at compile-time if
9725 they will be called via the lazy symbol resolver (this can depend on
9726 options given at link-time). Therefore, we must assume that the lazy
9727 resolver could be used which clobbers R11 and R10. */
9728 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9729 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9730 }
9731
2bf6d935
ML
9732 if (vec_len > 1)
9733 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9734 rtx_insn *call_insn = emit_call_insn (call);
9735 if (use)
9736 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9737
9738 return call_insn;
9739}
9740
9741/* Split simple return with popping POPC bytes from stack to indirect
9742 branch with stack adjustment . */
9743
9744void
9745ix86_split_simple_return_pop_internal (rtx popc)
9746{
9747 struct machine_function *m = cfun->machine;
9748 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9749 rtx_insn *insn;
9750
9751 /* There is no "pascal" calling convention in any 64bit ABI. */
9752 gcc_assert (!TARGET_64BIT);
9753
9754 insn = emit_insn (gen_pop (ecx));
9755 m->fs.cfa_offset -= UNITS_PER_WORD;
9756 m->fs.sp_offset -= UNITS_PER_WORD;
9757
9758 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9759 x = gen_rtx_SET (stack_pointer_rtx, x);
9760 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9761 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9762 RTX_FRAME_RELATED_P (insn) = 1;
9763
9764 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9765 x = gen_rtx_SET (stack_pointer_rtx, x);
9766 insn = emit_insn (x);
9767 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9768 RTX_FRAME_RELATED_P (insn) = 1;
9769
9770 /* Now return address is in ECX. */
9771 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9772}
9773
9774/* Errors in the source file can cause expand_expr to return const0_rtx
9775 where we expect a vector. To avoid crashing, use one of the vector
9776 clear instructions. */
9777
9778static rtx
9779safe_vector_operand (rtx x, machine_mode mode)
9780{
9781 if (x == const0_rtx)
9782 x = CONST0_RTX (mode);
9783 return x;
9784}
9785
9786/* Subroutine of ix86_expand_builtin to take care of binop insns. */
9787
9788static rtx
9789ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9790{
9791 rtx pat;
9792 tree arg0 = CALL_EXPR_ARG (exp, 0);
9793 tree arg1 = CALL_EXPR_ARG (exp, 1);
9794 rtx op0 = expand_normal (arg0);
9795 rtx op1 = expand_normal (arg1);
9796 machine_mode tmode = insn_data[icode].operand[0].mode;
9797 machine_mode mode0 = insn_data[icode].operand[1].mode;
9798 machine_mode mode1 = insn_data[icode].operand[2].mode;
9799
9800 if (VECTOR_MODE_P (mode0))
9801 op0 = safe_vector_operand (op0, mode0);
9802 if (VECTOR_MODE_P (mode1))
9803 op1 = safe_vector_operand (op1, mode1);
9804
9805 if (optimize || !target
9806 || GET_MODE (target) != tmode
9807 || !insn_data[icode].operand[0].predicate (target, tmode))
9808 target = gen_reg_rtx (tmode);
9809
9810 if (GET_MODE (op1) == SImode && mode1 == TImode)
9811 {
9812 rtx x = gen_reg_rtx (V4SImode);
9813 emit_insn (gen_sse2_loadd (x, op1));
9814 op1 = gen_lowpart (TImode, x);
9815 }
9816
9817 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9818 op0 = copy_to_mode_reg (mode0, op0);
9819 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9820 op1 = copy_to_mode_reg (mode1, op1);
9821
9822 pat = GEN_FCN (icode) (target, op0, op1);
9823 if (! pat)
9824 return 0;
9825
9826 emit_insn (pat);
9827
9828 return target;
9829}
9830
9831/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9832
9833static rtx
9834ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9835 enum ix86_builtin_func_type m_type,
9836 enum rtx_code sub_code)
9837{
9838 rtx pat;
715a8bc8 9839 unsigned int i, nargs;
2bf6d935
ML
9840 bool comparison_p = false;
9841 bool tf_p = false;
9842 bool last_arg_constant = false;
9843 int num_memory = 0;
715a8bc8 9844 rtx xops[4];
2bf6d935
ML
9845
9846 machine_mode tmode = insn_data[icode].operand[0].mode;
9847
9848 switch (m_type)
9849 {
9850 case MULTI_ARG_4_DF2_DI_I:
9851 case MULTI_ARG_4_DF2_DI_I1:
9852 case MULTI_ARG_4_SF2_SI_I:
9853 case MULTI_ARG_4_SF2_SI_I1:
9854 nargs = 4;
9855 last_arg_constant = true;
9856 break;
9857
9858 case MULTI_ARG_3_SF:
9859 case MULTI_ARG_3_DF:
9860 case MULTI_ARG_3_SF2:
9861 case MULTI_ARG_3_DF2:
9862 case MULTI_ARG_3_DI:
9863 case MULTI_ARG_3_SI:
9864 case MULTI_ARG_3_SI_DI:
9865 case MULTI_ARG_3_HI:
9866 case MULTI_ARG_3_HI_SI:
9867 case MULTI_ARG_3_QI:
9868 case MULTI_ARG_3_DI2:
9869 case MULTI_ARG_3_SI2:
9870 case MULTI_ARG_3_HI2:
9871 case MULTI_ARG_3_QI2:
9872 nargs = 3;
9873 break;
9874
9875 case MULTI_ARG_2_SF:
9876 case MULTI_ARG_2_DF:
9877 case MULTI_ARG_2_DI:
9878 case MULTI_ARG_2_SI:
9879 case MULTI_ARG_2_HI:
9880 case MULTI_ARG_2_QI:
9881 nargs = 2;
9882 break;
9883
9884 case MULTI_ARG_2_DI_IMM:
9885 case MULTI_ARG_2_SI_IMM:
9886 case MULTI_ARG_2_HI_IMM:
9887 case MULTI_ARG_2_QI_IMM:
9888 nargs = 2;
9889 last_arg_constant = true;
9890 break;
9891
9892 case MULTI_ARG_1_SF:
9893 case MULTI_ARG_1_DF:
9894 case MULTI_ARG_1_SF2:
9895 case MULTI_ARG_1_DF2:
9896 case MULTI_ARG_1_DI:
9897 case MULTI_ARG_1_SI:
9898 case MULTI_ARG_1_HI:
9899 case MULTI_ARG_1_QI:
9900 case MULTI_ARG_1_SI_DI:
9901 case MULTI_ARG_1_HI_DI:
9902 case MULTI_ARG_1_HI_SI:
9903 case MULTI_ARG_1_QI_DI:
9904 case MULTI_ARG_1_QI_SI:
9905 case MULTI_ARG_1_QI_HI:
9906 nargs = 1;
9907 break;
9908
9909 case MULTI_ARG_2_DI_CMP:
9910 case MULTI_ARG_2_SI_CMP:
9911 case MULTI_ARG_2_HI_CMP:
9912 case MULTI_ARG_2_QI_CMP:
9913 nargs = 2;
9914 comparison_p = true;
9915 break;
9916
9917 case MULTI_ARG_2_SF_TF:
9918 case MULTI_ARG_2_DF_TF:
9919 case MULTI_ARG_2_DI_TF:
9920 case MULTI_ARG_2_SI_TF:
9921 case MULTI_ARG_2_HI_TF:
9922 case MULTI_ARG_2_QI_TF:
9923 nargs = 2;
9924 tf_p = true;
9925 break;
9926
9927 default:
9928 gcc_unreachable ();
9929 }
9930
9931 if (optimize || !target
9932 || GET_MODE (target) != tmode
9933 || !insn_data[icode].operand[0].predicate (target, tmode))
9934 target = gen_reg_rtx (tmode);
9935 else if (memory_operand (target, tmode))
9936 num_memory++;
9937
715a8bc8 9938 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
9939
9940 for (i = 0; i < nargs; i++)
9941 {
9942 tree arg = CALL_EXPR_ARG (exp, i);
9943 rtx op = expand_normal (arg);
9944 int adjust = (comparison_p) ? 1 : 0;
9945 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9946
9947 if (last_arg_constant && i == nargs - 1)
9948 {
9949 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9950 {
9951 enum insn_code new_icode = icode;
9952 switch (icode)
9953 {
9954 case CODE_FOR_xop_vpermil2v2df3:
9955 case CODE_FOR_xop_vpermil2v4sf3:
9956 case CODE_FOR_xop_vpermil2v4df3:
9957 case CODE_FOR_xop_vpermil2v8sf3:
9958 error ("the last argument must be a 2-bit immediate");
9959 return gen_reg_rtx (tmode);
9960 case CODE_FOR_xop_rotlv2di3:
9961 new_icode = CODE_FOR_rotlv2di3;
9962 goto xop_rotl;
9963 case CODE_FOR_xop_rotlv4si3:
9964 new_icode = CODE_FOR_rotlv4si3;
9965 goto xop_rotl;
9966 case CODE_FOR_xop_rotlv8hi3:
9967 new_icode = CODE_FOR_rotlv8hi3;
9968 goto xop_rotl;
9969 case CODE_FOR_xop_rotlv16qi3:
9970 new_icode = CODE_FOR_rotlv16qi3;
9971 xop_rotl:
9972 if (CONST_INT_P (op))
9973 {
9974 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9975 op = GEN_INT (INTVAL (op) & mask);
9976 gcc_checking_assert
9977 (insn_data[icode].operand[i + 1].predicate (op, mode));
9978 }
9979 else
9980 {
9981 gcc_checking_assert
9982 (nargs == 2
9983 && insn_data[new_icode].operand[0].mode == tmode
9984 && insn_data[new_icode].operand[1].mode == tmode
9985 && insn_data[new_icode].operand[2].mode == mode
9986 && insn_data[new_icode].operand[0].predicate
9987 == insn_data[icode].operand[0].predicate
9988 && insn_data[new_icode].operand[1].predicate
9989 == insn_data[icode].operand[1].predicate);
9990 icode = new_icode;
9991 goto non_constant;
9992 }
9993 break;
9994 default:
9995 gcc_unreachable ();
9996 }
9997 }
9998 }
9999 else
10000 {
10001 non_constant:
10002 if (VECTOR_MODE_P (mode))
10003 op = safe_vector_operand (op, mode);
10004
10005 /* If we aren't optimizing, only allow one memory operand to be
10006 generated. */
10007 if (memory_operand (op, mode))
10008 num_memory++;
10009
10010 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
10011
10012 if (optimize
10013 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
10014 || num_memory > 1)
10015 op = force_reg (mode, op);
10016 }
10017
715a8bc8 10018 xops[i] = op;
2bf6d935
ML
10019 }
10020
10021 switch (nargs)
10022 {
10023 case 1:
715a8bc8 10024 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
10025 break;
10026
10027 case 2:
10028 if (tf_p)
715a8bc8 10029 pat = GEN_FCN (icode) (target, xops[0], xops[1],
2bf6d935
ML
10030 GEN_INT ((int)sub_code));
10031 else if (! comparison_p)
715a8bc8 10032 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
10033 else
10034 {
10035 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
715a8bc8 10036 xops[0], xops[1]);
2bf6d935 10037
715a8bc8 10038 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
2bf6d935
ML
10039 }
10040 break;
10041
10042 case 3:
715a8bc8 10043 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
10044 break;
10045
10046 case 4:
715a8bc8 10047 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
2bf6d935
ML
10048 break;
10049
10050 default:
10051 gcc_unreachable ();
10052 }
10053
10054 if (! pat)
10055 return 0;
10056
10057 emit_insn (pat);
10058 return target;
10059}
10060
10061/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
10062 insns with vec_merge. */
10063
10064static rtx
10065ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
10066 rtx target)
10067{
10068 rtx pat;
10069 tree arg0 = CALL_EXPR_ARG (exp, 0);
10070 rtx op1, op0 = expand_normal (arg0);
10071 machine_mode tmode = insn_data[icode].operand[0].mode;
10072 machine_mode mode0 = insn_data[icode].operand[1].mode;
10073
10074 if (optimize || !target
10075 || GET_MODE (target) != tmode
10076 || !insn_data[icode].operand[0].predicate (target, tmode))
10077 target = gen_reg_rtx (tmode);
10078
10079 if (VECTOR_MODE_P (mode0))
10080 op0 = safe_vector_operand (op0, mode0);
10081
10082 if ((optimize && !register_operand (op0, mode0))
10083 || !insn_data[icode].operand[1].predicate (op0, mode0))
10084 op0 = copy_to_mode_reg (mode0, op0);
10085
10086 op1 = op0;
10087 if (!insn_data[icode].operand[2].predicate (op1, mode0))
10088 op1 = copy_to_mode_reg (mode0, op1);
10089
10090 pat = GEN_FCN (icode) (target, op0, op1);
10091 if (! pat)
10092 return 0;
10093 emit_insn (pat);
10094 return target;
10095}
10096
10097/* Subroutine of ix86_expand_builtin to take care of comparison insns. */
10098
10099static rtx
10100ix86_expand_sse_compare (const struct builtin_description *d,
10101 tree exp, rtx target, bool swap)
10102{
10103 rtx pat;
10104 tree arg0 = CALL_EXPR_ARG (exp, 0);
10105 tree arg1 = CALL_EXPR_ARG (exp, 1);
10106 rtx op0 = expand_normal (arg0);
10107 rtx op1 = expand_normal (arg1);
10108 rtx op2;
10109 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10110 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10111 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10112 enum rtx_code comparison = d->comparison;
10113
10114 if (VECTOR_MODE_P (mode0))
10115 op0 = safe_vector_operand (op0, mode0);
10116 if (VECTOR_MODE_P (mode1))
10117 op1 = safe_vector_operand (op1, mode1);
10118
10119 /* Swap operands if we have a comparison that isn't available in
10120 hardware. */
10121 if (swap)
10122 std::swap (op0, op1);
10123
10124 if (optimize || !target
10125 || GET_MODE (target) != tmode
10126 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10127 target = gen_reg_rtx (tmode);
10128
10129 if ((optimize && !register_operand (op0, mode0))
10130 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
10131 op0 = copy_to_mode_reg (mode0, op0);
10132 if ((optimize && !register_operand (op1, mode1))
10133 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
10134 op1 = copy_to_mode_reg (mode1, op1);
10135
10136 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
10137 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10138 if (! pat)
10139 return 0;
10140 emit_insn (pat);
10141 return target;
10142}
10143
ae69e6f6 10144/* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
10145 * ordered EQ or unordered NE, generate PF jump. */
10146
10147static rtx
10148ix86_ssecom_setcc (const enum rtx_code comparison,
10149 bool check_unordered, machine_mode mode,
10150 rtx set_dst, rtx target)
10151{
10152
10153 rtx_code_label *label = NULL;
10154
10155 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10156 with NAN operands. */
10157 if (check_unordered)
10158 {
10159 gcc_assert (comparison == EQ || comparison == NE);
10160
10161 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10162 label = gen_label_rtx ();
10163 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10164 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10165 gen_rtx_LABEL_REF (VOIDmode, label),
10166 pc_rtx);
10167 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10168 }
10169
10170 /* NB: Set CCFPmode and check a different CCmode which is in subset
10171 of CCFPmode. */
10172 if (GET_MODE (set_dst) != mode)
10173 {
10174 gcc_assert (mode == CCAmode || mode == CCCmode
10175 || mode == CCOmode || mode == CCPmode
10176 || mode == CCSmode || mode == CCZmode);
10177 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10178 }
10179
10180 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10181 gen_rtx_fmt_ee (comparison, QImode,
10182 set_dst,
10183 const0_rtx)));
10184
10185 if (label)
10186 emit_label (label);
10187
10188 return SUBREG_REG (target);
10189}
10190
2bf6d935
ML
10191/* Subroutine of ix86_expand_builtin to take care of comi insns. */
10192
10193static rtx
10194ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
10195 rtx target)
10196{
ae69e6f6 10197 rtx pat, set_dst;
2bf6d935
ML
10198 tree arg0 = CALL_EXPR_ARG (exp, 0);
10199 tree arg1 = CALL_EXPR_ARG (exp, 1);
10200 rtx op0 = expand_normal (arg0);
10201 rtx op1 = expand_normal (arg1);
ae69e6f6 10202 enum insn_code icode = d->icode;
10203 const struct insn_data_d *insn_p = &insn_data[icode];
10204 machine_mode mode0 = insn_p->operand[0].mode;
10205 machine_mode mode1 = insn_p->operand[1].mode;
2bf6d935
ML
10206
10207 if (VECTOR_MODE_P (mode0))
10208 op0 = safe_vector_operand (op0, mode0);
10209 if (VECTOR_MODE_P (mode1))
10210 op1 = safe_vector_operand (op1, mode1);
10211
ae69e6f6 10212 enum rtx_code comparison = d->comparison;
10213 rtx const_val = const0_rtx;
10214
10215 bool check_unordered = false;
10216 machine_mode mode = CCFPmode;
10217 switch (comparison)
10218 {
10219 case LE: /* -> GE */
10220 case LT: /* -> GT */
10221 std::swap (op0, op1);
10222 comparison = swap_condition (comparison);
10223 /* FALLTHRU */
10224 case GT:
10225 case GE:
10226 break;
10227 case EQ:
10228 check_unordered = true;
10229 mode = CCZmode;
10230 break;
10231 case NE:
10232 check_unordered = true;
10233 mode = CCZmode;
10234 const_val = const1_rtx;
10235 break;
10236 default:
10237 gcc_unreachable ();
10238 }
10239
2bf6d935 10240 target = gen_reg_rtx (SImode);
ae69e6f6 10241 emit_move_insn (target, const_val);
2bf6d935
ML
10242 target = gen_rtx_SUBREG (QImode, target, 0);
10243
10244 if ((optimize && !register_operand (op0, mode0))
ae69e6f6 10245 || !insn_p->operand[0].predicate (op0, mode0))
2bf6d935
ML
10246 op0 = copy_to_mode_reg (mode0, op0);
10247 if ((optimize && !register_operand (op1, mode1))
ae69e6f6 10248 || !insn_p->operand[1].predicate (op1, mode1))
2bf6d935
ML
10249 op1 = copy_to_mode_reg (mode1, op1);
10250
ae69e6f6 10251 pat = GEN_FCN (icode) (op0, op1);
2bf6d935
ML
10252 if (! pat)
10253 return 0;
2bf6d935 10254
ae69e6f6 10255 set_dst = SET_DEST (pat);
10256 emit_insn (pat);
10257 return ix86_ssecom_setcc (comparison, check_unordered, mode,
10258 set_dst, target);
2bf6d935
ML
10259}
10260
10261/* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10262
10263static rtx
10264ix86_expand_sse_round (const struct builtin_description *d, tree exp,
10265 rtx target)
10266{
10267 rtx pat;
10268 tree arg0 = CALL_EXPR_ARG (exp, 0);
10269 rtx op1, op0 = expand_normal (arg0);
10270 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10271 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10272
10273 if (optimize || target == 0
10274 || GET_MODE (target) != tmode
10275 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10276 target = gen_reg_rtx (tmode);
10277
10278 if (VECTOR_MODE_P (mode0))
10279 op0 = safe_vector_operand (op0, mode0);
10280
10281 if ((optimize && !register_operand (op0, mode0))
10282 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10283 op0 = copy_to_mode_reg (mode0, op0);
10284
10285 op1 = GEN_INT (d->comparison);
10286
10287 pat = GEN_FCN (d->icode) (target, op0, op1);
10288 if (! pat)
10289 return 0;
10290 emit_insn (pat);
10291 return target;
10292}
10293
10294static rtx
10295ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
10296 tree exp, rtx target)
10297{
10298 rtx pat;
10299 tree arg0 = CALL_EXPR_ARG (exp, 0);
10300 tree arg1 = CALL_EXPR_ARG (exp, 1);
10301 rtx op0 = expand_normal (arg0);
10302 rtx op1 = expand_normal (arg1);
10303 rtx op2;
10304 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10305 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10306 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10307
10308 if (optimize || target == 0
10309 || GET_MODE (target) != tmode
10310 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10311 target = gen_reg_rtx (tmode);
10312
10313 op0 = safe_vector_operand (op0, mode0);
10314 op1 = safe_vector_operand (op1, mode1);
10315
10316 if ((optimize && !register_operand (op0, mode0))
10317 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10318 op0 = copy_to_mode_reg (mode0, op0);
10319 if ((optimize && !register_operand (op1, mode1))
10320 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10321 op1 = copy_to_mode_reg (mode1, op1);
10322
10323 op2 = GEN_INT (d->comparison);
10324
10325 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10326 if (! pat)
10327 return 0;
10328 emit_insn (pat);
10329 return target;
10330}
10331
10332/* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10333
10334static rtx
10335ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10336 rtx target)
10337{
10338 rtx pat;
10339 tree arg0 = CALL_EXPR_ARG (exp, 0);
10340 tree arg1 = CALL_EXPR_ARG (exp, 1);
10341 rtx op0 = expand_normal (arg0);
10342 rtx op1 = expand_normal (arg1);
10343 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10344 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10345 enum rtx_code comparison = d->comparison;
10346
5322f009
RS
10347 /* ptest reg, reg sets the carry flag. */
10348 if (comparison == LTU
10349 && (d->code == IX86_BUILTIN_PTESTC
10350 || d->code == IX86_BUILTIN_PTESTC256)
10351 && rtx_equal_p (op0, op1))
10352 {
10353 if (!target)
10354 target = gen_reg_rtx (SImode);
10355 emit_move_insn (target, const1_rtx);
10356 return target;
10357 }
10358
2bf6d935
ML
10359 if (VECTOR_MODE_P (mode0))
10360 op0 = safe_vector_operand (op0, mode0);
10361 if (VECTOR_MODE_P (mode1))
10362 op1 = safe_vector_operand (op1, mode1);
10363
10364 target = gen_reg_rtx (SImode);
10365 emit_move_insn (target, const0_rtx);
10366 target = gen_rtx_SUBREG (QImode, target, 0);
10367
10368 if ((optimize && !register_operand (op0, mode0))
10369 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10370 op0 = copy_to_mode_reg (mode0, op0);
10371 if ((optimize && !register_operand (op1, mode1))
10372 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10373 op1 = copy_to_mode_reg (mode1, op1);
10374
10375 pat = GEN_FCN (d->icode) (op0, op1);
10376 if (! pat)
10377 return 0;
10378 emit_insn (pat);
10379 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10380 gen_rtx_fmt_ee (comparison, QImode,
10381 SET_DEST (pat),
10382 const0_rtx)));
10383
10384 return SUBREG_REG (target);
10385}
10386
10387/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10388
10389static rtx
10390ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10391 tree exp, rtx target)
10392{
10393 rtx pat;
10394 tree arg0 = CALL_EXPR_ARG (exp, 0);
10395 tree arg1 = CALL_EXPR_ARG (exp, 1);
10396 tree arg2 = CALL_EXPR_ARG (exp, 2);
10397 tree arg3 = CALL_EXPR_ARG (exp, 3);
10398 tree arg4 = CALL_EXPR_ARG (exp, 4);
10399 rtx scratch0, scratch1;
10400 rtx op0 = expand_normal (arg0);
10401 rtx op1 = expand_normal (arg1);
10402 rtx op2 = expand_normal (arg2);
10403 rtx op3 = expand_normal (arg3);
10404 rtx op4 = expand_normal (arg4);
10405 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10406
10407 tmode0 = insn_data[d->icode].operand[0].mode;
10408 tmode1 = insn_data[d->icode].operand[1].mode;
10409 modev2 = insn_data[d->icode].operand[2].mode;
10410 modei3 = insn_data[d->icode].operand[3].mode;
10411 modev4 = insn_data[d->icode].operand[4].mode;
10412 modei5 = insn_data[d->icode].operand[5].mode;
10413 modeimm = insn_data[d->icode].operand[6].mode;
10414
10415 if (VECTOR_MODE_P (modev2))
10416 op0 = safe_vector_operand (op0, modev2);
10417 if (VECTOR_MODE_P (modev4))
10418 op2 = safe_vector_operand (op2, modev4);
10419
10420 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10421 op0 = copy_to_mode_reg (modev2, op0);
10422 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10423 op1 = copy_to_mode_reg (modei3, op1);
10424 if ((optimize && !register_operand (op2, modev4))
10425 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10426 op2 = copy_to_mode_reg (modev4, op2);
10427 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10428 op3 = copy_to_mode_reg (modei5, op3);
10429
10430 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10431 {
10432 error ("the fifth argument must be an 8-bit immediate");
10433 return const0_rtx;
10434 }
10435
10436 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10437 {
10438 if (optimize || !target
10439 || GET_MODE (target) != tmode0
10440 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10441 target = gen_reg_rtx (tmode0);
10442
10443 scratch1 = gen_reg_rtx (tmode1);
10444
10445 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10446 }
10447 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10448 {
10449 if (optimize || !target
10450 || GET_MODE (target) != tmode1
10451 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10452 target = gen_reg_rtx (tmode1);
10453
10454 scratch0 = gen_reg_rtx (tmode0);
10455
10456 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10457 }
10458 else
10459 {
10460 gcc_assert (d->flag);
10461
10462 scratch0 = gen_reg_rtx (tmode0);
10463 scratch1 = gen_reg_rtx (tmode1);
10464
10465 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10466 }
10467
10468 if (! pat)
10469 return 0;
10470
10471 emit_insn (pat);
10472
10473 if (d->flag)
10474 {
10475 target = gen_reg_rtx (SImode);
10476 emit_move_insn (target, const0_rtx);
10477 target = gen_rtx_SUBREG (QImode, target, 0);
10478
10479 emit_insn
10480 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10481 gen_rtx_fmt_ee (EQ, QImode,
10482 gen_rtx_REG ((machine_mode) d->flag,
10483 FLAGS_REG),
10484 const0_rtx)));
10485 return SUBREG_REG (target);
10486 }
10487 else
10488 return target;
10489}
10490
10491
10492/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10493
10494static rtx
10495ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10496 tree exp, rtx target)
10497{
10498 rtx pat;
10499 tree arg0 = CALL_EXPR_ARG (exp, 0);
10500 tree arg1 = CALL_EXPR_ARG (exp, 1);
10501 tree arg2 = CALL_EXPR_ARG (exp, 2);
10502 rtx scratch0, scratch1;
10503 rtx op0 = expand_normal (arg0);
10504 rtx op1 = expand_normal (arg1);
10505 rtx op2 = expand_normal (arg2);
10506 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10507
10508 tmode0 = insn_data[d->icode].operand[0].mode;
10509 tmode1 = insn_data[d->icode].operand[1].mode;
10510 modev2 = insn_data[d->icode].operand[2].mode;
10511 modev3 = insn_data[d->icode].operand[3].mode;
10512 modeimm = insn_data[d->icode].operand[4].mode;
10513
10514 if (VECTOR_MODE_P (modev2))
10515 op0 = safe_vector_operand (op0, modev2);
10516 if (VECTOR_MODE_P (modev3))
10517 op1 = safe_vector_operand (op1, modev3);
10518
10519 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10520 op0 = copy_to_mode_reg (modev2, op0);
10521 if ((optimize && !register_operand (op1, modev3))
10522 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10523 op1 = copy_to_mode_reg (modev3, op1);
10524
10525 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10526 {
10527 error ("the third argument must be an 8-bit immediate");
10528 return const0_rtx;
10529 }
10530
10531 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10532 {
10533 if (optimize || !target
10534 || GET_MODE (target) != tmode0
10535 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10536 target = gen_reg_rtx (tmode0);
10537
10538 scratch1 = gen_reg_rtx (tmode1);
10539
10540 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10541 }
10542 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10543 {
10544 if (optimize || !target
10545 || GET_MODE (target) != tmode1
10546 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10547 target = gen_reg_rtx (tmode1);
10548
10549 scratch0 = gen_reg_rtx (tmode0);
10550
10551 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10552 }
10553 else
10554 {
10555 gcc_assert (d->flag);
10556
10557 scratch0 = gen_reg_rtx (tmode0);
10558 scratch1 = gen_reg_rtx (tmode1);
10559
10560 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10561 }
10562
10563 if (! pat)
10564 return 0;
10565
10566 emit_insn (pat);
10567
10568 if (d->flag)
10569 {
10570 target = gen_reg_rtx (SImode);
10571 emit_move_insn (target, const0_rtx);
10572 target = gen_rtx_SUBREG (QImode, target, 0);
10573
10574 emit_insn
10575 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10576 gen_rtx_fmt_ee (EQ, QImode,
10577 gen_rtx_REG ((machine_mode) d->flag,
10578 FLAGS_REG),
10579 const0_rtx)));
10580 return SUBREG_REG (target);
10581 }
10582 else
10583 return target;
10584}
10585
10586/* Fixup modeless constants to fit required mode. */
10587
10588static rtx
10589fixup_modeless_constant (rtx x, machine_mode mode)
10590{
10591 if (GET_MODE (x) == VOIDmode)
10592 x = convert_to_mode (mode, x, 1);
10593 return x;
10594}
10595
10596/* Subroutine of ix86_expand_builtin to take care of insns with
10597 variable number of operands. */
10598
10599static rtx
10600ix86_expand_args_builtin (const struct builtin_description *d,
10601 tree exp, rtx target)
10602{
10603 rtx pat, real_target;
10604 unsigned int i, nargs;
10605 unsigned int nargs_constant = 0;
10606 unsigned int mask_pos = 0;
10607 int num_memory = 0;
715a8bc8 10608 rtx xops[6];
2bf6d935
ML
10609 bool second_arg_count = false;
10610 enum insn_code icode = d->icode;
10611 const struct insn_data_d *insn_p = &insn_data[icode];
10612 machine_mode tmode = insn_p->operand[0].mode;
10613 machine_mode rmode = VOIDmode;
10614 bool swap = false;
10615 enum rtx_code comparison = d->comparison;
10616
10617 switch ((enum ix86_builtin_func_type) d->flag)
10618 {
10619 case V2DF_FTYPE_V2DF_ROUND:
10620 case V4DF_FTYPE_V4DF_ROUND:
10621 case V8DF_FTYPE_V8DF_ROUND:
10622 case V4SF_FTYPE_V4SF_ROUND:
10623 case V8SF_FTYPE_V8SF_ROUND:
10624 case V16SF_FTYPE_V16SF_ROUND:
84bcefd5 10625 case V8HF_FTYPE_V8HF_ROUND:
10626 case V16HF_FTYPE_V16HF_ROUND:
10627 case V32HF_FTYPE_V32HF_ROUND:
2bf6d935
ML
10628 case V4SI_FTYPE_V4SF_ROUND:
10629 case V8SI_FTYPE_V8SF_ROUND:
10630 case V16SI_FTYPE_V16SF_ROUND:
10631 return ix86_expand_sse_round (d, exp, target);
10632 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10633 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10634 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10635 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10636 case INT_FTYPE_V8SF_V8SF_PTEST:
10637 case INT_FTYPE_V4DI_V4DI_PTEST:
10638 case INT_FTYPE_V4DF_V4DF_PTEST:
10639 case INT_FTYPE_V4SF_V4SF_PTEST:
10640 case INT_FTYPE_V2DI_V2DI_PTEST:
10641 case INT_FTYPE_V2DF_V2DF_PTEST:
10642 return ix86_expand_sse_ptest (d, exp, target);
10643 case FLOAT128_FTYPE_FLOAT128:
10644 case FLOAT_FTYPE_FLOAT:
a1ecc560 10645 case FLOAT_FTYPE_BFLOAT16:
2bf6d935
ML
10646 case INT_FTYPE_INT:
10647 case UINT_FTYPE_UINT:
10648 case UINT16_FTYPE_UINT16:
10649 case UINT64_FTYPE_INT:
10650 case UINT64_FTYPE_UINT64:
10651 case INT64_FTYPE_INT64:
10652 case INT64_FTYPE_V4SF:
10653 case INT64_FTYPE_V2DF:
10654 case INT_FTYPE_V16QI:
10655 case INT_FTYPE_V8QI:
10656 case INT_FTYPE_V8SF:
10657 case INT_FTYPE_V4DF:
10658 case INT_FTYPE_V4SF:
10659 case INT_FTYPE_V2DF:
10660 case INT_FTYPE_V32QI:
10661 case V16QI_FTYPE_V16QI:
10662 case V8SI_FTYPE_V8SF:
10663 case V8SI_FTYPE_V4SI:
10664 case V8HI_FTYPE_V8HI:
10665 case V8HI_FTYPE_V16QI:
10666 case V8QI_FTYPE_V8QI:
10667 case V8SF_FTYPE_V8SF:
10668 case V8SF_FTYPE_V8SI:
10669 case V8SF_FTYPE_V4SF:
10670 case V8SF_FTYPE_V8HI:
10671 case V4SI_FTYPE_V4SI:
10672 case V4SI_FTYPE_V16QI:
10673 case V4SI_FTYPE_V4SF:
10674 case V4SI_FTYPE_V8SI:
10675 case V4SI_FTYPE_V8HI:
10676 case V4SI_FTYPE_V4DF:
10677 case V4SI_FTYPE_V2DF:
10678 case V4HI_FTYPE_V4HI:
10679 case V4DF_FTYPE_V4DF:
10680 case V4DF_FTYPE_V4SI:
10681 case V4DF_FTYPE_V4SF:
10682 case V4DF_FTYPE_V2DF:
10683 case V4SF_FTYPE_V4SF:
10684 case V4SF_FTYPE_V4SI:
10685 case V4SF_FTYPE_V8SF:
10686 case V4SF_FTYPE_V4DF:
10687 case V4SF_FTYPE_V8HI:
10688 case V4SF_FTYPE_V2DF:
10689 case V2DI_FTYPE_V2DI:
10690 case V2DI_FTYPE_V16QI:
10691 case V2DI_FTYPE_V8HI:
10692 case V2DI_FTYPE_V4SI:
10693 case V2DF_FTYPE_V2DF:
10694 case V2DF_FTYPE_V4SI:
10695 case V2DF_FTYPE_V4DF:
10696 case V2DF_FTYPE_V4SF:
10697 case V2DF_FTYPE_V2SI:
10698 case V2SI_FTYPE_V2SI:
10699 case V2SI_FTYPE_V4SF:
10700 case V2SI_FTYPE_V2SF:
10701 case V2SI_FTYPE_V2DF:
10702 case V2SF_FTYPE_V2SF:
10703 case V2SF_FTYPE_V2SI:
10704 case V32QI_FTYPE_V32QI:
10705 case V32QI_FTYPE_V16QI:
10706 case V16HI_FTYPE_V16HI:
10707 case V16HI_FTYPE_V8HI:
10708 case V8SI_FTYPE_V8SI:
10709 case V16HI_FTYPE_V16QI:
10710 case V8SI_FTYPE_V16QI:
10711 case V4DI_FTYPE_V16QI:
10712 case V8SI_FTYPE_V8HI:
10713 case V4DI_FTYPE_V8HI:
10714 case V4DI_FTYPE_V4SI:
10715 case V4DI_FTYPE_V2DI:
10716 case UQI_FTYPE_UQI:
10717 case UHI_FTYPE_UHI:
10718 case USI_FTYPE_USI:
10719 case USI_FTYPE_UQI:
10720 case USI_FTYPE_UHI:
10721 case UDI_FTYPE_UDI:
10722 case UHI_FTYPE_V16QI:
10723 case USI_FTYPE_V32QI:
10724 case UDI_FTYPE_V64QI:
10725 case V16QI_FTYPE_UHI:
10726 case V32QI_FTYPE_USI:
10727 case V64QI_FTYPE_UDI:
10728 case V8HI_FTYPE_UQI:
10729 case V16HI_FTYPE_UHI:
10730 case V32HI_FTYPE_USI:
10731 case V4SI_FTYPE_UQI:
10732 case V8SI_FTYPE_UQI:
10733 case V4SI_FTYPE_UHI:
10734 case V8SI_FTYPE_UHI:
10735 case UQI_FTYPE_V8HI:
10736 case UHI_FTYPE_V16HI:
10737 case USI_FTYPE_V32HI:
10738 case UQI_FTYPE_V4SI:
10739 case UQI_FTYPE_V8SI:
10740 case UHI_FTYPE_V16SI:
10741 case UQI_FTYPE_V2DI:
10742 case UQI_FTYPE_V4DI:
10743 case UQI_FTYPE_V8DI:
10744 case V16SI_FTYPE_UHI:
10745 case V2DI_FTYPE_UQI:
10746 case V4DI_FTYPE_UQI:
10747 case V16SI_FTYPE_INT:
10748 case V16SF_FTYPE_V8SF:
10749 case V16SI_FTYPE_V8SI:
10750 case V16SF_FTYPE_V4SF:
10751 case V16SI_FTYPE_V4SI:
10752 case V16SI_FTYPE_V16SF:
10753 case V16SI_FTYPE_V16SI:
10754 case V64QI_FTYPE_V64QI:
10755 case V32HI_FTYPE_V32HI:
10756 case V16SF_FTYPE_V16SF:
10757 case V8DI_FTYPE_UQI:
10758 case V8DI_FTYPE_V8DI:
10759 case V8DF_FTYPE_V4DF:
10760 case V8DF_FTYPE_V2DF:
10761 case V8DF_FTYPE_V8DF:
10762 case V4DI_FTYPE_V4DI:
87235f1e 10763 case V16BF_FTYPE_V16SF:
10764 case V8BF_FTYPE_V8SF:
10765 case V8BF_FTYPE_V4SF:
2bf6d935
ML
10766 nargs = 1;
10767 break;
10768 case V4SF_FTYPE_V4SF_VEC_MERGE:
10769 case V2DF_FTYPE_V2DF_VEC_MERGE:
10770 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10771 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10772 case V16QI_FTYPE_V16QI_V16QI:
10773 case V16QI_FTYPE_V8HI_V8HI:
b96cb2ca 10774 case V16HF_FTYPE_V16HF_V16HF:
2bf6d935
ML
10775 case V16SF_FTYPE_V16SF_V16SF:
10776 case V8QI_FTYPE_V8QI_V8QI:
10777 case V8QI_FTYPE_V4HI_V4HI:
10778 case V8HI_FTYPE_V8HI_V8HI:
10779 case V8HI_FTYPE_V16QI_V16QI:
10780 case V8HI_FTYPE_V4SI_V4SI:
b96cb2ca 10781 case V8HF_FTYPE_V8HF_V8HF:
2bf6d935
ML
10782 case V8SF_FTYPE_V8SF_V8SF:
10783 case V8SF_FTYPE_V8SF_V8SI:
10784 case V8DF_FTYPE_V8DF_V8DF:
10785 case V4SI_FTYPE_V4SI_V4SI:
10786 case V4SI_FTYPE_V8HI_V8HI:
10787 case V4SI_FTYPE_V2DF_V2DF:
10788 case V4HI_FTYPE_V4HI_V4HI:
10789 case V4HI_FTYPE_V8QI_V8QI:
10790 case V4HI_FTYPE_V2SI_V2SI:
10791 case V4DF_FTYPE_V4DF_V4DF:
10792 case V4DF_FTYPE_V4DF_V4DI:
10793 case V4SF_FTYPE_V4SF_V4SF:
10794 case V4SF_FTYPE_V4SF_V4SI:
10795 case V4SF_FTYPE_V4SF_V2SI:
10796 case V4SF_FTYPE_V4SF_V2DF:
10797 case V4SF_FTYPE_V4SF_UINT:
10798 case V4SF_FTYPE_V4SF_DI:
10799 case V4SF_FTYPE_V4SF_SI:
86446132 10800 case V4DI_FTYPE_V4DI_V2DI:
2bf6d935
ML
10801 case V2DI_FTYPE_V2DI_V2DI:
10802 case V2DI_FTYPE_V16QI_V16QI:
10803 case V2DI_FTYPE_V4SI_V4SI:
10804 case V2DI_FTYPE_V2DI_V16QI:
10805 case V2SI_FTYPE_V2SI_V2SI:
10806 case V2SI_FTYPE_V4HI_V4HI:
10807 case V2SI_FTYPE_V2SF_V2SF:
10808 case V2DF_FTYPE_V2DF_V2DF:
10809 case V2DF_FTYPE_V2DF_V4SF:
10810 case V2DF_FTYPE_V2DF_V2DI:
10811 case V2DF_FTYPE_V2DF_DI:
10812 case V2DF_FTYPE_V2DF_SI:
10813 case V2DF_FTYPE_V2DF_UINT:
10814 case V2SF_FTYPE_V2SF_V2SF:
10815 case V1DI_FTYPE_V1DI_V1DI:
10816 case V1DI_FTYPE_V8QI_V8QI:
10817 case V1DI_FTYPE_V2SI_V2SI:
10818 case V32QI_FTYPE_V16HI_V16HI:
10819 case V16HI_FTYPE_V8SI_V8SI:
10820 case V64QI_FTYPE_V64QI_V64QI:
10821 case V32QI_FTYPE_V32QI_V32QI:
10822 case V16HI_FTYPE_V32QI_V32QI:
10823 case V16HI_FTYPE_V16HI_V16HI:
10824 case V8SI_FTYPE_V4DF_V4DF:
10825 case V8SI_FTYPE_V8SI_V8SI:
10826 case V8SI_FTYPE_V16HI_V16HI:
10827 case V4DI_FTYPE_V4DI_V4DI:
10828 case V4DI_FTYPE_V8SI_V8SI:
6bb0776e 10829 case V4DI_FTYPE_V32QI_V32QI:
2bf6d935
ML
10830 case V8DI_FTYPE_V64QI_V64QI:
10831 if (comparison == UNKNOWN)
10832 return ix86_expand_binop_builtin (icode, exp, target);
10833 nargs = 2;
10834 break;
10835 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10836 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10837 gcc_assert (comparison != UNKNOWN);
10838 nargs = 2;
10839 swap = true;
10840 break;
10841 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10842 case V16HI_FTYPE_V16HI_SI_COUNT:
10843 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10844 case V8SI_FTYPE_V8SI_SI_COUNT:
10845 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10846 case V4DI_FTYPE_V4DI_INT_COUNT:
10847 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10848 case V8HI_FTYPE_V8HI_SI_COUNT:
10849 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10850 case V4SI_FTYPE_V4SI_SI_COUNT:
10851 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10852 case V4HI_FTYPE_V4HI_SI_COUNT:
10853 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10854 case V2DI_FTYPE_V2DI_SI_COUNT:
10855 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10856 case V2SI_FTYPE_V2SI_SI_COUNT:
10857 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10858 case V1DI_FTYPE_V1DI_SI_COUNT:
10859 nargs = 2;
10860 second_arg_count = true;
10861 break;
10862 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10863 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10864 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10865 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10866 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10867 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10868 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10869 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10870 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10871 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10872 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10873 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10874 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10875 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10876 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10877 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10878 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10879 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10880 nargs = 4;
10881 second_arg_count = true;
10882 break;
10883 case UINT64_FTYPE_UINT64_UINT64:
10884 case UINT_FTYPE_UINT_UINT:
10885 case UINT_FTYPE_UINT_USHORT:
10886 case UINT_FTYPE_UINT_UCHAR:
10887 case UINT16_FTYPE_UINT16_INT:
10888 case UINT8_FTYPE_UINT8_INT:
10889 case UQI_FTYPE_UQI_UQI:
10890 case UHI_FTYPE_UHI_UHI:
10891 case USI_FTYPE_USI_USI:
10892 case UDI_FTYPE_UDI_UDI:
10893 case V16SI_FTYPE_V8DF_V8DF:
87235f1e 10894 case V32BF_FTYPE_V16SF_V16SF:
10895 case V16BF_FTYPE_V8SF_V8SF:
10896 case V8BF_FTYPE_V4SF_V4SF:
10897 case V16BF_FTYPE_V16SF_UHI:
10898 case V8BF_FTYPE_V8SF_UQI:
10899 case V8BF_FTYPE_V4SF_UQI:
2bf6d935
ML
10900 nargs = 2;
10901 break;
10902 case V2DI_FTYPE_V2DI_INT_CONVERT:
10903 nargs = 2;
10904 rmode = V1TImode;
10905 nargs_constant = 1;
10906 break;
10907 case V4DI_FTYPE_V4DI_INT_CONVERT:
10908 nargs = 2;
10909 rmode = V2TImode;
10910 nargs_constant = 1;
10911 break;
10912 case V8DI_FTYPE_V8DI_INT_CONVERT:
10913 nargs = 2;
10914 rmode = V4TImode;
10915 nargs_constant = 1;
10916 break;
10917 case V8HI_FTYPE_V8HI_INT:
10918 case V8HI_FTYPE_V8SF_INT:
10919 case V16HI_FTYPE_V16SF_INT:
10920 case V8HI_FTYPE_V4SF_INT:
10921 case V8SF_FTYPE_V8SF_INT:
10922 case V4SF_FTYPE_V16SF_INT:
10923 case V16SF_FTYPE_V16SF_INT:
10924 case V4SI_FTYPE_V4SI_INT:
10925 case V4SI_FTYPE_V8SI_INT:
10926 case V4HI_FTYPE_V4HI_INT:
10927 case V4DF_FTYPE_V4DF_INT:
10928 case V4DF_FTYPE_V8DF_INT:
10929 case V4SF_FTYPE_V4SF_INT:
10930 case V4SF_FTYPE_V8SF_INT:
10931 case V2DI_FTYPE_V2DI_INT:
10932 case V2DF_FTYPE_V2DF_INT:
10933 case V2DF_FTYPE_V4DF_INT:
10934 case V16HI_FTYPE_V16HI_INT:
10935 case V8SI_FTYPE_V8SI_INT:
10936 case V16SI_FTYPE_V16SI_INT:
10937 case V4SI_FTYPE_V16SI_INT:
10938 case V4DI_FTYPE_V4DI_INT:
10939 case V2DI_FTYPE_V4DI_INT:
10940 case V4DI_FTYPE_V8DI_INT:
2bf6d935
ML
10941 case UQI_FTYPE_UQI_UQI_CONST:
10942 case UHI_FTYPE_UHI_UQI:
10943 case USI_FTYPE_USI_UQI:
10944 case UDI_FTYPE_UDI_UQI:
10945 nargs = 2;
10946 nargs_constant = 1;
10947 break;
10948 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10949 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10950 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10951 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10952 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10953 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10954 case UHI_FTYPE_V16SI_V16SI_UHI:
10955 case UQI_FTYPE_V8DI_V8DI_UQI:
10956 case V16HI_FTYPE_V16SI_V16HI_UHI:
10957 case V16QI_FTYPE_V16SI_V16QI_UHI:
10958 case V16QI_FTYPE_V8DI_V16QI_UQI:
4204740f 10959 case V32HF_FTYPE_V32HF_V32HF_USI:
2bf6d935
ML
10960 case V16SF_FTYPE_V16SF_V16SF_UHI:
10961 case V16SF_FTYPE_V4SF_V16SF_UHI:
10962 case V16SI_FTYPE_SI_V16SI_UHI:
10963 case V16SI_FTYPE_V16HI_V16SI_UHI:
10964 case V16SI_FTYPE_V16QI_V16SI_UHI:
10965 case V8SF_FTYPE_V4SF_V8SF_UQI:
10966 case V4DF_FTYPE_V2DF_V4DF_UQI:
10967 case V8SI_FTYPE_V4SI_V8SI_UQI:
10968 case V8SI_FTYPE_SI_V8SI_UQI:
10969 case V4SI_FTYPE_V4SI_V4SI_UQI:
10970 case V4SI_FTYPE_SI_V4SI_UQI:
10971 case V4DI_FTYPE_V2DI_V4DI_UQI:
10972 case V4DI_FTYPE_DI_V4DI_UQI:
10973 case V2DI_FTYPE_V2DI_V2DI_UQI:
10974 case V2DI_FTYPE_DI_V2DI_UQI:
10975 case V64QI_FTYPE_V64QI_V64QI_UDI:
10976 case V64QI_FTYPE_V16QI_V64QI_UDI:
10977 case V64QI_FTYPE_QI_V64QI_UDI:
10978 case V32QI_FTYPE_V32QI_V32QI_USI:
10979 case V32QI_FTYPE_V16QI_V32QI_USI:
10980 case V32QI_FTYPE_QI_V32QI_USI:
10981 case V16QI_FTYPE_V16QI_V16QI_UHI:
10982 case V16QI_FTYPE_QI_V16QI_UHI:
10983 case V32HI_FTYPE_V8HI_V32HI_USI:
10984 case V32HI_FTYPE_HI_V32HI_USI:
10985 case V16HI_FTYPE_V8HI_V16HI_UHI:
10986 case V16HI_FTYPE_HI_V16HI_UHI:
10987 case V8HI_FTYPE_V8HI_V8HI_UQI:
10988 case V8HI_FTYPE_HI_V8HI_UQI:
4204740f 10989 case V16HF_FTYPE_V16HF_V16HF_UHI:
2bf6d935
ML
10990 case V8SF_FTYPE_V8HI_V8SF_UQI:
10991 case V4SF_FTYPE_V8HI_V4SF_UQI:
bd610db0 10992 case V8SI_FTYPE_V8HF_V8SI_UQI:
5a744e50 10993 case V8SF_FTYPE_V8HF_V8SF_UQI:
2bf6d935
ML
10994 case V8SI_FTYPE_V8SF_V8SI_UQI:
10995 case V4SI_FTYPE_V4SF_V4SI_UQI:
bd610db0 10996 case V4SI_FTYPE_V8HF_V4SI_UQI:
5a744e50 10997 case V4SF_FTYPE_V8HF_V4SF_UQI:
bd610db0 10998 case V4DI_FTYPE_V8HF_V4DI_UQI:
2bf6d935 10999 case V4DI_FTYPE_V4SF_V4DI_UQI:
bd610db0 11000 case V2DI_FTYPE_V8HF_V2DI_UQI:
2bf6d935 11001 case V2DI_FTYPE_V4SF_V2DI_UQI:
4204740f 11002 case V8HF_FTYPE_V8HF_V8HF_UQI:
081070bc 11003 case V8HF_FTYPE_V8HF_V8HF_V8HF:
be0e4c32 11004 case V8HF_FTYPE_V8HI_V8HF_UQI:
11005 case V8HF_FTYPE_V8SI_V8HF_UQI:
5a744e50 11006 case V8HF_FTYPE_V8SF_V8HF_UQI:
be0e4c32 11007 case V8HF_FTYPE_V4SI_V8HF_UQI:
5a744e50 11008 case V8HF_FTYPE_V4SF_V8HF_UQI:
be0e4c32 11009 case V8HF_FTYPE_V4DI_V8HF_UQI:
5a744e50 11010 case V8HF_FTYPE_V4DF_V8HF_UQI:
be0e4c32 11011 case V8HF_FTYPE_V2DI_V8HF_UQI:
5a744e50 11012 case V8HF_FTYPE_V2DF_V8HF_UQI:
2bf6d935
ML
11013 case V4SF_FTYPE_V4DI_V4SF_UQI:
11014 case V4SF_FTYPE_V2DI_V4SF_UQI:
11015 case V4DF_FTYPE_V4DI_V4DF_UQI:
5a744e50 11016 case V4DF_FTYPE_V8HF_V4DF_UQI:
11017 case V2DF_FTYPE_V8HF_V2DF_UQI:
2bf6d935
ML
11018 case V2DF_FTYPE_V2DI_V2DF_UQI:
11019 case V16QI_FTYPE_V8HI_V16QI_UQI:
11020 case V16QI_FTYPE_V16HI_V16QI_UHI:
11021 case V16QI_FTYPE_V4SI_V16QI_UQI:
11022 case V16QI_FTYPE_V8SI_V16QI_UQI:
bd610db0 11023 case V8HI_FTYPE_V8HF_V8HI_UQI:
2bf6d935
ML
11024 case V8HI_FTYPE_V4SI_V8HI_UQI:
11025 case V8HI_FTYPE_V8SI_V8HI_UQI:
11026 case V16QI_FTYPE_V2DI_V16QI_UQI:
11027 case V16QI_FTYPE_V4DI_V16QI_UQI:
11028 case V8HI_FTYPE_V2DI_V8HI_UQI:
11029 case V8HI_FTYPE_V4DI_V8HI_UQI:
11030 case V4SI_FTYPE_V2DI_V4SI_UQI:
11031 case V4SI_FTYPE_V4DI_V4SI_UQI:
11032 case V32QI_FTYPE_V32HI_V32QI_USI:
11033 case UHI_FTYPE_V16QI_V16QI_UHI:
11034 case USI_FTYPE_V32QI_V32QI_USI:
11035 case UDI_FTYPE_V64QI_V64QI_UDI:
11036 case UQI_FTYPE_V8HI_V8HI_UQI:
11037 case UHI_FTYPE_V16HI_V16HI_UHI:
11038 case USI_FTYPE_V32HI_V32HI_USI:
11039 case UQI_FTYPE_V4SI_V4SI_UQI:
11040 case UQI_FTYPE_V8SI_V8SI_UQI:
11041 case UQI_FTYPE_V2DI_V2DI_UQI:
11042 case UQI_FTYPE_V4DI_V4DI_UQI:
11043 case V4SF_FTYPE_V2DF_V4SF_UQI:
11044 case V4SF_FTYPE_V4DF_V4SF_UQI:
11045 case V16SI_FTYPE_V16SI_V16SI_UHI:
11046 case V16SI_FTYPE_V4SI_V16SI_UHI:
11047 case V2DI_FTYPE_V4SI_V2DI_UQI:
11048 case V2DI_FTYPE_V8HI_V2DI_UQI:
11049 case V2DI_FTYPE_V16QI_V2DI_UQI:
11050 case V4DI_FTYPE_V4DI_V4DI_UQI:
11051 case V4DI_FTYPE_V4SI_V4DI_UQI:
11052 case V4DI_FTYPE_V8HI_V4DI_UQI:
11053 case V4DI_FTYPE_V16QI_V4DI_UQI:
11054 case V4DI_FTYPE_V4DF_V4DI_UQI:
11055 case V2DI_FTYPE_V2DF_V2DI_UQI:
11056 case V4SI_FTYPE_V4DF_V4SI_UQI:
11057 case V4SI_FTYPE_V2DF_V4SI_UQI:
11058 case V4SI_FTYPE_V8HI_V4SI_UQI:
11059 case V4SI_FTYPE_V16QI_V4SI_UQI:
11060 case V4DI_FTYPE_V4DI_V4DI_V4DI:
11061 case V8DF_FTYPE_V2DF_V8DF_UQI:
11062 case V8DF_FTYPE_V4DF_V8DF_UQI:
11063 case V8DF_FTYPE_V8DF_V8DF_UQI:
11064 case V8SF_FTYPE_V8SF_V8SF_UQI:
11065 case V8SF_FTYPE_V8SI_V8SF_UQI:
11066 case V4DF_FTYPE_V4DF_V4DF_UQI:
11067 case V4SF_FTYPE_V4SF_V4SF_UQI:
11068 case V2DF_FTYPE_V2DF_V2DF_UQI:
11069 case V2DF_FTYPE_V4SF_V2DF_UQI:
11070 case V2DF_FTYPE_V4SI_V2DF_UQI:
11071 case V4SF_FTYPE_V4SI_V4SF_UQI:
11072 case V4DF_FTYPE_V4SF_V4DF_UQI:
11073 case V4DF_FTYPE_V4SI_V4DF_UQI:
11074 case V8SI_FTYPE_V8SI_V8SI_UQI:
11075 case V8SI_FTYPE_V8HI_V8SI_UQI:
11076 case V8SI_FTYPE_V16QI_V8SI_UQI:
11077 case V8DF_FTYPE_V8SI_V8DF_UQI:
11078 case V8DI_FTYPE_DI_V8DI_UQI:
11079 case V16SF_FTYPE_V8SF_V16SF_UHI:
11080 case V16SI_FTYPE_V8SI_V16SI_UHI:
be0e4c32 11081 case V16HF_FTYPE_V16HI_V16HF_UHI:
081070bc 11082 case V16HF_FTYPE_V16HF_V16HF_V16HF:
bd610db0 11083 case V16HI_FTYPE_V16HF_V16HI_UHI:
2bf6d935
ML
11084 case V16HI_FTYPE_V16HI_V16HI_UHI:
11085 case V8HI_FTYPE_V16QI_V8HI_UQI:
11086 case V16HI_FTYPE_V16QI_V16HI_UHI:
11087 case V32HI_FTYPE_V32HI_V32HI_USI:
11088 case V32HI_FTYPE_V32QI_V32HI_USI:
11089 case V8DI_FTYPE_V16QI_V8DI_UQI:
11090 case V8DI_FTYPE_V2DI_V8DI_UQI:
11091 case V8DI_FTYPE_V4DI_V8DI_UQI:
11092 case V8DI_FTYPE_V8DI_V8DI_UQI:
11093 case V8DI_FTYPE_V8HI_V8DI_UQI:
11094 case V8DI_FTYPE_V8SI_V8DI_UQI:
11095 case V8HI_FTYPE_V8DI_V8HI_UQI:
11096 case V8SI_FTYPE_V8DI_V8SI_UQI:
11097 case V4SI_FTYPE_V4SI_V4SI_V4SI:
86446132 11098 case V4DI_FTYPE_V4DI_V4DI_V2DI:
2bf6d935
ML
11099 case V16SI_FTYPE_V16SI_V16SI_V16SI:
11100 case V8DI_FTYPE_V8DI_V8DI_V8DI:
11101 case V32HI_FTYPE_V32HI_V32HI_V32HI:
11102 case V2DI_FTYPE_V2DI_V2DI_V2DI:
11103 case V16HI_FTYPE_V16HI_V16HI_V16HI:
11104 case V8SI_FTYPE_V8SI_V8SI_V8SI:
11105 case V8HI_FTYPE_V8HI_V8HI_V8HI:
87235f1e 11106 case V32BF_FTYPE_V16SF_V16SF_USI:
11107 case V16BF_FTYPE_V8SF_V8SF_UHI:
11108 case V8BF_FTYPE_V4SF_V4SF_UQI:
11109 case V16BF_FTYPE_V16SF_V16BF_UHI:
11110 case V8BF_FTYPE_V8SF_V8BF_UQI:
11111 case V8BF_FTYPE_V4SF_V8BF_UQI:
11112 case V16SF_FTYPE_V16SF_V32BF_V32BF:
11113 case V8SF_FTYPE_V8SF_V16BF_V16BF:
11114 case V4SF_FTYPE_V4SF_V8BF_V8BF:
2bf6d935
ML
11115 nargs = 3;
11116 break;
11117 case V32QI_FTYPE_V32QI_V32QI_INT:
11118 case V16HI_FTYPE_V16HI_V16HI_INT:
11119 case V16QI_FTYPE_V16QI_V16QI_INT:
11120 case V4DI_FTYPE_V4DI_V4DI_INT:
11121 case V8HI_FTYPE_V8HI_V8HI_INT:
11122 case V8SI_FTYPE_V8SI_V8SI_INT:
11123 case V8SI_FTYPE_V8SI_V4SI_INT:
11124 case V8SF_FTYPE_V8SF_V8SF_INT:
11125 case V8SF_FTYPE_V8SF_V4SF_INT:
11126 case V4SI_FTYPE_V4SI_V4SI_INT:
11127 case V4DF_FTYPE_V4DF_V4DF_INT:
11128 case V16SF_FTYPE_V16SF_V16SF_INT:
11129 case V16SF_FTYPE_V16SF_V4SF_INT:
11130 case V16SI_FTYPE_V16SI_V4SI_INT:
11131 case V4DF_FTYPE_V4DF_V2DF_INT:
11132 case V4SF_FTYPE_V4SF_V4SF_INT:
11133 case V2DI_FTYPE_V2DI_V2DI_INT:
11134 case V4DI_FTYPE_V4DI_V2DI_INT:
11135 case V2DF_FTYPE_V2DF_V2DF_INT:
11136 case UQI_FTYPE_V8DI_V8UDI_INT:
11137 case UQI_FTYPE_V8DF_V8DF_INT:
11138 case UQI_FTYPE_V2DF_V2DF_INT:
11139 case UQI_FTYPE_V4SF_V4SF_INT:
11140 case UHI_FTYPE_V16SI_V16SI_INT:
11141 case UHI_FTYPE_V16SF_V16SF_INT:
11142 case V64QI_FTYPE_V64QI_V64QI_INT:
11143 case V32HI_FTYPE_V32HI_V32HI_INT:
11144 case V16SI_FTYPE_V16SI_V16SI_INT:
11145 case V8DI_FTYPE_V8DI_V8DI_INT:
11146 nargs = 3;
11147 nargs_constant = 1;
11148 break;
11149 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
11150 nargs = 3;
11151 rmode = V4DImode;
11152 nargs_constant = 1;
11153 break;
11154 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
11155 nargs = 3;
11156 rmode = V2DImode;
11157 nargs_constant = 1;
11158 break;
11159 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
11160 nargs = 3;
11161 rmode = DImode;
11162 nargs_constant = 1;
11163 break;
11164 case V2DI_FTYPE_V2DI_UINT_UINT:
11165 nargs = 3;
11166 nargs_constant = 2;
11167 break;
11168 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
11169 nargs = 3;
11170 rmode = V8DImode;
11171 nargs_constant = 1;
11172 break;
11173 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
11174 nargs = 5;
11175 rmode = V8DImode;
11176 mask_pos = 2;
11177 nargs_constant = 1;
11178 break;
11179 case QI_FTYPE_V8DF_INT_UQI:
11180 case QI_FTYPE_V4DF_INT_UQI:
11181 case QI_FTYPE_V2DF_INT_UQI:
11182 case HI_FTYPE_V16SF_INT_UHI:
11183 case QI_FTYPE_V8SF_INT_UQI:
11184 case QI_FTYPE_V4SF_INT_UQI:
8486e9f2 11185 case QI_FTYPE_V8HF_INT_UQI:
11186 case HI_FTYPE_V16HF_INT_UHI:
11187 case SI_FTYPE_V32HF_INT_USI:
2bf6d935
ML
11188 case V4SI_FTYPE_V4SI_V4SI_UHI:
11189 case V8SI_FTYPE_V8SI_V8SI_UHI:
11190 nargs = 3;
11191 mask_pos = 1;
11192 nargs_constant = 1;
11193 break;
11194 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
11195 nargs = 5;
11196 rmode = V4DImode;
11197 mask_pos = 2;
11198 nargs_constant = 1;
11199 break;
11200 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
11201 nargs = 5;
11202 rmode = V2DImode;
11203 mask_pos = 2;
11204 nargs_constant = 1;
11205 break;
11206 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
11207 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
11208 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
11209 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
11210 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
11211 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
11212 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
11213 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
11214 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
11215 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
11216 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
11217 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
11218 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
11219 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
11220 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
11221 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
bd7a34ef 11222 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
2bf6d935
ML
11223 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
11224 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
11225 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
11226 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
11227 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
11228 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
11229 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
11230 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
11231 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
11232 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
11233 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
11234 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
11235 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
11236 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
11237 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
11238 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
11239 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
081070bc 11240 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
bd7a34ef 11241 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
2bf6d935
ML
11242 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
11243 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
11244 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
11245 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
11246 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
11247 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
11248 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
bd7a34ef 11249 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
2bf6d935
ML
11250 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
11251 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
11252 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
11253 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
11254 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
11255 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
11256 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
11257 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
11258 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
11259 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
11260 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
87235f1e 11261 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
11262 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
11263 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
2bf6d935
ML
11264 nargs = 4;
11265 break;
11266 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
11267 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
11268 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
11269 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
11270 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
8643bcba 11271 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
2bf6d935
ML
11272 nargs = 4;
11273 nargs_constant = 1;
11274 break;
11275 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
11276 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
11277 case QI_FTYPE_V4DF_V4DF_INT_UQI:
11278 case QI_FTYPE_V8SF_V8SF_INT_UQI:
0f200733 11279 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
2bf6d935
ML
11280 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
11281 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
11282 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
11283 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
0f200733 11284 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
2bf6d935
ML
11285 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
11286 case USI_FTYPE_V32QI_V32QI_INT_USI:
11287 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
11288 case USI_FTYPE_V32HI_V32HI_INT_USI:
0f200733 11289 case USI_FTYPE_V32HF_V32HF_INT_USI:
2bf6d935
ML
11290 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
11291 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
2bf6d935
ML
11292 nargs = 4;
11293 mask_pos = 1;
11294 nargs_constant = 1;
11295 break;
11296 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
11297 nargs = 4;
11298 nargs_constant = 2;
11299 break;
11300 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
11301 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
87235f1e 11302 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
11303 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
11304 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
2bf6d935
ML
11305 nargs = 4;
11306 break;
11307 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
11308 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
11309 mask_pos = 1;
11310 nargs = 4;
11311 nargs_constant = 1;
11312 break;
11313 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
11314 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
11315 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
11316 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
11317 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
11318 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
11319 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
11320 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
11321 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
11322 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
11323 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
11324 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
11325 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
11326 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
11327 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
11328 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
11329 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
11330 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
11331 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
11332 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
11333 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
11334 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
11335 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
11336 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
11337 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
8bed7617 11338 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11339 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
2bf6d935
ML
11340 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11341 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11342 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11343 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11344 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11345 nargs = 4;
11346 mask_pos = 2;
11347 nargs_constant = 1;
11348 break;
11349 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11350 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11351 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11352 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11353 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11354 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11355 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11356 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11357 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11358 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11359 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11360 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11361 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11362 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11363 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11364 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11365 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11366 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11367 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11368 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11369 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11370 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11371 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11372 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11373 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11374 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11375 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11376 nargs = 5;
11377 mask_pos = 2;
11378 nargs_constant = 1;
11379 break;
11380 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11381 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11382 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11383 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11384 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11385 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11386 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11387 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11388 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11389 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11390 nargs = 5;
11391 mask_pos = 1;
11392 nargs_constant = 1;
11393 break;
11394 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11395 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11396 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11397 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11398 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11399 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11400 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11401 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11402 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11403 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11404 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11405 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11406 nargs = 5;
11407 mask_pos = 1;
11408 nargs_constant = 2;
11409 break;
11410
11411 default:
11412 gcc_unreachable ();
11413 }
11414
715a8bc8 11415 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
11416
11417 if (comparison != UNKNOWN)
11418 {
11419 gcc_assert (nargs == 2);
11420 return ix86_expand_sse_compare (d, exp, target, swap);
11421 }
11422
11423 if (rmode == VOIDmode || rmode == tmode)
11424 {
11425 if (optimize
11426 || target == 0
11427 || GET_MODE (target) != tmode
11428 || !insn_p->operand[0].predicate (target, tmode))
11429 target = gen_reg_rtx (tmode);
11430 else if (memory_operand (target, tmode))
11431 num_memory++;
11432 real_target = target;
11433 }
11434 else
11435 {
11436 real_target = gen_reg_rtx (tmode);
11437 target = lowpart_subreg (rmode, real_target, tmode);
11438 }
11439
11440 for (i = 0; i < nargs; i++)
11441 {
11442 tree arg = CALL_EXPR_ARG (exp, i);
11443 rtx op = expand_normal (arg);
11444 machine_mode mode = insn_p->operand[i + 1].mode;
11445 bool match = insn_p->operand[i + 1].predicate (op, mode);
11446
11447 if (second_arg_count && i == 1)
11448 {
11449 /* SIMD shift insns take either an 8-bit immediate or
11450 register as count. But builtin functions take int as
11451 count. If count doesn't match, we put it in register.
11452 The instructions are using 64-bit count, if op is just
11453 32-bit, zero-extend it, as negative shift counts
11454 are undefined behavior and zero-extension is more
11455 efficient. */
11456 if (!match)
11457 {
11458 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11459 op = convert_modes (mode, GET_MODE (op), op, 1);
11460 else
11461 op = lowpart_subreg (mode, op, GET_MODE (op));
11462 if (!insn_p->operand[i + 1].predicate (op, mode))
11463 op = copy_to_reg (op);
11464 }
11465 }
11466 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11467 (!mask_pos && (nargs - i) <= nargs_constant))
11468 {
11469 if (!match)
11470 switch (icode)
11471 {
11472 case CODE_FOR_avx_vinsertf128v4di:
11473 case CODE_FOR_avx_vextractf128v4di:
11474 error ("the last argument must be an 1-bit immediate");
11475 return const0_rtx;
11476
11477 case CODE_FOR_avx512f_cmpv8di3_mask:
11478 case CODE_FOR_avx512f_cmpv16si3_mask:
11479 case CODE_FOR_avx512f_ucmpv8di3_mask:
11480 case CODE_FOR_avx512f_ucmpv16si3_mask:
11481 case CODE_FOR_avx512vl_cmpv4di3_mask:
11482 case CODE_FOR_avx512vl_cmpv8si3_mask:
11483 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11484 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11485 case CODE_FOR_avx512vl_cmpv2di3_mask:
11486 case CODE_FOR_avx512vl_cmpv4si3_mask:
11487 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11488 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11489 error ("the last argument must be a 3-bit immediate");
11490 return const0_rtx;
11491
11492 case CODE_FOR_sse4_1_roundsd:
11493 case CODE_FOR_sse4_1_roundss:
11494
11495 case CODE_FOR_sse4_1_roundpd:
11496 case CODE_FOR_sse4_1_roundps:
11497 case CODE_FOR_avx_roundpd256:
11498 case CODE_FOR_avx_roundps256:
11499
11500 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11501 case CODE_FOR_sse4_1_roundps_sfix:
11502 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11503 case CODE_FOR_avx_roundps_sfix256:
11504
11505 case CODE_FOR_sse4_1_blendps:
11506 case CODE_FOR_avx_blendpd256:
11507 case CODE_FOR_avx_vpermilv4df:
11508 case CODE_FOR_avx_vpermilv4df_mask:
11509 case CODE_FOR_avx512f_getmantv8df_mask:
11510 case CODE_FOR_avx512f_getmantv16sf_mask:
8486e9f2 11511 case CODE_FOR_avx512vl_getmantv16hf_mask:
2bf6d935
ML
11512 case CODE_FOR_avx512vl_getmantv8sf_mask:
11513 case CODE_FOR_avx512vl_getmantv4df_mask:
8486e9f2 11514 case CODE_FOR_avx512fp16_getmantv8hf_mask:
2bf6d935
ML
11515 case CODE_FOR_avx512vl_getmantv4sf_mask:
11516 case CODE_FOR_avx512vl_getmantv2df_mask:
11517 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11518 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11519 case CODE_FOR_avx512dq_rangepv4df_mask:
11520 case CODE_FOR_avx512dq_rangepv8sf_mask:
11521 case CODE_FOR_avx512dq_rangepv2df_mask:
11522 case CODE_FOR_avx512dq_rangepv4sf_mask:
11523 case CODE_FOR_avx_shufpd256_mask:
11524 error ("the last argument must be a 4-bit immediate");
11525 return const0_rtx;
11526
11527 case CODE_FOR_sha1rnds4:
11528 case CODE_FOR_sse4_1_blendpd:
11529 case CODE_FOR_avx_vpermilv2df:
11530 case CODE_FOR_avx_vpermilv2df_mask:
11531 case CODE_FOR_xop_vpermil2v2df3:
11532 case CODE_FOR_xop_vpermil2v4sf3:
11533 case CODE_FOR_xop_vpermil2v4df3:
11534 case CODE_FOR_xop_vpermil2v8sf3:
11535 case CODE_FOR_avx512f_vinsertf32x4_mask:
11536 case CODE_FOR_avx512f_vinserti32x4_mask:
11537 case CODE_FOR_avx512f_vextractf32x4_mask:
11538 case CODE_FOR_avx512f_vextracti32x4_mask:
11539 case CODE_FOR_sse2_shufpd:
11540 case CODE_FOR_sse2_shufpd_mask:
11541 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11542 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11543 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11544 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11545 error ("the last argument must be a 2-bit immediate");
11546 return const0_rtx;
11547
11548 case CODE_FOR_avx_vextractf128v4df:
11549 case CODE_FOR_avx_vextractf128v8sf:
11550 case CODE_FOR_avx_vextractf128v8si:
11551 case CODE_FOR_avx_vinsertf128v4df:
11552 case CODE_FOR_avx_vinsertf128v8sf:
11553 case CODE_FOR_avx_vinsertf128v8si:
11554 case CODE_FOR_avx512f_vinsertf64x4_mask:
11555 case CODE_FOR_avx512f_vinserti64x4_mask:
11556 case CODE_FOR_avx512f_vextractf64x4_mask:
11557 case CODE_FOR_avx512f_vextracti64x4_mask:
11558 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11559 case CODE_FOR_avx512dq_vinserti32x8_mask:
11560 case CODE_FOR_avx512vl_vinsertv4df:
11561 case CODE_FOR_avx512vl_vinsertv4di:
11562 case CODE_FOR_avx512vl_vinsertv8sf:
11563 case CODE_FOR_avx512vl_vinsertv8si:
11564 error ("the last argument must be a 1-bit immediate");
11565 return const0_rtx;
11566
11567 case CODE_FOR_avx_vmcmpv2df3:
11568 case CODE_FOR_avx_vmcmpv4sf3:
11569 case CODE_FOR_avx_cmpv2df3:
11570 case CODE_FOR_avx_cmpv4sf3:
11571 case CODE_FOR_avx_cmpv4df3:
11572 case CODE_FOR_avx_cmpv8sf3:
11573 case CODE_FOR_avx512f_cmpv8df3_mask:
11574 case CODE_FOR_avx512f_cmpv16sf3_mask:
11575 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11576 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
0f200733 11577 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11578 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11579 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
2bf6d935
ML
11580 error ("the last argument must be a 5-bit immediate");
11581 return const0_rtx;
11582
11583 default:
11584 switch (nargs_constant)
11585 {
11586 case 2:
11587 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11588 (!mask_pos && (nargs - i) == nargs_constant))
11589 {
11590 error ("the next to last argument must be an 8-bit immediate");
11591 break;
11592 }
11593 /* FALLTHRU */
11594 case 1:
11595 error ("the last argument must be an 8-bit immediate");
11596 break;
11597 default:
11598 gcc_unreachable ();
11599 }
11600 return const0_rtx;
11601 }
11602 }
11603 else
11604 {
11605 if (VECTOR_MODE_P (mode))
11606 op = safe_vector_operand (op, mode);
11607
11608 /* If we aren't optimizing, only allow one memory operand to
11609 be generated. */
11610 if (memory_operand (op, mode))
11611 num_memory++;
11612
11613 op = fixup_modeless_constant (op, mode);
11614
11615 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11616 {
11617 if (optimize || !match || num_memory > 1)
11618 op = copy_to_mode_reg (mode, op);
11619 }
11620 else
11621 {
11622 op = copy_to_reg (op);
11623 op = lowpart_subreg (mode, op, GET_MODE (op));
11624 }
11625 }
11626
715a8bc8 11627 xops[i] = op;
2bf6d935
ML
11628 }
11629
11630 switch (nargs)
11631 {
11632 case 1:
715a8bc8 11633 pat = GEN_FCN (icode) (real_target, xops[0]);
2bf6d935
ML
11634 break;
11635 case 2:
715a8bc8 11636 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
2bf6d935
ML
11637 break;
11638 case 3:
715a8bc8 11639 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
2bf6d935
ML
11640 break;
11641 case 4:
715a8bc8
UB
11642 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11643 xops[2], xops[3]);
2bf6d935
ML
11644 break;
11645 case 5:
715a8bc8
UB
11646 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11647 xops[2], xops[3], xops[4]);
2bf6d935
ML
11648 break;
11649 case 6:
715a8bc8
UB
11650 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11651 xops[2], xops[3], xops[4], xops[5]);
2bf6d935
ML
11652 break;
11653 default:
11654 gcc_unreachable ();
11655 }
11656
11657 if (! pat)
11658 return 0;
11659
11660 emit_insn (pat);
11661 return target;
11662}
11663
11664/* Transform pattern of following layout:
11665 (set A
11666 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11667 )
11668 into:
11669 (set (A B)) */
11670
11671static rtx
11672ix86_erase_embedded_rounding (rtx pat)
11673{
11674 if (GET_CODE (pat) == INSN)
11675 pat = PATTERN (pat);
11676
11677 gcc_assert (GET_CODE (pat) == SET);
11678 rtx src = SET_SRC (pat);
11679 gcc_assert (XVECLEN (src, 0) == 2);
11680 rtx p0 = XVECEXP (src, 0, 0);
11681 gcc_assert (GET_CODE (src) == UNSPEC
11682 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11683 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11684 return res;
11685}
11686
11687/* Subroutine of ix86_expand_round_builtin to take care of comi insns
11688 with rounding. */
11689static rtx
11690ix86_expand_sse_comi_round (const struct builtin_description *d,
11691 tree exp, rtx target)
11692{
11693 rtx pat, set_dst;
11694 tree arg0 = CALL_EXPR_ARG (exp, 0);
11695 tree arg1 = CALL_EXPR_ARG (exp, 1);
11696 tree arg2 = CALL_EXPR_ARG (exp, 2);
11697 tree arg3 = CALL_EXPR_ARG (exp, 3);
11698 rtx op0 = expand_normal (arg0);
11699 rtx op1 = expand_normal (arg1);
11700 rtx op2 = expand_normal (arg2);
11701 rtx op3 = expand_normal (arg3);
11702 enum insn_code icode = d->icode;
11703 const struct insn_data_d *insn_p = &insn_data[icode];
11704 machine_mode mode0 = insn_p->operand[0].mode;
11705 machine_mode mode1 = insn_p->operand[1].mode;
2bf6d935
ML
11706
11707 /* See avxintrin.h for values. */
467e9f38 11708 static const enum rtx_code comparisons[32] =
2bf6d935 11709 {
467e9f38
L
11710 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11711 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11712 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11713 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
2bf6d935 11714 };
467e9f38
L
11715 static const bool ordereds[32] =
11716 {
11717 true, true, true, false, false, false, false, true,
11718 false, false, false, true, true, true, true, false,
11719 true, true, true, false, false, false, false, true,
11720 false, false, false, true, true, true, true, false
11721 };
11722 static const bool non_signalings[32] =
2bf6d935
ML
11723 {
11724 true, false, false, true, true, false, false, true,
11725 true, false, false, true, true, false, false, true,
11726 false, true, true, false, false, true, true, false,
11727 false, true, true, false, false, true, true, false
11728 };
11729
11730 if (!CONST_INT_P (op2))
11731 {
11732 error ("the third argument must be comparison constant");
11733 return const0_rtx;
11734 }
11735 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11736 {
11737 error ("incorrect comparison mode");
11738 return const0_rtx;
11739 }
11740
11741 if (!insn_p->operand[2].predicate (op3, SImode))
11742 {
11743 error ("incorrect rounding operand");
11744 return const0_rtx;
11745 }
11746
2bf6d935
ML
11747 if (VECTOR_MODE_P (mode0))
11748 op0 = safe_vector_operand (op0, mode0);
11749 if (VECTOR_MODE_P (mode1))
11750 op1 = safe_vector_operand (op1, mode1);
11751
467e9f38
L
11752 enum rtx_code comparison = comparisons[INTVAL (op2)];
11753 bool ordered = ordereds[INTVAL (op2)];
11754 bool non_signaling = non_signalings[INTVAL (op2)];
11755 rtx const_val = const0_rtx;
11756
11757 bool check_unordered = false;
11758 machine_mode mode = CCFPmode;
11759 switch (comparison)
11760 {
11761 case ORDERED:
11762 if (!ordered)
11763 {
11764 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11765 if (!non_signaling)
11766 ordered = true;
11767 mode = CCSmode;
11768 }
11769 else
11770 {
11771 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11772 if (non_signaling)
11773 ordered = false;
11774 mode = CCPmode;
11775 }
11776 comparison = NE;
11777 break;
11778 case UNORDERED:
11779 if (ordered)
11780 {
11781 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11782 if (non_signaling)
11783 ordered = false;
11784 mode = CCSmode;
11785 }
11786 else
11787 {
11788 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11789 if (!non_signaling)
11790 ordered = true;
11791 mode = CCPmode;
11792 }
11793 comparison = EQ;
11794 break;
11795
11796 case LE: /* -> GE */
11797 case LT: /* -> GT */
11798 case UNGE: /* -> UNLE */
11799 case UNGT: /* -> UNLT */
11800 std::swap (op0, op1);
11801 comparison = swap_condition (comparison);
11802 /* FALLTHRU */
11803 case GT:
11804 case GE:
11805 case UNEQ:
11806 case UNLT:
11807 case UNLE:
11808 case LTGT:
11809 /* These are supported by CCFPmode. NB: Use ordered/signaling
11810 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11811 with NAN operands. */
11812 if (ordered == non_signaling)
11813 ordered = !ordered;
11814 break;
11815 case EQ:
11816 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11817 _CMP_EQ_OQ/_CMP_EQ_OS. */
11818 check_unordered = true;
11819 mode = CCZmode;
11820 break;
11821 case NE:
11822 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11823 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11824 gcc_assert (!ordered);
11825 check_unordered = true;
11826 mode = CCZmode;
11827 const_val = const1_rtx;
11828 break;
11829 default:
11830 gcc_unreachable ();
11831 }
11832
2bf6d935 11833 target = gen_reg_rtx (SImode);
467e9f38 11834 emit_move_insn (target, const_val);
2bf6d935
ML
11835 target = gen_rtx_SUBREG (QImode, target, 0);
11836
11837 if ((optimize && !register_operand (op0, mode0))
11838 || !insn_p->operand[0].predicate (op0, mode0))
11839 op0 = copy_to_mode_reg (mode0, op0);
11840 if ((optimize && !register_operand (op1, mode1))
11841 || !insn_p->operand[1].predicate (op1, mode1))
11842 op1 = copy_to_mode_reg (mode1, op1);
11843
467e9f38
L
11844 /*
11845 1. COMI: ordered and signaling.
11846 2. UCOMI: unordered and non-signaling.
11847 */
11848 if (non_signaling)
11849 icode = (icode == CODE_FOR_sse_comi_round
11850 ? CODE_FOR_sse_ucomi_round
11851 : CODE_FOR_sse2_ucomi_round);
2bf6d935
ML
11852
11853 pat = GEN_FCN (icode) (op0, op1, op3);
11854 if (! pat)
11855 return 0;
11856
11857 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11858 if (INTVAL (op3) == NO_ROUND)
11859 {
11860 pat = ix86_erase_embedded_rounding (pat);
11861 if (! pat)
11862 return 0;
11863
11864 set_dst = SET_DEST (pat);
11865 }
11866 else
11867 {
11868 gcc_assert (GET_CODE (pat) == SET);
11869 set_dst = SET_DEST (pat);
11870 }
11871
11872 emit_insn (pat);
467e9f38 11873
ae69e6f6 11874 return ix86_ssecom_setcc (comparison, check_unordered, mode,
11875 set_dst, target);
2bf6d935
ML
11876}
11877
11878static rtx
11879ix86_expand_round_builtin (const struct builtin_description *d,
11880 tree exp, rtx target)
11881{
11882 rtx pat;
11883 unsigned int i, nargs;
715a8bc8 11884 rtx xops[6];
2bf6d935
ML
11885 enum insn_code icode = d->icode;
11886 const struct insn_data_d *insn_p = &insn_data[icode];
11887 machine_mode tmode = insn_p->operand[0].mode;
11888 unsigned int nargs_constant = 0;
11889 unsigned int redundant_embed_rnd = 0;
11890
11891 switch ((enum ix86_builtin_func_type) d->flag)
11892 {
11893 case UINT64_FTYPE_V2DF_INT:
11894 case UINT64_FTYPE_V4SF_INT:
3069a2e5 11895 case UINT64_FTYPE_V8HF_INT:
2bf6d935
ML
11896 case UINT_FTYPE_V2DF_INT:
11897 case UINT_FTYPE_V4SF_INT:
3069a2e5 11898 case UINT_FTYPE_V8HF_INT:
2bf6d935
ML
11899 case INT64_FTYPE_V2DF_INT:
11900 case INT64_FTYPE_V4SF_INT:
3069a2e5 11901 case INT64_FTYPE_V8HF_INT:
2bf6d935
ML
11902 case INT_FTYPE_V2DF_INT:
11903 case INT_FTYPE_V4SF_INT:
3069a2e5 11904 case INT_FTYPE_V8HF_INT:
2bf6d935
ML
11905 nargs = 2;
11906 break;
bd7a34ef 11907 case V32HF_FTYPE_V32HF_V32HF_INT:
71838266 11908 case V8HF_FTYPE_V8HF_V8HF_INT:
3069a2e5 11909 case V8HF_FTYPE_V8HF_INT_INT:
11910 case V8HF_FTYPE_V8HF_UINT_INT:
11911 case V8HF_FTYPE_V8HF_INT64_INT:
11912 case V8HF_FTYPE_V8HF_UINT64_INT:
2bf6d935
ML
11913 case V4SF_FTYPE_V4SF_UINT_INT:
11914 case V4SF_FTYPE_V4SF_UINT64_INT:
11915 case V2DF_FTYPE_V2DF_UINT64_INT:
11916 case V4SF_FTYPE_V4SF_INT_INT:
11917 case V4SF_FTYPE_V4SF_INT64_INT:
11918 case V2DF_FTYPE_V2DF_INT64_INT:
11919 case V4SF_FTYPE_V4SF_V4SF_INT:
11920 case V2DF_FTYPE_V2DF_V2DF_INT:
11921 case V4SF_FTYPE_V4SF_V2DF_INT:
11922 case V2DF_FTYPE_V2DF_V4SF_INT:
11923 nargs = 3;
11924 break;
11925 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11926 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
bd610db0 11927 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
2bf6d935 11928 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
bd610db0 11929 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
2bf6d935
ML
11930 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11931 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11932 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
5a744e50 11933 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11934 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
be0e4c32 11935 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
4204740f 11936 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
081070bc 11937 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
2bf6d935
ML
11938 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11939 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11940 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11941 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
bd610db0 11942 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
be0e4c32 11943 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
2bf6d935
ML
11944 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11945 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11946 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11947 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
be0e4c32 11948 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
5a744e50 11949 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11950 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
3c9de0a9 11951 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
2bf6d935
ML
11952 nargs = 4;
11953 break;
11954 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11955 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11956 nargs_constant = 2;
11957 nargs = 4;
11958 break;
11959 case INT_FTYPE_V4SF_V4SF_INT_INT:
11960 case INT_FTYPE_V2DF_V2DF_INT_INT:
11961 return ix86_expand_sse_comi_round (d, exp, target);
11962 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11963 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11964 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
90429b96 11965 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
2bf6d935 11966 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
081070bc 11967 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
bd7a34ef 11968 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
90429b96 11969 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
2bf6d935
ML
11970 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11971 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
93103603 11972 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
2bf6d935
ML
11973 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11974 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
93103603 11975 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
71838266 11976 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
90429b96 11977 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11978 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
2bf6d935
ML
11979 nargs = 5;
11980 break;
8bed7617 11981 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
2bf6d935
ML
11982 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11983 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
93103603
SP
11984 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11985 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
2bf6d935
ML
11986 nargs_constant = 4;
11987 nargs = 5;
11988 break;
11989 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11990 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11991 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11992 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
0f200733 11993 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11994 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
2bf6d935
ML
11995 nargs_constant = 3;
11996 nargs = 5;
11997 break;
11998 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11999 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
12000 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
12001 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
12002 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
12003 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
8bed7617 12004 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
2bf6d935
ML
12005 nargs = 6;
12006 nargs_constant = 4;
12007 break;
12008 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
12009 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
12010 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
12011 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
12012 nargs = 6;
12013 nargs_constant = 3;
12014 break;
12015 default:
12016 gcc_unreachable ();
12017 }
715a8bc8 12018 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
12019
12020 if (optimize
12021 || target == 0
12022 || GET_MODE (target) != tmode
12023 || !insn_p->operand[0].predicate (target, tmode))
12024 target = gen_reg_rtx (tmode);
12025
12026 for (i = 0; i < nargs; i++)
12027 {
12028 tree arg = CALL_EXPR_ARG (exp, i);
12029 rtx op = expand_normal (arg);
12030 machine_mode mode = insn_p->operand[i + 1].mode;
12031 bool match = insn_p->operand[i + 1].predicate (op, mode);
12032
12033 if (i == nargs - nargs_constant)
12034 {
12035 if (!match)
12036 {
12037 switch (icode)
12038 {
12039 case CODE_FOR_avx512f_getmantv8df_mask_round:
12040 case CODE_FOR_avx512f_getmantv16sf_mask_round:
8486e9f2 12041 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
2bf6d935
ML
12042 case CODE_FOR_avx512f_vgetmantv2df_round:
12043 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
12044 case CODE_FOR_avx512f_vgetmantv4sf_round:
12045 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
8486e9f2 12046 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
2bf6d935
ML
12047 error ("the immediate argument must be a 4-bit immediate");
12048 return const0_rtx;
12049 case CODE_FOR_avx512f_cmpv8df3_mask_round:
12050 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
12051 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
12052 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
0f200733 12053 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
12054 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
2bf6d935
ML
12055 error ("the immediate argument must be a 5-bit immediate");
12056 return const0_rtx;
12057 default:
12058 error ("the immediate argument must be an 8-bit immediate");
12059 return const0_rtx;
12060 }
12061 }
12062 }
12063 else if (i == nargs-1)
12064 {
12065 if (!insn_p->operand[nargs].predicate (op, SImode))
12066 {
12067 error ("incorrect rounding operand");
12068 return const0_rtx;
12069 }
12070
12071 /* If there is no rounding use normal version of the pattern. */
12072 if (INTVAL (op) == NO_ROUND)
2f9529fc
HW
12073 {
12074 /* Skip erasing embedded rounding for below expanders who
12075 generates multiple insns. In ix86_erase_embedded_rounding
12076 the pattern will be transformed to a single set, and emit_insn
12077 appends the set insead of insert it to chain. So the insns
12078 emitted inside define_expander would be ignored. */
12079 switch (icode)
12080 {
12081 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
12082 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
12083 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
12084 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
12085 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
12086 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
12087 redundant_embed_rnd = 0;
12088 break;
12089 default:
12090 redundant_embed_rnd = 1;
12091 break;
12092 }
12093 }
2bf6d935
ML
12094 }
12095 else
12096 {
12097 if (VECTOR_MODE_P (mode))
12098 op = safe_vector_operand (op, mode);
12099
12100 op = fixup_modeless_constant (op, mode);
12101
12102 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12103 {
12104 if (optimize || !match)
12105 op = copy_to_mode_reg (mode, op);
12106 }
12107 else
12108 {
12109 op = copy_to_reg (op);
12110 op = lowpart_subreg (mode, op, GET_MODE (op));
12111 }
12112 }
12113
715a8bc8 12114 xops[i] = op;
2bf6d935
ML
12115 }
12116
12117 switch (nargs)
12118 {
12119 case 1:
715a8bc8 12120 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
12121 break;
12122 case 2:
715a8bc8 12123 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
12124 break;
12125 case 3:
715a8bc8 12126 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
12127 break;
12128 case 4:
715a8bc8
UB
12129 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12130 xops[2], xops[3]);
2bf6d935
ML
12131 break;
12132 case 5:
715a8bc8
UB
12133 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12134 xops[2], xops[3], xops[4]);
2bf6d935
ML
12135 break;
12136 case 6:
715a8bc8
UB
12137 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12138 xops[2], xops[3], xops[4], xops[5]);
2bf6d935
ML
12139 break;
12140 default:
12141 gcc_unreachable ();
12142 }
12143
12144 if (!pat)
12145 return 0;
12146
12147 if (redundant_embed_rnd)
12148 pat = ix86_erase_embedded_rounding (pat);
12149
12150 emit_insn (pat);
12151 return target;
12152}
12153
12154/* Subroutine of ix86_expand_builtin to take care of special insns
12155 with variable number of operands. */
12156
12157static rtx
12158ix86_expand_special_args_builtin (const struct builtin_description *d,
12159 tree exp, rtx target)
12160{
12161 tree arg;
12162 rtx pat, op;
12163 unsigned int i, nargs, arg_adjust, memory;
152834fe 12164 unsigned int constant = 100;
2bf6d935 12165 bool aligned_mem = false;
152834fe 12166 rtx xops[4];
2bf6d935 12167 enum insn_code icode = d->icode;
2bf6d935
ML
12168 const struct insn_data_d *insn_p = &insn_data[icode];
12169 machine_mode tmode = insn_p->operand[0].mode;
12170 enum { load, store } klass;
12171
12172 switch ((enum ix86_builtin_func_type) d->flag)
12173 {
12174 case VOID_FTYPE_VOID:
12175 emit_insn (GEN_FCN (icode) (target));
12176 return 0;
12177 case VOID_FTYPE_UINT64:
12178 case VOID_FTYPE_UNSIGNED:
12179 nargs = 0;
12180 klass = store;
12181 memory = 0;
12182 break;
12183
12184 case INT_FTYPE_VOID:
12185 case USHORT_FTYPE_VOID:
12186 case UINT64_FTYPE_VOID:
12187 case UINT_FTYPE_VOID:
299a53d7 12188 case UINT8_FTYPE_VOID:
2bf6d935
ML
12189 case UNSIGNED_FTYPE_VOID:
12190 nargs = 0;
12191 klass = load;
12192 memory = 0;
12193 break;
12194 case UINT64_FTYPE_PUNSIGNED:
12195 case V2DI_FTYPE_PV2DI:
12196 case V4DI_FTYPE_PV4DI:
12197 case V32QI_FTYPE_PCCHAR:
12198 case V16QI_FTYPE_PCCHAR:
12199 case V8SF_FTYPE_PCV4SF:
12200 case V8SF_FTYPE_PCFLOAT:
12201 case V4SF_FTYPE_PCFLOAT:
58685b93 12202 case V4SF_FTYPE_PCFLOAT16:
12203 case V4SF_FTYPE_PCBFLOAT16:
12204 case V4SF_FTYPE_PCV8BF:
12205 case V4SF_FTYPE_PCV8HF:
12206 case V8SF_FTYPE_PCFLOAT16:
12207 case V8SF_FTYPE_PCBFLOAT16:
12208 case V8SF_FTYPE_PCV16HF:
12209 case V8SF_FTYPE_PCV16BF:
2bf6d935
ML
12210 case V4DF_FTYPE_PCV2DF:
12211 case V4DF_FTYPE_PCDOUBLE:
12212 case V2DF_FTYPE_PCDOUBLE:
12213 case VOID_FTYPE_PVOID:
12214 case V8DI_FTYPE_PV8DI:
12215 nargs = 1;
12216 klass = load;
12217 memory = 0;
12218 switch (icode)
12219 {
12220 case CODE_FOR_sse4_1_movntdqa:
12221 case CODE_FOR_avx2_movntdqa:
12222 case CODE_FOR_avx512f_movntdqa:
12223 aligned_mem = true;
12224 break;
12225 default:
12226 break;
12227 }
12228 break;
12229 case VOID_FTYPE_PV2SF_V4SF:
12230 case VOID_FTYPE_PV8DI_V8DI:
12231 case VOID_FTYPE_PV4DI_V4DI:
12232 case VOID_FTYPE_PV2DI_V2DI:
12233 case VOID_FTYPE_PCHAR_V32QI:
12234 case VOID_FTYPE_PCHAR_V16QI:
12235 case VOID_FTYPE_PFLOAT_V16SF:
12236 case VOID_FTYPE_PFLOAT_V8SF:
12237 case VOID_FTYPE_PFLOAT_V4SF:
12238 case VOID_FTYPE_PDOUBLE_V8DF:
12239 case VOID_FTYPE_PDOUBLE_V4DF:
12240 case VOID_FTYPE_PDOUBLE_V2DF:
12241 case VOID_FTYPE_PLONGLONG_LONGLONG:
12242 case VOID_FTYPE_PULONGLONG_ULONGLONG:
12243 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
12244 case VOID_FTYPE_PINT_INT:
12245 nargs = 1;
12246 klass = store;
12247 /* Reserve memory operand for target. */
715a8bc8 12248 memory = ARRAY_SIZE (xops);
2bf6d935
ML
12249 switch (icode)
12250 {
12251 /* These builtins and instructions require the memory
12252 to be properly aligned. */
12253 case CODE_FOR_avx_movntv4di:
12254 case CODE_FOR_sse2_movntv2di:
12255 case CODE_FOR_avx_movntv8sf:
12256 case CODE_FOR_sse_movntv4sf:
12257 case CODE_FOR_sse4a_vmmovntv4sf:
12258 case CODE_FOR_avx_movntv4df:
12259 case CODE_FOR_sse2_movntv2df:
12260 case CODE_FOR_sse4a_vmmovntv2df:
12261 case CODE_FOR_sse2_movntidi:
12262 case CODE_FOR_sse_movntq:
12263 case CODE_FOR_sse2_movntisi:
12264 case CODE_FOR_avx512f_movntv16sf:
12265 case CODE_FOR_avx512f_movntv8df:
12266 case CODE_FOR_avx512f_movntv8di:
12267 aligned_mem = true;
12268 break;
12269 default:
12270 break;
12271 }
12272 break;
12273 case VOID_FTYPE_PVOID_PCVOID:
12274 nargs = 1;
12275 klass = store;
12276 memory = 0;
12277
12278 break;
12279 case V4SF_FTYPE_V4SF_PCV2SF:
12280 case V2DF_FTYPE_V2DF_PCDOUBLE:
12281 nargs = 2;
12282 klass = load;
12283 memory = 1;
12284 break;
12285 case V8SF_FTYPE_PCV8SF_V8SI:
12286 case V4DF_FTYPE_PCV4DF_V4DI:
12287 case V4SF_FTYPE_PCV4SF_V4SI:
12288 case V2DF_FTYPE_PCV2DF_V2DI:
12289 case V8SI_FTYPE_PCV8SI_V8SI:
12290 case V4DI_FTYPE_PCV4DI_V4DI:
12291 case V4SI_FTYPE_PCV4SI_V4SI:
12292 case V2DI_FTYPE_PCV2DI_V2DI:
12293 case VOID_FTYPE_INT_INT64:
12294 nargs = 2;
12295 klass = load;
12296 memory = 0;
12297 break;
12298 case VOID_FTYPE_PV8DF_V8DF_UQI:
12299 case VOID_FTYPE_PV4DF_V4DF_UQI:
12300 case VOID_FTYPE_PV2DF_V2DF_UQI:
12301 case VOID_FTYPE_PV16SF_V16SF_UHI:
12302 case VOID_FTYPE_PV8SF_V8SF_UQI:
12303 case VOID_FTYPE_PV4SF_V4SF_UQI:
12304 case VOID_FTYPE_PV8DI_V8DI_UQI:
12305 case VOID_FTYPE_PV4DI_V4DI_UQI:
12306 case VOID_FTYPE_PV2DI_V2DI_UQI:
12307 case VOID_FTYPE_PV16SI_V16SI_UHI:
12308 case VOID_FTYPE_PV8SI_V8SI_UQI:
12309 case VOID_FTYPE_PV4SI_V4SI_UQI:
12310 case VOID_FTYPE_PV64QI_V64QI_UDI:
12311 case VOID_FTYPE_PV32HI_V32HI_USI:
12312 case VOID_FTYPE_PV32QI_V32QI_USI:
12313 case VOID_FTYPE_PV16QI_V16QI_UHI:
12314 case VOID_FTYPE_PV16HI_V16HI_UHI:
12315 case VOID_FTYPE_PV8HI_V8HI_UQI:
12316 switch (icode)
12317 {
12318 /* These builtins and instructions require the memory
12319 to be properly aligned. */
12320 case CODE_FOR_avx512f_storev16sf_mask:
12321 case CODE_FOR_avx512f_storev16si_mask:
12322 case CODE_FOR_avx512f_storev8df_mask:
12323 case CODE_FOR_avx512f_storev8di_mask:
12324 case CODE_FOR_avx512vl_storev8sf_mask:
12325 case CODE_FOR_avx512vl_storev8si_mask:
12326 case CODE_FOR_avx512vl_storev4df_mask:
12327 case CODE_FOR_avx512vl_storev4di_mask:
12328 case CODE_FOR_avx512vl_storev4sf_mask:
12329 case CODE_FOR_avx512vl_storev4si_mask:
12330 case CODE_FOR_avx512vl_storev2df_mask:
12331 case CODE_FOR_avx512vl_storev2di_mask:
12332 aligned_mem = true;
12333 break;
12334 default:
12335 break;
12336 }
12337 /* FALLTHRU */
12338 case VOID_FTYPE_PV8SF_V8SI_V8SF:
12339 case VOID_FTYPE_PV4DF_V4DI_V4DF:
12340 case VOID_FTYPE_PV4SF_V4SI_V4SF:
12341 case VOID_FTYPE_PV2DF_V2DI_V2DF:
12342 case VOID_FTYPE_PV8SI_V8SI_V8SI:
12343 case VOID_FTYPE_PV4DI_V4DI_V4DI:
12344 case VOID_FTYPE_PV4SI_V4SI_V4SI:
12345 case VOID_FTYPE_PV2DI_V2DI_V2DI:
12346 case VOID_FTYPE_PV8SI_V8DI_UQI:
12347 case VOID_FTYPE_PV8HI_V8DI_UQI:
12348 case VOID_FTYPE_PV16HI_V16SI_UHI:
4a948703 12349 case VOID_FTYPE_PUDI_V8DI_UQI:
2bf6d935
ML
12350 case VOID_FTYPE_PV16QI_V16SI_UHI:
12351 case VOID_FTYPE_PV4SI_V4DI_UQI:
4a948703 12352 case VOID_FTYPE_PUDI_V2DI_UQI:
12353 case VOID_FTYPE_PUDI_V4DI_UQI:
12354 case VOID_FTYPE_PUSI_V2DI_UQI:
2bf6d935 12355 case VOID_FTYPE_PV8HI_V8SI_UQI:
4a948703 12356 case VOID_FTYPE_PUDI_V4SI_UQI:
12357 case VOID_FTYPE_PUSI_V4DI_UQI:
12358 case VOID_FTYPE_PUHI_V2DI_UQI:
12359 case VOID_FTYPE_PUDI_V8SI_UQI:
12360 case VOID_FTYPE_PUSI_V4SI_UQI:
2bf6d935
ML
12361 case VOID_FTYPE_PCHAR_V64QI_UDI:
12362 case VOID_FTYPE_PCHAR_V32QI_USI:
12363 case VOID_FTYPE_PCHAR_V16QI_UHI:
12364 case VOID_FTYPE_PSHORT_V32HI_USI:
12365 case VOID_FTYPE_PSHORT_V16HI_UHI:
12366 case VOID_FTYPE_PSHORT_V8HI_UQI:
12367 case VOID_FTYPE_PINT_V16SI_UHI:
12368 case VOID_FTYPE_PINT_V8SI_UQI:
12369 case VOID_FTYPE_PINT_V4SI_UQI:
12370 case VOID_FTYPE_PINT64_V8DI_UQI:
12371 case VOID_FTYPE_PINT64_V4DI_UQI:
12372 case VOID_FTYPE_PINT64_V2DI_UQI:
12373 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12374 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12375 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12376 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12377 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12378 case VOID_FTYPE_PFLOAT_V4SF_UQI:
c4d423c7 12379 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
2bf6d935
ML
12380 case VOID_FTYPE_PV32QI_V32HI_USI:
12381 case VOID_FTYPE_PV16QI_V16HI_UHI:
4a948703 12382 case VOID_FTYPE_PUDI_V8HI_UQI:
2bf6d935
ML
12383 nargs = 2;
12384 klass = store;
12385 /* Reserve memory operand for target. */
715a8bc8 12386 memory = ARRAY_SIZE (xops);
2bf6d935
ML
12387 break;
12388 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12389 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12390 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12391 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12392 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12393 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12394 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12395 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12396 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12397 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12398 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12399 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12400 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12401 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12402 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12403 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12404 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12405 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12406 switch (icode)
12407 {
12408 /* These builtins and instructions require the memory
12409 to be properly aligned. */
12410 case CODE_FOR_avx512f_loadv16sf_mask:
12411 case CODE_FOR_avx512f_loadv16si_mask:
12412 case CODE_FOR_avx512f_loadv8df_mask:
12413 case CODE_FOR_avx512f_loadv8di_mask:
12414 case CODE_FOR_avx512vl_loadv8sf_mask:
12415 case CODE_FOR_avx512vl_loadv8si_mask:
12416 case CODE_FOR_avx512vl_loadv4df_mask:
12417 case CODE_FOR_avx512vl_loadv4di_mask:
12418 case CODE_FOR_avx512vl_loadv4sf_mask:
12419 case CODE_FOR_avx512vl_loadv4si_mask:
12420 case CODE_FOR_avx512vl_loadv2df_mask:
12421 case CODE_FOR_avx512vl_loadv2di_mask:
12422 case CODE_FOR_avx512bw_loadv64qi_mask:
12423 case CODE_FOR_avx512vl_loadv32qi_mask:
12424 case CODE_FOR_avx512vl_loadv16qi_mask:
12425 case CODE_FOR_avx512bw_loadv32hi_mask:
12426 case CODE_FOR_avx512vl_loadv16hi_mask:
12427 case CODE_FOR_avx512vl_loadv8hi_mask:
12428 aligned_mem = true;
12429 break;
12430 default:
12431 break;
12432 }
12433 /* FALLTHRU */
12434 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12435 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12436 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12437 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12438 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12439 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12440 case V16SI_FTYPE_PCINT_V16SI_UHI:
12441 case V8SI_FTYPE_PCINT_V8SI_UQI:
12442 case V4SI_FTYPE_PCINT_V4SI_UQI:
12443 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12444 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12445 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12446 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12447 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12448 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12449 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12450 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12451 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
c4d423c7 12452 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
2bf6d935
ML
12453 nargs = 3;
12454 klass = load;
12455 memory = 0;
12456 break;
152834fe
HJ
12457 case INT_FTYPE_PINT_INT_INT_INT:
12458 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
12459 nargs = 4;
12460 klass = load;
12461 memory = 0;
12462 constant = 3;
12463 break;
2bf6d935
ML
12464 default:
12465 gcc_unreachable ();
12466 }
12467
715a8bc8 12468 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
12469
12470 if (klass == store)
12471 {
12472 arg = CALL_EXPR_ARG (exp, 0);
12473 op = expand_normal (arg);
12474 gcc_assert (target == 0);
12475 if (memory)
12476 {
12477 op = ix86_zero_extend_to_Pmode (op);
12478 target = gen_rtx_MEM (tmode, op);
12479 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12480 on it. Try to improve it using get_pointer_alignment,
12481 and if the special builtin is one that requires strict
12482 mode alignment, also from it's GET_MODE_ALIGNMENT.
12483 Failure to do so could lead to ix86_legitimate_combined_insn
12484 rejecting all changes to such insns. */
12485 unsigned int align = get_pointer_alignment (arg);
12486 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12487 align = GET_MODE_ALIGNMENT (tmode);
12488 if (MEM_ALIGN (target) < align)
12489 set_mem_align (target, align);
12490 }
12491 else
12492 target = force_reg (tmode, op);
12493 arg_adjust = 1;
12494 }
12495 else
12496 {
12497 arg_adjust = 0;
12498 if (optimize
12499 || target == 0
12500 || !register_operand (target, tmode)
12501 || GET_MODE (target) != tmode)
12502 target = gen_reg_rtx (tmode);
12503 }
12504
12505 for (i = 0; i < nargs; i++)
12506 {
12507 machine_mode mode = insn_p->operand[i + 1].mode;
2bf6d935
ML
12508
12509 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12510 op = expand_normal (arg);
2bf6d935 12511
776a37f6 12512 if (i == memory)
2bf6d935 12513 {
776a37f6 12514 /* This must be the memory operand. */
12515 op = ix86_zero_extend_to_Pmode (op);
12516 op = gen_rtx_MEM (mode, op);
12517 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12518 on it. Try to improve it using get_pointer_alignment,
12519 and if the special builtin is one that requires strict
12520 mode alignment, also from it's GET_MODE_ALIGNMENT.
12521 Failure to do so could lead to ix86_legitimate_combined_insn
12522 rejecting all changes to such insns. */
12523 unsigned int align = get_pointer_alignment (arg);
12524 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12525 align = GET_MODE_ALIGNMENT (mode);
12526 if (MEM_ALIGN (op) < align)
12527 set_mem_align (op, align);
2bf6d935 12528 }
152834fe
HJ
12529 else if (i == constant)
12530 {
12531 /* This must be the constant. */
12532 if (!insn_p->operand[nargs].predicate(op, SImode))
12533 {
12534 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12535 return const0_rtx;
12536 }
12537 }
2bf6d935
ML
12538 else
12539 {
776a37f6 12540 /* This must be register. */
12541 if (VECTOR_MODE_P (mode))
12542 op = safe_vector_operand (op, mode);
2bf6d935 12543
776a37f6 12544 op = fixup_modeless_constant (op, mode);
2bf6d935 12545
b6efffa5 12546 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
35c4c67e 12547 and that mask operand shoud be at the end.
12548 Keep all-ones mask which would be simplified by the expander. */
12549 if (nargs == 3 && i == 2 && klass == load
b6efffa5 12550 && constm1_operand (op, mode)
12551 && insn_p->operand[i].predicate (op, mode))
35c4c67e 12552 ;
12553 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
776a37f6 12554 op = copy_to_mode_reg (mode, op);
12555 else
12556 {
12557 op = copy_to_reg (op);
12558 op = lowpart_subreg (mode, op, GET_MODE (op));
2bf6d935
ML
12559 }
12560 }
12561
715a8bc8 12562 xops[i]= op;
2bf6d935
ML
12563 }
12564
12565 switch (nargs)
12566 {
12567 case 0:
12568 pat = GEN_FCN (icode) (target);
12569 break;
12570 case 1:
715a8bc8 12571 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
12572 break;
12573 case 2:
715a8bc8 12574 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
12575 break;
12576 case 3:
715a8bc8 12577 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935 12578 break;
152834fe
HJ
12579 case 4:
12580 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
12581 break;
2bf6d935
ML
12582 default:
12583 gcc_unreachable ();
12584 }
12585
12586 if (! pat)
12587 return 0;
715a8bc8 12588
2bf6d935
ML
12589 emit_insn (pat);
12590 return klass == store ? 0 : target;
12591}
12592
12593/* Return the integer constant in ARG. Constrain it to be in the range
12594 of the subparts of VEC_TYPE; issue an error if not. */
12595
12596static int
12597get_element_number (tree vec_type, tree arg)
12598{
12599 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12600
12601 if (!tree_fits_uhwi_p (arg)
12602 || (elt = tree_to_uhwi (arg), elt > max))
12603 {
a9c697b8
MS
12604 error ("selector must be an integer constant in the range "
12605 "[0, %wi]", max);
2bf6d935
ML
12606 return 0;
12607 }
12608
12609 return elt;
12610}
12611
12612/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12613 ix86_expand_vector_init. We DO have language-level syntax for this, in
12614 the form of (type){ init-list }. Except that since we can't place emms
12615 instructions from inside the compiler, we can't allow the use of MMX
12616 registers unless the user explicitly asks for it. So we do *not* define
12617 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12618 we have builtins invoked by mmintrin.h that gives us license to emit
12619 these sorts of instructions. */
12620
12621static rtx
12622ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12623{
12624 machine_mode tmode = TYPE_MODE (type);
12625 machine_mode inner_mode = GET_MODE_INNER (tmode);
12626 int i, n_elt = GET_MODE_NUNITS (tmode);
12627 rtvec v = rtvec_alloc (n_elt);
12628
12629 gcc_assert (VECTOR_MODE_P (tmode));
12630 gcc_assert (call_expr_nargs (exp) == n_elt);
12631
12632 for (i = 0; i < n_elt; ++i)
12633 {
12634 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12635 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12636 }
12637
12638 if (!target || !register_operand (target, tmode))
12639 target = gen_reg_rtx (tmode);
12640
12641 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12642 return target;
12643}
12644
12645/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12646 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12647 had a language-level syntax for referencing vector elements. */
12648
12649static rtx
12650ix86_expand_vec_ext_builtin (tree exp, rtx target)
12651{
12652 machine_mode tmode, mode0;
12653 tree arg0, arg1;
12654 int elt;
12655 rtx op0;
12656
12657 arg0 = CALL_EXPR_ARG (exp, 0);
12658 arg1 = CALL_EXPR_ARG (exp, 1);
12659
12660 op0 = expand_normal (arg0);
12661 elt = get_element_number (TREE_TYPE (arg0), arg1);
12662
12663 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12664 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12665 gcc_assert (VECTOR_MODE_P (mode0));
12666
12667 op0 = force_reg (mode0, op0);
12668
12669 if (optimize || !target || !register_operand (target, tmode))
12670 target = gen_reg_rtx (tmode);
12671
12672 ix86_expand_vector_extract (true, target, op0, elt);
12673
12674 return target;
12675}
12676
12677/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12678 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12679 a language-level syntax for referencing vector elements. */
12680
12681static rtx
12682ix86_expand_vec_set_builtin (tree exp)
12683{
12684 machine_mode tmode, mode1;
12685 tree arg0, arg1, arg2;
12686 int elt;
12687 rtx op0, op1, target;
12688
12689 arg0 = CALL_EXPR_ARG (exp, 0);
12690 arg1 = CALL_EXPR_ARG (exp, 1);
12691 arg2 = CALL_EXPR_ARG (exp, 2);
12692
12693 tmode = TYPE_MODE (TREE_TYPE (arg0));
12694 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12695 gcc_assert (VECTOR_MODE_P (tmode));
12696
12697 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12698 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12699 elt = get_element_number (TREE_TYPE (arg0), arg2);
12700
cda29c54 12701 if (GET_MODE (op1) != mode1)
2bf6d935
ML
12702 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12703
12704 op0 = force_reg (tmode, op0);
12705 op1 = force_reg (mode1, op1);
12706
12707 /* OP0 is the source of these builtin functions and shouldn't be
12708 modified. Create a copy, use it and return it as target. */
12709 target = gen_reg_rtx (tmode);
12710 emit_move_insn (target, op0);
12711 ix86_expand_vector_set (true, target, op1, elt);
12712
12713 return target;
12714}
12715
823b3b79 12716/* Return true if the necessary isa options for this builtin exist,
12717 else false.
12718 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12719bool
12720ix86_check_builtin_isa_match (unsigned int fcode,
12721 HOST_WIDE_INT* pbisa,
12722 HOST_WIDE_INT* pbisa2)
2bf6d935 12723{
2bf6d935
ML
12724 HOST_WIDE_INT isa = ix86_isa_flags;
12725 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12726 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12727 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
5ebdbdb9 12728 HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
2bf6d935
ML
12729 /* The general case is we require all the ISAs specified in bisa{,2}
12730 to be enabled.
12731 The exceptions are:
12732 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12733 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12734 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
ca813880 12735 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12736 OPTION_MASK_ISA2_AVXVNNI
5ebdbdb9 12737 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
825d0041 12738 OPTION_MASK_ISA2_AVXIFMA
5ebdbdb9 12739 (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
58685b93 12740 OPTION_MASK_ISA2_AVXNECONVERT
d9f9e53e 12741 OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
a13d6ec8
JJ
12742 where for each such pair it is sufficient if either of the ISAs is
12743 enabled, plus if it is ored with other options also those others.
12744 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
db3f0d21 12745
5ebdbdb9
HJ
12746#define SHARE_BUILTIN(A1, A2, B1, B2) \
12747 if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
12748 && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
12749 && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
12750 || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
12751 { \
12752 tmp_isa |= (A1) | (B1); \
12753 tmp_isa2 |= (A2) | (B2); \
12754 }
12755
12756 SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
12757 SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
12758 SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
12759 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
12760 OPTION_MASK_ISA2_AVXVNNI);
12761 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
12762 OPTION_MASK_ISA2_AVXIFMA);
12763 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
12764 OPTION_MASK_ISA2_AVXNECONVERT);
d9f9e53e
HJ
12765 SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
12766 OPTION_MASK_ISA2_VAES);
5ebdbdb9
HJ
12767 isa = tmp_isa;
12768 isa2 = tmp_isa2;
58685b93 12769
db3f0d21
UB
12770 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12771 /* __builtin_ia32_maskmovq requires MMX registers. */
6058b874 12772 && fcode != IX86_BUILTIN_MASKMOVQ)
a13d6ec8
JJ
12773 {
12774 bisa &= ~OPTION_MASK_ISA_MMX;
12775 bisa |= OPTION_MASK_ISA_SSE2;
ecfdb16c 12776 }
6058b874 12777
823b3b79 12778 if (pbisa)
12779 *pbisa = bisa;
12780 if (pbisa2)
12781 *pbisa2 = bisa2;
12782
12783 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12784}
12785
af29d0d6
RS
12786/* Emit instructions to set the carry flag from ARG. */
12787
12788void
12789ix86_expand_carry (rtx arg)
12790{
12791 if (!CONST_INT_P (arg) || arg == const0_rtx)
12792 {
12793 arg = convert_to_mode (QImode, arg, 1);
12794 arg = copy_to_mode_reg (QImode, arg);
12795 emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
12796 }
12797 else
12798 emit_insn (gen_x86_stc ());
12799}
12800
823b3b79 12801/* Expand an expression EXP that calls a built-in function,
12802 with result going to TARGET if that's convenient
12803 (and in mode MODE if that's convenient).
12804 SUBTARGET may be used as the target for computing one of EXP's operands.
12805 IGNORE is nonzero if the value is to be ignored. */
12806
12807rtx
12808ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12809 machine_mode mode, int ignore)
12810{
12811 size_t i;
12812 enum insn_code icode, icode2;
12813 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12814 tree arg0, arg1, arg2, arg3, arg4;
12815 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12816 machine_mode mode0, mode1, mode2, mode3, mode4;
12817 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12818 HOST_WIDE_INT bisa, bisa2;
12819
12820 /* For CPU builtins that can be folded, fold first and expand the fold. */
12821 switch (fcode)
12822 {
12823 case IX86_BUILTIN_CPU_INIT:
12824 {
12825 /* Make it call __cpu_indicator_init in libgcc. */
12826 tree call_expr, fndecl, type;
12827 type = build_function_type_list (integer_type_node, NULL_TREE);
12828 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12829 call_expr = build_call_expr (fndecl, 0);
12830 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12831 }
12832 case IX86_BUILTIN_CPU_IS:
12833 case IX86_BUILTIN_CPU_SUPPORTS:
12834 {
12835 tree arg0 = CALL_EXPR_ARG (exp, 0);
12836 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12837 gcc_assert (fold_expr != NULL_TREE);
12838 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12839 }
12840 }
12841
12842 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
2bf6d935
ML
12843 {
12844 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12845 if (TARGET_ABI_X32)
12846 bisa |= OPTION_MASK_ABI_X32;
12847 else
12848 bisa |= OPTION_MASK_ABI_64;
12849 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
46e6341f
JJ
12850 (enum fpmath_unit) 0,
12851 (enum prefer_vector_width) 0,
654cd743 12852 PVW_NONE, PVW_NONE,
46e6341f 12853 false, add_abi_p);
2bf6d935
ML
12854 if (!opts)
12855 error ("%qE needs unknown isa option", fndecl);
12856 else
12857 {
12858 gcc_assert (opts != NULL);
12859 error ("%qE needs isa option %s", fndecl, opts);
12860 free (opts);
12861 }
12862 return expand_call (exp, target, ignore);
12863 }
12864
12865 switch (fcode)
12866 {
12867 case IX86_BUILTIN_MASKMOVQ:
12868 case IX86_BUILTIN_MASKMOVDQU:
12869 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12870 ? CODE_FOR_mmx_maskmovq
12871 : CODE_FOR_sse2_maskmovdqu);
12872 /* Note the arg order is different from the operand order. */
12873 arg1 = CALL_EXPR_ARG (exp, 0);
12874 arg2 = CALL_EXPR_ARG (exp, 1);
12875 arg0 = CALL_EXPR_ARG (exp, 2);
12876 op0 = expand_normal (arg0);
12877 op1 = expand_normal (arg1);
12878 op2 = expand_normal (arg2);
12879 mode0 = insn_data[icode].operand[0].mode;
12880 mode1 = insn_data[icode].operand[1].mode;
12881 mode2 = insn_data[icode].operand[2].mode;
12882
12883 op0 = ix86_zero_extend_to_Pmode (op0);
12884 op0 = gen_rtx_MEM (mode1, op0);
12885
12886 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12887 op0 = copy_to_mode_reg (mode0, op0);
12888 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12889 op1 = copy_to_mode_reg (mode1, op1);
12890 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12891 op2 = copy_to_mode_reg (mode2, op2);
12892 pat = GEN_FCN (icode) (op0, op1, op2);
12893 if (! pat)
12894 return 0;
12895 emit_insn (pat);
12896 return 0;
12897
12898 case IX86_BUILTIN_LDMXCSR:
12899 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12900 target = assign_386_stack_local (SImode, SLOT_TEMP);
12901 emit_move_insn (target, op0);
12902 emit_insn (gen_sse_ldmxcsr (target));
12903 return 0;
12904
12905 case IX86_BUILTIN_STMXCSR:
12906 target = assign_386_stack_local (SImode, SLOT_TEMP);
12907 emit_insn (gen_sse_stmxcsr (target));
12908 return copy_to_mode_reg (SImode, target);
12909
12910 case IX86_BUILTIN_CLFLUSH:
12911 arg0 = CALL_EXPR_ARG (exp, 0);
12912 op0 = expand_normal (arg0);
12913 icode = CODE_FOR_sse2_clflush;
12914 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12915 op0 = ix86_zero_extend_to_Pmode (op0);
12916
12917 emit_insn (gen_sse2_clflush (op0));
12918 return 0;
12919
12920 case IX86_BUILTIN_CLWB:
12921 arg0 = CALL_EXPR_ARG (exp, 0);
12922 op0 = expand_normal (arg0);
12923 icode = CODE_FOR_clwb;
12924 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12925 op0 = ix86_zero_extend_to_Pmode (op0);
12926
12927 emit_insn (gen_clwb (op0));
12928 return 0;
12929
12930 case IX86_BUILTIN_CLFLUSHOPT:
12931 arg0 = CALL_EXPR_ARG (exp, 0);
12932 op0 = expand_normal (arg0);
12933 icode = CODE_FOR_clflushopt;
12934 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12935 op0 = ix86_zero_extend_to_Pmode (op0);
12936
12937 emit_insn (gen_clflushopt (op0));
12938 return 0;
12939
12940 case IX86_BUILTIN_MONITOR:
12941 case IX86_BUILTIN_MONITORX:
12942 arg0 = CALL_EXPR_ARG (exp, 0);
12943 arg1 = CALL_EXPR_ARG (exp, 1);
12944 arg2 = CALL_EXPR_ARG (exp, 2);
12945 op0 = expand_normal (arg0);
12946 op1 = expand_normal (arg1);
12947 op2 = expand_normal (arg2);
12948 if (!REG_P (op0))
12949 op0 = ix86_zero_extend_to_Pmode (op0);
12950 if (!REG_P (op1))
12951 op1 = copy_to_mode_reg (SImode, op1);
12952 if (!REG_P (op2))
12953 op2 = copy_to_mode_reg (SImode, op2);
12954
12955 emit_insn (fcode == IX86_BUILTIN_MONITOR
a963ca40
UB
12956 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12957 : gen_monitorx (Pmode, op0, op1, op2));
2bf6d935
ML
12958 return 0;
12959
12960 case IX86_BUILTIN_MWAIT:
12961 arg0 = CALL_EXPR_ARG (exp, 0);
12962 arg1 = CALL_EXPR_ARG (exp, 1);
12963 op0 = expand_normal (arg0);
12964 op1 = expand_normal (arg1);
12965 if (!REG_P (op0))
12966 op0 = copy_to_mode_reg (SImode, op0);
12967 if (!REG_P (op1))
12968 op1 = copy_to_mode_reg (SImode, op1);
12969 emit_insn (gen_sse3_mwait (op0, op1));
12970 return 0;
12971
12972 case IX86_BUILTIN_MWAITX:
12973 arg0 = CALL_EXPR_ARG (exp, 0);
12974 arg1 = CALL_EXPR_ARG (exp, 1);
12975 arg2 = CALL_EXPR_ARG (exp, 2);
12976 op0 = expand_normal (arg0);
12977 op1 = expand_normal (arg1);
12978 op2 = expand_normal (arg2);
12979 if (!REG_P (op0))
12980 op0 = copy_to_mode_reg (SImode, op0);
12981 if (!REG_P (op1))
12982 op1 = copy_to_mode_reg (SImode, op1);
12983 if (!REG_P (op2))
12984 op2 = copy_to_mode_reg (SImode, op2);
12985 emit_insn (gen_mwaitx (op0, op1, op2));
12986 return 0;
12987
12988 case IX86_BUILTIN_UMONITOR:
12989 arg0 = CALL_EXPR_ARG (exp, 0);
12990 op0 = expand_normal (arg0);
12991
12992 op0 = ix86_zero_extend_to_Pmode (op0);
987a3082 12993 emit_insn (gen_umonitor (Pmode, op0));
2bf6d935
ML
12994 return 0;
12995
12996 case IX86_BUILTIN_UMWAIT:
12997 case IX86_BUILTIN_TPAUSE:
12998 arg0 = CALL_EXPR_ARG (exp, 0);
12999 arg1 = CALL_EXPR_ARG (exp, 1);
13000 op0 = expand_normal (arg0);
13001 op1 = expand_normal (arg1);
13002
13003 if (!REG_P (op0))
13004 op0 = copy_to_mode_reg (SImode, op0);
13005
13006 op1 = force_reg (DImode, op1);
13007
13008 if (TARGET_64BIT)
13009 {
13010 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13011 NULL, 1, OPTAB_DIRECT);
13012 switch (fcode)
13013 {
13014 case IX86_BUILTIN_UMWAIT:
13015 icode = CODE_FOR_umwait_rex64;
13016 break;
13017 case IX86_BUILTIN_TPAUSE:
13018 icode = CODE_FOR_tpause_rex64;
13019 break;
13020 default:
13021 gcc_unreachable ();
13022 }
13023
13024 op2 = gen_lowpart (SImode, op2);
13025 op1 = gen_lowpart (SImode, op1);
13026 pat = GEN_FCN (icode) (op0, op1, op2);
13027 }
13028 else
13029 {
13030 switch (fcode)
13031 {
13032 case IX86_BUILTIN_UMWAIT:
13033 icode = CODE_FOR_umwait;
13034 break;
13035 case IX86_BUILTIN_TPAUSE:
13036 icode = CODE_FOR_tpause;
13037 break;
13038 default:
13039 gcc_unreachable ();
13040 }
13041 pat = GEN_FCN (icode) (op0, op1);
13042 }
13043
13044 if (!pat)
13045 return 0;
13046
13047 emit_insn (pat);
13048
13049 if (target == 0
13050 || !register_operand (target, QImode))
13051 target = gen_reg_rtx (QImode);
13052
13053 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13054 const0_rtx);
13055 emit_insn (gen_rtx_SET (target, pat));
13056
13057 return target;
13058
299a53d7 13059 case IX86_BUILTIN_TESTUI:
13060 emit_insn (gen_testui ());
13061
13062 if (target == 0
13063 || !register_operand (target, QImode))
13064 target = gen_reg_rtx (QImode);
13065
13066 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13067 const0_rtx);
13068 emit_insn (gen_rtx_SET (target, pat));
13069
13070 return target;
13071
2bf6d935
ML
13072 case IX86_BUILTIN_CLZERO:
13073 arg0 = CALL_EXPR_ARG (exp, 0);
13074 op0 = expand_normal (arg0);
13075 if (!REG_P (op0))
13076 op0 = ix86_zero_extend_to_Pmode (op0);
a963ca40 13077 emit_insn (gen_clzero (Pmode, op0));
2bf6d935
ML
13078 return 0;
13079
13080 case IX86_BUILTIN_CLDEMOTE:
13081 arg0 = CALL_EXPR_ARG (exp, 0);
13082 op0 = expand_normal (arg0);
13083 icode = CODE_FOR_cldemote;
13084 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13085 op0 = ix86_zero_extend_to_Pmode (op0);
13086
13087 emit_insn (gen_cldemote (op0));
13088 return 0;
13089
632a2f50 13090 case IX86_BUILTIN_LOADIWKEY:
13091 {
13092 arg0 = CALL_EXPR_ARG (exp, 0);
13093 arg1 = CALL_EXPR_ARG (exp, 1);
13094 arg2 = CALL_EXPR_ARG (exp, 2);
13095 arg3 = CALL_EXPR_ARG (exp, 3);
13096
13097 op0 = expand_normal (arg0);
13098 op1 = expand_normal (arg1);
13099 op2 = expand_normal (arg2);
13100 op3 = expand_normal (arg3);
13101
13102 if (!REG_P (op0))
13103 op0 = copy_to_mode_reg (V2DImode, op0);
13104 if (!REG_P (op1))
13105 op1 = copy_to_mode_reg (V2DImode, op1);
13106 if (!REG_P (op2))
13107 op2 = copy_to_mode_reg (V2DImode, op2);
13108 if (!REG_P (op3))
13109 op3 = copy_to_mode_reg (SImode, op3);
13110
13111 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
13112
13113 return 0;
13114 }
13115
13116 case IX86_BUILTIN_AESDEC128KLU8:
13117 icode = CODE_FOR_aesdec128klu8;
13118 goto aesdecenc_expand;
13119
13120 case IX86_BUILTIN_AESDEC256KLU8:
13121 icode = CODE_FOR_aesdec256klu8;
13122 goto aesdecenc_expand;
13123
13124 case IX86_BUILTIN_AESENC128KLU8:
13125 icode = CODE_FOR_aesenc128klu8;
13126 goto aesdecenc_expand;
13127
13128 case IX86_BUILTIN_AESENC256KLU8:
13129 icode = CODE_FOR_aesenc256klu8;
13130
13131 aesdecenc_expand:
13132
13133 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
13134 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
13135 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13136
13137 op0 = expand_normal (arg0);
13138 op1 = expand_normal (arg1);
13139 op2 = expand_normal (arg2);
13140
13141 if (!address_operand (op0, V2DImode))
13142 {
13143 op0 = convert_memory_address (Pmode, op0);
13144 op0 = copy_addr_to_reg (op0);
13145 }
13146 op0 = gen_rtx_MEM (V2DImode, op0);
13147
13148 if (!REG_P (op1))
13149 op1 = copy_to_mode_reg (V2DImode, op1);
13150
13151 if (!address_operand (op2, VOIDmode))
13152 {
13153 op2 = convert_memory_address (Pmode, op2);
13154 op2 = copy_addr_to_reg (op2);
13155 }
13156 op2 = gen_rtx_MEM (BLKmode, op2);
13157
13158 emit_insn (GEN_FCN (icode) (op1, op1, op2));
13159
13160 if (target == 0)
13161 target = gen_reg_rtx (QImode);
13162
1aeefa57
HW
13163 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
13164 error occurs. Then the output should be cleared for safety. */
13165 rtx_code_label *ok_label;
13166 rtx tmp;
13167
13168 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13169 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13170 ok_label = gen_label_rtx ();
13171 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13172 true, ok_label);
13173 /* Usually the runtime error seldom occur, so predict OK path as
13174 hotspot to optimize it as fallthrough block. */
13175 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13176
13177 emit_insn (gen_rtx_SET (op1, const0_rtx));
632a2f50 13178
1aeefa57
HW
13179 emit_label (ok_label);
13180 emit_insn (gen_rtx_SET (target, pat));
632a2f50 13181 emit_insn (gen_rtx_SET (op0, op1));
13182
13183 return target;
13184
13185 case IX86_BUILTIN_AESDECWIDE128KLU8:
13186 icode = CODE_FOR_aesdecwide128klu8;
13187 goto wideaesdecenc_expand;
13188
13189 case IX86_BUILTIN_AESDECWIDE256KLU8:
13190 icode = CODE_FOR_aesdecwide256klu8;
13191 goto wideaesdecenc_expand;
13192
13193 case IX86_BUILTIN_AESENCWIDE128KLU8:
13194 icode = CODE_FOR_aesencwide128klu8;
13195 goto wideaesdecenc_expand;
13196
13197 case IX86_BUILTIN_AESENCWIDE256KLU8:
13198 icode = CODE_FOR_aesencwide256klu8;
13199
13200 wideaesdecenc_expand:
13201
13202 rtx xmm_regs[8];
13203 rtx op;
13204
13205 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
13206 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
13207 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13208
13209 op0 = expand_normal (arg0);
13210 op1 = expand_normal (arg1);
13211 op2 = expand_normal (arg2);
13212
13213 if (!address_operand (op2, VOIDmode))
13214 {
13215 op2 = convert_memory_address (Pmode, op2);
13216 op2 = copy_addr_to_reg (op2);
13217 }
13218 op2 = gen_rtx_MEM (BLKmode, op2);
13219
13220 for (i = 0; i < 8; i++)
13221 {
13222 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13223
13224 op = gen_rtx_MEM (V2DImode,
13225 plus_constant (Pmode, op1, (i * 16)));
13226
13227 emit_move_insn (xmm_regs[i], op);
13228 }
13229
13230 emit_insn (GEN_FCN (icode) (op2));
13231
13232 if (target == 0)
13233 target = gen_reg_rtx (QImode);
13234
1aeefa57
HW
13235 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13236 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13237 ok_label = gen_label_rtx ();
13238 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13239 true, ok_label);
13240 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13241
13242 for (i = 0; i < 8; i++)
13243 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
13244
13245 emit_label (ok_label);
632a2f50 13246 emit_insn (gen_rtx_SET (target, pat));
13247
13248 for (i = 0; i < 8; i++)
13249 {
13250 op = gen_rtx_MEM (V2DImode,
13251 plus_constant (Pmode, op0, (i * 16)));
13252 emit_move_insn (op, xmm_regs[i]);
13253 }
13254
13255 return target;
13256
13257 case IX86_BUILTIN_ENCODEKEY128U32:
13258 {
13259 rtx op, xmm_regs[7];
13260
13261 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13262 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
13263 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
13264
13265 op0 = expand_normal (arg0);
13266 op1 = expand_normal (arg1);
13267 op2 = expand_normal (arg2);
13268
13269 if (!REG_P (op0))
13270 op0 = copy_to_mode_reg (SImode, op0);
13271
13272 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13273 emit_move_insn (op, op1);
13274
13275 for (i = 0; i < 3; i++)
13276 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13277
13278 if (target == 0)
13279 target = gen_reg_rtx (SImode);
13280
13281 emit_insn (gen_encodekey128u32 (target, op0));
13282
13283 for (i = 0; i < 3; i++)
13284 {
13285 op = gen_rtx_MEM (V2DImode,
13286 plus_constant (Pmode, op2, (i * 16)));
13287 emit_move_insn (op, xmm_regs[i]);
13288 }
13289
13290 return target;
13291 }
13292 case IX86_BUILTIN_ENCODEKEY256U32:
13293 {
13294 rtx op, xmm_regs[7];
13295
13296 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13297 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
13298 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
13299 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
13300
13301 op0 = expand_normal (arg0);
13302 op1 = expand_normal (arg1);
13303 op2 = expand_normal (arg2);
13304 op3 = expand_normal (arg3);
13305
13306 if (!REG_P (op0))
13307 op0 = copy_to_mode_reg (SImode, op0);
13308
13309 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13310 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13311 emit_move_insn (op, op1);
13312 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
13313 emit_move_insn (op, op2);
13314
13315 for (i = 0; i < 4; i++)
13316 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13317
13318 if (target == 0)
13319 target = gen_reg_rtx (SImode);
13320
13321 emit_insn (gen_encodekey256u32 (target, op0));
13322
13323 for (i = 0; i < 4; i++)
13324 {
13325 op = gen_rtx_MEM (V2DImode,
13326 plus_constant (Pmode, op3, (i * 16)));
13327 emit_move_insn (op, xmm_regs[i]);
13328 }
13329
13330 return target;
13331 }
13332
b384d9a0
HJ
13333 case IX86_BUILTIN_PREFETCH:
13334 {
13335 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13336 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13337 arg2 = CALL_EXPR_ARG (exp, 2); // const int
13338 arg3 = CALL_EXPR_ARG (exp, 3); // const int
13339
13340 op0 = expand_normal (arg0);
13341 op1 = expand_normal (arg1);
13342 op2 = expand_normal (arg2);
13343 op3 = expand_normal (arg3);
13344
13345 if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
13346 {
13347 error ("second, third and fourth argument must be a const");
13348 return const0_rtx;
13349 }
13350
13351 if (INTVAL (op3) == 1)
13352 {
77a67e3a
L
13353 if (INTVAL (op2) < 2 || INTVAL (op2) > 3)
13354 {
13355 error ("invalid third argument");
13356 return const0_rtx;
13357 }
13358
21de01f5 13359 if (TARGET_64BIT && TARGET_PREFETCHI
b384d9a0
HJ
13360 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13361 emit_insn (gen_prefetchi (op0, op2));
13362 else
13363 {
13364 warning (0, "instruction prefetch applies when in 64-bit mode"
13365 " with RIP-relative addressing and"
13366 " option %<-mprefetchi%>;"
13367 " they stay NOPs otherwise");
13368 emit_insn (gen_nop ());
13369 }
13370 }
13371 else
13372 {
13373 if (!address_operand (op0, VOIDmode))
13374 {
13375 op0 = convert_memory_address (Pmode, op0);
13376 op0 = copy_addr_to_reg (op0);
13377 }
21de01f5 13378
77a67e3a
L
13379 if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
13380 {
13381 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
13382 op2 = const0_rtx;
13383 }
13384
21de01f5
HJ
13385 if (TARGET_3DNOW || TARGET_PREFETCH_SSE
13386 || TARGET_PRFCHW || TARGET_PREFETCHWT1)
13387 emit_insn (gen_prefetch (op0, op1, op2));
13388 else if (!MEM_P (op0) && side_effects_p (op0))
13389 /* Don't do anything with direct references to volatile memory,
13390 but generate code to handle other side effects. */
13391 emit_insn (op0);
b384d9a0
HJ
13392 }
13393
13394 return 0;
13395 }
13396
13397 case IX86_BUILTIN_PREFETCHI:
13398 {
13399 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13400 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13401
13402 op0 = expand_normal (arg0);
13403 op1 = expand_normal (arg1);
13404
13405 if (!CONST_INT_P (op1))
13406 {
13407 error ("second argument must be a const");
13408 return const0_rtx;
13409 }
13410
13411 /* GOT/PLT_PIC should not be available for instruction prefetch.
13412 It must be real instruction address. */
13413 if (TARGET_64BIT
13414 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13415 emit_insn (gen_prefetchi (op0, op1));
13416 else
13417 {
13418 /* Ignore the hint. */
13419 warning (0, "instruction prefetch applies when in 64-bit mode"
13420 " with RIP-relative addressing and"
13421 " option %<-mprefetchi%>;"
13422 " they stay NOPs otherwise");
13423 emit_insn (gen_nop ());
13424 }
13425
13426 return 0;
13427 }
13428
2bf6d935
ML
13429 case IX86_BUILTIN_VEC_INIT_V2SI:
13430 case IX86_BUILTIN_VEC_INIT_V4HI:
13431 case IX86_BUILTIN_VEC_INIT_V8QI:
13432 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
13433
13434 case IX86_BUILTIN_VEC_EXT_V2DF:
13435 case IX86_BUILTIN_VEC_EXT_V2DI:
13436 case IX86_BUILTIN_VEC_EXT_V4SF:
13437 case IX86_BUILTIN_VEC_EXT_V4SI:
13438 case IX86_BUILTIN_VEC_EXT_V8HI:
13439 case IX86_BUILTIN_VEC_EXT_V2SI:
13440 case IX86_BUILTIN_VEC_EXT_V4HI:
13441 case IX86_BUILTIN_VEC_EXT_V16QI:
13442 return ix86_expand_vec_ext_builtin (exp, target);
13443
13444 case IX86_BUILTIN_VEC_SET_V2DI:
13445 case IX86_BUILTIN_VEC_SET_V4SF:
13446 case IX86_BUILTIN_VEC_SET_V4SI:
13447 case IX86_BUILTIN_VEC_SET_V8HI:
13448 case IX86_BUILTIN_VEC_SET_V4HI:
13449 case IX86_BUILTIN_VEC_SET_V16QI:
13450 return ix86_expand_vec_set_builtin (exp);
13451
13452 case IX86_BUILTIN_NANQ:
13453 case IX86_BUILTIN_NANSQ:
13454 return expand_call (exp, target, ignore);
13455
13456 case IX86_BUILTIN_RDPID:
13457
13458 op0 = gen_reg_rtx (word_mode);
13459
13460 if (TARGET_64BIT)
13461 {
13462 insn = gen_rdpid_rex64 (op0);
13463 op0 = convert_to_mode (SImode, op0, 1);
13464 }
13465 else
13466 insn = gen_rdpid (op0);
13467
13468 emit_insn (insn);
13469
13470 if (target == 0
13471 || !register_operand (target, SImode))
13472 target = gen_reg_rtx (SImode);
13473
13474 emit_move_insn (target, op0);
13475 return target;
13476
e21b52af
HL
13477 case IX86_BUILTIN_2INTERSECTD512:
13478 case IX86_BUILTIN_2INTERSECTQ512:
13479 case IX86_BUILTIN_2INTERSECTD256:
13480 case IX86_BUILTIN_2INTERSECTQ256:
13481 case IX86_BUILTIN_2INTERSECTD128:
13482 case IX86_BUILTIN_2INTERSECTQ128:
13483 arg0 = CALL_EXPR_ARG (exp, 0);
13484 arg1 = CALL_EXPR_ARG (exp, 1);
13485 arg2 = CALL_EXPR_ARG (exp, 2);
13486 arg3 = CALL_EXPR_ARG (exp, 3);
13487 op0 = expand_normal (arg0);
13488 op1 = expand_normal (arg1);
13489 op2 = expand_normal (arg2);
13490 op3 = expand_normal (arg3);
13491
13492 if (!address_operand (op0, VOIDmode))
13493 {
13494 op0 = convert_memory_address (Pmode, op0);
13495 op0 = copy_addr_to_reg (op0);
13496 }
13497 if (!address_operand (op1, VOIDmode))
13498 {
13499 op1 = convert_memory_address (Pmode, op1);
13500 op1 = copy_addr_to_reg (op1);
13501 }
13502
13503 switch (fcode)
13504 {
13505 case IX86_BUILTIN_2INTERSECTD512:
13506 mode4 = P2HImode;
13507 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13508 break;
13509 case IX86_BUILTIN_2INTERSECTQ512:
13510 mode4 = P2QImode;
13511 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13512 break;
13513 case IX86_BUILTIN_2INTERSECTD256:
13514 mode4 = P2QImode;
13515 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13516 break;
13517 case IX86_BUILTIN_2INTERSECTQ256:
13518 mode4 = P2QImode;
13519 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13520 break;
13521 case IX86_BUILTIN_2INTERSECTD128:
13522 mode4 = P2QImode;
13523 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13524 break;
13525 case IX86_BUILTIN_2INTERSECTQ128:
13526 mode4 = P2QImode;
13527 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13528 break;
13529 default:
13530 gcc_unreachable ();
13531 }
13532
13533 mode2 = insn_data[icode].operand[1].mode;
13534 mode3 = insn_data[icode].operand[2].mode;
13535 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13536 op2 = copy_to_mode_reg (mode2, op2);
13537 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13538 op3 = copy_to_mode_reg (mode3, op3);
13539
13540 op4 = gen_reg_rtx (mode4);
13541 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13542 mode0 = mode4 == P2HImode ? HImode : QImode;
13543 emit_move_insn (gen_rtx_MEM (mode0, op0),
13544 gen_lowpart (mode0, op4));
13545 emit_move_insn (gen_rtx_MEM (mode0, op1),
13546 gen_highpart (mode0, op4));
13547
13548 return 0;
13549
2bf6d935
ML
13550 case IX86_BUILTIN_RDPMC:
13551 case IX86_BUILTIN_RDTSC:
13552 case IX86_BUILTIN_RDTSCP:
13553 case IX86_BUILTIN_XGETBV:
13554
13555 op0 = gen_reg_rtx (DImode);
13556 op1 = gen_reg_rtx (DImode);
13557
13558 if (fcode == IX86_BUILTIN_RDPMC)
13559 {
13560 arg0 = CALL_EXPR_ARG (exp, 0);
13561 op2 = expand_normal (arg0);
13562 if (!register_operand (op2, SImode))
13563 op2 = copy_to_mode_reg (SImode, op2);
13564
13565 insn = (TARGET_64BIT
13566 ? gen_rdpmc_rex64 (op0, op1, op2)
13567 : gen_rdpmc (op0, op2));
13568 emit_insn (insn);
13569 }
13570 else if (fcode == IX86_BUILTIN_XGETBV)
13571 {
13572 arg0 = CALL_EXPR_ARG (exp, 0);
13573 op2 = expand_normal (arg0);
13574 if (!register_operand (op2, SImode))
13575 op2 = copy_to_mode_reg (SImode, op2);
13576
13577 insn = (TARGET_64BIT
13578 ? gen_xgetbv_rex64 (op0, op1, op2)
13579 : gen_xgetbv (op0, op2));
13580 emit_insn (insn);
13581 }
13582 else if (fcode == IX86_BUILTIN_RDTSC)
13583 {
13584 insn = (TARGET_64BIT
13585 ? gen_rdtsc_rex64 (op0, op1)
13586 : gen_rdtsc (op0));
13587 emit_insn (insn);
13588 }
13589 else
13590 {
13591 op2 = gen_reg_rtx (SImode);
13592
13593 insn = (TARGET_64BIT
13594 ? gen_rdtscp_rex64 (op0, op1, op2)
13595 : gen_rdtscp (op0, op2));
13596 emit_insn (insn);
13597
13598 arg0 = CALL_EXPR_ARG (exp, 0);
13599 op4 = expand_normal (arg0);
13600 if (!address_operand (op4, VOIDmode))
13601 {
13602 op4 = convert_memory_address (Pmode, op4);
13603 op4 = copy_addr_to_reg (op4);
13604 }
13605 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13606 }
13607
13608 if (target == 0
13609 || !register_operand (target, DImode))
13610 target = gen_reg_rtx (DImode);
13611
13612 if (TARGET_64BIT)
13613 {
13614 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13615 op1, 1, OPTAB_DIRECT);
13616 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13617 op0, 1, OPTAB_DIRECT);
13618 }
13619
13620 emit_move_insn (target, op0);
13621 return target;
13622
6a10feda
XG
13623 case IX86_BUILTIN_ENQCMD:
13624 case IX86_BUILTIN_ENQCMDS:
2bf6d935
ML
13625 case IX86_BUILTIN_MOVDIR64B:
13626
13627 arg0 = CALL_EXPR_ARG (exp, 0);
13628 arg1 = CALL_EXPR_ARG (exp, 1);
13629 op0 = expand_normal (arg0);
13630 op1 = expand_normal (arg1);
13631
13632 op0 = ix86_zero_extend_to_Pmode (op0);
13633 if (!address_operand (op1, VOIDmode))
13634 {
13635 op1 = convert_memory_address (Pmode, op1);
13636 op1 = copy_addr_to_reg (op1);
13637 }
13638 op1 = gen_rtx_MEM (XImode, op1);
13639
6a10feda
XG
13640 if (fcode == IX86_BUILTIN_MOVDIR64B)
13641 {
13642 emit_insn (gen_movdir64b (Pmode, op0, op1));
13643 return 0;
13644 }
13645 else
13646 {
44320665
UB
13647 if (target == 0
13648 || !register_operand (target, SImode))
13649 target = gen_reg_rtx (SImode);
6a10feda 13650
6a10feda
XG
13651 emit_move_insn (target, const0_rtx);
13652 target = gen_rtx_SUBREG (QImode, target, 0);
13653
44320665
UB
13654 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13655 ? UNSPECV_ENQCMD
13656 : UNSPECV_ENQCMDS);
13657 icode = code_for_enqcmd (unspecv, Pmode);
13658 emit_insn (GEN_FCN (icode) (op0, op1));
6a10feda 13659
44320665
UB
13660 emit_insn
13661 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13662 gen_rtx_fmt_ee (EQ, QImode,
13663 gen_rtx_REG (CCZmode, FLAGS_REG),
13664 const0_rtx)));
6a10feda
XG
13665 return SUBREG_REG (target);
13666 }
2bf6d935
ML
13667
13668 case IX86_BUILTIN_FXSAVE:
13669 case IX86_BUILTIN_FXRSTOR:
13670 case IX86_BUILTIN_FXSAVE64:
13671 case IX86_BUILTIN_FXRSTOR64:
13672 case IX86_BUILTIN_FNSTENV:
13673 case IX86_BUILTIN_FLDENV:
13674 mode0 = BLKmode;
13675 switch (fcode)
13676 {
13677 case IX86_BUILTIN_FXSAVE:
13678 icode = CODE_FOR_fxsave;
13679 break;
13680 case IX86_BUILTIN_FXRSTOR:
13681 icode = CODE_FOR_fxrstor;
13682 break;
13683 case IX86_BUILTIN_FXSAVE64:
13684 icode = CODE_FOR_fxsave64;
13685 break;
13686 case IX86_BUILTIN_FXRSTOR64:
13687 icode = CODE_FOR_fxrstor64;
13688 break;
13689 case IX86_BUILTIN_FNSTENV:
13690 icode = CODE_FOR_fnstenv;
13691 break;
13692 case IX86_BUILTIN_FLDENV:
13693 icode = CODE_FOR_fldenv;
13694 break;
13695 default:
13696 gcc_unreachable ();
13697 }
13698
13699 arg0 = CALL_EXPR_ARG (exp, 0);
13700 op0 = expand_normal (arg0);
13701
13702 if (!address_operand (op0, VOIDmode))
13703 {
13704 op0 = convert_memory_address (Pmode, op0);
13705 op0 = copy_addr_to_reg (op0);
13706 }
13707 op0 = gen_rtx_MEM (mode0, op0);
13708
13709 pat = GEN_FCN (icode) (op0);
13710 if (pat)
13711 emit_insn (pat);
13712 return 0;
13713
13714 case IX86_BUILTIN_XSETBV:
13715 arg0 = CALL_EXPR_ARG (exp, 0);
13716 arg1 = CALL_EXPR_ARG (exp, 1);
13717 op0 = expand_normal (arg0);
13718 op1 = expand_normal (arg1);
13719
13720 if (!REG_P (op0))
13721 op0 = copy_to_mode_reg (SImode, op0);
13722
13723 op1 = force_reg (DImode, op1);
13724
13725 if (TARGET_64BIT)
13726 {
13727 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13728 NULL, 1, OPTAB_DIRECT);
13729
13730 icode = CODE_FOR_xsetbv_rex64;
13731
13732 op2 = gen_lowpart (SImode, op2);
13733 op1 = gen_lowpart (SImode, op1);
13734 pat = GEN_FCN (icode) (op0, op1, op2);
13735 }
13736 else
13737 {
13738 icode = CODE_FOR_xsetbv;
13739
13740 pat = GEN_FCN (icode) (op0, op1);
13741 }
13742 if (pat)
13743 emit_insn (pat);
13744 return 0;
13745
13746 case IX86_BUILTIN_XSAVE:
13747 case IX86_BUILTIN_XRSTOR:
13748 case IX86_BUILTIN_XSAVE64:
13749 case IX86_BUILTIN_XRSTOR64:
13750 case IX86_BUILTIN_XSAVEOPT:
13751 case IX86_BUILTIN_XSAVEOPT64:
13752 case IX86_BUILTIN_XSAVES:
13753 case IX86_BUILTIN_XRSTORS:
13754 case IX86_BUILTIN_XSAVES64:
13755 case IX86_BUILTIN_XRSTORS64:
13756 case IX86_BUILTIN_XSAVEC:
13757 case IX86_BUILTIN_XSAVEC64:
13758 arg0 = CALL_EXPR_ARG (exp, 0);
13759 arg1 = CALL_EXPR_ARG (exp, 1);
13760 op0 = expand_normal (arg0);
13761 op1 = expand_normal (arg1);
13762
13763 if (!address_operand (op0, VOIDmode))
13764 {
13765 op0 = convert_memory_address (Pmode, op0);
13766 op0 = copy_addr_to_reg (op0);
13767 }
13768 op0 = gen_rtx_MEM (BLKmode, op0);
13769
13770 op1 = force_reg (DImode, op1);
13771
13772 if (TARGET_64BIT)
13773 {
13774 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13775 NULL, 1, OPTAB_DIRECT);
13776 switch (fcode)
13777 {
13778 case IX86_BUILTIN_XSAVE:
13779 icode = CODE_FOR_xsave_rex64;
13780 break;
13781 case IX86_BUILTIN_XRSTOR:
13782 icode = CODE_FOR_xrstor_rex64;
13783 break;
13784 case IX86_BUILTIN_XSAVE64:
13785 icode = CODE_FOR_xsave64;
13786 break;
13787 case IX86_BUILTIN_XRSTOR64:
13788 icode = CODE_FOR_xrstor64;
13789 break;
13790 case IX86_BUILTIN_XSAVEOPT:
13791 icode = CODE_FOR_xsaveopt_rex64;
13792 break;
13793 case IX86_BUILTIN_XSAVEOPT64:
13794 icode = CODE_FOR_xsaveopt64;
13795 break;
13796 case IX86_BUILTIN_XSAVES:
13797 icode = CODE_FOR_xsaves_rex64;
13798 break;
13799 case IX86_BUILTIN_XRSTORS:
13800 icode = CODE_FOR_xrstors_rex64;
13801 break;
13802 case IX86_BUILTIN_XSAVES64:
13803 icode = CODE_FOR_xsaves64;
13804 break;
13805 case IX86_BUILTIN_XRSTORS64:
13806 icode = CODE_FOR_xrstors64;
13807 break;
13808 case IX86_BUILTIN_XSAVEC:
13809 icode = CODE_FOR_xsavec_rex64;
13810 break;
13811 case IX86_BUILTIN_XSAVEC64:
13812 icode = CODE_FOR_xsavec64;
13813 break;
13814 default:
13815 gcc_unreachable ();
13816 }
13817
13818 op2 = gen_lowpart (SImode, op2);
13819 op1 = gen_lowpart (SImode, op1);
13820 pat = GEN_FCN (icode) (op0, op1, op2);
13821 }
13822 else
13823 {
13824 switch (fcode)
13825 {
13826 case IX86_BUILTIN_XSAVE:
13827 icode = CODE_FOR_xsave;
13828 break;
13829 case IX86_BUILTIN_XRSTOR:
13830 icode = CODE_FOR_xrstor;
13831 break;
13832 case IX86_BUILTIN_XSAVEOPT:
13833 icode = CODE_FOR_xsaveopt;
13834 break;
13835 case IX86_BUILTIN_XSAVES:
13836 icode = CODE_FOR_xsaves;
13837 break;
13838 case IX86_BUILTIN_XRSTORS:
13839 icode = CODE_FOR_xrstors;
13840 break;
13841 case IX86_BUILTIN_XSAVEC:
13842 icode = CODE_FOR_xsavec;
13843 break;
13844 default:
13845 gcc_unreachable ();
13846 }
13847 pat = GEN_FCN (icode) (op0, op1);
13848 }
13849
13850 if (pat)
13851 emit_insn (pat);
13852 return 0;
13853
13854 case IX86_BUILTIN_LLWPCB:
13855 arg0 = CALL_EXPR_ARG (exp, 0);
13856 op0 = expand_normal (arg0);
2398c206
UB
13857
13858 if (!register_operand (op0, Pmode))
2bf6d935 13859 op0 = ix86_zero_extend_to_Pmode (op0);
2398c206 13860 emit_insn (gen_lwp_llwpcb (Pmode, op0));
2bf6d935
ML
13861 return 0;
13862
13863 case IX86_BUILTIN_SLWPCB:
2bf6d935 13864 if (!target
2398c206 13865 || !register_operand (target, Pmode))
2bf6d935 13866 target = gen_reg_rtx (Pmode);
2398c206 13867 emit_insn (gen_lwp_slwpcb (Pmode, target));
2bf6d935
ML
13868 return target;
13869
2398c206
UB
13870 case IX86_BUILTIN_LWPVAL32:
13871 case IX86_BUILTIN_LWPVAL64:
13872 case IX86_BUILTIN_LWPINS32:
13873 case IX86_BUILTIN_LWPINS64:
13874 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13875 || fcode == IX86_BUILTIN_LWPINS32)
13876 ? SImode : DImode);
13877
13878 if (fcode == IX86_BUILTIN_LWPVAL32
13879 || fcode == IX86_BUILTIN_LWPVAL64)
13880 icode = code_for_lwp_lwpval (mode);
13881 else
13882 icode = code_for_lwp_lwpins (mode);
13883
13884 arg0 = CALL_EXPR_ARG (exp, 0);
13885 arg1 = CALL_EXPR_ARG (exp, 1);
13886 arg2 = CALL_EXPR_ARG (exp, 2);
13887 op0 = expand_normal (arg0);
13888 op1 = expand_normal (arg1);
13889 op2 = expand_normal (arg2);
13890 mode0 = insn_data[icode].operand[0].mode;
13891
13892 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13893 op0 = copy_to_mode_reg (mode0, op0);
13894 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13895 op1 = copy_to_mode_reg (SImode, op1);
13896
13897 if (!CONST_INT_P (op2))
13898 {
13899 error ("the last argument must be a 32-bit immediate");
13900 return const0_rtx;
13901 }
13902
13903 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13904
13905 if (fcode == IX86_BUILTIN_LWPINS32
13906 || fcode == IX86_BUILTIN_LWPINS64)
13907 {
13908 if (target == 0
13909 || !nonimmediate_operand (target, QImode))
13910 target = gen_reg_rtx (QImode);
13911
13912 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13913 const0_rtx);
13914 emit_insn (gen_rtx_SET (target, pat));
13915
13916 return target;
13917 }
13918 else
13919 return 0;
13920
2bf6d935
ML
13921 case IX86_BUILTIN_BEXTRI32:
13922 case IX86_BUILTIN_BEXTRI64:
9e026191
UB
13923 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13924
2bf6d935
ML
13925 arg0 = CALL_EXPR_ARG (exp, 0);
13926 arg1 = CALL_EXPR_ARG (exp, 1);
13927 op0 = expand_normal (arg0);
13928 op1 = expand_normal (arg1);
9e026191 13929
2bf6d935 13930 if (!CONST_INT_P (op1))
9e026191
UB
13931 {
13932 error ("last argument must be an immediate");
13933 return const0_rtx;
13934 }
2bf6d935 13935 else
9e026191
UB
13936 {
13937 unsigned char lsb_index = UINTVAL (op1);
13938 unsigned char length = UINTVAL (op1) >> 8;
13939
13940 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13941
13942 icode = code_for_tbm_bextri (mode);
2bf6d935
ML
13943
13944 mode1 = insn_data[icode].operand[1].mode;
13945 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13946 op0 = copy_to_mode_reg (mode1, op0);
13947
13948 mode0 = insn_data[icode].operand[0].mode;
13949 if (target == 0
13950 || !register_operand (target, mode0))
13951 target = gen_reg_rtx (mode0);
13952
9e026191
UB
13953 if (length == 0 || lsb_index >= bitsize)
13954 {
13955 emit_move_insn (target, const0_rtx);
13956 return target;
13957 }
13958
13959 if (length + lsb_index > bitsize)
13960 length = bitsize - lsb_index;
13961
13962 op1 = GEN_INT (length);
13963 op2 = GEN_INT (lsb_index);
13964
13965 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13966 return target;
13967 }
2bf6d935
ML
13968
13969 case IX86_BUILTIN_RDRAND16_STEP:
9e026191 13970 mode = HImode;
2bf6d935
ML
13971 goto rdrand_step;
13972
13973 case IX86_BUILTIN_RDRAND32_STEP:
9e026191 13974 mode = SImode;
2bf6d935
ML
13975 goto rdrand_step;
13976
13977 case IX86_BUILTIN_RDRAND64_STEP:
9e026191 13978 mode = DImode;
2bf6d935
ML
13979
13980rdrand_step:
13981 arg0 = CALL_EXPR_ARG (exp, 0);
13982 op1 = expand_normal (arg0);
13983 if (!address_operand (op1, VOIDmode))
13984 {
13985 op1 = convert_memory_address (Pmode, op1);
13986 op1 = copy_addr_to_reg (op1);
13987 }
13988
9e026191
UB
13989 op0 = gen_reg_rtx (mode);
13990 emit_insn (gen_rdrand (mode, op0));
2bf6d935 13991
9e026191 13992 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935 13993
9e026191 13994 op1 = force_reg (SImode, const1_rtx);
2bf6d935
ML
13995
13996 /* Emit SImode conditional move. */
9e026191 13997 if (mode == HImode)
2bf6d935
ML
13998 {
13999 if (TARGET_ZERO_EXTEND_WITH_AND
14000 && optimize_function_for_speed_p (cfun))
14001 {
14002 op2 = force_reg (SImode, const0_rtx);
14003
14004 emit_insn (gen_movstricthi
14005 (gen_lowpart (HImode, op2), op0));
14006 }
14007 else
14008 {
14009 op2 = gen_reg_rtx (SImode);
14010
14011 emit_insn (gen_zero_extendhisi2 (op2, op0));
14012 }
14013 }
9e026191 14014 else if (mode == SImode)
2bf6d935
ML
14015 op2 = op0;
14016 else
14017 op2 = gen_rtx_SUBREG (SImode, op0, 0);
14018
14019 if (target == 0
14020 || !register_operand (target, SImode))
14021 target = gen_reg_rtx (SImode);
14022
14023 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
14024 const0_rtx);
14025 emit_insn (gen_rtx_SET (target,
14026 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
14027 return target;
14028
14029 case IX86_BUILTIN_RDSEED16_STEP:
9e026191 14030 mode = HImode;
2bf6d935
ML
14031 goto rdseed_step;
14032
14033 case IX86_BUILTIN_RDSEED32_STEP:
9e026191 14034 mode = SImode;
2bf6d935
ML
14035 goto rdseed_step;
14036
14037 case IX86_BUILTIN_RDSEED64_STEP:
9e026191 14038 mode = DImode;
2bf6d935
ML
14039
14040rdseed_step:
14041 arg0 = CALL_EXPR_ARG (exp, 0);
14042 op1 = expand_normal (arg0);
14043 if (!address_operand (op1, VOIDmode))
14044 {
14045 op1 = convert_memory_address (Pmode, op1);
14046 op1 = copy_addr_to_reg (op1);
14047 }
14048
9e026191
UB
14049 op0 = gen_reg_rtx (mode);
14050 emit_insn (gen_rdseed (mode, op0));
2bf6d935 14051
9e026191 14052 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935
ML
14053
14054 op2 = gen_reg_rtx (QImode);
14055
14056 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14057 const0_rtx);
14058 emit_insn (gen_rtx_SET (op2, pat));
14059
14060 if (target == 0
14061 || !register_operand (target, SImode))
14062 target = gen_reg_rtx (SImode);
14063
14064 emit_insn (gen_zero_extendqisi2 (target, op2));
14065 return target;
14066
14067 case IX86_BUILTIN_SBB32:
14068 icode = CODE_FOR_subborrowsi;
14069 icode2 = CODE_FOR_subborrowsi_0;
14070 mode0 = SImode;
14071 mode1 = DImode;
14072 mode2 = CCmode;
14073 goto handlecarry;
14074
14075 case IX86_BUILTIN_SBB64:
14076 icode = CODE_FOR_subborrowdi;
14077 icode2 = CODE_FOR_subborrowdi_0;
14078 mode0 = DImode;
14079 mode1 = TImode;
14080 mode2 = CCmode;
14081 goto handlecarry;
14082
14083 case IX86_BUILTIN_ADDCARRYX32:
14084 icode = CODE_FOR_addcarrysi;
14085 icode2 = CODE_FOR_addcarrysi_0;
14086 mode0 = SImode;
14087 mode1 = DImode;
14088 mode2 = CCCmode;
14089 goto handlecarry;
14090
14091 case IX86_BUILTIN_ADDCARRYX64:
14092 icode = CODE_FOR_addcarrydi;
14093 icode2 = CODE_FOR_addcarrydi_0;
14094 mode0 = DImode;
14095 mode1 = TImode;
14096 mode2 = CCCmode;
14097
14098 handlecarry:
14099 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
14100 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
14101 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
14102 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
14103
14104 op1 = expand_normal (arg0);
2bf6d935
ML
14105
14106 op2 = expand_normal (arg1);
14107 if (!register_operand (op2, mode0))
14108 op2 = copy_to_mode_reg (mode0, op2);
14109
14110 op3 = expand_normal (arg2);
14111 if (!register_operand (op3, mode0))
14112 op3 = copy_to_mode_reg (mode0, op3);
14113
14114 op4 = expand_normal (arg3);
14115 if (!address_operand (op4, VOIDmode))
14116 {
14117 op4 = convert_memory_address (Pmode, op4);
14118 op4 = copy_addr_to_reg (op4);
14119 }
14120
14121 op0 = gen_reg_rtx (mode0);
eba3565c 14122 if (op1 == const0_rtx)
2bf6d935
ML
14123 {
14124 /* If arg0 is 0, optimize right away into add or sub
14125 instruction that sets CCCmode flags. */
14126 op1 = gen_rtx_REG (mode2, FLAGS_REG);
14127 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
14128 }
14129 else
14130 {
14131 /* Generate CF from input operand. */
af29d0d6 14132 ix86_expand_carry (op1);
2bf6d935
ML
14133
14134 /* Generate instruction that consumes CF. */
14135 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
14136 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
14137 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
14138 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
14139 }
14140
14141 /* Return current CF value. */
14142 if (target == 0)
14143 target = gen_reg_rtx (QImode);
14144
14145 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
14146 emit_insn (gen_rtx_SET (target, pat));
14147
14148 /* Store the result. */
14149 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
14150
14151 return target;
14152
14153 case IX86_BUILTIN_READ_FLAGS:
b60bc913
JJ
14154 if (ignore)
14155 return const0_rtx;
14156
2bf6d935
ML
14157 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
14158
14159 if (optimize
14160 || target == NULL_RTX
14161 || !nonimmediate_operand (target, word_mode)
14162 || GET_MODE (target) != word_mode)
14163 target = gen_reg_rtx (word_mode);
14164
14165 emit_insn (gen_pop (target));
14166 return target;
14167
14168 case IX86_BUILTIN_WRITE_FLAGS:
14169
14170 arg0 = CALL_EXPR_ARG (exp, 0);
14171 op0 = expand_normal (arg0);
14172 if (!general_no_elim_operand (op0, word_mode))
14173 op0 = copy_to_mode_reg (word_mode, op0);
14174
14175 emit_insn (gen_push (op0));
14176 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
14177 return 0;
14178
14179 case IX86_BUILTIN_KTESTC8:
14180 icode = CODE_FOR_ktestqi;
14181 mode3 = CCCmode;
14182 goto kortest;
14183
14184 case IX86_BUILTIN_KTESTZ8:
14185 icode = CODE_FOR_ktestqi;
14186 mode3 = CCZmode;
14187 goto kortest;
14188
14189 case IX86_BUILTIN_KTESTC16:
14190 icode = CODE_FOR_ktesthi;
14191 mode3 = CCCmode;
14192 goto kortest;
14193
14194 case IX86_BUILTIN_KTESTZ16:
14195 icode = CODE_FOR_ktesthi;
14196 mode3 = CCZmode;
14197 goto kortest;
14198
14199 case IX86_BUILTIN_KTESTC32:
14200 icode = CODE_FOR_ktestsi;
14201 mode3 = CCCmode;
14202 goto kortest;
14203
14204 case IX86_BUILTIN_KTESTZ32:
14205 icode = CODE_FOR_ktestsi;
14206 mode3 = CCZmode;
14207 goto kortest;
14208
14209 case IX86_BUILTIN_KTESTC64:
14210 icode = CODE_FOR_ktestdi;
14211 mode3 = CCCmode;
14212 goto kortest;
14213
14214 case IX86_BUILTIN_KTESTZ64:
14215 icode = CODE_FOR_ktestdi;
14216 mode3 = CCZmode;
14217 goto kortest;
14218
14219 case IX86_BUILTIN_KORTESTC8:
14220 icode = CODE_FOR_kortestqi;
14221 mode3 = CCCmode;
14222 goto kortest;
14223
14224 case IX86_BUILTIN_KORTESTZ8:
14225 icode = CODE_FOR_kortestqi;
14226 mode3 = CCZmode;
14227 goto kortest;
14228
14229 case IX86_BUILTIN_KORTESTC16:
14230 icode = CODE_FOR_kortesthi;
14231 mode3 = CCCmode;
14232 goto kortest;
14233
14234 case IX86_BUILTIN_KORTESTZ16:
14235 icode = CODE_FOR_kortesthi;
14236 mode3 = CCZmode;
14237 goto kortest;
14238
14239 case IX86_BUILTIN_KORTESTC32:
14240 icode = CODE_FOR_kortestsi;
14241 mode3 = CCCmode;
14242 goto kortest;
14243
14244 case IX86_BUILTIN_KORTESTZ32:
14245 icode = CODE_FOR_kortestsi;
14246 mode3 = CCZmode;
14247 goto kortest;
14248
14249 case IX86_BUILTIN_KORTESTC64:
14250 icode = CODE_FOR_kortestdi;
14251 mode3 = CCCmode;
14252 goto kortest;
14253
14254 case IX86_BUILTIN_KORTESTZ64:
14255 icode = CODE_FOR_kortestdi;
14256 mode3 = CCZmode;
14257
14258 kortest:
14259 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
14260 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
14261 op0 = expand_normal (arg0);
14262 op1 = expand_normal (arg1);
14263
14264 mode0 = insn_data[icode].operand[0].mode;
14265 mode1 = insn_data[icode].operand[1].mode;
14266
14267 if (GET_MODE (op0) != VOIDmode)
14268 op0 = force_reg (GET_MODE (op0), op0);
14269
14270 op0 = gen_lowpart (mode0, op0);
14271
14272 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14273 op0 = copy_to_mode_reg (mode0, op0);
14274
14275 if (GET_MODE (op1) != VOIDmode)
14276 op1 = force_reg (GET_MODE (op1), op1);
14277
14278 op1 = gen_lowpart (mode1, op1);
14279
14280 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14281 op1 = copy_to_mode_reg (mode1, op1);
14282
14283 target = gen_reg_rtx (QImode);
14284
14285 /* Emit kortest. */
14286 emit_insn (GEN_FCN (icode) (op0, op1));
14287 /* And use setcc to return result from flags. */
14288 ix86_expand_setcc (target, EQ,
14289 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
14290 return target;
14291
14292 case IX86_BUILTIN_GATHERSIV2DF:
14293 icode = CODE_FOR_avx2_gathersiv2df;
14294 goto gather_gen;
14295 case IX86_BUILTIN_GATHERSIV4DF:
14296 icode = CODE_FOR_avx2_gathersiv4df;
14297 goto gather_gen;
14298 case IX86_BUILTIN_GATHERDIV2DF:
14299 icode = CODE_FOR_avx2_gatherdiv2df;
14300 goto gather_gen;
14301 case IX86_BUILTIN_GATHERDIV4DF:
14302 icode = CODE_FOR_avx2_gatherdiv4df;
14303 goto gather_gen;
14304 case IX86_BUILTIN_GATHERSIV4SF:
14305 icode = CODE_FOR_avx2_gathersiv4sf;
14306 goto gather_gen;
14307 case IX86_BUILTIN_GATHERSIV8SF:
14308 icode = CODE_FOR_avx2_gathersiv8sf;
14309 goto gather_gen;
14310 case IX86_BUILTIN_GATHERDIV4SF:
14311 icode = CODE_FOR_avx2_gatherdiv4sf;
14312 goto gather_gen;
14313 case IX86_BUILTIN_GATHERDIV8SF:
14314 icode = CODE_FOR_avx2_gatherdiv8sf;
14315 goto gather_gen;
14316 case IX86_BUILTIN_GATHERSIV2DI:
14317 icode = CODE_FOR_avx2_gathersiv2di;
14318 goto gather_gen;
14319 case IX86_BUILTIN_GATHERSIV4DI:
14320 icode = CODE_FOR_avx2_gathersiv4di;
14321 goto gather_gen;
14322 case IX86_BUILTIN_GATHERDIV2DI:
14323 icode = CODE_FOR_avx2_gatherdiv2di;
14324 goto gather_gen;
14325 case IX86_BUILTIN_GATHERDIV4DI:
14326 icode = CODE_FOR_avx2_gatherdiv4di;
14327 goto gather_gen;
14328 case IX86_BUILTIN_GATHERSIV4SI:
14329 icode = CODE_FOR_avx2_gathersiv4si;
14330 goto gather_gen;
14331 case IX86_BUILTIN_GATHERSIV8SI:
14332 icode = CODE_FOR_avx2_gathersiv8si;
14333 goto gather_gen;
14334 case IX86_BUILTIN_GATHERDIV4SI:
14335 icode = CODE_FOR_avx2_gatherdiv4si;
14336 goto gather_gen;
14337 case IX86_BUILTIN_GATHERDIV8SI:
14338 icode = CODE_FOR_avx2_gatherdiv8si;
14339 goto gather_gen;
14340 case IX86_BUILTIN_GATHERALTSIV4DF:
14341 icode = CODE_FOR_avx2_gathersiv4df;
14342 goto gather_gen;
14343 case IX86_BUILTIN_GATHERALTDIV8SF:
14344 icode = CODE_FOR_avx2_gatherdiv8sf;
14345 goto gather_gen;
14346 case IX86_BUILTIN_GATHERALTSIV4DI:
14347 icode = CODE_FOR_avx2_gathersiv4di;
14348 goto gather_gen;
14349 case IX86_BUILTIN_GATHERALTDIV8SI:
14350 icode = CODE_FOR_avx2_gatherdiv8si;
14351 goto gather_gen;
14352 case IX86_BUILTIN_GATHER3SIV16SF:
14353 icode = CODE_FOR_avx512f_gathersiv16sf;
14354 goto gather_gen;
14355 case IX86_BUILTIN_GATHER3SIV8DF:
14356 icode = CODE_FOR_avx512f_gathersiv8df;
14357 goto gather_gen;
14358 case IX86_BUILTIN_GATHER3DIV16SF:
14359 icode = CODE_FOR_avx512f_gatherdiv16sf;
14360 goto gather_gen;
14361 case IX86_BUILTIN_GATHER3DIV8DF:
14362 icode = CODE_FOR_avx512f_gatherdiv8df;
14363 goto gather_gen;
14364 case IX86_BUILTIN_GATHER3SIV16SI:
14365 icode = CODE_FOR_avx512f_gathersiv16si;
14366 goto gather_gen;
14367 case IX86_BUILTIN_GATHER3SIV8DI:
14368 icode = CODE_FOR_avx512f_gathersiv8di;
14369 goto gather_gen;
14370 case IX86_BUILTIN_GATHER3DIV16SI:
14371 icode = CODE_FOR_avx512f_gatherdiv16si;
14372 goto gather_gen;
14373 case IX86_BUILTIN_GATHER3DIV8DI:
14374 icode = CODE_FOR_avx512f_gatherdiv8di;
14375 goto gather_gen;
14376 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14377 icode = CODE_FOR_avx512f_gathersiv8df;
14378 goto gather_gen;
14379 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14380 icode = CODE_FOR_avx512f_gatherdiv16sf;
14381 goto gather_gen;
14382 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14383 icode = CODE_FOR_avx512f_gathersiv8di;
14384 goto gather_gen;
14385 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14386 icode = CODE_FOR_avx512f_gatherdiv16si;
14387 goto gather_gen;
14388 case IX86_BUILTIN_GATHER3SIV2DF:
14389 icode = CODE_FOR_avx512vl_gathersiv2df;
14390 goto gather_gen;
14391 case IX86_BUILTIN_GATHER3SIV4DF:
14392 icode = CODE_FOR_avx512vl_gathersiv4df;
14393 goto gather_gen;
14394 case IX86_BUILTIN_GATHER3DIV2DF:
14395 icode = CODE_FOR_avx512vl_gatherdiv2df;
14396 goto gather_gen;
14397 case IX86_BUILTIN_GATHER3DIV4DF:
14398 icode = CODE_FOR_avx512vl_gatherdiv4df;
14399 goto gather_gen;
14400 case IX86_BUILTIN_GATHER3SIV4SF:
14401 icode = CODE_FOR_avx512vl_gathersiv4sf;
14402 goto gather_gen;
14403 case IX86_BUILTIN_GATHER3SIV8SF:
14404 icode = CODE_FOR_avx512vl_gathersiv8sf;
14405 goto gather_gen;
14406 case IX86_BUILTIN_GATHER3DIV4SF:
14407 icode = CODE_FOR_avx512vl_gatherdiv4sf;
14408 goto gather_gen;
14409 case IX86_BUILTIN_GATHER3DIV8SF:
14410 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14411 goto gather_gen;
14412 case IX86_BUILTIN_GATHER3SIV2DI:
14413 icode = CODE_FOR_avx512vl_gathersiv2di;
14414 goto gather_gen;
14415 case IX86_BUILTIN_GATHER3SIV4DI:
14416 icode = CODE_FOR_avx512vl_gathersiv4di;
14417 goto gather_gen;
14418 case IX86_BUILTIN_GATHER3DIV2DI:
14419 icode = CODE_FOR_avx512vl_gatherdiv2di;
14420 goto gather_gen;
14421 case IX86_BUILTIN_GATHER3DIV4DI:
14422 icode = CODE_FOR_avx512vl_gatherdiv4di;
14423 goto gather_gen;
14424 case IX86_BUILTIN_GATHER3SIV4SI:
14425 icode = CODE_FOR_avx512vl_gathersiv4si;
14426 goto gather_gen;
14427 case IX86_BUILTIN_GATHER3SIV8SI:
14428 icode = CODE_FOR_avx512vl_gathersiv8si;
14429 goto gather_gen;
14430 case IX86_BUILTIN_GATHER3DIV4SI:
14431 icode = CODE_FOR_avx512vl_gatherdiv4si;
14432 goto gather_gen;
14433 case IX86_BUILTIN_GATHER3DIV8SI:
14434 icode = CODE_FOR_avx512vl_gatherdiv8si;
14435 goto gather_gen;
14436 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14437 icode = CODE_FOR_avx512vl_gathersiv4df;
14438 goto gather_gen;
14439 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14440 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14441 goto gather_gen;
14442 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14443 icode = CODE_FOR_avx512vl_gathersiv4di;
14444 goto gather_gen;
14445 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14446 icode = CODE_FOR_avx512vl_gatherdiv8si;
14447 goto gather_gen;
14448 case IX86_BUILTIN_SCATTERSIV16SF:
14449 icode = CODE_FOR_avx512f_scattersiv16sf;
14450 goto scatter_gen;
14451 case IX86_BUILTIN_SCATTERSIV8DF:
14452 icode = CODE_FOR_avx512f_scattersiv8df;
14453 goto scatter_gen;
14454 case IX86_BUILTIN_SCATTERDIV16SF:
14455 icode = CODE_FOR_avx512f_scatterdiv16sf;
14456 goto scatter_gen;
14457 case IX86_BUILTIN_SCATTERDIV8DF:
14458 icode = CODE_FOR_avx512f_scatterdiv8df;
14459 goto scatter_gen;
14460 case IX86_BUILTIN_SCATTERSIV16SI:
14461 icode = CODE_FOR_avx512f_scattersiv16si;
14462 goto scatter_gen;
14463 case IX86_BUILTIN_SCATTERSIV8DI:
14464 icode = CODE_FOR_avx512f_scattersiv8di;
14465 goto scatter_gen;
14466 case IX86_BUILTIN_SCATTERDIV16SI:
14467 icode = CODE_FOR_avx512f_scatterdiv16si;
14468 goto scatter_gen;
14469 case IX86_BUILTIN_SCATTERDIV8DI:
14470 icode = CODE_FOR_avx512f_scatterdiv8di;
14471 goto scatter_gen;
14472 case IX86_BUILTIN_SCATTERSIV8SF:
14473 icode = CODE_FOR_avx512vl_scattersiv8sf;
14474 goto scatter_gen;
14475 case IX86_BUILTIN_SCATTERSIV4SF:
14476 icode = CODE_FOR_avx512vl_scattersiv4sf;
14477 goto scatter_gen;
14478 case IX86_BUILTIN_SCATTERSIV4DF:
14479 icode = CODE_FOR_avx512vl_scattersiv4df;
14480 goto scatter_gen;
14481 case IX86_BUILTIN_SCATTERSIV2DF:
14482 icode = CODE_FOR_avx512vl_scattersiv2df;
14483 goto scatter_gen;
14484 case IX86_BUILTIN_SCATTERDIV8SF:
14485 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14486 goto scatter_gen;
14487 case IX86_BUILTIN_SCATTERDIV4SF:
14488 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14489 goto scatter_gen;
14490 case IX86_BUILTIN_SCATTERDIV4DF:
14491 icode = CODE_FOR_avx512vl_scatterdiv4df;
14492 goto scatter_gen;
14493 case IX86_BUILTIN_SCATTERDIV2DF:
14494 icode = CODE_FOR_avx512vl_scatterdiv2df;
14495 goto scatter_gen;
14496 case IX86_BUILTIN_SCATTERSIV8SI:
14497 icode = CODE_FOR_avx512vl_scattersiv8si;
14498 goto scatter_gen;
14499 case IX86_BUILTIN_SCATTERSIV4SI:
14500 icode = CODE_FOR_avx512vl_scattersiv4si;
14501 goto scatter_gen;
14502 case IX86_BUILTIN_SCATTERSIV4DI:
14503 icode = CODE_FOR_avx512vl_scattersiv4di;
14504 goto scatter_gen;
14505 case IX86_BUILTIN_SCATTERSIV2DI:
14506 icode = CODE_FOR_avx512vl_scattersiv2di;
14507 goto scatter_gen;
14508 case IX86_BUILTIN_SCATTERDIV8SI:
14509 icode = CODE_FOR_avx512vl_scatterdiv8si;
14510 goto scatter_gen;
14511 case IX86_BUILTIN_SCATTERDIV4SI:
14512 icode = CODE_FOR_avx512vl_scatterdiv4si;
14513 goto scatter_gen;
14514 case IX86_BUILTIN_SCATTERDIV4DI:
14515 icode = CODE_FOR_avx512vl_scatterdiv4di;
14516 goto scatter_gen;
14517 case IX86_BUILTIN_SCATTERDIV2DI:
14518 icode = CODE_FOR_avx512vl_scatterdiv2di;
14519 goto scatter_gen;
14520 case IX86_BUILTIN_GATHERPFDPD:
14521 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14522 goto vec_prefetch_gen;
14523 case IX86_BUILTIN_SCATTERALTSIV8DF:
14524 icode = CODE_FOR_avx512f_scattersiv8df;
14525 goto scatter_gen;
14526 case IX86_BUILTIN_SCATTERALTDIV16SF:
14527 icode = CODE_FOR_avx512f_scatterdiv16sf;
14528 goto scatter_gen;
14529 case IX86_BUILTIN_SCATTERALTSIV8DI:
14530 icode = CODE_FOR_avx512f_scattersiv8di;
14531 goto scatter_gen;
14532 case IX86_BUILTIN_SCATTERALTDIV16SI:
14533 icode = CODE_FOR_avx512f_scatterdiv16si;
14534 goto scatter_gen;
14535 case IX86_BUILTIN_SCATTERALTSIV4DF:
14536 icode = CODE_FOR_avx512vl_scattersiv4df;
14537 goto scatter_gen;
14538 case IX86_BUILTIN_SCATTERALTDIV8SF:
14539 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14540 goto scatter_gen;
14541 case IX86_BUILTIN_SCATTERALTSIV4DI:
14542 icode = CODE_FOR_avx512vl_scattersiv4di;
14543 goto scatter_gen;
14544 case IX86_BUILTIN_SCATTERALTDIV8SI:
14545 icode = CODE_FOR_avx512vl_scatterdiv8si;
14546 goto scatter_gen;
14547 case IX86_BUILTIN_SCATTERALTSIV2DF:
14548 icode = CODE_FOR_avx512vl_scattersiv2df;
14549 goto scatter_gen;
14550 case IX86_BUILTIN_SCATTERALTDIV4SF:
14551 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14552 goto scatter_gen;
14553 case IX86_BUILTIN_SCATTERALTSIV2DI:
14554 icode = CODE_FOR_avx512vl_scattersiv2di;
14555 goto scatter_gen;
14556 case IX86_BUILTIN_SCATTERALTDIV4SI:
14557 icode = CODE_FOR_avx512vl_scatterdiv4si;
14558 goto scatter_gen;
14559 case IX86_BUILTIN_GATHERPFDPS:
14560 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14561 goto vec_prefetch_gen;
14562 case IX86_BUILTIN_GATHERPFQPD:
14563 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14564 goto vec_prefetch_gen;
14565 case IX86_BUILTIN_GATHERPFQPS:
14566 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14567 goto vec_prefetch_gen;
14568 case IX86_BUILTIN_SCATTERPFDPD:
14569 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14570 goto vec_prefetch_gen;
14571 case IX86_BUILTIN_SCATTERPFDPS:
14572 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14573 goto vec_prefetch_gen;
14574 case IX86_BUILTIN_SCATTERPFQPD:
14575 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14576 goto vec_prefetch_gen;
14577 case IX86_BUILTIN_SCATTERPFQPS:
14578 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14579 goto vec_prefetch_gen;
14580
14581 gather_gen:
14582 rtx half;
14583 rtx (*gen) (rtx, rtx);
14584
14585 arg0 = CALL_EXPR_ARG (exp, 0);
14586 arg1 = CALL_EXPR_ARG (exp, 1);
14587 arg2 = CALL_EXPR_ARG (exp, 2);
14588 arg3 = CALL_EXPR_ARG (exp, 3);
14589 arg4 = CALL_EXPR_ARG (exp, 4);
14590 op0 = expand_normal (arg0);
14591 op1 = expand_normal (arg1);
14592 op2 = expand_normal (arg2);
14593 op3 = expand_normal (arg3);
14594 op4 = expand_normal (arg4);
14595 /* Note the arg order is different from the operand order. */
14596 mode0 = insn_data[icode].operand[1].mode;
14597 mode2 = insn_data[icode].operand[3].mode;
14598 mode3 = insn_data[icode].operand[4].mode;
14599 mode4 = insn_data[icode].operand[5].mode;
14600
14601 if (target == NULL_RTX
14602 || GET_MODE (target) != insn_data[icode].operand[0].mode
14603 || !insn_data[icode].operand[0].predicate (target,
14604 GET_MODE (target)))
14605 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14606 else
14607 subtarget = target;
14608
14609 switch (fcode)
14610 {
14611 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14612 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14613 half = gen_reg_rtx (V8SImode);
14614 if (!nonimmediate_operand (op2, V16SImode))
14615 op2 = copy_to_mode_reg (V16SImode, op2);
14616 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14617 op2 = half;
14618 break;
14619 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14620 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14621 case IX86_BUILTIN_GATHERALTSIV4DF:
14622 case IX86_BUILTIN_GATHERALTSIV4DI:
14623 half = gen_reg_rtx (V4SImode);
14624 if (!nonimmediate_operand (op2, V8SImode))
14625 op2 = copy_to_mode_reg (V8SImode, op2);
14626 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14627 op2 = half;
14628 break;
14629 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14630 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14631 half = gen_reg_rtx (mode0);
14632 if (mode0 == V8SFmode)
14633 gen = gen_vec_extract_lo_v16sf;
14634 else
14635 gen = gen_vec_extract_lo_v16si;
14636 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14637 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14638 emit_insn (gen (half, op0));
14639 op0 = half;
14640 op3 = lowpart_subreg (QImode, op3, HImode);
14641 break;
14642 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14643 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14644 case IX86_BUILTIN_GATHERALTDIV8SF:
14645 case IX86_BUILTIN_GATHERALTDIV8SI:
14646 half = gen_reg_rtx (mode0);
14647 if (mode0 == V4SFmode)
14648 gen = gen_vec_extract_lo_v8sf;
14649 else
14650 gen = gen_vec_extract_lo_v8si;
14651 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14652 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14653 emit_insn (gen (half, op0));
14654 op0 = half;
14655 if (VECTOR_MODE_P (GET_MODE (op3)))
14656 {
14657 half = gen_reg_rtx (mode0);
14658 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14659 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14660 emit_insn (gen (half, op3));
14661 op3 = half;
14662 }
14663 break;
14664 default:
14665 break;
14666 }
14667
14668 /* Force memory operand only with base register here. But we
14669 don't want to do it on memory operand for other builtin
14670 functions. */
14671 op1 = ix86_zero_extend_to_Pmode (op1);
14672
14673 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14674 op0 = copy_to_mode_reg (mode0, op0);
14675 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14676 op1 = copy_to_mode_reg (Pmode, op1);
14677 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14678 op2 = copy_to_mode_reg (mode2, op2);
14679
14680 op3 = fixup_modeless_constant (op3, mode3);
14681
14682 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14683 {
14684 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14685 op3 = copy_to_mode_reg (mode3, op3);
14686 }
14687 else
14688 {
14689 op3 = copy_to_reg (op3);
14690 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14691 }
14692 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14693 {
14694 error ("the last argument must be scale 1, 2, 4, 8");
14695 return const0_rtx;
14696 }
14697
14698 /* Optimize. If mask is known to have all high bits set,
14699 replace op0 with pc_rtx to signal that the instruction
14700 overwrites the whole destination and doesn't use its
14701 previous contents. */
14702 if (optimize)
14703 {
14704 if (TREE_CODE (arg3) == INTEGER_CST)
14705 {
14706 if (integer_all_onesp (arg3))
14707 op0 = pc_rtx;
14708 }
14709 else if (TREE_CODE (arg3) == VECTOR_CST)
14710 {
14711 unsigned int negative = 0;
14712 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14713 {
14714 tree cst = VECTOR_CST_ELT (arg3, i);
14715 if (TREE_CODE (cst) == INTEGER_CST
14716 && tree_int_cst_sign_bit (cst))
14717 negative++;
14718 else if (TREE_CODE (cst) == REAL_CST
14719 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14720 negative++;
14721 }
14722 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14723 op0 = pc_rtx;
14724 }
14725 else if (TREE_CODE (arg3) == SSA_NAME
9907413a 14726 && VECTOR_TYPE_P (TREE_TYPE (arg3)))
2bf6d935
ML
14727 {
14728 /* Recognize also when mask is like:
14729 __v2df src = _mm_setzero_pd ();
14730 __v2df mask = _mm_cmpeq_pd (src, src);
14731 or
14732 __v8sf src = _mm256_setzero_ps ();
14733 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14734 as that is a cheaper way to load all ones into
14735 a register than having to load a constant from
14736 memory. */
14737 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14738 if (is_gimple_call (def_stmt))
14739 {
14740 tree fndecl = gimple_call_fndecl (def_stmt);
14741 if (fndecl
14742 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
4d732405 14743 switch (DECL_MD_FUNCTION_CODE (fndecl))
2bf6d935
ML
14744 {
14745 case IX86_BUILTIN_CMPPD:
14746 case IX86_BUILTIN_CMPPS:
14747 case IX86_BUILTIN_CMPPD256:
14748 case IX86_BUILTIN_CMPPS256:
14749 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14750 break;
14751 /* FALLTHRU */
14752 case IX86_BUILTIN_CMPEQPD:
14753 case IX86_BUILTIN_CMPEQPS:
14754 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14755 && initializer_zerop (gimple_call_arg (def_stmt,
14756 1)))
14757 op0 = pc_rtx;
14758 break;
14759 default:
14760 break;
14761 }
14762 }
14763 }
14764 }
14765
14766 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14767 if (! pat)
14768 return const0_rtx;
14769 emit_insn (pat);
14770
14771 switch (fcode)
14772 {
14773 case IX86_BUILTIN_GATHER3DIV16SF:
14774 if (target == NULL_RTX)
14775 target = gen_reg_rtx (V8SFmode);
14776 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14777 break;
14778 case IX86_BUILTIN_GATHER3DIV16SI:
14779 if (target == NULL_RTX)
14780 target = gen_reg_rtx (V8SImode);
14781 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14782 break;
14783 case IX86_BUILTIN_GATHER3DIV8SF:
14784 case IX86_BUILTIN_GATHERDIV8SF:
14785 if (target == NULL_RTX)
14786 target = gen_reg_rtx (V4SFmode);
14787 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14788 break;
14789 case IX86_BUILTIN_GATHER3DIV8SI:
14790 case IX86_BUILTIN_GATHERDIV8SI:
14791 if (target == NULL_RTX)
14792 target = gen_reg_rtx (V4SImode);
14793 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14794 break;
14795 default:
14796 target = subtarget;
14797 break;
14798 }
14799 return target;
14800
14801 scatter_gen:
14802 arg0 = CALL_EXPR_ARG (exp, 0);
14803 arg1 = CALL_EXPR_ARG (exp, 1);
14804 arg2 = CALL_EXPR_ARG (exp, 2);
14805 arg3 = CALL_EXPR_ARG (exp, 3);
14806 arg4 = CALL_EXPR_ARG (exp, 4);
14807 op0 = expand_normal (arg0);
14808 op1 = expand_normal (arg1);
14809 op2 = expand_normal (arg2);
14810 op3 = expand_normal (arg3);
14811 op4 = expand_normal (arg4);
14812 mode1 = insn_data[icode].operand[1].mode;
14813 mode2 = insn_data[icode].operand[2].mode;
14814 mode3 = insn_data[icode].operand[3].mode;
14815 mode4 = insn_data[icode].operand[4].mode;
14816
14817 /* Scatter instruction stores operand op3 to memory with
14818 indices from op2 and scale from op4 under writemask op1.
14819 If index operand op2 has more elements then source operand
14820 op3 one need to use only its low half. And vice versa. */
14821 switch (fcode)
14822 {
14823 case IX86_BUILTIN_SCATTERALTSIV8DF:
14824 case IX86_BUILTIN_SCATTERALTSIV8DI:
14825 half = gen_reg_rtx (V8SImode);
14826 if (!nonimmediate_operand (op2, V16SImode))
14827 op2 = copy_to_mode_reg (V16SImode, op2);
14828 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14829 op2 = half;
14830 break;
14831 case IX86_BUILTIN_SCATTERALTDIV16SF:
14832 case IX86_BUILTIN_SCATTERALTDIV16SI:
14833 half = gen_reg_rtx (mode3);
14834 if (mode3 == V8SFmode)
14835 gen = gen_vec_extract_lo_v16sf;
14836 else
14837 gen = gen_vec_extract_lo_v16si;
14838 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14839 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14840 emit_insn (gen (half, op3));
14841 op3 = half;
14842 break;
14843 case IX86_BUILTIN_SCATTERALTSIV4DF:
14844 case IX86_BUILTIN_SCATTERALTSIV4DI:
14845 half = gen_reg_rtx (V4SImode);
14846 if (!nonimmediate_operand (op2, V8SImode))
14847 op2 = copy_to_mode_reg (V8SImode, op2);
14848 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14849 op2 = half;
14850 break;
14851 case IX86_BUILTIN_SCATTERALTDIV8SF:
14852 case IX86_BUILTIN_SCATTERALTDIV8SI:
14853 half = gen_reg_rtx (mode3);
14854 if (mode3 == V4SFmode)
14855 gen = gen_vec_extract_lo_v8sf;
14856 else
14857 gen = gen_vec_extract_lo_v8si;
14858 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14859 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14860 emit_insn (gen (half, op3));
14861 op3 = half;
14862 break;
14863 case IX86_BUILTIN_SCATTERALTSIV2DF:
14864 case IX86_BUILTIN_SCATTERALTSIV2DI:
14865 if (!nonimmediate_operand (op2, V4SImode))
14866 op2 = copy_to_mode_reg (V4SImode, op2);
14867 break;
14868 case IX86_BUILTIN_SCATTERALTDIV4SF:
14869 case IX86_BUILTIN_SCATTERALTDIV4SI:
14870 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14871 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14872 break;
14873 default:
14874 break;
14875 }
14876
14877 /* Force memory operand only with base register here. But we
14878 don't want to do it on memory operand for other builtin
14879 functions. */
14880 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14881
14882 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14883 op0 = copy_to_mode_reg (Pmode, op0);
14884
14885 op1 = fixup_modeless_constant (op1, mode1);
14886
14887 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14888 {
14889 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14890 op1 = copy_to_mode_reg (mode1, op1);
14891 }
14892 else
14893 {
14894 op1 = copy_to_reg (op1);
14895 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14896 }
14897
14898 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14899 op2 = copy_to_mode_reg (mode2, op2);
14900
14901 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14902 op3 = copy_to_mode_reg (mode3, op3);
14903
14904 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14905 {
14906 error ("the last argument must be scale 1, 2, 4, 8");
14907 return const0_rtx;
14908 }
14909
14910 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14911 if (! pat)
14912 return const0_rtx;
14913
14914 emit_insn (pat);
14915 return 0;
14916
14917 vec_prefetch_gen:
14918 arg0 = CALL_EXPR_ARG (exp, 0);
14919 arg1 = CALL_EXPR_ARG (exp, 1);
14920 arg2 = CALL_EXPR_ARG (exp, 2);
14921 arg3 = CALL_EXPR_ARG (exp, 3);
14922 arg4 = CALL_EXPR_ARG (exp, 4);
14923 op0 = expand_normal (arg0);
14924 op1 = expand_normal (arg1);
14925 op2 = expand_normal (arg2);
14926 op3 = expand_normal (arg3);
14927 op4 = expand_normal (arg4);
14928 mode0 = insn_data[icode].operand[0].mode;
14929 mode1 = insn_data[icode].operand[1].mode;
14930 mode3 = insn_data[icode].operand[3].mode;
14931 mode4 = insn_data[icode].operand[4].mode;
14932
14933 op0 = fixup_modeless_constant (op0, mode0);
14934
14935 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14936 {
14937 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14938 op0 = copy_to_mode_reg (mode0, op0);
14939 }
14940 else
14941 {
14942 op0 = copy_to_reg (op0);
14943 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14944 }
14945
14946 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14947 op1 = copy_to_mode_reg (mode1, op1);
14948
14949 /* Force memory operand only with base register here. But we
14950 don't want to do it on memory operand for other builtin
14951 functions. */
14952 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14953
14954 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14955 op2 = copy_to_mode_reg (Pmode, op2);
14956
14957 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14958 {
14959 error ("the forth argument must be scale 1, 2, 4, 8");
14960 return const0_rtx;
14961 }
14962
14963 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14964 {
14965 error ("incorrect hint operand");
14966 return const0_rtx;
14967 }
14968
14969 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14970 if (! pat)
14971 return const0_rtx;
14972
14973 emit_insn (pat);
14974
14975 return 0;
14976
14977 case IX86_BUILTIN_XABORT:
14978 icode = CODE_FOR_xabort;
14979 arg0 = CALL_EXPR_ARG (exp, 0);
14980 op0 = expand_normal (arg0);
14981 mode0 = insn_data[icode].operand[0].mode;
14982 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14983 {
14984 error ("the argument to %<xabort%> intrinsic must "
14985 "be an 8-bit immediate");
14986 return const0_rtx;
14987 }
14988 emit_insn (gen_xabort (op0));
14989 return 0;
14990
b5034abb
UB
14991 case IX86_BUILTIN_RDSSPD:
14992 case IX86_BUILTIN_RDSSPQ:
14993 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14994
14995 if (target == 0
14996 || !register_operand (target, mode))
14997 target = gen_reg_rtx (mode);
14998
14999 op0 = force_reg (mode, const0_rtx);
15000
15001 emit_insn (gen_rdssp (mode, target, op0));
15002 return target;
15003
15004 case IX86_BUILTIN_INCSSPD:
15005 case IX86_BUILTIN_INCSSPQ:
15006 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
15007
15008 arg0 = CALL_EXPR_ARG (exp, 0);
15009 op0 = expand_normal (arg0);
15010
15011 op0 = force_reg (mode, op0);
15012
15013 emit_insn (gen_incssp (mode, op0));
15014 return 0;
15015
83927c63
HW
15016 case IX86_BUILTIN_HRESET:
15017 icode = CODE_FOR_hreset;
15018 arg0 = CALL_EXPR_ARG (exp, 0);
15019 op0 = expand_normal (arg0);
15020 op0 = force_reg (SImode, op0);
15021 emit_insn (gen_hreset (op0));
15022 return 0;
15023
2bf6d935
ML
15024 case IX86_BUILTIN_RSTORSSP:
15025 case IX86_BUILTIN_CLRSSBSY:
15026 arg0 = CALL_EXPR_ARG (exp, 0);
15027 op0 = expand_normal (arg0);
15028 icode = (fcode == IX86_BUILTIN_RSTORSSP
b5034abb
UB
15029 ? CODE_FOR_rstorssp
15030 : CODE_FOR_clrssbsy);
15031
2bf6d935
ML
15032 if (!address_operand (op0, VOIDmode))
15033 {
b5034abb
UB
15034 op0 = convert_memory_address (Pmode, op0);
15035 op0 = copy_addr_to_reg (op0);
2bf6d935 15036 }
b5034abb 15037 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
2bf6d935
ML
15038 return 0;
15039
15040 case IX86_BUILTIN_WRSSD:
15041 case IX86_BUILTIN_WRSSQ:
15042 case IX86_BUILTIN_WRUSSD:
15043 case IX86_BUILTIN_WRUSSQ:
b5034abb
UB
15044 mode = ((fcode == IX86_BUILTIN_WRSSD
15045 || fcode == IX86_BUILTIN_WRUSSD)
15046 ? SImode : DImode);
15047
2bf6d935
ML
15048 arg0 = CALL_EXPR_ARG (exp, 0);
15049 op0 = expand_normal (arg0);
15050 arg1 = CALL_EXPR_ARG (exp, 1);
15051 op1 = expand_normal (arg1);
b5034abb 15052
2bf6d935 15053 op0 = force_reg (mode, op0);
b5034abb 15054
2bf6d935
ML
15055 if (!address_operand (op1, VOIDmode))
15056 {
b5034abb
UB
15057 op1 = convert_memory_address (Pmode, op1);
15058 op1 = copy_addr_to_reg (op1);
2bf6d935 15059 }
b5034abb
UB
15060 op1 = gen_rtx_MEM (mode, op1);
15061
44320665
UB
15062 icode = ((fcode == IX86_BUILTIN_WRSSD
15063 || fcode == IX86_BUILTIN_WRSSQ)
15064 ? code_for_wrss (mode)
15065 : code_for_wruss (mode));
15066 emit_insn (GEN_FCN (icode) (op0, op1));
15067
2bf6d935
ML
15068 return 0;
15069
15070 default:
15071 break;
15072 }
15073
15074 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
15075 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
15076 {
15077 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
15078 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
15079 target);
15080 }
15081
fd5d5794
UB
15082 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
15083 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
15084 {
15085 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
15086 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
15087 target);
15088 }
15089
2bf6d935
ML
15090 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
15091 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
15092 {
15093 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
15094 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
15095 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
15096 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
15097 int masked = 1;
15098 machine_mode mode, wide_mode, nar_mode;
15099
15100 nar_mode = V4SFmode;
15101 mode = V16SFmode;
15102 wide_mode = V64SFmode;
15103 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
15104 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
15105
15106 switch (fcode)
15107 {
15108 case IX86_BUILTIN_4FMAPS:
15109 fcn = gen_avx5124fmaddps_4fmaddps;
15110 masked = 0;
15111 goto v4fma_expand;
15112
15113 case IX86_BUILTIN_4DPWSSD:
15114 nar_mode = V4SImode;
15115 mode = V16SImode;
15116 wide_mode = V64SImode;
15117 fcn = gen_avx5124vnniw_vp4dpwssd;
15118 masked = 0;
15119 goto v4fma_expand;
15120
15121 case IX86_BUILTIN_4DPWSSDS:
15122 nar_mode = V4SImode;
15123 mode = V16SImode;
15124 wide_mode = V64SImode;
15125 fcn = gen_avx5124vnniw_vp4dpwssds;
15126 masked = 0;
15127 goto v4fma_expand;
15128
15129 case IX86_BUILTIN_4FNMAPS:
15130 fcn = gen_avx5124fmaddps_4fnmaddps;
15131 masked = 0;
15132 goto v4fma_expand;
15133
15134 case IX86_BUILTIN_4FNMAPS_MASK:
15135 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
15136 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
15137 goto v4fma_expand;
15138
15139 case IX86_BUILTIN_4DPWSSD_MASK:
15140 nar_mode = V4SImode;
15141 mode = V16SImode;
15142 wide_mode = V64SImode;
15143 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
15144 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
15145 goto v4fma_expand;
15146
15147 case IX86_BUILTIN_4DPWSSDS_MASK:
15148 nar_mode = V4SImode;
15149 mode = V16SImode;
15150 wide_mode = V64SImode;
15151 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
15152 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
15153 goto v4fma_expand;
15154
15155 case IX86_BUILTIN_4FMAPS_MASK:
15156 {
15157 tree args[4];
15158 rtx ops[4];
15159 rtx wide_reg;
15160 rtx accum;
15161 rtx addr;
15162 rtx mem;
15163
15164v4fma_expand:
15165 wide_reg = gen_reg_rtx (wide_mode);
15166 for (i = 0; i < 4; i++)
15167 {
15168 args[i] = CALL_EXPR_ARG (exp, i);
15169 ops[i] = expand_normal (args[i]);
15170
15171 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
15172 ops[i]);
15173 }
15174
15175 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15176 accum = force_reg (mode, accum);
15177
15178 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15179 addr = force_reg (Pmode, addr);
15180
15181 mem = gen_rtx_MEM (nar_mode, addr);
15182
15183 target = gen_reg_rtx (mode);
15184
15185 emit_move_insn (target, accum);
15186
15187 if (! masked)
15188 emit_insn (fcn (target, accum, wide_reg, mem));
15189 else
15190 {
15191 rtx merge, mask;
15192 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15193
15194 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15195
15196 if (CONST_INT_P (mask))
15197 mask = fixup_modeless_constant (mask, HImode);
15198
15199 mask = force_reg (HImode, mask);
15200
15201 if (GET_MODE (mask) != HImode)
15202 mask = gen_rtx_SUBREG (HImode, mask, 0);
15203
15204 /* If merge is 0 then we're about to emit z-masked variant. */
15205 if (const0_operand (merge, mode))
15206 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15207 /* If merge is the same as accum then emit merge-masked variant. */
15208 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15209 {
15210 merge = force_reg (mode, merge);
15211 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15212 }
15213 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15214 else
15215 {
15216 target = gen_reg_rtx (mode);
15217 emit_move_insn (target, merge);
15218 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15219 }
15220 }
15221 return target;
15222 }
15223
15224 case IX86_BUILTIN_4FNMASS:
15225 fcn = gen_avx5124fmaddps_4fnmaddss;
15226 masked = 0;
15227 goto s4fma_expand;
15228
15229 case IX86_BUILTIN_4FMASS:
15230 fcn = gen_avx5124fmaddps_4fmaddss;
15231 masked = 0;
15232 goto s4fma_expand;
15233
15234 case IX86_BUILTIN_4FNMASS_MASK:
15235 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
15236 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
15237 goto s4fma_expand;
15238
15239 case IX86_BUILTIN_4FMASS_MASK:
15240 {
15241 tree args[4];
15242 rtx ops[4];
15243 rtx wide_reg;
15244 rtx accum;
15245 rtx addr;
15246 rtx mem;
15247
15248 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
15249 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
15250
15251s4fma_expand:
15252 mode = V4SFmode;
15253 wide_reg = gen_reg_rtx (V64SFmode);
15254 for (i = 0; i < 4; i++)
15255 {
15256 rtx tmp;
15257 args[i] = CALL_EXPR_ARG (exp, i);
15258 ops[i] = expand_normal (args[i]);
15259
15260 tmp = gen_reg_rtx (SFmode);
15261 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
15262
15263 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
15264 gen_rtx_SUBREG (V16SFmode, tmp, 0));
15265 }
15266
15267 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15268 accum = force_reg (V4SFmode, accum);
15269
15270 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15271 addr = force_reg (Pmode, addr);
15272
15273 mem = gen_rtx_MEM (V4SFmode, addr);
15274
15275 target = gen_reg_rtx (V4SFmode);
15276
15277 emit_move_insn (target, accum);
15278
15279 if (! masked)
15280 emit_insn (fcn (target, accum, wide_reg, mem));
15281 else
15282 {
15283 rtx merge, mask;
15284 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15285
15286 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15287
15288 if (CONST_INT_P (mask))
15289 mask = fixup_modeless_constant (mask, QImode);
15290
15291 mask = force_reg (QImode, mask);
15292
15293 if (GET_MODE (mask) != QImode)
15294 mask = gen_rtx_SUBREG (QImode, mask, 0);
15295
15296 /* If merge is 0 then we're about to emit z-masked variant. */
15297 if (const0_operand (merge, mode))
15298 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15299 /* If merge is the same as accum then emit merge-masked
15300 variant. */
15301 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15302 {
15303 merge = force_reg (mode, merge);
15304 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15305 }
15306 /* Merge with something unknown might happen if we z-mask
15307 w/ -O0. */
15308 else
15309 {
15310 target = gen_reg_rtx (mode);
15311 emit_move_insn (target, merge);
15312 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15313 }
15314 }
15315 return target;
15316 }
15317 case IX86_BUILTIN_RDPID:
15318 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
15319 target);
15320 case IX86_BUILTIN_FABSQ:
15321 case IX86_BUILTIN_COPYSIGNQ:
15322 if (!TARGET_SSE)
15323 /* Emit a normal call if SSE isn't available. */
15324 return expand_call (exp, target, ignore);
15325 /* FALLTHRU */
15326 default:
15327 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
15328 }
15329 }
15330
15331 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
15332 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
15333 {
15334 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
15335 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
15336 }
15337
15338 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15339 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
15340 {
15341 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
15342 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
15343 }
15344
15345 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15346 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
15347 {
15348 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
15349 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
15350 }
15351
15352 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15353 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
15354 {
15355 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
15356 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
15357 }
15358
15359 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15360 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
15361 {
15362 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
15363 const struct builtin_description *d = bdesc_multi_arg + i;
15364 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
15365 (enum ix86_builtin_func_type)
15366 d->flag, d->comparison);
15367 }
15368
15369 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
15370 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
15371 {
15372 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
15373 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
15374 target);
15375 }
15376
2bf6d935
ML
15377 gcc_unreachable ();
15378}
15379
15380/* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15381 fill target with val via vec_duplicate. */
15382
15383static bool
15384ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
15385{
15386 bool ok;
15387 rtx_insn *insn;
15388 rtx dup;
b3237a2c
JJ
15389 /* Save/restore recog_data in case this is called from splitters
15390 or other routines where recog_data needs to stay valid across
15391 force_reg. See PR106577. */
15392 recog_data_d recog_data_save = recog_data;
2bf6d935
ML
15393
15394 /* First attempt to recognize VAL as-is. */
15395 dup = gen_vec_duplicate (mode, val);
15396 insn = emit_insn (gen_rtx_SET (target, dup));
15397 if (recog_memoized (insn) < 0)
15398 {
15399 rtx_insn *seq;
15400 machine_mode innermode = GET_MODE_INNER (mode);
15401 rtx reg;
15402
15403 /* If that fails, force VAL into a register. */
15404
15405 start_sequence ();
15406 reg = force_reg (innermode, val);
15407 if (GET_MODE (reg) != innermode)
15408 reg = gen_lowpart (innermode, reg);
15409 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
15410 seq = get_insns ();
15411 end_sequence ();
15412 if (seq)
15413 emit_insn_before (seq, insn);
15414
15415 ok = recog_memoized (insn) >= 0;
15416 gcc_assert (ok);
15417 }
b3237a2c 15418 recog_data = recog_data_save;
2bf6d935
ML
15419 return true;
15420}
15421
15422/* Get a vector mode of the same size as the original but with elements
15423 twice as wide. This is only guaranteed to apply to integral vectors. */
15424
15425static machine_mode
15426get_mode_wider_vector (machine_mode o)
15427{
e53b6e56 15428 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
4b796619 15429 machine_mode n = GET_MODE_NEXT_MODE (o).require ();
2bf6d935
ML
15430 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
15431 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
15432 return n;
15433}
15434
15435static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
15436static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
15437
15438/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15439 with all elements equal to VAR. Return true if successful. */
15440
51c30227 15441bool
2bf6d935
ML
15442ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
15443 rtx target, rtx val)
15444{
15445 bool ok;
15446
15447 switch (mode)
15448 {
15449 case E_V2SImode:
15450 case E_V2SFmode:
15451 if (!mmx_ok)
15452 return false;
15453 /* FALLTHRU */
15454
15455 case E_V4DFmode:
15456 case E_V4DImode:
15457 case E_V8SFmode:
15458 case E_V8SImode:
15459 case E_V2DFmode:
15460 case E_V2DImode:
15461 case E_V4SFmode:
15462 case E_V4SImode:
15463 case E_V16SImode:
15464 case E_V8DImode:
15465 case E_V16SFmode:
15466 case E_V8DFmode:
15467 return ix86_vector_duplicate_value (mode, target, val);
15468
15469 case E_V4HImode:
15470 if (!mmx_ok)
15471 return false;
15472 if (TARGET_SSE || TARGET_3DNOW_A)
15473 {
15474 rtx x;
15475
15476 val = gen_lowpart (SImode, val);
15477 x = gen_rtx_TRUNCATE (HImode, val);
15478 x = gen_rtx_VEC_DUPLICATE (mode, x);
15479 emit_insn (gen_rtx_SET (target, x));
15480 return true;
15481 }
15482 goto widen;
15483
8d7dae0e
UB
15484 case E_V2HImode:
15485 if (TARGET_SSE2)
15486 {
15487 rtx x;
15488
15489 val = gen_lowpart (SImode, val);
15490 x = gen_rtx_TRUNCATE (HImode, val);
15491 x = gen_rtx_VEC_DUPLICATE (mode, x);
15492 emit_insn (gen_rtx_SET (target, x));
15493 return true;
15494 }
15495 return false;
15496
2bf6d935 15497 case E_V8QImode:
64735dc9 15498 case E_V4QImode:
2bf6d935
ML
15499 if (!mmx_ok)
15500 return false;
15501 goto widen;
15502
15503 case E_V8HImode:
7a54d3de 15504 case E_V8HFmode:
6910cad5 15505 case E_V8BFmode:
2bf6d935
ML
15506 if (TARGET_AVX2)
15507 return ix86_vector_duplicate_value (mode, target, val);
15508
15509 if (TARGET_SSE2)
15510 {
15511 struct expand_vec_perm_d dperm;
15512 rtx tmp1, tmp2;
15513
15514 permute:
15515 memset (&dperm, 0, sizeof (dperm));
15516 dperm.target = target;
15517 dperm.vmode = mode;
15518 dperm.nelt = GET_MODE_NUNITS (mode);
15519 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15520 dperm.one_operand_p = true;
15521
092763fd 15522 if (mode == V8HFmode || mode == V8BFmode)
e2385690 15523 {
092763fd 15524 tmp1 = force_reg (GET_MODE_INNER (mode), val);
e2385690 15525 tmp2 = gen_reg_rtx (mode);
96799fa4 15526 emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
e2385690
HW
15527 tmp1 = gen_lowpart (mode, tmp2);
15528 }
7a54d3de
UB
15529 else
15530 {
15531 /* Extend to SImode using a paradoxical SUBREG. */
15532 tmp1 = gen_reg_rtx (SImode);
15533 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15534
15535 /* Insert the SImode value as
15536 low element of a V4SImode vector. */
15537 tmp2 = gen_reg_rtx (V4SImode);
15538 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15539 tmp1 = gen_lowpart (mode, tmp2);
15540 }
2bf6d935 15541
7a54d3de 15542 emit_move_insn (dperm.op0, tmp1);
2bf6d935
ML
15543 ok = (expand_vec_perm_1 (&dperm)
15544 || expand_vec_perm_broadcast_1 (&dperm));
15545 gcc_assert (ok);
15546 return ok;
15547 }
15548 goto widen;
15549
15550 case E_V16QImode:
15551 if (TARGET_AVX2)
15552 return ix86_vector_duplicate_value (mode, target, val);
15553
15554 if (TARGET_SSE2)
15555 goto permute;
15556 goto widen;
15557
15558 widen:
15559 /* Replicate the value once into the next wider mode and recurse. */
15560 {
15561 machine_mode smode, wsmode, wvmode;
15562 rtx x;
15563
15564 smode = GET_MODE_INNER (mode);
15565 wvmode = get_mode_wider_vector (mode);
15566 wsmode = GET_MODE_INNER (wvmode);
15567
15568 val = convert_modes (wsmode, smode, val, true);
20a2c8ac
UB
15569
15570 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15571 emit_insn (gen_insv_1 (wsmode, val, val));
15572 else
15573 {
15574 x = expand_simple_binop (wsmode, ASHIFT, val,
15575 GEN_INT (GET_MODE_BITSIZE (smode)),
15576 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15577 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15578 OPTAB_LIB_WIDEN);
15579 }
2bf6d935
ML
15580
15581 x = gen_reg_rtx (wvmode);
15582 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15583 gcc_assert (ok);
15584 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15585 return ok;
15586 }
15587
15588 case E_V16HImode:
7a54d3de 15589 case E_V16HFmode:
6910cad5 15590 case E_V16BFmode:
2bf6d935
ML
15591 case E_V32QImode:
15592 if (TARGET_AVX2)
15593 return ix86_vector_duplicate_value (mode, target, val);
15594 else
15595 {
78260b9a 15596 machine_mode hvmode;
15597 switch (mode)
15598 {
15599 case V16HImode:
15600 hvmode = V8HImode;
15601 break;
15602 case V16HFmode:
15603 hvmode = V8HFmode;
15604 break;
15605 case V16BFmode:
15606 hvmode = V8BFmode;
15607 break;
15608 case V32QImode:
15609 hvmode = V16QImode;
15610 break;
15611 default:
15612 gcc_unreachable ();
15613 }
2bf6d935
ML
15614 rtx x = gen_reg_rtx (hvmode);
15615
15616 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15617 gcc_assert (ok);
15618
15619 x = gen_rtx_VEC_CONCAT (mode, x, x);
15620 emit_insn (gen_rtx_SET (target, x));
15621 }
15622 return true;
15623
2bf6d935 15624 case E_V32HImode:
7a54d3de 15625 case E_V32HFmode:
6910cad5 15626 case E_V32BFmode:
7a54d3de 15627 case E_V64QImode:
2bf6d935
ML
15628 if (TARGET_AVX512BW)
15629 return ix86_vector_duplicate_value (mode, target, val);
15630 else
15631 {
78260b9a 15632 machine_mode hvmode;
15633 switch (mode)
15634 {
15635 case V32HImode:
15636 hvmode = V16HImode;
15637 break;
15638 case V32HFmode:
15639 hvmode = V16HFmode;
15640 break;
15641 case V32BFmode:
15642 hvmode = V16BFmode;
15643 break;
15644 case V64QImode:
15645 hvmode = V32QImode;
15646 break;
15647 default:
15648 gcc_unreachable ();
15649 }
2bf6d935
ML
15650 rtx x = gen_reg_rtx (hvmode);
15651
15652 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15653 gcc_assert (ok);
15654
15655 x = gen_rtx_VEC_CONCAT (mode, x, x);
15656 emit_insn (gen_rtx_SET (target, x));
15657 }
15658 return true;
15659
15660 default:
15661 return false;
15662 }
15663}
15664
15665/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15666 whose ONE_VAR element is VAR, and other elements are zero. Return true
15667 if successful. */
15668
15669static bool
15670ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15671 rtx target, rtx var, int one_var)
15672{
15673 machine_mode vsimode;
15674 rtx new_target;
15675 rtx x, tmp;
15676 bool use_vector_set = false;
15677 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15678
15679 switch (mode)
15680 {
15681 case E_V2DImode:
15682 /* For SSE4.1, we normally use vector set. But if the second
15683 element is zero and inter-unit moves are OK, we use movq
15684 instead. */
15685 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15686 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15687 && one_var == 0));
15688 break;
15689 case E_V16QImode:
15690 case E_V4SImode:
15691 case E_V4SFmode:
15692 use_vector_set = TARGET_SSE4_1;
15693 break;
15694 case E_V8HImode:
15695 use_vector_set = TARGET_SSE2;
c4d423c7 15696 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15697 ? gen_vec_setv8hi_0 : NULL;
2bf6d935 15698 break;
8a0eb0cd
UB
15699 case E_V8QImode:
15700 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15701 break;
2bf6d935
ML
15702 case E_V4HImode:
15703 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15704 break;
64735dc9
UB
15705 case E_V4QImode:
15706 use_vector_set = TARGET_SSE4_1;
15707 break;
2bf6d935 15708 case E_V32QImode:
c4d423c7 15709 use_vector_set = TARGET_AVX;
15710 break;
2bf6d935
ML
15711 case E_V16HImode:
15712 use_vector_set = TARGET_AVX;
c4d423c7 15713 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15714 ? gen_vec_setv16hi_0 : NULL;
2bf6d935
ML
15715 break;
15716 case E_V8SImode:
15717 use_vector_set = TARGET_AVX;
15718 gen_vec_set_0 = gen_vec_setv8si_0;
15719 break;
15720 case E_V8SFmode:
15721 use_vector_set = TARGET_AVX;
15722 gen_vec_set_0 = gen_vec_setv8sf_0;
15723 break;
15724 case E_V4DFmode:
15725 use_vector_set = TARGET_AVX;
15726 gen_vec_set_0 = gen_vec_setv4df_0;
15727 break;
15728 case E_V4DImode:
15729 /* Use ix86_expand_vector_set in 64bit mode only. */
15730 use_vector_set = TARGET_AVX && TARGET_64BIT;
15731 gen_vec_set_0 = gen_vec_setv4di_0;
15732 break;
15733 case E_V16SImode:
15734 use_vector_set = TARGET_AVX512F && one_var == 0;
15735 gen_vec_set_0 = gen_vec_setv16si_0;
15736 break;
15737 case E_V16SFmode:
15738 use_vector_set = TARGET_AVX512F && one_var == 0;
15739 gen_vec_set_0 = gen_vec_setv16sf_0;
15740 break;
15741 case E_V8DFmode:
15742 use_vector_set = TARGET_AVX512F && one_var == 0;
15743 gen_vec_set_0 = gen_vec_setv8df_0;
15744 break;
15745 case E_V8DImode:
15746 /* Use ix86_expand_vector_set in 64bit mode only. */
15747 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15748 gen_vec_set_0 = gen_vec_setv8di_0;
15749 break;
9e2a82e1 15750 case E_V8HFmode:
15751 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15752 gen_vec_set_0 = gen_vec_setv8hf_0;
15753 break;
15754 case E_V16HFmode:
15755 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15756 gen_vec_set_0 = gen_vec_setv16hf_0;
15757 break;
15758 case E_V32HFmode:
15759 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15760 gen_vec_set_0 = gen_vec_setv32hf_0;
15761 break;
6910cad5 15762 case E_V8BFmode:
15763 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15764 gen_vec_set_0 = gen_vec_setv8bf_0;
15765 break;
15766 case E_V16BFmode:
15767 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15768 gen_vec_set_0 = gen_vec_setv16bf_0;
15769 break;
15770 case E_V32BFmode:
15771 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15772 gen_vec_set_0 = gen_vec_setv32bf_0;
15773 break;
c4d423c7 15774 case E_V32HImode:
15775 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15776 gen_vec_set_0 = gen_vec_setv32hi_0;
2bf6d935
ML
15777 default:
15778 break;
15779 }
15780
15781 if (use_vector_set)
15782 {
15783 if (gen_vec_set_0 && one_var == 0)
15784 {
15785 var = force_reg (GET_MODE_INNER (mode), var);
15786 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15787 return true;
15788 }
15789 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15790 var = force_reg (GET_MODE_INNER (mode), var);
15791 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15792 return true;
15793 }
15794
15795 switch (mode)
15796 {
15797 case E_V2SFmode:
15798 case E_V2SImode:
15799 if (!mmx_ok)
15800 return false;
15801 /* FALLTHRU */
15802
15803 case E_V2DFmode:
15804 case E_V2DImode:
15805 if (one_var != 0)
15806 return false;
15807 var = force_reg (GET_MODE_INNER (mode), var);
15808 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15809 emit_insn (gen_rtx_SET (target, x));
15810 return true;
15811
15812 case E_V4SFmode:
15813 case E_V4SImode:
15814 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15815 new_target = gen_reg_rtx (mode);
15816 else
15817 new_target = target;
15818 var = force_reg (GET_MODE_INNER (mode), var);
15819 x = gen_rtx_VEC_DUPLICATE (mode, var);
15820 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15821 emit_insn (gen_rtx_SET (new_target, x));
15822 if (one_var != 0)
15823 {
15824 /* We need to shuffle the value to the correct position, so
15825 create a new pseudo to store the intermediate result. */
15826
15827 /* With SSE2, we can use the integer shuffle insns. */
15828 if (mode != V4SFmode && TARGET_SSE2)
15829 {
15830 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15831 const1_rtx,
15832 GEN_INT (one_var == 1 ? 0 : 1),
15833 GEN_INT (one_var == 2 ? 0 : 1),
15834 GEN_INT (one_var == 3 ? 0 : 1)));
15835 if (target != new_target)
15836 emit_move_insn (target, new_target);
15837 return true;
15838 }
15839
15840 /* Otherwise convert the intermediate result to V4SFmode and
15841 use the SSE1 shuffle instructions. */
15842 if (mode != V4SFmode)
15843 {
15844 tmp = gen_reg_rtx (V4SFmode);
15845 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15846 }
15847 else
15848 tmp = new_target;
15849
15850 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15851 const1_rtx,
15852 GEN_INT (one_var == 1 ? 0 : 1),
15853 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15854 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15855
15856 if (mode != V4SFmode)
15857 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15858 else if (tmp != target)
15859 emit_move_insn (target, tmp);
15860 }
15861 else if (target != new_target)
15862 emit_move_insn (target, new_target);
15863 return true;
15864
15865 case E_V8HImode:
15866 case E_V16QImode:
15867 vsimode = V4SImode;
15868 goto widen;
15869 case E_V4HImode:
15870 case E_V8QImode:
15871 if (!mmx_ok)
15872 return false;
15873 vsimode = V2SImode;
15874 goto widen;
15875 widen:
15876 if (one_var != 0)
15877 return false;
15878
15879 /* Zero extend the variable element to SImode and recurse. */
15880 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15881
15882 x = gen_reg_rtx (vsimode);
15883 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15884 var, one_var))
15885 gcc_unreachable ();
15886
15887 emit_move_insn (target, gen_lowpart (mode, x));
15888 return true;
15889
15890 default:
15891 return false;
15892 }
15893}
15894
15895/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15896 consisting of the values in VALS. It is known that all elements
15897 except ONE_VAR are constants. Return true if successful. */
15898
15899static bool
15900ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15901 rtx target, rtx vals, int one_var)
15902{
15903 rtx var = XVECEXP (vals, 0, one_var);
15904 machine_mode wmode;
15905 rtx const_vec, x;
15906
15907 const_vec = copy_rtx (vals);
15908 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15909 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15910
15911 switch (mode)
15912 {
15913 case E_V2DFmode:
15914 case E_V2DImode:
15915 case E_V2SFmode:
15916 case E_V2SImode:
15917 /* For the two element vectors, it's just as easy to use
15918 the general case. */
15919 return false;
15920
15921 case E_V4DImode:
15922 /* Use ix86_expand_vector_set in 64bit mode only. */
15923 if (!TARGET_64BIT)
15924 return false;
15925 /* FALLTHRU */
9e2a82e1 15926 case E_V8HFmode:
15927 case E_V16HFmode:
6910cad5 15928 case E_V8BFmode:
15929 case E_V16BFmode:
2bf6d935
ML
15930 case E_V4DFmode:
15931 case E_V8SFmode:
15932 case E_V8SImode:
15933 case E_V16HImode:
15934 case E_V32QImode:
15935 case E_V4SFmode:
15936 case E_V4SImode:
15937 case E_V8HImode:
15938 case E_V4HImode:
15939 break;
15940
15941 case E_V16QImode:
15942 if (TARGET_SSE4_1)
15943 break;
15944 wmode = V8HImode;
15945 goto widen;
15946 case E_V8QImode:
8a0eb0cd
UB
15947 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15948 break;
2bf6d935
ML
15949 wmode = V4HImode;
15950 goto widen;
64735dc9
UB
15951 case E_V4QImode:
15952 if (TARGET_SSE4_1)
15953 break;
15954 wmode = V2HImode;
2bf6d935
ML
15955 widen:
15956 /* There's no way to set one QImode entry easily. Combine
15957 the variable value with its adjacent constant value, and
15958 promote to an HImode set. */
15959 x = XVECEXP (vals, 0, one_var ^ 1);
15960 if (one_var & 1)
15961 {
15962 var = convert_modes (HImode, QImode, var, true);
15963 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15964 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15965 x = GEN_INT (INTVAL (x) & 0xff);
15966 }
15967 else
15968 {
15969 var = convert_modes (HImode, QImode, var, true);
15970 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15971 }
15972 if (x != const0_rtx)
15973 var = expand_simple_binop (HImode, IOR, var, x, var,
15974 1, OPTAB_LIB_WIDEN);
15975
15976 x = gen_reg_rtx (wmode);
15977 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15978 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15979
15980 emit_move_insn (target, gen_lowpart (mode, x));
15981 return true;
15982
15983 default:
15984 return false;
15985 }
15986
15987 emit_move_insn (target, const_vec);
15988 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15989 return true;
15990}
15991
15992/* A subroutine of ix86_expand_vector_init_general. Use vector
15993 concatenate to handle the most general case: all values variable,
15994 and none identical. */
15995
15996static void
15997ix86_expand_vector_init_concat (machine_mode mode,
15998 rtx target, rtx *ops, int n)
15999{
1aeecaf5
HL
16000 machine_mode half_mode = VOIDmode;
16001 rtx half[2];
2bf6d935
ML
16002 rtvec v;
16003 int i, j;
16004
16005 switch (n)
16006 {
16007 case 2:
16008 switch (mode)
16009 {
9e2a82e1 16010 case E_V32HFmode:
16011 half_mode = V16HFmode;
16012 break;
6910cad5 16013 case E_V32BFmode:
16014 half_mode = V16BFmode;
16015 break;
2bf6d935 16016 case E_V16SImode:
1aeecaf5 16017 half_mode = V8SImode;
2bf6d935
ML
16018 break;
16019 case E_V16SFmode:
1aeecaf5 16020 half_mode = V8SFmode;
2bf6d935
ML
16021 break;
16022 case E_V8DImode:
1aeecaf5 16023 half_mode = V4DImode;
2bf6d935
ML
16024 break;
16025 case E_V8DFmode:
1aeecaf5 16026 half_mode = V4DFmode;
2bf6d935 16027 break;
9e2a82e1 16028 case E_V16HFmode:
16029 half_mode = V8HFmode;
16030 break;
6910cad5 16031 case E_V16BFmode:
16032 half_mode = V8BFmode;
16033 break;
2bf6d935 16034 case E_V8SImode:
1aeecaf5 16035 half_mode = V4SImode;
2bf6d935
ML
16036 break;
16037 case E_V8SFmode:
1aeecaf5 16038 half_mode = V4SFmode;
2bf6d935
ML
16039 break;
16040 case E_V4DImode:
1aeecaf5 16041 half_mode = V2DImode;
2bf6d935
ML
16042 break;
16043 case E_V4DFmode:
1aeecaf5 16044 half_mode = V2DFmode;
2bf6d935
ML
16045 break;
16046 case E_V4SImode:
1aeecaf5 16047 half_mode = V2SImode;
2bf6d935
ML
16048 break;
16049 case E_V4SFmode:
1aeecaf5 16050 half_mode = V2SFmode;
2bf6d935
ML
16051 break;
16052 case E_V2DImode:
1aeecaf5 16053 half_mode = DImode;
2bf6d935
ML
16054 break;
16055 case E_V2SImode:
1aeecaf5 16056 half_mode = SImode;
2bf6d935
ML
16057 break;
16058 case E_V2DFmode:
1aeecaf5 16059 half_mode = DFmode;
2bf6d935
ML
16060 break;
16061 case E_V2SFmode:
1aeecaf5 16062 half_mode = SFmode;
2bf6d935
ML
16063 break;
16064 default:
16065 gcc_unreachable ();
16066 }
16067
1aeecaf5
HL
16068 if (!register_operand (ops[1], half_mode))
16069 ops[1] = force_reg (half_mode, ops[1]);
16070 if (!register_operand (ops[0], half_mode))
16071 ops[0] = force_reg (half_mode, ops[0]);
2bf6d935
ML
16072 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
16073 ops[1])));
16074 break;
16075
16076 case 4:
16077 switch (mode)
16078 {
16079 case E_V4DImode:
1aeecaf5 16080 half_mode = V2DImode;
2bf6d935
ML
16081 break;
16082 case E_V4DFmode:
1aeecaf5 16083 half_mode = V2DFmode;
2bf6d935
ML
16084 break;
16085 case E_V4SImode:
1aeecaf5 16086 half_mode = V2SImode;
2bf6d935
ML
16087 break;
16088 case E_V4SFmode:
1aeecaf5 16089 half_mode = V2SFmode;
2bf6d935
ML
16090 break;
16091 default:
16092 gcc_unreachable ();
16093 }
16094 goto half;
16095
16096 case 8:
16097 switch (mode)
16098 {
16099 case E_V8DImode:
1aeecaf5 16100 half_mode = V4DImode;
2bf6d935
ML
16101 break;
16102 case E_V8DFmode:
1aeecaf5 16103 half_mode = V4DFmode;
2bf6d935
ML
16104 break;
16105 case E_V8SImode:
1aeecaf5 16106 half_mode = V4SImode;
2bf6d935
ML
16107 break;
16108 case E_V8SFmode:
1aeecaf5 16109 half_mode = V4SFmode;
2bf6d935
ML
16110 break;
16111 default:
16112 gcc_unreachable ();
16113 }
16114 goto half;
16115
16116 case 16:
16117 switch (mode)
16118 {
16119 case E_V16SImode:
1aeecaf5 16120 half_mode = V8SImode;
2bf6d935
ML
16121 break;
16122 case E_V16SFmode:
1aeecaf5 16123 half_mode = V8SFmode;
2bf6d935
ML
16124 break;
16125 default:
16126 gcc_unreachable ();
16127 }
16128 goto half;
16129
16130half:
16131 /* FIXME: We process inputs backward to help RA. PR 36222. */
16132 i = n - 1;
1aeecaf5 16133 for (j = 1; j != -1; j--)
2bf6d935 16134 {
1aeecaf5
HL
16135 half[j] = gen_reg_rtx (half_mode);
16136 switch (n >> 1)
2bf6d935 16137 {
1aeecaf5
HL
16138 case 2:
16139 v = gen_rtvec (2, ops[i-1], ops[i]);
16140 i -= 2;
16141 break;
16142 case 4:
16143 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
16144 i -= 4;
16145 break;
16146 case 8:
16147 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
16148 ops[i-3], ops[i-2], ops[i-1], ops[i]);
16149 i -= 8;
16150 break;
16151 default:
16152 gcc_unreachable ();
2bf6d935 16153 }
1aeecaf5
HL
16154 ix86_expand_vector_init (false, half[j],
16155 gen_rtx_PARALLEL (half_mode, v));
2bf6d935 16156 }
1aeecaf5
HL
16157
16158 ix86_expand_vector_init_concat (mode, target, half, 2);
2bf6d935
ML
16159 break;
16160
16161 default:
16162 gcc_unreachable ();
16163 }
16164}
16165
16166/* A subroutine of ix86_expand_vector_init_general. Use vector
16167 interleave to handle the most general case: all values variable,
16168 and none identical. */
16169
16170static void
16171ix86_expand_vector_init_interleave (machine_mode mode,
16172 rtx target, rtx *ops, int n)
16173{
16174 machine_mode first_imode, second_imode, third_imode, inner_mode;
16175 int i, j;
9e2a82e1 16176 rtx op, op0, op1;
2bf6d935
ML
16177 rtx (*gen_load_even) (rtx, rtx, rtx);
16178 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
16179 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
16180
16181 switch (mode)
16182 {
9e2a82e1 16183 case E_V8HFmode:
7fc4d600 16184 gen_load_even = gen_vec_interleave_lowv8hf;
9e2a82e1 16185 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16186 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16187 inner_mode = HFmode;
16188 first_imode = V4SImode;
16189 second_imode = V2DImode;
16190 third_imode = VOIDmode;
16191 break;
6910cad5 16192 case E_V8BFmode:
16193 gen_load_even = gen_vec_interleave_lowv8bf;
16194 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16195 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16196 inner_mode = BFmode;
16197 first_imode = V4SImode;
16198 second_imode = V2DImode;
16199 third_imode = VOIDmode;
16200 break;
2bf6d935
ML
16201 case E_V8HImode:
16202 gen_load_even = gen_vec_setv8hi;
16203 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16204 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16205 inner_mode = HImode;
16206 first_imode = V4SImode;
16207 second_imode = V2DImode;
16208 third_imode = VOIDmode;
16209 break;
16210 case E_V16QImode:
16211 gen_load_even = gen_vec_setv16qi;
16212 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
16213 gen_interleave_second_low = gen_vec_interleave_lowv4si;
16214 inner_mode = QImode;
16215 first_imode = V8HImode;
16216 second_imode = V4SImode;
16217 third_imode = V2DImode;
16218 break;
16219 default:
16220 gcc_unreachable ();
16221 }
16222
16223 for (i = 0; i < n; i++)
16224 {
9e2a82e1 16225 op = ops [i + i];
6910cad5 16226 if (inner_mode == HFmode || inner_mode == BFmode)
9e2a82e1 16227 {
7fc4d600 16228 rtx even, odd;
6910cad5 16229 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16230 machine_mode vec_mode =
16231 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
16232 op0 = gen_reg_rtx (vec_mode);
16233 even = lowpart_subreg (vec_mode,
16234 force_reg (inner_mode, op), inner_mode);
16235 odd = lowpart_subreg (vec_mode,
16236 force_reg (inner_mode, ops[i + i + 1]),
16237 inner_mode);
7fc4d600 16238 emit_insn (gen_load_even (op0, even, odd));
9e2a82e1 16239 }
7fc4d600 16240 else
16241 {
16242 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16243 op0 = gen_reg_rtx (SImode);
16244 emit_move_insn (op0, gen_lowpart (SImode, op));
9e2a82e1 16245
7fc4d600 16246 /* Insert the SImode value as low element of V4SImode vector. */
16247 op1 = gen_reg_rtx (V4SImode);
16248 op0 = gen_rtx_VEC_MERGE (V4SImode,
16249 gen_rtx_VEC_DUPLICATE (V4SImode,
16250 op0),
16251 CONST0_RTX (V4SImode),
16252 const1_rtx);
16253 emit_insn (gen_rtx_SET (op1, op0));
2bf6d935 16254
7fc4d600 16255 /* Cast the V4SImode vector back to a vector in orignal mode. */
16256 op0 = gen_reg_rtx (mode);
16257 emit_move_insn (op0, gen_lowpart (mode, op1));
2bf6d935 16258
7fc4d600 16259 /* Load even elements into the second position. */
16260 emit_insn (gen_load_even (op0,
16261 force_reg (inner_mode,
16262 ops[i + i + 1]),
16263 const1_rtx));
16264 }
2bf6d935
ML
16265
16266 /* Cast vector to FIRST_IMODE vector. */
16267 ops[i] = gen_reg_rtx (first_imode);
16268 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
16269 }
16270
16271 /* Interleave low FIRST_IMODE vectors. */
16272 for (i = j = 0; i < n; i += 2, j++)
16273 {
16274 op0 = gen_reg_rtx (first_imode);
16275 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
16276
16277 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16278 ops[j] = gen_reg_rtx (second_imode);
16279 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
16280 }
16281
16282 /* Interleave low SECOND_IMODE vectors. */
16283 switch (second_imode)
16284 {
16285 case E_V4SImode:
16286 for (i = j = 0; i < n / 2; i += 2, j++)
16287 {
16288 op0 = gen_reg_rtx (second_imode);
16289 emit_insn (gen_interleave_second_low (op0, ops[i],
16290 ops[i + 1]));
16291
16292 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16293 vector. */
16294 ops[j] = gen_reg_rtx (third_imode);
16295 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
16296 }
16297 second_imode = V2DImode;
16298 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16299 /* FALLTHRU */
16300
16301 case E_V2DImode:
16302 op0 = gen_reg_rtx (second_imode);
16303 emit_insn (gen_interleave_second_low (op0, ops[0],
16304 ops[1]));
16305
16306 /* Cast the SECOND_IMODE vector back to a vector on original
16307 mode. */
16308 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
16309 break;
16310
16311 default:
16312 gcc_unreachable ();
16313 }
16314}
16315
16316/* A subroutine of ix86_expand_vector_init. Handle the most general case:
16317 all values variable, and none identical. */
16318
16319static void
16320ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
16321 rtx target, rtx vals)
16322{
16323 rtx ops[64], op0, op1, op2, op3, op4, op5;
16324 machine_mode half_mode = VOIDmode;
16325 machine_mode quarter_mode = VOIDmode;
16326 int n, i;
16327
16328 switch (mode)
16329 {
16330 case E_V2SFmode:
16331 case E_V2SImode:
16332 if (!mmx_ok && !TARGET_SSE)
16333 break;
16334 /* FALLTHRU */
16335
16336 case E_V16SImode:
16337 case E_V16SFmode:
16338 case E_V8DFmode:
16339 case E_V8DImode:
16340 case E_V8SFmode:
16341 case E_V8SImode:
16342 case E_V4DFmode:
16343 case E_V4DImode:
16344 case E_V4SFmode:
16345 case E_V4SImode:
16346 case E_V2DFmode:
16347 case E_V2DImode:
16348 n = GET_MODE_NUNITS (mode);
16349 for (i = 0; i < n; i++)
16350 ops[i] = XVECEXP (vals, 0, i);
16351 ix86_expand_vector_init_concat (mode, target, ops, n);
16352 return;
16353
16354 case E_V2TImode:
16355 for (i = 0; i < 2; i++)
16356 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16357 op0 = gen_reg_rtx (V4DImode);
16358 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
16359 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16360 return;
16361
16362 case E_V4TImode:
16363 for (i = 0; i < 4; i++)
16364 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16365 ops[4] = gen_reg_rtx (V4DImode);
16366 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
16367 ops[5] = gen_reg_rtx (V4DImode);
16368 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
16369 op0 = gen_reg_rtx (V8DImode);
16370 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
16371 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16372 return;
16373
16374 case E_V32QImode:
16375 half_mode = V16QImode;
16376 goto half;
16377
16378 case E_V16HImode:
16379 half_mode = V8HImode;
16380 goto half;
16381
9e2a82e1 16382 case E_V16HFmode:
16383 half_mode = V8HFmode;
16384 goto half;
16385
6910cad5 16386 case E_V16BFmode:
16387 half_mode = V8BFmode;
16388 goto half;
16389
2bf6d935
ML
16390half:
16391 n = GET_MODE_NUNITS (mode);
16392 for (i = 0; i < n; i++)
16393 ops[i] = XVECEXP (vals, 0, i);
16394 op0 = gen_reg_rtx (half_mode);
16395 op1 = gen_reg_rtx (half_mode);
16396 ix86_expand_vector_init_interleave (half_mode, op0, ops,
16397 n >> 2);
16398 ix86_expand_vector_init_interleave (half_mode, op1,
16399 &ops [n >> 1], n >> 2);
16400 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
16401 return;
16402
16403 case E_V64QImode:
16404 quarter_mode = V16QImode;
16405 half_mode = V32QImode;
16406 goto quarter;
16407
16408 case E_V32HImode:
16409 quarter_mode = V8HImode;
16410 half_mode = V16HImode;
16411 goto quarter;
16412
9e2a82e1 16413 case E_V32HFmode:
16414 quarter_mode = V8HFmode;
16415 half_mode = V16HFmode;
16416 goto quarter;
16417
6910cad5 16418 case E_V32BFmode:
16419 quarter_mode = V8BFmode;
16420 half_mode = V16BFmode;
16421 goto quarter;
16422
2bf6d935
ML
16423quarter:
16424 n = GET_MODE_NUNITS (mode);
16425 for (i = 0; i < n; i++)
16426 ops[i] = XVECEXP (vals, 0, i);
16427 op0 = gen_reg_rtx (quarter_mode);
16428 op1 = gen_reg_rtx (quarter_mode);
16429 op2 = gen_reg_rtx (quarter_mode);
16430 op3 = gen_reg_rtx (quarter_mode);
16431 op4 = gen_reg_rtx (half_mode);
16432 op5 = gen_reg_rtx (half_mode);
16433 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
16434 n >> 3);
16435 ix86_expand_vector_init_interleave (quarter_mode, op1,
16436 &ops [n >> 2], n >> 3);
16437 ix86_expand_vector_init_interleave (quarter_mode, op2,
16438 &ops [n >> 1], n >> 3);
16439 ix86_expand_vector_init_interleave (quarter_mode, op3,
16440 &ops [(n >> 1) | (n >> 2)], n >> 3);
16441 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
16442 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
16443 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
16444 return;
16445
16446 case E_V16QImode:
16447 if (!TARGET_SSE4_1)
16448 break;
16449 /* FALLTHRU */
16450
16451 case E_V8HImode:
16452 if (!TARGET_SSE2)
16453 break;
16454
16455 /* Don't use ix86_expand_vector_init_interleave if we can't
16456 move from GPR to SSE register directly. */
16457 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
16458 break;
9e2a82e1 16459 /* FALLTHRU */
16460
16461 case E_V8HFmode:
6910cad5 16462 case E_V8BFmode:
2bf6d935
ML
16463
16464 n = GET_MODE_NUNITS (mode);
16465 for (i = 0; i < n; i++)
16466 ops[i] = XVECEXP (vals, 0, i);
16467 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
16468 return;
16469
16470 case E_V4HImode:
16471 case E_V8QImode:
8d7dae0e
UB
16472
16473 case E_V2HImode:
64735dc9 16474 case E_V4QImode:
2bf6d935
ML
16475 break;
16476
16477 default:
16478 gcc_unreachable ();
16479 }
16480
16481 {
16482 int i, j, n_elts, n_words, n_elt_per_word;
8d7dae0e 16483 machine_mode tmp_mode, inner_mode;
2bf6d935
ML
16484 rtx words[4], shift;
16485
8d7dae0e
UB
16486 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
16487
2bf6d935
ML
16488 inner_mode = GET_MODE_INNER (mode);
16489 n_elts = GET_MODE_NUNITS (mode);
8d7dae0e 16490 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
2bf6d935
ML
16491 n_elt_per_word = n_elts / n_words;
16492 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
16493
16494 for (i = 0; i < n_words; ++i)
16495 {
16496 rtx word = NULL_RTX;
16497
16498 for (j = 0; j < n_elt_per_word; ++j)
16499 {
16500 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
8d7dae0e 16501 elt = convert_modes (tmp_mode, inner_mode, elt, true);
2bf6d935
ML
16502
16503 if (j == 0)
16504 word = elt;
16505 else
16506 {
8d7dae0e 16507 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
e1a74058 16508 NULL_RTX, 1, OPTAB_LIB_WIDEN);
8d7dae0e 16509 word = expand_simple_binop (tmp_mode, IOR, word, elt,
e1a74058 16510 NULL_RTX, 1, OPTAB_LIB_WIDEN);
2bf6d935
ML
16511 }
16512 }
16513
16514 words[i] = word;
16515 }
16516
16517 if (n_words == 1)
16518 emit_move_insn (target, gen_lowpart (mode, words[0]));
16519 else if (n_words == 2)
16520 {
affee7dc
RB
16521 gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
16522 machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
16523 rtx tmp = gen_reg_rtx (concat_mode);
16524 vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
2b2bf793 16525 ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
affee7dc 16526 emit_move_insn (target, gen_lowpart (mode, tmp));
2bf6d935
ML
16527 }
16528 else if (n_words == 4)
16529 {
16530 rtx tmp = gen_reg_rtx (V4SImode);
8d7dae0e 16531 gcc_assert (tmp_mode == SImode);
2bf6d935
ML
16532 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16533 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
16534 emit_move_insn (target, gen_lowpart (mode, tmp));
16535 }
16536 else
16537 gcc_unreachable ();
16538 }
16539}
16540
16541/* Initialize vector TARGET via VALS. Suppress the use of MMX
16542 instructions unless MMX_OK is true. */
16543
16544void
16545ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16546{
16547 machine_mode mode = GET_MODE (target);
16548 machine_mode inner_mode = GET_MODE_INNER (mode);
16549 int n_elts = GET_MODE_NUNITS (mode);
16550 int n_var = 0, one_var = -1;
16551 bool all_same = true, all_const_zero = true;
16552 int i;
16553 rtx x;
16554
16555 /* Handle first initialization from vector elts. */
16556 if (n_elts != XVECLEN (vals, 0))
16557 {
16558 rtx subtarget = target;
16559 x = XVECEXP (vals, 0, 0);
16560 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16561 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16562 {
16563 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
b7dd2e4e
JJ
16564 if (inner_mode == QImode
16565 || inner_mode == HImode
575191b9 16566 || inner_mode == TImode
6910cad5 16567 || inner_mode == HFmode
16568 || inner_mode == BFmode)
2bf6d935
ML
16569 {
16570 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
b7dd2e4e
JJ
16571 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16572 n_bits /= GET_MODE_SIZE (elt_mode);
16573 mode = mode_for_vector (elt_mode, n_bits).require ();
16574 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
2bf6d935
ML
16575 ops[0] = gen_lowpart (inner_mode, ops[0]);
16576 ops[1] = gen_lowpart (inner_mode, ops[1]);
16577 subtarget = gen_reg_rtx (mode);
16578 }
16579 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
16580 if (subtarget != target)
16581 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
16582 return;
16583 }
16584 gcc_unreachable ();
16585 }
16586
16587 for (i = 0; i < n_elts; ++i)
16588 {
16589 x = XVECEXP (vals, 0, i);
16590 if (!(CONST_SCALAR_INT_P (x)
16591 || CONST_DOUBLE_P (x)
16592 || CONST_FIXED_P (x)))
16593 n_var++, one_var = i;
16594 else if (x != CONST0_RTX (inner_mode))
16595 all_const_zero = false;
16596 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
16597 all_same = false;
16598 }
16599
16600 /* Constants are best loaded from the constant pool. */
16601 if (n_var == 0)
16602 {
16603 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
16604 return;
16605 }
16606
16607 /* If all values are identical, broadcast the value. */
16608 if (all_same
16609 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16610 XVECEXP (vals, 0, 0)))
16611 return;
16612
16613 /* Values where only one field is non-constant are best loaded from
16614 the pool and overwritten via move later. */
16615 if (n_var == 1)
16616 {
16617 if (all_const_zero
16618 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16619 XVECEXP (vals, 0, one_var),
16620 one_var))
16621 return;
16622
16623 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16624 return;
16625 }
16626
16627 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16628}
16629
287cc750 16630/* Implemented as
16631 V setg (V v, int idx, T val)
16632 {
16633 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16634 V valv = (V){val, val, val, val, val, val, val, val};
16635 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16636 v = (v & ~mask) | (valv & mask);
16637 return v;
16638 }. */
16639void
16640ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16641{
16642 rtx vec[64];
16643 machine_mode mode = GET_MODE (target);
16644 machine_mode cmp_mode = mode;
16645 int n_elts = GET_MODE_NUNITS (mode);
16646 rtx valv,idxv,constv,idx_tmp;
16647 bool ok = false;
16648
16649 /* 512-bits vector byte/word broadcast and comparison only available
16650 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16651 when without TARGET_AVX512BW. */
6910cad5 16652 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
16653 || mode == V64QImode)
7a54d3de 16654 && !TARGET_AVX512BW)
287cc750 16655 {
16656 gcc_assert (TARGET_AVX512F);
16657 rtx vhi, vlo, idx_hi;
16658 machine_mode half_mode;
16659 rtx (*extract_hi)(rtx, rtx);
16660 rtx (*extract_lo)(rtx, rtx);
16661
16662 if (mode == V32HImode)
16663 {
16664 half_mode = V16HImode;
16665 extract_hi = gen_vec_extract_hi_v32hi;
16666 extract_lo = gen_vec_extract_lo_v32hi;
16667 }
7a54d3de
UB
16668 else if (mode == V32HFmode)
16669 {
16670 half_mode = V16HFmode;
16671 extract_hi = gen_vec_extract_hi_v32hf;
16672 extract_lo = gen_vec_extract_lo_v32hf;
16673 }
6910cad5 16674 else if (mode == V32BFmode)
16675 {
16676 half_mode = V16BFmode;
16677 extract_hi = gen_vec_extract_hi_v32bf;
16678 extract_lo = gen_vec_extract_lo_v32bf;
16679 }
287cc750 16680 else
16681 {
16682 half_mode = V32QImode;
16683 extract_hi = gen_vec_extract_hi_v64qi;
16684 extract_lo = gen_vec_extract_lo_v64qi;
16685 }
16686
16687 vhi = gen_reg_rtx (half_mode);
16688 vlo = gen_reg_rtx (half_mode);
16689 idx_hi = gen_reg_rtx (GET_MODE (idx));
16690 emit_insn (extract_hi (vhi, target));
16691 emit_insn (extract_lo (vlo, target));
16692 vec[0] = idx_hi;
16693 vec[1] = idx;
16694 vec[2] = GEN_INT (n_elts/2);
16695 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16696 ix86_expand_vector_set_var (vhi, val, idx_hi);
16697 ix86_expand_vector_set_var (vlo, val, idx);
16698 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16699 return;
16700 }
16701
16702 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16703 {
16704 switch (mode)
16705 {
16706 case E_V2DFmode:
16707 cmp_mode = V2DImode;
16708 break;
16709 case E_V4DFmode:
16710 cmp_mode = V4DImode;
16711 break;
16712 case E_V8DFmode:
16713 cmp_mode = V8DImode;
16714 break;
20a2c8ac
UB
16715 case E_V2SFmode:
16716 cmp_mode = V2SImode;
16717 break;
287cc750 16718 case E_V4SFmode:
16719 cmp_mode = V4SImode;
16720 break;
16721 case E_V8SFmode:
16722 cmp_mode = V8SImode;
16723 break;
16724 case E_V16SFmode:
16725 cmp_mode = V16SImode;
16726 break;
9e2a82e1 16727 case E_V8HFmode:
16728 cmp_mode = V8HImode;
16729 break;
16730 case E_V16HFmode:
16731 cmp_mode = V16HImode;
16732 break;
16733 case E_V32HFmode:
16734 cmp_mode = V32HImode;
16735 break;
6910cad5 16736 case E_V8BFmode:
16737 cmp_mode = V8HImode;
16738 break;
16739 case E_V16BFmode:
16740 cmp_mode = V16HImode;
16741 break;
16742 case E_V32BFmode:
16743 cmp_mode = V32HImode;
16744 break;
287cc750 16745 default:
16746 gcc_unreachable ();
16747 }
16748 }
16749
16750 for (int i = 0; i != n_elts; i++)
16751 vec[i] = GEN_INT (i);
16752 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16753 valv = gen_reg_rtx (mode);
16754 idxv = gen_reg_rtx (cmp_mode);
16755 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16756
20a2c8ac
UB
16757 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16758 mode, valv, val);
287cc750 16759 gcc_assert (ok);
20a2c8ac
UB
16760 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16761 cmp_mode, idxv, idx_tmp);
287cc750 16762 gcc_assert (ok);
16763 vec[0] = target;
16764 vec[1] = valv;
16765 vec[2] = target;
16766 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16767 vec[4] = idxv;
16768 vec[5] = constv;
16769 ok = ix86_expand_int_vcond (vec);
16770 gcc_assert (ok);
16771}
16772
2bf6d935
ML
16773void
16774ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16775{
16776 machine_mode mode = GET_MODE (target);
16777 machine_mode inner_mode = GET_MODE_INNER (mode);
16778 machine_mode half_mode;
16779 bool use_vec_merge = false;
7fc4d600 16780 bool blendm_const = false;
2bf6d935 16781 rtx tmp;
6910cad5 16782 static rtx (*gen_extract[8][2]) (rtx, rtx)
2bf6d935
ML
16783 = {
16784 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16785 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16786 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16787 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16788 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
9e2a82e1 16789 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
6910cad5 16790 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
16791 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
2bf6d935 16792 };
6910cad5 16793 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
2bf6d935
ML
16794 = {
16795 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16796 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16797 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16798 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16799 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
9e2a82e1 16800 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16801 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
6910cad5 16802 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
2bf6d935
ML
16803 };
16804 int i, j, n;
16805 machine_mode mmode = VOIDmode;
16806 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16807
16808 switch (mode)
16809 {
2bf6d935 16810 case E_V2SImode:
f15c7bd1
UB
16811 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16812 if (use_vec_merge)
16813 break;
16814 /* FALLTHRU */
16815
16816 case E_V2SFmode:
2bf6d935
ML
16817 if (mmx_ok)
16818 {
16819 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16820 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16821 if (elt == 0)
16822 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16823 else
16824 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16825 emit_insn (gen_rtx_SET (target, tmp));
16826 return;
16827 }
16828 break;
16829
16830 case E_V2DImode:
16831 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16832 if (use_vec_merge)
16833 break;
16834
16835 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16836 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16837 if (elt == 0)
16838 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16839 else
16840 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16841 emit_insn (gen_rtx_SET (target, tmp));
16842 return;
16843
16844 case E_V2DFmode:
ac173024
L
16845 /* NB: For ELT == 0, use standard scalar operation patterns which
16846 preserve the rest of the vector for combiner:
16847
16848 (vec_merge:V2DF
16849 (vec_duplicate:V2DF (reg:DF))
16850 (reg:V2DF)
16851 (const_int 1))
16852 */
16853 if (elt == 0)
16854 goto do_vec_merge;
16855
2bf6d935
ML
16856 {
16857 rtx op0, op1;
16858
16859 /* For the two element vectors, we implement a VEC_CONCAT with
16860 the extraction of the other element. */
16861
16862 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16863 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16864
16865 if (elt == 0)
16866 op0 = val, op1 = tmp;
16867 else
16868 op0 = tmp, op1 = val;
16869
16870 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16871 emit_insn (gen_rtx_SET (target, tmp));
16872 }
16873 return;
16874
16875 case E_V4SFmode:
16876 use_vec_merge = TARGET_SSE4_1;
16877 if (use_vec_merge)
16878 break;
16879
16880 switch (elt)
16881 {
16882 case 0:
16883 use_vec_merge = true;
16884 break;
16885
16886 case 1:
16887 /* tmp = target = A B C D */
16888 tmp = copy_to_reg (target);
16889 /* target = A A B B */
16890 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16891 /* target = X A B B */
16892 ix86_expand_vector_set (false, target, val, 0);
16893 /* target = A X C D */
16894 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16895 const1_rtx, const0_rtx,
16896 GEN_INT (2+4), GEN_INT (3+4)));
16897 return;
16898
16899 case 2:
16900 /* tmp = target = A B C D */
16901 tmp = copy_to_reg (target);
16902 /* tmp = X B C D */
16903 ix86_expand_vector_set (false, tmp, val, 0);
16904 /* target = A B X D */
16905 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16906 const0_rtx, const1_rtx,
16907 GEN_INT (0+4), GEN_INT (3+4)));
16908 return;
16909
16910 case 3:
16911 /* tmp = target = A B C D */
16912 tmp = copy_to_reg (target);
16913 /* tmp = X B C D */
16914 ix86_expand_vector_set (false, tmp, val, 0);
16915 /* target = A B X D */
16916 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16917 const0_rtx, const1_rtx,
16918 GEN_INT (2+4), GEN_INT (0+4)));
16919 return;
16920
16921 default:
16922 gcc_unreachable ();
16923 }
16924 break;
16925
16926 case E_V4SImode:
16927 use_vec_merge = TARGET_SSE4_1;
16928 if (use_vec_merge)
16929 break;
16930
16931 /* Element 0 handled by vec_merge below. */
16932 if (elt == 0)
16933 {
16934 use_vec_merge = true;
16935 break;
16936 }
16937
16938 if (TARGET_SSE2)
16939 {
16940 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16941 store into element 0, then shuffle them back. */
16942
16943 rtx order[4];
16944
16945 order[0] = GEN_INT (elt);
16946 order[1] = const1_rtx;
16947 order[2] = const2_rtx;
16948 order[3] = GEN_INT (3);
16949 order[elt] = const0_rtx;
16950
16951 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16952 order[1], order[2], order[3]));
16953
16954 ix86_expand_vector_set (false, target, val, 0);
16955
16956 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16957 order[1], order[2], order[3]));
16958 }
16959 else
16960 {
16961 /* For SSE1, we have to reuse the V4SF code. */
16962 rtx t = gen_reg_rtx (V4SFmode);
16963 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16964 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16965 emit_move_insn (target, gen_lowpart (mode, t));
16966 }
16967 return;
16968
16969 case E_V8HImode:
7eb961d8 16970 case E_V8HFmode:
6910cad5 16971 case E_V8BFmode:
5883e567 16972 case E_V2HImode:
2bf6d935
ML
16973 use_vec_merge = TARGET_SSE2;
16974 break;
16975 case E_V4HImode:
16976 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16977 break;
16978
16979 case E_V16QImode:
5883e567 16980 case E_V4QImode:
2bf6d935
ML
16981 use_vec_merge = TARGET_SSE4_1;
16982 break;
16983
16984 case E_V8QImode:
f15c7bd1 16985 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935
ML
16986 break;
16987
16988 case E_V32QImode:
16989 half_mode = V16QImode;
16990 j = 0;
16991 n = 16;
16992 goto half;
16993
9e2a82e1 16994 case E_V16HFmode:
6910cad5 16995 case E_V16BFmode:
1f759dbd 16996 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16997 if (TARGET_AVX2 && elt != 0)
7fc4d600 16998 {
16999 mmode = SImode;
6910cad5 17000 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
17001 : gen_avx2_pblendbf_1);
7fc4d600 17002 blendm_const = true;
17003 break;
17004 }
17005 else
17006 {
6910cad5 17007 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
17008 j = ((mode == E_V16HFmode) ? 6 : 7);
7fc4d600 17009 n = 8;
17010 goto half;
17011 }
9e2a82e1 17012
2bf6d935
ML
17013 case E_V16HImode:
17014 half_mode = V8HImode;
17015 j = 1;
17016 n = 8;
17017 goto half;
17018
17019 case E_V8SImode:
17020 half_mode = V4SImode;
17021 j = 2;
17022 n = 4;
17023 goto half;
17024
17025 case E_V4DImode:
17026 half_mode = V2DImode;
17027 j = 3;
17028 n = 2;
17029 goto half;
17030
17031 case E_V8SFmode:
17032 half_mode = V4SFmode;
17033 j = 4;
17034 n = 4;
17035 goto half;
17036
17037 case E_V4DFmode:
17038 half_mode = V2DFmode;
17039 j = 5;
17040 n = 2;
17041 goto half;
17042
17043half:
17044 /* Compute offset. */
17045 i = elt / n;
17046 elt %= n;
17047
17048 gcc_assert (i <= 1);
17049
17050 /* Extract the half. */
17051 tmp = gen_reg_rtx (half_mode);
17052 emit_insn (gen_extract[j][i] (tmp, target));
17053
17054 /* Put val in tmp at elt. */
17055 ix86_expand_vector_set (false, tmp, val, elt);
17056
17057 /* Put it back. */
17058 emit_insn (gen_insert[j][i] (target, target, tmp));
17059 return;
17060
17061 case E_V8DFmode:
17062 if (TARGET_AVX512F)
17063 {
17064 mmode = QImode;
17065 gen_blendm = gen_avx512f_blendmv8df;
17066 }
17067 break;
17068
17069 case E_V8DImode:
17070 if (TARGET_AVX512F)
17071 {
17072 mmode = QImode;
17073 gen_blendm = gen_avx512f_blendmv8di;
17074 }
17075 break;
17076
17077 case E_V16SFmode:
17078 if (TARGET_AVX512F)
17079 {
17080 mmode = HImode;
17081 gen_blendm = gen_avx512f_blendmv16sf;
17082 }
17083 break;
17084
17085 case E_V16SImode:
17086 if (TARGET_AVX512F)
17087 {
17088 mmode = HImode;
17089 gen_blendm = gen_avx512f_blendmv16si;
17090 }
17091 break;
17092
9e2a82e1 17093 case E_V32HFmode:
17094 if (TARGET_AVX512BW)
17095 {
17096 mmode = SImode;
17097 gen_blendm = gen_avx512bw_blendmv32hf;
17098 }
17099 break;
6910cad5 17100 case E_V32BFmode:
17101 if (TARGET_AVX512BW)
17102 {
17103 mmode = SImode;
17104 gen_blendm = gen_avx512bw_blendmv32bf;
17105 }
17106 break;
2bf6d935
ML
17107 case E_V32HImode:
17108 if (TARGET_AVX512BW)
17109 {
17110 mmode = SImode;
17111 gen_blendm = gen_avx512bw_blendmv32hi;
17112 }
17113 else if (TARGET_AVX512F)
17114 {
17115 half_mode = E_V8HImode;
17116 n = 8;
17117 goto quarter;
17118 }
17119 break;
17120
17121 case E_V64QImode:
17122 if (TARGET_AVX512BW)
17123 {
17124 mmode = DImode;
17125 gen_blendm = gen_avx512bw_blendmv64qi;
17126 }
17127 else if (TARGET_AVX512F)
17128 {
17129 half_mode = E_V16QImode;
17130 n = 16;
17131 goto quarter;
17132 }
17133 break;
17134
17135quarter:
17136 /* Compute offset. */
17137 i = elt / n;
17138 elt %= n;
17139
17140 gcc_assert (i <= 3);
17141
17142 {
17143 /* Extract the quarter. */
17144 tmp = gen_reg_rtx (V4SImode);
17145 rtx tmp2 = gen_lowpart (V16SImode, target);
17146 rtx mask = gen_reg_rtx (QImode);
17147
17148 emit_move_insn (mask, constm1_rtx);
17149 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
17150 tmp, mask));
17151
17152 tmp2 = gen_reg_rtx (half_mode);
17153 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
17154 tmp = tmp2;
17155
17156 /* Put val in tmp at elt. */
17157 ix86_expand_vector_set (false, tmp, val, elt);
17158
17159 /* Put it back. */
17160 tmp2 = gen_reg_rtx (V16SImode);
17161 rtx tmp3 = gen_lowpart (V16SImode, target);
17162 mask = gen_reg_rtx (HImode);
17163 emit_move_insn (mask, constm1_rtx);
17164 tmp = gen_lowpart (V4SImode, tmp);
17165 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
17166 tmp3, mask));
17167 emit_move_insn (target, gen_lowpart (mode, tmp2));
17168 }
17169 return;
17170
17171 default:
17172 break;
17173 }
17174
17175 if (mmode != VOIDmode)
17176 {
17177 tmp = gen_reg_rtx (mode);
17178 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
7fc4d600 17179 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
2bf6d935
ML
17180 /* The avx512*_blendm<mode> expanders have different operand order
17181 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
17182 elements where the mask is set and second input operand otherwise,
17183 in {sse,avx}*_*blend* the first input operand is used for elements
17184 where the mask is clear and second input operand otherwise. */
7fc4d600 17185 if (!blendm_const)
17186 merge_mask = force_reg (mmode, merge_mask);
17187 emit_insn (gen_blendm (target, target, tmp, merge_mask));
2bf6d935
ML
17188 }
17189 else if (use_vec_merge)
17190 {
ac173024 17191do_vec_merge:
2bf6d935
ML
17192 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
17193 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
17194 GEN_INT (HOST_WIDE_INT_1U << elt));
17195 emit_insn (gen_rtx_SET (target, tmp));
17196 }
17197 else
17198 {
17199 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17200
17201 emit_move_insn (mem, target);
17202
17203 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
17204 emit_move_insn (tmp, val);
17205
17206 emit_move_insn (target, mem);
17207 }
17208}
17209
17210void
17211ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
17212{
17213 machine_mode mode = GET_MODE (vec);
17214 machine_mode inner_mode = GET_MODE_INNER (mode);
17215 bool use_vec_extr = false;
17216 rtx tmp;
17217
17218 switch (mode)
17219 {
17220 case E_V2SImode:
5fbc8ab4
UB
17221 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17222 if (use_vec_extr)
17223 break;
17224 /* FALLTHRU */
17225
2bf6d935
ML
17226 case E_V2SFmode:
17227 if (!mmx_ok)
17228 break;
17229 /* FALLTHRU */
17230
17231 case E_V2DFmode:
17232 case E_V2DImode:
17233 case E_V2TImode:
17234 case E_V4TImode:
17235 use_vec_extr = true;
17236 break;
17237
17238 case E_V4SFmode:
17239 use_vec_extr = TARGET_SSE4_1;
17240 if (use_vec_extr)
17241 break;
17242
17243 switch (elt)
17244 {
17245 case 0:
17246 tmp = vec;
17247 break;
17248
17249 case 1:
17250 case 3:
17251 tmp = gen_reg_rtx (mode);
17252 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
17253 GEN_INT (elt), GEN_INT (elt),
17254 GEN_INT (elt+4), GEN_INT (elt+4)));
17255 break;
17256
17257 case 2:
17258 tmp = gen_reg_rtx (mode);
17259 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
17260 break;
17261
17262 default:
17263 gcc_unreachable ();
17264 }
17265 vec = tmp;
17266 use_vec_extr = true;
17267 elt = 0;
17268 break;
17269
17270 case E_V4SImode:
17271 use_vec_extr = TARGET_SSE4_1;
17272 if (use_vec_extr)
17273 break;
17274
17275 if (TARGET_SSE2)
17276 {
17277 switch (elt)
17278 {
17279 case 0:
17280 tmp = vec;
17281 break;
17282
17283 case 1:
17284 case 3:
17285 tmp = gen_reg_rtx (mode);
17286 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
17287 GEN_INT (elt), GEN_INT (elt),
17288 GEN_INT (elt), GEN_INT (elt)));
17289 break;
17290
17291 case 2:
17292 tmp = gen_reg_rtx (mode);
17293 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
17294 break;
17295
17296 default:
17297 gcc_unreachable ();
17298 }
17299 vec = tmp;
17300 use_vec_extr = true;
17301 elt = 0;
17302 }
17303 else
17304 {
17305 /* For SSE1, we have to reuse the V4SF code. */
17306 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
17307 gen_lowpart (V4SFmode, vec), elt);
17308 return;
17309 }
17310 break;
17311
17312 case E_V8HImode:
7a54d3de 17313 case E_V8HFmode:
6910cad5 17314 case E_V8BFmode:
5883e567 17315 case E_V2HImode:
2bf6d935
ML
17316 use_vec_extr = TARGET_SSE2;
17317 break;
17318 case E_V4HImode:
17319 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17320 break;
17321
17322 case E_V16QImode:
17323 use_vec_extr = TARGET_SSE4_1;
f66e6e2b
JJ
17324 if (!use_vec_extr
17325 && TARGET_SSE2
17326 && elt == 0
17327 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
17328 {
17329 tmp = gen_reg_rtx (SImode);
17330 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
17331 0);
17332 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
17333 return;
17334 }
2bf6d935 17335 break;
5883e567
UB
17336 case E_V4QImode:
17337 use_vec_extr = TARGET_SSE4_1;
17338 break;
2bf6d935
ML
17339
17340 case E_V8SFmode:
17341 if (TARGET_AVX)
17342 {
17343 tmp = gen_reg_rtx (V4SFmode);
17344 if (elt < 4)
17345 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
17346 else
17347 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
17348 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17349 return;
17350 }
17351 break;
17352
17353 case E_V4DFmode:
17354 if (TARGET_AVX)
17355 {
17356 tmp = gen_reg_rtx (V2DFmode);
17357 if (elt < 2)
17358 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
17359 else
17360 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
17361 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17362 return;
17363 }
17364 break;
17365
17366 case E_V32QImode:
17367 if (TARGET_AVX)
17368 {
17369 tmp = gen_reg_rtx (V16QImode);
17370 if (elt < 16)
17371 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
17372 else
17373 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
17374 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17375 return;
17376 }
17377 break;
17378
17379 case E_V16HImode:
17380 if (TARGET_AVX)
17381 {
17382 tmp = gen_reg_rtx (V8HImode);
17383 if (elt < 8)
17384 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
17385 else
17386 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
17387 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17388 return;
17389 }
17390 break;
17391
17392 case E_V8SImode:
17393 if (TARGET_AVX)
17394 {
17395 tmp = gen_reg_rtx (V4SImode);
17396 if (elt < 4)
17397 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
17398 else
17399 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
17400 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17401 return;
17402 }
17403 break;
17404
17405 case E_V4DImode:
17406 if (TARGET_AVX)
17407 {
17408 tmp = gen_reg_rtx (V2DImode);
17409 if (elt < 2)
17410 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
17411 else
17412 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
17413 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17414 return;
17415 }
17416 break;
17417
17418 case E_V32HImode:
17419 if (TARGET_AVX512BW)
17420 {
17421 tmp = gen_reg_rtx (V16HImode);
17422 if (elt < 16)
17423 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
17424 else
17425 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
17426 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17427 return;
17428 }
17429 break;
17430
17431 case E_V64QImode:
17432 if (TARGET_AVX512BW)
17433 {
17434 tmp = gen_reg_rtx (V32QImode);
17435 if (elt < 32)
17436 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
17437 else
17438 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
17439 ix86_expand_vector_extract (false, target, tmp, elt & 31);
17440 return;
17441 }
17442 break;
17443
17444 case E_V16SFmode:
17445 tmp = gen_reg_rtx (V8SFmode);
17446 if (elt < 8)
17447 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
17448 else
17449 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
17450 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17451 return;
17452
17453 case E_V8DFmode:
17454 tmp = gen_reg_rtx (V4DFmode);
17455 if (elt < 4)
17456 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
17457 else
17458 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
17459 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17460 return;
17461
17462 case E_V16SImode:
17463 tmp = gen_reg_rtx (V8SImode);
17464 if (elt < 8)
17465 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
17466 else
17467 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
17468 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17469 return;
17470
17471 case E_V8DImode:
17472 tmp = gen_reg_rtx (V4DImode);
17473 if (elt < 4)
17474 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
17475 else
17476 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
17477 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17478 return;
17479
9e2a82e1 17480 case E_V32HFmode:
6910cad5 17481 case E_V32BFmode:
7a54d3de
UB
17482 if (TARGET_AVX512BW)
17483 {
6910cad5 17484 tmp = (mode == E_V32HFmode
17485 ? gen_reg_rtx (V16HFmode)
17486 : gen_reg_rtx (V16BFmode));
7a54d3de 17487 if (elt < 16)
96799fa4 17488 emit_insn (gen_vec_extract_lo (mode, tmp, vec));
7a54d3de 17489 else
96799fa4 17490 emit_insn (gen_vec_extract_hi (mode, tmp, vec));
7a54d3de
UB
17491 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17492 return;
17493 }
17494 break;
9e2a82e1 17495
17496 case E_V16HFmode:
6910cad5 17497 case E_V16BFmode:
7a54d3de
UB
17498 if (TARGET_AVX)
17499 {
6910cad5 17500 tmp = (mode == E_V16HFmode
17501 ? gen_reg_rtx (V8HFmode)
17502 : gen_reg_rtx (V8BFmode));
7a54d3de 17503 if (elt < 8)
96799fa4 17504 emit_insn (gen_vec_extract_lo (mode, tmp, vec));
7a54d3de 17505 else
96799fa4 17506 emit_insn (gen_vec_extract_hi (mode, tmp, vec));
7a54d3de
UB
17507 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17508 return;
17509 }
9e2a82e1 17510 break;
17511
2bf6d935 17512 case E_V8QImode:
5fbc8ab4 17513 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935 17514 /* ??? Could extract the appropriate HImode element and shift. */
5fbc8ab4
UB
17515 break;
17516
2bf6d935
ML
17517 default:
17518 break;
17519 }
17520
17521 if (use_vec_extr)
17522 {
17523 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17524 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17525
17526 /* Let the rtl optimizers know about the zero extension performed. */
17527 if (inner_mode == QImode || inner_mode == HImode)
17528 {
97c32001 17529 rtx reg = gen_reg_rtx (SImode);
2bf6d935 17530 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
97c32001
RS
17531 emit_move_insn (reg, tmp);
17532 tmp = gen_lowpart (inner_mode, reg);
17533 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17534 SUBREG_PROMOTED_SET (tmp, 1);
2bf6d935
ML
17535 }
17536
97c32001 17537 emit_move_insn (target, tmp);
2bf6d935
ML
17538 }
17539 else
17540 {
17541 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17542
17543 emit_move_insn (mem, vec);
17544
17545 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17546 emit_move_insn (target, tmp);
17547 }
17548}
17549
17550/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17551 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17552 The upper bits of DEST are undefined, though they shouldn't cause
17553 exceptions (some bits from src or all zeros are ok). */
17554
17555static void
17556emit_reduc_half (rtx dest, rtx src, int i)
17557{
17558 rtx tem, d = dest;
17559 switch (GET_MODE (src))
17560 {
17561 case E_V4SFmode:
17562 if (i == 128)
17563 tem = gen_sse_movhlps (dest, src, src);
17564 else
17565 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
17566 GEN_INT (1 + 4), GEN_INT (1 + 4));
17567 break;
17568 case E_V2DFmode:
17569 tem = gen_vec_interleave_highv2df (dest, src, src);
17570 break;
73c535a0 17571 case E_V4QImode:
17572 d = gen_reg_rtx (V1SImode);
17573 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
17574 GEN_INT (i / 2));
17575 break;
77ca2cfc 17576 case E_V4HImode:
17577 d = gen_reg_rtx (V1DImode);
17578 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
17579 GEN_INT (i / 2));
17580 break;
2bf6d935
ML
17581 case E_V16QImode:
17582 case E_V8HImode:
3540429b 17583 case E_V8HFmode:
2bf6d935
ML
17584 case E_V4SImode:
17585 case E_V2DImode:
17586 d = gen_reg_rtx (V1TImode);
17587 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
17588 GEN_INT (i / 2));
17589 break;
17590 case E_V8SFmode:
17591 if (i == 256)
17592 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
17593 else
17594 tem = gen_avx_shufps256 (dest, src, src,
17595 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
17596 break;
17597 case E_V4DFmode:
17598 if (i == 256)
17599 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
17600 else
17601 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
17602 break;
17603 case E_V32QImode:
17604 case E_V16HImode:
3540429b 17605 case E_V16HFmode:
2bf6d935
ML
17606 case E_V8SImode:
17607 case E_V4DImode:
17608 if (i == 256)
17609 {
17610 if (GET_MODE (dest) != V4DImode)
17611 d = gen_reg_rtx (V4DImode);
17612 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
17613 gen_lowpart (V4DImode, src),
17614 const1_rtx);
17615 }
17616 else
17617 {
17618 d = gen_reg_rtx (V2TImode);
17619 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
17620 GEN_INT (i / 2));
17621 }
17622 break;
17623 case E_V64QImode:
17624 case E_V32HImode:
3540429b 17625 case E_V32HFmode:
bee27152
JJ
17626 if (i < 64)
17627 {
17628 d = gen_reg_rtx (V4TImode);
17629 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
17630 GEN_INT (i / 2));
17631 break;
17632 }
17633 /* FALLTHRU */
2bf6d935
ML
17634 case E_V16SImode:
17635 case E_V16SFmode:
17636 case E_V8DImode:
17637 case E_V8DFmode:
17638 if (i > 128)
17639 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
17640 gen_lowpart (V16SImode, src),
17641 gen_lowpart (V16SImode, src),
17642 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17643 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17644 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17645 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17646 GEN_INT (0xC), GEN_INT (0xD),
17647 GEN_INT (0xE), GEN_INT (0xF),
17648 GEN_INT (0x10), GEN_INT (0x11),
17649 GEN_INT (0x12), GEN_INT (0x13),
17650 GEN_INT (0x14), GEN_INT (0x15),
17651 GEN_INT (0x16), GEN_INT (0x17));
2bf6d935
ML
17652 else
17653 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
17654 gen_lowpart (V16SImode, src),
17655 GEN_INT (i == 128 ? 0x2 : 0x1),
17656 GEN_INT (0x3),
17657 GEN_INT (0x3),
17658 GEN_INT (0x3),
17659 GEN_INT (i == 128 ? 0x6 : 0x5),
17660 GEN_INT (0x7),
17661 GEN_INT (0x7),
17662 GEN_INT (0x7),
17663 GEN_INT (i == 128 ? 0xA : 0x9),
17664 GEN_INT (0xB),
17665 GEN_INT (0xB),
17666 GEN_INT (0xB),
17667 GEN_INT (i == 128 ? 0xE : 0xD),
17668 GEN_INT (0xF),
17669 GEN_INT (0xF),
17670 GEN_INT (0xF));
2bf6d935
ML
17671 break;
17672 default:
17673 gcc_unreachable ();
17674 }
17675 emit_insn (tem);
17676 if (d != dest)
17677 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17678}
17679
17680/* Expand a vector reduction. FN is the binary pattern to reduce;
17681 DEST is the destination; IN is the input vector. */
17682
17683void
17684ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17685{
17686 rtx half, dst, vec = in;
17687 machine_mode mode = GET_MODE (in);
17688 int i;
17689
17690 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17691 if (TARGET_SSE4_1
17692 && mode == V8HImode
17693 && fn == gen_uminv8hi3)
17694 {
17695 emit_insn (gen_sse4_1_phminposuw (dest, in));
17696 return;
17697 }
17698
17699 for (i = GET_MODE_BITSIZE (mode);
17700 i > GET_MODE_UNIT_BITSIZE (mode);
17701 i >>= 1)
17702 {
17703 half = gen_reg_rtx (mode);
17704 emit_reduc_half (half, vec, i);
17705 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17706 dst = dest;
17707 else
17708 dst = gen_reg_rtx (mode);
17709 emit_insn (fn (dst, half, vec));
17710 vec = dst;
17711 }
17712}
17713
17714/* Output code to perform a conditional jump to LABEL, if C2 flag in
17715 FP status register is set. */
17716
17717void
17718ix86_emit_fp_unordered_jump (rtx label)
17719{
17720 rtx reg = gen_reg_rtx (HImode);
17721 rtx_insn *insn;
17722 rtx temp;
17723
17724 emit_insn (gen_x86_fnstsw_1 (reg));
17725
17726 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17727 {
17728 emit_insn (gen_x86_sahf_1 (reg));
17729
17730 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17731 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17732 }
17733 else
17734 {
17735 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17736
17737 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17738 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17739 }
17740
17741 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17742 gen_rtx_LABEL_REF (VOIDmode, label),
17743 pc_rtx);
17744 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17745 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17746 JUMP_LABEL (insn) = label;
17747}
17748
17749/* Output code to perform an sinh XFmode calculation. */
17750
152f243f
JJ
17751void
17752ix86_emit_i387_sinh (rtx op0, rtx op1)
2bf6d935
ML
17753{
17754 rtx e1 = gen_reg_rtx (XFmode);
17755 rtx e2 = gen_reg_rtx (XFmode);
17756 rtx scratch = gen_reg_rtx (HImode);
17757 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17758 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17759 rtx cst1, tmp;
17760 rtx_code_label *jump_label = gen_label_rtx ();
17761 rtx_insn *insn;
17762
17763 /* scratch = fxam (op1) */
17764 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17765
17766 /* e1 = expm1 (|op1|) */
17767 emit_insn (gen_absxf2 (e2, op1));
17768 emit_insn (gen_expm1xf2 (e1, e2));
17769
17770 /* e2 = e1 / (e1 + 1.0) + e1 */
17771 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17772 emit_insn (gen_addxf3 (e2, e1, cst1));
17773 emit_insn (gen_divxf3 (e2, e1, e2));
17774 emit_insn (gen_addxf3 (e2, e2, e1));
17775
17776 /* flags = signbit (op1) */
17777 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17778
17779 /* if (flags) then e2 = -e2 */
17780 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17781 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17782 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17783 pc_rtx);
17784 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17785 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17786 JUMP_LABEL (insn) = jump_label;
17787
17788 emit_insn (gen_negxf2 (e2, e2));
17789
17790 emit_label (jump_label);
17791 LABEL_NUSES (jump_label) = 1;
17792
17793 /* op0 = 0.5 * e2 */
17794 half = force_reg (XFmode, half);
17795 emit_insn (gen_mulxf3 (op0, e2, half));
17796}
17797
17798/* Output code to perform an cosh XFmode calculation. */
17799
152f243f
JJ
17800void
17801ix86_emit_i387_cosh (rtx op0, rtx op1)
2bf6d935
ML
17802{
17803 rtx e1 = gen_reg_rtx (XFmode);
17804 rtx e2 = gen_reg_rtx (XFmode);
17805 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17806 rtx cst1;
17807
17808 /* e1 = exp (op1) */
17809 emit_insn (gen_expxf2 (e1, op1));
17810
17811 /* e2 = e1 + 1.0 / e1 */
17812 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17813 emit_insn (gen_divxf3 (e2, cst1, e1));
17814 emit_insn (gen_addxf3 (e2, e1, e2));
17815
17816 /* op0 = 0.5 * e2 */
17817 half = force_reg (XFmode, half);
17818 emit_insn (gen_mulxf3 (op0, e2, half));
17819}
17820
17821/* Output code to perform an tanh XFmode calculation. */
17822
152f243f
JJ
17823void
17824ix86_emit_i387_tanh (rtx op0, rtx op1)
2bf6d935
ML
17825{
17826 rtx e1 = gen_reg_rtx (XFmode);
17827 rtx e2 = gen_reg_rtx (XFmode);
17828 rtx scratch = gen_reg_rtx (HImode);
17829 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17830 rtx cst2, tmp;
17831 rtx_code_label *jump_label = gen_label_rtx ();
17832 rtx_insn *insn;
17833
17834 /* scratch = fxam (op1) */
17835 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17836
17837 /* e1 = expm1 (-|2 * op1|) */
17838 emit_insn (gen_addxf3 (e2, op1, op1));
17839 emit_insn (gen_absxf2 (e2, e2));
17840 emit_insn (gen_negxf2 (e2, e2));
17841 emit_insn (gen_expm1xf2 (e1, e2));
17842
17843 /* e2 = e1 / (e1 + 2.0) */
17844 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17845 emit_insn (gen_addxf3 (e2, e1, cst2));
17846 emit_insn (gen_divxf3 (e2, e1, e2));
17847
17848 /* flags = signbit (op1) */
17849 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17850
17851 /* if (!flags) then e2 = -e2 */
17852 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17853 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17854 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17855 pc_rtx);
17856 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17857 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17858 JUMP_LABEL (insn) = jump_label;
17859
17860 emit_insn (gen_negxf2 (e2, e2));
17861
17862 emit_label (jump_label);
17863 LABEL_NUSES (jump_label) = 1;
17864
17865 emit_move_insn (op0, e2);
17866}
17867
17868/* Output code to perform an asinh XFmode calculation. */
17869
152f243f
JJ
17870void
17871ix86_emit_i387_asinh (rtx op0, rtx op1)
2bf6d935
ML
17872{
17873 rtx e1 = gen_reg_rtx (XFmode);
17874 rtx e2 = gen_reg_rtx (XFmode);
17875 rtx scratch = gen_reg_rtx (HImode);
17876 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17877 rtx cst1, tmp;
17878 rtx_code_label *jump_label = gen_label_rtx ();
17879 rtx_insn *insn;
17880
17881 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17882 emit_insn (gen_mulxf3 (e1, op1, op1));
17883 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17884 emit_insn (gen_addxf3 (e2, e1, cst1));
17885 emit_insn (gen_sqrtxf2 (e2, e2));
17886 emit_insn (gen_addxf3 (e2, e2, cst1));
17887
17888 /* e1 = e1 / e2 */
17889 emit_insn (gen_divxf3 (e1, e1, e2));
17890
17891 /* scratch = fxam (op1) */
17892 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17893
17894 /* e1 = e1 + |op1| */
17895 emit_insn (gen_absxf2 (e2, op1));
17896 emit_insn (gen_addxf3 (e1, e1, e2));
17897
17898 /* e2 = log1p (e1) */
17899 ix86_emit_i387_log1p (e2, e1);
17900
17901 /* flags = signbit (op1) */
17902 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17903
17904 /* if (flags) then e2 = -e2 */
17905 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17906 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17907 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17908 pc_rtx);
17909 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17910 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17911 JUMP_LABEL (insn) = jump_label;
17912
17913 emit_insn (gen_negxf2 (e2, e2));
17914
17915 emit_label (jump_label);
17916 LABEL_NUSES (jump_label) = 1;
17917
17918 emit_move_insn (op0, e2);
17919}
17920
17921/* Output code to perform an acosh XFmode calculation. */
17922
152f243f
JJ
17923void
17924ix86_emit_i387_acosh (rtx op0, rtx op1)
2bf6d935
ML
17925{
17926 rtx e1 = gen_reg_rtx (XFmode);
17927 rtx e2 = gen_reg_rtx (XFmode);
17928 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17929
17930 /* e2 = sqrt (op1 + 1.0) */
17931 emit_insn (gen_addxf3 (e2, op1, cst1));
17932 emit_insn (gen_sqrtxf2 (e2, e2));
17933
17934 /* e1 = sqrt (op1 - 1.0) */
17935 emit_insn (gen_subxf3 (e1, op1, cst1));
17936 emit_insn (gen_sqrtxf2 (e1, e1));
17937
17938 /* e1 = e1 * e2 */
17939 emit_insn (gen_mulxf3 (e1, e1, e2));
17940
17941 /* e1 = e1 + op1 */
17942 emit_insn (gen_addxf3 (e1, e1, op1));
17943
17944 /* op0 = log (e1) */
17945 emit_insn (gen_logxf2 (op0, e1));
17946}
17947
17948/* Output code to perform an atanh XFmode calculation. */
17949
152f243f
JJ
17950void
17951ix86_emit_i387_atanh (rtx op0, rtx op1)
2bf6d935
ML
17952{
17953 rtx e1 = gen_reg_rtx (XFmode);
17954 rtx e2 = gen_reg_rtx (XFmode);
17955 rtx scratch = gen_reg_rtx (HImode);
17956 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17957 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17958 rtx cst1, tmp;
17959 rtx_code_label *jump_label = gen_label_rtx ();
17960 rtx_insn *insn;
17961
17962 /* scratch = fxam (op1) */
17963 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17964
17965 /* e2 = |op1| */
17966 emit_insn (gen_absxf2 (e2, op1));
17967
17968 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17969 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17970 emit_insn (gen_addxf3 (e1, e2, cst1));
17971 emit_insn (gen_addxf3 (e2, e2, e2));
17972 emit_insn (gen_negxf2 (e2, e2));
17973 emit_insn (gen_divxf3 (e1, e2, e1));
17974
17975 /* e2 = log1p (e1) */
17976 ix86_emit_i387_log1p (e2, e1);
17977
17978 /* flags = signbit (op1) */
17979 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17980
17981 /* if (!flags) then e2 = -e2 */
17982 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17983 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17984 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17985 pc_rtx);
17986 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17987 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17988 JUMP_LABEL (insn) = jump_label;
17989
17990 emit_insn (gen_negxf2 (e2, e2));
17991
17992 emit_label (jump_label);
17993 LABEL_NUSES (jump_label) = 1;
17994
17995 /* op0 = 0.5 * e2 */
17996 half = force_reg (XFmode, half);
17997 emit_insn (gen_mulxf3 (op0, e2, half));
17998}
17999
18000/* Output code to perform a log1p XFmode calculation. */
18001
152f243f
JJ
18002void
18003ix86_emit_i387_log1p (rtx op0, rtx op1)
2bf6d935
ML
18004{
18005 rtx_code_label *label1 = gen_label_rtx ();
18006 rtx_code_label *label2 = gen_label_rtx ();
18007
18008 rtx tmp = gen_reg_rtx (XFmode);
18009 rtx res = gen_reg_rtx (XFmode);
18010 rtx cst, cstln2, cst1;
18011 rtx_insn *insn;
18012
d481d137
JJ
18013 /* The emit_jump call emits pending stack adjust, make sure it is emitted
18014 before the conditional jump, otherwise the stack adjustment will be
18015 only conditional. */
18016 do_pending_stack_adjust ();
18017
2bf6d935
ML
18018 cst = const_double_from_real_value
18019 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
18020 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
18021
18022 emit_insn (gen_absxf2 (tmp, op1));
18023
18024 cst = force_reg (XFmode, cst);
18025 ix86_expand_branch (GE, tmp, cst, label1);
18026 predict_jump (REG_BR_PROB_BASE * 10 / 100);
18027 insn = get_last_insn ();
18028 JUMP_LABEL (insn) = label1;
18029
18030 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
18031 emit_jump (label2);
18032
18033 emit_label (label1);
18034 LABEL_NUSES (label1) = 1;
18035
18036 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18037 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
18038 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
18039
18040 emit_label (label2);
18041 LABEL_NUSES (label2) = 1;
18042
18043 emit_move_insn (op0, res);
18044}
18045
18046/* Emit code for round calculation. */
152f243f
JJ
18047void
18048ix86_emit_i387_round (rtx op0, rtx op1)
2bf6d935
ML
18049{
18050 machine_mode inmode = GET_MODE (op1);
18051 machine_mode outmode = GET_MODE (op0);
18052 rtx e1 = gen_reg_rtx (XFmode);
18053 rtx e2 = gen_reg_rtx (XFmode);
18054 rtx scratch = gen_reg_rtx (HImode);
18055 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
18056 rtx half = const_double_from_real_value (dconsthalf, XFmode);
18057 rtx res = gen_reg_rtx (outmode);
18058 rtx_code_label *jump_label = gen_label_rtx ();
18059 rtx (*floor_insn) (rtx, rtx);
18060 rtx (*neg_insn) (rtx, rtx);
18061 rtx_insn *insn;
18062 rtx tmp;
18063
18064 switch (inmode)
18065 {
18066 case E_SFmode:
18067 case E_DFmode:
18068 tmp = gen_reg_rtx (XFmode);
18069
18070 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
18071 op1 = tmp;
18072 break;
18073 case E_XFmode:
18074 break;
18075 default:
18076 gcc_unreachable ();
18077 }
18078
18079 switch (outmode)
18080 {
18081 case E_SFmode:
18082 floor_insn = gen_frndintxf2_floor;
18083 neg_insn = gen_negsf2;
18084 break;
18085 case E_DFmode:
18086 floor_insn = gen_frndintxf2_floor;
18087 neg_insn = gen_negdf2;
18088 break;
18089 case E_XFmode:
18090 floor_insn = gen_frndintxf2_floor;
18091 neg_insn = gen_negxf2;
18092 break;
18093 case E_HImode:
18094 floor_insn = gen_lfloorxfhi2;
18095 neg_insn = gen_neghi2;
18096 break;
18097 case E_SImode:
18098 floor_insn = gen_lfloorxfsi2;
18099 neg_insn = gen_negsi2;
18100 break;
18101 case E_DImode:
18102 floor_insn = gen_lfloorxfdi2;
18103 neg_insn = gen_negdi2;
18104 break;
18105 default:
18106 gcc_unreachable ();
18107 }
18108
18109 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
18110
18111 /* scratch = fxam(op1) */
18112 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18113
18114 /* e1 = fabs(op1) */
18115 emit_insn (gen_absxf2 (e1, op1));
18116
18117 /* e2 = e1 + 0.5 */
18118 half = force_reg (XFmode, half);
18119 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
18120
18121 /* res = floor(e2) */
18122 switch (outmode)
18123 {
18124 case E_SFmode:
18125 case E_DFmode:
18126 {
18127 tmp = gen_reg_rtx (XFmode);
18128
18129 emit_insn (floor_insn (tmp, e2));
18130 emit_insn (gen_rtx_SET (res,
18131 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
18132 UNSPEC_TRUNC_NOOP)));
18133 }
18134 break;
18135 default:
18136 emit_insn (floor_insn (res, e2));
18137 }
18138
18139 /* flags = signbit(a) */
18140 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18141
18142 /* if (flags) then res = -res */
18143 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18144 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
18145 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18146 pc_rtx);
18147 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18148 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18149 JUMP_LABEL (insn) = jump_label;
18150
18151 emit_insn (neg_insn (res, res));
18152
18153 emit_label (jump_label);
18154 LABEL_NUSES (jump_label) = 1;
18155
18156 emit_move_insn (op0, res);
18157}
18158
18159/* Output code to perform a Newton-Rhapson approximation of a single precision
18160 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
18161
152f243f
JJ
18162void
18163ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
2bf6d935
ML
18164{
18165 rtx x0, x1, e0, e1;
18166
18167 x0 = gen_reg_rtx (mode);
18168 e0 = gen_reg_rtx (mode);
18169 e1 = gen_reg_rtx (mode);
18170 x1 = gen_reg_rtx (mode);
18171
18172 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
18173
18174 b = force_reg (mode, b);
18175
18176 /* x0 = rcp(b) estimate */
18177 if (mode == V16SFmode || mode == V8DFmode)
18178 {
18179 if (TARGET_AVX512ER)
18180 {
18181 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18182 UNSPEC_RCP28)));
18183 /* res = a * x0 */
18184 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
18185 return;
18186 }
18187 else
18188 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18189 UNSPEC_RCP14)));
18190 }
18191 else
18192 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18193 UNSPEC_RCP)));
18194
18195 /* e0 = x0 * b */
18196 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
18197
18198 /* e0 = x0 * e0 */
18199 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
18200
18201 /* e1 = x0 + x0 */
18202 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
18203
18204 /* x1 = e1 - e0 */
18205 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
18206
18207 /* res = a * x1 */
18208 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
18209}
18210
18211/* Output code to perform a Newton-Rhapson approximation of a
18212 single precision floating point [reciprocal] square root. */
18213
152f243f
JJ
18214void
18215ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
2bf6d935
ML
18216{
18217 rtx x0, e0, e1, e2, e3, mthree, mhalf;
18218 REAL_VALUE_TYPE r;
18219 int unspec;
18220
18221 x0 = gen_reg_rtx (mode);
18222 e0 = gen_reg_rtx (mode);
18223 e1 = gen_reg_rtx (mode);
18224 e2 = gen_reg_rtx (mode);
18225 e3 = gen_reg_rtx (mode);
18226
18227 if (TARGET_AVX512ER && mode == V16SFmode)
18228 {
18229 if (recip)
18230 /* res = rsqrt28(a) estimate */
18231 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18232 UNSPEC_RSQRT28)));
18233 else
18234 {
18235 /* x0 = rsqrt28(a) estimate */
18236 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18237 UNSPEC_RSQRT28)));
18238 /* res = rcp28(x0) estimate */
18239 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
18240 UNSPEC_RCP28)));
18241 }
18242 return;
18243 }
18244
18245 real_from_integer (&r, VOIDmode, -3, SIGNED);
18246 mthree = const_double_from_real_value (r, SFmode);
18247
18248 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
18249 mhalf = const_double_from_real_value (r, SFmode);
18250 unspec = UNSPEC_RSQRT;
18251
18252 if (VECTOR_MODE_P (mode))
18253 {
18254 mthree = ix86_build_const_vector (mode, true, mthree);
18255 mhalf = ix86_build_const_vector (mode, true, mhalf);
18256 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18257 if (GET_MODE_SIZE (mode) == 64)
18258 unspec = UNSPEC_RSQRT14;
18259 }
18260
18261 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18262 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18263
18264 a = force_reg (mode, a);
18265
18266 /* x0 = rsqrt(a) estimate */
18267 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18268 unspec)));
18269
18270 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18271 if (!recip)
18272 {
18273 rtx zero = force_reg (mode, CONST0_RTX(mode));
18274 rtx mask;
18275
18276 /* Handle masked compare. */
18277 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
18278 {
18279 mask = gen_reg_rtx (HImode);
18280 /* Imm value 0x4 corresponds to not-equal comparison. */
18281 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
18282 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
18283 }
18284 else
18285 {
18286 mask = gen_reg_rtx (mode);
18287 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
18288 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
18289 }
18290 }
18291
fab263ab
L
18292 mthree = force_reg (mode, mthree);
18293
2bf6d935
ML
18294 /* e0 = x0 * a */
18295 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
2bf6d935 18296
a6645a82
L
18297 unsigned vector_size = GET_MODE_SIZE (mode);
18298 if (TARGET_FMA
18299 || (TARGET_AVX512F && vector_size == 64)
18300 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
fab263ab
L
18301 emit_insn (gen_rtx_SET (e2,
18302 gen_rtx_FMA (mode, e0, x0, mthree)));
18303 else
18304 {
18305 /* e1 = e0 * x0 */
18306 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
18307
18308 /* e2 = e1 - 3. */
18309 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
18310 }
2bf6d935
ML
18311
18312 mhalf = force_reg (mode, mhalf);
18313 if (recip)
18314 /* e3 = -.5 * x0 */
18315 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
18316 else
18317 /* e3 = -.5 * e0 */
18318 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
18319 /* ret = e2 * e3 */
18320 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
18321}
18322
18323/* Expand fabs (OP0) and return a new rtx that holds the result. The
18324 mask for masking out the sign-bit is stored in *SMASK, if that is
18325 non-null. */
18326
18327static rtx
18328ix86_expand_sse_fabs (rtx op0, rtx *smask)
18329{
18330 machine_mode vmode, mode = GET_MODE (op0);
18331 rtx xa, mask;
18332
18333 xa = gen_reg_rtx (mode);
18334 if (mode == SFmode)
18335 vmode = V4SFmode;
18336 else if (mode == DFmode)
18337 vmode = V2DFmode;
18338 else
18339 vmode = mode;
18340 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
18341 if (!VECTOR_MODE_P (mode))
18342 {
18343 /* We need to generate a scalar mode mask in this case. */
18344 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18345 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18346 mask = gen_reg_rtx (mode);
18347 emit_insn (gen_rtx_SET (mask, tmp));
18348 }
18349 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
18350
18351 if (smask)
18352 *smask = mask;
18353
18354 return xa;
18355}
18356
18357/* Expands a comparison of OP0 with OP1 using comparison code CODE,
18358 swapping the operands if SWAP_OPERANDS is true. The expanded
18359 code is a forward jump to a newly created label in case the
18360 comparison is true. The generated label rtx is returned. */
18361static rtx_code_label *
18362ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
18363 bool swap_operands)
18364{
18365 bool unordered_compare = ix86_unordered_fp_compare (code);
18366 rtx_code_label *label;
18367 rtx tmp, reg;
18368
18369 if (swap_operands)
18370 std::swap (op0, op1);
18371
18372 label = gen_label_rtx ();
18373 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
18374 if (unordered_compare)
18375 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
18376 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
18377 emit_insn (gen_rtx_SET (reg, tmp));
18378 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
18379 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18380 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
18381 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18382 JUMP_LABEL (tmp) = label;
18383
18384 return label;
18385}
18386
18387/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18388 using comparison code CODE. Operands are swapped for the comparison if
18389 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18390static rtx
18391ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
18392 bool swap_operands)
18393{
18394 rtx (*insn)(rtx, rtx, rtx, rtx);
18395 machine_mode mode = GET_MODE (op0);
18396 rtx mask = gen_reg_rtx (mode);
18397
18398 if (swap_operands)
18399 std::swap (op0, op1);
18400
18401 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
18402
18403 emit_insn (insn (mask, op0, op1,
18404 gen_rtx_fmt_ee (code, mode, op0, op1)));
18405 return mask;
18406}
18407
18408/* Expand copysign from SIGN to the positive value ABS_VALUE
18409 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18410 the sign-bit. */
18411
18412static void
18413ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
18414{
18415 machine_mode mode = GET_MODE (sign);
18416 rtx sgn = gen_reg_rtx (mode);
18417 if (mask == NULL_RTX)
18418 {
18419 machine_mode vmode;
18420
18421 if (mode == SFmode)
18422 vmode = V4SFmode;
18423 else if (mode == DFmode)
18424 vmode = V2DFmode;
18425 else
18426 vmode = mode;
18427
18428 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
18429 if (!VECTOR_MODE_P (mode))
18430 {
18431 /* We need to generate a scalar mode mask in this case. */
18432 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18433 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18434 mask = gen_reg_rtx (mode);
18435 emit_insn (gen_rtx_SET (mask, tmp));
18436 }
18437 }
18438 else
18439 mask = gen_rtx_NOT (mode, mask);
18440 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
18441 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
18442}
18443
18444/* Expand SSE sequence for computing lround from OP1 storing
18445 into OP0. */
18446
18447void
18448ix86_expand_lround (rtx op0, rtx op1)
18449{
18450 /* C code for the stuff we're doing below:
d2754fbb
UB
18451 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18452 return (long)tmp;
2bf6d935
ML
18453 */
18454 machine_mode mode = GET_MODE (op1);
18455 const struct real_format *fmt;
18456 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18457 rtx adj;
18458
18459 /* load nextafter (0.5, 0.0) */
18460 fmt = REAL_MODE_FORMAT (mode);
18461 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18462 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18463
18464 /* adj = copysign (0.5, op1) */
18465 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
18466 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
18467
18468 /* adj = op1 + adj */
18469 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
18470
18471 /* op0 = (imode)adj */
18472 expand_fix (op0, adj, 0);
18473}
18474
18475/* Expand SSE2 sequence for computing lround from OPERAND1 storing
18476 into OPERAND0. */
18477
18478void
18479ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
18480{
18481 /* C code for the stuff we're doing below (for do_floor):
18482 xi = (long)op1;
d2754fbb
UB
18483 xi -= (double)xi > op1 ? 1 : 0;
18484 return xi;
2bf6d935
ML
18485 */
18486 machine_mode fmode = GET_MODE (op1);
18487 machine_mode imode = GET_MODE (op0);
18488 rtx ireg, freg, tmp;
18489 rtx_code_label *label;
18490
18491 /* reg = (long)op1 */
18492 ireg = gen_reg_rtx (imode);
18493 expand_fix (ireg, op1, 0);
18494
18495 /* freg = (double)reg */
18496 freg = gen_reg_rtx (fmode);
18497 expand_float (freg, ireg, 0);
18498
18499 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18500 label = ix86_expand_sse_compare_and_jump (UNLE,
18501 freg, op1, !do_floor);
18502 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
18503 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
18504 emit_move_insn (ireg, tmp);
18505
18506 emit_label (label);
18507 LABEL_NUSES (label) = 1;
18508
18509 emit_move_insn (op0, ireg);
18510}
18511
18512/* Generate and return a rtx of mode MODE for 2**n where n is the number
18513 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18514
18515static rtx
18516ix86_gen_TWO52 (machine_mode mode)
18517{
d2754fbb 18518 const struct real_format *fmt;
2bf6d935
ML
18519 REAL_VALUE_TYPE TWO52r;
18520 rtx TWO52;
18521
d2754fbb
UB
18522 fmt = REAL_MODE_FORMAT (mode);
18523 real_2expN (&TWO52r, fmt->p - 1, mode);
2bf6d935
ML
18524 TWO52 = const_double_from_real_value (TWO52r, mode);
18525 TWO52 = force_reg (mode, TWO52);
18526
18527 return TWO52;
18528}
18529
18530/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18531
18532void
18533ix86_expand_rint (rtx operand0, rtx operand1)
18534{
18535 /* C code for the stuff we're doing below:
18536 xa = fabs (operand1);
d2754fbb 18537 if (!isless (xa, 2**52))
2bf6d935 18538 return operand1;
d2754fbb
UB
18539 two52 = 2**52;
18540 if (flag_rounding_math)
2bf6d935
ML
18541 {
18542 two52 = copysign (two52, operand1);
18543 xa = operand1;
18544 }
d2754fbb
UB
18545 xa = xa + two52 - two52;
18546 return copysign (xa, operand1);
2bf6d935
ML
18547 */
18548 machine_mode mode = GET_MODE (operand0);
81615bb0 18549 rtx res, xa, TWO52, mask;
2bf6d935
ML
18550 rtx_code_label *label;
18551
d2754fbb
UB
18552 TWO52 = ix86_gen_TWO52 (mode);
18553
18554 /* Temporary for holding the result, initialized to the input
18555 operand to ease control flow. */
18556 res = copy_to_reg (operand1);
2bf6d935
ML
18557
18558 /* xa = abs (operand1) */
18559 xa = ix86_expand_sse_fabs (res, &mask);
18560
18561 /* if (!isless (xa, TWO52)) goto label; */
2bf6d935
ML
18562 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18563
2bf6d935
ML
18564 if (flag_rounding_math)
18565 {
81615bb0 18566 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
2bf6d935
ML
18567 xa = res;
18568 }
18569
81615bb0
UB
18570 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18571 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18572
18573 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18574 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18575 xa = ix86_expand_sse_fabs (xa, NULL);
2bf6d935
ML
18576
18577 ix86_sse_copysign_to_positive (res, xa, res, mask);
18578
18579 emit_label (label);
18580 LABEL_NUSES (label) = 1;
18581
18582 emit_move_insn (operand0, res);
18583}
18584
36d387f2
UB
18585/* Expand SSE2 sequence for computing floor or ceil
18586 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
18587void
18588ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
18589{
18590 /* C code for the stuff we expand below.
18591 double xa = fabs (x), x2;
d2754fbb
UB
18592 if (!isless (xa, TWO52))
18593 return x;
2bf6d935 18594 x2 = (double)(long)x;
337ed0eb 18595
2bf6d935
ML
18596 Compensate. Floor:
18597 if (x2 > x)
18598 x2 -= 1;
18599 Compensate. Ceil:
18600 if (x2 < x)
18601 x2 += 1;
337ed0eb 18602
2bf6d935
ML
18603 if (HONOR_SIGNED_ZEROS (mode))
18604 return copysign (x2, x);
18605 return x2;
18606 */
18607 machine_mode mode = GET_MODE (operand0);
18608 rtx xa, xi, TWO52, tmp, one, res, mask;
18609 rtx_code_label *label;
18610
18611 TWO52 = ix86_gen_TWO52 (mode);
18612
18613 /* Temporary for holding the result, initialized to the input
18614 operand to ease control flow. */
d2754fbb 18615 res = copy_to_reg (operand1);
2bf6d935
ML
18616
18617 /* xa = abs (operand1) */
18618 xa = ix86_expand_sse_fabs (res, &mask);
18619
18620 /* if (!isless (xa, TWO52)) goto label; */
18621 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18622
18623 /* xa = (double)(long)x */
d2754fbb 18624 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935
ML
18625 expand_fix (xi, res, 0);
18626 expand_float (xa, xi, 0);
18627
18628 /* generate 1.0 */
18629 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18630
18631 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18632 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18633 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18634 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18635 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
2bf6d935 18636 if (HONOR_SIGNED_ZEROS (mode))
337ed0eb
UB
18637 {
18638 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18639 if (do_floor && flag_rounding_math)
18640 tmp = ix86_expand_sse_fabs (tmp, NULL);
18641
18642 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18643 }
18644 emit_move_insn (res, tmp);
2bf6d935
ML
18645
18646 emit_label (label);
18647 LABEL_NUSES (label) = 1;
18648
18649 emit_move_insn (operand0, res);
18650}
18651
36d387f2
UB
18652/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18653 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18654 that is only available on 64bit targets. */
2bf6d935 18655void
36d387f2 18656ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
2bf6d935
ML
18657{
18658 /* C code for the stuff we expand below.
d2754fbb
UB
18659 double xa = fabs (x), x2;
18660 if (!isless (xa, TWO52))
18661 return x;
18662 xa = xa + TWO52 - TWO52;
18663 x2 = copysign (xa, x);
337ed0eb 18664
36d387f2 18665 Compensate. Floor:
d2754fbb
UB
18666 if (x2 > x)
18667 x2 -= 1;
36d387f2 18668 Compensate. Ceil:
d2754fbb
UB
18669 if (x2 < x)
18670 x2 += 1;
337ed0eb 18671
36d387f2
UB
18672 if (HONOR_SIGNED_ZEROS (mode))
18673 x2 = copysign (x2, x);
18674 return x2;
2bf6d935
ML
18675 */
18676 machine_mode mode = GET_MODE (operand0);
36d387f2 18677 rtx xa, TWO52, tmp, one, res, mask;
2bf6d935
ML
18678 rtx_code_label *label;
18679
18680 TWO52 = ix86_gen_TWO52 (mode);
18681
18682 /* Temporary for holding the result, initialized to the input
18683 operand to ease control flow. */
d2754fbb 18684 res = copy_to_reg (operand1);
2bf6d935
ML
18685
18686 /* xa = abs (operand1) */
18687 xa = ix86_expand_sse_fabs (res, &mask);
18688
18689 /* if (!isless (xa, TWO52)) goto label; */
18690 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18691
36d387f2
UB
18692 /* xa = xa + TWO52 - TWO52; */
18693 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18694 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
2bf6d935 18695
36d387f2
UB
18696 /* xa = copysign (xa, operand1) */
18697 ix86_sse_copysign_to_positive (xa, xa, res, mask);
2bf6d935 18698
36d387f2
UB
18699 /* generate 1.0 */
18700 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
2bf6d935 18701
36d387f2
UB
18702 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18703 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18704 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18705 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18706 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
337ed0eb
UB
18707 if (HONOR_SIGNED_ZEROS (mode))
18708 {
18709 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18710 if (do_floor && flag_rounding_math)
18711 tmp = ix86_expand_sse_fabs (tmp, NULL);
18712
18713 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18714 }
36d387f2 18715 emit_move_insn (res, tmp);
2bf6d935
ML
18716
18717 emit_label (label);
18718 LABEL_NUSES (label) = 1;
18719
18720 emit_move_insn (operand0, res);
18721}
18722
36d387f2
UB
18723/* Expand SSE sequence for computing trunc
18724 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
18725void
18726ix86_expand_trunc (rtx operand0, rtx operand1)
18727{
18728 /* C code for SSE variant we expand below.
d2754fbb
UB
18729 double xa = fabs (x), x2;
18730 if (!isless (xa, TWO52))
18731 return x;
18732 x2 = (double)(long)x;
2bf6d935
ML
18733 if (HONOR_SIGNED_ZEROS (mode))
18734 return copysign (x2, x);
18735 return x2;
18736 */
18737 machine_mode mode = GET_MODE (operand0);
18738 rtx xa, xi, TWO52, res, mask;
18739 rtx_code_label *label;
18740
18741 TWO52 = ix86_gen_TWO52 (mode);
18742
18743 /* Temporary for holding the result, initialized to the input
18744 operand to ease control flow. */
d2754fbb 18745 res = copy_to_reg (operand1);
2bf6d935
ML
18746
18747 /* xa = abs (operand1) */
18748 xa = ix86_expand_sse_fabs (res, &mask);
18749
18750 /* if (!isless (xa, TWO52)) goto label; */
18751 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18752
97d3ddcf 18753 /* xa = (double)(long)x */
d2754fbb 18754 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935 18755 expand_fix (xi, res, 0);
97d3ddcf 18756 expand_float (xa, xi, 0);
2bf6d935
ML
18757
18758 if (HONOR_SIGNED_ZEROS (mode))
97d3ddcf
UB
18759 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18760
18761 emit_move_insn (res, xa);
2bf6d935
ML
18762
18763 emit_label (label);
18764 LABEL_NUSES (label) = 1;
18765
18766 emit_move_insn (operand0, res);
18767}
18768
18769/* Expand SSE sequence for computing trunc from OPERAND1 storing
36d387f2
UB
18770 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18771 that is only available on 64bit targets. */
2bf6d935
ML
18772void
18773ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18774{
18775 machine_mode mode = GET_MODE (operand0);
c142ae5e 18776 rtx xa, xa2, TWO52, tmp, one, res, mask;
2bf6d935
ML
18777 rtx_code_label *label;
18778
18779 /* C code for SSE variant we expand below.
d2754fbb
UB
18780 double xa = fabs (x), x2;
18781 if (!isless (xa, TWO52))
18782 return x;
18783 xa2 = xa + TWO52 - TWO52;
2bf6d935 18784 Compensate:
d2754fbb
UB
18785 if (xa2 > xa)
18786 xa2 -= 1.0;
18787 x2 = copysign (xa2, x);
18788 return x2;
2bf6d935
ML
18789 */
18790
18791 TWO52 = ix86_gen_TWO52 (mode);
18792
18793 /* Temporary for holding the result, initialized to the input
18794 operand to ease control flow. */
d2754fbb 18795 res =copy_to_reg (operand1);
2bf6d935
ML
18796
18797 /* xa = abs (operand1) */
c142ae5e 18798 xa = ix86_expand_sse_fabs (res, &mask);
2bf6d935
ML
18799
18800 /* if (!isless (xa, TWO52)) goto label; */
18801 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18802
c142ae5e
UB
18803 /* xa2 = xa + TWO52 - TWO52; */
18804 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18805 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
2bf6d935
ML
18806
18807 /* generate 1.0 */
18808 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18809
c142ae5e
UB
18810 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18811 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18812 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
2bf6d935 18813 tmp = expand_simple_binop (mode, MINUS,
c142ae5e
UB
18814 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18815 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
81615bb0 18816 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
c142ae5e 18817 tmp = ix86_expand_sse_fabs (tmp, NULL);
2bf6d935 18818
c142ae5e
UB
18819 /* res = copysign (xa2, operand1) */
18820 ix86_sse_copysign_to_positive (res, tmp, res, mask);
2bf6d935
ML
18821
18822 emit_label (label);
18823 LABEL_NUSES (label) = 1;
18824
18825 emit_move_insn (operand0, res);
18826}
18827
36d387f2
UB
18828/* Expand SSE sequence for computing round
18829 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
18830void
18831ix86_expand_round (rtx operand0, rtx operand1)
18832{
18833 /* C code for the stuff we're doing below:
d2754fbb
UB
18834 double xa = fabs (x);
18835 if (!isless (xa, TWO52))
18836 return x;
18837 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18838 return copysign (xa, x);
2bf6d935
ML
18839 */
18840 machine_mode mode = GET_MODE (operand0);
18841 rtx res, TWO52, xa, xi, half, mask;
18842 rtx_code_label *label;
18843 const struct real_format *fmt;
18844 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18845
18846 /* Temporary for holding the result, initialized to the input
18847 operand to ease control flow. */
d2754fbb 18848 res = copy_to_reg (operand1);
2bf6d935
ML
18849
18850 TWO52 = ix86_gen_TWO52 (mode);
18851 xa = ix86_expand_sse_fabs (res, &mask);
18852 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18853
18854 /* load nextafter (0.5, 0.0) */
18855 fmt = REAL_MODE_FORMAT (mode);
18856 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18857 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18858
18859 /* xa = xa + 0.5 */
18860 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18861 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18862
18863 /* xa = (double)(int64_t)xa */
d2754fbb 18864 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935
ML
18865 expand_fix (xi, xa, 0);
18866 expand_float (xa, xi, 0);
18867
18868 /* res = copysign (xa, operand1) */
97d3ddcf 18869 ix86_sse_copysign_to_positive (res, xa, res, mask);
2bf6d935
ML
18870
18871 emit_label (label);
18872 LABEL_NUSES (label) = 1;
18873
18874 emit_move_insn (operand0, res);
18875}
18876
36d387f2
UB
18877/* Expand SSE sequence for computing round from OPERAND1 storing
18878 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18879 that is only available on 64bit targets. */
18880void
18881ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18882{
18883 /* C code for the stuff we expand below.
d2754fbb
UB
18884 double xa = fabs (x), xa2, x2;
18885 if (!isless (xa, TWO52))
18886 return x;
36d387f2
UB
18887 Using the absolute value and copying back sign makes
18888 -0.0 -> -0.0 correct.
d2754fbb 18889 xa2 = xa + TWO52 - TWO52;
36d387f2
UB
18890 Compensate.
18891 dxa = xa2 - xa;
d2754fbb
UB
18892 if (dxa <= -0.5)
18893 xa2 += 1;
18894 else if (dxa > 0.5)
18895 xa2 -= 1;
18896 x2 = copysign (xa2, x);
18897 return x2;
36d387f2
UB
18898 */
18899 machine_mode mode = GET_MODE (operand0);
18900 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18901 rtx_code_label *label;
18902
18903 TWO52 = ix86_gen_TWO52 (mode);
18904
18905 /* Temporary for holding the result, initialized to the input
18906 operand to ease control flow. */
d2754fbb 18907 res = copy_to_reg (operand1);
36d387f2
UB
18908
18909 /* xa = abs (operand1) */
18910 xa = ix86_expand_sse_fabs (res, &mask);
18911
18912 /* if (!isless (xa, TWO52)) goto label; */
18913 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18914
18915 /* xa2 = xa + TWO52 - TWO52; */
18916 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18917 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18918
18919 /* dxa = xa2 - xa; */
18920 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18921
18922 /* generate 0.5, 1.0 and -0.5 */
18923 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18924 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18925 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18926 0, OPTAB_DIRECT);
18927
18928 /* Compensate. */
18929 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18930 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18931 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18932 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18933 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18934 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18935 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18936 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18937
18938 /* res = copysign (xa2, operand1) */
97d3ddcf 18939 ix86_sse_copysign_to_positive (res, xa2, res, mask);
36d387f2
UB
18940
18941 emit_label (label);
18942 LABEL_NUSES (label) = 1;
18943
18944 emit_move_insn (operand0, res);
18945}
18946
2bf6d935
ML
18947/* Expand SSE sequence for computing round
18948 from OP1 storing into OP0 using sse4 round insn. */
18949void
18950ix86_expand_round_sse4 (rtx op0, rtx op1)
18951{
18952 machine_mode mode = GET_MODE (op0);
18953 rtx e1, e2, res, half;
18954 const struct real_format *fmt;
18955 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18956 rtx (*gen_copysign) (rtx, rtx, rtx);
18957 rtx (*gen_round) (rtx, rtx, rtx);
18958
18959 switch (mode)
18960 {
18961 case E_SFmode:
18962 gen_copysign = gen_copysignsf3;
18963 gen_round = gen_sse4_1_roundsf2;
18964 break;
18965 case E_DFmode:
18966 gen_copysign = gen_copysigndf3;
18967 gen_round = gen_sse4_1_rounddf2;
18968 break;
18969 default:
18970 gcc_unreachable ();
18971 }
18972
18973 /* round (a) = trunc (a + copysign (0.5, a)) */
18974
18975 /* load nextafter (0.5, 0.0) */
18976 fmt = REAL_MODE_FORMAT (mode);
18977 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18978 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18979 half = const_double_from_real_value (pred_half, mode);
18980
18981 /* e1 = copysign (0.5, op1) */
18982 e1 = gen_reg_rtx (mode);
18983 emit_insn (gen_copysign (e1, half, op1));
18984
18985 /* e2 = op1 + e1 */
18986 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18987
18988 /* res = trunc (e2) */
18989 res = gen_reg_rtx (mode);
18990 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18991
18992 emit_move_insn (op0, res);
18993}
18994
18995/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18996 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18997 insn every time. */
18998
18999static GTY(()) rtx_insn *vselect_insn;
19000
19001/* Initialize vselect_insn. */
19002
19003static void
19004init_vselect_insn (void)
19005{
19006 unsigned i;
19007 rtx x;
19008
19009 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
19010 for (i = 0; i < MAX_VECT_LEN; ++i)
19011 XVECEXP (x, 0, i) = const0_rtx;
19012 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
19013 const0_rtx), x);
19014 x = gen_rtx_SET (const0_rtx, x);
19015 start_sequence ();
19016 vselect_insn = emit_insn (x);
19017 end_sequence ();
19018}
19019
19020/* Construct (set target (vec_select op0 (parallel perm))) and
19021 return true if that's a valid instruction in the active ISA. */
19022
19023static bool
19024expand_vselect (rtx target, rtx op0, const unsigned char *perm,
19025 unsigned nelt, bool testing_p)
19026{
19027 unsigned int i;
19028 rtx x, save_vconcat;
19029 int icode;
19030
19031 if (vselect_insn == NULL_RTX)
19032 init_vselect_insn ();
19033
19034 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
19035 PUT_NUM_ELEM (XVEC (x, 0), nelt);
19036 for (i = 0; i < nelt; ++i)
19037 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
19038 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
19039 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
19040 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
19041 SET_DEST (PATTERN (vselect_insn)) = target;
19042 icode = recog_memoized (vselect_insn);
19043
19044 if (icode >= 0 && !testing_p)
19045 emit_insn (copy_rtx (PATTERN (vselect_insn)));
19046
19047 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
19048 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
19049 INSN_CODE (vselect_insn) = -1;
19050
19051 return icode >= 0;
19052}
19053
19054/* Similar, but generate a vec_concat from op0 and op1 as well. */
19055
19056static bool
19057expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
19058 const unsigned char *perm, unsigned nelt,
19059 bool testing_p)
19060{
19061 machine_mode v2mode;
19062 rtx x;
19063 bool ok;
19064
19065 if (vselect_insn == NULL_RTX)
19066 init_vselect_insn ();
19067
19068 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
19069 return false;
19070 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
19071 PUT_MODE (x, v2mode);
19072 XEXP (x, 0) = op0;
19073 XEXP (x, 1) = op1;
19074 ok = expand_vselect (target, x, perm, nelt, testing_p);
19075 XEXP (x, 0) = const0_rtx;
19076 XEXP (x, 1) = const0_rtx;
19077 return ok;
19078}
19079
4bf4c103 19080/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
19081 using movss or movsd. */
19082static bool
19083expand_vec_perm_movs (struct expand_vec_perm_d *d)
19084{
19085 machine_mode vmode = d->vmode;
19086 unsigned i, nelt = d->nelt;
19087 rtx x;
19088
19089 if (d->one_operand_p)
19090 return false;
19091
4bcc3b5c
UB
19092 if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
19093 && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
19094 && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
2bf6d935
ML
19095 return false;
19096
19097 /* Only the first element is changed. */
19098 if (d->perm[0] != nelt && d->perm[0] != 0)
19099 return false;
19100 for (i = 1; i < nelt; ++i)
19101 if (d->perm[i] != i + nelt - d->perm[0])
19102 return false;
19103
19104 if (d->testing_p)
19105 return true;
19106
19107 if (d->perm[0] == nelt)
19108 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
19109 else
19110 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
19111
19112 emit_insn (gen_rtx_SET (d->target, x));
19113
19114 return true;
19115}
19116
95b99e47
UB
19117/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19118 using insertps. */
19119static bool
19120expand_vec_perm_insertps (struct expand_vec_perm_d *d)
19121{
19122 machine_mode vmode = d->vmode;
19123 unsigned i, cnt_s, nelt = d->nelt;
19124 int cnt_d = -1;
19125 rtx src, dst;
19126
19127 if (d->one_operand_p)
19128 return false;
19129
19130 if (!(TARGET_SSE4_1
19131 && (vmode == V4SFmode || vmode == V4SImode
19132 || (TARGET_MMX_WITH_SSE
19133 && (vmode == V2SFmode || vmode == V2SImode)))))
19134 return false;
19135
19136 for (i = 0; i < nelt; ++i)
19137 {
19138 if (d->perm[i] == i)
19139 continue;
19140 if (cnt_d != -1)
19141 {
19142 cnt_d = -1;
19143 break;
19144 }
19145 cnt_d = i;
19146 }
19147
19148 if (cnt_d == -1)
19149 {
19150 for (i = 0; i < nelt; ++i)
19151 {
19152 if (d->perm[i] == i + nelt)
19153 continue;
19154 if (cnt_d != -1)
19155 return false;
19156 cnt_d = i;
19157 }
19158
19159 if (cnt_d == -1)
19160 return false;
19161 }
19162
19163 if (d->testing_p)
19164 return true;
19165
19166 gcc_assert (cnt_d != -1);
19167
19168 cnt_s = d->perm[cnt_d];
19169 if (cnt_s < nelt)
19170 {
19171 src = d->op0;
19172 dst = d->op1;
19173 }
19174 else
19175 {
19176 cnt_s -= nelt;
19177 src = d->op1;
19178 dst = d->op0;
19179 }
19180 gcc_assert (cnt_s < nelt);
19181
19182 rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
19183 GEN_INT (cnt_s << 6 | cnt_d << 4));
19184 emit_insn (x);
19185
19186 return true;
19187}
19188
4bf4c103 19189/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
19190 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
19191
19192static bool
19193expand_vec_perm_blend (struct expand_vec_perm_d *d)
19194{
19195 machine_mode mmode, vmode = d->vmode;
fa2987ed
JJ
19196 unsigned i, nelt = d->nelt;
19197 unsigned HOST_WIDE_INT mask;
2bf6d935
ML
19198 rtx target, op0, op1, maskop, x;
19199 rtx rperm[32], vperm;
19200
19201 if (d->one_operand_p)
19202 return false;
19203 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
19204 && (TARGET_AVX512BW
19205 || GET_MODE_UNIT_SIZE (vmode) >= 4))
19206 ;
19207 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19208 ;
19209 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19210 ;
57052c6e
UB
19211 else if (TARGET_SSE4_1
19212 && (GET_MODE_SIZE (vmode) == 16
19213 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
19214 || GET_MODE_SIZE (vmode) == 4))
2bf6d935
ML
19215 ;
19216 else
19217 return false;
19218
19219 /* This is a blend, not a permute. Elements must stay in their
19220 respective lanes. */
19221 for (i = 0; i < nelt; ++i)
19222 {
19223 unsigned e = d->perm[i];
19224 if (!(e == i || e == i + nelt))
19225 return false;
19226 }
19227
19228 if (d->testing_p)
19229 return true;
19230
19231 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
19232 decision should be extracted elsewhere, so that we only try that
19233 sequence once all budget==3 options have been tried. */
19234 target = d->target;
19235 op0 = d->op0;
19236 op1 = d->op1;
19237 mask = 0;
19238
19239 switch (vmode)
19240 {
19241 case E_V8DFmode:
19242 case E_V16SFmode:
19243 case E_V4DFmode:
19244 case E_V8SFmode:
19245 case E_V2DFmode:
19246 case E_V4SFmode:
57052c6e
UB
19247 case E_V2SFmode:
19248 case E_V2HImode:
a325bdd1 19249 case E_V4HImode:
2bf6d935
ML
19250 case E_V8HImode:
19251 case E_V8SImode:
19252 case E_V32HImode:
19253 case E_V64QImode:
19254 case E_V16SImode:
19255 case E_V8DImode:
19256 for (i = 0; i < nelt; ++i)
fa2987ed 19257 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
2bf6d935
ML
19258 break;
19259
19260 case E_V2DImode:
19261 for (i = 0; i < 2; ++i)
19262 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
19263 vmode = V8HImode;
19264 goto do_subreg;
19265
a325bdd1
PB
19266 case E_V2SImode:
19267 for (i = 0; i < 2; ++i)
19268 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
19269 vmode = V4HImode;
19270 goto do_subreg;
19271
2bf6d935 19272 case E_V4SImode:
3588c8cb 19273 if (TARGET_AVX2)
19274 {
19275 /* Use vpblendd instead of vpblendw. */
19276 for (i = 0; i < nelt; ++i)
19277 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19278 break;
19279 }
19280 else
19281 {
19282 for (i = 0; i < 4; ++i)
19283 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19284 vmode = V8HImode;
19285 goto do_subreg;
19286 }
2bf6d935
ML
19287
19288 case E_V16QImode:
19289 /* See if bytes move in pairs so we can use pblendw with
19290 an immediate argument, rather than pblendvb with a vector
19291 argument. */
19292 for (i = 0; i < 16; i += 2)
19293 if (d->perm[i] + 1 != d->perm[i + 1])
19294 {
19295 use_pblendvb:
19296 for (i = 0; i < nelt; ++i)
19297 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
19298
19299 finish_pblendvb:
19300 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19301 vperm = force_reg (vmode, vperm);
19302
be8749f9 19303 if (GET_MODE_SIZE (vmode) == 4)
820ac79e 19304 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
be8749f9 19305 else if (GET_MODE_SIZE (vmode) == 8)
820ac79e 19306 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
a325bdd1 19307 else if (GET_MODE_SIZE (vmode) == 16)
2bf6d935
ML
19308 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
19309 else
19310 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
19311 if (target != d->target)
19312 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19313 return true;
19314 }
19315
19316 for (i = 0; i < 8; ++i)
19317 mask |= (d->perm[i * 2] >= 16) << i;
19318 vmode = V8HImode;
19319 /* FALLTHRU */
19320
19321 do_subreg:
19322 target = gen_reg_rtx (vmode);
19323 op0 = gen_lowpart (vmode, op0);
19324 op1 = gen_lowpart (vmode, op1);
19325 break;
19326
a325bdd1
PB
19327 case E_V8QImode:
19328 for (i = 0; i < 8; i += 2)
19329 if (d->perm[i] + 1 != d->perm[i + 1])
19330 goto use_pblendvb;
19331
19332 for (i = 0; i < 4; ++i)
19333 mask |= (d->perm[i * 2] >= 8) << i;
19334 vmode = V4HImode;
19335 goto do_subreg;
19336
be8749f9
UB
19337 case E_V4QImode:
19338 for (i = 0; i < 4; i += 2)
19339 if (d->perm[i] + 1 != d->perm[i + 1])
19340 goto use_pblendvb;
19341
19342 for (i = 0; i < 2; ++i)
19343 mask |= (d->perm[i * 2] >= 4) << i;
19344 vmode = V2HImode;
19345 goto do_subreg;
19346
2bf6d935
ML
19347 case E_V32QImode:
19348 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19349 for (i = 0; i < 32; i += 2)
19350 if (d->perm[i] + 1 != d->perm[i + 1])
19351 goto use_pblendvb;
19352 /* See if bytes move in quadruplets. If yes, vpblendd
19353 with immediate can be used. */
19354 for (i = 0; i < 32; i += 4)
19355 if (d->perm[i] + 2 != d->perm[i + 2])
19356 break;
19357 if (i < 32)
19358 {
19359 /* See if bytes move the same in both lanes. If yes,
19360 vpblendw with immediate can be used. */
19361 for (i = 0; i < 16; i += 2)
19362 if (d->perm[i] + 16 != d->perm[i + 16])
19363 goto use_pblendvb;
19364
19365 /* Use vpblendw. */
19366 for (i = 0; i < 16; ++i)
19367 mask |= (d->perm[i * 2] >= 32) << i;
19368 vmode = V16HImode;
19369 goto do_subreg;
19370 }
19371
19372 /* Use vpblendd. */
19373 for (i = 0; i < 8; ++i)
19374 mask |= (d->perm[i * 4] >= 32) << i;
19375 vmode = V8SImode;
19376 goto do_subreg;
19377
19378 case E_V16HImode:
19379 /* See if words move in pairs. If yes, vpblendd can be used. */
19380 for (i = 0; i < 16; i += 2)
19381 if (d->perm[i] + 1 != d->perm[i + 1])
19382 break;
19383 if (i < 16)
19384 {
19385 /* See if words move the same in both lanes. If not,
19386 vpblendvb must be used. */
19387 for (i = 0; i < 8; i++)
19388 if (d->perm[i] + 8 != d->perm[i + 8])
19389 {
19390 /* Use vpblendvb. */
19391 for (i = 0; i < 32; ++i)
19392 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
19393
19394 vmode = V32QImode;
19395 nelt = 32;
19396 target = gen_reg_rtx (vmode);
19397 op0 = gen_lowpart (vmode, op0);
19398 op1 = gen_lowpart (vmode, op1);
19399 goto finish_pblendvb;
19400 }
19401
19402 /* Use vpblendw. */
19403 for (i = 0; i < 16; ++i)
19404 mask |= (d->perm[i] >= 16) << i;
19405 break;
19406 }
19407
19408 /* Use vpblendd. */
19409 for (i = 0; i < 8; ++i)
19410 mask |= (d->perm[i * 2] >= 16) << i;
19411 vmode = V8SImode;
19412 goto do_subreg;
19413
19414 case E_V4DImode:
19415 /* Use vpblendd. */
19416 for (i = 0; i < 4; ++i)
19417 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19418 vmode = V8SImode;
19419 goto do_subreg;
19420
19421 default:
19422 gcc_unreachable ();
19423 }
19424
19425 switch (vmode)
19426 {
19427 case E_V8DFmode:
19428 case E_V8DImode:
19429 mmode = QImode;
19430 break;
19431 case E_V16SFmode:
19432 case E_V16SImode:
19433 mmode = HImode;
19434 break;
19435 case E_V32HImode:
19436 mmode = SImode;
19437 break;
19438 case E_V64QImode:
19439 mmode = DImode;
19440 break;
19441 default:
19442 mmode = VOIDmode;
19443 }
19444
33066c90 19445 /* Canonicalize vec_merge. */
19446 if (swap_commutative_operands_p (op1, op0)
19447 /* Two operands have same precedence, then
19448 first bit of mask select first operand. */
19449 || (!swap_commutative_operands_p (op0, op1)
19450 && !(mask & 1)))
19451 {
19452 unsigned n_elts = GET_MODE_NUNITS (vmode);
19453 std::swap (op0, op1);
19454 unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
19455 if (n_elts == HOST_BITS_PER_WIDE_INT)
19456 mask_all = -1;
19457 else
19458 mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
19459 mask = ~mask & mask_all;
19460 }
19461
2bf6d935
ML
19462 if (mmode != VOIDmode)
19463 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
19464 else
19465 maskop = GEN_INT (mask);
19466
19467 /* This matches five different patterns with the different modes. */
19468 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
19469 x = gen_rtx_SET (target, x);
19470 emit_insn (x);
19471 if (target != d->target)
19472 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19473
19474 return true;
19475}
19476
4bf4c103 19477/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
19478 in terms of the variable form of vpermilps.
19479
19480 Note that we will have already failed the immediate input vpermilps,
19481 which requires that the high and low part shuffle be identical; the
19482 variable form doesn't require that. */
19483
19484static bool
19485expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
19486{
19487 rtx rperm[8], vperm;
19488 unsigned i;
19489
19490 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
19491 return false;
19492
19493 /* We can only permute within the 128-bit lane. */
19494 for (i = 0; i < 8; ++i)
19495 {
19496 unsigned e = d->perm[i];
19497 if (i < 4 ? e >= 4 : e < 4)
19498 return false;
19499 }
19500
19501 if (d->testing_p)
19502 return true;
19503
19504 for (i = 0; i < 8; ++i)
19505 {
19506 unsigned e = d->perm[i];
19507
19508 /* Within each 128-bit lane, the elements of op0 are numbered
19509 from 0 and the elements of op1 are numbered from 4. */
19510 if (e >= 8 + 4)
19511 e -= 8;
19512 else if (e >= 4)
19513 e -= 4;
19514
19515 rperm[i] = GEN_INT (e);
19516 }
19517
19518 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
19519 vperm = force_reg (V8SImode, vperm);
19520 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
19521
19522 return true;
19523}
19524
1fa991d1
UB
19525/* For V*[QHS]Imode permutations, check if the same permutation
19526 can't be performed in a 2x, 4x or 8x wider inner mode. */
19527
19528static bool
19529canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
19530 struct expand_vec_perm_d *nd)
19531{
19532 int i;
19533 machine_mode mode = VOIDmode;
19534
19535 switch (d->vmode)
19536 {
19537 case E_V8QImode: mode = V4HImode; break;
19538 case E_V16QImode: mode = V8HImode; break;
19539 case E_V32QImode: mode = V16HImode; break;
19540 case E_V64QImode: mode = V32HImode; break;
19541 case E_V4HImode: mode = V2SImode; break;
19542 case E_V8HImode: mode = V4SImode; break;
19543 case E_V16HImode: mode = V8SImode; break;
19544 case E_V32HImode: mode = V16SImode; break;
19545 case E_V4SImode: mode = V2DImode; break;
19546 case E_V8SImode: mode = V4DImode; break;
19547 case E_V16SImode: mode = V8DImode; break;
19548 default: return false;
19549 }
19550 for (i = 0; i < d->nelt; i += 2)
19551 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
19552 return false;
19553 nd->vmode = mode;
19554 nd->nelt = d->nelt / 2;
19555 for (i = 0; i < nd->nelt; i++)
19556 nd->perm[i] = d->perm[2 * i] / 2;
19557 if (GET_MODE_INNER (mode) != DImode)
19558 canonicalize_vector_int_perm (nd, nd);
19559 if (nd != d)
19560 {
19561 nd->one_operand_p = d->one_operand_p;
19562 nd->testing_p = d->testing_p;
19563 if (d->op0 == d->op1)
19564 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
19565 else
19566 {
19567 nd->op0 = gen_lowpart (nd->vmode, d->op0);
19568 nd->op1 = gen_lowpart (nd->vmode, d->op1);
19569 }
19570 if (d->testing_p)
19571 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
19572 else
19573 nd->target = gen_reg_rtx (nd->vmode);
19574 }
19575 return true;
19576}
19577
2bf6d935
ML
19578/* Return true if permutation D can be performed as VMODE permutation
19579 instead. */
19580
19581static bool
19582valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
19583{
19584 unsigned int i, j, chunk;
19585
19586 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
19587 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
19588 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
19589 return false;
19590
19591 if (GET_MODE_NUNITS (vmode) >= d->nelt)
19592 return true;
19593
19594 chunk = d->nelt / GET_MODE_NUNITS (vmode);
19595 for (i = 0; i < d->nelt; i += chunk)
19596 if (d->perm[i] & (chunk - 1))
19597 return false;
19598 else
19599 for (j = 1; j < chunk; ++j)
19600 if (d->perm[i] + j != d->perm[i + j])
19601 return false;
19602
19603 return true;
19604}
19605
4bf4c103 19606/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
19607 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19608
19609static bool
19610expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
19611{
19612 unsigned i, nelt, eltsz, mask;
19613 unsigned char perm[64];
877c9e33 19614 machine_mode vmode;
1fa991d1 19615 struct expand_vec_perm_d nd;
2bf6d935
ML
19616 rtx rperm[64], vperm, target, op0, op1;
19617
19618 nelt = d->nelt;
19619
19620 if (!d->one_operand_p)
be8749f9
UB
19621 switch (GET_MODE_SIZE (d->vmode))
19622 {
19623 case 4:
19624 if (!TARGET_XOP)
19625 return false;
19626 vmode = V4QImode;
19627 break;
37e93925 19628
be8749f9
UB
19629 case 8:
19630 if (!TARGET_XOP)
19631 return false;
19632 vmode = V8QImode;
19633 break;
2bf6d935 19634
be8749f9
UB
19635 case 16:
19636 if (!TARGET_XOP)
2bf6d935 19637 return false;
877c9e33 19638 vmode = V16QImode;
be8749f9
UB
19639 break;
19640
19641 case 32:
19642 if (!TARGET_AVX2)
19643 return false;
19644
19645 if (valid_perm_using_mode_p (V2TImode, d))
19646 {
19647 if (d->testing_p)
19648 return true;
19649
19650 /* Use vperm2i128 insn. The pattern uses
19651 V4DImode instead of V2TImode. */
19652 target = d->target;
19653 if (d->vmode != V4DImode)
19654 target = gen_reg_rtx (V4DImode);
19655 op0 = gen_lowpart (V4DImode, d->op0);
19656 op1 = gen_lowpart (V4DImode, d->op1);
19657 rperm[0]
19658 = GEN_INT ((d->perm[0] / (nelt / 2))
19659 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
19660 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
19661 if (target != d->target)
19662 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19663 return true;
19664 }
19665 /* FALLTHRU */
19666
19667 default:
37e93925 19668 return false;
be8749f9 19669 }
2bf6d935 19670 else
be8749f9
UB
19671 switch (GET_MODE_SIZE (d->vmode))
19672 {
19673 case 4:
19674 if (!TARGET_SSSE3)
19675 return false;
19676 vmode = V4QImode;
19677 break;
2bf6d935 19678
be8749f9
UB
19679 case 8:
19680 if (!TARGET_SSSE3)
19681 return false;
19682 vmode = V8QImode;
19683 break;
2bf6d935 19684
be8749f9
UB
19685 case 16:
19686 if (!TARGET_SSSE3)
19687 return false;
877c9e33 19688 vmode = V16QImode;
be8749f9
UB
19689 break;
19690
19691 case 32:
19692 if (!TARGET_AVX2)
19693 return false;
19694
19695 /* V4DImode should be already handled through
19696 expand_vselect by vpermq instruction. */
19697 gcc_assert (d->vmode != V4DImode);
19698
19699 vmode = V32QImode;
19700 if (d->vmode == V8SImode
19701 || d->vmode == V16HImode
19702 || d->vmode == V32QImode)
19703 {
19704 /* First see if vpermq can be used for
19705 V8SImode/V16HImode/V32QImode. */
19706 if (valid_perm_using_mode_p (V4DImode, d))
19707 {
19708 for (i = 0; i < 4; i++)
19709 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
19710 if (d->testing_p)
19711 return true;
19712 target = gen_reg_rtx (V4DImode);
19713 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
19714 perm, 4, false))
19715 {
19716 emit_move_insn (d->target,
19717 gen_lowpart (d->vmode, target));
2bf6d935 19718 return true;
be8749f9
UB
19719 }
19720 return false;
19721 }
2bf6d935 19722
be8749f9
UB
19723 /* Next see if vpermd can be used. */
19724 if (valid_perm_using_mode_p (V8SImode, d))
19725 vmode = V8SImode;
19726 }
19727 /* Or if vpermps can be used. */
19728 else if (d->vmode == V8SFmode)
19729 vmode = V8SImode;
2bf6d935 19730
be8749f9
UB
19731 if (vmode == V32QImode)
19732 {
19733 /* vpshufb only works intra lanes, it is not
19734 possible to shuffle bytes in between the lanes. */
19735 for (i = 0; i < nelt; ++i)
19736 if ((d->perm[i] ^ i) & (nelt / 2))
19737 return false;
19738 }
19739 break;
2bf6d935 19740
be8749f9
UB
19741 case 64:
19742 if (!TARGET_AVX512BW)
19743 return false;
2bf6d935 19744
be8749f9
UB
19745 /* If vpermq didn't work, vpshufb won't work either. */
19746 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19747 return false;
19748
19749 vmode = V64QImode;
19750 if (d->vmode == V16SImode
19751 || d->vmode == V32HImode
19752 || d->vmode == V64QImode)
19753 {
19754 /* First see if vpermq can be used for
19755 V16SImode/V32HImode/V64QImode. */
19756 if (valid_perm_using_mode_p (V8DImode, d))
19757 {
19758 for (i = 0; i < 8; i++)
19759 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19760 if (d->testing_p)
19761 return true;
19762 target = gen_reg_rtx (V8DImode);
19763 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19764 perm, 8, false))
19765 {
19766 emit_move_insn (d->target,
19767 gen_lowpart (d->vmode, target));
2bf6d935 19768 return true;
be8749f9
UB
19769 }
19770 return false;
19771 }
2bf6d935 19772
be8749f9
UB
19773 /* Next see if vpermd can be used. */
19774 if (valid_perm_using_mode_p (V16SImode, d))
19775 vmode = V16SImode;
19776 }
19777 /* Or if vpermps can be used. */
19778 else if (d->vmode == V16SFmode)
19779 vmode = V16SImode;
877c9e33 19780
be8749f9
UB
19781 if (vmode == V64QImode)
19782 {
19783 /* vpshufb only works intra lanes, it is not
19784 possible to shuffle bytes in between the lanes. */
19785 for (i = 0; i < nelt; ++i)
19786 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19787 return false;
19788 }
19789 break;
19790
19791 default:
2bf6d935 19792 return false;
be8749f9 19793 }
2bf6d935
ML
19794
19795 if (d->testing_p)
19796 return true;
19797
681143b9
UB
19798 /* Try to avoid variable permutation instruction. */
19799 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19800 {
19801 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19802 return true;
19803 }
19804
2bf6d935
ML
19805 if (vmode == V8SImode)
19806 for (i = 0; i < 8; ++i)
19807 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19808 else if (vmode == V16SImode)
19809 for (i = 0; i < 16; ++i)
19810 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19811 else
19812 {
19813 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19814 if (!d->one_operand_p)
19815 mask = 2 * nelt - 1;
2bf6d935
ML
19816 else if (vmode == V64QImode)
19817 mask = nelt / 4 - 1;
a325bdd1 19818 else if (vmode == V32QImode)
2bf6d935 19819 mask = nelt / 2 - 1;
a325bdd1
PB
19820 else
19821 mask = nelt - 1;
2bf6d935
ML
19822
19823 for (i = 0; i < nelt; ++i)
19824 {
19825 unsigned j, e = d->perm[i] & mask;
19826 for (j = 0; j < eltsz; ++j)
19827 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19828 }
19829 }
19830
a325bdd1
PB
19831 machine_mode vpmode = vmode;
19832
877c9e33
UB
19833 nelt = GET_MODE_SIZE (vmode);
19834
19835 /* Emulate narrow modes with V16QI instructions. */
19836 if (nelt < 16)
a325bdd1 19837 {
dd835ec2
UB
19838 rtx m128 = GEN_INT (-128);
19839
37e93925 19840 /* Remap elements from the second operand, as we have to
be8749f9 19841 account for inactive top elements from the first operand. */
37e93925 19842 if (!d->one_operand_p)
be8749f9 19843 {
be8749f9
UB
19844 for (i = 0; i < nelt; ++i)
19845 {
877c9e33
UB
19846 unsigned ival = UINTVAL (rperm[i]);
19847 if (ival >= nelt)
19848 rperm[i] = GEN_INT (ival + 16 - nelt);
be8749f9
UB
19849 }
19850 }
37e93925 19851
877c9e33 19852 /* Fill inactive elements in the top positions with zeros. */
a325bdd1 19853 for (i = nelt; i < 16; ++i)
dd835ec2 19854 rperm[i] = m128;
37e93925 19855
a325bdd1
PB
19856 vpmode = V16QImode;
19857 }
19858
19859 vperm = gen_rtx_CONST_VECTOR (vpmode,
19860 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19861 vperm = force_reg (vpmode, vperm);
2bf6d935 19862
37e93925
UB
19863 if (vmode == d->vmode)
19864 target = d->target;
19865 else
2bf6d935 19866 target = gen_reg_rtx (vmode);
37e93925 19867
2bf6d935 19868 op0 = gen_lowpart (vmode, d->op0);
37e93925 19869
2bf6d935
ML
19870 if (d->one_operand_p)
19871 {
37e93925
UB
19872 rtx (*gen) (rtx, rtx, rtx);
19873
be8749f9
UB
19874 if (vmode == V4QImode)
19875 gen = gen_mmx_pshufbv4qi3;
19876 else if (vmode == V8QImode)
37e93925 19877 gen = gen_mmx_pshufbv8qi3;
a325bdd1 19878 else if (vmode == V16QImode)
37e93925 19879 gen = gen_ssse3_pshufbv16qi3;
2bf6d935 19880 else if (vmode == V32QImode)
37e93925 19881 gen = gen_avx2_pshufbv32qi3;
2bf6d935 19882 else if (vmode == V64QImode)
37e93925 19883 gen = gen_avx512bw_pshufbv64qi3;
2bf6d935 19884 else if (vmode == V8SFmode)
37e93925 19885 gen = gen_avx2_permvarv8sf;
2bf6d935 19886 else if (vmode == V8SImode)
37e93925 19887 gen = gen_avx2_permvarv8si;
2bf6d935 19888 else if (vmode == V16SFmode)
37e93925 19889 gen = gen_avx512f_permvarv16sf;
2bf6d935 19890 else if (vmode == V16SImode)
37e93925 19891 gen = gen_avx512f_permvarv16si;
2bf6d935
ML
19892 else
19893 gcc_unreachable ();
37e93925
UB
19894
19895 emit_insn (gen (target, op0, vperm));
2bf6d935
ML
19896 }
19897 else
19898 {
37e93925
UB
19899 rtx (*gen) (rtx, rtx, rtx, rtx);
19900
2bf6d935 19901 op1 = gen_lowpart (vmode, d->op1);
37e93925 19902
be8749f9
UB
19903 if (vmode == V4QImode)
19904 gen = gen_mmx_ppermv32;
19905 else if (vmode == V8QImode)
37e93925
UB
19906 gen = gen_mmx_ppermv64;
19907 else if (vmode == V16QImode)
19908 gen = gen_xop_pperm;
19909 else
19910 gcc_unreachable ();
19911
19912 emit_insn (gen (target, op0, op1, vperm));
2bf6d935 19913 }
37e93925 19914
2bf6d935
ML
19915 if (target != d->target)
19916 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19917
19918 return true;
19919}
19920
2bf6d935
ML
19921/* Try to expand one-operand permutation with constant mask. */
19922
19923static bool
19924ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19925{
19926 machine_mode mode = GET_MODE (d->op0);
19927 machine_mode maskmode = mode;
faf2b6bc 19928 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
2bf6d935
ML
19929 rtx (*gen) (rtx, rtx, rtx) = NULL;
19930 rtx target, op0, mask;
19931 rtx vec[64];
19932
19933 if (!rtx_equal_p (d->op0, d->op1))
19934 return false;
19935
19936 if (!TARGET_AVX512F)
19937 return false;
19938
faf2b6bc 19939 /* Accept VNxHImode and VNxQImode now. */
19940 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19941 return false;
19942
19943 /* vpermw. */
19944 if (!TARGET_AVX512BW && inner_size == 2)
19945 return false;
19946
19947 /* vpermb. */
19948 if (!TARGET_AVX512VBMI && inner_size == 1)
19949 return false;
19950
2bf6d935
ML
19951 switch (mode)
19952 {
19953 case E_V16SImode:
19954 gen = gen_avx512f_permvarv16si;
19955 break;
19956 case E_V16SFmode:
19957 gen = gen_avx512f_permvarv16sf;
19958 maskmode = V16SImode;
19959 break;
19960 case E_V8DImode:
19961 gen = gen_avx512f_permvarv8di;
19962 break;
19963 case E_V8DFmode:
19964 gen = gen_avx512f_permvarv8df;
19965 maskmode = V8DImode;
19966 break;
faf2b6bc 19967 case E_V32HImode:
19968 gen = gen_avx512bw_permvarv32hi;
19969 break;
19970 case E_V16HImode:
19971 gen = gen_avx512vl_permvarv16hi;
19972 break;
19973 case E_V8HImode:
19974 gen = gen_avx512vl_permvarv8hi;
19975 break;
19976 case E_V64QImode:
19977 gen = gen_avx512bw_permvarv64qi;
19978 break;
19979 case E_V32QImode:
19980 gen = gen_avx512vl_permvarv32qi;
19981 break;
19982 case E_V16QImode:
19983 gen = gen_avx512vl_permvarv16qi;
19984 break;
19985
2bf6d935
ML
19986 default:
19987 return false;
19988 }
19989
04b4f315
JJ
19990 if (d->testing_p)
19991 return true;
19992
2bf6d935
ML
19993 target = d->target;
19994 op0 = d->op0;
19995 for (int i = 0; i < d->nelt; ++i)
19996 vec[i] = GEN_INT (d->perm[i]);
19997 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19998 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19999 return true;
20000}
20001
20002static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
20003
4bf4c103 20004/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
2bf6d935
ML
20005 in a single instruction. */
20006
20007static bool
20008expand_vec_perm_1 (struct expand_vec_perm_d *d)
20009{
20010 unsigned i, nelt = d->nelt;
20011 struct expand_vec_perm_d nd;
20012
20013 /* Check plain VEC_SELECT first, because AVX has instructions that could
20014 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
20015 input where SEL+CONCAT may not. */
20016 if (d->one_operand_p)
20017 {
20018 int mask = nelt - 1;
20019 bool identity_perm = true;
20020 bool broadcast_perm = true;
20021
20022 for (i = 0; i < nelt; i++)
20023 {
20024 nd.perm[i] = d->perm[i] & mask;
20025 if (nd.perm[i] != i)
20026 identity_perm = false;
20027 if (nd.perm[i])
20028 broadcast_perm = false;
20029 }
20030
20031 if (identity_perm)
20032 {
20033 if (!d->testing_p)
20034 emit_move_insn (d->target, d->op0);
20035 return true;
20036 }
20037 else if (broadcast_perm && TARGET_AVX2)
20038 {
20039 /* Use vpbroadcast{b,w,d}. */
20040 rtx (*gen) (rtx, rtx) = NULL;
20041 switch (d->vmode)
20042 {
20043 case E_V64QImode:
20044 if (TARGET_AVX512BW)
20045 gen = gen_avx512bw_vec_dupv64qi_1;
20046 break;
20047 case E_V32QImode:
20048 gen = gen_avx2_pbroadcastv32qi_1;
20049 break;
20050 case E_V32HImode:
20051 if (TARGET_AVX512BW)
20052 gen = gen_avx512bw_vec_dupv32hi_1;
20053 break;
20054 case E_V16HImode:
20055 gen = gen_avx2_pbroadcastv16hi_1;
20056 break;
20057 case E_V16SImode:
20058 if (TARGET_AVX512F)
20059 gen = gen_avx512f_vec_dupv16si_1;
20060 break;
20061 case E_V8SImode:
20062 gen = gen_avx2_pbroadcastv8si_1;
20063 break;
20064 case E_V16QImode:
20065 gen = gen_avx2_pbroadcastv16qi;
20066 break;
20067 case E_V8HImode:
20068 gen = gen_avx2_pbroadcastv8hi;
20069 break;
20070 case E_V16SFmode:
20071 if (TARGET_AVX512F)
20072 gen = gen_avx512f_vec_dupv16sf_1;
20073 break;
20074 case E_V8SFmode:
20075 gen = gen_avx2_vec_dupv8sf_1;
20076 break;
20077 case E_V8DFmode:
20078 if (TARGET_AVX512F)
20079 gen = gen_avx512f_vec_dupv8df_1;
20080 break;
20081 case E_V8DImode:
20082 if (TARGET_AVX512F)
20083 gen = gen_avx512f_vec_dupv8di_1;
20084 break;
20085 /* For other modes prefer other shuffles this function creates. */
20086 default: break;
20087 }
20088 if (gen != NULL)
20089 {
20090 if (!d->testing_p)
20091 emit_insn (gen (d->target, d->op0));
20092 return true;
20093 }
20094 }
20095
20096 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
20097 return true;
20098
20099 /* There are plenty of patterns in sse.md that are written for
20100 SEL+CONCAT and are not replicated for a single op. Perhaps
20101 that should be changed, to avoid the nastiness here. */
20102
20103 /* Recognize interleave style patterns, which means incrementing
20104 every other permutation operand. */
20105 for (i = 0; i < nelt; i += 2)
20106 {
20107 nd.perm[i] = d->perm[i] & mask;
20108 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
20109 }
20110 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
20111 d->testing_p))
20112 return true;
20113
20114 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
20115 if (nelt >= 4)
20116 {
20117 for (i = 0; i < nelt; i += 4)
20118 {
20119 nd.perm[i + 0] = d->perm[i + 0] & mask;
20120 nd.perm[i + 1] = d->perm[i + 1] & mask;
20121 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
20122 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
20123 }
20124
20125 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
20126 d->testing_p))
20127 return true;
20128 }
20129 }
20130
57052c6e
UB
20131 /* Try the SSE4.1 blend variable merge instructions. */
20132 if (expand_vec_perm_blend (d))
20133 return true;
20134
2bf6d935
ML
20135 /* Try movss/movsd instructions. */
20136 if (expand_vec_perm_movs (d))
20137 return true;
20138
95b99e47
UB
20139 /* Try the SSE4.1 insertps instruction. */
20140 if (expand_vec_perm_insertps (d))
20141 return true;
20142
57052c6e 20143 /* Try the fully general two operand permute. */
2bf6d935
ML
20144 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
20145 d->testing_p))
20146 return true;
20147
20148 /* Recognize interleave style patterns with reversed operands. */
20149 if (!d->one_operand_p)
20150 {
20151 for (i = 0; i < nelt; ++i)
20152 {
20153 unsigned e = d->perm[i];
20154 if (e >= nelt)
20155 e -= nelt;
20156 else
20157 e += nelt;
20158 nd.perm[i] = e;
20159 }
20160
20161 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
20162 d->testing_p))
20163 return true;
20164 }
20165
2bf6d935
ML
20166 /* Try one of the AVX vpermil variable permutations. */
20167 if (expand_vec_perm_vpermil (d))
20168 return true;
20169
20170 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
20171 vpshufb, vpermd, vpermps or vpermq variable permutation. */
20172 if (expand_vec_perm_pshufb (d))
20173 return true;
20174
20175 /* Try the AVX2 vpalignr instruction. */
20176 if (expand_vec_perm_palignr (d, true))
20177 return true;
20178
faf2b6bc 20179 /* Try the AVX512F vperm{w,b,s,d} instructions */
2bf6d935
ML
20180 if (ix86_expand_vec_one_operand_perm_avx512 (d))
20181 return true;
20182
20183 /* Try the AVX512F vpermt2/vpermi2 instructions. */
20184 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
20185 return true;
20186
20187 /* See if we can get the same permutation in different vector integer
20188 mode. */
20189 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
20190 {
20191 if (!d->testing_p)
20192 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
20193 return true;
20194 }
20195 return false;
20196}
20197
1442e203 20198/* Canonicalize vec_perm index to make the first index
20199 always comes from the first vector. */
20200static void
20201ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
20202{
20203 unsigned nelt = d->nelt;
20204 if (d->perm[0] < nelt)
20205 return;
20206
20207 for (unsigned i = 0; i != nelt; i++)
20208 d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
20209
20210 std::swap (d->op0, d->op1);
20211 return;
20212}
20213
3db8e9c2 20214/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20215 in terms of a pair of shufps+ shufps/pshufd instructions. */
20216static bool
20217expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
20218{
20219 unsigned char perm1[4];
20220 machine_mode vmode = d->vmode;
20221 bool ok;
20222 unsigned i, j, k, count = 0;
20223
20224 if (d->one_operand_p
20225 || (vmode != V4SImode && vmode != V4SFmode))
20226 return false;
20227
20228 if (d->testing_p)
20229 return true;
20230
1442e203 20231 ix86_vec_perm_index_canon (d);
3db8e9c2 20232 for (i = 0; i < 4; ++i)
20233 count += d->perm[i] > 3 ? 1 : 0;
20234
20235 gcc_assert (count & 3);
20236
20237 rtx tmp = gen_reg_rtx (vmode);
20238 /* 2 from op0 and 2 from op1. */
20239 if (count == 2)
20240 {
20241 unsigned char perm2[4];
20242 for (i = 0, j = 0, k = 2; i < 4; ++i)
20243 if (d->perm[i] & 4)
20244 {
20245 perm1[k++] = d->perm[i];
20246 perm2[i] = k - 1;
20247 }
20248 else
20249 {
20250 perm1[j++] = d->perm[i];
20251 perm2[i] = j - 1;
20252 }
20253
20254 /* shufps. */
20255 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20256 perm1, d->nelt, false);
20257 gcc_assert (ok);
20258 if (vmode == V4SImode && TARGET_SSE2)
20259 /* pshufd. */
20260 ok = expand_vselect (d->target, tmp,
20261 perm2, d->nelt, false);
20262 else
20263 {
20264 /* shufps. */
20265 perm2[2] += 4;
20266 perm2[3] += 4;
20267 ok = expand_vselect_vconcat (d->target, tmp, tmp,
20268 perm2, d->nelt, false);
20269 }
20270 gcc_assert (ok);
20271 }
20272 /* 3 from one op and 1 from another. */
20273 else
20274 {
20275 unsigned pair_idx = 8, lone_idx = 8, shift;
20276
20277 /* Find the lone index. */
20278 for (i = 0; i < 4; ++i)
20279 if ((d->perm[i] > 3 && count == 1)
20280 || (d->perm[i] < 4 && count == 3))
20281 lone_idx = i;
20282
20283 /* When lone_idx is not 0, it must from second op(count == 1). */
20284 gcc_assert (count == (lone_idx ? 1 : 3));
20285
20286 /* Find the pair index that sits in the same half as the lone index. */
20287 shift = lone_idx & 2;
20288 pair_idx = 1 - lone_idx + 2 * shift;
20289
20290 /* First permutate lone index and pair index into the same vector as
20291 [ lone, lone, pair, pair ]. */
20292 perm1[1] = perm1[0]
20293 = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
20294 perm1[3] = perm1[2]
20295 = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
20296
20297 /* Alway put the vector contains lone indx at the first. */
20298 if (count == 1)
20299 std::swap (d->op0, d->op1);
20300
20301 /* shufps. */
20302 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20303 perm1, d->nelt, false);
20304 gcc_assert (ok);
20305
20306 /* Refine lone and pair index to original order. */
20307 perm1[shift] = lone_idx << 1;
20308 perm1[shift + 1] = pair_idx << 1;
20309
20310 /* Select the remaining 2 elements in another vector. */
20311 for (i = 2 - shift; i < 4 - shift; ++i)
20312 perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
20313
20314 /* Adjust to original selector. */
20315 if (lone_idx > 1)
20316 std::swap (tmp, d->op1);
20317
20318 /* shufps. */
20319 ok = expand_vselect_vconcat (d->target, tmp, d->op1,
20320 perm1, d->nelt, false);
20321
20322 gcc_assert (ok);
20323 }
20324
20325 return true;
20326}
20327
4bf4c103 20328/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
20329 in terms of a pair of pshuflw + pshufhw instructions. */
20330
20331static bool
20332expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
20333{
20334 unsigned char perm2[MAX_VECT_LEN];
20335 unsigned i;
20336 bool ok;
20337
20338 if (d->vmode != V8HImode || !d->one_operand_p)
20339 return false;
20340
20341 /* The two permutations only operate in 64-bit lanes. */
20342 for (i = 0; i < 4; ++i)
20343 if (d->perm[i] >= 4)
20344 return false;
20345 for (i = 4; i < 8; ++i)
20346 if (d->perm[i] < 4)
20347 return false;
20348
20349 if (d->testing_p)
20350 return true;
20351
20352 /* Emit the pshuflw. */
20353 memcpy (perm2, d->perm, 4);
20354 for (i = 4; i < 8; ++i)
20355 perm2[i] = i;
20356 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
20357 gcc_assert (ok);
20358
20359 /* Emit the pshufhw. */
20360 memcpy (perm2 + 4, d->perm + 4, 4);
20361 for (i = 0; i < 4; ++i)
20362 perm2[i] = i;
20363 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
20364 gcc_assert (ok);
20365
20366 return true;
20367}
20368
4bf4c103 20369/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
20370 the permutation using the SSSE3 palignr instruction. This succeeds
20371 when all of the elements in PERM fit within one vector and we merely
20372 need to shift them down so that a single vector permutation has a
20373 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20374 the vpalignr instruction itself can perform the requested permutation. */
20375
20376static bool
20377expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
20378{
20379 unsigned i, nelt = d->nelt;
20380 unsigned min, max, minswap, maxswap;
20381 bool in_order, ok, swap = false;
20382 rtx shift, target;
20383 struct expand_vec_perm_d dcopy;
20384
20385 /* Even with AVX, palignr only operates on 128-bit vectors,
20386 in AVX2 palignr operates on both 128-bit lanes. */
20387 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
20388 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
20389 return false;
20390
20391 min = 2 * nelt;
20392 max = 0;
20393 minswap = 2 * nelt;
20394 maxswap = 0;
20395 for (i = 0; i < nelt; ++i)
20396 {
20397 unsigned e = d->perm[i];
20398 unsigned eswap = d->perm[i] ^ nelt;
20399 if (GET_MODE_SIZE (d->vmode) == 32)
20400 {
20401 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
20402 eswap = e ^ (nelt / 2);
20403 }
20404 if (e < min)
20405 min = e;
20406 if (e > max)
20407 max = e;
20408 if (eswap < minswap)
20409 minswap = eswap;
20410 if (eswap > maxswap)
20411 maxswap = eswap;
20412 }
20413 if (min == 0
20414 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
20415 {
20416 if (d->one_operand_p
20417 || minswap == 0
20418 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
20419 ? nelt / 2 : nelt))
20420 return false;
20421 swap = true;
20422 min = minswap;
20423 max = maxswap;
20424 }
20425
20426 /* Given that we have SSSE3, we know we'll be able to implement the
20427 single operand permutation after the palignr with pshufb for
20428 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20429 first. */
20430 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
20431 return true;
20432
20433 dcopy = *d;
20434 if (swap)
20435 {
20436 dcopy.op0 = d->op1;
20437 dcopy.op1 = d->op0;
20438 for (i = 0; i < nelt; ++i)
20439 dcopy.perm[i] ^= nelt;
20440 }
20441
20442 in_order = true;
20443 for (i = 0; i < nelt; ++i)
20444 {
20445 unsigned e = dcopy.perm[i];
20446 if (GET_MODE_SIZE (d->vmode) == 32
20447 && e >= nelt
20448 && (e & (nelt / 2 - 1)) < min)
20449 e = e - min - (nelt / 2);
20450 else
20451 e = e - min;
20452 if (e != i)
20453 in_order = false;
20454 dcopy.perm[i] = e;
20455 }
20456 dcopy.one_operand_p = true;
20457
20458 if (single_insn_only_p && !in_order)
20459 return false;
20460
20461 /* For AVX2, test whether we can permute the result in one instruction. */
20462 if (d->testing_p)
20463 {
20464 if (in_order)
20465 return true;
20466 dcopy.op1 = dcopy.op0;
20467 return expand_vec_perm_1 (&dcopy);
20468 }
20469
20470 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
20471 if (GET_MODE_SIZE (d->vmode) == 16)
20472 {
02e2e15e
RS
20473 target = gen_reg_rtx (V1TImode);
20474 emit_insn (gen_ssse3_palignrv1ti (target,
20475 gen_lowpart (V1TImode, dcopy.op1),
20476 gen_lowpart (V1TImode, dcopy.op0),
20477 shift));
2bf6d935
ML
20478 }
20479 else
20480 {
20481 target = gen_reg_rtx (V2TImode);
20482 emit_insn (gen_avx2_palignrv2ti (target,
20483 gen_lowpart (V2TImode, dcopy.op1),
20484 gen_lowpart (V2TImode, dcopy.op0),
20485 shift));
20486 }
20487
20488 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
20489
20490 /* Test for the degenerate case where the alignment by itself
20491 produces the desired permutation. */
20492 if (in_order)
20493 {
20494 emit_move_insn (d->target, dcopy.op0);
20495 return true;
20496 }
20497
20498 ok = expand_vec_perm_1 (&dcopy);
20499 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
20500
20501 return ok;
20502}
20503
20504/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20505 the permutation using the SSE4_1 pblendv instruction. Potentially
20506 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20507
20508static bool
20509expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
20510{
20511 unsigned i, which, nelt = d->nelt;
20512 struct expand_vec_perm_d dcopy, dcopy1;
20513 machine_mode vmode = d->vmode;
20514 bool ok;
20515
20516 /* Use the same checks as in expand_vec_perm_blend. */
20517 if (d->one_operand_p)
20518 return false;
20519 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20520 ;
20521 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20522 ;
acff89c7
UB
20523 else if (TARGET_SSE4_1
20524 && (GET_MODE_SIZE (vmode) == 16
20525 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
20526 || GET_MODE_SIZE (vmode) == 4))
2bf6d935
ML
20527 ;
20528 else
20529 return false;
20530
20531 /* Figure out where permutation elements stay not in their
20532 respective lanes. */
20533 for (i = 0, which = 0; i < nelt; ++i)
20534 {
20535 unsigned e = d->perm[i];
20536 if (e != i)
20537 which |= (e < nelt ? 1 : 2);
20538 }
20539 /* We can pblend the part where elements stay not in their
20540 respective lanes only when these elements are all in one
20541 half of a permutation.
20542 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20543 lanes, but both 8 and 9 >= 8
20544 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20545 respective lanes and 8 >= 8, but 2 not. */
20546 if (which != 1 && which != 2)
20547 return false;
20548 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
20549 return true;
20550
20551 /* First we apply one operand permutation to the part where
20552 elements stay not in their respective lanes. */
20553 dcopy = *d;
20554 if (which == 2)
20555 dcopy.op0 = dcopy.op1 = d->op1;
20556 else
20557 dcopy.op0 = dcopy.op1 = d->op0;
20558 if (!d->testing_p)
20559 dcopy.target = gen_reg_rtx (vmode);
20560 dcopy.one_operand_p = true;
20561
20562 for (i = 0; i < nelt; ++i)
20563 dcopy.perm[i] = d->perm[i] & (nelt - 1);
20564
20565 ok = expand_vec_perm_1 (&dcopy);
20566 if (GET_MODE_SIZE (vmode) != 16 && !ok)
20567 return false;
20568 else
20569 gcc_assert (ok);
20570 if (d->testing_p)
20571 return true;
20572
20573 /* Next we put permuted elements into their positions. */
20574 dcopy1 = *d;
20575 if (which == 2)
20576 dcopy1.op1 = dcopy.target;
20577 else
20578 dcopy1.op0 = dcopy.target;
20579
20580 for (i = 0; i < nelt; ++i)
20581 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
20582
20583 ok = expand_vec_perm_blend (&dcopy1);
20584 gcc_assert (ok);
20585
20586 return true;
20587}
20588
20589static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
20590
4bf4c103 20591/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
20592 a two vector permutation into a single vector permutation by using
20593 an interleave operation to merge the vectors. */
20594
20595static bool
20596expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
20597{
20598 struct expand_vec_perm_d dremap, dfinal;
20599 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
20600 unsigned HOST_WIDE_INT contents;
20601 unsigned char remap[2 * MAX_VECT_LEN];
20602 rtx_insn *seq;
20603 bool ok, same_halves = false;
20604
be8749f9
UB
20605 if (GET_MODE_SIZE (d->vmode) == 4
20606 || GET_MODE_SIZE (d->vmode) == 8
a325bdd1 20607 || GET_MODE_SIZE (d->vmode) == 16)
2bf6d935
ML
20608 {
20609 if (d->one_operand_p)
20610 return false;
20611 }
20612 else if (GET_MODE_SIZE (d->vmode) == 32)
20613 {
20614 if (!TARGET_AVX)
20615 return false;
20616 /* For 32-byte modes allow even d->one_operand_p.
20617 The lack of cross-lane shuffling in some instructions
20618 might prevent a single insn shuffle. */
20619 dfinal = *d;
20620 dfinal.testing_p = true;
20621 /* If expand_vec_perm_interleave3 can expand this into
20622 a 3 insn sequence, give up and let it be expanded as
20623 3 insn sequence. While that is one insn longer,
20624 it doesn't need a memory operand and in the common
20625 case that both interleave low and high permutations
20626 with the same operands are adjacent needs 4 insns
20627 for both after CSE. */
20628 if (expand_vec_perm_interleave3 (&dfinal))
20629 return false;
20630 }
20631 else
20632 return false;
20633
20634 /* Examine from whence the elements come. */
20635 contents = 0;
20636 for (i = 0; i < nelt; ++i)
20637 contents |= HOST_WIDE_INT_1U << d->perm[i];
20638
20639 memset (remap, 0xff, sizeof (remap));
20640 dremap = *d;
20641
be8749f9
UB
20642 if (GET_MODE_SIZE (d->vmode) == 4
20643 || GET_MODE_SIZE (d->vmode) == 8)
a325bdd1
PB
20644 {
20645 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20646
20647 /* Split the two input vectors into 4 halves. */
20648 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20649 h2 = h1 << nelt2;
20650 h3 = h2 << nelt2;
20651 h4 = h3 << nelt2;
20652
20653 /* If the elements from the low halves use interleave low,
20654 and similarly for interleave high. */
20655 if ((contents & (h1 | h3)) == contents)
20656 {
20657 /* punpckl* */
20658 for (i = 0; i < nelt2; ++i)
20659 {
20660 remap[i] = i * 2;
20661 remap[i + nelt] = i * 2 + 1;
20662 dremap.perm[i * 2] = i;
20663 dremap.perm[i * 2 + 1] = i + nelt;
20664 }
20665 }
20666 else if ((contents & (h2 | h4)) == contents)
20667 {
20668 /* punpckh* */
20669 for (i = 0; i < nelt2; ++i)
20670 {
20671 remap[i + nelt2] = i * 2;
20672 remap[i + nelt + nelt2] = i * 2 + 1;
20673 dremap.perm[i * 2] = i + nelt2;
20674 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20675 }
20676 }
20677 else
20678 return false;
20679 }
20680 else if (GET_MODE_SIZE (d->vmode) == 16)
2bf6d935
ML
20681 {
20682 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20683
20684 /* Split the two input vectors into 4 halves. */
20685 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20686 h2 = h1 << nelt2;
20687 h3 = h2 << nelt2;
20688 h4 = h3 << nelt2;
20689
20690 /* If the elements from the low halves use interleave low, and similarly
20691 for interleave high. If the elements are from mis-matched halves, we
20692 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20693 if ((contents & (h1 | h3)) == contents)
20694 {
20695 /* punpckl* */
20696 for (i = 0; i < nelt2; ++i)
20697 {
20698 remap[i] = i * 2;
20699 remap[i + nelt] = i * 2 + 1;
20700 dremap.perm[i * 2] = i;
20701 dremap.perm[i * 2 + 1] = i + nelt;
20702 }
20703 if (!TARGET_SSE2 && d->vmode == V4SImode)
20704 dremap.vmode = V4SFmode;
20705 }
20706 else if ((contents & (h2 | h4)) == contents)
20707 {
20708 /* punpckh* */
20709 for (i = 0; i < nelt2; ++i)
20710 {
20711 remap[i + nelt2] = i * 2;
20712 remap[i + nelt + nelt2] = i * 2 + 1;
20713 dremap.perm[i * 2] = i + nelt2;
20714 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20715 }
20716 if (!TARGET_SSE2 && d->vmode == V4SImode)
20717 dremap.vmode = V4SFmode;
20718 }
20719 else if ((contents & (h1 | h4)) == contents)
20720 {
20721 /* shufps */
20722 for (i = 0; i < nelt2; ++i)
20723 {
20724 remap[i] = i;
20725 remap[i + nelt + nelt2] = i + nelt2;
20726 dremap.perm[i] = i;
20727 dremap.perm[i + nelt2] = i + nelt + nelt2;
20728 }
20729 if (nelt != 4)
20730 {
20731 /* shufpd */
20732 dremap.vmode = V2DImode;
20733 dremap.nelt = 2;
20734 dremap.perm[0] = 0;
20735 dremap.perm[1] = 3;
20736 }
20737 }
20738 else if ((contents & (h2 | h3)) == contents)
20739 {
20740 /* shufps */
20741 for (i = 0; i < nelt2; ++i)
20742 {
20743 remap[i + nelt2] = i;
20744 remap[i + nelt] = i + nelt2;
20745 dremap.perm[i] = i + nelt2;
20746 dremap.perm[i + nelt2] = i + nelt;
20747 }
20748 if (nelt != 4)
20749 {
20750 /* shufpd */
20751 dremap.vmode = V2DImode;
20752 dremap.nelt = 2;
20753 dremap.perm[0] = 1;
20754 dremap.perm[1] = 2;
20755 }
20756 }
20757 else
20758 return false;
20759 }
20760 else
20761 {
20762 unsigned int nelt4 = nelt / 4, nzcnt = 0;
20763 unsigned HOST_WIDE_INT q[8];
20764 unsigned int nonzero_halves[4];
20765
20766 /* Split the two input vectors into 8 quarters. */
20767 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
20768 for (i = 1; i < 8; ++i)
20769 q[i] = q[0] << (nelt4 * i);
20770 for (i = 0; i < 4; ++i)
20771 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
20772 {
20773 nonzero_halves[nzcnt] = i;
20774 ++nzcnt;
20775 }
20776
20777 if (nzcnt == 1)
20778 {
20779 gcc_assert (d->one_operand_p);
20780 nonzero_halves[1] = nonzero_halves[0];
20781 same_halves = true;
20782 }
20783 else if (d->one_operand_p)
20784 {
20785 gcc_assert (nonzero_halves[0] == 0);
20786 gcc_assert (nonzero_halves[1] == 1);
20787 }
20788
20789 if (nzcnt <= 2)
20790 {
20791 if (d->perm[0] / nelt2 == nonzero_halves[1])
20792 {
20793 /* Attempt to increase the likelihood that dfinal
20794 shuffle will be intra-lane. */
20795 std::swap (nonzero_halves[0], nonzero_halves[1]);
20796 }
20797
20798 /* vperm2f128 or vperm2i128. */
20799 for (i = 0; i < nelt2; ++i)
20800 {
20801 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
20802 remap[i + nonzero_halves[0] * nelt2] = i;
20803 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
20804 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
20805 }
20806
20807 if (d->vmode != V8SFmode
20808 && d->vmode != V4DFmode
20809 && d->vmode != V8SImode)
20810 {
20811 dremap.vmode = V8SImode;
20812 dremap.nelt = 8;
20813 for (i = 0; i < 4; ++i)
20814 {
20815 dremap.perm[i] = i + nonzero_halves[0] * 4;
20816 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
20817 }
20818 }
20819 }
20820 else if (d->one_operand_p)
20821 return false;
20822 else if (TARGET_AVX2
20823 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
20824 {
20825 /* vpunpckl* */
20826 for (i = 0; i < nelt4; ++i)
20827 {
20828 remap[i] = i * 2;
20829 remap[i + nelt] = i * 2 + 1;
20830 remap[i + nelt2] = i * 2 + nelt2;
20831 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
20832 dremap.perm[i * 2] = i;
20833 dremap.perm[i * 2 + 1] = i + nelt;
20834 dremap.perm[i * 2 + nelt2] = i + nelt2;
20835 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
20836 }
20837 }
20838 else if (TARGET_AVX2
20839 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
20840 {
20841 /* vpunpckh* */
20842 for (i = 0; i < nelt4; ++i)
20843 {
20844 remap[i + nelt4] = i * 2;
20845 remap[i + nelt + nelt4] = i * 2 + 1;
20846 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
20847 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
20848 dremap.perm[i * 2] = i + nelt4;
20849 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
20850 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
20851 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
20852 }
20853 }
20854 else
20855 return false;
20856 }
20857
20858 /* Use the remapping array set up above to move the elements from their
20859 swizzled locations into their final destinations. */
20860 dfinal = *d;
20861 for (i = 0; i < nelt; ++i)
20862 {
20863 unsigned e = remap[d->perm[i]];
20864 gcc_assert (e < nelt);
20865 /* If same_halves is true, both halves of the remapped vector are the
20866 same. Avoid cross-lane accesses if possible. */
20867 if (same_halves && i >= nelt2)
20868 {
20869 gcc_assert (e < nelt2);
20870 dfinal.perm[i] = e + nelt2;
20871 }
20872 else
20873 dfinal.perm[i] = e;
20874 }
20875 if (!d->testing_p)
20876 {
20877 dremap.target = gen_reg_rtx (dremap.vmode);
20878 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20879 }
20880 dfinal.op1 = dfinal.op0;
20881 dfinal.one_operand_p = true;
20882
20883 /* Test if the final remap can be done with a single insn. For V4SFmode or
20884 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20885 start_sequence ();
20886 ok = expand_vec_perm_1 (&dfinal);
20887 seq = get_insns ();
20888 end_sequence ();
20889
20890 if (!ok)
20891 return false;
20892
20893 if (d->testing_p)
20894 return true;
20895
20896 if (dremap.vmode != dfinal.vmode)
20897 {
20898 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20899 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20900 }
20901
20902 ok = expand_vec_perm_1 (&dremap);
20903 gcc_assert (ok);
20904
20905 emit_insn (seq);
20906 return true;
20907}
20908
4bf4c103 20909/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
20910 a single vector cross-lane permutation into vpermq followed
20911 by any of the single insn permutations. */
20912
20913static bool
20914expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20915{
20916 struct expand_vec_perm_d dremap, dfinal;
20917 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20918 unsigned contents[2];
20919 bool ok;
20920
20921 if (!(TARGET_AVX2
20922 && (d->vmode == V32QImode || d->vmode == V16HImode)
20923 && d->one_operand_p))
20924 return false;
20925
20926 contents[0] = 0;
20927 contents[1] = 0;
20928 for (i = 0; i < nelt2; ++i)
20929 {
20930 contents[0] |= 1u << (d->perm[i] / nelt4);
20931 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20932 }
20933
20934 for (i = 0; i < 2; ++i)
20935 {
20936 unsigned int cnt = 0;
20937 for (j = 0; j < 4; ++j)
20938 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20939 return false;
20940 }
20941
20942 if (d->testing_p)
20943 return true;
20944
20945 dremap = *d;
20946 dremap.vmode = V4DImode;
20947 dremap.nelt = 4;
20948 dremap.target = gen_reg_rtx (V4DImode);
20949 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20950 dremap.op1 = dremap.op0;
20951 dremap.one_operand_p = true;
20952 for (i = 0; i < 2; ++i)
20953 {
20954 unsigned int cnt = 0;
20955 for (j = 0; j < 4; ++j)
20956 if ((contents[i] & (1u << j)) != 0)
20957 dremap.perm[2 * i + cnt++] = j;
20958 for (; cnt < 2; ++cnt)
20959 dremap.perm[2 * i + cnt] = 0;
20960 }
20961
20962 dfinal = *d;
20963 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20964 dfinal.op1 = dfinal.op0;
20965 dfinal.one_operand_p = true;
20966 for (i = 0, j = 0; i < nelt; ++i)
20967 {
20968 if (i == nelt2)
20969 j = 2;
20970 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20971 if ((d->perm[i] / nelt4) == dremap.perm[j])
20972 ;
20973 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20974 dfinal.perm[i] |= nelt4;
20975 else
20976 gcc_unreachable ();
20977 }
20978
20979 ok = expand_vec_perm_1 (&dremap);
20980 gcc_assert (ok);
20981
20982 ok = expand_vec_perm_1 (&dfinal);
20983 gcc_assert (ok);
20984
20985 return true;
20986}
20987
20988static bool canonicalize_perm (struct expand_vec_perm_d *d);
20989
4bf4c103 20990/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
2bf6d935
ML
20991 a vector permutation using two instructions, vperm2f128 resp.
20992 vperm2i128 followed by any single in-lane permutation. */
20993
20994static bool
20995expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20996{
20997 struct expand_vec_perm_d dfirst, dsecond;
20998 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20999 bool ok;
21000
21001 if (!TARGET_AVX
21002 || GET_MODE_SIZE (d->vmode) != 32
21003 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
21004 return false;
21005
21006 dsecond = *d;
21007 dsecond.one_operand_p = false;
21008 dsecond.testing_p = true;
21009
21010 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
21011 immediate. For perm < 16 the second permutation uses
21012 d->op0 as first operand, for perm >= 16 it uses d->op1
21013 as first operand. The second operand is the result of
21014 vperm2[fi]128. */
21015 for (perm = 0; perm < 32; perm++)
21016 {
21017 /* Ignore permutations which do not move anything cross-lane. */
21018 if (perm < 16)
21019 {
21020 /* The second shuffle for e.g. V4DFmode has
21021 0123 and ABCD operands.
21022 Ignore AB23, as 23 is already in the second lane
21023 of the first operand. */
21024 if ((perm & 0xc) == (1 << 2)) continue;
21025 /* And 01CD, as 01 is in the first lane of the first
21026 operand. */
21027 if ((perm & 3) == 0) continue;
21028 /* And 4567, as then the vperm2[fi]128 doesn't change
21029 anything on the original 4567 second operand. */
21030 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
21031 }
21032 else
21033 {
21034 /* The second shuffle for e.g. V4DFmode has
21035 4567 and ABCD operands.
21036 Ignore AB67, as 67 is already in the second lane
21037 of the first operand. */
21038 if ((perm & 0xc) == (3 << 2)) continue;
21039 /* And 45CD, as 45 is in the first lane of the first
21040 operand. */
21041 if ((perm & 3) == 2) continue;
21042 /* And 0123, as then the vperm2[fi]128 doesn't change
21043 anything on the original 0123 first operand. */
21044 if ((perm & 0xf) == (1 << 2)) continue;
21045 }
21046
21047 for (i = 0; i < nelt; i++)
21048 {
21049 j = d->perm[i] / nelt2;
21050 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
21051 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
21052 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
21053 dsecond.perm[i] = d->perm[i] & (nelt - 1);
21054 else
21055 break;
21056 }
21057
21058 if (i == nelt)
21059 {
21060 start_sequence ();
21061 ok = expand_vec_perm_1 (&dsecond);
21062 end_sequence ();
21063 }
21064 else
21065 ok = false;
21066
21067 if (ok)
21068 {
21069 if (d->testing_p)
21070 return true;
21071
21072 /* Found a usable second shuffle. dfirst will be
21073 vperm2f128 on d->op0 and d->op1. */
21074 dsecond.testing_p = false;
21075 dfirst = *d;
21076 dfirst.target = gen_reg_rtx (d->vmode);
21077 for (i = 0; i < nelt; i++)
21078 dfirst.perm[i] = (i & (nelt2 - 1))
21079 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
21080
21081 canonicalize_perm (&dfirst);
21082 ok = expand_vec_perm_1 (&dfirst);
21083 gcc_assert (ok);
21084
21085 /* And dsecond is some single insn shuffle, taking
21086 d->op0 and result of vperm2f128 (if perm < 16) or
21087 d->op1 and result of vperm2f128 (otherwise). */
21088 if (perm >= 16)
21089 dsecond.op0 = dsecond.op1;
21090 dsecond.op1 = dfirst.target;
21091
21092 ok = expand_vec_perm_1 (&dsecond);
21093 gcc_assert (ok);
21094
21095 return true;
21096 }
21097
21098 /* For one operand, the only useful vperm2f128 permutation is 0x01
21099 aka lanes swap. */
21100 if (d->one_operand_p)
21101 return false;
21102 }
21103
21104 return false;
21105}
21106
4bf4c103 21107/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
21108 a two vector permutation using 2 intra-lane interleave insns
21109 and cross-lane shuffle for 32-byte vectors. */
21110
21111static bool
21112expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
21113{
21114 unsigned i, nelt;
21115 rtx (*gen) (rtx, rtx, rtx);
21116
21117 if (d->one_operand_p)
21118 return false;
21119 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
21120 ;
21121 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
21122 ;
21123 else
21124 return false;
21125
21126 nelt = d->nelt;
21127 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
21128 return false;
21129 for (i = 0; i < nelt; i += 2)
21130 if (d->perm[i] != d->perm[0] + i / 2
21131 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
21132 return false;
21133
21134 if (d->testing_p)
21135 return true;
21136
21137 switch (d->vmode)
21138 {
21139 case E_V32QImode:
21140 if (d->perm[0])
21141 gen = gen_vec_interleave_highv32qi;
21142 else
21143 gen = gen_vec_interleave_lowv32qi;
21144 break;
21145 case E_V16HImode:
21146 if (d->perm[0])
21147 gen = gen_vec_interleave_highv16hi;
21148 else
21149 gen = gen_vec_interleave_lowv16hi;
21150 break;
21151 case E_V8SImode:
21152 if (d->perm[0])
21153 gen = gen_vec_interleave_highv8si;
21154 else
21155 gen = gen_vec_interleave_lowv8si;
21156 break;
21157 case E_V4DImode:
21158 if (d->perm[0])
21159 gen = gen_vec_interleave_highv4di;
21160 else
21161 gen = gen_vec_interleave_lowv4di;
21162 break;
21163 case E_V8SFmode:
21164 if (d->perm[0])
21165 gen = gen_vec_interleave_highv8sf;
21166 else
21167 gen = gen_vec_interleave_lowv8sf;
21168 break;
21169 case E_V4DFmode:
21170 if (d->perm[0])
21171 gen = gen_vec_interleave_highv4df;
21172 else
21173 gen = gen_vec_interleave_lowv4df;
21174 break;
21175 default:
21176 gcc_unreachable ();
21177 }
21178
21179 emit_insn (gen (d->target, d->op0, d->op1));
21180 return true;
21181}
21182
4bf4c103 21183/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
2bf6d935
ML
21184 a single vector permutation using a single intra-lane vector
21185 permutation, vperm2f128 swapping the lanes and vblend* insn blending
21186 the non-swapped and swapped vectors together. */
21187
21188static bool
21189expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
21190{
21191 struct expand_vec_perm_d dfirst, dsecond;
21192 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
21193 rtx_insn *seq;
21194 bool ok;
21195 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21196
21197 if (!TARGET_AVX
21198 || TARGET_AVX2
21199 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21200 || !d->one_operand_p)
21201 return false;
21202
21203 dfirst = *d;
21204 for (i = 0; i < nelt; i++)
21205 dfirst.perm[i] = 0xff;
21206 for (i = 0, msk = 0; i < nelt; i++)
21207 {
21208 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21209 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
21210 return false;
21211 dfirst.perm[j] = d->perm[i];
21212 if (j != i)
21213 msk |= (1 << i);
21214 }
21215 for (i = 0; i < nelt; i++)
21216 if (dfirst.perm[i] == 0xff)
21217 dfirst.perm[i] = i;
21218
21219 if (!d->testing_p)
21220 dfirst.target = gen_reg_rtx (dfirst.vmode);
21221
21222 start_sequence ();
21223 ok = expand_vec_perm_1 (&dfirst);
21224 seq = get_insns ();
21225 end_sequence ();
21226
21227 if (!ok)
21228 return false;
21229
21230 if (d->testing_p)
21231 return true;
21232
21233 emit_insn (seq);
21234
21235 dsecond = *d;
21236 dsecond.op0 = dfirst.target;
21237 dsecond.op1 = dfirst.target;
21238 dsecond.one_operand_p = true;
21239 dsecond.target = gen_reg_rtx (dsecond.vmode);
21240 for (i = 0; i < nelt; i++)
21241 dsecond.perm[i] = i ^ nelt2;
21242
21243 ok = expand_vec_perm_1 (&dsecond);
21244 gcc_assert (ok);
21245
21246 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21247 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
21248 return true;
21249}
21250
829c4bea
JJ
21251/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21252 a two vector permutation using two single vector permutations and
21253 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
21254 of dfirst or dsecond is identity permutation. */
21255
21256static bool
21257expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
21258{
21259 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
21260 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21261 bool ident1 = true, ident2 = true;
21262
21263 if (d->one_operand_p)
21264 return false;
21265
21266 if (GET_MODE_SIZE (d->vmode) == 16)
21267 {
21268 if (!TARGET_SSE)
21269 return false;
21270 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
21271 return false;
21272 }
21273 else if (GET_MODE_SIZE (d->vmode) == 32)
21274 {
21275 if (!TARGET_AVX)
21276 return false;
21277 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
21278 return false;
21279 lane = nelt2;
21280 }
21281 else
21282 return false;
21283
21284 for (i = 1; i < nelt; i++)
21285 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
21286 return false;
21287
21288 dfirst = *d;
21289 dsecond = *d;
21290 dfinal = *d;
21291 dfirst.op1 = dfirst.op0;
21292 dfirst.one_operand_p = true;
21293 dsecond.op0 = dsecond.op1;
21294 dsecond.one_operand_p = true;
21295
21296 for (i = 0; i < nelt; i++)
21297 if (d->perm[i] >= nelt)
21298 {
21299 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
21300 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
21301 ident2 = false;
21302 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
21303 = d->perm[i] - nelt;
21304 }
21305 else
21306 {
21307 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
21308 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
21309 ident1 = false;
21310 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
21311 }
21312
21313 if (two_insn && !ident1 && !ident2)
21314 return false;
21315
21316 if (!d->testing_p)
21317 {
21318 if (!ident1)
21319 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21320 if (!ident2)
21321 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21322 if (d->perm[0] >= nelt)
21323 std::swap (dfinal.op0, dfinal.op1);
21324 }
21325
21326 bool ok;
21327 rtx_insn *seq1 = NULL, *seq2 = NULL;
21328
21329 if (!ident1)
21330 {
21331 start_sequence ();
21332 ok = expand_vec_perm_1 (&dfirst);
21333 seq1 = get_insns ();
21334 end_sequence ();
21335
21336 if (!ok)
21337 return false;
21338 }
21339
21340 if (!ident2)
21341 {
21342 start_sequence ();
21343 ok = expand_vec_perm_1 (&dsecond);
21344 seq2 = get_insns ();
21345 end_sequence ();
21346
21347 if (!ok)
21348 return false;
21349 }
21350
21351 if (d->testing_p)
21352 return true;
21353
21354 for (i = 0; i < nelt; i++)
21355 {
21356 dfinal.perm[i] = i / 2;
21357 if (i >= lane)
21358 dfinal.perm[i] += lane / 2;
21359 if ((i & 1) != 0)
21360 dfinal.perm[i] += nelt;
21361 }
21362 emit_insn (seq1);
21363 emit_insn (seq2);
21364 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
21365 dfinal.perm, dfinal.nelt, false);
21366 gcc_assert (ok);
21367 return true;
21368}
21369
21370/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21371 the permutation using two single vector permutations and the SSE4_1 pblendv
21372 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21373 identity permutation. */
21374
21375static bool
21376expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
21377{
21378 unsigned i, nelt = d->nelt;
21379 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21380 machine_mode vmode = d->vmode;
21381 bool ident1 = true, ident2 = true;
21382
21383 /* Use the same checks as in expand_vec_perm_blend. */
21384 if (d->one_operand_p)
21385 return false;
21386 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21387 ;
21388 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21389 ;
acff89c7
UB
21390 else if (TARGET_SSE4_1
21391 && (GET_MODE_SIZE (vmode) == 16
21392 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
21393 || GET_MODE_SIZE (vmode) == 4))
829c4bea
JJ
21394 ;
21395 else
21396 return false;
21397
21398 dfirst = *d;
21399 dsecond = *d;
21400 dfinal = *d;
21401 dfirst.op1 = dfirst.op0;
21402 dfirst.one_operand_p = true;
21403 dsecond.op0 = dsecond.op1;
21404 dsecond.one_operand_p = true;
21405
21406 for (i = 0; i < nelt; ++i)
21407 if (d->perm[i] >= nelt)
21408 {
21409 dfirst.perm[i] = 0xff;
21410 dsecond.perm[i] = d->perm[i] - nelt;
21411 if (d->perm[i] != i + nelt)
21412 ident2 = false;
21413 }
21414 else
21415 {
21416 dsecond.perm[i] = 0xff;
21417 dfirst.perm[i] = d->perm[i];
21418 if (d->perm[i] != i)
21419 ident1 = false;
21420 }
21421
21422 if (two_insn && !ident1 && !ident2)
21423 return false;
21424
21425 /* For now. Ideally treat 0xff as a wildcard. */
21426 for (i = 0; i < nelt; ++i)
21427 if (dfirst.perm[i] == 0xff)
21428 {
21429 if (GET_MODE_SIZE (vmode) == 32
21430 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
21431 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21432 else
21433 dfirst.perm[i] = i;
21434 }
21435 else
21436 {
21437 if (GET_MODE_SIZE (vmode) == 32
21438 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
21439 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21440 else
21441 dsecond.perm[i] = i;
21442 }
21443
21444 if (!d->testing_p)
21445 {
21446 if (!ident1)
21447 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21448 if (!ident2)
21449 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21450 }
21451
21452 bool ok;
21453 rtx_insn *seq1 = NULL, *seq2 = NULL;
21454
21455 if (!ident1)
21456 {
21457 start_sequence ();
21458 ok = expand_vec_perm_1 (&dfirst);
21459 seq1 = get_insns ();
21460 end_sequence ();
21461
21462 if (!ok)
21463 return false;
21464 }
21465
21466 if (!ident2)
21467 {
21468 start_sequence ();
21469 ok = expand_vec_perm_1 (&dsecond);
21470 seq2 = get_insns ();
21471 end_sequence ();
21472
21473 if (!ok)
21474 return false;
21475 }
21476
21477 if (d->testing_p)
21478 return true;
21479
21480 for (i = 0; i < nelt; ++i)
21481 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
21482
21483 emit_insn (seq1);
21484 emit_insn (seq2);
21485 ok = expand_vec_perm_blend (&dfinal);
21486 gcc_assert (ok);
21487 return true;
21488}
21489
4bf4c103 21490/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
2bf6d935
ML
21491 permutation using two vperm2f128, followed by a vshufpd insn blending
21492 the two vectors together. */
21493
21494static bool
21495expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
21496{
21497 struct expand_vec_perm_d dfirst, dsecond, dthird;
21498 bool ok;
21499
21500 if (!TARGET_AVX || (d->vmode != V4DFmode))
21501 return false;
21502
21503 if (d->testing_p)
21504 return true;
21505
21506 dfirst = *d;
21507 dsecond = *d;
21508 dthird = *d;
21509
21510 dfirst.perm[0] = (d->perm[0] & ~1);
21511 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
21512 dfirst.perm[2] = (d->perm[2] & ~1);
21513 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
21514 dsecond.perm[0] = (d->perm[1] & ~1);
21515 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
21516 dsecond.perm[2] = (d->perm[3] & ~1);
21517 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
21518 dthird.perm[0] = (d->perm[0] % 2);
21519 dthird.perm[1] = (d->perm[1] % 2) + 4;
21520 dthird.perm[2] = (d->perm[2] % 2) + 2;
21521 dthird.perm[3] = (d->perm[3] % 2) + 6;
21522
21523 dfirst.target = gen_reg_rtx (dfirst.vmode);
21524 dsecond.target = gen_reg_rtx (dsecond.vmode);
21525 dthird.op0 = dfirst.target;
21526 dthird.op1 = dsecond.target;
21527 dthird.one_operand_p = false;
21528
21529 canonicalize_perm (&dfirst);
21530 canonicalize_perm (&dsecond);
21531
21532 ok = expand_vec_perm_1 (&dfirst)
21533 && expand_vec_perm_1 (&dsecond)
21534 && expand_vec_perm_1 (&dthird);
21535
21536 gcc_assert (ok);
21537
21538 return true;
21539}
21540
4bf4c103
JJ
21541static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
21542
21543/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21544 a two vector permutation using two intra-lane vector
21545 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21546 the non-swapped and swapped vectors together. */
21547
21548static bool
21549expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
21550{
21551 struct expand_vec_perm_d dfirst, dsecond, dthird;
21552 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
21553 rtx_insn *seq1, *seq2;
21554 bool ok;
21555 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21556
21557 if (!TARGET_AVX
21558 || TARGET_AVX2
21559 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21560 || d->one_operand_p)
21561 return false;
21562
21563 dfirst = *d;
21564 dsecond = *d;
21565 for (i = 0; i < nelt; i++)
21566 {
21567 dfirst.perm[i] = 0xff;
21568 dsecond.perm[i] = 0xff;
21569 }
21570 for (i = 0, msk = 0; i < nelt; i++)
21571 {
21572 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21573 if (j == i)
21574 {
21575 dfirst.perm[j] = d->perm[i];
21576 which1 |= (d->perm[i] < nelt ? 1 : 2);
21577 }
21578 else
21579 {
21580 dsecond.perm[j] = d->perm[i];
21581 which2 |= (d->perm[i] < nelt ? 1 : 2);
21582 msk |= (1U << i);
21583 }
21584 }
21585 if (msk == 0 || msk == (1U << nelt) - 1)
21586 return false;
21587
21588 if (!d->testing_p)
21589 {
21590 dfirst.target = gen_reg_rtx (dfirst.vmode);
21591 dsecond.target = gen_reg_rtx (dsecond.vmode);
21592 }
21593
21594 for (i = 0; i < nelt; i++)
21595 {
21596 if (dfirst.perm[i] == 0xff)
21597 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
21598 if (dsecond.perm[i] == 0xff)
21599 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
21600 }
21601 canonicalize_perm (&dfirst);
21602 start_sequence ();
21603 ok = ix86_expand_vec_perm_const_1 (&dfirst);
21604 seq1 = get_insns ();
21605 end_sequence ();
21606
21607 if (!ok)
21608 return false;
21609
21610 canonicalize_perm (&dsecond);
21611 start_sequence ();
21612 ok = ix86_expand_vec_perm_const_1 (&dsecond);
21613 seq2 = get_insns ();
21614 end_sequence ();
21615
21616 if (!ok)
21617 return false;
21618
21619 if (d->testing_p)
21620 return true;
21621
21622 emit_insn (seq1);
21623 emit_insn (seq2);
21624
21625 dthird = *d;
21626 dthird.op0 = dsecond.target;
21627 dthird.op1 = dsecond.target;
21628 dthird.one_operand_p = true;
21629 dthird.target = gen_reg_rtx (dthird.vmode);
21630 for (i = 0; i < nelt; i++)
21631 dthird.perm[i] = i ^ nelt2;
21632
21633 ok = expand_vec_perm_1 (&dthird);
21634 gcc_assert (ok);
21635
21636 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21637 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
21638 return true;
21639}
21640
2bf6d935
ML
21641/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21642 permutation with two pshufb insns and an ior. We should have already
21643 failed all two instruction sequences. */
21644
21645static bool
21646expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
21647{
21648 rtx rperm[2][16], vperm, l, h, op, m128;
21649 unsigned int i, nelt, eltsz;
dd835ec2
UB
21650 machine_mode mode;
21651 rtx (*gen) (rtx, rtx, rtx);
2bf6d935 21652
dd835ec2 21653 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
be8749f9
UB
21654 && GET_MODE_SIZE (d->vmode) != 8
21655 && GET_MODE_SIZE (d->vmode) != 4))
2bf6d935
ML
21656 return false;
21657 gcc_assert (!d->one_operand_p);
21658
21659 if (d->testing_p)
21660 return true;
21661
dd835ec2
UB
21662 switch (GET_MODE_SIZE (d->vmode))
21663 {
be8749f9
UB
21664 case 4:
21665 mode = V4QImode;
21666 gen = gen_mmx_pshufbv4qi3;
21667 break;
dd835ec2
UB
21668 case 8:
21669 mode = V8QImode;
21670 gen = gen_mmx_pshufbv8qi3;
21671 break;
21672 case 16:
21673 mode = V16QImode;
21674 gen = gen_ssse3_pshufbv16qi3;
21675 break;
21676 default:
21677 gcc_unreachable ();
21678 }
21679
2bf6d935
ML
21680 nelt = d->nelt;
21681 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21682
21683 /* Generate two permutation masks. If the required element is within
21684 the given vector it is shuffled into the proper lane. If the required
21685 element is in the other vector, force a zero into the lane by setting
21686 bit 7 in the permutation mask. */
21687 m128 = GEN_INT (-128);
21688 for (i = 0; i < nelt; ++i)
21689 {
dd835ec2 21690 unsigned j, k, e = d->perm[i];
2bf6d935
ML
21691 unsigned which = (e >= nelt);
21692 if (e >= nelt)
21693 e -= nelt;
21694
21695 for (j = 0; j < eltsz; ++j)
21696 {
21697 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
21698 rperm[1-which][i*eltsz + j] = m128;
21699 }
dd835ec2
UB
21700
21701 for (k = i*eltsz + j; k < 16; ++k)
21702 rperm[0][k] = rperm[1][k] = m128;
2bf6d935
ML
21703 }
21704
21705 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
21706 vperm = force_reg (V16QImode, vperm);
21707
dd835ec2
UB
21708 l = gen_reg_rtx (mode);
21709 op = gen_lowpart (mode, d->op0);
21710 emit_insn (gen (l, op, vperm));
2bf6d935
ML
21711
21712 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
21713 vperm = force_reg (V16QImode, vperm);
21714
dd835ec2
UB
21715 h = gen_reg_rtx (mode);
21716 op = gen_lowpart (mode, d->op1);
21717 emit_insn (gen (h, op, vperm));
2bf6d935
ML
21718
21719 op = d->target;
dd835ec2
UB
21720 if (d->vmode != mode)
21721 op = gen_reg_rtx (mode);
b5193e35 21722 ix86_emit_vec_binop (IOR, mode, op, l, h);
2bf6d935
ML
21723 if (op != d->target)
21724 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21725
21726 return true;
21727}
21728
21729/* Implement arbitrary permutation of one V32QImode and V16QImode operand
21730 with two vpshufb insns, vpermq and vpor. We should have already failed
21731 all two or three instruction sequences. */
21732
21733static bool
21734expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
21735{
21736 rtx rperm[2][32], vperm, l, h, hp, op, m128;
21737 unsigned int i, nelt, eltsz;
21738
21739 if (!TARGET_AVX2
21740 || !d->one_operand_p
21741 || (d->vmode != V32QImode && d->vmode != V16HImode))
21742 return false;
21743
21744 if (d->testing_p)
21745 return true;
21746
21747 nelt = d->nelt;
21748 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21749
21750 /* Generate two permutation masks. If the required element is within
21751 the same lane, it is shuffled in. If the required element from the
21752 other lane, force a zero by setting bit 7 in the permutation mask.
21753 In the other mask the mask has non-negative elements if element
21754 is requested from the other lane, but also moved to the other lane,
21755 so that the result of vpshufb can have the two V2TImode halves
21756 swapped. */
21757 m128 = GEN_INT (-128);
21758 for (i = 0; i < nelt; ++i)
21759 {
21760 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21761 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21762
21763 for (j = 0; j < eltsz; ++j)
21764 {
21765 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
21766 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
21767 }
21768 }
21769
21770 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21771 vperm = force_reg (V32QImode, vperm);
21772
21773 h = gen_reg_rtx (V32QImode);
21774 op = gen_lowpart (V32QImode, d->op0);
21775 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21776
21777 /* Swap the 128-byte lanes of h into hp. */
21778 hp = gen_reg_rtx (V4DImode);
21779 op = gen_lowpart (V4DImode, h);
21780 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
21781 const1_rtx));
21782
21783 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21784 vperm = force_reg (V32QImode, vperm);
21785
21786 l = gen_reg_rtx (V32QImode);
21787 op = gen_lowpart (V32QImode, d->op0);
21788 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21789
21790 op = d->target;
21791 if (d->vmode != V32QImode)
21792 op = gen_reg_rtx (V32QImode);
21793 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
21794 if (op != d->target)
21795 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21796
21797 return true;
21798}
21799
21800/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21801 and extract-odd permutations of two V32QImode and V16QImode operand
21802 with two vpshufb insns, vpor and vpermq. We should have already
21803 failed all two or three instruction sequences. */
21804
21805static bool
21806expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
21807{
21808 rtx rperm[2][32], vperm, l, h, ior, op, m128;
21809 unsigned int i, nelt, eltsz;
21810
21811 if (!TARGET_AVX2
21812 || d->one_operand_p
21813 || (d->vmode != V32QImode && d->vmode != V16HImode))
21814 return false;
21815
21816 for (i = 0; i < d->nelt; ++i)
21817 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
21818 return false;
21819
21820 if (d->testing_p)
21821 return true;
21822
21823 nelt = d->nelt;
21824 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21825
21826 /* Generate two permutation masks. In the first permutation mask
21827 the first quarter will contain indexes for the first half
21828 of the op0, the second quarter will contain bit 7 set, third quarter
21829 will contain indexes for the second half of the op0 and the
21830 last quarter bit 7 set. In the second permutation mask
21831 the first quarter will contain bit 7 set, the second quarter
21832 indexes for the first half of the op1, the third quarter bit 7 set
21833 and last quarter indexes for the second half of the op1.
21834 I.e. the first mask e.g. for V32QImode extract even will be:
21835 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21836 (all values masked with 0xf except for -128) and second mask
21837 for extract even will be
21838 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21839 m128 = GEN_INT (-128);
21840 for (i = 0; i < nelt; ++i)
21841 {
21842 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21843 unsigned which = d->perm[i] >= nelt;
21844 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
21845
21846 for (j = 0; j < eltsz; ++j)
21847 {
21848 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
21849 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
21850 }
21851 }
21852
21853 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21854 vperm = force_reg (V32QImode, vperm);
21855
21856 l = gen_reg_rtx (V32QImode);
21857 op = gen_lowpart (V32QImode, d->op0);
21858 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21859
21860 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21861 vperm = force_reg (V32QImode, vperm);
21862
21863 h = gen_reg_rtx (V32QImode);
21864 op = gen_lowpart (V32QImode, d->op1);
21865 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21866
21867 ior = gen_reg_rtx (V32QImode);
21868 emit_insn (gen_iorv32qi3 (ior, l, h));
21869
21870 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21871 op = gen_reg_rtx (V4DImode);
21872 ior = gen_lowpart (V4DImode, ior);
21873 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
21874 const1_rtx, GEN_INT (3)));
21875 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21876
21877 return true;
21878}
21879
fcda0efc 21880/* Implement permutation with pslldq + psrldq + por when pshufb is not
21881 available. */
21882static bool
21883expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
21884{
21885 unsigned i, nelt = d->nelt;
21886 unsigned start1, end1 = -1;
21887 machine_mode vmode = d->vmode, imode;
21888 int start2 = -1;
21889 bool clear_op0, clear_op1;
21890 unsigned inner_size;
21891 rtx op0, op1, dop1;
21892 rtx (*gen_vec_shr) (rtx, rtx, rtx);
21893 rtx (*gen_vec_shl) (rtx, rtx, rtx);
21894
21895 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21896 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
21897 return false;
21898
21899 start1 = d->perm[0];
21900 for (i = 1; i < nelt; i++)
21901 {
69c4b5c5 21902 if (d->perm[i] != d->perm[i-1] + 1
21903 || d->perm[i] == nelt)
fcda0efc 21904 {
21905 if (start2 == -1)
21906 {
21907 start2 = d->perm[i];
21908 end1 = d->perm[i-1];
21909 }
21910 else
21911 return false;
21912 }
fcda0efc 21913 }
21914
21915 clear_op0 = end1 != nelt - 1;
21916 clear_op1 = start2 % nelt != 0;
21917 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21918 if (!pandn && (clear_op0 || clear_op1))
21919 return false;
21920
21921 if (d->testing_p)
21922 return true;
21923
21924 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
21925 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
21926 imode = GET_MODE_INNER (vmode);
21927 inner_size = GET_MODE_BITSIZE (imode);
21928 op0 = gen_reg_rtx (vmode);
21929 op1 = gen_reg_rtx (vmode);
21930
21931 if (start1)
21932 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
21933 else
21934 emit_move_insn (op0, d->op0);
21935
21936 dop1 = d->op1;
21937 if (d->one_operand_p)
21938 dop1 = d->op0;
21939
21940 int shl_offset = end1 - start1 + 1 - start2 % nelt;
21941 if (shl_offset)
21942 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
21943 else
21944 emit_move_insn (op1, dop1);
21945
21946 /* Clear lower/upper bits for op0/op1. */
21947 if (clear_op0 || clear_op1)
21948 {
21949 rtx vec[16];
21950 rtx const_vec;
21951 rtx clear;
21952 for (i = 0; i != nelt; i++)
21953 {
21954 if (i < (end1 - start1 + 1))
21955 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
21956 else
21957 vec[i] = CONST0_RTX (imode);
21958 }
21959 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
21960 const_vec = validize_mem (force_const_mem (vmode, const_vec));
21961 clear = force_reg (vmode, const_vec);
21962
21963 if (clear_op0)
21964 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
21965 if (clear_op1)
21966 emit_move_insn (op1, gen_rtx_AND (vmode,
21967 gen_rtx_NOT (vmode, clear),
21968 op1));
21969 }
21970
21971 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
21972 return true;
21973}
21974
2bf6d935 21975/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
a325bdd1
PB
21976 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21977 operands with two "and" and "pack" or two "shift" and "pack" insns.
21978 We should have already failed all two instruction sequences. */
2bf6d935
ML
21979
21980static bool
21981expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
21982{
21983 rtx op, dop0, dop1, t;
21984 unsigned i, odd, c, s, nelt = d->nelt;
21985 bool end_perm = false;
21986 machine_mode half_mode;
21987 rtx (*gen_and) (rtx, rtx, rtx);
21988 rtx (*gen_pack) (rtx, rtx, rtx);
21989 rtx (*gen_shift) (rtx, rtx, rtx);
21990
21991 if (d->one_operand_p)
21992 return false;
21993
21994 switch (d->vmode)
21995 {
dd835ec2
UB
21996 case E_V4HImode:
21997 /* Required for "pack". */
21998 if (!TARGET_SSE4_1)
21999 return false;
22000 c = 0xffff;
22001 s = 16;
22002 half_mode = V2SImode;
22003 gen_and = gen_andv2si3;
22004 gen_pack = gen_mmx_packusdw;
22005 gen_shift = gen_lshrv2si3;
22006 break;
2bf6d935
ML
22007 case E_V8HImode:
22008 /* Required for "pack". */
22009 if (!TARGET_SSE4_1)
22010 return false;
22011 c = 0xffff;
22012 s = 16;
22013 half_mode = V4SImode;
22014 gen_and = gen_andv4si3;
22015 gen_pack = gen_sse4_1_packusdw;
22016 gen_shift = gen_lshrv4si3;
22017 break;
a325bdd1
PB
22018 case E_V8QImode:
22019 /* No check as all instructions are SSE2. */
22020 c = 0xff;
22021 s = 8;
22022 half_mode = V4HImode;
22023 gen_and = gen_andv4hi3;
22024 gen_pack = gen_mmx_packuswb;
22025 gen_shift = gen_lshrv4hi3;
22026 break;
2bf6d935
ML
22027 case E_V16QImode:
22028 /* No check as all instructions are SSE2. */
22029 c = 0xff;
22030 s = 8;
22031 half_mode = V8HImode;
22032 gen_and = gen_andv8hi3;
22033 gen_pack = gen_sse2_packuswb;
22034 gen_shift = gen_lshrv8hi3;
22035 break;
22036 case E_V16HImode:
22037 if (!TARGET_AVX2)
22038 return false;
22039 c = 0xffff;
22040 s = 16;
22041 half_mode = V8SImode;
22042 gen_and = gen_andv8si3;
22043 gen_pack = gen_avx2_packusdw;
22044 gen_shift = gen_lshrv8si3;
22045 end_perm = true;
22046 break;
22047 case E_V32QImode:
22048 if (!TARGET_AVX2)
22049 return false;
22050 c = 0xff;
22051 s = 8;
22052 half_mode = V16HImode;
22053 gen_and = gen_andv16hi3;
22054 gen_pack = gen_avx2_packuswb;
22055 gen_shift = gen_lshrv16hi3;
22056 end_perm = true;
22057 break;
22058 default:
dd835ec2 22059 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
a325bdd1 22060 are more profitable than general shuffles. */
2bf6d935
ML
22061 return false;
22062 }
22063
22064 /* Check that permutation is even or odd. */
22065 odd = d->perm[0];
22066 if (odd > 1)
22067 return false;
22068
22069 for (i = 1; i < nelt; ++i)
22070 if (d->perm[i] != 2 * i + odd)
22071 return false;
22072
22073 if (d->testing_p)
22074 return true;
22075
22076 dop0 = gen_reg_rtx (half_mode);
22077 dop1 = gen_reg_rtx (half_mode);
22078 if (odd == 0)
22079 {
22080 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
22081 t = force_reg (half_mode, t);
22082 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
22083 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
22084 }
22085 else
22086 {
22087 emit_insn (gen_shift (dop0,
22088 gen_lowpart (half_mode, d->op0),
22089 GEN_INT (s)));
22090 emit_insn (gen_shift (dop1,
22091 gen_lowpart (half_mode, d->op1),
22092 GEN_INT (s)));
22093 }
22094 /* In AVX2 for 256 bit case we need to permute pack result. */
22095 if (TARGET_AVX2 && end_perm)
22096 {
22097 op = gen_reg_rtx (d->vmode);
22098 t = gen_reg_rtx (V4DImode);
22099 emit_insn (gen_pack (op, dop0, dop1));
22100 emit_insn (gen_avx2_permv4di_1 (t,
22101 gen_lowpart (V4DImode, op),
22102 const0_rtx,
22103 const2_rtx,
22104 const1_rtx,
22105 GEN_INT (3)));
22106 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
22107 }
22108 else
22109 emit_insn (gen_pack (d->target, dop0, dop1));
22110
22111 return true;
22112}
22113
22114/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22115 and extract-odd permutations of two V64QI operands
22116 with two "shifts", two "truncs" and one "concat" insns for "odd"
22117 and two "truncs" and one concat insn for "even."
22118 Have already failed all two instruction sequences. */
22119
22120static bool
22121expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
22122{
22123 rtx t1, t2, t3, t4;
22124 unsigned i, odd, nelt = d->nelt;
22125
22126 if (!TARGET_AVX512BW
22127 || d->one_operand_p
22128 || d->vmode != V64QImode)
22129 return false;
22130
22131 /* Check that permutation is even or odd. */
22132 odd = d->perm[0];
22133 if (odd > 1)
22134 return false;
22135
22136 for (i = 1; i < nelt; ++i)
22137 if (d->perm[i] != 2 * i + odd)
22138 return false;
22139
22140 if (d->testing_p)
22141 return true;
22142
22143
22144 if (odd)
22145 {
22146 t1 = gen_reg_rtx (V32HImode);
22147 t2 = gen_reg_rtx (V32HImode);
22148 emit_insn (gen_lshrv32hi3 (t1,
22149 gen_lowpart (V32HImode, d->op0),
22150 GEN_INT (8)));
22151 emit_insn (gen_lshrv32hi3 (t2,
22152 gen_lowpart (V32HImode, d->op1),
22153 GEN_INT (8)));
22154 }
22155 else
22156 {
22157 t1 = gen_lowpart (V32HImode, d->op0);
22158 t2 = gen_lowpart (V32HImode, d->op1);
22159 }
22160
22161 t3 = gen_reg_rtx (V32QImode);
22162 t4 = gen_reg_rtx (V32QImode);
22163 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
22164 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
22165 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
22166
22167 return true;
22168}
22169
4bf4c103 22170/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
2bf6d935
ML
22171 and extract-odd permutations. */
22172
22173static bool
22174expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
22175{
22176 rtx t1, t2, t3, t4, t5;
22177
22178 switch (d->vmode)
22179 {
22180 case E_V4DFmode:
22181 if (d->testing_p)
22182 break;
22183 t1 = gen_reg_rtx (V4DFmode);
22184 t2 = gen_reg_rtx (V4DFmode);
22185
22186 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22187 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
22188 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
22189
22190 /* Now an unpck[lh]pd will produce the result required. */
22191 if (odd)
22192 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
22193 else
22194 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
22195 emit_insn (t3);
22196 break;
22197
22198 case E_V8SFmode:
22199 {
22200 int mask = odd ? 0xdd : 0x88;
22201
22202 if (d->testing_p)
22203 break;
22204 t1 = gen_reg_rtx (V8SFmode);
22205 t2 = gen_reg_rtx (V8SFmode);
22206 t3 = gen_reg_rtx (V8SFmode);
22207
22208 /* Shuffle within the 128-bit lanes to produce:
22209 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
22210 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
22211 GEN_INT (mask)));
22212
22213 /* Shuffle the lanes around to produce:
22214 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
22215 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
22216 GEN_INT (0x3)));
22217
22218 /* Shuffle within the 128-bit lanes to produce:
22219 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
22220 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
22221
22222 /* Shuffle within the 128-bit lanes to produce:
22223 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
22224 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
22225
22226 /* Shuffle the lanes around to produce:
22227 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
22228 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
22229 GEN_INT (0x20)));
22230 }
22231 break;
22232
22233 case E_V2DFmode:
22234 case E_V4SFmode:
22235 case E_V2DImode:
9b8579a6 22236 case E_V2SImode:
2bf6d935 22237 case E_V4SImode:
8d7dae0e 22238 case E_V2HImode:
2bf6d935
ML
22239 /* These are always directly implementable by expand_vec_perm_1. */
22240 gcc_unreachable ();
22241
240198fe
UB
22242 case E_V2SFmode:
22243 gcc_assert (TARGET_MMX_WITH_SSE);
22244 /* We have no suitable instructions. */
22245 if (d->testing_p)
22246 return false;
22247 break;
22248
be8749f9
UB
22249 case E_V4QImode:
22250 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22251 return expand_vec_perm_pshufb2 (d);
22252 else
22253 {
22254 if (d->testing_p)
22255 break;
22256 /* We need 2*log2(N)-1 operations to achieve odd/even
22257 with interleave. */
22258 t1 = gen_reg_rtx (V4QImode);
22259 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
22260 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
22261 if (odd)
22262 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
22263 else
22264 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
22265 emit_insn (t2);
22266 }
22267 break;
22268
9b8579a6 22269 case E_V4HImode:
dd835ec2
UB
22270 if (TARGET_SSE4_1)
22271 return expand_vec_perm_even_odd_pack (d);
22272 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22273 return expand_vec_perm_pshufb2 (d);
9b8579a6 22274 else
dd835ec2
UB
22275 {
22276 if (d->testing_p)
22277 break;
22278 /* We need 2*log2(N)-1 operations to achieve odd/even
22279 with interleave. */
22280 t1 = gen_reg_rtx (V4HImode);
22281 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
22282 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
22283 if (odd)
22284 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
22285 else
22286 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
22287 emit_insn (t2);
22288 }
9b8579a6
UB
22289 break;
22290
2bf6d935
ML
22291 case E_V8HImode:
22292 if (TARGET_SSE4_1)
22293 return expand_vec_perm_even_odd_pack (d);
22294 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22295 return expand_vec_perm_pshufb2 (d);
22296 else
22297 {
22298 if (d->testing_p)
22299 break;
22300 /* We need 2*log2(N)-1 operations to achieve odd/even
22301 with interleave. */
22302 t1 = gen_reg_rtx (V8HImode);
22303 t2 = gen_reg_rtx (V8HImode);
22304 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
22305 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
22306 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
22307 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
22308 if (odd)
22309 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
22310 else
22311 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
22312 emit_insn (t3);
22313 }
22314 break;
22315
a325bdd1 22316 case E_V8QImode:
2bf6d935
ML
22317 case E_V16QImode:
22318 return expand_vec_perm_even_odd_pack (d);
22319
22320 case E_V16HImode:
22321 case E_V32QImode:
22322 return expand_vec_perm_even_odd_pack (d);
22323
22324 case E_V64QImode:
22325 return expand_vec_perm_even_odd_trunc (d);
22326
22327 case E_V4DImode:
22328 if (!TARGET_AVX2)
22329 {
22330 struct expand_vec_perm_d d_copy = *d;
22331 d_copy.vmode = V4DFmode;
22332 if (d->testing_p)
22333 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
22334 else
22335 d_copy.target = gen_reg_rtx (V4DFmode);
22336 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
22337 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
22338 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22339 {
22340 if (!d->testing_p)
22341 emit_move_insn (d->target,
22342 gen_lowpart (V4DImode, d_copy.target));
22343 return true;
22344 }
22345 return false;
22346 }
22347
22348 if (d->testing_p)
22349 break;
22350
22351 t1 = gen_reg_rtx (V4DImode);
22352 t2 = gen_reg_rtx (V4DImode);
22353
22354 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22355 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
22356 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
22357
22358 /* Now an vpunpck[lh]qdq will produce the result required. */
22359 if (odd)
22360 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
22361 else
22362 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
22363 emit_insn (t3);
22364 break;
22365
22366 case E_V8SImode:
22367 if (!TARGET_AVX2)
22368 {
22369 struct expand_vec_perm_d d_copy = *d;
22370 d_copy.vmode = V8SFmode;
22371 if (d->testing_p)
22372 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
22373 else
22374 d_copy.target = gen_reg_rtx (V8SFmode);
22375 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
22376 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
22377 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22378 {
22379 if (!d->testing_p)
22380 emit_move_insn (d->target,
22381 gen_lowpart (V8SImode, d_copy.target));
22382 return true;
22383 }
22384 return false;
22385 }
22386
22387 if (d->testing_p)
22388 break;
22389
22390 t1 = gen_reg_rtx (V8SImode);
22391 t2 = gen_reg_rtx (V8SImode);
22392 t3 = gen_reg_rtx (V4DImode);
22393 t4 = gen_reg_rtx (V4DImode);
22394 t5 = gen_reg_rtx (V4DImode);
22395
22396 /* Shuffle the lanes around into
22397 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22398 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
22399 gen_lowpart (V4DImode, d->op1),
22400 GEN_INT (0x20)));
22401 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
22402 gen_lowpart (V4DImode, d->op1),
22403 GEN_INT (0x31)));
22404
22405 /* Swap the 2nd and 3rd position in each lane into
22406 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22407 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
22408 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22409 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
22410 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22411
22412 /* Now an vpunpck[lh]qdq will produce
22413 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22414 if (odd)
22415 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
22416 gen_lowpart (V4DImode, t2));
22417 else
22418 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
22419 gen_lowpart (V4DImode, t2));
22420 emit_insn (t3);
22421 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
22422 break;
22423
22424 default:
22425 gcc_unreachable ();
22426 }
22427
22428 return true;
22429}
22430
4bf4c103 22431/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
22432 extract-even and extract-odd permutations. */
22433
22434static bool
22435expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
22436{
22437 unsigned i, odd, nelt = d->nelt;
22438
22439 odd = d->perm[0];
22440 if (odd != 0 && odd != 1)
22441 return false;
22442
22443 for (i = 1; i < nelt; ++i)
22444 if (d->perm[i] != 2 * i + odd)
22445 return false;
22446
50b58779
JJ
22447 if (d->vmode == E_V32HImode
22448 && d->testing_p
22449 && !TARGET_AVX512BW)
22450 return false;
22451
2bf6d935
ML
22452 return expand_vec_perm_even_odd_1 (d, odd);
22453}
22454
4bf4c103 22455/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
2bf6d935
ML
22456 permutations. We assume that expand_vec_perm_1 has already failed. */
22457
22458static bool
22459expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
22460{
22461 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
22462 machine_mode vmode = d->vmode;
be8749f9 22463 rtx (*gen) (rtx, rtx, rtx);
2bf6d935
ML
22464 unsigned char perm2[4];
22465 rtx op0 = d->op0, dest;
22466 bool ok;
22467
22468 switch (vmode)
22469 {
22470 case E_V4DFmode:
22471 case E_V8SFmode:
22472 /* These are special-cased in sse.md so that we can optionally
22473 use the vbroadcast instruction. They expand to two insns
22474 if the input happens to be in a register. */
22475 gcc_unreachable ();
22476
22477 case E_V2DFmode:
240198fe 22478 case E_V2SFmode:
2bf6d935 22479 case E_V4SFmode:
240198fe 22480 case E_V2DImode:
9b8579a6 22481 case E_V2SImode:
2bf6d935 22482 case E_V4SImode:
8d7dae0e
UB
22483 case E_V2HImode:
22484 case E_V4HImode:
2bf6d935
ML
22485 /* These are always implementable using standard shuffle patterns. */
22486 gcc_unreachable ();
22487
be8749f9
UB
22488 case E_V4QImode:
22489 /* This can be implemented via interleave and pshuflw. */
22490 if (d->testing_p)
22491 return true;
22492
22493 if (elt >= nelt2)
22494 {
22495 gen = gen_mmx_punpckhbw_low;
22496 elt -= nelt2;
22497 }
22498 else
22499 gen = gen_mmx_punpcklbw_low;
22500
22501 dest = gen_reg_rtx (vmode);
22502 emit_insn (gen (dest, op0, op0));
22503 vmode = get_mode_wider_vector (vmode);
22504 op0 = gen_lowpart (vmode, dest);
22505
22506 memset (perm2, elt, 2);
22507 dest = gen_reg_rtx (vmode);
22508 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22509 gcc_assert (ok);
22510
22511 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22512 return true;
22513
a325bdd1 22514 case E_V8QImode:
be8749f9 22515 /* This can be implemented via interleave. We save one insn by
a325bdd1
PB
22516 stopping once we have promoted to V2SImode and then use pshufd. */
22517 if (d->testing_p)
22518 return true;
22519 do
22520 {
a325bdd1
PB
22521 if (elt >= nelt2)
22522 {
22523 gen = vmode == V8QImode ? gen_mmx_punpckhbw
22524 : gen_mmx_punpckhwd;
22525 elt -= nelt2;
22526 }
be8749f9
UB
22527 else
22528 gen = vmode == V8QImode ? gen_mmx_punpcklbw
22529 : gen_mmx_punpcklwd;
a325bdd1
PB
22530 nelt2 /= 2;
22531
22532 dest = gen_reg_rtx (vmode);
22533 emit_insn (gen (dest, op0, op0));
22534 vmode = get_mode_wider_vector (vmode);
22535 op0 = gen_lowpart (vmode, dest);
22536 }
22537 while (vmode != V2SImode);
22538
22539 memset (perm2, elt, 2);
be8749f9 22540 dest = gen_reg_rtx (vmode);
a325bdd1
PB
22541 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22542 gcc_assert (ok);
be8749f9
UB
22543
22544 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
a325bdd1
PB
22545 return true;
22546
2bf6d935
ML
22547 case E_V8HImode:
22548 case E_V16QImode:
22549 /* These can be implemented via interleave. We save one insn by
22550 stopping once we have promoted to V4SImode and then use pshufd. */
22551 if (d->testing_p)
22552 return true;
22553 do
22554 {
2bf6d935
ML
22555 if (elt >= nelt2)
22556 {
22557 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
22558 : gen_vec_interleave_highv8hi;
22559 elt -= nelt2;
22560 }
be8749f9
UB
22561 else
22562 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
22563 : gen_vec_interleave_lowv8hi;
2bf6d935
ML
22564 nelt2 /= 2;
22565
22566 dest = gen_reg_rtx (vmode);
22567 emit_insn (gen (dest, op0, op0));
22568 vmode = get_mode_wider_vector (vmode);
22569 op0 = gen_lowpart (vmode, dest);
22570 }
22571 while (vmode != V4SImode);
22572
22573 memset (perm2, elt, 4);
be8749f9 22574 dest = gen_reg_rtx (vmode);
2bf6d935
ML
22575 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22576 gcc_assert (ok);
be8749f9
UB
22577
22578 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
2bf6d935
ML
22579 return true;
22580
7a54d3de 22581 case E_V8HFmode:
092763fd 22582 case E_V8BFmode:
7a54d3de
UB
22583 /* This can be implemented via interleave and pshufd. */
22584 if (d->testing_p)
22585 return true;
22586
96799fa4 22587 rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
7a54d3de
UB
22588 if (elt >= nelt2)
22589 {
96799fa4 22590 gen_interleave = gen_vec_interleave_high;
7a54d3de
UB
22591 elt -= nelt2;
22592 }
22593 else
96799fa4 22594 gen_interleave = gen_vec_interleave_low;
7a54d3de
UB
22595 nelt2 /= 2;
22596
22597 dest = gen_reg_rtx (vmode);
96799fa4 22598 emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
7a54d3de
UB
22599
22600 vmode = V4SImode;
22601 op0 = gen_lowpart (vmode, dest);
22602
22603 memset (perm2, elt, 4);
22604 dest = gen_reg_rtx (vmode);
22605 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22606 gcc_assert (ok);
22607
22608 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22609 return true;
22610
2bf6d935
ML
22611 case E_V32QImode:
22612 case E_V16HImode:
22613 case E_V8SImode:
22614 case E_V4DImode:
22615 /* For AVX2 broadcasts of the first element vpbroadcast* or
22616 vpermq should be used by expand_vec_perm_1. */
22617 gcc_assert (!TARGET_AVX2 || d->perm[0]);
22618 return false;
22619
240f0780
JJ
22620 case E_V64QImode:
22621 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
22622 return false;
22623
04b4f315
JJ
22624 case E_V32HImode:
22625 gcc_assert (!TARGET_AVX512BW);
22626 return false;
22627
2bf6d935
ML
22628 default:
22629 gcc_unreachable ();
22630 }
22631}
22632
4bf4c103 22633/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
22634 broadcast permutations. */
22635
22636static bool
22637expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
22638{
22639 unsigned i, elt, nelt = d->nelt;
22640
22641 if (!d->one_operand_p)
22642 return false;
22643
22644 elt = d->perm[0];
22645 for (i = 1; i < nelt; ++i)
22646 if (d->perm[i] != elt)
22647 return false;
22648
22649 return expand_vec_perm_broadcast_1 (d);
22650}
22651
22652/* Implement arbitrary permutations of two V64QImode operands
22653 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22654static bool
22655expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
22656{
22657 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
22658 return false;
22659
22660 if (d->testing_p)
22661 return true;
22662
22663 struct expand_vec_perm_d ds[2];
22664 rtx rperm[128], vperm, target0, target1;
22665 unsigned int i, nelt;
22666 machine_mode vmode;
22667
22668 nelt = d->nelt;
22669 vmode = V64QImode;
22670
22671 for (i = 0; i < 2; i++)
22672 {
22673 ds[i] = *d;
22674 ds[i].vmode = V32HImode;
22675 ds[i].nelt = 32;
22676 ds[i].target = gen_reg_rtx (V32HImode);
22677 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
22678 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
22679 }
22680
22681 /* Prepare permutations such that the first one takes care of
22682 putting the even bytes into the right positions or one higher
22683 positions (ds[0]) and the second one takes care of
22684 putting the odd bytes into the right positions or one below
22685 (ds[1]). */
22686
22687 for (i = 0; i < nelt; i++)
22688 {
22689 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
22690 if (i & 1)
22691 {
22692 rperm[i] = constm1_rtx;
22693 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22694 }
22695 else
22696 {
22697 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22698 rperm[i + 64] = constm1_rtx;
22699 }
22700 }
22701
22702 bool ok = expand_vec_perm_1 (&ds[0]);
22703 gcc_assert (ok);
22704 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
22705
22706 ok = expand_vec_perm_1 (&ds[1]);
22707 gcc_assert (ok);
22708 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
22709
22710 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
22711 vperm = force_reg (vmode, vperm);
22712 target0 = gen_reg_rtx (V64QImode);
22713 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
22714
22715 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
22716 vperm = force_reg (vmode, vperm);
22717 target1 = gen_reg_rtx (V64QImode);
22718 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
22719
22720 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
22721 return true;
22722}
22723
22724/* Implement arbitrary permutation of two V32QImode and V16QImode operands
22725 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22726 all the shorter instruction sequences. */
22727
22728static bool
22729expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
22730{
22731 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
22732 unsigned int i, nelt, eltsz;
22733 bool used[4];
22734
22735 if (!TARGET_AVX2
22736 || d->one_operand_p
22737 || (d->vmode != V32QImode && d->vmode != V16HImode))
22738 return false;
22739
22740 if (d->testing_p)
22741 return true;
22742
22743 nelt = d->nelt;
22744 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22745
22746 /* Generate 4 permutation masks. If the required element is within
22747 the same lane, it is shuffled in. If the required element from the
22748 other lane, force a zero by setting bit 7 in the permutation mask.
22749 In the other mask the mask has non-negative elements if element
22750 is requested from the other lane, but also moved to the other lane,
22751 so that the result of vpshufb can have the two V2TImode halves
22752 swapped. */
22753 m128 = GEN_INT (-128);
22754 for (i = 0; i < 32; ++i)
22755 {
22756 rperm[0][i] = m128;
22757 rperm[1][i] = m128;
22758 rperm[2][i] = m128;
22759 rperm[3][i] = m128;
22760 }
22761 used[0] = false;
22762 used[1] = false;
22763 used[2] = false;
22764 used[3] = false;
22765 for (i = 0; i < nelt; ++i)
22766 {
22767 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22768 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22769 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
22770
22771 for (j = 0; j < eltsz; ++j)
22772 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
22773 used[which] = true;
22774 }
22775
22776 for (i = 0; i < 2; ++i)
22777 {
22778 if (!used[2 * i + 1])
22779 {
22780 h[i] = NULL_RTX;
22781 continue;
22782 }
22783 vperm = gen_rtx_CONST_VECTOR (V32QImode,
22784 gen_rtvec_v (32, rperm[2 * i + 1]));
22785 vperm = force_reg (V32QImode, vperm);
22786 h[i] = gen_reg_rtx (V32QImode);
22787 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22788 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
22789 }
22790
22791 /* Swap the 128-byte lanes of h[X]. */
22792 for (i = 0; i < 2; ++i)
22793 {
22794 if (h[i] == NULL_RTX)
22795 continue;
22796 op = gen_reg_rtx (V4DImode);
22797 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
22798 const2_rtx, GEN_INT (3), const0_rtx,
22799 const1_rtx));
22800 h[i] = gen_lowpart (V32QImode, op);
22801 }
22802
22803 for (i = 0; i < 2; ++i)
22804 {
22805 if (!used[2 * i])
22806 {
22807 l[i] = NULL_RTX;
22808 continue;
22809 }
22810 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
22811 vperm = force_reg (V32QImode, vperm);
22812 l[i] = gen_reg_rtx (V32QImode);
22813 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22814 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
22815 }
22816
22817 for (i = 0; i < 2; ++i)
22818 {
22819 if (h[i] && l[i])
22820 {
22821 op = gen_reg_rtx (V32QImode);
22822 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
22823 l[i] = op;
22824 }
22825 else if (h[i])
22826 l[i] = h[i];
22827 }
22828
22829 gcc_assert (l[0] && l[1]);
22830 op = d->target;
22831 if (d->vmode != V32QImode)
22832 op = gen_reg_rtx (V32QImode);
22833 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
22834 if (op != d->target)
22835 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22836 return true;
22837}
22838
22839/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22840 taken care of, perform the expansion in D and return true on success. */
22841
22842static bool
22843ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22844{
22845 /* Try a single instruction expansion. */
22846 if (expand_vec_perm_1 (d))
22847 return true;
22848
22849 /* Try sequences of two instructions. */
22850
22851 if (expand_vec_perm_pshuflw_pshufhw (d))
22852 return true;
22853
22854 if (expand_vec_perm_palignr (d, false))
22855 return true;
22856
22857 if (expand_vec_perm_interleave2 (d))
22858 return true;
22859
22860 if (expand_vec_perm_broadcast (d))
22861 return true;
22862
22863 if (expand_vec_perm_vpermq_perm_1 (d))
22864 return true;
22865
22866 if (expand_vec_perm_vperm2f128 (d))
22867 return true;
22868
22869 if (expand_vec_perm_pblendv (d))
22870 return true;
22871
829c4bea
JJ
22872 if (expand_vec_perm_2perm_interleave (d, true))
22873 return true;
22874
22875 if (expand_vec_perm_2perm_pblendv (d, true))
22876 return true;
22877
3db8e9c2 22878 if (expand_vec_perm_shufps_shufps (d))
22879 return true;
22880
2bf6d935
ML
22881 /* Try sequences of three instructions. */
22882
22883 if (expand_vec_perm_even_odd_pack (d))
22884 return true;
22885
22886 if (expand_vec_perm_2vperm2f128_vshuf (d))
22887 return true;
22888
22889 if (expand_vec_perm_pshufb2 (d))
22890 return true;
22891
fcda0efc 22892 if (expand_vec_perm_pslldq_psrldq_por (d, false))
22893 return true;
22894
2bf6d935
ML
22895 if (expand_vec_perm_interleave3 (d))
22896 return true;
22897
22898 if (expand_vec_perm_vperm2f128_vblend (d))
22899 return true;
22900
829c4bea
JJ
22901 if (expand_vec_perm_2perm_interleave (d, false))
22902 return true;
22903
22904 if (expand_vec_perm_2perm_pblendv (d, false))
22905 return true;
22906
2bf6d935
ML
22907 /* Try sequences of four instructions. */
22908
22909 if (expand_vec_perm_even_odd_trunc (d))
22910 return true;
22911 if (expand_vec_perm_vpshufb2_vpermq (d))
22912 return true;
22913
22914 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
22915 return true;
22916
22917 if (expand_vec_perm_vpermt2_vpshub2 (d))
22918 return true;
22919
22920 /* ??? Look for narrow permutations whose element orderings would
22921 allow the promotion to a wider mode. */
22922
22923 /* ??? Look for sequences of interleave or a wider permute that place
22924 the data into the correct lanes for a half-vector shuffle like
22925 pshuf[lh]w or vpermilps. */
22926
22927 /* ??? Look for sequences of interleave that produce the desired results.
22928 The combinatorics of punpck[lh] get pretty ugly... */
22929
22930 if (expand_vec_perm_even_odd (d))
22931 return true;
22932
fcda0efc 22933 /* Generate four or five instructions. */
22934 if (expand_vec_perm_pslldq_psrldq_por (d, true))
22935 return true;
22936
2bf6d935
ML
22937 /* Even longer sequences. */
22938 if (expand_vec_perm_vpshufb4_vpermq2 (d))
22939 return true;
22940
22941 /* See if we can get the same permutation in different vector integer
22942 mode. */
22943 struct expand_vec_perm_d nd;
22944 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22945 {
22946 if (!d->testing_p)
22947 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22948 return true;
22949 }
22950
4bf4c103
JJ
22951 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22952 if (expand_vec_perm2_vperm2f128_vblend (d))
22953 return true;
22954
2bf6d935
ML
22955 return false;
22956}
22957
22958/* If a permutation only uses one operand, make it clear. Returns true
22959 if the permutation references both operands. */
22960
22961static bool
22962canonicalize_perm (struct expand_vec_perm_d *d)
22963{
22964 int i, which, nelt = d->nelt;
22965
22966 for (i = which = 0; i < nelt; ++i)
4bf4c103 22967 which |= (d->perm[i] < nelt ? 1 : 2);
2bf6d935
ML
22968
22969 d->one_operand_p = true;
22970 switch (which)
22971 {
22972 default:
22973 gcc_unreachable();
22974
22975 case 3:
22976 if (!rtx_equal_p (d->op0, d->op1))
22977 {
22978 d->one_operand_p = false;
22979 break;
22980 }
22981 /* The elements of PERM do not suggest that only the first operand
22982 is used, but both operands are identical. Allow easier matching
22983 of the permutation by folding the permutation into the single
22984 input vector. */
22985 /* FALLTHRU */
22986
22987 case 2:
22988 for (i = 0; i < nelt; ++i)
22989 d->perm[i] &= nelt - 1;
22990 d->op0 = d->op1;
22991 break;
22992
22993 case 1:
22994 d->op1 = d->op0;
22995 break;
22996 }
22997
22998 return (which == 3);
22999}
23000
23001/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
23002
23003bool
ae8decf1
PK
23004ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
23005 rtx target, rtx op0, rtx op1,
23006 const vec_perm_indices &sel)
2bf6d935 23007{
ae8decf1
PK
23008 if (vmode != op_mode)
23009 return false;
23010
2bf6d935
ML
23011 struct expand_vec_perm_d d;
23012 unsigned char perm[MAX_VECT_LEN];
23013 unsigned int i, nelt, which;
23014 bool two_args;
23015
be072bfa
HW
23016 /* For HF mode vector, convert it to HI using subreg. */
23017 if (GET_MODE_INNER (vmode) == HFmode)
23018 {
23019 machine_mode orig_mode = vmode;
23020 vmode = mode_for_vector (HImode,
23021 GET_MODE_NUNITS (vmode)).require ();
23022 if (target)
23023 target = lowpart_subreg (vmode, target, orig_mode);
23024 if (op0)
23025 op0 = lowpart_subreg (vmode, op0, orig_mode);
23026 if (op1)
23027 op1 = lowpart_subreg (vmode, op1, orig_mode);
23028 }
23029
2bf6d935
ML
23030 d.target = target;
23031 d.op0 = op0;
23032 d.op1 = op1;
23033
23034 d.vmode = vmode;
23035 gcc_assert (VECTOR_MODE_P (d.vmode));
23036 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23037 d.testing_p = !target;
23038
23039 gcc_assert (sel.length () == nelt);
23040 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
23041
23042 /* Given sufficient ISA support we can just return true here
23043 for selected vector modes. */
23044 switch (d.vmode)
23045 {
23046 case E_V16SFmode:
23047 case E_V16SImode:
23048 case E_V8DImode:
23049 case E_V8DFmode:
23050 if (!TARGET_AVX512F)
23051 return false;
23052 /* All implementable with a single vperm[it]2 insn. */
23053 if (d.testing_p)
23054 return true;
23055 break;
23056 case E_V32HImode:
50b58779 23057 if (!TARGET_AVX512F)
2bf6d935 23058 return false;
50b58779 23059 if (d.testing_p && TARGET_AVX512BW)
2bf6d935
ML
23060 /* All implementable with a single vperm[it]2 insn. */
23061 return true;
23062 break;
23063 case E_V64QImode:
50b58779 23064 if (!TARGET_AVX512F)
2bf6d935 23065 return false;
50b58779 23066 if (d.testing_p && TARGET_AVX512BW)
2bf6d935
ML
23067 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
23068 return true;
23069 break;
23070 case E_V8SImode:
23071 case E_V8SFmode:
23072 case E_V4DFmode:
23073 case E_V4DImode:
23074 if (!TARGET_AVX)
23075 return false;
23076 if (d.testing_p && TARGET_AVX512VL)
23077 /* All implementable with a single vperm[it]2 insn. */
23078 return true;
23079 break;
23080 case E_V16HImode:
23081 if (!TARGET_SSE2)
23082 return false;
23083 if (d.testing_p && TARGET_AVX2)
23084 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23085 return true;
23086 break;
23087 case E_V32QImode:
23088 if (!TARGET_SSE2)
23089 return false;
23090 if (d.testing_p && TARGET_AVX2)
23091 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23092 return true;
23093 break;
23094 case E_V8HImode:
23095 case E_V16QImode:
23096 if (!TARGET_SSE2)
23097 return false;
23098 /* Fall through. */
23099 case E_V4SImode:
23100 case E_V4SFmode:
23101 if (!TARGET_SSE)
23102 return false;
23103 /* All implementable with a single vpperm insn. */
23104 if (d.testing_p && TARGET_XOP)
23105 return true;
23106 /* All implementable with 2 pshufb + 1 ior. */
23107 if (d.testing_p && TARGET_SSSE3)
23108 return true;
23109 break;
240198fe 23110 case E_V2SFmode:
9b8579a6
UB
23111 case E_V2SImode:
23112 case E_V4HImode:
a325bdd1 23113 case E_V8QImode:
9b8579a6
UB
23114 if (!TARGET_MMX_WITH_SSE)
23115 return false;
23116 break;
8d7dae0e 23117 case E_V2HImode:
4986946f
UB
23118 if (!TARGET_SSE2)
23119 return false;
23120 /* All implementable with *punpckwd. */
23121 if (d.testing_p)
23122 return true;
23123 break;
be8749f9
UB
23124 case E_V4QImode:
23125 if (!TARGET_SSE2)
23126 return false;
23127 break;
2bf6d935
ML
23128 case E_V2DImode:
23129 case E_V2DFmode:
23130 if (!TARGET_SSE)
23131 return false;
23132 /* All implementable with shufpd or unpck[lh]pd. */
23133 if (d.testing_p)
23134 return true;
23135 break;
23136 default:
23137 return false;
23138 }
23139
23140 for (i = which = 0; i < nelt; ++i)
23141 {
23142 unsigned char e = sel[i];
23143 gcc_assert (e < 2 * nelt);
23144 d.perm[i] = e;
23145 perm[i] = e;
23146 which |= (e < nelt ? 1 : 2);
23147 }
23148
23149 if (d.testing_p)
23150 {
23151 /* For all elements from second vector, fold the elements to first. */
23152 if (which == 2)
23153 for (i = 0; i < nelt; ++i)
23154 d.perm[i] -= nelt;
23155
23156 /* Check whether the mask can be applied to the vector type. */
23157 d.one_operand_p = (which != 3);
23158
8d7dae0e 23159 /* Implementable with shufps, pshufd or pshuflw. */
9b8579a6 23160 if (d.one_operand_p
240198fe 23161 && (d.vmode == V4SFmode || d.vmode == V2SFmode
8d7dae0e
UB
23162 || d.vmode == V4SImode || d.vmode == V2SImode
23163 || d.vmode == V4HImode || d.vmode == V2HImode))
2bf6d935
ML
23164 return true;
23165
23166 /* Otherwise we have to go through the motions and see if we can
23167 figure out how to generate the requested permutation. */
23168 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
23169 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
23170 if (!d.one_operand_p)
23171 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
23172
23173 start_sequence ();
23174 bool ret = ix86_expand_vec_perm_const_1 (&d);
23175 end_sequence ();
23176
23177 return ret;
23178 }
23179
23180 two_args = canonicalize_perm (&d);
23181
b1d1e2b5
JJ
23182 /* If one of the operands is a zero vector, try to match pmovzx. */
23183 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
23184 {
23185 struct expand_vec_perm_d dzero = d;
23186 if (d.op0 == CONST0_RTX (vmode))
23187 {
23188 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
23189 std::swap (dzero.op0, dzero.op1);
23190 for (i = 0; i < nelt; ++i)
23191 dzero.perm[i] ^= nelt;
23192 }
23193 else
23194 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
23195
23196 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
23197 dzero.perm, nelt, dzero.testing_p))
23198 return true;
23199 }
23200
23201 /* Force operands into registers. */
23202 rtx nop0 = force_reg (vmode, d.op0);
23203 if (d.op0 == d.op1)
23204 d.op1 = nop0;
23205 d.op0 = nop0;
23206 d.op1 = force_reg (vmode, d.op1);
23207
2bf6d935
ML
23208 if (ix86_expand_vec_perm_const_1 (&d))
23209 return true;
23210
23211 /* If the selector says both arguments are needed, but the operands are the
23212 same, the above tried to expand with one_operand_p and flattened selector.
23213 If that didn't work, retry without one_operand_p; we succeeded with that
23214 during testing. */
23215 if (two_args && d.one_operand_p)
23216 {
23217 d.one_operand_p = false;
23218 memcpy (d.perm, perm, sizeof (perm));
23219 return ix86_expand_vec_perm_const_1 (&d);
23220 }
23221
23222 return false;
23223}
23224
23225void
23226ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
23227{
23228 struct expand_vec_perm_d d;
23229 unsigned i, nelt;
23230
23231 d.target = targ;
23232 d.op0 = op0;
23233 d.op1 = op1;
23234 d.vmode = GET_MODE (targ);
23235 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23236 d.one_operand_p = false;
23237 d.testing_p = false;
23238
23239 for (i = 0; i < nelt; ++i)
23240 d.perm[i] = i * 2 + odd;
23241
23242 /* We'll either be able to implement the permutation directly... */
23243 if (expand_vec_perm_1 (&d))
23244 return;
23245
23246 /* ... or we use the special-case patterns. */
23247 expand_vec_perm_even_odd_1 (&d, odd);
23248}
23249
23250static void
23251ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
23252{
23253 struct expand_vec_perm_d d;
23254 unsigned i, nelt, base;
23255 bool ok;
23256
23257 d.target = targ;
23258 d.op0 = op0;
23259 d.op1 = op1;
23260 d.vmode = GET_MODE (targ);
23261 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23262 d.one_operand_p = false;
23263 d.testing_p = false;
23264
23265 base = high_p ? nelt / 2 : 0;
23266 for (i = 0; i < nelt / 2; ++i)
23267 {
23268 d.perm[i * 2] = i + base;
23269 d.perm[i * 2 + 1] = i + base + nelt;
23270 }
23271
23272 /* Note that for AVX this isn't one instruction. */
23273 ok = ix86_expand_vec_perm_const_1 (&d);
23274 gcc_assert (ok);
23275}
23276
c7199fb6 23277/* Expand a vector operation shift by constant for a V*QImode in terms of the
23278 same operation on V*HImode. Return true if success. */
3bd86940 23279static bool
23280ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
23281 rtx dest, rtx op1, rtx op2)
c7199fb6 23282{
23283 machine_mode qimode, himode;
c44c2a3b 23284 HOST_WIDE_INT and_constant, xor_constant;
c7199fb6 23285 HOST_WIDE_INT shift_amount;
23286 rtx vec_const_and, vec_const_xor;
23287 rtx tmp, op1_subreg;
23288 rtx (*gen_shift) (rtx, rtx, rtx);
23289 rtx (*gen_and) (rtx, rtx, rtx);
23290 rtx (*gen_xor) (rtx, rtx, rtx);
23291 rtx (*gen_sub) (rtx, rtx, rtx);
23292
23293 /* Only optimize shift by constant. */
23294 if (!CONST_INT_P (op2))
23295 return false;
23296
23297 qimode = GET_MODE (dest);
23298 shift_amount = INTVAL (op2);
23299 /* Do nothing when shift amount greater equal 8. */
23300 if (shift_amount > 7)
23301 return false;
23302
23303 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
23304 /* Record sign bit. */
23305 xor_constant = 1 << (8 - shift_amount - 1);
23306
23307 /* Zero upper/lower bits shift from left/right element. */
23308 and_constant
23309 = (code == ASHIFT ? 256 - (1 << shift_amount)
23310 : (1 << (8 - shift_amount)) - 1);
23311
23312 switch (qimode)
23313 {
23314 case V16QImode:
23315 himode = V8HImode;
23316 gen_shift =
23317 ((code == ASHIFT)
23318 ? gen_ashlv8hi3
23319 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
23320 gen_and = gen_andv16qi3;
23321 gen_xor = gen_xorv16qi3;
23322 gen_sub = gen_subv16qi3;
23323 break;
23324 case V32QImode:
23325 himode = V16HImode;
23326 gen_shift =
23327 ((code == ASHIFT)
23328 ? gen_ashlv16hi3
23329 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
23330 gen_and = gen_andv32qi3;
23331 gen_xor = gen_xorv32qi3;
23332 gen_sub = gen_subv32qi3;
23333 break;
23334 case V64QImode:
23335 himode = V32HImode;
23336 gen_shift =
23337 ((code == ASHIFT)
23338 ? gen_ashlv32hi3
23339 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
23340 gen_and = gen_andv64qi3;
23341 gen_xor = gen_xorv64qi3;
23342 gen_sub = gen_subv64qi3;
23343 break;
23344 default:
23345 gcc_unreachable ();
23346 }
23347
23348 tmp = gen_reg_rtx (himode);
23349 vec_const_and = gen_reg_rtx (qimode);
23350 op1_subreg = lowpart_subreg (himode, op1, qimode);
23351
23352 /* For ASHIFT and LSHIFTRT, perform operation like
23353 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23354 vpand %vec_const_and, %dest. */
23355 emit_insn (gen_shift (tmp, op1_subreg, op2));
23356 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
23357 emit_move_insn (vec_const_and,
23358 ix86_build_const_vector (qimode, true,
c44c2a3b 23359 gen_int_mode (and_constant, QImode)));
c7199fb6 23360 emit_insn (gen_and (dest, dest, vec_const_and));
23361
23362 /* For ASHIFTRT, perform extra operation like
23363 vpxor %vec_const_xor, %dest, %dest
23364 vpsubb %vec_const_xor, %dest, %dest */
23365 if (code == ASHIFTRT)
23366 {
23367 vec_const_xor = gen_reg_rtx (qimode);
23368 emit_move_insn (vec_const_xor,
23369 ix86_build_const_vector (qimode, true,
c44c2a3b 23370 gen_int_mode (xor_constant, QImode)));
c7199fb6 23371 emit_insn (gen_xor (dest, dest, vec_const_xor));
23372 emit_insn (gen_sub (dest, dest, vec_const_xor));
23373 }
23374 return true;
23375}
23376
fe7b9c2e
UB
23377void
23378ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23379{
23380 machine_mode qimode = GET_MODE (dest);
52ff3f7b 23381 rtx qop1, qop2, hop1, hop2, qdest, hdest;
fe7b9c2e 23382 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
52ff3f7b 23383 bool uns_p = code != ASHIFTRT;
fe7b9c2e
UB
23384
23385 switch (qimode)
23386 {
23387 case E_V4QImode:
23388 case E_V8QImode:
23389 break;
23390 default:
23391 gcc_unreachable ();
23392 }
23393
23394 qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
23395
23396 if (op2vec)
23397 qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
23398 else
23399 qop2 = op2;
23400
c53f5100
UB
23401 qdest = gen_reg_rtx (V16QImode);
23402
23403 if (CONST_INT_P (op2)
23404 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23405 && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
23406 {
23407 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23408 return;
23409 }
23410
fe7b9c2e
UB
23411 switch (code)
23412 {
23413 case MULT:
23414 gcc_assert (op2vec);
52ff3f7b
UB
23415 if (!TARGET_SSE4_1)
23416 {
23417 /* Unpack data such that we've got a source byte in each low byte
23418 of each word. We don't care what goes into the high byte of
23419 each word. Rather than trying to get zero in there, most
23420 convenient is to let it be a copy of the low byte. */
23421 hop1 = copy_to_reg (qop1);
23422 hop2 = copy_to_reg (qop2);
23423 emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
23424 emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
23425 break;
23426 }
fe7b9c2e
UB
23427 /* FALLTHRU */
23428 case ASHIFT:
52ff3f7b 23429 case ASHIFTRT:
fe7b9c2e
UB
23430 case LSHIFTRT:
23431 hop1 = gen_reg_rtx (V8HImode);
23432 ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
52ff3f7b 23433 /* mult/vashr/vlshr/vashl */
fe7b9c2e
UB
23434 if (op2vec)
23435 {
23436 hop2 = gen_reg_rtx (V8HImode);
23437 ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
23438 }
23439 else
23440 hop2 = qop2;
23441
23442 break;
23443 default:
23444 gcc_unreachable ();
23445 }
23446
23447 if (code != MULT && op2vec)
23448 {
23449 /* Expand vashr/vlshr/vashl. */
52ff3f7b
UB
23450 hdest = gen_reg_rtx (V8HImode);
23451 emit_insn (gen_rtx_SET (hdest,
fe7b9c2e
UB
23452 simplify_gen_binary (code, V8HImode,
23453 hop1, hop2)));
23454 }
23455 else
23456 /* Expand mult/ashr/lshr/ashl. */
52ff3f7b 23457 hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
fe7b9c2e
UB
23458 NULL_RTX, 1, OPTAB_DIRECT);
23459
23460 if (TARGET_AVX512BW && TARGET_AVX512VL)
23461 {
23462 if (qimode == V8QImode)
23463 qdest = dest;
23464 else
23465 qdest = gen_reg_rtx (V8QImode);
23466
52ff3f7b 23467 emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
fe7b9c2e
UB
23468 }
23469 else
23470 {
23471 struct expand_vec_perm_d d;
52ff3f7b 23472 rtx qres = gen_lowpart (V16QImode, hdest);
fe7b9c2e
UB
23473 bool ok;
23474 int i;
23475
fe7b9c2e
UB
23476 /* Merge the data back into the right place. */
23477 d.target = qdest;
52ff3f7b 23478 d.op0 = d.op1 = qres;
fe7b9c2e
UB
23479 d.vmode = V16QImode;
23480 d.nelt = 16;
23481 d.one_operand_p = false;
23482 d.testing_p = false;
23483
23484 for (i = 0; i < d.nelt; ++i)
23485 d.perm[i] = i * 2;
23486
23487 ok = ix86_expand_vec_perm_const_1 (&d);
23488 gcc_assert (ok);
23489 }
23490
23491 if (qdest != dest)
23492 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23493}
23494
52ff3f7b
UB
23495/* Emit instruction in 2x wider mode. For example, optimize
23496 vector MUL generation like
23497
23498 vpmovzxbw ymm2, xmm0
23499 vpmovzxbw ymm3, xmm1
23500 vpmullw ymm4, ymm2, ymm3
23501 vpmovwb xmm0, ymm4
23502
23503 it would take less instructions than ix86_expand_vecop_qihi.
23504 Return true if success. */
23505
23506static bool
23507ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23508{
23509 machine_mode himode, qimode = GET_MODE (dest);
23510 machine_mode wqimode;
23511 rtx qop1, qop2, hop1, hop2, hdest;
23512 rtx (*gen_truncate)(rtx, rtx) = NULL;
23513 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23514 bool uns_p = code != ASHIFTRT;
23515
23516 if ((qimode == V16QImode && !TARGET_AVX2)
23517 || (qimode == V32QImode && !TARGET_AVX512BW)
23518 /* There are no V64HImode instructions. */
23519 || qimode == V64QImode)
23520 return false;
23521
23522 /* Do not generate ymm/zmm instructions when
23523 target prefers 128/256 bit vector width. */
23524 if ((qimode == V16QImode && TARGET_PREFER_AVX128)
23525 || (qimode == V32QImode && TARGET_PREFER_AVX256))
23526 return false;
23527
23528 switch (qimode)
23529 {
23530 case E_V16QImode:
23531 himode = V16HImode;
3c1e2b76 23532 if (TARGET_AVX512VL && TARGET_AVX512BW)
52ff3f7b
UB
23533 gen_truncate = gen_truncv16hiv16qi2;
23534 break;
23535 case E_V32QImode:
23536 himode = V32HImode;
23537 gen_truncate = gen_truncv32hiv32qi2;
23538 break;
23539 default:
23540 gcc_unreachable ();
23541 }
23542
23543 wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
23544 qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
23545
23546 if (op2vec)
23547 qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
23548 else
23549 qop2 = op2;
23550
23551 hop1 = gen_reg_rtx (himode);
23552 ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
23553
23554 if (op2vec)
23555 {
23556 hop2 = gen_reg_rtx (himode);
23557 ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
23558 }
23559 else
23560 hop2 = qop2;
23561
2720bbd5
UB
23562 if (code != MULT && op2vec)
23563 {
23564 /* Expand vashr/vlshr/vashl. */
23565 hdest = gen_reg_rtx (himode);
23566 emit_insn (gen_rtx_SET (hdest,
23567 simplify_gen_binary (code, himode,
23568 hop1, hop2)));
23569 }
23570 else
23571 /* Expand mult/ashr/lshr/ashl. */
23572 hdest = expand_simple_binop (himode, code, hop1, hop2,
23573 NULL_RTX, 1, OPTAB_DIRECT);
52ff3f7b
UB
23574
23575 if (gen_truncate)
23576 emit_insn (gen_truncate (dest, hdest));
23577 else
23578 {
23579 struct expand_vec_perm_d d;
23580 rtx wqdest = gen_reg_rtx (wqimode);
23581 rtx wqres = gen_lowpart (wqimode, hdest);
23582 bool ok;
23583 int i;
23584
23585 /* Merge the data back into the right place. */
23586 d.target = wqdest;
23587 d.op0 = d.op1 = wqres;
23588 d.vmode = wqimode;
23589 d.nelt = GET_MODE_NUNITS (wqimode);
23590 d.one_operand_p = false;
23591 d.testing_p = false;
23592
23593 for (i = 0; i < d.nelt; ++i)
23594 d.perm[i] = i * 2;
23595
23596 ok = ix86_expand_vec_perm_const_1 (&d);
23597 gcc_assert (ok);
23598
23599 emit_move_insn (dest, gen_lowpart (qimode, wqdest));
23600 }
23601
23602 return true;
23603}
23604
2bf6d935
ML
23605/* Expand a vector operation CODE for a V*QImode in terms of the
23606 same operation on V*HImode. */
23607
23608void
23609ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23610{
23611 machine_mode qimode = GET_MODE (dest);
23612 machine_mode himode;
23613 rtx (*gen_il) (rtx, rtx, rtx);
23614 rtx (*gen_ih) (rtx, rtx, rtx);
23615 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
fe7b9c2e 23616 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
2bf6d935 23617 struct expand_vec_perm_d d;
00fffa91 23618 bool full_interleave = true;
52ff3f7b 23619 bool uns_p = code != ASHIFTRT;
00fffa91 23620 bool ok;
2bf6d935
ML
23621 int i;
23622
3bd86940 23623 if (CONST_INT_P (op2)
23624 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23625 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
23626 return;
23627
0368fc54 23628 if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
3bd86940 23629 return;
23630
2bf6d935
ML
23631 switch (qimode)
23632 {
23633 case E_V16QImode:
23634 himode = V8HImode;
2bf6d935
ML
23635 break;
23636 case E_V32QImode:
23637 himode = V16HImode;
2bf6d935
ML
23638 break;
23639 case E_V64QImode:
23640 himode = V32HImode;
2bf6d935
ML
23641 break;
23642 default:
23643 gcc_unreachable ();
23644 }
23645
2bf6d935
ML
23646 switch (code)
23647 {
23648 case MULT:
fe7b9c2e 23649 gcc_assert (op2vec);
2bf6d935
ML
23650 /* Unpack data such that we've got a source byte in each low byte of
23651 each word. We don't care what goes into the high byte of each word.
23652 Rather than trying to get zero in there, most convenient is to let
23653 it be a copy of the low byte. */
00fffa91
UB
23654 switch (qimode)
23655 {
23656 case E_V16QImode:
23657 gen_il = gen_vec_interleave_lowv16qi;
23658 gen_ih = gen_vec_interleave_highv16qi;
23659 break;
23660 case E_V32QImode:
23661 gen_il = gen_avx2_interleave_lowv32qi;
23662 gen_ih = gen_avx2_interleave_highv32qi;
23663 full_interleave = false;
23664 break;
23665 case E_V64QImode:
23666 gen_il = gen_avx512bw_interleave_lowv64qi;
23667 gen_ih = gen_avx512bw_interleave_highv64qi;
23668 full_interleave = false;
23669 break;
23670 default:
23671 gcc_unreachable ();
23672 }
23673
2bf6d935
ML
23674 op2_l = gen_reg_rtx (qimode);
23675 op2_h = gen_reg_rtx (qimode);
23676 emit_insn (gen_il (op2_l, op2, op2));
23677 emit_insn (gen_ih (op2_h, op2, op2));
23678
23679 op1_l = gen_reg_rtx (qimode);
23680 op1_h = gen_reg_rtx (qimode);
23681 emit_insn (gen_il (op1_l, op1, op1));
23682 emit_insn (gen_ih (op1_h, op1, op1));
2bf6d935
ML
23683 break;
23684
23685 case ASHIFT:
52ff3f7b 23686 case ASHIFTRT:
2bf6d935 23687 case LSHIFTRT:
2bf6d935
ML
23688 op1_l = gen_reg_rtx (himode);
23689 op1_h = gen_reg_rtx (himode);
23690 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
23691 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
3bd86940 23692 /* vashr/vlshr/vashl */
fe7b9c2e 23693 if (op2vec)
3bd86940 23694 {
23695 rtx tmp = force_reg (qimode, op2);
23696 op2_l = gen_reg_rtx (himode);
23697 op2_h = gen_reg_rtx (himode);
23698 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
23699 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
23700 }
23701 else
23702 op2_l = op2_h = op2;
23703
2bf6d935
ML
23704 break;
23705 default:
23706 gcc_unreachable ();
23707 }
23708
fe7b9c2e 23709 if (code != MULT && op2vec)
3bd86940 23710 {
00fffa91 23711 /* Expand vashr/vlshr/vashl. */
3bd86940 23712 res_l = gen_reg_rtx (himode);
23713 res_h = gen_reg_rtx (himode);
23714 emit_insn (gen_rtx_SET (res_l,
23715 simplify_gen_binary (code, himode,
23716 op1_l, op2_l)));
23717 emit_insn (gen_rtx_SET (res_h,
23718 simplify_gen_binary (code, himode,
23719 op1_h, op2_h)));
23720 }
3bd86940 23721 else
23722 {
00fffa91 23723 /* Expand mult/ashr/lshr/ashl. */
3bd86940 23724 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
23725 1, OPTAB_DIRECT);
23726 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
23727 1, OPTAB_DIRECT);
23728 }
23729
2bf6d935
ML
23730 gcc_assert (res_l && res_h);
23731
23732 /* Merge the data back into the right place. */
23733 d.target = dest;
23734 d.op0 = gen_lowpart (qimode, res_l);
23735 d.op1 = gen_lowpart (qimode, res_h);
23736 d.vmode = qimode;
23737 d.nelt = GET_MODE_NUNITS (qimode);
23738 d.one_operand_p = false;
23739 d.testing_p = false;
23740
23741 if (full_interleave)
23742 {
00fffa91 23743 /* We used the full interleave, the desired
2bf6d935
ML
23744 results are in the even elements. */
23745 for (i = 0; i < d.nelt; ++i)
23746 d.perm[i] = i * 2;
23747 }
23748 else
23749 {
23750 /* For AVX, the interleave used above was not cross-lane. So the
23751 extraction is evens but with the second and third quarter swapped.
23752 Happily, that is even one insn shorter than even extraction.
23753 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23754 always first from the first and then from the second source operand,
23755 the index bits above the low 4 bits remains the same.
23756 Thus, for d.nelt == 32 we want permutation
23757 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23758 and for d.nelt == 64 we want permutation
23759 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23760 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23761 for (i = 0; i < d.nelt; ++i)
23762 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
23763 }
23764
23765 ok = ix86_expand_vec_perm_const_1 (&d);
23766 gcc_assert (ok);
2bf6d935
ML
23767}
23768
23769/* Helper function of ix86_expand_mul_widen_evenodd. Return true
23770 if op is CONST_VECTOR with all odd elements equal to their
23771 preceding element. */
23772
23773static bool
23774const_vector_equal_evenodd_p (rtx op)
23775{
23776 machine_mode mode = GET_MODE (op);
23777 int i, nunits = GET_MODE_NUNITS (mode);
23778 if (GET_CODE (op) != CONST_VECTOR
23779 || nunits != CONST_VECTOR_NUNITS (op))
23780 return false;
23781 for (i = 0; i < nunits; i += 2)
23782 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
23783 return false;
23784 return true;
23785}
23786
23787void
23788ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
23789 bool uns_p, bool odd_p)
23790{
23791 machine_mode mode = GET_MODE (op1);
23792 machine_mode wmode = GET_MODE (dest);
23793 rtx x;
23794 rtx orig_op1 = op1, orig_op2 = op2;
23795
23796 if (!nonimmediate_operand (op1, mode))
23797 op1 = force_reg (mode, op1);
23798 if (!nonimmediate_operand (op2, mode))
23799 op2 = force_reg (mode, op2);
23800
23801 /* We only play even/odd games with vectors of SImode. */
23802 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
23803
23804 /* If we're looking for the odd results, shift those members down to
23805 the even slots. For some cpus this is faster than a PSHUFD. */
23806 if (odd_p)
23807 {
23808 /* For XOP use vpmacsdqh, but only for smult, as it is only
23809 signed. */
23810 if (TARGET_XOP && mode == V4SImode && !uns_p)
23811 {
23812 x = force_reg (wmode, CONST0_RTX (wmode));
23813 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
23814 return;
23815 }
23816
23817 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
23818 if (!const_vector_equal_evenodd_p (orig_op1))
23819 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
23820 x, NULL, 1, OPTAB_DIRECT);
23821 if (!const_vector_equal_evenodd_p (orig_op2))
23822 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
23823 x, NULL, 1, OPTAB_DIRECT);
23824 op1 = gen_lowpart (mode, op1);
23825 op2 = gen_lowpart (mode, op2);
23826 }
23827
23828 if (mode == V16SImode)
23829 {
23830 if (uns_p)
23831 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
23832 else
23833 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
23834 }
23835 else if (mode == V8SImode)
23836 {
23837 if (uns_p)
23838 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
23839 else
23840 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
23841 }
23842 else if (uns_p)
23843 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
23844 else if (TARGET_SSE4_1)
23845 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
23846 else
23847 {
23848 rtx s1, s2, t0, t1, t2;
23849
23850 /* The easiest way to implement this without PMULDQ is to go through
23851 the motions as if we are performing a full 64-bit multiply. With
23852 the exception that we need to do less shuffling of the elements. */
23853
23854 /* Compute the sign-extension, aka highparts, of the two operands. */
23855 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23856 op1, pc_rtx, pc_rtx);
23857 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23858 op2, pc_rtx, pc_rtx);
23859
23860 /* Multiply LO(A) * HI(B), and vice-versa. */
23861 t1 = gen_reg_rtx (wmode);
23862 t2 = gen_reg_rtx (wmode);
23863 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
23864 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
23865
23866 /* Multiply LO(A) * LO(B). */
23867 t0 = gen_reg_rtx (wmode);
23868 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
23869
23870 /* Combine and shift the highparts into place. */
23871 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
23872 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
23873 1, OPTAB_DIRECT);
23874
23875 /* Combine high and low parts. */
23876 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
23877 return;
23878 }
23879 emit_insn (x);
23880}
23881
23882void
23883ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
23884 bool uns_p, bool high_p)
23885{
23886 machine_mode wmode = GET_MODE (dest);
23887 machine_mode mode = GET_MODE (op1);
23888 rtx t1, t2, t3, t4, mask;
23889
23890 switch (mode)
23891 {
23892 case E_V4SImode:
23893 t1 = gen_reg_rtx (mode);
23894 t2 = gen_reg_rtx (mode);
23895 if (TARGET_XOP && !uns_p)
23896 {
23897 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23898 shuffle the elements once so that all elements are in the right
23899 place for immediate use: { A C B D }. */
23900 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
23901 const1_rtx, GEN_INT (3)));
23902 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
23903 const1_rtx, GEN_INT (3)));
23904 }
23905 else
23906 {
23907 /* Put the elements into place for the multiply. */
23908 ix86_expand_vec_interleave (t1, op1, op1, high_p);
23909 ix86_expand_vec_interleave (t2, op2, op2, high_p);
23910 high_p = false;
23911 }
23912 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
23913 break;
23914
23915 case E_V8SImode:
23916 /* Shuffle the elements between the lanes. After this we
23917 have { A B E F | C D G H } for each operand. */
23918 t1 = gen_reg_rtx (V4DImode);
23919 t2 = gen_reg_rtx (V4DImode);
23920 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
23921 const0_rtx, const2_rtx,
23922 const1_rtx, GEN_INT (3)));
23923 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
23924 const0_rtx, const2_rtx,
23925 const1_rtx, GEN_INT (3)));
23926
23927 /* Shuffle the elements within the lanes. After this we
23928 have { A A B B | C C D D } or { E E F F | G G H H }. */
23929 t3 = gen_reg_rtx (V8SImode);
23930 t4 = gen_reg_rtx (V8SImode);
23931 mask = GEN_INT (high_p
23932 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23933 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23934 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
23935 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
23936
23937 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
23938 break;
23939
23940 case E_V8HImode:
23941 case E_V16HImode:
23942 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
23943 uns_p, OPTAB_DIRECT);
23944 t2 = expand_binop (mode,
23945 uns_p ? umul_highpart_optab : smul_highpart_optab,
23946 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
23947 gcc_assert (t1 && t2);
23948
23949 t3 = gen_reg_rtx (mode);
23950 ix86_expand_vec_interleave (t3, t1, t2, high_p);
23951 emit_move_insn (dest, gen_lowpart (wmode, t3));
23952 break;
23953
23954 case E_V16QImode:
23955 case E_V32QImode:
23956 case E_V32HImode:
23957 case E_V16SImode:
23958 case E_V64QImode:
23959 t1 = gen_reg_rtx (wmode);
23960 t2 = gen_reg_rtx (wmode);
23961 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
23962 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
23963
23964 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
23965 break;
23966
23967 default:
23968 gcc_unreachable ();
23969 }
23970}
23971
23972void
23973ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
23974{
23975 rtx res_1, res_2, res_3, res_4;
23976
23977 res_1 = gen_reg_rtx (V4SImode);
23978 res_2 = gen_reg_rtx (V4SImode);
23979 res_3 = gen_reg_rtx (V2DImode);
23980 res_4 = gen_reg_rtx (V2DImode);
23981 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
23982 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
23983
23984 /* Move the results in element 2 down to element 1; we don't care
23985 what goes in elements 2 and 3. Then we can merge the parts
23986 back together with an interleave.
23987
23988 Note that two other sequences were tried:
23989 (1) Use interleaves at the start instead of psrldq, which allows
23990 us to use a single shufps to merge things back at the end.
23991 (2) Use shufps here to combine the two vectors, then pshufd to
23992 put the elements in the correct order.
23993 In both cases the cost of the reformatting stall was too high
23994 and the overall sequence slower. */
23995
23996 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
23997 const0_rtx, const2_rtx,
23998 const0_rtx, const0_rtx));
23999 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
24000 const0_rtx, const2_rtx,
24001 const0_rtx, const0_rtx));
24002 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
24003
24004 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
24005}
24006
24007void
24008ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
24009{
24010 machine_mode mode = GET_MODE (op0);
24011 rtx t1, t2, t3, t4, t5, t6;
24012
24013 if (TARGET_AVX512DQ && mode == V8DImode)
24014 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
1ce82f56 24015 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
2bf6d935 24016 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
1ce82f56 24017 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
2bf6d935
ML
24018 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
24019 else if (TARGET_XOP && mode == V2DImode)
24020 {
24021 /* op1: A,B,C,D, op2: E,F,G,H */
24022 op1 = gen_lowpart (V4SImode, op1);
24023 op2 = gen_lowpart (V4SImode, op2);
24024
24025 t1 = gen_reg_rtx (V4SImode);
24026 t2 = gen_reg_rtx (V4SImode);
24027 t3 = gen_reg_rtx (V2DImode);
24028 t4 = gen_reg_rtx (V2DImode);
24029
24030 /* t1: B,A,D,C */
24031 emit_insn (gen_sse2_pshufd_1 (t1, op1,
24032 GEN_INT (1),
24033 GEN_INT (0),
24034 GEN_INT (3),
24035 GEN_INT (2)));
24036
24037 /* t2: (B*E),(A*F),(D*G),(C*H) */
24038 emit_insn (gen_mulv4si3 (t2, t1, op2));
24039
24040 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
24041 emit_insn (gen_xop_phadddq (t3, t2));
24042
24043 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
24044 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
24045
24046 /* Multiply lower parts and add all */
24047 t5 = gen_reg_rtx (V2DImode);
24048 emit_insn (gen_vec_widen_umult_even_v4si (t5,
24049 gen_lowpart (V4SImode, op1),
24050 gen_lowpart (V4SImode, op2)));
8ba6ea87 24051 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
2bf6d935
ML
24052 }
24053 else
24054 {
24055 machine_mode nmode;
24056 rtx (*umul) (rtx, rtx, rtx);
24057
24058 if (mode == V2DImode)
24059 {
24060 umul = gen_vec_widen_umult_even_v4si;
24061 nmode = V4SImode;
24062 }
24063 else if (mode == V4DImode)
24064 {
24065 umul = gen_vec_widen_umult_even_v8si;
24066 nmode = V8SImode;
24067 }
24068 else if (mode == V8DImode)
24069 {
24070 umul = gen_vec_widen_umult_even_v16si;
24071 nmode = V16SImode;
24072 }
24073 else
24074 gcc_unreachable ();
24075
24076
24077 /* Multiply low parts. */
24078 t1 = gen_reg_rtx (mode);
24079 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
24080
24081 /* Shift input vectors right 32 bits so we can multiply high parts. */
24082 t6 = GEN_INT (32);
24083 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
24084 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
24085
24086 /* Multiply high parts by low parts. */
24087 t4 = gen_reg_rtx (mode);
24088 t5 = gen_reg_rtx (mode);
24089 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
24090 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
24091
24092 /* Combine and shift the highparts back. */
24093 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
24094 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
24095
24096 /* Combine high and low parts. */
24097 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
24098 }
24099
24100 set_unique_reg_note (get_last_insn (), REG_EQUAL,
24101 gen_rtx_MULT (mode, op1, op2));
24102}
24103
24104/* Return 1 if control tansfer instruction INSN
24105 should be encoded with notrack prefix. */
24106
24107bool
e8b0314a 24108ix86_notrack_prefixed_insn_p (rtx_insn *insn)
2bf6d935
ML
24109{
24110 if (!insn || !((flag_cf_protection & CF_BRANCH)))
24111 return false;
24112
24113 if (CALL_P (insn))
24114 {
24115 rtx call = get_call_rtx_from (insn);
24116 gcc_assert (call != NULL_RTX);
24117 rtx addr = XEXP (call, 0);
24118
24119 /* Do not emit 'notrack' if it's not an indirect call. */
24120 if (MEM_P (addr)
24121 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
24122 return false;
24123 else
24124 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
24125 }
24126
24127 if (JUMP_P (insn) && !flag_cet_switch)
24128 {
24129 rtx target = JUMP_LABEL (insn);
24130 if (target == NULL_RTX || ANY_RETURN_P (target))
24131 return false;
24132
24133 /* Check the jump is a switch table. */
24134 rtx_insn *label = as_a<rtx_insn *> (target);
24135 rtx_insn *table = next_insn (label);
24136 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
24137 return false;
24138 else
24139 return true;
24140 }
24141 return false;
24142}
24143
24144/* Calculate integer abs() using only SSE2 instructions. */
24145
24146void
24147ix86_expand_sse2_abs (rtx target, rtx input)
24148{
24149 machine_mode mode = GET_MODE (target);
24150 rtx tmp0, tmp1, x;
24151
24152 switch (mode)
24153 {
24154 case E_V2DImode:
24155 case E_V4DImode:
24156 /* For 64-bit signed integer X, with SSE4.2 use
24157 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
24158 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
24159 32 and use logical instead of arithmetic right shift (which is
24160 unimplemented) and subtract. */
24161 if (TARGET_SSE4_2)
24162 {
24163 tmp0 = gen_reg_rtx (mode);
24164 tmp1 = gen_reg_rtx (mode);
24165 emit_move_insn (tmp1, CONST0_RTX (mode));
24166 if (mode == E_V2DImode)
24167 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
24168 else
24169 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
24170 }
24171 else
24172 {
24173 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
24174 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
24175 - 1), NULL, 0, OPTAB_DIRECT);
24176 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
24177 }
24178
24179 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24180 NULL, 0, OPTAB_DIRECT);
24181 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24182 target, 0, OPTAB_DIRECT);
24183 break;
24184
24185 case E_V4SImode:
24186 /* For 32-bit signed integer X, the best way to calculate the absolute
24187 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
24188 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
24189 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
24190 NULL, 0, OPTAB_DIRECT);
24191 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24192 NULL, 0, OPTAB_DIRECT);
24193 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24194 target, 0, OPTAB_DIRECT);
24195 break;
24196
24197 case E_V8HImode:
24198 /* For 16-bit signed integer X, the best way to calculate the absolute
24199 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
24200 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24201
24202 x = expand_simple_binop (mode, SMAX, tmp0, input,
24203 target, 0, OPTAB_DIRECT);
24204 break;
24205
24206 case E_V16QImode:
24207 /* For 8-bit signed integer X, the best way to calculate the absolute
24208 value of X is min ((unsigned char) X, (unsigned char) (-X)),
24209 as SSE2 provides the PMINUB insn. */
24210 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24211
24212 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
24213 target, 0, OPTAB_DIRECT);
24214 break;
24215
24216 default:
24217 gcc_unreachable ();
24218 }
24219
24220 if (x != target)
24221 emit_move_insn (target, x);
24222}
24223
24224/* Expand an extract from a vector register through pextr insn.
24225 Return true if successful. */
24226
24227bool
24228ix86_expand_pextr (rtx *operands)
24229{
24230 rtx dst = operands[0];
24231 rtx src = operands[1];
24232
24233 unsigned int size = INTVAL (operands[2]);
24234 unsigned int pos = INTVAL (operands[3]);
24235
24236 if (SUBREG_P (dst))
24237 {
24238 /* Reject non-lowpart subregs. */
24239 if (SUBREG_BYTE (dst) > 0)
24240 return false;
24241 dst = SUBREG_REG (dst);
24242 }
24243
24244 if (SUBREG_P (src))
24245 {
24246 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
24247 src = SUBREG_REG (src);
24248 }
24249
24250 switch (GET_MODE (src))
24251 {
24252 case E_V16QImode:
24253 case E_V8HImode:
24254 case E_V4SImode:
24255 case E_V2DImode:
24256 case E_V1TImode:
2bf6d935
ML
24257 {
24258 machine_mode srcmode, dstmode;
24259 rtx d, pat;
24260
24261 if (!int_mode_for_size (size, 0).exists (&dstmode))
24262 return false;
24263
24264 switch (dstmode)
24265 {
24266 case E_QImode:
24267 if (!TARGET_SSE4_1)
24268 return false;
24269 srcmode = V16QImode;
24270 break;
24271
24272 case E_HImode:
24273 if (!TARGET_SSE2)
24274 return false;
24275 srcmode = V8HImode;
24276 break;
24277
24278 case E_SImode:
24279 if (!TARGET_SSE4_1)
24280 return false;
24281 srcmode = V4SImode;
24282 break;
24283
24284 case E_DImode:
24285 gcc_assert (TARGET_64BIT);
24286 if (!TARGET_SSE4_1)
24287 return false;
24288 srcmode = V2DImode;
24289 break;
24290
24291 default:
24292 return false;
24293 }
24294
24295 /* Reject extractions from misaligned positions. */
24296 if (pos & (size-1))
24297 return false;
24298
24299 if (GET_MODE (dst) == dstmode)
24300 d = dst;
24301 else
24302 d = gen_reg_rtx (dstmode);
24303
24304 /* Construct insn pattern. */
24305 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
24306 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
24307
24308 /* Let the rtl optimizers know about the zero extension performed. */
24309 if (dstmode == QImode || dstmode == HImode)
24310 {
24311 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
24312 d = gen_lowpart (SImode, d);
24313 }
24314
24315 emit_insn (gen_rtx_SET (d, pat));
24316
24317 if (d != dst)
24318 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24319 return true;
24320 }
24321
24322 default:
24323 return false;
24324 }
24325}
24326
24327/* Expand an insert into a vector register through pinsr insn.
24328 Return true if successful. */
24329
24330bool
24331ix86_expand_pinsr (rtx *operands)
24332{
24333 rtx dst = operands[0];
24334 rtx src = operands[3];
24335
24336 unsigned int size = INTVAL (operands[1]);
24337 unsigned int pos = INTVAL (operands[2]);
24338
24339 if (SUBREG_P (dst))
24340 {
24341 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
24342 dst = SUBREG_REG (dst);
24343 }
24344
24345 switch (GET_MODE (dst))
24346 {
24347 case E_V16QImode:
24348 case E_V8HImode:
24349 case E_V4SImode:
24350 case E_V2DImode:
24351 case E_V1TImode:
2bf6d935
ML
24352 {
24353 machine_mode srcmode, dstmode;
24354 rtx (*pinsr)(rtx, rtx, rtx, rtx);
24355 rtx d;
24356
24357 if (!int_mode_for_size (size, 0).exists (&srcmode))
24358 return false;
24359
24360 switch (srcmode)
24361 {
24362 case E_QImode:
24363 if (!TARGET_SSE4_1)
24364 return false;
24365 dstmode = V16QImode;
24366 pinsr = gen_sse4_1_pinsrb;
24367 break;
24368
24369 case E_HImode:
24370 if (!TARGET_SSE2)
24371 return false;
24372 dstmode = V8HImode;
24373 pinsr = gen_sse2_pinsrw;
24374 break;
24375
24376 case E_SImode:
24377 if (!TARGET_SSE4_1)
24378 return false;
24379 dstmode = V4SImode;
24380 pinsr = gen_sse4_1_pinsrd;
24381 break;
24382
24383 case E_DImode:
24384 gcc_assert (TARGET_64BIT);
24385 if (!TARGET_SSE4_1)
24386 return false;
24387 dstmode = V2DImode;
24388 pinsr = gen_sse4_1_pinsrq;
24389 break;
24390
24391 default:
24392 return false;
24393 }
24394
24395 /* Reject insertions to misaligned positions. */
24396 if (pos & (size-1))
24397 return false;
24398
24399 if (SUBREG_P (src))
24400 {
24401 unsigned int srcpos = SUBREG_BYTE (src);
24402
24403 if (srcpos > 0)
24404 {
24405 rtx extr_ops[4];
24406
24407 extr_ops[0] = gen_reg_rtx (srcmode);
24408 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
24409 extr_ops[2] = GEN_INT (size);
24410 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
24411
24412 if (!ix86_expand_pextr (extr_ops))
24413 return false;
24414
24415 src = extr_ops[0];
24416 }
24417 else
24418 src = gen_lowpart (srcmode, SUBREG_REG (src));
24419 }
24420
24421 if (GET_MODE (dst) == dstmode)
24422 d = dst;
24423 else
24424 d = gen_reg_rtx (dstmode);
24425
24426 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
24427 gen_lowpart (srcmode, src),
24428 GEN_INT (1 << (pos / size))));
24429 if (d != dst)
24430 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24431 return true;
24432 }
24433
24434 default:
24435 return false;
24436 }
24437}
24438
24439/* All CPUs prefer to avoid cross-lane operations so perform reductions
24440 upper against lower halves up to SSE reg size. */
24441
24442machine_mode
24443ix86_split_reduction (machine_mode mode)
24444{
24445 /* Reduce lowpart against highpart until we reach SSE reg width to
24446 avoid cross-lane operations. */
24447 switch (mode)
24448 {
24449 case E_V8DImode:
24450 case E_V4DImode:
24451 return V2DImode;
24452 case E_V16SImode:
24453 case E_V8SImode:
24454 return V4SImode;
24455 case E_V32HImode:
24456 case E_V16HImode:
24457 return V8HImode;
24458 case E_V64QImode:
24459 case E_V32QImode:
24460 return V16QImode;
24461 case E_V16SFmode:
24462 case E_V8SFmode:
24463 return V4SFmode;
24464 case E_V8DFmode:
24465 case E_V4DFmode:
24466 return V2DFmode;
24467 default:
24468 return mode;
24469 }
24470}
24471
24472/* Generate call to __divmoddi4. */
24473
24474void
24475ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
24476 rtx op0, rtx op1,
24477 rtx *quot_p, rtx *rem_p)
24478{
24479 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
24480
24481 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
24482 mode, op0, mode, op1, mode,
24483 XEXP (rem, 0), Pmode);
24484 *quot_p = quot;
24485 *rem_p = rem;
24486}
24487
152f243f
JJ
24488void
24489ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
24490 enum rtx_code code, bool after,
24491 bool doubleword)
4d281ff7 24492{
0435b978 24493 rtx old_reg, new_reg, old_mem, success;
4d281ff7 24494 machine_mode mode = GET_MODE (target);
0435b978 24495 rtx_code_label *loop_label = NULL;
4d281ff7
HW
24496
24497 old_reg = gen_reg_rtx (mode);
24498 new_reg = old_reg;
4d281ff7 24499 old_mem = copy_to_reg (mem);
0435b978 24500 loop_label = gen_label_rtx ();
4d281ff7
HW
24501 emit_label (loop_label);
24502 emit_move_insn (old_reg, old_mem);
24503
24504 /* return value for atomic_fetch_op. */
24505 if (!after)
24506 emit_move_insn (target, old_reg);
24507
24508 if (code == NOT)
24509 {
24510 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
24511 true, OPTAB_LIB_WIDEN);
24512 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
24513 }
24514 else
24515 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
24516 true, OPTAB_LIB_WIDEN);
24517
24518 /* return value for atomic_op_fetch. */
24519 if (after)
24520 emit_move_insn (target, new_reg);
24521
0435b978
HW
24522 success = NULL_RTX;
24523
24524 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
24525 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
24526 SImode),
24527 doubleword, loop_label);
24528}
24529
24530/* Relax cmpxchg instruction, param loop_label indicates whether
24531 the instruction should be relaxed with a pause loop. If not,
24532 it will be relaxed to an atomic load + compare, and skip
24533 cmpxchg instruction if mem != exp_input. */
24534
152f243f
JJ
24535void
24536ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
24537 rtx mem, rtx exp_input, rtx new_input,
24538 rtx mem_model, bool doubleword,
24539 rtx_code_label *loop_label)
0435b978
HW
24540{
24541 rtx_code_label *cmp_label = NULL;
24542 rtx_code_label *done_label = NULL;
24543 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
24544 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
24545 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
24546 machine_mode mode = GET_MODE (target_val), hmode = mode;
24547
24548 if (*ptarget_bool == NULL)
24549 target_bool = gen_reg_rtx (QImode);
24550 else
24551 target_bool = *ptarget_bool;
24552
24553 cmp_label = gen_label_rtx ();
24554 done_label = gen_label_rtx ();
24555
24556 new_mem = gen_reg_rtx (mode);
24557 /* Load memory first. */
24558 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
24559
24560 switch (mode)
24561 {
9d1796d8 24562 case E_TImode:
0435b978
HW
24563 gendw = gen_atomic_compare_and_swapti_doubleword;
24564 hmode = DImode;
24565 break;
9d1796d8 24566 case E_DImode:
0435b978
HW
24567 if (doubleword)
24568 {
24569 gendw = gen_atomic_compare_and_swapdi_doubleword;
24570 hmode = SImode;
24571 }
24572 else
24573 gen = gen_atomic_compare_and_swapdi_1;
24574 break;
9d1796d8
RS
24575 case E_SImode:
24576 gen = gen_atomic_compare_and_swapsi_1;
24577 break;
24578 case E_HImode:
24579 gen = gen_atomic_compare_and_swaphi_1;
24580 break;
24581 case E_QImode:
24582 gen = gen_atomic_compare_and_swapqi_1;
24583 break;
0435b978
HW
24584 default:
24585 gcc_unreachable ();
24586 }
4d281ff7 24587
0435b978 24588 /* Compare mem value with expected value. */
4d281ff7
HW
24589 if (doubleword)
24590 {
0435b978
HW
24591 rtx low_new_mem = gen_lowpart (hmode, new_mem);
24592 rtx low_exp_input = gen_lowpart (hmode, exp_input);
24593 rtx high_new_mem = gen_highpart (hmode, new_mem);
24594 rtx high_exp_input = gen_highpart (hmode, exp_input);
24595 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
24596 hmode, 1, cmp_label,
4d281ff7 24597 profile_probability::guessed_never ());
0435b978
HW
24598 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
24599 hmode, 1, cmp_label,
4d281ff7
HW
24600 profile_probability::guessed_never ());
24601 }
24602 else
0435b978
HW
24603 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
24604 GET_MODE (exp_input), 1, cmp_label,
4d281ff7
HW
24605 profile_probability::guessed_never ());
24606
0435b978
HW
24607 /* Directly emits cmpxchg here. */
24608 if (doubleword)
24609 emit_insn (gendw (target_val, mem, exp_input,
24610 gen_lowpart (hmode, new_input),
24611 gen_highpart (hmode, new_input),
24612 mem_model));
24613 else
24614 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
24615
24616 if (!loop_label)
24617 {
24618 emit_jump_insn (gen_jump (done_label));
24619 emit_barrier ();
24620 emit_label (cmp_label);
24621 emit_move_insn (target_val, new_mem);
24622 emit_label (done_label);
24623 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24624 const0_rtx);
24625 }
24626 else
24627 {
24628 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24629 const0_rtx);
24630 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
24631 GET_MODE (target_bool), 1, loop_label,
24632 profile_probability::guessed_never ());
24633 emit_jump_insn (gen_jump (done_label));
24634 emit_barrier ();
24635
24636 /* If mem is not expected, pause and loop back. */
24637 emit_label (cmp_label);
522f25e9 24638 emit_move_insn (target_val, new_mem);
0435b978
HW
24639 emit_insn (gen_pause ());
24640 emit_jump_insn (gen_jump (loop_label));
24641 emit_barrier ();
24642 emit_label (done_label);
24643 }
24644
24645 *ptarget_bool = target_bool;
4d281ff7
HW
24646}
24647
b1115dbf
JJ
24648/* Convert a BFmode VAL to SFmode without signaling sNaNs.
24649 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24650
24651rtx
24652ix86_expand_fast_convert_bf_to_sf (rtx val)
24653{
24654 rtx op = gen_lowpart (HImode, val), ret;
24655 if (CONST_INT_P (op))
24656 {
24657 ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
24658 val, BFmode);
24659 if (ret)
24660 return ret;
24661 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24662 ret = gen_reg_rtx (SImode);
24663 emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
e55251f3 24664 emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
24665 return gen_lowpart (SFmode, ret);
b1115dbf 24666 }
e55251f3 24667
24668 ret = gen_reg_rtx (SFmode);
24669 emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
24670 return ret;
b1115dbf
JJ
24671}
24672
2bf6d935 24673#include "gt-i386-expand.h"