]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/i386-expand.cc
c++: class-head parsing and CPP_TEMPLATE_ID access [PR108275]
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
CommitLineData
7adcbafe 1/* Copyright (C) 1988-2022 Free Software Foundation, Inc.
2bf6d935
ML
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3, or (at your option)
8any later version.
9
10GCC is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with GCC; see the file COPYING3. If not see
17<http://www.gnu.org/licenses/>. */
18
19#define IN_TARGET_CODE 1
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "gimple.h"
29#include "cfghooks.h"
30#include "cfgloop.h"
31#include "df.h"
32#include "tm_p.h"
33#include "stringpool.h"
34#include "expmed.h"
35#include "optabs.h"
36#include "regs.h"
37#include "emit-rtl.h"
38#include "recog.h"
39#include "cgraph.h"
40#include "diagnostic.h"
41#include "cfgbuild.h"
42#include "alias.h"
43#include "fold-const.h"
44#include "attribs.h"
45#include "calls.h"
46#include "stor-layout.h"
47#include "varasm.h"
48#include "output.h"
49#include "insn-attr.h"
50#include "flags.h"
51#include "except.h"
52#include "explow.h"
53#include "expr.h"
54#include "cfgrtl.h"
55#include "common/common-target.h"
56#include "langhooks.h"
57#include "reload.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "tm-constrs.h"
2bf6d935
ML
61#include "cselib.h"
62#include "sched-int.h"
63#include "opts.h"
64#include "tree-pass.h"
65#include "context.h"
66#include "pass_manager.h"
67#include "target-globals.h"
68#include "gimple-iterator.h"
69#include "tree-vectorizer.h"
70#include "shrink-wrap.h"
71#include "builtins.h"
72#include "rtl-iter.h"
73#include "tree-iterator.h"
74#include "dbgcnt.h"
75#include "case-cfn-macros.h"
76#include "dojump.h"
77#include "fold-const-call.h"
78#include "tree-vrp.h"
79#include "tree-ssanames.h"
80#include "selftest.h"
81#include "selftest-rtl.h"
82#include "print-rtl.h"
83#include "intl.h"
84#include "ifcvt.h"
85#include "symbol-summary.h"
86#include "ipa-prop.h"
87#include "ipa-fnsummary.h"
88#include "wide-int-bitmask.h"
89#include "tree-vector-builder.h"
90#include "debug.h"
91#include "dwarf2out.h"
92#include "i386-options.h"
93#include "i386-builtins.h"
94#include "i386-expand.h"
bb576017 95#include "asan.h"
2bf6d935
ML
96
97/* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
102
103void
104split_double_mode (machine_mode mode, rtx operands[],
105 int num, rtx lo_half[], rtx hi_half[])
106{
107 machine_mode half_mode;
108 unsigned int byte;
deeedbad
JJ
109 rtx mem_op = NULL_RTX;
110 int mem_num = 0;
2bf6d935
ML
111
112 switch (mode)
113 {
114 case E_TImode:
115 half_mode = DImode;
116 break;
117 case E_DImode:
118 half_mode = SImode;
119 break;
58d6eea0 120 case E_P2HImode:
121 half_mode = HImode;
122 break;
123 case E_P2QImode:
124 half_mode = QImode;
125 break;
2bf6d935
ML
126 default:
127 gcc_unreachable ();
128 }
129
130 byte = GET_MODE_SIZE (half_mode);
131
132 while (num--)
133 {
134 rtx op = operands[num];
135
136 /* simplify_subreg refuse to split volatile memory addresses,
137 but we still have to handle it. */
138 if (MEM_P (op))
139 {
deeedbad
JJ
140 if (mem_op && rtx_equal_p (op, mem_op))
141 {
142 lo_half[num] = lo_half[mem_num];
143 hi_half[num] = hi_half[mem_num];
144 }
145 else
146 {
147 mem_op = op;
148 mem_num = num;
149 lo_half[num] = adjust_address (op, half_mode, 0);
150 hi_half[num] = adjust_address (op, half_mode, byte);
151 }
2bf6d935
ML
152 }
153 else
154 {
155 lo_half[num] = simplify_gen_subreg (half_mode, op,
156 GET_MODE (op) == VOIDmode
157 ? mode : GET_MODE (op), 0);
d39fbed7
UB
158
159 rtx tmp = simplify_gen_subreg (half_mode, op,
160 GET_MODE (op) == VOIDmode
161 ? mode : GET_MODE (op), byte);
162 /* simplify_gen_subreg will return NULL RTX for the
163 high half of the paradoxical subreg. */
164 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
2bf6d935
ML
165 }
166 }
167}
168
16aafa31
RS
169/* Emit the double word assignment DST = { LO, HI }. */
170
171void
172split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
173{
174 rtx dlo, dhi;
175 int deleted_move_count = 0;
176 split_double_mode (mode, &dst, 1, &dlo, &dhi);
2c089640
JJ
177 /* Constraints ensure that if both lo and hi are MEMs, then
178 dst has early-clobber and thus addresses of MEMs don't use
179 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
180 dlo/dhi are registers. */
181 if (MEM_P (lo)
182 && rtx_equal_p (dlo, hi)
183 && reg_overlap_mentioned_p (dhi, lo))
184 {
185 /* If dlo is same as hi and lo's address uses dhi register,
186 code below would first emit_move_insn (dhi, hi)
187 and then emit_move_insn (dlo, lo). But the former
188 would invalidate lo's address. Load into dhi first,
189 then swap. */
190 emit_move_insn (dhi, lo);
191 lo = dhi;
192 }
193 else if (MEM_P (hi)
194 && !MEM_P (lo)
195 && !rtx_equal_p (dlo, lo)
196 && reg_overlap_mentioned_p (dlo, hi))
197 {
198 /* In this case, code below would first emit_move_insn (dlo, lo)
199 and then emit_move_insn (dhi, hi). But the former would
200 invalidate hi's address. Load into dhi first. */
201 emit_move_insn (dhi, hi);
202 hi = dhi;
203 }
16aafa31
RS
204 if (!rtx_equal_p (dlo, hi))
205 {
206 if (!rtx_equal_p (dlo, lo))
207 emit_move_insn (dlo, lo);
208 else
209 deleted_move_count++;
210 if (!rtx_equal_p (dhi, hi))
211 emit_move_insn (dhi, hi);
212 else
213 deleted_move_count++;
214 }
215 else if (!rtx_equal_p (lo, dhi))
216 {
217 if (!rtx_equal_p (dhi, hi))
218 emit_move_insn (dhi, hi);
219 else
220 deleted_move_count++;
221 if (!rtx_equal_p (dlo, lo))
222 emit_move_insn (dlo, lo);
223 else
224 deleted_move_count++;
225 }
226 else if (mode == TImode)
227 emit_insn (gen_swapdi (dlo, dhi));
228 else
229 emit_insn (gen_swapsi (dlo, dhi));
230
231 if (deleted_move_count == 2)
232 emit_note (NOTE_INSN_DELETED);
233}
234
235
2bf6d935
ML
236/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
237 for the target. */
238
239void
240ix86_expand_clear (rtx dest)
241{
242 rtx tmp;
243
244 /* We play register width games, which are only valid after reload. */
245 gcc_assert (reload_completed);
246
247 /* Avoid HImode and its attendant prefix byte. */
248 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
249 dest = gen_rtx_REG (SImode, REGNO (dest));
250 tmp = gen_rtx_SET (dest, const0_rtx);
251
252 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
253 {
254 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
255 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
256 }
257
258 emit_insn (tmp);
259}
260
edafb35b
L
261/* Return true if V can be broadcasted from an integer of WIDTH bits
262 which is returned in VAL_BROADCAST. Otherwise, return false. */
263
264static bool
265ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
266 HOST_WIDE_INT &val_broadcast)
267{
268 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
269 val_broadcast = wi::extract_uhwi (val, 0, width);
270 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
271 {
272 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
273 if (val_broadcast != each)
274 return false;
275 }
276 val_broadcast = sext_hwi (val_broadcast, width);
277 return true;
278}
279
280/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
281
282static rtx
283ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
284{
285 /* Don't use integer vector broadcast if we can't move from GPR to SSE
286 register directly. */
287 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
288 return nullptr;
289
290 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
291 broadcast only if vector broadcast is available. */
292 if (!TARGET_AVX
293 || !CONST_WIDE_INT_P (op)
294 || standard_sse_constant_p (op, mode))
295 return nullptr;
296
297 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
298 HOST_WIDE_INT val_broadcast;
299 scalar_int_mode broadcast_mode;
300 if (TARGET_AVX2
301 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
302 val_broadcast))
303 broadcast_mode = QImode;
304 else if (TARGET_AVX2
305 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
306 val_broadcast))
307 broadcast_mode = HImode;
308 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
309 val_broadcast))
310 broadcast_mode = SImode;
311 else if (TARGET_64BIT
312 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
313 val_broadcast))
314 broadcast_mode = DImode;
315 else
316 return nullptr;
317
318 /* Check if OP can be broadcasted from VAL. */
319 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
320 if (val != CONST_WIDE_INT_ELT (op, i))
321 return nullptr;
322
323 unsigned int nunits = (GET_MODE_SIZE (mode)
324 / GET_MODE_SIZE (broadcast_mode));
325 machine_mode vector_mode;
326 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
327 gcc_unreachable ();
328 rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
329 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
330 target,
331 GEN_INT (val_broadcast));
332 gcc_assert (ok);
333 target = lowpart_subreg (mode, target, vector_mode);
334 return target;
335}
336
2bf6d935
ML
337void
338ix86_expand_move (machine_mode mode, rtx operands[])
339{
340 rtx op0, op1;
341 rtx tmp, addend = NULL_RTX;
342 enum tls_model model;
343
344 op0 = operands[0];
345 op1 = operands[1];
346
be39636d
RS
347 /* Avoid complex sets of likely spilled hard registers before reload. */
348 if (!ix86_hardreg_mov_ok (op0, op1))
349 {
350 tmp = gen_reg_rtx (mode);
351 operands[0] = tmp;
352 ix86_expand_move (mode, operands);
353 operands[0] = op0;
354 operands[1] = tmp;
355 op1 = tmp;
356 }
357
2bf6d935
ML
358 switch (GET_CODE (op1))
359 {
360 case CONST:
361 tmp = XEXP (op1, 0);
362
363 if (GET_CODE (tmp) != PLUS
364 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
365 break;
366
367 op1 = XEXP (tmp, 0);
368 addend = XEXP (tmp, 1);
369 /* FALLTHRU */
370
371 case SYMBOL_REF:
372 model = SYMBOL_REF_TLS_MODEL (op1);
373
374 if (model)
375 op1 = legitimize_tls_address (op1, model, true);
376 else if (ix86_force_load_from_GOT_p (op1))
377 {
378 /* Load the external function address via GOT slot to avoid PLT. */
379 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
380 (TARGET_64BIT
381 ? UNSPEC_GOTPCREL
382 : UNSPEC_GOT));
383 op1 = gen_rtx_CONST (Pmode, op1);
384 op1 = gen_const_mem (Pmode, op1);
385 set_mem_alias_set (op1, ix86_GOT_alias_set ());
386 }
387 else
388 {
389 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
390 if (tmp)
391 {
392 op1 = tmp;
393 if (!addend)
394 break;
395 }
396 else
397 {
398 op1 = operands[1];
399 break;
400 }
401 }
402
403 if (addend)
404 {
405 op1 = force_operand (op1, NULL_RTX);
406 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
407 op0, 1, OPTAB_DIRECT);
408 }
409 else
410 op1 = force_operand (op1, op0);
411
412 if (op1 == op0)
413 return;
414
415 op1 = convert_to_mode (mode, op1, 1);
416
417 default:
418 break;
419 }
420
421 if ((flag_pic || MACHOPIC_INDIRECT)
422 && symbolic_operand (op1, mode))
423 {
424 if (TARGET_MACHO && !TARGET_64BIT)
425 {
426#if TARGET_MACHO
427 /* dynamic-no-pic */
428 if (MACHOPIC_INDIRECT)
429 {
430 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
431 ? op0 : gen_reg_rtx (Pmode);
432 op1 = machopic_indirect_data_reference (op1, temp);
433 if (MACHOPIC_PURE)
434 op1 = machopic_legitimize_pic_address (op1, mode,
435 temp == op1 ? 0 : temp);
436 }
437 if (op0 != op1 && GET_CODE (op0) != MEM)
438 {
439 rtx insn = gen_rtx_SET (op0, op1);
440 emit_insn (insn);
441 return;
442 }
443 if (GET_CODE (op0) == MEM)
444 op1 = force_reg (Pmode, op1);
445 else
446 {
447 rtx temp = op0;
448 if (GET_CODE (temp) != REG)
449 temp = gen_reg_rtx (Pmode);
450 temp = legitimize_pic_address (op1, temp);
451 if (temp == op0)
452 return;
453 op1 = temp;
454 }
455 /* dynamic-no-pic */
456#endif
457 }
458 else
459 {
460 if (MEM_P (op0))
461 op1 = force_reg (mode, op1);
462 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
463 {
464 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
465 op1 = legitimize_pic_address (op1, reg);
466 if (op0 == op1)
467 return;
468 op1 = convert_to_mode (mode, op1, 1);
469 }
470 }
471 }
472 else
473 {
474 if (MEM_P (op0)
475 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
476 || !push_operand (op0, mode))
477 && MEM_P (op1))
478 op1 = force_reg (mode, op1);
479
480 if (push_operand (op0, mode)
481 && ! general_no_elim_operand (op1, mode))
482 op1 = copy_to_mode_reg (mode, op1);
483
484 /* Force large constants in 64bit compilation into register
485 to get them CSEed. */
486 if (can_create_pseudo_p ()
487 && (mode == DImode) && TARGET_64BIT
488 && immediate_operand (op1, mode)
489 && !x86_64_zext_immediate_operand (op1, VOIDmode)
490 && !register_operand (op0, mode)
491 && optimize)
492 op1 = copy_to_mode_reg (mode, op1);
493
edafb35b 494 if (can_create_pseudo_p ())
2bf6d935 495 {
edafb35b 496 if (CONST_DOUBLE_P (op1))
2bf6d935 497 {
edafb35b
L
498 /* If we are loading a floating point constant to a
499 register, force the value to memory now, since we'll
500 get better code out the back end. */
501
502 op1 = validize_mem (force_const_mem (mode, op1));
503 if (!register_operand (op0, mode))
504 {
505 rtx temp = gen_reg_rtx (mode);
506 emit_insn (gen_rtx_SET (temp, op1));
507 emit_move_insn (op0, temp);
508 return;
509 }
510 }
511 else if (GET_MODE_SIZE (mode) >= 16)
512 {
513 rtx tmp = ix86_convert_const_wide_int_to_broadcast
514 (GET_MODE (op0), op1);
515 if (tmp != nullptr)
516 op1 = tmp;
2bf6d935
ML
517 }
518 }
519 }
520
521 emit_insn (gen_rtx_SET (op0, op1));
522}
523
a6291d88 524/* OP is a memref of CONST_VECTOR, return scalar constant mem
525 if CONST_VECTOR is a vec_duplicate, else return NULL. */
edafb35b 526static rtx
a6291d88 527ix86_broadcast_from_constant (machine_mode mode, rtx op)
edafb35b
L
528{
529 int nunits = GET_MODE_NUNITS (mode);
530 if (nunits < 2)
531 return nullptr;
532
533 /* Don't use integer vector broadcast if we can't move from GPR to SSE
534 register directly. */
a6291d88 535 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
536 && INTEGRAL_MODE_P (mode))
edafb35b
L
537 return nullptr;
538
539 /* Convert CONST_VECTOR to a non-standard SSE constant integer
540 broadcast only if vector broadcast is available. */
541 if (!(TARGET_AVX2
542 || (TARGET_AVX
543 && (GET_MODE_INNER (mode) == SImode
a6291d88 544 || GET_MODE_INNER (mode) == DImode))
545 || FLOAT_MODE_P (mode))
edafb35b
L
546 || standard_sse_constant_p (op, mode))
547 return nullptr;
548
a6291d88 549 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
550 We can still put 64-bit integer constant in memory when
551 avx512 embed broadcast is available. */
552 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
553 && (!TARGET_AVX512F
554 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
edafb35b
L
555 return nullptr;
556
f7cad1a0
JJ
557 if (GET_MODE_INNER (mode) == TImode)
558 return nullptr;
559
edafb35b
L
560 rtx constant = get_pool_constant (XEXP (op, 0));
561 if (GET_CODE (constant) != CONST_VECTOR)
562 return nullptr;
563
564 /* There could be some rtx like
565 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
566 but with "*.LC1" refer to V2DI constant vector. */
567 if (GET_MODE (constant) != mode)
568 {
569 constant = simplify_subreg (mode, constant, GET_MODE (constant),
570 0);
571 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
572 return nullptr;
573 }
574
575 rtx first = XVECEXP (constant, 0, 0);
576
577 for (int i = 1; i < nunits; ++i)
578 {
579 rtx tmp = XVECEXP (constant, 0, i);
580 /* Vector duplicate value. */
581 if (!rtx_equal_p (tmp, first))
582 return nullptr;
583 }
584
585 return first;
586}
587
2bf6d935
ML
588void
589ix86_expand_vector_move (machine_mode mode, rtx operands[])
590{
591 rtx op0 = operands[0], op1 = operands[1];
592 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
593 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
594 unsigned int align = (TARGET_IAMCU
595 ? GET_MODE_BITSIZE (mode)
596 : GET_MODE_ALIGNMENT (mode));
597
598 if (push_operand (op0, VOIDmode))
599 op0 = emit_move_resolve_push (mode, op0);
600
601 /* Force constants other than zero into memory. We do not know how
602 the instructions used to build constants modify the upper 64 bits
603 of the register, once we have that information we may be able
604 to handle some of them more efficiently. */
605 if (can_create_pseudo_p ()
606 && (CONSTANT_P (op1)
607 || (SUBREG_P (op1)
608 && CONSTANT_P (SUBREG_REG (op1))))
609 && ((register_operand (op0, mode)
610 && !standard_sse_constant_p (op1, mode))
611 /* ix86_expand_vector_move_misalign() does not like constants. */
612 || (SSE_REG_MODE_P (mode)
613 && MEM_P (op0)
614 && MEM_ALIGN (op0) < align)))
615 {
616 if (SUBREG_P (op1))
617 {
618 machine_mode imode = GET_MODE (SUBREG_REG (op1));
619 rtx r = force_const_mem (imode, SUBREG_REG (op1));
620 if (r)
621 r = validize_mem (r);
622 else
623 r = force_reg (imode, SUBREG_REG (op1));
624 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
625 }
626 else
edafb35b
L
627 {
628 machine_mode mode = GET_MODE (op0);
629 rtx tmp = ix86_convert_const_wide_int_to_broadcast
630 (mode, op1);
631 if (tmp == nullptr)
632 op1 = validize_mem (force_const_mem (mode, op1));
633 else
634 op1 = tmp;
635 }
636 }
637
638 if (can_create_pseudo_p ()
639 && GET_MODE_SIZE (mode) >= 16
a6291d88 640 && VECTOR_MODE_P (mode)
edafb35b
L
641 && (MEM_P (op1)
642 && SYMBOL_REF_P (XEXP (op1, 0))
643 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
644 {
a6291d88 645 rtx first = ix86_broadcast_from_constant (mode, op1);
edafb35b
L
646 if (first != nullptr)
647 {
648 /* Broadcast to XMM/YMM/ZMM register from an integer
a6291d88 649 constant or scalar mem. */
6e5401e8 650 op1 = gen_reg_rtx (mode);
a6291d88 651 if (FLOAT_MODE_P (mode)
652 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
6e5401e8 653 first = force_const_mem (GET_MODE_INNER (mode), first);
edafb35b
L
654 bool ok = ix86_expand_vector_init_duplicate (false, mode,
655 op1, first);
656 gcc_assert (ok);
657 emit_move_insn (op0, op1);
658 return;
659 }
2bf6d935
ML
660 }
661
662 /* We need to check memory alignment for SSE mode since attribute
663 can make operands unaligned. */
664 if (can_create_pseudo_p ()
665 && SSE_REG_MODE_P (mode)
666 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
667 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
668 {
669 rtx tmp[2];
670
671 /* ix86_expand_vector_move_misalign() does not like both
672 arguments in memory. */
673 if (!register_operand (op0, mode)
674 && !register_operand (op1, mode))
09dba016
L
675 {
676 rtx scratch = ix86_gen_scratch_sse_rtx (mode);
677 emit_move_insn (scratch, op1);
678 op1 = scratch;
679 }
2bf6d935
ML
680
681 tmp[0] = op0; tmp[1] = op1;
682 ix86_expand_vector_move_misalign (mode, tmp);
683 return;
684 }
685
fad14a02
RS
686 /* Special case TImode to V1TImode conversions, via V2DI. */
687 if (mode == V1TImode
688 && SUBREG_P (op1)
689 && GET_MODE (SUBREG_REG (op1)) == TImode
690 && TARGET_64BIT && TARGET_SSE
691 && can_create_pseudo_p ())
692 {
693 rtx tmp = gen_reg_rtx (V2DImode);
694 rtx lo = gen_reg_rtx (DImode);
695 rtx hi = gen_reg_rtx (DImode);
696 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
697 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
698 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
699 emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
700 return;
701 }
702
97c32001
RS
703 /* If operand0 is a hard register, make operand1 a pseudo. */
704 if (can_create_pseudo_p ()
705 && !ix86_hardreg_mov_ok (op0, op1))
706 {
707 rtx tmp = gen_reg_rtx (GET_MODE (op0));
708 emit_move_insn (tmp, op1);
709 emit_move_insn (op0, tmp);
710 return;
711 }
712
2bf6d935
ML
713 /* Make operand1 a register if it isn't already. */
714 if (can_create_pseudo_p ()
715 && !register_operand (op0, mode)
716 && !register_operand (op1, mode))
717 {
7f4c3943
L
718 rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
719 emit_move_insn (tmp, op1);
720 emit_move_insn (op0, tmp);
2bf6d935
ML
721 return;
722 }
723
724 emit_insn (gen_rtx_SET (op0, op1));
725}
726
727/* Split 32-byte AVX unaligned load and store if needed. */
728
729static void
730ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
731{
732 rtx m;
733 rtx (*extract) (rtx, rtx, rtx);
734 machine_mode mode;
735
736 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
737 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
738 {
739 emit_insn (gen_rtx_SET (op0, op1));
740 return;
741 }
742
743 rtx orig_op0 = NULL_RTX;
744 mode = GET_MODE (op0);
745 switch (GET_MODE_CLASS (mode))
746 {
747 case MODE_VECTOR_INT:
748 case MODE_INT:
749 if (mode != V32QImode)
750 {
751 if (!MEM_P (op0))
752 {
753 orig_op0 = op0;
754 op0 = gen_reg_rtx (V32QImode);
755 }
756 else
757 op0 = gen_lowpart (V32QImode, op0);
758 op1 = gen_lowpart (V32QImode, op1);
759 mode = V32QImode;
760 }
761 break;
762 case MODE_VECTOR_FLOAT:
763 break;
764 default:
765 gcc_unreachable ();
766 }
767
768 switch (mode)
769 {
770 default:
771 gcc_unreachable ();
772 case E_V32QImode:
773 extract = gen_avx_vextractf128v32qi;
774 mode = V16QImode;
775 break;
60d1d296
L
776 case E_V16BFmode:
777 extract = gen_avx_vextractf128v16bf;
778 mode = V8BFmode;
779 break;
d959312b
L
780 case E_V16HFmode:
781 extract = gen_avx_vextractf128v16hf;
782 mode = V8HFmode;
783 break;
2bf6d935
ML
784 case E_V8SFmode:
785 extract = gen_avx_vextractf128v8sf;
786 mode = V4SFmode;
787 break;
788 case E_V4DFmode:
789 extract = gen_avx_vextractf128v4df;
790 mode = V2DFmode;
791 break;
792 }
793
794 if (MEM_P (op1))
795 {
796 rtx r = gen_reg_rtx (mode);
797 m = adjust_address (op1, mode, 0);
798 emit_move_insn (r, m);
799 m = adjust_address (op1, mode, 16);
800 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
801 emit_move_insn (op0, r);
802 }
803 else if (MEM_P (op0))
804 {
805 m = adjust_address (op0, mode, 0);
806 emit_insn (extract (m, op1, const0_rtx));
807 m = adjust_address (op0, mode, 16);
808 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
809 }
810 else
811 gcc_unreachable ();
812
813 if (orig_op0)
814 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
815}
816
817/* Implement the movmisalign patterns for SSE. Non-SSE modes go
818 straight to ix86_expand_vector_move. */
819/* Code generation for scalar reg-reg moves of single and double precision data:
820 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
821 movaps reg, reg
822 else
823 movss reg, reg
824 if (x86_sse_partial_reg_dependency == true)
825 movapd reg, reg
826 else
827 movsd reg, reg
828
829 Code generation for scalar loads of double precision data:
830 if (x86_sse_split_regs == true)
831 movlpd mem, reg (gas syntax)
832 else
833 movsd mem, reg
834
835 Code generation for unaligned packed loads of single precision data
836 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
837 if (x86_sse_unaligned_move_optimal)
838 movups mem, reg
839
840 if (x86_sse_partial_reg_dependency == true)
841 {
842 xorps reg, reg
843 movlps mem, reg
844 movhps mem+8, reg
845 }
846 else
847 {
848 movlps mem, reg
849 movhps mem+8, reg
850 }
851
852 Code generation for unaligned packed loads of double precision data
853 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
854 if (x86_sse_unaligned_move_optimal)
855 movupd mem, reg
856
857 if (x86_sse_split_regs == true)
858 {
859 movlpd mem, reg
860 movhpd mem+8, reg
861 }
862 else
863 {
864 movsd mem, reg
865 movhpd mem+8, reg
866 }
867 */
868
869void
870ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
871{
872 rtx op0, op1, m;
873
874 op0 = operands[0];
875 op1 = operands[1];
876
877 /* Use unaligned load/store for AVX512 or when optimizing for size. */
878 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
879 {
880 emit_insn (gen_rtx_SET (op0, op1));
881 return;
882 }
883
884 if (TARGET_AVX)
885 {
886 if (GET_MODE_SIZE (mode) == 32)
887 ix86_avx256_split_vector_move_misalign (op0, op1);
888 else
889 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
890 emit_insn (gen_rtx_SET (op0, op1));
891 return;
892 }
893
894 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
895 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
896 {
897 emit_insn (gen_rtx_SET (op0, op1));
898 return;
899 }
900
901 /* ??? If we have typed data, then it would appear that using
902 movdqu is the only way to get unaligned data loaded with
903 integer type. */
904 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
905 {
906 emit_insn (gen_rtx_SET (op0, op1));
907 return;
908 }
909
910 if (MEM_P (op1))
911 {
912 if (TARGET_SSE2 && mode == V2DFmode)
913 {
914 rtx zero;
915
916 /* When SSE registers are split into halves, we can avoid
917 writing to the top half twice. */
918 if (TARGET_SSE_SPLIT_REGS)
919 {
920 emit_clobber (op0);
921 zero = op0;
922 }
923 else
924 {
925 /* ??? Not sure about the best option for the Intel chips.
926 The following would seem to satisfy; the register is
927 entirely cleared, breaking the dependency chain. We
928 then store to the upper half, with a dependency depth
929 of one. A rumor has it that Intel recommends two movsd
930 followed by an unpacklpd, but this is unconfirmed. And
931 given that the dependency depth of the unpacklpd would
932 still be one, I'm not sure why this would be better. */
933 zero = CONST0_RTX (V2DFmode);
934 }
935
936 m = adjust_address (op1, DFmode, 0);
937 emit_insn (gen_sse2_loadlpd (op0, zero, m));
938 m = adjust_address (op1, DFmode, 8);
939 emit_insn (gen_sse2_loadhpd (op0, op0, m));
940 }
941 else
942 {
943 rtx t;
944
945 if (mode != V4SFmode)
946 t = gen_reg_rtx (V4SFmode);
947 else
948 t = op0;
949
950 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
951 emit_move_insn (t, CONST0_RTX (V4SFmode));
952 else
953 emit_clobber (t);
954
955 m = adjust_address (op1, V2SFmode, 0);
956 emit_insn (gen_sse_loadlps (t, t, m));
957 m = adjust_address (op1, V2SFmode, 8);
958 emit_insn (gen_sse_loadhps (t, t, m));
959 if (mode != V4SFmode)
960 emit_move_insn (op0, gen_lowpart (mode, t));
961 }
962 }
963 else if (MEM_P (op0))
964 {
965 if (TARGET_SSE2 && mode == V2DFmode)
966 {
967 m = adjust_address (op0, DFmode, 0);
968 emit_insn (gen_sse2_storelpd (m, op1));
969 m = adjust_address (op0, DFmode, 8);
970 emit_insn (gen_sse2_storehpd (m, op1));
971 }
972 else
973 {
974 if (mode != V4SFmode)
975 op1 = gen_lowpart (V4SFmode, op1);
976
977 m = adjust_address (op0, V2SFmode, 0);
978 emit_insn (gen_sse_storelps (m, op1));
979 m = adjust_address (op0, V2SFmode, 8);
980 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
981 }
982 }
983 else
984 gcc_unreachable ();
985}
986
b74ebb2a
L
987/* Move bits 64:95 to bits 32:63. */
988
989void
990ix86_move_vector_high_sse_to_mmx (rtx op)
991{
992 rtx mask = gen_rtx_PARALLEL (VOIDmode,
993 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
994 GEN_INT (0), GEN_INT (0)));
995 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
996 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
997 rtx insn = gen_rtx_SET (dest, op);
998 emit_insn (insn);
999}
1000
1001/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1002
1003void
1004ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1005{
1006 rtx op0 = operands[0];
1007 rtx op1 = operands[1];
1008 rtx op2 = operands[2];
1009
1010 machine_mode dmode = GET_MODE (op0);
1011 machine_mode smode = GET_MODE (op1);
1012 machine_mode inner_dmode = GET_MODE_INNER (dmode);
1013 machine_mode inner_smode = GET_MODE_INNER (smode);
1014
1015 /* Get the corresponding SSE mode for destination. */
1016 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1017 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1018 nunits).require ();
1019 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1020 nunits / 2).require ();
1021
1022 /* Get the corresponding SSE mode for source. */
1023 nunits = 16 / GET_MODE_SIZE (inner_smode);
1024 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1025 nunits).require ();
1026
1027 /* Generate SSE pack with signed/unsigned saturation. */
1028 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1029 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1030 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1031
1032 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1033 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1034 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
1035 op1, op2));
1036 emit_insn (insn);
1037
1038 ix86_move_vector_high_sse_to_mmx (op0);
1039}
1040
6e9fffcf
L
1041/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1042
1043void
1044ix86_split_mmx_punpck (rtx operands[], bool high_p)
1045{
1046 rtx op0 = operands[0];
1047 rtx op1 = operands[1];
1048 rtx op2 = operands[2];
1049 machine_mode mode = GET_MODE (op0);
1050 rtx mask;
1051 /* The corresponding SSE mode. */
1052 machine_mode sse_mode, double_sse_mode;
1053
1054 switch (mode)
1055 {
be8749f9 1056 case E_V4QImode:
6e9fffcf
L
1057 case E_V8QImode:
1058 sse_mode = V16QImode;
1059 double_sse_mode = V32QImode;
1060 mask = gen_rtx_PARALLEL (VOIDmode,
1061 gen_rtvec (16,
1062 GEN_INT (0), GEN_INT (16),
1063 GEN_INT (1), GEN_INT (17),
1064 GEN_INT (2), GEN_INT (18),
1065 GEN_INT (3), GEN_INT (19),
1066 GEN_INT (4), GEN_INT (20),
1067 GEN_INT (5), GEN_INT (21),
1068 GEN_INT (6), GEN_INT (22),
1069 GEN_INT (7), GEN_INT (23)));
1070 break;
1071
1072 case E_V4HImode:
be8749f9 1073 case E_V2HImode:
6e9fffcf
L
1074 sse_mode = V8HImode;
1075 double_sse_mode = V16HImode;
1076 mask = gen_rtx_PARALLEL (VOIDmode,
1077 gen_rtvec (8,
1078 GEN_INT (0), GEN_INT (8),
1079 GEN_INT (1), GEN_INT (9),
1080 GEN_INT (2), GEN_INT (10),
1081 GEN_INT (3), GEN_INT (11)));
1082 break;
1083
1084 case E_V2SImode:
1085 sse_mode = V4SImode;
1086 double_sse_mode = V8SImode;
1087 mask = gen_rtx_PARALLEL (VOIDmode,
1088 gen_rtvec (4,
1089 GEN_INT (0), GEN_INT (4),
1090 GEN_INT (1), GEN_INT (5)));
1091 break;
1092
a325bdd1
PB
1093 case E_V2SFmode:
1094 sse_mode = V4SFmode;
1095 double_sse_mode = V8SFmode;
1096 mask = gen_rtx_PARALLEL (VOIDmode,
1097 gen_rtvec (4,
1098 GEN_INT (0), GEN_INT (4),
1099 GEN_INT (1), GEN_INT (5)));
1100 break;
1101
6e9fffcf
L
1102 default:
1103 gcc_unreachable ();
1104 }
1105
1106 /* Generate SSE punpcklXX. */
1107 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1108 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1109 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1110
1111 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1112 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1113 rtx insn = gen_rtx_SET (dest, op2);
1114 emit_insn (insn);
1115
be8749f9 1116 /* Move high bits to low bits. */
6e9fffcf
L
1117 if (high_p)
1118 {
a325bdd1
PB
1119 if (sse_mode == V4SFmode)
1120 {
1121 mask = gen_rtx_PARALLEL (VOIDmode,
1122 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1123 GEN_INT (4), GEN_INT (5)));
1124 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1125 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1126 }
1127 else
1128 {
be8749f9
UB
1129 int sz = GET_MODE_SIZE (mode);
1130
1131 if (sz == 4)
1132 mask = gen_rtx_PARALLEL (VOIDmode,
1133 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1134 GEN_INT (0), GEN_INT (1)));
1135 else if (sz == 8)
1136 mask = gen_rtx_PARALLEL (VOIDmode,
1137 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1138 GEN_INT (0), GEN_INT (1)));
1139 else
1140 gcc_unreachable ();
1141
a325bdd1
PB
1142 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1143 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1144 }
1145
6e9fffcf
L
1146 insn = gen_rtx_SET (dest, op1);
1147 emit_insn (insn);
1148 }
1149}
1150
2bf6d935
ML
1151/* Helper function of ix86_fixup_binary_operands to canonicalize
1152 operand order. Returns true if the operands should be swapped. */
1153
1154static bool
1155ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1156 rtx operands[])
1157{
1158 rtx dst = operands[0];
1159 rtx src1 = operands[1];
1160 rtx src2 = operands[2];
1161
1162 /* If the operation is not commutative, we can't do anything. */
1163 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1164 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1165 return false;
1166
1167 /* Highest priority is that src1 should match dst. */
1168 if (rtx_equal_p (dst, src1))
1169 return false;
1170 if (rtx_equal_p (dst, src2))
1171 return true;
1172
1173 /* Next highest priority is that immediate constants come second. */
1174 if (immediate_operand (src2, mode))
1175 return false;
1176 if (immediate_operand (src1, mode))
1177 return true;
1178
1179 /* Lowest priority is that memory references should come second. */
1180 if (MEM_P (src2))
1181 return false;
1182 if (MEM_P (src1))
1183 return true;
1184
1185 return false;
1186}
1187
1188
1189/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1190 destination to use for the operation. If different from the true
1191 destination in operands[0], a copy operation will be required. */
1192
1193rtx
1194ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1195 rtx operands[])
1196{
1197 rtx dst = operands[0];
1198 rtx src1 = operands[1];
1199 rtx src2 = operands[2];
1200
1201 /* Canonicalize operand order. */
1202 if (ix86_swap_binary_operands_p (code, mode, operands))
1203 {
1204 /* It is invalid to swap operands of different modes. */
1205 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1206
1207 std::swap (src1, src2);
1208 }
1209
1210 /* Both source operands cannot be in memory. */
1211 if (MEM_P (src1) && MEM_P (src2))
1212 {
1213 /* Optimization: Only read from memory once. */
1214 if (rtx_equal_p (src1, src2))
1215 {
1216 src2 = force_reg (mode, src2);
1217 src1 = src2;
1218 }
1219 else if (rtx_equal_p (dst, src1))
1220 src2 = force_reg (mode, src2);
1221 else
1222 src1 = force_reg (mode, src1);
1223 }
1224
1225 /* If the destination is memory, and we do not have matching source
1226 operands, do things in registers. */
1227 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1228 dst = gen_reg_rtx (mode);
1229
1230 /* Source 1 cannot be a constant. */
1231 if (CONSTANT_P (src1))
1232 src1 = force_reg (mode, src1);
1233
1234 /* Source 1 cannot be a non-matching memory. */
1235 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1236 src1 = force_reg (mode, src1);
1237
1238 /* Improve address combine. */
1239 if (code == PLUS
1240 && GET_MODE_CLASS (mode) == MODE_INT
1241 && MEM_P (src2))
1242 src2 = force_reg (mode, src2);
1243
1244 operands[1] = src1;
1245 operands[2] = src2;
1246 return dst;
1247}
1248
1249/* Similarly, but assume that the destination has already been
1250 set up properly. */
1251
1252void
1253ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1254 machine_mode mode, rtx operands[])
1255{
1256 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1257 gcc_assert (dst == operands[0]);
1258}
1259
1260/* Attempt to expand a binary operator. Make the expansion closer to the
1261 actual machine, then just general_operand, which will allow 3 separate
1262 memory references (one output, two input) in a single insn. */
1263
1264void
1265ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1266 rtx operands[])
1267{
1268 rtx src1, src2, dst, op, clob;
1269
1270 dst = ix86_fixup_binary_operands (code, mode, operands);
1271 src1 = operands[1];
1272 src2 = operands[2];
1273
1274 /* Emit the instruction. */
1275
1276 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1277
1278 if (reload_completed
1279 && code == PLUS
1280 && !rtx_equal_p (dst, src1))
1281 {
1282 /* This is going to be an LEA; avoid splitting it later. */
1283 emit_insn (op);
1284 }
1285 else
1286 {
1287 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1288 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1289 }
1290
1291 /* Fix up the destination if needed. */
1292 if (dst != operands[0])
1293 emit_move_insn (operands[0], dst);
1294}
1295
1296/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1297 the given OPERANDS. */
1298
1299void
1300ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1301 rtx operands[])
1302{
1303 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1304 if (SUBREG_P (operands[1]))
1305 {
1306 op1 = operands[1];
1307 op2 = operands[2];
1308 }
1309 else if (SUBREG_P (operands[2]))
1310 {
1311 op1 = operands[2];
1312 op2 = operands[1];
1313 }
1314 /* Optimize (__m128i) d | (__m128i) e and similar code
1315 when d and e are float vectors into float vector logical
1316 insn. In C/C++ without using intrinsics there is no other way
1317 to express vector logical operation on float vectors than
1318 to cast them temporarily to integer vectors. */
1319 if (op1
1320 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1321 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1322 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1323 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1324 && SUBREG_BYTE (op1) == 0
1325 && (GET_CODE (op2) == CONST_VECTOR
1326 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1327 && SUBREG_BYTE (op2) == 0))
1328 && can_create_pseudo_p ())
1329 {
1330 rtx dst;
1331 switch (GET_MODE (SUBREG_REG (op1)))
1332 {
1333 case E_V4SFmode:
1334 case E_V8SFmode:
1335 case E_V16SFmode:
1336 case E_V2DFmode:
1337 case E_V4DFmode:
1338 case E_V8DFmode:
1339 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1340 if (GET_CODE (op2) == CONST_VECTOR)
1341 {
1342 op2 = gen_lowpart (GET_MODE (dst), op2);
1343 op2 = force_reg (GET_MODE (dst), op2);
1344 }
1345 else
1346 {
1347 op1 = operands[1];
1348 op2 = SUBREG_REG (operands[2]);
1349 if (!vector_operand (op2, GET_MODE (dst)))
1350 op2 = force_reg (GET_MODE (dst), op2);
1351 }
1352 op1 = SUBREG_REG (op1);
1353 if (!vector_operand (op1, GET_MODE (dst)))
1354 op1 = force_reg (GET_MODE (dst), op1);
1355 emit_insn (gen_rtx_SET (dst,
1356 gen_rtx_fmt_ee (code, GET_MODE (dst),
1357 op1, op2)));
1358 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1359 return;
1360 default:
1361 break;
1362 }
1363 }
1364 if (!vector_operand (operands[1], mode))
1365 operands[1] = force_reg (mode, operands[1]);
1366 if (!vector_operand (operands[2], mode))
1367 operands[2] = force_reg (mode, operands[2]);
1368 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1369 emit_insn (gen_rtx_SET (operands[0],
1370 gen_rtx_fmt_ee (code, mode, operands[1],
1371 operands[2])));
1372}
1373
1374/* Return TRUE or FALSE depending on whether the binary operator meets the
1375 appropriate constraints. */
1376
1377bool
1378ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1379 rtx operands[3])
1380{
1381 rtx dst = operands[0];
1382 rtx src1 = operands[1];
1383 rtx src2 = operands[2];
1384
1385 /* Both source operands cannot be in memory. */
7026bb95 1386 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1387 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
2bf6d935
ML
1388 return false;
1389
1390 /* Canonicalize operand order for commutative operators. */
1391 if (ix86_swap_binary_operands_p (code, mode, operands))
1392 std::swap (src1, src2);
1393
1394 /* If the destination is memory, we must have a matching source operand. */
1395 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1396 return false;
1397
1398 /* Source 1 cannot be a constant. */
1399 if (CONSTANT_P (src1))
1400 return false;
1401
1402 /* Source 1 cannot be a non-matching memory. */
1403 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1404 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1405 return (code == AND
1406 && (mode == HImode
1407 || mode == SImode
1408 || (TARGET_64BIT && mode == DImode))
1409 && satisfies_constraint_L (src2));
1410
1411 return true;
1412}
1413
1414/* Attempt to expand a unary operator. Make the expansion closer to the
1415 actual machine, then just general_operand, which will allow 2 separate
1416 memory references (one output, one input) in a single insn. */
1417
1418void
1419ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1420 rtx operands[])
1421{
1422 bool matching_memory = false;
1423 rtx src, dst, op, clob;
1424
1425 dst = operands[0];
1426 src = operands[1];
1427
1428 /* If the destination is memory, and we do not have matching source
1429 operands, do things in registers. */
1430 if (MEM_P (dst))
1431 {
1432 if (rtx_equal_p (dst, src))
1433 matching_memory = true;
1434 else
1435 dst = gen_reg_rtx (mode);
1436 }
1437
1438 /* When source operand is memory, destination must match. */
1439 if (MEM_P (src) && !matching_memory)
1440 src = force_reg (mode, src);
1441
1442 /* Emit the instruction. */
1443
1444 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1445
1446 if (code == NOT)
1447 emit_insn (op);
1448 else
1449 {
1450 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1451 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1452 }
1453
1454 /* Fix up the destination if needed. */
1455 if (dst != operands[0])
1456 emit_move_insn (operands[0], dst);
1457}
1458
1459/* Predict just emitted jump instruction to be taken with probability PROB. */
1460
1461static void
1462predict_jump (int prob)
1463{
1464 rtx_insn *insn = get_last_insn ();
1465 gcc_assert (JUMP_P (insn));
1466 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1467}
1468
1469/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1470 divisor are within the range [0-255]. */
1471
1472void
1473ix86_split_idivmod (machine_mode mode, rtx operands[],
40c81f84 1474 bool unsigned_p)
2bf6d935
ML
1475{
1476 rtx_code_label *end_label, *qimode_label;
1477 rtx div, mod;
1478 rtx_insn *insn;
1479 rtx scratch, tmp0, tmp1, tmp2;
1480 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
2bf6d935 1481
2b399dba
UB
1482 operands[2] = force_reg (mode, operands[2]);
1483 operands[3] = force_reg (mode, operands[3]);
1484
2bf6d935
ML
1485 switch (mode)
1486 {
1487 case E_SImode:
1488 if (GET_MODE (operands[0]) == SImode)
1489 {
1490 if (GET_MODE (operands[1]) == SImode)
40c81f84 1491 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
2bf6d935
ML
1492 else
1493 gen_divmod4_1
40c81f84 1494 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
2bf6d935
ML
1495 }
1496 else
ea298f7a
UB
1497 gen_divmod4_1
1498 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
2bf6d935 1499 break;
ea298f7a 1500
2bf6d935 1501 case E_DImode:
40c81f84 1502 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
2bf6d935 1503 break;
ea298f7a 1504
2bf6d935
ML
1505 default:
1506 gcc_unreachable ();
1507 }
1508
1509 end_label = gen_label_rtx ();
1510 qimode_label = gen_label_rtx ();
1511
1512 scratch = gen_reg_rtx (mode);
1513
1514 /* Use 8bit unsigned divimod if dividend and divisor are within
1515 the range [0-255]. */
1516 emit_move_insn (scratch, operands[2]);
1517 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1518 scratch, 1, OPTAB_DIRECT);
ea298f7a 1519 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
2bf6d935
ML
1520 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1521 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1522 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1523 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1524 pc_rtx);
1525 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1526 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1527 JUMP_LABEL (insn) = qimode_label;
1528
1529 /* Generate original signed/unsigned divimod. */
e9539592
UB
1530 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1531 operands[2], operands[3]));
2bf6d935
ML
1532
1533 /* Branch to the end. */
1534 emit_jump_insn (gen_jump (end_label));
1535 emit_barrier ();
1536
1537 /* Generate 8bit unsigned divide. */
1538 emit_label (qimode_label);
1539 /* Don't use operands[0] for result of 8bit divide since not all
1540 registers support QImode ZERO_EXTRACT. */
1541 tmp0 = lowpart_subreg (HImode, scratch, mode);
1542 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1543 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1544 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1545
40c81f84 1546 if (unsigned_p)
2bf6d935 1547 {
40c81f84
UB
1548 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1549 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
2bf6d935
ML
1550 }
1551 else
1552 {
40c81f84
UB
1553 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1554 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
2bf6d935
ML
1555 }
1556 if (mode == SImode)
1557 {
1558 if (GET_MODE (operands[0]) != SImode)
1559 div = gen_rtx_ZERO_EXTEND (DImode, div);
1560 if (GET_MODE (operands[1]) != SImode)
1561 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1562 }
1563
1564 /* Extract remainder from AH. */
e9539592
UB
1565 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1566 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1567 GEN_INT (8), GEN_INT (8));
1568 insn = emit_move_insn (operands[1], tmp1);
2bf6d935
ML
1569 set_unique_reg_note (insn, REG_EQUAL, mod);
1570
1571 /* Zero extend quotient from AL. */
1572 tmp1 = gen_lowpart (QImode, tmp0);
ea298f7a
UB
1573 insn = emit_insn (gen_extend_insn
1574 (operands[0], tmp1,
1575 GET_MODE (operands[0]), QImode, 1));
2bf6d935
ML
1576 set_unique_reg_note (insn, REG_EQUAL, div);
1577
1578 emit_label (end_label);
1579}
1580
1581/* Emit x86 binary operand CODE in mode MODE, where the first operand
1582 matches destination. RTX includes clobber of FLAGS_REG. */
1583
1584void
1585ix86_emit_binop (enum rtx_code code, machine_mode mode,
1586 rtx dst, rtx src)
1587{
1588 rtx op, clob;
1589
1590 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1591 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1592
1593 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1594}
1595
1596/* Return true if regno1 def is nearest to the insn. */
1597
1598static bool
1599find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1600{
1601 rtx_insn *prev = insn;
1602 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1603
1604 if (insn == start)
1605 return false;
1606 while (prev && prev != start)
1607 {
1608 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1609 {
1610 prev = PREV_INSN (prev);
1611 continue;
1612 }
1613 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1614 return true;
1615 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1616 return false;
1617 prev = PREV_INSN (prev);
1618 }
1619
1620 /* None of the regs is defined in the bb. */
1621 return false;
1622}
1623
d58a66aa
JJ
1624/* INSN_UID of the last insn emitted by zero store peephole2s. */
1625int ix86_last_zero_store_uid;
1626
2bf6d935
ML
1627/* Split lea instructions into a sequence of instructions
1628 which are executed on ALU to avoid AGU stalls.
1629 It is assumed that it is allowed to clobber flags register
1630 at lea position. */
1631
1632void
1633ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1634{
1635 unsigned int regno0, regno1, regno2;
1636 struct ix86_address parts;
1637 rtx target, tmp;
1638 int ok, adds;
1639
1640 ok = ix86_decompose_address (operands[1], &parts);
1641 gcc_assert (ok);
1642
1643 target = gen_lowpart (mode, operands[0]);
1644
1645 regno0 = true_regnum (target);
1646 regno1 = INVALID_REGNUM;
1647 regno2 = INVALID_REGNUM;
1648
1649 if (parts.base)
1650 {
1651 parts.base = gen_lowpart (mode, parts.base);
1652 regno1 = true_regnum (parts.base);
1653 }
1654
1655 if (parts.index)
1656 {
1657 parts.index = gen_lowpart (mode, parts.index);
1658 regno2 = true_regnum (parts.index);
1659 }
1660
1661 if (parts.disp)
1662 parts.disp = gen_lowpart (mode, parts.disp);
1663
1664 if (parts.scale > 1)
1665 {
1666 /* Case r1 = r1 + ... */
1667 if (regno1 == regno0)
1668 {
1669 /* If we have a case r1 = r1 + C * r2 then we
1670 should use multiplication which is very
1671 expensive. Assume cost model is wrong if we
1672 have such case here. */
1673 gcc_assert (regno2 != regno0);
1674
1675 for (adds = parts.scale; adds > 0; adds--)
1676 ix86_emit_binop (PLUS, mode, target, parts.index);
1677 }
1678 else
1679 {
1680 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1681 if (regno0 != regno2)
1682 emit_insn (gen_rtx_SET (target, parts.index));
1683
d55ce33a
JJ
1684 /* Use shift for scaling, but emit it as MULT instead
1685 to avoid it being immediately peephole2 optimized back
1686 into lea. */
1687 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
2bf6d935
ML
1688
1689 if (parts.base)
1690 ix86_emit_binop (PLUS, mode, target, parts.base);
1691
1692 if (parts.disp && parts.disp != const0_rtx)
1693 ix86_emit_binop (PLUS, mode, target, parts.disp);
1694 }
1695 }
1696 else if (!parts.base && !parts.index)
1697 {
1698 gcc_assert(parts.disp);
1699 emit_insn (gen_rtx_SET (target, parts.disp));
1700 }
1701 else
1702 {
1703 if (!parts.base)
1704 {
1705 if (regno0 != regno2)
1706 emit_insn (gen_rtx_SET (target, parts.index));
1707 }
1708 else if (!parts.index)
1709 {
1710 if (regno0 != regno1)
1711 emit_insn (gen_rtx_SET (target, parts.base));
1712 }
1713 else
1714 {
1715 if (regno0 == regno1)
1716 tmp = parts.index;
1717 else if (regno0 == regno2)
1718 tmp = parts.base;
1719 else
1720 {
1721 rtx tmp1;
1722
1723 /* Find better operand for SET instruction, depending
1724 on which definition is farther from the insn. */
1725 if (find_nearest_reg_def (insn, regno1, regno2))
1726 tmp = parts.index, tmp1 = parts.base;
1727 else
1728 tmp = parts.base, tmp1 = parts.index;
1729
1730 emit_insn (gen_rtx_SET (target, tmp));
1731
1732 if (parts.disp && parts.disp != const0_rtx)
1733 ix86_emit_binop (PLUS, mode, target, parts.disp);
1734
1735 ix86_emit_binop (PLUS, mode, target, tmp1);
1736 return;
1737 }
1738
1739 ix86_emit_binop (PLUS, mode, target, tmp);
1740 }
1741
1742 if (parts.disp && parts.disp != const0_rtx)
1743 ix86_emit_binop (PLUS, mode, target, parts.disp);
1744 }
1745}
1746
1747/* Post-reload splitter for converting an SF or DFmode value in an
1748 SSE register into an unsigned SImode. */
1749
1750void
1751ix86_split_convert_uns_si_sse (rtx operands[])
1752{
1753 machine_mode vecmode;
1754 rtx value, large, zero_or_two31, input, two31, x;
1755
1756 large = operands[1];
1757 zero_or_two31 = operands[2];
1758 input = operands[3];
1759 two31 = operands[4];
1760 vecmode = GET_MODE (large);
1761 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1762
1763 /* Load up the value into the low element. We must ensure that the other
1764 elements are valid floats -- zero is the easiest such value. */
1765 if (MEM_P (input))
1766 {
1767 if (vecmode == V4SFmode)
1768 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1769 else
1770 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1771 }
1772 else
1773 {
1774 input = gen_rtx_REG (vecmode, REGNO (input));
1775 emit_move_insn (value, CONST0_RTX (vecmode));
1776 if (vecmode == V4SFmode)
febb58d2 1777 emit_insn (gen_sse_movss_v4sf (value, value, input));
2bf6d935 1778 else
febb58d2 1779 emit_insn (gen_sse2_movsd_v2df (value, value, input));
2bf6d935
ML
1780 }
1781
1782 emit_move_insn (large, two31);
1783 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1784
1785 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1786 emit_insn (gen_rtx_SET (large, x));
1787
1788 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1789 emit_insn (gen_rtx_SET (zero_or_two31, x));
1790
1791 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1792 emit_insn (gen_rtx_SET (value, x));
1793
1794 large = gen_rtx_REG (V4SImode, REGNO (large));
1795 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1796
1797 x = gen_rtx_REG (V4SImode, REGNO (value));
1798 if (vecmode == V4SFmode)
1799 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1800 else
1801 emit_insn (gen_sse2_cvttpd2dq (x, value));
1802 value = x;
1803
1804 emit_insn (gen_xorv4si3 (value, value, large));
1805}
1806
1807static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1808 machine_mode mode, rtx target,
1809 rtx var, int one_var);
1810
1811/* Convert an unsigned DImode value into a DFmode, using only SSE.
1812 Expects the 64-bit DImode to be supplied in a pair of integral
1813 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1814 -mfpmath=sse, !optimize_size only. */
1815
1816void
1817ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1818{
1819 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1820 rtx int_xmm, fp_xmm;
1821 rtx biases, exponents;
1822 rtx x;
1823
1824 int_xmm = gen_reg_rtx (V4SImode);
1825 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1826 emit_insn (gen_movdi_to_sse (int_xmm, input));
1827 else if (TARGET_SSE_SPLIT_REGS)
1828 {
1829 emit_clobber (int_xmm);
1830 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1831 }
1832 else
1833 {
1834 x = gen_reg_rtx (V2DImode);
1835 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1836 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1837 }
1838
1839 x = gen_rtx_CONST_VECTOR (V4SImode,
1840 gen_rtvec (4, GEN_INT (0x43300000UL),
1841 GEN_INT (0x45300000UL),
1842 const0_rtx, const0_rtx));
1843 exponents = validize_mem (force_const_mem (V4SImode, x));
1844
1845 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1846 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1847
1848 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1849 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1850 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1851 (0x1.0p84 + double(fp_value_hi_xmm)).
1852 Note these exponents differ by 32. */
1853
1854 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1855
1856 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1857 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1858 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1859 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1860 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1861 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1862 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1863 biases = validize_mem (force_const_mem (V2DFmode, biases));
1864 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1865
1866 /* Add the upper and lower DFmode values together. */
1867 if (TARGET_SSE3)
1868 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1869 else
1870 {
1871 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1872 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1873 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1874 }
1875
1876 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1877}
1878
1879/* Not used, but eases macroization of patterns. */
1880void
1881ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1882{
1883 gcc_unreachable ();
1884}
1885
0cda606d
UB
1886static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1887
2bf6d935
ML
1888/* Convert an unsigned SImode value into a DFmode. Only currently used
1889 for SSE, but applicable anywhere. */
1890
1891void
1892ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1893{
1894 REAL_VALUE_TYPE TWO31r;
1895 rtx x, fp;
1896
1897 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1898 NULL, 1, OPTAB_DIRECT);
1899
1900 fp = gen_reg_rtx (DFmode);
1901 emit_insn (gen_floatsidf2 (fp, x));
1902
1903 real_ldexp (&TWO31r, &dconst1, 31);
1904 x = const_double_from_real_value (TWO31r, DFmode);
1905
1906 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
0cda606d
UB
1907
1908 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1909 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1910 x = ix86_expand_sse_fabs (x, NULL);
1911
2bf6d935
ML
1912 if (x != target)
1913 emit_move_insn (target, x);
1914}
1915
1916/* Convert a signed DImode value into a DFmode. Only used for SSE in
1917 32-bit mode; otherwise we have a direct convert instruction. */
1918
1919void
1920ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1921{
1922 REAL_VALUE_TYPE TWO32r;
1923 rtx fp_lo, fp_hi, x;
1924
1925 fp_lo = gen_reg_rtx (DFmode);
1926 fp_hi = gen_reg_rtx (DFmode);
1927
1928 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1929
1930 real_ldexp (&TWO32r, &dconst1, 32);
1931 x = const_double_from_real_value (TWO32r, DFmode);
1932 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1933
1934 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1935
1936 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1937 0, OPTAB_DIRECT);
1938 if (x != target)
1939 emit_move_insn (target, x);
1940}
1941
1942/* Convert an unsigned SImode value into a SFmode, using only SSE.
1943 For x86_32, -mfpmath=sse, !optimize_size only. */
1944void
1945ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1946{
1947 REAL_VALUE_TYPE ONE16r;
1948 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1949
1950 real_ldexp (&ONE16r, &dconst1, 16);
1951 x = const_double_from_real_value (ONE16r, SFmode);
1952 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1953 NULL, 0, OPTAB_DIRECT);
1954 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1955 NULL, 0, OPTAB_DIRECT);
1956 fp_hi = gen_reg_rtx (SFmode);
1957 fp_lo = gen_reg_rtx (SFmode);
1958 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1959 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
ad9fcb96
L
1960 if (TARGET_FMA)
1961 {
1962 x = validize_mem (force_const_mem (SFmode, x));
1963 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1964 emit_move_insn (target, fp_hi);
1965 }
1966 else
1967 {
1968 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1969 0, OPTAB_DIRECT);
1970 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1971 0, OPTAB_DIRECT);
1972 if (!rtx_equal_p (target, fp_hi))
1973 emit_move_insn (target, fp_hi);
1974 }
2bf6d935
ML
1975}
1976
1977/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1978 a vector of unsigned ints VAL to vector of floats TARGET. */
1979
1980void
1981ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1982{
1983 rtx tmp[8];
1984 REAL_VALUE_TYPE TWO16r;
1985 machine_mode intmode = GET_MODE (val);
1986 machine_mode fltmode = GET_MODE (target);
1987 rtx (*cvt) (rtx, rtx);
1988
1989 if (intmode == V4SImode)
1990 cvt = gen_floatv4siv4sf2;
1991 else
1992 cvt = gen_floatv8siv8sf2;
1993 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1994 tmp[0] = force_reg (intmode, tmp[0]);
1995 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1996 OPTAB_DIRECT);
1997 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1998 NULL_RTX, 1, OPTAB_DIRECT);
1999 tmp[3] = gen_reg_rtx (fltmode);
2000 emit_insn (cvt (tmp[3], tmp[1]));
2001 tmp[4] = gen_reg_rtx (fltmode);
2002 emit_insn (cvt (tmp[4], tmp[2]));
2003 real_ldexp (&TWO16r, &dconst1, 16);
2004 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2005 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
ad9fcb96
L
2006 if (TARGET_FMA)
2007 {
2008 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2009 emit_move_insn (target, tmp[6]);
2010 }
2011 else
2012 {
2013 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2014 NULL_RTX, 1, OPTAB_DIRECT);
2015 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2016 target, 1, OPTAB_DIRECT);
2017 if (tmp[7] != target)
2018 emit_move_insn (target, tmp[7]);
2019 }
2bf6d935
ML
2020}
2021
2022/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2023 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
2024 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2025 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2026
2027rtx
2028ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2029{
2030 REAL_VALUE_TYPE TWO31r;
2031 rtx two31r, tmp[4];
2032 machine_mode mode = GET_MODE (val);
2033 machine_mode scalarmode = GET_MODE_INNER (mode);
2034 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2035 rtx (*cmp) (rtx, rtx, rtx, rtx);
2036 int i;
2037
2038 for (i = 0; i < 3; i++)
2039 tmp[i] = gen_reg_rtx (mode);
2040 real_ldexp (&TWO31r, &dconst1, 31);
2041 two31r = const_double_from_real_value (TWO31r, scalarmode);
2042 two31r = ix86_build_const_vector (mode, 1, two31r);
2043 two31r = force_reg (mode, two31r);
2044 switch (mode)
2045 {
2046 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2047 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2048 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2049 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2050 default: gcc_unreachable ();
2051 }
2052 tmp[3] = gen_rtx_LE (mode, two31r, val);
2053 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2054 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2055 0, OPTAB_DIRECT);
2056 if (intmode == V4SImode || TARGET_AVX2)
2057 *xorp = expand_simple_binop (intmode, ASHIFT,
2058 gen_lowpart (intmode, tmp[0]),
2059 GEN_INT (31), NULL_RTX, 0,
2060 OPTAB_DIRECT);
2061 else
2062 {
6a556ba4 2063 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2bf6d935
ML
2064 two31 = ix86_build_const_vector (intmode, 1, two31);
2065 *xorp = expand_simple_binop (intmode, AND,
2066 gen_lowpart (intmode, tmp[0]),
2067 two31, NULL_RTX, 0,
2068 OPTAB_DIRECT);
2069 }
2070 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2071 0, OPTAB_DIRECT);
2072}
2073
2074/* Generate code for floating point ABS or NEG. */
2075
2076void
2077ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2078 rtx operands[])
2079{
f359611b 2080 rtx set, dst, src;
2bf6d935
ML
2081 bool use_sse = false;
2082 bool vector_mode = VECTOR_MODE_P (mode);
2083 machine_mode vmode = mode;
f359611b 2084 rtvec par;
2bf6d935 2085
75a97b59
L
2086 if (vector_mode || mode == TFmode || mode == HFmode)
2087 {
2088 use_sse = true;
2089 if (mode == HFmode)
2090 vmode = V8HFmode;
2091 }
2bf6d935
ML
2092 else if (TARGET_SSE_MATH)
2093 {
2094 use_sse = SSE_FLOAT_MODE_P (mode);
2095 if (mode == SFmode)
2096 vmode = V4SFmode;
2097 else if (mode == DFmode)
2098 vmode = V2DFmode;
2099 }
2100
2bf6d935
ML
2101 dst = operands[0];
2102 src = operands[1];
2103
2104 set = gen_rtx_fmt_e (code, mode, src);
2105 set = gen_rtx_SET (dst, set);
2106
f359611b 2107 if (use_sse)
2bf6d935 2108 {
f359611b 2109 rtx mask, use, clob;
2bf6d935 2110
f359611b
UB
2111 /* NEG and ABS performed with SSE use bitwise mask operations.
2112 Create the appropriate mask now. */
2113 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2bf6d935 2114 use = gen_rtx_USE (VOIDmode, mask);
94f687bd 2115 if (vector_mode || mode == TFmode)
2bf6d935
ML
2116 par = gen_rtvec (2, set, use);
2117 else
2118 {
2119 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2120 par = gen_rtvec (3, set, use, clob);
2121 }
2bf6d935
ML
2122 }
2123 else
f359611b
UB
2124 {
2125 rtx clob;
2126
2127 /* Changing of sign for FP values is doable using integer unit too. */
2128 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2129 par = gen_rtvec (2, set, clob);
2130 }
2131
2132 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2133}
2134
2135/* Deconstruct a floating point ABS or NEG operation
2136 with integer registers into integer operations. */
2137
2138void
2139ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2140 rtx operands[])
2141{
2142 enum rtx_code absneg_op;
2143 rtx dst, set;
2144
2145 gcc_assert (operands_match_p (operands[0], operands[1]));
2146
2147 switch (mode)
2148 {
2149 case E_SFmode:
2150 dst = gen_lowpart (SImode, operands[0]);
2151
2152 if (code == ABS)
2153 {
2154 set = gen_int_mode (0x7fffffff, SImode);
2155 absneg_op = AND;
2156 }
2157 else
2158 {
2159 set = gen_int_mode (0x80000000, SImode);
2160 absneg_op = XOR;
2161 }
2162 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2163 break;
2164
2165 case E_DFmode:
2166 if (TARGET_64BIT)
2167 {
2168 dst = gen_lowpart (DImode, operands[0]);
2169 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2170
2171 if (code == ABS)
2172 set = const0_rtx;
2173 else
2174 set = gen_rtx_NOT (DImode, dst);
2175 }
2176 else
2177 {
2178 dst = gen_highpart (SImode, operands[0]);
2179
2180 if (code == ABS)
2181 {
2182 set = gen_int_mode (0x7fffffff, SImode);
2183 absneg_op = AND;
2184 }
2185 else
2186 {
2187 set = gen_int_mode (0x80000000, SImode);
2188 absneg_op = XOR;
2189 }
2190 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2191 }
2192 break;
2193
2194 case E_XFmode:
2195 dst = gen_rtx_REG (SImode,
2196 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2197 if (code == ABS)
2198 {
2199 set = GEN_INT (0x7fff);
2200 absneg_op = AND;
2201 }
2202 else
2203 {
2204 set = GEN_INT (0x8000);
2205 absneg_op = XOR;
2206 }
2207 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2208 break;
2209
2210 default:
2211 gcc_unreachable ();
2212 }
2213
2214 set = gen_rtx_SET (dst, set);
2215
2216 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2217 rtvec par = gen_rtvec (2, set, clob);
2218
2219 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2bf6d935
ML
2220}
2221
2222/* Expand a copysign operation. Special case operand 0 being a constant. */
2223
2224void
2225ix86_expand_copysign (rtx operands[])
2226{
2227 machine_mode mode, vmode;
7e691189 2228 rtx dest, vdest, op0, op1, mask, op2, op3;
2bf6d935 2229
60efb1fe 2230 mode = GET_MODE (operands[0]);
2bf6d935 2231
75a97b59
L
2232 if (mode == HFmode)
2233 vmode = V8HFmode;
2234 else if (mode == SFmode)
2bf6d935
ML
2235 vmode = V4SFmode;
2236 else if (mode == DFmode)
2237 vmode = V2DFmode;
987a3082 2238 else if (mode == TFmode)
2bf6d935 2239 vmode = mode;
987a3082
UB
2240 else
2241 gcc_unreachable ();
2242
60efb1fe 2243 if (rtx_equal_p (operands[1], operands[2]))
2bf6d935 2244 {
60efb1fe 2245 emit_move_insn (operands[0], operands[1]);
2bf6d935
ML
2246 return;
2247 }
2248
7e691189
JJ
2249 dest = operands[0];
2250 vdest = lowpart_subreg (vmode, dest, mode);
2251 if (vdest == NULL_RTX)
2252 vdest = gen_reg_rtx (vmode);
2253 else
2254 dest = NULL_RTX;
2255 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
60efb1fe 2256 mask = ix86_build_signbit_mask (vmode, 0, 0);
2bf6d935 2257
60efb1fe 2258 if (CONST_DOUBLE_P (operands[1]))
2bf6d935 2259 {
60efb1fe 2260 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2261 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2262 if (op0 == CONST0_RTX (mode))
2bf6d935 2263 {
7e691189
JJ
2264 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2265 if (dest)
2266 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
60efb1fe 2267 return;
2bf6d935 2268 }
2bf6d935 2269
60efb1fe 2270 if (GET_MODE_SIZE (mode) < 16)
2271 op0 = ix86_build_const_vector (vmode, false, op0);
2272 op0 = force_reg (vmode, op0);
2bf6d935 2273 }
60efb1fe 2274 else
7e691189 2275 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
60efb1fe 2276
2277 op2 = gen_reg_rtx (vmode);
2278 op3 = gen_reg_rtx (vmode);
2279 emit_move_insn (op2, gen_rtx_AND (vmode,
2280 gen_rtx_NOT (vmode, mask),
2281 op0));
2282 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
7e691189
JJ
2283 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2284 if (dest)
2285 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2bf6d935
ML
2286}
2287
2288/* Expand an xorsign operation. */
2289
2290void
2291ix86_expand_xorsign (rtx operands[])
2292{
2bf6d935 2293 machine_mode mode, vmode;
7e691189 2294 rtx dest, vdest, op0, op1, mask, x, temp;
2bf6d935
ML
2295
2296 dest = operands[0];
2297 op0 = operands[1];
2298 op1 = operands[2];
2299
2300 mode = GET_MODE (dest);
2301
75a97b59
L
2302 if (mode == HFmode)
2303 vmode = V8HFmode;
2304 else if (mode == SFmode)
987a3082 2305 vmode = V4SFmode;
2bf6d935 2306 else if (mode == DFmode)
987a3082 2307 vmode = V2DFmode;
2bf6d935
ML
2308 else
2309 gcc_unreachable ();
2310
7485a525 2311 temp = gen_reg_rtx (vmode);
2bf6d935
ML
2312 mask = ix86_build_signbit_mask (vmode, 0, 0);
2313
7e691189 2314 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
7485a525
JJ
2315 x = gen_rtx_AND (vmode, op1, mask);
2316 emit_insn (gen_rtx_SET (temp, x));
2bf6d935 2317
7e691189 2318 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
7485a525 2319 x = gen_rtx_XOR (vmode, temp, op0);
652bef70 2320
7e691189
JJ
2321 vdest = lowpart_subreg (vmode, dest, mode);
2322 if (vdest == NULL_RTX)
2323 vdest = gen_reg_rtx (vmode);
2324 else
2325 dest = NULL_RTX;
2326 emit_insn (gen_rtx_SET (vdest, x));
2327
2328 if (dest)
2329 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2bf6d935
ML
2330}
2331
2332static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2333
2334void
2335ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2336{
2337 machine_mode mode = GET_MODE (op0);
2338 rtx tmp;
2339
2340 /* Handle special case - vector comparsion with boolean result, transform
2341 it using ptest instruction. */
850a13d7 2342 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2343 || mode == OImode)
2bf6d935
ML
2344 {
2345 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2346 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2347
2348 gcc_assert (code == EQ || code == NE);
850a13d7 2349
2350 if (mode == OImode)
2351 {
2352 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2353 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2354 mode = p_mode;
2355 }
2bf6d935
ML
2356 /* Generate XOR since we can't check that one operand is zero vector. */
2357 tmp = gen_reg_rtx (mode);
2358 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2359 tmp = gen_lowpart (p_mode, tmp);
2360 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2361 gen_rtx_UNSPEC (CCmode,
2362 gen_rtvec (2, tmp, tmp),
2363 UNSPEC_PTEST)));
2364 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2365 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2366 gen_rtx_LABEL_REF (VOIDmode, label),
2367 pc_rtx);
2368 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2369 return;
2370 }
2371
2372 switch (mode)
2373 {
a6841211 2374 case E_HFmode:
2bf6d935
ML
2375 case E_SFmode:
2376 case E_DFmode:
2377 case E_XFmode:
2378 case E_QImode:
2379 case E_HImode:
2380 case E_SImode:
2381 simple:
2382 tmp = ix86_expand_compare (code, op0, op1);
2383 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2384 gen_rtx_LABEL_REF (VOIDmode, label),
2385 pc_rtx);
2386 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2387 return;
2388
2389 case E_DImode:
2390 if (TARGET_64BIT)
2391 goto simple;
2bf6d935
ML
2392 /* FALLTHRU */
2393 case E_TImode:
43201f2c
RS
2394 /* DI and TI mode equality/inequality comparisons may be performed
2395 on SSE registers. Avoid splitting them, except when optimizing
2396 for size. */
2397 if ((code == EQ || code == NE)
2398 && !optimize_insn_for_size_p ())
2399 goto simple;
2400
2bf6d935
ML
2401 /* Expand DImode branch into multiple compare+branch. */
2402 {
2403 rtx lo[2], hi[2];
2404 rtx_code_label *label2;
2405 enum rtx_code code1, code2, code3;
2406 machine_mode submode;
2407
2408 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2409 {
2410 std::swap (op0, op1);
2411 code = swap_condition (code);
2412 }
2413
2414 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2415 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2416
2417 submode = mode == DImode ? SImode : DImode;
2418
43201f2c 2419 /* If we are doing less-than or greater-or-equal-than,
2bf6d935
ML
2420 op1 is a constant and the low word is zero, then we can just
2421 examine the high word. Similarly for low word -1 and
2422 less-or-equal-than or greater-than. */
2423
2424 if (CONST_INT_P (hi[1]))
2425 switch (code)
2426 {
2427 case LT: case LTU: case GE: case GEU:
2428 if (lo[1] == const0_rtx)
2429 {
2430 ix86_expand_branch (code, hi[0], hi[1], label);
2431 return;
2432 }
2433 break;
2434 case LE: case LEU: case GT: case GTU:
2435 if (lo[1] == constm1_rtx)
2436 {
2437 ix86_expand_branch (code, hi[0], hi[1], label);
2438 return;
2439 }
2440 break;
2441 default:
2442 break;
2443 }
2444
2445 /* Emulate comparisons that do not depend on Zero flag with
2446 double-word subtraction. Note that only Overflow, Sign
2447 and Carry flags are valid, so swap arguments and condition
2448 of comparisons that would otherwise test Zero flag. */
2449
2450 switch (code)
2451 {
2452 case LE: case LEU: case GT: case GTU:
2453 std::swap (lo[0], lo[1]);
2454 std::swap (hi[0], hi[1]);
2455 code = swap_condition (code);
2456 /* FALLTHRU */
2457
2458 case LT: case LTU: case GE: case GEU:
2459 {
2bf6d935 2460 bool uns = (code == LTU || code == GEU);
987a3082
UB
2461 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2462 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2bf6d935
ML
2463
2464 if (!nonimmediate_operand (lo[0], submode))
2465 lo[0] = force_reg (submode, lo[0]);
2466 if (!x86_64_general_operand (lo[1], submode))
2467 lo[1] = force_reg (submode, lo[1]);
2468
2469 if (!register_operand (hi[0], submode))
2470 hi[0] = force_reg (submode, hi[0]);
2471 if ((uns && !nonimmediate_operand (hi[1], submode))
2472 || (!uns && !x86_64_general_operand (hi[1], submode)))
2473 hi[1] = force_reg (submode, hi[1]);
2474
987a3082 2475 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2bf6d935 2476
987a3082
UB
2477 tmp = gen_rtx_SCRATCH (submode);
2478 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2bf6d935 2479
987a3082 2480 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2bf6d935
ML
2481 ix86_expand_branch (code, tmp, const0_rtx, label);
2482 return;
2483 }
2484
2485 default:
2486 break;
2487 }
2488
2489 /* Otherwise, we need two or three jumps. */
2490
2491 label2 = gen_label_rtx ();
2492
2493 code1 = code;
2494 code2 = swap_condition (code);
2495 code3 = unsigned_condition (code);
2496
2497 switch (code)
2498 {
2499 case LT: case GT: case LTU: case GTU:
2500 break;
2501
2502 case LE: code1 = LT; code2 = GT; break;
2503 case GE: code1 = GT; code2 = LT; break;
2504 case LEU: code1 = LTU; code2 = GTU; break;
2505 case GEU: code1 = GTU; code2 = LTU; break;
2506
2507 case EQ: code1 = UNKNOWN; code2 = NE; break;
2508 case NE: code2 = UNKNOWN; break;
2509
2510 default:
2511 gcc_unreachable ();
2512 }
2513
2514 /*
2515 * a < b =>
2516 * if (hi(a) < hi(b)) goto true;
2517 * if (hi(a) > hi(b)) goto false;
2518 * if (lo(a) < lo(b)) goto true;
2519 * false:
2520 */
2521
2522 if (code1 != UNKNOWN)
2523 ix86_expand_branch (code1, hi[0], hi[1], label);
2524 if (code2 != UNKNOWN)
2525 ix86_expand_branch (code2, hi[0], hi[1], label2);
2526
2527 ix86_expand_branch (code3, lo[0], lo[1], label);
2528
2529 if (code2 != UNKNOWN)
2530 emit_label (label2);
2531 return;
2532 }
2533
2534 default:
2535 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2536 goto simple;
2537 }
2538}
2539
2540/* Figure out whether to use unordered fp comparisons. */
2541
2542static bool
2543ix86_unordered_fp_compare (enum rtx_code code)
2544{
2545 if (!TARGET_IEEE_FP)
2546 return false;
2547
2548 switch (code)
2549 {
2bf6d935
ML
2550 case LT:
2551 case LE:
d6038777
UB
2552 case GT:
2553 case GE:
2554 case LTGT:
2bf6d935
ML
2555 return false;
2556
2557 case EQ:
2558 case NE:
2559
2bf6d935
ML
2560 case UNORDERED:
2561 case ORDERED:
2562 case UNLT:
2563 case UNLE:
2564 case UNGT:
2565 case UNGE:
2566 case UNEQ:
2567 return true;
2568
2569 default:
2570 gcc_unreachable ();
2571 }
2572}
2573
2574/* Return a comparison we can do and that it is equivalent to
2575 swap_condition (code) apart possibly from orderedness.
2576 But, never change orderedness if TARGET_IEEE_FP, returning
2577 UNKNOWN in that case if necessary. */
2578
2579static enum rtx_code
2580ix86_fp_swap_condition (enum rtx_code code)
2581{
2582 switch (code)
2583 {
2584 case GT: /* GTU - CF=0 & ZF=0 */
2585 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2586 case GE: /* GEU - CF=0 */
2587 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2588 case UNLT: /* LTU - CF=1 */
2589 return TARGET_IEEE_FP ? UNKNOWN : GT;
2590 case UNLE: /* LEU - CF=1 | ZF=1 */
2591 return TARGET_IEEE_FP ? UNKNOWN : GE;
2592 default:
2593 return swap_condition (code);
2594 }
2595}
2596
2597/* Return cost of comparison CODE using the best strategy for performance.
2598 All following functions do use number of instructions as a cost metrics.
2599 In future this should be tweaked to compute bytes for optimize_size and
2600 take into account performance of various instructions on various CPUs. */
2601
2602static int
2603ix86_fp_comparison_cost (enum rtx_code code)
2604{
2605 int arith_cost;
2606
2607 /* The cost of code using bit-twiddling on %ah. */
2608 switch (code)
2609 {
2610 case UNLE:
2611 case UNLT:
2612 case LTGT:
2613 case GT:
2614 case GE:
2615 case UNORDERED:
2616 case ORDERED:
2617 case UNEQ:
2618 arith_cost = 4;
2619 break;
2620 case LT:
2621 case NE:
2622 case EQ:
2623 case UNGE:
2624 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2625 break;
2626 case LE:
2627 case UNGT:
2628 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2629 break;
2630 default:
2631 gcc_unreachable ();
2632 }
2633
2634 switch (ix86_fp_comparison_strategy (code))
2635 {
2636 case IX86_FPCMP_COMI:
2637 return arith_cost > 4 ? 3 : 2;
2638 case IX86_FPCMP_SAHF:
2639 return arith_cost > 4 ? 4 : 3;
2640 default:
2641 return arith_cost;
2642 }
2643}
2644
2645/* Swap, force into registers, or otherwise massage the two operands
2646 to a fp comparison. The operands are updated in place; the new
2647 comparison code is returned. */
2648
2649static enum rtx_code
2650ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2651{
2652 bool unordered_compare = ix86_unordered_fp_compare (code);
2653 rtx op0 = *pop0, op1 = *pop1;
2654 machine_mode op_mode = GET_MODE (op0);
a6841211 2655 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2bf6d935 2656
5792208f
JJ
2657 if (op_mode == BFmode)
2658 {
2659 rtx op = gen_lowpart (HImode, op0);
2660 if (CONST_INT_P (op))
2661 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2662 op0, BFmode);
2663 else
2664 {
2665 rtx t1 = gen_reg_rtx (SImode);
2666 emit_insn (gen_zero_extendhisi2 (t1, op));
2667 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2668 op = gen_lowpart (SFmode, t1);
2669 }
2670 *pop0 = op;
2671 op = gen_lowpart (HImode, op1);
2672 if (CONST_INT_P (op))
2673 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2674 op1, BFmode);
2675 else
2676 {
2677 rtx t1 = gen_reg_rtx (SImode);
2678 emit_insn (gen_zero_extendhisi2 (t1, op));
2679 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2680 op = gen_lowpart (SFmode, t1);
2681 }
2682 *pop1 = op;
2683 return ix86_prepare_fp_compare_args (code, pop0, pop1);
2684 }
2685
2bf6d935
ML
2686 /* All of the unordered compare instructions only work on registers.
2687 The same is true of the fcomi compare instructions. The XFmode
2688 compare instructions require registers except when comparing
2689 against zero or when converting operand 1 from fixed point to
2690 floating point. */
2691
2692 if (!is_sse
2693 && (unordered_compare
2694 || (op_mode == XFmode
2695 && ! (standard_80387_constant_p (op0) == 1
2696 || standard_80387_constant_p (op1) == 1)
2697 && GET_CODE (op1) != FLOAT)
2698 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2699 {
2700 op0 = force_reg (op_mode, op0);
2701 op1 = force_reg (op_mode, op1);
2702 }
2703 else
2704 {
2705 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2706 things around if they appear profitable, otherwise force op0
2707 into a register. */
2708
2709 if (standard_80387_constant_p (op0) == 0
2710 || (MEM_P (op0)
2711 && ! (standard_80387_constant_p (op1) == 0
2712 || MEM_P (op1))))
2713 {
2714 enum rtx_code new_code = ix86_fp_swap_condition (code);
2715 if (new_code != UNKNOWN)
2716 {
2717 std::swap (op0, op1);
2718 code = new_code;
2719 }
2720 }
2721
2722 if (!REG_P (op0))
2723 op0 = force_reg (op_mode, op0);
2724
2725 if (CONSTANT_P (op1))
2726 {
2727 int tmp = standard_80387_constant_p (op1);
2728 if (tmp == 0)
2729 op1 = validize_mem (force_const_mem (op_mode, op1));
2730 else if (tmp == 1)
2731 {
2732 if (TARGET_CMOVE)
2733 op1 = force_reg (op_mode, op1);
2734 }
2735 else
2736 op1 = force_reg (op_mode, op1);
2737 }
2738 }
2739
2740 /* Try to rearrange the comparison to make it cheaper. */
2741 if (ix86_fp_comparison_cost (code)
2742 > ix86_fp_comparison_cost (swap_condition (code))
2743 && (REG_P (op1) || can_create_pseudo_p ()))
2744 {
2745 std::swap (op0, op1);
2746 code = swap_condition (code);
2747 if (!REG_P (op0))
2748 op0 = force_reg (op_mode, op0);
2749 }
2750
2751 *pop0 = op0;
2752 *pop1 = op1;
2753 return code;
2754}
2755
2756/* Generate insn patterns to do a floating point compare of OPERANDS. */
2757
2758static rtx
2759ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2760{
2761 bool unordered_compare = ix86_unordered_fp_compare (code);
2762 machine_mode cmp_mode;
2763 rtx tmp, scratch;
2764
2765 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2766
2767 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2768 if (unordered_compare)
2769 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2770
2771 /* Do fcomi/sahf based test when profitable. */
2772 switch (ix86_fp_comparison_strategy (code))
2773 {
2774 case IX86_FPCMP_COMI:
2775 cmp_mode = CCFPmode;
2776 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2777 break;
2778
2779 case IX86_FPCMP_SAHF:
2780 cmp_mode = CCFPmode;
2781 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2782 scratch = gen_reg_rtx (HImode);
2783 emit_insn (gen_rtx_SET (scratch, tmp));
2784 emit_insn (gen_x86_sahf_1 (scratch));
2785 break;
2786
2787 case IX86_FPCMP_ARITH:
2788 cmp_mode = CCNOmode;
2789 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2790 scratch = gen_reg_rtx (HImode);
2791 emit_insn (gen_rtx_SET (scratch, tmp));
2792
2793 /* In the unordered case, we have to check C2 for NaN's, which
2794 doesn't happen to work out to anything nice combination-wise.
2795 So do some bit twiddling on the value we've got in AH to come
2796 up with an appropriate set of condition codes. */
2797
2798 switch (code)
2799 {
2800 case GT:
2801 case UNGT:
2802 if (code == GT || !TARGET_IEEE_FP)
2803 {
2804 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2805 code = EQ;
2806 }
2807 else
2808 {
2809 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2810 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2811 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2812 cmp_mode = CCmode;
2813 code = GEU;
2814 }
2815 break;
2816 case LT:
2817 case UNLT:
2818 if (code == LT && TARGET_IEEE_FP)
2819 {
2820 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2821 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2822 cmp_mode = CCmode;
2823 code = EQ;
2824 }
2825 else
2826 {
2827 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2828 code = NE;
2829 }
2830 break;
2831 case GE:
2832 case UNGE:
2833 if (code == GE || !TARGET_IEEE_FP)
2834 {
2835 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2836 code = EQ;
2837 }
2838 else
2839 {
2840 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2841 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2842 code = NE;
2843 }
2844 break;
2845 case LE:
2846 case UNLE:
2847 if (code == LE && TARGET_IEEE_FP)
2848 {
2849 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2850 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2851 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2852 cmp_mode = CCmode;
2853 code = LTU;
2854 }
2855 else
2856 {
2857 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2858 code = NE;
2859 }
2860 break;
2861 case EQ:
2862 case UNEQ:
2863 if (code == EQ && TARGET_IEEE_FP)
2864 {
2865 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2866 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2867 cmp_mode = CCmode;
2868 code = EQ;
2869 }
2870 else
2871 {
2872 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2873 code = NE;
2874 }
2875 break;
2876 case NE:
2877 case LTGT:
2878 if (code == NE && TARGET_IEEE_FP)
2879 {
2880 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2881 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2882 GEN_INT (0x40)));
2883 code = NE;
2884 }
2885 else
2886 {
2887 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2888 code = EQ;
2889 }
2890 break;
2891
2892 case UNORDERED:
2893 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2894 code = NE;
2895 break;
2896 case ORDERED:
2897 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2898 code = EQ;
2899 break;
2900
2901 default:
2902 gcc_unreachable ();
2903 }
2904 break;
2905
2906 default:
2907 gcc_unreachable();
2908 }
2909
2910 /* Return the test that should be put into the flags user, i.e.
2911 the bcc, scc, or cmov instruction. */
2912 return gen_rtx_fmt_ee (code, VOIDmode,
2913 gen_rtx_REG (cmp_mode, FLAGS_REG),
2914 const0_rtx);
2915}
2916
2917/* Generate insn patterns to do an integer compare of OPERANDS. */
2918
2919static rtx
2920ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2921{
2922 machine_mode cmpmode;
2923 rtx tmp, flags;
2924
86403f4e
UB
2925 /* Swap operands to emit carry flag comparison. */
2926 if ((code == GTU || code == LEU)
2927 && nonimmediate_operand (op1, VOIDmode))
2928 {
2929 std::swap (op0, op1);
2930 code = swap_condition (code);
2931 }
2932
2bf6d935
ML
2933 cmpmode = SELECT_CC_MODE (code, op0, op1);
2934 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2935
2936 /* This is very simple, but making the interface the same as in the
2937 FP case makes the rest of the code easier. */
2938 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2939 emit_insn (gen_rtx_SET (flags, tmp));
2940
2941 /* Return the test that should be put into the flags user, i.e.
2942 the bcc, scc, or cmov instruction. */
2943 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2944}
2945
2946static rtx
2947ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2948{
2949 rtx ret;
2950
2951 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2952 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2953
2954 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2955 {
2956 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2957 ret = ix86_expand_fp_compare (code, op0, op1);
2958 }
2959 else
2960 ret = ix86_expand_int_compare (code, op0, op1);
2961
2962 return ret;
2963}
2964
2965void
2966ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2967{
2968 rtx ret;
2969
2970 gcc_assert (GET_MODE (dest) == QImode);
2971
2972 ret = ix86_expand_compare (code, op0, op1);
2973 PUT_MODE (ret, QImode);
2974 emit_insn (gen_rtx_SET (dest, ret));
2975}
2976
463d9108
JJ
2977/* Expand floating point op0 <=> op1, i.e.
2978 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2979
2980void
2981ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
2982{
2983 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
2984 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
2985 rtx l0 = gen_label_rtx ();
2986 rtx l1 = gen_label_rtx ();
2987 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
2988 rtx lend = gen_label_rtx ();
2989 rtx tmp;
2990 rtx_insn *jmp;
2991 if (l2)
2992 {
2993 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
2994 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2995 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
2996 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
2997 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2998 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
2999 }
3000 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3001 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3002 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3003 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3004 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3005 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3006 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3007 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3008 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3009 add_reg_br_prob_note (jmp, profile_probability::even ());
3010 emit_move_insn (dest, constm1_rtx);
3011 emit_jump (lend);
3012 emit_label (l0);
3013 emit_move_insn (dest, const0_rtx);
3014 emit_jump (lend);
3015 emit_label (l1);
3016 emit_move_insn (dest, const1_rtx);
3017 emit_jump (lend);
3018 if (l2)
3019 {
3020 emit_label (l2);
3021 emit_move_insn (dest, const2_rtx);
3022 }
3023 emit_label (lend);
3024}
3025
2bf6d935
ML
3026/* Expand comparison setting or clearing carry flag. Return true when
3027 successful and set pop for the operation. */
3028static bool
3029ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3030{
3031 machine_mode mode
3032 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3033
3034 /* Do not handle double-mode compares that go through special path. */
3035 if (mode == (TARGET_64BIT ? TImode : DImode))
3036 return false;
3037
3038 if (SCALAR_FLOAT_MODE_P (mode))
3039 {
3040 rtx compare_op;
3041 rtx_insn *compare_seq;
3042
3043 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3044
3045 /* Shortcut: following common codes never translate
3046 into carry flag compares. */
3047 if (code == EQ || code == NE || code == UNEQ || code == LTGT
3048 || code == ORDERED || code == UNORDERED)
3049 return false;
3050
3051 /* These comparisons require zero flag; swap operands so they won't. */
3052 if ((code == GT || code == UNLE || code == LE || code == UNGT)
3053 && !TARGET_IEEE_FP)
3054 {
3055 std::swap (op0, op1);
3056 code = swap_condition (code);
3057 }
3058
3059 /* Try to expand the comparison and verify that we end up with
3060 carry flag based comparison. This fails to be true only when
3061 we decide to expand comparison using arithmetic that is not
3062 too common scenario. */
3063 start_sequence ();
3064 compare_op = ix86_expand_fp_compare (code, op0, op1);
3065 compare_seq = get_insns ();
3066 end_sequence ();
3067
3068 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3069 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3070 else
3071 code = GET_CODE (compare_op);
3072
3073 if (code != LTU && code != GEU)
3074 return false;
3075
3076 emit_insn (compare_seq);
3077 *pop = compare_op;
3078 return true;
3079 }
3080
3081 if (!INTEGRAL_MODE_P (mode))
3082 return false;
3083
3084 switch (code)
3085 {
3086 case LTU:
3087 case GEU:
3088 break;
3089
3090 /* Convert a==0 into (unsigned)a<1. */
3091 case EQ:
3092 case NE:
3093 if (op1 != const0_rtx)
3094 return false;
3095 op1 = const1_rtx;
3096 code = (code == EQ ? LTU : GEU);
3097 break;
3098
3099 /* Convert a>b into b<a or a>=b-1. */
3100 case GTU:
3101 case LEU:
3102 if (CONST_INT_P (op1))
3103 {
3104 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3105 /* Bail out on overflow. We still can swap operands but that
3106 would force loading of the constant into register. */
3107 if (op1 == const0_rtx
3108 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3109 return false;
3110 code = (code == GTU ? GEU : LTU);
3111 }
3112 else
3113 {
3114 std::swap (op0, op1);
3115 code = (code == GTU ? LTU : GEU);
3116 }
3117 break;
3118
3119 /* Convert a>=0 into (unsigned)a<0x80000000. */
3120 case LT:
3121 case GE:
3122 if (mode == DImode || op1 != const0_rtx)
3123 return false;
3124 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3125 code = (code == LT ? GEU : LTU);
3126 break;
3127 case LE:
3128 case GT:
3129 if (mode == DImode || op1 != constm1_rtx)
3130 return false;
3131 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3132 code = (code == LE ? GEU : LTU);
3133 break;
3134
3135 default:
3136 return false;
3137 }
3138 /* Swapping operands may cause constant to appear as first operand. */
3139 if (!nonimmediate_operand (op0, VOIDmode))
3140 {
3141 if (!can_create_pseudo_p ())
3142 return false;
3143 op0 = force_reg (mode, op0);
3144 }
3145 *pop = ix86_expand_compare (code, op0, op1);
3146 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3147 return true;
3148}
3149
3150/* Expand conditional increment or decrement using adb/sbb instructions.
3151 The default case using setcc followed by the conditional move can be
3152 done by generic code. */
3153bool
3154ix86_expand_int_addcc (rtx operands[])
3155{
3156 enum rtx_code code = GET_CODE (operands[1]);
3157 rtx flags;
987a3082 3158 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2bf6d935
ML
3159 rtx compare_op;
3160 rtx val = const0_rtx;
3161 bool fpcmp = false;
3162 machine_mode mode;
3163 rtx op0 = XEXP (operands[1], 0);
3164 rtx op1 = XEXP (operands[1], 1);
3165
3166 if (operands[3] != const1_rtx
3167 && operands[3] != constm1_rtx)
3168 return false;
3169 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3170 return false;
3171 code = GET_CODE (compare_op);
3172
3173 flags = XEXP (compare_op, 0);
3174
3175 if (GET_MODE (flags) == CCFPmode)
3176 {
3177 fpcmp = true;
3178 code = ix86_fp_compare_code_to_integer (code);
3179 }
3180
3181 if (code != LTU)
3182 {
3183 val = constm1_rtx;
3184 if (fpcmp)
3185 PUT_CODE (compare_op,
3186 reverse_condition_maybe_unordered
3187 (GET_CODE (compare_op)));
3188 else
3189 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3190 }
3191
3192 mode = GET_MODE (operands[0]);
3193
3194 /* Construct either adc or sbb insn. */
3195 if ((code == LTU) == (operands[3] == constm1_rtx))
987a3082 3196 insn = gen_sub3_carry;
2bf6d935 3197 else
987a3082
UB
3198 insn = gen_add3_carry;
3199
3200 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2bf6d935
ML
3201
3202 return true;
3203}
3204
3205bool
3206ix86_expand_int_movcc (rtx operands[])
3207{
3208 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3209 rtx_insn *compare_seq;
3210 rtx compare_op;
3211 machine_mode mode = GET_MODE (operands[0]);
3212 bool sign_bit_compare_p = false;
f1652e33 3213 bool negate_cc_compare_p = false;
2bf6d935
ML
3214 rtx op0 = XEXP (operands[1], 0);
3215 rtx op1 = XEXP (operands[1], 1);
1ceddd74
JJ
3216 rtx op2 = operands[2];
3217 rtx op3 = operands[3];
2bf6d935
ML
3218
3219 if (GET_MODE (op0) == TImode
3220 || (GET_MODE (op0) == DImode
3221 && !TARGET_64BIT))
3222 return false;
3223
5792208f
JJ
3224 if (GET_MODE (op0) == BFmode
3225 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3226 return false;
3227
2bf6d935
ML
3228 start_sequence ();
3229 compare_op = ix86_expand_compare (code, op0, op1);
3230 compare_seq = get_insns ();
3231 end_sequence ();
3232
3233 compare_code = GET_CODE (compare_op);
3234
3235 if ((op1 == const0_rtx && (code == GE || code == LT))
3236 || (op1 == constm1_rtx && (code == GT || code == LE)))
3237 sign_bit_compare_p = true;
3238
1ceddd74
JJ
3239 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3240 but if op1 is a constant, the latter form allows more optimizations,
3241 either through the last 2 ops being constant handling, or the one
3242 constant and one variable cases. On the other side, for cmov the
3243 former might be better as we don't need to load the constant into
3244 another register. */
3245 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3246 op2 = op1;
3247 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3248 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3249 op3 = op1;
3250
2bf6d935
ML
3251 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3252 HImode insns, we'd be swallowed in word prefix ops. */
3253
3254 if ((mode != HImode || TARGET_FAST_PREFIX)
3255 && (mode != (TARGET_64BIT ? TImode : DImode))
1ceddd74
JJ
3256 && CONST_INT_P (op2)
3257 && CONST_INT_P (op3))
2bf6d935
ML
3258 {
3259 rtx out = operands[0];
1ceddd74
JJ
3260 HOST_WIDE_INT ct = INTVAL (op2);
3261 HOST_WIDE_INT cf = INTVAL (op3);
2bf6d935
ML
3262 HOST_WIDE_INT diff;
3263
f1652e33
RS
3264 if ((mode == SImode
3265 || (TARGET_64BIT && mode == DImode))
3266 && (GET_MODE (op0) == SImode
3267 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3268 {
3269 /* Special case x != 0 ? -1 : y. */
3270 if (code == NE && op1 == const0_rtx && ct == -1)
3271 {
3272 negate_cc_compare_p = true;
3273 std::swap (ct, cf);
3274 code = EQ;
3275 }
3276 else if (code == EQ && op1 == const0_rtx && cf == -1)
3277 negate_cc_compare_p = true;
3278 }
3279
2bf6d935
ML
3280 diff = ct - cf;
3281 /* Sign bit compares are better done using shifts than we do by using
3282 sbb. */
3283 if (sign_bit_compare_p
f1652e33 3284 || negate_cc_compare_p
2bf6d935
ML
3285 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3286 {
d0558f42
RS
3287 /* Place comparison result in its own pseudo. */
3288 rtx tmp = gen_reg_rtx (mode);
2bf6d935 3289
f1652e33
RS
3290 if (negate_cc_compare_p)
3291 {
3292 if (GET_MODE (op0) == DImode)
3293 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3294 else
3295 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3296 gen_lowpart (SImode, op0)));
3297
f1652e33
RS
3298 if (mode == DImode)
3299 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3300 else
3301 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3302 tmp)));
3303 }
3304 else if (!sign_bit_compare_p)
2bf6d935
ML
3305 {
3306 rtx flags;
3307 bool fpcmp = false;
3308
3309 compare_code = GET_CODE (compare_op);
3310
3311 flags = XEXP (compare_op, 0);
3312
3313 if (GET_MODE (flags) == CCFPmode)
3314 {
3315 fpcmp = true;
3316 compare_code
3317 = ix86_fp_compare_code_to_integer (compare_code);
3318 }
3319
3320 /* To simplify rest of code, restrict to the GEU case. */
3321 if (compare_code == LTU)
3322 {
3323 std::swap (ct, cf);
3324 compare_code = reverse_condition (compare_code);
3325 code = reverse_condition (code);
3326 }
3327 else
3328 {
3329 if (fpcmp)
3330 PUT_CODE (compare_op,
3331 reverse_condition_maybe_unordered
3332 (GET_CODE (compare_op)));
3333 else
3334 PUT_CODE (compare_op,
3335 reverse_condition (GET_CODE (compare_op)));
3336 }
3337 diff = ct - cf;
3338
2bf6d935
ML
3339 if (mode == DImode)
3340 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3341 else
3342 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3343 flags, compare_op));
3344 }
3345 else
3346 {
3347 if (code == GT || code == GE)
3348 code = reverse_condition (code);
3349 else
3350 {
3351 std::swap (ct, cf);
3352 diff = ct - cf;
3353 }
3354 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3355 }
3356
d0558f42
RS
3357 /* Add a REG_EQUAL note to allow condition to be shared. */
3358 rtx note = gen_rtx_fmt_ee (code, mode, op0, op1);
3359 set_unique_reg_note (get_last_insn (), REG_EQUAL,
3360 gen_rtx_NEG (mode, note));
3361
2bf6d935
ML
3362 if (diff == 1)
3363 {
3364 /*
3365 * cmpl op0,op1
3366 * sbbl dest,dest
3367 * [addl dest, ct]
3368 *
3369 * Size 5 - 8.
3370 */
3371 if (ct)
d0558f42
RS
3372 tmp = expand_simple_binop (mode, PLUS, tmp, GEN_INT (ct),
3373 NULL_RTX, 1, OPTAB_DIRECT);
2bf6d935
ML
3374 }
3375 else if (cf == -1)
3376 {
3377 /*
3378 * cmpl op0,op1
3379 * sbbl dest,dest
3380 * orl $ct, dest
3381 *
3382 * Size 8.
3383 */
d0558f42
RS
3384 tmp = expand_simple_binop (mode, IOR, tmp, GEN_INT (ct),
3385 NULL_RTX, 1, OPTAB_DIRECT);
2bf6d935
ML
3386 }
3387 else if (diff == -1 && ct)
3388 {
3389 /*
3390 * cmpl op0,op1
3391 * sbbl dest,dest
3392 * notl dest
3393 * [addl dest, cf]
3394 *
3395 * Size 8 - 11.
3396 */
d0558f42 3397 tmp = expand_simple_unop (mode, NOT, tmp, NULL_RTX, 1);
2bf6d935 3398 if (cf)
d0558f42
RS
3399 tmp = expand_simple_binop (mode, PLUS, tmp, GEN_INT (cf),
3400 NULL_RTX, 1, OPTAB_DIRECT);
2bf6d935
ML
3401 }
3402 else
3403 {
3404 /*
3405 * cmpl op0,op1
3406 * sbbl dest,dest
3407 * [notl dest]
3408 * andl cf - ct, dest
3409 * [addl dest, ct]
3410 *
3411 * Size 8 - 11.
3412 */
3413
3414 if (cf == 0)
3415 {
3416 cf = ct;
3417 ct = 0;
d0558f42 3418 tmp = expand_simple_unop (mode, NOT, tmp, NULL_RTX, 1);
2bf6d935
ML
3419 }
3420
d0558f42 3421 tmp = expand_simple_binop (mode, AND, tmp,
2bf6d935 3422 gen_int_mode (cf - ct, mode),
d0558f42 3423 NULL_RTX, 1, OPTAB_DIRECT);
2bf6d935 3424 if (ct)
d0558f42
RS
3425 tmp = expand_simple_binop (mode, PLUS, tmp, GEN_INT (ct),
3426 NULL_RTX, 1, OPTAB_DIRECT);
2bf6d935
ML
3427 }
3428
d0558f42 3429 emit_move_insn (out, tmp);
2bf6d935
ML
3430 return true;
3431 }
3432
3433 if (diff < 0)
3434 {
3435 machine_mode cmp_mode = GET_MODE (op0);
3436 enum rtx_code new_code;
3437
3438 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3439 {
3440 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3441
8f17461b
UB
3442 /* We may be reversing a non-trapping
3443 comparison to a trapping comparison. */
3444 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3445 && code != EQ && code != NE
3446 && code != ORDERED && code != UNORDERED)
3447 new_code = UNKNOWN;
3448 else
3449 new_code = reverse_condition_maybe_unordered (code);
2bf6d935
ML
3450 }
3451 else
3452 new_code = ix86_reverse_condition (code, cmp_mode);
3453 if (new_code != UNKNOWN)
3454 {
3455 std::swap (ct, cf);
3456 diff = -diff;
3457 code = new_code;
3458 }
3459 }
3460
3461 compare_code = UNKNOWN;
3462 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3463 && CONST_INT_P (op1))
3464 {
3465 if (op1 == const0_rtx
3466 && (code == LT || code == GE))
3467 compare_code = code;
3468 else if (op1 == constm1_rtx)
3469 {
3470 if (code == LE)
3471 compare_code = LT;
3472 else if (code == GT)
3473 compare_code = GE;
3474 }
3475 }
3476
3477 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3478 if (compare_code != UNKNOWN
3479 && GET_MODE (op0) == GET_MODE (out)
3480 && (cf == -1 || ct == -1))
3481 {
3482 /* If lea code below could be used, only optimize
3483 if it results in a 2 insn sequence. */
3484
3485 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3486 || diff == 3 || diff == 5 || diff == 9)
3487 || (compare_code == LT && ct == -1)
3488 || (compare_code == GE && cf == -1))
3489 {
3490 /*
3491 * notl op1 (if necessary)
3492 * sarl $31, op1
3493 * orl cf, op1
3494 */
3495 if (ct != -1)
3496 {
3497 cf = ct;
3498 ct = -1;
3499 code = reverse_condition (code);
3500 }
3501
3502 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3503
3504 out = expand_simple_binop (mode, IOR,
3505 out, GEN_INT (cf),
3506 out, 1, OPTAB_DIRECT);
3507 if (out != operands[0])
3508 emit_move_insn (operands[0], out);
3509
3510 return true;
3511 }
3512 }
3513
3514
3515 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3516 || diff == 3 || diff == 5 || diff == 9)
3517 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3518 && (mode != DImode
3519 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3520 {
3521 /*
3522 * xorl dest,dest
3523 * cmpl op1,op2
3524 * setcc dest
3525 * lea cf(dest*(ct-cf)),dest
3526 *
3527 * Size 14.
3528 *
3529 * This also catches the degenerate setcc-only case.
3530 */
3531
3532 rtx tmp;
3533 int nops;
3534
3535 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3536
3537 nops = 0;
3538 /* On x86_64 the lea instruction operates on Pmode, so we need
3539 to get arithmetics done in proper mode to match. */
3540 if (diff == 1)
3541 tmp = copy_rtx (out);
3542 else
3543 {
3544 rtx out1;
3545 out1 = copy_rtx (out);
3546 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3547 nops++;
3548 if (diff & 1)
3549 {
3550 tmp = gen_rtx_PLUS (mode, tmp, out1);
3551 nops++;
3552 }
3553 }
3554 if (cf != 0)
3555 {
c3185b64 3556 tmp = plus_constant (mode, tmp, cf);
2bf6d935
ML
3557 nops++;
3558 }
3559 if (!rtx_equal_p (tmp, out))
3560 {
3561 if (nops == 1)
3562 out = force_operand (tmp, copy_rtx (out));
3563 else
3564 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3565 }
3566 if (!rtx_equal_p (out, operands[0]))
3567 emit_move_insn (operands[0], copy_rtx (out));
3568
3569 return true;
3570 }
3571
3572 /*
3573 * General case: Jumpful:
3574 * xorl dest,dest cmpl op1, op2
3575 * cmpl op1, op2 movl ct, dest
3576 * setcc dest jcc 1f
3577 * decl dest movl cf, dest
3578 * andl (cf-ct),dest 1:
3579 * addl ct,dest
3580 *
3581 * Size 20. Size 14.
3582 *
3583 * This is reasonably steep, but branch mispredict costs are
3584 * high on modern cpus, so consider failing only if optimizing
3585 * for space.
3586 */
3587
3588 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3589 && BRANCH_COST (optimize_insn_for_speed_p (),
3590 false) >= 2)
3591 {
3592 if (cf == 0)
3593 {
3594 machine_mode cmp_mode = GET_MODE (op0);
3595 enum rtx_code new_code;
3596
3597 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3598 {
3599 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3600
8f17461b
UB
3601 /* We may be reversing a non-trapping
3602 comparison to a trapping comparison. */
3603 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3604 && code != EQ && code != NE
3605 && code != ORDERED && code != UNORDERED)
3606 new_code = UNKNOWN;
3607 else
3608 new_code = reverse_condition_maybe_unordered (code);
3609
2bf6d935
ML
3610 }
3611 else
3612 {
3613 new_code = ix86_reverse_condition (code, cmp_mode);
3614 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3615 compare_code = reverse_condition (compare_code);
3616 }
3617
3618 if (new_code != UNKNOWN)
3619 {
3620 cf = ct;
3621 ct = 0;
3622 code = new_code;
3623 }
3624 }
3625
3626 if (compare_code != UNKNOWN)
3627 {
3628 /* notl op1 (if needed)
3629 sarl $31, op1
3630 andl (cf-ct), op1
3631 addl ct, op1
3632
3633 For x < 0 (resp. x <= -1) there will be no notl,
3634 so if possible swap the constants to get rid of the
3635 complement.
3636 True/false will be -1/0 while code below (store flag
3637 followed by decrement) is 0/-1, so the constants need
3638 to be exchanged once more. */
3639
3640 if (compare_code == GE || !cf)
3641 {
3642 code = reverse_condition (code);
3643 compare_code = LT;
3644 }
3645 else
3646 std::swap (ct, cf);
3647
3648 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3649 }
3650 else
3651 {
3652 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3653
3654 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3655 constm1_rtx,
3656 copy_rtx (out), 1, OPTAB_DIRECT);
3657 }
3658
3659 out = expand_simple_binop (mode, AND, copy_rtx (out),
3660 gen_int_mode (cf - ct, mode),
3661 copy_rtx (out), 1, OPTAB_DIRECT);
3662 if (ct)
3663 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3664 copy_rtx (out), 1, OPTAB_DIRECT);
3665 if (!rtx_equal_p (out, operands[0]))
3666 emit_move_insn (operands[0], copy_rtx (out));
3667
3668 return true;
3669 }
3670 }
3671
3672 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3673 {
3674 /* Try a few things more with specific constants and a variable. */
3675
3676 optab op;
3677 rtx var, orig_out, out, tmp;
3678
3679 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3680 return false;
3681
1ceddd74
JJ
3682 operands[2] = op2;
3683 operands[3] = op3;
3684
2bf6d935
ML
3685 /* If one of the two operands is an interesting constant, load a
3686 constant with the above and mask it in with a logical operation. */
3687
3688 if (CONST_INT_P (operands[2]))
3689 {
3690 var = operands[3];
3691 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3692 operands[3] = constm1_rtx, op = and_optab;
3693 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3694 operands[3] = const0_rtx, op = ior_optab;
3695 else
3696 return false;
3697 }
3698 else if (CONST_INT_P (operands[3]))
3699 {
3700 var = operands[2];
3701 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
e4ced0b6
RS
3702 {
3703 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3704 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3705 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3706 operands[1] = simplify_gen_relational (LT, VOIDmode,
3707 GET_MODE (op0),
3708 op0, const0_rtx);
3709
3710 operands[2] = constm1_rtx;
3711 op = and_optab;
3712 }
2bf6d935
ML
3713 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3714 operands[2] = const0_rtx, op = ior_optab;
3715 else
3716 return false;
3717 }
3718 else
3719 return false;
3720
3721 orig_out = operands[0];
3722 tmp = gen_reg_rtx (mode);
3723 operands[0] = tmp;
3724
3725 /* Recurse to get the constant loaded. */
3726 if (!ix86_expand_int_movcc (operands))
3727 return false;
3728
3729 /* Mask in the interesting variable. */
3730 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3731 OPTAB_WIDEN);
3732 if (!rtx_equal_p (out, orig_out))
3733 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3734
3735 return true;
3736 }
3737
3738 /*
3739 * For comparison with above,
3740 *
3741 * movl cf,dest
3742 * movl ct,tmp
3743 * cmpl op1,op2
3744 * cmovcc tmp,dest
3745 *
3746 * Size 15.
3747 */
3748
3749 if (! nonimmediate_operand (operands[2], mode))
3750 operands[2] = force_reg (mode, operands[2]);
3751 if (! nonimmediate_operand (operands[3], mode))
3752 operands[3] = force_reg (mode, operands[3]);
3753
3754 if (! register_operand (operands[2], VOIDmode)
3755 && (mode == QImode
3756 || ! register_operand (operands[3], VOIDmode)))
3757 operands[2] = force_reg (mode, operands[2]);
3758
3759 if (mode == QImode
3760 && ! register_operand (operands[3], VOIDmode))
3761 operands[3] = force_reg (mode, operands[3]);
3762
3763 emit_insn (compare_seq);
3764 emit_insn (gen_rtx_SET (operands[0],
3765 gen_rtx_IF_THEN_ELSE (mode,
3766 compare_op, operands[2],
3767 operands[3])));
3768 return true;
3769}
3770
3771/* Detect conditional moves that exactly match min/max operational
3772 semantics. Note that this is IEEE safe, as long as we don't
3773 interchange the operands.
3774
3775 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3776 and TRUE if the operation is successful and instructions are emitted. */
3777
3778static bool
3779ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3780 rtx cmp_op1, rtx if_true, rtx if_false)
3781{
3782 machine_mode mode;
3783 bool is_min;
3784 rtx tmp;
3785
3786 if (code == LT)
3787 ;
3788 else if (code == UNGE)
3789 std::swap (if_true, if_false);
3790 else
3791 return false;
3792
3793 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3794 is_min = true;
3795 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3796 is_min = false;
3797 else
3798 return false;
3799
3800 mode = GET_MODE (dest);
3801
3802 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3803 but MODE may be a vector mode and thus not appropriate. */
3804 if (!flag_finite_math_only || flag_signed_zeros)
3805 {
3806 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3807 rtvec v;
3808
3809 if_true = force_reg (mode, if_true);
3810 v = gen_rtvec (2, if_true, if_false);
3811 tmp = gen_rtx_UNSPEC (mode, v, u);
3812 }
3813 else
3814 {
3815 code = is_min ? SMIN : SMAX;
3816 if (MEM_P (if_true) && MEM_P (if_false))
3817 if_true = force_reg (mode, if_true);
3818 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3819 }
3820
3821 emit_insn (gen_rtx_SET (dest, tmp));
3822 return true;
3823}
3824
8b905e9b
HL
3825/* Return true if MODE is valid for vector compare to mask register,
3826 Same result for conditionl vector move with mask register. */
3827static bool
3828ix86_valid_mask_cmp_mode (machine_mode mode)
3829{
3830 /* XOP has its own vector conditional movement. */
a8654147 3831 if (TARGET_XOP && !TARGET_AVX512F)
8b905e9b
HL
3832 return false;
3833
0d788c35 3834 /* HFmode only supports vcmpsh whose dest is mask register. */
3835 if (TARGET_AVX512FP16 && mode == HFmode)
3836 return true;
3837
8b905e9b
HL
3838 /* AVX512F is needed for mask operation. */
3839 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3840 return false;
3841
3842 /* AVX512BW is needed for vector QI/HImode,
3843 AVX512VL is needed for 128/256-bit vector. */
3844 machine_mode inner_mode = GET_MODE_INNER (mode);
3845 int vector_size = GET_MODE_SIZE (mode);
3846 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3847 return false;
3848
3849 return vector_size == 64 || TARGET_AVX512VL;
3850}
3851
8d0737d8 3852/* Return true if integer mask comparison should be used. */
3853static bool
3854ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3855 rtx op_true, rtx op_false)
3856{
92f372f0
UB
3857 int vector_size = GET_MODE_SIZE (mode);
3858
0d788c35 3859 if (cmp_mode == HFmode)
3860 return true;
3861 else if (vector_size < 16)
92f372f0
UB
3862 return false;
3863 else if (vector_size == 64)
8d0737d8 3864 return true;
9ce50028
HW
3865 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3866 return true;
8d0737d8 3867
3868 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3869 gcc_assert (!op_true == !op_false);
3870
3871 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3872 vector dest is required. */
3873 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3874 return false;
3875
3876 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3877 if (op_false == CONST0_RTX (mode)
3878 || op_true == CONST0_RTX (mode)
3879 || (INTEGRAL_MODE_P (mode)
3880 && (op_true == CONSTM1_RTX (mode)
3881 || op_false == CONSTM1_RTX (mode))))
3882 return false;
3883
3884 return true;
3885}
3886
2bf6d935
ML
3887/* Expand an SSE comparison. Return the register with the result. */
3888
3889static rtx
3890ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3891 rtx op_true, rtx op_false)
3892{
3893 machine_mode mode = GET_MODE (dest);
3894 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3895
3896 /* In general case result of comparison can differ from operands' type. */
3897 machine_mode cmp_mode;
3898
3899 /* In AVX512F the result of comparison is an integer mask. */
3900 bool maskcmp = false;
3901 rtx x;
3902
8d0737d8 3903 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
2bf6d935
ML
3904 {
3905 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
2bf6d935 3906 maskcmp = true;
8b905e9b 3907 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
2bf6d935
ML
3908 }
3909 else
3910 cmp_mode = cmp_ops_mode;
3911
3912 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3913
a86b3453 3914 bool (*op1_predicate)(rtx, machine_mode)
2bf6d935
ML
3915 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3916
3917 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3918 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3919
3920 if (optimize
3921 || (maskcmp && cmp_mode != mode)
3922 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3923 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3924 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3925
99e4891e 3926 if (maskcmp)
3927 {
3928 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3929 gcc_assert (ok);
3930 return dest;
3931 }
3932
2bf6d935
ML
3933 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3934
8d0737d8 3935 if (cmp_mode != mode)
2bf6d935
ML
3936 {
3937 x = force_reg (cmp_ops_mode, x);
3938 convert_move (dest, x, false);
3939 }
3940 else
3941 emit_insn (gen_rtx_SET (dest, x));
3942
3943 return dest;
3944}
3945
b5193e35
UB
3946/* Emit x86 binary operand CODE in mode MODE for SSE vector
3947 instructions that can be performed using GP registers. */
3948
3949static void
3950ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
3951 rtx dst, rtx src1, rtx src2)
3952{
3953 rtx tmp;
3954
3955 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
3956
3957 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
3958 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
3959 {
3960 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
3961 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
3962 }
3963
3964 emit_insn (tmp);
3965}
3966
2bf6d935
ML
3967/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3968 operations. This is used for both scalar and vector conditional moves. */
3969
3970void
3971ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3972{
3973 machine_mode mode = GET_MODE (dest);
3974 machine_mode cmpmode = GET_MODE (cmp);
f4a2cecd 3975 rtx x;
2bf6d935 3976
9b5d50b7 3977 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3978 if (rtx_equal_p (op_true, op_false))
3979 {
3980 emit_move_insn (dest, op_true);
3981 return;
3982 }
3983
2bf6d935
ML
3984 /* If we have an integer mask and FP value then we need
3985 to cast mask to FP mode. */
3986 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3987 {
3988 cmp = force_reg (cmpmode, cmp);
3989 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3990 }
3991
8d0737d8 3992 /* In AVX512F the result of comparison is an integer mask. */
3993 if (mode != cmpmode
3994 && GET_MODE_CLASS (cmpmode) == MODE_INT)
2bf6d935 3995 {
8d0737d8 3996 gcc_assert (ix86_valid_mask_cmp_mode (mode));
0d788c35 3997 /* Using scalar/vector move with mask register. */
8b905e9b
HL
3998 cmp = force_reg (cmpmode, cmp);
3999 /* Optimize for mask zero. */
4000 op_true = (op_true != CONST0_RTX (mode)
4001 ? force_reg (mode, op_true) : op_true);
4002 op_false = (op_false != CONST0_RTX (mode)
4003 ? force_reg (mode, op_false) : op_false);
4004 if (op_true == CONST0_RTX (mode))
2bf6d935 4005 {
ee78c20e 4006 if (cmpmode == E_DImode && !TARGET_64BIT)
f4a2cecd
UB
4007 {
4008 x = gen_reg_rtx (cmpmode);
4009 emit_insn (gen_knotdi (x, cmp));
4010 }
ee78c20e 4011 else
f4a2cecd
UB
4012 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4013 cmp = x;
8b905e9b
HL
4014 /* Reverse op_true op_false. */
4015 std::swap (op_true, op_false);
2bf6d935 4016 }
8b905e9b 4017
0d788c35 4018 if (mode == HFmode)
4019 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4020 else
f4a2cecd
UB
4021 emit_insn (gen_rtx_SET (dest,
4022 gen_rtx_VEC_MERGE (mode,
4023 op_true, op_false, cmp)));
8b905e9b 4024 return;
2bf6d935 4025 }
f4a2cecd
UB
4026
4027 if (vector_all_ones_operand (op_true, mode)
4028 && op_false == CONST0_RTX (mode))
2bf6d935 4029 {
f4a2cecd 4030 emit_move_insn (dest, cmp);
2bf6d935
ML
4031 return;
4032 }
4033 else if (op_false == CONST0_RTX (mode))
4034 {
f4a2cecd
UB
4035 x = expand_simple_binop (mode, AND, cmp, op_true,
4036 dest, 1, OPTAB_DIRECT);
4037 if (x != dest)
4038 emit_move_insn (dest, x);
2bf6d935
ML
4039 return;
4040 }
4041 else if (op_true == CONST0_RTX (mode))
4042 {
4043 op_false = force_reg (mode, op_false);
4044 x = gen_rtx_NOT (mode, cmp);
b5193e35 4045 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
2bf6d935
ML
4046 return;
4047 }
f4a2cecd 4048 else if (vector_all_ones_operand (op_true, mode))
2bf6d935 4049 {
f4a2cecd
UB
4050 x = expand_simple_binop (mode, IOR, cmp, op_false,
4051 dest, 1, OPTAB_DIRECT);
4052 if (x != dest)
4053 emit_move_insn (dest, x);
2bf6d935
ML
4054 return;
4055 }
f4a2cecd
UB
4056
4057 if (TARGET_XOP)
2bf6d935
ML
4058 {
4059 op_true = force_reg (mode, op_true);
4060
f1693741
UB
4061 if (GET_MODE_SIZE (mode) < 16
4062 || !nonimmediate_operand (op_false, mode))
2bf6d935
ML
4063 op_false = force_reg (mode, op_false);
4064
f4a2cecd
UB
4065 emit_insn (gen_rtx_SET (dest,
4066 gen_rtx_IF_THEN_ELSE (mode, cmp,
4067 op_true, op_false)));
2bf6d935
ML
4068 return;
4069 }
4070
4071 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
f4a2cecd 4072 machine_mode blend_mode = mode;
2bf6d935 4073
f4a2cecd
UB
4074 if (GET_MODE_SIZE (mode) < 16
4075 || !vector_operand (op_true, mode))
2bf6d935
ML
4076 op_true = force_reg (mode, op_true);
4077
4078 op_false = force_reg (mode, op_false);
4079
4080 switch (mode)
4081 {
b1f7fd8a
UB
4082 case E_V2SFmode:
4083 if (TARGET_SSE4_1)
f4a2cecd 4084 gen = gen_mmx_blendvps;
b1f7fd8a 4085 break;
2bf6d935
ML
4086 case E_V4SFmode:
4087 if (TARGET_SSE4_1)
4088 gen = gen_sse4_1_blendvps;
4089 break;
4090 case E_V2DFmode:
4091 if (TARGET_SSE4_1)
4092 gen = gen_sse4_1_blendvpd;
4093 break;
4094 case E_SFmode:
4095 if (TARGET_SSE4_1)
f4a2cecd 4096 gen = gen_sse4_1_blendvss;
2bf6d935
ML
4097 break;
4098 case E_DFmode:
4099 if (TARGET_SSE4_1)
f4a2cecd 4100 gen = gen_sse4_1_blendvsd;
2bf6d935 4101 break;
5795ec0e
UB
4102 case E_V8QImode:
4103 case E_V4HImode:
4104 case E_V2SImode:
4105 if (TARGET_SSE4_1)
4106 {
820ac79e 4107 gen = gen_mmx_pblendvb_v8qi;
f4a2cecd 4108 blend_mode = V8QImode;
5795ec0e
UB
4109 }
4110 break;
2df9d3c5
UB
4111 case E_V4QImode:
4112 case E_V2HImode:
4113 if (TARGET_SSE4_1)
4114 {
820ac79e 4115 gen = gen_mmx_pblendvb_v4qi;
f4a2cecd 4116 blend_mode = V4QImode;
2df9d3c5
UB
4117 }
4118 break;
820ac79e
UB
4119 case E_V2QImode:
4120 if (TARGET_SSE4_1)
f4a2cecd 4121 gen = gen_mmx_pblendvb_v2qi;
820ac79e 4122 break;
2bf6d935
ML
4123 case E_V16QImode:
4124 case E_V8HImode:
9e2a82e1 4125 case E_V8HFmode:
6910cad5 4126 case E_V8BFmode:
2bf6d935
ML
4127 case E_V4SImode:
4128 case E_V2DImode:
793f847b 4129 case E_V1TImode:
2bf6d935
ML
4130 if (TARGET_SSE4_1)
4131 {
4132 gen = gen_sse4_1_pblendvb;
f4a2cecd 4133 blend_mode = V16QImode;
2bf6d935
ML
4134 }
4135 break;
4136 case E_V8SFmode:
4137 if (TARGET_AVX)
4138 gen = gen_avx_blendvps256;
4139 break;
4140 case E_V4DFmode:
4141 if (TARGET_AVX)
4142 gen = gen_avx_blendvpd256;
4143 break;
4144 case E_V32QImode:
4145 case E_V16HImode:
9e2a82e1 4146 case E_V16HFmode:
6910cad5 4147 case E_V16BFmode:
2bf6d935
ML
4148 case E_V8SImode:
4149 case E_V4DImode:
4150 if (TARGET_AVX2)
4151 {
4152 gen = gen_avx2_pblendvb;
f4a2cecd 4153 blend_mode = V32QImode;
2bf6d935
ML
4154 }
4155 break;
4156
4157 case E_V64QImode:
4158 gen = gen_avx512bw_blendmv64qi;
4159 break;
4160 case E_V32HImode:
4161 gen = gen_avx512bw_blendmv32hi;
4162 break;
9e2a82e1 4163 case E_V32HFmode:
4164 gen = gen_avx512bw_blendmv32hf;
4165 break;
6910cad5 4166 case E_V32BFmode:
4167 gen = gen_avx512bw_blendmv32bf;
4168 break;
2bf6d935
ML
4169 case E_V16SImode:
4170 gen = gen_avx512f_blendmv16si;
4171 break;
4172 case E_V8DImode:
4173 gen = gen_avx512f_blendmv8di;
4174 break;
4175 case E_V8DFmode:
4176 gen = gen_avx512f_blendmv8df;
4177 break;
4178 case E_V16SFmode:
4179 gen = gen_avx512f_blendmv16sf;
4180 break;
4181
4182 default:
4183 break;
4184 }
4185
4186 if (gen != NULL)
4187 {
f4a2cecd
UB
4188 if (blend_mode == mode)
4189 x = dest;
4190 else
4191 {
4192 x = gen_reg_rtx (blend_mode);
4193 op_false = gen_lowpart (blend_mode, op_false);
4194 op_true = gen_lowpart (blend_mode, op_true);
4195 cmp = gen_lowpart (blend_mode, cmp);
4196 }
4197
4198 emit_insn (gen (x, op_false, op_true, cmp));
4199
4200 if (x != dest)
4201 emit_move_insn (dest, gen_lowpart (mode, x));
2bf6d935
ML
4202 }
4203 else
4204 {
f4a2cecd 4205 rtx t2, t3;
2bf6d935 4206
f4a2cecd
UB
4207 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4208 NULL, 1, OPTAB_DIRECT);
2bf6d935 4209
f4a2cecd 4210 t3 = gen_reg_rtx (mode);
2bf6d935 4211 x = gen_rtx_NOT (mode, cmp);
b5193e35 4212 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
2bf6d935 4213
f4a2cecd
UB
4214 x = expand_simple_binop (mode, IOR, t3, t2,
4215 dest, 1, OPTAB_DIRECT);
4216 if (x != dest)
4217 emit_move_insn (dest, x);
2bf6d935
ML
4218 }
4219}
4220
4221/* Swap, force into registers, or otherwise massage the two operands
4222 to an sse comparison with a mask result. Thus we differ a bit from
4223 ix86_prepare_fp_compare_args which expects to produce a flags result.
4224
4225 The DEST operand exists to help determine whether to commute commutative
4226 operators. The POP0/POP1 operands are updated in place. The new
4227 comparison code is returned, or UNKNOWN if not implementable. */
4228
4229static enum rtx_code
4230ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4231 rtx *pop0, rtx *pop1)
4232{
4233 switch (code)
4234 {
4235 case LTGT:
4236 case UNEQ:
4237 /* AVX supports all the needed comparisons. */
4238 if (TARGET_AVX)
4239 break;
4240 /* We have no LTGT as an operator. We could implement it with
4241 NE & ORDERED, but this requires an extra temporary. It's
4242 not clear that it's worth it. */
4243 return UNKNOWN;
4244
4245 case LT:
4246 case LE:
4247 case UNGT:
4248 case UNGE:
4249 /* These are supported directly. */
4250 break;
4251
4252 case EQ:
4253 case NE:
4254 case UNORDERED:
4255 case ORDERED:
4256 /* AVX has 3 operand comparisons, no need to swap anything. */
4257 if (TARGET_AVX)
4258 break;
4259 /* For commutative operators, try to canonicalize the destination
4260 operand to be first in the comparison - this helps reload to
4261 avoid extra moves. */
4262 if (!dest || !rtx_equal_p (dest, *pop1))
4263 break;
4264 /* FALLTHRU */
4265
4266 case GE:
4267 case GT:
4268 case UNLE:
4269 case UNLT:
4270 /* These are not supported directly before AVX, and furthermore
4271 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4272 comparison operands to transform into something that is
4273 supported. */
4274 std::swap (*pop0, *pop1);
4275 code = swap_condition (code);
4276 break;
4277
4278 default:
4279 gcc_unreachable ();
4280 }
4281
4282 return code;
4283}
4284
4285/* Expand a floating-point conditional move. Return true if successful. */
4286
4287bool
4288ix86_expand_fp_movcc (rtx operands[])
4289{
4290 machine_mode mode = GET_MODE (operands[0]);
4291 enum rtx_code code = GET_CODE (operands[1]);
4292 rtx tmp, compare_op;
4293 rtx op0 = XEXP (operands[1], 0);
4294 rtx op1 = XEXP (operands[1], 1);
4295
5792208f
JJ
4296 if (GET_MODE (op0) == BFmode
4297 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4298 return false;
4299
a6841211 4300 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
2bf6d935
ML
4301 {
4302 machine_mode cmode;
4303
4304 /* Since we've no cmove for sse registers, don't force bad register
4305 allocation just to gain access to it. Deny movcc when the
4306 comparison mode doesn't match the move mode. */
4307 cmode = GET_MODE (op0);
4308 if (cmode == VOIDmode)
4309 cmode = GET_MODE (op1);
4310 if (cmode != mode)
4311 return false;
4312
4313 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4314 if (code == UNKNOWN)
4315 return false;
4316
4317 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4318 operands[2], operands[3]))
4319 return true;
4320
4321 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4322 operands[2], operands[3]);
4323 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4324 return true;
4325 }
4326
4327 if (GET_MODE (op0) == TImode
4328 || (GET_MODE (op0) == DImode
4329 && !TARGET_64BIT))
4330 return false;
4331
4332 /* The floating point conditional move instructions don't directly
4333 support conditions resulting from a signed integer comparison. */
4334
4335 compare_op = ix86_expand_compare (code, op0, op1);
4336 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4337 {
4338 tmp = gen_reg_rtx (QImode);
4339 ix86_expand_setcc (tmp, code, op0, op1);
4340
4341 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4342 }
4343
4344 emit_insn (gen_rtx_SET (operands[0],
4345 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4346 operands[2], operands[3])));
4347
4348 return true;
4349}
4350
4351/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4352
4353static int
4354ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4355{
4356 switch (code)
4357 {
4358 case EQ:
4359 return 0;
4360 case LT:
4361 case LTU:
4362 return 1;
4363 case LE:
4364 case LEU:
4365 return 2;
4366 case NE:
4367 return 4;
4368 case GE:
4369 case GEU:
4370 return 5;
4371 case GT:
4372 case GTU:
4373 return 6;
4374 default:
4375 gcc_unreachable ();
4376 }
4377}
4378
4379/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4380
4381static int
4382ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4383{
4384 switch (code)
4385 {
4386 case EQ:
4387 return 0x00;
4388 case NE:
4389 return 0x04;
4390 case GT:
4391 return 0x0e;
4392 case LE:
4393 return 0x02;
4394 case GE:
4395 return 0x0d;
4396 case LT:
4397 return 0x01;
4398 case UNLE:
4399 return 0x0a;
4400 case UNLT:
4401 return 0x09;
4402 case UNGE:
4403 return 0x05;
4404 case UNGT:
4405 return 0x06;
4406 case UNEQ:
4407 return 0x18;
4408 case LTGT:
4409 return 0x0c;
4410 case ORDERED:
4411 return 0x07;
4412 case UNORDERED:
4413 return 0x03;
4414 default:
4415 gcc_unreachable ();
4416 }
4417}
4418
4419/* Return immediate value to be used in UNSPEC_PCMP
4420 for comparison CODE in MODE. */
4421
4422static int
4423ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4424{
4425 if (FLOAT_MODE_P (mode))
4426 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4427 return ix86_int_cmp_code_to_pcmp_immediate (code);
4428}
4429
4430/* Expand AVX-512 vector comparison. */
4431
4432bool
99e4891e 4433ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
2bf6d935 4434{
99e4891e 4435 machine_mode mask_mode = GET_MODE (dest);
4436 machine_mode cmp_mode = GET_MODE (cmp_op0);
2bf6d935
ML
4437 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4438 int unspec_code;
4439 rtx unspec;
4440
4441 switch (code)
4442 {
4443 case LEU:
4444 case GTU:
4445 case GEU:
4446 case LTU:
4447 unspec_code = UNSPEC_UNSIGNED_PCMP;
4448 break;
4449
4450 default:
4451 unspec_code = UNSPEC_PCMP;
4452 }
4453
99e4891e 4454 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
2bf6d935 4455 unspec_code);
99e4891e 4456 emit_insn (gen_rtx_SET (dest, unspec));
2bf6d935
ML
4457
4458 return true;
4459}
4460
4461/* Expand fp vector comparison. */
4462
4463bool
4464ix86_expand_fp_vec_cmp (rtx operands[])
4465{
4466 enum rtx_code code = GET_CODE (operands[1]);
4467 rtx cmp;
4468
4469 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4470 &operands[2], &operands[3]);
4471 if (code == UNKNOWN)
4472 {
4473 rtx temp;
4474 switch (GET_CODE (operands[1]))
4475 {
4476 case LTGT:
4477 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4478 operands[3], NULL, NULL);
4479 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4480 operands[3], NULL, NULL);
4481 code = AND;
4482 break;
4483 case UNEQ:
4484 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4485 operands[3], NULL, NULL);
4486 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4487 operands[3], NULL, NULL);
4488 code = IOR;
4489 break;
4490 default:
4491 gcc_unreachable ();
4492 }
4493 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4494 OPTAB_DIRECT);
4495 }
4496 else
4497 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
8d0737d8 4498 NULL, NULL);
2bf6d935
ML
4499
4500 if (operands[0] != cmp)
4501 emit_move_insn (operands[0], cmp);
4502
4503 return true;
4504}
4505
4506static rtx
4507ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4508 rtx op_true, rtx op_false, bool *negate)
4509{
4510 machine_mode data_mode = GET_MODE (dest);
4511 machine_mode mode = GET_MODE (cop0);
4512 rtx x;
4513
4514 *negate = false;
4515
4516 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4517 if (TARGET_XOP
6c67afaf
UB
4518 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4519 && GET_MODE_SIZE (mode) <= 16)
2bf6d935 4520 ;
8b905e9b
HL
4521 /* AVX512F supports all of the comparsions
4522 on all 128/256/512-bit vector int types. */
8d0737d8 4523 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
8b905e9b 4524 ;
2bf6d935
ML
4525 else
4526 {
4527 /* Canonicalize the comparison to EQ, GT, GTU. */
4528 switch (code)
4529 {
4530 case EQ:
4531 case GT:
4532 case GTU:
4533 break;
4534
2bf6d935
ML
4535 case LE:
4536 case LEU:
fa271afb
JJ
4537 /* x <= cst can be handled as x < cst + 1 unless there is
4538 wrap around in cst + 1. */
4539 if (GET_CODE (cop1) == CONST_VECTOR
4540 && GET_MODE_INNER (mode) != TImode)
4541 {
4542 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4543 machine_mode eltmode = GET_MODE_INNER (mode);
4544 for (i = 0; i < n_elts; ++i)
4545 {
4546 rtx elt = CONST_VECTOR_ELT (cop1, i);
4547 if (!CONST_INT_P (elt))
4548 break;
4549 if (code == GE)
4550 {
4551 /* For LE punt if some element is signed maximum. */
4552 if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4553 == (GET_MODE_MASK (eltmode) >> 1))
4554 break;
4555 }
4556 /* For LEU punt if some element is unsigned maximum. */
4557 else if (elt == constm1_rtx)
4558 break;
4559 }
4560 if (i == n_elts)
4561 {
4562 rtvec v = rtvec_alloc (n_elts);
4563 for (i = 0; i < n_elts; ++i)
4564 RTVEC_ELT (v, i)
49de156a
JJ
4565 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4566 eltmode);
fa271afb
JJ
4567 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4568 std::swap (cop0, cop1);
4569 code = code == LE ? GT : GTU;
4570 break;
4571 }
4572 }
4573 /* FALLTHRU */
4574 case NE:
2bf6d935
ML
4575 code = reverse_condition (code);
4576 *negate = true;
4577 break;
4578
4579 case GE:
4580 case GEU:
fa271afb
JJ
4581 /* x >= cst can be handled as x > cst - 1 unless there is
4582 wrap around in cst - 1. */
4583 if (GET_CODE (cop1) == CONST_VECTOR
4584 && GET_MODE_INNER (mode) != TImode)
4585 {
4586 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4587 machine_mode eltmode = GET_MODE_INNER (mode);
4588 for (i = 0; i < n_elts; ++i)
4589 {
4590 rtx elt = CONST_VECTOR_ELT (cop1, i);
4591 if (!CONST_INT_P (elt))
4592 break;
4593 if (code == GE)
4594 {
4595 /* For GE punt if some element is signed minimum. */
4596 if (INTVAL (elt) < 0
4597 && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4598 == 0))
4599 break;
4600 }
4601 /* For GEU punt if some element is zero. */
4602 else if (elt == const0_rtx)
4603 break;
4604 }
4605 if (i == n_elts)
4606 {
4607 rtvec v = rtvec_alloc (n_elts);
4608 for (i = 0; i < n_elts; ++i)
4609 RTVEC_ELT (v, i)
49de156a
JJ
4610 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
4611 eltmode);
fa271afb
JJ
4612 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4613 code = code == GE ? GT : GTU;
4614 break;
4615 }
4616 }
2bf6d935
ML
4617 code = reverse_condition (code);
4618 *negate = true;
4619 /* FALLTHRU */
4620
4621 case LT:
4622 case LTU:
4623 std::swap (cop0, cop1);
4624 code = swap_condition (code);
4625 break;
4626
4627 default:
4628 gcc_unreachable ();
4629 }
4630
4631 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4632 if (mode == V2DImode)
4633 {
4634 switch (code)
4635 {
4636 case EQ:
4637 /* SSE4.1 supports EQ. */
4638 if (!TARGET_SSE4_1)
4639 return NULL;
4640 break;
4641
4642 case GT:
4643 case GTU:
4644 /* SSE4.2 supports GT/GTU. */
4645 if (!TARGET_SSE4_2)
4646 return NULL;
4647 break;
4648
4649 default:
4650 gcc_unreachable ();
4651 }
4652 }
4653
fa271afb
JJ
4654 if (GET_CODE (cop0) == CONST_VECTOR)
4655 cop0 = force_reg (mode, cop0);
4656 else if (GET_CODE (cop1) == CONST_VECTOR)
4657 cop1 = force_reg (mode, cop1);
4658
2bf6d935
ML
4659 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4660 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4661 if (*negate)
4662 std::swap (optrue, opfalse);
4663
4664 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4665 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4666 min (x, y) == x). While we add one instruction (the minimum),
4667 we remove the need for two instructions in the negation, as the
4668 result is done this way.
4669 When using masks, do it for SI/DImode element types, as it is shorter
4670 than the two subtractions. */
4671 if ((code != EQ
4672 && GET_MODE_SIZE (mode) != 64
4673 && vector_all_ones_operand (opfalse, data_mode)
4674 && optrue == CONST0_RTX (data_mode))
4675 || (code == GTU
4676 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4677 /* Don't do it if not using integer masks and we'd end up with
4678 the right values in the registers though. */
4679 && (GET_MODE_SIZE (mode) == 64
4680 || !vector_all_ones_operand (optrue, data_mode)
4681 || opfalse != CONST0_RTX (data_mode))))
4682 {
4683 rtx (*gen) (rtx, rtx, rtx) = NULL;
4684
4685 switch (mode)
4686 {
4687 case E_V16SImode:
4688 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4689 break;
4690 case E_V8DImode:
4691 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4692 cop0 = force_reg (mode, cop0);
4693 cop1 = force_reg (mode, cop1);
4694 break;
4695 case E_V32QImode:
4696 if (TARGET_AVX2)
4697 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4698 break;
4699 case E_V16HImode:
4700 if (TARGET_AVX2)
4701 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4702 break;
4703 case E_V8SImode:
4704 if (TARGET_AVX2)
4705 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4706 break;
4707 case E_V4DImode:
4708 if (TARGET_AVX512VL)
4709 {
4710 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4711 cop0 = force_reg (mode, cop0);
4712 cop1 = force_reg (mode, cop1);
4713 }
4714 break;
4715 case E_V16QImode:
4716 if (code == GTU && TARGET_SSE2)
4717 gen = gen_uminv16qi3;
4718 else if (code == GT && TARGET_SSE4_1)
4719 gen = gen_sminv16qi3;
4720 break;
f3661f2d
UB
4721 case E_V8QImode:
4722 if (code == GTU && TARGET_SSE2)
4723 gen = gen_uminv8qi3;
4724 else if (code == GT && TARGET_SSE4_1)
4725 gen = gen_sminv8qi3;
4726 break;
2df9d3c5
UB
4727 case E_V4QImode:
4728 if (code == GTU && TARGET_SSE2)
4729 gen = gen_uminv4qi3;
4730 else if (code == GT && TARGET_SSE4_1)
4731 gen = gen_sminv4qi3;
4732 break;
04a74555
UB
4733 case E_V2QImode:
4734 if (code == GTU && TARGET_SSE2)
4735 gen = gen_uminv2qi3;
4736 else if (code == GT && TARGET_SSE4_1)
4737 gen = gen_sminv2qi3;
4738 break;
2bf6d935
ML
4739 case E_V8HImode:
4740 if (code == GTU && TARGET_SSE4_1)
4741 gen = gen_uminv8hi3;
4742 else if (code == GT && TARGET_SSE2)
4743 gen = gen_sminv8hi3;
4744 break;
f3661f2d
UB
4745 case E_V4HImode:
4746 if (code == GTU && TARGET_SSE4_1)
4747 gen = gen_uminv4hi3;
4748 else if (code == GT && TARGET_SSE2)
4749 gen = gen_sminv4hi3;
4750 break;
2df9d3c5
UB
4751 case E_V2HImode:
4752 if (code == GTU && TARGET_SSE4_1)
4753 gen = gen_uminv2hi3;
4754 else if (code == GT && TARGET_SSE2)
4755 gen = gen_sminv2hi3;
4756 break;
2bf6d935
ML
4757 case E_V4SImode:
4758 if (TARGET_SSE4_1)
4759 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4760 break;
f3661f2d
UB
4761 case E_V2SImode:
4762 if (TARGET_SSE4_1)
4763 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4764 break;
2bf6d935
ML
4765 case E_V2DImode:
4766 if (TARGET_AVX512VL)
4767 {
4768 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4769 cop0 = force_reg (mode, cop0);
4770 cop1 = force_reg (mode, cop1);
4771 }
4772 break;
4773 default:
4774 break;
4775 }
4776
4777 if (gen)
4778 {
4779 rtx tem = gen_reg_rtx (mode);
4780 if (!vector_operand (cop0, mode))
4781 cop0 = force_reg (mode, cop0);
4782 if (!vector_operand (cop1, mode))
4783 cop1 = force_reg (mode, cop1);
4784 *negate = !*negate;
4785 emit_insn (gen (tem, cop0, cop1));
4786 cop1 = tem;
4787 code = EQ;
4788 }
4789 }
4790
4791 /* Unsigned parallel compare is not supported by the hardware.
4792 Play some tricks to turn this into a signed comparison
4793 against 0. */
4794 if (code == GTU)
4795 {
4796 cop0 = force_reg (mode, cop0);
4797
4798 switch (mode)
4799 {
4800 case E_V16SImode:
4801 case E_V8DImode:
4802 case E_V8SImode:
4803 case E_V4DImode:
4804 case E_V4SImode:
f3661f2d 4805 case E_V2SImode:
2bf6d935
ML
4806 case E_V2DImode:
4807 {
4808 rtx t1, t2, mask;
83bc5e44 4809
2bf6d935
ML
4810 /* Subtract (-(INT MAX) - 1) from both operands to make
4811 them signed. */
4812 mask = ix86_build_signbit_mask (mode, true, false);
4813 t1 = gen_reg_rtx (mode);
83bc5e44 4814 emit_insn (gen_sub3_insn (t1, cop0, mask));
2bf6d935
ML
4815
4816 t2 = gen_reg_rtx (mode);
83bc5e44 4817 emit_insn (gen_sub3_insn (t2, cop1, mask));
2bf6d935
ML
4818
4819 cop0 = t1;
4820 cop1 = t2;
4821 code = GT;
4822 }
4823 break;
4824
4825 case E_V64QImode:
4826 case E_V32HImode:
4827 case E_V32QImode:
4828 case E_V16HImode:
4829 case E_V16QImode:
f3661f2d 4830 case E_V8QImode:
2df9d3c5 4831 case E_V4QImode:
04a74555 4832 case E_V2QImode:
2bf6d935 4833 case E_V8HImode:
f3661f2d 4834 case E_V4HImode:
2df9d3c5 4835 case E_V2HImode:
2bf6d935
ML
4836 /* Perform a parallel unsigned saturating subtraction. */
4837 x = gen_reg_rtx (mode);
83bc5e44
UB
4838 emit_insn (gen_rtx_SET
4839 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
2bf6d935
ML
4840 cop0 = x;
4841 cop1 = CONST0_RTX (mode);
4842 code = EQ;
4843 *negate = !*negate;
4844 break;
4845
4846 default:
4847 gcc_unreachable ();
4848 }
4849 }
4850 }
4851
4852 if (*negate)
4853 std::swap (op_true, op_false);
4854
fa271afb
JJ
4855 if (GET_CODE (cop1) == CONST_VECTOR)
4856 cop1 = force_reg (mode, cop1);
4857
2bf6d935
ML
4858 /* Allow the comparison to be done in one mode, but the movcc to
4859 happen in another mode. */
4860 if (data_mode == mode)
fa271afb 4861 x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
2bf6d935
ML
4862 else
4863 {
4864 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4865 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4866 op_true, op_false);
4867 if (GET_MODE (x) == mode)
4868 x = gen_lowpart (data_mode, x);
4869 }
4870
4871 return x;
4872}
4873
4874/* Expand integer vector comparison. */
4875
4876bool
4877ix86_expand_int_vec_cmp (rtx operands[])
4878{
4879 rtx_code code = GET_CODE (operands[1]);
4880 bool negate = false;
4881 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4882 operands[3], NULL, NULL, &negate);
4883
4884 if (!cmp)
4885 return false;
4886
4887 if (negate)
4888 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4889 CONST0_RTX (GET_MODE (cmp)),
4890 NULL, NULL, &negate);
4891
4892 gcc_assert (!negate);
4893
4894 if (operands[0] != cmp)
4895 emit_move_insn (operands[0], cmp);
4896
4897 return true;
4898}
4899
4900/* Expand a floating-point vector conditional move; a vcond operation
4901 rather than a movcc operation. */
4902
4903bool
4904ix86_expand_fp_vcond (rtx operands[])
4905{
4906 enum rtx_code code = GET_CODE (operands[3]);
4907 rtx cmp;
4908
4909 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4910 &operands[4], &operands[5]);
4911 if (code == UNKNOWN)
4912 {
4913 rtx temp;
4914 switch (GET_CODE (operands[3]))
4915 {
4916 case LTGT:
4917 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4918 operands[5], operands[0], operands[0]);
4919 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4920 operands[5], operands[1], operands[2]);
4921 code = AND;
4922 break;
4923 case UNEQ:
4924 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4925 operands[5], operands[0], operands[0]);
4926 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4927 operands[5], operands[1], operands[2]);
4928 code = IOR;
4929 break;
4930 default:
4931 gcc_unreachable ();
4932 }
4933 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4934 OPTAB_DIRECT);
4935 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4936 return true;
4937 }
4938
4939 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4940 operands[5], operands[1], operands[2]))
4941 return true;
4942
4943 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4944 operands[1], operands[2]);
4945 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4946 return true;
4947}
4948
4949/* Expand a signed/unsigned integral vector conditional move. */
4950
4951bool
4952ix86_expand_int_vcond (rtx operands[])
4953{
4954 machine_mode data_mode = GET_MODE (operands[0]);
4955 machine_mode mode = GET_MODE (operands[4]);
4956 enum rtx_code code = GET_CODE (operands[3]);
4957 bool negate = false;
4958 rtx x, cop0, cop1;
4959
4960 cop0 = operands[4];
4961 cop1 = operands[5];
4962
4963 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4964 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4965 if ((code == LT || code == GE)
4966 && data_mode == mode
4967 && cop1 == CONST0_RTX (mode)
4968 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4969 && GET_MODE_UNIT_SIZE (data_mode) > 1
4970 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4971 && (GET_MODE_SIZE (data_mode) == 16
4972 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4973 {
4974 rtx negop = operands[2 - (code == LT)];
4975 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4976 if (negop == CONST1_RTX (data_mode))
4977 {
4978 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4979 operands[0], 1, OPTAB_DIRECT);
4980 if (res != operands[0])
4981 emit_move_insn (operands[0], res);
4982 return true;
4983 }
4984 else if (GET_MODE_INNER (data_mode) != DImode
4985 && vector_all_ones_operand (negop, data_mode))
4986 {
4987 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4988 operands[0], 0, OPTAB_DIRECT);
4989 if (res != operands[0])
4990 emit_move_insn (operands[0], res);
4991 return true;
4992 }
4993 }
4994
4995 if (!nonimmediate_operand (cop1, mode))
4996 cop1 = force_reg (mode, cop1);
4997 if (!general_operand (operands[1], data_mode))
4998 operands[1] = force_reg (data_mode, operands[1]);
4999 if (!general_operand (operands[2], data_mode))
5000 operands[2] = force_reg (data_mode, operands[2]);
5001
5002 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5003 operands[1], operands[2], &negate);
5004
5005 if (!x)
5006 return false;
5007
5008 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5009 operands[2-negate]);
5010 return true;
5011}
5012
5013static bool
5014ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5015 struct expand_vec_perm_d *d)
5016{
5017 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5018 expander, so args are either in d, or in op0, op1 etc. */
5019 machine_mode mode = GET_MODE (d ? d->op0 : op0);
5020 machine_mode maskmode = mode;
5021 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5022
5023 switch (mode)
5024 {
faf2b6bc 5025 case E_V16QImode:
5026 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5027 gen = gen_avx512vl_vpermt2varv16qi3;
5028 break;
5029 case E_V32QImode:
5030 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5031 gen = gen_avx512vl_vpermt2varv32qi3;
5032 break;
5033 case E_V64QImode:
5034 if (TARGET_AVX512VBMI)
5035 gen = gen_avx512bw_vpermt2varv64qi3;
5036 break;
2bf6d935
ML
5037 case E_V8HImode:
5038 if (TARGET_AVX512VL && TARGET_AVX512BW)
5039 gen = gen_avx512vl_vpermt2varv8hi3;
5040 break;
5041 case E_V16HImode:
5042 if (TARGET_AVX512VL && TARGET_AVX512BW)
5043 gen = gen_avx512vl_vpermt2varv16hi3;
5044 break;
2bf6d935
ML
5045 case E_V32HImode:
5046 if (TARGET_AVX512BW)
5047 gen = gen_avx512bw_vpermt2varv32hi3;
5048 break;
5049 case E_V4SImode:
5050 if (TARGET_AVX512VL)
5051 gen = gen_avx512vl_vpermt2varv4si3;
5052 break;
5053 case E_V8SImode:
5054 if (TARGET_AVX512VL)
5055 gen = gen_avx512vl_vpermt2varv8si3;
5056 break;
5057 case E_V16SImode:
5058 if (TARGET_AVX512F)
5059 gen = gen_avx512f_vpermt2varv16si3;
5060 break;
5061 case E_V4SFmode:
5062 if (TARGET_AVX512VL)
5063 {
5064 gen = gen_avx512vl_vpermt2varv4sf3;
5065 maskmode = V4SImode;
5066 }
5067 break;
5068 case E_V8SFmode:
5069 if (TARGET_AVX512VL)
5070 {
5071 gen = gen_avx512vl_vpermt2varv8sf3;
5072 maskmode = V8SImode;
5073 }
5074 break;
5075 case E_V16SFmode:
5076 if (TARGET_AVX512F)
5077 {
5078 gen = gen_avx512f_vpermt2varv16sf3;
5079 maskmode = V16SImode;
5080 }
5081 break;
5082 case E_V2DImode:
5083 if (TARGET_AVX512VL)
5084 gen = gen_avx512vl_vpermt2varv2di3;
5085 break;
5086 case E_V4DImode:
5087 if (TARGET_AVX512VL)
5088 gen = gen_avx512vl_vpermt2varv4di3;
5089 break;
5090 case E_V8DImode:
5091 if (TARGET_AVX512F)
5092 gen = gen_avx512f_vpermt2varv8di3;
5093 break;
5094 case E_V2DFmode:
5095 if (TARGET_AVX512VL)
5096 {
5097 gen = gen_avx512vl_vpermt2varv2df3;
5098 maskmode = V2DImode;
5099 }
5100 break;
5101 case E_V4DFmode:
5102 if (TARGET_AVX512VL)
5103 {
5104 gen = gen_avx512vl_vpermt2varv4df3;
5105 maskmode = V4DImode;
5106 }
5107 break;
5108 case E_V8DFmode:
5109 if (TARGET_AVX512F)
5110 {
5111 gen = gen_avx512f_vpermt2varv8df3;
5112 maskmode = V8DImode;
5113 }
5114 break;
5115 default:
5116 break;
5117 }
5118
5119 if (gen == NULL)
5120 return false;
5121
5122 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5123 expander, so args are either in d, or in op0, op1 etc. */
5124 if (d)
5125 {
5126 rtx vec[64];
5127 target = d->target;
5128 op0 = d->op0;
5129 op1 = d->op1;
5130 for (int i = 0; i < d->nelt; ++i)
5131 vec[i] = GEN_INT (d->perm[i]);
5132 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5133 }
5134
5135 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5136 return true;
5137}
5138
5139/* Expand a variable vector permutation. */
5140
5141void
5142ix86_expand_vec_perm (rtx operands[])
5143{
5144 rtx target = operands[0];
5145 rtx op0 = operands[1];
5146 rtx op1 = operands[2];
5147 rtx mask = operands[3];
5148 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5149 machine_mode mode = GET_MODE (op0);
5150 machine_mode maskmode = GET_MODE (mask);
5151 int w, e, i;
5152 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5153
5154 /* Number of elements in the vector. */
5155 w = GET_MODE_NUNITS (mode);
5156 e = GET_MODE_UNIT_SIZE (mode);
5157 gcc_assert (w <= 64);
5158
be072bfa
HW
5159 /* For HF mode vector, convert it to HI using subreg. */
5160 if (GET_MODE_INNER (mode) == HFmode)
5161 {
5162 machine_mode orig_mode = mode;
5163 mode = mode_for_vector (HImode, w).require ();
5164 target = lowpart_subreg (mode, target, orig_mode);
5165 op0 = lowpart_subreg (mode, op0, orig_mode);
5166 op1 = lowpart_subreg (mode, op1, orig_mode);
5167 }
5168
2bf6d935
ML
5169 if (TARGET_AVX512F && one_operand_shuffle)
5170 {
5171 rtx (*gen) (rtx, rtx, rtx) = NULL;
5172 switch (mode)
5173 {
5174 case E_V16SImode:
5175 gen =gen_avx512f_permvarv16si;
5176 break;
5177 case E_V16SFmode:
5178 gen = gen_avx512f_permvarv16sf;
5179 break;
5180 case E_V8DImode:
5181 gen = gen_avx512f_permvarv8di;
5182 break;
5183 case E_V8DFmode:
5184 gen = gen_avx512f_permvarv8df;
5185 break;
5186 default:
5187 break;
5188 }
5189 if (gen != NULL)
5190 {
5191 emit_insn (gen (target, op0, mask));
5192 return;
5193 }
5194 }
5195
5196 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5197 return;
5198
5199 if (TARGET_AVX2)
5200 {
5201 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5202 {
5203 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5204 an constant shuffle operand. With a tiny bit of effort we can
5205 use VPERMD instead. A re-interpretation stall for V4DFmode is
5206 unfortunate but there's no avoiding it.
5207 Similarly for V16HImode we don't have instructions for variable
5208 shuffling, while for V32QImode we can use after preparing suitable
5209 masks vpshufb; vpshufb; vpermq; vpor. */
5210
5211 if (mode == V16HImode)
5212 {
5213 maskmode = mode = V32QImode;
5214 w = 32;
5215 e = 1;
5216 }
5217 else
5218 {
5219 maskmode = mode = V8SImode;
5220 w = 8;
5221 e = 4;
5222 }
5223 t1 = gen_reg_rtx (maskmode);
5224
5225 /* Replicate the low bits of the V4DImode mask into V8SImode:
5226 mask = { A B C D }
5227 t1 = { A A B B C C D D }. */
5228 for (i = 0; i < w / 2; ++i)
5229 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5230 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5231 vt = force_reg (maskmode, vt);
5232 mask = gen_lowpart (maskmode, mask);
5233 if (maskmode == V8SImode)
5234 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5235 else
5236 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5237
5238 /* Multiply the shuffle indicies by two. */
5239 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5240 OPTAB_DIRECT);
5241
5242 /* Add one to the odd shuffle indicies:
5243 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5244 for (i = 0; i < w / 2; ++i)
5245 {
5246 vec[i * 2] = const0_rtx;
5247 vec[i * 2 + 1] = const1_rtx;
5248 }
5249 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5250 vt = validize_mem (force_const_mem (maskmode, vt));
5251 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5252 OPTAB_DIRECT);
5253
5254 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5255 operands[3] = mask = t1;
5256 target = gen_reg_rtx (mode);
5257 op0 = gen_lowpart (mode, op0);
5258 op1 = gen_lowpart (mode, op1);
5259 }
5260
5261 switch (mode)
5262 {
5263 case E_V8SImode:
5264 /* The VPERMD and VPERMPS instructions already properly ignore
5265 the high bits of the shuffle elements. No need for us to
5266 perform an AND ourselves. */
5267 if (one_operand_shuffle)
5268 {
5269 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5270 if (target != operands[0])
5271 emit_move_insn (operands[0],
5272 gen_lowpart (GET_MODE (operands[0]), target));
5273 }
5274 else
5275 {
5276 t1 = gen_reg_rtx (V8SImode);
5277 t2 = gen_reg_rtx (V8SImode);
5278 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5279 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5280 goto merge_two;
5281 }
5282 return;
5283
5284 case E_V8SFmode:
5285 mask = gen_lowpart (V8SImode, mask);
5286 if (one_operand_shuffle)
5287 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5288 else
5289 {
5290 t1 = gen_reg_rtx (V8SFmode);
5291 t2 = gen_reg_rtx (V8SFmode);
5292 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5293 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5294 goto merge_two;
5295 }
5296 return;
5297
5298 case E_V4SImode:
5299 /* By combining the two 128-bit input vectors into one 256-bit
5300 input vector, we can use VPERMD and VPERMPS for the full
5301 two-operand shuffle. */
5302 t1 = gen_reg_rtx (V8SImode);
5303 t2 = gen_reg_rtx (V8SImode);
5304 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5305 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5306 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5307 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5308 return;
5309
5310 case E_V4SFmode:
5311 t1 = gen_reg_rtx (V8SFmode);
5312 t2 = gen_reg_rtx (V8SImode);
5313 mask = gen_lowpart (V4SImode, mask);
5314 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5315 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5316 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5317 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5318 return;
5319
5320 case E_V32QImode:
5321 t1 = gen_reg_rtx (V32QImode);
5322 t2 = gen_reg_rtx (V32QImode);
5323 t3 = gen_reg_rtx (V32QImode);
5324 vt2 = GEN_INT (-128);
5325 vt = gen_const_vec_duplicate (V32QImode, vt2);
5326 vt = force_reg (V32QImode, vt);
5327 for (i = 0; i < 32; i++)
5328 vec[i] = i < 16 ? vt2 : const0_rtx;
5329 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5330 vt2 = force_reg (V32QImode, vt2);
5331 /* From mask create two adjusted masks, which contain the same
5332 bits as mask in the low 7 bits of each vector element.
5333 The first mask will have the most significant bit clear
5334 if it requests element from the same 128-bit lane
5335 and MSB set if it requests element from the other 128-bit lane.
5336 The second mask will have the opposite values of the MSB,
5337 and additionally will have its 128-bit lanes swapped.
5338 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5339 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5340 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5341 stands for other 12 bytes. */
5342 /* The bit whether element is from the same lane or the other
5343 lane is bit 4, so shift it up by 3 to the MSB position. */
5344 t5 = gen_reg_rtx (V4DImode);
5345 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5346 GEN_INT (3)));
5347 /* Clear MSB bits from the mask just in case it had them set. */
5348 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5349 /* After this t1 will have MSB set for elements from other lane. */
5350 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5351 /* Clear bits other than MSB. */
5352 emit_insn (gen_andv32qi3 (t1, t1, vt));
5353 /* Or in the lower bits from mask into t3. */
5354 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5355 /* And invert MSB bits in t1, so MSB is set for elements from the same
5356 lane. */
5357 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5358 /* Swap 128-bit lanes in t3. */
5359 t6 = gen_reg_rtx (V4DImode);
5360 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5361 const2_rtx, GEN_INT (3),
5362 const0_rtx, const1_rtx));
5363 /* And or in the lower bits from mask into t1. */
5364 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5365 if (one_operand_shuffle)
5366 {
5367 /* Each of these shuffles will put 0s in places where
5368 element from the other 128-bit lane is needed, otherwise
5369 will shuffle in the requested value. */
5370 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5371 gen_lowpart (V32QImode, t6)));
5372 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5373 /* For t3 the 128-bit lanes are swapped again. */
5374 t7 = gen_reg_rtx (V4DImode);
5375 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5376 const2_rtx, GEN_INT (3),
5377 const0_rtx, const1_rtx));
5378 /* And oring both together leads to the result. */
5379 emit_insn (gen_iorv32qi3 (target, t1,
5380 gen_lowpart (V32QImode, t7)));
5381 if (target != operands[0])
5382 emit_move_insn (operands[0],
5383 gen_lowpart (GET_MODE (operands[0]), target));
5384 return;
5385 }
5386
5387 t4 = gen_reg_rtx (V32QImode);
5388 /* Similarly to the above one_operand_shuffle code,
5389 just for repeated twice for each operand. merge_two:
5390 code will merge the two results together. */
5391 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5392 gen_lowpart (V32QImode, t6)));
5393 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5394 gen_lowpart (V32QImode, t6)));
5395 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5396 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5397 t7 = gen_reg_rtx (V4DImode);
5398 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5399 const2_rtx, GEN_INT (3),
5400 const0_rtx, const1_rtx));
5401 t8 = gen_reg_rtx (V4DImode);
5402 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5403 const2_rtx, GEN_INT (3),
5404 const0_rtx, const1_rtx));
5405 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5406 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5407 t1 = t4;
5408 t2 = t3;
5409 goto merge_two;
5410
5411 default:
5412 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5413 break;
5414 }
5415 }
5416
5417 if (TARGET_XOP)
5418 {
5419 /* The XOP VPPERM insn supports three inputs. By ignoring the
5420 one_operand_shuffle special case, we avoid creating another
5421 set of constant vectors in memory. */
5422 one_operand_shuffle = false;
5423
5424 /* mask = mask & {2*w-1, ...} */
5425 vt = GEN_INT (2*w - 1);
5426 }
5427 else
5428 {
5429 /* mask = mask & {w-1, ...} */
5430 vt = GEN_INT (w - 1);
5431 }
5432
5433 vt = gen_const_vec_duplicate (maskmode, vt);
5434 mask = expand_simple_binop (maskmode, AND, mask, vt,
5435 NULL_RTX, 0, OPTAB_DIRECT);
5436
5437 /* For non-QImode operations, convert the word permutation control
5438 into a byte permutation control. */
5439 if (mode != V16QImode)
5440 {
5441 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5442 GEN_INT (exact_log2 (e)),
5443 NULL_RTX, 0, OPTAB_DIRECT);
5444
5445 /* Convert mask to vector of chars. */
5446 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5447
5448 /* Replicate each of the input bytes into byte positions:
5449 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5450 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5451 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5452 for (i = 0; i < 16; ++i)
5453 vec[i] = GEN_INT (i/e * e);
5454 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5455 vt = validize_mem (force_const_mem (V16QImode, vt));
5456 if (TARGET_XOP)
5457 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5458 else
5459 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5460
5461 /* Convert it into the byte positions by doing
5462 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5463 for (i = 0; i < 16; ++i)
5464 vec[i] = GEN_INT (i % e);
5465 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5466 vt = validize_mem (force_const_mem (V16QImode, vt));
5467 emit_insn (gen_addv16qi3 (mask, mask, vt));
5468 }
5469
5470 /* The actual shuffle operations all operate on V16QImode. */
5471 op0 = gen_lowpart (V16QImode, op0);
5472 op1 = gen_lowpart (V16QImode, op1);
5473
5474 if (TARGET_XOP)
5475 {
5476 if (GET_MODE (target) != V16QImode)
5477 target = gen_reg_rtx (V16QImode);
5478 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5479 if (target != operands[0])
5480 emit_move_insn (operands[0],
5481 gen_lowpart (GET_MODE (operands[0]), target));
5482 }
5483 else if (one_operand_shuffle)
5484 {
5485 if (GET_MODE (target) != V16QImode)
5486 target = gen_reg_rtx (V16QImode);
5487 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5488 if (target != operands[0])
5489 emit_move_insn (operands[0],
5490 gen_lowpart (GET_MODE (operands[0]), target));
5491 }
5492 else
5493 {
5494 rtx xops[6];
5495 bool ok;
5496
5497 /* Shuffle the two input vectors independently. */
5498 t1 = gen_reg_rtx (V16QImode);
5499 t2 = gen_reg_rtx (V16QImode);
5500 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5501 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5502
5503 merge_two:
5504 /* Then merge them together. The key is whether any given control
5505 element contained a bit set that indicates the second word. */
5506 mask = operands[3];
5507 vt = GEN_INT (w);
5508 if (maskmode == V2DImode && !TARGET_SSE4_1)
5509 {
5510 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5511 more shuffle to convert the V2DI input mask into a V4SI
5512 input mask. At which point the masking that expand_int_vcond
5513 will work as desired. */
5514 rtx t3 = gen_reg_rtx (V4SImode);
5515 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5516 const0_rtx, const0_rtx,
5517 const2_rtx, const2_rtx));
5518 mask = t3;
5519 maskmode = V4SImode;
5520 e = w = 4;
5521 }
5522
5523 vt = gen_const_vec_duplicate (maskmode, vt);
5524 vt = force_reg (maskmode, vt);
5525 mask = expand_simple_binop (maskmode, AND, mask, vt,
5526 NULL_RTX, 0, OPTAB_DIRECT);
5527
5528 if (GET_MODE (target) != mode)
5529 target = gen_reg_rtx (mode);
5530 xops[0] = target;
5531 xops[1] = gen_lowpart (mode, t2);
5532 xops[2] = gen_lowpart (mode, t1);
5533 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5534 xops[4] = mask;
5535 xops[5] = vt;
5536 ok = ix86_expand_int_vcond (xops);
5537 gcc_assert (ok);
5538 if (target != operands[0])
5539 emit_move_insn (operands[0],
5540 gen_lowpart (GET_MODE (operands[0]), target));
5541 }
5542}
5543
5544/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5545 true if we should do zero extension, else sign extension. HIGH_P is
5546 true if we want the N/2 high elements, else the low elements. */
5547
5548void
5549ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5550{
5551 machine_mode imode = GET_MODE (src);
5552 rtx tmp;
5553
5554 if (TARGET_SSE4_1)
5555 {
5556 rtx (*unpack)(rtx, rtx);
5557 rtx (*extract)(rtx, rtx) = NULL;
5558 machine_mode halfmode = BLKmode;
5559
5560 switch (imode)
5561 {
5562 case E_V64QImode:
5563 if (unsigned_p)
5564 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5565 else
5566 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5567 halfmode = V32QImode;
5568 extract
5569 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5570 break;
5571 case E_V32QImode:
5572 if (unsigned_p)
5573 unpack = gen_avx2_zero_extendv16qiv16hi2;
5574 else
5575 unpack = gen_avx2_sign_extendv16qiv16hi2;
5576 halfmode = V16QImode;
5577 extract
5578 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5579 break;
5580 case E_V32HImode:
5581 if (unsigned_p)
5582 unpack = gen_avx512f_zero_extendv16hiv16si2;
5583 else
5584 unpack = gen_avx512f_sign_extendv16hiv16si2;
5585 halfmode = V16HImode;
5586 extract
5587 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5588 break;
5589 case E_V16HImode:
5590 if (unsigned_p)
5591 unpack = gen_avx2_zero_extendv8hiv8si2;
5592 else
5593 unpack = gen_avx2_sign_extendv8hiv8si2;
5594 halfmode = V8HImode;
5595 extract
5596 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5597 break;
5598 case E_V16SImode:
5599 if (unsigned_p)
5600 unpack = gen_avx512f_zero_extendv8siv8di2;
5601 else
5602 unpack = gen_avx512f_sign_extendv8siv8di2;
5603 halfmode = V8SImode;
5604 extract
5605 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5606 break;
5607 case E_V8SImode:
5608 if (unsigned_p)
5609 unpack = gen_avx2_zero_extendv4siv4di2;
5610 else
5611 unpack = gen_avx2_sign_extendv4siv4di2;
5612 halfmode = V4SImode;
5613 extract
5614 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5615 break;
5616 case E_V16QImode:
5617 if (unsigned_p)
5618 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5619 else
5620 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5621 break;
5622 case E_V8HImode:
5623 if (unsigned_p)
5624 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5625 else
5626 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5627 break;
5628 case E_V4SImode:
5629 if (unsigned_p)
5630 unpack = gen_sse4_1_zero_extendv2siv2di2;
5631 else
5632 unpack = gen_sse4_1_sign_extendv2siv2di2;
5633 break;
836328b2
UB
5634 case E_V8QImode:
5635 if (unsigned_p)
5636 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5637 else
5638 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5639 break;
5640 case E_V4HImode:
5641 if (unsigned_p)
5642 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5643 else
5644 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5645 break;
663a014e
UB
5646 case E_V4QImode:
5647 if (unsigned_p)
5648 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5649 else
5650 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5651 break;
2bf6d935
ML
5652 default:
5653 gcc_unreachable ();
5654 }
5655
5656 if (GET_MODE_SIZE (imode) >= 32)
5657 {
5658 tmp = gen_reg_rtx (halfmode);
5659 emit_insn (extract (tmp, src));
5660 }
5661 else if (high_p)
5662 {
836328b2
UB
5663 switch (GET_MODE_SIZE (imode))
5664 {
5665 case 16:
5666 /* Shift higher 8 bytes to lower 8 bytes. */
5667 tmp = gen_reg_rtx (V1TImode);
5668 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5669 GEN_INT (64)));
5670 break;
5671 case 8:
5672 /* Shift higher 4 bytes to lower 4 bytes. */
5673 tmp = gen_reg_rtx (V1DImode);
5674 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5675 GEN_INT (32)));
5676 break;
663a014e
UB
5677 case 4:
5678 /* Shift higher 2 bytes to lower 2 bytes. */
5679 tmp = gen_reg_rtx (V1SImode);
5680 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5681 GEN_INT (16)));
5682 break;
836328b2
UB
5683 default:
5684 gcc_unreachable ();
5685 }
5686
2bf6d935
ML
5687 tmp = gen_lowpart (imode, tmp);
5688 }
5689 else
5690 tmp = src;
5691
5692 emit_insn (unpack (dest, tmp));
5693 }
5694 else
5695 {
5696 rtx (*unpack)(rtx, rtx, rtx);
5697
5698 switch (imode)
5699 {
5700 case E_V16QImode:
5701 if (high_p)
5702 unpack = gen_vec_interleave_highv16qi;
5703 else
5704 unpack = gen_vec_interleave_lowv16qi;
5705 break;
5706 case E_V8HImode:
5707 if (high_p)
5708 unpack = gen_vec_interleave_highv8hi;
5709 else
5710 unpack = gen_vec_interleave_lowv8hi;
5711 break;
5712 case E_V4SImode:
5713 if (high_p)
5714 unpack = gen_vec_interleave_highv4si;
5715 else
5716 unpack = gen_vec_interleave_lowv4si;
5717 break;
836328b2
UB
5718 case E_V8QImode:
5719 if (high_p)
5720 unpack = gen_mmx_punpckhbw;
5721 else
5722 unpack = gen_mmx_punpcklbw;
5723 break;
5724 case E_V4HImode:
5725 if (high_p)
5726 unpack = gen_mmx_punpckhwd;
5727 else
5728 unpack = gen_mmx_punpcklwd;
5729 break;
663a014e
UB
5730 case E_V4QImode:
5731 if (high_p)
5732 unpack = gen_mmx_punpckhbw_low;
5733 else
5734 unpack = gen_mmx_punpcklbw_low;
5735 break;
2bf6d935
ML
5736 default:
5737 gcc_unreachable ();
5738 }
5739
5740 if (unsigned_p)
5741 tmp = force_reg (imode, CONST0_RTX (imode));
5742 else
5743 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5744 src, pc_rtx, pc_rtx);
5745
5746 rtx tmp2 = gen_reg_rtx (imode);
5747 emit_insn (unpack (tmp2, src, tmp));
5748 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5749 }
5750}
5751
faf2b6bc 5752/* Return true if mem is pool constant which contains a const_vector
5753 perm index, assign the index to PERM. */
5754bool
5755ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5756{
5757 machine_mode mode = GET_MODE (mem);
5758 int nelt = GET_MODE_NUNITS (mode);
5759
5760 if (!INTEGRAL_MODE_P (mode))
5761 return false;
5762
5763 /* Needs to be constant pool. */
5764 if (!(MEM_P (mem))
5765 || !SYMBOL_REF_P (XEXP (mem, 0))
5766 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5767 return false;
5768
5769 rtx constant = get_pool_constant (XEXP (mem, 0));
5770
5771 if (GET_CODE (constant) != CONST_VECTOR)
5772 return false;
5773
5774 /* There could be some rtx like
5775 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5776 but with "*.LC1" refer to V2DI constant vector. */
5777 if (GET_MODE (constant) != mode)
5778 {
5779 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5780
5781 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5782 return false;
5783 }
5784
5785 for (int i = 0; i != nelt; i++)
5786 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5787
5788 return true;
5789}
5790
2bf6d935
ML
5791/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5792 but works for floating pointer parameters and nonoffsetable memories.
5793 For pushes, it returns just stack offsets; the values will be saved
5794 in the right order. Maximally three parts are generated. */
5795
5796static int
5797ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5798{
5799 int size;
5800
5801 if (!TARGET_64BIT)
5802 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5803 else
5804 size = (GET_MODE_SIZE (mode) + 4) / 8;
5805
5806 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5807 gcc_assert (size >= 2 && size <= 4);
5808
5809 /* Optimize constant pool reference to immediates. This is used by fp
5810 moves, that force all constants to memory to allow combining. */
5811 if (MEM_P (operand) && MEM_READONLY_P (operand))
5812 operand = avoid_constant_pool_reference (operand);
5813
5814 if (MEM_P (operand) && !offsettable_memref_p (operand))
5815 {
5816 /* The only non-offsetable memories we handle are pushes. */
5817 int ok = push_operand (operand, VOIDmode);
5818
5819 gcc_assert (ok);
5820
5821 operand = copy_rtx (operand);
5822 PUT_MODE (operand, word_mode);
5823 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5824 return size;
5825 }
5826
5827 if (GET_CODE (operand) == CONST_VECTOR)
5828 {
5829 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5830 /* Caution: if we looked through a constant pool memory above,
5831 the operand may actually have a different mode now. That's
5832 ok, since we want to pun this all the way back to an integer. */
5833 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5834 gcc_assert (operand != NULL);
5835 mode = imode;
5836 }
5837
5838 if (!TARGET_64BIT)
5839 {
5840 if (mode == DImode)
5841 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5842 else
5843 {
5844 int i;
5845
5846 if (REG_P (operand))
5847 {
5848 gcc_assert (reload_completed);
5849 for (i = 0; i < size; i++)
5850 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5851 }
5852 else if (offsettable_memref_p (operand))
5853 {
5854 operand = adjust_address (operand, SImode, 0);
5855 parts[0] = operand;
5856 for (i = 1; i < size; i++)
5857 parts[i] = adjust_address (operand, SImode, 4 * i);
5858 }
5859 else if (CONST_DOUBLE_P (operand))
5860 {
5861 const REAL_VALUE_TYPE *r;
5862 long l[4];
5863
5864 r = CONST_DOUBLE_REAL_VALUE (operand);
5865 switch (mode)
5866 {
5867 case E_TFmode:
5868 real_to_target (l, r, mode);
5869 parts[3] = gen_int_mode (l[3], SImode);
5870 parts[2] = gen_int_mode (l[2], SImode);
5871 break;
5872 case E_XFmode:
5873 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5874 long double may not be 80-bit. */
5875 real_to_target (l, r, mode);
5876 parts[2] = gen_int_mode (l[2], SImode);
5877 break;
5878 case E_DFmode:
5879 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5880 break;
5881 default:
5882 gcc_unreachable ();
5883 }
5884 parts[1] = gen_int_mode (l[1], SImode);
5885 parts[0] = gen_int_mode (l[0], SImode);
5886 }
5887 else
5888 gcc_unreachable ();
5889 }
5890 }
5891 else
5892 {
5893 if (mode == TImode)
5894 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5895 if (mode == XFmode || mode == TFmode)
5896 {
5897 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5898 if (REG_P (operand))
5899 {
5900 gcc_assert (reload_completed);
5901 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5902 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5903 }
5904 else if (offsettable_memref_p (operand))
5905 {
5906 operand = adjust_address (operand, DImode, 0);
5907 parts[0] = operand;
5908 parts[1] = adjust_address (operand, upper_mode, 8);
5909 }
5910 else if (CONST_DOUBLE_P (operand))
5911 {
5912 long l[4];
5913
5914 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5915
5916 /* real_to_target puts 32-bit pieces in each long. */
5917 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5918 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5919 << 32), DImode);
5920
5921 if (upper_mode == SImode)
5922 parts[1] = gen_int_mode (l[2], SImode);
5923 else
5924 parts[1]
5925 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5926 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5927 << 32), DImode);
5928 }
5929 else
5930 gcc_unreachable ();
5931 }
5932 }
5933
5934 return size;
5935}
5936
5937/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5938 Return false when normal moves are needed; true when all required
5939 insns have been emitted. Operands 2-4 contain the input values
5940 int the correct order; operands 5-7 contain the output values. */
5941
5942void
5943ix86_split_long_move (rtx operands[])
5944{
5945 rtx part[2][4];
5946 int nparts, i, j;
5947 int push = 0;
5948 int collisions = 0;
5949 machine_mode mode = GET_MODE (operands[0]);
5950 bool collisionparts[4];
5951
5952 /* The DFmode expanders may ask us to move double.
5953 For 64bit target this is single move. By hiding the fact
5954 here we simplify i386.md splitters. */
5955 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5956 {
5957 /* Optimize constant pool reference to immediates. This is used by
5958 fp moves, that force all constants to memory to allow combining. */
5959
5960 if (MEM_P (operands[1])
5961 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5962 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5963 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5964 if (push_operand (operands[0], VOIDmode))
5965 {
5966 operands[0] = copy_rtx (operands[0]);
5967 PUT_MODE (operands[0], word_mode);
5968 }
5969 else
5970 operands[0] = gen_lowpart (DImode, operands[0]);
5971 operands[1] = gen_lowpart (DImode, operands[1]);
5972 emit_move_insn (operands[0], operands[1]);
5973 return;
5974 }
5975
5976 /* The only non-offsettable memory we handle is push. */
5977 if (push_operand (operands[0], VOIDmode))
5978 push = 1;
5979 else
5980 gcc_assert (!MEM_P (operands[0])
5981 || offsettable_memref_p (operands[0]));
5982
5983 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5984 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5985
5986 /* When emitting push, take care for source operands on the stack. */
5987 if (push && MEM_P (operands[1])
5988 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5989 {
5990 rtx src_base = XEXP (part[1][nparts - 1], 0);
5991
5992 /* Compensate for the stack decrement by 4. */
5993 if (!TARGET_64BIT && nparts == 3
5994 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5995 src_base = plus_constant (Pmode, src_base, 4);
5996
5997 /* src_base refers to the stack pointer and is
5998 automatically decreased by emitted push. */
5999 for (i = 0; i < nparts; i++)
6000 part[1][i] = change_address (part[1][i],
6001 GET_MODE (part[1][i]), src_base);
6002 }
6003
6004 /* We need to do copy in the right order in case an address register
6005 of the source overlaps the destination. */
6006 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6007 {
6008 rtx tmp;
6009
6010 for (i = 0; i < nparts; i++)
6011 {
6012 collisionparts[i]
6013 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6014 if (collisionparts[i])
6015 collisions++;
6016 }
6017
6018 /* Collision in the middle part can be handled by reordering. */
6019 if (collisions == 1 && nparts == 3 && collisionparts [1])
6020 {
6021 std::swap (part[0][1], part[0][2]);
6022 std::swap (part[1][1], part[1][2]);
6023 }
6024 else if (collisions == 1
6025 && nparts == 4
6026 && (collisionparts [1] || collisionparts [2]))
6027 {
6028 if (collisionparts [1])
6029 {
6030 std::swap (part[0][1], part[0][2]);
6031 std::swap (part[1][1], part[1][2]);
6032 }
6033 else
6034 {
6035 std::swap (part[0][2], part[0][3]);
6036 std::swap (part[1][2], part[1][3]);
6037 }
6038 }
6039
6040 /* If there are more collisions, we can't handle it by reordering.
6041 Do an lea to the last part and use only one colliding move. */
6042 else if (collisions > 1)
6043 {
6044 rtx base, addr;
6045
6046 collisions = 1;
6047
6048 base = part[0][nparts - 1];
6049
6050 /* Handle the case when the last part isn't valid for lea.
6051 Happens in 64-bit mode storing the 12-byte XFmode. */
6052 if (GET_MODE (base) != Pmode)
6053 base = gen_rtx_REG (Pmode, REGNO (base));
6054
6055 addr = XEXP (part[1][0], 0);
6056 if (TARGET_TLS_DIRECT_SEG_REFS)
6057 {
6058 struct ix86_address parts;
6059 int ok = ix86_decompose_address (addr, &parts);
6060 gcc_assert (ok);
6061 /* It is not valid to use %gs: or %fs: in lea. */
6062 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6063 }
6064 emit_insn (gen_rtx_SET (base, addr));
6065 part[1][0] = replace_equiv_address (part[1][0], base);
6066 for (i = 1; i < nparts; i++)
6067 {
6068 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6069 part[1][i] = replace_equiv_address (part[1][i], tmp);
6070 }
6071 }
6072 }
6073
6074 if (push)
6075 {
6076 if (!TARGET_64BIT)
6077 {
6078 if (nparts == 3)
6079 {
6080 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
d9330fb5 6081 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
2bf6d935
ML
6082 emit_move_insn (part[0][2], part[1][2]);
6083 }
6084 else if (nparts == 4)
6085 {
6086 emit_move_insn (part[0][3], part[1][3]);
6087 emit_move_insn (part[0][2], part[1][2]);
6088 }
6089 }
6090 else
6091 {
6092 /* In 64bit mode we don't have 32bit push available. In case this is
6093 register, it is OK - we will just use larger counterpart. We also
6094 retype memory - these comes from attempt to avoid REX prefix on
6095 moving of second half of TFmode value. */
6096 if (GET_MODE (part[1][1]) == SImode)
6097 {
6098 switch (GET_CODE (part[1][1]))
6099 {
6100 case MEM:
6101 part[1][1] = adjust_address (part[1][1], DImode, 0);
6102 break;
6103
6104 case REG:
6105 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6106 break;
6107
6108 default:
6109 gcc_unreachable ();
6110 }
6111
6112 if (GET_MODE (part[1][0]) == SImode)
6113 part[1][0] = part[1][1];
6114 }
6115 }
6116 emit_move_insn (part[0][1], part[1][1]);
6117 emit_move_insn (part[0][0], part[1][0]);
6118 return;
6119 }
6120
6121 /* Choose correct order to not overwrite the source before it is copied. */
6122 if ((REG_P (part[0][0])
6123 && REG_P (part[1][1])
6124 && (REGNO (part[0][0]) == REGNO (part[1][1])
6125 || (nparts == 3
6126 && REGNO (part[0][0]) == REGNO (part[1][2]))
6127 || (nparts == 4
6128 && REGNO (part[0][0]) == REGNO (part[1][3]))))
6129 || (collisions > 0
6130 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6131 {
6132 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6133 {
6134 operands[2 + i] = part[0][j];
6135 operands[6 + i] = part[1][j];
6136 }
6137 }
6138 else
6139 {
6140 for (i = 0; i < nparts; i++)
6141 {
6142 operands[2 + i] = part[0][i];
6143 operands[6 + i] = part[1][i];
6144 }
6145 }
6146
6147 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6148 if (optimize_insn_for_size_p ())
6149 {
6150 for (j = 0; j < nparts - 1; j++)
6151 if (CONST_INT_P (operands[6 + j])
6152 && operands[6 + j] != const0_rtx
6153 && REG_P (operands[2 + j]))
6154 for (i = j; i < nparts - 1; i++)
6155 if (CONST_INT_P (operands[7 + i])
6156 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6157 operands[7 + i] = operands[2 + j];
6158 }
6159
6160 for (i = 0; i < nparts; i++)
6161 emit_move_insn (operands[2 + i], operands[6 + i]);
6162
6163 return;
6164}
6165
6166/* Helper function of ix86_split_ashl used to generate an SImode/DImode
6167 left shift by a constant, either using a single shift or
6168 a sequence of add instructions. */
6169
6170static void
6171ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6172{
2bf6d935
ML
6173 if (count == 1
6174 || (count * ix86_cost->add <= ix86_cost->shift_const
6175 && !optimize_insn_for_size_p ()))
6176 {
2bf6d935 6177 while (count-- > 0)
83bc5e44 6178 emit_insn (gen_add2_insn (operand, operand));
2bf6d935
ML
6179 }
6180 else
6181 {
83bc5e44
UB
6182 rtx (*insn)(rtx, rtx, rtx);
6183
2bf6d935
ML
6184 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6185 emit_insn (insn (operand, operand, GEN_INT (count)));
6186 }
6187}
6188
6189void
6190ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6191{
6192 rtx (*gen_ashl3)(rtx, rtx, rtx);
6193 rtx (*gen_shld)(rtx, rtx, rtx);
6194 int half_width = GET_MODE_BITSIZE (mode) >> 1;
987a3082 6195 machine_mode half_mode;
2bf6d935
ML
6196
6197 rtx low[2], high[2];
6198 int count;
6199
6200 if (CONST_INT_P (operands[2]))
6201 {
6202 split_double_mode (mode, operands, 2, low, high);
6203 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6204
6205 if (count >= half_width)
6206 {
6207 emit_move_insn (high[0], low[1]);
38b649ec 6208 ix86_expand_clear (low[0]);
2bf6d935
ML
6209
6210 if (count > half_width)
6211 ix86_expand_ashl_const (high[0], count - half_width, mode);
6212 }
6213 else
6214 {
6215 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6216
6217 if (!rtx_equal_p (operands[0], operands[1]))
6218 emit_move_insn (operands[0], operands[1]);
6219
6220 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6221 ix86_expand_ashl_const (low[0], count, mode);
6222 }
6223 return;
6224 }
6225
6226 split_double_mode (mode, operands, 1, low, high);
987a3082 6227 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6228
6229 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6230
6231 if (operands[1] == const1_rtx)
6232 {
6233 /* Assuming we've chosen a QImode capable registers, then 1 << N
6234 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6235 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6236 {
6237 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6238
6239 ix86_expand_clear (low[0]);
6240 ix86_expand_clear (high[0]);
6241 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6242
6243 d = gen_lowpart (QImode, low[0]);
6244 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6245 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6246 emit_insn (gen_rtx_SET (d, s));
6247
6248 d = gen_lowpart (QImode, high[0]);
6249 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6250 s = gen_rtx_NE (QImode, flags, const0_rtx);
6251 emit_insn (gen_rtx_SET (d, s));
6252 }
6253
6254 /* Otherwise, we can get the same results by manually performing
6255 a bit extract operation on bit 5/6, and then performing the two
6256 shifts. The two methods of getting 0/1 into low/high are exactly
6257 the same size. Avoiding the shift in the bit extract case helps
6258 pentium4 a bit; no one else seems to care much either way. */
6259 else
6260 {
2bf6d935
ML
6261 rtx (*gen_lshr3)(rtx, rtx, rtx);
6262 rtx (*gen_and3)(rtx, rtx, rtx);
6263 rtx (*gen_xor3)(rtx, rtx, rtx);
6264 HOST_WIDE_INT bits;
6265 rtx x;
6266
6267 if (mode == DImode)
6268 {
2bf6d935
ML
6269 gen_lshr3 = gen_lshrsi3;
6270 gen_and3 = gen_andsi3;
6271 gen_xor3 = gen_xorsi3;
6272 bits = 5;
6273 }
6274 else
6275 {
2bf6d935
ML
6276 gen_lshr3 = gen_lshrdi3;
6277 gen_and3 = gen_anddi3;
6278 gen_xor3 = gen_xordi3;
6279 bits = 6;
6280 }
6281
6282 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6283 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6284 else
6285 x = gen_lowpart (half_mode, operands[2]);
6286 emit_insn (gen_rtx_SET (high[0], x));
6287
6288 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6289 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6290 emit_move_insn (low[0], high[0]);
6291 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6292 }
6293
6294 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6295 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6296 return;
6297 }
6298
6299 if (operands[1] == constm1_rtx)
6300 {
6301 /* For -1 << N, we can avoid the shld instruction, because we
6302 know that we're shifting 0...31/63 ones into a -1. */
6303 emit_move_insn (low[0], constm1_rtx);
6304 if (optimize_insn_for_size_p ())
6305 emit_move_insn (high[0], low[0]);
6306 else
6307 emit_move_insn (high[0], constm1_rtx);
6308 }
6309 else
6310 {
6311 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6312
6313 if (!rtx_equal_p (operands[0], operands[1]))
6314 emit_move_insn (operands[0], operands[1]);
6315
6316 split_double_mode (mode, operands, 1, low, high);
6317 emit_insn (gen_shld (high[0], low[0], operands[2]));
6318 }
6319
6320 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6321
6322 if (TARGET_CMOVE && scratch)
6323 {
2bf6d935 6324 ix86_expand_clear (scratch);
987a3082
UB
6325 emit_insn (gen_x86_shift_adj_1
6326 (half_mode, high[0], low[0], operands[2], scratch));
2bf6d935
ML
6327 }
6328 else
987a3082 6329 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
2bf6d935
ML
6330}
6331
6332void
6333ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6334{
6335 rtx (*gen_ashr3)(rtx, rtx, rtx)
6336 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6337 rtx (*gen_shrd)(rtx, rtx, rtx);
6338 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6339
6340 rtx low[2], high[2];
6341 int count;
6342
6343 if (CONST_INT_P (operands[2]))
6344 {
6345 split_double_mode (mode, operands, 2, low, high);
6346 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6347
6348 if (count == GET_MODE_BITSIZE (mode) - 1)
6349 {
6350 emit_move_insn (high[0], high[1]);
6351 emit_insn (gen_ashr3 (high[0], high[0],
6352 GEN_INT (half_width - 1)));
6353 emit_move_insn (low[0], high[0]);
6354
6355 }
6356 else if (count >= half_width)
6357 {
6358 emit_move_insn (low[0], high[1]);
6359 emit_move_insn (high[0], low[0]);
6360 emit_insn (gen_ashr3 (high[0], high[0],
6361 GEN_INT (half_width - 1)));
6362
6363 if (count > half_width)
6364 emit_insn (gen_ashr3 (low[0], low[0],
6365 GEN_INT (count - half_width)));
6366 }
6367 else
6368 {
6369 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6370
6371 if (!rtx_equal_p (operands[0], operands[1]))
6372 emit_move_insn (operands[0], operands[1]);
6373
6374 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6375 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6376 }
6377 }
6378 else
6379 {
987a3082
UB
6380 machine_mode half_mode;
6381
2bf6d935
ML
6382 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6383
6384 if (!rtx_equal_p (operands[0], operands[1]))
6385 emit_move_insn (operands[0], operands[1]);
6386
6387 split_double_mode (mode, operands, 1, low, high);
987a3082 6388 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6389
6390 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6391 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6392
6393 if (TARGET_CMOVE && scratch)
6394 {
2bf6d935
ML
6395 emit_move_insn (scratch, high[0]);
6396 emit_insn (gen_ashr3 (scratch, scratch,
6397 GEN_INT (half_width - 1)));
987a3082
UB
6398 emit_insn (gen_x86_shift_adj_1
6399 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
6400 }
6401 else
987a3082
UB
6402 emit_insn (gen_x86_shift_adj_3
6403 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
6404 }
6405}
6406
6407void
6408ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6409{
6410 rtx (*gen_lshr3)(rtx, rtx, rtx)
6411 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6412 rtx (*gen_shrd)(rtx, rtx, rtx);
6413 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6414
6415 rtx low[2], high[2];
6416 int count;
6417
6418 if (CONST_INT_P (operands[2]))
6419 {
6420 split_double_mode (mode, operands, 2, low, high);
6421 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6422
6423 if (count >= half_width)
6424 {
6425 emit_move_insn (low[0], high[1]);
6426 ix86_expand_clear (high[0]);
6427
6428 if (count > half_width)
6429 emit_insn (gen_lshr3 (low[0], low[0],
6430 GEN_INT (count - half_width)));
6431 }
6432 else
6433 {
6434 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6435
6436 if (!rtx_equal_p (operands[0], operands[1]))
6437 emit_move_insn (operands[0], operands[1]);
6438
6439 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6440 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6441 }
6442 }
6443 else
6444 {
987a3082
UB
6445 machine_mode half_mode;
6446
2bf6d935
ML
6447 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6448
6449 if (!rtx_equal_p (operands[0], operands[1]))
6450 emit_move_insn (operands[0], operands[1]);
6451
6452 split_double_mode (mode, operands, 1, low, high);
987a3082 6453 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6454
6455 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6456 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6457
6458 if (TARGET_CMOVE && scratch)
6459 {
2bf6d935 6460 ix86_expand_clear (scratch);
987a3082
UB
6461 emit_insn (gen_x86_shift_adj_1
6462 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
6463 }
6464 else
987a3082
UB
6465 emit_insn (gen_x86_shift_adj_2
6466 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
6467 }
6468}
6469
1188cf5f
RS
6470/* Expand move of V1TI mode register X to a new TI mode register. */
6471static rtx
6472ix86_expand_v1ti_to_ti (rtx x)
6473{
6474 rtx result = gen_reg_rtx (TImode);
a5d269f0
RS
6475 if (TARGET_SSE2)
6476 {
51e9e8a2 6477 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
a5d269f0
RS
6478 rtx lo = gen_lowpart (DImode, result);
6479 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6480 rtx hi = gen_highpart (DImode, result);
6481 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6482 }
6483 else
6484 emit_move_insn (result, gen_lowpart (TImode, x));
1188cf5f
RS
6485 return result;
6486}
6487
6488/* Expand move of TI mode register X to a new V1TI mode register. */
6489static rtx
6490ix86_expand_ti_to_v1ti (rtx x)
6491{
1188cf5f
RS
6492 if (TARGET_SSE2)
6493 {
6494 rtx lo = gen_lowpart (DImode, x);
6495 rtx hi = gen_highpart (DImode, x);
6496 rtx tmp = gen_reg_rtx (V2DImode);
6497 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
51e9e8a2 6498 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
1188cf5f 6499 }
51e9e8a2
RS
6500
6501 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
1188cf5f
RS
6502}
6503
6b8b2557 6504/* Expand V1TI mode shift (of rtx_code CODE) by constant. */
1188cf5f
RS
6505void
6506ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6b8b2557 6507{
6b8b2557
RS
6508 rtx op1 = force_reg (V1TImode, operands[1]);
6509
1188cf5f
RS
6510 if (!CONST_INT_P (operands[2]))
6511 {
6512 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6513 rtx tmp2 = gen_reg_rtx (TImode);
6514 rtx (*shift) (rtx, rtx, rtx)
6515 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6516 emit_insn (shift (tmp2, tmp1, operands[2]));
6517 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6518 emit_move_insn (operands[0], tmp3);
6519 return;
6520 }
6521
6522 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6523
6b8b2557
RS
6524 if (bits == 0)
6525 {
6526 emit_move_insn (operands[0], op1);
6527 return;
6528 }
6529
6530 if ((bits & 7) == 0)
6531 {
6532 rtx tmp = gen_reg_rtx (V1TImode);
6533 if (code == ASHIFT)
1188cf5f 6534 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6b8b2557
RS
6535 else
6536 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6537 emit_move_insn (operands[0], tmp);
6538 return;
6539 }
6540
6541 rtx tmp1 = gen_reg_rtx (V1TImode);
6542 if (code == ASHIFT)
6543 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6544 else
6545 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6546
6547 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
51e9e8a2 6548 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6b8b2557
RS
6549
6550 /* tmp3 will be the V2DImode result. */
6551 rtx tmp3 = gen_reg_rtx (V2DImode);
6552
6553 if (bits > 64)
6554 {
6555 if (code == ASHIFT)
6556 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6557 else
6558 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6559 }
6560 else
6561 {
6562 /* tmp4 is operands[1], in V2DImode. */
51e9e8a2 6563 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6b8b2557
RS
6564
6565 rtx tmp5 = gen_reg_rtx (V2DImode);
6566 if (code == ASHIFT)
6567 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6568 else
6569 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6570
6571 rtx tmp6 = gen_reg_rtx (V2DImode);
6572 if (code == ASHIFT)
6573 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6574 else
6575 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6576
6577 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6578 }
6579
6580 /* Convert the result back to V1TImode and store in operands[0]. */
51e9e8a2 6581 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6b8b2557
RS
6582 emit_move_insn (operands[0], tmp7);
6583}
6584
6585/* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
1188cf5f
RS
6586void
6587ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6b8b2557 6588{
6b8b2557
RS
6589 rtx op1 = force_reg (V1TImode, operands[1]);
6590
1188cf5f
RS
6591 if (!CONST_INT_P (operands[2]))
6592 {
6593 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6594 rtx tmp2 = gen_reg_rtx (TImode);
6595 rtx (*rotate) (rtx, rtx, rtx)
6596 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6597 emit_insn (rotate (tmp2, tmp1, operands[2]));
6598 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6599 emit_move_insn (operands[0], tmp3);
6600 return;
6601 }
6602
6603 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6604
6b8b2557
RS
6605 if (bits == 0)
6606 {
6607 emit_move_insn (operands[0], op1);
6608 return;
6609 }
6610
6611 if (code == ROTATERT)
6612 bits = 128 - bits;
6613
6614 if ((bits & 31) == 0)
6615 {
6b8b2557 6616 rtx tmp2 = gen_reg_rtx (V4SImode);
51e9e8a2 6617 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6b8b2557
RS
6618 if (bits == 32)
6619 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6620 else if (bits == 64)
6621 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6622 else
6623 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
51e9e8a2 6624 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6b8b2557
RS
6625 return;
6626 }
6627
6628 if ((bits & 7) == 0)
6629 {
6630 rtx tmp1 = gen_reg_rtx (V1TImode);
6631 rtx tmp2 = gen_reg_rtx (V1TImode);
6632 rtx tmp3 = gen_reg_rtx (V1TImode);
6633
6634 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6635 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6636 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6637 emit_move_insn (operands[0], tmp3);
6638 return;
6639 }
6640
51e9e8a2 6641 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6b8b2557
RS
6642
6643 rtx lobits;
6644 rtx hibits;
6645
6646 switch (bits >> 5)
6647 {
6648 case 0:
6649 lobits = op1_v4si;
6650 hibits = gen_reg_rtx (V4SImode);
6651 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6652 break;
6653
6654 case 1:
6655 lobits = gen_reg_rtx (V4SImode);
6656 hibits = gen_reg_rtx (V4SImode);
6657 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6658 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6659 break;
6660
6661 case 2:
6662 lobits = gen_reg_rtx (V4SImode);
6663 hibits = gen_reg_rtx (V4SImode);
6664 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6665 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6666 break;
6667
6668 default:
6669 lobits = gen_reg_rtx (V4SImode);
6670 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6671 hibits = op1_v4si;
6672 break;
6673 }
6674
6675 rtx tmp1 = gen_reg_rtx (V4SImode);
6676 rtx tmp2 = gen_reg_rtx (V4SImode);
6677 rtx tmp3 = gen_reg_rtx (V4SImode);
6b8b2557
RS
6678
6679 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6680 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6681 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
51e9e8a2
RS
6682
6683 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6b8b2557
RS
6684}
6685
1188cf5f
RS
6686/* Expand V1TI mode ashiftrt by constant. */
6687void
6688ix86_expand_v1ti_ashiftrt (rtx operands[])
6689{
6690 rtx op1 = force_reg (V1TImode, operands[1]);
6691
6692 if (!CONST_INT_P (operands[2]))
6693 {
6694 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6695 rtx tmp2 = gen_reg_rtx (TImode);
6696 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6697 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6698 emit_move_insn (operands[0], tmp3);
6699 return;
6700 }
6701
6702 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6703
6704 if (bits == 0)
6705 {
6706 emit_move_insn (operands[0], op1);
6707 return;
6708 }
6709
6710 if (bits == 127)
6711 {
6712 /* Two operations. */
51e9e8a2 6713 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6714 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6715 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6716
6717 rtx tmp3 = gen_reg_rtx (V4SImode);
6718 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6719
51e9e8a2 6720 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
1188cf5f
RS
6721 return;
6722 }
6723
6724 if (bits == 64)
6725 {
6726 /* Three operations. */
51e9e8a2 6727 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6728 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6729 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6730
6731 rtx tmp3 = gen_reg_rtx (V4SImode);
6732 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6733
51e9e8a2
RS
6734 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6735 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6736 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6737 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6738
51e9e8a2 6739 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6740 return;
6741 }
6742
6743 if (bits == 96)
6744 {
6745 /* Three operations. */
51e9e8a2 6746 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6747 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6748 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6749
51e9e8a2
RS
6750 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6751 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
1188cf5f 6752 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6753 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6754
51e9e8a2 6755 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
1188cf5f 6756 rtx tmp7 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6757 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6758
51e9e8a2
RS
6759 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6760 return;
6761 }
6762
6763 if (bits >= 111)
6764 {
6765 /* Three operations. */
6766 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6767 rtx tmp2 = gen_reg_rtx (V4SImode);
6768 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6769
6770 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6771 rtx tmp4 = gen_reg_rtx (V8HImode);
6772 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6773
6774 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6775 rtx tmp6 = gen_reg_rtx (V4SImode);
6776 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6777
6778 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6779 return;
6780 }
6781
6782 if (TARGET_AVX2 || TARGET_SSE4_1)
6783 {
6784 /* Three operations. */
6785 if (bits == 32)
6786 {
51e9e8a2 6787 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6788 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6789 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6790
6791 rtx tmp3 = gen_reg_rtx (V1TImode);
6792 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6793
6794 if (TARGET_AVX2)
6795 {
51e9e8a2 6796 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
1188cf5f 6797 rtx tmp5 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6798 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6799 GEN_INT (7)));
6800
51e9e8a2 6801 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
1188cf5f
RS
6802 }
6803 else
6804 {
51e9e8a2
RS
6805 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6806 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
1188cf5f 6807 rtx tmp6 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6808 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6809 GEN_INT (0x3f)));
6810
51e9e8a2 6811 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6812 }
6813 return;
6814 }
6815
6816 /* Three operations. */
6817 if (bits == 8 || bits == 16 || bits == 24)
6818 {
51e9e8a2 6819 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6820 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6821 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6822
6823 rtx tmp3 = gen_reg_rtx (V1TImode);
6824 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6825
6826 if (TARGET_AVX2)
6827 {
51e9e8a2 6828 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
1188cf5f 6829 rtx tmp5 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6830 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6831 GEN_INT (7)));
6832
51e9e8a2 6833 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
1188cf5f
RS
6834 }
6835 else
6836 {
51e9e8a2
RS
6837 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6838 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
1188cf5f 6839 rtx tmp6 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6840 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6841 GEN_INT (0x3f)));
6842
51e9e8a2 6843 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
1188cf5f
RS
6844 }
6845 return;
6846 }
6847 }
6848
6849 if (bits > 96)
6850 {
6851 /* Four operations. */
51e9e8a2 6852 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6853 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6854 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6855
6856 rtx tmp3 = gen_reg_rtx (V4SImode);
6857 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6858
51e9e8a2
RS
6859 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6860 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6861 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6862 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6863
51e9e8a2 6864 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
1188cf5f 6865 rtx tmp8 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6866 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6867
51e9e8a2 6868 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
1188cf5f
RS
6869 return;
6870 }
6871
6872 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6873 {
6874 /* Four operations. */
51e9e8a2 6875 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6876 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6877 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6878
6879 rtx tmp3 = gen_reg_rtx (V4SImode);
6880 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6881
6882 rtx tmp4 = gen_reg_rtx (V1TImode);
6883 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6884
51e9e8a2
RS
6885 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6886 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
1188cf5f 6887 rtx tmp7 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6888 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6889 GEN_INT (bits == 48 ? 0x1f : 0x07)));
6890
51e9e8a2 6891 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
1188cf5f
RS
6892 return;
6893 }
6894
6895 if ((bits & 7) == 0)
6896 {
6897 /* Five operations. */
51e9e8a2 6898 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6899 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6900 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6901
6902 rtx tmp3 = gen_reg_rtx (V4SImode);
6903 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6904
6905 rtx tmp4 = gen_reg_rtx (V1TImode);
6906 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6907
51e9e8a2 6908 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
1188cf5f 6909 rtx tmp6 = gen_reg_rtx (V1TImode);
1188cf5f
RS
6910 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6911
51e9e8a2
RS
6912 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6913 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
1188cf5f 6914 rtx tmp9 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6915 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6916
51e9e8a2 6917 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
1188cf5f
RS
6918 return;
6919 }
6920
6921 if (TARGET_AVX2 && bits < 32)
6922 {
6923 /* Six operations. */
51e9e8a2 6924 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6925 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6926 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6927
6928 rtx tmp3 = gen_reg_rtx (V1TImode);
6929 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6930
51e9e8a2 6931 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 6932 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6933 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6934
51e9e8a2 6935 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6936 rtx tmp7 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6937 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6938
6939 rtx tmp8 = gen_reg_rtx (V2DImode);
6940 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6941
51e9e8a2 6942 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
1188cf5f 6943 rtx tmp10 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6944 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6945
51e9e8a2 6946 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
1188cf5f
RS
6947 return;
6948 }
6949
6950 if (TARGET_SSE4_1 && bits < 15)
6951 {
6952 /* Six operations. */
51e9e8a2 6953 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 6954 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
6955 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6956
6957 rtx tmp3 = gen_reg_rtx (V1TImode);
6958 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6959
51e9e8a2 6960 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 6961 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6962 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6963
51e9e8a2 6964 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 6965 rtx tmp7 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6966 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6967
6968 rtx tmp8 = gen_reg_rtx (V2DImode);
6969 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6970
51e9e8a2
RS
6971 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6972 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
1188cf5f 6973 rtx tmp11 = gen_reg_rtx (V8HImode);
1188cf5f
RS
6974 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
6975
51e9e8a2 6976 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
1188cf5f
RS
6977 return;
6978 }
6979
6980 if (bits == 1)
6981 {
6982 /* Eight operations. */
6983 rtx tmp1 = gen_reg_rtx (V1TImode);
6984 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6985
51e9e8a2 6986 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 6987 rtx tmp3 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6988 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
6989
51e9e8a2 6990 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
1188cf5f 6991 rtx tmp5 = gen_reg_rtx (V2DImode);
1188cf5f
RS
6992 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
6993
6994 rtx tmp6 = gen_reg_rtx (V2DImode);
6995 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
6996
6997 rtx tmp7 = gen_reg_rtx (V2DImode);
6998 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
6999
51e9e8a2 7000 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
1188cf5f 7001 rtx tmp9 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7002 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7003
51e9e8a2 7004 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
1188cf5f 7005 rtx tmp11 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7006 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7007
7008 rtx tmp12 = gen_reg_rtx (V2DImode);
7009 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7010
51e9e8a2 7011 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
1188cf5f
RS
7012 return;
7013 }
7014
7015 if (bits > 64)
7016 {
7017 /* Eight operations. */
51e9e8a2 7018 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 7019 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7020 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7021
7022 rtx tmp3 = gen_reg_rtx (V4SImode);
7023 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7024
7025 rtx tmp4 = gen_reg_rtx (V1TImode);
7026 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7027
51e9e8a2 7028 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
1188cf5f 7029 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7030 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7031
51e9e8a2 7032 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
1188cf5f 7033 rtx tmp8 = gen_reg_rtx (V1TImode);
1188cf5f
RS
7034 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7035
51e9e8a2 7036 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
1188cf5f 7037 rtx tmp10 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7038 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7039
51e9e8a2 7040 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
1188cf5f 7041 rtx tmp12 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7042 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7043
7044 rtx tmp13 = gen_reg_rtx (V2DImode);
7045 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7046
51e9e8a2 7047 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
1188cf5f
RS
7048 }
7049 else
7050 {
7051 /* Nine operations. */
51e9e8a2 7052 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
1188cf5f 7053 rtx tmp2 = gen_reg_rtx (V4SImode);
1188cf5f
RS
7054 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7055
7056 rtx tmp3 = gen_reg_rtx (V4SImode);
7057 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7058
7059 rtx tmp4 = gen_reg_rtx (V1TImode);
7060 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7061
51e9e8a2 7062 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
1188cf5f 7063 rtx tmp6 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7064 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7065
51e9e8a2 7066 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
1188cf5f 7067 rtx tmp8 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7068 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7069
7070 rtx tmp9 = gen_reg_rtx (V2DImode);
7071 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7072
51e9e8a2 7073 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
1188cf5f 7074 rtx tmp11 = gen_reg_rtx (V1TImode);
1188cf5f
RS
7075 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7076
51e9e8a2 7077 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
1188cf5f 7078 rtx tmp13 = gen_reg_rtx (V2DImode);
1188cf5f
RS
7079 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7080
7081 rtx tmp14 = gen_reg_rtx (V2DImode);
7082 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7083
51e9e8a2 7084 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
1188cf5f
RS
7085 }
7086}
7087
2bf6d935
ML
7088/* Return mode for the memcpy/memset loop counter. Prefer SImode over
7089 DImode for constant loop counts. */
7090
7091static machine_mode
7092counter_mode (rtx count_exp)
7093{
7094 if (GET_MODE (count_exp) != VOIDmode)
7095 return GET_MODE (count_exp);
7096 if (!CONST_INT_P (count_exp))
7097 return Pmode;
7098 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7099 return DImode;
7100 return SImode;
7101}
7102
7103/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7104 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7105 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7106 memory by VALUE (supposed to be in MODE).
7107
7108 The size is rounded down to whole number of chunk size moved at once.
7109 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7110
7111
7112static void
76715c32 7113expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
2bf6d935
ML
7114 rtx destptr, rtx srcptr, rtx value,
7115 rtx count, machine_mode mode, int unroll,
7116 int expected_size, bool issetmem)
7117{
7118 rtx_code_label *out_label, *top_label;
7119 rtx iter, tmp;
7120 machine_mode iter_mode = counter_mode (count);
7121 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7122 rtx piece_size = GEN_INT (piece_size_n);
7123 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7124 rtx size;
7125 int i;
7126
7127 top_label = gen_label_rtx ();
7128 out_label = gen_label_rtx ();
7129 iter = gen_reg_rtx (iter_mode);
7130
7131 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7132 NULL, 1, OPTAB_DIRECT);
7133 /* Those two should combine. */
7134 if (piece_size == const1_rtx)
7135 {
7136 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7137 true, out_label);
7138 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7139 }
7140 emit_move_insn (iter, const0_rtx);
7141
7142 emit_label (top_label);
7143
7144 tmp = convert_modes (Pmode, iter_mode, iter, true);
7145
7146 /* This assert could be relaxed - in this case we'll need to compute
7147 smallest power of two, containing in PIECE_SIZE_N and pass it to
7148 offset_address. */
7149 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7150 destmem = offset_address (destmem, tmp, piece_size_n);
7151 destmem = adjust_address (destmem, mode, 0);
7152
7153 if (!issetmem)
7154 {
7155 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7156 srcmem = adjust_address (srcmem, mode, 0);
7157
7158 /* When unrolling for chips that reorder memory reads and writes,
7159 we can save registers by using single temporary.
7160 Also using 4 temporaries is overkill in 32bit mode. */
7161 if (!TARGET_64BIT && 0)
7162 {
7163 for (i = 0; i < unroll; i++)
7164 {
7165 if (i)
7166 {
7167 destmem = adjust_address (copy_rtx (destmem), mode,
7168 GET_MODE_SIZE (mode));
7169 srcmem = adjust_address (copy_rtx (srcmem), mode,
7170 GET_MODE_SIZE (mode));
7171 }
7172 emit_move_insn (destmem, srcmem);
7173 }
7174 }
7175 else
7176 {
7177 rtx tmpreg[4];
7178 gcc_assert (unroll <= 4);
7179 for (i = 0; i < unroll; i++)
7180 {
7181 tmpreg[i] = gen_reg_rtx (mode);
7182 if (i)
7183 srcmem = adjust_address (copy_rtx (srcmem), mode,
7184 GET_MODE_SIZE (mode));
7185 emit_move_insn (tmpreg[i], srcmem);
7186 }
7187 for (i = 0; i < unroll; i++)
7188 {
7189 if (i)
7190 destmem = adjust_address (copy_rtx (destmem), mode,
7191 GET_MODE_SIZE (mode));
7192 emit_move_insn (destmem, tmpreg[i]);
7193 }
7194 }
7195 }
7196 else
7197 for (i = 0; i < unroll; i++)
7198 {
7199 if (i)
7200 destmem = adjust_address (copy_rtx (destmem), mode,
7201 GET_MODE_SIZE (mode));
7202 emit_move_insn (destmem, value);
7203 }
7204
7205 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7206 true, OPTAB_LIB_WIDEN);
7207 if (tmp != iter)
7208 emit_move_insn (iter, tmp);
7209
7210 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7211 true, top_label);
7212 if (expected_size != -1)
7213 {
7214 expected_size /= GET_MODE_SIZE (mode) * unroll;
7215 if (expected_size == 0)
7216 predict_jump (0);
7217 else if (expected_size > REG_BR_PROB_BASE)
7218 predict_jump (REG_BR_PROB_BASE - 1);
7219 else
7220 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7221 / expected_size);
7222 }
7223 else
7224 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7225 iter = ix86_zero_extend_to_Pmode (iter);
7226 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7227 true, OPTAB_LIB_WIDEN);
7228 if (tmp != destptr)
7229 emit_move_insn (destptr, tmp);
7230 if (!issetmem)
7231 {
7232 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7233 true, OPTAB_LIB_WIDEN);
7234 if (tmp != srcptr)
7235 emit_move_insn (srcptr, tmp);
7236 }
7237 emit_label (out_label);
7238}
7239
7240/* Divide COUNTREG by SCALE. */
7241static rtx
7242scale_counter (rtx countreg, int scale)
7243{
7244 rtx sc;
7245
7246 if (scale == 1)
7247 return countreg;
7248 if (CONST_INT_P (countreg))
7249 return GEN_INT (INTVAL (countreg) / scale);
7250 gcc_assert (REG_P (countreg));
7251
7252 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7253 GEN_INT (exact_log2 (scale)),
7254 NULL, 1, OPTAB_DIRECT);
7255 return sc;
7256}
7257
7258/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7259 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7260 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7261 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7262 ORIG_VALUE is the original value passed to memset to fill the memory with.
7263 Other arguments have same meaning as for previous function. */
7264
7265static void
76715c32 7266expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
2bf6d935
ML
7267 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7268 rtx count,
7269 machine_mode mode, bool issetmem)
7270{
7271 rtx destexp;
7272 rtx srcexp;
7273 rtx countreg;
7274 HOST_WIDE_INT rounded_count;
7275
7276 /* If possible, it is shorter to use rep movs.
7277 TODO: Maybe it is better to move this logic to decide_alg. */
7278 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
bf24f4ec 7279 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
2bf6d935
ML
7280 && (!issetmem || orig_value == const0_rtx))
7281 mode = SImode;
7282
7283 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7284 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7285
7286 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7287 GET_MODE_SIZE (mode)));
7288 if (mode != QImode)
7289 {
7290 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7291 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7292 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7293 }
7294 else
7295 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7296 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7297 {
7298 rounded_count
7299 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7300 destmem = shallow_copy_rtx (destmem);
7301 set_mem_size (destmem, rounded_count);
7302 }
7303 else if (MEM_SIZE_KNOWN_P (destmem))
7304 clear_mem_size (destmem);
7305
7306 if (issetmem)
7307 {
7308 value = force_reg (mode, gen_lowpart (mode, value));
7309 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7310 }
7311 else
7312 {
7313 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7314 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7315 if (mode != QImode)
7316 {
7317 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7318 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7319 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7320 }
7321 else
7322 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7323 if (CONST_INT_P (count))
7324 {
7325 rounded_count
7326 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7327 srcmem = shallow_copy_rtx (srcmem);
7328 set_mem_size (srcmem, rounded_count);
7329 }
7330 else
7331 {
7332 if (MEM_SIZE_KNOWN_P (srcmem))
7333 clear_mem_size (srcmem);
7334 }
7335 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7336 destexp, srcexp));
7337 }
7338}
7339
7340/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7341 DESTMEM.
7342 SRC is passed by pointer to be updated on return.
7343 Return value is updated DST. */
7344static rtx
7345emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7346 HOST_WIDE_INT size_to_move)
7347{
c3185b64 7348 rtx dst = destmem, src = *srcmem, tempreg;
2bf6d935
ML
7349 enum insn_code code;
7350 machine_mode move_mode;
7351 int piece_size, i;
7352
7353 /* Find the widest mode in which we could perform moves.
7354 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7355 it until move of such size is supported. */
7356 piece_size = 1 << floor_log2 (size_to_move);
7357 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7358 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7359 {
7360 gcc_assert (piece_size > 1);
7361 piece_size >>= 1;
7362 }
7363
7364 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7365 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7366 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7367 {
7368 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7369 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7370 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7371 {
7372 move_mode = word_mode;
7373 piece_size = GET_MODE_SIZE (move_mode);
7374 code = optab_handler (mov_optab, move_mode);
7375 }
7376 }
7377 gcc_assert (code != CODE_FOR_nothing);
7378
7379 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7380 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7381
7382 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7383 gcc_assert (size_to_move % piece_size == 0);
c3185b64 7384
2bf6d935
ML
7385 for (i = 0; i < size_to_move; i += piece_size)
7386 {
7387 /* We move from memory to memory, so we'll need to do it via
7388 a temporary register. */
7389 tempreg = gen_reg_rtx (move_mode);
7390 emit_insn (GEN_FCN (code) (tempreg, src));
7391 emit_insn (GEN_FCN (code) (dst, tempreg));
7392
7393 emit_move_insn (destptr,
c3185b64 7394 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935 7395 emit_move_insn (srcptr,
c3185b64 7396 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
2bf6d935
ML
7397
7398 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7399 piece_size);
7400 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7401 piece_size);
7402 }
7403
7404 /* Update DST and SRC rtx. */
7405 *srcmem = src;
7406 return dst;
7407}
7408
7409/* Helper function for the string operations below. Dest VARIABLE whether
7410 it is aligned to VALUE bytes. If true, jump to the label. */
7411
7412static rtx_code_label *
7413ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7414{
7415 rtx_code_label *label = gen_label_rtx ();
7416 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7417 if (GET_MODE (variable) == DImode)
7418 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7419 else
7420 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7421 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7422 1, label);
7423 if (epilogue)
7424 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7425 else
7426 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7427 return label;
7428}
7429
7430
7431/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7432
7433static void
76715c32 7434expand_cpymem_epilogue (rtx destmem, rtx srcmem,
2bf6d935
ML
7435 rtx destptr, rtx srcptr, rtx count, int max_size)
7436{
7437 rtx src, dest;
7438 if (CONST_INT_P (count))
7439 {
7440 HOST_WIDE_INT countval = INTVAL (count);
7441 HOST_WIDE_INT epilogue_size = countval % max_size;
7442 int i;
7443
7444 /* For now MAX_SIZE should be a power of 2. This assert could be
7445 relaxed, but it'll require a bit more complicated epilogue
7446 expanding. */
7447 gcc_assert ((max_size & (max_size - 1)) == 0);
7448 for (i = max_size; i >= 1; i >>= 1)
7449 {
7450 if (epilogue_size & i)
7451 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7452 }
7453 return;
7454 }
7455 if (max_size > 8)
7456 {
7457 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7458 count, 1, OPTAB_DIRECT);
76715c32 7459 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
2bf6d935
ML
7460 count, QImode, 1, 4, false);
7461 return;
7462 }
7463
7464 /* When there are stringops, we can cheaply increase dest and src pointers.
7465 Otherwise we save code size by maintaining offset (zero is readily
7466 available from preceding rep operation) and using x86 addressing modes.
7467 */
7468 if (TARGET_SINGLE_STRINGOP)
7469 {
7470 if (max_size > 4)
7471 {
7472 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7473 src = change_address (srcmem, SImode, srcptr);
7474 dest = change_address (destmem, SImode, destptr);
7475 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7476 emit_label (label);
7477 LABEL_NUSES (label) = 1;
7478 }
7479 if (max_size > 2)
7480 {
7481 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7482 src = change_address (srcmem, HImode, srcptr);
7483 dest = change_address (destmem, HImode, destptr);
7484 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7485 emit_label (label);
7486 LABEL_NUSES (label) = 1;
7487 }
7488 if (max_size > 1)
7489 {
7490 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7491 src = change_address (srcmem, QImode, srcptr);
7492 dest = change_address (destmem, QImode, destptr);
7493 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7494 emit_label (label);
7495 LABEL_NUSES (label) = 1;
7496 }
7497 }
7498 else
7499 {
7500 rtx offset = force_reg (Pmode, const0_rtx);
7501 rtx tmp;
7502
7503 if (max_size > 4)
7504 {
7505 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7506 src = change_address (srcmem, SImode, srcptr);
7507 dest = change_address (destmem, SImode, destptr);
7508 emit_move_insn (dest, src);
7509 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7510 true, OPTAB_LIB_WIDEN);
7511 if (tmp != offset)
7512 emit_move_insn (offset, tmp);
7513 emit_label (label);
7514 LABEL_NUSES (label) = 1;
7515 }
7516 if (max_size > 2)
7517 {
7518 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7519 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7520 src = change_address (srcmem, HImode, tmp);
7521 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7522 dest = change_address (destmem, HImode, tmp);
7523 emit_move_insn (dest, src);
7524 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7525 true, OPTAB_LIB_WIDEN);
7526 if (tmp != offset)
7527 emit_move_insn (offset, tmp);
7528 emit_label (label);
7529 LABEL_NUSES (label) = 1;
7530 }
7531 if (max_size > 1)
7532 {
7533 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7534 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7535 src = change_address (srcmem, QImode, tmp);
7536 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7537 dest = change_address (destmem, QImode, tmp);
7538 emit_move_insn (dest, src);
7539 emit_label (label);
7540 LABEL_NUSES (label) = 1;
7541 }
7542 }
7543}
7544
7545/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7546 with value PROMOTED_VAL.
7547 SRC is passed by pointer to be updated on return.
7548 Return value is updated DST. */
7549static rtx
7550emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7551 HOST_WIDE_INT size_to_move)
7552{
c3185b64 7553 rtx dst = destmem;
2bf6d935
ML
7554 enum insn_code code;
7555 machine_mode move_mode;
7556 int piece_size, i;
7557
7558 /* Find the widest mode in which we could perform moves.
7559 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7560 it until move of such size is supported. */
7561 move_mode = GET_MODE (promoted_val);
7562 if (move_mode == VOIDmode)
7563 move_mode = QImode;
7564 if (size_to_move < GET_MODE_SIZE (move_mode))
7565 {
7566 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7567 move_mode = int_mode_for_size (move_bits, 0).require ();
7568 promoted_val = gen_lowpart (move_mode, promoted_val);
7569 }
7570 piece_size = GET_MODE_SIZE (move_mode);
7571 code = optab_handler (mov_optab, move_mode);
7572 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7573
7574 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7575
7576 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7577 gcc_assert (size_to_move % piece_size == 0);
c3185b64 7578
2bf6d935
ML
7579 for (i = 0; i < size_to_move; i += piece_size)
7580 {
7581 if (piece_size <= GET_MODE_SIZE (word_mode))
7582 {
7583 emit_insn (gen_strset (destptr, dst, promoted_val));
7584 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7585 piece_size);
7586 continue;
7587 }
7588
7589 emit_insn (GEN_FCN (code) (dst, promoted_val));
7590
7591 emit_move_insn (destptr,
c3185b64 7592 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935
ML
7593
7594 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7595 piece_size);
7596 }
7597
7598 /* Update DST rtx. */
7599 return dst;
7600}
7601/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7602static void
7603expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7604 rtx count, int max_size)
7605{
7606 count = expand_simple_binop (counter_mode (count), AND, count,
7607 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
76715c32 7608 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
2bf6d935
ML
7609 gen_lowpart (QImode, value), count, QImode,
7610 1, max_size / 2, true);
7611}
7612
7613/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7614static void
7615expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7616 rtx count, int max_size)
7617{
7618 rtx dest;
7619
7620 if (CONST_INT_P (count))
7621 {
7622 HOST_WIDE_INT countval = INTVAL (count);
7623 HOST_WIDE_INT epilogue_size = countval % max_size;
7624 int i;
7625
7626 /* For now MAX_SIZE should be a power of 2. This assert could be
7627 relaxed, but it'll require a bit more complicated epilogue
7628 expanding. */
7629 gcc_assert ((max_size & (max_size - 1)) == 0);
7630 for (i = max_size; i >= 1; i >>= 1)
7631 {
7632 if (epilogue_size & i)
7633 {
7634 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7635 destmem = emit_memset (destmem, destptr, vec_value, i);
7636 else
7637 destmem = emit_memset (destmem, destptr, value, i);
7638 }
7639 }
7640 return;
7641 }
7642 if (max_size > 32)
7643 {
7644 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7645 return;
7646 }
7647 if (max_size > 16)
7648 {
7649 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7650 if (TARGET_64BIT)
7651 {
7652 dest = change_address (destmem, DImode, destptr);
7653 emit_insn (gen_strset (destptr, dest, value));
7654 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7655 emit_insn (gen_strset (destptr, dest, value));
7656 }
7657 else
7658 {
7659 dest = change_address (destmem, SImode, destptr);
7660 emit_insn (gen_strset (destptr, dest, value));
7661 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7662 emit_insn (gen_strset (destptr, dest, value));
7663 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7664 emit_insn (gen_strset (destptr, dest, value));
7665 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7666 emit_insn (gen_strset (destptr, dest, value));
7667 }
7668 emit_label (label);
7669 LABEL_NUSES (label) = 1;
7670 }
7671 if (max_size > 8)
7672 {
7673 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7674 if (TARGET_64BIT)
7675 {
7676 dest = change_address (destmem, DImode, destptr);
7677 emit_insn (gen_strset (destptr, dest, value));
7678 }
7679 else
7680 {
7681 dest = change_address (destmem, SImode, destptr);
7682 emit_insn (gen_strset (destptr, dest, value));
7683 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7684 emit_insn (gen_strset (destptr, dest, value));
7685 }
7686 emit_label (label);
7687 LABEL_NUSES (label) = 1;
7688 }
7689 if (max_size > 4)
7690 {
7691 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7692 dest = change_address (destmem, SImode, destptr);
7693 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7694 emit_label (label);
7695 LABEL_NUSES (label) = 1;
7696 }
7697 if (max_size > 2)
7698 {
7699 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7700 dest = change_address (destmem, HImode, destptr);
7701 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7702 emit_label (label);
7703 LABEL_NUSES (label) = 1;
7704 }
7705 if (max_size > 1)
7706 {
7707 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7708 dest = change_address (destmem, QImode, destptr);
7709 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7710 emit_label (label);
7711 LABEL_NUSES (label) = 1;
7712 }
7713}
7714
7715/* Adjust COUNTER by the VALUE. */
7716static void
7717ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7718{
83bc5e44 7719 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
2bf6d935
ML
7720}
7721
7722/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7723 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7724 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7725 ignored.
7726 Return value is updated DESTMEM. */
7727
7728static rtx
76715c32 7729expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
2bf6d935
ML
7730 rtx destptr, rtx srcptr, rtx value,
7731 rtx vec_value, rtx count, int align,
7732 int desired_alignment, bool issetmem)
7733{
7734 int i;
7735 for (i = 1; i < desired_alignment; i <<= 1)
7736 {
7737 if (align <= i)
7738 {
7739 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7740 if (issetmem)
7741 {
7742 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7743 destmem = emit_memset (destmem, destptr, vec_value, i);
7744 else
7745 destmem = emit_memset (destmem, destptr, value, i);
7746 }
7747 else
7748 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7749 ix86_adjust_counter (count, i);
7750 emit_label (label);
7751 LABEL_NUSES (label) = 1;
7752 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7753 }
7754 }
7755 return destmem;
7756}
7757
7758/* Test if COUNT&SIZE is nonzero and if so, expand movme
7759 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7760 and jump to DONE_LABEL. */
7761static void
76715c32 7762expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
2bf6d935
ML
7763 rtx destptr, rtx srcptr,
7764 rtx value, rtx vec_value,
7765 rtx count, int size,
7766 rtx done_label, bool issetmem)
7767{
7768 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7769 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7770 rtx modesize;
7771 int n;
7772
7773 /* If we do not have vector value to copy, we must reduce size. */
7774 if (issetmem)
7775 {
7776 if (!vec_value)
7777 {
7778 if (GET_MODE (value) == VOIDmode && size > 8)
7779 mode = Pmode;
7780 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7781 mode = GET_MODE (value);
7782 }
7783 else
7784 mode = GET_MODE (vec_value), value = vec_value;
7785 }
7786 else
7787 {
7788 /* Choose appropriate vector mode. */
7789 if (size >= 32)
7790 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7791 else if (size >= 16)
7792 mode = TARGET_SSE ? V16QImode : DImode;
7793 srcmem = change_address (srcmem, mode, srcptr);
7794 }
7795 destmem = change_address (destmem, mode, destptr);
7796 modesize = GEN_INT (GET_MODE_SIZE (mode));
7797 gcc_assert (GET_MODE_SIZE (mode) <= size);
7798 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7799 {
7800 if (issetmem)
7801 emit_move_insn (destmem, gen_lowpart (mode, value));
7802 else
7803 {
7804 emit_move_insn (destmem, srcmem);
7805 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7806 }
7807 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7808 }
7809
7810 destmem = offset_address (destmem, count, 1);
7811 destmem = offset_address (destmem, GEN_INT (-2 * size),
7812 GET_MODE_SIZE (mode));
7813 if (!issetmem)
7814 {
7815 srcmem = offset_address (srcmem, count, 1);
7816 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7817 GET_MODE_SIZE (mode));
7818 }
7819 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7820 {
7821 if (issetmem)
7822 emit_move_insn (destmem, gen_lowpart (mode, value));
7823 else
7824 {
7825 emit_move_insn (destmem, srcmem);
7826 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7827 }
7828 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7829 }
7830 emit_jump_insn (gen_jump (done_label));
7831 emit_barrier ();
7832
7833 emit_label (label);
7834 LABEL_NUSES (label) = 1;
7835}
7836
7837/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7838 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7839 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7840 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7841 DONE_LABEL is a label after the whole copying sequence. The label is created
7842 on demand if *DONE_LABEL is NULL.
7843 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7844 bounds after the initial copies.
7845
7846 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7847 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7848 we will dispatch to a library call for large blocks.
7849
7850 In pseudocode we do:
7851
7852 if (COUNT < SIZE)
7853 {
7854 Assume that SIZE is 4. Bigger sizes are handled analogously
7855 if (COUNT & 4)
7856 {
7857 copy 4 bytes from SRCPTR to DESTPTR
7858 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7859 goto done_label
7860 }
7861 if (!COUNT)
7862 goto done_label;
7863 copy 1 byte from SRCPTR to DESTPTR
7864 if (COUNT & 2)
7865 {
7866 copy 2 bytes from SRCPTR to DESTPTR
7867 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7868 }
7869 }
7870 else
7871 {
7872 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7873 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7874
7875 OLD_DESPTR = DESTPTR;
7876 Align DESTPTR up to DESIRED_ALIGN
7877 SRCPTR += DESTPTR - OLD_DESTPTR
7878 COUNT -= DEST_PTR - OLD_DESTPTR
7879 if (DYNAMIC_CHECK)
7880 Round COUNT down to multiple of SIZE
7881 << optional caller supplied zero size guard is here >>
7882 << optional caller supplied dynamic check is here >>
7883 << caller supplied main copy loop is here >>
7884 }
7885 done_label:
7886 */
7887static void
76715c32 7888expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
2bf6d935
ML
7889 rtx *destptr, rtx *srcptr,
7890 machine_mode mode,
7891 rtx value, rtx vec_value,
7892 rtx *count,
7893 rtx_code_label **done_label,
7894 int size,
7895 int desired_align,
7896 int align,
7897 unsigned HOST_WIDE_INT *min_size,
7898 bool dynamic_check,
7899 bool issetmem)
7900{
7901 rtx_code_label *loop_label = NULL, *label;
7902 int n;
7903 rtx modesize;
7904 int prolog_size = 0;
7905 rtx mode_value;
7906
7907 /* Chose proper value to copy. */
7908 if (issetmem && VECTOR_MODE_P (mode))
7909 mode_value = vec_value;
7910 else
7911 mode_value = value;
7912 gcc_assert (GET_MODE_SIZE (mode) <= size);
7913
7914 /* See if block is big or small, handle small blocks. */
7915 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7916 {
7917 int size2 = size;
7918 loop_label = gen_label_rtx ();
7919
7920 if (!*done_label)
7921 *done_label = gen_label_rtx ();
7922
7923 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7924 1, loop_label);
7925 size2 >>= 1;
7926
7927 /* Handle sizes > 3. */
7928 for (;size2 > 2; size2 >>= 1)
76715c32 7929 expand_small_cpymem_or_setmem (destmem, srcmem,
2bf6d935
ML
7930 *destptr, *srcptr,
7931 value, vec_value,
7932 *count,
7933 size2, *done_label, issetmem);
7934 /* Nothing to copy? Jump to DONE_LABEL if so */
7935 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7936 1, *done_label);
7937
7938 /* Do a byte copy. */
7939 destmem = change_address (destmem, QImode, *destptr);
7940 if (issetmem)
7941 emit_move_insn (destmem, gen_lowpart (QImode, value));
7942 else
7943 {
7944 srcmem = change_address (srcmem, QImode, *srcptr);
7945 emit_move_insn (destmem, srcmem);
7946 }
7947
7948 /* Handle sizes 2 and 3. */
7949 label = ix86_expand_aligntest (*count, 2, false);
7950 destmem = change_address (destmem, HImode, *destptr);
7951 destmem = offset_address (destmem, *count, 1);
7952 destmem = offset_address (destmem, GEN_INT (-2), 2);
7953 if (issetmem)
7954 emit_move_insn (destmem, gen_lowpart (HImode, value));
7955 else
7956 {
7957 srcmem = change_address (srcmem, HImode, *srcptr);
7958 srcmem = offset_address (srcmem, *count, 1);
7959 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
7960 emit_move_insn (destmem, srcmem);
7961 }
7962
7963 emit_label (label);
7964 LABEL_NUSES (label) = 1;
7965 emit_jump_insn (gen_jump (*done_label));
7966 emit_barrier ();
7967 }
7968 else
7969 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
7970 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
7971
7972 /* Start memcpy for COUNT >= SIZE. */
7973 if (loop_label)
7974 {
7975 emit_label (loop_label);
7976 LABEL_NUSES (loop_label) = 1;
7977 }
7978
7979 /* Copy first desired_align bytes. */
7980 if (!issetmem)
7981 srcmem = change_address (srcmem, mode, *srcptr);
7982 destmem = change_address (destmem, mode, *destptr);
7983 modesize = GEN_INT (GET_MODE_SIZE (mode));
7984 for (n = 0; prolog_size < desired_align - align; n++)
7985 {
7986 if (issetmem)
7987 emit_move_insn (destmem, mode_value);
7988 else
7989 {
7990 emit_move_insn (destmem, srcmem);
7991 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7992 }
7993 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7994 prolog_size += GET_MODE_SIZE (mode);
7995 }
7996
7997
7998 /* Copy last SIZE bytes. */
7999 destmem = offset_address (destmem, *count, 1);
8000 destmem = offset_address (destmem,
8001 GEN_INT (-size - prolog_size),
8002 1);
8003 if (issetmem)
8004 emit_move_insn (destmem, mode_value);
8005 else
8006 {
8007 srcmem = offset_address (srcmem, *count, 1);
8008 srcmem = offset_address (srcmem,
8009 GEN_INT (-size - prolog_size),
8010 1);
8011 emit_move_insn (destmem, srcmem);
8012 }
8013 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8014 {
8015 destmem = offset_address (destmem, modesize, 1);
8016 if (issetmem)
8017 emit_move_insn (destmem, mode_value);
8018 else
8019 {
8020 srcmem = offset_address (srcmem, modesize, 1);
8021 emit_move_insn (destmem, srcmem);
8022 }
8023 }
8024
8025 /* Align destination. */
8026 if (desired_align > 1 && desired_align > align)
8027 {
8028 rtx saveddest = *destptr;
8029
8030 gcc_assert (desired_align <= size);
8031 /* Align destptr up, place it to new register. */
8032 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8033 GEN_INT (prolog_size),
8034 NULL_RTX, 1, OPTAB_DIRECT);
8035 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8036 REG_POINTER (*destptr) = 1;
8037 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8038 GEN_INT (-desired_align),
8039 *destptr, 1, OPTAB_DIRECT);
8040 /* See how many bytes we skipped. */
8041 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8042 *destptr,
8043 saveddest, 1, OPTAB_DIRECT);
8044 /* Adjust srcptr and count. */
8045 if (!issetmem)
8046 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8047 saveddest, *srcptr, 1, OPTAB_DIRECT);
8048 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8049 saveddest, *count, 1, OPTAB_DIRECT);
8050 /* We copied at most size + prolog_size. */
8051 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8052 *min_size
8053 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8054 else
8055 *min_size = 0;
8056
8057 /* Our loops always round down the block size, but for dispatch to
8058 library we need precise value. */
8059 if (dynamic_check)
8060 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
8061 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
8062 }
8063 else
8064 {
8065 gcc_assert (prolog_size == 0);
8066 /* Decrease count, so we won't end up copying last word twice. */
8067 if (!CONST_INT_P (*count))
8068 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8069 constm1_rtx, *count, 1, OPTAB_DIRECT);
8070 else
8071 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
8072 (unsigned HOST_WIDE_INT)size));
8073 if (*min_size)
8074 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
8075 }
8076}
8077
8078
8079/* This function is like the previous one, except here we know how many bytes
8080 need to be copied. That allows us to update alignment not only of DST, which
8081 is returned, but also of SRC, which is passed as a pointer for that
8082 reason. */
8083static rtx
76715c32 8084expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
2bf6d935
ML
8085 rtx srcreg, rtx value, rtx vec_value,
8086 int desired_align, int align_bytes,
8087 bool issetmem)
8088{
8089 rtx src = NULL;
8090 rtx orig_dst = dst;
8091 rtx orig_src = NULL;
8092 int piece_size = 1;
8093 int copied_bytes = 0;
8094
8095 if (!issetmem)
8096 {
8097 gcc_assert (srcp != NULL);
8098 src = *srcp;
8099 orig_src = src;
8100 }
8101
8102 for (piece_size = 1;
8103 piece_size <= desired_align && copied_bytes < align_bytes;
8104 piece_size <<= 1)
8105 {
8106 if (align_bytes & piece_size)
8107 {
8108 if (issetmem)
8109 {
8110 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
8111 dst = emit_memset (dst, destreg, vec_value, piece_size);
8112 else
8113 dst = emit_memset (dst, destreg, value, piece_size);
8114 }
8115 else
8116 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
8117 copied_bytes += piece_size;
8118 }
8119 }
8120 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
8121 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8122 if (MEM_SIZE_KNOWN_P (orig_dst))
8123 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
8124
8125 if (!issetmem)
8126 {
8127 int src_align_bytes = get_mem_align_offset (src, desired_align
8128 * BITS_PER_UNIT);
8129 if (src_align_bytes >= 0)
8130 src_align_bytes = desired_align - src_align_bytes;
8131 if (src_align_bytes >= 0)
8132 {
8133 unsigned int src_align;
8134 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
8135 {
8136 if ((src_align_bytes & (src_align - 1))
8137 == (align_bytes & (src_align - 1)))
8138 break;
8139 }
8140 if (src_align > (unsigned int) desired_align)
8141 src_align = desired_align;
8142 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8143 set_mem_align (src, src_align * BITS_PER_UNIT);
8144 }
8145 if (MEM_SIZE_KNOWN_P (orig_src))
8146 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8147 *srcp = src;
8148 }
8149
8150 return dst;
8151}
8152
8153/* Return true if ALG can be used in current context.
8154 Assume we expand memset if MEMSET is true. */
8155static bool
8156alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8157{
8158 if (alg == no_stringop)
8159 return false;
8160 if (alg == vector_loop)
8161 return TARGET_SSE || TARGET_AVX;
8162 /* Algorithms using the rep prefix want at least edi and ecx;
8163 additionally, memset wants eax and memcpy wants esi. Don't
8164 consider such algorithms if the user has appropriated those
8165 registers for their own purposes, or if we have a non-default
8166 address space, since some string insns cannot override the segment. */
8167 if (alg == rep_prefix_1_byte
8168 || alg == rep_prefix_4_byte
8169 || alg == rep_prefix_8_byte)
8170 {
8171 if (have_as)
8172 return false;
8173 if (fixed_regs[CX_REG]
8174 || fixed_regs[DI_REG]
8175 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8176 return false;
8177 }
8178 return true;
8179}
8180
8181/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8182static enum stringop_alg
8183decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8184 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8185 bool memset, bool zero_memset, bool have_as,
8186 int *dynamic_check, bool *noalign, bool recur)
8187{
8188 const struct stringop_algs *algs;
8189 bool optimize_for_speed;
8190 int max = 0;
8191 const struct processor_costs *cost;
8192 int i;
8193 bool any_alg_usable_p = false;
8194
8195 *noalign = false;
8196 *dynamic_check = -1;
8197
8198 /* Even if the string operation call is cold, we still might spend a lot
8199 of time processing large blocks. */
8200 if (optimize_function_for_size_p (cfun)
8201 || (optimize_insn_for_size_p ()
8202 && (max_size < 256
8203 || (expected_size != -1 && expected_size < 256))))
8204 optimize_for_speed = false;
8205 else
8206 optimize_for_speed = true;
8207
8208 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8209 if (memset)
8210 algs = &cost->memset[TARGET_64BIT != 0];
8211 else
8212 algs = &cost->memcpy[TARGET_64BIT != 0];
8213
8214 /* See maximal size for user defined algorithm. */
8215 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8216 {
8217 enum stringop_alg candidate = algs->size[i].alg;
8218 bool usable = alg_usable_p (candidate, memset, have_as);
8219 any_alg_usable_p |= usable;
8220
8221 if (candidate != libcall && candidate && usable)
8222 max = algs->size[i].max;
8223 }
8224
8225 /* If expected size is not known but max size is small enough
8226 so inline version is a win, set expected size into
8227 the range. */
8228 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8229 && expected_size == -1)
8230 expected_size = min_size / 2 + max_size / 2;
8231
8232 /* If user specified the algorithm, honor it if possible. */
8233 if (ix86_stringop_alg != no_stringop
8234 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8235 return ix86_stringop_alg;
8236 /* rep; movq or rep; movl is the smallest variant. */
8237 else if (!optimize_for_speed)
8238 {
8239 *noalign = true;
8240 if (!count || (count & 3) || (memset && !zero_memset))
8241 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8242 ? rep_prefix_1_byte : loop_1_byte;
8243 else
8244 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8245 ? rep_prefix_4_byte : loop;
8246 }
8247 /* Very tiny blocks are best handled via the loop, REP is expensive to
8248 setup. */
8249 else if (expected_size != -1 && expected_size < 4)
8250 return loop_1_byte;
8251 else if (expected_size != -1)
8252 {
8253 enum stringop_alg alg = libcall;
8254 bool alg_noalign = false;
8255 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8256 {
8257 /* We get here if the algorithms that were not libcall-based
8258 were rep-prefix based and we are unable to use rep prefixes
8259 based on global register usage. Break out of the loop and
8260 use the heuristic below. */
8261 if (algs->size[i].max == 0)
8262 break;
8263 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8264 {
8265 enum stringop_alg candidate = algs->size[i].alg;
8266
8267 if (candidate != libcall
8268 && alg_usable_p (candidate, memset, have_as))
8269 {
8270 alg = candidate;
8271 alg_noalign = algs->size[i].noalign;
8272 }
8273 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8274 last non-libcall inline algorithm. */
8275 if (TARGET_INLINE_ALL_STRINGOPS)
8276 {
8277 /* When the current size is best to be copied by a libcall,
8278 but we are still forced to inline, run the heuristic below
8279 that will pick code for medium sized blocks. */
8280 if (alg != libcall)
8281 {
8282 *noalign = alg_noalign;
8283 return alg;
8284 }
8285 else if (!any_alg_usable_p)
8286 break;
8287 }
bf24f4ec
L
8288 else if (alg_usable_p (candidate, memset, have_as)
8289 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8290 && candidate == rep_prefix_1_byte
8291 /* NB: If min_size != max_size, size is
8292 unknown. */
8293 && min_size != max_size))
2bf6d935
ML
8294 {
8295 *noalign = algs->size[i].noalign;
8296 return candidate;
8297 }
8298 }
8299 }
8300 }
8301 /* When asked to inline the call anyway, try to pick meaningful choice.
8302 We look for maximal size of block that is faster to copy by hand and
8303 take blocks of at most of that size guessing that average size will
8304 be roughly half of the block.
8305
8306 If this turns out to be bad, we might simply specify the preferred
8307 choice in ix86_costs. */
8308 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8309 && (algs->unknown_size == libcall
8310 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8311 {
8312 enum stringop_alg alg;
8313 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8314
8315 /* If there aren't any usable algorithms or if recursing already,
8316 then recursing on smaller sizes or same size isn't going to
8317 find anything. Just return the simple byte-at-a-time copy loop. */
8318 if (!any_alg_usable_p || recur)
8319 {
8320 /* Pick something reasonable. */
8321 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8322 *dynamic_check = 128;
8323 return loop_1_byte;
8324 }
8325 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8326 zero_memset, have_as, dynamic_check, noalign, true);
8327 gcc_assert (*dynamic_check == -1);
8328 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8329 *dynamic_check = max;
8330 else
8331 gcc_assert (alg != libcall);
8332 return alg;
8333 }
8334 return (alg_usable_p (algs->unknown_size, memset, have_as)
8335 ? algs->unknown_size : libcall);
8336}
8337
8338/* Decide on alignment. We know that the operand is already aligned to ALIGN
8339 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8340static int
8341decide_alignment (int align,
8342 enum stringop_alg alg,
8343 int expected_size,
8344 machine_mode move_mode)
8345{
8346 int desired_align = 0;
8347
8348 gcc_assert (alg != no_stringop);
8349
8350 if (alg == libcall)
8351 return 0;
8352 if (move_mode == VOIDmode)
8353 return 0;
8354
8355 desired_align = GET_MODE_SIZE (move_mode);
8356 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8357 copying whole cacheline at once. */
f23881fc 8358 if (TARGET_CPU_P (PENTIUMPRO)
2bf6d935
ML
8359 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8360 desired_align = 8;
8361
8362 if (optimize_size)
8363 desired_align = 1;
8364 if (desired_align < align)
8365 desired_align = align;
8366 if (expected_size != -1 && expected_size < 4)
8367 desired_align = align;
8368
8369 return desired_align;
8370}
8371
8372
8373/* Helper function for memcpy. For QImode value 0xXY produce
8374 0xXYXYXYXY of wide specified by MODE. This is essentially
8375 a * 0x10101010, but we can do slightly better than
8376 synth_mult by unwinding the sequence by hand on CPUs with
8377 slow multiply. */
8378static rtx
8379promote_duplicated_reg (machine_mode mode, rtx val)
8380{
8381 machine_mode valmode = GET_MODE (val);
8382 rtx tmp;
8383 int nops = mode == DImode ? 3 : 2;
8384
8385 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8386 if (val == const0_rtx)
8387 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8388 if (CONST_INT_P (val))
8389 {
8390 HOST_WIDE_INT v = INTVAL (val) & 255;
8391
8392 v |= v << 8;
8393 v |= v << 16;
8394 if (mode == DImode)
8395 v |= (v << 16) << 16;
8396 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8397 }
8398
8399 if (valmode == VOIDmode)
8400 valmode = QImode;
8401 if (valmode != QImode)
8402 val = gen_lowpart (QImode, val);
8403 if (mode == QImode)
8404 return val;
8405 if (!TARGET_PARTIAL_REG_STALL)
8406 nops--;
8407 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8408 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8409 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8410 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8411 {
8412 rtx reg = convert_modes (mode, QImode, val, true);
8413 tmp = promote_duplicated_reg (mode, const1_rtx);
8414 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8415 OPTAB_DIRECT);
8416 }
8417 else
8418 {
8419 rtx reg = convert_modes (mode, QImode, val, true);
8420
8421 if (!TARGET_PARTIAL_REG_STALL)
e9539592 8422 emit_insn (gen_insv_1 (mode, reg, reg));
2bf6d935
ML
8423 else
8424 {
8425 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8426 NULL, 1, OPTAB_DIRECT);
8427 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8428 OPTAB_DIRECT);
8429 }
8430 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8431 NULL, 1, OPTAB_DIRECT);
8432 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8433 if (mode == SImode)
8434 return reg;
8435 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8436 NULL, 1, OPTAB_DIRECT);
8437 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8438 return reg;
8439 }
8440}
8441
8442/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8443 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8444 alignment from ALIGN to DESIRED_ALIGN. */
8445static rtx
8446promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8447 int align)
8448{
8449 rtx promoted_val;
8450
8451 if (TARGET_64BIT
8452 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8453 promoted_val = promote_duplicated_reg (DImode, val);
8454 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8455 promoted_val = promote_duplicated_reg (SImode, val);
8456 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8457 promoted_val = promote_duplicated_reg (HImode, val);
8458 else
8459 promoted_val = val;
8460
8461 return promoted_val;
8462}
8463
8464/* Copy the address to a Pmode register. This is used for x32 to
8465 truncate DImode TLS address to a SImode register. */
8466
8467static rtx
8468ix86_copy_addr_to_reg (rtx addr)
8469{
8470 rtx reg;
8471 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8472 {
8473 reg = copy_addr_to_reg (addr);
8474 REG_POINTER (reg) = 1;
8475 return reg;
8476 }
8477 else
8478 {
8479 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8480 reg = copy_to_mode_reg (DImode, addr);
8481 REG_POINTER (reg) = 1;
8482 return gen_rtx_SUBREG (SImode, reg, 0);
8483 }
8484}
8485
8486/* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8487 operations when profitable. The code depends upon architecture, block size
8488 and alignment, but always has one of the following overall structures:
8489
8490 Aligned move sequence:
8491
8492 1) Prologue guard: Conditional that jumps up to epilogues for small
8493 blocks that can be handled by epilogue alone. This is faster
8494 but also needed for correctness, since prologue assume the block
8495 is larger than the desired alignment.
8496
8497 Optional dynamic check for size and libcall for large
8498 blocks is emitted here too, with -minline-stringops-dynamically.
8499
8500 2) Prologue: copy first few bytes in order to get destination
8501 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8502 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8503 copied. We emit either a jump tree on power of two sized
8504 blocks, or a byte loop.
8505
8506 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8507 with specified algorithm.
8508
8509 4) Epilogue: code copying tail of the block that is too small to be
8510 handled by main body (or up to size guarded by prologue guard).
8511
8512 Misaligned move sequence
8513
8514 1) missaligned move prologue/epilogue containing:
8515 a) Prologue handling small memory blocks and jumping to done_label
8516 (skipped if blocks are known to be large enough)
8517 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8518 needed by single possibly misaligned move
8519 (skipped if alignment is not needed)
8520 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8521
8522 2) Zero size guard dispatching to done_label, if needed
8523
8524 3) dispatch to library call, if needed,
8525
8526 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8527 with specified algorithm. */
8528bool
76715c32 8529ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
2bf6d935
ML
8530 rtx align_exp, rtx expected_align_exp,
8531 rtx expected_size_exp, rtx min_size_exp,
8532 rtx max_size_exp, rtx probable_max_size_exp,
8533 bool issetmem)
8534{
8535 rtx destreg;
8536 rtx srcreg = NULL;
8537 rtx_code_label *label = NULL;
8538 rtx tmp;
8539 rtx_code_label *jump_around_label = NULL;
8540 HOST_WIDE_INT align = 1;
8541 unsigned HOST_WIDE_INT count = 0;
8542 HOST_WIDE_INT expected_size = -1;
8543 int size_needed = 0, epilogue_size_needed;
8544 int desired_align = 0, align_bytes = 0;
8545 enum stringop_alg alg;
8546 rtx promoted_val = NULL;
8547 rtx vec_promoted_val = NULL;
8548 bool force_loopy_epilogue = false;
8549 int dynamic_check;
8550 bool need_zero_guard = false;
8551 bool noalign;
8552 machine_mode move_mode = VOIDmode;
8553 machine_mode wider_mode;
8554 int unroll_factor = 1;
8555 /* TODO: Once value ranges are available, fill in proper data. */
8556 unsigned HOST_WIDE_INT min_size = 0;
8557 unsigned HOST_WIDE_INT max_size = -1;
8558 unsigned HOST_WIDE_INT probable_max_size = -1;
8559 bool misaligned_prologue_used = false;
8560 bool have_as;
8561
8562 if (CONST_INT_P (align_exp))
8563 align = INTVAL (align_exp);
8564 /* i386 can do misaligned access on reasonably increased cost. */
8565 if (CONST_INT_P (expected_align_exp)
8566 && INTVAL (expected_align_exp) > align)
8567 align = INTVAL (expected_align_exp);
8568 /* ALIGN is the minimum of destination and source alignment, but we care here
8569 just about destination alignment. */
8570 else if (!issetmem
8571 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8572 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8573
8574 if (CONST_INT_P (count_exp))
8575 {
8576 min_size = max_size = probable_max_size = count = expected_size
8577 = INTVAL (count_exp);
8578 /* When COUNT is 0, there is nothing to do. */
8579 if (!count)
8580 return true;
8581 }
8582 else
8583 {
8584 if (min_size_exp)
8585 min_size = INTVAL (min_size_exp);
8586 if (max_size_exp)
8587 max_size = INTVAL (max_size_exp);
8588 if (probable_max_size_exp)
8589 probable_max_size = INTVAL (probable_max_size_exp);
8590 if (CONST_INT_P (expected_size_exp))
8591 expected_size = INTVAL (expected_size_exp);
8592 }
8593
8594 /* Make sure we don't need to care about overflow later on. */
8595 if (count > (HOST_WIDE_INT_1U << 30))
8596 return false;
8597
8598 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8599 if (!issetmem)
8600 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8601
8602 /* Step 0: Decide on preferred algorithm, desired alignment and
8603 size of chunks to be copied by main loop. */
8604 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8605 issetmem,
8606 issetmem && val_exp == const0_rtx, have_as,
8607 &dynamic_check, &noalign, false);
8608
8609 if (dump_file)
8610 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8611 stringop_alg_names[alg]);
8612
8613 if (alg == libcall)
8614 return false;
8615 gcc_assert (alg != no_stringop);
8616
8617 /* For now vector-version of memset is generated only for memory zeroing, as
8618 creating of promoted vector value is very cheap in this case. */
8619 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8620 alg = unrolled_loop;
8621
8622 if (!count)
8623 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8624 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8625 if (!issetmem)
8626 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8627
8628 unroll_factor = 1;
8629 move_mode = word_mode;
8630 switch (alg)
8631 {
8632 case libcall:
8633 case no_stringop:
8634 case last_alg:
8635 gcc_unreachable ();
8636 case loop_1_byte:
8637 need_zero_guard = true;
8638 move_mode = QImode;
8639 break;
8640 case loop:
8641 need_zero_guard = true;
8642 break;
8643 case unrolled_loop:
8644 need_zero_guard = true;
8645 unroll_factor = (TARGET_64BIT ? 4 : 2);
8646 break;
8647 case vector_loop:
8648 need_zero_guard = true;
8649 unroll_factor = 4;
8650 /* Find the widest supported mode. */
8651 move_mode = word_mode;
8652 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8653 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8654 move_mode = wider_mode;
8655
586bbef1 8656 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
2bf6d935 8657 move_mode = TImode;
eef81eef
JH
8658 if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
8659 move_mode = OImode;
2bf6d935
ML
8660
8661 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8662 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8663 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8664 {
8665 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8666 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8667 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8668 move_mode = word_mode;
8669 }
8670 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8671 break;
8672 case rep_prefix_8_byte:
8673 move_mode = DImode;
8674 break;
8675 case rep_prefix_4_byte:
8676 move_mode = SImode;
8677 break;
8678 case rep_prefix_1_byte:
8679 move_mode = QImode;
8680 break;
8681 }
8682 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8683 epilogue_size_needed = size_needed;
8684
8685 /* If we are going to call any library calls conditionally, make sure any
8686 pending stack adjustment happen before the first conditional branch,
8687 otherwise they will be emitted before the library call only and won't
8688 happen from the other branches. */
8689 if (dynamic_check != -1)
8690 do_pending_stack_adjust ();
8691
8692 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8693 if (!TARGET_ALIGN_STRINGOPS || noalign)
8694 align = desired_align;
8695
8696 /* Step 1: Prologue guard. */
8697
8698 /* Alignment code needs count to be in register. */
8699 if (CONST_INT_P (count_exp) && desired_align > align)
8700 {
8701 if (INTVAL (count_exp) > desired_align
8702 && INTVAL (count_exp) > size_needed)
8703 {
8704 align_bytes
8705 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8706 if (align_bytes <= 0)
8707 align_bytes = 0;
8708 else
8709 align_bytes = desired_align - align_bytes;
8710 }
8711 if (align_bytes == 0)
8712 count_exp = force_reg (counter_mode (count_exp), count_exp);
8713 }
8714 gcc_assert (desired_align >= 1 && align >= 1);
8715
8716 /* Misaligned move sequences handle both prologue and epilogue at once.
8717 Default code generation results in a smaller code for large alignments
8718 and also avoids redundant job when sizes are known precisely. */
8719 misaligned_prologue_used
8720 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8721 && MAX (desired_align, epilogue_size_needed) <= 32
8722 && desired_align <= epilogue_size_needed
8723 && ((desired_align > align && !align_bytes)
8724 || (!count && epilogue_size_needed > 1)));
8725
8726 /* Do the cheap promotion to allow better CSE across the
8727 main loop and epilogue (ie one load of the big constant in the
8728 front of all code.
8729 For now the misaligned move sequences do not have fast path
8730 without broadcasting. */
8731 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8732 {
8733 if (alg == vector_loop)
8734 {
8735 gcc_assert (val_exp == const0_rtx);
8736 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8737 promoted_val = promote_duplicated_reg_to_size (val_exp,
8738 GET_MODE_SIZE (word_mode),
8739 desired_align, align);
8740 }
8741 else
8742 {
8743 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8744 desired_align, align);
8745 }
8746 }
8747 /* Misaligned move sequences handles both prologues and epilogues at once.
8748 Default code generation results in smaller code for large alignments and
8749 also avoids redundant job when sizes are known precisely. */
8750 if (misaligned_prologue_used)
8751 {
8752 /* Misaligned move prologue handled small blocks by itself. */
76715c32 8753 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
2bf6d935
ML
8754 (dst, src, &destreg, &srcreg,
8755 move_mode, promoted_val, vec_promoted_val,
8756 &count_exp,
8757 &jump_around_label,
8758 desired_align < align
8759 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8760 desired_align, align, &min_size, dynamic_check, issetmem);
8761 if (!issetmem)
8762 src = change_address (src, BLKmode, srcreg);
8763 dst = change_address (dst, BLKmode, destreg);
8764 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8765 epilogue_size_needed = 0;
8766 if (need_zero_guard
8767 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8768 {
8769 /* It is possible that we copied enough so the main loop will not
8770 execute. */
8771 gcc_assert (size_needed > 1);
8772 if (jump_around_label == NULL_RTX)
8773 jump_around_label = gen_label_rtx ();
8774 emit_cmp_and_jump_insns (count_exp,
8775 GEN_INT (size_needed),
8776 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8777 if (expected_size == -1
8778 || expected_size < (desired_align - align) / 2 + size_needed)
8779 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8780 else
8781 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8782 }
8783 }
8784 /* Ensure that alignment prologue won't copy past end of block. */
8785 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8786 {
8787 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8788 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8789 Make sure it is power of 2. */
8790 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8791
8792 /* To improve performance of small blocks, we jump around the VAL
8793 promoting mode. This mean that if the promoted VAL is not constant,
8794 we might not use it in the epilogue and have to use byte
8795 loop variant. */
8796 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8797 force_loopy_epilogue = true;
8798 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8799 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8800 {
8801 /* If main algorithm works on QImode, no epilogue is needed.
8802 For small sizes just don't align anything. */
8803 if (size_needed == 1)
8804 desired_align = align;
8805 else
8806 goto epilogue;
8807 }
8808 else if (!count
8809 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8810 {
8811 label = gen_label_rtx ();
8812 emit_cmp_and_jump_insns (count_exp,
8813 GEN_INT (epilogue_size_needed),
8814 LTU, 0, counter_mode (count_exp), 1, label);
8815 if (expected_size == -1 || expected_size < epilogue_size_needed)
8816 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8817 else
8818 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8819 }
8820 }
8821
8822 /* Emit code to decide on runtime whether library call or inline should be
8823 used. */
8824 if (dynamic_check != -1)
8825 {
8826 if (!issetmem && CONST_INT_P (count_exp))
8827 {
8828 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8829 {
8830 emit_block_copy_via_libcall (dst, src, count_exp);
8831 count_exp = const0_rtx;
8832 goto epilogue;
8833 }
8834 }
8835 else
8836 {
8837 rtx_code_label *hot_label = gen_label_rtx ();
8838 if (jump_around_label == NULL_RTX)
8839 jump_around_label = gen_label_rtx ();
8840 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8841 LEU, 0, counter_mode (count_exp),
8842 1, hot_label);
8843 predict_jump (REG_BR_PROB_BASE * 90 / 100);
8844 if (issetmem)
8845 set_storage_via_libcall (dst, count_exp, val_exp);
8846 else
8847 emit_block_copy_via_libcall (dst, src, count_exp);
8848 emit_jump (jump_around_label);
8849 emit_label (hot_label);
8850 }
8851 }
8852
8853 /* Step 2: Alignment prologue. */
8854 /* Do the expensive promotion once we branched off the small blocks. */
8855 if (issetmem && !promoted_val)
8856 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8857 desired_align, align);
8858
8859 if (desired_align > align && !misaligned_prologue_used)
8860 {
8861 if (align_bytes == 0)
8862 {
8863 /* Except for the first move in prologue, we no longer know
8864 constant offset in aliasing info. It don't seems to worth
8865 the pain to maintain it for the first move, so throw away
8866 the info early. */
8867 dst = change_address (dst, BLKmode, destreg);
8868 if (!issetmem)
8869 src = change_address (src, BLKmode, srcreg);
76715c32 8870 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
2bf6d935
ML
8871 promoted_val, vec_promoted_val,
8872 count_exp, align, desired_align,
8873 issetmem);
8874 /* At most desired_align - align bytes are copied. */
8875 if (min_size < (unsigned)(desired_align - align))
8876 min_size = 0;
8877 else
8878 min_size -= desired_align - align;
8879 }
8880 else
8881 {
8882 /* If we know how many bytes need to be stored before dst is
8883 sufficiently aligned, maintain aliasing info accurately. */
76715c32 8884 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
2bf6d935
ML
8885 srcreg,
8886 promoted_val,
8887 vec_promoted_val,
8888 desired_align,
8889 align_bytes,
8890 issetmem);
8891
8892 count_exp = plus_constant (counter_mode (count_exp),
8893 count_exp, -align_bytes);
8894 count -= align_bytes;
8895 min_size -= align_bytes;
8896 max_size -= align_bytes;
8897 }
8898 if (need_zero_guard
8899 && min_size < (unsigned HOST_WIDE_INT) size_needed
8900 && (count < (unsigned HOST_WIDE_INT) size_needed
8901 || (align_bytes == 0
8902 && count < ((unsigned HOST_WIDE_INT) size_needed
8903 + desired_align - align))))
8904 {
8905 /* It is possible that we copied enough so the main loop will not
8906 execute. */
8907 gcc_assert (size_needed > 1);
8908 if (label == NULL_RTX)
8909 label = gen_label_rtx ();
8910 emit_cmp_and_jump_insns (count_exp,
8911 GEN_INT (size_needed),
8912 LTU, 0, counter_mode (count_exp), 1, label);
8913 if (expected_size == -1
8914 || expected_size < (desired_align - align) / 2 + size_needed)
8915 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8916 else
8917 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8918 }
8919 }
8920 if (label && size_needed == 1)
8921 {
8922 emit_label (label);
8923 LABEL_NUSES (label) = 1;
8924 label = NULL;
8925 epilogue_size_needed = 1;
8926 if (issetmem)
8927 promoted_val = val_exp;
8928 }
8929 else if (label == NULL_RTX && !misaligned_prologue_used)
8930 epilogue_size_needed = size_needed;
8931
8932 /* Step 3: Main loop. */
8933
8934 switch (alg)
8935 {
8936 case libcall:
8937 case no_stringop:
8938 case last_alg:
8939 gcc_unreachable ();
8940 case loop_1_byte:
8941 case loop:
8942 case unrolled_loop:
76715c32 8943 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
8944 count_exp, move_mode, unroll_factor,
8945 expected_size, issetmem);
8946 break;
8947 case vector_loop:
76715c32 8948 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
2bf6d935
ML
8949 vec_promoted_val, count_exp, move_mode,
8950 unroll_factor, expected_size, issetmem);
8951 break;
8952 case rep_prefix_8_byte:
8953 case rep_prefix_4_byte:
8954 case rep_prefix_1_byte:
76715c32 8955 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
8956 val_exp, count_exp, move_mode, issetmem);
8957 break;
8958 }
8959 /* Adjust properly the offset of src and dest memory for aliasing. */
8960 if (CONST_INT_P (count_exp))
8961 {
8962 if (!issetmem)
8963 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
8964 (count / size_needed) * size_needed);
8965 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
8966 (count / size_needed) * size_needed);
8967 }
8968 else
8969 {
8970 if (!issetmem)
8971 src = change_address (src, BLKmode, srcreg);
8972 dst = change_address (dst, BLKmode, destreg);
8973 }
8974
8975 /* Step 4: Epilogue to copy the remaining bytes. */
8976 epilogue:
8977 if (label)
8978 {
8979 /* When the main loop is done, COUNT_EXP might hold original count,
8980 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8981 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8982 bytes. Compensate if needed. */
8983
8984 if (size_needed < epilogue_size_needed)
8985 {
8986 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
8987 GEN_INT (size_needed - 1), count_exp, 1,
8988 OPTAB_DIRECT);
8989 if (tmp != count_exp)
8990 emit_move_insn (count_exp, tmp);
8991 }
8992 emit_label (label);
8993 LABEL_NUSES (label) = 1;
8994 }
8995
8996 if (count_exp != const0_rtx && epilogue_size_needed > 1)
8997 {
8998 if (force_loopy_epilogue)
8999 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
9000 epilogue_size_needed);
9001 else
9002 {
9003 if (issetmem)
9004 expand_setmem_epilogue (dst, destreg, promoted_val,
9005 vec_promoted_val, count_exp,
9006 epilogue_size_needed);
9007 else
76715c32 9008 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
2bf6d935
ML
9009 epilogue_size_needed);
9010 }
9011 }
9012 if (jump_around_label)
9013 emit_label (jump_around_label);
9014 return true;
9015}
9016
3edc21af
L
9017/* Expand cmpstrn or memcmp. */
9018
9019bool
9020ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
9021 rtx length, rtx align, bool is_cmpstrn)
9022{
4052c05e
L
9023 /* Expand strncmp and memcmp only with -minline-all-stringops since
9024 "repz cmpsb" can be much slower than strncmp and memcmp functions
9025 implemented with vector instructions, see
9026
9027 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9028 */
9029 if (!TARGET_INLINE_ALL_STRINGOPS)
3edc21af
L
9030 return false;
9031
9032 /* Can't use this if the user has appropriated ecx, esi or edi. */
9033 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
9034 return false;
9035
9036 if (is_cmpstrn)
9037 {
9038 /* For strncmp, length is the maximum length, which can be larger
9039 than actual string lengths. We can expand the cmpstrn pattern
9040 to "repz cmpsb" only if one of the strings is a constant so
9041 that expand_builtin_strncmp() can write the length argument to
9042 be the minimum of the const string length and the actual length
9043 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9044 tree t1 = MEM_EXPR (src1);
9045 tree t2 = MEM_EXPR (src2);
9046 if (!((t1 && TREE_CODE (t1) == MEM_REF
9047 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
9048 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
9049 == STRING_CST))
9050 || (t2 && TREE_CODE (t2) == MEM_REF
9051 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
9052 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
9053 == STRING_CST))))
9054 return false;
9055 }
3edc21af
L
9056
9057 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
9058 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
9059 if (addr1 != XEXP (src1, 0))
9060 src1 = replace_equiv_address_nv (src1, addr1);
9061 if (addr2 != XEXP (src2, 0))
9062 src2 = replace_equiv_address_nv (src2, addr2);
9063
9064 /* NB: Make a copy of the data length to avoid changing the original
9065 data length by cmpstrnqi patterns. */
9066 length = ix86_zero_extend_to_Pmode (length);
9067 rtx lengthreg = gen_reg_rtx (Pmode);
9068 emit_move_insn (lengthreg, length);
9069
9070 /* If we are testing strict equality, we can use known alignment to
9071 good advantage. This may be possible with combine, particularly
9072 once cc0 is dead. */
9073 if (CONST_INT_P (length))
9074 {
9075 if (length == const0_rtx)
9076 {
9077 emit_move_insn (result, const0_rtx);
9078 return true;
9079 }
9080 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
9081 src1, src2));
9082 }
9083 else
9084 {
9085 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
9086 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
9087 src1, src2));
9088 }
9089
9090 rtx out = gen_lowpart (QImode, result);
9091 emit_insn (gen_cmpintqi (out));
9092 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
9093
9094 return true;
9095}
2bf6d935
ML
9096
9097/* Expand the appropriate insns for doing strlen if not just doing
9098 repnz; scasb
9099
9100 out = result, initialized with the start address
9101 align_rtx = alignment of the address.
9102 scratch = scratch register, initialized with the startaddress when
9103 not aligned, otherwise undefined
9104
9105 This is just the body. It needs the initializations mentioned above and
9106 some address computing at the end. These things are done in i386.md. */
9107
9108static void
9109ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
9110{
9111 int align;
9112 rtx tmp;
9113 rtx_code_label *align_2_label = NULL;
9114 rtx_code_label *align_3_label = NULL;
9115 rtx_code_label *align_4_label = gen_label_rtx ();
9116 rtx_code_label *end_0_label = gen_label_rtx ();
9117 rtx mem;
9118 rtx tmpreg = gen_reg_rtx (SImode);
9119 rtx scratch = gen_reg_rtx (SImode);
9120 rtx cmp;
9121
9122 align = 0;
9123 if (CONST_INT_P (align_rtx))
9124 align = INTVAL (align_rtx);
9125
9126 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9127
9128 /* Is there a known alignment and is it less than 4? */
9129 if (align < 4)
9130 {
9131 rtx scratch1 = gen_reg_rtx (Pmode);
9132 emit_move_insn (scratch1, out);
9133 /* Is there a known alignment and is it not 2? */
9134 if (align != 2)
9135 {
9136 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
9137 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
9138
9139 /* Leave just the 3 lower bits. */
9140 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9141 NULL_RTX, 0, OPTAB_WIDEN);
9142
9143 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9144 Pmode, 1, align_4_label);
9145 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9146 Pmode, 1, align_2_label);
9147 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9148 Pmode, 1, align_3_label);
9149 }
9150 else
9151 {
9152 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9153 check if is aligned to 4 - byte. */
9154
9155 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9156 NULL_RTX, 0, OPTAB_WIDEN);
9157
9158 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9159 Pmode, 1, align_4_label);
9160 }
9161
9162 mem = change_address (src, QImode, out);
9163
9164 /* Now compare the bytes. */
9165
9166 /* Compare the first n unaligned byte on a byte per byte basis. */
9167 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9168 QImode, 1, end_0_label);
9169
9170 /* Increment the address. */
d9330fb5 9171 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
9172
9173 /* Not needed with an alignment of 2 */
9174 if (align != 2)
9175 {
9176 emit_label (align_2_label);
9177
9178 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9179 end_0_label);
9180
d9330fb5 9181 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
9182
9183 emit_label (align_3_label);
9184 }
9185
9186 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9187 end_0_label);
9188
d9330fb5 9189 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
9190 }
9191
9192 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9193 align this loop. It gives only huge programs, but does not help to
9194 speed up. */
9195 emit_label (align_4_label);
9196
9197 mem = change_address (src, SImode, out);
9198 emit_move_insn (scratch, mem);
d9330fb5 9199 emit_insn (gen_add2_insn (out, GEN_INT (4)));
2bf6d935
ML
9200
9201 /* This formula yields a nonzero result iff one of the bytes is zero.
9202 This saves three branches inside loop and many cycles. */
9203
9204 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9205 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9206 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9207 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9208 gen_int_mode (0x80808080, SImode)));
9209 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9210 align_4_label);
9211
9212 if (TARGET_CMOVE)
9213 {
9214 rtx reg = gen_reg_rtx (SImode);
9215 rtx reg2 = gen_reg_rtx (Pmode);
9216 emit_move_insn (reg, tmpreg);
9217 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9218
9219 /* If zero is not in the first two bytes, move two bytes forward. */
9220 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9221 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9222 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9223 emit_insn (gen_rtx_SET (tmpreg,
9224 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9225 reg,
9226 tmpreg)));
9227 /* Emit lea manually to avoid clobbering of flags. */
c3185b64 9228 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
2bf6d935
ML
9229
9230 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9231 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9232 emit_insn (gen_rtx_SET (out,
9233 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9234 reg2,
9235 out)));
9236 }
9237 else
9238 {
9239 rtx_code_label *end_2_label = gen_label_rtx ();
9240 /* Is zero in the first two bytes? */
9241
9242 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9243 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9244 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9245 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9246 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9247 pc_rtx);
9248 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9249 JUMP_LABEL (tmp) = end_2_label;
9250
9251 /* Not in the first two. Move two bytes forward. */
9252 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
d9330fb5 9253 emit_insn (gen_add2_insn (out, const2_rtx));
2bf6d935
ML
9254
9255 emit_label (end_2_label);
9256
9257 }
9258
9259 /* Avoid branch in fixing the byte. */
9260 tmpreg = gen_lowpart (QImode, tmpreg);
9261 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9262 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9263 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
d9330fb5 9264 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
2bf6d935
ML
9265
9266 emit_label (end_0_label);
9267}
9268
9269/* Expand strlen. */
9270
9271bool
9272ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9273{
9274if (TARGET_UNROLL_STRLEN
9275 && TARGET_INLINE_ALL_STRINGOPS
9276 && eoschar == const0_rtx
9277 && optimize > 1)
9278 {
9279 /* The generic case of strlen expander is long. Avoid it's
9280 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9281 rtx addr = force_reg (Pmode, XEXP (src, 0));
9282 /* Well it seems that some optimizer does not combine a call like
9283 foo(strlen(bar), strlen(bar));
9284 when the move and the subtraction is done here. It does calculate
9285 the length just once when these instructions are done inside of
9286 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9287 often used and I use one fewer register for the lifetime of
9288 output_strlen_unroll() this is better. */
9289
9290 emit_move_insn (out, addr);
9291
9292 ix86_expand_strlensi_unroll_1 (out, src, align);
9293
9294 /* strlensi_unroll_1 returns the address of the zero at the end of
9295 the string, like memchr(), so compute the length by subtracting
9296 the start address. */
d9330fb5 9297 emit_insn (gen_sub2_insn (out, addr));
2bf6d935
ML
9298 return true;
9299 }
9300 else
9301 return false;
9302}
9303
9304/* For given symbol (function) construct code to compute address of it's PLT
9305 entry in large x86-64 PIC model. */
9306
9307static rtx
9308construct_plt_address (rtx symbol)
9309{
9310 rtx tmp, unspec;
9311
9312 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9313 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9314 gcc_assert (Pmode == DImode);
9315
9316 tmp = gen_reg_rtx (Pmode);
9317 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9318
9319 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
d9330fb5 9320 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
2bf6d935
ML
9321 return tmp;
9322}
9323
9324/* Additional registers that are clobbered by SYSV calls. */
9325
9326static int const x86_64_ms_sysv_extra_clobbered_registers
9327 [NUM_X86_64_MS_CLOBBERED_REGS] =
9328{
9329 SI_REG, DI_REG,
9330 XMM6_REG, XMM7_REG,
9331 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9332 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9333};
9334
9335rtx_insn *
9336ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9337 rtx callarg2,
9338 rtx pop, bool sibcall)
9339{
9340 rtx vec[3];
9341 rtx use = NULL, call;
9342 unsigned int vec_len = 0;
9343 tree fndecl;
9344
9345 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9346 {
9347 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9348 if (fndecl
9349 && (lookup_attribute ("interrupt",
9350 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
a9c697b8 9351 error ("interrupt service routine cannot be called directly");
2bf6d935
ML
9352 }
9353 else
9354 fndecl = NULL_TREE;
9355
9356 if (pop == const0_rtx)
9357 pop = NULL;
9358 gcc_assert (!TARGET_64BIT || !pop);
9359
41bd1b19 9360 rtx addr = XEXP (fnaddr, 0);
2bf6d935
ML
9361 if (TARGET_MACHO && !TARGET_64BIT)
9362 {
9363#if TARGET_MACHO
9364 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9365 fnaddr = machopic_indirect_call_target (fnaddr);
9366#endif
9367 }
9368 else
9369 {
9370 /* Static functions and indirect calls don't need the pic register. Also,
9371 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9372 it an indirect call. */
2bf6d935
ML
9373 if (flag_pic
9374 && GET_CODE (addr) == SYMBOL_REF
f7854b90 9375 && ix86_call_use_plt_p (addr))
2bf6d935
ML
9376 {
9377 if (flag_plt
9378 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9379 || !lookup_attribute ("noplt",
9380 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9381 {
9382 if (!TARGET_64BIT
9383 || (ix86_cmodel == CM_LARGE_PIC
9384 && DEFAULT_ABI != MS_ABI))
9385 {
9386 use_reg (&use, gen_rtx_REG (Pmode,
9387 REAL_PIC_OFFSET_TABLE_REGNUM));
9388 if (ix86_use_pseudo_pic_reg ())
9389 emit_move_insn (gen_rtx_REG (Pmode,
9390 REAL_PIC_OFFSET_TABLE_REGNUM),
9391 pic_offset_table_rtx);
9392 }
9393 }
9394 else if (!TARGET_PECOFF && !TARGET_MACHO)
9395 {
69157fe7
JJ
9396 if (TARGET_64BIT
9397 && ix86_cmodel == CM_LARGE_PIC
9398 && DEFAULT_ABI != MS_ABI)
9399 {
9400 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9401 UNSPEC_GOT);
9402 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9403 fnaddr = force_reg (Pmode, fnaddr);
9404 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9405 }
9406 else if (TARGET_64BIT)
2bf6d935
ML
9407 {
9408 fnaddr = gen_rtx_UNSPEC (Pmode,
9409 gen_rtvec (1, addr),
9410 UNSPEC_GOTPCREL);
9411 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9412 }
9413 else
9414 {
9415 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9416 UNSPEC_GOT);
9417 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9418 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9419 fnaddr);
9420 }
9421 fnaddr = gen_const_mem (Pmode, fnaddr);
9422 /* Pmode may not be the same as word_mode for x32, which
9423 doesn't support indirect branch via 32-bit memory slot.
9424 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9425 indirect branch via x32 GOT slot is OK. */
9426 if (GET_MODE (fnaddr) != word_mode)
9427 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9428 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9429 }
9430 }
9431 }
9432
9433 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9434 parameters passed in vector registers. */
9435 if (TARGET_64BIT
9436 && (INTVAL (callarg2) > 0
9437 || (INTVAL (callarg2) == 0
9438 && (TARGET_SSE || !flag_skip_rax_setup))))
9439 {
9440 rtx al = gen_rtx_REG (QImode, AX_REG);
9441 emit_move_insn (al, callarg2);
9442 use_reg (&use, al);
9443 }
9444
9445 if (ix86_cmodel == CM_LARGE_PIC
9446 && !TARGET_PECOFF
9447 && MEM_P (fnaddr)
9448 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9449 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9450 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9451 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9452 branch via x32 GOT slot is OK. */
9453 else if (!(TARGET_X32
9454 && MEM_P (fnaddr)
9455 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9456 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9457 && (sibcall
9458 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9459 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9460 {
9461 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9462 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9463 }
9464
bb576017 9465 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9466 mask off code pointers here.
9467 TODO: also need to handle indirect jump. */
9468 if (ix86_memtag_can_tag_addresses () && !fndecl
9469 && sanitize_flags_p (SANITIZE_HWADDRESS))
9470 {
9471 rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
9472 NULL_RTX);
9473 fnaddr = gen_rtx_MEM (QImode, untagged_addr);
9474 }
9475
2bf6d935
ML
9476 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9477
9478 if (retval)
9479 call = gen_rtx_SET (retval, call);
9480 vec[vec_len++] = call;
9481
9482 if (pop)
9483 {
9484 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9485 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9486 vec[vec_len++] = pop;
9487 }
9488
9489 if (cfun->machine->no_caller_saved_registers
9490 && (!fndecl
9491 || (!TREE_THIS_VOLATILE (fndecl)
9492 && !lookup_attribute ("no_caller_saved_registers",
9493 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9494 {
9495 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9496 bool is_64bit_ms_abi = (TARGET_64BIT
9497 && ix86_function_abi (fndecl) == MS_ABI);
9498 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9499
9500 /* If there are no caller-saved registers, add all registers
9501 that are clobbered by the call which returns. */
9502 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9503 if (!fixed_regs[i]
9504 && (ix86_call_used_regs[i] == 1
9505 || (ix86_call_used_regs[i] & c_mask))
9506 && !STACK_REGNO_P (i)
9507 && !MMX_REGNO_P (i))
9508 clobber_reg (&use,
9509 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9510 }
9511 else if (TARGET_64BIT_MS_ABI
9512 && (!callarg2 || INTVAL (callarg2) != -2))
9513 {
9514 unsigned i;
9515
9516 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9517 {
9518 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9519 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9520
9521 clobber_reg (&use, gen_rtx_REG (mode, regno));
9522 }
9523
9524 /* Set here, but it may get cleared later. */
9525 if (TARGET_CALL_MS2SYSV_XLOGUES)
9526 {
9527 if (!TARGET_SSE)
9528 ;
9529
9530 /* Don't break hot-patched functions. */
9531 else if (ix86_function_ms_hook_prologue (current_function_decl))
9532 ;
9533
9534 /* TODO: Cases not yet examined. */
9535 else if (flag_split_stack)
9536 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9537
9538 else
9539 {
9540 gcc_assert (!reload_completed);
9541 cfun->machine->call_ms2sysv = true;
9542 }
9543 }
9544 }
9545
41bd1b19
IS
9546 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9547 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9548 || !fndecl || TREE_PUBLIC (fndecl)))
9549 {
9550 /* We allow public functions defined in a TU to bind locally for PIC
9551 code (the default) on 64bit Mach-O.
9552 If such functions are not inlined, we cannot tell at compile-time if
9553 they will be called via the lazy symbol resolver (this can depend on
9554 options given at link-time). Therefore, we must assume that the lazy
9555 resolver could be used which clobbers R11 and R10. */
9556 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9557 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9558 }
9559
2bf6d935
ML
9560 if (vec_len > 1)
9561 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9562 rtx_insn *call_insn = emit_call_insn (call);
9563 if (use)
9564 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9565
9566 return call_insn;
9567}
9568
9569/* Split simple return with popping POPC bytes from stack to indirect
9570 branch with stack adjustment . */
9571
9572void
9573ix86_split_simple_return_pop_internal (rtx popc)
9574{
9575 struct machine_function *m = cfun->machine;
9576 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9577 rtx_insn *insn;
9578
9579 /* There is no "pascal" calling convention in any 64bit ABI. */
9580 gcc_assert (!TARGET_64BIT);
9581
9582 insn = emit_insn (gen_pop (ecx));
9583 m->fs.cfa_offset -= UNITS_PER_WORD;
9584 m->fs.sp_offset -= UNITS_PER_WORD;
9585
9586 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9587 x = gen_rtx_SET (stack_pointer_rtx, x);
9588 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9589 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9590 RTX_FRAME_RELATED_P (insn) = 1;
9591
9592 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9593 x = gen_rtx_SET (stack_pointer_rtx, x);
9594 insn = emit_insn (x);
9595 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9596 RTX_FRAME_RELATED_P (insn) = 1;
9597
9598 /* Now return address is in ECX. */
9599 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9600}
9601
9602/* Errors in the source file can cause expand_expr to return const0_rtx
9603 where we expect a vector. To avoid crashing, use one of the vector
9604 clear instructions. */
9605
9606static rtx
9607safe_vector_operand (rtx x, machine_mode mode)
9608{
9609 if (x == const0_rtx)
9610 x = CONST0_RTX (mode);
9611 return x;
9612}
9613
9614/* Subroutine of ix86_expand_builtin to take care of binop insns. */
9615
9616static rtx
9617ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9618{
9619 rtx pat;
9620 tree arg0 = CALL_EXPR_ARG (exp, 0);
9621 tree arg1 = CALL_EXPR_ARG (exp, 1);
9622 rtx op0 = expand_normal (arg0);
9623 rtx op1 = expand_normal (arg1);
9624 machine_mode tmode = insn_data[icode].operand[0].mode;
9625 machine_mode mode0 = insn_data[icode].operand[1].mode;
9626 machine_mode mode1 = insn_data[icode].operand[2].mode;
9627
9628 if (VECTOR_MODE_P (mode0))
9629 op0 = safe_vector_operand (op0, mode0);
9630 if (VECTOR_MODE_P (mode1))
9631 op1 = safe_vector_operand (op1, mode1);
9632
9633 if (optimize || !target
9634 || GET_MODE (target) != tmode
9635 || !insn_data[icode].operand[0].predicate (target, tmode))
9636 target = gen_reg_rtx (tmode);
9637
9638 if (GET_MODE (op1) == SImode && mode1 == TImode)
9639 {
9640 rtx x = gen_reg_rtx (V4SImode);
9641 emit_insn (gen_sse2_loadd (x, op1));
9642 op1 = gen_lowpart (TImode, x);
9643 }
9644
9645 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9646 op0 = copy_to_mode_reg (mode0, op0);
9647 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9648 op1 = copy_to_mode_reg (mode1, op1);
9649
9650 pat = GEN_FCN (icode) (target, op0, op1);
9651 if (! pat)
9652 return 0;
9653
9654 emit_insn (pat);
9655
9656 return target;
9657}
9658
9659/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9660
9661static rtx
9662ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9663 enum ix86_builtin_func_type m_type,
9664 enum rtx_code sub_code)
9665{
9666 rtx pat;
715a8bc8 9667 unsigned int i, nargs;
2bf6d935
ML
9668 bool comparison_p = false;
9669 bool tf_p = false;
9670 bool last_arg_constant = false;
9671 int num_memory = 0;
715a8bc8 9672 rtx xops[4];
2bf6d935
ML
9673
9674 machine_mode tmode = insn_data[icode].operand[0].mode;
9675
9676 switch (m_type)
9677 {
9678 case MULTI_ARG_4_DF2_DI_I:
9679 case MULTI_ARG_4_DF2_DI_I1:
9680 case MULTI_ARG_4_SF2_SI_I:
9681 case MULTI_ARG_4_SF2_SI_I1:
9682 nargs = 4;
9683 last_arg_constant = true;
9684 break;
9685
9686 case MULTI_ARG_3_SF:
9687 case MULTI_ARG_3_DF:
9688 case MULTI_ARG_3_SF2:
9689 case MULTI_ARG_3_DF2:
9690 case MULTI_ARG_3_DI:
9691 case MULTI_ARG_3_SI:
9692 case MULTI_ARG_3_SI_DI:
9693 case MULTI_ARG_3_HI:
9694 case MULTI_ARG_3_HI_SI:
9695 case MULTI_ARG_3_QI:
9696 case MULTI_ARG_3_DI2:
9697 case MULTI_ARG_3_SI2:
9698 case MULTI_ARG_3_HI2:
9699 case MULTI_ARG_3_QI2:
9700 nargs = 3;
9701 break;
9702
9703 case MULTI_ARG_2_SF:
9704 case MULTI_ARG_2_DF:
9705 case MULTI_ARG_2_DI:
9706 case MULTI_ARG_2_SI:
9707 case MULTI_ARG_2_HI:
9708 case MULTI_ARG_2_QI:
9709 nargs = 2;
9710 break;
9711
9712 case MULTI_ARG_2_DI_IMM:
9713 case MULTI_ARG_2_SI_IMM:
9714 case MULTI_ARG_2_HI_IMM:
9715 case MULTI_ARG_2_QI_IMM:
9716 nargs = 2;
9717 last_arg_constant = true;
9718 break;
9719
9720 case MULTI_ARG_1_SF:
9721 case MULTI_ARG_1_DF:
9722 case MULTI_ARG_1_SF2:
9723 case MULTI_ARG_1_DF2:
9724 case MULTI_ARG_1_DI:
9725 case MULTI_ARG_1_SI:
9726 case MULTI_ARG_1_HI:
9727 case MULTI_ARG_1_QI:
9728 case MULTI_ARG_1_SI_DI:
9729 case MULTI_ARG_1_HI_DI:
9730 case MULTI_ARG_1_HI_SI:
9731 case MULTI_ARG_1_QI_DI:
9732 case MULTI_ARG_1_QI_SI:
9733 case MULTI_ARG_1_QI_HI:
9734 nargs = 1;
9735 break;
9736
9737 case MULTI_ARG_2_DI_CMP:
9738 case MULTI_ARG_2_SI_CMP:
9739 case MULTI_ARG_2_HI_CMP:
9740 case MULTI_ARG_2_QI_CMP:
9741 nargs = 2;
9742 comparison_p = true;
9743 break;
9744
9745 case MULTI_ARG_2_SF_TF:
9746 case MULTI_ARG_2_DF_TF:
9747 case MULTI_ARG_2_DI_TF:
9748 case MULTI_ARG_2_SI_TF:
9749 case MULTI_ARG_2_HI_TF:
9750 case MULTI_ARG_2_QI_TF:
9751 nargs = 2;
9752 tf_p = true;
9753 break;
9754
9755 default:
9756 gcc_unreachable ();
9757 }
9758
9759 if (optimize || !target
9760 || GET_MODE (target) != tmode
9761 || !insn_data[icode].operand[0].predicate (target, tmode))
9762 target = gen_reg_rtx (tmode);
9763 else if (memory_operand (target, tmode))
9764 num_memory++;
9765
715a8bc8 9766 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
9767
9768 for (i = 0; i < nargs; i++)
9769 {
9770 tree arg = CALL_EXPR_ARG (exp, i);
9771 rtx op = expand_normal (arg);
9772 int adjust = (comparison_p) ? 1 : 0;
9773 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9774
9775 if (last_arg_constant && i == nargs - 1)
9776 {
9777 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9778 {
9779 enum insn_code new_icode = icode;
9780 switch (icode)
9781 {
9782 case CODE_FOR_xop_vpermil2v2df3:
9783 case CODE_FOR_xop_vpermil2v4sf3:
9784 case CODE_FOR_xop_vpermil2v4df3:
9785 case CODE_FOR_xop_vpermil2v8sf3:
9786 error ("the last argument must be a 2-bit immediate");
9787 return gen_reg_rtx (tmode);
9788 case CODE_FOR_xop_rotlv2di3:
9789 new_icode = CODE_FOR_rotlv2di3;
9790 goto xop_rotl;
9791 case CODE_FOR_xop_rotlv4si3:
9792 new_icode = CODE_FOR_rotlv4si3;
9793 goto xop_rotl;
9794 case CODE_FOR_xop_rotlv8hi3:
9795 new_icode = CODE_FOR_rotlv8hi3;
9796 goto xop_rotl;
9797 case CODE_FOR_xop_rotlv16qi3:
9798 new_icode = CODE_FOR_rotlv16qi3;
9799 xop_rotl:
9800 if (CONST_INT_P (op))
9801 {
9802 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9803 op = GEN_INT (INTVAL (op) & mask);
9804 gcc_checking_assert
9805 (insn_data[icode].operand[i + 1].predicate (op, mode));
9806 }
9807 else
9808 {
9809 gcc_checking_assert
9810 (nargs == 2
9811 && insn_data[new_icode].operand[0].mode == tmode
9812 && insn_data[new_icode].operand[1].mode == tmode
9813 && insn_data[new_icode].operand[2].mode == mode
9814 && insn_data[new_icode].operand[0].predicate
9815 == insn_data[icode].operand[0].predicate
9816 && insn_data[new_icode].operand[1].predicate
9817 == insn_data[icode].operand[1].predicate);
9818 icode = new_icode;
9819 goto non_constant;
9820 }
9821 break;
9822 default:
9823 gcc_unreachable ();
9824 }
9825 }
9826 }
9827 else
9828 {
9829 non_constant:
9830 if (VECTOR_MODE_P (mode))
9831 op = safe_vector_operand (op, mode);
9832
9833 /* If we aren't optimizing, only allow one memory operand to be
9834 generated. */
9835 if (memory_operand (op, mode))
9836 num_memory++;
9837
9838 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9839
9840 if (optimize
9841 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9842 || num_memory > 1)
9843 op = force_reg (mode, op);
9844 }
9845
715a8bc8 9846 xops[i] = op;
2bf6d935
ML
9847 }
9848
9849 switch (nargs)
9850 {
9851 case 1:
715a8bc8 9852 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
9853 break;
9854
9855 case 2:
9856 if (tf_p)
715a8bc8 9857 pat = GEN_FCN (icode) (target, xops[0], xops[1],
2bf6d935
ML
9858 GEN_INT ((int)sub_code));
9859 else if (! comparison_p)
715a8bc8 9860 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
9861 else
9862 {
9863 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
715a8bc8 9864 xops[0], xops[1]);
2bf6d935 9865
715a8bc8 9866 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
2bf6d935
ML
9867 }
9868 break;
9869
9870 case 3:
715a8bc8 9871 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
9872 break;
9873
9874 case 4:
715a8bc8 9875 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
2bf6d935
ML
9876 break;
9877
9878 default:
9879 gcc_unreachable ();
9880 }
9881
9882 if (! pat)
9883 return 0;
9884
9885 emit_insn (pat);
9886 return target;
9887}
9888
9889/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9890 insns with vec_merge. */
9891
9892static rtx
9893ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9894 rtx target)
9895{
9896 rtx pat;
9897 tree arg0 = CALL_EXPR_ARG (exp, 0);
9898 rtx op1, op0 = expand_normal (arg0);
9899 machine_mode tmode = insn_data[icode].operand[0].mode;
9900 machine_mode mode0 = insn_data[icode].operand[1].mode;
9901
9902 if (optimize || !target
9903 || GET_MODE (target) != tmode
9904 || !insn_data[icode].operand[0].predicate (target, tmode))
9905 target = gen_reg_rtx (tmode);
9906
9907 if (VECTOR_MODE_P (mode0))
9908 op0 = safe_vector_operand (op0, mode0);
9909
9910 if ((optimize && !register_operand (op0, mode0))
9911 || !insn_data[icode].operand[1].predicate (op0, mode0))
9912 op0 = copy_to_mode_reg (mode0, op0);
9913
9914 op1 = op0;
9915 if (!insn_data[icode].operand[2].predicate (op1, mode0))
9916 op1 = copy_to_mode_reg (mode0, op1);
9917
9918 pat = GEN_FCN (icode) (target, op0, op1);
9919 if (! pat)
9920 return 0;
9921 emit_insn (pat);
9922 return target;
9923}
9924
9925/* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9926
9927static rtx
9928ix86_expand_sse_compare (const struct builtin_description *d,
9929 tree exp, rtx target, bool swap)
9930{
9931 rtx pat;
9932 tree arg0 = CALL_EXPR_ARG (exp, 0);
9933 tree arg1 = CALL_EXPR_ARG (exp, 1);
9934 rtx op0 = expand_normal (arg0);
9935 rtx op1 = expand_normal (arg1);
9936 rtx op2;
9937 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9938 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9939 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9940 enum rtx_code comparison = d->comparison;
9941
9942 if (VECTOR_MODE_P (mode0))
9943 op0 = safe_vector_operand (op0, mode0);
9944 if (VECTOR_MODE_P (mode1))
9945 op1 = safe_vector_operand (op1, mode1);
9946
9947 /* Swap operands if we have a comparison that isn't available in
9948 hardware. */
9949 if (swap)
9950 std::swap (op0, op1);
9951
9952 if (optimize || !target
9953 || GET_MODE (target) != tmode
9954 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9955 target = gen_reg_rtx (tmode);
9956
9957 if ((optimize && !register_operand (op0, mode0))
9958 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
9959 op0 = copy_to_mode_reg (mode0, op0);
9960 if ((optimize && !register_operand (op1, mode1))
9961 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
9962 op1 = copy_to_mode_reg (mode1, op1);
9963
9964 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
9965 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9966 if (! pat)
9967 return 0;
9968 emit_insn (pat);
9969 return target;
9970}
9971
ae69e6f6 9972/* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
9973 * ordered EQ or unordered NE, generate PF jump. */
9974
9975static rtx
9976ix86_ssecom_setcc (const enum rtx_code comparison,
9977 bool check_unordered, machine_mode mode,
9978 rtx set_dst, rtx target)
9979{
9980
9981 rtx_code_label *label = NULL;
9982
9983 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
9984 with NAN operands. */
9985 if (check_unordered)
9986 {
9987 gcc_assert (comparison == EQ || comparison == NE);
9988
9989 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
9990 label = gen_label_rtx ();
9991 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
9992 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9993 gen_rtx_LABEL_REF (VOIDmode, label),
9994 pc_rtx);
9995 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9996 }
9997
9998 /* NB: Set CCFPmode and check a different CCmode which is in subset
9999 of CCFPmode. */
10000 if (GET_MODE (set_dst) != mode)
10001 {
10002 gcc_assert (mode == CCAmode || mode == CCCmode
10003 || mode == CCOmode || mode == CCPmode
10004 || mode == CCSmode || mode == CCZmode);
10005 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10006 }
10007
10008 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10009 gen_rtx_fmt_ee (comparison, QImode,
10010 set_dst,
10011 const0_rtx)));
10012
10013 if (label)
10014 emit_label (label);
10015
10016 return SUBREG_REG (target);
10017}
10018
2bf6d935
ML
10019/* Subroutine of ix86_expand_builtin to take care of comi insns. */
10020
10021static rtx
10022ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
10023 rtx target)
10024{
ae69e6f6 10025 rtx pat, set_dst;
2bf6d935
ML
10026 tree arg0 = CALL_EXPR_ARG (exp, 0);
10027 tree arg1 = CALL_EXPR_ARG (exp, 1);
10028 rtx op0 = expand_normal (arg0);
10029 rtx op1 = expand_normal (arg1);
ae69e6f6 10030 enum insn_code icode = d->icode;
10031 const struct insn_data_d *insn_p = &insn_data[icode];
10032 machine_mode mode0 = insn_p->operand[0].mode;
10033 machine_mode mode1 = insn_p->operand[1].mode;
2bf6d935
ML
10034
10035 if (VECTOR_MODE_P (mode0))
10036 op0 = safe_vector_operand (op0, mode0);
10037 if (VECTOR_MODE_P (mode1))
10038 op1 = safe_vector_operand (op1, mode1);
10039
ae69e6f6 10040 enum rtx_code comparison = d->comparison;
10041 rtx const_val = const0_rtx;
10042
10043 bool check_unordered = false;
10044 machine_mode mode = CCFPmode;
10045 switch (comparison)
10046 {
10047 case LE: /* -> GE */
10048 case LT: /* -> GT */
10049 std::swap (op0, op1);
10050 comparison = swap_condition (comparison);
10051 /* FALLTHRU */
10052 case GT:
10053 case GE:
10054 break;
10055 case EQ:
10056 check_unordered = true;
10057 mode = CCZmode;
10058 break;
10059 case NE:
10060 check_unordered = true;
10061 mode = CCZmode;
10062 const_val = const1_rtx;
10063 break;
10064 default:
10065 gcc_unreachable ();
10066 }
10067
2bf6d935 10068 target = gen_reg_rtx (SImode);
ae69e6f6 10069 emit_move_insn (target, const_val);
2bf6d935
ML
10070 target = gen_rtx_SUBREG (QImode, target, 0);
10071
10072 if ((optimize && !register_operand (op0, mode0))
ae69e6f6 10073 || !insn_p->operand[0].predicate (op0, mode0))
2bf6d935
ML
10074 op0 = copy_to_mode_reg (mode0, op0);
10075 if ((optimize && !register_operand (op1, mode1))
ae69e6f6 10076 || !insn_p->operand[1].predicate (op1, mode1))
2bf6d935
ML
10077 op1 = copy_to_mode_reg (mode1, op1);
10078
ae69e6f6 10079 pat = GEN_FCN (icode) (op0, op1);
2bf6d935
ML
10080 if (! pat)
10081 return 0;
2bf6d935 10082
ae69e6f6 10083 set_dst = SET_DEST (pat);
10084 emit_insn (pat);
10085 return ix86_ssecom_setcc (comparison, check_unordered, mode,
10086 set_dst, target);
2bf6d935
ML
10087}
10088
10089/* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10090
10091static rtx
10092ix86_expand_sse_round (const struct builtin_description *d, tree exp,
10093 rtx target)
10094{
10095 rtx pat;
10096 tree arg0 = CALL_EXPR_ARG (exp, 0);
10097 rtx op1, op0 = expand_normal (arg0);
10098 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10099 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10100
10101 if (optimize || target == 0
10102 || GET_MODE (target) != tmode
10103 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10104 target = gen_reg_rtx (tmode);
10105
10106 if (VECTOR_MODE_P (mode0))
10107 op0 = safe_vector_operand (op0, mode0);
10108
10109 if ((optimize && !register_operand (op0, mode0))
10110 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10111 op0 = copy_to_mode_reg (mode0, op0);
10112
10113 op1 = GEN_INT (d->comparison);
10114
10115 pat = GEN_FCN (d->icode) (target, op0, op1);
10116 if (! pat)
10117 return 0;
10118 emit_insn (pat);
10119 return target;
10120}
10121
10122static rtx
10123ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
10124 tree exp, rtx target)
10125{
10126 rtx pat;
10127 tree arg0 = CALL_EXPR_ARG (exp, 0);
10128 tree arg1 = CALL_EXPR_ARG (exp, 1);
10129 rtx op0 = expand_normal (arg0);
10130 rtx op1 = expand_normal (arg1);
10131 rtx op2;
10132 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10133 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10134 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10135
10136 if (optimize || target == 0
10137 || GET_MODE (target) != tmode
10138 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10139 target = gen_reg_rtx (tmode);
10140
10141 op0 = safe_vector_operand (op0, mode0);
10142 op1 = safe_vector_operand (op1, mode1);
10143
10144 if ((optimize && !register_operand (op0, mode0))
10145 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10146 op0 = copy_to_mode_reg (mode0, op0);
10147 if ((optimize && !register_operand (op1, mode1))
10148 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10149 op1 = copy_to_mode_reg (mode1, op1);
10150
10151 op2 = GEN_INT (d->comparison);
10152
10153 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10154 if (! pat)
10155 return 0;
10156 emit_insn (pat);
10157 return target;
10158}
10159
10160/* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10161
10162static rtx
10163ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10164 rtx target)
10165{
10166 rtx pat;
10167 tree arg0 = CALL_EXPR_ARG (exp, 0);
10168 tree arg1 = CALL_EXPR_ARG (exp, 1);
10169 rtx op0 = expand_normal (arg0);
10170 rtx op1 = expand_normal (arg1);
10171 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10172 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10173 enum rtx_code comparison = d->comparison;
10174
10175 if (VECTOR_MODE_P (mode0))
10176 op0 = safe_vector_operand (op0, mode0);
10177 if (VECTOR_MODE_P (mode1))
10178 op1 = safe_vector_operand (op1, mode1);
10179
10180 target = gen_reg_rtx (SImode);
10181 emit_move_insn (target, const0_rtx);
10182 target = gen_rtx_SUBREG (QImode, target, 0);
10183
10184 if ((optimize && !register_operand (op0, mode0))
10185 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10186 op0 = copy_to_mode_reg (mode0, op0);
10187 if ((optimize && !register_operand (op1, mode1))
10188 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10189 op1 = copy_to_mode_reg (mode1, op1);
10190
10191 pat = GEN_FCN (d->icode) (op0, op1);
10192 if (! pat)
10193 return 0;
10194 emit_insn (pat);
10195 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10196 gen_rtx_fmt_ee (comparison, QImode,
10197 SET_DEST (pat),
10198 const0_rtx)));
10199
10200 return SUBREG_REG (target);
10201}
10202
10203/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10204
10205static rtx
10206ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10207 tree exp, rtx target)
10208{
10209 rtx pat;
10210 tree arg0 = CALL_EXPR_ARG (exp, 0);
10211 tree arg1 = CALL_EXPR_ARG (exp, 1);
10212 tree arg2 = CALL_EXPR_ARG (exp, 2);
10213 tree arg3 = CALL_EXPR_ARG (exp, 3);
10214 tree arg4 = CALL_EXPR_ARG (exp, 4);
10215 rtx scratch0, scratch1;
10216 rtx op0 = expand_normal (arg0);
10217 rtx op1 = expand_normal (arg1);
10218 rtx op2 = expand_normal (arg2);
10219 rtx op3 = expand_normal (arg3);
10220 rtx op4 = expand_normal (arg4);
10221 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10222
10223 tmode0 = insn_data[d->icode].operand[0].mode;
10224 tmode1 = insn_data[d->icode].operand[1].mode;
10225 modev2 = insn_data[d->icode].operand[2].mode;
10226 modei3 = insn_data[d->icode].operand[3].mode;
10227 modev4 = insn_data[d->icode].operand[4].mode;
10228 modei5 = insn_data[d->icode].operand[5].mode;
10229 modeimm = insn_data[d->icode].operand[6].mode;
10230
10231 if (VECTOR_MODE_P (modev2))
10232 op0 = safe_vector_operand (op0, modev2);
10233 if (VECTOR_MODE_P (modev4))
10234 op2 = safe_vector_operand (op2, modev4);
10235
10236 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10237 op0 = copy_to_mode_reg (modev2, op0);
10238 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10239 op1 = copy_to_mode_reg (modei3, op1);
10240 if ((optimize && !register_operand (op2, modev4))
10241 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10242 op2 = copy_to_mode_reg (modev4, op2);
10243 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10244 op3 = copy_to_mode_reg (modei5, op3);
10245
10246 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10247 {
10248 error ("the fifth argument must be an 8-bit immediate");
10249 return const0_rtx;
10250 }
10251
10252 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10253 {
10254 if (optimize || !target
10255 || GET_MODE (target) != tmode0
10256 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10257 target = gen_reg_rtx (tmode0);
10258
10259 scratch1 = gen_reg_rtx (tmode1);
10260
10261 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10262 }
10263 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10264 {
10265 if (optimize || !target
10266 || GET_MODE (target) != tmode1
10267 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10268 target = gen_reg_rtx (tmode1);
10269
10270 scratch0 = gen_reg_rtx (tmode0);
10271
10272 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10273 }
10274 else
10275 {
10276 gcc_assert (d->flag);
10277
10278 scratch0 = gen_reg_rtx (tmode0);
10279 scratch1 = gen_reg_rtx (tmode1);
10280
10281 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10282 }
10283
10284 if (! pat)
10285 return 0;
10286
10287 emit_insn (pat);
10288
10289 if (d->flag)
10290 {
10291 target = gen_reg_rtx (SImode);
10292 emit_move_insn (target, const0_rtx);
10293 target = gen_rtx_SUBREG (QImode, target, 0);
10294
10295 emit_insn
10296 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10297 gen_rtx_fmt_ee (EQ, QImode,
10298 gen_rtx_REG ((machine_mode) d->flag,
10299 FLAGS_REG),
10300 const0_rtx)));
10301 return SUBREG_REG (target);
10302 }
10303 else
10304 return target;
10305}
10306
10307
10308/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10309
10310static rtx
10311ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10312 tree exp, rtx target)
10313{
10314 rtx pat;
10315 tree arg0 = CALL_EXPR_ARG (exp, 0);
10316 tree arg1 = CALL_EXPR_ARG (exp, 1);
10317 tree arg2 = CALL_EXPR_ARG (exp, 2);
10318 rtx scratch0, scratch1;
10319 rtx op0 = expand_normal (arg0);
10320 rtx op1 = expand_normal (arg1);
10321 rtx op2 = expand_normal (arg2);
10322 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10323
10324 tmode0 = insn_data[d->icode].operand[0].mode;
10325 tmode1 = insn_data[d->icode].operand[1].mode;
10326 modev2 = insn_data[d->icode].operand[2].mode;
10327 modev3 = insn_data[d->icode].operand[3].mode;
10328 modeimm = insn_data[d->icode].operand[4].mode;
10329
10330 if (VECTOR_MODE_P (modev2))
10331 op0 = safe_vector_operand (op0, modev2);
10332 if (VECTOR_MODE_P (modev3))
10333 op1 = safe_vector_operand (op1, modev3);
10334
10335 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10336 op0 = copy_to_mode_reg (modev2, op0);
10337 if ((optimize && !register_operand (op1, modev3))
10338 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10339 op1 = copy_to_mode_reg (modev3, op1);
10340
10341 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10342 {
10343 error ("the third argument must be an 8-bit immediate");
10344 return const0_rtx;
10345 }
10346
10347 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10348 {
10349 if (optimize || !target
10350 || GET_MODE (target) != tmode0
10351 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10352 target = gen_reg_rtx (tmode0);
10353
10354 scratch1 = gen_reg_rtx (tmode1);
10355
10356 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10357 }
10358 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10359 {
10360 if (optimize || !target
10361 || GET_MODE (target) != tmode1
10362 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10363 target = gen_reg_rtx (tmode1);
10364
10365 scratch0 = gen_reg_rtx (tmode0);
10366
10367 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10368 }
10369 else
10370 {
10371 gcc_assert (d->flag);
10372
10373 scratch0 = gen_reg_rtx (tmode0);
10374 scratch1 = gen_reg_rtx (tmode1);
10375
10376 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10377 }
10378
10379 if (! pat)
10380 return 0;
10381
10382 emit_insn (pat);
10383
10384 if (d->flag)
10385 {
10386 target = gen_reg_rtx (SImode);
10387 emit_move_insn (target, const0_rtx);
10388 target = gen_rtx_SUBREG (QImode, target, 0);
10389
10390 emit_insn
10391 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10392 gen_rtx_fmt_ee (EQ, QImode,
10393 gen_rtx_REG ((machine_mode) d->flag,
10394 FLAGS_REG),
10395 const0_rtx)));
10396 return SUBREG_REG (target);
10397 }
10398 else
10399 return target;
10400}
10401
10402/* Fixup modeless constants to fit required mode. */
10403
10404static rtx
10405fixup_modeless_constant (rtx x, machine_mode mode)
10406{
10407 if (GET_MODE (x) == VOIDmode)
10408 x = convert_to_mode (mode, x, 1);
10409 return x;
10410}
10411
10412/* Subroutine of ix86_expand_builtin to take care of insns with
10413 variable number of operands. */
10414
10415static rtx
10416ix86_expand_args_builtin (const struct builtin_description *d,
10417 tree exp, rtx target)
10418{
10419 rtx pat, real_target;
10420 unsigned int i, nargs;
10421 unsigned int nargs_constant = 0;
10422 unsigned int mask_pos = 0;
10423 int num_memory = 0;
715a8bc8 10424 rtx xops[6];
2bf6d935
ML
10425 bool second_arg_count = false;
10426 enum insn_code icode = d->icode;
10427 const struct insn_data_d *insn_p = &insn_data[icode];
10428 machine_mode tmode = insn_p->operand[0].mode;
10429 machine_mode rmode = VOIDmode;
10430 bool swap = false;
10431 enum rtx_code comparison = d->comparison;
10432
10433 switch ((enum ix86_builtin_func_type) d->flag)
10434 {
10435 case V2DF_FTYPE_V2DF_ROUND:
10436 case V4DF_FTYPE_V4DF_ROUND:
10437 case V8DF_FTYPE_V8DF_ROUND:
10438 case V4SF_FTYPE_V4SF_ROUND:
10439 case V8SF_FTYPE_V8SF_ROUND:
10440 case V16SF_FTYPE_V16SF_ROUND:
84bcefd5 10441 case V8HF_FTYPE_V8HF_ROUND:
10442 case V16HF_FTYPE_V16HF_ROUND:
10443 case V32HF_FTYPE_V32HF_ROUND:
2bf6d935
ML
10444 case V4SI_FTYPE_V4SF_ROUND:
10445 case V8SI_FTYPE_V8SF_ROUND:
10446 case V16SI_FTYPE_V16SF_ROUND:
10447 return ix86_expand_sse_round (d, exp, target);
10448 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10449 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10450 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10451 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10452 case INT_FTYPE_V8SF_V8SF_PTEST:
10453 case INT_FTYPE_V4DI_V4DI_PTEST:
10454 case INT_FTYPE_V4DF_V4DF_PTEST:
10455 case INT_FTYPE_V4SF_V4SF_PTEST:
10456 case INT_FTYPE_V2DI_V2DI_PTEST:
10457 case INT_FTYPE_V2DF_V2DF_PTEST:
10458 return ix86_expand_sse_ptest (d, exp, target);
10459 case FLOAT128_FTYPE_FLOAT128:
10460 case FLOAT_FTYPE_FLOAT:
a1ecc560 10461 case FLOAT_FTYPE_BFLOAT16:
2bf6d935
ML
10462 case INT_FTYPE_INT:
10463 case UINT_FTYPE_UINT:
10464 case UINT16_FTYPE_UINT16:
10465 case UINT64_FTYPE_INT:
10466 case UINT64_FTYPE_UINT64:
10467 case INT64_FTYPE_INT64:
10468 case INT64_FTYPE_V4SF:
10469 case INT64_FTYPE_V2DF:
10470 case INT_FTYPE_V16QI:
10471 case INT_FTYPE_V8QI:
10472 case INT_FTYPE_V8SF:
10473 case INT_FTYPE_V4DF:
10474 case INT_FTYPE_V4SF:
10475 case INT_FTYPE_V2DF:
10476 case INT_FTYPE_V32QI:
10477 case V16QI_FTYPE_V16QI:
10478 case V8SI_FTYPE_V8SF:
10479 case V8SI_FTYPE_V4SI:
10480 case V8HI_FTYPE_V8HI:
10481 case V8HI_FTYPE_V16QI:
10482 case V8QI_FTYPE_V8QI:
10483 case V8SF_FTYPE_V8SF:
10484 case V8SF_FTYPE_V8SI:
10485 case V8SF_FTYPE_V4SF:
10486 case V8SF_FTYPE_V8HI:
10487 case V4SI_FTYPE_V4SI:
10488 case V4SI_FTYPE_V16QI:
10489 case V4SI_FTYPE_V4SF:
10490 case V4SI_FTYPE_V8SI:
10491 case V4SI_FTYPE_V8HI:
10492 case V4SI_FTYPE_V4DF:
10493 case V4SI_FTYPE_V2DF:
10494 case V4HI_FTYPE_V4HI:
10495 case V4DF_FTYPE_V4DF:
10496 case V4DF_FTYPE_V4SI:
10497 case V4DF_FTYPE_V4SF:
10498 case V4DF_FTYPE_V2DF:
10499 case V4SF_FTYPE_V4SF:
10500 case V4SF_FTYPE_V4SI:
10501 case V4SF_FTYPE_V8SF:
10502 case V4SF_FTYPE_V4DF:
10503 case V4SF_FTYPE_V8HI:
10504 case V4SF_FTYPE_V2DF:
10505 case V2DI_FTYPE_V2DI:
10506 case V2DI_FTYPE_V16QI:
10507 case V2DI_FTYPE_V8HI:
10508 case V2DI_FTYPE_V4SI:
10509 case V2DF_FTYPE_V2DF:
10510 case V2DF_FTYPE_V4SI:
10511 case V2DF_FTYPE_V4DF:
10512 case V2DF_FTYPE_V4SF:
10513 case V2DF_FTYPE_V2SI:
10514 case V2SI_FTYPE_V2SI:
10515 case V2SI_FTYPE_V4SF:
10516 case V2SI_FTYPE_V2SF:
10517 case V2SI_FTYPE_V2DF:
10518 case V2SF_FTYPE_V2SF:
10519 case V2SF_FTYPE_V2SI:
10520 case V32QI_FTYPE_V32QI:
10521 case V32QI_FTYPE_V16QI:
10522 case V16HI_FTYPE_V16HI:
10523 case V16HI_FTYPE_V8HI:
10524 case V8SI_FTYPE_V8SI:
10525 case V16HI_FTYPE_V16QI:
10526 case V8SI_FTYPE_V16QI:
10527 case V4DI_FTYPE_V16QI:
10528 case V8SI_FTYPE_V8HI:
10529 case V4DI_FTYPE_V8HI:
10530 case V4DI_FTYPE_V4SI:
10531 case V4DI_FTYPE_V2DI:
10532 case UQI_FTYPE_UQI:
10533 case UHI_FTYPE_UHI:
10534 case USI_FTYPE_USI:
10535 case USI_FTYPE_UQI:
10536 case USI_FTYPE_UHI:
10537 case UDI_FTYPE_UDI:
10538 case UHI_FTYPE_V16QI:
10539 case USI_FTYPE_V32QI:
10540 case UDI_FTYPE_V64QI:
10541 case V16QI_FTYPE_UHI:
10542 case V32QI_FTYPE_USI:
10543 case V64QI_FTYPE_UDI:
10544 case V8HI_FTYPE_UQI:
10545 case V16HI_FTYPE_UHI:
10546 case V32HI_FTYPE_USI:
10547 case V4SI_FTYPE_UQI:
10548 case V8SI_FTYPE_UQI:
10549 case V4SI_FTYPE_UHI:
10550 case V8SI_FTYPE_UHI:
10551 case UQI_FTYPE_V8HI:
10552 case UHI_FTYPE_V16HI:
10553 case USI_FTYPE_V32HI:
10554 case UQI_FTYPE_V4SI:
10555 case UQI_FTYPE_V8SI:
10556 case UHI_FTYPE_V16SI:
10557 case UQI_FTYPE_V2DI:
10558 case UQI_FTYPE_V4DI:
10559 case UQI_FTYPE_V8DI:
10560 case V16SI_FTYPE_UHI:
10561 case V2DI_FTYPE_UQI:
10562 case V4DI_FTYPE_UQI:
10563 case V16SI_FTYPE_INT:
10564 case V16SF_FTYPE_V8SF:
10565 case V16SI_FTYPE_V8SI:
10566 case V16SF_FTYPE_V4SF:
10567 case V16SI_FTYPE_V4SI:
10568 case V16SI_FTYPE_V16SF:
10569 case V16SI_FTYPE_V16SI:
10570 case V64QI_FTYPE_V64QI:
10571 case V32HI_FTYPE_V32HI:
10572 case V16SF_FTYPE_V16SF:
10573 case V8DI_FTYPE_UQI:
10574 case V8DI_FTYPE_V8DI:
10575 case V8DF_FTYPE_V4DF:
10576 case V8DF_FTYPE_V2DF:
10577 case V8DF_FTYPE_V8DF:
10578 case V4DI_FTYPE_V4DI:
87235f1e 10579 case V16BF_FTYPE_V16SF:
10580 case V8BF_FTYPE_V8SF:
10581 case V8BF_FTYPE_V4SF:
2bf6d935
ML
10582 nargs = 1;
10583 break;
10584 case V4SF_FTYPE_V4SF_VEC_MERGE:
10585 case V2DF_FTYPE_V2DF_VEC_MERGE:
10586 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10587 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10588 case V16QI_FTYPE_V16QI_V16QI:
10589 case V16QI_FTYPE_V8HI_V8HI:
b96cb2ca 10590 case V16HF_FTYPE_V16HF_V16HF:
2bf6d935
ML
10591 case V16SF_FTYPE_V16SF_V16SF:
10592 case V8QI_FTYPE_V8QI_V8QI:
10593 case V8QI_FTYPE_V4HI_V4HI:
10594 case V8HI_FTYPE_V8HI_V8HI:
10595 case V8HI_FTYPE_V16QI_V16QI:
10596 case V8HI_FTYPE_V4SI_V4SI:
b96cb2ca 10597 case V8HF_FTYPE_V8HF_V8HF:
2bf6d935
ML
10598 case V8SF_FTYPE_V8SF_V8SF:
10599 case V8SF_FTYPE_V8SF_V8SI:
10600 case V8DF_FTYPE_V8DF_V8DF:
10601 case V4SI_FTYPE_V4SI_V4SI:
10602 case V4SI_FTYPE_V8HI_V8HI:
10603 case V4SI_FTYPE_V2DF_V2DF:
10604 case V4HI_FTYPE_V4HI_V4HI:
10605 case V4HI_FTYPE_V8QI_V8QI:
10606 case V4HI_FTYPE_V2SI_V2SI:
10607 case V4DF_FTYPE_V4DF_V4DF:
10608 case V4DF_FTYPE_V4DF_V4DI:
10609 case V4SF_FTYPE_V4SF_V4SF:
10610 case V4SF_FTYPE_V4SF_V4SI:
10611 case V4SF_FTYPE_V4SF_V2SI:
10612 case V4SF_FTYPE_V4SF_V2DF:
10613 case V4SF_FTYPE_V4SF_UINT:
10614 case V4SF_FTYPE_V4SF_DI:
10615 case V4SF_FTYPE_V4SF_SI:
10616 case V2DI_FTYPE_V2DI_V2DI:
10617 case V2DI_FTYPE_V16QI_V16QI:
10618 case V2DI_FTYPE_V4SI_V4SI:
10619 case V2DI_FTYPE_V2DI_V16QI:
10620 case V2SI_FTYPE_V2SI_V2SI:
10621 case V2SI_FTYPE_V4HI_V4HI:
10622 case V2SI_FTYPE_V2SF_V2SF:
10623 case V2DF_FTYPE_V2DF_V2DF:
10624 case V2DF_FTYPE_V2DF_V4SF:
10625 case V2DF_FTYPE_V2DF_V2DI:
10626 case V2DF_FTYPE_V2DF_DI:
10627 case V2DF_FTYPE_V2DF_SI:
10628 case V2DF_FTYPE_V2DF_UINT:
10629 case V2SF_FTYPE_V2SF_V2SF:
10630 case V1DI_FTYPE_V1DI_V1DI:
10631 case V1DI_FTYPE_V8QI_V8QI:
10632 case V1DI_FTYPE_V2SI_V2SI:
10633 case V32QI_FTYPE_V16HI_V16HI:
10634 case V16HI_FTYPE_V8SI_V8SI:
10635 case V64QI_FTYPE_V64QI_V64QI:
10636 case V32QI_FTYPE_V32QI_V32QI:
10637 case V16HI_FTYPE_V32QI_V32QI:
10638 case V16HI_FTYPE_V16HI_V16HI:
10639 case V8SI_FTYPE_V4DF_V4DF:
10640 case V8SI_FTYPE_V8SI_V8SI:
10641 case V8SI_FTYPE_V16HI_V16HI:
10642 case V4DI_FTYPE_V4DI_V4DI:
10643 case V4DI_FTYPE_V8SI_V8SI:
6bb0776e 10644 case V4DI_FTYPE_V32QI_V32QI:
2bf6d935
ML
10645 case V8DI_FTYPE_V64QI_V64QI:
10646 if (comparison == UNKNOWN)
10647 return ix86_expand_binop_builtin (icode, exp, target);
10648 nargs = 2;
10649 break;
10650 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10651 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10652 gcc_assert (comparison != UNKNOWN);
10653 nargs = 2;
10654 swap = true;
10655 break;
10656 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10657 case V16HI_FTYPE_V16HI_SI_COUNT:
10658 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10659 case V8SI_FTYPE_V8SI_SI_COUNT:
10660 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10661 case V4DI_FTYPE_V4DI_INT_COUNT:
10662 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10663 case V8HI_FTYPE_V8HI_SI_COUNT:
10664 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10665 case V4SI_FTYPE_V4SI_SI_COUNT:
10666 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10667 case V4HI_FTYPE_V4HI_SI_COUNT:
10668 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10669 case V2DI_FTYPE_V2DI_SI_COUNT:
10670 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10671 case V2SI_FTYPE_V2SI_SI_COUNT:
10672 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10673 case V1DI_FTYPE_V1DI_SI_COUNT:
10674 nargs = 2;
10675 second_arg_count = true;
10676 break;
10677 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10678 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10679 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10680 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10681 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10682 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10683 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10684 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10685 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10686 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10687 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10688 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10689 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10690 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10691 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10692 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10693 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10694 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10695 nargs = 4;
10696 second_arg_count = true;
10697 break;
10698 case UINT64_FTYPE_UINT64_UINT64:
10699 case UINT_FTYPE_UINT_UINT:
10700 case UINT_FTYPE_UINT_USHORT:
10701 case UINT_FTYPE_UINT_UCHAR:
10702 case UINT16_FTYPE_UINT16_INT:
10703 case UINT8_FTYPE_UINT8_INT:
10704 case UQI_FTYPE_UQI_UQI:
10705 case UHI_FTYPE_UHI_UHI:
10706 case USI_FTYPE_USI_USI:
10707 case UDI_FTYPE_UDI_UDI:
10708 case V16SI_FTYPE_V8DF_V8DF:
87235f1e 10709 case V32BF_FTYPE_V16SF_V16SF:
10710 case V16BF_FTYPE_V8SF_V8SF:
10711 case V8BF_FTYPE_V4SF_V4SF:
10712 case V16BF_FTYPE_V16SF_UHI:
10713 case V8BF_FTYPE_V8SF_UQI:
10714 case V8BF_FTYPE_V4SF_UQI:
2bf6d935
ML
10715 nargs = 2;
10716 break;
10717 case V2DI_FTYPE_V2DI_INT_CONVERT:
10718 nargs = 2;
10719 rmode = V1TImode;
10720 nargs_constant = 1;
10721 break;
10722 case V4DI_FTYPE_V4DI_INT_CONVERT:
10723 nargs = 2;
10724 rmode = V2TImode;
10725 nargs_constant = 1;
10726 break;
10727 case V8DI_FTYPE_V8DI_INT_CONVERT:
10728 nargs = 2;
10729 rmode = V4TImode;
10730 nargs_constant = 1;
10731 break;
10732 case V8HI_FTYPE_V8HI_INT:
10733 case V8HI_FTYPE_V8SF_INT:
10734 case V16HI_FTYPE_V16SF_INT:
10735 case V8HI_FTYPE_V4SF_INT:
10736 case V8SF_FTYPE_V8SF_INT:
10737 case V4SF_FTYPE_V16SF_INT:
10738 case V16SF_FTYPE_V16SF_INT:
10739 case V4SI_FTYPE_V4SI_INT:
10740 case V4SI_FTYPE_V8SI_INT:
10741 case V4HI_FTYPE_V4HI_INT:
10742 case V4DF_FTYPE_V4DF_INT:
10743 case V4DF_FTYPE_V8DF_INT:
10744 case V4SF_FTYPE_V4SF_INT:
10745 case V4SF_FTYPE_V8SF_INT:
10746 case V2DI_FTYPE_V2DI_INT:
10747 case V2DF_FTYPE_V2DF_INT:
10748 case V2DF_FTYPE_V4DF_INT:
10749 case V16HI_FTYPE_V16HI_INT:
10750 case V8SI_FTYPE_V8SI_INT:
10751 case V16SI_FTYPE_V16SI_INT:
10752 case V4SI_FTYPE_V16SI_INT:
10753 case V4DI_FTYPE_V4DI_INT:
10754 case V2DI_FTYPE_V4DI_INT:
10755 case V4DI_FTYPE_V8DI_INT:
2bf6d935
ML
10756 case UQI_FTYPE_UQI_UQI_CONST:
10757 case UHI_FTYPE_UHI_UQI:
10758 case USI_FTYPE_USI_UQI:
10759 case UDI_FTYPE_UDI_UQI:
10760 nargs = 2;
10761 nargs_constant = 1;
10762 break;
10763 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10764 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10765 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10766 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10767 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10768 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10769 case UHI_FTYPE_V16SI_V16SI_UHI:
10770 case UQI_FTYPE_V8DI_V8DI_UQI:
10771 case V16HI_FTYPE_V16SI_V16HI_UHI:
10772 case V16QI_FTYPE_V16SI_V16QI_UHI:
10773 case V16QI_FTYPE_V8DI_V16QI_UQI:
4204740f 10774 case V32HF_FTYPE_V32HF_V32HF_USI:
2bf6d935
ML
10775 case V16SF_FTYPE_V16SF_V16SF_UHI:
10776 case V16SF_FTYPE_V4SF_V16SF_UHI:
10777 case V16SI_FTYPE_SI_V16SI_UHI:
10778 case V16SI_FTYPE_V16HI_V16SI_UHI:
10779 case V16SI_FTYPE_V16QI_V16SI_UHI:
10780 case V8SF_FTYPE_V4SF_V8SF_UQI:
10781 case V4DF_FTYPE_V2DF_V4DF_UQI:
10782 case V8SI_FTYPE_V4SI_V8SI_UQI:
10783 case V8SI_FTYPE_SI_V8SI_UQI:
10784 case V4SI_FTYPE_V4SI_V4SI_UQI:
10785 case V4SI_FTYPE_SI_V4SI_UQI:
10786 case V4DI_FTYPE_V2DI_V4DI_UQI:
10787 case V4DI_FTYPE_DI_V4DI_UQI:
10788 case V2DI_FTYPE_V2DI_V2DI_UQI:
10789 case V2DI_FTYPE_DI_V2DI_UQI:
10790 case V64QI_FTYPE_V64QI_V64QI_UDI:
10791 case V64QI_FTYPE_V16QI_V64QI_UDI:
10792 case V64QI_FTYPE_QI_V64QI_UDI:
10793 case V32QI_FTYPE_V32QI_V32QI_USI:
10794 case V32QI_FTYPE_V16QI_V32QI_USI:
10795 case V32QI_FTYPE_QI_V32QI_USI:
10796 case V16QI_FTYPE_V16QI_V16QI_UHI:
10797 case V16QI_FTYPE_QI_V16QI_UHI:
10798 case V32HI_FTYPE_V8HI_V32HI_USI:
10799 case V32HI_FTYPE_HI_V32HI_USI:
10800 case V16HI_FTYPE_V8HI_V16HI_UHI:
10801 case V16HI_FTYPE_HI_V16HI_UHI:
10802 case V8HI_FTYPE_V8HI_V8HI_UQI:
10803 case V8HI_FTYPE_HI_V8HI_UQI:
4204740f 10804 case V16HF_FTYPE_V16HF_V16HF_UHI:
2bf6d935
ML
10805 case V8SF_FTYPE_V8HI_V8SF_UQI:
10806 case V4SF_FTYPE_V8HI_V4SF_UQI:
bd610db0 10807 case V8SI_FTYPE_V8HF_V8SI_UQI:
5a744e50 10808 case V8SF_FTYPE_V8HF_V8SF_UQI:
2bf6d935
ML
10809 case V8SI_FTYPE_V8SF_V8SI_UQI:
10810 case V4SI_FTYPE_V4SF_V4SI_UQI:
bd610db0 10811 case V4SI_FTYPE_V8HF_V4SI_UQI:
5a744e50 10812 case V4SF_FTYPE_V8HF_V4SF_UQI:
bd610db0 10813 case V4DI_FTYPE_V8HF_V4DI_UQI:
2bf6d935 10814 case V4DI_FTYPE_V4SF_V4DI_UQI:
bd610db0 10815 case V2DI_FTYPE_V8HF_V2DI_UQI:
2bf6d935 10816 case V2DI_FTYPE_V4SF_V2DI_UQI:
4204740f 10817 case V8HF_FTYPE_V8HF_V8HF_UQI:
081070bc 10818 case V8HF_FTYPE_V8HF_V8HF_V8HF:
be0e4c32 10819 case V8HF_FTYPE_V8HI_V8HF_UQI:
10820 case V8HF_FTYPE_V8SI_V8HF_UQI:
5a744e50 10821 case V8HF_FTYPE_V8SF_V8HF_UQI:
be0e4c32 10822 case V8HF_FTYPE_V4SI_V8HF_UQI:
5a744e50 10823 case V8HF_FTYPE_V4SF_V8HF_UQI:
be0e4c32 10824 case V8HF_FTYPE_V4DI_V8HF_UQI:
5a744e50 10825 case V8HF_FTYPE_V4DF_V8HF_UQI:
be0e4c32 10826 case V8HF_FTYPE_V2DI_V8HF_UQI:
5a744e50 10827 case V8HF_FTYPE_V2DF_V8HF_UQI:
2bf6d935
ML
10828 case V4SF_FTYPE_V4DI_V4SF_UQI:
10829 case V4SF_FTYPE_V2DI_V4SF_UQI:
10830 case V4DF_FTYPE_V4DI_V4DF_UQI:
5a744e50 10831 case V4DF_FTYPE_V8HF_V4DF_UQI:
10832 case V2DF_FTYPE_V8HF_V2DF_UQI:
2bf6d935
ML
10833 case V2DF_FTYPE_V2DI_V2DF_UQI:
10834 case V16QI_FTYPE_V8HI_V16QI_UQI:
10835 case V16QI_FTYPE_V16HI_V16QI_UHI:
10836 case V16QI_FTYPE_V4SI_V16QI_UQI:
10837 case V16QI_FTYPE_V8SI_V16QI_UQI:
bd610db0 10838 case V8HI_FTYPE_V8HF_V8HI_UQI:
2bf6d935
ML
10839 case V8HI_FTYPE_V4SI_V8HI_UQI:
10840 case V8HI_FTYPE_V8SI_V8HI_UQI:
10841 case V16QI_FTYPE_V2DI_V16QI_UQI:
10842 case V16QI_FTYPE_V4DI_V16QI_UQI:
10843 case V8HI_FTYPE_V2DI_V8HI_UQI:
10844 case V8HI_FTYPE_V4DI_V8HI_UQI:
10845 case V4SI_FTYPE_V2DI_V4SI_UQI:
10846 case V4SI_FTYPE_V4DI_V4SI_UQI:
10847 case V32QI_FTYPE_V32HI_V32QI_USI:
10848 case UHI_FTYPE_V16QI_V16QI_UHI:
10849 case USI_FTYPE_V32QI_V32QI_USI:
10850 case UDI_FTYPE_V64QI_V64QI_UDI:
10851 case UQI_FTYPE_V8HI_V8HI_UQI:
10852 case UHI_FTYPE_V16HI_V16HI_UHI:
10853 case USI_FTYPE_V32HI_V32HI_USI:
10854 case UQI_FTYPE_V4SI_V4SI_UQI:
10855 case UQI_FTYPE_V8SI_V8SI_UQI:
10856 case UQI_FTYPE_V2DI_V2DI_UQI:
10857 case UQI_FTYPE_V4DI_V4DI_UQI:
10858 case V4SF_FTYPE_V2DF_V4SF_UQI:
10859 case V4SF_FTYPE_V4DF_V4SF_UQI:
10860 case V16SI_FTYPE_V16SI_V16SI_UHI:
10861 case V16SI_FTYPE_V4SI_V16SI_UHI:
10862 case V2DI_FTYPE_V4SI_V2DI_UQI:
10863 case V2DI_FTYPE_V8HI_V2DI_UQI:
10864 case V2DI_FTYPE_V16QI_V2DI_UQI:
10865 case V4DI_FTYPE_V4DI_V4DI_UQI:
10866 case V4DI_FTYPE_V4SI_V4DI_UQI:
10867 case V4DI_FTYPE_V8HI_V4DI_UQI:
10868 case V4DI_FTYPE_V16QI_V4DI_UQI:
10869 case V4DI_FTYPE_V4DF_V4DI_UQI:
10870 case V2DI_FTYPE_V2DF_V2DI_UQI:
10871 case V4SI_FTYPE_V4DF_V4SI_UQI:
10872 case V4SI_FTYPE_V2DF_V4SI_UQI:
10873 case V4SI_FTYPE_V8HI_V4SI_UQI:
10874 case V4SI_FTYPE_V16QI_V4SI_UQI:
10875 case V4DI_FTYPE_V4DI_V4DI_V4DI:
10876 case V8DF_FTYPE_V2DF_V8DF_UQI:
10877 case V8DF_FTYPE_V4DF_V8DF_UQI:
10878 case V8DF_FTYPE_V8DF_V8DF_UQI:
10879 case V8SF_FTYPE_V8SF_V8SF_UQI:
10880 case V8SF_FTYPE_V8SI_V8SF_UQI:
10881 case V4DF_FTYPE_V4DF_V4DF_UQI:
10882 case V4SF_FTYPE_V4SF_V4SF_UQI:
10883 case V2DF_FTYPE_V2DF_V2DF_UQI:
10884 case V2DF_FTYPE_V4SF_V2DF_UQI:
10885 case V2DF_FTYPE_V4SI_V2DF_UQI:
10886 case V4SF_FTYPE_V4SI_V4SF_UQI:
10887 case V4DF_FTYPE_V4SF_V4DF_UQI:
10888 case V4DF_FTYPE_V4SI_V4DF_UQI:
10889 case V8SI_FTYPE_V8SI_V8SI_UQI:
10890 case V8SI_FTYPE_V8HI_V8SI_UQI:
10891 case V8SI_FTYPE_V16QI_V8SI_UQI:
10892 case V8DF_FTYPE_V8SI_V8DF_UQI:
10893 case V8DI_FTYPE_DI_V8DI_UQI:
10894 case V16SF_FTYPE_V8SF_V16SF_UHI:
10895 case V16SI_FTYPE_V8SI_V16SI_UHI:
be0e4c32 10896 case V16HF_FTYPE_V16HI_V16HF_UHI:
081070bc 10897 case V16HF_FTYPE_V16HF_V16HF_V16HF:
bd610db0 10898 case V16HI_FTYPE_V16HF_V16HI_UHI:
2bf6d935
ML
10899 case V16HI_FTYPE_V16HI_V16HI_UHI:
10900 case V8HI_FTYPE_V16QI_V8HI_UQI:
10901 case V16HI_FTYPE_V16QI_V16HI_UHI:
10902 case V32HI_FTYPE_V32HI_V32HI_USI:
10903 case V32HI_FTYPE_V32QI_V32HI_USI:
10904 case V8DI_FTYPE_V16QI_V8DI_UQI:
10905 case V8DI_FTYPE_V2DI_V8DI_UQI:
10906 case V8DI_FTYPE_V4DI_V8DI_UQI:
10907 case V8DI_FTYPE_V8DI_V8DI_UQI:
10908 case V8DI_FTYPE_V8HI_V8DI_UQI:
10909 case V8DI_FTYPE_V8SI_V8DI_UQI:
10910 case V8HI_FTYPE_V8DI_V8HI_UQI:
10911 case V8SI_FTYPE_V8DI_V8SI_UQI:
10912 case V4SI_FTYPE_V4SI_V4SI_V4SI:
10913 case V16SI_FTYPE_V16SI_V16SI_V16SI:
10914 case V8DI_FTYPE_V8DI_V8DI_V8DI:
10915 case V32HI_FTYPE_V32HI_V32HI_V32HI:
10916 case V2DI_FTYPE_V2DI_V2DI_V2DI:
10917 case V16HI_FTYPE_V16HI_V16HI_V16HI:
10918 case V8SI_FTYPE_V8SI_V8SI_V8SI:
10919 case V8HI_FTYPE_V8HI_V8HI_V8HI:
87235f1e 10920 case V32BF_FTYPE_V16SF_V16SF_USI:
10921 case V16BF_FTYPE_V8SF_V8SF_UHI:
10922 case V8BF_FTYPE_V4SF_V4SF_UQI:
10923 case V16BF_FTYPE_V16SF_V16BF_UHI:
10924 case V8BF_FTYPE_V8SF_V8BF_UQI:
10925 case V8BF_FTYPE_V4SF_V8BF_UQI:
10926 case V16SF_FTYPE_V16SF_V32BF_V32BF:
10927 case V8SF_FTYPE_V8SF_V16BF_V16BF:
10928 case V4SF_FTYPE_V4SF_V8BF_V8BF:
2bf6d935
ML
10929 nargs = 3;
10930 break;
10931 case V32QI_FTYPE_V32QI_V32QI_INT:
10932 case V16HI_FTYPE_V16HI_V16HI_INT:
10933 case V16QI_FTYPE_V16QI_V16QI_INT:
10934 case V4DI_FTYPE_V4DI_V4DI_INT:
10935 case V8HI_FTYPE_V8HI_V8HI_INT:
10936 case V8SI_FTYPE_V8SI_V8SI_INT:
10937 case V8SI_FTYPE_V8SI_V4SI_INT:
10938 case V8SF_FTYPE_V8SF_V8SF_INT:
10939 case V8SF_FTYPE_V8SF_V4SF_INT:
10940 case V4SI_FTYPE_V4SI_V4SI_INT:
10941 case V4DF_FTYPE_V4DF_V4DF_INT:
10942 case V16SF_FTYPE_V16SF_V16SF_INT:
10943 case V16SF_FTYPE_V16SF_V4SF_INT:
10944 case V16SI_FTYPE_V16SI_V4SI_INT:
10945 case V4DF_FTYPE_V4DF_V2DF_INT:
10946 case V4SF_FTYPE_V4SF_V4SF_INT:
10947 case V2DI_FTYPE_V2DI_V2DI_INT:
10948 case V4DI_FTYPE_V4DI_V2DI_INT:
10949 case V2DF_FTYPE_V2DF_V2DF_INT:
10950 case UQI_FTYPE_V8DI_V8UDI_INT:
10951 case UQI_FTYPE_V8DF_V8DF_INT:
10952 case UQI_FTYPE_V2DF_V2DF_INT:
10953 case UQI_FTYPE_V4SF_V4SF_INT:
10954 case UHI_FTYPE_V16SI_V16SI_INT:
10955 case UHI_FTYPE_V16SF_V16SF_INT:
10956 case V64QI_FTYPE_V64QI_V64QI_INT:
10957 case V32HI_FTYPE_V32HI_V32HI_INT:
10958 case V16SI_FTYPE_V16SI_V16SI_INT:
10959 case V8DI_FTYPE_V8DI_V8DI_INT:
10960 nargs = 3;
10961 nargs_constant = 1;
10962 break;
10963 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
10964 nargs = 3;
10965 rmode = V4DImode;
10966 nargs_constant = 1;
10967 break;
10968 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
10969 nargs = 3;
10970 rmode = V2DImode;
10971 nargs_constant = 1;
10972 break;
10973 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
10974 nargs = 3;
10975 rmode = DImode;
10976 nargs_constant = 1;
10977 break;
10978 case V2DI_FTYPE_V2DI_UINT_UINT:
10979 nargs = 3;
10980 nargs_constant = 2;
10981 break;
10982 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
10983 nargs = 3;
10984 rmode = V8DImode;
10985 nargs_constant = 1;
10986 break;
10987 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
10988 nargs = 5;
10989 rmode = V8DImode;
10990 mask_pos = 2;
10991 nargs_constant = 1;
10992 break;
10993 case QI_FTYPE_V8DF_INT_UQI:
10994 case QI_FTYPE_V4DF_INT_UQI:
10995 case QI_FTYPE_V2DF_INT_UQI:
10996 case HI_FTYPE_V16SF_INT_UHI:
10997 case QI_FTYPE_V8SF_INT_UQI:
10998 case QI_FTYPE_V4SF_INT_UQI:
8486e9f2 10999 case QI_FTYPE_V8HF_INT_UQI:
11000 case HI_FTYPE_V16HF_INT_UHI:
11001 case SI_FTYPE_V32HF_INT_USI:
2bf6d935
ML
11002 case V4SI_FTYPE_V4SI_V4SI_UHI:
11003 case V8SI_FTYPE_V8SI_V8SI_UHI:
11004 nargs = 3;
11005 mask_pos = 1;
11006 nargs_constant = 1;
11007 break;
11008 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
11009 nargs = 5;
11010 rmode = V4DImode;
11011 mask_pos = 2;
11012 nargs_constant = 1;
11013 break;
11014 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
11015 nargs = 5;
11016 rmode = V2DImode;
11017 mask_pos = 2;
11018 nargs_constant = 1;
11019 break;
11020 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
11021 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
11022 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
11023 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
11024 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
11025 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
11026 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
11027 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
11028 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
11029 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
11030 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
11031 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
11032 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
11033 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
11034 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
11035 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
bd7a34ef 11036 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
2bf6d935
ML
11037 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
11038 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
11039 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
11040 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
11041 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
11042 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
11043 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
11044 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
11045 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
11046 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
11047 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
11048 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
11049 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
11050 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
11051 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
11052 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
11053 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
081070bc 11054 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
bd7a34ef 11055 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
2bf6d935
ML
11056 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
11057 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
11058 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
11059 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
11060 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
11061 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
11062 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
bd7a34ef 11063 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
2bf6d935
ML
11064 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
11065 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
11066 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
11067 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
11068 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
11069 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
11070 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
11071 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
11072 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
11073 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
11074 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
87235f1e 11075 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
11076 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
11077 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
2bf6d935
ML
11078 nargs = 4;
11079 break;
11080 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
11081 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
11082 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
11083 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
11084 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
11085 nargs = 4;
11086 nargs_constant = 1;
11087 break;
11088 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
11089 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
11090 case QI_FTYPE_V4DF_V4DF_INT_UQI:
11091 case QI_FTYPE_V8SF_V8SF_INT_UQI:
0f200733 11092 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
2bf6d935
ML
11093 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
11094 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
11095 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
11096 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
0f200733 11097 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
2bf6d935
ML
11098 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
11099 case USI_FTYPE_V32QI_V32QI_INT_USI:
11100 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
11101 case USI_FTYPE_V32HI_V32HI_INT_USI:
0f200733 11102 case USI_FTYPE_V32HF_V32HF_INT_USI:
2bf6d935
ML
11103 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
11104 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
2bf6d935
ML
11105 nargs = 4;
11106 mask_pos = 1;
11107 nargs_constant = 1;
11108 break;
11109 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
11110 nargs = 4;
11111 nargs_constant = 2;
11112 break;
11113 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
11114 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
87235f1e 11115 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
11116 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
11117 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
2bf6d935
ML
11118 nargs = 4;
11119 break;
11120 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
11121 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
11122 mask_pos = 1;
11123 nargs = 4;
11124 nargs_constant = 1;
11125 break;
11126 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
11127 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
11128 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
11129 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
11130 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
11131 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
11132 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
11133 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
11134 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
11135 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
11136 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
11137 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
11138 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
11139 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
11140 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
11141 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
11142 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
11143 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
11144 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
11145 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
11146 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
11147 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
11148 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
11149 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
11150 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
8bed7617 11151 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11152 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
2bf6d935
ML
11153 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11154 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11155 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11156 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11157 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11158 nargs = 4;
11159 mask_pos = 2;
11160 nargs_constant = 1;
11161 break;
11162 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11163 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11164 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11165 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11166 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11167 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11168 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11169 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11170 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11171 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11172 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11173 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11174 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11175 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11176 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11177 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11178 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11179 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11180 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11181 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11182 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11183 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11184 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11185 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11186 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11187 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11188 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11189 nargs = 5;
11190 mask_pos = 2;
11191 nargs_constant = 1;
11192 break;
11193 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11194 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11195 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11196 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11197 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11198 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11199 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11200 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11201 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11202 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11203 nargs = 5;
11204 mask_pos = 1;
11205 nargs_constant = 1;
11206 break;
11207 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11208 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11209 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11210 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11211 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11212 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11213 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11214 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11215 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11216 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11217 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11218 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11219 nargs = 5;
11220 mask_pos = 1;
11221 nargs_constant = 2;
11222 break;
11223
11224 default:
11225 gcc_unreachable ();
11226 }
11227
715a8bc8 11228 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
11229
11230 if (comparison != UNKNOWN)
11231 {
11232 gcc_assert (nargs == 2);
11233 return ix86_expand_sse_compare (d, exp, target, swap);
11234 }
11235
11236 if (rmode == VOIDmode || rmode == tmode)
11237 {
11238 if (optimize
11239 || target == 0
11240 || GET_MODE (target) != tmode
11241 || !insn_p->operand[0].predicate (target, tmode))
11242 target = gen_reg_rtx (tmode);
11243 else if (memory_operand (target, tmode))
11244 num_memory++;
11245 real_target = target;
11246 }
11247 else
11248 {
11249 real_target = gen_reg_rtx (tmode);
11250 target = lowpart_subreg (rmode, real_target, tmode);
11251 }
11252
11253 for (i = 0; i < nargs; i++)
11254 {
11255 tree arg = CALL_EXPR_ARG (exp, i);
11256 rtx op = expand_normal (arg);
11257 machine_mode mode = insn_p->operand[i + 1].mode;
11258 bool match = insn_p->operand[i + 1].predicate (op, mode);
11259
11260 if (second_arg_count && i == 1)
11261 {
11262 /* SIMD shift insns take either an 8-bit immediate or
11263 register as count. But builtin functions take int as
11264 count. If count doesn't match, we put it in register.
11265 The instructions are using 64-bit count, if op is just
11266 32-bit, zero-extend it, as negative shift counts
11267 are undefined behavior and zero-extension is more
11268 efficient. */
11269 if (!match)
11270 {
11271 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11272 op = convert_modes (mode, GET_MODE (op), op, 1);
11273 else
11274 op = lowpart_subreg (mode, op, GET_MODE (op));
11275 if (!insn_p->operand[i + 1].predicate (op, mode))
11276 op = copy_to_reg (op);
11277 }
11278 }
11279 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11280 (!mask_pos && (nargs - i) <= nargs_constant))
11281 {
11282 if (!match)
11283 switch (icode)
11284 {
11285 case CODE_FOR_avx_vinsertf128v4di:
11286 case CODE_FOR_avx_vextractf128v4di:
11287 error ("the last argument must be an 1-bit immediate");
11288 return const0_rtx;
11289
11290 case CODE_FOR_avx512f_cmpv8di3_mask:
11291 case CODE_FOR_avx512f_cmpv16si3_mask:
11292 case CODE_FOR_avx512f_ucmpv8di3_mask:
11293 case CODE_FOR_avx512f_ucmpv16si3_mask:
11294 case CODE_FOR_avx512vl_cmpv4di3_mask:
11295 case CODE_FOR_avx512vl_cmpv8si3_mask:
11296 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11297 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11298 case CODE_FOR_avx512vl_cmpv2di3_mask:
11299 case CODE_FOR_avx512vl_cmpv4si3_mask:
11300 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11301 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11302 error ("the last argument must be a 3-bit immediate");
11303 return const0_rtx;
11304
11305 case CODE_FOR_sse4_1_roundsd:
11306 case CODE_FOR_sse4_1_roundss:
11307
11308 case CODE_FOR_sse4_1_roundpd:
11309 case CODE_FOR_sse4_1_roundps:
11310 case CODE_FOR_avx_roundpd256:
11311 case CODE_FOR_avx_roundps256:
11312
11313 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11314 case CODE_FOR_sse4_1_roundps_sfix:
11315 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11316 case CODE_FOR_avx_roundps_sfix256:
11317
11318 case CODE_FOR_sse4_1_blendps:
11319 case CODE_FOR_avx_blendpd256:
11320 case CODE_FOR_avx_vpermilv4df:
11321 case CODE_FOR_avx_vpermilv4df_mask:
11322 case CODE_FOR_avx512f_getmantv8df_mask:
11323 case CODE_FOR_avx512f_getmantv16sf_mask:
8486e9f2 11324 case CODE_FOR_avx512vl_getmantv16hf_mask:
2bf6d935
ML
11325 case CODE_FOR_avx512vl_getmantv8sf_mask:
11326 case CODE_FOR_avx512vl_getmantv4df_mask:
8486e9f2 11327 case CODE_FOR_avx512fp16_getmantv8hf_mask:
2bf6d935
ML
11328 case CODE_FOR_avx512vl_getmantv4sf_mask:
11329 case CODE_FOR_avx512vl_getmantv2df_mask:
11330 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11331 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11332 case CODE_FOR_avx512dq_rangepv4df_mask:
11333 case CODE_FOR_avx512dq_rangepv8sf_mask:
11334 case CODE_FOR_avx512dq_rangepv2df_mask:
11335 case CODE_FOR_avx512dq_rangepv4sf_mask:
11336 case CODE_FOR_avx_shufpd256_mask:
11337 error ("the last argument must be a 4-bit immediate");
11338 return const0_rtx;
11339
11340 case CODE_FOR_sha1rnds4:
11341 case CODE_FOR_sse4_1_blendpd:
11342 case CODE_FOR_avx_vpermilv2df:
11343 case CODE_FOR_avx_vpermilv2df_mask:
11344 case CODE_FOR_xop_vpermil2v2df3:
11345 case CODE_FOR_xop_vpermil2v4sf3:
11346 case CODE_FOR_xop_vpermil2v4df3:
11347 case CODE_FOR_xop_vpermil2v8sf3:
11348 case CODE_FOR_avx512f_vinsertf32x4_mask:
11349 case CODE_FOR_avx512f_vinserti32x4_mask:
11350 case CODE_FOR_avx512f_vextractf32x4_mask:
11351 case CODE_FOR_avx512f_vextracti32x4_mask:
11352 case CODE_FOR_sse2_shufpd:
11353 case CODE_FOR_sse2_shufpd_mask:
11354 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11355 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11356 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11357 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11358 error ("the last argument must be a 2-bit immediate");
11359 return const0_rtx;
11360
11361 case CODE_FOR_avx_vextractf128v4df:
11362 case CODE_FOR_avx_vextractf128v8sf:
11363 case CODE_FOR_avx_vextractf128v8si:
11364 case CODE_FOR_avx_vinsertf128v4df:
11365 case CODE_FOR_avx_vinsertf128v8sf:
11366 case CODE_FOR_avx_vinsertf128v8si:
11367 case CODE_FOR_avx512f_vinsertf64x4_mask:
11368 case CODE_FOR_avx512f_vinserti64x4_mask:
11369 case CODE_FOR_avx512f_vextractf64x4_mask:
11370 case CODE_FOR_avx512f_vextracti64x4_mask:
11371 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11372 case CODE_FOR_avx512dq_vinserti32x8_mask:
11373 case CODE_FOR_avx512vl_vinsertv4df:
11374 case CODE_FOR_avx512vl_vinsertv4di:
11375 case CODE_FOR_avx512vl_vinsertv8sf:
11376 case CODE_FOR_avx512vl_vinsertv8si:
11377 error ("the last argument must be a 1-bit immediate");
11378 return const0_rtx;
11379
11380 case CODE_FOR_avx_vmcmpv2df3:
11381 case CODE_FOR_avx_vmcmpv4sf3:
11382 case CODE_FOR_avx_cmpv2df3:
11383 case CODE_FOR_avx_cmpv4sf3:
11384 case CODE_FOR_avx_cmpv4df3:
11385 case CODE_FOR_avx_cmpv8sf3:
11386 case CODE_FOR_avx512f_cmpv8df3_mask:
11387 case CODE_FOR_avx512f_cmpv16sf3_mask:
11388 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11389 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
0f200733 11390 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11391 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11392 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
2bf6d935
ML
11393 error ("the last argument must be a 5-bit immediate");
11394 return const0_rtx;
11395
11396 default:
11397 switch (nargs_constant)
11398 {
11399 case 2:
11400 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11401 (!mask_pos && (nargs - i) == nargs_constant))
11402 {
11403 error ("the next to last argument must be an 8-bit immediate");
11404 break;
11405 }
11406 /* FALLTHRU */
11407 case 1:
11408 error ("the last argument must be an 8-bit immediate");
11409 break;
11410 default:
11411 gcc_unreachable ();
11412 }
11413 return const0_rtx;
11414 }
11415 }
11416 else
11417 {
11418 if (VECTOR_MODE_P (mode))
11419 op = safe_vector_operand (op, mode);
11420
11421 /* If we aren't optimizing, only allow one memory operand to
11422 be generated. */
11423 if (memory_operand (op, mode))
11424 num_memory++;
11425
11426 op = fixup_modeless_constant (op, mode);
11427
11428 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11429 {
11430 if (optimize || !match || num_memory > 1)
11431 op = copy_to_mode_reg (mode, op);
11432 }
11433 else
11434 {
11435 op = copy_to_reg (op);
11436 op = lowpart_subreg (mode, op, GET_MODE (op));
11437 }
11438 }
11439
715a8bc8 11440 xops[i] = op;
2bf6d935
ML
11441 }
11442
11443 switch (nargs)
11444 {
11445 case 1:
715a8bc8 11446 pat = GEN_FCN (icode) (real_target, xops[0]);
2bf6d935
ML
11447 break;
11448 case 2:
715a8bc8 11449 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
2bf6d935
ML
11450 break;
11451 case 3:
715a8bc8 11452 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
2bf6d935
ML
11453 break;
11454 case 4:
715a8bc8
UB
11455 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11456 xops[2], xops[3]);
2bf6d935
ML
11457 break;
11458 case 5:
715a8bc8
UB
11459 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11460 xops[2], xops[3], xops[4]);
2bf6d935
ML
11461 break;
11462 case 6:
715a8bc8
UB
11463 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11464 xops[2], xops[3], xops[4], xops[5]);
2bf6d935
ML
11465 break;
11466 default:
11467 gcc_unreachable ();
11468 }
11469
11470 if (! pat)
11471 return 0;
11472
11473 emit_insn (pat);
11474 return target;
11475}
11476
11477/* Transform pattern of following layout:
11478 (set A
11479 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11480 )
11481 into:
11482 (set (A B)) */
11483
11484static rtx
11485ix86_erase_embedded_rounding (rtx pat)
11486{
11487 if (GET_CODE (pat) == INSN)
11488 pat = PATTERN (pat);
11489
11490 gcc_assert (GET_CODE (pat) == SET);
11491 rtx src = SET_SRC (pat);
11492 gcc_assert (XVECLEN (src, 0) == 2);
11493 rtx p0 = XVECEXP (src, 0, 0);
11494 gcc_assert (GET_CODE (src) == UNSPEC
11495 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11496 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11497 return res;
11498}
11499
11500/* Subroutine of ix86_expand_round_builtin to take care of comi insns
11501 with rounding. */
11502static rtx
11503ix86_expand_sse_comi_round (const struct builtin_description *d,
11504 tree exp, rtx target)
11505{
11506 rtx pat, set_dst;
11507 tree arg0 = CALL_EXPR_ARG (exp, 0);
11508 tree arg1 = CALL_EXPR_ARG (exp, 1);
11509 tree arg2 = CALL_EXPR_ARG (exp, 2);
11510 tree arg3 = CALL_EXPR_ARG (exp, 3);
11511 rtx op0 = expand_normal (arg0);
11512 rtx op1 = expand_normal (arg1);
11513 rtx op2 = expand_normal (arg2);
11514 rtx op3 = expand_normal (arg3);
11515 enum insn_code icode = d->icode;
11516 const struct insn_data_d *insn_p = &insn_data[icode];
11517 machine_mode mode0 = insn_p->operand[0].mode;
11518 machine_mode mode1 = insn_p->operand[1].mode;
2bf6d935
ML
11519
11520 /* See avxintrin.h for values. */
467e9f38 11521 static const enum rtx_code comparisons[32] =
2bf6d935 11522 {
467e9f38
L
11523 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11524 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11525 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11526 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
2bf6d935 11527 };
467e9f38
L
11528 static const bool ordereds[32] =
11529 {
11530 true, true, true, false, false, false, false, true,
11531 false, false, false, true, true, true, true, false,
11532 true, true, true, false, false, false, false, true,
11533 false, false, false, true, true, true, true, false
11534 };
11535 static const bool non_signalings[32] =
2bf6d935
ML
11536 {
11537 true, false, false, true, true, false, false, true,
11538 true, false, false, true, true, false, false, true,
11539 false, true, true, false, false, true, true, false,
11540 false, true, true, false, false, true, true, false
11541 };
11542
11543 if (!CONST_INT_P (op2))
11544 {
11545 error ("the third argument must be comparison constant");
11546 return const0_rtx;
11547 }
11548 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11549 {
11550 error ("incorrect comparison mode");
11551 return const0_rtx;
11552 }
11553
11554 if (!insn_p->operand[2].predicate (op3, SImode))
11555 {
11556 error ("incorrect rounding operand");
11557 return const0_rtx;
11558 }
11559
2bf6d935
ML
11560 if (VECTOR_MODE_P (mode0))
11561 op0 = safe_vector_operand (op0, mode0);
11562 if (VECTOR_MODE_P (mode1))
11563 op1 = safe_vector_operand (op1, mode1);
11564
467e9f38
L
11565 enum rtx_code comparison = comparisons[INTVAL (op2)];
11566 bool ordered = ordereds[INTVAL (op2)];
11567 bool non_signaling = non_signalings[INTVAL (op2)];
11568 rtx const_val = const0_rtx;
11569
11570 bool check_unordered = false;
11571 machine_mode mode = CCFPmode;
11572 switch (comparison)
11573 {
11574 case ORDERED:
11575 if (!ordered)
11576 {
11577 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11578 if (!non_signaling)
11579 ordered = true;
11580 mode = CCSmode;
11581 }
11582 else
11583 {
11584 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11585 if (non_signaling)
11586 ordered = false;
11587 mode = CCPmode;
11588 }
11589 comparison = NE;
11590 break;
11591 case UNORDERED:
11592 if (ordered)
11593 {
11594 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11595 if (non_signaling)
11596 ordered = false;
11597 mode = CCSmode;
11598 }
11599 else
11600 {
11601 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11602 if (!non_signaling)
11603 ordered = true;
11604 mode = CCPmode;
11605 }
11606 comparison = EQ;
11607 break;
11608
11609 case LE: /* -> GE */
11610 case LT: /* -> GT */
11611 case UNGE: /* -> UNLE */
11612 case UNGT: /* -> UNLT */
11613 std::swap (op0, op1);
11614 comparison = swap_condition (comparison);
11615 /* FALLTHRU */
11616 case GT:
11617 case GE:
11618 case UNEQ:
11619 case UNLT:
11620 case UNLE:
11621 case LTGT:
11622 /* These are supported by CCFPmode. NB: Use ordered/signaling
11623 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11624 with NAN operands. */
11625 if (ordered == non_signaling)
11626 ordered = !ordered;
11627 break;
11628 case EQ:
11629 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11630 _CMP_EQ_OQ/_CMP_EQ_OS. */
11631 check_unordered = true;
11632 mode = CCZmode;
11633 break;
11634 case NE:
11635 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11636 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11637 gcc_assert (!ordered);
11638 check_unordered = true;
11639 mode = CCZmode;
11640 const_val = const1_rtx;
11641 break;
11642 default:
11643 gcc_unreachable ();
11644 }
11645
2bf6d935 11646 target = gen_reg_rtx (SImode);
467e9f38 11647 emit_move_insn (target, const_val);
2bf6d935
ML
11648 target = gen_rtx_SUBREG (QImode, target, 0);
11649
11650 if ((optimize && !register_operand (op0, mode0))
11651 || !insn_p->operand[0].predicate (op0, mode0))
11652 op0 = copy_to_mode_reg (mode0, op0);
11653 if ((optimize && !register_operand (op1, mode1))
11654 || !insn_p->operand[1].predicate (op1, mode1))
11655 op1 = copy_to_mode_reg (mode1, op1);
11656
467e9f38
L
11657 /*
11658 1. COMI: ordered and signaling.
11659 2. UCOMI: unordered and non-signaling.
11660 */
11661 if (non_signaling)
11662 icode = (icode == CODE_FOR_sse_comi_round
11663 ? CODE_FOR_sse_ucomi_round
11664 : CODE_FOR_sse2_ucomi_round);
2bf6d935
ML
11665
11666 pat = GEN_FCN (icode) (op0, op1, op3);
11667 if (! pat)
11668 return 0;
11669
11670 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11671 if (INTVAL (op3) == NO_ROUND)
11672 {
11673 pat = ix86_erase_embedded_rounding (pat);
11674 if (! pat)
11675 return 0;
11676
11677 set_dst = SET_DEST (pat);
11678 }
11679 else
11680 {
11681 gcc_assert (GET_CODE (pat) == SET);
11682 set_dst = SET_DEST (pat);
11683 }
11684
11685 emit_insn (pat);
467e9f38 11686
ae69e6f6 11687 return ix86_ssecom_setcc (comparison, check_unordered, mode,
11688 set_dst, target);
2bf6d935
ML
11689}
11690
11691static rtx
11692ix86_expand_round_builtin (const struct builtin_description *d,
11693 tree exp, rtx target)
11694{
11695 rtx pat;
11696 unsigned int i, nargs;
715a8bc8 11697 rtx xops[6];
2bf6d935
ML
11698 enum insn_code icode = d->icode;
11699 const struct insn_data_d *insn_p = &insn_data[icode];
11700 machine_mode tmode = insn_p->operand[0].mode;
11701 unsigned int nargs_constant = 0;
11702 unsigned int redundant_embed_rnd = 0;
11703
11704 switch ((enum ix86_builtin_func_type) d->flag)
11705 {
11706 case UINT64_FTYPE_V2DF_INT:
11707 case UINT64_FTYPE_V4SF_INT:
3069a2e5 11708 case UINT64_FTYPE_V8HF_INT:
2bf6d935
ML
11709 case UINT_FTYPE_V2DF_INT:
11710 case UINT_FTYPE_V4SF_INT:
3069a2e5 11711 case UINT_FTYPE_V8HF_INT:
2bf6d935
ML
11712 case INT64_FTYPE_V2DF_INT:
11713 case INT64_FTYPE_V4SF_INT:
3069a2e5 11714 case INT64_FTYPE_V8HF_INT:
2bf6d935
ML
11715 case INT_FTYPE_V2DF_INT:
11716 case INT_FTYPE_V4SF_INT:
3069a2e5 11717 case INT_FTYPE_V8HF_INT:
2bf6d935
ML
11718 nargs = 2;
11719 break;
bd7a34ef 11720 case V32HF_FTYPE_V32HF_V32HF_INT:
71838266 11721 case V8HF_FTYPE_V8HF_V8HF_INT:
3069a2e5 11722 case V8HF_FTYPE_V8HF_INT_INT:
11723 case V8HF_FTYPE_V8HF_UINT_INT:
11724 case V8HF_FTYPE_V8HF_INT64_INT:
11725 case V8HF_FTYPE_V8HF_UINT64_INT:
2bf6d935
ML
11726 case V4SF_FTYPE_V4SF_UINT_INT:
11727 case V4SF_FTYPE_V4SF_UINT64_INT:
11728 case V2DF_FTYPE_V2DF_UINT64_INT:
11729 case V4SF_FTYPE_V4SF_INT_INT:
11730 case V4SF_FTYPE_V4SF_INT64_INT:
11731 case V2DF_FTYPE_V2DF_INT64_INT:
11732 case V4SF_FTYPE_V4SF_V4SF_INT:
11733 case V2DF_FTYPE_V2DF_V2DF_INT:
11734 case V4SF_FTYPE_V4SF_V2DF_INT:
11735 case V2DF_FTYPE_V2DF_V4SF_INT:
11736 nargs = 3;
11737 break;
11738 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11739 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
bd610db0 11740 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
2bf6d935 11741 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
bd610db0 11742 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
2bf6d935
ML
11743 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11744 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11745 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
5a744e50 11746 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11747 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
be0e4c32 11748 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
4204740f 11749 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
081070bc 11750 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
2bf6d935
ML
11751 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11752 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11753 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11754 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
bd610db0 11755 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
be0e4c32 11756 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
2bf6d935
ML
11757 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11758 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11759 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11760 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
be0e4c32 11761 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
5a744e50 11762 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11763 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
3c9de0a9 11764 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
2bf6d935
ML
11765 nargs = 4;
11766 break;
11767 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11768 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11769 nargs_constant = 2;
11770 nargs = 4;
11771 break;
11772 case INT_FTYPE_V4SF_V4SF_INT_INT:
11773 case INT_FTYPE_V2DF_V2DF_INT_INT:
11774 return ix86_expand_sse_comi_round (d, exp, target);
11775 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11776 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11777 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
90429b96 11778 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
2bf6d935 11779 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
081070bc 11780 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
bd7a34ef 11781 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
90429b96 11782 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
2bf6d935
ML
11783 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11784 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
93103603 11785 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
2bf6d935
ML
11786 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11787 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
93103603 11788 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
71838266 11789 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
90429b96 11790 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11791 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
2bf6d935
ML
11792 nargs = 5;
11793 break;
8bed7617 11794 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
2bf6d935
ML
11795 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11796 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
93103603
SP
11797 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11798 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
2bf6d935
ML
11799 nargs_constant = 4;
11800 nargs = 5;
11801 break;
11802 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11803 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11804 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11805 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
0f200733 11806 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11807 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
2bf6d935
ML
11808 nargs_constant = 3;
11809 nargs = 5;
11810 break;
11811 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11812 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11813 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11814 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11815 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11816 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
8bed7617 11817 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
2bf6d935
ML
11818 nargs = 6;
11819 nargs_constant = 4;
11820 break;
11821 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11822 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11823 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11824 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11825 nargs = 6;
11826 nargs_constant = 3;
11827 break;
11828 default:
11829 gcc_unreachable ();
11830 }
715a8bc8 11831 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
11832
11833 if (optimize
11834 || target == 0
11835 || GET_MODE (target) != tmode
11836 || !insn_p->operand[0].predicate (target, tmode))
11837 target = gen_reg_rtx (tmode);
11838
11839 for (i = 0; i < nargs; i++)
11840 {
11841 tree arg = CALL_EXPR_ARG (exp, i);
11842 rtx op = expand_normal (arg);
11843 machine_mode mode = insn_p->operand[i + 1].mode;
11844 bool match = insn_p->operand[i + 1].predicate (op, mode);
11845
11846 if (i == nargs - nargs_constant)
11847 {
11848 if (!match)
11849 {
11850 switch (icode)
11851 {
11852 case CODE_FOR_avx512f_getmantv8df_mask_round:
11853 case CODE_FOR_avx512f_getmantv16sf_mask_round:
8486e9f2 11854 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
2bf6d935
ML
11855 case CODE_FOR_avx512f_vgetmantv2df_round:
11856 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11857 case CODE_FOR_avx512f_vgetmantv4sf_round:
11858 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
8486e9f2 11859 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
2bf6d935
ML
11860 error ("the immediate argument must be a 4-bit immediate");
11861 return const0_rtx;
11862 case CODE_FOR_avx512f_cmpv8df3_mask_round:
11863 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11864 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11865 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
0f200733 11866 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11867 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
2bf6d935
ML
11868 error ("the immediate argument must be a 5-bit immediate");
11869 return const0_rtx;
11870 default:
11871 error ("the immediate argument must be an 8-bit immediate");
11872 return const0_rtx;
11873 }
11874 }
11875 }
11876 else if (i == nargs-1)
11877 {
11878 if (!insn_p->operand[nargs].predicate (op, SImode))
11879 {
11880 error ("incorrect rounding operand");
11881 return const0_rtx;
11882 }
11883
11884 /* If there is no rounding use normal version of the pattern. */
11885 if (INTVAL (op) == NO_ROUND)
2f9529fc
HW
11886 {
11887 /* Skip erasing embedded rounding for below expanders who
11888 generates multiple insns. In ix86_erase_embedded_rounding
11889 the pattern will be transformed to a single set, and emit_insn
11890 appends the set insead of insert it to chain. So the insns
11891 emitted inside define_expander would be ignored. */
11892 switch (icode)
11893 {
11894 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11895 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11896 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11897 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11898 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11899 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11900 redundant_embed_rnd = 0;
11901 break;
11902 default:
11903 redundant_embed_rnd = 1;
11904 break;
11905 }
11906 }
2bf6d935
ML
11907 }
11908 else
11909 {
11910 if (VECTOR_MODE_P (mode))
11911 op = safe_vector_operand (op, mode);
11912
11913 op = fixup_modeless_constant (op, mode);
11914
11915 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11916 {
11917 if (optimize || !match)
11918 op = copy_to_mode_reg (mode, op);
11919 }
11920 else
11921 {
11922 op = copy_to_reg (op);
11923 op = lowpart_subreg (mode, op, GET_MODE (op));
11924 }
11925 }
11926
715a8bc8 11927 xops[i] = op;
2bf6d935
ML
11928 }
11929
11930 switch (nargs)
11931 {
11932 case 1:
715a8bc8 11933 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
11934 break;
11935 case 2:
715a8bc8 11936 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
11937 break;
11938 case 3:
715a8bc8 11939 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
11940 break;
11941 case 4:
715a8bc8
UB
11942 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11943 xops[2], xops[3]);
2bf6d935
ML
11944 break;
11945 case 5:
715a8bc8
UB
11946 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11947 xops[2], xops[3], xops[4]);
2bf6d935
ML
11948 break;
11949 case 6:
715a8bc8
UB
11950 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11951 xops[2], xops[3], xops[4], xops[5]);
2bf6d935
ML
11952 break;
11953 default:
11954 gcc_unreachable ();
11955 }
11956
11957 if (!pat)
11958 return 0;
11959
11960 if (redundant_embed_rnd)
11961 pat = ix86_erase_embedded_rounding (pat);
11962
11963 emit_insn (pat);
11964 return target;
11965}
11966
11967/* Subroutine of ix86_expand_builtin to take care of special insns
11968 with variable number of operands. */
11969
11970static rtx
11971ix86_expand_special_args_builtin (const struct builtin_description *d,
11972 tree exp, rtx target)
11973{
11974 tree arg;
11975 rtx pat, op;
11976 unsigned int i, nargs, arg_adjust, memory;
152834fe 11977 unsigned int constant = 100;
2bf6d935 11978 bool aligned_mem = false;
152834fe 11979 rtx xops[4];
2bf6d935 11980 enum insn_code icode = d->icode;
2bf6d935
ML
11981 const struct insn_data_d *insn_p = &insn_data[icode];
11982 machine_mode tmode = insn_p->operand[0].mode;
11983 enum { load, store } klass;
11984
11985 switch ((enum ix86_builtin_func_type) d->flag)
11986 {
11987 case VOID_FTYPE_VOID:
11988 emit_insn (GEN_FCN (icode) (target));
11989 return 0;
11990 case VOID_FTYPE_UINT64:
11991 case VOID_FTYPE_UNSIGNED:
11992 nargs = 0;
11993 klass = store;
11994 memory = 0;
11995 break;
11996
11997 case INT_FTYPE_VOID:
11998 case USHORT_FTYPE_VOID:
11999 case UINT64_FTYPE_VOID:
12000 case UINT_FTYPE_VOID:
299a53d7 12001 case UINT8_FTYPE_VOID:
2bf6d935
ML
12002 case UNSIGNED_FTYPE_VOID:
12003 nargs = 0;
12004 klass = load;
12005 memory = 0;
12006 break;
12007 case UINT64_FTYPE_PUNSIGNED:
12008 case V2DI_FTYPE_PV2DI:
12009 case V4DI_FTYPE_PV4DI:
12010 case V32QI_FTYPE_PCCHAR:
12011 case V16QI_FTYPE_PCCHAR:
12012 case V8SF_FTYPE_PCV4SF:
12013 case V8SF_FTYPE_PCFLOAT:
12014 case V4SF_FTYPE_PCFLOAT:
58685b93 12015 case V4SF_FTYPE_PCFLOAT16:
12016 case V4SF_FTYPE_PCBFLOAT16:
12017 case V4SF_FTYPE_PCV8BF:
12018 case V4SF_FTYPE_PCV8HF:
12019 case V8SF_FTYPE_PCFLOAT16:
12020 case V8SF_FTYPE_PCBFLOAT16:
12021 case V8SF_FTYPE_PCV16HF:
12022 case V8SF_FTYPE_PCV16BF:
2bf6d935
ML
12023 case V4DF_FTYPE_PCV2DF:
12024 case V4DF_FTYPE_PCDOUBLE:
12025 case V2DF_FTYPE_PCDOUBLE:
12026 case VOID_FTYPE_PVOID:
12027 case V8DI_FTYPE_PV8DI:
12028 nargs = 1;
12029 klass = load;
12030 memory = 0;
12031 switch (icode)
12032 {
12033 case CODE_FOR_sse4_1_movntdqa:
12034 case CODE_FOR_avx2_movntdqa:
12035 case CODE_FOR_avx512f_movntdqa:
12036 aligned_mem = true;
12037 break;
12038 default:
12039 break;
12040 }
12041 break;
12042 case VOID_FTYPE_PV2SF_V4SF:
12043 case VOID_FTYPE_PV8DI_V8DI:
12044 case VOID_FTYPE_PV4DI_V4DI:
12045 case VOID_FTYPE_PV2DI_V2DI:
12046 case VOID_FTYPE_PCHAR_V32QI:
12047 case VOID_FTYPE_PCHAR_V16QI:
12048 case VOID_FTYPE_PFLOAT_V16SF:
12049 case VOID_FTYPE_PFLOAT_V8SF:
12050 case VOID_FTYPE_PFLOAT_V4SF:
12051 case VOID_FTYPE_PDOUBLE_V8DF:
12052 case VOID_FTYPE_PDOUBLE_V4DF:
12053 case VOID_FTYPE_PDOUBLE_V2DF:
12054 case VOID_FTYPE_PLONGLONG_LONGLONG:
12055 case VOID_FTYPE_PULONGLONG_ULONGLONG:
12056 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
12057 case VOID_FTYPE_PINT_INT:
12058 nargs = 1;
12059 klass = store;
12060 /* Reserve memory operand for target. */
715a8bc8 12061 memory = ARRAY_SIZE (xops);
2bf6d935
ML
12062 switch (icode)
12063 {
12064 /* These builtins and instructions require the memory
12065 to be properly aligned. */
12066 case CODE_FOR_avx_movntv4di:
12067 case CODE_FOR_sse2_movntv2di:
12068 case CODE_FOR_avx_movntv8sf:
12069 case CODE_FOR_sse_movntv4sf:
12070 case CODE_FOR_sse4a_vmmovntv4sf:
12071 case CODE_FOR_avx_movntv4df:
12072 case CODE_FOR_sse2_movntv2df:
12073 case CODE_FOR_sse4a_vmmovntv2df:
12074 case CODE_FOR_sse2_movntidi:
12075 case CODE_FOR_sse_movntq:
12076 case CODE_FOR_sse2_movntisi:
12077 case CODE_FOR_avx512f_movntv16sf:
12078 case CODE_FOR_avx512f_movntv8df:
12079 case CODE_FOR_avx512f_movntv8di:
12080 aligned_mem = true;
12081 break;
12082 default:
12083 break;
12084 }
12085 break;
12086 case VOID_FTYPE_PVOID_PCVOID:
12087 nargs = 1;
12088 klass = store;
12089 memory = 0;
12090
12091 break;
12092 case V4SF_FTYPE_V4SF_PCV2SF:
12093 case V2DF_FTYPE_V2DF_PCDOUBLE:
12094 nargs = 2;
12095 klass = load;
12096 memory = 1;
12097 break;
12098 case V8SF_FTYPE_PCV8SF_V8SI:
12099 case V4DF_FTYPE_PCV4DF_V4DI:
12100 case V4SF_FTYPE_PCV4SF_V4SI:
12101 case V2DF_FTYPE_PCV2DF_V2DI:
12102 case V8SI_FTYPE_PCV8SI_V8SI:
12103 case V4DI_FTYPE_PCV4DI_V4DI:
12104 case V4SI_FTYPE_PCV4SI_V4SI:
12105 case V2DI_FTYPE_PCV2DI_V2DI:
12106 case VOID_FTYPE_INT_INT64:
12107 nargs = 2;
12108 klass = load;
12109 memory = 0;
12110 break;
12111 case VOID_FTYPE_PV8DF_V8DF_UQI:
12112 case VOID_FTYPE_PV4DF_V4DF_UQI:
12113 case VOID_FTYPE_PV2DF_V2DF_UQI:
12114 case VOID_FTYPE_PV16SF_V16SF_UHI:
12115 case VOID_FTYPE_PV8SF_V8SF_UQI:
12116 case VOID_FTYPE_PV4SF_V4SF_UQI:
12117 case VOID_FTYPE_PV8DI_V8DI_UQI:
12118 case VOID_FTYPE_PV4DI_V4DI_UQI:
12119 case VOID_FTYPE_PV2DI_V2DI_UQI:
12120 case VOID_FTYPE_PV16SI_V16SI_UHI:
12121 case VOID_FTYPE_PV8SI_V8SI_UQI:
12122 case VOID_FTYPE_PV4SI_V4SI_UQI:
12123 case VOID_FTYPE_PV64QI_V64QI_UDI:
12124 case VOID_FTYPE_PV32HI_V32HI_USI:
12125 case VOID_FTYPE_PV32QI_V32QI_USI:
12126 case VOID_FTYPE_PV16QI_V16QI_UHI:
12127 case VOID_FTYPE_PV16HI_V16HI_UHI:
12128 case VOID_FTYPE_PV8HI_V8HI_UQI:
12129 switch (icode)
12130 {
12131 /* These builtins and instructions require the memory
12132 to be properly aligned. */
12133 case CODE_FOR_avx512f_storev16sf_mask:
12134 case CODE_FOR_avx512f_storev16si_mask:
12135 case CODE_FOR_avx512f_storev8df_mask:
12136 case CODE_FOR_avx512f_storev8di_mask:
12137 case CODE_FOR_avx512vl_storev8sf_mask:
12138 case CODE_FOR_avx512vl_storev8si_mask:
12139 case CODE_FOR_avx512vl_storev4df_mask:
12140 case CODE_FOR_avx512vl_storev4di_mask:
12141 case CODE_FOR_avx512vl_storev4sf_mask:
12142 case CODE_FOR_avx512vl_storev4si_mask:
12143 case CODE_FOR_avx512vl_storev2df_mask:
12144 case CODE_FOR_avx512vl_storev2di_mask:
12145 aligned_mem = true;
12146 break;
12147 default:
12148 break;
12149 }
12150 /* FALLTHRU */
12151 case VOID_FTYPE_PV8SF_V8SI_V8SF:
12152 case VOID_FTYPE_PV4DF_V4DI_V4DF:
12153 case VOID_FTYPE_PV4SF_V4SI_V4SF:
12154 case VOID_FTYPE_PV2DF_V2DI_V2DF:
12155 case VOID_FTYPE_PV8SI_V8SI_V8SI:
12156 case VOID_FTYPE_PV4DI_V4DI_V4DI:
12157 case VOID_FTYPE_PV4SI_V4SI_V4SI:
12158 case VOID_FTYPE_PV2DI_V2DI_V2DI:
12159 case VOID_FTYPE_PV8SI_V8DI_UQI:
12160 case VOID_FTYPE_PV8HI_V8DI_UQI:
12161 case VOID_FTYPE_PV16HI_V16SI_UHI:
4a948703 12162 case VOID_FTYPE_PUDI_V8DI_UQI:
2bf6d935
ML
12163 case VOID_FTYPE_PV16QI_V16SI_UHI:
12164 case VOID_FTYPE_PV4SI_V4DI_UQI:
4a948703 12165 case VOID_FTYPE_PUDI_V2DI_UQI:
12166 case VOID_FTYPE_PUDI_V4DI_UQI:
12167 case VOID_FTYPE_PUSI_V2DI_UQI:
2bf6d935 12168 case VOID_FTYPE_PV8HI_V8SI_UQI:
4a948703 12169 case VOID_FTYPE_PUDI_V4SI_UQI:
12170 case VOID_FTYPE_PUSI_V4DI_UQI:
12171 case VOID_FTYPE_PUHI_V2DI_UQI:
12172 case VOID_FTYPE_PUDI_V8SI_UQI:
12173 case VOID_FTYPE_PUSI_V4SI_UQI:
2bf6d935
ML
12174 case VOID_FTYPE_PCHAR_V64QI_UDI:
12175 case VOID_FTYPE_PCHAR_V32QI_USI:
12176 case VOID_FTYPE_PCHAR_V16QI_UHI:
12177 case VOID_FTYPE_PSHORT_V32HI_USI:
12178 case VOID_FTYPE_PSHORT_V16HI_UHI:
12179 case VOID_FTYPE_PSHORT_V8HI_UQI:
12180 case VOID_FTYPE_PINT_V16SI_UHI:
12181 case VOID_FTYPE_PINT_V8SI_UQI:
12182 case VOID_FTYPE_PINT_V4SI_UQI:
12183 case VOID_FTYPE_PINT64_V8DI_UQI:
12184 case VOID_FTYPE_PINT64_V4DI_UQI:
12185 case VOID_FTYPE_PINT64_V2DI_UQI:
12186 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12187 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12188 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12189 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12190 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12191 case VOID_FTYPE_PFLOAT_V4SF_UQI:
c4d423c7 12192 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
2bf6d935
ML
12193 case VOID_FTYPE_PV32QI_V32HI_USI:
12194 case VOID_FTYPE_PV16QI_V16HI_UHI:
4a948703 12195 case VOID_FTYPE_PUDI_V8HI_UQI:
2bf6d935
ML
12196 nargs = 2;
12197 klass = store;
12198 /* Reserve memory operand for target. */
715a8bc8 12199 memory = ARRAY_SIZE (xops);
2bf6d935
ML
12200 break;
12201 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12202 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12203 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12204 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12205 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12206 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12207 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12208 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12209 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12210 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12211 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12212 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12213 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12214 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12215 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12216 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12217 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12218 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12219 switch (icode)
12220 {
12221 /* These builtins and instructions require the memory
12222 to be properly aligned. */
12223 case CODE_FOR_avx512f_loadv16sf_mask:
12224 case CODE_FOR_avx512f_loadv16si_mask:
12225 case CODE_FOR_avx512f_loadv8df_mask:
12226 case CODE_FOR_avx512f_loadv8di_mask:
12227 case CODE_FOR_avx512vl_loadv8sf_mask:
12228 case CODE_FOR_avx512vl_loadv8si_mask:
12229 case CODE_FOR_avx512vl_loadv4df_mask:
12230 case CODE_FOR_avx512vl_loadv4di_mask:
12231 case CODE_FOR_avx512vl_loadv4sf_mask:
12232 case CODE_FOR_avx512vl_loadv4si_mask:
12233 case CODE_FOR_avx512vl_loadv2df_mask:
12234 case CODE_FOR_avx512vl_loadv2di_mask:
12235 case CODE_FOR_avx512bw_loadv64qi_mask:
12236 case CODE_FOR_avx512vl_loadv32qi_mask:
12237 case CODE_FOR_avx512vl_loadv16qi_mask:
12238 case CODE_FOR_avx512bw_loadv32hi_mask:
12239 case CODE_FOR_avx512vl_loadv16hi_mask:
12240 case CODE_FOR_avx512vl_loadv8hi_mask:
12241 aligned_mem = true;
12242 break;
12243 default:
12244 break;
12245 }
12246 /* FALLTHRU */
12247 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12248 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12249 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12250 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12251 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12252 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12253 case V16SI_FTYPE_PCINT_V16SI_UHI:
12254 case V8SI_FTYPE_PCINT_V8SI_UQI:
12255 case V4SI_FTYPE_PCINT_V4SI_UQI:
12256 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12257 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12258 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12259 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12260 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12261 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12262 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12263 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12264 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
c4d423c7 12265 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
2bf6d935
ML
12266 nargs = 3;
12267 klass = load;
12268 memory = 0;
12269 break;
152834fe
HJ
12270 case INT_FTYPE_PINT_INT_INT_INT:
12271 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
12272 nargs = 4;
12273 klass = load;
12274 memory = 0;
12275 constant = 3;
12276 break;
2bf6d935
ML
12277 default:
12278 gcc_unreachable ();
12279 }
12280
715a8bc8 12281 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
12282
12283 if (klass == store)
12284 {
12285 arg = CALL_EXPR_ARG (exp, 0);
12286 op = expand_normal (arg);
12287 gcc_assert (target == 0);
12288 if (memory)
12289 {
12290 op = ix86_zero_extend_to_Pmode (op);
12291 target = gen_rtx_MEM (tmode, op);
12292 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12293 on it. Try to improve it using get_pointer_alignment,
12294 and if the special builtin is one that requires strict
12295 mode alignment, also from it's GET_MODE_ALIGNMENT.
12296 Failure to do so could lead to ix86_legitimate_combined_insn
12297 rejecting all changes to such insns. */
12298 unsigned int align = get_pointer_alignment (arg);
12299 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12300 align = GET_MODE_ALIGNMENT (tmode);
12301 if (MEM_ALIGN (target) < align)
12302 set_mem_align (target, align);
12303 }
12304 else
12305 target = force_reg (tmode, op);
12306 arg_adjust = 1;
12307 }
12308 else
12309 {
12310 arg_adjust = 0;
12311 if (optimize
12312 || target == 0
12313 || !register_operand (target, tmode)
12314 || GET_MODE (target) != tmode)
12315 target = gen_reg_rtx (tmode);
12316 }
12317
12318 for (i = 0; i < nargs; i++)
12319 {
12320 machine_mode mode = insn_p->operand[i + 1].mode;
2bf6d935
ML
12321
12322 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12323 op = expand_normal (arg);
2bf6d935 12324
776a37f6 12325 if (i == memory)
2bf6d935 12326 {
776a37f6 12327 /* This must be the memory operand. */
12328 op = ix86_zero_extend_to_Pmode (op);
12329 op = gen_rtx_MEM (mode, op);
12330 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12331 on it. Try to improve it using get_pointer_alignment,
12332 and if the special builtin is one that requires strict
12333 mode alignment, also from it's GET_MODE_ALIGNMENT.
12334 Failure to do so could lead to ix86_legitimate_combined_insn
12335 rejecting all changes to such insns. */
12336 unsigned int align = get_pointer_alignment (arg);
12337 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12338 align = GET_MODE_ALIGNMENT (mode);
12339 if (MEM_ALIGN (op) < align)
12340 set_mem_align (op, align);
2bf6d935 12341 }
152834fe
HJ
12342 else if (i == constant)
12343 {
12344 /* This must be the constant. */
12345 if (!insn_p->operand[nargs].predicate(op, SImode))
12346 {
12347 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12348 return const0_rtx;
12349 }
12350 }
2bf6d935
ML
12351 else
12352 {
776a37f6 12353 /* This must be register. */
12354 if (VECTOR_MODE_P (mode))
12355 op = safe_vector_operand (op, mode);
2bf6d935 12356
776a37f6 12357 op = fixup_modeless_constant (op, mode);
2bf6d935 12358
b6efffa5 12359 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
35c4c67e 12360 and that mask operand shoud be at the end.
12361 Keep all-ones mask which would be simplified by the expander. */
12362 if (nargs == 3 && i == 2 && klass == load
b6efffa5 12363 && constm1_operand (op, mode)
12364 && insn_p->operand[i].predicate (op, mode))
35c4c67e 12365 ;
12366 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
776a37f6 12367 op = copy_to_mode_reg (mode, op);
12368 else
12369 {
12370 op = copy_to_reg (op);
12371 op = lowpart_subreg (mode, op, GET_MODE (op));
2bf6d935
ML
12372 }
12373 }
12374
715a8bc8 12375 xops[i]= op;
2bf6d935
ML
12376 }
12377
12378 switch (nargs)
12379 {
12380 case 0:
12381 pat = GEN_FCN (icode) (target);
12382 break;
12383 case 1:
715a8bc8 12384 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
12385 break;
12386 case 2:
715a8bc8 12387 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
12388 break;
12389 case 3:
715a8bc8 12390 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935 12391 break;
152834fe
HJ
12392 case 4:
12393 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
12394 break;
2bf6d935
ML
12395 default:
12396 gcc_unreachable ();
12397 }
12398
12399 if (! pat)
12400 return 0;
715a8bc8 12401
2bf6d935
ML
12402 emit_insn (pat);
12403 return klass == store ? 0 : target;
12404}
12405
12406/* Return the integer constant in ARG. Constrain it to be in the range
12407 of the subparts of VEC_TYPE; issue an error if not. */
12408
12409static int
12410get_element_number (tree vec_type, tree arg)
12411{
12412 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12413
12414 if (!tree_fits_uhwi_p (arg)
12415 || (elt = tree_to_uhwi (arg), elt > max))
12416 {
a9c697b8
MS
12417 error ("selector must be an integer constant in the range "
12418 "[0, %wi]", max);
2bf6d935
ML
12419 return 0;
12420 }
12421
12422 return elt;
12423}
12424
12425/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12426 ix86_expand_vector_init. We DO have language-level syntax for this, in
12427 the form of (type){ init-list }. Except that since we can't place emms
12428 instructions from inside the compiler, we can't allow the use of MMX
12429 registers unless the user explicitly asks for it. So we do *not* define
12430 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12431 we have builtins invoked by mmintrin.h that gives us license to emit
12432 these sorts of instructions. */
12433
12434static rtx
12435ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12436{
12437 machine_mode tmode = TYPE_MODE (type);
12438 machine_mode inner_mode = GET_MODE_INNER (tmode);
12439 int i, n_elt = GET_MODE_NUNITS (tmode);
12440 rtvec v = rtvec_alloc (n_elt);
12441
12442 gcc_assert (VECTOR_MODE_P (tmode));
12443 gcc_assert (call_expr_nargs (exp) == n_elt);
12444
12445 for (i = 0; i < n_elt; ++i)
12446 {
12447 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12448 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12449 }
12450
12451 if (!target || !register_operand (target, tmode))
12452 target = gen_reg_rtx (tmode);
12453
12454 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12455 return target;
12456}
12457
12458/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12459 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12460 had a language-level syntax for referencing vector elements. */
12461
12462static rtx
12463ix86_expand_vec_ext_builtin (tree exp, rtx target)
12464{
12465 machine_mode tmode, mode0;
12466 tree arg0, arg1;
12467 int elt;
12468 rtx op0;
12469
12470 arg0 = CALL_EXPR_ARG (exp, 0);
12471 arg1 = CALL_EXPR_ARG (exp, 1);
12472
12473 op0 = expand_normal (arg0);
12474 elt = get_element_number (TREE_TYPE (arg0), arg1);
12475
12476 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12477 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12478 gcc_assert (VECTOR_MODE_P (mode0));
12479
12480 op0 = force_reg (mode0, op0);
12481
12482 if (optimize || !target || !register_operand (target, tmode))
12483 target = gen_reg_rtx (tmode);
12484
12485 ix86_expand_vector_extract (true, target, op0, elt);
12486
12487 return target;
12488}
12489
12490/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12491 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12492 a language-level syntax for referencing vector elements. */
12493
12494static rtx
12495ix86_expand_vec_set_builtin (tree exp)
12496{
12497 machine_mode tmode, mode1;
12498 tree arg0, arg1, arg2;
12499 int elt;
12500 rtx op0, op1, target;
12501
12502 arg0 = CALL_EXPR_ARG (exp, 0);
12503 arg1 = CALL_EXPR_ARG (exp, 1);
12504 arg2 = CALL_EXPR_ARG (exp, 2);
12505
12506 tmode = TYPE_MODE (TREE_TYPE (arg0));
12507 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12508 gcc_assert (VECTOR_MODE_P (tmode));
12509
12510 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12511 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12512 elt = get_element_number (TREE_TYPE (arg0), arg2);
12513
cda29c54 12514 if (GET_MODE (op1) != mode1)
2bf6d935
ML
12515 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12516
12517 op0 = force_reg (tmode, op0);
12518 op1 = force_reg (mode1, op1);
12519
12520 /* OP0 is the source of these builtin functions and shouldn't be
12521 modified. Create a copy, use it and return it as target. */
12522 target = gen_reg_rtx (tmode);
12523 emit_move_insn (target, op0);
12524 ix86_expand_vector_set (true, target, op1, elt);
12525
12526 return target;
12527}
12528
823b3b79 12529/* Return true if the necessary isa options for this builtin exist,
12530 else false.
12531 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12532bool
12533ix86_check_builtin_isa_match (unsigned int fcode,
12534 HOST_WIDE_INT* pbisa,
12535 HOST_WIDE_INT* pbisa2)
2bf6d935 12536{
2bf6d935
ML
12537 HOST_WIDE_INT isa = ix86_isa_flags;
12538 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12539 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12540 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12541 /* The general case is we require all the ISAs specified in bisa{,2}
12542 to be enabled.
12543 The exceptions are:
12544 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12545 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12546 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
ca813880 12547 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12548 OPTION_MASK_ISA2_AVXVNNI
825d0041
HW
12549 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512IFMA) or
12550 OPTION_MASK_ISA2_AVXIFMA
58685b93 12551 (OPTION_MASK_ISA_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16) or
12552 OPTION_MASK_ISA2_AVXNECONVERT
a13d6ec8
JJ
12553 where for each such pair it is sufficient if either of the ISAs is
12554 enabled, plus if it is ored with other options also those others.
12555 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
2bf6d935
ML
12556 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12557 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12558 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
12559 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
db3f0d21 12560
2bf6d935
ML
12561 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12562 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12563 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
12564 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
db3f0d21 12565
2bf6d935
ML
12566 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12567 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12568 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
12569 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
6058b874 12570
ca813880 12571 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12572 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12573 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
12574 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12575 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12576 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
12577 {
12578 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
12579 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
12580 }
12581
825d0041
HW
12582 if ((((bisa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12583 == (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12584 || (bisa2 & OPTION_MASK_ISA2_AVXIFMA) != 0)
12585 && (((isa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12586 == (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12587 || (isa2 & OPTION_MASK_ISA2_AVXIFMA) != 0))
12588 {
12589 isa |= OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL;
12590 isa2 |= OPTION_MASK_ISA2_AVXIFMA;
12591 }
12592
58685b93 12593 if ((((bisa & OPTION_MASK_ISA_AVX512VL) != 0
12594 && (bisa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
12595 && (bisa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0)
12596 && (((isa & OPTION_MASK_ISA_AVX512VL) != 0
12597 && (isa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
12598 || (isa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0))
12599 {
12600 isa |= OPTION_MASK_ISA_AVX512VL;
12601 isa2 |= OPTION_MASK_ISA2_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16;
12602 }
12603
db3f0d21
UB
12604 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12605 /* __builtin_ia32_maskmovq requires MMX registers. */
6058b874 12606 && fcode != IX86_BUILTIN_MASKMOVQ)
a13d6ec8
JJ
12607 {
12608 bisa &= ~OPTION_MASK_ISA_MMX;
12609 bisa |= OPTION_MASK_ISA_SSE2;
ecfdb16c 12610 }
6058b874 12611
823b3b79 12612 if (pbisa)
12613 *pbisa = bisa;
12614 if (pbisa2)
12615 *pbisa2 = bisa2;
12616
12617 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12618}
12619
12620/* Expand an expression EXP that calls a built-in function,
12621 with result going to TARGET if that's convenient
12622 (and in mode MODE if that's convenient).
12623 SUBTARGET may be used as the target for computing one of EXP's operands.
12624 IGNORE is nonzero if the value is to be ignored. */
12625
12626rtx
12627ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12628 machine_mode mode, int ignore)
12629{
12630 size_t i;
12631 enum insn_code icode, icode2;
12632 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12633 tree arg0, arg1, arg2, arg3, arg4;
12634 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12635 machine_mode mode0, mode1, mode2, mode3, mode4;
12636 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12637 HOST_WIDE_INT bisa, bisa2;
12638
12639 /* For CPU builtins that can be folded, fold first and expand the fold. */
12640 switch (fcode)
12641 {
12642 case IX86_BUILTIN_CPU_INIT:
12643 {
12644 /* Make it call __cpu_indicator_init in libgcc. */
12645 tree call_expr, fndecl, type;
12646 type = build_function_type_list (integer_type_node, NULL_TREE);
12647 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12648 call_expr = build_call_expr (fndecl, 0);
12649 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12650 }
12651 case IX86_BUILTIN_CPU_IS:
12652 case IX86_BUILTIN_CPU_SUPPORTS:
12653 {
12654 tree arg0 = CALL_EXPR_ARG (exp, 0);
12655 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12656 gcc_assert (fold_expr != NULL_TREE);
12657 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12658 }
12659 }
12660
12661 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
2bf6d935
ML
12662 {
12663 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12664 if (TARGET_ABI_X32)
12665 bisa |= OPTION_MASK_ABI_X32;
12666 else
12667 bisa |= OPTION_MASK_ABI_64;
12668 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
46e6341f
JJ
12669 (enum fpmath_unit) 0,
12670 (enum prefer_vector_width) 0,
654cd743 12671 PVW_NONE, PVW_NONE,
46e6341f 12672 false, add_abi_p);
2bf6d935
ML
12673 if (!opts)
12674 error ("%qE needs unknown isa option", fndecl);
12675 else
12676 {
12677 gcc_assert (opts != NULL);
12678 error ("%qE needs isa option %s", fndecl, opts);
12679 free (opts);
12680 }
12681 return expand_call (exp, target, ignore);
12682 }
12683
12684 switch (fcode)
12685 {
12686 case IX86_BUILTIN_MASKMOVQ:
12687 case IX86_BUILTIN_MASKMOVDQU:
12688 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12689 ? CODE_FOR_mmx_maskmovq
12690 : CODE_FOR_sse2_maskmovdqu);
12691 /* Note the arg order is different from the operand order. */
12692 arg1 = CALL_EXPR_ARG (exp, 0);
12693 arg2 = CALL_EXPR_ARG (exp, 1);
12694 arg0 = CALL_EXPR_ARG (exp, 2);
12695 op0 = expand_normal (arg0);
12696 op1 = expand_normal (arg1);
12697 op2 = expand_normal (arg2);
12698 mode0 = insn_data[icode].operand[0].mode;
12699 mode1 = insn_data[icode].operand[1].mode;
12700 mode2 = insn_data[icode].operand[2].mode;
12701
12702 op0 = ix86_zero_extend_to_Pmode (op0);
12703 op0 = gen_rtx_MEM (mode1, op0);
12704
12705 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12706 op0 = copy_to_mode_reg (mode0, op0);
12707 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12708 op1 = copy_to_mode_reg (mode1, op1);
12709 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12710 op2 = copy_to_mode_reg (mode2, op2);
12711 pat = GEN_FCN (icode) (op0, op1, op2);
12712 if (! pat)
12713 return 0;
12714 emit_insn (pat);
12715 return 0;
12716
12717 case IX86_BUILTIN_LDMXCSR:
12718 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12719 target = assign_386_stack_local (SImode, SLOT_TEMP);
12720 emit_move_insn (target, op0);
12721 emit_insn (gen_sse_ldmxcsr (target));
12722 return 0;
12723
12724 case IX86_BUILTIN_STMXCSR:
12725 target = assign_386_stack_local (SImode, SLOT_TEMP);
12726 emit_insn (gen_sse_stmxcsr (target));
12727 return copy_to_mode_reg (SImode, target);
12728
12729 case IX86_BUILTIN_CLFLUSH:
12730 arg0 = CALL_EXPR_ARG (exp, 0);
12731 op0 = expand_normal (arg0);
12732 icode = CODE_FOR_sse2_clflush;
12733 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12734 op0 = ix86_zero_extend_to_Pmode (op0);
12735
12736 emit_insn (gen_sse2_clflush (op0));
12737 return 0;
12738
12739 case IX86_BUILTIN_CLWB:
12740 arg0 = CALL_EXPR_ARG (exp, 0);
12741 op0 = expand_normal (arg0);
12742 icode = CODE_FOR_clwb;
12743 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12744 op0 = ix86_zero_extend_to_Pmode (op0);
12745
12746 emit_insn (gen_clwb (op0));
12747 return 0;
12748
12749 case IX86_BUILTIN_CLFLUSHOPT:
12750 arg0 = CALL_EXPR_ARG (exp, 0);
12751 op0 = expand_normal (arg0);
12752 icode = CODE_FOR_clflushopt;
12753 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12754 op0 = ix86_zero_extend_to_Pmode (op0);
12755
12756 emit_insn (gen_clflushopt (op0));
12757 return 0;
12758
12759 case IX86_BUILTIN_MONITOR:
12760 case IX86_BUILTIN_MONITORX:
12761 arg0 = CALL_EXPR_ARG (exp, 0);
12762 arg1 = CALL_EXPR_ARG (exp, 1);
12763 arg2 = CALL_EXPR_ARG (exp, 2);
12764 op0 = expand_normal (arg0);
12765 op1 = expand_normal (arg1);
12766 op2 = expand_normal (arg2);
12767 if (!REG_P (op0))
12768 op0 = ix86_zero_extend_to_Pmode (op0);
12769 if (!REG_P (op1))
12770 op1 = copy_to_mode_reg (SImode, op1);
12771 if (!REG_P (op2))
12772 op2 = copy_to_mode_reg (SImode, op2);
12773
12774 emit_insn (fcode == IX86_BUILTIN_MONITOR
a963ca40
UB
12775 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12776 : gen_monitorx (Pmode, op0, op1, op2));
2bf6d935
ML
12777 return 0;
12778
12779 case IX86_BUILTIN_MWAIT:
12780 arg0 = CALL_EXPR_ARG (exp, 0);
12781 arg1 = CALL_EXPR_ARG (exp, 1);
12782 op0 = expand_normal (arg0);
12783 op1 = expand_normal (arg1);
12784 if (!REG_P (op0))
12785 op0 = copy_to_mode_reg (SImode, op0);
12786 if (!REG_P (op1))
12787 op1 = copy_to_mode_reg (SImode, op1);
12788 emit_insn (gen_sse3_mwait (op0, op1));
12789 return 0;
12790
12791 case IX86_BUILTIN_MWAITX:
12792 arg0 = CALL_EXPR_ARG (exp, 0);
12793 arg1 = CALL_EXPR_ARG (exp, 1);
12794 arg2 = CALL_EXPR_ARG (exp, 2);
12795 op0 = expand_normal (arg0);
12796 op1 = expand_normal (arg1);
12797 op2 = expand_normal (arg2);
12798 if (!REG_P (op0))
12799 op0 = copy_to_mode_reg (SImode, op0);
12800 if (!REG_P (op1))
12801 op1 = copy_to_mode_reg (SImode, op1);
12802 if (!REG_P (op2))
12803 op2 = copy_to_mode_reg (SImode, op2);
12804 emit_insn (gen_mwaitx (op0, op1, op2));
12805 return 0;
12806
12807 case IX86_BUILTIN_UMONITOR:
12808 arg0 = CALL_EXPR_ARG (exp, 0);
12809 op0 = expand_normal (arg0);
12810
12811 op0 = ix86_zero_extend_to_Pmode (op0);
987a3082 12812 emit_insn (gen_umonitor (Pmode, op0));
2bf6d935
ML
12813 return 0;
12814
12815 case IX86_BUILTIN_UMWAIT:
12816 case IX86_BUILTIN_TPAUSE:
12817 arg0 = CALL_EXPR_ARG (exp, 0);
12818 arg1 = CALL_EXPR_ARG (exp, 1);
12819 op0 = expand_normal (arg0);
12820 op1 = expand_normal (arg1);
12821
12822 if (!REG_P (op0))
12823 op0 = copy_to_mode_reg (SImode, op0);
12824
12825 op1 = force_reg (DImode, op1);
12826
12827 if (TARGET_64BIT)
12828 {
12829 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12830 NULL, 1, OPTAB_DIRECT);
12831 switch (fcode)
12832 {
12833 case IX86_BUILTIN_UMWAIT:
12834 icode = CODE_FOR_umwait_rex64;
12835 break;
12836 case IX86_BUILTIN_TPAUSE:
12837 icode = CODE_FOR_tpause_rex64;
12838 break;
12839 default:
12840 gcc_unreachable ();
12841 }
12842
12843 op2 = gen_lowpart (SImode, op2);
12844 op1 = gen_lowpart (SImode, op1);
12845 pat = GEN_FCN (icode) (op0, op1, op2);
12846 }
12847 else
12848 {
12849 switch (fcode)
12850 {
12851 case IX86_BUILTIN_UMWAIT:
12852 icode = CODE_FOR_umwait;
12853 break;
12854 case IX86_BUILTIN_TPAUSE:
12855 icode = CODE_FOR_tpause;
12856 break;
12857 default:
12858 gcc_unreachable ();
12859 }
12860 pat = GEN_FCN (icode) (op0, op1);
12861 }
12862
12863 if (!pat)
12864 return 0;
12865
12866 emit_insn (pat);
12867
12868 if (target == 0
12869 || !register_operand (target, QImode))
12870 target = gen_reg_rtx (QImode);
12871
12872 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12873 const0_rtx);
12874 emit_insn (gen_rtx_SET (target, pat));
12875
12876 return target;
12877
299a53d7 12878 case IX86_BUILTIN_TESTUI:
12879 emit_insn (gen_testui ());
12880
12881 if (target == 0
12882 || !register_operand (target, QImode))
12883 target = gen_reg_rtx (QImode);
12884
12885 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12886 const0_rtx);
12887 emit_insn (gen_rtx_SET (target, pat));
12888
12889 return target;
12890
2bf6d935
ML
12891 case IX86_BUILTIN_CLZERO:
12892 arg0 = CALL_EXPR_ARG (exp, 0);
12893 op0 = expand_normal (arg0);
12894 if (!REG_P (op0))
12895 op0 = ix86_zero_extend_to_Pmode (op0);
a963ca40 12896 emit_insn (gen_clzero (Pmode, op0));
2bf6d935
ML
12897 return 0;
12898
12899 case IX86_BUILTIN_CLDEMOTE:
12900 arg0 = CALL_EXPR_ARG (exp, 0);
12901 op0 = expand_normal (arg0);
12902 icode = CODE_FOR_cldemote;
12903 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12904 op0 = ix86_zero_extend_to_Pmode (op0);
12905
12906 emit_insn (gen_cldemote (op0));
12907 return 0;
12908
632a2f50 12909 case IX86_BUILTIN_LOADIWKEY:
12910 {
12911 arg0 = CALL_EXPR_ARG (exp, 0);
12912 arg1 = CALL_EXPR_ARG (exp, 1);
12913 arg2 = CALL_EXPR_ARG (exp, 2);
12914 arg3 = CALL_EXPR_ARG (exp, 3);
12915
12916 op0 = expand_normal (arg0);
12917 op1 = expand_normal (arg1);
12918 op2 = expand_normal (arg2);
12919 op3 = expand_normal (arg3);
12920
12921 if (!REG_P (op0))
12922 op0 = copy_to_mode_reg (V2DImode, op0);
12923 if (!REG_P (op1))
12924 op1 = copy_to_mode_reg (V2DImode, op1);
12925 if (!REG_P (op2))
12926 op2 = copy_to_mode_reg (V2DImode, op2);
12927 if (!REG_P (op3))
12928 op3 = copy_to_mode_reg (SImode, op3);
12929
12930 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12931
12932 return 0;
12933 }
12934
12935 case IX86_BUILTIN_AESDEC128KLU8:
12936 icode = CODE_FOR_aesdec128klu8;
12937 goto aesdecenc_expand;
12938
12939 case IX86_BUILTIN_AESDEC256KLU8:
12940 icode = CODE_FOR_aesdec256klu8;
12941 goto aesdecenc_expand;
12942
12943 case IX86_BUILTIN_AESENC128KLU8:
12944 icode = CODE_FOR_aesenc128klu8;
12945 goto aesdecenc_expand;
12946
12947 case IX86_BUILTIN_AESENC256KLU8:
12948 icode = CODE_FOR_aesenc256klu8;
12949
12950 aesdecenc_expand:
12951
12952 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
12953 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
12954 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12955
12956 op0 = expand_normal (arg0);
12957 op1 = expand_normal (arg1);
12958 op2 = expand_normal (arg2);
12959
12960 if (!address_operand (op0, V2DImode))
12961 {
12962 op0 = convert_memory_address (Pmode, op0);
12963 op0 = copy_addr_to_reg (op0);
12964 }
12965 op0 = gen_rtx_MEM (V2DImode, op0);
12966
12967 if (!REG_P (op1))
12968 op1 = copy_to_mode_reg (V2DImode, op1);
12969
12970 if (!address_operand (op2, VOIDmode))
12971 {
12972 op2 = convert_memory_address (Pmode, op2);
12973 op2 = copy_addr_to_reg (op2);
12974 }
12975 op2 = gen_rtx_MEM (BLKmode, op2);
12976
12977 emit_insn (GEN_FCN (icode) (op1, op1, op2));
12978
12979 if (target == 0)
12980 target = gen_reg_rtx (QImode);
12981
1aeefa57
HW
12982 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12983 error occurs. Then the output should be cleared for safety. */
12984 rtx_code_label *ok_label;
12985 rtx tmp;
12986
12987 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12988 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12989 ok_label = gen_label_rtx ();
12990 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12991 true, ok_label);
12992 /* Usually the runtime error seldom occur, so predict OK path as
12993 hotspot to optimize it as fallthrough block. */
12994 predict_jump (REG_BR_PROB_BASE * 90 / 100);
12995
12996 emit_insn (gen_rtx_SET (op1, const0_rtx));
632a2f50 12997
1aeefa57
HW
12998 emit_label (ok_label);
12999 emit_insn (gen_rtx_SET (target, pat));
632a2f50 13000 emit_insn (gen_rtx_SET (op0, op1));
13001
13002 return target;
13003
13004 case IX86_BUILTIN_AESDECWIDE128KLU8:
13005 icode = CODE_FOR_aesdecwide128klu8;
13006 goto wideaesdecenc_expand;
13007
13008 case IX86_BUILTIN_AESDECWIDE256KLU8:
13009 icode = CODE_FOR_aesdecwide256klu8;
13010 goto wideaesdecenc_expand;
13011
13012 case IX86_BUILTIN_AESENCWIDE128KLU8:
13013 icode = CODE_FOR_aesencwide128klu8;
13014 goto wideaesdecenc_expand;
13015
13016 case IX86_BUILTIN_AESENCWIDE256KLU8:
13017 icode = CODE_FOR_aesencwide256klu8;
13018
13019 wideaesdecenc_expand:
13020
13021 rtx xmm_regs[8];
13022 rtx op;
13023
13024 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
13025 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
13026 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13027
13028 op0 = expand_normal (arg0);
13029 op1 = expand_normal (arg1);
13030 op2 = expand_normal (arg2);
13031
13032 if (!address_operand (op2, VOIDmode))
13033 {
13034 op2 = convert_memory_address (Pmode, op2);
13035 op2 = copy_addr_to_reg (op2);
13036 }
13037 op2 = gen_rtx_MEM (BLKmode, op2);
13038
13039 for (i = 0; i < 8; i++)
13040 {
13041 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13042
13043 op = gen_rtx_MEM (V2DImode,
13044 plus_constant (Pmode, op1, (i * 16)));
13045
13046 emit_move_insn (xmm_regs[i], op);
13047 }
13048
13049 emit_insn (GEN_FCN (icode) (op2));
13050
13051 if (target == 0)
13052 target = gen_reg_rtx (QImode);
13053
1aeefa57
HW
13054 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13055 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13056 ok_label = gen_label_rtx ();
13057 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13058 true, ok_label);
13059 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13060
13061 for (i = 0; i < 8; i++)
13062 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
13063
13064 emit_label (ok_label);
632a2f50 13065 emit_insn (gen_rtx_SET (target, pat));
13066
13067 for (i = 0; i < 8; i++)
13068 {
13069 op = gen_rtx_MEM (V2DImode,
13070 plus_constant (Pmode, op0, (i * 16)));
13071 emit_move_insn (op, xmm_regs[i]);
13072 }
13073
13074 return target;
13075
13076 case IX86_BUILTIN_ENCODEKEY128U32:
13077 {
13078 rtx op, xmm_regs[7];
13079
13080 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13081 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
13082 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
13083
13084 op0 = expand_normal (arg0);
13085 op1 = expand_normal (arg1);
13086 op2 = expand_normal (arg2);
13087
13088 if (!REG_P (op0))
13089 op0 = copy_to_mode_reg (SImode, op0);
13090
13091 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13092 emit_move_insn (op, op1);
13093
13094 for (i = 0; i < 3; i++)
13095 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13096
13097 if (target == 0)
13098 target = gen_reg_rtx (SImode);
13099
13100 emit_insn (gen_encodekey128u32 (target, op0));
13101
13102 for (i = 0; i < 3; i++)
13103 {
13104 op = gen_rtx_MEM (V2DImode,
13105 plus_constant (Pmode, op2, (i * 16)));
13106 emit_move_insn (op, xmm_regs[i]);
13107 }
13108
13109 return target;
13110 }
13111 case IX86_BUILTIN_ENCODEKEY256U32:
13112 {
13113 rtx op, xmm_regs[7];
13114
13115 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13116 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
13117 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
13118 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
13119
13120 op0 = expand_normal (arg0);
13121 op1 = expand_normal (arg1);
13122 op2 = expand_normal (arg2);
13123 op3 = expand_normal (arg3);
13124
13125 if (!REG_P (op0))
13126 op0 = copy_to_mode_reg (SImode, op0);
13127
13128 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13129 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13130 emit_move_insn (op, op1);
13131 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
13132 emit_move_insn (op, op2);
13133
13134 for (i = 0; i < 4; i++)
13135 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13136
13137 if (target == 0)
13138 target = gen_reg_rtx (SImode);
13139
13140 emit_insn (gen_encodekey256u32 (target, op0));
13141
13142 for (i = 0; i < 4; i++)
13143 {
13144 op = gen_rtx_MEM (V2DImode,
13145 plus_constant (Pmode, op3, (i * 16)));
13146 emit_move_insn (op, xmm_regs[i]);
13147 }
13148
13149 return target;
13150 }
13151
b384d9a0
HJ
13152 case IX86_BUILTIN_PREFETCH:
13153 {
13154 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13155 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13156 arg2 = CALL_EXPR_ARG (exp, 2); // const int
13157 arg3 = CALL_EXPR_ARG (exp, 3); // const int
13158
13159 op0 = expand_normal (arg0);
13160 op1 = expand_normal (arg1);
13161 op2 = expand_normal (arg2);
13162 op3 = expand_normal (arg3);
13163
13164 if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
13165 {
13166 error ("second, third and fourth argument must be a const");
13167 return const0_rtx;
13168 }
13169
13170 if (INTVAL (op3) == 1)
13171 {
21de01f5 13172 if (TARGET_64BIT && TARGET_PREFETCHI
b384d9a0
HJ
13173 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13174 emit_insn (gen_prefetchi (op0, op2));
13175 else
13176 {
13177 warning (0, "instruction prefetch applies when in 64-bit mode"
13178 " with RIP-relative addressing and"
13179 " option %<-mprefetchi%>;"
13180 " they stay NOPs otherwise");
13181 emit_insn (gen_nop ());
13182 }
13183 }
13184 else
13185 {
13186 if (!address_operand (op0, VOIDmode))
13187 {
13188 op0 = convert_memory_address (Pmode, op0);
13189 op0 = copy_addr_to_reg (op0);
13190 }
21de01f5
HJ
13191
13192 if (TARGET_3DNOW || TARGET_PREFETCH_SSE
13193 || TARGET_PRFCHW || TARGET_PREFETCHWT1)
13194 emit_insn (gen_prefetch (op0, op1, op2));
13195 else if (!MEM_P (op0) && side_effects_p (op0))
13196 /* Don't do anything with direct references to volatile memory,
13197 but generate code to handle other side effects. */
13198 emit_insn (op0);
b384d9a0
HJ
13199 }
13200
13201 return 0;
13202 }
13203
13204 case IX86_BUILTIN_PREFETCHI:
13205 {
13206 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13207 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13208
13209 op0 = expand_normal (arg0);
13210 op1 = expand_normal (arg1);
13211
13212 if (!CONST_INT_P (op1))
13213 {
13214 error ("second argument must be a const");
13215 return const0_rtx;
13216 }
13217
13218 /* GOT/PLT_PIC should not be available for instruction prefetch.
13219 It must be real instruction address. */
13220 if (TARGET_64BIT
13221 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13222 emit_insn (gen_prefetchi (op0, op1));
13223 else
13224 {
13225 /* Ignore the hint. */
13226 warning (0, "instruction prefetch applies when in 64-bit mode"
13227 " with RIP-relative addressing and"
13228 " option %<-mprefetchi%>;"
13229 " they stay NOPs otherwise");
13230 emit_insn (gen_nop ());
13231 }
13232
13233 return 0;
13234 }
13235
2bf6d935
ML
13236 case IX86_BUILTIN_VEC_INIT_V2SI:
13237 case IX86_BUILTIN_VEC_INIT_V4HI:
13238 case IX86_BUILTIN_VEC_INIT_V8QI:
13239 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
13240
13241 case IX86_BUILTIN_VEC_EXT_V2DF:
13242 case IX86_BUILTIN_VEC_EXT_V2DI:
13243 case IX86_BUILTIN_VEC_EXT_V4SF:
13244 case IX86_BUILTIN_VEC_EXT_V4SI:
13245 case IX86_BUILTIN_VEC_EXT_V8HI:
13246 case IX86_BUILTIN_VEC_EXT_V2SI:
13247 case IX86_BUILTIN_VEC_EXT_V4HI:
13248 case IX86_BUILTIN_VEC_EXT_V16QI:
13249 return ix86_expand_vec_ext_builtin (exp, target);
13250
13251 case IX86_BUILTIN_VEC_SET_V2DI:
13252 case IX86_BUILTIN_VEC_SET_V4SF:
13253 case IX86_BUILTIN_VEC_SET_V4SI:
13254 case IX86_BUILTIN_VEC_SET_V8HI:
13255 case IX86_BUILTIN_VEC_SET_V4HI:
13256 case IX86_BUILTIN_VEC_SET_V16QI:
13257 return ix86_expand_vec_set_builtin (exp);
13258
13259 case IX86_BUILTIN_NANQ:
13260 case IX86_BUILTIN_NANSQ:
13261 return expand_call (exp, target, ignore);
13262
13263 case IX86_BUILTIN_RDPID:
13264
13265 op0 = gen_reg_rtx (word_mode);
13266
13267 if (TARGET_64BIT)
13268 {
13269 insn = gen_rdpid_rex64 (op0);
13270 op0 = convert_to_mode (SImode, op0, 1);
13271 }
13272 else
13273 insn = gen_rdpid (op0);
13274
13275 emit_insn (insn);
13276
13277 if (target == 0
13278 || !register_operand (target, SImode))
13279 target = gen_reg_rtx (SImode);
13280
13281 emit_move_insn (target, op0);
13282 return target;
13283
e21b52af
HL
13284 case IX86_BUILTIN_2INTERSECTD512:
13285 case IX86_BUILTIN_2INTERSECTQ512:
13286 case IX86_BUILTIN_2INTERSECTD256:
13287 case IX86_BUILTIN_2INTERSECTQ256:
13288 case IX86_BUILTIN_2INTERSECTD128:
13289 case IX86_BUILTIN_2INTERSECTQ128:
13290 arg0 = CALL_EXPR_ARG (exp, 0);
13291 arg1 = CALL_EXPR_ARG (exp, 1);
13292 arg2 = CALL_EXPR_ARG (exp, 2);
13293 arg3 = CALL_EXPR_ARG (exp, 3);
13294 op0 = expand_normal (arg0);
13295 op1 = expand_normal (arg1);
13296 op2 = expand_normal (arg2);
13297 op3 = expand_normal (arg3);
13298
13299 if (!address_operand (op0, VOIDmode))
13300 {
13301 op0 = convert_memory_address (Pmode, op0);
13302 op0 = copy_addr_to_reg (op0);
13303 }
13304 if (!address_operand (op1, VOIDmode))
13305 {
13306 op1 = convert_memory_address (Pmode, op1);
13307 op1 = copy_addr_to_reg (op1);
13308 }
13309
13310 switch (fcode)
13311 {
13312 case IX86_BUILTIN_2INTERSECTD512:
13313 mode4 = P2HImode;
13314 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13315 break;
13316 case IX86_BUILTIN_2INTERSECTQ512:
13317 mode4 = P2QImode;
13318 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13319 break;
13320 case IX86_BUILTIN_2INTERSECTD256:
13321 mode4 = P2QImode;
13322 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13323 break;
13324 case IX86_BUILTIN_2INTERSECTQ256:
13325 mode4 = P2QImode;
13326 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13327 break;
13328 case IX86_BUILTIN_2INTERSECTD128:
13329 mode4 = P2QImode;
13330 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13331 break;
13332 case IX86_BUILTIN_2INTERSECTQ128:
13333 mode4 = P2QImode;
13334 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13335 break;
13336 default:
13337 gcc_unreachable ();
13338 }
13339
13340 mode2 = insn_data[icode].operand[1].mode;
13341 mode3 = insn_data[icode].operand[2].mode;
13342 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13343 op2 = copy_to_mode_reg (mode2, op2);
13344 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13345 op3 = copy_to_mode_reg (mode3, op3);
13346
13347 op4 = gen_reg_rtx (mode4);
13348 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13349 mode0 = mode4 == P2HImode ? HImode : QImode;
13350 emit_move_insn (gen_rtx_MEM (mode0, op0),
13351 gen_lowpart (mode0, op4));
13352 emit_move_insn (gen_rtx_MEM (mode0, op1),
13353 gen_highpart (mode0, op4));
13354
13355 return 0;
13356
2bf6d935
ML
13357 case IX86_BUILTIN_RDPMC:
13358 case IX86_BUILTIN_RDTSC:
13359 case IX86_BUILTIN_RDTSCP:
13360 case IX86_BUILTIN_XGETBV:
13361
13362 op0 = gen_reg_rtx (DImode);
13363 op1 = gen_reg_rtx (DImode);
13364
13365 if (fcode == IX86_BUILTIN_RDPMC)
13366 {
13367 arg0 = CALL_EXPR_ARG (exp, 0);
13368 op2 = expand_normal (arg0);
13369 if (!register_operand (op2, SImode))
13370 op2 = copy_to_mode_reg (SImode, op2);
13371
13372 insn = (TARGET_64BIT
13373 ? gen_rdpmc_rex64 (op0, op1, op2)
13374 : gen_rdpmc (op0, op2));
13375 emit_insn (insn);
13376 }
13377 else if (fcode == IX86_BUILTIN_XGETBV)
13378 {
13379 arg0 = CALL_EXPR_ARG (exp, 0);
13380 op2 = expand_normal (arg0);
13381 if (!register_operand (op2, SImode))
13382 op2 = copy_to_mode_reg (SImode, op2);
13383
13384 insn = (TARGET_64BIT
13385 ? gen_xgetbv_rex64 (op0, op1, op2)
13386 : gen_xgetbv (op0, op2));
13387 emit_insn (insn);
13388 }
13389 else if (fcode == IX86_BUILTIN_RDTSC)
13390 {
13391 insn = (TARGET_64BIT
13392 ? gen_rdtsc_rex64 (op0, op1)
13393 : gen_rdtsc (op0));
13394 emit_insn (insn);
13395 }
13396 else
13397 {
13398 op2 = gen_reg_rtx (SImode);
13399
13400 insn = (TARGET_64BIT
13401 ? gen_rdtscp_rex64 (op0, op1, op2)
13402 : gen_rdtscp (op0, op2));
13403 emit_insn (insn);
13404
13405 arg0 = CALL_EXPR_ARG (exp, 0);
13406 op4 = expand_normal (arg0);
13407 if (!address_operand (op4, VOIDmode))
13408 {
13409 op4 = convert_memory_address (Pmode, op4);
13410 op4 = copy_addr_to_reg (op4);
13411 }
13412 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13413 }
13414
13415 if (target == 0
13416 || !register_operand (target, DImode))
13417 target = gen_reg_rtx (DImode);
13418
13419 if (TARGET_64BIT)
13420 {
13421 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13422 op1, 1, OPTAB_DIRECT);
13423 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13424 op0, 1, OPTAB_DIRECT);
13425 }
13426
13427 emit_move_insn (target, op0);
13428 return target;
13429
6a10feda
XG
13430 case IX86_BUILTIN_ENQCMD:
13431 case IX86_BUILTIN_ENQCMDS:
2bf6d935
ML
13432 case IX86_BUILTIN_MOVDIR64B:
13433
13434 arg0 = CALL_EXPR_ARG (exp, 0);
13435 arg1 = CALL_EXPR_ARG (exp, 1);
13436 op0 = expand_normal (arg0);
13437 op1 = expand_normal (arg1);
13438
13439 op0 = ix86_zero_extend_to_Pmode (op0);
13440 if (!address_operand (op1, VOIDmode))
13441 {
13442 op1 = convert_memory_address (Pmode, op1);
13443 op1 = copy_addr_to_reg (op1);
13444 }
13445 op1 = gen_rtx_MEM (XImode, op1);
13446
6a10feda
XG
13447 if (fcode == IX86_BUILTIN_MOVDIR64B)
13448 {
13449 emit_insn (gen_movdir64b (Pmode, op0, op1));
13450 return 0;
13451 }
13452 else
13453 {
44320665
UB
13454 if (target == 0
13455 || !register_operand (target, SImode))
13456 target = gen_reg_rtx (SImode);
6a10feda 13457
6a10feda
XG
13458 emit_move_insn (target, const0_rtx);
13459 target = gen_rtx_SUBREG (QImode, target, 0);
13460
44320665
UB
13461 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13462 ? UNSPECV_ENQCMD
13463 : UNSPECV_ENQCMDS);
13464 icode = code_for_enqcmd (unspecv, Pmode);
13465 emit_insn (GEN_FCN (icode) (op0, op1));
6a10feda 13466
44320665
UB
13467 emit_insn
13468 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13469 gen_rtx_fmt_ee (EQ, QImode,
13470 gen_rtx_REG (CCZmode, FLAGS_REG),
13471 const0_rtx)));
6a10feda
XG
13472 return SUBREG_REG (target);
13473 }
2bf6d935
ML
13474
13475 case IX86_BUILTIN_FXSAVE:
13476 case IX86_BUILTIN_FXRSTOR:
13477 case IX86_BUILTIN_FXSAVE64:
13478 case IX86_BUILTIN_FXRSTOR64:
13479 case IX86_BUILTIN_FNSTENV:
13480 case IX86_BUILTIN_FLDENV:
13481 mode0 = BLKmode;
13482 switch (fcode)
13483 {
13484 case IX86_BUILTIN_FXSAVE:
13485 icode = CODE_FOR_fxsave;
13486 break;
13487 case IX86_BUILTIN_FXRSTOR:
13488 icode = CODE_FOR_fxrstor;
13489 break;
13490 case IX86_BUILTIN_FXSAVE64:
13491 icode = CODE_FOR_fxsave64;
13492 break;
13493 case IX86_BUILTIN_FXRSTOR64:
13494 icode = CODE_FOR_fxrstor64;
13495 break;
13496 case IX86_BUILTIN_FNSTENV:
13497 icode = CODE_FOR_fnstenv;
13498 break;
13499 case IX86_BUILTIN_FLDENV:
13500 icode = CODE_FOR_fldenv;
13501 break;
13502 default:
13503 gcc_unreachable ();
13504 }
13505
13506 arg0 = CALL_EXPR_ARG (exp, 0);
13507 op0 = expand_normal (arg0);
13508
13509 if (!address_operand (op0, VOIDmode))
13510 {
13511 op0 = convert_memory_address (Pmode, op0);
13512 op0 = copy_addr_to_reg (op0);
13513 }
13514 op0 = gen_rtx_MEM (mode0, op0);
13515
13516 pat = GEN_FCN (icode) (op0);
13517 if (pat)
13518 emit_insn (pat);
13519 return 0;
13520
13521 case IX86_BUILTIN_XSETBV:
13522 arg0 = CALL_EXPR_ARG (exp, 0);
13523 arg1 = CALL_EXPR_ARG (exp, 1);
13524 op0 = expand_normal (arg0);
13525 op1 = expand_normal (arg1);
13526
13527 if (!REG_P (op0))
13528 op0 = copy_to_mode_reg (SImode, op0);
13529
13530 op1 = force_reg (DImode, op1);
13531
13532 if (TARGET_64BIT)
13533 {
13534 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13535 NULL, 1, OPTAB_DIRECT);
13536
13537 icode = CODE_FOR_xsetbv_rex64;
13538
13539 op2 = gen_lowpart (SImode, op2);
13540 op1 = gen_lowpart (SImode, op1);
13541 pat = GEN_FCN (icode) (op0, op1, op2);
13542 }
13543 else
13544 {
13545 icode = CODE_FOR_xsetbv;
13546
13547 pat = GEN_FCN (icode) (op0, op1);
13548 }
13549 if (pat)
13550 emit_insn (pat);
13551 return 0;
13552
13553 case IX86_BUILTIN_XSAVE:
13554 case IX86_BUILTIN_XRSTOR:
13555 case IX86_BUILTIN_XSAVE64:
13556 case IX86_BUILTIN_XRSTOR64:
13557 case IX86_BUILTIN_XSAVEOPT:
13558 case IX86_BUILTIN_XSAVEOPT64:
13559 case IX86_BUILTIN_XSAVES:
13560 case IX86_BUILTIN_XRSTORS:
13561 case IX86_BUILTIN_XSAVES64:
13562 case IX86_BUILTIN_XRSTORS64:
13563 case IX86_BUILTIN_XSAVEC:
13564 case IX86_BUILTIN_XSAVEC64:
13565 arg0 = CALL_EXPR_ARG (exp, 0);
13566 arg1 = CALL_EXPR_ARG (exp, 1);
13567 op0 = expand_normal (arg0);
13568 op1 = expand_normal (arg1);
13569
13570 if (!address_operand (op0, VOIDmode))
13571 {
13572 op0 = convert_memory_address (Pmode, op0);
13573 op0 = copy_addr_to_reg (op0);
13574 }
13575 op0 = gen_rtx_MEM (BLKmode, op0);
13576
13577 op1 = force_reg (DImode, op1);
13578
13579 if (TARGET_64BIT)
13580 {
13581 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13582 NULL, 1, OPTAB_DIRECT);
13583 switch (fcode)
13584 {
13585 case IX86_BUILTIN_XSAVE:
13586 icode = CODE_FOR_xsave_rex64;
13587 break;
13588 case IX86_BUILTIN_XRSTOR:
13589 icode = CODE_FOR_xrstor_rex64;
13590 break;
13591 case IX86_BUILTIN_XSAVE64:
13592 icode = CODE_FOR_xsave64;
13593 break;
13594 case IX86_BUILTIN_XRSTOR64:
13595 icode = CODE_FOR_xrstor64;
13596 break;
13597 case IX86_BUILTIN_XSAVEOPT:
13598 icode = CODE_FOR_xsaveopt_rex64;
13599 break;
13600 case IX86_BUILTIN_XSAVEOPT64:
13601 icode = CODE_FOR_xsaveopt64;
13602 break;
13603 case IX86_BUILTIN_XSAVES:
13604 icode = CODE_FOR_xsaves_rex64;
13605 break;
13606 case IX86_BUILTIN_XRSTORS:
13607 icode = CODE_FOR_xrstors_rex64;
13608 break;
13609 case IX86_BUILTIN_XSAVES64:
13610 icode = CODE_FOR_xsaves64;
13611 break;
13612 case IX86_BUILTIN_XRSTORS64:
13613 icode = CODE_FOR_xrstors64;
13614 break;
13615 case IX86_BUILTIN_XSAVEC:
13616 icode = CODE_FOR_xsavec_rex64;
13617 break;
13618 case IX86_BUILTIN_XSAVEC64:
13619 icode = CODE_FOR_xsavec64;
13620 break;
13621 default:
13622 gcc_unreachable ();
13623 }
13624
13625 op2 = gen_lowpart (SImode, op2);
13626 op1 = gen_lowpart (SImode, op1);
13627 pat = GEN_FCN (icode) (op0, op1, op2);
13628 }
13629 else
13630 {
13631 switch (fcode)
13632 {
13633 case IX86_BUILTIN_XSAVE:
13634 icode = CODE_FOR_xsave;
13635 break;
13636 case IX86_BUILTIN_XRSTOR:
13637 icode = CODE_FOR_xrstor;
13638 break;
13639 case IX86_BUILTIN_XSAVEOPT:
13640 icode = CODE_FOR_xsaveopt;
13641 break;
13642 case IX86_BUILTIN_XSAVES:
13643 icode = CODE_FOR_xsaves;
13644 break;
13645 case IX86_BUILTIN_XRSTORS:
13646 icode = CODE_FOR_xrstors;
13647 break;
13648 case IX86_BUILTIN_XSAVEC:
13649 icode = CODE_FOR_xsavec;
13650 break;
13651 default:
13652 gcc_unreachable ();
13653 }
13654 pat = GEN_FCN (icode) (op0, op1);
13655 }
13656
13657 if (pat)
13658 emit_insn (pat);
13659 return 0;
13660
13661 case IX86_BUILTIN_LLWPCB:
13662 arg0 = CALL_EXPR_ARG (exp, 0);
13663 op0 = expand_normal (arg0);
2398c206
UB
13664
13665 if (!register_operand (op0, Pmode))
2bf6d935 13666 op0 = ix86_zero_extend_to_Pmode (op0);
2398c206 13667 emit_insn (gen_lwp_llwpcb (Pmode, op0));
2bf6d935
ML
13668 return 0;
13669
13670 case IX86_BUILTIN_SLWPCB:
2bf6d935 13671 if (!target
2398c206 13672 || !register_operand (target, Pmode))
2bf6d935 13673 target = gen_reg_rtx (Pmode);
2398c206 13674 emit_insn (gen_lwp_slwpcb (Pmode, target));
2bf6d935
ML
13675 return target;
13676
2398c206
UB
13677 case IX86_BUILTIN_LWPVAL32:
13678 case IX86_BUILTIN_LWPVAL64:
13679 case IX86_BUILTIN_LWPINS32:
13680 case IX86_BUILTIN_LWPINS64:
13681 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13682 || fcode == IX86_BUILTIN_LWPINS32)
13683 ? SImode : DImode);
13684
13685 if (fcode == IX86_BUILTIN_LWPVAL32
13686 || fcode == IX86_BUILTIN_LWPVAL64)
13687 icode = code_for_lwp_lwpval (mode);
13688 else
13689 icode = code_for_lwp_lwpins (mode);
13690
13691 arg0 = CALL_EXPR_ARG (exp, 0);
13692 arg1 = CALL_EXPR_ARG (exp, 1);
13693 arg2 = CALL_EXPR_ARG (exp, 2);
13694 op0 = expand_normal (arg0);
13695 op1 = expand_normal (arg1);
13696 op2 = expand_normal (arg2);
13697 mode0 = insn_data[icode].operand[0].mode;
13698
13699 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13700 op0 = copy_to_mode_reg (mode0, op0);
13701 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13702 op1 = copy_to_mode_reg (SImode, op1);
13703
13704 if (!CONST_INT_P (op2))
13705 {
13706 error ("the last argument must be a 32-bit immediate");
13707 return const0_rtx;
13708 }
13709
13710 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13711
13712 if (fcode == IX86_BUILTIN_LWPINS32
13713 || fcode == IX86_BUILTIN_LWPINS64)
13714 {
13715 if (target == 0
13716 || !nonimmediate_operand (target, QImode))
13717 target = gen_reg_rtx (QImode);
13718
13719 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13720 const0_rtx);
13721 emit_insn (gen_rtx_SET (target, pat));
13722
13723 return target;
13724 }
13725 else
13726 return 0;
13727
2bf6d935
ML
13728 case IX86_BUILTIN_BEXTRI32:
13729 case IX86_BUILTIN_BEXTRI64:
9e026191
UB
13730 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13731
2bf6d935
ML
13732 arg0 = CALL_EXPR_ARG (exp, 0);
13733 arg1 = CALL_EXPR_ARG (exp, 1);
13734 op0 = expand_normal (arg0);
13735 op1 = expand_normal (arg1);
9e026191 13736
2bf6d935 13737 if (!CONST_INT_P (op1))
9e026191
UB
13738 {
13739 error ("last argument must be an immediate");
13740 return const0_rtx;
13741 }
2bf6d935 13742 else
9e026191
UB
13743 {
13744 unsigned char lsb_index = UINTVAL (op1);
13745 unsigned char length = UINTVAL (op1) >> 8;
13746
13747 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13748
13749 icode = code_for_tbm_bextri (mode);
2bf6d935
ML
13750
13751 mode1 = insn_data[icode].operand[1].mode;
13752 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13753 op0 = copy_to_mode_reg (mode1, op0);
13754
13755 mode0 = insn_data[icode].operand[0].mode;
13756 if (target == 0
13757 || !register_operand (target, mode0))
13758 target = gen_reg_rtx (mode0);
13759
9e026191
UB
13760 if (length == 0 || lsb_index >= bitsize)
13761 {
13762 emit_move_insn (target, const0_rtx);
13763 return target;
13764 }
13765
13766 if (length + lsb_index > bitsize)
13767 length = bitsize - lsb_index;
13768
13769 op1 = GEN_INT (length);
13770 op2 = GEN_INT (lsb_index);
13771
13772 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13773 return target;
13774 }
2bf6d935
ML
13775
13776 case IX86_BUILTIN_RDRAND16_STEP:
9e026191 13777 mode = HImode;
2bf6d935
ML
13778 goto rdrand_step;
13779
13780 case IX86_BUILTIN_RDRAND32_STEP:
9e026191 13781 mode = SImode;
2bf6d935
ML
13782 goto rdrand_step;
13783
13784 case IX86_BUILTIN_RDRAND64_STEP:
9e026191 13785 mode = DImode;
2bf6d935
ML
13786
13787rdrand_step:
13788 arg0 = CALL_EXPR_ARG (exp, 0);
13789 op1 = expand_normal (arg0);
13790 if (!address_operand (op1, VOIDmode))
13791 {
13792 op1 = convert_memory_address (Pmode, op1);
13793 op1 = copy_addr_to_reg (op1);
13794 }
13795
9e026191
UB
13796 op0 = gen_reg_rtx (mode);
13797 emit_insn (gen_rdrand (mode, op0));
2bf6d935 13798
9e026191 13799 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935 13800
9e026191 13801 op1 = force_reg (SImode, const1_rtx);
2bf6d935
ML
13802
13803 /* Emit SImode conditional move. */
9e026191 13804 if (mode == HImode)
2bf6d935
ML
13805 {
13806 if (TARGET_ZERO_EXTEND_WITH_AND
13807 && optimize_function_for_speed_p (cfun))
13808 {
13809 op2 = force_reg (SImode, const0_rtx);
13810
13811 emit_insn (gen_movstricthi
13812 (gen_lowpart (HImode, op2), op0));
13813 }
13814 else
13815 {
13816 op2 = gen_reg_rtx (SImode);
13817
13818 emit_insn (gen_zero_extendhisi2 (op2, op0));
13819 }
13820 }
9e026191 13821 else if (mode == SImode)
2bf6d935
ML
13822 op2 = op0;
13823 else
13824 op2 = gen_rtx_SUBREG (SImode, op0, 0);
13825
13826 if (target == 0
13827 || !register_operand (target, SImode))
13828 target = gen_reg_rtx (SImode);
13829
13830 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13831 const0_rtx);
13832 emit_insn (gen_rtx_SET (target,
13833 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13834 return target;
13835
13836 case IX86_BUILTIN_RDSEED16_STEP:
9e026191 13837 mode = HImode;
2bf6d935
ML
13838 goto rdseed_step;
13839
13840 case IX86_BUILTIN_RDSEED32_STEP:
9e026191 13841 mode = SImode;
2bf6d935
ML
13842 goto rdseed_step;
13843
13844 case IX86_BUILTIN_RDSEED64_STEP:
9e026191 13845 mode = DImode;
2bf6d935
ML
13846
13847rdseed_step:
13848 arg0 = CALL_EXPR_ARG (exp, 0);
13849 op1 = expand_normal (arg0);
13850 if (!address_operand (op1, VOIDmode))
13851 {
13852 op1 = convert_memory_address (Pmode, op1);
13853 op1 = copy_addr_to_reg (op1);
13854 }
13855
9e026191
UB
13856 op0 = gen_reg_rtx (mode);
13857 emit_insn (gen_rdseed (mode, op0));
2bf6d935 13858
9e026191 13859 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935
ML
13860
13861 op2 = gen_reg_rtx (QImode);
13862
13863 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13864 const0_rtx);
13865 emit_insn (gen_rtx_SET (op2, pat));
13866
13867 if (target == 0
13868 || !register_operand (target, SImode))
13869 target = gen_reg_rtx (SImode);
13870
13871 emit_insn (gen_zero_extendqisi2 (target, op2));
13872 return target;
13873
13874 case IX86_BUILTIN_SBB32:
13875 icode = CODE_FOR_subborrowsi;
13876 icode2 = CODE_FOR_subborrowsi_0;
13877 mode0 = SImode;
13878 mode1 = DImode;
13879 mode2 = CCmode;
13880 goto handlecarry;
13881
13882 case IX86_BUILTIN_SBB64:
13883 icode = CODE_FOR_subborrowdi;
13884 icode2 = CODE_FOR_subborrowdi_0;
13885 mode0 = DImode;
13886 mode1 = TImode;
13887 mode2 = CCmode;
13888 goto handlecarry;
13889
13890 case IX86_BUILTIN_ADDCARRYX32:
13891 icode = CODE_FOR_addcarrysi;
13892 icode2 = CODE_FOR_addcarrysi_0;
13893 mode0 = SImode;
13894 mode1 = DImode;
13895 mode2 = CCCmode;
13896 goto handlecarry;
13897
13898 case IX86_BUILTIN_ADDCARRYX64:
13899 icode = CODE_FOR_addcarrydi;
13900 icode2 = CODE_FOR_addcarrydi_0;
13901 mode0 = DImode;
13902 mode1 = TImode;
13903 mode2 = CCCmode;
13904
13905 handlecarry:
13906 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
13907 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
13908 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
13909 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
13910
13911 op1 = expand_normal (arg0);
13912 if (!integer_zerop (arg0))
13913 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
13914
13915 op2 = expand_normal (arg1);
13916 if (!register_operand (op2, mode0))
13917 op2 = copy_to_mode_reg (mode0, op2);
13918
13919 op3 = expand_normal (arg2);
13920 if (!register_operand (op3, mode0))
13921 op3 = copy_to_mode_reg (mode0, op3);
13922
13923 op4 = expand_normal (arg3);
13924 if (!address_operand (op4, VOIDmode))
13925 {
13926 op4 = convert_memory_address (Pmode, op4);
13927 op4 = copy_addr_to_reg (op4);
13928 }
13929
13930 op0 = gen_reg_rtx (mode0);
13931 if (integer_zerop (arg0))
13932 {
13933 /* If arg0 is 0, optimize right away into add or sub
13934 instruction that sets CCCmode flags. */
13935 op1 = gen_rtx_REG (mode2, FLAGS_REG);
13936 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
13937 }
13938 else
13939 {
13940 /* Generate CF from input operand. */
13941 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
13942
13943 /* Generate instruction that consumes CF. */
13944 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
13945 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
13946 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
13947 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
13948 }
13949
13950 /* Return current CF value. */
13951 if (target == 0)
13952 target = gen_reg_rtx (QImode);
13953
13954 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
13955 emit_insn (gen_rtx_SET (target, pat));
13956
13957 /* Store the result. */
13958 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
13959
13960 return target;
13961
13962 case IX86_BUILTIN_READ_FLAGS:
b60bc913
JJ
13963 if (ignore)
13964 return const0_rtx;
13965
2bf6d935
ML
13966 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
13967
13968 if (optimize
13969 || target == NULL_RTX
13970 || !nonimmediate_operand (target, word_mode)
13971 || GET_MODE (target) != word_mode)
13972 target = gen_reg_rtx (word_mode);
13973
13974 emit_insn (gen_pop (target));
13975 return target;
13976
13977 case IX86_BUILTIN_WRITE_FLAGS:
13978
13979 arg0 = CALL_EXPR_ARG (exp, 0);
13980 op0 = expand_normal (arg0);
13981 if (!general_no_elim_operand (op0, word_mode))
13982 op0 = copy_to_mode_reg (word_mode, op0);
13983
13984 emit_insn (gen_push (op0));
13985 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
13986 return 0;
13987
13988 case IX86_BUILTIN_KTESTC8:
13989 icode = CODE_FOR_ktestqi;
13990 mode3 = CCCmode;
13991 goto kortest;
13992
13993 case IX86_BUILTIN_KTESTZ8:
13994 icode = CODE_FOR_ktestqi;
13995 mode3 = CCZmode;
13996 goto kortest;
13997
13998 case IX86_BUILTIN_KTESTC16:
13999 icode = CODE_FOR_ktesthi;
14000 mode3 = CCCmode;
14001 goto kortest;
14002
14003 case IX86_BUILTIN_KTESTZ16:
14004 icode = CODE_FOR_ktesthi;
14005 mode3 = CCZmode;
14006 goto kortest;
14007
14008 case IX86_BUILTIN_KTESTC32:
14009 icode = CODE_FOR_ktestsi;
14010 mode3 = CCCmode;
14011 goto kortest;
14012
14013 case IX86_BUILTIN_KTESTZ32:
14014 icode = CODE_FOR_ktestsi;
14015 mode3 = CCZmode;
14016 goto kortest;
14017
14018 case IX86_BUILTIN_KTESTC64:
14019 icode = CODE_FOR_ktestdi;
14020 mode3 = CCCmode;
14021 goto kortest;
14022
14023 case IX86_BUILTIN_KTESTZ64:
14024 icode = CODE_FOR_ktestdi;
14025 mode3 = CCZmode;
14026 goto kortest;
14027
14028 case IX86_BUILTIN_KORTESTC8:
14029 icode = CODE_FOR_kortestqi;
14030 mode3 = CCCmode;
14031 goto kortest;
14032
14033 case IX86_BUILTIN_KORTESTZ8:
14034 icode = CODE_FOR_kortestqi;
14035 mode3 = CCZmode;
14036 goto kortest;
14037
14038 case IX86_BUILTIN_KORTESTC16:
14039 icode = CODE_FOR_kortesthi;
14040 mode3 = CCCmode;
14041 goto kortest;
14042
14043 case IX86_BUILTIN_KORTESTZ16:
14044 icode = CODE_FOR_kortesthi;
14045 mode3 = CCZmode;
14046 goto kortest;
14047
14048 case IX86_BUILTIN_KORTESTC32:
14049 icode = CODE_FOR_kortestsi;
14050 mode3 = CCCmode;
14051 goto kortest;
14052
14053 case IX86_BUILTIN_KORTESTZ32:
14054 icode = CODE_FOR_kortestsi;
14055 mode3 = CCZmode;
14056 goto kortest;
14057
14058 case IX86_BUILTIN_KORTESTC64:
14059 icode = CODE_FOR_kortestdi;
14060 mode3 = CCCmode;
14061 goto kortest;
14062
14063 case IX86_BUILTIN_KORTESTZ64:
14064 icode = CODE_FOR_kortestdi;
14065 mode3 = CCZmode;
14066
14067 kortest:
14068 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
14069 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
14070 op0 = expand_normal (arg0);
14071 op1 = expand_normal (arg1);
14072
14073 mode0 = insn_data[icode].operand[0].mode;
14074 mode1 = insn_data[icode].operand[1].mode;
14075
14076 if (GET_MODE (op0) != VOIDmode)
14077 op0 = force_reg (GET_MODE (op0), op0);
14078
14079 op0 = gen_lowpart (mode0, op0);
14080
14081 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14082 op0 = copy_to_mode_reg (mode0, op0);
14083
14084 if (GET_MODE (op1) != VOIDmode)
14085 op1 = force_reg (GET_MODE (op1), op1);
14086
14087 op1 = gen_lowpart (mode1, op1);
14088
14089 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14090 op1 = copy_to_mode_reg (mode1, op1);
14091
14092 target = gen_reg_rtx (QImode);
14093
14094 /* Emit kortest. */
14095 emit_insn (GEN_FCN (icode) (op0, op1));
14096 /* And use setcc to return result from flags. */
14097 ix86_expand_setcc (target, EQ,
14098 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
14099 return target;
14100
14101 case IX86_BUILTIN_GATHERSIV2DF:
14102 icode = CODE_FOR_avx2_gathersiv2df;
14103 goto gather_gen;
14104 case IX86_BUILTIN_GATHERSIV4DF:
14105 icode = CODE_FOR_avx2_gathersiv4df;
14106 goto gather_gen;
14107 case IX86_BUILTIN_GATHERDIV2DF:
14108 icode = CODE_FOR_avx2_gatherdiv2df;
14109 goto gather_gen;
14110 case IX86_BUILTIN_GATHERDIV4DF:
14111 icode = CODE_FOR_avx2_gatherdiv4df;
14112 goto gather_gen;
14113 case IX86_BUILTIN_GATHERSIV4SF:
14114 icode = CODE_FOR_avx2_gathersiv4sf;
14115 goto gather_gen;
14116 case IX86_BUILTIN_GATHERSIV8SF:
14117 icode = CODE_FOR_avx2_gathersiv8sf;
14118 goto gather_gen;
14119 case IX86_BUILTIN_GATHERDIV4SF:
14120 icode = CODE_FOR_avx2_gatherdiv4sf;
14121 goto gather_gen;
14122 case IX86_BUILTIN_GATHERDIV8SF:
14123 icode = CODE_FOR_avx2_gatherdiv8sf;
14124 goto gather_gen;
14125 case IX86_BUILTIN_GATHERSIV2DI:
14126 icode = CODE_FOR_avx2_gathersiv2di;
14127 goto gather_gen;
14128 case IX86_BUILTIN_GATHERSIV4DI:
14129 icode = CODE_FOR_avx2_gathersiv4di;
14130 goto gather_gen;
14131 case IX86_BUILTIN_GATHERDIV2DI:
14132 icode = CODE_FOR_avx2_gatherdiv2di;
14133 goto gather_gen;
14134 case IX86_BUILTIN_GATHERDIV4DI:
14135 icode = CODE_FOR_avx2_gatherdiv4di;
14136 goto gather_gen;
14137 case IX86_BUILTIN_GATHERSIV4SI:
14138 icode = CODE_FOR_avx2_gathersiv4si;
14139 goto gather_gen;
14140 case IX86_BUILTIN_GATHERSIV8SI:
14141 icode = CODE_FOR_avx2_gathersiv8si;
14142 goto gather_gen;
14143 case IX86_BUILTIN_GATHERDIV4SI:
14144 icode = CODE_FOR_avx2_gatherdiv4si;
14145 goto gather_gen;
14146 case IX86_BUILTIN_GATHERDIV8SI:
14147 icode = CODE_FOR_avx2_gatherdiv8si;
14148 goto gather_gen;
14149 case IX86_BUILTIN_GATHERALTSIV4DF:
14150 icode = CODE_FOR_avx2_gathersiv4df;
14151 goto gather_gen;
14152 case IX86_BUILTIN_GATHERALTDIV8SF:
14153 icode = CODE_FOR_avx2_gatherdiv8sf;
14154 goto gather_gen;
14155 case IX86_BUILTIN_GATHERALTSIV4DI:
14156 icode = CODE_FOR_avx2_gathersiv4di;
14157 goto gather_gen;
14158 case IX86_BUILTIN_GATHERALTDIV8SI:
14159 icode = CODE_FOR_avx2_gatherdiv8si;
14160 goto gather_gen;
14161 case IX86_BUILTIN_GATHER3SIV16SF:
14162 icode = CODE_FOR_avx512f_gathersiv16sf;
14163 goto gather_gen;
14164 case IX86_BUILTIN_GATHER3SIV8DF:
14165 icode = CODE_FOR_avx512f_gathersiv8df;
14166 goto gather_gen;
14167 case IX86_BUILTIN_GATHER3DIV16SF:
14168 icode = CODE_FOR_avx512f_gatherdiv16sf;
14169 goto gather_gen;
14170 case IX86_BUILTIN_GATHER3DIV8DF:
14171 icode = CODE_FOR_avx512f_gatherdiv8df;
14172 goto gather_gen;
14173 case IX86_BUILTIN_GATHER3SIV16SI:
14174 icode = CODE_FOR_avx512f_gathersiv16si;
14175 goto gather_gen;
14176 case IX86_BUILTIN_GATHER3SIV8DI:
14177 icode = CODE_FOR_avx512f_gathersiv8di;
14178 goto gather_gen;
14179 case IX86_BUILTIN_GATHER3DIV16SI:
14180 icode = CODE_FOR_avx512f_gatherdiv16si;
14181 goto gather_gen;
14182 case IX86_BUILTIN_GATHER3DIV8DI:
14183 icode = CODE_FOR_avx512f_gatherdiv8di;
14184 goto gather_gen;
14185 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14186 icode = CODE_FOR_avx512f_gathersiv8df;
14187 goto gather_gen;
14188 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14189 icode = CODE_FOR_avx512f_gatherdiv16sf;
14190 goto gather_gen;
14191 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14192 icode = CODE_FOR_avx512f_gathersiv8di;
14193 goto gather_gen;
14194 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14195 icode = CODE_FOR_avx512f_gatherdiv16si;
14196 goto gather_gen;
14197 case IX86_BUILTIN_GATHER3SIV2DF:
14198 icode = CODE_FOR_avx512vl_gathersiv2df;
14199 goto gather_gen;
14200 case IX86_BUILTIN_GATHER3SIV4DF:
14201 icode = CODE_FOR_avx512vl_gathersiv4df;
14202 goto gather_gen;
14203 case IX86_BUILTIN_GATHER3DIV2DF:
14204 icode = CODE_FOR_avx512vl_gatherdiv2df;
14205 goto gather_gen;
14206 case IX86_BUILTIN_GATHER3DIV4DF:
14207 icode = CODE_FOR_avx512vl_gatherdiv4df;
14208 goto gather_gen;
14209 case IX86_BUILTIN_GATHER3SIV4SF:
14210 icode = CODE_FOR_avx512vl_gathersiv4sf;
14211 goto gather_gen;
14212 case IX86_BUILTIN_GATHER3SIV8SF:
14213 icode = CODE_FOR_avx512vl_gathersiv8sf;
14214 goto gather_gen;
14215 case IX86_BUILTIN_GATHER3DIV4SF:
14216 icode = CODE_FOR_avx512vl_gatherdiv4sf;
14217 goto gather_gen;
14218 case IX86_BUILTIN_GATHER3DIV8SF:
14219 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14220 goto gather_gen;
14221 case IX86_BUILTIN_GATHER3SIV2DI:
14222 icode = CODE_FOR_avx512vl_gathersiv2di;
14223 goto gather_gen;
14224 case IX86_BUILTIN_GATHER3SIV4DI:
14225 icode = CODE_FOR_avx512vl_gathersiv4di;
14226 goto gather_gen;
14227 case IX86_BUILTIN_GATHER3DIV2DI:
14228 icode = CODE_FOR_avx512vl_gatherdiv2di;
14229 goto gather_gen;
14230 case IX86_BUILTIN_GATHER3DIV4DI:
14231 icode = CODE_FOR_avx512vl_gatherdiv4di;
14232 goto gather_gen;
14233 case IX86_BUILTIN_GATHER3SIV4SI:
14234 icode = CODE_FOR_avx512vl_gathersiv4si;
14235 goto gather_gen;
14236 case IX86_BUILTIN_GATHER3SIV8SI:
14237 icode = CODE_FOR_avx512vl_gathersiv8si;
14238 goto gather_gen;
14239 case IX86_BUILTIN_GATHER3DIV4SI:
14240 icode = CODE_FOR_avx512vl_gatherdiv4si;
14241 goto gather_gen;
14242 case IX86_BUILTIN_GATHER3DIV8SI:
14243 icode = CODE_FOR_avx512vl_gatherdiv8si;
14244 goto gather_gen;
14245 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14246 icode = CODE_FOR_avx512vl_gathersiv4df;
14247 goto gather_gen;
14248 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14249 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14250 goto gather_gen;
14251 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14252 icode = CODE_FOR_avx512vl_gathersiv4di;
14253 goto gather_gen;
14254 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14255 icode = CODE_FOR_avx512vl_gatherdiv8si;
14256 goto gather_gen;
14257 case IX86_BUILTIN_SCATTERSIV16SF:
14258 icode = CODE_FOR_avx512f_scattersiv16sf;
14259 goto scatter_gen;
14260 case IX86_BUILTIN_SCATTERSIV8DF:
14261 icode = CODE_FOR_avx512f_scattersiv8df;
14262 goto scatter_gen;
14263 case IX86_BUILTIN_SCATTERDIV16SF:
14264 icode = CODE_FOR_avx512f_scatterdiv16sf;
14265 goto scatter_gen;
14266 case IX86_BUILTIN_SCATTERDIV8DF:
14267 icode = CODE_FOR_avx512f_scatterdiv8df;
14268 goto scatter_gen;
14269 case IX86_BUILTIN_SCATTERSIV16SI:
14270 icode = CODE_FOR_avx512f_scattersiv16si;
14271 goto scatter_gen;
14272 case IX86_BUILTIN_SCATTERSIV8DI:
14273 icode = CODE_FOR_avx512f_scattersiv8di;
14274 goto scatter_gen;
14275 case IX86_BUILTIN_SCATTERDIV16SI:
14276 icode = CODE_FOR_avx512f_scatterdiv16si;
14277 goto scatter_gen;
14278 case IX86_BUILTIN_SCATTERDIV8DI:
14279 icode = CODE_FOR_avx512f_scatterdiv8di;
14280 goto scatter_gen;
14281 case IX86_BUILTIN_SCATTERSIV8SF:
14282 icode = CODE_FOR_avx512vl_scattersiv8sf;
14283 goto scatter_gen;
14284 case IX86_BUILTIN_SCATTERSIV4SF:
14285 icode = CODE_FOR_avx512vl_scattersiv4sf;
14286 goto scatter_gen;
14287 case IX86_BUILTIN_SCATTERSIV4DF:
14288 icode = CODE_FOR_avx512vl_scattersiv4df;
14289 goto scatter_gen;
14290 case IX86_BUILTIN_SCATTERSIV2DF:
14291 icode = CODE_FOR_avx512vl_scattersiv2df;
14292 goto scatter_gen;
14293 case IX86_BUILTIN_SCATTERDIV8SF:
14294 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14295 goto scatter_gen;
14296 case IX86_BUILTIN_SCATTERDIV4SF:
14297 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14298 goto scatter_gen;
14299 case IX86_BUILTIN_SCATTERDIV4DF:
14300 icode = CODE_FOR_avx512vl_scatterdiv4df;
14301 goto scatter_gen;
14302 case IX86_BUILTIN_SCATTERDIV2DF:
14303 icode = CODE_FOR_avx512vl_scatterdiv2df;
14304 goto scatter_gen;
14305 case IX86_BUILTIN_SCATTERSIV8SI:
14306 icode = CODE_FOR_avx512vl_scattersiv8si;
14307 goto scatter_gen;
14308 case IX86_BUILTIN_SCATTERSIV4SI:
14309 icode = CODE_FOR_avx512vl_scattersiv4si;
14310 goto scatter_gen;
14311 case IX86_BUILTIN_SCATTERSIV4DI:
14312 icode = CODE_FOR_avx512vl_scattersiv4di;
14313 goto scatter_gen;
14314 case IX86_BUILTIN_SCATTERSIV2DI:
14315 icode = CODE_FOR_avx512vl_scattersiv2di;
14316 goto scatter_gen;
14317 case IX86_BUILTIN_SCATTERDIV8SI:
14318 icode = CODE_FOR_avx512vl_scatterdiv8si;
14319 goto scatter_gen;
14320 case IX86_BUILTIN_SCATTERDIV4SI:
14321 icode = CODE_FOR_avx512vl_scatterdiv4si;
14322 goto scatter_gen;
14323 case IX86_BUILTIN_SCATTERDIV4DI:
14324 icode = CODE_FOR_avx512vl_scatterdiv4di;
14325 goto scatter_gen;
14326 case IX86_BUILTIN_SCATTERDIV2DI:
14327 icode = CODE_FOR_avx512vl_scatterdiv2di;
14328 goto scatter_gen;
14329 case IX86_BUILTIN_GATHERPFDPD:
14330 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14331 goto vec_prefetch_gen;
14332 case IX86_BUILTIN_SCATTERALTSIV8DF:
14333 icode = CODE_FOR_avx512f_scattersiv8df;
14334 goto scatter_gen;
14335 case IX86_BUILTIN_SCATTERALTDIV16SF:
14336 icode = CODE_FOR_avx512f_scatterdiv16sf;
14337 goto scatter_gen;
14338 case IX86_BUILTIN_SCATTERALTSIV8DI:
14339 icode = CODE_FOR_avx512f_scattersiv8di;
14340 goto scatter_gen;
14341 case IX86_BUILTIN_SCATTERALTDIV16SI:
14342 icode = CODE_FOR_avx512f_scatterdiv16si;
14343 goto scatter_gen;
14344 case IX86_BUILTIN_SCATTERALTSIV4DF:
14345 icode = CODE_FOR_avx512vl_scattersiv4df;
14346 goto scatter_gen;
14347 case IX86_BUILTIN_SCATTERALTDIV8SF:
14348 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14349 goto scatter_gen;
14350 case IX86_BUILTIN_SCATTERALTSIV4DI:
14351 icode = CODE_FOR_avx512vl_scattersiv4di;
14352 goto scatter_gen;
14353 case IX86_BUILTIN_SCATTERALTDIV8SI:
14354 icode = CODE_FOR_avx512vl_scatterdiv8si;
14355 goto scatter_gen;
14356 case IX86_BUILTIN_SCATTERALTSIV2DF:
14357 icode = CODE_FOR_avx512vl_scattersiv2df;
14358 goto scatter_gen;
14359 case IX86_BUILTIN_SCATTERALTDIV4SF:
14360 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14361 goto scatter_gen;
14362 case IX86_BUILTIN_SCATTERALTSIV2DI:
14363 icode = CODE_FOR_avx512vl_scattersiv2di;
14364 goto scatter_gen;
14365 case IX86_BUILTIN_SCATTERALTDIV4SI:
14366 icode = CODE_FOR_avx512vl_scatterdiv4si;
14367 goto scatter_gen;
14368 case IX86_BUILTIN_GATHERPFDPS:
14369 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14370 goto vec_prefetch_gen;
14371 case IX86_BUILTIN_GATHERPFQPD:
14372 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14373 goto vec_prefetch_gen;
14374 case IX86_BUILTIN_GATHERPFQPS:
14375 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14376 goto vec_prefetch_gen;
14377 case IX86_BUILTIN_SCATTERPFDPD:
14378 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14379 goto vec_prefetch_gen;
14380 case IX86_BUILTIN_SCATTERPFDPS:
14381 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14382 goto vec_prefetch_gen;
14383 case IX86_BUILTIN_SCATTERPFQPD:
14384 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14385 goto vec_prefetch_gen;
14386 case IX86_BUILTIN_SCATTERPFQPS:
14387 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14388 goto vec_prefetch_gen;
14389
14390 gather_gen:
14391 rtx half;
14392 rtx (*gen) (rtx, rtx);
14393
14394 arg0 = CALL_EXPR_ARG (exp, 0);
14395 arg1 = CALL_EXPR_ARG (exp, 1);
14396 arg2 = CALL_EXPR_ARG (exp, 2);
14397 arg3 = CALL_EXPR_ARG (exp, 3);
14398 arg4 = CALL_EXPR_ARG (exp, 4);
14399 op0 = expand_normal (arg0);
14400 op1 = expand_normal (arg1);
14401 op2 = expand_normal (arg2);
14402 op3 = expand_normal (arg3);
14403 op4 = expand_normal (arg4);
14404 /* Note the arg order is different from the operand order. */
14405 mode0 = insn_data[icode].operand[1].mode;
14406 mode2 = insn_data[icode].operand[3].mode;
14407 mode3 = insn_data[icode].operand[4].mode;
14408 mode4 = insn_data[icode].operand[5].mode;
14409
14410 if (target == NULL_RTX
14411 || GET_MODE (target) != insn_data[icode].operand[0].mode
14412 || !insn_data[icode].operand[0].predicate (target,
14413 GET_MODE (target)))
14414 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14415 else
14416 subtarget = target;
14417
14418 switch (fcode)
14419 {
14420 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14421 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14422 half = gen_reg_rtx (V8SImode);
14423 if (!nonimmediate_operand (op2, V16SImode))
14424 op2 = copy_to_mode_reg (V16SImode, op2);
14425 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14426 op2 = half;
14427 break;
14428 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14429 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14430 case IX86_BUILTIN_GATHERALTSIV4DF:
14431 case IX86_BUILTIN_GATHERALTSIV4DI:
14432 half = gen_reg_rtx (V4SImode);
14433 if (!nonimmediate_operand (op2, V8SImode))
14434 op2 = copy_to_mode_reg (V8SImode, op2);
14435 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14436 op2 = half;
14437 break;
14438 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14439 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14440 half = gen_reg_rtx (mode0);
14441 if (mode0 == V8SFmode)
14442 gen = gen_vec_extract_lo_v16sf;
14443 else
14444 gen = gen_vec_extract_lo_v16si;
14445 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14446 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14447 emit_insn (gen (half, op0));
14448 op0 = half;
14449 op3 = lowpart_subreg (QImode, op3, HImode);
14450 break;
14451 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14452 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14453 case IX86_BUILTIN_GATHERALTDIV8SF:
14454 case IX86_BUILTIN_GATHERALTDIV8SI:
14455 half = gen_reg_rtx (mode0);
14456 if (mode0 == V4SFmode)
14457 gen = gen_vec_extract_lo_v8sf;
14458 else
14459 gen = gen_vec_extract_lo_v8si;
14460 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14461 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14462 emit_insn (gen (half, op0));
14463 op0 = half;
14464 if (VECTOR_MODE_P (GET_MODE (op3)))
14465 {
14466 half = gen_reg_rtx (mode0);
14467 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14468 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14469 emit_insn (gen (half, op3));
14470 op3 = half;
14471 }
14472 break;
14473 default:
14474 break;
14475 }
14476
14477 /* Force memory operand only with base register here. But we
14478 don't want to do it on memory operand for other builtin
14479 functions. */
14480 op1 = ix86_zero_extend_to_Pmode (op1);
14481
14482 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14483 op0 = copy_to_mode_reg (mode0, op0);
14484 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14485 op1 = copy_to_mode_reg (Pmode, op1);
14486 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14487 op2 = copy_to_mode_reg (mode2, op2);
14488
14489 op3 = fixup_modeless_constant (op3, mode3);
14490
14491 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14492 {
14493 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14494 op3 = copy_to_mode_reg (mode3, op3);
14495 }
14496 else
14497 {
14498 op3 = copy_to_reg (op3);
14499 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14500 }
14501 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14502 {
14503 error ("the last argument must be scale 1, 2, 4, 8");
14504 return const0_rtx;
14505 }
14506
14507 /* Optimize. If mask is known to have all high bits set,
14508 replace op0 with pc_rtx to signal that the instruction
14509 overwrites the whole destination and doesn't use its
14510 previous contents. */
14511 if (optimize)
14512 {
14513 if (TREE_CODE (arg3) == INTEGER_CST)
14514 {
14515 if (integer_all_onesp (arg3))
14516 op0 = pc_rtx;
14517 }
14518 else if (TREE_CODE (arg3) == VECTOR_CST)
14519 {
14520 unsigned int negative = 0;
14521 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14522 {
14523 tree cst = VECTOR_CST_ELT (arg3, i);
14524 if (TREE_CODE (cst) == INTEGER_CST
14525 && tree_int_cst_sign_bit (cst))
14526 negative++;
14527 else if (TREE_CODE (cst) == REAL_CST
14528 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14529 negative++;
14530 }
14531 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14532 op0 = pc_rtx;
14533 }
14534 else if (TREE_CODE (arg3) == SSA_NAME
14535 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
14536 {
14537 /* Recognize also when mask is like:
14538 __v2df src = _mm_setzero_pd ();
14539 __v2df mask = _mm_cmpeq_pd (src, src);
14540 or
14541 __v8sf src = _mm256_setzero_ps ();
14542 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14543 as that is a cheaper way to load all ones into
14544 a register than having to load a constant from
14545 memory. */
14546 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14547 if (is_gimple_call (def_stmt))
14548 {
14549 tree fndecl = gimple_call_fndecl (def_stmt);
14550 if (fndecl
14551 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
4d732405 14552 switch (DECL_MD_FUNCTION_CODE (fndecl))
2bf6d935
ML
14553 {
14554 case IX86_BUILTIN_CMPPD:
14555 case IX86_BUILTIN_CMPPS:
14556 case IX86_BUILTIN_CMPPD256:
14557 case IX86_BUILTIN_CMPPS256:
14558 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14559 break;
14560 /* FALLTHRU */
14561 case IX86_BUILTIN_CMPEQPD:
14562 case IX86_BUILTIN_CMPEQPS:
14563 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14564 && initializer_zerop (gimple_call_arg (def_stmt,
14565 1)))
14566 op0 = pc_rtx;
14567 break;
14568 default:
14569 break;
14570 }
14571 }
14572 }
14573 }
14574
14575 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14576 if (! pat)
14577 return const0_rtx;
14578 emit_insn (pat);
14579
14580 switch (fcode)
14581 {
14582 case IX86_BUILTIN_GATHER3DIV16SF:
14583 if (target == NULL_RTX)
14584 target = gen_reg_rtx (V8SFmode);
14585 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14586 break;
14587 case IX86_BUILTIN_GATHER3DIV16SI:
14588 if (target == NULL_RTX)
14589 target = gen_reg_rtx (V8SImode);
14590 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14591 break;
14592 case IX86_BUILTIN_GATHER3DIV8SF:
14593 case IX86_BUILTIN_GATHERDIV8SF:
14594 if (target == NULL_RTX)
14595 target = gen_reg_rtx (V4SFmode);
14596 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14597 break;
14598 case IX86_BUILTIN_GATHER3DIV8SI:
14599 case IX86_BUILTIN_GATHERDIV8SI:
14600 if (target == NULL_RTX)
14601 target = gen_reg_rtx (V4SImode);
14602 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14603 break;
14604 default:
14605 target = subtarget;
14606 break;
14607 }
14608 return target;
14609
14610 scatter_gen:
14611 arg0 = CALL_EXPR_ARG (exp, 0);
14612 arg1 = CALL_EXPR_ARG (exp, 1);
14613 arg2 = CALL_EXPR_ARG (exp, 2);
14614 arg3 = CALL_EXPR_ARG (exp, 3);
14615 arg4 = CALL_EXPR_ARG (exp, 4);
14616 op0 = expand_normal (arg0);
14617 op1 = expand_normal (arg1);
14618 op2 = expand_normal (arg2);
14619 op3 = expand_normal (arg3);
14620 op4 = expand_normal (arg4);
14621 mode1 = insn_data[icode].operand[1].mode;
14622 mode2 = insn_data[icode].operand[2].mode;
14623 mode3 = insn_data[icode].operand[3].mode;
14624 mode4 = insn_data[icode].operand[4].mode;
14625
14626 /* Scatter instruction stores operand op3 to memory with
14627 indices from op2 and scale from op4 under writemask op1.
14628 If index operand op2 has more elements then source operand
14629 op3 one need to use only its low half. And vice versa. */
14630 switch (fcode)
14631 {
14632 case IX86_BUILTIN_SCATTERALTSIV8DF:
14633 case IX86_BUILTIN_SCATTERALTSIV8DI:
14634 half = gen_reg_rtx (V8SImode);
14635 if (!nonimmediate_operand (op2, V16SImode))
14636 op2 = copy_to_mode_reg (V16SImode, op2);
14637 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14638 op2 = half;
14639 break;
14640 case IX86_BUILTIN_SCATTERALTDIV16SF:
14641 case IX86_BUILTIN_SCATTERALTDIV16SI:
14642 half = gen_reg_rtx (mode3);
14643 if (mode3 == V8SFmode)
14644 gen = gen_vec_extract_lo_v16sf;
14645 else
14646 gen = gen_vec_extract_lo_v16si;
14647 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14648 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14649 emit_insn (gen (half, op3));
14650 op3 = half;
14651 break;
14652 case IX86_BUILTIN_SCATTERALTSIV4DF:
14653 case IX86_BUILTIN_SCATTERALTSIV4DI:
14654 half = gen_reg_rtx (V4SImode);
14655 if (!nonimmediate_operand (op2, V8SImode))
14656 op2 = copy_to_mode_reg (V8SImode, op2);
14657 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14658 op2 = half;
14659 break;
14660 case IX86_BUILTIN_SCATTERALTDIV8SF:
14661 case IX86_BUILTIN_SCATTERALTDIV8SI:
14662 half = gen_reg_rtx (mode3);
14663 if (mode3 == V4SFmode)
14664 gen = gen_vec_extract_lo_v8sf;
14665 else
14666 gen = gen_vec_extract_lo_v8si;
14667 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14668 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14669 emit_insn (gen (half, op3));
14670 op3 = half;
14671 break;
14672 case IX86_BUILTIN_SCATTERALTSIV2DF:
14673 case IX86_BUILTIN_SCATTERALTSIV2DI:
14674 if (!nonimmediate_operand (op2, V4SImode))
14675 op2 = copy_to_mode_reg (V4SImode, op2);
14676 break;
14677 case IX86_BUILTIN_SCATTERALTDIV4SF:
14678 case IX86_BUILTIN_SCATTERALTDIV4SI:
14679 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14680 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14681 break;
14682 default:
14683 break;
14684 }
14685
14686 /* Force memory operand only with base register here. But we
14687 don't want to do it on memory operand for other builtin
14688 functions. */
14689 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14690
14691 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14692 op0 = copy_to_mode_reg (Pmode, op0);
14693
14694 op1 = fixup_modeless_constant (op1, mode1);
14695
14696 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14697 {
14698 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14699 op1 = copy_to_mode_reg (mode1, op1);
14700 }
14701 else
14702 {
14703 op1 = copy_to_reg (op1);
14704 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14705 }
14706
14707 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14708 op2 = copy_to_mode_reg (mode2, op2);
14709
14710 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14711 op3 = copy_to_mode_reg (mode3, op3);
14712
14713 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14714 {
14715 error ("the last argument must be scale 1, 2, 4, 8");
14716 return const0_rtx;
14717 }
14718
14719 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14720 if (! pat)
14721 return const0_rtx;
14722
14723 emit_insn (pat);
14724 return 0;
14725
14726 vec_prefetch_gen:
14727 arg0 = CALL_EXPR_ARG (exp, 0);
14728 arg1 = CALL_EXPR_ARG (exp, 1);
14729 arg2 = CALL_EXPR_ARG (exp, 2);
14730 arg3 = CALL_EXPR_ARG (exp, 3);
14731 arg4 = CALL_EXPR_ARG (exp, 4);
14732 op0 = expand_normal (arg0);
14733 op1 = expand_normal (arg1);
14734 op2 = expand_normal (arg2);
14735 op3 = expand_normal (arg3);
14736 op4 = expand_normal (arg4);
14737 mode0 = insn_data[icode].operand[0].mode;
14738 mode1 = insn_data[icode].operand[1].mode;
14739 mode3 = insn_data[icode].operand[3].mode;
14740 mode4 = insn_data[icode].operand[4].mode;
14741
14742 op0 = fixup_modeless_constant (op0, mode0);
14743
14744 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14745 {
14746 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14747 op0 = copy_to_mode_reg (mode0, op0);
14748 }
14749 else
14750 {
14751 op0 = copy_to_reg (op0);
14752 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14753 }
14754
14755 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14756 op1 = copy_to_mode_reg (mode1, op1);
14757
14758 /* Force memory operand only with base register here. But we
14759 don't want to do it on memory operand for other builtin
14760 functions. */
14761 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14762
14763 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14764 op2 = copy_to_mode_reg (Pmode, op2);
14765
14766 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14767 {
14768 error ("the forth argument must be scale 1, 2, 4, 8");
14769 return const0_rtx;
14770 }
14771
14772 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14773 {
14774 error ("incorrect hint operand");
14775 return const0_rtx;
14776 }
14777
14778 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14779 if (! pat)
14780 return const0_rtx;
14781
14782 emit_insn (pat);
14783
14784 return 0;
14785
14786 case IX86_BUILTIN_XABORT:
14787 icode = CODE_FOR_xabort;
14788 arg0 = CALL_EXPR_ARG (exp, 0);
14789 op0 = expand_normal (arg0);
14790 mode0 = insn_data[icode].operand[0].mode;
14791 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14792 {
14793 error ("the argument to %<xabort%> intrinsic must "
14794 "be an 8-bit immediate");
14795 return const0_rtx;
14796 }
14797 emit_insn (gen_xabort (op0));
14798 return 0;
14799
b5034abb
UB
14800 case IX86_BUILTIN_RDSSPD:
14801 case IX86_BUILTIN_RDSSPQ:
14802 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14803
14804 if (target == 0
14805 || !register_operand (target, mode))
14806 target = gen_reg_rtx (mode);
14807
14808 op0 = force_reg (mode, const0_rtx);
14809
14810 emit_insn (gen_rdssp (mode, target, op0));
14811 return target;
14812
14813 case IX86_BUILTIN_INCSSPD:
14814 case IX86_BUILTIN_INCSSPQ:
14815 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14816
14817 arg0 = CALL_EXPR_ARG (exp, 0);
14818 op0 = expand_normal (arg0);
14819
14820 op0 = force_reg (mode, op0);
14821
14822 emit_insn (gen_incssp (mode, op0));
14823 return 0;
14824
83927c63
HW
14825 case IX86_BUILTIN_HRESET:
14826 icode = CODE_FOR_hreset;
14827 arg0 = CALL_EXPR_ARG (exp, 0);
14828 op0 = expand_normal (arg0);
14829 op0 = force_reg (SImode, op0);
14830 emit_insn (gen_hreset (op0));
14831 return 0;
14832
2bf6d935
ML
14833 case IX86_BUILTIN_RSTORSSP:
14834 case IX86_BUILTIN_CLRSSBSY:
14835 arg0 = CALL_EXPR_ARG (exp, 0);
14836 op0 = expand_normal (arg0);
14837 icode = (fcode == IX86_BUILTIN_RSTORSSP
b5034abb
UB
14838 ? CODE_FOR_rstorssp
14839 : CODE_FOR_clrssbsy);
14840
2bf6d935
ML
14841 if (!address_operand (op0, VOIDmode))
14842 {
b5034abb
UB
14843 op0 = convert_memory_address (Pmode, op0);
14844 op0 = copy_addr_to_reg (op0);
2bf6d935 14845 }
b5034abb 14846 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
2bf6d935
ML
14847 return 0;
14848
14849 case IX86_BUILTIN_WRSSD:
14850 case IX86_BUILTIN_WRSSQ:
14851 case IX86_BUILTIN_WRUSSD:
14852 case IX86_BUILTIN_WRUSSQ:
b5034abb
UB
14853 mode = ((fcode == IX86_BUILTIN_WRSSD
14854 || fcode == IX86_BUILTIN_WRUSSD)
14855 ? SImode : DImode);
14856
2bf6d935
ML
14857 arg0 = CALL_EXPR_ARG (exp, 0);
14858 op0 = expand_normal (arg0);
14859 arg1 = CALL_EXPR_ARG (exp, 1);
14860 op1 = expand_normal (arg1);
b5034abb 14861
2bf6d935 14862 op0 = force_reg (mode, op0);
b5034abb 14863
2bf6d935
ML
14864 if (!address_operand (op1, VOIDmode))
14865 {
b5034abb
UB
14866 op1 = convert_memory_address (Pmode, op1);
14867 op1 = copy_addr_to_reg (op1);
2bf6d935 14868 }
b5034abb
UB
14869 op1 = gen_rtx_MEM (mode, op1);
14870
44320665
UB
14871 icode = ((fcode == IX86_BUILTIN_WRSSD
14872 || fcode == IX86_BUILTIN_WRSSQ)
14873 ? code_for_wrss (mode)
14874 : code_for_wruss (mode));
14875 emit_insn (GEN_FCN (icode) (op0, op1));
14876
2bf6d935
ML
14877 return 0;
14878
14879 default:
14880 break;
14881 }
14882
14883 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14884 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14885 {
14886 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14887 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14888 target);
14889 }
14890
fd5d5794
UB
14891 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14892 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14893 {
14894 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14895 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14896 target);
14897 }
14898
2bf6d935
ML
14899 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14900 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14901 {
14902 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14903 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14904 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14905 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14906 int masked = 1;
14907 machine_mode mode, wide_mode, nar_mode;
14908
14909 nar_mode = V4SFmode;
14910 mode = V16SFmode;
14911 wide_mode = V64SFmode;
14912 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
14913 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14914
14915 switch (fcode)
14916 {
14917 case IX86_BUILTIN_4FMAPS:
14918 fcn = gen_avx5124fmaddps_4fmaddps;
14919 masked = 0;
14920 goto v4fma_expand;
14921
14922 case IX86_BUILTIN_4DPWSSD:
14923 nar_mode = V4SImode;
14924 mode = V16SImode;
14925 wide_mode = V64SImode;
14926 fcn = gen_avx5124vnniw_vp4dpwssd;
14927 masked = 0;
14928 goto v4fma_expand;
14929
14930 case IX86_BUILTIN_4DPWSSDS:
14931 nar_mode = V4SImode;
14932 mode = V16SImode;
14933 wide_mode = V64SImode;
14934 fcn = gen_avx5124vnniw_vp4dpwssds;
14935 masked = 0;
14936 goto v4fma_expand;
14937
14938 case IX86_BUILTIN_4FNMAPS:
14939 fcn = gen_avx5124fmaddps_4fnmaddps;
14940 masked = 0;
14941 goto v4fma_expand;
14942
14943 case IX86_BUILTIN_4FNMAPS_MASK:
14944 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
14945 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
14946 goto v4fma_expand;
14947
14948 case IX86_BUILTIN_4DPWSSD_MASK:
14949 nar_mode = V4SImode;
14950 mode = V16SImode;
14951 wide_mode = V64SImode;
14952 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
14953 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
14954 goto v4fma_expand;
14955
14956 case IX86_BUILTIN_4DPWSSDS_MASK:
14957 nar_mode = V4SImode;
14958 mode = V16SImode;
14959 wide_mode = V64SImode;
14960 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
14961 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
14962 goto v4fma_expand;
14963
14964 case IX86_BUILTIN_4FMAPS_MASK:
14965 {
14966 tree args[4];
14967 rtx ops[4];
14968 rtx wide_reg;
14969 rtx accum;
14970 rtx addr;
14971 rtx mem;
14972
14973v4fma_expand:
14974 wide_reg = gen_reg_rtx (wide_mode);
14975 for (i = 0; i < 4; i++)
14976 {
14977 args[i] = CALL_EXPR_ARG (exp, i);
14978 ops[i] = expand_normal (args[i]);
14979
14980 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
14981 ops[i]);
14982 }
14983
14984 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14985 accum = force_reg (mode, accum);
14986
14987 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14988 addr = force_reg (Pmode, addr);
14989
14990 mem = gen_rtx_MEM (nar_mode, addr);
14991
14992 target = gen_reg_rtx (mode);
14993
14994 emit_move_insn (target, accum);
14995
14996 if (! masked)
14997 emit_insn (fcn (target, accum, wide_reg, mem));
14998 else
14999 {
15000 rtx merge, mask;
15001 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15002
15003 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15004
15005 if (CONST_INT_P (mask))
15006 mask = fixup_modeless_constant (mask, HImode);
15007
15008 mask = force_reg (HImode, mask);
15009
15010 if (GET_MODE (mask) != HImode)
15011 mask = gen_rtx_SUBREG (HImode, mask, 0);
15012
15013 /* If merge is 0 then we're about to emit z-masked variant. */
15014 if (const0_operand (merge, mode))
15015 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15016 /* If merge is the same as accum then emit merge-masked variant. */
15017 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15018 {
15019 merge = force_reg (mode, merge);
15020 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15021 }
15022 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15023 else
15024 {
15025 target = gen_reg_rtx (mode);
15026 emit_move_insn (target, merge);
15027 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15028 }
15029 }
15030 return target;
15031 }
15032
15033 case IX86_BUILTIN_4FNMASS:
15034 fcn = gen_avx5124fmaddps_4fnmaddss;
15035 masked = 0;
15036 goto s4fma_expand;
15037
15038 case IX86_BUILTIN_4FMASS:
15039 fcn = gen_avx5124fmaddps_4fmaddss;
15040 masked = 0;
15041 goto s4fma_expand;
15042
15043 case IX86_BUILTIN_4FNMASS_MASK:
15044 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
15045 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
15046 goto s4fma_expand;
15047
15048 case IX86_BUILTIN_4FMASS_MASK:
15049 {
15050 tree args[4];
15051 rtx ops[4];
15052 rtx wide_reg;
15053 rtx accum;
15054 rtx addr;
15055 rtx mem;
15056
15057 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
15058 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
15059
15060s4fma_expand:
15061 mode = V4SFmode;
15062 wide_reg = gen_reg_rtx (V64SFmode);
15063 for (i = 0; i < 4; i++)
15064 {
15065 rtx tmp;
15066 args[i] = CALL_EXPR_ARG (exp, i);
15067 ops[i] = expand_normal (args[i]);
15068
15069 tmp = gen_reg_rtx (SFmode);
15070 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
15071
15072 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
15073 gen_rtx_SUBREG (V16SFmode, tmp, 0));
15074 }
15075
15076 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15077 accum = force_reg (V4SFmode, accum);
15078
15079 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15080 addr = force_reg (Pmode, addr);
15081
15082 mem = gen_rtx_MEM (V4SFmode, addr);
15083
15084 target = gen_reg_rtx (V4SFmode);
15085
15086 emit_move_insn (target, accum);
15087
15088 if (! masked)
15089 emit_insn (fcn (target, accum, wide_reg, mem));
15090 else
15091 {
15092 rtx merge, mask;
15093 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15094
15095 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15096
15097 if (CONST_INT_P (mask))
15098 mask = fixup_modeless_constant (mask, QImode);
15099
15100 mask = force_reg (QImode, mask);
15101
15102 if (GET_MODE (mask) != QImode)
15103 mask = gen_rtx_SUBREG (QImode, mask, 0);
15104
15105 /* If merge is 0 then we're about to emit z-masked variant. */
15106 if (const0_operand (merge, mode))
15107 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15108 /* If merge is the same as accum then emit merge-masked
15109 variant. */
15110 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15111 {
15112 merge = force_reg (mode, merge);
15113 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15114 }
15115 /* Merge with something unknown might happen if we z-mask
15116 w/ -O0. */
15117 else
15118 {
15119 target = gen_reg_rtx (mode);
15120 emit_move_insn (target, merge);
15121 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15122 }
15123 }
15124 return target;
15125 }
15126 case IX86_BUILTIN_RDPID:
15127 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
15128 target);
15129 case IX86_BUILTIN_FABSQ:
15130 case IX86_BUILTIN_COPYSIGNQ:
15131 if (!TARGET_SSE)
15132 /* Emit a normal call if SSE isn't available. */
15133 return expand_call (exp, target, ignore);
15134 /* FALLTHRU */
15135 default:
15136 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
15137 }
15138 }
15139
15140 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
15141 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
15142 {
15143 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
15144 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
15145 }
15146
15147 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15148 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
15149 {
15150 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
15151 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
15152 }
15153
15154 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15155 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
15156 {
15157 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
15158 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
15159 }
15160
15161 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15162 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
15163 {
15164 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
15165 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
15166 }
15167
15168 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15169 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
15170 {
15171 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
15172 const struct builtin_description *d = bdesc_multi_arg + i;
15173 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
15174 (enum ix86_builtin_func_type)
15175 d->flag, d->comparison);
15176 }
15177
15178 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
15179 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
15180 {
15181 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
15182 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
15183 target);
15184 }
15185
2bf6d935
ML
15186 gcc_unreachable ();
15187}
15188
15189/* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15190 fill target with val via vec_duplicate. */
15191
15192static bool
15193ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
15194{
15195 bool ok;
15196 rtx_insn *insn;
15197 rtx dup;
b3237a2c
JJ
15198 /* Save/restore recog_data in case this is called from splitters
15199 or other routines where recog_data needs to stay valid across
15200 force_reg. See PR106577. */
15201 recog_data_d recog_data_save = recog_data;
2bf6d935
ML
15202
15203 /* First attempt to recognize VAL as-is. */
15204 dup = gen_vec_duplicate (mode, val);
15205 insn = emit_insn (gen_rtx_SET (target, dup));
15206 if (recog_memoized (insn) < 0)
15207 {
15208 rtx_insn *seq;
15209 machine_mode innermode = GET_MODE_INNER (mode);
15210 rtx reg;
15211
15212 /* If that fails, force VAL into a register. */
15213
15214 start_sequence ();
15215 reg = force_reg (innermode, val);
15216 if (GET_MODE (reg) != innermode)
15217 reg = gen_lowpart (innermode, reg);
15218 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
15219 seq = get_insns ();
15220 end_sequence ();
15221 if (seq)
15222 emit_insn_before (seq, insn);
15223
15224 ok = recog_memoized (insn) >= 0;
15225 gcc_assert (ok);
15226 }
b3237a2c 15227 recog_data = recog_data_save;
2bf6d935
ML
15228 return true;
15229}
15230
15231/* Get a vector mode of the same size as the original but with elements
15232 twice as wide. This is only guaranteed to apply to integral vectors. */
15233
15234static machine_mode
15235get_mode_wider_vector (machine_mode o)
15236{
e53b6e56 15237 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
4b796619 15238 machine_mode n = GET_MODE_NEXT_MODE (o).require ();
2bf6d935
ML
15239 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
15240 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
15241 return n;
15242}
15243
15244static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
15245static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
15246
15247/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15248 with all elements equal to VAR. Return true if successful. */
15249
51c30227 15250bool
2bf6d935
ML
15251ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
15252 rtx target, rtx val)
15253{
15254 bool ok;
15255
15256 switch (mode)
15257 {
15258 case E_V2SImode:
15259 case E_V2SFmode:
15260 if (!mmx_ok)
15261 return false;
15262 /* FALLTHRU */
15263
15264 case E_V4DFmode:
15265 case E_V4DImode:
15266 case E_V8SFmode:
15267 case E_V8SImode:
15268 case E_V2DFmode:
15269 case E_V2DImode:
15270 case E_V4SFmode:
15271 case E_V4SImode:
15272 case E_V16SImode:
15273 case E_V8DImode:
15274 case E_V16SFmode:
15275 case E_V8DFmode:
15276 return ix86_vector_duplicate_value (mode, target, val);
15277
15278 case E_V4HImode:
15279 if (!mmx_ok)
15280 return false;
15281 if (TARGET_SSE || TARGET_3DNOW_A)
15282 {
15283 rtx x;
15284
15285 val = gen_lowpart (SImode, val);
15286 x = gen_rtx_TRUNCATE (HImode, val);
15287 x = gen_rtx_VEC_DUPLICATE (mode, x);
15288 emit_insn (gen_rtx_SET (target, x));
15289 return true;
15290 }
15291 goto widen;
15292
8d7dae0e
UB
15293 case E_V2HImode:
15294 if (TARGET_SSE2)
15295 {
15296 rtx x;
15297
15298 val = gen_lowpart (SImode, val);
15299 x = gen_rtx_TRUNCATE (HImode, val);
15300 x = gen_rtx_VEC_DUPLICATE (mode, x);
15301 emit_insn (gen_rtx_SET (target, x));
15302 return true;
15303 }
15304 return false;
15305
2bf6d935 15306 case E_V8QImode:
64735dc9 15307 case E_V4QImode:
2bf6d935
ML
15308 if (!mmx_ok)
15309 return false;
15310 goto widen;
15311
15312 case E_V8HImode:
7a54d3de 15313 case E_V8HFmode:
6910cad5 15314 case E_V8BFmode:
2bf6d935
ML
15315 if (TARGET_AVX2)
15316 return ix86_vector_duplicate_value (mode, target, val);
15317
15318 if (TARGET_SSE2)
15319 {
15320 struct expand_vec_perm_d dperm;
15321 rtx tmp1, tmp2;
15322
15323 permute:
15324 memset (&dperm, 0, sizeof (dperm));
15325 dperm.target = target;
15326 dperm.vmode = mode;
15327 dperm.nelt = GET_MODE_NUNITS (mode);
15328 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15329 dperm.one_operand_p = true;
15330
092763fd 15331 if (mode == V8HFmode || mode == V8BFmode)
e2385690 15332 {
092763fd 15333 tmp1 = force_reg (GET_MODE_INNER (mode), val);
e2385690 15334 tmp2 = gen_reg_rtx (mode);
092763fd 15335 emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
15336 CONST0_RTX (mode), tmp1));
e2385690
HW
15337 tmp1 = gen_lowpart (mode, tmp2);
15338 }
7a54d3de
UB
15339 else
15340 {
15341 /* Extend to SImode using a paradoxical SUBREG. */
15342 tmp1 = gen_reg_rtx (SImode);
15343 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15344
15345 /* Insert the SImode value as
15346 low element of a V4SImode vector. */
15347 tmp2 = gen_reg_rtx (V4SImode);
15348 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15349 tmp1 = gen_lowpart (mode, tmp2);
15350 }
2bf6d935 15351
7a54d3de 15352 emit_move_insn (dperm.op0, tmp1);
2bf6d935
ML
15353 ok = (expand_vec_perm_1 (&dperm)
15354 || expand_vec_perm_broadcast_1 (&dperm));
15355 gcc_assert (ok);
15356 return ok;
15357 }
15358 goto widen;
15359
15360 case E_V16QImode:
15361 if (TARGET_AVX2)
15362 return ix86_vector_duplicate_value (mode, target, val);
15363
15364 if (TARGET_SSE2)
15365 goto permute;
15366 goto widen;
15367
15368 widen:
15369 /* Replicate the value once into the next wider mode and recurse. */
15370 {
15371 machine_mode smode, wsmode, wvmode;
15372 rtx x;
15373
15374 smode = GET_MODE_INNER (mode);
15375 wvmode = get_mode_wider_vector (mode);
15376 wsmode = GET_MODE_INNER (wvmode);
15377
15378 val = convert_modes (wsmode, smode, val, true);
20a2c8ac
UB
15379
15380 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15381 emit_insn (gen_insv_1 (wsmode, val, val));
15382 else
15383 {
15384 x = expand_simple_binop (wsmode, ASHIFT, val,
15385 GEN_INT (GET_MODE_BITSIZE (smode)),
15386 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15387 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15388 OPTAB_LIB_WIDEN);
15389 }
2bf6d935
ML
15390
15391 x = gen_reg_rtx (wvmode);
15392 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15393 gcc_assert (ok);
15394 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15395 return ok;
15396 }
15397
15398 case E_V16HImode:
7a54d3de 15399 case E_V16HFmode:
6910cad5 15400 case E_V16BFmode:
2bf6d935
ML
15401 case E_V32QImode:
15402 if (TARGET_AVX2)
15403 return ix86_vector_duplicate_value (mode, target, val);
15404 else
15405 {
78260b9a 15406 machine_mode hvmode;
15407 switch (mode)
15408 {
15409 case V16HImode:
15410 hvmode = V8HImode;
15411 break;
15412 case V16HFmode:
15413 hvmode = V8HFmode;
15414 break;
15415 case V16BFmode:
15416 hvmode = V8BFmode;
15417 break;
15418 case V32QImode:
15419 hvmode = V16QImode;
15420 break;
15421 default:
15422 gcc_unreachable ();
15423 }
2bf6d935
ML
15424 rtx x = gen_reg_rtx (hvmode);
15425
15426 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15427 gcc_assert (ok);
15428
15429 x = gen_rtx_VEC_CONCAT (mode, x, x);
15430 emit_insn (gen_rtx_SET (target, x));
15431 }
15432 return true;
15433
2bf6d935 15434 case E_V32HImode:
7a54d3de 15435 case E_V32HFmode:
6910cad5 15436 case E_V32BFmode:
7a54d3de 15437 case E_V64QImode:
2bf6d935
ML
15438 if (TARGET_AVX512BW)
15439 return ix86_vector_duplicate_value (mode, target, val);
15440 else
15441 {
78260b9a 15442 machine_mode hvmode;
15443 switch (mode)
15444 {
15445 case V32HImode:
15446 hvmode = V16HImode;
15447 break;
15448 case V32HFmode:
15449 hvmode = V16HFmode;
15450 break;
15451 case V32BFmode:
15452 hvmode = V16BFmode;
15453 break;
15454 case V64QImode:
15455 hvmode = V32QImode;
15456 break;
15457 default:
15458 gcc_unreachable ();
15459 }
2bf6d935
ML
15460 rtx x = gen_reg_rtx (hvmode);
15461
15462 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15463 gcc_assert (ok);
15464
15465 x = gen_rtx_VEC_CONCAT (mode, x, x);
15466 emit_insn (gen_rtx_SET (target, x));
15467 }
15468 return true;
15469
15470 default:
15471 return false;
15472 }
15473}
15474
15475/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15476 whose ONE_VAR element is VAR, and other elements are zero. Return true
15477 if successful. */
15478
15479static bool
15480ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15481 rtx target, rtx var, int one_var)
15482{
15483 machine_mode vsimode;
15484 rtx new_target;
15485 rtx x, tmp;
15486 bool use_vector_set = false;
15487 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15488
15489 switch (mode)
15490 {
15491 case E_V2DImode:
15492 /* For SSE4.1, we normally use vector set. But if the second
15493 element is zero and inter-unit moves are OK, we use movq
15494 instead. */
15495 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15496 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15497 && one_var == 0));
15498 break;
15499 case E_V16QImode:
15500 case E_V4SImode:
15501 case E_V4SFmode:
15502 use_vector_set = TARGET_SSE4_1;
15503 break;
15504 case E_V8HImode:
15505 use_vector_set = TARGET_SSE2;
c4d423c7 15506 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15507 ? gen_vec_setv8hi_0 : NULL;
2bf6d935 15508 break;
8a0eb0cd
UB
15509 case E_V8QImode:
15510 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15511 break;
2bf6d935
ML
15512 case E_V4HImode:
15513 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15514 break;
64735dc9
UB
15515 case E_V4QImode:
15516 use_vector_set = TARGET_SSE4_1;
15517 break;
2bf6d935 15518 case E_V32QImode:
c4d423c7 15519 use_vector_set = TARGET_AVX;
15520 break;
2bf6d935
ML
15521 case E_V16HImode:
15522 use_vector_set = TARGET_AVX;
c4d423c7 15523 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15524 ? gen_vec_setv16hi_0 : NULL;
2bf6d935
ML
15525 break;
15526 case E_V8SImode:
15527 use_vector_set = TARGET_AVX;
15528 gen_vec_set_0 = gen_vec_setv8si_0;
15529 break;
15530 case E_V8SFmode:
15531 use_vector_set = TARGET_AVX;
15532 gen_vec_set_0 = gen_vec_setv8sf_0;
15533 break;
15534 case E_V4DFmode:
15535 use_vector_set = TARGET_AVX;
15536 gen_vec_set_0 = gen_vec_setv4df_0;
15537 break;
15538 case E_V4DImode:
15539 /* Use ix86_expand_vector_set in 64bit mode only. */
15540 use_vector_set = TARGET_AVX && TARGET_64BIT;
15541 gen_vec_set_0 = gen_vec_setv4di_0;
15542 break;
15543 case E_V16SImode:
15544 use_vector_set = TARGET_AVX512F && one_var == 0;
15545 gen_vec_set_0 = gen_vec_setv16si_0;
15546 break;
15547 case E_V16SFmode:
15548 use_vector_set = TARGET_AVX512F && one_var == 0;
15549 gen_vec_set_0 = gen_vec_setv16sf_0;
15550 break;
15551 case E_V8DFmode:
15552 use_vector_set = TARGET_AVX512F && one_var == 0;
15553 gen_vec_set_0 = gen_vec_setv8df_0;
15554 break;
15555 case E_V8DImode:
15556 /* Use ix86_expand_vector_set in 64bit mode only. */
15557 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15558 gen_vec_set_0 = gen_vec_setv8di_0;
15559 break;
9e2a82e1 15560 case E_V8HFmode:
15561 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15562 gen_vec_set_0 = gen_vec_setv8hf_0;
15563 break;
15564 case E_V16HFmode:
15565 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15566 gen_vec_set_0 = gen_vec_setv16hf_0;
15567 break;
15568 case E_V32HFmode:
15569 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15570 gen_vec_set_0 = gen_vec_setv32hf_0;
15571 break;
6910cad5 15572 case E_V8BFmode:
15573 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15574 gen_vec_set_0 = gen_vec_setv8bf_0;
15575 break;
15576 case E_V16BFmode:
15577 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15578 gen_vec_set_0 = gen_vec_setv16bf_0;
15579 break;
15580 case E_V32BFmode:
15581 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15582 gen_vec_set_0 = gen_vec_setv32bf_0;
15583 break;
c4d423c7 15584 case E_V32HImode:
15585 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15586 gen_vec_set_0 = gen_vec_setv32hi_0;
2bf6d935
ML
15587 default:
15588 break;
15589 }
15590
15591 if (use_vector_set)
15592 {
15593 if (gen_vec_set_0 && one_var == 0)
15594 {
15595 var = force_reg (GET_MODE_INNER (mode), var);
15596 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15597 return true;
15598 }
15599 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15600 var = force_reg (GET_MODE_INNER (mode), var);
15601 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15602 return true;
15603 }
15604
15605 switch (mode)
15606 {
15607 case E_V2SFmode:
15608 case E_V2SImode:
15609 if (!mmx_ok)
15610 return false;
15611 /* FALLTHRU */
15612
15613 case E_V2DFmode:
15614 case E_V2DImode:
15615 if (one_var != 0)
15616 return false;
15617 var = force_reg (GET_MODE_INNER (mode), var);
15618 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15619 emit_insn (gen_rtx_SET (target, x));
15620 return true;
15621
15622 case E_V4SFmode:
15623 case E_V4SImode:
15624 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15625 new_target = gen_reg_rtx (mode);
15626 else
15627 new_target = target;
15628 var = force_reg (GET_MODE_INNER (mode), var);
15629 x = gen_rtx_VEC_DUPLICATE (mode, var);
15630 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15631 emit_insn (gen_rtx_SET (new_target, x));
15632 if (one_var != 0)
15633 {
15634 /* We need to shuffle the value to the correct position, so
15635 create a new pseudo to store the intermediate result. */
15636
15637 /* With SSE2, we can use the integer shuffle insns. */
15638 if (mode != V4SFmode && TARGET_SSE2)
15639 {
15640 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15641 const1_rtx,
15642 GEN_INT (one_var == 1 ? 0 : 1),
15643 GEN_INT (one_var == 2 ? 0 : 1),
15644 GEN_INT (one_var == 3 ? 0 : 1)));
15645 if (target != new_target)
15646 emit_move_insn (target, new_target);
15647 return true;
15648 }
15649
15650 /* Otherwise convert the intermediate result to V4SFmode and
15651 use the SSE1 shuffle instructions. */
15652 if (mode != V4SFmode)
15653 {
15654 tmp = gen_reg_rtx (V4SFmode);
15655 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15656 }
15657 else
15658 tmp = new_target;
15659
15660 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15661 const1_rtx,
15662 GEN_INT (one_var == 1 ? 0 : 1),
15663 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15664 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15665
15666 if (mode != V4SFmode)
15667 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15668 else if (tmp != target)
15669 emit_move_insn (target, tmp);
15670 }
15671 else if (target != new_target)
15672 emit_move_insn (target, new_target);
15673 return true;
15674
15675 case E_V8HImode:
15676 case E_V16QImode:
15677 vsimode = V4SImode;
15678 goto widen;
15679 case E_V4HImode:
15680 case E_V8QImode:
15681 if (!mmx_ok)
15682 return false;
15683 vsimode = V2SImode;
15684 goto widen;
15685 widen:
15686 if (one_var != 0)
15687 return false;
15688
15689 /* Zero extend the variable element to SImode and recurse. */
15690 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15691
15692 x = gen_reg_rtx (vsimode);
15693 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15694 var, one_var))
15695 gcc_unreachable ();
15696
15697 emit_move_insn (target, gen_lowpart (mode, x));
15698 return true;
15699
15700 default:
15701 return false;
15702 }
15703}
15704
15705/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15706 consisting of the values in VALS. It is known that all elements
15707 except ONE_VAR are constants. Return true if successful. */
15708
15709static bool
15710ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15711 rtx target, rtx vals, int one_var)
15712{
15713 rtx var = XVECEXP (vals, 0, one_var);
15714 machine_mode wmode;
15715 rtx const_vec, x;
15716
15717 const_vec = copy_rtx (vals);
15718 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15719 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15720
15721 switch (mode)
15722 {
15723 case E_V2DFmode:
15724 case E_V2DImode:
15725 case E_V2SFmode:
15726 case E_V2SImode:
15727 /* For the two element vectors, it's just as easy to use
15728 the general case. */
15729 return false;
15730
15731 case E_V4DImode:
15732 /* Use ix86_expand_vector_set in 64bit mode only. */
15733 if (!TARGET_64BIT)
15734 return false;
15735 /* FALLTHRU */
9e2a82e1 15736 case E_V8HFmode:
15737 case E_V16HFmode:
6910cad5 15738 case E_V8BFmode:
15739 case E_V16BFmode:
2bf6d935
ML
15740 case E_V4DFmode:
15741 case E_V8SFmode:
15742 case E_V8SImode:
15743 case E_V16HImode:
15744 case E_V32QImode:
15745 case E_V4SFmode:
15746 case E_V4SImode:
15747 case E_V8HImode:
15748 case E_V4HImode:
15749 break;
15750
15751 case E_V16QImode:
15752 if (TARGET_SSE4_1)
15753 break;
15754 wmode = V8HImode;
15755 goto widen;
15756 case E_V8QImode:
8a0eb0cd
UB
15757 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15758 break;
2bf6d935
ML
15759 wmode = V4HImode;
15760 goto widen;
64735dc9
UB
15761 case E_V4QImode:
15762 if (TARGET_SSE4_1)
15763 break;
15764 wmode = V2HImode;
2bf6d935
ML
15765 widen:
15766 /* There's no way to set one QImode entry easily. Combine
15767 the variable value with its adjacent constant value, and
15768 promote to an HImode set. */
15769 x = XVECEXP (vals, 0, one_var ^ 1);
15770 if (one_var & 1)
15771 {
15772 var = convert_modes (HImode, QImode, var, true);
15773 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15774 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15775 x = GEN_INT (INTVAL (x) & 0xff);
15776 }
15777 else
15778 {
15779 var = convert_modes (HImode, QImode, var, true);
15780 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15781 }
15782 if (x != const0_rtx)
15783 var = expand_simple_binop (HImode, IOR, var, x, var,
15784 1, OPTAB_LIB_WIDEN);
15785
15786 x = gen_reg_rtx (wmode);
15787 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15788 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15789
15790 emit_move_insn (target, gen_lowpart (mode, x));
15791 return true;
15792
15793 default:
15794 return false;
15795 }
15796
15797 emit_move_insn (target, const_vec);
15798 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15799 return true;
15800}
15801
15802/* A subroutine of ix86_expand_vector_init_general. Use vector
15803 concatenate to handle the most general case: all values variable,
15804 and none identical. */
15805
15806static void
15807ix86_expand_vector_init_concat (machine_mode mode,
15808 rtx target, rtx *ops, int n)
15809{
1aeecaf5
HL
15810 machine_mode half_mode = VOIDmode;
15811 rtx half[2];
2bf6d935
ML
15812 rtvec v;
15813 int i, j;
15814
15815 switch (n)
15816 {
15817 case 2:
15818 switch (mode)
15819 {
9e2a82e1 15820 case E_V32HFmode:
15821 half_mode = V16HFmode;
15822 break;
6910cad5 15823 case E_V32BFmode:
15824 half_mode = V16BFmode;
15825 break;
2bf6d935 15826 case E_V16SImode:
1aeecaf5 15827 half_mode = V8SImode;
2bf6d935
ML
15828 break;
15829 case E_V16SFmode:
1aeecaf5 15830 half_mode = V8SFmode;
2bf6d935
ML
15831 break;
15832 case E_V8DImode:
1aeecaf5 15833 half_mode = V4DImode;
2bf6d935
ML
15834 break;
15835 case E_V8DFmode:
1aeecaf5 15836 half_mode = V4DFmode;
2bf6d935 15837 break;
9e2a82e1 15838 case E_V16HFmode:
15839 half_mode = V8HFmode;
15840 break;
6910cad5 15841 case E_V16BFmode:
15842 half_mode = V8BFmode;
15843 break;
2bf6d935 15844 case E_V8SImode:
1aeecaf5 15845 half_mode = V4SImode;
2bf6d935
ML
15846 break;
15847 case E_V8SFmode:
1aeecaf5 15848 half_mode = V4SFmode;
2bf6d935
ML
15849 break;
15850 case E_V4DImode:
1aeecaf5 15851 half_mode = V2DImode;
2bf6d935
ML
15852 break;
15853 case E_V4DFmode:
1aeecaf5 15854 half_mode = V2DFmode;
2bf6d935
ML
15855 break;
15856 case E_V4SImode:
1aeecaf5 15857 half_mode = V2SImode;
2bf6d935
ML
15858 break;
15859 case E_V4SFmode:
1aeecaf5 15860 half_mode = V2SFmode;
2bf6d935
ML
15861 break;
15862 case E_V2DImode:
1aeecaf5 15863 half_mode = DImode;
2bf6d935
ML
15864 break;
15865 case E_V2SImode:
1aeecaf5 15866 half_mode = SImode;
2bf6d935
ML
15867 break;
15868 case E_V2DFmode:
1aeecaf5 15869 half_mode = DFmode;
2bf6d935
ML
15870 break;
15871 case E_V2SFmode:
1aeecaf5 15872 half_mode = SFmode;
2bf6d935
ML
15873 break;
15874 default:
15875 gcc_unreachable ();
15876 }
15877
1aeecaf5
HL
15878 if (!register_operand (ops[1], half_mode))
15879 ops[1] = force_reg (half_mode, ops[1]);
15880 if (!register_operand (ops[0], half_mode))
15881 ops[0] = force_reg (half_mode, ops[0]);
2bf6d935
ML
15882 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15883 ops[1])));
15884 break;
15885
15886 case 4:
15887 switch (mode)
15888 {
15889 case E_V4DImode:
1aeecaf5 15890 half_mode = V2DImode;
2bf6d935
ML
15891 break;
15892 case E_V4DFmode:
1aeecaf5 15893 half_mode = V2DFmode;
2bf6d935
ML
15894 break;
15895 case E_V4SImode:
1aeecaf5 15896 half_mode = V2SImode;
2bf6d935
ML
15897 break;
15898 case E_V4SFmode:
1aeecaf5 15899 half_mode = V2SFmode;
2bf6d935
ML
15900 break;
15901 default:
15902 gcc_unreachable ();
15903 }
15904 goto half;
15905
15906 case 8:
15907 switch (mode)
15908 {
15909 case E_V8DImode:
1aeecaf5 15910 half_mode = V4DImode;
2bf6d935
ML
15911 break;
15912 case E_V8DFmode:
1aeecaf5 15913 half_mode = V4DFmode;
2bf6d935
ML
15914 break;
15915 case E_V8SImode:
1aeecaf5 15916 half_mode = V4SImode;
2bf6d935
ML
15917 break;
15918 case E_V8SFmode:
1aeecaf5 15919 half_mode = V4SFmode;
2bf6d935
ML
15920 break;
15921 default:
15922 gcc_unreachable ();
15923 }
15924 goto half;
15925
15926 case 16:
15927 switch (mode)
15928 {
15929 case E_V16SImode:
1aeecaf5 15930 half_mode = V8SImode;
2bf6d935
ML
15931 break;
15932 case E_V16SFmode:
1aeecaf5 15933 half_mode = V8SFmode;
2bf6d935
ML
15934 break;
15935 default:
15936 gcc_unreachable ();
15937 }
15938 goto half;
15939
15940half:
15941 /* FIXME: We process inputs backward to help RA. PR 36222. */
15942 i = n - 1;
1aeecaf5 15943 for (j = 1; j != -1; j--)
2bf6d935 15944 {
1aeecaf5
HL
15945 half[j] = gen_reg_rtx (half_mode);
15946 switch (n >> 1)
2bf6d935 15947 {
1aeecaf5
HL
15948 case 2:
15949 v = gen_rtvec (2, ops[i-1], ops[i]);
15950 i -= 2;
15951 break;
15952 case 4:
15953 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
15954 i -= 4;
15955 break;
15956 case 8:
15957 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
15958 ops[i-3], ops[i-2], ops[i-1], ops[i]);
15959 i -= 8;
15960 break;
15961 default:
15962 gcc_unreachable ();
2bf6d935 15963 }
1aeecaf5
HL
15964 ix86_expand_vector_init (false, half[j],
15965 gen_rtx_PARALLEL (half_mode, v));
2bf6d935 15966 }
1aeecaf5
HL
15967
15968 ix86_expand_vector_init_concat (mode, target, half, 2);
2bf6d935
ML
15969 break;
15970
15971 default:
15972 gcc_unreachable ();
15973 }
15974}
15975
15976/* A subroutine of ix86_expand_vector_init_general. Use vector
15977 interleave to handle the most general case: all values variable,
15978 and none identical. */
15979
15980static void
15981ix86_expand_vector_init_interleave (machine_mode mode,
15982 rtx target, rtx *ops, int n)
15983{
15984 machine_mode first_imode, second_imode, third_imode, inner_mode;
15985 int i, j;
9e2a82e1 15986 rtx op, op0, op1;
2bf6d935
ML
15987 rtx (*gen_load_even) (rtx, rtx, rtx);
15988 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
15989 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
15990
15991 switch (mode)
15992 {
9e2a82e1 15993 case E_V8HFmode:
7fc4d600 15994 gen_load_even = gen_vec_interleave_lowv8hf;
9e2a82e1 15995 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15996 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15997 inner_mode = HFmode;
15998 first_imode = V4SImode;
15999 second_imode = V2DImode;
16000 third_imode = VOIDmode;
16001 break;
6910cad5 16002 case E_V8BFmode:
16003 gen_load_even = gen_vec_interleave_lowv8bf;
16004 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16005 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16006 inner_mode = BFmode;
16007 first_imode = V4SImode;
16008 second_imode = V2DImode;
16009 third_imode = VOIDmode;
16010 break;
2bf6d935
ML
16011 case E_V8HImode:
16012 gen_load_even = gen_vec_setv8hi;
16013 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16014 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16015 inner_mode = HImode;
16016 first_imode = V4SImode;
16017 second_imode = V2DImode;
16018 third_imode = VOIDmode;
16019 break;
16020 case E_V16QImode:
16021 gen_load_even = gen_vec_setv16qi;
16022 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
16023 gen_interleave_second_low = gen_vec_interleave_lowv4si;
16024 inner_mode = QImode;
16025 first_imode = V8HImode;
16026 second_imode = V4SImode;
16027 third_imode = V2DImode;
16028 break;
16029 default:
16030 gcc_unreachable ();
16031 }
16032
16033 for (i = 0; i < n; i++)
16034 {
9e2a82e1 16035 op = ops [i + i];
6910cad5 16036 if (inner_mode == HFmode || inner_mode == BFmode)
9e2a82e1 16037 {
7fc4d600 16038 rtx even, odd;
6910cad5 16039 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16040 machine_mode vec_mode =
16041 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
16042 op0 = gen_reg_rtx (vec_mode);
16043 even = lowpart_subreg (vec_mode,
16044 force_reg (inner_mode, op), inner_mode);
16045 odd = lowpart_subreg (vec_mode,
16046 force_reg (inner_mode, ops[i + i + 1]),
16047 inner_mode);
7fc4d600 16048 emit_insn (gen_load_even (op0, even, odd));
9e2a82e1 16049 }
7fc4d600 16050 else
16051 {
16052 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16053 op0 = gen_reg_rtx (SImode);
16054 emit_move_insn (op0, gen_lowpart (SImode, op));
9e2a82e1 16055
7fc4d600 16056 /* Insert the SImode value as low element of V4SImode vector. */
16057 op1 = gen_reg_rtx (V4SImode);
16058 op0 = gen_rtx_VEC_MERGE (V4SImode,
16059 gen_rtx_VEC_DUPLICATE (V4SImode,
16060 op0),
16061 CONST0_RTX (V4SImode),
16062 const1_rtx);
16063 emit_insn (gen_rtx_SET (op1, op0));
2bf6d935 16064
7fc4d600 16065 /* Cast the V4SImode vector back to a vector in orignal mode. */
16066 op0 = gen_reg_rtx (mode);
16067 emit_move_insn (op0, gen_lowpart (mode, op1));
2bf6d935 16068
7fc4d600 16069 /* Load even elements into the second position. */
16070 emit_insn (gen_load_even (op0,
16071 force_reg (inner_mode,
16072 ops[i + i + 1]),
16073 const1_rtx));
16074 }
2bf6d935
ML
16075
16076 /* Cast vector to FIRST_IMODE vector. */
16077 ops[i] = gen_reg_rtx (first_imode);
16078 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
16079 }
16080
16081 /* Interleave low FIRST_IMODE vectors. */
16082 for (i = j = 0; i < n; i += 2, j++)
16083 {
16084 op0 = gen_reg_rtx (first_imode);
16085 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
16086
16087 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16088 ops[j] = gen_reg_rtx (second_imode);
16089 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
16090 }
16091
16092 /* Interleave low SECOND_IMODE vectors. */
16093 switch (second_imode)
16094 {
16095 case E_V4SImode:
16096 for (i = j = 0; i < n / 2; i += 2, j++)
16097 {
16098 op0 = gen_reg_rtx (second_imode);
16099 emit_insn (gen_interleave_second_low (op0, ops[i],
16100 ops[i + 1]));
16101
16102 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16103 vector. */
16104 ops[j] = gen_reg_rtx (third_imode);
16105 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
16106 }
16107 second_imode = V2DImode;
16108 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16109 /* FALLTHRU */
16110
16111 case E_V2DImode:
16112 op0 = gen_reg_rtx (second_imode);
16113 emit_insn (gen_interleave_second_low (op0, ops[0],
16114 ops[1]));
16115
16116 /* Cast the SECOND_IMODE vector back to a vector on original
16117 mode. */
16118 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
16119 break;
16120
16121 default:
16122 gcc_unreachable ();
16123 }
16124}
16125
16126/* A subroutine of ix86_expand_vector_init. Handle the most general case:
16127 all values variable, and none identical. */
16128
16129static void
16130ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
16131 rtx target, rtx vals)
16132{
16133 rtx ops[64], op0, op1, op2, op3, op4, op5;
16134 machine_mode half_mode = VOIDmode;
16135 machine_mode quarter_mode = VOIDmode;
16136 int n, i;
16137
16138 switch (mode)
16139 {
16140 case E_V2SFmode:
16141 case E_V2SImode:
16142 if (!mmx_ok && !TARGET_SSE)
16143 break;
16144 /* FALLTHRU */
16145
16146 case E_V16SImode:
16147 case E_V16SFmode:
16148 case E_V8DFmode:
16149 case E_V8DImode:
16150 case E_V8SFmode:
16151 case E_V8SImode:
16152 case E_V4DFmode:
16153 case E_V4DImode:
16154 case E_V4SFmode:
16155 case E_V4SImode:
16156 case E_V2DFmode:
16157 case E_V2DImode:
16158 n = GET_MODE_NUNITS (mode);
16159 for (i = 0; i < n; i++)
16160 ops[i] = XVECEXP (vals, 0, i);
16161 ix86_expand_vector_init_concat (mode, target, ops, n);
16162 return;
16163
16164 case E_V2TImode:
16165 for (i = 0; i < 2; i++)
16166 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16167 op0 = gen_reg_rtx (V4DImode);
16168 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
16169 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16170 return;
16171
16172 case E_V4TImode:
16173 for (i = 0; i < 4; i++)
16174 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16175 ops[4] = gen_reg_rtx (V4DImode);
16176 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
16177 ops[5] = gen_reg_rtx (V4DImode);
16178 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
16179 op0 = gen_reg_rtx (V8DImode);
16180 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
16181 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16182 return;
16183
16184 case E_V32QImode:
16185 half_mode = V16QImode;
16186 goto half;
16187
16188 case E_V16HImode:
16189 half_mode = V8HImode;
16190 goto half;
16191
9e2a82e1 16192 case E_V16HFmode:
16193 half_mode = V8HFmode;
16194 goto half;
16195
6910cad5 16196 case E_V16BFmode:
16197 half_mode = V8BFmode;
16198 goto half;
16199
2bf6d935
ML
16200half:
16201 n = GET_MODE_NUNITS (mode);
16202 for (i = 0; i < n; i++)
16203 ops[i] = XVECEXP (vals, 0, i);
16204 op0 = gen_reg_rtx (half_mode);
16205 op1 = gen_reg_rtx (half_mode);
16206 ix86_expand_vector_init_interleave (half_mode, op0, ops,
16207 n >> 2);
16208 ix86_expand_vector_init_interleave (half_mode, op1,
16209 &ops [n >> 1], n >> 2);
16210 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
16211 return;
16212
16213 case E_V64QImode:
16214 quarter_mode = V16QImode;
16215 half_mode = V32QImode;
16216 goto quarter;
16217
16218 case E_V32HImode:
16219 quarter_mode = V8HImode;
16220 half_mode = V16HImode;
16221 goto quarter;
16222
9e2a82e1 16223 case E_V32HFmode:
16224 quarter_mode = V8HFmode;
16225 half_mode = V16HFmode;
16226 goto quarter;
16227
6910cad5 16228 case E_V32BFmode:
16229 quarter_mode = V8BFmode;
16230 half_mode = V16BFmode;
16231 goto quarter;
16232
2bf6d935
ML
16233quarter:
16234 n = GET_MODE_NUNITS (mode);
16235 for (i = 0; i < n; i++)
16236 ops[i] = XVECEXP (vals, 0, i);
16237 op0 = gen_reg_rtx (quarter_mode);
16238 op1 = gen_reg_rtx (quarter_mode);
16239 op2 = gen_reg_rtx (quarter_mode);
16240 op3 = gen_reg_rtx (quarter_mode);
16241 op4 = gen_reg_rtx (half_mode);
16242 op5 = gen_reg_rtx (half_mode);
16243 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
16244 n >> 3);
16245 ix86_expand_vector_init_interleave (quarter_mode, op1,
16246 &ops [n >> 2], n >> 3);
16247 ix86_expand_vector_init_interleave (quarter_mode, op2,
16248 &ops [n >> 1], n >> 3);
16249 ix86_expand_vector_init_interleave (quarter_mode, op3,
16250 &ops [(n >> 1) | (n >> 2)], n >> 3);
16251 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
16252 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
16253 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
16254 return;
16255
16256 case E_V16QImode:
16257 if (!TARGET_SSE4_1)
16258 break;
16259 /* FALLTHRU */
16260
16261 case E_V8HImode:
16262 if (!TARGET_SSE2)
16263 break;
16264
16265 /* Don't use ix86_expand_vector_init_interleave if we can't
16266 move from GPR to SSE register directly. */
16267 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
16268 break;
9e2a82e1 16269 /* FALLTHRU */
16270
16271 case E_V8HFmode:
6910cad5 16272 case E_V8BFmode:
2bf6d935
ML
16273
16274 n = GET_MODE_NUNITS (mode);
16275 for (i = 0; i < n; i++)
16276 ops[i] = XVECEXP (vals, 0, i);
16277 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
16278 return;
16279
16280 case E_V4HImode:
16281 case E_V8QImode:
8d7dae0e
UB
16282
16283 case E_V2HImode:
64735dc9 16284 case E_V4QImode:
2bf6d935
ML
16285 break;
16286
16287 default:
16288 gcc_unreachable ();
16289 }
16290
16291 {
16292 int i, j, n_elts, n_words, n_elt_per_word;
8d7dae0e 16293 machine_mode tmp_mode, inner_mode;
2bf6d935
ML
16294 rtx words[4], shift;
16295
8d7dae0e
UB
16296 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
16297
2bf6d935
ML
16298 inner_mode = GET_MODE_INNER (mode);
16299 n_elts = GET_MODE_NUNITS (mode);
8d7dae0e 16300 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
2bf6d935
ML
16301 n_elt_per_word = n_elts / n_words;
16302 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
16303
16304 for (i = 0; i < n_words; ++i)
16305 {
16306 rtx word = NULL_RTX;
16307
16308 for (j = 0; j < n_elt_per_word; ++j)
16309 {
16310 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
8d7dae0e 16311 elt = convert_modes (tmp_mode, inner_mode, elt, true);
2bf6d935
ML
16312
16313 if (j == 0)
16314 word = elt;
16315 else
16316 {
8d7dae0e 16317 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
e1a74058 16318 NULL_RTX, 1, OPTAB_LIB_WIDEN);
8d7dae0e 16319 word = expand_simple_binop (tmp_mode, IOR, word, elt,
e1a74058 16320 NULL_RTX, 1, OPTAB_LIB_WIDEN);
2bf6d935
ML
16321 }
16322 }
16323
16324 words[i] = word;
16325 }
16326
16327 if (n_words == 1)
16328 emit_move_insn (target, gen_lowpart (mode, words[0]));
16329 else if (n_words == 2)
16330 {
16331 rtx tmp = gen_reg_rtx (mode);
16332 emit_clobber (tmp);
8d7dae0e
UB
16333 emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
16334 emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
2bf6d935
ML
16335 emit_move_insn (target, tmp);
16336 }
16337 else if (n_words == 4)
16338 {
16339 rtx tmp = gen_reg_rtx (V4SImode);
8d7dae0e 16340 gcc_assert (tmp_mode == SImode);
2bf6d935
ML
16341 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16342 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
16343 emit_move_insn (target, gen_lowpart (mode, tmp));
16344 }
16345 else
16346 gcc_unreachable ();
16347 }
16348}
16349
16350/* Initialize vector TARGET via VALS. Suppress the use of MMX
16351 instructions unless MMX_OK is true. */
16352
16353void
16354ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16355{
16356 machine_mode mode = GET_MODE (target);
16357 machine_mode inner_mode = GET_MODE_INNER (mode);
16358 int n_elts = GET_MODE_NUNITS (mode);
16359 int n_var = 0, one_var = -1;
16360 bool all_same = true, all_const_zero = true;
16361 int i;
16362 rtx x;
16363
16364 /* Handle first initialization from vector elts. */
16365 if (n_elts != XVECLEN (vals, 0))
16366 {
16367 rtx subtarget = target;
16368 x = XVECEXP (vals, 0, 0);
16369 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16370 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16371 {
16372 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
b7dd2e4e
JJ
16373 if (inner_mode == QImode
16374 || inner_mode == HImode
575191b9 16375 || inner_mode == TImode
6910cad5 16376 || inner_mode == HFmode
16377 || inner_mode == BFmode)
2bf6d935
ML
16378 {
16379 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
b7dd2e4e
JJ
16380 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16381 n_bits /= GET_MODE_SIZE (elt_mode);
16382 mode = mode_for_vector (elt_mode, n_bits).require ();
16383 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
2bf6d935
ML
16384 ops[0] = gen_lowpart (inner_mode, ops[0]);
16385 ops[1] = gen_lowpart (inner_mode, ops[1]);
16386 subtarget = gen_reg_rtx (mode);
16387 }
16388 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
16389 if (subtarget != target)
16390 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
16391 return;
16392 }
16393 gcc_unreachable ();
16394 }
16395
16396 for (i = 0; i < n_elts; ++i)
16397 {
16398 x = XVECEXP (vals, 0, i);
16399 if (!(CONST_SCALAR_INT_P (x)
16400 || CONST_DOUBLE_P (x)
16401 || CONST_FIXED_P (x)))
16402 n_var++, one_var = i;
16403 else if (x != CONST0_RTX (inner_mode))
16404 all_const_zero = false;
16405 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
16406 all_same = false;
16407 }
16408
16409 /* Constants are best loaded from the constant pool. */
16410 if (n_var == 0)
16411 {
16412 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
16413 return;
16414 }
16415
16416 /* If all values are identical, broadcast the value. */
16417 if (all_same
16418 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16419 XVECEXP (vals, 0, 0)))
16420 return;
16421
16422 /* Values where only one field is non-constant are best loaded from
16423 the pool and overwritten via move later. */
16424 if (n_var == 1)
16425 {
16426 if (all_const_zero
16427 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16428 XVECEXP (vals, 0, one_var),
16429 one_var))
16430 return;
16431
16432 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16433 return;
16434 }
16435
16436 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16437}
16438
287cc750 16439/* Implemented as
16440 V setg (V v, int idx, T val)
16441 {
16442 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16443 V valv = (V){val, val, val, val, val, val, val, val};
16444 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16445 v = (v & ~mask) | (valv & mask);
16446 return v;
16447 }. */
16448void
16449ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16450{
16451 rtx vec[64];
16452 machine_mode mode = GET_MODE (target);
16453 machine_mode cmp_mode = mode;
16454 int n_elts = GET_MODE_NUNITS (mode);
16455 rtx valv,idxv,constv,idx_tmp;
16456 bool ok = false;
16457
16458 /* 512-bits vector byte/word broadcast and comparison only available
16459 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16460 when without TARGET_AVX512BW. */
6910cad5 16461 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
16462 || mode == V64QImode)
7a54d3de 16463 && !TARGET_AVX512BW)
287cc750 16464 {
16465 gcc_assert (TARGET_AVX512F);
16466 rtx vhi, vlo, idx_hi;
16467 machine_mode half_mode;
16468 rtx (*extract_hi)(rtx, rtx);
16469 rtx (*extract_lo)(rtx, rtx);
16470
16471 if (mode == V32HImode)
16472 {
16473 half_mode = V16HImode;
16474 extract_hi = gen_vec_extract_hi_v32hi;
16475 extract_lo = gen_vec_extract_lo_v32hi;
16476 }
7a54d3de
UB
16477 else if (mode == V32HFmode)
16478 {
16479 half_mode = V16HFmode;
16480 extract_hi = gen_vec_extract_hi_v32hf;
16481 extract_lo = gen_vec_extract_lo_v32hf;
16482 }
6910cad5 16483 else if (mode == V32BFmode)
16484 {
16485 half_mode = V16BFmode;
16486 extract_hi = gen_vec_extract_hi_v32bf;
16487 extract_lo = gen_vec_extract_lo_v32bf;
16488 }
287cc750 16489 else
16490 {
16491 half_mode = V32QImode;
16492 extract_hi = gen_vec_extract_hi_v64qi;
16493 extract_lo = gen_vec_extract_lo_v64qi;
16494 }
16495
16496 vhi = gen_reg_rtx (half_mode);
16497 vlo = gen_reg_rtx (half_mode);
16498 idx_hi = gen_reg_rtx (GET_MODE (idx));
16499 emit_insn (extract_hi (vhi, target));
16500 emit_insn (extract_lo (vlo, target));
16501 vec[0] = idx_hi;
16502 vec[1] = idx;
16503 vec[2] = GEN_INT (n_elts/2);
16504 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16505 ix86_expand_vector_set_var (vhi, val, idx_hi);
16506 ix86_expand_vector_set_var (vlo, val, idx);
16507 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16508 return;
16509 }
16510
16511 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16512 {
16513 switch (mode)
16514 {
16515 case E_V2DFmode:
16516 cmp_mode = V2DImode;
16517 break;
16518 case E_V4DFmode:
16519 cmp_mode = V4DImode;
16520 break;
16521 case E_V8DFmode:
16522 cmp_mode = V8DImode;
16523 break;
20a2c8ac
UB
16524 case E_V2SFmode:
16525 cmp_mode = V2SImode;
16526 break;
287cc750 16527 case E_V4SFmode:
16528 cmp_mode = V4SImode;
16529 break;
16530 case E_V8SFmode:
16531 cmp_mode = V8SImode;
16532 break;
16533 case E_V16SFmode:
16534 cmp_mode = V16SImode;
16535 break;
9e2a82e1 16536 case E_V8HFmode:
16537 cmp_mode = V8HImode;
16538 break;
16539 case E_V16HFmode:
16540 cmp_mode = V16HImode;
16541 break;
16542 case E_V32HFmode:
16543 cmp_mode = V32HImode;
16544 break;
6910cad5 16545 case E_V8BFmode:
16546 cmp_mode = V8HImode;
16547 break;
16548 case E_V16BFmode:
16549 cmp_mode = V16HImode;
16550 break;
16551 case E_V32BFmode:
16552 cmp_mode = V32HImode;
16553 break;
287cc750 16554 default:
16555 gcc_unreachable ();
16556 }
16557 }
16558
16559 for (int i = 0; i != n_elts; i++)
16560 vec[i] = GEN_INT (i);
16561 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16562 valv = gen_reg_rtx (mode);
16563 idxv = gen_reg_rtx (cmp_mode);
16564 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16565
20a2c8ac
UB
16566 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16567 mode, valv, val);
287cc750 16568 gcc_assert (ok);
20a2c8ac
UB
16569 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16570 cmp_mode, idxv, idx_tmp);
287cc750 16571 gcc_assert (ok);
16572 vec[0] = target;
16573 vec[1] = valv;
16574 vec[2] = target;
16575 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16576 vec[4] = idxv;
16577 vec[5] = constv;
16578 ok = ix86_expand_int_vcond (vec);
16579 gcc_assert (ok);
16580}
16581
2bf6d935
ML
16582void
16583ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16584{
16585 machine_mode mode = GET_MODE (target);
16586 machine_mode inner_mode = GET_MODE_INNER (mode);
16587 machine_mode half_mode;
16588 bool use_vec_merge = false;
7fc4d600 16589 bool blendm_const = false;
2bf6d935 16590 rtx tmp;
6910cad5 16591 static rtx (*gen_extract[8][2]) (rtx, rtx)
2bf6d935
ML
16592 = {
16593 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16594 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16595 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16596 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16597 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
9e2a82e1 16598 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
6910cad5 16599 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
16600 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
2bf6d935 16601 };
6910cad5 16602 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
2bf6d935
ML
16603 = {
16604 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16605 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16606 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16607 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16608 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
9e2a82e1 16609 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16610 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
6910cad5 16611 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
2bf6d935
ML
16612 };
16613 int i, j, n;
16614 machine_mode mmode = VOIDmode;
16615 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16616
16617 switch (mode)
16618 {
2bf6d935 16619 case E_V2SImode:
f15c7bd1
UB
16620 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16621 if (use_vec_merge)
16622 break;
16623 /* FALLTHRU */
16624
16625 case E_V2SFmode:
2bf6d935
ML
16626 if (mmx_ok)
16627 {
16628 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16629 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16630 if (elt == 0)
16631 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16632 else
16633 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16634 emit_insn (gen_rtx_SET (target, tmp));
16635 return;
16636 }
16637 break;
16638
16639 case E_V2DImode:
16640 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16641 if (use_vec_merge)
16642 break;
16643
16644 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16645 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16646 if (elt == 0)
16647 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16648 else
16649 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16650 emit_insn (gen_rtx_SET (target, tmp));
16651 return;
16652
16653 case E_V2DFmode:
ac173024
L
16654 /* NB: For ELT == 0, use standard scalar operation patterns which
16655 preserve the rest of the vector for combiner:
16656
16657 (vec_merge:V2DF
16658 (vec_duplicate:V2DF (reg:DF))
16659 (reg:V2DF)
16660 (const_int 1))
16661 */
16662 if (elt == 0)
16663 goto do_vec_merge;
16664
2bf6d935
ML
16665 {
16666 rtx op0, op1;
16667
16668 /* For the two element vectors, we implement a VEC_CONCAT with
16669 the extraction of the other element. */
16670
16671 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16672 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16673
16674 if (elt == 0)
16675 op0 = val, op1 = tmp;
16676 else
16677 op0 = tmp, op1 = val;
16678
16679 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16680 emit_insn (gen_rtx_SET (target, tmp));
16681 }
16682 return;
16683
16684 case E_V4SFmode:
16685 use_vec_merge = TARGET_SSE4_1;
16686 if (use_vec_merge)
16687 break;
16688
16689 switch (elt)
16690 {
16691 case 0:
16692 use_vec_merge = true;
16693 break;
16694
16695 case 1:
16696 /* tmp = target = A B C D */
16697 tmp = copy_to_reg (target);
16698 /* target = A A B B */
16699 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16700 /* target = X A B B */
16701 ix86_expand_vector_set (false, target, val, 0);
16702 /* target = A X C D */
16703 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16704 const1_rtx, const0_rtx,
16705 GEN_INT (2+4), GEN_INT (3+4)));
16706 return;
16707
16708 case 2:
16709 /* tmp = target = A B C D */
16710 tmp = copy_to_reg (target);
16711 /* tmp = X B C D */
16712 ix86_expand_vector_set (false, tmp, val, 0);
16713 /* target = A B X D */
16714 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16715 const0_rtx, const1_rtx,
16716 GEN_INT (0+4), GEN_INT (3+4)));
16717 return;
16718
16719 case 3:
16720 /* tmp = target = A B C D */
16721 tmp = copy_to_reg (target);
16722 /* tmp = X B C D */
16723 ix86_expand_vector_set (false, tmp, val, 0);
16724 /* target = A B X D */
16725 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16726 const0_rtx, const1_rtx,
16727 GEN_INT (2+4), GEN_INT (0+4)));
16728 return;
16729
16730 default:
16731 gcc_unreachable ();
16732 }
16733 break;
16734
16735 case E_V4SImode:
16736 use_vec_merge = TARGET_SSE4_1;
16737 if (use_vec_merge)
16738 break;
16739
16740 /* Element 0 handled by vec_merge below. */
16741 if (elt == 0)
16742 {
16743 use_vec_merge = true;
16744 break;
16745 }
16746
16747 if (TARGET_SSE2)
16748 {
16749 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16750 store into element 0, then shuffle them back. */
16751
16752 rtx order[4];
16753
16754 order[0] = GEN_INT (elt);
16755 order[1] = const1_rtx;
16756 order[2] = const2_rtx;
16757 order[3] = GEN_INT (3);
16758 order[elt] = const0_rtx;
16759
16760 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16761 order[1], order[2], order[3]));
16762
16763 ix86_expand_vector_set (false, target, val, 0);
16764
16765 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16766 order[1], order[2], order[3]));
16767 }
16768 else
16769 {
16770 /* For SSE1, we have to reuse the V4SF code. */
16771 rtx t = gen_reg_rtx (V4SFmode);
16772 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16773 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16774 emit_move_insn (target, gen_lowpart (mode, t));
16775 }
16776 return;
16777
16778 case E_V8HImode:
7eb961d8 16779 case E_V8HFmode:
6910cad5 16780 case E_V8BFmode:
5883e567 16781 case E_V2HImode:
2bf6d935
ML
16782 use_vec_merge = TARGET_SSE2;
16783 break;
16784 case E_V4HImode:
16785 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16786 break;
16787
16788 case E_V16QImode:
5883e567 16789 case E_V4QImode:
2bf6d935
ML
16790 use_vec_merge = TARGET_SSE4_1;
16791 break;
16792
16793 case E_V8QImode:
f15c7bd1 16794 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935
ML
16795 break;
16796
16797 case E_V32QImode:
16798 half_mode = V16QImode;
16799 j = 0;
16800 n = 16;
16801 goto half;
16802
9e2a82e1 16803 case E_V16HFmode:
6910cad5 16804 case E_V16BFmode:
1f759dbd 16805 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16806 if (TARGET_AVX2 && elt != 0)
7fc4d600 16807 {
16808 mmode = SImode;
6910cad5 16809 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
16810 : gen_avx2_pblendbf_1);
7fc4d600 16811 blendm_const = true;
16812 break;
16813 }
16814 else
16815 {
6910cad5 16816 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
16817 j = ((mode == E_V16HFmode) ? 6 : 7);
7fc4d600 16818 n = 8;
16819 goto half;
16820 }
9e2a82e1 16821
2bf6d935
ML
16822 case E_V16HImode:
16823 half_mode = V8HImode;
16824 j = 1;
16825 n = 8;
16826 goto half;
16827
16828 case E_V8SImode:
16829 half_mode = V4SImode;
16830 j = 2;
16831 n = 4;
16832 goto half;
16833
16834 case E_V4DImode:
16835 half_mode = V2DImode;
16836 j = 3;
16837 n = 2;
16838 goto half;
16839
16840 case E_V8SFmode:
16841 half_mode = V4SFmode;
16842 j = 4;
16843 n = 4;
16844 goto half;
16845
16846 case E_V4DFmode:
16847 half_mode = V2DFmode;
16848 j = 5;
16849 n = 2;
16850 goto half;
16851
16852half:
16853 /* Compute offset. */
16854 i = elt / n;
16855 elt %= n;
16856
16857 gcc_assert (i <= 1);
16858
16859 /* Extract the half. */
16860 tmp = gen_reg_rtx (half_mode);
16861 emit_insn (gen_extract[j][i] (tmp, target));
16862
16863 /* Put val in tmp at elt. */
16864 ix86_expand_vector_set (false, tmp, val, elt);
16865
16866 /* Put it back. */
16867 emit_insn (gen_insert[j][i] (target, target, tmp));
16868 return;
16869
16870 case E_V8DFmode:
16871 if (TARGET_AVX512F)
16872 {
16873 mmode = QImode;
16874 gen_blendm = gen_avx512f_blendmv8df;
16875 }
16876 break;
16877
16878 case E_V8DImode:
16879 if (TARGET_AVX512F)
16880 {
16881 mmode = QImode;
16882 gen_blendm = gen_avx512f_blendmv8di;
16883 }
16884 break;
16885
16886 case E_V16SFmode:
16887 if (TARGET_AVX512F)
16888 {
16889 mmode = HImode;
16890 gen_blendm = gen_avx512f_blendmv16sf;
16891 }
16892 break;
16893
16894 case E_V16SImode:
16895 if (TARGET_AVX512F)
16896 {
16897 mmode = HImode;
16898 gen_blendm = gen_avx512f_blendmv16si;
16899 }
16900 break;
16901
9e2a82e1 16902 case E_V32HFmode:
16903 if (TARGET_AVX512BW)
16904 {
16905 mmode = SImode;
16906 gen_blendm = gen_avx512bw_blendmv32hf;
16907 }
16908 break;
6910cad5 16909 case E_V32BFmode:
16910 if (TARGET_AVX512BW)
16911 {
16912 mmode = SImode;
16913 gen_blendm = gen_avx512bw_blendmv32bf;
16914 }
16915 break;
2bf6d935
ML
16916 case E_V32HImode:
16917 if (TARGET_AVX512BW)
16918 {
16919 mmode = SImode;
16920 gen_blendm = gen_avx512bw_blendmv32hi;
16921 }
16922 else if (TARGET_AVX512F)
16923 {
16924 half_mode = E_V8HImode;
16925 n = 8;
16926 goto quarter;
16927 }
16928 break;
16929
16930 case E_V64QImode:
16931 if (TARGET_AVX512BW)
16932 {
16933 mmode = DImode;
16934 gen_blendm = gen_avx512bw_blendmv64qi;
16935 }
16936 else if (TARGET_AVX512F)
16937 {
16938 half_mode = E_V16QImode;
16939 n = 16;
16940 goto quarter;
16941 }
16942 break;
16943
16944quarter:
16945 /* Compute offset. */
16946 i = elt / n;
16947 elt %= n;
16948
16949 gcc_assert (i <= 3);
16950
16951 {
16952 /* Extract the quarter. */
16953 tmp = gen_reg_rtx (V4SImode);
16954 rtx tmp2 = gen_lowpart (V16SImode, target);
16955 rtx mask = gen_reg_rtx (QImode);
16956
16957 emit_move_insn (mask, constm1_rtx);
16958 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
16959 tmp, mask));
16960
16961 tmp2 = gen_reg_rtx (half_mode);
16962 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
16963 tmp = tmp2;
16964
16965 /* Put val in tmp at elt. */
16966 ix86_expand_vector_set (false, tmp, val, elt);
16967
16968 /* Put it back. */
16969 tmp2 = gen_reg_rtx (V16SImode);
16970 rtx tmp3 = gen_lowpart (V16SImode, target);
16971 mask = gen_reg_rtx (HImode);
16972 emit_move_insn (mask, constm1_rtx);
16973 tmp = gen_lowpart (V4SImode, tmp);
16974 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
16975 tmp3, mask));
16976 emit_move_insn (target, gen_lowpart (mode, tmp2));
16977 }
16978 return;
16979
16980 default:
16981 break;
16982 }
16983
16984 if (mmode != VOIDmode)
16985 {
16986 tmp = gen_reg_rtx (mode);
16987 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
7fc4d600 16988 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
2bf6d935
ML
16989 /* The avx512*_blendm<mode> expanders have different operand order
16990 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
16991 elements where the mask is set and second input operand otherwise,
16992 in {sse,avx}*_*blend* the first input operand is used for elements
16993 where the mask is clear and second input operand otherwise. */
7fc4d600 16994 if (!blendm_const)
16995 merge_mask = force_reg (mmode, merge_mask);
16996 emit_insn (gen_blendm (target, target, tmp, merge_mask));
2bf6d935
ML
16997 }
16998 else if (use_vec_merge)
16999 {
ac173024 17000do_vec_merge:
2bf6d935
ML
17001 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
17002 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
17003 GEN_INT (HOST_WIDE_INT_1U << elt));
17004 emit_insn (gen_rtx_SET (target, tmp));
17005 }
17006 else
17007 {
17008 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17009
17010 emit_move_insn (mem, target);
17011
17012 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
17013 emit_move_insn (tmp, val);
17014
17015 emit_move_insn (target, mem);
17016 }
17017}
17018
17019void
17020ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
17021{
17022 machine_mode mode = GET_MODE (vec);
17023 machine_mode inner_mode = GET_MODE_INNER (mode);
17024 bool use_vec_extr = false;
17025 rtx tmp;
17026
17027 switch (mode)
17028 {
17029 case E_V2SImode:
5fbc8ab4
UB
17030 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17031 if (use_vec_extr)
17032 break;
17033 /* FALLTHRU */
17034
2bf6d935
ML
17035 case E_V2SFmode:
17036 if (!mmx_ok)
17037 break;
17038 /* FALLTHRU */
17039
17040 case E_V2DFmode:
17041 case E_V2DImode:
17042 case E_V2TImode:
17043 case E_V4TImode:
17044 use_vec_extr = true;
17045 break;
17046
17047 case E_V4SFmode:
17048 use_vec_extr = TARGET_SSE4_1;
17049 if (use_vec_extr)
17050 break;
17051
17052 switch (elt)
17053 {
17054 case 0:
17055 tmp = vec;
17056 break;
17057
17058 case 1:
17059 case 3:
17060 tmp = gen_reg_rtx (mode);
17061 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
17062 GEN_INT (elt), GEN_INT (elt),
17063 GEN_INT (elt+4), GEN_INT (elt+4)));
17064 break;
17065
17066 case 2:
17067 tmp = gen_reg_rtx (mode);
17068 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
17069 break;
17070
17071 default:
17072 gcc_unreachable ();
17073 }
17074 vec = tmp;
17075 use_vec_extr = true;
17076 elt = 0;
17077 break;
17078
17079 case E_V4SImode:
17080 use_vec_extr = TARGET_SSE4_1;
17081 if (use_vec_extr)
17082 break;
17083
17084 if (TARGET_SSE2)
17085 {
17086 switch (elt)
17087 {
17088 case 0:
17089 tmp = vec;
17090 break;
17091
17092 case 1:
17093 case 3:
17094 tmp = gen_reg_rtx (mode);
17095 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
17096 GEN_INT (elt), GEN_INT (elt),
17097 GEN_INT (elt), GEN_INT (elt)));
17098 break;
17099
17100 case 2:
17101 tmp = gen_reg_rtx (mode);
17102 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
17103 break;
17104
17105 default:
17106 gcc_unreachable ();
17107 }
17108 vec = tmp;
17109 use_vec_extr = true;
17110 elt = 0;
17111 }
17112 else
17113 {
17114 /* For SSE1, we have to reuse the V4SF code. */
17115 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
17116 gen_lowpart (V4SFmode, vec), elt);
17117 return;
17118 }
17119 break;
17120
17121 case E_V8HImode:
7a54d3de 17122 case E_V8HFmode:
6910cad5 17123 case E_V8BFmode:
5883e567 17124 case E_V2HImode:
2bf6d935
ML
17125 use_vec_extr = TARGET_SSE2;
17126 break;
17127 case E_V4HImode:
17128 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17129 break;
17130
17131 case E_V16QImode:
17132 use_vec_extr = TARGET_SSE4_1;
f66e6e2b
JJ
17133 if (!use_vec_extr
17134 && TARGET_SSE2
17135 && elt == 0
17136 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
17137 {
17138 tmp = gen_reg_rtx (SImode);
17139 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
17140 0);
17141 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
17142 return;
17143 }
2bf6d935 17144 break;
5883e567
UB
17145 case E_V4QImode:
17146 use_vec_extr = TARGET_SSE4_1;
17147 break;
2bf6d935
ML
17148
17149 case E_V8SFmode:
17150 if (TARGET_AVX)
17151 {
17152 tmp = gen_reg_rtx (V4SFmode);
17153 if (elt < 4)
17154 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
17155 else
17156 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
17157 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17158 return;
17159 }
17160 break;
17161
17162 case E_V4DFmode:
17163 if (TARGET_AVX)
17164 {
17165 tmp = gen_reg_rtx (V2DFmode);
17166 if (elt < 2)
17167 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
17168 else
17169 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
17170 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17171 return;
17172 }
17173 break;
17174
17175 case E_V32QImode:
17176 if (TARGET_AVX)
17177 {
17178 tmp = gen_reg_rtx (V16QImode);
17179 if (elt < 16)
17180 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
17181 else
17182 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
17183 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17184 return;
17185 }
17186 break;
17187
17188 case E_V16HImode:
17189 if (TARGET_AVX)
17190 {
17191 tmp = gen_reg_rtx (V8HImode);
17192 if (elt < 8)
17193 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
17194 else
17195 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
17196 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17197 return;
17198 }
17199 break;
17200
17201 case E_V8SImode:
17202 if (TARGET_AVX)
17203 {
17204 tmp = gen_reg_rtx (V4SImode);
17205 if (elt < 4)
17206 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
17207 else
17208 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
17209 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17210 return;
17211 }
17212 break;
17213
17214 case E_V4DImode:
17215 if (TARGET_AVX)
17216 {
17217 tmp = gen_reg_rtx (V2DImode);
17218 if (elt < 2)
17219 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
17220 else
17221 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
17222 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17223 return;
17224 }
17225 break;
17226
17227 case E_V32HImode:
17228 if (TARGET_AVX512BW)
17229 {
17230 tmp = gen_reg_rtx (V16HImode);
17231 if (elt < 16)
17232 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
17233 else
17234 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
17235 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17236 return;
17237 }
17238 break;
17239
17240 case E_V64QImode:
17241 if (TARGET_AVX512BW)
17242 {
17243 tmp = gen_reg_rtx (V32QImode);
17244 if (elt < 32)
17245 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
17246 else
17247 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
17248 ix86_expand_vector_extract (false, target, tmp, elt & 31);
17249 return;
17250 }
17251 break;
17252
17253 case E_V16SFmode:
17254 tmp = gen_reg_rtx (V8SFmode);
17255 if (elt < 8)
17256 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
17257 else
17258 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
17259 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17260 return;
17261
17262 case E_V8DFmode:
17263 tmp = gen_reg_rtx (V4DFmode);
17264 if (elt < 4)
17265 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
17266 else
17267 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
17268 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17269 return;
17270
17271 case E_V16SImode:
17272 tmp = gen_reg_rtx (V8SImode);
17273 if (elt < 8)
17274 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
17275 else
17276 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
17277 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17278 return;
17279
17280 case E_V8DImode:
17281 tmp = gen_reg_rtx (V4DImode);
17282 if (elt < 4)
17283 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
17284 else
17285 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
17286 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17287 return;
17288
9e2a82e1 17289 case E_V32HFmode:
6910cad5 17290 case E_V32BFmode:
7a54d3de
UB
17291 if (TARGET_AVX512BW)
17292 {
6910cad5 17293 tmp = (mode == E_V32HFmode
17294 ? gen_reg_rtx (V16HFmode)
17295 : gen_reg_rtx (V16BFmode));
7a54d3de 17296 if (elt < 16)
6910cad5 17297 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
7a54d3de 17298 else
6910cad5 17299 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
7a54d3de
UB
17300 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17301 return;
17302 }
17303 break;
9e2a82e1 17304
17305 case E_V16HFmode:
6910cad5 17306 case E_V16BFmode:
7a54d3de
UB
17307 if (TARGET_AVX)
17308 {
6910cad5 17309 tmp = (mode == E_V16HFmode
17310 ? gen_reg_rtx (V8HFmode)
17311 : gen_reg_rtx (V8BFmode));
7a54d3de 17312 if (elt < 8)
6910cad5 17313 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
7a54d3de 17314 else
6910cad5 17315 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
7a54d3de
UB
17316 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17317 return;
17318 }
9e2a82e1 17319 break;
17320
2bf6d935 17321 case E_V8QImode:
5fbc8ab4 17322 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935 17323 /* ??? Could extract the appropriate HImode element and shift. */
5fbc8ab4
UB
17324 break;
17325
2bf6d935
ML
17326 default:
17327 break;
17328 }
17329
17330 if (use_vec_extr)
17331 {
17332 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17333 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17334
17335 /* Let the rtl optimizers know about the zero extension performed. */
17336 if (inner_mode == QImode || inner_mode == HImode)
17337 {
97c32001 17338 rtx reg = gen_reg_rtx (SImode);
2bf6d935 17339 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
97c32001
RS
17340 emit_move_insn (reg, tmp);
17341 tmp = gen_lowpart (inner_mode, reg);
17342 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17343 SUBREG_PROMOTED_SET (tmp, 1);
2bf6d935
ML
17344 }
17345
97c32001 17346 emit_move_insn (target, tmp);
2bf6d935
ML
17347 }
17348 else
17349 {
17350 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17351
17352 emit_move_insn (mem, vec);
17353
17354 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17355 emit_move_insn (target, tmp);
17356 }
17357}
17358
17359/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17360 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17361 The upper bits of DEST are undefined, though they shouldn't cause
17362 exceptions (some bits from src or all zeros are ok). */
17363
17364static void
17365emit_reduc_half (rtx dest, rtx src, int i)
17366{
17367 rtx tem, d = dest;
17368 switch (GET_MODE (src))
17369 {
17370 case E_V4SFmode:
17371 if (i == 128)
17372 tem = gen_sse_movhlps (dest, src, src);
17373 else
17374 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
17375 GEN_INT (1 + 4), GEN_INT (1 + 4));
17376 break;
17377 case E_V2DFmode:
17378 tem = gen_vec_interleave_highv2df (dest, src, src);
17379 break;
73c535a0 17380 case E_V4QImode:
17381 d = gen_reg_rtx (V1SImode);
17382 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
17383 GEN_INT (i / 2));
17384 break;
77ca2cfc 17385 case E_V4HImode:
17386 d = gen_reg_rtx (V1DImode);
17387 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
17388 GEN_INT (i / 2));
17389 break;
2bf6d935
ML
17390 case E_V16QImode:
17391 case E_V8HImode:
3540429b 17392 case E_V8HFmode:
2bf6d935
ML
17393 case E_V4SImode:
17394 case E_V2DImode:
17395 d = gen_reg_rtx (V1TImode);
17396 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
17397 GEN_INT (i / 2));
17398 break;
17399 case E_V8SFmode:
17400 if (i == 256)
17401 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
17402 else
17403 tem = gen_avx_shufps256 (dest, src, src,
17404 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
17405 break;
17406 case E_V4DFmode:
17407 if (i == 256)
17408 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
17409 else
17410 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
17411 break;
17412 case E_V32QImode:
17413 case E_V16HImode:
3540429b 17414 case E_V16HFmode:
2bf6d935
ML
17415 case E_V8SImode:
17416 case E_V4DImode:
17417 if (i == 256)
17418 {
17419 if (GET_MODE (dest) != V4DImode)
17420 d = gen_reg_rtx (V4DImode);
17421 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
17422 gen_lowpart (V4DImode, src),
17423 const1_rtx);
17424 }
17425 else
17426 {
17427 d = gen_reg_rtx (V2TImode);
17428 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
17429 GEN_INT (i / 2));
17430 }
17431 break;
17432 case E_V64QImode:
17433 case E_V32HImode:
3540429b 17434 case E_V32HFmode:
bee27152
JJ
17435 if (i < 64)
17436 {
17437 d = gen_reg_rtx (V4TImode);
17438 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
17439 GEN_INT (i / 2));
17440 break;
17441 }
17442 /* FALLTHRU */
2bf6d935
ML
17443 case E_V16SImode:
17444 case E_V16SFmode:
17445 case E_V8DImode:
17446 case E_V8DFmode:
17447 if (i > 128)
17448 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
17449 gen_lowpart (V16SImode, src),
17450 gen_lowpart (V16SImode, src),
17451 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17452 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17453 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17454 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17455 GEN_INT (0xC), GEN_INT (0xD),
17456 GEN_INT (0xE), GEN_INT (0xF),
17457 GEN_INT (0x10), GEN_INT (0x11),
17458 GEN_INT (0x12), GEN_INT (0x13),
17459 GEN_INT (0x14), GEN_INT (0x15),
17460 GEN_INT (0x16), GEN_INT (0x17));
2bf6d935
ML
17461 else
17462 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
17463 gen_lowpart (V16SImode, src),
17464 GEN_INT (i == 128 ? 0x2 : 0x1),
17465 GEN_INT (0x3),
17466 GEN_INT (0x3),
17467 GEN_INT (0x3),
17468 GEN_INT (i == 128 ? 0x6 : 0x5),
17469 GEN_INT (0x7),
17470 GEN_INT (0x7),
17471 GEN_INT (0x7),
17472 GEN_INT (i == 128 ? 0xA : 0x9),
17473 GEN_INT (0xB),
17474 GEN_INT (0xB),
17475 GEN_INT (0xB),
17476 GEN_INT (i == 128 ? 0xE : 0xD),
17477 GEN_INT (0xF),
17478 GEN_INT (0xF),
17479 GEN_INT (0xF));
2bf6d935
ML
17480 break;
17481 default:
17482 gcc_unreachable ();
17483 }
17484 emit_insn (tem);
17485 if (d != dest)
17486 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17487}
17488
17489/* Expand a vector reduction. FN is the binary pattern to reduce;
17490 DEST is the destination; IN is the input vector. */
17491
17492void
17493ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17494{
17495 rtx half, dst, vec = in;
17496 machine_mode mode = GET_MODE (in);
17497 int i;
17498
17499 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17500 if (TARGET_SSE4_1
17501 && mode == V8HImode
17502 && fn == gen_uminv8hi3)
17503 {
17504 emit_insn (gen_sse4_1_phminposuw (dest, in));
17505 return;
17506 }
17507
17508 for (i = GET_MODE_BITSIZE (mode);
17509 i > GET_MODE_UNIT_BITSIZE (mode);
17510 i >>= 1)
17511 {
17512 half = gen_reg_rtx (mode);
17513 emit_reduc_half (half, vec, i);
17514 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17515 dst = dest;
17516 else
17517 dst = gen_reg_rtx (mode);
17518 emit_insn (fn (dst, half, vec));
17519 vec = dst;
17520 }
17521}
17522
17523/* Output code to perform a conditional jump to LABEL, if C2 flag in
17524 FP status register is set. */
17525
17526void
17527ix86_emit_fp_unordered_jump (rtx label)
17528{
17529 rtx reg = gen_reg_rtx (HImode);
17530 rtx_insn *insn;
17531 rtx temp;
17532
17533 emit_insn (gen_x86_fnstsw_1 (reg));
17534
17535 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17536 {
17537 emit_insn (gen_x86_sahf_1 (reg));
17538
17539 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17540 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17541 }
17542 else
17543 {
17544 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17545
17546 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17547 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17548 }
17549
17550 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17551 gen_rtx_LABEL_REF (VOIDmode, label),
17552 pc_rtx);
17553 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17554 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17555 JUMP_LABEL (insn) = label;
17556}
17557
17558/* Output code to perform an sinh XFmode calculation. */
17559
152f243f
JJ
17560void
17561ix86_emit_i387_sinh (rtx op0, rtx op1)
2bf6d935
ML
17562{
17563 rtx e1 = gen_reg_rtx (XFmode);
17564 rtx e2 = gen_reg_rtx (XFmode);
17565 rtx scratch = gen_reg_rtx (HImode);
17566 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17567 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17568 rtx cst1, tmp;
17569 rtx_code_label *jump_label = gen_label_rtx ();
17570 rtx_insn *insn;
17571
17572 /* scratch = fxam (op1) */
17573 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17574
17575 /* e1 = expm1 (|op1|) */
17576 emit_insn (gen_absxf2 (e2, op1));
17577 emit_insn (gen_expm1xf2 (e1, e2));
17578
17579 /* e2 = e1 / (e1 + 1.0) + e1 */
17580 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17581 emit_insn (gen_addxf3 (e2, e1, cst1));
17582 emit_insn (gen_divxf3 (e2, e1, e2));
17583 emit_insn (gen_addxf3 (e2, e2, e1));
17584
17585 /* flags = signbit (op1) */
17586 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17587
17588 /* if (flags) then e2 = -e2 */
17589 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17590 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17591 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17592 pc_rtx);
17593 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17594 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17595 JUMP_LABEL (insn) = jump_label;
17596
17597 emit_insn (gen_negxf2 (e2, e2));
17598
17599 emit_label (jump_label);
17600 LABEL_NUSES (jump_label) = 1;
17601
17602 /* op0 = 0.5 * e2 */
17603 half = force_reg (XFmode, half);
17604 emit_insn (gen_mulxf3 (op0, e2, half));
17605}
17606
17607/* Output code to perform an cosh XFmode calculation. */
17608
152f243f
JJ
17609void
17610ix86_emit_i387_cosh (rtx op0, rtx op1)
2bf6d935
ML
17611{
17612 rtx e1 = gen_reg_rtx (XFmode);
17613 rtx e2 = gen_reg_rtx (XFmode);
17614 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17615 rtx cst1;
17616
17617 /* e1 = exp (op1) */
17618 emit_insn (gen_expxf2 (e1, op1));
17619
17620 /* e2 = e1 + 1.0 / e1 */
17621 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17622 emit_insn (gen_divxf3 (e2, cst1, e1));
17623 emit_insn (gen_addxf3 (e2, e1, e2));
17624
17625 /* op0 = 0.5 * e2 */
17626 half = force_reg (XFmode, half);
17627 emit_insn (gen_mulxf3 (op0, e2, half));
17628}
17629
17630/* Output code to perform an tanh XFmode calculation. */
17631
152f243f
JJ
17632void
17633ix86_emit_i387_tanh (rtx op0, rtx op1)
2bf6d935
ML
17634{
17635 rtx e1 = gen_reg_rtx (XFmode);
17636 rtx e2 = gen_reg_rtx (XFmode);
17637 rtx scratch = gen_reg_rtx (HImode);
17638 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17639 rtx cst2, tmp;
17640 rtx_code_label *jump_label = gen_label_rtx ();
17641 rtx_insn *insn;
17642
17643 /* scratch = fxam (op1) */
17644 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17645
17646 /* e1 = expm1 (-|2 * op1|) */
17647 emit_insn (gen_addxf3 (e2, op1, op1));
17648 emit_insn (gen_absxf2 (e2, e2));
17649 emit_insn (gen_negxf2 (e2, e2));
17650 emit_insn (gen_expm1xf2 (e1, e2));
17651
17652 /* e2 = e1 / (e1 + 2.0) */
17653 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17654 emit_insn (gen_addxf3 (e2, e1, cst2));
17655 emit_insn (gen_divxf3 (e2, e1, e2));
17656
17657 /* flags = signbit (op1) */
17658 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17659
17660 /* if (!flags) then e2 = -e2 */
17661 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17662 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17663 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17664 pc_rtx);
17665 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17666 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17667 JUMP_LABEL (insn) = jump_label;
17668
17669 emit_insn (gen_negxf2 (e2, e2));
17670
17671 emit_label (jump_label);
17672 LABEL_NUSES (jump_label) = 1;
17673
17674 emit_move_insn (op0, e2);
17675}
17676
17677/* Output code to perform an asinh XFmode calculation. */
17678
152f243f
JJ
17679void
17680ix86_emit_i387_asinh (rtx op0, rtx op1)
2bf6d935
ML
17681{
17682 rtx e1 = gen_reg_rtx (XFmode);
17683 rtx e2 = gen_reg_rtx (XFmode);
17684 rtx scratch = gen_reg_rtx (HImode);
17685 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17686 rtx cst1, tmp;
17687 rtx_code_label *jump_label = gen_label_rtx ();
17688 rtx_insn *insn;
17689
17690 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17691 emit_insn (gen_mulxf3 (e1, op1, op1));
17692 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17693 emit_insn (gen_addxf3 (e2, e1, cst1));
17694 emit_insn (gen_sqrtxf2 (e2, e2));
17695 emit_insn (gen_addxf3 (e2, e2, cst1));
17696
17697 /* e1 = e1 / e2 */
17698 emit_insn (gen_divxf3 (e1, e1, e2));
17699
17700 /* scratch = fxam (op1) */
17701 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17702
17703 /* e1 = e1 + |op1| */
17704 emit_insn (gen_absxf2 (e2, op1));
17705 emit_insn (gen_addxf3 (e1, e1, e2));
17706
17707 /* e2 = log1p (e1) */
17708 ix86_emit_i387_log1p (e2, e1);
17709
17710 /* flags = signbit (op1) */
17711 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17712
17713 /* if (flags) then e2 = -e2 */
17714 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17715 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17716 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17717 pc_rtx);
17718 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17719 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17720 JUMP_LABEL (insn) = jump_label;
17721
17722 emit_insn (gen_negxf2 (e2, e2));
17723
17724 emit_label (jump_label);
17725 LABEL_NUSES (jump_label) = 1;
17726
17727 emit_move_insn (op0, e2);
17728}
17729
17730/* Output code to perform an acosh XFmode calculation. */
17731
152f243f
JJ
17732void
17733ix86_emit_i387_acosh (rtx op0, rtx op1)
2bf6d935
ML
17734{
17735 rtx e1 = gen_reg_rtx (XFmode);
17736 rtx e2 = gen_reg_rtx (XFmode);
17737 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17738
17739 /* e2 = sqrt (op1 + 1.0) */
17740 emit_insn (gen_addxf3 (e2, op1, cst1));
17741 emit_insn (gen_sqrtxf2 (e2, e2));
17742
17743 /* e1 = sqrt (op1 - 1.0) */
17744 emit_insn (gen_subxf3 (e1, op1, cst1));
17745 emit_insn (gen_sqrtxf2 (e1, e1));
17746
17747 /* e1 = e1 * e2 */
17748 emit_insn (gen_mulxf3 (e1, e1, e2));
17749
17750 /* e1 = e1 + op1 */
17751 emit_insn (gen_addxf3 (e1, e1, op1));
17752
17753 /* op0 = log (e1) */
17754 emit_insn (gen_logxf2 (op0, e1));
17755}
17756
17757/* Output code to perform an atanh XFmode calculation. */
17758
152f243f
JJ
17759void
17760ix86_emit_i387_atanh (rtx op0, rtx op1)
2bf6d935
ML
17761{
17762 rtx e1 = gen_reg_rtx (XFmode);
17763 rtx e2 = gen_reg_rtx (XFmode);
17764 rtx scratch = gen_reg_rtx (HImode);
17765 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17766 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17767 rtx cst1, tmp;
17768 rtx_code_label *jump_label = gen_label_rtx ();
17769 rtx_insn *insn;
17770
17771 /* scratch = fxam (op1) */
17772 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17773
17774 /* e2 = |op1| */
17775 emit_insn (gen_absxf2 (e2, op1));
17776
17777 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17778 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17779 emit_insn (gen_addxf3 (e1, e2, cst1));
17780 emit_insn (gen_addxf3 (e2, e2, e2));
17781 emit_insn (gen_negxf2 (e2, e2));
17782 emit_insn (gen_divxf3 (e1, e2, e1));
17783
17784 /* e2 = log1p (e1) */
17785 ix86_emit_i387_log1p (e2, e1);
17786
17787 /* flags = signbit (op1) */
17788 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17789
17790 /* if (!flags) then e2 = -e2 */
17791 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17792 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17793 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17794 pc_rtx);
17795 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17796 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17797 JUMP_LABEL (insn) = jump_label;
17798
17799 emit_insn (gen_negxf2 (e2, e2));
17800
17801 emit_label (jump_label);
17802 LABEL_NUSES (jump_label) = 1;
17803
17804 /* op0 = 0.5 * e2 */
17805 half = force_reg (XFmode, half);
17806 emit_insn (gen_mulxf3 (op0, e2, half));
17807}
17808
17809/* Output code to perform a log1p XFmode calculation. */
17810
152f243f
JJ
17811void
17812ix86_emit_i387_log1p (rtx op0, rtx op1)
2bf6d935
ML
17813{
17814 rtx_code_label *label1 = gen_label_rtx ();
17815 rtx_code_label *label2 = gen_label_rtx ();
17816
17817 rtx tmp = gen_reg_rtx (XFmode);
17818 rtx res = gen_reg_rtx (XFmode);
17819 rtx cst, cstln2, cst1;
17820 rtx_insn *insn;
17821
d481d137
JJ
17822 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17823 before the conditional jump, otherwise the stack adjustment will be
17824 only conditional. */
17825 do_pending_stack_adjust ();
17826
2bf6d935
ML
17827 cst = const_double_from_real_value
17828 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17829 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17830
17831 emit_insn (gen_absxf2 (tmp, op1));
17832
17833 cst = force_reg (XFmode, cst);
17834 ix86_expand_branch (GE, tmp, cst, label1);
17835 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17836 insn = get_last_insn ();
17837 JUMP_LABEL (insn) = label1;
17838
17839 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17840 emit_jump (label2);
17841
17842 emit_label (label1);
17843 LABEL_NUSES (label1) = 1;
17844
17845 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17846 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17847 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17848
17849 emit_label (label2);
17850 LABEL_NUSES (label2) = 1;
17851
17852 emit_move_insn (op0, res);
17853}
17854
17855/* Emit code for round calculation. */
152f243f
JJ
17856void
17857ix86_emit_i387_round (rtx op0, rtx op1)
2bf6d935
ML
17858{
17859 machine_mode inmode = GET_MODE (op1);
17860 machine_mode outmode = GET_MODE (op0);
17861 rtx e1 = gen_reg_rtx (XFmode);
17862 rtx e2 = gen_reg_rtx (XFmode);
17863 rtx scratch = gen_reg_rtx (HImode);
17864 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17865 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17866 rtx res = gen_reg_rtx (outmode);
17867 rtx_code_label *jump_label = gen_label_rtx ();
17868 rtx (*floor_insn) (rtx, rtx);
17869 rtx (*neg_insn) (rtx, rtx);
17870 rtx_insn *insn;
17871 rtx tmp;
17872
17873 switch (inmode)
17874 {
17875 case E_SFmode:
17876 case E_DFmode:
17877 tmp = gen_reg_rtx (XFmode);
17878
17879 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17880 op1 = tmp;
17881 break;
17882 case E_XFmode:
17883 break;
17884 default:
17885 gcc_unreachable ();
17886 }
17887
17888 switch (outmode)
17889 {
17890 case E_SFmode:
17891 floor_insn = gen_frndintxf2_floor;
17892 neg_insn = gen_negsf2;
17893 break;
17894 case E_DFmode:
17895 floor_insn = gen_frndintxf2_floor;
17896 neg_insn = gen_negdf2;
17897 break;
17898 case E_XFmode:
17899 floor_insn = gen_frndintxf2_floor;
17900 neg_insn = gen_negxf2;
17901 break;
17902 case E_HImode:
17903 floor_insn = gen_lfloorxfhi2;
17904 neg_insn = gen_neghi2;
17905 break;
17906 case E_SImode:
17907 floor_insn = gen_lfloorxfsi2;
17908 neg_insn = gen_negsi2;
17909 break;
17910 case E_DImode:
17911 floor_insn = gen_lfloorxfdi2;
17912 neg_insn = gen_negdi2;
17913 break;
17914 default:
17915 gcc_unreachable ();
17916 }
17917
17918 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17919
17920 /* scratch = fxam(op1) */
17921 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17922
17923 /* e1 = fabs(op1) */
17924 emit_insn (gen_absxf2 (e1, op1));
17925
17926 /* e2 = e1 + 0.5 */
17927 half = force_reg (XFmode, half);
17928 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
17929
17930 /* res = floor(e2) */
17931 switch (outmode)
17932 {
17933 case E_SFmode:
17934 case E_DFmode:
17935 {
17936 tmp = gen_reg_rtx (XFmode);
17937
17938 emit_insn (floor_insn (tmp, e2));
17939 emit_insn (gen_rtx_SET (res,
17940 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
17941 UNSPEC_TRUNC_NOOP)));
17942 }
17943 break;
17944 default:
17945 emit_insn (floor_insn (res, e2));
17946 }
17947
17948 /* flags = signbit(a) */
17949 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17950
17951 /* if (flags) then res = -res */
17952 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17953 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17954 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17955 pc_rtx);
17956 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17957 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17958 JUMP_LABEL (insn) = jump_label;
17959
17960 emit_insn (neg_insn (res, res));
17961
17962 emit_label (jump_label);
17963 LABEL_NUSES (jump_label) = 1;
17964
17965 emit_move_insn (op0, res);
17966}
17967
17968/* Output code to perform a Newton-Rhapson approximation of a single precision
17969 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
17970
152f243f
JJ
17971void
17972ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
2bf6d935
ML
17973{
17974 rtx x0, x1, e0, e1;
17975
17976 x0 = gen_reg_rtx (mode);
17977 e0 = gen_reg_rtx (mode);
17978 e1 = gen_reg_rtx (mode);
17979 x1 = gen_reg_rtx (mode);
17980
17981 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17982
17983 b = force_reg (mode, b);
17984
17985 /* x0 = rcp(b) estimate */
17986 if (mode == V16SFmode || mode == V8DFmode)
17987 {
17988 if (TARGET_AVX512ER)
17989 {
17990 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17991 UNSPEC_RCP28)));
17992 /* res = a * x0 */
17993 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
17994 return;
17995 }
17996 else
17997 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17998 UNSPEC_RCP14)));
17999 }
18000 else
18001 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18002 UNSPEC_RCP)));
18003
18004 /* e0 = x0 * b */
18005 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
18006
18007 /* e0 = x0 * e0 */
18008 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
18009
18010 /* e1 = x0 + x0 */
18011 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
18012
18013 /* x1 = e1 - e0 */
18014 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
18015
18016 /* res = a * x1 */
18017 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
18018}
18019
18020/* Output code to perform a Newton-Rhapson approximation of a
18021 single precision floating point [reciprocal] square root. */
18022
152f243f
JJ
18023void
18024ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
2bf6d935
ML
18025{
18026 rtx x0, e0, e1, e2, e3, mthree, mhalf;
18027 REAL_VALUE_TYPE r;
18028 int unspec;
18029
18030 x0 = gen_reg_rtx (mode);
18031 e0 = gen_reg_rtx (mode);
18032 e1 = gen_reg_rtx (mode);
18033 e2 = gen_reg_rtx (mode);
18034 e3 = gen_reg_rtx (mode);
18035
18036 if (TARGET_AVX512ER && mode == V16SFmode)
18037 {
18038 if (recip)
18039 /* res = rsqrt28(a) estimate */
18040 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18041 UNSPEC_RSQRT28)));
18042 else
18043 {
18044 /* x0 = rsqrt28(a) estimate */
18045 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18046 UNSPEC_RSQRT28)));
18047 /* res = rcp28(x0) estimate */
18048 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
18049 UNSPEC_RCP28)));
18050 }
18051 return;
18052 }
18053
18054 real_from_integer (&r, VOIDmode, -3, SIGNED);
18055 mthree = const_double_from_real_value (r, SFmode);
18056
18057 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
18058 mhalf = const_double_from_real_value (r, SFmode);
18059 unspec = UNSPEC_RSQRT;
18060
18061 if (VECTOR_MODE_P (mode))
18062 {
18063 mthree = ix86_build_const_vector (mode, true, mthree);
18064 mhalf = ix86_build_const_vector (mode, true, mhalf);
18065 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18066 if (GET_MODE_SIZE (mode) == 64)
18067 unspec = UNSPEC_RSQRT14;
18068 }
18069
18070 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18071 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18072
18073 a = force_reg (mode, a);
18074
18075 /* x0 = rsqrt(a) estimate */
18076 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18077 unspec)));
18078
18079 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18080 if (!recip)
18081 {
18082 rtx zero = force_reg (mode, CONST0_RTX(mode));
18083 rtx mask;
18084
18085 /* Handle masked compare. */
18086 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
18087 {
18088 mask = gen_reg_rtx (HImode);
18089 /* Imm value 0x4 corresponds to not-equal comparison. */
18090 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
18091 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
18092 }
18093 else
18094 {
18095 mask = gen_reg_rtx (mode);
18096 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
18097 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
18098 }
18099 }
18100
fab263ab
L
18101 mthree = force_reg (mode, mthree);
18102
2bf6d935
ML
18103 /* e0 = x0 * a */
18104 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
2bf6d935 18105
a6645a82
L
18106 unsigned vector_size = GET_MODE_SIZE (mode);
18107 if (TARGET_FMA
18108 || (TARGET_AVX512F && vector_size == 64)
18109 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
fab263ab
L
18110 emit_insn (gen_rtx_SET (e2,
18111 gen_rtx_FMA (mode, e0, x0, mthree)));
18112 else
18113 {
18114 /* e1 = e0 * x0 */
18115 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
18116
18117 /* e2 = e1 - 3. */
18118 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
18119 }
2bf6d935
ML
18120
18121 mhalf = force_reg (mode, mhalf);
18122 if (recip)
18123 /* e3 = -.5 * x0 */
18124 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
18125 else
18126 /* e3 = -.5 * e0 */
18127 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
18128 /* ret = e2 * e3 */
18129 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
18130}
18131
18132/* Expand fabs (OP0) and return a new rtx that holds the result. The
18133 mask for masking out the sign-bit is stored in *SMASK, if that is
18134 non-null. */
18135
18136static rtx
18137ix86_expand_sse_fabs (rtx op0, rtx *smask)
18138{
18139 machine_mode vmode, mode = GET_MODE (op0);
18140 rtx xa, mask;
18141
18142 xa = gen_reg_rtx (mode);
18143 if (mode == SFmode)
18144 vmode = V4SFmode;
18145 else if (mode == DFmode)
18146 vmode = V2DFmode;
18147 else
18148 vmode = mode;
18149 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
18150 if (!VECTOR_MODE_P (mode))
18151 {
18152 /* We need to generate a scalar mode mask in this case. */
18153 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18154 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18155 mask = gen_reg_rtx (mode);
18156 emit_insn (gen_rtx_SET (mask, tmp));
18157 }
18158 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
18159
18160 if (smask)
18161 *smask = mask;
18162
18163 return xa;
18164}
18165
18166/* Expands a comparison of OP0 with OP1 using comparison code CODE,
18167 swapping the operands if SWAP_OPERANDS is true. The expanded
18168 code is a forward jump to a newly created label in case the
18169 comparison is true. The generated label rtx is returned. */
18170static rtx_code_label *
18171ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
18172 bool swap_operands)
18173{
18174 bool unordered_compare = ix86_unordered_fp_compare (code);
18175 rtx_code_label *label;
18176 rtx tmp, reg;
18177
18178 if (swap_operands)
18179 std::swap (op0, op1);
18180
18181 label = gen_label_rtx ();
18182 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
18183 if (unordered_compare)
18184 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
18185 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
18186 emit_insn (gen_rtx_SET (reg, tmp));
18187 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
18188 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18189 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
18190 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18191 JUMP_LABEL (tmp) = label;
18192
18193 return label;
18194}
18195
18196/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18197 using comparison code CODE. Operands are swapped for the comparison if
18198 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18199static rtx
18200ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
18201 bool swap_operands)
18202{
18203 rtx (*insn)(rtx, rtx, rtx, rtx);
18204 machine_mode mode = GET_MODE (op0);
18205 rtx mask = gen_reg_rtx (mode);
18206
18207 if (swap_operands)
18208 std::swap (op0, op1);
18209
18210 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
18211
18212 emit_insn (insn (mask, op0, op1,
18213 gen_rtx_fmt_ee (code, mode, op0, op1)));
18214 return mask;
18215}
18216
18217/* Expand copysign from SIGN to the positive value ABS_VALUE
18218 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18219 the sign-bit. */
18220
18221static void
18222ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
18223{
18224 machine_mode mode = GET_MODE (sign);
18225 rtx sgn = gen_reg_rtx (mode);
18226 if (mask == NULL_RTX)
18227 {
18228 machine_mode vmode;
18229
18230 if (mode == SFmode)
18231 vmode = V4SFmode;
18232 else if (mode == DFmode)
18233 vmode = V2DFmode;
18234 else
18235 vmode = mode;
18236
18237 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
18238 if (!VECTOR_MODE_P (mode))
18239 {
18240 /* We need to generate a scalar mode mask in this case. */
18241 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18242 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18243 mask = gen_reg_rtx (mode);
18244 emit_insn (gen_rtx_SET (mask, tmp));
18245 }
18246 }
18247 else
18248 mask = gen_rtx_NOT (mode, mask);
18249 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
18250 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
18251}
18252
18253/* Expand SSE sequence for computing lround from OP1 storing
18254 into OP0. */
18255
18256void
18257ix86_expand_lround (rtx op0, rtx op1)
18258{
18259 /* C code for the stuff we're doing below:
d2754fbb
UB
18260 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18261 return (long)tmp;
2bf6d935
ML
18262 */
18263 machine_mode mode = GET_MODE (op1);
18264 const struct real_format *fmt;
18265 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18266 rtx adj;
18267
18268 /* load nextafter (0.5, 0.0) */
18269 fmt = REAL_MODE_FORMAT (mode);
18270 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18271 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18272
18273 /* adj = copysign (0.5, op1) */
18274 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
18275 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
18276
18277 /* adj = op1 + adj */
18278 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
18279
18280 /* op0 = (imode)adj */
18281 expand_fix (op0, adj, 0);
18282}
18283
18284/* Expand SSE2 sequence for computing lround from OPERAND1 storing
18285 into OPERAND0. */
18286
18287void
18288ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
18289{
18290 /* C code for the stuff we're doing below (for do_floor):
18291 xi = (long)op1;
d2754fbb
UB
18292 xi -= (double)xi > op1 ? 1 : 0;
18293 return xi;
2bf6d935
ML
18294 */
18295 machine_mode fmode = GET_MODE (op1);
18296 machine_mode imode = GET_MODE (op0);
18297 rtx ireg, freg, tmp;
18298 rtx_code_label *label;
18299
18300 /* reg = (long)op1 */
18301 ireg = gen_reg_rtx (imode);
18302 expand_fix (ireg, op1, 0);
18303
18304 /* freg = (double)reg */
18305 freg = gen_reg_rtx (fmode);
18306 expand_float (freg, ireg, 0);
18307
18308 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18309 label = ix86_expand_sse_compare_and_jump (UNLE,
18310 freg, op1, !do_floor);
18311 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
18312 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
18313 emit_move_insn (ireg, tmp);
18314
18315 emit_label (label);
18316 LABEL_NUSES (label) = 1;
18317
18318 emit_move_insn (op0, ireg);
18319}
18320
18321/* Generate and return a rtx of mode MODE for 2**n where n is the number
18322 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18323
18324static rtx
18325ix86_gen_TWO52 (machine_mode mode)
18326{
d2754fbb 18327 const struct real_format *fmt;
2bf6d935
ML
18328 REAL_VALUE_TYPE TWO52r;
18329 rtx TWO52;
18330
d2754fbb
UB
18331 fmt = REAL_MODE_FORMAT (mode);
18332 real_2expN (&TWO52r, fmt->p - 1, mode);
2bf6d935
ML
18333 TWO52 = const_double_from_real_value (TWO52r, mode);
18334 TWO52 = force_reg (mode, TWO52);
18335
18336 return TWO52;
18337}
18338
18339/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18340
18341void
18342ix86_expand_rint (rtx operand0, rtx operand1)
18343{
18344 /* C code for the stuff we're doing below:
18345 xa = fabs (operand1);
d2754fbb 18346 if (!isless (xa, 2**52))
2bf6d935 18347 return operand1;
d2754fbb
UB
18348 two52 = 2**52;
18349 if (flag_rounding_math)
2bf6d935
ML
18350 {
18351 two52 = copysign (two52, operand1);
18352 xa = operand1;
18353 }
d2754fbb
UB
18354 xa = xa + two52 - two52;
18355 return copysign (xa, operand1);
2bf6d935
ML
18356 */
18357 machine_mode mode = GET_MODE (operand0);
81615bb0 18358 rtx res, xa, TWO52, mask;
2bf6d935
ML
18359 rtx_code_label *label;
18360
d2754fbb
UB
18361 TWO52 = ix86_gen_TWO52 (mode);
18362
18363 /* Temporary for holding the result, initialized to the input
18364 operand to ease control flow. */
18365 res = copy_to_reg (operand1);
2bf6d935
ML
18366
18367 /* xa = abs (operand1) */
18368 xa = ix86_expand_sse_fabs (res, &mask);
18369
18370 /* if (!isless (xa, TWO52)) goto label; */
2bf6d935
ML
18371 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18372
2bf6d935
ML
18373 if (flag_rounding_math)
18374 {
81615bb0 18375 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
2bf6d935
ML
18376 xa = res;
18377 }
18378
81615bb0
UB
18379 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18380 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18381
18382 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18383 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18384 xa = ix86_expand_sse_fabs (xa, NULL);
2bf6d935
ML
18385
18386 ix86_sse_copysign_to_positive (res, xa, res, mask);
18387
18388 emit_label (label);
18389 LABEL_NUSES (label) = 1;
18390
18391 emit_move_insn (operand0, res);
18392}
18393
36d387f2
UB
18394/* Expand SSE2 sequence for computing floor or ceil
18395 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
18396void
18397ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
18398{
18399 /* C code for the stuff we expand below.
18400 double xa = fabs (x), x2;
d2754fbb
UB
18401 if (!isless (xa, TWO52))
18402 return x;
2bf6d935 18403 x2 = (double)(long)x;
337ed0eb 18404
2bf6d935
ML
18405 Compensate. Floor:
18406 if (x2 > x)
18407 x2 -= 1;
18408 Compensate. Ceil:
18409 if (x2 < x)
18410 x2 += 1;
337ed0eb 18411
2bf6d935
ML
18412 if (HONOR_SIGNED_ZEROS (mode))
18413 return copysign (x2, x);
18414 return x2;
18415 */
18416 machine_mode mode = GET_MODE (operand0);
18417 rtx xa, xi, TWO52, tmp, one, res, mask;
18418 rtx_code_label *label;
18419
18420 TWO52 = ix86_gen_TWO52 (mode);
18421
18422 /* Temporary for holding the result, initialized to the input
18423 operand to ease control flow. */
d2754fbb 18424 res = copy_to_reg (operand1);
2bf6d935
ML
18425
18426 /* xa = abs (operand1) */
18427 xa = ix86_expand_sse_fabs (res, &mask);
18428
18429 /* if (!isless (xa, TWO52)) goto label; */
18430 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18431
18432 /* xa = (double)(long)x */
d2754fbb 18433 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935
ML
18434 expand_fix (xi, res, 0);
18435 expand_float (xa, xi, 0);
18436
18437 /* generate 1.0 */
18438 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18439
18440 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18441 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18442 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18443 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18444 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
2bf6d935 18445 if (HONOR_SIGNED_ZEROS (mode))
337ed0eb
UB
18446 {
18447 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18448 if (do_floor && flag_rounding_math)
18449 tmp = ix86_expand_sse_fabs (tmp, NULL);
18450
18451 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18452 }
18453 emit_move_insn (res, tmp);
2bf6d935
ML
18454
18455 emit_label (label);
18456 LABEL_NUSES (label) = 1;
18457
18458 emit_move_insn (operand0, res);
18459}
18460
36d387f2
UB
18461/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18462 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18463 that is only available on 64bit targets. */
2bf6d935 18464void
36d387f2 18465ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
2bf6d935
ML
18466{
18467 /* C code for the stuff we expand below.
d2754fbb
UB
18468 double xa = fabs (x), x2;
18469 if (!isless (xa, TWO52))
18470 return x;
18471 xa = xa + TWO52 - TWO52;
18472 x2 = copysign (xa, x);
337ed0eb 18473
36d387f2 18474 Compensate. Floor:
d2754fbb
UB
18475 if (x2 > x)
18476 x2 -= 1;
36d387f2 18477 Compensate. Ceil:
d2754fbb
UB
18478 if (x2 < x)
18479 x2 += 1;
337ed0eb 18480
36d387f2
UB
18481 if (HONOR_SIGNED_ZEROS (mode))
18482 x2 = copysign (x2, x);
18483 return x2;
2bf6d935
ML
18484 */
18485 machine_mode mode = GET_MODE (operand0);
36d387f2 18486 rtx xa, TWO52, tmp, one, res, mask;
2bf6d935
ML
18487 rtx_code_label *label;
18488
18489 TWO52 = ix86_gen_TWO52 (mode);
18490
18491 /* Temporary for holding the result, initialized to the input
18492 operand to ease control flow. */
d2754fbb 18493 res = copy_to_reg (operand1);
2bf6d935
ML
18494
18495 /* xa = abs (operand1) */
18496 xa = ix86_expand_sse_fabs (res, &mask);
18497
18498 /* if (!isless (xa, TWO52)) goto label; */
18499 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18500
36d387f2
UB
18501 /* xa = xa + TWO52 - TWO52; */
18502 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18503 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
2bf6d935 18504
36d387f2
UB
18505 /* xa = copysign (xa, operand1) */
18506 ix86_sse_copysign_to_positive (xa, xa, res, mask);
2bf6d935 18507
36d387f2
UB
18508 /* generate 1.0 */
18509 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
2bf6d935 18510
36d387f2
UB
18511 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18512 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18513 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18514 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18515 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
337ed0eb
UB
18516 if (HONOR_SIGNED_ZEROS (mode))
18517 {
18518 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18519 if (do_floor && flag_rounding_math)
18520 tmp = ix86_expand_sse_fabs (tmp, NULL);
18521
18522 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18523 }
36d387f2 18524 emit_move_insn (res, tmp);
2bf6d935
ML
18525
18526 emit_label (label);
18527 LABEL_NUSES (label) = 1;
18528
18529 emit_move_insn (operand0, res);
18530}
18531
36d387f2
UB
18532/* Expand SSE sequence for computing trunc
18533 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
18534void
18535ix86_expand_trunc (rtx operand0, rtx operand1)
18536{
18537 /* C code for SSE variant we expand below.
d2754fbb
UB
18538 double xa = fabs (x), x2;
18539 if (!isless (xa, TWO52))
18540 return x;
18541 x2 = (double)(long)x;
2bf6d935
ML
18542 if (HONOR_SIGNED_ZEROS (mode))
18543 return copysign (x2, x);
18544 return x2;
18545 */
18546 machine_mode mode = GET_MODE (operand0);
18547 rtx xa, xi, TWO52, res, mask;
18548 rtx_code_label *label;
18549
18550 TWO52 = ix86_gen_TWO52 (mode);
18551
18552 /* Temporary for holding the result, initialized to the input
18553 operand to ease control flow. */
d2754fbb 18554 res = copy_to_reg (operand1);
2bf6d935
ML
18555
18556 /* xa = abs (operand1) */
18557 xa = ix86_expand_sse_fabs (res, &mask);
18558
18559 /* if (!isless (xa, TWO52)) goto label; */
18560 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18561
97d3ddcf 18562 /* xa = (double)(long)x */
d2754fbb 18563 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935 18564 expand_fix (xi, res, 0);
97d3ddcf 18565 expand_float (xa, xi, 0);
2bf6d935
ML
18566
18567 if (HONOR_SIGNED_ZEROS (mode))
97d3ddcf
UB
18568 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18569
18570 emit_move_insn (res, xa);
2bf6d935
ML
18571
18572 emit_label (label);
18573 LABEL_NUSES (label) = 1;
18574
18575 emit_move_insn (operand0, res);
18576}
18577
18578/* Expand SSE sequence for computing trunc from OPERAND1 storing
36d387f2
UB
18579 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18580 that is only available on 64bit targets. */
2bf6d935
ML
18581void
18582ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18583{
18584 machine_mode mode = GET_MODE (operand0);
c142ae5e 18585 rtx xa, xa2, TWO52, tmp, one, res, mask;
2bf6d935
ML
18586 rtx_code_label *label;
18587
18588 /* C code for SSE variant we expand below.
d2754fbb
UB
18589 double xa = fabs (x), x2;
18590 if (!isless (xa, TWO52))
18591 return x;
18592 xa2 = xa + TWO52 - TWO52;
2bf6d935 18593 Compensate:
d2754fbb
UB
18594 if (xa2 > xa)
18595 xa2 -= 1.0;
18596 x2 = copysign (xa2, x);
18597 return x2;
2bf6d935
ML
18598 */
18599
18600 TWO52 = ix86_gen_TWO52 (mode);
18601
18602 /* Temporary for holding the result, initialized to the input
18603 operand to ease control flow. */
d2754fbb 18604 res =copy_to_reg (operand1);
2bf6d935
ML
18605
18606 /* xa = abs (operand1) */
c142ae5e 18607 xa = ix86_expand_sse_fabs (res, &mask);
2bf6d935
ML
18608
18609 /* if (!isless (xa, TWO52)) goto label; */
18610 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18611
c142ae5e
UB
18612 /* xa2 = xa + TWO52 - TWO52; */
18613 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18614 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
2bf6d935
ML
18615
18616 /* generate 1.0 */
18617 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18618
c142ae5e
UB
18619 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18620 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18621 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
2bf6d935 18622 tmp = expand_simple_binop (mode, MINUS,
c142ae5e
UB
18623 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18624 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
81615bb0 18625 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
c142ae5e 18626 tmp = ix86_expand_sse_fabs (tmp, NULL);
2bf6d935 18627
c142ae5e
UB
18628 /* res = copysign (xa2, operand1) */
18629 ix86_sse_copysign_to_positive (res, tmp, res, mask);
2bf6d935
ML
18630
18631 emit_label (label);
18632 LABEL_NUSES (label) = 1;
18633
18634 emit_move_insn (operand0, res);
18635}
18636
36d387f2
UB
18637/* Expand SSE sequence for computing round
18638 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
18639void
18640ix86_expand_round (rtx operand0, rtx operand1)
18641{
18642 /* C code for the stuff we're doing below:
d2754fbb
UB
18643 double xa = fabs (x);
18644 if (!isless (xa, TWO52))
18645 return x;
18646 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18647 return copysign (xa, x);
2bf6d935
ML
18648 */
18649 machine_mode mode = GET_MODE (operand0);
18650 rtx res, TWO52, xa, xi, half, mask;
18651 rtx_code_label *label;
18652 const struct real_format *fmt;
18653 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18654
18655 /* Temporary for holding the result, initialized to the input
18656 operand to ease control flow. */
d2754fbb 18657 res = copy_to_reg (operand1);
2bf6d935
ML
18658
18659 TWO52 = ix86_gen_TWO52 (mode);
18660 xa = ix86_expand_sse_fabs (res, &mask);
18661 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18662
18663 /* load nextafter (0.5, 0.0) */
18664 fmt = REAL_MODE_FORMAT (mode);
18665 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18666 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18667
18668 /* xa = xa + 0.5 */
18669 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18670 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18671
18672 /* xa = (double)(int64_t)xa */
d2754fbb 18673 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935
ML
18674 expand_fix (xi, xa, 0);
18675 expand_float (xa, xi, 0);
18676
18677 /* res = copysign (xa, operand1) */
97d3ddcf 18678 ix86_sse_copysign_to_positive (res, xa, res, mask);
2bf6d935
ML
18679
18680 emit_label (label);
18681 LABEL_NUSES (label) = 1;
18682
18683 emit_move_insn (operand0, res);
18684}
18685
36d387f2
UB
18686/* Expand SSE sequence for computing round from OPERAND1 storing
18687 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18688 that is only available on 64bit targets. */
18689void
18690ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18691{
18692 /* C code for the stuff we expand below.
d2754fbb
UB
18693 double xa = fabs (x), xa2, x2;
18694 if (!isless (xa, TWO52))
18695 return x;
36d387f2
UB
18696 Using the absolute value and copying back sign makes
18697 -0.0 -> -0.0 correct.
d2754fbb 18698 xa2 = xa + TWO52 - TWO52;
36d387f2
UB
18699 Compensate.
18700 dxa = xa2 - xa;
d2754fbb
UB
18701 if (dxa <= -0.5)
18702 xa2 += 1;
18703 else if (dxa > 0.5)
18704 xa2 -= 1;
18705 x2 = copysign (xa2, x);
18706 return x2;
36d387f2
UB
18707 */
18708 machine_mode mode = GET_MODE (operand0);
18709 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18710 rtx_code_label *label;
18711
18712 TWO52 = ix86_gen_TWO52 (mode);
18713
18714 /* Temporary for holding the result, initialized to the input
18715 operand to ease control flow. */
d2754fbb 18716 res = copy_to_reg (operand1);
36d387f2
UB
18717
18718 /* xa = abs (operand1) */
18719 xa = ix86_expand_sse_fabs (res, &mask);
18720
18721 /* if (!isless (xa, TWO52)) goto label; */
18722 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18723
18724 /* xa2 = xa + TWO52 - TWO52; */
18725 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18726 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18727
18728 /* dxa = xa2 - xa; */
18729 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18730
18731 /* generate 0.5, 1.0 and -0.5 */
18732 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18733 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18734 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18735 0, OPTAB_DIRECT);
18736
18737 /* Compensate. */
18738 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18739 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18740 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18741 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18742 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18743 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18744 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18745 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18746
18747 /* res = copysign (xa2, operand1) */
97d3ddcf 18748 ix86_sse_copysign_to_positive (res, xa2, res, mask);
36d387f2
UB
18749
18750 emit_label (label);
18751 LABEL_NUSES (label) = 1;
18752
18753 emit_move_insn (operand0, res);
18754}
18755
2bf6d935
ML
18756/* Expand SSE sequence for computing round
18757 from OP1 storing into OP0 using sse4 round insn. */
18758void
18759ix86_expand_round_sse4 (rtx op0, rtx op1)
18760{
18761 machine_mode mode = GET_MODE (op0);
18762 rtx e1, e2, res, half;
18763 const struct real_format *fmt;
18764 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18765 rtx (*gen_copysign) (rtx, rtx, rtx);
18766 rtx (*gen_round) (rtx, rtx, rtx);
18767
18768 switch (mode)
18769 {
18770 case E_SFmode:
18771 gen_copysign = gen_copysignsf3;
18772 gen_round = gen_sse4_1_roundsf2;
18773 break;
18774 case E_DFmode:
18775 gen_copysign = gen_copysigndf3;
18776 gen_round = gen_sse4_1_rounddf2;
18777 break;
18778 default:
18779 gcc_unreachable ();
18780 }
18781
18782 /* round (a) = trunc (a + copysign (0.5, a)) */
18783
18784 /* load nextafter (0.5, 0.0) */
18785 fmt = REAL_MODE_FORMAT (mode);
18786 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18787 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18788 half = const_double_from_real_value (pred_half, mode);
18789
18790 /* e1 = copysign (0.5, op1) */
18791 e1 = gen_reg_rtx (mode);
18792 emit_insn (gen_copysign (e1, half, op1));
18793
18794 /* e2 = op1 + e1 */
18795 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18796
18797 /* res = trunc (e2) */
18798 res = gen_reg_rtx (mode);
18799 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18800
18801 emit_move_insn (op0, res);
18802}
18803
18804/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18805 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18806 insn every time. */
18807
18808static GTY(()) rtx_insn *vselect_insn;
18809
18810/* Initialize vselect_insn. */
18811
18812static void
18813init_vselect_insn (void)
18814{
18815 unsigned i;
18816 rtx x;
18817
18818 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18819 for (i = 0; i < MAX_VECT_LEN; ++i)
18820 XVECEXP (x, 0, i) = const0_rtx;
18821 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18822 const0_rtx), x);
18823 x = gen_rtx_SET (const0_rtx, x);
18824 start_sequence ();
18825 vselect_insn = emit_insn (x);
18826 end_sequence ();
18827}
18828
18829/* Construct (set target (vec_select op0 (parallel perm))) and
18830 return true if that's a valid instruction in the active ISA. */
18831
18832static bool
18833expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18834 unsigned nelt, bool testing_p)
18835{
18836 unsigned int i;
18837 rtx x, save_vconcat;
18838 int icode;
18839
18840 if (vselect_insn == NULL_RTX)
18841 init_vselect_insn ();
18842
18843 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18844 PUT_NUM_ELEM (XVEC (x, 0), nelt);
18845 for (i = 0; i < nelt; ++i)
18846 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18847 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18848 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18849 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18850 SET_DEST (PATTERN (vselect_insn)) = target;
18851 icode = recog_memoized (vselect_insn);
18852
18853 if (icode >= 0 && !testing_p)
18854 emit_insn (copy_rtx (PATTERN (vselect_insn)));
18855
18856 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18857 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18858 INSN_CODE (vselect_insn) = -1;
18859
18860 return icode >= 0;
18861}
18862
18863/* Similar, but generate a vec_concat from op0 and op1 as well. */
18864
18865static bool
18866expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18867 const unsigned char *perm, unsigned nelt,
18868 bool testing_p)
18869{
18870 machine_mode v2mode;
18871 rtx x;
18872 bool ok;
18873
18874 if (vselect_insn == NULL_RTX)
18875 init_vselect_insn ();
18876
18877 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18878 return false;
18879 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18880 PUT_MODE (x, v2mode);
18881 XEXP (x, 0) = op0;
18882 XEXP (x, 1) = op1;
18883 ok = expand_vselect (target, x, perm, nelt, testing_p);
18884 XEXP (x, 0) = const0_rtx;
18885 XEXP (x, 1) = const0_rtx;
18886 return ok;
18887}
18888
4bf4c103 18889/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
18890 using movss or movsd. */
18891static bool
18892expand_vec_perm_movs (struct expand_vec_perm_d *d)
18893{
18894 machine_mode vmode = d->vmode;
18895 unsigned i, nelt = d->nelt;
18896 rtx x;
18897
18898 if (d->one_operand_p)
18899 return false;
18900
18901 if (!(TARGET_SSE && vmode == V4SFmode)
febb58d2 18902 && !(TARGET_SSE && vmode == V4SImode)
240198fe 18903 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
febb58d2
RS
18904 && !(TARGET_SSE2 && vmode == V2DFmode)
18905 && !(TARGET_SSE2 && vmode == V2DImode))
2bf6d935
ML
18906 return false;
18907
18908 /* Only the first element is changed. */
18909 if (d->perm[0] != nelt && d->perm[0] != 0)
18910 return false;
18911 for (i = 1; i < nelt; ++i)
18912 if (d->perm[i] != i + nelt - d->perm[0])
18913 return false;
18914
18915 if (d->testing_p)
18916 return true;
18917
18918 if (d->perm[0] == nelt)
18919 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18920 else
18921 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18922
18923 emit_insn (gen_rtx_SET (d->target, x));
18924
18925 return true;
18926}
18927
4bf4c103 18928/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
18929 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
18930
18931static bool
18932expand_vec_perm_blend (struct expand_vec_perm_d *d)
18933{
18934 machine_mode mmode, vmode = d->vmode;
fa2987ed
JJ
18935 unsigned i, nelt = d->nelt;
18936 unsigned HOST_WIDE_INT mask;
2bf6d935
ML
18937 rtx target, op0, op1, maskop, x;
18938 rtx rperm[32], vperm;
18939
18940 if (d->one_operand_p)
18941 return false;
18942 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
18943 && (TARGET_AVX512BW
18944 || GET_MODE_UNIT_SIZE (vmode) >= 4))
18945 ;
18946 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
18947 ;
18948 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
18949 ;
a325bdd1 18950 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
be8749f9
UB
18951 || GET_MODE_SIZE (vmode) == 8
18952 || GET_MODE_SIZE (vmode) == 4))
2bf6d935
ML
18953 ;
18954 else
18955 return false;
18956
18957 /* This is a blend, not a permute. Elements must stay in their
18958 respective lanes. */
18959 for (i = 0; i < nelt; ++i)
18960 {
18961 unsigned e = d->perm[i];
18962 if (!(e == i || e == i + nelt))
18963 return false;
18964 }
18965
18966 if (d->testing_p)
18967 return true;
18968
18969 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
18970 decision should be extracted elsewhere, so that we only try that
18971 sequence once all budget==3 options have been tried. */
18972 target = d->target;
18973 op0 = d->op0;
18974 op1 = d->op1;
18975 mask = 0;
18976
18977 switch (vmode)
18978 {
18979 case E_V8DFmode:
18980 case E_V16SFmode:
18981 case E_V4DFmode:
18982 case E_V8SFmode:
18983 case E_V2DFmode:
18984 case E_V4SFmode:
a325bdd1 18985 case E_V4HImode:
2bf6d935
ML
18986 case E_V8HImode:
18987 case E_V8SImode:
18988 case E_V32HImode:
18989 case E_V64QImode:
18990 case E_V16SImode:
18991 case E_V8DImode:
18992 for (i = 0; i < nelt; ++i)
fa2987ed 18993 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
2bf6d935
ML
18994 break;
18995
18996 case E_V2DImode:
18997 for (i = 0; i < 2; ++i)
18998 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
18999 vmode = V8HImode;
19000 goto do_subreg;
19001
a325bdd1
PB
19002 case E_V2SImode:
19003 for (i = 0; i < 2; ++i)
19004 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
19005 vmode = V4HImode;
19006 goto do_subreg;
19007
2bf6d935
ML
19008 case E_V4SImode:
19009 for (i = 0; i < 4; ++i)
19010 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19011 vmode = V8HImode;
19012 goto do_subreg;
19013
19014 case E_V16QImode:
19015 /* See if bytes move in pairs so we can use pblendw with
19016 an immediate argument, rather than pblendvb with a vector
19017 argument. */
19018 for (i = 0; i < 16; i += 2)
19019 if (d->perm[i] + 1 != d->perm[i + 1])
19020 {
19021 use_pblendvb:
19022 for (i = 0; i < nelt; ++i)
19023 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
19024
19025 finish_pblendvb:
19026 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19027 vperm = force_reg (vmode, vperm);
19028
be8749f9 19029 if (GET_MODE_SIZE (vmode) == 4)
820ac79e 19030 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
be8749f9 19031 else if (GET_MODE_SIZE (vmode) == 8)
820ac79e 19032 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
a325bdd1 19033 else if (GET_MODE_SIZE (vmode) == 16)
2bf6d935
ML
19034 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
19035 else
19036 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
19037 if (target != d->target)
19038 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19039 return true;
19040 }
19041
19042 for (i = 0; i < 8; ++i)
19043 mask |= (d->perm[i * 2] >= 16) << i;
19044 vmode = V8HImode;
19045 /* FALLTHRU */
19046
19047 do_subreg:
19048 target = gen_reg_rtx (vmode);
19049 op0 = gen_lowpart (vmode, op0);
19050 op1 = gen_lowpart (vmode, op1);
19051 break;
19052
a325bdd1
PB
19053 case E_V8QImode:
19054 for (i = 0; i < 8; i += 2)
19055 if (d->perm[i] + 1 != d->perm[i + 1])
19056 goto use_pblendvb;
19057
19058 for (i = 0; i < 4; ++i)
19059 mask |= (d->perm[i * 2] >= 8) << i;
19060 vmode = V4HImode;
19061 goto do_subreg;
19062
be8749f9
UB
19063 case E_V4QImode:
19064 for (i = 0; i < 4; i += 2)
19065 if (d->perm[i] + 1 != d->perm[i + 1])
19066 goto use_pblendvb;
19067
19068 for (i = 0; i < 2; ++i)
19069 mask |= (d->perm[i * 2] >= 4) << i;
19070 vmode = V2HImode;
19071 goto do_subreg;
19072
2bf6d935
ML
19073 case E_V32QImode:
19074 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19075 for (i = 0; i < 32; i += 2)
19076 if (d->perm[i] + 1 != d->perm[i + 1])
19077 goto use_pblendvb;
19078 /* See if bytes move in quadruplets. If yes, vpblendd
19079 with immediate can be used. */
19080 for (i = 0; i < 32; i += 4)
19081 if (d->perm[i] + 2 != d->perm[i + 2])
19082 break;
19083 if (i < 32)
19084 {
19085 /* See if bytes move the same in both lanes. If yes,
19086 vpblendw with immediate can be used. */
19087 for (i = 0; i < 16; i += 2)
19088 if (d->perm[i] + 16 != d->perm[i + 16])
19089 goto use_pblendvb;
19090
19091 /* Use vpblendw. */
19092 for (i = 0; i < 16; ++i)
19093 mask |= (d->perm[i * 2] >= 32) << i;
19094 vmode = V16HImode;
19095 goto do_subreg;
19096 }
19097
19098 /* Use vpblendd. */
19099 for (i = 0; i < 8; ++i)
19100 mask |= (d->perm[i * 4] >= 32) << i;
19101 vmode = V8SImode;
19102 goto do_subreg;
19103
19104 case E_V16HImode:
19105 /* See if words move in pairs. If yes, vpblendd can be used. */
19106 for (i = 0; i < 16; i += 2)
19107 if (d->perm[i] + 1 != d->perm[i + 1])
19108 break;
19109 if (i < 16)
19110 {
19111 /* See if words move the same in both lanes. If not,
19112 vpblendvb must be used. */
19113 for (i = 0; i < 8; i++)
19114 if (d->perm[i] + 8 != d->perm[i + 8])
19115 {
19116 /* Use vpblendvb. */
19117 for (i = 0; i < 32; ++i)
19118 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
19119
19120 vmode = V32QImode;
19121 nelt = 32;
19122 target = gen_reg_rtx (vmode);
19123 op0 = gen_lowpart (vmode, op0);
19124 op1 = gen_lowpart (vmode, op1);
19125 goto finish_pblendvb;
19126 }
19127
19128 /* Use vpblendw. */
19129 for (i = 0; i < 16; ++i)
19130 mask |= (d->perm[i] >= 16) << i;
19131 break;
19132 }
19133
19134 /* Use vpblendd. */
19135 for (i = 0; i < 8; ++i)
19136 mask |= (d->perm[i * 2] >= 16) << i;
19137 vmode = V8SImode;
19138 goto do_subreg;
19139
19140 case E_V4DImode:
19141 /* Use vpblendd. */
19142 for (i = 0; i < 4; ++i)
19143 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19144 vmode = V8SImode;
19145 goto do_subreg;
19146
19147 default:
19148 gcc_unreachable ();
19149 }
19150
19151 switch (vmode)
19152 {
19153 case E_V8DFmode:
19154 case E_V8DImode:
19155 mmode = QImode;
19156 break;
19157 case E_V16SFmode:
19158 case E_V16SImode:
19159 mmode = HImode;
19160 break;
19161 case E_V32HImode:
19162 mmode = SImode;
19163 break;
19164 case E_V64QImode:
19165 mmode = DImode;
19166 break;
19167 default:
19168 mmode = VOIDmode;
19169 }
19170
19171 if (mmode != VOIDmode)
19172 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
19173 else
19174 maskop = GEN_INT (mask);
19175
19176 /* This matches five different patterns with the different modes. */
19177 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
19178 x = gen_rtx_SET (target, x);
19179 emit_insn (x);
19180 if (target != d->target)
19181 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19182
19183 return true;
19184}
19185
4bf4c103 19186/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
19187 in terms of the variable form of vpermilps.
19188
19189 Note that we will have already failed the immediate input vpermilps,
19190 which requires that the high and low part shuffle be identical; the
19191 variable form doesn't require that. */
19192
19193static bool
19194expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
19195{
19196 rtx rperm[8], vperm;
19197 unsigned i;
19198
19199 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
19200 return false;
19201
19202 /* We can only permute within the 128-bit lane. */
19203 for (i = 0; i < 8; ++i)
19204 {
19205 unsigned e = d->perm[i];
19206 if (i < 4 ? e >= 4 : e < 4)
19207 return false;
19208 }
19209
19210 if (d->testing_p)
19211 return true;
19212
19213 for (i = 0; i < 8; ++i)
19214 {
19215 unsigned e = d->perm[i];
19216
19217 /* Within each 128-bit lane, the elements of op0 are numbered
19218 from 0 and the elements of op1 are numbered from 4. */
19219 if (e >= 8 + 4)
19220 e -= 8;
19221 else if (e >= 4)
19222 e -= 4;
19223
19224 rperm[i] = GEN_INT (e);
19225 }
19226
19227 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
19228 vperm = force_reg (V8SImode, vperm);
19229 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
19230
19231 return true;
19232}
19233
1fa991d1
UB
19234/* For V*[QHS]Imode permutations, check if the same permutation
19235 can't be performed in a 2x, 4x or 8x wider inner mode. */
19236
19237static bool
19238canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
19239 struct expand_vec_perm_d *nd)
19240{
19241 int i;
19242 machine_mode mode = VOIDmode;
19243
19244 switch (d->vmode)
19245 {
19246 case E_V8QImode: mode = V4HImode; break;
19247 case E_V16QImode: mode = V8HImode; break;
19248 case E_V32QImode: mode = V16HImode; break;
19249 case E_V64QImode: mode = V32HImode; break;
19250 case E_V4HImode: mode = V2SImode; break;
19251 case E_V8HImode: mode = V4SImode; break;
19252 case E_V16HImode: mode = V8SImode; break;
19253 case E_V32HImode: mode = V16SImode; break;
19254 case E_V4SImode: mode = V2DImode; break;
19255 case E_V8SImode: mode = V4DImode; break;
19256 case E_V16SImode: mode = V8DImode; break;
19257 default: return false;
19258 }
19259 for (i = 0; i < d->nelt; i += 2)
19260 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
19261 return false;
19262 nd->vmode = mode;
19263 nd->nelt = d->nelt / 2;
19264 for (i = 0; i < nd->nelt; i++)
19265 nd->perm[i] = d->perm[2 * i] / 2;
19266 if (GET_MODE_INNER (mode) != DImode)
19267 canonicalize_vector_int_perm (nd, nd);
19268 if (nd != d)
19269 {
19270 nd->one_operand_p = d->one_operand_p;
19271 nd->testing_p = d->testing_p;
19272 if (d->op0 == d->op1)
19273 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
19274 else
19275 {
19276 nd->op0 = gen_lowpart (nd->vmode, d->op0);
19277 nd->op1 = gen_lowpart (nd->vmode, d->op1);
19278 }
19279 if (d->testing_p)
19280 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
19281 else
19282 nd->target = gen_reg_rtx (nd->vmode);
19283 }
19284 return true;
19285}
19286
2bf6d935
ML
19287/* Return true if permutation D can be performed as VMODE permutation
19288 instead. */
19289
19290static bool
19291valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
19292{
19293 unsigned int i, j, chunk;
19294
19295 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
19296 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
19297 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
19298 return false;
19299
19300 if (GET_MODE_NUNITS (vmode) >= d->nelt)
19301 return true;
19302
19303 chunk = d->nelt / GET_MODE_NUNITS (vmode);
19304 for (i = 0; i < d->nelt; i += chunk)
19305 if (d->perm[i] & (chunk - 1))
19306 return false;
19307 else
19308 for (j = 1; j < chunk; ++j)
19309 if (d->perm[i] + j != d->perm[i + j])
19310 return false;
19311
19312 return true;
19313}
19314
4bf4c103 19315/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
19316 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19317
19318static bool
19319expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
19320{
19321 unsigned i, nelt, eltsz, mask;
19322 unsigned char perm[64];
877c9e33 19323 machine_mode vmode;
1fa991d1 19324 struct expand_vec_perm_d nd;
2bf6d935
ML
19325 rtx rperm[64], vperm, target, op0, op1;
19326
19327 nelt = d->nelt;
19328
19329 if (!d->one_operand_p)
be8749f9
UB
19330 switch (GET_MODE_SIZE (d->vmode))
19331 {
19332 case 4:
19333 if (!TARGET_XOP)
19334 return false;
19335 vmode = V4QImode;
19336 break;
37e93925 19337
be8749f9
UB
19338 case 8:
19339 if (!TARGET_XOP)
19340 return false;
19341 vmode = V8QImode;
19342 break;
2bf6d935 19343
be8749f9
UB
19344 case 16:
19345 if (!TARGET_XOP)
2bf6d935 19346 return false;
877c9e33 19347 vmode = V16QImode;
be8749f9
UB
19348 break;
19349
19350 case 32:
19351 if (!TARGET_AVX2)
19352 return false;
19353
19354 if (valid_perm_using_mode_p (V2TImode, d))
19355 {
19356 if (d->testing_p)
19357 return true;
19358
19359 /* Use vperm2i128 insn. The pattern uses
19360 V4DImode instead of V2TImode. */
19361 target = d->target;
19362 if (d->vmode != V4DImode)
19363 target = gen_reg_rtx (V4DImode);
19364 op0 = gen_lowpart (V4DImode, d->op0);
19365 op1 = gen_lowpart (V4DImode, d->op1);
19366 rperm[0]
19367 = GEN_INT ((d->perm[0] / (nelt / 2))
19368 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
19369 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
19370 if (target != d->target)
19371 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19372 return true;
19373 }
19374 /* FALLTHRU */
19375
19376 default:
37e93925 19377 return false;
be8749f9 19378 }
2bf6d935 19379 else
be8749f9
UB
19380 switch (GET_MODE_SIZE (d->vmode))
19381 {
19382 case 4:
19383 if (!TARGET_SSSE3)
19384 return false;
19385 vmode = V4QImode;
19386 break;
2bf6d935 19387
be8749f9
UB
19388 case 8:
19389 if (!TARGET_SSSE3)
19390 return false;
19391 vmode = V8QImode;
19392 break;
2bf6d935 19393
be8749f9
UB
19394 case 16:
19395 if (!TARGET_SSSE3)
19396 return false;
877c9e33 19397 vmode = V16QImode;
be8749f9
UB
19398 break;
19399
19400 case 32:
19401 if (!TARGET_AVX2)
19402 return false;
19403
19404 /* V4DImode should be already handled through
19405 expand_vselect by vpermq instruction. */
19406 gcc_assert (d->vmode != V4DImode);
19407
19408 vmode = V32QImode;
19409 if (d->vmode == V8SImode
19410 || d->vmode == V16HImode
19411 || d->vmode == V32QImode)
19412 {
19413 /* First see if vpermq can be used for
19414 V8SImode/V16HImode/V32QImode. */
19415 if (valid_perm_using_mode_p (V4DImode, d))
19416 {
19417 for (i = 0; i < 4; i++)
19418 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
19419 if (d->testing_p)
19420 return true;
19421 target = gen_reg_rtx (V4DImode);
19422 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
19423 perm, 4, false))
19424 {
19425 emit_move_insn (d->target,
19426 gen_lowpart (d->vmode, target));
2bf6d935 19427 return true;
be8749f9
UB
19428 }
19429 return false;
19430 }
2bf6d935 19431
be8749f9
UB
19432 /* Next see if vpermd can be used. */
19433 if (valid_perm_using_mode_p (V8SImode, d))
19434 vmode = V8SImode;
19435 }
19436 /* Or if vpermps can be used. */
19437 else if (d->vmode == V8SFmode)
19438 vmode = V8SImode;
2bf6d935 19439
be8749f9
UB
19440 if (vmode == V32QImode)
19441 {
19442 /* vpshufb only works intra lanes, it is not
19443 possible to shuffle bytes in between the lanes. */
19444 for (i = 0; i < nelt; ++i)
19445 if ((d->perm[i] ^ i) & (nelt / 2))
19446 return false;
19447 }
19448 break;
2bf6d935 19449
be8749f9
UB
19450 case 64:
19451 if (!TARGET_AVX512BW)
19452 return false;
2bf6d935 19453
be8749f9
UB
19454 /* If vpermq didn't work, vpshufb won't work either. */
19455 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19456 return false;
19457
19458 vmode = V64QImode;
19459 if (d->vmode == V16SImode
19460 || d->vmode == V32HImode
19461 || d->vmode == V64QImode)
19462 {
19463 /* First see if vpermq can be used for
19464 V16SImode/V32HImode/V64QImode. */
19465 if (valid_perm_using_mode_p (V8DImode, d))
19466 {
19467 for (i = 0; i < 8; i++)
19468 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19469 if (d->testing_p)
19470 return true;
19471 target = gen_reg_rtx (V8DImode);
19472 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19473 perm, 8, false))
19474 {
19475 emit_move_insn (d->target,
19476 gen_lowpart (d->vmode, target));
2bf6d935 19477 return true;
be8749f9
UB
19478 }
19479 return false;
19480 }
2bf6d935 19481
be8749f9
UB
19482 /* Next see if vpermd can be used. */
19483 if (valid_perm_using_mode_p (V16SImode, d))
19484 vmode = V16SImode;
19485 }
19486 /* Or if vpermps can be used. */
19487 else if (d->vmode == V16SFmode)
19488 vmode = V16SImode;
877c9e33 19489
be8749f9
UB
19490 if (vmode == V64QImode)
19491 {
19492 /* vpshufb only works intra lanes, it is not
19493 possible to shuffle bytes in between the lanes. */
19494 for (i = 0; i < nelt; ++i)
19495 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19496 return false;
19497 }
19498 break;
19499
19500 default:
2bf6d935 19501 return false;
be8749f9 19502 }
2bf6d935
ML
19503
19504 if (d->testing_p)
19505 return true;
19506
681143b9
UB
19507 /* Try to avoid variable permutation instruction. */
19508 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19509 {
19510 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19511 return true;
19512 }
19513
2bf6d935
ML
19514 if (vmode == V8SImode)
19515 for (i = 0; i < 8; ++i)
19516 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19517 else if (vmode == V16SImode)
19518 for (i = 0; i < 16; ++i)
19519 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19520 else
19521 {
19522 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19523 if (!d->one_operand_p)
19524 mask = 2 * nelt - 1;
2bf6d935
ML
19525 else if (vmode == V64QImode)
19526 mask = nelt / 4 - 1;
a325bdd1 19527 else if (vmode == V32QImode)
2bf6d935 19528 mask = nelt / 2 - 1;
a325bdd1
PB
19529 else
19530 mask = nelt - 1;
2bf6d935
ML
19531
19532 for (i = 0; i < nelt; ++i)
19533 {
19534 unsigned j, e = d->perm[i] & mask;
19535 for (j = 0; j < eltsz; ++j)
19536 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19537 }
19538 }
19539
a325bdd1
PB
19540 machine_mode vpmode = vmode;
19541
877c9e33
UB
19542 nelt = GET_MODE_SIZE (vmode);
19543
19544 /* Emulate narrow modes with V16QI instructions. */
19545 if (nelt < 16)
a325bdd1 19546 {
dd835ec2
UB
19547 rtx m128 = GEN_INT (-128);
19548
37e93925 19549 /* Remap elements from the second operand, as we have to
be8749f9 19550 account for inactive top elements from the first operand. */
37e93925 19551 if (!d->one_operand_p)
be8749f9 19552 {
be8749f9
UB
19553 for (i = 0; i < nelt; ++i)
19554 {
877c9e33
UB
19555 unsigned ival = UINTVAL (rperm[i]);
19556 if (ival >= nelt)
19557 rperm[i] = GEN_INT (ival + 16 - nelt);
be8749f9
UB
19558 }
19559 }
37e93925 19560
877c9e33 19561 /* Fill inactive elements in the top positions with zeros. */
a325bdd1 19562 for (i = nelt; i < 16; ++i)
dd835ec2 19563 rperm[i] = m128;
37e93925 19564
a325bdd1
PB
19565 vpmode = V16QImode;
19566 }
19567
19568 vperm = gen_rtx_CONST_VECTOR (vpmode,
19569 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19570 vperm = force_reg (vpmode, vperm);
2bf6d935 19571
37e93925
UB
19572 if (vmode == d->vmode)
19573 target = d->target;
19574 else
2bf6d935 19575 target = gen_reg_rtx (vmode);
37e93925 19576
2bf6d935 19577 op0 = gen_lowpart (vmode, d->op0);
37e93925 19578
2bf6d935
ML
19579 if (d->one_operand_p)
19580 {
37e93925
UB
19581 rtx (*gen) (rtx, rtx, rtx);
19582
be8749f9
UB
19583 if (vmode == V4QImode)
19584 gen = gen_mmx_pshufbv4qi3;
19585 else if (vmode == V8QImode)
37e93925 19586 gen = gen_mmx_pshufbv8qi3;
a325bdd1 19587 else if (vmode == V16QImode)
37e93925 19588 gen = gen_ssse3_pshufbv16qi3;
2bf6d935 19589 else if (vmode == V32QImode)
37e93925 19590 gen = gen_avx2_pshufbv32qi3;
2bf6d935 19591 else if (vmode == V64QImode)
37e93925 19592 gen = gen_avx512bw_pshufbv64qi3;
2bf6d935 19593 else if (vmode == V8SFmode)
37e93925 19594 gen = gen_avx2_permvarv8sf;
2bf6d935 19595 else if (vmode == V8SImode)
37e93925 19596 gen = gen_avx2_permvarv8si;
2bf6d935 19597 else if (vmode == V16SFmode)
37e93925 19598 gen = gen_avx512f_permvarv16sf;
2bf6d935 19599 else if (vmode == V16SImode)
37e93925 19600 gen = gen_avx512f_permvarv16si;
2bf6d935
ML
19601 else
19602 gcc_unreachable ();
37e93925
UB
19603
19604 emit_insn (gen (target, op0, vperm));
2bf6d935
ML
19605 }
19606 else
19607 {
37e93925
UB
19608 rtx (*gen) (rtx, rtx, rtx, rtx);
19609
2bf6d935 19610 op1 = gen_lowpart (vmode, d->op1);
37e93925 19611
be8749f9
UB
19612 if (vmode == V4QImode)
19613 gen = gen_mmx_ppermv32;
19614 else if (vmode == V8QImode)
37e93925
UB
19615 gen = gen_mmx_ppermv64;
19616 else if (vmode == V16QImode)
19617 gen = gen_xop_pperm;
19618 else
19619 gcc_unreachable ();
19620
19621 emit_insn (gen (target, op0, op1, vperm));
2bf6d935 19622 }
37e93925 19623
2bf6d935
ML
19624 if (target != d->target)
19625 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19626
19627 return true;
19628}
19629
2bf6d935
ML
19630/* Try to expand one-operand permutation with constant mask. */
19631
19632static bool
19633ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19634{
19635 machine_mode mode = GET_MODE (d->op0);
19636 machine_mode maskmode = mode;
faf2b6bc 19637 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
2bf6d935
ML
19638 rtx (*gen) (rtx, rtx, rtx) = NULL;
19639 rtx target, op0, mask;
19640 rtx vec[64];
19641
19642 if (!rtx_equal_p (d->op0, d->op1))
19643 return false;
19644
19645 if (!TARGET_AVX512F)
19646 return false;
19647
faf2b6bc 19648 /* Accept VNxHImode and VNxQImode now. */
19649 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19650 return false;
19651
19652 /* vpermw. */
19653 if (!TARGET_AVX512BW && inner_size == 2)
19654 return false;
19655
19656 /* vpermb. */
19657 if (!TARGET_AVX512VBMI && inner_size == 1)
19658 return false;
19659
2bf6d935
ML
19660 switch (mode)
19661 {
19662 case E_V16SImode:
19663 gen = gen_avx512f_permvarv16si;
19664 break;
19665 case E_V16SFmode:
19666 gen = gen_avx512f_permvarv16sf;
19667 maskmode = V16SImode;
19668 break;
19669 case E_V8DImode:
19670 gen = gen_avx512f_permvarv8di;
19671 break;
19672 case E_V8DFmode:
19673 gen = gen_avx512f_permvarv8df;
19674 maskmode = V8DImode;
19675 break;
faf2b6bc 19676 case E_V32HImode:
19677 gen = gen_avx512bw_permvarv32hi;
19678 break;
19679 case E_V16HImode:
19680 gen = gen_avx512vl_permvarv16hi;
19681 break;
19682 case E_V8HImode:
19683 gen = gen_avx512vl_permvarv8hi;
19684 break;
19685 case E_V64QImode:
19686 gen = gen_avx512bw_permvarv64qi;
19687 break;
19688 case E_V32QImode:
19689 gen = gen_avx512vl_permvarv32qi;
19690 break;
19691 case E_V16QImode:
19692 gen = gen_avx512vl_permvarv16qi;
19693 break;
19694
2bf6d935
ML
19695 default:
19696 return false;
19697 }
19698
04b4f315
JJ
19699 if (d->testing_p)
19700 return true;
19701
2bf6d935
ML
19702 target = d->target;
19703 op0 = d->op0;
19704 for (int i = 0; i < d->nelt; ++i)
19705 vec[i] = GEN_INT (d->perm[i]);
19706 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19707 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19708 return true;
19709}
19710
19711static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19712
4bf4c103 19713/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
2bf6d935
ML
19714 in a single instruction. */
19715
19716static bool
19717expand_vec_perm_1 (struct expand_vec_perm_d *d)
19718{
19719 unsigned i, nelt = d->nelt;
19720 struct expand_vec_perm_d nd;
19721
19722 /* Check plain VEC_SELECT first, because AVX has instructions that could
19723 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19724 input where SEL+CONCAT may not. */
19725 if (d->one_operand_p)
19726 {
19727 int mask = nelt - 1;
19728 bool identity_perm = true;
19729 bool broadcast_perm = true;
19730
19731 for (i = 0; i < nelt; i++)
19732 {
19733 nd.perm[i] = d->perm[i] & mask;
19734 if (nd.perm[i] != i)
19735 identity_perm = false;
19736 if (nd.perm[i])
19737 broadcast_perm = false;
19738 }
19739
19740 if (identity_perm)
19741 {
19742 if (!d->testing_p)
19743 emit_move_insn (d->target, d->op0);
19744 return true;
19745 }
19746 else if (broadcast_perm && TARGET_AVX2)
19747 {
19748 /* Use vpbroadcast{b,w,d}. */
19749 rtx (*gen) (rtx, rtx) = NULL;
19750 switch (d->vmode)
19751 {
19752 case E_V64QImode:
19753 if (TARGET_AVX512BW)
19754 gen = gen_avx512bw_vec_dupv64qi_1;
19755 break;
19756 case E_V32QImode:
19757 gen = gen_avx2_pbroadcastv32qi_1;
19758 break;
19759 case E_V32HImode:
19760 if (TARGET_AVX512BW)
19761 gen = gen_avx512bw_vec_dupv32hi_1;
19762 break;
19763 case E_V16HImode:
19764 gen = gen_avx2_pbroadcastv16hi_1;
19765 break;
19766 case E_V16SImode:
19767 if (TARGET_AVX512F)
19768 gen = gen_avx512f_vec_dupv16si_1;
19769 break;
19770 case E_V8SImode:
19771 gen = gen_avx2_pbroadcastv8si_1;
19772 break;
19773 case E_V16QImode:
19774 gen = gen_avx2_pbroadcastv16qi;
19775 break;
19776 case E_V8HImode:
19777 gen = gen_avx2_pbroadcastv8hi;
19778 break;
19779 case E_V16SFmode:
19780 if (TARGET_AVX512F)
19781 gen = gen_avx512f_vec_dupv16sf_1;
19782 break;
19783 case E_V8SFmode:
19784 gen = gen_avx2_vec_dupv8sf_1;
19785 break;
19786 case E_V8DFmode:
19787 if (TARGET_AVX512F)
19788 gen = gen_avx512f_vec_dupv8df_1;
19789 break;
19790 case E_V8DImode:
19791 if (TARGET_AVX512F)
19792 gen = gen_avx512f_vec_dupv8di_1;
19793 break;
19794 /* For other modes prefer other shuffles this function creates. */
19795 default: break;
19796 }
19797 if (gen != NULL)
19798 {
19799 if (!d->testing_p)
19800 emit_insn (gen (d->target, d->op0));
19801 return true;
19802 }
19803 }
19804
19805 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19806 return true;
19807
19808 /* There are plenty of patterns in sse.md that are written for
19809 SEL+CONCAT and are not replicated for a single op. Perhaps
19810 that should be changed, to avoid the nastiness here. */
19811
19812 /* Recognize interleave style patterns, which means incrementing
19813 every other permutation operand. */
19814 for (i = 0; i < nelt; i += 2)
19815 {
19816 nd.perm[i] = d->perm[i] & mask;
19817 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19818 }
19819 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19820 d->testing_p))
19821 return true;
19822
19823 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19824 if (nelt >= 4)
19825 {
19826 for (i = 0; i < nelt; i += 4)
19827 {
19828 nd.perm[i + 0] = d->perm[i + 0] & mask;
19829 nd.perm[i + 1] = d->perm[i + 1] & mask;
19830 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19831 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19832 }
19833
19834 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19835 d->testing_p))
19836 return true;
19837 }
19838 }
19839
19840 /* Try movss/movsd instructions. */
19841 if (expand_vec_perm_movs (d))
19842 return true;
19843
19844 /* Finally, try the fully general two operand permute. */
19845 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
19846 d->testing_p))
19847 return true;
19848
19849 /* Recognize interleave style patterns with reversed operands. */
19850 if (!d->one_operand_p)
19851 {
19852 for (i = 0; i < nelt; ++i)
19853 {
19854 unsigned e = d->perm[i];
19855 if (e >= nelt)
19856 e -= nelt;
19857 else
19858 e += nelt;
19859 nd.perm[i] = e;
19860 }
19861
19862 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
19863 d->testing_p))
19864 return true;
19865 }
19866
19867 /* Try the SSE4.1 blend variable merge instructions. */
19868 if (expand_vec_perm_blend (d))
19869 return true;
19870
19871 /* Try one of the AVX vpermil variable permutations. */
19872 if (expand_vec_perm_vpermil (d))
19873 return true;
19874
19875 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19876 vpshufb, vpermd, vpermps or vpermq variable permutation. */
19877 if (expand_vec_perm_pshufb (d))
19878 return true;
19879
19880 /* Try the AVX2 vpalignr instruction. */
19881 if (expand_vec_perm_palignr (d, true))
19882 return true;
19883
faf2b6bc 19884 /* Try the AVX512F vperm{w,b,s,d} instructions */
2bf6d935
ML
19885 if (ix86_expand_vec_one_operand_perm_avx512 (d))
19886 return true;
19887
19888 /* Try the AVX512F vpermt2/vpermi2 instructions. */
19889 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
19890 return true;
19891
19892 /* See if we can get the same permutation in different vector integer
19893 mode. */
19894 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19895 {
19896 if (!d->testing_p)
19897 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19898 return true;
19899 }
19900 return false;
19901}
19902
1442e203 19903/* Canonicalize vec_perm index to make the first index
19904 always comes from the first vector. */
19905static void
19906ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
19907{
19908 unsigned nelt = d->nelt;
19909 if (d->perm[0] < nelt)
19910 return;
19911
19912 for (unsigned i = 0; i != nelt; i++)
19913 d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
19914
19915 std::swap (d->op0, d->op1);
19916 return;
19917}
19918
3db8e9c2 19919/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19920 in terms of a pair of shufps+ shufps/pshufd instructions. */
19921static bool
19922expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
19923{
19924 unsigned char perm1[4];
19925 machine_mode vmode = d->vmode;
19926 bool ok;
19927 unsigned i, j, k, count = 0;
19928
19929 if (d->one_operand_p
19930 || (vmode != V4SImode && vmode != V4SFmode))
19931 return false;
19932
19933 if (d->testing_p)
19934 return true;
19935
1442e203 19936 ix86_vec_perm_index_canon (d);
3db8e9c2 19937 for (i = 0; i < 4; ++i)
19938 count += d->perm[i] > 3 ? 1 : 0;
19939
19940 gcc_assert (count & 3);
19941
19942 rtx tmp = gen_reg_rtx (vmode);
19943 /* 2 from op0 and 2 from op1. */
19944 if (count == 2)
19945 {
19946 unsigned char perm2[4];
19947 for (i = 0, j = 0, k = 2; i < 4; ++i)
19948 if (d->perm[i] & 4)
19949 {
19950 perm1[k++] = d->perm[i];
19951 perm2[i] = k - 1;
19952 }
19953 else
19954 {
19955 perm1[j++] = d->perm[i];
19956 perm2[i] = j - 1;
19957 }
19958
19959 /* shufps. */
19960 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
19961 perm1, d->nelt, false);
19962 gcc_assert (ok);
19963 if (vmode == V4SImode && TARGET_SSE2)
19964 /* pshufd. */
19965 ok = expand_vselect (d->target, tmp,
19966 perm2, d->nelt, false);
19967 else
19968 {
19969 /* shufps. */
19970 perm2[2] += 4;
19971 perm2[3] += 4;
19972 ok = expand_vselect_vconcat (d->target, tmp, tmp,
19973 perm2, d->nelt, false);
19974 }
19975 gcc_assert (ok);
19976 }
19977 /* 3 from one op and 1 from another. */
19978 else
19979 {
19980 unsigned pair_idx = 8, lone_idx = 8, shift;
19981
19982 /* Find the lone index. */
19983 for (i = 0; i < 4; ++i)
19984 if ((d->perm[i] > 3 && count == 1)
19985 || (d->perm[i] < 4 && count == 3))
19986 lone_idx = i;
19987
19988 /* When lone_idx is not 0, it must from second op(count == 1). */
19989 gcc_assert (count == (lone_idx ? 1 : 3));
19990
19991 /* Find the pair index that sits in the same half as the lone index. */
19992 shift = lone_idx & 2;
19993 pair_idx = 1 - lone_idx + 2 * shift;
19994
19995 /* First permutate lone index and pair index into the same vector as
19996 [ lone, lone, pair, pair ]. */
19997 perm1[1] = perm1[0]
19998 = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
19999 perm1[3] = perm1[2]
20000 = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
20001
20002 /* Alway put the vector contains lone indx at the first. */
20003 if (count == 1)
20004 std::swap (d->op0, d->op1);
20005
20006 /* shufps. */
20007 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20008 perm1, d->nelt, false);
20009 gcc_assert (ok);
20010
20011 /* Refine lone and pair index to original order. */
20012 perm1[shift] = lone_idx << 1;
20013 perm1[shift + 1] = pair_idx << 1;
20014
20015 /* Select the remaining 2 elements in another vector. */
20016 for (i = 2 - shift; i < 4 - shift; ++i)
20017 perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
20018
20019 /* Adjust to original selector. */
20020 if (lone_idx > 1)
20021 std::swap (tmp, d->op1);
20022
20023 /* shufps. */
20024 ok = expand_vselect_vconcat (d->target, tmp, d->op1,
20025 perm1, d->nelt, false);
20026
20027 gcc_assert (ok);
20028 }
20029
20030 return true;
20031}
20032
4bf4c103 20033/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
20034 in terms of a pair of pshuflw + pshufhw instructions. */
20035
20036static bool
20037expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
20038{
20039 unsigned char perm2[MAX_VECT_LEN];
20040 unsigned i;
20041 bool ok;
20042
20043 if (d->vmode != V8HImode || !d->one_operand_p)
20044 return false;
20045
20046 /* The two permutations only operate in 64-bit lanes. */
20047 for (i = 0; i < 4; ++i)
20048 if (d->perm[i] >= 4)
20049 return false;
20050 for (i = 4; i < 8; ++i)
20051 if (d->perm[i] < 4)
20052 return false;
20053
20054 if (d->testing_p)
20055 return true;
20056
20057 /* Emit the pshuflw. */
20058 memcpy (perm2, d->perm, 4);
20059 for (i = 4; i < 8; ++i)
20060 perm2[i] = i;
20061 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
20062 gcc_assert (ok);
20063
20064 /* Emit the pshufhw. */
20065 memcpy (perm2 + 4, d->perm + 4, 4);
20066 for (i = 0; i < 4; ++i)
20067 perm2[i] = i;
20068 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
20069 gcc_assert (ok);
20070
20071 return true;
20072}
20073
4bf4c103 20074/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
20075 the permutation using the SSSE3 palignr instruction. This succeeds
20076 when all of the elements in PERM fit within one vector and we merely
20077 need to shift them down so that a single vector permutation has a
20078 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20079 the vpalignr instruction itself can perform the requested permutation. */
20080
20081static bool
20082expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
20083{
20084 unsigned i, nelt = d->nelt;
20085 unsigned min, max, minswap, maxswap;
20086 bool in_order, ok, swap = false;
20087 rtx shift, target;
20088 struct expand_vec_perm_d dcopy;
20089
20090 /* Even with AVX, palignr only operates on 128-bit vectors,
20091 in AVX2 palignr operates on both 128-bit lanes. */
20092 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
20093 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
20094 return false;
20095
20096 min = 2 * nelt;
20097 max = 0;
20098 minswap = 2 * nelt;
20099 maxswap = 0;
20100 for (i = 0; i < nelt; ++i)
20101 {
20102 unsigned e = d->perm[i];
20103 unsigned eswap = d->perm[i] ^ nelt;
20104 if (GET_MODE_SIZE (d->vmode) == 32)
20105 {
20106 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
20107 eswap = e ^ (nelt / 2);
20108 }
20109 if (e < min)
20110 min = e;
20111 if (e > max)
20112 max = e;
20113 if (eswap < minswap)
20114 minswap = eswap;
20115 if (eswap > maxswap)
20116 maxswap = eswap;
20117 }
20118 if (min == 0
20119 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
20120 {
20121 if (d->one_operand_p
20122 || minswap == 0
20123 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
20124 ? nelt / 2 : nelt))
20125 return false;
20126 swap = true;
20127 min = minswap;
20128 max = maxswap;
20129 }
20130
20131 /* Given that we have SSSE3, we know we'll be able to implement the
20132 single operand permutation after the palignr with pshufb for
20133 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20134 first. */
20135 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
20136 return true;
20137
20138 dcopy = *d;
20139 if (swap)
20140 {
20141 dcopy.op0 = d->op1;
20142 dcopy.op1 = d->op0;
20143 for (i = 0; i < nelt; ++i)
20144 dcopy.perm[i] ^= nelt;
20145 }
20146
20147 in_order = true;
20148 for (i = 0; i < nelt; ++i)
20149 {
20150 unsigned e = dcopy.perm[i];
20151 if (GET_MODE_SIZE (d->vmode) == 32
20152 && e >= nelt
20153 && (e & (nelt / 2 - 1)) < min)
20154 e = e - min - (nelt / 2);
20155 else
20156 e = e - min;
20157 if (e != i)
20158 in_order = false;
20159 dcopy.perm[i] = e;
20160 }
20161 dcopy.one_operand_p = true;
20162
20163 if (single_insn_only_p && !in_order)
20164 return false;
20165
20166 /* For AVX2, test whether we can permute the result in one instruction. */
20167 if (d->testing_p)
20168 {
20169 if (in_order)
20170 return true;
20171 dcopy.op1 = dcopy.op0;
20172 return expand_vec_perm_1 (&dcopy);
20173 }
20174
20175 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
20176 if (GET_MODE_SIZE (d->vmode) == 16)
20177 {
02e2e15e
RS
20178 target = gen_reg_rtx (V1TImode);
20179 emit_insn (gen_ssse3_palignrv1ti (target,
20180 gen_lowpart (V1TImode, dcopy.op1),
20181 gen_lowpart (V1TImode, dcopy.op0),
20182 shift));
2bf6d935
ML
20183 }
20184 else
20185 {
20186 target = gen_reg_rtx (V2TImode);
20187 emit_insn (gen_avx2_palignrv2ti (target,
20188 gen_lowpart (V2TImode, dcopy.op1),
20189 gen_lowpart (V2TImode, dcopy.op0),
20190 shift));
20191 }
20192
20193 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
20194
20195 /* Test for the degenerate case where the alignment by itself
20196 produces the desired permutation. */
20197 if (in_order)
20198 {
20199 emit_move_insn (d->target, dcopy.op0);
20200 return true;
20201 }
20202
20203 ok = expand_vec_perm_1 (&dcopy);
20204 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
20205
20206 return ok;
20207}
20208
20209/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20210 the permutation using the SSE4_1 pblendv instruction. Potentially
20211 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20212
20213static bool
20214expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
20215{
20216 unsigned i, which, nelt = d->nelt;
20217 struct expand_vec_perm_d dcopy, dcopy1;
20218 machine_mode vmode = d->vmode;
20219 bool ok;
20220
20221 /* Use the same checks as in expand_vec_perm_blend. */
20222 if (d->one_operand_p)
20223 return false;
20224 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20225 ;
20226 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20227 ;
be8749f9
UB
20228 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
20229 || GET_MODE_SIZE (vmode) == 8
a325bdd1 20230 || GET_MODE_SIZE (vmode) == 16))
2bf6d935
ML
20231 ;
20232 else
20233 return false;
20234
20235 /* Figure out where permutation elements stay not in their
20236 respective lanes. */
20237 for (i = 0, which = 0; i < nelt; ++i)
20238 {
20239 unsigned e = d->perm[i];
20240 if (e != i)
20241 which |= (e < nelt ? 1 : 2);
20242 }
20243 /* We can pblend the part where elements stay not in their
20244 respective lanes only when these elements are all in one
20245 half of a permutation.
20246 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20247 lanes, but both 8 and 9 >= 8
20248 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20249 respective lanes and 8 >= 8, but 2 not. */
20250 if (which != 1 && which != 2)
20251 return false;
20252 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
20253 return true;
20254
20255 /* First we apply one operand permutation to the part where
20256 elements stay not in their respective lanes. */
20257 dcopy = *d;
20258 if (which == 2)
20259 dcopy.op0 = dcopy.op1 = d->op1;
20260 else
20261 dcopy.op0 = dcopy.op1 = d->op0;
20262 if (!d->testing_p)
20263 dcopy.target = gen_reg_rtx (vmode);
20264 dcopy.one_operand_p = true;
20265
20266 for (i = 0; i < nelt; ++i)
20267 dcopy.perm[i] = d->perm[i] & (nelt - 1);
20268
20269 ok = expand_vec_perm_1 (&dcopy);
20270 if (GET_MODE_SIZE (vmode) != 16 && !ok)
20271 return false;
20272 else
20273 gcc_assert (ok);
20274 if (d->testing_p)
20275 return true;
20276
20277 /* Next we put permuted elements into their positions. */
20278 dcopy1 = *d;
20279 if (which == 2)
20280 dcopy1.op1 = dcopy.target;
20281 else
20282 dcopy1.op0 = dcopy.target;
20283
20284 for (i = 0; i < nelt; ++i)
20285 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
20286
20287 ok = expand_vec_perm_blend (&dcopy1);
20288 gcc_assert (ok);
20289
20290 return true;
20291}
20292
20293static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
20294
4bf4c103 20295/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
20296 a two vector permutation into a single vector permutation by using
20297 an interleave operation to merge the vectors. */
20298
20299static bool
20300expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
20301{
20302 struct expand_vec_perm_d dremap, dfinal;
20303 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
20304 unsigned HOST_WIDE_INT contents;
20305 unsigned char remap[2 * MAX_VECT_LEN];
20306 rtx_insn *seq;
20307 bool ok, same_halves = false;
20308
be8749f9
UB
20309 if (GET_MODE_SIZE (d->vmode) == 4
20310 || GET_MODE_SIZE (d->vmode) == 8
a325bdd1 20311 || GET_MODE_SIZE (d->vmode) == 16)
2bf6d935
ML
20312 {
20313 if (d->one_operand_p)
20314 return false;
20315 }
20316 else if (GET_MODE_SIZE (d->vmode) == 32)
20317 {
20318 if (!TARGET_AVX)
20319 return false;
20320 /* For 32-byte modes allow even d->one_operand_p.
20321 The lack of cross-lane shuffling in some instructions
20322 might prevent a single insn shuffle. */
20323 dfinal = *d;
20324 dfinal.testing_p = true;
20325 /* If expand_vec_perm_interleave3 can expand this into
20326 a 3 insn sequence, give up and let it be expanded as
20327 3 insn sequence. While that is one insn longer,
20328 it doesn't need a memory operand and in the common
20329 case that both interleave low and high permutations
20330 with the same operands are adjacent needs 4 insns
20331 for both after CSE. */
20332 if (expand_vec_perm_interleave3 (&dfinal))
20333 return false;
20334 }
20335 else
20336 return false;
20337
20338 /* Examine from whence the elements come. */
20339 contents = 0;
20340 for (i = 0; i < nelt; ++i)
20341 contents |= HOST_WIDE_INT_1U << d->perm[i];
20342
20343 memset (remap, 0xff, sizeof (remap));
20344 dremap = *d;
20345
be8749f9
UB
20346 if (GET_MODE_SIZE (d->vmode) == 4
20347 || GET_MODE_SIZE (d->vmode) == 8)
a325bdd1
PB
20348 {
20349 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20350
20351 /* Split the two input vectors into 4 halves. */
20352 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20353 h2 = h1 << nelt2;
20354 h3 = h2 << nelt2;
20355 h4 = h3 << nelt2;
20356
20357 /* If the elements from the low halves use interleave low,
20358 and similarly for interleave high. */
20359 if ((contents & (h1 | h3)) == contents)
20360 {
20361 /* punpckl* */
20362 for (i = 0; i < nelt2; ++i)
20363 {
20364 remap[i] = i * 2;
20365 remap[i + nelt] = i * 2 + 1;
20366 dremap.perm[i * 2] = i;
20367 dremap.perm[i * 2 + 1] = i + nelt;
20368 }
20369 }
20370 else if ((contents & (h2 | h4)) == contents)
20371 {
20372 /* punpckh* */
20373 for (i = 0; i < nelt2; ++i)
20374 {
20375 remap[i + nelt2] = i * 2;
20376 remap[i + nelt + nelt2] = i * 2 + 1;
20377 dremap.perm[i * 2] = i + nelt2;
20378 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20379 }
20380 }
20381 else
20382 return false;
20383 }
20384 else if (GET_MODE_SIZE (d->vmode) == 16)
2bf6d935
ML
20385 {
20386 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20387
20388 /* Split the two input vectors into 4 halves. */
20389 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20390 h2 = h1 << nelt2;
20391 h3 = h2 << nelt2;
20392 h4 = h3 << nelt2;
20393
20394 /* If the elements from the low halves use interleave low, and similarly
20395 for interleave high. If the elements are from mis-matched halves, we
20396 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20397 if ((contents & (h1 | h3)) == contents)
20398 {
20399 /* punpckl* */
20400 for (i = 0; i < nelt2; ++i)
20401 {
20402 remap[i] = i * 2;
20403 remap[i + nelt] = i * 2 + 1;
20404 dremap.perm[i * 2] = i;
20405 dremap.perm[i * 2 + 1] = i + nelt;
20406 }
20407 if (!TARGET_SSE2 && d->vmode == V4SImode)
20408 dremap.vmode = V4SFmode;
20409 }
20410 else if ((contents & (h2 | h4)) == contents)
20411 {
20412 /* punpckh* */
20413 for (i = 0; i < nelt2; ++i)
20414 {
20415 remap[i + nelt2] = i * 2;
20416 remap[i + nelt + nelt2] = i * 2 + 1;
20417 dremap.perm[i * 2] = i + nelt2;
20418 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20419 }
20420 if (!TARGET_SSE2 && d->vmode == V4SImode)
20421 dremap.vmode = V4SFmode;
20422 }
20423 else if ((contents & (h1 | h4)) == contents)
20424 {
20425 /* shufps */
20426 for (i = 0; i < nelt2; ++i)
20427 {
20428 remap[i] = i;
20429 remap[i + nelt + nelt2] = i + nelt2;
20430 dremap.perm[i] = i;
20431 dremap.perm[i + nelt2] = i + nelt + nelt2;
20432 }
20433 if (nelt != 4)
20434 {
20435 /* shufpd */
20436 dremap.vmode = V2DImode;
20437 dremap.nelt = 2;
20438 dremap.perm[0] = 0;
20439 dremap.perm[1] = 3;
20440 }
20441 }
20442 else if ((contents & (h2 | h3)) == contents)
20443 {
20444 /* shufps */
20445 for (i = 0; i < nelt2; ++i)
20446 {
20447 remap[i + nelt2] = i;
20448 remap[i + nelt] = i + nelt2;
20449 dremap.perm[i] = i + nelt2;
20450 dremap.perm[i + nelt2] = i + nelt;
20451 }
20452 if (nelt != 4)
20453 {
20454 /* shufpd */
20455 dremap.vmode = V2DImode;
20456 dremap.nelt = 2;
20457 dremap.perm[0] = 1;
20458 dremap.perm[1] = 2;
20459 }
20460 }
20461 else
20462 return false;
20463 }
20464 else
20465 {
20466 unsigned int nelt4 = nelt / 4, nzcnt = 0;
20467 unsigned HOST_WIDE_INT q[8];
20468 unsigned int nonzero_halves[4];
20469
20470 /* Split the two input vectors into 8 quarters. */
20471 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
20472 for (i = 1; i < 8; ++i)
20473 q[i] = q[0] << (nelt4 * i);
20474 for (i = 0; i < 4; ++i)
20475 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
20476 {
20477 nonzero_halves[nzcnt] = i;
20478 ++nzcnt;
20479 }
20480
20481 if (nzcnt == 1)
20482 {
20483 gcc_assert (d->one_operand_p);
20484 nonzero_halves[1] = nonzero_halves[0];
20485 same_halves = true;
20486 }
20487 else if (d->one_operand_p)
20488 {
20489 gcc_assert (nonzero_halves[0] == 0);
20490 gcc_assert (nonzero_halves[1] == 1);
20491 }
20492
20493 if (nzcnt <= 2)
20494 {
20495 if (d->perm[0] / nelt2 == nonzero_halves[1])
20496 {
20497 /* Attempt to increase the likelihood that dfinal
20498 shuffle will be intra-lane. */
20499 std::swap (nonzero_halves[0], nonzero_halves[1]);
20500 }
20501
20502 /* vperm2f128 or vperm2i128. */
20503 for (i = 0; i < nelt2; ++i)
20504 {
20505 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
20506 remap[i + nonzero_halves[0] * nelt2] = i;
20507 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
20508 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
20509 }
20510
20511 if (d->vmode != V8SFmode
20512 && d->vmode != V4DFmode
20513 && d->vmode != V8SImode)
20514 {
20515 dremap.vmode = V8SImode;
20516 dremap.nelt = 8;
20517 for (i = 0; i < 4; ++i)
20518 {
20519 dremap.perm[i] = i + nonzero_halves[0] * 4;
20520 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
20521 }
20522 }
20523 }
20524 else if (d->one_operand_p)
20525 return false;
20526 else if (TARGET_AVX2
20527 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
20528 {
20529 /* vpunpckl* */
20530 for (i = 0; i < nelt4; ++i)
20531 {
20532 remap[i] = i * 2;
20533 remap[i + nelt] = i * 2 + 1;
20534 remap[i + nelt2] = i * 2 + nelt2;
20535 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
20536 dremap.perm[i * 2] = i;
20537 dremap.perm[i * 2 + 1] = i + nelt;
20538 dremap.perm[i * 2 + nelt2] = i + nelt2;
20539 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
20540 }
20541 }
20542 else if (TARGET_AVX2
20543 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
20544 {
20545 /* vpunpckh* */
20546 for (i = 0; i < nelt4; ++i)
20547 {
20548 remap[i + nelt4] = i * 2;
20549 remap[i + nelt + nelt4] = i * 2 + 1;
20550 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
20551 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
20552 dremap.perm[i * 2] = i + nelt4;
20553 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
20554 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
20555 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
20556 }
20557 }
20558 else
20559 return false;
20560 }
20561
20562 /* Use the remapping array set up above to move the elements from their
20563 swizzled locations into their final destinations. */
20564 dfinal = *d;
20565 for (i = 0; i < nelt; ++i)
20566 {
20567 unsigned e = remap[d->perm[i]];
20568 gcc_assert (e < nelt);
20569 /* If same_halves is true, both halves of the remapped vector are the
20570 same. Avoid cross-lane accesses if possible. */
20571 if (same_halves && i >= nelt2)
20572 {
20573 gcc_assert (e < nelt2);
20574 dfinal.perm[i] = e + nelt2;
20575 }
20576 else
20577 dfinal.perm[i] = e;
20578 }
20579 if (!d->testing_p)
20580 {
20581 dremap.target = gen_reg_rtx (dremap.vmode);
20582 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20583 }
20584 dfinal.op1 = dfinal.op0;
20585 dfinal.one_operand_p = true;
20586
20587 /* Test if the final remap can be done with a single insn. For V4SFmode or
20588 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20589 start_sequence ();
20590 ok = expand_vec_perm_1 (&dfinal);
20591 seq = get_insns ();
20592 end_sequence ();
20593
20594 if (!ok)
20595 return false;
20596
20597 if (d->testing_p)
20598 return true;
20599
20600 if (dremap.vmode != dfinal.vmode)
20601 {
20602 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20603 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20604 }
20605
20606 ok = expand_vec_perm_1 (&dremap);
20607 gcc_assert (ok);
20608
20609 emit_insn (seq);
20610 return true;
20611}
20612
4bf4c103 20613/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
20614 a single vector cross-lane permutation into vpermq followed
20615 by any of the single insn permutations. */
20616
20617static bool
20618expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20619{
20620 struct expand_vec_perm_d dremap, dfinal;
20621 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20622 unsigned contents[2];
20623 bool ok;
20624
20625 if (!(TARGET_AVX2
20626 && (d->vmode == V32QImode || d->vmode == V16HImode)
20627 && d->one_operand_p))
20628 return false;
20629
20630 contents[0] = 0;
20631 contents[1] = 0;
20632 for (i = 0; i < nelt2; ++i)
20633 {
20634 contents[0] |= 1u << (d->perm[i] / nelt4);
20635 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20636 }
20637
20638 for (i = 0; i < 2; ++i)
20639 {
20640 unsigned int cnt = 0;
20641 for (j = 0; j < 4; ++j)
20642 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20643 return false;
20644 }
20645
20646 if (d->testing_p)
20647 return true;
20648
20649 dremap = *d;
20650 dremap.vmode = V4DImode;
20651 dremap.nelt = 4;
20652 dremap.target = gen_reg_rtx (V4DImode);
20653 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20654 dremap.op1 = dremap.op0;
20655 dremap.one_operand_p = true;
20656 for (i = 0; i < 2; ++i)
20657 {
20658 unsigned int cnt = 0;
20659 for (j = 0; j < 4; ++j)
20660 if ((contents[i] & (1u << j)) != 0)
20661 dremap.perm[2 * i + cnt++] = j;
20662 for (; cnt < 2; ++cnt)
20663 dremap.perm[2 * i + cnt] = 0;
20664 }
20665
20666 dfinal = *d;
20667 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20668 dfinal.op1 = dfinal.op0;
20669 dfinal.one_operand_p = true;
20670 for (i = 0, j = 0; i < nelt; ++i)
20671 {
20672 if (i == nelt2)
20673 j = 2;
20674 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20675 if ((d->perm[i] / nelt4) == dremap.perm[j])
20676 ;
20677 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20678 dfinal.perm[i] |= nelt4;
20679 else
20680 gcc_unreachable ();
20681 }
20682
20683 ok = expand_vec_perm_1 (&dremap);
20684 gcc_assert (ok);
20685
20686 ok = expand_vec_perm_1 (&dfinal);
20687 gcc_assert (ok);
20688
20689 return true;
20690}
20691
20692static bool canonicalize_perm (struct expand_vec_perm_d *d);
20693
4bf4c103 20694/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
2bf6d935
ML
20695 a vector permutation using two instructions, vperm2f128 resp.
20696 vperm2i128 followed by any single in-lane permutation. */
20697
20698static bool
20699expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20700{
20701 struct expand_vec_perm_d dfirst, dsecond;
20702 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20703 bool ok;
20704
20705 if (!TARGET_AVX
20706 || GET_MODE_SIZE (d->vmode) != 32
20707 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20708 return false;
20709
20710 dsecond = *d;
20711 dsecond.one_operand_p = false;
20712 dsecond.testing_p = true;
20713
20714 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20715 immediate. For perm < 16 the second permutation uses
20716 d->op0 as first operand, for perm >= 16 it uses d->op1
20717 as first operand. The second operand is the result of
20718 vperm2[fi]128. */
20719 for (perm = 0; perm < 32; perm++)
20720 {
20721 /* Ignore permutations which do not move anything cross-lane. */
20722 if (perm < 16)
20723 {
20724 /* The second shuffle for e.g. V4DFmode has
20725 0123 and ABCD operands.
20726 Ignore AB23, as 23 is already in the second lane
20727 of the first operand. */
20728 if ((perm & 0xc) == (1 << 2)) continue;
20729 /* And 01CD, as 01 is in the first lane of the first
20730 operand. */
20731 if ((perm & 3) == 0) continue;
20732 /* And 4567, as then the vperm2[fi]128 doesn't change
20733 anything on the original 4567 second operand. */
20734 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
20735 }
20736 else
20737 {
20738 /* The second shuffle for e.g. V4DFmode has
20739 4567 and ABCD operands.
20740 Ignore AB67, as 67 is already in the second lane
20741 of the first operand. */
20742 if ((perm & 0xc) == (3 << 2)) continue;
20743 /* And 45CD, as 45 is in the first lane of the first
20744 operand. */
20745 if ((perm & 3) == 2) continue;
20746 /* And 0123, as then the vperm2[fi]128 doesn't change
20747 anything on the original 0123 first operand. */
20748 if ((perm & 0xf) == (1 << 2)) continue;
20749 }
20750
20751 for (i = 0; i < nelt; i++)
20752 {
20753 j = d->perm[i] / nelt2;
20754 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20755 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20756 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20757 dsecond.perm[i] = d->perm[i] & (nelt - 1);
20758 else
20759 break;
20760 }
20761
20762 if (i == nelt)
20763 {
20764 start_sequence ();
20765 ok = expand_vec_perm_1 (&dsecond);
20766 end_sequence ();
20767 }
20768 else
20769 ok = false;
20770
20771 if (ok)
20772 {
20773 if (d->testing_p)
20774 return true;
20775
20776 /* Found a usable second shuffle. dfirst will be
20777 vperm2f128 on d->op0 and d->op1. */
20778 dsecond.testing_p = false;
20779 dfirst = *d;
20780 dfirst.target = gen_reg_rtx (d->vmode);
20781 for (i = 0; i < nelt; i++)
20782 dfirst.perm[i] = (i & (nelt2 - 1))
20783 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20784
20785 canonicalize_perm (&dfirst);
20786 ok = expand_vec_perm_1 (&dfirst);
20787 gcc_assert (ok);
20788
20789 /* And dsecond is some single insn shuffle, taking
20790 d->op0 and result of vperm2f128 (if perm < 16) or
20791 d->op1 and result of vperm2f128 (otherwise). */
20792 if (perm >= 16)
20793 dsecond.op0 = dsecond.op1;
20794 dsecond.op1 = dfirst.target;
20795
20796 ok = expand_vec_perm_1 (&dsecond);
20797 gcc_assert (ok);
20798
20799 return true;
20800 }
20801
20802 /* For one operand, the only useful vperm2f128 permutation is 0x01
20803 aka lanes swap. */
20804 if (d->one_operand_p)
20805 return false;
20806 }
20807
20808 return false;
20809}
20810
4bf4c103 20811/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
20812 a two vector permutation using 2 intra-lane interleave insns
20813 and cross-lane shuffle for 32-byte vectors. */
20814
20815static bool
20816expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20817{
20818 unsigned i, nelt;
20819 rtx (*gen) (rtx, rtx, rtx);
20820
20821 if (d->one_operand_p)
20822 return false;
20823 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20824 ;
20825 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20826 ;
20827 else
20828 return false;
20829
20830 nelt = d->nelt;
20831 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20832 return false;
20833 for (i = 0; i < nelt; i += 2)
20834 if (d->perm[i] != d->perm[0] + i / 2
20835 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
20836 return false;
20837
20838 if (d->testing_p)
20839 return true;
20840
20841 switch (d->vmode)
20842 {
20843 case E_V32QImode:
20844 if (d->perm[0])
20845 gen = gen_vec_interleave_highv32qi;
20846 else
20847 gen = gen_vec_interleave_lowv32qi;
20848 break;
20849 case E_V16HImode:
20850 if (d->perm[0])
20851 gen = gen_vec_interleave_highv16hi;
20852 else
20853 gen = gen_vec_interleave_lowv16hi;
20854 break;
20855 case E_V8SImode:
20856 if (d->perm[0])
20857 gen = gen_vec_interleave_highv8si;
20858 else
20859 gen = gen_vec_interleave_lowv8si;
20860 break;
20861 case E_V4DImode:
20862 if (d->perm[0])
20863 gen = gen_vec_interleave_highv4di;
20864 else
20865 gen = gen_vec_interleave_lowv4di;
20866 break;
20867 case E_V8SFmode:
20868 if (d->perm[0])
20869 gen = gen_vec_interleave_highv8sf;
20870 else
20871 gen = gen_vec_interleave_lowv8sf;
20872 break;
20873 case E_V4DFmode:
20874 if (d->perm[0])
20875 gen = gen_vec_interleave_highv4df;
20876 else
20877 gen = gen_vec_interleave_lowv4df;
20878 break;
20879 default:
20880 gcc_unreachable ();
20881 }
20882
20883 emit_insn (gen (d->target, d->op0, d->op1));
20884 return true;
20885}
20886
4bf4c103 20887/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
2bf6d935
ML
20888 a single vector permutation using a single intra-lane vector
20889 permutation, vperm2f128 swapping the lanes and vblend* insn blending
20890 the non-swapped and swapped vectors together. */
20891
20892static bool
20893expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
20894{
20895 struct expand_vec_perm_d dfirst, dsecond;
20896 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
20897 rtx_insn *seq;
20898 bool ok;
20899 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20900
20901 if (!TARGET_AVX
20902 || TARGET_AVX2
20903 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20904 || !d->one_operand_p)
20905 return false;
20906
20907 dfirst = *d;
20908 for (i = 0; i < nelt; i++)
20909 dfirst.perm[i] = 0xff;
20910 for (i = 0, msk = 0; i < nelt; i++)
20911 {
20912 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20913 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
20914 return false;
20915 dfirst.perm[j] = d->perm[i];
20916 if (j != i)
20917 msk |= (1 << i);
20918 }
20919 for (i = 0; i < nelt; i++)
20920 if (dfirst.perm[i] == 0xff)
20921 dfirst.perm[i] = i;
20922
20923 if (!d->testing_p)
20924 dfirst.target = gen_reg_rtx (dfirst.vmode);
20925
20926 start_sequence ();
20927 ok = expand_vec_perm_1 (&dfirst);
20928 seq = get_insns ();
20929 end_sequence ();
20930
20931 if (!ok)
20932 return false;
20933
20934 if (d->testing_p)
20935 return true;
20936
20937 emit_insn (seq);
20938
20939 dsecond = *d;
20940 dsecond.op0 = dfirst.target;
20941 dsecond.op1 = dfirst.target;
20942 dsecond.one_operand_p = true;
20943 dsecond.target = gen_reg_rtx (dsecond.vmode);
20944 for (i = 0; i < nelt; i++)
20945 dsecond.perm[i] = i ^ nelt2;
20946
20947 ok = expand_vec_perm_1 (&dsecond);
20948 gcc_assert (ok);
20949
20950 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20951 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
20952 return true;
20953}
20954
829c4bea
JJ
20955/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20956 a two vector permutation using two single vector permutations and
20957 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
20958 of dfirst or dsecond is identity permutation. */
20959
20960static bool
20961expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
20962{
20963 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
20964 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20965 bool ident1 = true, ident2 = true;
20966
20967 if (d->one_operand_p)
20968 return false;
20969
20970 if (GET_MODE_SIZE (d->vmode) == 16)
20971 {
20972 if (!TARGET_SSE)
20973 return false;
20974 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
20975 return false;
20976 }
20977 else if (GET_MODE_SIZE (d->vmode) == 32)
20978 {
20979 if (!TARGET_AVX)
20980 return false;
20981 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
20982 return false;
20983 lane = nelt2;
20984 }
20985 else
20986 return false;
20987
20988 for (i = 1; i < nelt; i++)
20989 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
20990 return false;
20991
20992 dfirst = *d;
20993 dsecond = *d;
20994 dfinal = *d;
20995 dfirst.op1 = dfirst.op0;
20996 dfirst.one_operand_p = true;
20997 dsecond.op0 = dsecond.op1;
20998 dsecond.one_operand_p = true;
20999
21000 for (i = 0; i < nelt; i++)
21001 if (d->perm[i] >= nelt)
21002 {
21003 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
21004 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
21005 ident2 = false;
21006 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
21007 = d->perm[i] - nelt;
21008 }
21009 else
21010 {
21011 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
21012 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
21013 ident1 = false;
21014 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
21015 }
21016
21017 if (two_insn && !ident1 && !ident2)
21018 return false;
21019
21020 if (!d->testing_p)
21021 {
21022 if (!ident1)
21023 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21024 if (!ident2)
21025 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21026 if (d->perm[0] >= nelt)
21027 std::swap (dfinal.op0, dfinal.op1);
21028 }
21029
21030 bool ok;
21031 rtx_insn *seq1 = NULL, *seq2 = NULL;
21032
21033 if (!ident1)
21034 {
21035 start_sequence ();
21036 ok = expand_vec_perm_1 (&dfirst);
21037 seq1 = get_insns ();
21038 end_sequence ();
21039
21040 if (!ok)
21041 return false;
21042 }
21043
21044 if (!ident2)
21045 {
21046 start_sequence ();
21047 ok = expand_vec_perm_1 (&dsecond);
21048 seq2 = get_insns ();
21049 end_sequence ();
21050
21051 if (!ok)
21052 return false;
21053 }
21054
21055 if (d->testing_p)
21056 return true;
21057
21058 for (i = 0; i < nelt; i++)
21059 {
21060 dfinal.perm[i] = i / 2;
21061 if (i >= lane)
21062 dfinal.perm[i] += lane / 2;
21063 if ((i & 1) != 0)
21064 dfinal.perm[i] += nelt;
21065 }
21066 emit_insn (seq1);
21067 emit_insn (seq2);
21068 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
21069 dfinal.perm, dfinal.nelt, false);
21070 gcc_assert (ok);
21071 return true;
21072}
21073
21074/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21075 the permutation using two single vector permutations and the SSE4_1 pblendv
21076 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21077 identity permutation. */
21078
21079static bool
21080expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
21081{
21082 unsigned i, nelt = d->nelt;
21083 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21084 machine_mode vmode = d->vmode;
21085 bool ident1 = true, ident2 = true;
21086
21087 /* Use the same checks as in expand_vec_perm_blend. */
21088 if (d->one_operand_p)
21089 return false;
21090 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21091 ;
21092 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21093 ;
dd835ec2 21094 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
be8749f9
UB
21095 || GET_MODE_SIZE (vmode) == 8
21096 || GET_MODE_SIZE (vmode) == 4))
829c4bea
JJ
21097 ;
21098 else
21099 return false;
21100
21101 dfirst = *d;
21102 dsecond = *d;
21103 dfinal = *d;
21104 dfirst.op1 = dfirst.op0;
21105 dfirst.one_operand_p = true;
21106 dsecond.op0 = dsecond.op1;
21107 dsecond.one_operand_p = true;
21108
21109 for (i = 0; i < nelt; ++i)
21110 if (d->perm[i] >= nelt)
21111 {
21112 dfirst.perm[i] = 0xff;
21113 dsecond.perm[i] = d->perm[i] - nelt;
21114 if (d->perm[i] != i + nelt)
21115 ident2 = false;
21116 }
21117 else
21118 {
21119 dsecond.perm[i] = 0xff;
21120 dfirst.perm[i] = d->perm[i];
21121 if (d->perm[i] != i)
21122 ident1 = false;
21123 }
21124
21125 if (two_insn && !ident1 && !ident2)
21126 return false;
21127
21128 /* For now. Ideally treat 0xff as a wildcard. */
21129 for (i = 0; i < nelt; ++i)
21130 if (dfirst.perm[i] == 0xff)
21131 {
21132 if (GET_MODE_SIZE (vmode) == 32
21133 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
21134 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21135 else
21136 dfirst.perm[i] = i;
21137 }
21138 else
21139 {
21140 if (GET_MODE_SIZE (vmode) == 32
21141 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
21142 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21143 else
21144 dsecond.perm[i] = i;
21145 }
21146
21147 if (!d->testing_p)
21148 {
21149 if (!ident1)
21150 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21151 if (!ident2)
21152 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21153 }
21154
21155 bool ok;
21156 rtx_insn *seq1 = NULL, *seq2 = NULL;
21157
21158 if (!ident1)
21159 {
21160 start_sequence ();
21161 ok = expand_vec_perm_1 (&dfirst);
21162 seq1 = get_insns ();
21163 end_sequence ();
21164
21165 if (!ok)
21166 return false;
21167 }
21168
21169 if (!ident2)
21170 {
21171 start_sequence ();
21172 ok = expand_vec_perm_1 (&dsecond);
21173 seq2 = get_insns ();
21174 end_sequence ();
21175
21176 if (!ok)
21177 return false;
21178 }
21179
21180 if (d->testing_p)
21181 return true;
21182
21183 for (i = 0; i < nelt; ++i)
21184 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
21185
21186 emit_insn (seq1);
21187 emit_insn (seq2);
21188 ok = expand_vec_perm_blend (&dfinal);
21189 gcc_assert (ok);
21190 return true;
21191}
21192
4bf4c103 21193/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
2bf6d935
ML
21194 permutation using two vperm2f128, followed by a vshufpd insn blending
21195 the two vectors together. */
21196
21197static bool
21198expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
21199{
21200 struct expand_vec_perm_d dfirst, dsecond, dthird;
21201 bool ok;
21202
21203 if (!TARGET_AVX || (d->vmode != V4DFmode))
21204 return false;
21205
21206 if (d->testing_p)
21207 return true;
21208
21209 dfirst = *d;
21210 dsecond = *d;
21211 dthird = *d;
21212
21213 dfirst.perm[0] = (d->perm[0] & ~1);
21214 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
21215 dfirst.perm[2] = (d->perm[2] & ~1);
21216 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
21217 dsecond.perm[0] = (d->perm[1] & ~1);
21218 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
21219 dsecond.perm[2] = (d->perm[3] & ~1);
21220 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
21221 dthird.perm[0] = (d->perm[0] % 2);
21222 dthird.perm[1] = (d->perm[1] % 2) + 4;
21223 dthird.perm[2] = (d->perm[2] % 2) + 2;
21224 dthird.perm[3] = (d->perm[3] % 2) + 6;
21225
21226 dfirst.target = gen_reg_rtx (dfirst.vmode);
21227 dsecond.target = gen_reg_rtx (dsecond.vmode);
21228 dthird.op0 = dfirst.target;
21229 dthird.op1 = dsecond.target;
21230 dthird.one_operand_p = false;
21231
21232 canonicalize_perm (&dfirst);
21233 canonicalize_perm (&dsecond);
21234
21235 ok = expand_vec_perm_1 (&dfirst)
21236 && expand_vec_perm_1 (&dsecond)
21237 && expand_vec_perm_1 (&dthird);
21238
21239 gcc_assert (ok);
21240
21241 return true;
21242}
21243
4bf4c103
JJ
21244static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
21245
21246/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21247 a two vector permutation using two intra-lane vector
21248 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21249 the non-swapped and swapped vectors together. */
21250
21251static bool
21252expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
21253{
21254 struct expand_vec_perm_d dfirst, dsecond, dthird;
21255 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
21256 rtx_insn *seq1, *seq2;
21257 bool ok;
21258 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21259
21260 if (!TARGET_AVX
21261 || TARGET_AVX2
21262 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21263 || d->one_operand_p)
21264 return false;
21265
21266 dfirst = *d;
21267 dsecond = *d;
21268 for (i = 0; i < nelt; i++)
21269 {
21270 dfirst.perm[i] = 0xff;
21271 dsecond.perm[i] = 0xff;
21272 }
21273 for (i = 0, msk = 0; i < nelt; i++)
21274 {
21275 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21276 if (j == i)
21277 {
21278 dfirst.perm[j] = d->perm[i];
21279 which1 |= (d->perm[i] < nelt ? 1 : 2);
21280 }
21281 else
21282 {
21283 dsecond.perm[j] = d->perm[i];
21284 which2 |= (d->perm[i] < nelt ? 1 : 2);
21285 msk |= (1U << i);
21286 }
21287 }
21288 if (msk == 0 || msk == (1U << nelt) - 1)
21289 return false;
21290
21291 if (!d->testing_p)
21292 {
21293 dfirst.target = gen_reg_rtx (dfirst.vmode);
21294 dsecond.target = gen_reg_rtx (dsecond.vmode);
21295 }
21296
21297 for (i = 0; i < nelt; i++)
21298 {
21299 if (dfirst.perm[i] == 0xff)
21300 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
21301 if (dsecond.perm[i] == 0xff)
21302 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
21303 }
21304 canonicalize_perm (&dfirst);
21305 start_sequence ();
21306 ok = ix86_expand_vec_perm_const_1 (&dfirst);
21307 seq1 = get_insns ();
21308 end_sequence ();
21309
21310 if (!ok)
21311 return false;
21312
21313 canonicalize_perm (&dsecond);
21314 start_sequence ();
21315 ok = ix86_expand_vec_perm_const_1 (&dsecond);
21316 seq2 = get_insns ();
21317 end_sequence ();
21318
21319 if (!ok)
21320 return false;
21321
21322 if (d->testing_p)
21323 return true;
21324
21325 emit_insn (seq1);
21326 emit_insn (seq2);
21327
21328 dthird = *d;
21329 dthird.op0 = dsecond.target;
21330 dthird.op1 = dsecond.target;
21331 dthird.one_operand_p = true;
21332 dthird.target = gen_reg_rtx (dthird.vmode);
21333 for (i = 0; i < nelt; i++)
21334 dthird.perm[i] = i ^ nelt2;
21335
21336 ok = expand_vec_perm_1 (&dthird);
21337 gcc_assert (ok);
21338
21339 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21340 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
21341 return true;
21342}
21343
2bf6d935
ML
21344/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21345 permutation with two pshufb insns and an ior. We should have already
21346 failed all two instruction sequences. */
21347
21348static bool
21349expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
21350{
21351 rtx rperm[2][16], vperm, l, h, op, m128;
21352 unsigned int i, nelt, eltsz;
dd835ec2
UB
21353 machine_mode mode;
21354 rtx (*gen) (rtx, rtx, rtx);
2bf6d935 21355
dd835ec2 21356 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
be8749f9
UB
21357 && GET_MODE_SIZE (d->vmode) != 8
21358 && GET_MODE_SIZE (d->vmode) != 4))
2bf6d935
ML
21359 return false;
21360 gcc_assert (!d->one_operand_p);
21361
21362 if (d->testing_p)
21363 return true;
21364
dd835ec2
UB
21365 switch (GET_MODE_SIZE (d->vmode))
21366 {
be8749f9
UB
21367 case 4:
21368 mode = V4QImode;
21369 gen = gen_mmx_pshufbv4qi3;
21370 break;
dd835ec2
UB
21371 case 8:
21372 mode = V8QImode;
21373 gen = gen_mmx_pshufbv8qi3;
21374 break;
21375 case 16:
21376 mode = V16QImode;
21377 gen = gen_ssse3_pshufbv16qi3;
21378 break;
21379 default:
21380 gcc_unreachable ();
21381 }
21382
2bf6d935
ML
21383 nelt = d->nelt;
21384 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21385
21386 /* Generate two permutation masks. If the required element is within
21387 the given vector it is shuffled into the proper lane. If the required
21388 element is in the other vector, force a zero into the lane by setting
21389 bit 7 in the permutation mask. */
21390 m128 = GEN_INT (-128);
21391 for (i = 0; i < nelt; ++i)
21392 {
dd835ec2 21393 unsigned j, k, e = d->perm[i];
2bf6d935
ML
21394 unsigned which = (e >= nelt);
21395 if (e >= nelt)
21396 e -= nelt;
21397
21398 for (j = 0; j < eltsz; ++j)
21399 {
21400 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
21401 rperm[1-which][i*eltsz + j] = m128;
21402 }
dd835ec2
UB
21403
21404 for (k = i*eltsz + j; k < 16; ++k)
21405 rperm[0][k] = rperm[1][k] = m128;
2bf6d935
ML
21406 }
21407
21408 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
21409 vperm = force_reg (V16QImode, vperm);
21410
dd835ec2
UB
21411 l = gen_reg_rtx (mode);
21412 op = gen_lowpart (mode, d->op0);
21413 emit_insn (gen (l, op, vperm));
2bf6d935
ML
21414
21415 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
21416 vperm = force_reg (V16QImode, vperm);
21417
dd835ec2
UB
21418 h = gen_reg_rtx (mode);
21419 op = gen_lowpart (mode, d->op1);
21420 emit_insn (gen (h, op, vperm));
2bf6d935
ML
21421
21422 op = d->target;
dd835ec2
UB
21423 if (d->vmode != mode)
21424 op = gen_reg_rtx (mode);
b5193e35 21425 ix86_emit_vec_binop (IOR, mode, op, l, h);
2bf6d935
ML
21426 if (op != d->target)
21427 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21428
21429 return true;
21430}
21431
21432/* Implement arbitrary permutation of one V32QImode and V16QImode operand
21433 with two vpshufb insns, vpermq and vpor. We should have already failed
21434 all two or three instruction sequences. */
21435
21436static bool
21437expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
21438{
21439 rtx rperm[2][32], vperm, l, h, hp, op, m128;
21440 unsigned int i, nelt, eltsz;
21441
21442 if (!TARGET_AVX2
21443 || !d->one_operand_p
21444 || (d->vmode != V32QImode && d->vmode != V16HImode))
21445 return false;
21446
21447 if (d->testing_p)
21448 return true;
21449
21450 nelt = d->nelt;
21451 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21452
21453 /* Generate two permutation masks. If the required element is within
21454 the same lane, it is shuffled in. If the required element from the
21455 other lane, force a zero by setting bit 7 in the permutation mask.
21456 In the other mask the mask has non-negative elements if element
21457 is requested from the other lane, but also moved to the other lane,
21458 so that the result of vpshufb can have the two V2TImode halves
21459 swapped. */
21460 m128 = GEN_INT (-128);
21461 for (i = 0; i < nelt; ++i)
21462 {
21463 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21464 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21465
21466 for (j = 0; j < eltsz; ++j)
21467 {
21468 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
21469 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
21470 }
21471 }
21472
21473 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21474 vperm = force_reg (V32QImode, vperm);
21475
21476 h = gen_reg_rtx (V32QImode);
21477 op = gen_lowpart (V32QImode, d->op0);
21478 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21479
21480 /* Swap the 128-byte lanes of h into hp. */
21481 hp = gen_reg_rtx (V4DImode);
21482 op = gen_lowpart (V4DImode, h);
21483 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
21484 const1_rtx));
21485
21486 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21487 vperm = force_reg (V32QImode, vperm);
21488
21489 l = gen_reg_rtx (V32QImode);
21490 op = gen_lowpart (V32QImode, d->op0);
21491 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21492
21493 op = d->target;
21494 if (d->vmode != V32QImode)
21495 op = gen_reg_rtx (V32QImode);
21496 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
21497 if (op != d->target)
21498 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21499
21500 return true;
21501}
21502
21503/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21504 and extract-odd permutations of two V32QImode and V16QImode operand
21505 with two vpshufb insns, vpor and vpermq. We should have already
21506 failed all two or three instruction sequences. */
21507
21508static bool
21509expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
21510{
21511 rtx rperm[2][32], vperm, l, h, ior, op, m128;
21512 unsigned int i, nelt, eltsz;
21513
21514 if (!TARGET_AVX2
21515 || d->one_operand_p
21516 || (d->vmode != V32QImode && d->vmode != V16HImode))
21517 return false;
21518
21519 for (i = 0; i < d->nelt; ++i)
21520 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
21521 return false;
21522
21523 if (d->testing_p)
21524 return true;
21525
21526 nelt = d->nelt;
21527 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21528
21529 /* Generate two permutation masks. In the first permutation mask
21530 the first quarter will contain indexes for the first half
21531 of the op0, the second quarter will contain bit 7 set, third quarter
21532 will contain indexes for the second half of the op0 and the
21533 last quarter bit 7 set. In the second permutation mask
21534 the first quarter will contain bit 7 set, the second quarter
21535 indexes for the first half of the op1, the third quarter bit 7 set
21536 and last quarter indexes for the second half of the op1.
21537 I.e. the first mask e.g. for V32QImode extract even will be:
21538 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21539 (all values masked with 0xf except for -128) and second mask
21540 for extract even will be
21541 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21542 m128 = GEN_INT (-128);
21543 for (i = 0; i < nelt; ++i)
21544 {
21545 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21546 unsigned which = d->perm[i] >= nelt;
21547 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
21548
21549 for (j = 0; j < eltsz; ++j)
21550 {
21551 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
21552 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
21553 }
21554 }
21555
21556 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21557 vperm = force_reg (V32QImode, vperm);
21558
21559 l = gen_reg_rtx (V32QImode);
21560 op = gen_lowpart (V32QImode, d->op0);
21561 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21562
21563 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21564 vperm = force_reg (V32QImode, vperm);
21565
21566 h = gen_reg_rtx (V32QImode);
21567 op = gen_lowpart (V32QImode, d->op1);
21568 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21569
21570 ior = gen_reg_rtx (V32QImode);
21571 emit_insn (gen_iorv32qi3 (ior, l, h));
21572
21573 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21574 op = gen_reg_rtx (V4DImode);
21575 ior = gen_lowpart (V4DImode, ior);
21576 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
21577 const1_rtx, GEN_INT (3)));
21578 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21579
21580 return true;
21581}
21582
fcda0efc 21583/* Implement permutation with pslldq + psrldq + por when pshufb is not
21584 available. */
21585static bool
21586expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
21587{
21588 unsigned i, nelt = d->nelt;
21589 unsigned start1, end1 = -1;
21590 machine_mode vmode = d->vmode, imode;
21591 int start2 = -1;
21592 bool clear_op0, clear_op1;
21593 unsigned inner_size;
21594 rtx op0, op1, dop1;
21595 rtx (*gen_vec_shr) (rtx, rtx, rtx);
21596 rtx (*gen_vec_shl) (rtx, rtx, rtx);
21597
21598 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21599 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
21600 return false;
21601
21602 start1 = d->perm[0];
21603 for (i = 1; i < nelt; i++)
21604 {
69c4b5c5 21605 if (d->perm[i] != d->perm[i-1] + 1
21606 || d->perm[i] == nelt)
fcda0efc 21607 {
21608 if (start2 == -1)
21609 {
21610 start2 = d->perm[i];
21611 end1 = d->perm[i-1];
21612 }
21613 else
21614 return false;
21615 }
fcda0efc 21616 }
21617
21618 clear_op0 = end1 != nelt - 1;
21619 clear_op1 = start2 % nelt != 0;
21620 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21621 if (!pandn && (clear_op0 || clear_op1))
21622 return false;
21623
21624 if (d->testing_p)
21625 return true;
21626
21627 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
21628 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
21629 imode = GET_MODE_INNER (vmode);
21630 inner_size = GET_MODE_BITSIZE (imode);
21631 op0 = gen_reg_rtx (vmode);
21632 op1 = gen_reg_rtx (vmode);
21633
21634 if (start1)
21635 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
21636 else
21637 emit_move_insn (op0, d->op0);
21638
21639 dop1 = d->op1;
21640 if (d->one_operand_p)
21641 dop1 = d->op0;
21642
21643 int shl_offset = end1 - start1 + 1 - start2 % nelt;
21644 if (shl_offset)
21645 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
21646 else
21647 emit_move_insn (op1, dop1);
21648
21649 /* Clear lower/upper bits for op0/op1. */
21650 if (clear_op0 || clear_op1)
21651 {
21652 rtx vec[16];
21653 rtx const_vec;
21654 rtx clear;
21655 for (i = 0; i != nelt; i++)
21656 {
21657 if (i < (end1 - start1 + 1))
21658 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
21659 else
21660 vec[i] = CONST0_RTX (imode);
21661 }
21662 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
21663 const_vec = validize_mem (force_const_mem (vmode, const_vec));
21664 clear = force_reg (vmode, const_vec);
21665
21666 if (clear_op0)
21667 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
21668 if (clear_op1)
21669 emit_move_insn (op1, gen_rtx_AND (vmode,
21670 gen_rtx_NOT (vmode, clear),
21671 op1));
21672 }
21673
21674 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
21675 return true;
21676}
21677
2bf6d935 21678/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
a325bdd1
PB
21679 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21680 operands with two "and" and "pack" or two "shift" and "pack" insns.
21681 We should have already failed all two instruction sequences. */
2bf6d935
ML
21682
21683static bool
21684expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
21685{
21686 rtx op, dop0, dop1, t;
21687 unsigned i, odd, c, s, nelt = d->nelt;
21688 bool end_perm = false;
21689 machine_mode half_mode;
21690 rtx (*gen_and) (rtx, rtx, rtx);
21691 rtx (*gen_pack) (rtx, rtx, rtx);
21692 rtx (*gen_shift) (rtx, rtx, rtx);
21693
21694 if (d->one_operand_p)
21695 return false;
21696
21697 switch (d->vmode)
21698 {
dd835ec2
UB
21699 case E_V4HImode:
21700 /* Required for "pack". */
21701 if (!TARGET_SSE4_1)
21702 return false;
21703 c = 0xffff;
21704 s = 16;
21705 half_mode = V2SImode;
21706 gen_and = gen_andv2si3;
21707 gen_pack = gen_mmx_packusdw;
21708 gen_shift = gen_lshrv2si3;
21709 break;
2bf6d935
ML
21710 case E_V8HImode:
21711 /* Required for "pack". */
21712 if (!TARGET_SSE4_1)
21713 return false;
21714 c = 0xffff;
21715 s = 16;
21716 half_mode = V4SImode;
21717 gen_and = gen_andv4si3;
21718 gen_pack = gen_sse4_1_packusdw;
21719 gen_shift = gen_lshrv4si3;
21720 break;
a325bdd1
PB
21721 case E_V8QImode:
21722 /* No check as all instructions are SSE2. */
21723 c = 0xff;
21724 s = 8;
21725 half_mode = V4HImode;
21726 gen_and = gen_andv4hi3;
21727 gen_pack = gen_mmx_packuswb;
21728 gen_shift = gen_lshrv4hi3;
21729 break;
2bf6d935
ML
21730 case E_V16QImode:
21731 /* No check as all instructions are SSE2. */
21732 c = 0xff;
21733 s = 8;
21734 half_mode = V8HImode;
21735 gen_and = gen_andv8hi3;
21736 gen_pack = gen_sse2_packuswb;
21737 gen_shift = gen_lshrv8hi3;
21738 break;
21739 case E_V16HImode:
21740 if (!TARGET_AVX2)
21741 return false;
21742 c = 0xffff;
21743 s = 16;
21744 half_mode = V8SImode;
21745 gen_and = gen_andv8si3;
21746 gen_pack = gen_avx2_packusdw;
21747 gen_shift = gen_lshrv8si3;
21748 end_perm = true;
21749 break;
21750 case E_V32QImode:
21751 if (!TARGET_AVX2)
21752 return false;
21753 c = 0xff;
21754 s = 8;
21755 half_mode = V16HImode;
21756 gen_and = gen_andv16hi3;
21757 gen_pack = gen_avx2_packuswb;
21758 gen_shift = gen_lshrv16hi3;
21759 end_perm = true;
21760 break;
21761 default:
dd835ec2 21762 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
a325bdd1 21763 are more profitable than general shuffles. */
2bf6d935
ML
21764 return false;
21765 }
21766
21767 /* Check that permutation is even or odd. */
21768 odd = d->perm[0];
21769 if (odd > 1)
21770 return false;
21771
21772 for (i = 1; i < nelt; ++i)
21773 if (d->perm[i] != 2 * i + odd)
21774 return false;
21775
21776 if (d->testing_p)
21777 return true;
21778
21779 dop0 = gen_reg_rtx (half_mode);
21780 dop1 = gen_reg_rtx (half_mode);
21781 if (odd == 0)
21782 {
21783 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
21784 t = force_reg (half_mode, t);
21785 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
21786 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
21787 }
21788 else
21789 {
21790 emit_insn (gen_shift (dop0,
21791 gen_lowpart (half_mode, d->op0),
21792 GEN_INT (s)));
21793 emit_insn (gen_shift (dop1,
21794 gen_lowpart (half_mode, d->op1),
21795 GEN_INT (s)));
21796 }
21797 /* In AVX2 for 256 bit case we need to permute pack result. */
21798 if (TARGET_AVX2 && end_perm)
21799 {
21800 op = gen_reg_rtx (d->vmode);
21801 t = gen_reg_rtx (V4DImode);
21802 emit_insn (gen_pack (op, dop0, dop1));
21803 emit_insn (gen_avx2_permv4di_1 (t,
21804 gen_lowpart (V4DImode, op),
21805 const0_rtx,
21806 const2_rtx,
21807 const1_rtx,
21808 GEN_INT (3)));
21809 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
21810 }
21811 else
21812 emit_insn (gen_pack (d->target, dop0, dop1));
21813
21814 return true;
21815}
21816
21817/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21818 and extract-odd permutations of two V64QI operands
21819 with two "shifts", two "truncs" and one "concat" insns for "odd"
21820 and two "truncs" and one concat insn for "even."
21821 Have already failed all two instruction sequences. */
21822
21823static bool
21824expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
21825{
21826 rtx t1, t2, t3, t4;
21827 unsigned i, odd, nelt = d->nelt;
21828
21829 if (!TARGET_AVX512BW
21830 || d->one_operand_p
21831 || d->vmode != V64QImode)
21832 return false;
21833
21834 /* Check that permutation is even or odd. */
21835 odd = d->perm[0];
21836 if (odd > 1)
21837 return false;
21838
21839 for (i = 1; i < nelt; ++i)
21840 if (d->perm[i] != 2 * i + odd)
21841 return false;
21842
21843 if (d->testing_p)
21844 return true;
21845
21846
21847 if (odd)
21848 {
21849 t1 = gen_reg_rtx (V32HImode);
21850 t2 = gen_reg_rtx (V32HImode);
21851 emit_insn (gen_lshrv32hi3 (t1,
21852 gen_lowpart (V32HImode, d->op0),
21853 GEN_INT (8)));
21854 emit_insn (gen_lshrv32hi3 (t2,
21855 gen_lowpart (V32HImode, d->op1),
21856 GEN_INT (8)));
21857 }
21858 else
21859 {
21860 t1 = gen_lowpart (V32HImode, d->op0);
21861 t2 = gen_lowpart (V32HImode, d->op1);
21862 }
21863
21864 t3 = gen_reg_rtx (V32QImode);
21865 t4 = gen_reg_rtx (V32QImode);
21866 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
21867 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
21868 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
21869
21870 return true;
21871}
21872
4bf4c103 21873/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
2bf6d935
ML
21874 and extract-odd permutations. */
21875
21876static bool
21877expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
21878{
21879 rtx t1, t2, t3, t4, t5;
21880
21881 switch (d->vmode)
21882 {
21883 case E_V4DFmode:
21884 if (d->testing_p)
21885 break;
21886 t1 = gen_reg_rtx (V4DFmode);
21887 t2 = gen_reg_rtx (V4DFmode);
21888
21889 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21890 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
21891 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
21892
21893 /* Now an unpck[lh]pd will produce the result required. */
21894 if (odd)
21895 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
21896 else
21897 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
21898 emit_insn (t3);
21899 break;
21900
21901 case E_V8SFmode:
21902 {
21903 int mask = odd ? 0xdd : 0x88;
21904
21905 if (d->testing_p)
21906 break;
21907 t1 = gen_reg_rtx (V8SFmode);
21908 t2 = gen_reg_rtx (V8SFmode);
21909 t3 = gen_reg_rtx (V8SFmode);
21910
21911 /* Shuffle within the 128-bit lanes to produce:
21912 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
21913 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
21914 GEN_INT (mask)));
21915
21916 /* Shuffle the lanes around to produce:
21917 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
21918 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
21919 GEN_INT (0x3)));
21920
21921 /* Shuffle within the 128-bit lanes to produce:
21922 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
21923 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
21924
21925 /* Shuffle within the 128-bit lanes to produce:
21926 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
21927 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
21928
21929 /* Shuffle the lanes around to produce:
21930 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
21931 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
21932 GEN_INT (0x20)));
21933 }
21934 break;
21935
21936 case E_V2DFmode:
21937 case E_V4SFmode:
21938 case E_V2DImode:
9b8579a6 21939 case E_V2SImode:
2bf6d935 21940 case E_V4SImode:
8d7dae0e 21941 case E_V2HImode:
2bf6d935
ML
21942 /* These are always directly implementable by expand_vec_perm_1. */
21943 gcc_unreachable ();
21944
240198fe
UB
21945 case E_V2SFmode:
21946 gcc_assert (TARGET_MMX_WITH_SSE);
21947 /* We have no suitable instructions. */
21948 if (d->testing_p)
21949 return false;
21950 break;
21951
be8749f9
UB
21952 case E_V4QImode:
21953 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21954 return expand_vec_perm_pshufb2 (d);
21955 else
21956 {
21957 if (d->testing_p)
21958 break;
21959 /* We need 2*log2(N)-1 operations to achieve odd/even
21960 with interleave. */
21961 t1 = gen_reg_rtx (V4QImode);
21962 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
21963 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
21964 if (odd)
21965 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
21966 else
21967 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
21968 emit_insn (t2);
21969 }
21970 break;
21971
9b8579a6 21972 case E_V4HImode:
dd835ec2
UB
21973 if (TARGET_SSE4_1)
21974 return expand_vec_perm_even_odd_pack (d);
21975 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21976 return expand_vec_perm_pshufb2 (d);
9b8579a6 21977 else
dd835ec2
UB
21978 {
21979 if (d->testing_p)
21980 break;
21981 /* We need 2*log2(N)-1 operations to achieve odd/even
21982 with interleave. */
21983 t1 = gen_reg_rtx (V4HImode);
21984 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
21985 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
21986 if (odd)
21987 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
21988 else
21989 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
21990 emit_insn (t2);
21991 }
9b8579a6
UB
21992 break;
21993
2bf6d935
ML
21994 case E_V8HImode:
21995 if (TARGET_SSE4_1)
21996 return expand_vec_perm_even_odd_pack (d);
21997 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21998 return expand_vec_perm_pshufb2 (d);
21999 else
22000 {
22001 if (d->testing_p)
22002 break;
22003 /* We need 2*log2(N)-1 operations to achieve odd/even
22004 with interleave. */
22005 t1 = gen_reg_rtx (V8HImode);
22006 t2 = gen_reg_rtx (V8HImode);
22007 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
22008 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
22009 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
22010 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
22011 if (odd)
22012 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
22013 else
22014 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
22015 emit_insn (t3);
22016 }
22017 break;
22018
a325bdd1 22019 case E_V8QImode:
2bf6d935
ML
22020 case E_V16QImode:
22021 return expand_vec_perm_even_odd_pack (d);
22022
22023 case E_V16HImode:
22024 case E_V32QImode:
22025 return expand_vec_perm_even_odd_pack (d);
22026
22027 case E_V64QImode:
22028 return expand_vec_perm_even_odd_trunc (d);
22029
22030 case E_V4DImode:
22031 if (!TARGET_AVX2)
22032 {
22033 struct expand_vec_perm_d d_copy = *d;
22034 d_copy.vmode = V4DFmode;
22035 if (d->testing_p)
22036 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
22037 else
22038 d_copy.target = gen_reg_rtx (V4DFmode);
22039 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
22040 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
22041 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22042 {
22043 if (!d->testing_p)
22044 emit_move_insn (d->target,
22045 gen_lowpart (V4DImode, d_copy.target));
22046 return true;
22047 }
22048 return false;
22049 }
22050
22051 if (d->testing_p)
22052 break;
22053
22054 t1 = gen_reg_rtx (V4DImode);
22055 t2 = gen_reg_rtx (V4DImode);
22056
22057 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22058 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
22059 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
22060
22061 /* Now an vpunpck[lh]qdq will produce the result required. */
22062 if (odd)
22063 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
22064 else
22065 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
22066 emit_insn (t3);
22067 break;
22068
22069 case E_V8SImode:
22070 if (!TARGET_AVX2)
22071 {
22072 struct expand_vec_perm_d d_copy = *d;
22073 d_copy.vmode = V8SFmode;
22074 if (d->testing_p)
22075 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
22076 else
22077 d_copy.target = gen_reg_rtx (V8SFmode);
22078 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
22079 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
22080 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22081 {
22082 if (!d->testing_p)
22083 emit_move_insn (d->target,
22084 gen_lowpart (V8SImode, d_copy.target));
22085 return true;
22086 }
22087 return false;
22088 }
22089
22090 if (d->testing_p)
22091 break;
22092
22093 t1 = gen_reg_rtx (V8SImode);
22094 t2 = gen_reg_rtx (V8SImode);
22095 t3 = gen_reg_rtx (V4DImode);
22096 t4 = gen_reg_rtx (V4DImode);
22097 t5 = gen_reg_rtx (V4DImode);
22098
22099 /* Shuffle the lanes around into
22100 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22101 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
22102 gen_lowpart (V4DImode, d->op1),
22103 GEN_INT (0x20)));
22104 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
22105 gen_lowpart (V4DImode, d->op1),
22106 GEN_INT (0x31)));
22107
22108 /* Swap the 2nd and 3rd position in each lane into
22109 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22110 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
22111 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22112 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
22113 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22114
22115 /* Now an vpunpck[lh]qdq will produce
22116 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22117 if (odd)
22118 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
22119 gen_lowpart (V4DImode, t2));
22120 else
22121 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
22122 gen_lowpart (V4DImode, t2));
22123 emit_insn (t3);
22124 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
22125 break;
22126
22127 default:
22128 gcc_unreachable ();
22129 }
22130
22131 return true;
22132}
22133
4bf4c103 22134/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
22135 extract-even and extract-odd permutations. */
22136
22137static bool
22138expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
22139{
22140 unsigned i, odd, nelt = d->nelt;
22141
22142 odd = d->perm[0];
22143 if (odd != 0 && odd != 1)
22144 return false;
22145
22146 for (i = 1; i < nelt; ++i)
22147 if (d->perm[i] != 2 * i + odd)
22148 return false;
22149
50b58779
JJ
22150 if (d->vmode == E_V32HImode
22151 && d->testing_p
22152 && !TARGET_AVX512BW)
22153 return false;
22154
2bf6d935
ML
22155 return expand_vec_perm_even_odd_1 (d, odd);
22156}
22157
4bf4c103 22158/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
2bf6d935
ML
22159 permutations. We assume that expand_vec_perm_1 has already failed. */
22160
22161static bool
22162expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
22163{
22164 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
22165 machine_mode vmode = d->vmode;
be8749f9 22166 rtx (*gen) (rtx, rtx, rtx);
2bf6d935
ML
22167 unsigned char perm2[4];
22168 rtx op0 = d->op0, dest;
22169 bool ok;
22170
22171 switch (vmode)
22172 {
22173 case E_V4DFmode:
22174 case E_V8SFmode:
22175 /* These are special-cased in sse.md so that we can optionally
22176 use the vbroadcast instruction. They expand to two insns
22177 if the input happens to be in a register. */
22178 gcc_unreachable ();
22179
22180 case E_V2DFmode:
240198fe 22181 case E_V2SFmode:
2bf6d935 22182 case E_V4SFmode:
240198fe 22183 case E_V2DImode:
9b8579a6 22184 case E_V2SImode:
2bf6d935 22185 case E_V4SImode:
8d7dae0e
UB
22186 case E_V2HImode:
22187 case E_V4HImode:
2bf6d935
ML
22188 /* These are always implementable using standard shuffle patterns. */
22189 gcc_unreachable ();
22190
be8749f9
UB
22191 case E_V4QImode:
22192 /* This can be implemented via interleave and pshuflw. */
22193 if (d->testing_p)
22194 return true;
22195
22196 if (elt >= nelt2)
22197 {
22198 gen = gen_mmx_punpckhbw_low;
22199 elt -= nelt2;
22200 }
22201 else
22202 gen = gen_mmx_punpcklbw_low;
22203
22204 dest = gen_reg_rtx (vmode);
22205 emit_insn (gen (dest, op0, op0));
22206 vmode = get_mode_wider_vector (vmode);
22207 op0 = gen_lowpart (vmode, dest);
22208
22209 memset (perm2, elt, 2);
22210 dest = gen_reg_rtx (vmode);
22211 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22212 gcc_assert (ok);
22213
22214 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22215 return true;
22216
a325bdd1 22217 case E_V8QImode:
be8749f9 22218 /* This can be implemented via interleave. We save one insn by
a325bdd1
PB
22219 stopping once we have promoted to V2SImode and then use pshufd. */
22220 if (d->testing_p)
22221 return true;
22222 do
22223 {
a325bdd1
PB
22224 if (elt >= nelt2)
22225 {
22226 gen = vmode == V8QImode ? gen_mmx_punpckhbw
22227 : gen_mmx_punpckhwd;
22228 elt -= nelt2;
22229 }
be8749f9
UB
22230 else
22231 gen = vmode == V8QImode ? gen_mmx_punpcklbw
22232 : gen_mmx_punpcklwd;
a325bdd1
PB
22233 nelt2 /= 2;
22234
22235 dest = gen_reg_rtx (vmode);
22236 emit_insn (gen (dest, op0, op0));
22237 vmode = get_mode_wider_vector (vmode);
22238 op0 = gen_lowpart (vmode, dest);
22239 }
22240 while (vmode != V2SImode);
22241
22242 memset (perm2, elt, 2);
be8749f9 22243 dest = gen_reg_rtx (vmode);
a325bdd1
PB
22244 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22245 gcc_assert (ok);
be8749f9
UB
22246
22247 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
a325bdd1
PB
22248 return true;
22249
2bf6d935
ML
22250 case E_V8HImode:
22251 case E_V16QImode:
22252 /* These can be implemented via interleave. We save one insn by
22253 stopping once we have promoted to V4SImode and then use pshufd. */
22254 if (d->testing_p)
22255 return true;
22256 do
22257 {
2bf6d935
ML
22258 if (elt >= nelt2)
22259 {
22260 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
22261 : gen_vec_interleave_highv8hi;
22262 elt -= nelt2;
22263 }
be8749f9
UB
22264 else
22265 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
22266 : gen_vec_interleave_lowv8hi;
2bf6d935
ML
22267 nelt2 /= 2;
22268
22269 dest = gen_reg_rtx (vmode);
22270 emit_insn (gen (dest, op0, op0));
22271 vmode = get_mode_wider_vector (vmode);
22272 op0 = gen_lowpart (vmode, dest);
22273 }
22274 while (vmode != V4SImode);
22275
22276 memset (perm2, elt, 4);
be8749f9 22277 dest = gen_reg_rtx (vmode);
2bf6d935
ML
22278 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22279 gcc_assert (ok);
be8749f9
UB
22280
22281 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
2bf6d935
ML
22282 return true;
22283
7a54d3de 22284 case E_V8HFmode:
092763fd 22285 case E_V8BFmode:
7a54d3de
UB
22286 /* This can be implemented via interleave and pshufd. */
22287 if (d->testing_p)
22288 return true;
22289
092763fd 22290 rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
7a54d3de
UB
22291 if (elt >= nelt2)
22292 {
092763fd 22293 maybe_gen = maybe_gen_vec_interleave_high;
7a54d3de
UB
22294 elt -= nelt2;
22295 }
22296 else
092763fd 22297 maybe_gen = maybe_gen_vec_interleave_low;
7a54d3de
UB
22298 nelt2 /= 2;
22299
22300 dest = gen_reg_rtx (vmode);
092763fd 22301 emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
7a54d3de
UB
22302
22303 vmode = V4SImode;
22304 op0 = gen_lowpart (vmode, dest);
22305
22306 memset (perm2, elt, 4);
22307 dest = gen_reg_rtx (vmode);
22308 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22309 gcc_assert (ok);
22310
22311 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22312 return true;
22313
2bf6d935
ML
22314 case E_V32QImode:
22315 case E_V16HImode:
22316 case E_V8SImode:
22317 case E_V4DImode:
22318 /* For AVX2 broadcasts of the first element vpbroadcast* or
22319 vpermq should be used by expand_vec_perm_1. */
22320 gcc_assert (!TARGET_AVX2 || d->perm[0]);
22321 return false;
22322
240f0780
JJ
22323 case E_V64QImode:
22324 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
22325 return false;
22326
04b4f315
JJ
22327 case E_V32HImode:
22328 gcc_assert (!TARGET_AVX512BW);
22329 return false;
22330
2bf6d935
ML
22331 default:
22332 gcc_unreachable ();
22333 }
22334}
22335
4bf4c103 22336/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
22337 broadcast permutations. */
22338
22339static bool
22340expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
22341{
22342 unsigned i, elt, nelt = d->nelt;
22343
22344 if (!d->one_operand_p)
22345 return false;
22346
22347 elt = d->perm[0];
22348 for (i = 1; i < nelt; ++i)
22349 if (d->perm[i] != elt)
22350 return false;
22351
22352 return expand_vec_perm_broadcast_1 (d);
22353}
22354
22355/* Implement arbitrary permutations of two V64QImode operands
22356 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22357static bool
22358expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
22359{
22360 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
22361 return false;
22362
22363 if (d->testing_p)
22364 return true;
22365
22366 struct expand_vec_perm_d ds[2];
22367 rtx rperm[128], vperm, target0, target1;
22368 unsigned int i, nelt;
22369 machine_mode vmode;
22370
22371 nelt = d->nelt;
22372 vmode = V64QImode;
22373
22374 for (i = 0; i < 2; i++)
22375 {
22376 ds[i] = *d;
22377 ds[i].vmode = V32HImode;
22378 ds[i].nelt = 32;
22379 ds[i].target = gen_reg_rtx (V32HImode);
22380 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
22381 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
22382 }
22383
22384 /* Prepare permutations such that the first one takes care of
22385 putting the even bytes into the right positions or one higher
22386 positions (ds[0]) and the second one takes care of
22387 putting the odd bytes into the right positions or one below
22388 (ds[1]). */
22389
22390 for (i = 0; i < nelt; i++)
22391 {
22392 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
22393 if (i & 1)
22394 {
22395 rperm[i] = constm1_rtx;
22396 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22397 }
22398 else
22399 {
22400 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22401 rperm[i + 64] = constm1_rtx;
22402 }
22403 }
22404
22405 bool ok = expand_vec_perm_1 (&ds[0]);
22406 gcc_assert (ok);
22407 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
22408
22409 ok = expand_vec_perm_1 (&ds[1]);
22410 gcc_assert (ok);
22411 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
22412
22413 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
22414 vperm = force_reg (vmode, vperm);
22415 target0 = gen_reg_rtx (V64QImode);
22416 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
22417
22418 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
22419 vperm = force_reg (vmode, vperm);
22420 target1 = gen_reg_rtx (V64QImode);
22421 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
22422
22423 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
22424 return true;
22425}
22426
22427/* Implement arbitrary permutation of two V32QImode and V16QImode operands
22428 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22429 all the shorter instruction sequences. */
22430
22431static bool
22432expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
22433{
22434 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
22435 unsigned int i, nelt, eltsz;
22436 bool used[4];
22437
22438 if (!TARGET_AVX2
22439 || d->one_operand_p
22440 || (d->vmode != V32QImode && d->vmode != V16HImode))
22441 return false;
22442
22443 if (d->testing_p)
22444 return true;
22445
22446 nelt = d->nelt;
22447 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22448
22449 /* Generate 4 permutation masks. If the required element is within
22450 the same lane, it is shuffled in. If the required element from the
22451 other lane, force a zero by setting bit 7 in the permutation mask.
22452 In the other mask the mask has non-negative elements if element
22453 is requested from the other lane, but also moved to the other lane,
22454 so that the result of vpshufb can have the two V2TImode halves
22455 swapped. */
22456 m128 = GEN_INT (-128);
22457 for (i = 0; i < 32; ++i)
22458 {
22459 rperm[0][i] = m128;
22460 rperm[1][i] = m128;
22461 rperm[2][i] = m128;
22462 rperm[3][i] = m128;
22463 }
22464 used[0] = false;
22465 used[1] = false;
22466 used[2] = false;
22467 used[3] = false;
22468 for (i = 0; i < nelt; ++i)
22469 {
22470 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22471 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22472 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
22473
22474 for (j = 0; j < eltsz; ++j)
22475 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
22476 used[which] = true;
22477 }
22478
22479 for (i = 0; i < 2; ++i)
22480 {
22481 if (!used[2 * i + 1])
22482 {
22483 h[i] = NULL_RTX;
22484 continue;
22485 }
22486 vperm = gen_rtx_CONST_VECTOR (V32QImode,
22487 gen_rtvec_v (32, rperm[2 * i + 1]));
22488 vperm = force_reg (V32QImode, vperm);
22489 h[i] = gen_reg_rtx (V32QImode);
22490 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22491 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
22492 }
22493
22494 /* Swap the 128-byte lanes of h[X]. */
22495 for (i = 0; i < 2; ++i)
22496 {
22497 if (h[i] == NULL_RTX)
22498 continue;
22499 op = gen_reg_rtx (V4DImode);
22500 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
22501 const2_rtx, GEN_INT (3), const0_rtx,
22502 const1_rtx));
22503 h[i] = gen_lowpart (V32QImode, op);
22504 }
22505
22506 for (i = 0; i < 2; ++i)
22507 {
22508 if (!used[2 * i])
22509 {
22510 l[i] = NULL_RTX;
22511 continue;
22512 }
22513 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
22514 vperm = force_reg (V32QImode, vperm);
22515 l[i] = gen_reg_rtx (V32QImode);
22516 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22517 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
22518 }
22519
22520 for (i = 0; i < 2; ++i)
22521 {
22522 if (h[i] && l[i])
22523 {
22524 op = gen_reg_rtx (V32QImode);
22525 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
22526 l[i] = op;
22527 }
22528 else if (h[i])
22529 l[i] = h[i];
22530 }
22531
22532 gcc_assert (l[0] && l[1]);
22533 op = d->target;
22534 if (d->vmode != V32QImode)
22535 op = gen_reg_rtx (V32QImode);
22536 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
22537 if (op != d->target)
22538 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22539 return true;
22540}
22541
22542/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22543 taken care of, perform the expansion in D and return true on success. */
22544
22545static bool
22546ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22547{
22548 /* Try a single instruction expansion. */
22549 if (expand_vec_perm_1 (d))
22550 return true;
22551
22552 /* Try sequences of two instructions. */
22553
22554 if (expand_vec_perm_pshuflw_pshufhw (d))
22555 return true;
22556
22557 if (expand_vec_perm_palignr (d, false))
22558 return true;
22559
22560 if (expand_vec_perm_interleave2 (d))
22561 return true;
22562
22563 if (expand_vec_perm_broadcast (d))
22564 return true;
22565
22566 if (expand_vec_perm_vpermq_perm_1 (d))
22567 return true;
22568
22569 if (expand_vec_perm_vperm2f128 (d))
22570 return true;
22571
22572 if (expand_vec_perm_pblendv (d))
22573 return true;
22574
829c4bea
JJ
22575 if (expand_vec_perm_2perm_interleave (d, true))
22576 return true;
22577
22578 if (expand_vec_perm_2perm_pblendv (d, true))
22579 return true;
22580
3db8e9c2 22581 if (expand_vec_perm_shufps_shufps (d))
22582 return true;
22583
2bf6d935
ML
22584 /* Try sequences of three instructions. */
22585
22586 if (expand_vec_perm_even_odd_pack (d))
22587 return true;
22588
22589 if (expand_vec_perm_2vperm2f128_vshuf (d))
22590 return true;
22591
22592 if (expand_vec_perm_pshufb2 (d))
22593 return true;
22594
fcda0efc 22595 if (expand_vec_perm_pslldq_psrldq_por (d, false))
22596 return true;
22597
2bf6d935
ML
22598 if (expand_vec_perm_interleave3 (d))
22599 return true;
22600
22601 if (expand_vec_perm_vperm2f128_vblend (d))
22602 return true;
22603
829c4bea
JJ
22604 if (expand_vec_perm_2perm_interleave (d, false))
22605 return true;
22606
22607 if (expand_vec_perm_2perm_pblendv (d, false))
22608 return true;
22609
2bf6d935
ML
22610 /* Try sequences of four instructions. */
22611
22612 if (expand_vec_perm_even_odd_trunc (d))
22613 return true;
22614 if (expand_vec_perm_vpshufb2_vpermq (d))
22615 return true;
22616
22617 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
22618 return true;
22619
22620 if (expand_vec_perm_vpermt2_vpshub2 (d))
22621 return true;
22622
22623 /* ??? Look for narrow permutations whose element orderings would
22624 allow the promotion to a wider mode. */
22625
22626 /* ??? Look for sequences of interleave or a wider permute that place
22627 the data into the correct lanes for a half-vector shuffle like
22628 pshuf[lh]w or vpermilps. */
22629
22630 /* ??? Look for sequences of interleave that produce the desired results.
22631 The combinatorics of punpck[lh] get pretty ugly... */
22632
22633 if (expand_vec_perm_even_odd (d))
22634 return true;
22635
fcda0efc 22636 /* Generate four or five instructions. */
22637 if (expand_vec_perm_pslldq_psrldq_por (d, true))
22638 return true;
22639
2bf6d935
ML
22640 /* Even longer sequences. */
22641 if (expand_vec_perm_vpshufb4_vpermq2 (d))
22642 return true;
22643
22644 /* See if we can get the same permutation in different vector integer
22645 mode. */
22646 struct expand_vec_perm_d nd;
22647 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22648 {
22649 if (!d->testing_p)
22650 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22651 return true;
22652 }
22653
4bf4c103
JJ
22654 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22655 if (expand_vec_perm2_vperm2f128_vblend (d))
22656 return true;
22657
2bf6d935
ML
22658 return false;
22659}
22660
22661/* If a permutation only uses one operand, make it clear. Returns true
22662 if the permutation references both operands. */
22663
22664static bool
22665canonicalize_perm (struct expand_vec_perm_d *d)
22666{
22667 int i, which, nelt = d->nelt;
22668
22669 for (i = which = 0; i < nelt; ++i)
4bf4c103 22670 which |= (d->perm[i] < nelt ? 1 : 2);
2bf6d935
ML
22671
22672 d->one_operand_p = true;
22673 switch (which)
22674 {
22675 default:
22676 gcc_unreachable();
22677
22678 case 3:
22679 if (!rtx_equal_p (d->op0, d->op1))
22680 {
22681 d->one_operand_p = false;
22682 break;
22683 }
22684 /* The elements of PERM do not suggest that only the first operand
22685 is used, but both operands are identical. Allow easier matching
22686 of the permutation by folding the permutation into the single
22687 input vector. */
22688 /* FALLTHRU */
22689
22690 case 2:
22691 for (i = 0; i < nelt; ++i)
22692 d->perm[i] &= nelt - 1;
22693 d->op0 = d->op1;
22694 break;
22695
22696 case 1:
22697 d->op1 = d->op0;
22698 break;
22699 }
22700
22701 return (which == 3);
22702}
22703
22704/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22705
22706bool
ae8decf1
PK
22707ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
22708 rtx target, rtx op0, rtx op1,
22709 const vec_perm_indices &sel)
2bf6d935 22710{
ae8decf1
PK
22711 if (vmode != op_mode)
22712 return false;
22713
2bf6d935
ML
22714 struct expand_vec_perm_d d;
22715 unsigned char perm[MAX_VECT_LEN];
22716 unsigned int i, nelt, which;
22717 bool two_args;
22718
be072bfa
HW
22719 /* For HF mode vector, convert it to HI using subreg. */
22720 if (GET_MODE_INNER (vmode) == HFmode)
22721 {
22722 machine_mode orig_mode = vmode;
22723 vmode = mode_for_vector (HImode,
22724 GET_MODE_NUNITS (vmode)).require ();
22725 if (target)
22726 target = lowpart_subreg (vmode, target, orig_mode);
22727 if (op0)
22728 op0 = lowpart_subreg (vmode, op0, orig_mode);
22729 if (op1)
22730 op1 = lowpart_subreg (vmode, op1, orig_mode);
22731 }
22732
2bf6d935
ML
22733 d.target = target;
22734 d.op0 = op0;
22735 d.op1 = op1;
22736
22737 d.vmode = vmode;
22738 gcc_assert (VECTOR_MODE_P (d.vmode));
22739 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22740 d.testing_p = !target;
22741
22742 gcc_assert (sel.length () == nelt);
22743 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
22744
22745 /* Given sufficient ISA support we can just return true here
22746 for selected vector modes. */
22747 switch (d.vmode)
22748 {
22749 case E_V16SFmode:
22750 case E_V16SImode:
22751 case E_V8DImode:
22752 case E_V8DFmode:
22753 if (!TARGET_AVX512F)
22754 return false;
22755 /* All implementable with a single vperm[it]2 insn. */
22756 if (d.testing_p)
22757 return true;
22758 break;
22759 case E_V32HImode:
50b58779 22760 if (!TARGET_AVX512F)
2bf6d935 22761 return false;
50b58779 22762 if (d.testing_p && TARGET_AVX512BW)
2bf6d935
ML
22763 /* All implementable with a single vperm[it]2 insn. */
22764 return true;
22765 break;
22766 case E_V64QImode:
50b58779 22767 if (!TARGET_AVX512F)
2bf6d935 22768 return false;
50b58779 22769 if (d.testing_p && TARGET_AVX512BW)
2bf6d935
ML
22770 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22771 return true;
22772 break;
22773 case E_V8SImode:
22774 case E_V8SFmode:
22775 case E_V4DFmode:
22776 case E_V4DImode:
22777 if (!TARGET_AVX)
22778 return false;
22779 if (d.testing_p && TARGET_AVX512VL)
22780 /* All implementable with a single vperm[it]2 insn. */
22781 return true;
22782 break;
22783 case E_V16HImode:
22784 if (!TARGET_SSE2)
22785 return false;
22786 if (d.testing_p && TARGET_AVX2)
22787 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22788 return true;
22789 break;
22790 case E_V32QImode:
22791 if (!TARGET_SSE2)
22792 return false;
22793 if (d.testing_p && TARGET_AVX2)
22794 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22795 return true;
22796 break;
22797 case E_V8HImode:
22798 case E_V16QImode:
22799 if (!TARGET_SSE2)
22800 return false;
22801 /* Fall through. */
22802 case E_V4SImode:
22803 case E_V4SFmode:
22804 if (!TARGET_SSE)
22805 return false;
22806 /* All implementable with a single vpperm insn. */
22807 if (d.testing_p && TARGET_XOP)
22808 return true;
22809 /* All implementable with 2 pshufb + 1 ior. */
22810 if (d.testing_p && TARGET_SSSE3)
22811 return true;
22812 break;
240198fe 22813 case E_V2SFmode:
9b8579a6
UB
22814 case E_V2SImode:
22815 case E_V4HImode:
a325bdd1 22816 case E_V8QImode:
9b8579a6
UB
22817 if (!TARGET_MMX_WITH_SSE)
22818 return false;
22819 break;
8d7dae0e 22820 case E_V2HImode:
4986946f
UB
22821 if (!TARGET_SSE2)
22822 return false;
22823 /* All implementable with *punpckwd. */
22824 if (d.testing_p)
22825 return true;
22826 break;
be8749f9
UB
22827 case E_V4QImode:
22828 if (!TARGET_SSE2)
22829 return false;
22830 break;
2bf6d935
ML
22831 case E_V2DImode:
22832 case E_V2DFmode:
22833 if (!TARGET_SSE)
22834 return false;
22835 /* All implementable with shufpd or unpck[lh]pd. */
22836 if (d.testing_p)
22837 return true;
22838 break;
22839 default:
22840 return false;
22841 }
22842
22843 for (i = which = 0; i < nelt; ++i)
22844 {
22845 unsigned char e = sel[i];
22846 gcc_assert (e < 2 * nelt);
22847 d.perm[i] = e;
22848 perm[i] = e;
22849 which |= (e < nelt ? 1 : 2);
22850 }
22851
22852 if (d.testing_p)
22853 {
22854 /* For all elements from second vector, fold the elements to first. */
22855 if (which == 2)
22856 for (i = 0; i < nelt; ++i)
22857 d.perm[i] -= nelt;
22858
22859 /* Check whether the mask can be applied to the vector type. */
22860 d.one_operand_p = (which != 3);
22861
8d7dae0e 22862 /* Implementable with shufps, pshufd or pshuflw. */
9b8579a6 22863 if (d.one_operand_p
240198fe 22864 && (d.vmode == V4SFmode || d.vmode == V2SFmode
8d7dae0e
UB
22865 || d.vmode == V4SImode || d.vmode == V2SImode
22866 || d.vmode == V4HImode || d.vmode == V2HImode))
2bf6d935
ML
22867 return true;
22868
22869 /* Otherwise we have to go through the motions and see if we can
22870 figure out how to generate the requested permutation. */
22871 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
22872 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
22873 if (!d.one_operand_p)
22874 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
22875
22876 start_sequence ();
22877 bool ret = ix86_expand_vec_perm_const_1 (&d);
22878 end_sequence ();
22879
22880 return ret;
22881 }
22882
22883 two_args = canonicalize_perm (&d);
22884
b1d1e2b5
JJ
22885 /* If one of the operands is a zero vector, try to match pmovzx. */
22886 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
22887 {
22888 struct expand_vec_perm_d dzero = d;
22889 if (d.op0 == CONST0_RTX (vmode))
22890 {
22891 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
22892 std::swap (dzero.op0, dzero.op1);
22893 for (i = 0; i < nelt; ++i)
22894 dzero.perm[i] ^= nelt;
22895 }
22896 else
22897 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
22898
22899 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
22900 dzero.perm, nelt, dzero.testing_p))
22901 return true;
22902 }
22903
22904 /* Force operands into registers. */
22905 rtx nop0 = force_reg (vmode, d.op0);
22906 if (d.op0 == d.op1)
22907 d.op1 = nop0;
22908 d.op0 = nop0;
22909 d.op1 = force_reg (vmode, d.op1);
22910
2bf6d935
ML
22911 if (ix86_expand_vec_perm_const_1 (&d))
22912 return true;
22913
22914 /* If the selector says both arguments are needed, but the operands are the
22915 same, the above tried to expand with one_operand_p and flattened selector.
22916 If that didn't work, retry without one_operand_p; we succeeded with that
22917 during testing. */
22918 if (two_args && d.one_operand_p)
22919 {
22920 d.one_operand_p = false;
22921 memcpy (d.perm, perm, sizeof (perm));
22922 return ix86_expand_vec_perm_const_1 (&d);
22923 }
22924
22925 return false;
22926}
22927
22928void
22929ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
22930{
22931 struct expand_vec_perm_d d;
22932 unsigned i, nelt;
22933
22934 d.target = targ;
22935 d.op0 = op0;
22936 d.op1 = op1;
22937 d.vmode = GET_MODE (targ);
22938 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22939 d.one_operand_p = false;
22940 d.testing_p = false;
22941
22942 for (i = 0; i < nelt; ++i)
22943 d.perm[i] = i * 2 + odd;
22944
22945 /* We'll either be able to implement the permutation directly... */
22946 if (expand_vec_perm_1 (&d))
22947 return;
22948
22949 /* ... or we use the special-case patterns. */
22950 expand_vec_perm_even_odd_1 (&d, odd);
22951}
22952
22953static void
22954ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
22955{
22956 struct expand_vec_perm_d d;
22957 unsigned i, nelt, base;
22958 bool ok;
22959
22960 d.target = targ;
22961 d.op0 = op0;
22962 d.op1 = op1;
22963 d.vmode = GET_MODE (targ);
22964 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22965 d.one_operand_p = false;
22966 d.testing_p = false;
22967
22968 base = high_p ? nelt / 2 : 0;
22969 for (i = 0; i < nelt / 2; ++i)
22970 {
22971 d.perm[i * 2] = i + base;
22972 d.perm[i * 2 + 1] = i + base + nelt;
22973 }
22974
22975 /* Note that for AVX this isn't one instruction. */
22976 ok = ix86_expand_vec_perm_const_1 (&d);
22977 gcc_assert (ok);
22978}
22979
3bd86940 22980/* This function is similar as ix86_expand_vecop_qihi,
22981 but optimized under AVX512BW by using vpmovwb.
22982 For example, optimize vector MUL generation like
54cdb2f5 22983
22984 vpmovzxbw ymm2, xmm0
22985 vpmovzxbw ymm3, xmm1
22986 vpmullw ymm4, ymm2, ymm3
22987 vpmovwb xmm0, ymm4
22988
22989 it would take less instructions than ix86_expand_vecop_qihi.
22990 Return true if success. */
22991
3bd86940 22992static bool
22993ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
54cdb2f5 22994{
22995 machine_mode himode, qimode = GET_MODE (dest);
22996 rtx hop1, hop2, hdest;
22997 rtx (*gen_extend)(rtx, rtx);
22998 rtx (*gen_truncate)(rtx, rtx);
3bd86940 22999 bool uns_p = (code == ASHIFTRT) ? false : true;
54cdb2f5 23000
23001 /* There's no V64HImode multiplication instruction. */
23002 if (qimode == E_V64QImode)
23003 return false;
23004
23005 /* vpmovwb only available under AVX512BW. */
23006 if (!TARGET_AVX512BW)
23007 return false;
23008 if ((qimode == V8QImode || qimode == V16QImode)
23009 && !TARGET_AVX512VL)
23010 return false;
23011 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
23012 if (qimode == V32QImode
23013 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
23014 return false;
23015
23016 switch (qimode)
23017 {
23018 case E_V8QImode:
23019 himode = V8HImode;
3bd86940 23020 gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
54cdb2f5 23021 gen_truncate = gen_truncv8hiv8qi2;
23022 break;
23023 case E_V16QImode:
23024 himode = V16HImode;
3bd86940 23025 gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
54cdb2f5 23026 gen_truncate = gen_truncv16hiv16qi2;
23027 break;
23028 case E_V32QImode:
23029 himode = V32HImode;
3bd86940 23030 gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
54cdb2f5 23031 gen_truncate = gen_truncv32hiv32qi2;
23032 break;
23033 default:
23034 gcc_unreachable ();
23035 }
23036
23037 hop1 = gen_reg_rtx (himode);
23038 hop2 = gen_reg_rtx (himode);
23039 hdest = gen_reg_rtx (himode);
23040 emit_insn (gen_extend (hop1, op1));
23041 emit_insn (gen_extend (hop2, op2));
3bd86940 23042 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
54cdb2f5 23043 hop1, hop2)));
23044 emit_insn (gen_truncate (dest, hdest));
23045 return true;
23046}
2bf6d935 23047
c7199fb6 23048/* Expand a vector operation shift by constant for a V*QImode in terms of the
23049 same operation on V*HImode. Return true if success. */
3bd86940 23050static bool
23051ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
23052 rtx dest, rtx op1, rtx op2)
c7199fb6 23053{
23054 machine_mode qimode, himode;
c44c2a3b 23055 HOST_WIDE_INT and_constant, xor_constant;
c7199fb6 23056 HOST_WIDE_INT shift_amount;
23057 rtx vec_const_and, vec_const_xor;
23058 rtx tmp, op1_subreg;
23059 rtx (*gen_shift) (rtx, rtx, rtx);
23060 rtx (*gen_and) (rtx, rtx, rtx);
23061 rtx (*gen_xor) (rtx, rtx, rtx);
23062 rtx (*gen_sub) (rtx, rtx, rtx);
23063
23064 /* Only optimize shift by constant. */
23065 if (!CONST_INT_P (op2))
23066 return false;
23067
23068 qimode = GET_MODE (dest);
23069 shift_amount = INTVAL (op2);
23070 /* Do nothing when shift amount greater equal 8. */
23071 if (shift_amount > 7)
23072 return false;
23073
23074 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
23075 /* Record sign bit. */
23076 xor_constant = 1 << (8 - shift_amount - 1);
23077
23078 /* Zero upper/lower bits shift from left/right element. */
23079 and_constant
23080 = (code == ASHIFT ? 256 - (1 << shift_amount)
23081 : (1 << (8 - shift_amount)) - 1);
23082
23083 switch (qimode)
23084 {
23085 case V16QImode:
23086 himode = V8HImode;
23087 gen_shift =
23088 ((code == ASHIFT)
23089 ? gen_ashlv8hi3
23090 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
23091 gen_and = gen_andv16qi3;
23092 gen_xor = gen_xorv16qi3;
23093 gen_sub = gen_subv16qi3;
23094 break;
23095 case V32QImode:
23096 himode = V16HImode;
23097 gen_shift =
23098 ((code == ASHIFT)
23099 ? gen_ashlv16hi3
23100 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
23101 gen_and = gen_andv32qi3;
23102 gen_xor = gen_xorv32qi3;
23103 gen_sub = gen_subv32qi3;
23104 break;
23105 case V64QImode:
23106 himode = V32HImode;
23107 gen_shift =
23108 ((code == ASHIFT)
23109 ? gen_ashlv32hi3
23110 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
23111 gen_and = gen_andv64qi3;
23112 gen_xor = gen_xorv64qi3;
23113 gen_sub = gen_subv64qi3;
23114 break;
23115 default:
23116 gcc_unreachable ();
23117 }
23118
23119 tmp = gen_reg_rtx (himode);
23120 vec_const_and = gen_reg_rtx (qimode);
23121 op1_subreg = lowpart_subreg (himode, op1, qimode);
23122
23123 /* For ASHIFT and LSHIFTRT, perform operation like
23124 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23125 vpand %vec_const_and, %dest. */
23126 emit_insn (gen_shift (tmp, op1_subreg, op2));
23127 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
23128 emit_move_insn (vec_const_and,
23129 ix86_build_const_vector (qimode, true,
c44c2a3b 23130 gen_int_mode (and_constant, QImode)));
c7199fb6 23131 emit_insn (gen_and (dest, dest, vec_const_and));
23132
23133 /* For ASHIFTRT, perform extra operation like
23134 vpxor %vec_const_xor, %dest, %dest
23135 vpsubb %vec_const_xor, %dest, %dest */
23136 if (code == ASHIFTRT)
23137 {
23138 vec_const_xor = gen_reg_rtx (qimode);
23139 emit_move_insn (vec_const_xor,
23140 ix86_build_const_vector (qimode, true,
c44c2a3b 23141 gen_int_mode (xor_constant, QImode)));
c7199fb6 23142 emit_insn (gen_xor (dest, dest, vec_const_xor));
23143 emit_insn (gen_sub (dest, dest, vec_const_xor));
23144 }
23145 return true;
23146}
23147
2bf6d935
ML
23148/* Expand a vector operation CODE for a V*QImode in terms of the
23149 same operation on V*HImode. */
23150
23151void
23152ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23153{
23154 machine_mode qimode = GET_MODE (dest);
23155 machine_mode himode;
23156 rtx (*gen_il) (rtx, rtx, rtx);
23157 rtx (*gen_ih) (rtx, rtx, rtx);
23158 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
23159 struct expand_vec_perm_d d;
23160 bool ok, full_interleave;
23161 bool uns_p = false;
23162 int i;
23163
3bd86940 23164 if (CONST_INT_P (op2)
23165 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23166 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
23167 return;
23168
23169 if (TARGET_AVX512BW
23170 && VECTOR_MODE_P (GET_MODE (op2))
23171 && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
23172 return;
23173
2bf6d935
ML
23174 switch (qimode)
23175 {
23176 case E_V16QImode:
23177 himode = V8HImode;
23178 gen_il = gen_vec_interleave_lowv16qi;
23179 gen_ih = gen_vec_interleave_highv16qi;
23180 break;
23181 case E_V32QImode:
23182 himode = V16HImode;
23183 gen_il = gen_avx2_interleave_lowv32qi;
23184 gen_ih = gen_avx2_interleave_highv32qi;
23185 break;
23186 case E_V64QImode:
23187 himode = V32HImode;
23188 gen_il = gen_avx512bw_interleave_lowv64qi;
23189 gen_ih = gen_avx512bw_interleave_highv64qi;
23190 break;
23191 default:
23192 gcc_unreachable ();
23193 }
23194
2bf6d935
ML
23195 switch (code)
23196 {
23197 case MULT:
23198 /* Unpack data such that we've got a source byte in each low byte of
23199 each word. We don't care what goes into the high byte of each word.
23200 Rather than trying to get zero in there, most convenient is to let
23201 it be a copy of the low byte. */
23202 op2_l = gen_reg_rtx (qimode);
23203 op2_h = gen_reg_rtx (qimode);
23204 emit_insn (gen_il (op2_l, op2, op2));
23205 emit_insn (gen_ih (op2_h, op2, op2));
23206
23207 op1_l = gen_reg_rtx (qimode);
23208 op1_h = gen_reg_rtx (qimode);
23209 emit_insn (gen_il (op1_l, op1, op1));
23210 emit_insn (gen_ih (op1_h, op1, op1));
23211 full_interleave = qimode == V16QImode;
23212 break;
23213
23214 case ASHIFT:
23215 case LSHIFTRT:
23216 uns_p = true;
23217 /* FALLTHRU */
23218 case ASHIFTRT:
23219 op1_l = gen_reg_rtx (himode);
23220 op1_h = gen_reg_rtx (himode);
23221 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
23222 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
3bd86940 23223 /* vashr/vlshr/vashl */
23224 if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
23225 {
23226 rtx tmp = force_reg (qimode, op2);
23227 op2_l = gen_reg_rtx (himode);
23228 op2_h = gen_reg_rtx (himode);
23229 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
23230 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
23231 }
23232 else
23233 op2_l = op2_h = op2;
23234
2bf6d935
ML
23235 full_interleave = true;
23236 break;
23237 default:
23238 gcc_unreachable ();
23239 }
23240
3bd86940 23241 /* Perform vashr/vlshr/vashl. */
23242 if (code != MULT
23243 && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
23244 {
23245 res_l = gen_reg_rtx (himode);
23246 res_h = gen_reg_rtx (himode);
23247 emit_insn (gen_rtx_SET (res_l,
23248 simplify_gen_binary (code, himode,
23249 op1_l, op2_l)));
23250 emit_insn (gen_rtx_SET (res_h,
23251 simplify_gen_binary (code, himode,
23252 op1_h, op2_h)));
23253 }
23254 /* Performance mult/ashr/lshr/ashl. */
23255 else
23256 {
23257 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
23258 1, OPTAB_DIRECT);
23259 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
23260 1, OPTAB_DIRECT);
23261 }
23262
2bf6d935
ML
23263 gcc_assert (res_l && res_h);
23264
23265 /* Merge the data back into the right place. */
23266 d.target = dest;
23267 d.op0 = gen_lowpart (qimode, res_l);
23268 d.op1 = gen_lowpart (qimode, res_h);
23269 d.vmode = qimode;
23270 d.nelt = GET_MODE_NUNITS (qimode);
23271 d.one_operand_p = false;
23272 d.testing_p = false;
23273
23274 if (full_interleave)
23275 {
23276 /* For SSE2, we used an full interleave, so the desired
23277 results are in the even elements. */
23278 for (i = 0; i < d.nelt; ++i)
23279 d.perm[i] = i * 2;
23280 }
23281 else
23282 {
23283 /* For AVX, the interleave used above was not cross-lane. So the
23284 extraction is evens but with the second and third quarter swapped.
23285 Happily, that is even one insn shorter than even extraction.
23286 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23287 always first from the first and then from the second source operand,
23288 the index bits above the low 4 bits remains the same.
23289 Thus, for d.nelt == 32 we want permutation
23290 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23291 and for d.nelt == 64 we want permutation
23292 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23293 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23294 for (i = 0; i < d.nelt; ++i)
23295 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
23296 }
23297
23298 ok = ix86_expand_vec_perm_const_1 (&d);
23299 gcc_assert (ok);
23300
23301 set_unique_reg_note (get_last_insn (), REG_EQUAL,
23302 gen_rtx_fmt_ee (code, qimode, op1, op2));
23303}
23304
23305/* Helper function of ix86_expand_mul_widen_evenodd. Return true
23306 if op is CONST_VECTOR with all odd elements equal to their
23307 preceding element. */
23308
23309static bool
23310const_vector_equal_evenodd_p (rtx op)
23311{
23312 machine_mode mode = GET_MODE (op);
23313 int i, nunits = GET_MODE_NUNITS (mode);
23314 if (GET_CODE (op) != CONST_VECTOR
23315 || nunits != CONST_VECTOR_NUNITS (op))
23316 return false;
23317 for (i = 0; i < nunits; i += 2)
23318 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
23319 return false;
23320 return true;
23321}
23322
23323void
23324ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
23325 bool uns_p, bool odd_p)
23326{
23327 machine_mode mode = GET_MODE (op1);
23328 machine_mode wmode = GET_MODE (dest);
23329 rtx x;
23330 rtx orig_op1 = op1, orig_op2 = op2;
23331
23332 if (!nonimmediate_operand (op1, mode))
23333 op1 = force_reg (mode, op1);
23334 if (!nonimmediate_operand (op2, mode))
23335 op2 = force_reg (mode, op2);
23336
23337 /* We only play even/odd games with vectors of SImode. */
23338 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
23339
23340 /* If we're looking for the odd results, shift those members down to
23341 the even slots. For some cpus this is faster than a PSHUFD. */
23342 if (odd_p)
23343 {
23344 /* For XOP use vpmacsdqh, but only for smult, as it is only
23345 signed. */
23346 if (TARGET_XOP && mode == V4SImode && !uns_p)
23347 {
23348 x = force_reg (wmode, CONST0_RTX (wmode));
23349 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
23350 return;
23351 }
23352
23353 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
23354 if (!const_vector_equal_evenodd_p (orig_op1))
23355 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
23356 x, NULL, 1, OPTAB_DIRECT);
23357 if (!const_vector_equal_evenodd_p (orig_op2))
23358 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
23359 x, NULL, 1, OPTAB_DIRECT);
23360 op1 = gen_lowpart (mode, op1);
23361 op2 = gen_lowpart (mode, op2);
23362 }
23363
23364 if (mode == V16SImode)
23365 {
23366 if (uns_p)
23367 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
23368 else
23369 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
23370 }
23371 else if (mode == V8SImode)
23372 {
23373 if (uns_p)
23374 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
23375 else
23376 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
23377 }
23378 else if (uns_p)
23379 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
23380 else if (TARGET_SSE4_1)
23381 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
23382 else
23383 {
23384 rtx s1, s2, t0, t1, t2;
23385
23386 /* The easiest way to implement this without PMULDQ is to go through
23387 the motions as if we are performing a full 64-bit multiply. With
23388 the exception that we need to do less shuffling of the elements. */
23389
23390 /* Compute the sign-extension, aka highparts, of the two operands. */
23391 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23392 op1, pc_rtx, pc_rtx);
23393 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23394 op2, pc_rtx, pc_rtx);
23395
23396 /* Multiply LO(A) * HI(B), and vice-versa. */
23397 t1 = gen_reg_rtx (wmode);
23398 t2 = gen_reg_rtx (wmode);
23399 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
23400 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
23401
23402 /* Multiply LO(A) * LO(B). */
23403 t0 = gen_reg_rtx (wmode);
23404 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
23405
23406 /* Combine and shift the highparts into place. */
23407 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
23408 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
23409 1, OPTAB_DIRECT);
23410
23411 /* Combine high and low parts. */
23412 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
23413 return;
23414 }
23415 emit_insn (x);
23416}
23417
23418void
23419ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
23420 bool uns_p, bool high_p)
23421{
23422 machine_mode wmode = GET_MODE (dest);
23423 machine_mode mode = GET_MODE (op1);
23424 rtx t1, t2, t3, t4, mask;
23425
23426 switch (mode)
23427 {
23428 case E_V4SImode:
23429 t1 = gen_reg_rtx (mode);
23430 t2 = gen_reg_rtx (mode);
23431 if (TARGET_XOP && !uns_p)
23432 {
23433 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23434 shuffle the elements once so that all elements are in the right
23435 place for immediate use: { A C B D }. */
23436 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
23437 const1_rtx, GEN_INT (3)));
23438 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
23439 const1_rtx, GEN_INT (3)));
23440 }
23441 else
23442 {
23443 /* Put the elements into place for the multiply. */
23444 ix86_expand_vec_interleave (t1, op1, op1, high_p);
23445 ix86_expand_vec_interleave (t2, op2, op2, high_p);
23446 high_p = false;
23447 }
23448 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
23449 break;
23450
23451 case E_V8SImode:
23452 /* Shuffle the elements between the lanes. After this we
23453 have { A B E F | C D G H } for each operand. */
23454 t1 = gen_reg_rtx (V4DImode);
23455 t2 = gen_reg_rtx (V4DImode);
23456 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
23457 const0_rtx, const2_rtx,
23458 const1_rtx, GEN_INT (3)));
23459 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
23460 const0_rtx, const2_rtx,
23461 const1_rtx, GEN_INT (3)));
23462
23463 /* Shuffle the elements within the lanes. After this we
23464 have { A A B B | C C D D } or { E E F F | G G H H }. */
23465 t3 = gen_reg_rtx (V8SImode);
23466 t4 = gen_reg_rtx (V8SImode);
23467 mask = GEN_INT (high_p
23468 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23469 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23470 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
23471 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
23472
23473 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
23474 break;
23475
23476 case E_V8HImode:
23477 case E_V16HImode:
23478 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
23479 uns_p, OPTAB_DIRECT);
23480 t2 = expand_binop (mode,
23481 uns_p ? umul_highpart_optab : smul_highpart_optab,
23482 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
23483 gcc_assert (t1 && t2);
23484
23485 t3 = gen_reg_rtx (mode);
23486 ix86_expand_vec_interleave (t3, t1, t2, high_p);
23487 emit_move_insn (dest, gen_lowpart (wmode, t3));
23488 break;
23489
23490 case E_V16QImode:
23491 case E_V32QImode:
23492 case E_V32HImode:
23493 case E_V16SImode:
23494 case E_V64QImode:
23495 t1 = gen_reg_rtx (wmode);
23496 t2 = gen_reg_rtx (wmode);
23497 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
23498 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
23499
23500 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
23501 break;
23502
23503 default:
23504 gcc_unreachable ();
23505 }
23506}
23507
23508void
23509ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
23510{
23511 rtx res_1, res_2, res_3, res_4;
23512
23513 res_1 = gen_reg_rtx (V4SImode);
23514 res_2 = gen_reg_rtx (V4SImode);
23515 res_3 = gen_reg_rtx (V2DImode);
23516 res_4 = gen_reg_rtx (V2DImode);
23517 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
23518 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
23519
23520 /* Move the results in element 2 down to element 1; we don't care
23521 what goes in elements 2 and 3. Then we can merge the parts
23522 back together with an interleave.
23523
23524 Note that two other sequences were tried:
23525 (1) Use interleaves at the start instead of psrldq, which allows
23526 us to use a single shufps to merge things back at the end.
23527 (2) Use shufps here to combine the two vectors, then pshufd to
23528 put the elements in the correct order.
23529 In both cases the cost of the reformatting stall was too high
23530 and the overall sequence slower. */
23531
23532 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
23533 const0_rtx, const2_rtx,
23534 const0_rtx, const0_rtx));
23535 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
23536 const0_rtx, const2_rtx,
23537 const0_rtx, const0_rtx));
23538 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
23539
23540 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
23541}
23542
23543void
23544ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
23545{
23546 machine_mode mode = GET_MODE (op0);
23547 rtx t1, t2, t3, t4, t5, t6;
23548
23549 if (TARGET_AVX512DQ && mode == V8DImode)
23550 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
23551 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
23552 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
23553 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
23554 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
23555 else if (TARGET_XOP && mode == V2DImode)
23556 {
23557 /* op1: A,B,C,D, op2: E,F,G,H */
23558 op1 = gen_lowpart (V4SImode, op1);
23559 op2 = gen_lowpart (V4SImode, op2);
23560
23561 t1 = gen_reg_rtx (V4SImode);
23562 t2 = gen_reg_rtx (V4SImode);
23563 t3 = gen_reg_rtx (V2DImode);
23564 t4 = gen_reg_rtx (V2DImode);
23565
23566 /* t1: B,A,D,C */
23567 emit_insn (gen_sse2_pshufd_1 (t1, op1,
23568 GEN_INT (1),
23569 GEN_INT (0),
23570 GEN_INT (3),
23571 GEN_INT (2)));
23572
23573 /* t2: (B*E),(A*F),(D*G),(C*H) */
23574 emit_insn (gen_mulv4si3 (t2, t1, op2));
23575
23576 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
23577 emit_insn (gen_xop_phadddq (t3, t2));
23578
23579 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
23580 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
23581
23582 /* Multiply lower parts and add all */
23583 t5 = gen_reg_rtx (V2DImode);
23584 emit_insn (gen_vec_widen_umult_even_v4si (t5,
23585 gen_lowpart (V4SImode, op1),
23586 gen_lowpart (V4SImode, op2)));
8ba6ea87 23587 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
2bf6d935
ML
23588 }
23589 else
23590 {
23591 machine_mode nmode;
23592 rtx (*umul) (rtx, rtx, rtx);
23593
23594 if (mode == V2DImode)
23595 {
23596 umul = gen_vec_widen_umult_even_v4si;
23597 nmode = V4SImode;
23598 }
23599 else if (mode == V4DImode)
23600 {
23601 umul = gen_vec_widen_umult_even_v8si;
23602 nmode = V8SImode;
23603 }
23604 else if (mode == V8DImode)
23605 {
23606 umul = gen_vec_widen_umult_even_v16si;
23607 nmode = V16SImode;
23608 }
23609 else
23610 gcc_unreachable ();
23611
23612
23613 /* Multiply low parts. */
23614 t1 = gen_reg_rtx (mode);
23615 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
23616
23617 /* Shift input vectors right 32 bits so we can multiply high parts. */
23618 t6 = GEN_INT (32);
23619 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
23620 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
23621
23622 /* Multiply high parts by low parts. */
23623 t4 = gen_reg_rtx (mode);
23624 t5 = gen_reg_rtx (mode);
23625 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
23626 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
23627
23628 /* Combine and shift the highparts back. */
23629 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
23630 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
23631
23632 /* Combine high and low parts. */
23633 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
23634 }
23635
23636 set_unique_reg_note (get_last_insn (), REG_EQUAL,
23637 gen_rtx_MULT (mode, op1, op2));
23638}
23639
23640/* Return 1 if control tansfer instruction INSN
23641 should be encoded with notrack prefix. */
23642
23643bool
e8b0314a 23644ix86_notrack_prefixed_insn_p (rtx_insn *insn)
2bf6d935
ML
23645{
23646 if (!insn || !((flag_cf_protection & CF_BRANCH)))
23647 return false;
23648
23649 if (CALL_P (insn))
23650 {
23651 rtx call = get_call_rtx_from (insn);
23652 gcc_assert (call != NULL_RTX);
23653 rtx addr = XEXP (call, 0);
23654
23655 /* Do not emit 'notrack' if it's not an indirect call. */
23656 if (MEM_P (addr)
23657 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
23658 return false;
23659 else
23660 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
23661 }
23662
23663 if (JUMP_P (insn) && !flag_cet_switch)
23664 {
23665 rtx target = JUMP_LABEL (insn);
23666 if (target == NULL_RTX || ANY_RETURN_P (target))
23667 return false;
23668
23669 /* Check the jump is a switch table. */
23670 rtx_insn *label = as_a<rtx_insn *> (target);
23671 rtx_insn *table = next_insn (label);
23672 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
23673 return false;
23674 else
23675 return true;
23676 }
23677 return false;
23678}
23679
23680/* Calculate integer abs() using only SSE2 instructions. */
23681
23682void
23683ix86_expand_sse2_abs (rtx target, rtx input)
23684{
23685 machine_mode mode = GET_MODE (target);
23686 rtx tmp0, tmp1, x;
23687
23688 switch (mode)
23689 {
23690 case E_V2DImode:
23691 case E_V4DImode:
23692 /* For 64-bit signed integer X, with SSE4.2 use
23693 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
23694 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
23695 32 and use logical instead of arithmetic right shift (which is
23696 unimplemented) and subtract. */
23697 if (TARGET_SSE4_2)
23698 {
23699 tmp0 = gen_reg_rtx (mode);
23700 tmp1 = gen_reg_rtx (mode);
23701 emit_move_insn (tmp1, CONST0_RTX (mode));
23702 if (mode == E_V2DImode)
23703 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
23704 else
23705 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
23706 }
23707 else
23708 {
23709 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
23710 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
23711 - 1), NULL, 0, OPTAB_DIRECT);
23712 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
23713 }
23714
23715 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23716 NULL, 0, OPTAB_DIRECT);
23717 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23718 target, 0, OPTAB_DIRECT);
23719 break;
23720
23721 case E_V4SImode:
23722 /* For 32-bit signed integer X, the best way to calculate the absolute
23723 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
23724 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
23725 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
23726 NULL, 0, OPTAB_DIRECT);
23727 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23728 NULL, 0, OPTAB_DIRECT);
23729 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23730 target, 0, OPTAB_DIRECT);
23731 break;
23732
23733 case E_V8HImode:
23734 /* For 16-bit signed integer X, the best way to calculate the absolute
23735 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
23736 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23737
23738 x = expand_simple_binop (mode, SMAX, tmp0, input,
23739 target, 0, OPTAB_DIRECT);
23740 break;
23741
23742 case E_V16QImode:
23743 /* For 8-bit signed integer X, the best way to calculate the absolute
23744 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23745 as SSE2 provides the PMINUB insn. */
23746 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23747
23748 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
23749 target, 0, OPTAB_DIRECT);
23750 break;
23751
23752 default:
23753 gcc_unreachable ();
23754 }
23755
23756 if (x != target)
23757 emit_move_insn (target, x);
23758}
23759
23760/* Expand an extract from a vector register through pextr insn.
23761 Return true if successful. */
23762
23763bool
23764ix86_expand_pextr (rtx *operands)
23765{
23766 rtx dst = operands[0];
23767 rtx src = operands[1];
23768
23769 unsigned int size = INTVAL (operands[2]);
23770 unsigned int pos = INTVAL (operands[3]);
23771
23772 if (SUBREG_P (dst))
23773 {
23774 /* Reject non-lowpart subregs. */
23775 if (SUBREG_BYTE (dst) > 0)
23776 return false;
23777 dst = SUBREG_REG (dst);
23778 }
23779
23780 if (SUBREG_P (src))
23781 {
23782 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
23783 src = SUBREG_REG (src);
23784 }
23785
23786 switch (GET_MODE (src))
23787 {
23788 case E_V16QImode:
23789 case E_V8HImode:
23790 case E_V4SImode:
23791 case E_V2DImode:
23792 case E_V1TImode:
2bf6d935
ML
23793 {
23794 machine_mode srcmode, dstmode;
23795 rtx d, pat;
23796
23797 if (!int_mode_for_size (size, 0).exists (&dstmode))
23798 return false;
23799
23800 switch (dstmode)
23801 {
23802 case E_QImode:
23803 if (!TARGET_SSE4_1)
23804 return false;
23805 srcmode = V16QImode;
23806 break;
23807
23808 case E_HImode:
23809 if (!TARGET_SSE2)
23810 return false;
23811 srcmode = V8HImode;
23812 break;
23813
23814 case E_SImode:
23815 if (!TARGET_SSE4_1)
23816 return false;
23817 srcmode = V4SImode;
23818 break;
23819
23820 case E_DImode:
23821 gcc_assert (TARGET_64BIT);
23822 if (!TARGET_SSE4_1)
23823 return false;
23824 srcmode = V2DImode;
23825 break;
23826
23827 default:
23828 return false;
23829 }
23830
23831 /* Reject extractions from misaligned positions. */
23832 if (pos & (size-1))
23833 return false;
23834
23835 if (GET_MODE (dst) == dstmode)
23836 d = dst;
23837 else
23838 d = gen_reg_rtx (dstmode);
23839
23840 /* Construct insn pattern. */
23841 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
23842 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
23843
23844 /* Let the rtl optimizers know about the zero extension performed. */
23845 if (dstmode == QImode || dstmode == HImode)
23846 {
23847 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
23848 d = gen_lowpart (SImode, d);
23849 }
23850
23851 emit_insn (gen_rtx_SET (d, pat));
23852
23853 if (d != dst)
23854 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23855 return true;
23856 }
23857
23858 default:
23859 return false;
23860 }
23861}
23862
23863/* Expand an insert into a vector register through pinsr insn.
23864 Return true if successful. */
23865
23866bool
23867ix86_expand_pinsr (rtx *operands)
23868{
23869 rtx dst = operands[0];
23870 rtx src = operands[3];
23871
23872 unsigned int size = INTVAL (operands[1]);
23873 unsigned int pos = INTVAL (operands[2]);
23874
23875 if (SUBREG_P (dst))
23876 {
23877 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
23878 dst = SUBREG_REG (dst);
23879 }
23880
23881 switch (GET_MODE (dst))
23882 {
23883 case E_V16QImode:
23884 case E_V8HImode:
23885 case E_V4SImode:
23886 case E_V2DImode:
23887 case E_V1TImode:
2bf6d935
ML
23888 {
23889 machine_mode srcmode, dstmode;
23890 rtx (*pinsr)(rtx, rtx, rtx, rtx);
23891 rtx d;
23892
23893 if (!int_mode_for_size (size, 0).exists (&srcmode))
23894 return false;
23895
23896 switch (srcmode)
23897 {
23898 case E_QImode:
23899 if (!TARGET_SSE4_1)
23900 return false;
23901 dstmode = V16QImode;
23902 pinsr = gen_sse4_1_pinsrb;
23903 break;
23904
23905 case E_HImode:
23906 if (!TARGET_SSE2)
23907 return false;
23908 dstmode = V8HImode;
23909 pinsr = gen_sse2_pinsrw;
23910 break;
23911
23912 case E_SImode:
23913 if (!TARGET_SSE4_1)
23914 return false;
23915 dstmode = V4SImode;
23916 pinsr = gen_sse4_1_pinsrd;
23917 break;
23918
23919 case E_DImode:
23920 gcc_assert (TARGET_64BIT);
23921 if (!TARGET_SSE4_1)
23922 return false;
23923 dstmode = V2DImode;
23924 pinsr = gen_sse4_1_pinsrq;
23925 break;
23926
23927 default:
23928 return false;
23929 }
23930
23931 /* Reject insertions to misaligned positions. */
23932 if (pos & (size-1))
23933 return false;
23934
23935 if (SUBREG_P (src))
23936 {
23937 unsigned int srcpos = SUBREG_BYTE (src);
23938
23939 if (srcpos > 0)
23940 {
23941 rtx extr_ops[4];
23942
23943 extr_ops[0] = gen_reg_rtx (srcmode);
23944 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
23945 extr_ops[2] = GEN_INT (size);
23946 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
23947
23948 if (!ix86_expand_pextr (extr_ops))
23949 return false;
23950
23951 src = extr_ops[0];
23952 }
23953 else
23954 src = gen_lowpart (srcmode, SUBREG_REG (src));
23955 }
23956
23957 if (GET_MODE (dst) == dstmode)
23958 d = dst;
23959 else
23960 d = gen_reg_rtx (dstmode);
23961
23962 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
23963 gen_lowpart (srcmode, src),
23964 GEN_INT (1 << (pos / size))));
23965 if (d != dst)
23966 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23967 return true;
23968 }
23969
23970 default:
23971 return false;
23972 }
23973}
23974
23975/* All CPUs prefer to avoid cross-lane operations so perform reductions
23976 upper against lower halves up to SSE reg size. */
23977
23978machine_mode
23979ix86_split_reduction (machine_mode mode)
23980{
23981 /* Reduce lowpart against highpart until we reach SSE reg width to
23982 avoid cross-lane operations. */
23983 switch (mode)
23984 {
23985 case E_V8DImode:
23986 case E_V4DImode:
23987 return V2DImode;
23988 case E_V16SImode:
23989 case E_V8SImode:
23990 return V4SImode;
23991 case E_V32HImode:
23992 case E_V16HImode:
23993 return V8HImode;
23994 case E_V64QImode:
23995 case E_V32QImode:
23996 return V16QImode;
23997 case E_V16SFmode:
23998 case E_V8SFmode:
23999 return V4SFmode;
24000 case E_V8DFmode:
24001 case E_V4DFmode:
24002 return V2DFmode;
24003 default:
24004 return mode;
24005 }
24006}
24007
24008/* Generate call to __divmoddi4. */
24009
24010void
24011ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
24012 rtx op0, rtx op1,
24013 rtx *quot_p, rtx *rem_p)
24014{
24015 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
24016
24017 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
24018 mode, op0, mode, op1, mode,
24019 XEXP (rem, 0), Pmode);
24020 *quot_p = quot;
24021 *rem_p = rem;
24022}
24023
152f243f
JJ
24024void
24025ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
24026 enum rtx_code code, bool after,
24027 bool doubleword)
4d281ff7 24028{
0435b978 24029 rtx old_reg, new_reg, old_mem, success;
4d281ff7 24030 machine_mode mode = GET_MODE (target);
0435b978 24031 rtx_code_label *loop_label = NULL;
4d281ff7
HW
24032
24033 old_reg = gen_reg_rtx (mode);
24034 new_reg = old_reg;
4d281ff7 24035 old_mem = copy_to_reg (mem);
0435b978 24036 loop_label = gen_label_rtx ();
4d281ff7
HW
24037 emit_label (loop_label);
24038 emit_move_insn (old_reg, old_mem);
24039
24040 /* return value for atomic_fetch_op. */
24041 if (!after)
24042 emit_move_insn (target, old_reg);
24043
24044 if (code == NOT)
24045 {
24046 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
24047 true, OPTAB_LIB_WIDEN);
24048 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
24049 }
24050 else
24051 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
24052 true, OPTAB_LIB_WIDEN);
24053
24054 /* return value for atomic_op_fetch. */
24055 if (after)
24056 emit_move_insn (target, new_reg);
24057
0435b978
HW
24058 success = NULL_RTX;
24059
24060 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
24061 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
24062 SImode),
24063 doubleword, loop_label);
24064}
24065
24066/* Relax cmpxchg instruction, param loop_label indicates whether
24067 the instruction should be relaxed with a pause loop. If not,
24068 it will be relaxed to an atomic load + compare, and skip
24069 cmpxchg instruction if mem != exp_input. */
24070
152f243f
JJ
24071void
24072ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
24073 rtx mem, rtx exp_input, rtx new_input,
24074 rtx mem_model, bool doubleword,
24075 rtx_code_label *loop_label)
0435b978
HW
24076{
24077 rtx_code_label *cmp_label = NULL;
24078 rtx_code_label *done_label = NULL;
24079 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
24080 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
24081 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
24082 machine_mode mode = GET_MODE (target_val), hmode = mode;
24083
24084 if (*ptarget_bool == NULL)
24085 target_bool = gen_reg_rtx (QImode);
24086 else
24087 target_bool = *ptarget_bool;
24088
24089 cmp_label = gen_label_rtx ();
24090 done_label = gen_label_rtx ();
24091
24092 new_mem = gen_reg_rtx (mode);
24093 /* Load memory first. */
24094 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
24095
24096 switch (mode)
24097 {
9d1796d8 24098 case E_TImode:
0435b978
HW
24099 gendw = gen_atomic_compare_and_swapti_doubleword;
24100 hmode = DImode;
24101 break;
9d1796d8 24102 case E_DImode:
0435b978
HW
24103 if (doubleword)
24104 {
24105 gendw = gen_atomic_compare_and_swapdi_doubleword;
24106 hmode = SImode;
24107 }
24108 else
24109 gen = gen_atomic_compare_and_swapdi_1;
24110 break;
9d1796d8
RS
24111 case E_SImode:
24112 gen = gen_atomic_compare_and_swapsi_1;
24113 break;
24114 case E_HImode:
24115 gen = gen_atomic_compare_and_swaphi_1;
24116 break;
24117 case E_QImode:
24118 gen = gen_atomic_compare_and_swapqi_1;
24119 break;
0435b978
HW
24120 default:
24121 gcc_unreachable ();
24122 }
4d281ff7 24123
0435b978 24124 /* Compare mem value with expected value. */
4d281ff7
HW
24125 if (doubleword)
24126 {
0435b978
HW
24127 rtx low_new_mem = gen_lowpart (hmode, new_mem);
24128 rtx low_exp_input = gen_lowpart (hmode, exp_input);
24129 rtx high_new_mem = gen_highpart (hmode, new_mem);
24130 rtx high_exp_input = gen_highpart (hmode, exp_input);
24131 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
24132 hmode, 1, cmp_label,
4d281ff7 24133 profile_probability::guessed_never ());
0435b978
HW
24134 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
24135 hmode, 1, cmp_label,
4d281ff7
HW
24136 profile_probability::guessed_never ());
24137 }
24138 else
0435b978
HW
24139 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
24140 GET_MODE (exp_input), 1, cmp_label,
4d281ff7
HW
24141 profile_probability::guessed_never ());
24142
0435b978
HW
24143 /* Directly emits cmpxchg here. */
24144 if (doubleword)
24145 emit_insn (gendw (target_val, mem, exp_input,
24146 gen_lowpart (hmode, new_input),
24147 gen_highpart (hmode, new_input),
24148 mem_model));
24149 else
24150 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
24151
24152 if (!loop_label)
24153 {
24154 emit_jump_insn (gen_jump (done_label));
24155 emit_barrier ();
24156 emit_label (cmp_label);
24157 emit_move_insn (target_val, new_mem);
24158 emit_label (done_label);
24159 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24160 const0_rtx);
24161 }
24162 else
24163 {
24164 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24165 const0_rtx);
24166 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
24167 GET_MODE (target_bool), 1, loop_label,
24168 profile_probability::guessed_never ());
24169 emit_jump_insn (gen_jump (done_label));
24170 emit_barrier ();
24171
24172 /* If mem is not expected, pause and loop back. */
24173 emit_label (cmp_label);
522f25e9 24174 emit_move_insn (target_val, new_mem);
0435b978
HW
24175 emit_insn (gen_pause ());
24176 emit_jump_insn (gen_jump (loop_label));
24177 emit_barrier ();
24178 emit_label (done_label);
24179 }
24180
24181 *ptarget_bool = target_bool;
4d281ff7
HW
24182}
24183
b1115dbf
JJ
24184/* Convert a BFmode VAL to SFmode without signaling sNaNs.
24185 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24186
24187rtx
24188ix86_expand_fast_convert_bf_to_sf (rtx val)
24189{
24190 rtx op = gen_lowpart (HImode, val), ret;
24191 if (CONST_INT_P (op))
24192 {
24193 ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
24194 val, BFmode);
24195 if (ret)
24196 return ret;
24197 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24198 ret = gen_reg_rtx (SImode);
24199 emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
e55251f3 24200 emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
24201 return gen_lowpart (SFmode, ret);
b1115dbf 24202 }
e55251f3 24203
24204 ret = gen_reg_rtx (SFmode);
24205 emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
24206 return ret;
b1115dbf
JJ
24207}
24208
2bf6d935 24209#include "gt-i386-expand.h"