]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386-expand.cc
i386: Convert ptestz of pandn into ptestc.
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
1 /* Copyright (C) 1988-2023 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95 #include "asan.h"
96
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
102
103 void
104 split_double_mode (machine_mode mode, rtx operands[],
105 int num, rtx lo_half[], rtx hi_half[])
106 {
107 machine_mode half_mode;
108 unsigned int byte;
109 rtx mem_op = NULL_RTX;
110 int mem_num = 0;
111
112 switch (mode)
113 {
114 case E_TImode:
115 half_mode = DImode;
116 break;
117 case E_DImode:
118 half_mode = SImode;
119 break;
120 case E_P2HImode:
121 half_mode = HImode;
122 break;
123 case E_P2QImode:
124 half_mode = QImode;
125 break;
126 default:
127 gcc_unreachable ();
128 }
129
130 byte = GET_MODE_SIZE (half_mode);
131
132 while (num--)
133 {
134 rtx op = operands[num];
135
136 /* simplify_subreg refuse to split volatile memory addresses,
137 but we still have to handle it. */
138 if (MEM_P (op))
139 {
140 if (mem_op && rtx_equal_p (op, mem_op))
141 {
142 lo_half[num] = lo_half[mem_num];
143 hi_half[num] = hi_half[mem_num];
144 }
145 else
146 {
147 mem_op = op;
148 mem_num = num;
149 lo_half[num] = adjust_address (op, half_mode, 0);
150 hi_half[num] = adjust_address (op, half_mode, byte);
151 }
152 }
153 else
154 {
155 lo_half[num] = simplify_gen_subreg (half_mode, op,
156 GET_MODE (op) == VOIDmode
157 ? mode : GET_MODE (op), 0);
158
159 rtx tmp = simplify_gen_subreg (half_mode, op,
160 GET_MODE (op) == VOIDmode
161 ? mode : GET_MODE (op), byte);
162 /* simplify_gen_subreg will return NULL RTX for the
163 high half of the paradoxical subreg. */
164 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
165 }
166 }
167 }
168
169 /* Emit the double word assignment DST = { LO, HI }. */
170
171 void
172 split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
173 {
174 rtx dlo, dhi;
175 int deleted_move_count = 0;
176 split_double_mode (mode, &dst, 1, &dlo, &dhi);
177 /* Constraints ensure that if both lo and hi are MEMs, then
178 dst has early-clobber and thus addresses of MEMs don't use
179 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
180 dlo/dhi are registers. */
181 if (MEM_P (lo)
182 && rtx_equal_p (dlo, hi)
183 && reg_overlap_mentioned_p (dhi, lo))
184 {
185 /* If dlo is same as hi and lo's address uses dhi register,
186 code below would first emit_move_insn (dhi, hi)
187 and then emit_move_insn (dlo, lo). But the former
188 would invalidate lo's address. Load into dhi first,
189 then swap. */
190 emit_move_insn (dhi, lo);
191 lo = dhi;
192 }
193 else if (MEM_P (hi)
194 && !MEM_P (lo)
195 && !rtx_equal_p (dlo, lo)
196 && reg_overlap_mentioned_p (dlo, hi))
197 {
198 /* In this case, code below would first emit_move_insn (dlo, lo)
199 and then emit_move_insn (dhi, hi). But the former would
200 invalidate hi's address. */
201 if (rtx_equal_p (dhi, lo))
202 {
203 /* We can't load into dhi first, so load into dlo
204 first and we'll swap. */
205 emit_move_insn (dlo, hi);
206 hi = dlo;
207 }
208 else
209 {
210 /* Load into dhi first. */
211 emit_move_insn (dhi, hi);
212 hi = dhi;
213 }
214 }
215 if (!rtx_equal_p (dlo, hi))
216 {
217 if (!rtx_equal_p (dlo, lo))
218 emit_move_insn (dlo, lo);
219 else
220 deleted_move_count++;
221 if (!rtx_equal_p (dhi, hi))
222 emit_move_insn (dhi, hi);
223 else
224 deleted_move_count++;
225 }
226 else if (!rtx_equal_p (lo, dhi))
227 {
228 if (!rtx_equal_p (dhi, hi))
229 emit_move_insn (dhi, hi);
230 else
231 deleted_move_count++;
232 if (!rtx_equal_p (dlo, lo))
233 emit_move_insn (dlo, lo);
234 else
235 deleted_move_count++;
236 }
237 else if (mode == TImode)
238 emit_insn (gen_swapdi (dlo, dhi));
239 else
240 emit_insn (gen_swapsi (dlo, dhi));
241
242 if (deleted_move_count == 2)
243 emit_note (NOTE_INSN_DELETED);
244 }
245
246
247 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
248 for the target. */
249
250 void
251 ix86_expand_clear (rtx dest)
252 {
253 rtx tmp;
254
255 /* We play register width games, which are only valid after reload. */
256 gcc_assert (reload_completed);
257
258 /* Avoid HImode and its attendant prefix byte. */
259 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
260 dest = gen_rtx_REG (SImode, REGNO (dest));
261 tmp = gen_rtx_SET (dest, const0_rtx);
262
263 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
264 {
265 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
266 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
267 }
268
269 emit_insn (tmp);
270 }
271
272 /* Return true if V can be broadcasted from an integer of WIDTH bits
273 which is returned in VAL_BROADCAST. Otherwise, return false. */
274
275 static bool
276 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
277 HOST_WIDE_INT &val_broadcast)
278 {
279 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
280 val_broadcast = wi::extract_uhwi (val, 0, width);
281 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
282 {
283 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
284 if (val_broadcast != each)
285 return false;
286 }
287 val_broadcast = sext_hwi (val_broadcast, width);
288 return true;
289 }
290
291 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
292
293 static rtx
294 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
295 {
296 /* Don't use integer vector broadcast if we can't move from GPR to SSE
297 register directly. */
298 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
299 return nullptr;
300
301 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
302 broadcast only if vector broadcast is available. */
303 if (!TARGET_AVX
304 || !CONST_WIDE_INT_P (op)
305 || standard_sse_constant_p (op, mode)
306 || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
307 != GET_MODE_BITSIZE (mode)))
308 return nullptr;
309
310 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
311 HOST_WIDE_INT val_broadcast;
312 scalar_int_mode broadcast_mode;
313 if (TARGET_AVX2
314 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
315 val_broadcast))
316 broadcast_mode = QImode;
317 else if (TARGET_AVX2
318 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
319 val_broadcast))
320 broadcast_mode = HImode;
321 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
322 val_broadcast))
323 broadcast_mode = SImode;
324 else if (TARGET_64BIT
325 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
326 val_broadcast))
327 broadcast_mode = DImode;
328 else
329 return nullptr;
330
331 /* Check if OP can be broadcasted from VAL. */
332 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
333 if (val != CONST_WIDE_INT_ELT (op, i))
334 return nullptr;
335
336 unsigned int nunits = (GET_MODE_SIZE (mode)
337 / GET_MODE_SIZE (broadcast_mode));
338 machine_mode vector_mode;
339 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
340 gcc_unreachable ();
341 rtx target = gen_reg_rtx (vector_mode);
342 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
343 target,
344 GEN_INT (val_broadcast));
345 gcc_assert (ok);
346 target = lowpart_subreg (mode, target, vector_mode);
347 return target;
348 }
349
350 void
351 ix86_expand_move (machine_mode mode, rtx operands[])
352 {
353 rtx op0, op1;
354 rtx tmp, addend = NULL_RTX;
355 enum tls_model model;
356
357 op0 = operands[0];
358 op1 = operands[1];
359
360 /* Avoid complex sets of likely spilled hard registers before reload. */
361 if (!ix86_hardreg_mov_ok (op0, op1))
362 {
363 tmp = gen_reg_rtx (mode);
364 operands[0] = tmp;
365 ix86_expand_move (mode, operands);
366 operands[0] = op0;
367 operands[1] = tmp;
368 op1 = tmp;
369 }
370
371 switch (GET_CODE (op1))
372 {
373 case CONST:
374 tmp = XEXP (op1, 0);
375
376 if (GET_CODE (tmp) != PLUS
377 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
378 break;
379
380 op1 = XEXP (tmp, 0);
381 addend = XEXP (tmp, 1);
382 /* FALLTHRU */
383
384 case SYMBOL_REF:
385 model = SYMBOL_REF_TLS_MODEL (op1);
386
387 if (model)
388 op1 = legitimize_tls_address (op1, model, true);
389 else if (ix86_force_load_from_GOT_p (op1))
390 {
391 /* Load the external function address via GOT slot to avoid PLT. */
392 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
393 (TARGET_64BIT
394 ? UNSPEC_GOTPCREL
395 : UNSPEC_GOT));
396 op1 = gen_rtx_CONST (Pmode, op1);
397 op1 = gen_const_mem (Pmode, op1);
398 set_mem_alias_set (op1, ix86_GOT_alias_set ());
399 }
400 else
401 {
402 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
403 if (tmp)
404 {
405 op1 = tmp;
406 if (!addend)
407 break;
408 }
409 else
410 {
411 op1 = operands[1];
412 break;
413 }
414 }
415
416 if (addend)
417 {
418 op1 = force_operand (op1, NULL_RTX);
419 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
420 op0, 1, OPTAB_DIRECT);
421 }
422 else
423 op1 = force_operand (op1, op0);
424
425 if (op1 == op0)
426 return;
427
428 op1 = convert_to_mode (mode, op1, 1);
429
430 default:
431 break;
432 }
433
434 if ((flag_pic || MACHOPIC_INDIRECT)
435 && symbolic_operand (op1, mode))
436 {
437 if (TARGET_MACHO && !TARGET_64BIT)
438 {
439 #if TARGET_MACHO
440 /* dynamic-no-pic */
441 if (MACHOPIC_INDIRECT)
442 {
443 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
444 ? op0 : gen_reg_rtx (Pmode);
445 op1 = machopic_indirect_data_reference (op1, temp);
446 if (MACHOPIC_PURE)
447 op1 = machopic_legitimize_pic_address (op1, mode,
448 temp == op1 ? 0 : temp);
449 }
450 if (op0 != op1 && GET_CODE (op0) != MEM)
451 {
452 rtx insn = gen_rtx_SET (op0, op1);
453 emit_insn (insn);
454 return;
455 }
456 if (GET_CODE (op0) == MEM)
457 op1 = force_reg (Pmode, op1);
458 else
459 {
460 rtx temp = op0;
461 if (GET_CODE (temp) != REG)
462 temp = gen_reg_rtx (Pmode);
463 temp = legitimize_pic_address (op1, temp);
464 if (temp == op0)
465 return;
466 op1 = temp;
467 }
468 /* dynamic-no-pic */
469 #endif
470 }
471 else
472 {
473 if (MEM_P (op0))
474 op1 = force_reg (mode, op1);
475 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
476 {
477 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
478 op1 = legitimize_pic_address (op1, reg);
479 if (op0 == op1)
480 return;
481 op1 = convert_to_mode (mode, op1, 1);
482 }
483 }
484 }
485 else
486 {
487 if (MEM_P (op0)
488 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
489 || !push_operand (op0, mode))
490 && MEM_P (op1))
491 op1 = force_reg (mode, op1);
492
493 if (push_operand (op0, mode)
494 && ! general_no_elim_operand (op1, mode))
495 op1 = copy_to_mode_reg (mode, op1);
496
497 /* Force large constants in 64bit compilation into register
498 to get them CSEed. */
499 if (can_create_pseudo_p ()
500 && (mode == DImode) && TARGET_64BIT
501 && immediate_operand (op1, mode)
502 && !x86_64_zext_immediate_operand (op1, VOIDmode)
503 && !register_operand (op0, mode)
504 && optimize)
505 op1 = copy_to_mode_reg (mode, op1);
506
507 if (can_create_pseudo_p ())
508 {
509 if (CONST_DOUBLE_P (op1))
510 {
511 /* If we are loading a floating point constant to a
512 register, force the value to memory now, since we'll
513 get better code out the back end. */
514
515 op1 = validize_mem (force_const_mem (mode, op1));
516 if (!register_operand (op0, mode))
517 {
518 rtx temp = gen_reg_rtx (mode);
519 emit_insn (gen_rtx_SET (temp, op1));
520 emit_move_insn (op0, temp);
521 return;
522 }
523 }
524 else if (CONST_WIDE_INT_P (op1)
525 && GET_MODE_SIZE (mode) >= 16)
526 {
527 rtx tmp = ix86_convert_const_wide_int_to_broadcast
528 (GET_MODE (op0), op1);
529 if (tmp != nullptr)
530 op1 = tmp;
531 }
532 }
533 }
534
535 emit_insn (gen_rtx_SET (op0, op1));
536 }
537
538 /* OP is a memref of CONST_VECTOR, return scalar constant mem
539 if CONST_VECTOR is a vec_duplicate, else return NULL. */
540 static rtx
541 ix86_broadcast_from_constant (machine_mode mode, rtx op)
542 {
543 int nunits = GET_MODE_NUNITS (mode);
544 if (nunits < 2)
545 return nullptr;
546
547 /* Don't use integer vector broadcast if we can't move from GPR to SSE
548 register directly. */
549 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
550 && INTEGRAL_MODE_P (mode))
551 return nullptr;
552
553 /* Convert CONST_VECTOR to a non-standard SSE constant integer
554 broadcast only if vector broadcast is available. */
555 if (!(TARGET_AVX2
556 || (TARGET_AVX
557 && (GET_MODE_INNER (mode) == SImode
558 || GET_MODE_INNER (mode) == DImode))
559 || FLOAT_MODE_P (mode))
560 || standard_sse_constant_p (op, mode))
561 return nullptr;
562
563 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
564 We can still put 64-bit integer constant in memory when
565 avx512 embed broadcast is available. */
566 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
567 && (!TARGET_AVX512F
568 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
569 return nullptr;
570
571 if (GET_MODE_INNER (mode) == TImode)
572 return nullptr;
573
574 rtx constant = get_pool_constant (XEXP (op, 0));
575 if (GET_CODE (constant) != CONST_VECTOR)
576 return nullptr;
577
578 /* There could be some rtx like
579 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
580 but with "*.LC1" refer to V2DI constant vector. */
581 if (GET_MODE (constant) != mode)
582 {
583 constant = simplify_subreg (mode, constant, GET_MODE (constant),
584 0);
585 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
586 return nullptr;
587 }
588
589 rtx first = XVECEXP (constant, 0, 0);
590
591 for (int i = 1; i < nunits; ++i)
592 {
593 rtx tmp = XVECEXP (constant, 0, i);
594 /* Vector duplicate value. */
595 if (!rtx_equal_p (tmp, first))
596 return nullptr;
597 }
598
599 return first;
600 }
601
602 void
603 ix86_expand_vector_move (machine_mode mode, rtx operands[])
604 {
605 rtx op0 = operands[0], op1 = operands[1];
606 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
607 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
608 unsigned int align = (TARGET_IAMCU
609 ? GET_MODE_BITSIZE (mode)
610 : GET_MODE_ALIGNMENT (mode));
611
612 if (push_operand (op0, VOIDmode))
613 op0 = emit_move_resolve_push (mode, op0);
614
615 /* Force constants other than zero into memory. We do not know how
616 the instructions used to build constants modify the upper 64 bits
617 of the register, once we have that information we may be able
618 to handle some of them more efficiently. */
619 if (can_create_pseudo_p ()
620 && (CONSTANT_P (op1)
621 || (SUBREG_P (op1)
622 && CONSTANT_P (SUBREG_REG (op1))))
623 && ((register_operand (op0, mode)
624 && !standard_sse_constant_p (op1, mode))
625 /* ix86_expand_vector_move_misalign() does not like constants. */
626 || (SSE_REG_MODE_P (mode)
627 && MEM_P (op0)
628 && MEM_ALIGN (op0) < align)))
629 {
630 if (SUBREG_P (op1))
631 {
632 machine_mode imode = GET_MODE (SUBREG_REG (op1));
633 rtx r = force_const_mem (imode, SUBREG_REG (op1));
634 if (r)
635 r = validize_mem (r);
636 else
637 r = force_reg (imode, SUBREG_REG (op1));
638 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
639 }
640 else
641 {
642 machine_mode mode = GET_MODE (op0);
643 rtx tmp = ix86_convert_const_wide_int_to_broadcast
644 (mode, op1);
645 if (tmp == nullptr)
646 op1 = validize_mem (force_const_mem (mode, op1));
647 else
648 op1 = tmp;
649 }
650 }
651
652 if (can_create_pseudo_p ()
653 && GET_MODE_SIZE (mode) >= 16
654 && VECTOR_MODE_P (mode)
655 && (MEM_P (op1)
656 && SYMBOL_REF_P (XEXP (op1, 0))
657 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
658 {
659 rtx first = ix86_broadcast_from_constant (mode, op1);
660 if (first != nullptr)
661 {
662 /* Broadcast to XMM/YMM/ZMM register from an integer
663 constant or scalar mem. */
664 op1 = gen_reg_rtx (mode);
665 if (FLOAT_MODE_P (mode)
666 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
667 first = force_const_mem (GET_MODE_INNER (mode), first);
668 bool ok = ix86_expand_vector_init_duplicate (false, mode,
669 op1, first);
670 gcc_assert (ok);
671 emit_move_insn (op0, op1);
672 return;
673 }
674 }
675
676 /* We need to check memory alignment for SSE mode since attribute
677 can make operands unaligned. */
678 if (can_create_pseudo_p ()
679 && SSE_REG_MODE_P (mode)
680 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
681 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
682 {
683 rtx tmp[2];
684
685 /* ix86_expand_vector_move_misalign() does not like both
686 arguments in memory. */
687 if (!register_operand (op0, mode)
688 && !register_operand (op1, mode))
689 {
690 rtx scratch = gen_reg_rtx (mode);
691 emit_move_insn (scratch, op1);
692 op1 = scratch;
693 }
694
695 tmp[0] = op0; tmp[1] = op1;
696 ix86_expand_vector_move_misalign (mode, tmp);
697 return;
698 }
699
700 /* Special case TImode to 128-bit vector conversions via V2DI. */
701 if (VECTOR_MODE_P (mode)
702 && GET_MODE_SIZE (mode) == 16
703 && SUBREG_P (op1)
704 && GET_MODE (SUBREG_REG (op1)) == TImode
705 && TARGET_64BIT && TARGET_SSE
706 && can_create_pseudo_p ())
707 {
708 rtx tmp = gen_reg_rtx (V2DImode);
709 rtx lo = gen_reg_rtx (DImode);
710 rtx hi = gen_reg_rtx (DImode);
711 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
712 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
713 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
714 emit_move_insn (op0, gen_lowpart (mode, tmp));
715 return;
716 }
717
718 /* If operand0 is a hard register, make operand1 a pseudo. */
719 if (can_create_pseudo_p ()
720 && !ix86_hardreg_mov_ok (op0, op1))
721 {
722 rtx tmp = gen_reg_rtx (GET_MODE (op0));
723 emit_move_insn (tmp, op1);
724 emit_move_insn (op0, tmp);
725 return;
726 }
727
728 /* Make operand1 a register if it isn't already. */
729 if (can_create_pseudo_p ()
730 && !register_operand (op0, mode)
731 && !register_operand (op1, mode))
732 {
733 rtx tmp = gen_reg_rtx (GET_MODE (op0));
734 emit_move_insn (tmp, op1);
735 emit_move_insn (op0, tmp);
736 return;
737 }
738
739 emit_insn (gen_rtx_SET (op0, op1));
740 }
741
742 /* Split 32-byte AVX unaligned load and store if needed. */
743
744 static void
745 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
746 {
747 rtx m;
748 rtx (*extract) (rtx, rtx, rtx);
749 machine_mode mode;
750
751 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
752 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
753 {
754 emit_insn (gen_rtx_SET (op0, op1));
755 return;
756 }
757
758 rtx orig_op0 = NULL_RTX;
759 mode = GET_MODE (op0);
760 switch (GET_MODE_CLASS (mode))
761 {
762 case MODE_VECTOR_INT:
763 case MODE_INT:
764 if (mode != V32QImode)
765 {
766 if (!MEM_P (op0))
767 {
768 orig_op0 = op0;
769 op0 = gen_reg_rtx (V32QImode);
770 }
771 else
772 op0 = gen_lowpart (V32QImode, op0);
773 op1 = gen_lowpart (V32QImode, op1);
774 mode = V32QImode;
775 }
776 break;
777 case MODE_VECTOR_FLOAT:
778 break;
779 default:
780 gcc_unreachable ();
781 }
782
783 switch (mode)
784 {
785 default:
786 gcc_unreachable ();
787 case E_V32QImode:
788 extract = gen_avx_vextractf128v32qi;
789 mode = V16QImode;
790 break;
791 case E_V16BFmode:
792 extract = gen_avx_vextractf128v16bf;
793 mode = V8BFmode;
794 break;
795 case E_V16HFmode:
796 extract = gen_avx_vextractf128v16hf;
797 mode = V8HFmode;
798 break;
799 case E_V8SFmode:
800 extract = gen_avx_vextractf128v8sf;
801 mode = V4SFmode;
802 break;
803 case E_V4DFmode:
804 extract = gen_avx_vextractf128v4df;
805 mode = V2DFmode;
806 break;
807 }
808
809 if (MEM_P (op1))
810 {
811 rtx r = gen_reg_rtx (mode);
812 m = adjust_address (op1, mode, 0);
813 emit_move_insn (r, m);
814 m = adjust_address (op1, mode, 16);
815 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
816 emit_move_insn (op0, r);
817 }
818 else if (MEM_P (op0))
819 {
820 m = adjust_address (op0, mode, 0);
821 emit_insn (extract (m, op1, const0_rtx));
822 m = adjust_address (op0, mode, 16);
823 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
824 }
825 else
826 gcc_unreachable ();
827
828 if (orig_op0)
829 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
830 }
831
832 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
833 straight to ix86_expand_vector_move. */
834 /* Code generation for scalar reg-reg moves of single and double precision data:
835 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
836 movaps reg, reg
837 else
838 movss reg, reg
839 if (x86_sse_partial_reg_dependency == true)
840 movapd reg, reg
841 else
842 movsd reg, reg
843
844 Code generation for scalar loads of double precision data:
845 if (x86_sse_split_regs == true)
846 movlpd mem, reg (gas syntax)
847 else
848 movsd mem, reg
849
850 Code generation for unaligned packed loads of single precision data
851 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
852 if (x86_sse_unaligned_move_optimal)
853 movups mem, reg
854
855 if (x86_sse_partial_reg_dependency == true)
856 {
857 xorps reg, reg
858 movlps mem, reg
859 movhps mem+8, reg
860 }
861 else
862 {
863 movlps mem, reg
864 movhps mem+8, reg
865 }
866
867 Code generation for unaligned packed loads of double precision data
868 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
869 if (x86_sse_unaligned_move_optimal)
870 movupd mem, reg
871
872 if (x86_sse_split_regs == true)
873 {
874 movlpd mem, reg
875 movhpd mem+8, reg
876 }
877 else
878 {
879 movsd mem, reg
880 movhpd mem+8, reg
881 }
882 */
883
884 void
885 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
886 {
887 rtx op0, op1, m;
888
889 op0 = operands[0];
890 op1 = operands[1];
891
892 /* Use unaligned load/store for AVX512 or when optimizing for size. */
893 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
894 {
895 emit_insn (gen_rtx_SET (op0, op1));
896 return;
897 }
898
899 if (TARGET_AVX)
900 {
901 if (GET_MODE_SIZE (mode) == 32)
902 ix86_avx256_split_vector_move_misalign (op0, op1);
903 else
904 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
905 emit_insn (gen_rtx_SET (op0, op1));
906 return;
907 }
908
909 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
910 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
911 {
912 emit_insn (gen_rtx_SET (op0, op1));
913 return;
914 }
915
916 /* ??? If we have typed data, then it would appear that using
917 movdqu is the only way to get unaligned data loaded with
918 integer type. */
919 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
920 {
921 emit_insn (gen_rtx_SET (op0, op1));
922 return;
923 }
924
925 if (MEM_P (op1))
926 {
927 if (TARGET_SSE2 && mode == V2DFmode)
928 {
929 rtx zero;
930
931 /* When SSE registers are split into halves, we can avoid
932 writing to the top half twice. */
933 if (TARGET_SSE_SPLIT_REGS)
934 {
935 emit_clobber (op0);
936 zero = op0;
937 }
938 else
939 {
940 /* ??? Not sure about the best option for the Intel chips.
941 The following would seem to satisfy; the register is
942 entirely cleared, breaking the dependency chain. We
943 then store to the upper half, with a dependency depth
944 of one. A rumor has it that Intel recommends two movsd
945 followed by an unpacklpd, but this is unconfirmed. And
946 given that the dependency depth of the unpacklpd would
947 still be one, I'm not sure why this would be better. */
948 zero = CONST0_RTX (V2DFmode);
949 }
950
951 m = adjust_address (op1, DFmode, 0);
952 emit_insn (gen_sse2_loadlpd (op0, zero, m));
953 m = adjust_address (op1, DFmode, 8);
954 emit_insn (gen_sse2_loadhpd (op0, op0, m));
955 }
956 else
957 {
958 rtx t;
959
960 if (mode != V4SFmode)
961 t = gen_reg_rtx (V4SFmode);
962 else
963 t = op0;
964
965 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
966 emit_move_insn (t, CONST0_RTX (V4SFmode));
967 else
968 emit_clobber (t);
969
970 m = adjust_address (op1, V2SFmode, 0);
971 emit_insn (gen_sse_loadlps (t, t, m));
972 m = adjust_address (op1, V2SFmode, 8);
973 emit_insn (gen_sse_loadhps (t, t, m));
974 if (mode != V4SFmode)
975 emit_move_insn (op0, gen_lowpart (mode, t));
976 }
977 }
978 else if (MEM_P (op0))
979 {
980 if (TARGET_SSE2 && mode == V2DFmode)
981 {
982 m = adjust_address (op0, DFmode, 0);
983 emit_insn (gen_sse2_storelpd (m, op1));
984 m = adjust_address (op0, DFmode, 8);
985 emit_insn (gen_sse2_storehpd (m, op1));
986 }
987 else
988 {
989 if (mode != V4SFmode)
990 op1 = gen_lowpart (V4SFmode, op1);
991
992 m = adjust_address (op0, V2SFmode, 0);
993 emit_insn (gen_sse_storelps (m, op1));
994 m = adjust_address (op0, V2SFmode, 8);
995 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
996 }
997 }
998 else
999 gcc_unreachable ();
1000 }
1001
1002 /* Move bits 64:95 to bits 32:63. */
1003
1004 void
1005 ix86_move_vector_high_sse_to_mmx (rtx op)
1006 {
1007 rtx mask = gen_rtx_PARALLEL (VOIDmode,
1008 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1009 GEN_INT (0), GEN_INT (0)));
1010 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
1011 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1012 rtx insn = gen_rtx_SET (dest, op);
1013 emit_insn (insn);
1014 }
1015
1016 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1017
1018 void
1019 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1020 {
1021 rtx op0 = operands[0];
1022 rtx op1 = operands[1];
1023 rtx op2 = operands[2];
1024 rtx src;
1025
1026 machine_mode dmode = GET_MODE (op0);
1027 machine_mode smode = GET_MODE (op1);
1028 machine_mode inner_dmode = GET_MODE_INNER (dmode);
1029 machine_mode inner_smode = GET_MODE_INNER (smode);
1030
1031 /* Get the corresponding SSE mode for destination. */
1032 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1033 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1034 nunits).require ();
1035 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1036 nunits / 2).require ();
1037
1038 /* Get the corresponding SSE mode for source. */
1039 nunits = 16 / GET_MODE_SIZE (inner_smode);
1040 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1041 nunits).require ();
1042
1043 /* Generate SSE pack with signed/unsigned saturation. */
1044 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1045 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1046 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1047
1048 /* paskusdw/packuswb does unsigned saturation of a signed source
1049 which is different from generic us_truncate RTX. */
1050 if (code == US_TRUNCATE)
1051 src = gen_rtx_UNSPEC (sse_dmode,
1052 gen_rtvec (2, op1, op2),
1053 UNSPEC_US_TRUNCATE);
1054 else
1055 {
1056 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1057 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1058 src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
1059 }
1060
1061 emit_move_insn (dest, src);
1062
1063 ix86_move_vector_high_sse_to_mmx (op0);
1064 }
1065
1066 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1067
1068 void
1069 ix86_split_mmx_punpck (rtx operands[], bool high_p)
1070 {
1071 rtx op0 = operands[0];
1072 rtx op1 = operands[1];
1073 rtx op2 = operands[2];
1074 machine_mode mode = GET_MODE (op0);
1075 rtx mask;
1076 /* The corresponding SSE mode. */
1077 machine_mode sse_mode, double_sse_mode;
1078
1079 switch (mode)
1080 {
1081 case E_V4QImode:
1082 case E_V8QImode:
1083 sse_mode = V16QImode;
1084 double_sse_mode = V32QImode;
1085 mask = gen_rtx_PARALLEL (VOIDmode,
1086 gen_rtvec (16,
1087 GEN_INT (0), GEN_INT (16),
1088 GEN_INT (1), GEN_INT (17),
1089 GEN_INT (2), GEN_INT (18),
1090 GEN_INT (3), GEN_INT (19),
1091 GEN_INT (4), GEN_INT (20),
1092 GEN_INT (5), GEN_INT (21),
1093 GEN_INT (6), GEN_INT (22),
1094 GEN_INT (7), GEN_INT (23)));
1095 break;
1096
1097 case E_V4HImode:
1098 case E_V2HImode:
1099 sse_mode = V8HImode;
1100 double_sse_mode = V16HImode;
1101 mask = gen_rtx_PARALLEL (VOIDmode,
1102 gen_rtvec (8,
1103 GEN_INT (0), GEN_INT (8),
1104 GEN_INT (1), GEN_INT (9),
1105 GEN_INT (2), GEN_INT (10),
1106 GEN_INT (3), GEN_INT (11)));
1107 break;
1108
1109 case E_V2SImode:
1110 sse_mode = V4SImode;
1111 double_sse_mode = V8SImode;
1112 mask = gen_rtx_PARALLEL (VOIDmode,
1113 gen_rtvec (4,
1114 GEN_INT (0), GEN_INT (4),
1115 GEN_INT (1), GEN_INT (5)));
1116 break;
1117
1118 case E_V2SFmode:
1119 sse_mode = V4SFmode;
1120 double_sse_mode = V8SFmode;
1121 mask = gen_rtx_PARALLEL (VOIDmode,
1122 gen_rtvec (4,
1123 GEN_INT (0), GEN_INT (4),
1124 GEN_INT (1), GEN_INT (5)));
1125 break;
1126
1127 default:
1128 gcc_unreachable ();
1129 }
1130
1131 /* Generate SSE punpcklXX. */
1132 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1133 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1134 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1135
1136 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1137 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1138 rtx insn = gen_rtx_SET (dest, op2);
1139 emit_insn (insn);
1140
1141 /* Move high bits to low bits. */
1142 if (high_p)
1143 {
1144 if (sse_mode == V4SFmode)
1145 {
1146 mask = gen_rtx_PARALLEL (VOIDmode,
1147 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1148 GEN_INT (4), GEN_INT (5)));
1149 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1150 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1151 }
1152 else
1153 {
1154 int sz = GET_MODE_SIZE (mode);
1155
1156 if (sz == 4)
1157 mask = gen_rtx_PARALLEL (VOIDmode,
1158 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1159 GEN_INT (0), GEN_INT (1)));
1160 else if (sz == 8)
1161 mask = gen_rtx_PARALLEL (VOIDmode,
1162 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1163 GEN_INT (0), GEN_INT (1)));
1164 else
1165 gcc_unreachable ();
1166
1167 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1168 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1169 }
1170
1171 insn = gen_rtx_SET (dest, op1);
1172 emit_insn (insn);
1173 }
1174 }
1175
1176 /* Helper function of ix86_fixup_binary_operands to canonicalize
1177 operand order. Returns true if the operands should be swapped. */
1178
1179 static bool
1180 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1181 rtx operands[])
1182 {
1183 rtx dst = operands[0];
1184 rtx src1 = operands[1];
1185 rtx src2 = operands[2];
1186
1187 /* If the operation is not commutative, we can't do anything. */
1188 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1189 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1190 return false;
1191
1192 /* Highest priority is that src1 should match dst. */
1193 if (rtx_equal_p (dst, src1))
1194 return false;
1195 if (rtx_equal_p (dst, src2))
1196 return true;
1197
1198 /* Next highest priority is that immediate constants come second. */
1199 if (immediate_operand (src2, mode))
1200 return false;
1201 if (immediate_operand (src1, mode))
1202 return true;
1203
1204 /* Lowest priority is that memory references should come second. */
1205 if (MEM_P (src2))
1206 return false;
1207 if (MEM_P (src1))
1208 return true;
1209
1210 return false;
1211 }
1212
1213
1214 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1215 destination to use for the operation. If different from the true
1216 destination in operands[0], a copy operation will be required. */
1217
1218 rtx
1219 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1220 rtx operands[])
1221 {
1222 rtx dst = operands[0];
1223 rtx src1 = operands[1];
1224 rtx src2 = operands[2];
1225
1226 /* Canonicalize operand order. */
1227 if (ix86_swap_binary_operands_p (code, mode, operands))
1228 {
1229 /* It is invalid to swap operands of different modes. */
1230 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1231
1232 std::swap (src1, src2);
1233 }
1234
1235 /* Both source operands cannot be in memory. */
1236 if (MEM_P (src1) && MEM_P (src2))
1237 {
1238 /* Optimization: Only read from memory once. */
1239 if (rtx_equal_p (src1, src2))
1240 {
1241 src2 = force_reg (mode, src2);
1242 src1 = src2;
1243 }
1244 else if (rtx_equal_p (dst, src1))
1245 src2 = force_reg (mode, src2);
1246 else
1247 src1 = force_reg (mode, src1);
1248 }
1249
1250 /* If the destination is memory, and we do not have matching source
1251 operands, do things in registers. */
1252 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1253 dst = gen_reg_rtx (mode);
1254
1255 /* Source 1 cannot be a constant. */
1256 if (CONSTANT_P (src1))
1257 src1 = force_reg (mode, src1);
1258
1259 /* Source 1 cannot be a non-matching memory. */
1260 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1261 src1 = force_reg (mode, src1);
1262
1263 /* Improve address combine. */
1264 if (code == PLUS
1265 && GET_MODE_CLASS (mode) == MODE_INT
1266 && MEM_P (src2))
1267 src2 = force_reg (mode, src2);
1268
1269 operands[1] = src1;
1270 operands[2] = src2;
1271 return dst;
1272 }
1273
1274 /* Similarly, but assume that the destination has already been
1275 set up properly. */
1276
1277 void
1278 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1279 machine_mode mode, rtx operands[])
1280 {
1281 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1282 gcc_assert (dst == operands[0]);
1283 }
1284
1285 /* Attempt to expand a binary operator. Make the expansion closer to the
1286 actual machine, then just general_operand, which will allow 3 separate
1287 memory references (one output, two input) in a single insn. */
1288
1289 void
1290 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1291 rtx operands[])
1292 {
1293 rtx src1, src2, dst, op, clob;
1294
1295 dst = ix86_fixup_binary_operands (code, mode, operands);
1296 src1 = operands[1];
1297 src2 = operands[2];
1298
1299 /* Emit the instruction. */
1300
1301 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1302
1303 if (reload_completed
1304 && code == PLUS
1305 && !rtx_equal_p (dst, src1))
1306 {
1307 /* This is going to be an LEA; avoid splitting it later. */
1308 emit_insn (op);
1309 }
1310 else
1311 {
1312 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1313 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1314 }
1315
1316 /* Fix up the destination if needed. */
1317 if (dst != operands[0])
1318 emit_move_insn (operands[0], dst);
1319 }
1320
1321 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1322 the given OPERANDS. */
1323
1324 void
1325 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1326 rtx operands[])
1327 {
1328 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1329 if (SUBREG_P (operands[1]))
1330 {
1331 op1 = operands[1];
1332 op2 = operands[2];
1333 }
1334 else if (SUBREG_P (operands[2]))
1335 {
1336 op1 = operands[2];
1337 op2 = operands[1];
1338 }
1339 /* Optimize (__m128i) d | (__m128i) e and similar code
1340 when d and e are float vectors into float vector logical
1341 insn. In C/C++ without using intrinsics there is no other way
1342 to express vector logical operation on float vectors than
1343 to cast them temporarily to integer vectors. */
1344 if (op1
1345 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1346 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1347 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1348 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1349 && SUBREG_BYTE (op1) == 0
1350 && (GET_CODE (op2) == CONST_VECTOR
1351 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1352 && SUBREG_BYTE (op2) == 0))
1353 && can_create_pseudo_p ())
1354 {
1355 rtx dst;
1356 switch (GET_MODE (SUBREG_REG (op1)))
1357 {
1358 case E_V4SFmode:
1359 case E_V8SFmode:
1360 case E_V16SFmode:
1361 case E_V2DFmode:
1362 case E_V4DFmode:
1363 case E_V8DFmode:
1364 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1365 if (GET_CODE (op2) == CONST_VECTOR)
1366 {
1367 op2 = gen_lowpart (GET_MODE (dst), op2);
1368 op2 = force_reg (GET_MODE (dst), op2);
1369 }
1370 else
1371 {
1372 op1 = operands[1];
1373 op2 = SUBREG_REG (operands[2]);
1374 if (!vector_operand (op2, GET_MODE (dst)))
1375 op2 = force_reg (GET_MODE (dst), op2);
1376 }
1377 op1 = SUBREG_REG (op1);
1378 if (!vector_operand (op1, GET_MODE (dst)))
1379 op1 = force_reg (GET_MODE (dst), op1);
1380 emit_insn (gen_rtx_SET (dst,
1381 gen_rtx_fmt_ee (code, GET_MODE (dst),
1382 op1, op2)));
1383 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1384 return;
1385 default:
1386 break;
1387 }
1388 }
1389 if (!vector_operand (operands[1], mode))
1390 operands[1] = force_reg (mode, operands[1]);
1391 if (!vector_operand (operands[2], mode))
1392 operands[2] = force_reg (mode, operands[2]);
1393 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1394 emit_insn (gen_rtx_SET (operands[0],
1395 gen_rtx_fmt_ee (code, mode, operands[1],
1396 operands[2])));
1397 }
1398
1399 /* Return TRUE or FALSE depending on whether the binary operator meets the
1400 appropriate constraints. */
1401
1402 bool
1403 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1404 rtx operands[3])
1405 {
1406 rtx dst = operands[0];
1407 rtx src1 = operands[1];
1408 rtx src2 = operands[2];
1409
1410 /* Both source operands cannot be in memory. */
1411 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1412 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1413 return false;
1414
1415 /* Canonicalize operand order for commutative operators. */
1416 if (ix86_swap_binary_operands_p (code, mode, operands))
1417 std::swap (src1, src2);
1418
1419 /* If the destination is memory, we must have a matching source operand. */
1420 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1421 return false;
1422
1423 /* Source 1 cannot be a constant. */
1424 if (CONSTANT_P (src1))
1425 return false;
1426
1427 /* Source 1 cannot be a non-matching memory. */
1428 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1429 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1430 return (code == AND
1431 && (mode == HImode
1432 || mode == SImode
1433 || (TARGET_64BIT && mode == DImode))
1434 && satisfies_constraint_L (src2));
1435
1436 return true;
1437 }
1438
1439 /* Attempt to expand a unary operator. Make the expansion closer to the
1440 actual machine, then just general_operand, which will allow 2 separate
1441 memory references (one output, one input) in a single insn. */
1442
1443 void
1444 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1445 rtx operands[])
1446 {
1447 bool matching_memory = false;
1448 rtx src, dst, op, clob;
1449
1450 dst = operands[0];
1451 src = operands[1];
1452
1453 /* If the destination is memory, and we do not have matching source
1454 operands, do things in registers. */
1455 if (MEM_P (dst))
1456 {
1457 if (rtx_equal_p (dst, src))
1458 matching_memory = true;
1459 else
1460 dst = gen_reg_rtx (mode);
1461 }
1462
1463 /* When source operand is memory, destination must match. */
1464 if (MEM_P (src) && !matching_memory)
1465 src = force_reg (mode, src);
1466
1467 /* Emit the instruction. */
1468
1469 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1470
1471 if (code == NOT)
1472 emit_insn (op);
1473 else
1474 {
1475 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1476 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1477 }
1478
1479 /* Fix up the destination if needed. */
1480 if (dst != operands[0])
1481 emit_move_insn (operands[0], dst);
1482 }
1483
1484 /* Predict just emitted jump instruction to be taken with probability PROB. */
1485
1486 static void
1487 predict_jump (int prob)
1488 {
1489 rtx_insn *insn = get_last_insn ();
1490 gcc_assert (JUMP_P (insn));
1491 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1492 }
1493
1494 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1495 divisor are within the range [0-255]. */
1496
1497 void
1498 ix86_split_idivmod (machine_mode mode, rtx operands[],
1499 bool unsigned_p)
1500 {
1501 rtx_code_label *end_label, *qimode_label;
1502 rtx div, mod;
1503 rtx_insn *insn;
1504 rtx scratch, tmp0, tmp1, tmp2;
1505 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1506
1507 operands[2] = force_reg (mode, operands[2]);
1508 operands[3] = force_reg (mode, operands[3]);
1509
1510 switch (mode)
1511 {
1512 case E_SImode:
1513 if (GET_MODE (operands[0]) == SImode)
1514 {
1515 if (GET_MODE (operands[1]) == SImode)
1516 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1517 else
1518 gen_divmod4_1
1519 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1520 }
1521 else
1522 gen_divmod4_1
1523 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1524 break;
1525
1526 case E_DImode:
1527 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1528 break;
1529
1530 default:
1531 gcc_unreachable ();
1532 }
1533
1534 end_label = gen_label_rtx ();
1535 qimode_label = gen_label_rtx ();
1536
1537 scratch = gen_reg_rtx (mode);
1538
1539 /* Use 8bit unsigned divimod if dividend and divisor are within
1540 the range [0-255]. */
1541 emit_move_insn (scratch, operands[2]);
1542 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1543 scratch, 1, OPTAB_DIRECT);
1544 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1545 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1546 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1547 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1548 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1549 pc_rtx);
1550 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1551 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1552 JUMP_LABEL (insn) = qimode_label;
1553
1554 /* Generate original signed/unsigned divimod. */
1555 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1556 operands[2], operands[3]));
1557
1558 /* Branch to the end. */
1559 emit_jump_insn (gen_jump (end_label));
1560 emit_barrier ();
1561
1562 /* Generate 8bit unsigned divide. */
1563 emit_label (qimode_label);
1564 /* Don't use operands[0] for result of 8bit divide since not all
1565 registers support QImode ZERO_EXTRACT. */
1566 tmp0 = lowpart_subreg (HImode, scratch, mode);
1567 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1568 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1569 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1570
1571 if (unsigned_p)
1572 {
1573 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1574 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1575 }
1576 else
1577 {
1578 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1579 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1580 }
1581 if (mode == SImode)
1582 {
1583 if (GET_MODE (operands[0]) != SImode)
1584 div = gen_rtx_ZERO_EXTEND (DImode, div);
1585 if (GET_MODE (operands[1]) != SImode)
1586 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1587 }
1588
1589 /* Extract remainder from AH. */
1590 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1591 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1592 GEN_INT (8), GEN_INT (8));
1593 insn = emit_move_insn (operands[1], tmp1);
1594 set_unique_reg_note (insn, REG_EQUAL, mod);
1595
1596 /* Zero extend quotient from AL. */
1597 tmp1 = gen_lowpart (QImode, tmp0);
1598 insn = emit_insn (gen_extend_insn
1599 (operands[0], tmp1,
1600 GET_MODE (operands[0]), QImode, 1));
1601 set_unique_reg_note (insn, REG_EQUAL, div);
1602
1603 emit_label (end_label);
1604 }
1605
1606 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1607 matches destination. RTX includes clobber of FLAGS_REG. */
1608
1609 void
1610 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1611 rtx dst, rtx src)
1612 {
1613 rtx op, clob;
1614
1615 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1616 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1617
1618 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1619 }
1620
1621 /* Return true if regno1 def is nearest to the insn. */
1622
1623 static bool
1624 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1625 {
1626 rtx_insn *prev = insn;
1627 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1628
1629 if (insn == start)
1630 return false;
1631 while (prev && prev != start)
1632 {
1633 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1634 {
1635 prev = PREV_INSN (prev);
1636 continue;
1637 }
1638 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1639 return true;
1640 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1641 return false;
1642 prev = PREV_INSN (prev);
1643 }
1644
1645 /* None of the regs is defined in the bb. */
1646 return false;
1647 }
1648
1649 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1650 int ix86_last_zero_store_uid;
1651
1652 /* Split lea instructions into a sequence of instructions
1653 which are executed on ALU to avoid AGU stalls.
1654 It is assumed that it is allowed to clobber flags register
1655 at lea position. */
1656
1657 void
1658 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1659 {
1660 unsigned int regno0, regno1, regno2;
1661 struct ix86_address parts;
1662 rtx target, tmp;
1663 int ok, adds;
1664
1665 ok = ix86_decompose_address (operands[1], &parts);
1666 gcc_assert (ok);
1667
1668 target = gen_lowpart (mode, operands[0]);
1669
1670 regno0 = true_regnum (target);
1671 regno1 = INVALID_REGNUM;
1672 regno2 = INVALID_REGNUM;
1673
1674 if (parts.base)
1675 {
1676 parts.base = gen_lowpart (mode, parts.base);
1677 regno1 = true_regnum (parts.base);
1678 }
1679
1680 if (parts.index)
1681 {
1682 parts.index = gen_lowpart (mode, parts.index);
1683 regno2 = true_regnum (parts.index);
1684 }
1685
1686 if (parts.disp)
1687 parts.disp = gen_lowpart (mode, parts.disp);
1688
1689 if (parts.scale > 1)
1690 {
1691 /* Case r1 = r1 + ... */
1692 if (regno1 == regno0)
1693 {
1694 /* If we have a case r1 = r1 + C * r2 then we
1695 should use multiplication which is very
1696 expensive. Assume cost model is wrong if we
1697 have such case here. */
1698 gcc_assert (regno2 != regno0);
1699
1700 for (adds = parts.scale; adds > 0; adds--)
1701 ix86_emit_binop (PLUS, mode, target, parts.index);
1702 }
1703 else
1704 {
1705 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1706 if (regno0 != regno2)
1707 emit_insn (gen_rtx_SET (target, parts.index));
1708
1709 /* Use shift for scaling, but emit it as MULT instead
1710 to avoid it being immediately peephole2 optimized back
1711 into lea. */
1712 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1713
1714 if (parts.base)
1715 ix86_emit_binop (PLUS, mode, target, parts.base);
1716
1717 if (parts.disp && parts.disp != const0_rtx)
1718 ix86_emit_binop (PLUS, mode, target, parts.disp);
1719 }
1720 }
1721 else if (!parts.base && !parts.index)
1722 {
1723 gcc_assert(parts.disp);
1724 emit_insn (gen_rtx_SET (target, parts.disp));
1725 }
1726 else
1727 {
1728 if (!parts.base)
1729 {
1730 if (regno0 != regno2)
1731 emit_insn (gen_rtx_SET (target, parts.index));
1732 }
1733 else if (!parts.index)
1734 {
1735 if (regno0 != regno1)
1736 emit_insn (gen_rtx_SET (target, parts.base));
1737 }
1738 else
1739 {
1740 if (regno0 == regno1)
1741 tmp = parts.index;
1742 else if (regno0 == regno2)
1743 tmp = parts.base;
1744 else
1745 {
1746 rtx tmp1;
1747
1748 /* Find better operand for SET instruction, depending
1749 on which definition is farther from the insn. */
1750 if (find_nearest_reg_def (insn, regno1, regno2))
1751 tmp = parts.index, tmp1 = parts.base;
1752 else
1753 tmp = parts.base, tmp1 = parts.index;
1754
1755 emit_insn (gen_rtx_SET (target, tmp));
1756
1757 if (parts.disp && parts.disp != const0_rtx)
1758 ix86_emit_binop (PLUS, mode, target, parts.disp);
1759
1760 ix86_emit_binop (PLUS, mode, target, tmp1);
1761 return;
1762 }
1763
1764 ix86_emit_binop (PLUS, mode, target, tmp);
1765 }
1766
1767 if (parts.disp && parts.disp != const0_rtx)
1768 ix86_emit_binop (PLUS, mode, target, parts.disp);
1769 }
1770 }
1771
1772 /* Post-reload splitter for converting an SF or DFmode value in an
1773 SSE register into an unsigned SImode. */
1774
1775 void
1776 ix86_split_convert_uns_si_sse (rtx operands[])
1777 {
1778 machine_mode vecmode;
1779 rtx value, large, zero_or_two31, input, two31, x;
1780
1781 large = operands[1];
1782 zero_or_two31 = operands[2];
1783 input = operands[3];
1784 two31 = operands[4];
1785 vecmode = GET_MODE (large);
1786 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1787
1788 /* Load up the value into the low element. We must ensure that the other
1789 elements are valid floats -- zero is the easiest such value. */
1790 if (MEM_P (input))
1791 {
1792 if (vecmode == V4SFmode)
1793 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1794 else
1795 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1796 }
1797 else
1798 {
1799 input = gen_rtx_REG (vecmode, REGNO (input));
1800 emit_move_insn (value, CONST0_RTX (vecmode));
1801 if (vecmode == V4SFmode)
1802 emit_insn (gen_sse_movss_v4sf (value, value, input));
1803 else
1804 emit_insn (gen_sse2_movsd_v2df (value, value, input));
1805 }
1806
1807 emit_move_insn (large, two31);
1808 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1809
1810 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1811 emit_insn (gen_rtx_SET (large, x));
1812
1813 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1814 emit_insn (gen_rtx_SET (zero_or_two31, x));
1815
1816 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1817 emit_insn (gen_rtx_SET (value, x));
1818
1819 large = gen_rtx_REG (V4SImode, REGNO (large));
1820 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1821
1822 x = gen_rtx_REG (V4SImode, REGNO (value));
1823 if (vecmode == V4SFmode)
1824 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1825 else
1826 emit_insn (gen_sse2_cvttpd2dq (x, value));
1827 value = x;
1828
1829 emit_insn (gen_xorv4si3 (value, value, large));
1830 }
1831
1832 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1833 machine_mode mode, rtx target,
1834 rtx var, int one_var);
1835
1836 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1837 Expects the 64-bit DImode to be supplied in a pair of integral
1838 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1839 -mfpmath=sse, !optimize_size only. */
1840
1841 void
1842 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1843 {
1844 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1845 rtx int_xmm, fp_xmm;
1846 rtx biases, exponents;
1847 rtx x;
1848
1849 int_xmm = gen_reg_rtx (V4SImode);
1850 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1851 emit_insn (gen_movdi_to_sse (int_xmm, input));
1852 else if (TARGET_SSE_SPLIT_REGS)
1853 {
1854 emit_clobber (int_xmm);
1855 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1856 }
1857 else
1858 {
1859 x = gen_reg_rtx (V2DImode);
1860 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1861 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1862 }
1863
1864 x = gen_rtx_CONST_VECTOR (V4SImode,
1865 gen_rtvec (4, GEN_INT (0x43300000UL),
1866 GEN_INT (0x45300000UL),
1867 const0_rtx, const0_rtx));
1868 exponents = validize_mem (force_const_mem (V4SImode, x));
1869
1870 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1871 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1872
1873 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1874 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1875 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1876 (0x1.0p84 + double(fp_value_hi_xmm)).
1877 Note these exponents differ by 32. */
1878
1879 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1880
1881 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1882 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1883 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1884 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1885 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1886 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1887 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1888 biases = validize_mem (force_const_mem (V2DFmode, biases));
1889 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1890
1891 /* Add the upper and lower DFmode values together. */
1892 if (TARGET_SSE3)
1893 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1894 else
1895 {
1896 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1897 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1898 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1899 }
1900
1901 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1902 }
1903
1904 /* Not used, but eases macroization of patterns. */
1905 void
1906 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1907 {
1908 gcc_unreachable ();
1909 }
1910
1911 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1912
1913 /* Convert an unsigned SImode value into a DFmode. Only currently used
1914 for SSE, but applicable anywhere. */
1915
1916 void
1917 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1918 {
1919 REAL_VALUE_TYPE TWO31r;
1920 rtx x, fp;
1921
1922 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1923 NULL, 1, OPTAB_DIRECT);
1924
1925 fp = gen_reg_rtx (DFmode);
1926 emit_insn (gen_floatsidf2 (fp, x));
1927
1928 real_ldexp (&TWO31r, &dconst1, 31);
1929 x = const_double_from_real_value (TWO31r, DFmode);
1930
1931 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1932
1933 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1934 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1935 x = ix86_expand_sse_fabs (x, NULL);
1936
1937 if (x != target)
1938 emit_move_insn (target, x);
1939 }
1940
1941 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1942 32-bit mode; otherwise we have a direct convert instruction. */
1943
1944 void
1945 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1946 {
1947 REAL_VALUE_TYPE TWO32r;
1948 rtx fp_lo, fp_hi, x;
1949
1950 fp_lo = gen_reg_rtx (DFmode);
1951 fp_hi = gen_reg_rtx (DFmode);
1952
1953 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1954
1955 real_ldexp (&TWO32r, &dconst1, 32);
1956 x = const_double_from_real_value (TWO32r, DFmode);
1957 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1958
1959 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1960
1961 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1962 0, OPTAB_DIRECT);
1963 if (x != target)
1964 emit_move_insn (target, x);
1965 }
1966
1967 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1968 For x86_32, -mfpmath=sse, !optimize_size only. */
1969 void
1970 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1971 {
1972 REAL_VALUE_TYPE ONE16r;
1973 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1974
1975 real_ldexp (&ONE16r, &dconst1, 16);
1976 x = const_double_from_real_value (ONE16r, SFmode);
1977 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1978 NULL, 0, OPTAB_DIRECT);
1979 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1980 NULL, 0, OPTAB_DIRECT);
1981 fp_hi = gen_reg_rtx (SFmode);
1982 fp_lo = gen_reg_rtx (SFmode);
1983 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1984 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1985 if (TARGET_FMA)
1986 {
1987 x = validize_mem (force_const_mem (SFmode, x));
1988 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1989 emit_move_insn (target, fp_hi);
1990 }
1991 else
1992 {
1993 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1994 0, OPTAB_DIRECT);
1995 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1996 0, OPTAB_DIRECT);
1997 if (!rtx_equal_p (target, fp_hi))
1998 emit_move_insn (target, fp_hi);
1999 }
2000 }
2001
2002 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2003 a vector of unsigned ints VAL to vector of floats TARGET. */
2004
2005 void
2006 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
2007 {
2008 rtx tmp[8];
2009 REAL_VALUE_TYPE TWO16r;
2010 machine_mode intmode = GET_MODE (val);
2011 machine_mode fltmode = GET_MODE (target);
2012 rtx (*cvt) (rtx, rtx);
2013
2014 if (intmode == V4SImode)
2015 cvt = gen_floatv4siv4sf2;
2016 else
2017 cvt = gen_floatv8siv8sf2;
2018 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
2019 tmp[0] = force_reg (intmode, tmp[0]);
2020 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
2021 OPTAB_DIRECT);
2022 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
2023 NULL_RTX, 1, OPTAB_DIRECT);
2024 tmp[3] = gen_reg_rtx (fltmode);
2025 emit_insn (cvt (tmp[3], tmp[1]));
2026 tmp[4] = gen_reg_rtx (fltmode);
2027 emit_insn (cvt (tmp[4], tmp[2]));
2028 real_ldexp (&TWO16r, &dconst1, 16);
2029 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2030 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
2031 if (TARGET_FMA)
2032 {
2033 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2034 emit_move_insn (target, tmp[6]);
2035 }
2036 else
2037 {
2038 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2039 NULL_RTX, 1, OPTAB_DIRECT);
2040 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2041 target, 1, OPTAB_DIRECT);
2042 if (tmp[7] != target)
2043 emit_move_insn (target, tmp[7]);
2044 }
2045 }
2046
2047 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2048 pattern can be used on it instead of fixuns_trunc*.
2049 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2050 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2051
2052 rtx
2053 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2054 {
2055 REAL_VALUE_TYPE TWO31r;
2056 rtx two31r, tmp[4];
2057 machine_mode mode = GET_MODE (val);
2058 machine_mode scalarmode = GET_MODE_INNER (mode);
2059 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2060 rtx (*cmp) (rtx, rtx, rtx, rtx);
2061 int i;
2062
2063 for (i = 0; i < 3; i++)
2064 tmp[i] = gen_reg_rtx (mode);
2065 real_ldexp (&TWO31r, &dconst1, 31);
2066 two31r = const_double_from_real_value (TWO31r, scalarmode);
2067 two31r = ix86_build_const_vector (mode, 1, two31r);
2068 two31r = force_reg (mode, two31r);
2069 switch (mode)
2070 {
2071 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2072 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2073 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2074 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2075 default: gcc_unreachable ();
2076 }
2077 tmp[3] = gen_rtx_LE (mode, two31r, val);
2078 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2079 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2080 0, OPTAB_DIRECT);
2081 if (intmode == V4SImode || TARGET_AVX2)
2082 *xorp = expand_simple_binop (intmode, ASHIFT,
2083 gen_lowpart (intmode, tmp[0]),
2084 GEN_INT (31), NULL_RTX, 0,
2085 OPTAB_DIRECT);
2086 else
2087 {
2088 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2089 two31 = ix86_build_const_vector (intmode, 1, two31);
2090 *xorp = expand_simple_binop (intmode, AND,
2091 gen_lowpart (intmode, tmp[0]),
2092 two31, NULL_RTX, 0,
2093 OPTAB_DIRECT);
2094 }
2095 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2096 0, OPTAB_DIRECT);
2097 }
2098
2099 /* Generate code for floating point ABS or NEG. */
2100
2101 void
2102 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2103 rtx operands[])
2104 {
2105 rtx set, dst, src;
2106 bool use_sse = false;
2107 bool vector_mode = VECTOR_MODE_P (mode);
2108 machine_mode vmode = mode;
2109 rtvec par;
2110
2111 if (vector_mode || mode == TFmode || mode == HFmode)
2112 {
2113 use_sse = true;
2114 if (mode == HFmode)
2115 vmode = V8HFmode;
2116 }
2117 else if (TARGET_SSE_MATH)
2118 {
2119 use_sse = SSE_FLOAT_MODE_P (mode);
2120 if (mode == SFmode)
2121 vmode = V4SFmode;
2122 else if (mode == DFmode)
2123 vmode = V2DFmode;
2124 }
2125
2126 dst = operands[0];
2127 src = operands[1];
2128
2129 set = gen_rtx_fmt_e (code, mode, src);
2130 set = gen_rtx_SET (dst, set);
2131
2132 if (use_sse)
2133 {
2134 rtx mask, use, clob;
2135
2136 /* NEG and ABS performed with SSE use bitwise mask operations.
2137 Create the appropriate mask now. */
2138 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2139 use = gen_rtx_USE (VOIDmode, mask);
2140 if (vector_mode || mode == TFmode)
2141 par = gen_rtvec (2, set, use);
2142 else
2143 {
2144 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2145 par = gen_rtvec (3, set, use, clob);
2146 }
2147 }
2148 else
2149 {
2150 rtx clob;
2151
2152 /* Changing of sign for FP values is doable using integer unit too. */
2153 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2154 par = gen_rtvec (2, set, clob);
2155 }
2156
2157 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2158 }
2159
2160 /* Deconstruct a floating point ABS or NEG operation
2161 with integer registers into integer operations. */
2162
2163 void
2164 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2165 rtx operands[])
2166 {
2167 enum rtx_code absneg_op;
2168 rtx dst, set;
2169
2170 gcc_assert (operands_match_p (operands[0], operands[1]));
2171
2172 switch (mode)
2173 {
2174 case E_SFmode:
2175 dst = gen_lowpart (SImode, operands[0]);
2176
2177 if (code == ABS)
2178 {
2179 set = gen_int_mode (0x7fffffff, SImode);
2180 absneg_op = AND;
2181 }
2182 else
2183 {
2184 set = gen_int_mode (0x80000000, SImode);
2185 absneg_op = XOR;
2186 }
2187 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2188 break;
2189
2190 case E_DFmode:
2191 if (TARGET_64BIT)
2192 {
2193 dst = gen_lowpart (DImode, operands[0]);
2194 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2195
2196 if (code == ABS)
2197 set = const0_rtx;
2198 else
2199 set = gen_rtx_NOT (DImode, dst);
2200 }
2201 else
2202 {
2203 dst = gen_highpart (SImode, operands[0]);
2204
2205 if (code == ABS)
2206 {
2207 set = gen_int_mode (0x7fffffff, SImode);
2208 absneg_op = AND;
2209 }
2210 else
2211 {
2212 set = gen_int_mode (0x80000000, SImode);
2213 absneg_op = XOR;
2214 }
2215 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2216 }
2217 break;
2218
2219 case E_XFmode:
2220 dst = gen_rtx_REG (SImode,
2221 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2222 if (code == ABS)
2223 {
2224 set = GEN_INT (0x7fff);
2225 absneg_op = AND;
2226 }
2227 else
2228 {
2229 set = GEN_INT (0x8000);
2230 absneg_op = XOR;
2231 }
2232 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2233 break;
2234
2235 default:
2236 gcc_unreachable ();
2237 }
2238
2239 set = gen_rtx_SET (dst, set);
2240
2241 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2242 rtvec par = gen_rtvec (2, set, clob);
2243
2244 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2245 }
2246
2247 /* Expand a copysign operation. Special case operand 0 being a constant. */
2248
2249 void
2250 ix86_expand_copysign (rtx operands[])
2251 {
2252 machine_mode mode, vmode;
2253 rtx dest, vdest, op0, op1, mask, op2, op3;
2254
2255 mode = GET_MODE (operands[0]);
2256
2257 if (mode == HFmode)
2258 vmode = V8HFmode;
2259 else if (mode == SFmode)
2260 vmode = V4SFmode;
2261 else if (mode == DFmode)
2262 vmode = V2DFmode;
2263 else if (mode == TFmode)
2264 vmode = mode;
2265 else
2266 gcc_unreachable ();
2267
2268 if (rtx_equal_p (operands[1], operands[2]))
2269 {
2270 emit_move_insn (operands[0], operands[1]);
2271 return;
2272 }
2273
2274 dest = operands[0];
2275 vdest = lowpart_subreg (vmode, dest, mode);
2276 if (vdest == NULL_RTX)
2277 vdest = gen_reg_rtx (vmode);
2278 else
2279 dest = NULL_RTX;
2280 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2281 mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
2282
2283 if (CONST_DOUBLE_P (operands[1]))
2284 {
2285 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2286 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2287 if (op0 == CONST0_RTX (mode))
2288 {
2289 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2290 if (dest)
2291 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2292 return;
2293 }
2294
2295 if (GET_MODE_SIZE (mode) < 16)
2296 op0 = ix86_build_const_vector (vmode, false, op0);
2297 op0 = force_reg (vmode, op0);
2298 }
2299 else
2300 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2301
2302 op2 = gen_reg_rtx (vmode);
2303 op3 = gen_reg_rtx (vmode);
2304 emit_move_insn (op2, gen_rtx_AND (vmode,
2305 gen_rtx_NOT (vmode, mask),
2306 op0));
2307 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2308 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2309 if (dest)
2310 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2311 }
2312
2313 /* Expand an xorsign operation. */
2314
2315 void
2316 ix86_expand_xorsign (rtx operands[])
2317 {
2318 machine_mode mode, vmode;
2319 rtx dest, vdest, op0, op1, mask, x, temp;
2320
2321 dest = operands[0];
2322 op0 = operands[1];
2323 op1 = operands[2];
2324
2325 mode = GET_MODE (dest);
2326
2327 if (mode == HFmode)
2328 vmode = V8HFmode;
2329 else if (mode == SFmode)
2330 vmode = V4SFmode;
2331 else if (mode == DFmode)
2332 vmode = V2DFmode;
2333 else
2334 gcc_unreachable ();
2335
2336 temp = gen_reg_rtx (vmode);
2337 mask = ix86_build_signbit_mask (vmode, 0, 0);
2338
2339 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2340 x = gen_rtx_AND (vmode, op1, mask);
2341 emit_insn (gen_rtx_SET (temp, x));
2342
2343 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2344 x = gen_rtx_XOR (vmode, temp, op0);
2345
2346 vdest = lowpart_subreg (vmode, dest, mode);
2347 if (vdest == NULL_RTX)
2348 vdest = gen_reg_rtx (vmode);
2349 else
2350 dest = NULL_RTX;
2351 emit_insn (gen_rtx_SET (vdest, x));
2352
2353 if (dest)
2354 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2355 }
2356
2357 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2358
2359 void
2360 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2361 {
2362 machine_mode mode = GET_MODE (op0);
2363 rtx tmp;
2364
2365 /* Handle special case - vector comparsion with boolean result, transform
2366 it using ptest instruction. */
2367 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2368 || mode == OImode)
2369 {
2370 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2371 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2372
2373 gcc_assert (code == EQ || code == NE);
2374
2375 if (mode == OImode)
2376 {
2377 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2378 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2379 mode = p_mode;
2380 }
2381 /* Generate XOR since we can't check that one operand is zero vector. */
2382 tmp = gen_reg_rtx (mode);
2383 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2384 tmp = gen_lowpart (p_mode, tmp);
2385 emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
2386 gen_rtx_UNSPEC (CCZmode,
2387 gen_rtvec (2, tmp, tmp),
2388 UNSPEC_PTEST)));
2389 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2390 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2391 gen_rtx_LABEL_REF (VOIDmode, label),
2392 pc_rtx);
2393 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2394 return;
2395 }
2396
2397 switch (mode)
2398 {
2399 case E_HFmode:
2400 case E_SFmode:
2401 case E_DFmode:
2402 case E_XFmode:
2403 case E_QImode:
2404 case E_HImode:
2405 case E_SImode:
2406 simple:
2407 tmp = ix86_expand_compare (code, op0, op1);
2408 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2409 gen_rtx_LABEL_REF (VOIDmode, label),
2410 pc_rtx);
2411 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2412 return;
2413
2414 case E_DImode:
2415 if (TARGET_64BIT)
2416 goto simple;
2417 /* FALLTHRU */
2418 case E_TImode:
2419 /* DI and TI mode equality/inequality comparisons may be performed
2420 on SSE registers. Avoid splitting them, except when optimizing
2421 for size. */
2422 if ((code == EQ || code == NE)
2423 && !optimize_insn_for_size_p ())
2424 goto simple;
2425
2426 /* Expand DImode branch into multiple compare+branch. */
2427 {
2428 rtx lo[2], hi[2];
2429 rtx_code_label *label2;
2430 enum rtx_code code1, code2, code3;
2431 machine_mode submode;
2432
2433 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2434 {
2435 std::swap (op0, op1);
2436 code = swap_condition (code);
2437 }
2438
2439 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2440 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2441
2442 submode = mode == DImode ? SImode : DImode;
2443
2444 /* If we are doing less-than or greater-or-equal-than,
2445 op1 is a constant and the low word is zero, then we can just
2446 examine the high word. Similarly for low word -1 and
2447 less-or-equal-than or greater-than. */
2448
2449 if (CONST_INT_P (hi[1]))
2450 switch (code)
2451 {
2452 case LT: case LTU: case GE: case GEU:
2453 if (lo[1] == const0_rtx)
2454 {
2455 ix86_expand_branch (code, hi[0], hi[1], label);
2456 return;
2457 }
2458 break;
2459 case LE: case LEU: case GT: case GTU:
2460 if (lo[1] == constm1_rtx)
2461 {
2462 ix86_expand_branch (code, hi[0], hi[1], label);
2463 return;
2464 }
2465 break;
2466 default:
2467 break;
2468 }
2469
2470 /* Emulate comparisons that do not depend on Zero flag with
2471 double-word subtraction. Note that only Overflow, Sign
2472 and Carry flags are valid, so swap arguments and condition
2473 of comparisons that would otherwise test Zero flag. */
2474
2475 switch (code)
2476 {
2477 case LE: case LEU: case GT: case GTU:
2478 std::swap (lo[0], lo[1]);
2479 std::swap (hi[0], hi[1]);
2480 code = swap_condition (code);
2481 /* FALLTHRU */
2482
2483 case LT: case LTU: case GE: case GEU:
2484 {
2485 bool uns = (code == LTU || code == GEU);
2486 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2487 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2488
2489 if (!nonimmediate_operand (lo[0], submode))
2490 lo[0] = force_reg (submode, lo[0]);
2491 if (!x86_64_general_operand (lo[1], submode))
2492 lo[1] = force_reg (submode, lo[1]);
2493
2494 if (!register_operand (hi[0], submode))
2495 hi[0] = force_reg (submode, hi[0]);
2496 if ((uns && !nonimmediate_operand (hi[1], submode))
2497 || (!uns && !x86_64_general_operand (hi[1], submode)))
2498 hi[1] = force_reg (submode, hi[1]);
2499
2500 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2501
2502 tmp = gen_rtx_SCRATCH (submode);
2503 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2504
2505 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2506 ix86_expand_branch (code, tmp, const0_rtx, label);
2507 return;
2508 }
2509
2510 default:
2511 break;
2512 }
2513
2514 /* Otherwise, we need two or three jumps. */
2515
2516 label2 = gen_label_rtx ();
2517
2518 code1 = code;
2519 code2 = swap_condition (code);
2520 code3 = unsigned_condition (code);
2521
2522 switch (code)
2523 {
2524 case LT: case GT: case LTU: case GTU:
2525 break;
2526
2527 case LE: code1 = LT; code2 = GT; break;
2528 case GE: code1 = GT; code2 = LT; break;
2529 case LEU: code1 = LTU; code2 = GTU; break;
2530 case GEU: code1 = GTU; code2 = LTU; break;
2531
2532 case EQ: code1 = UNKNOWN; code2 = NE; break;
2533 case NE: code2 = UNKNOWN; break;
2534
2535 default:
2536 gcc_unreachable ();
2537 }
2538
2539 /*
2540 * a < b =>
2541 * if (hi(a) < hi(b)) goto true;
2542 * if (hi(a) > hi(b)) goto false;
2543 * if (lo(a) < lo(b)) goto true;
2544 * false:
2545 */
2546
2547 if (code1 != UNKNOWN)
2548 ix86_expand_branch (code1, hi[0], hi[1], label);
2549 if (code2 != UNKNOWN)
2550 ix86_expand_branch (code2, hi[0], hi[1], label2);
2551
2552 ix86_expand_branch (code3, lo[0], lo[1], label);
2553
2554 if (code2 != UNKNOWN)
2555 emit_label (label2);
2556 return;
2557 }
2558
2559 default:
2560 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2561 goto simple;
2562 }
2563 }
2564
2565 /* Figure out whether to use unordered fp comparisons. */
2566
2567 static bool
2568 ix86_unordered_fp_compare (enum rtx_code code)
2569 {
2570 if (!TARGET_IEEE_FP)
2571 return false;
2572
2573 switch (code)
2574 {
2575 case LT:
2576 case LE:
2577 case GT:
2578 case GE:
2579 case LTGT:
2580 return false;
2581
2582 case EQ:
2583 case NE:
2584
2585 case UNORDERED:
2586 case ORDERED:
2587 case UNLT:
2588 case UNLE:
2589 case UNGT:
2590 case UNGE:
2591 case UNEQ:
2592 return true;
2593
2594 default:
2595 gcc_unreachable ();
2596 }
2597 }
2598
2599 /* Return a comparison we can do and that it is equivalent to
2600 swap_condition (code) apart possibly from orderedness.
2601 But, never change orderedness if TARGET_IEEE_FP, returning
2602 UNKNOWN in that case if necessary. */
2603
2604 static enum rtx_code
2605 ix86_fp_swap_condition (enum rtx_code code)
2606 {
2607 switch (code)
2608 {
2609 case GT: /* GTU - CF=0 & ZF=0 */
2610 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2611 case GE: /* GEU - CF=0 */
2612 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2613 case UNLT: /* LTU - CF=1 */
2614 return TARGET_IEEE_FP ? UNKNOWN : GT;
2615 case UNLE: /* LEU - CF=1 | ZF=1 */
2616 return TARGET_IEEE_FP ? UNKNOWN : GE;
2617 default:
2618 return swap_condition (code);
2619 }
2620 }
2621
2622 /* Return cost of comparison CODE using the best strategy for performance.
2623 All following functions do use number of instructions as a cost metrics.
2624 In future this should be tweaked to compute bytes for optimize_size and
2625 take into account performance of various instructions on various CPUs. */
2626
2627 static int
2628 ix86_fp_comparison_cost (enum rtx_code code)
2629 {
2630 int arith_cost;
2631
2632 /* The cost of code using bit-twiddling on %ah. */
2633 switch (code)
2634 {
2635 case UNLE:
2636 case UNLT:
2637 case LTGT:
2638 case GT:
2639 case GE:
2640 case UNORDERED:
2641 case ORDERED:
2642 case UNEQ:
2643 arith_cost = 4;
2644 break;
2645 case LT:
2646 case NE:
2647 case EQ:
2648 case UNGE:
2649 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2650 break;
2651 case LE:
2652 case UNGT:
2653 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2654 break;
2655 default:
2656 gcc_unreachable ();
2657 }
2658
2659 switch (ix86_fp_comparison_strategy (code))
2660 {
2661 case IX86_FPCMP_COMI:
2662 return arith_cost > 4 ? 3 : 2;
2663 case IX86_FPCMP_SAHF:
2664 return arith_cost > 4 ? 4 : 3;
2665 default:
2666 return arith_cost;
2667 }
2668 }
2669
2670 /* Swap, force into registers, or otherwise massage the two operands
2671 to a fp comparison. The operands are updated in place; the new
2672 comparison code is returned. */
2673
2674 static enum rtx_code
2675 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2676 {
2677 bool unordered_compare = ix86_unordered_fp_compare (code);
2678 rtx op0 = *pop0, op1 = *pop1;
2679 machine_mode op_mode = GET_MODE (op0);
2680 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2681
2682 if (op_mode == BFmode)
2683 {
2684 rtx op = gen_lowpart (HImode, op0);
2685 if (CONST_INT_P (op))
2686 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2687 op0, BFmode);
2688 else
2689 {
2690 rtx t1 = gen_reg_rtx (SImode);
2691 emit_insn (gen_zero_extendhisi2 (t1, op));
2692 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2693 op = gen_lowpart (SFmode, t1);
2694 }
2695 *pop0 = op;
2696 op = gen_lowpart (HImode, op1);
2697 if (CONST_INT_P (op))
2698 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2699 op1, BFmode);
2700 else
2701 {
2702 rtx t1 = gen_reg_rtx (SImode);
2703 emit_insn (gen_zero_extendhisi2 (t1, op));
2704 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2705 op = gen_lowpart (SFmode, t1);
2706 }
2707 *pop1 = op;
2708 return ix86_prepare_fp_compare_args (code, pop0, pop1);
2709 }
2710
2711 /* All of the unordered compare instructions only work on registers.
2712 The same is true of the fcomi compare instructions. The XFmode
2713 compare instructions require registers except when comparing
2714 against zero or when converting operand 1 from fixed point to
2715 floating point. */
2716
2717 if (!is_sse
2718 && (unordered_compare
2719 || (op_mode == XFmode
2720 && ! (standard_80387_constant_p (op0) == 1
2721 || standard_80387_constant_p (op1) == 1)
2722 && GET_CODE (op1) != FLOAT)
2723 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2724 {
2725 op0 = force_reg (op_mode, op0);
2726 op1 = force_reg (op_mode, op1);
2727 }
2728 else
2729 {
2730 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2731 things around if they appear profitable, otherwise force op0
2732 into a register. */
2733
2734 if (standard_80387_constant_p (op0) == 0
2735 || (MEM_P (op0)
2736 && ! (standard_80387_constant_p (op1) == 0
2737 || MEM_P (op1))))
2738 {
2739 enum rtx_code new_code = ix86_fp_swap_condition (code);
2740 if (new_code != UNKNOWN)
2741 {
2742 std::swap (op0, op1);
2743 code = new_code;
2744 }
2745 }
2746
2747 if (!REG_P (op0))
2748 op0 = force_reg (op_mode, op0);
2749
2750 if (CONSTANT_P (op1))
2751 {
2752 int tmp = standard_80387_constant_p (op1);
2753 if (tmp == 0)
2754 op1 = validize_mem (force_const_mem (op_mode, op1));
2755 else if (tmp == 1)
2756 {
2757 if (TARGET_CMOVE)
2758 op1 = force_reg (op_mode, op1);
2759 }
2760 else
2761 op1 = force_reg (op_mode, op1);
2762 }
2763 }
2764
2765 /* Try to rearrange the comparison to make it cheaper. */
2766 if (ix86_fp_comparison_cost (code)
2767 > ix86_fp_comparison_cost (swap_condition (code))
2768 && (REG_P (op1) || can_create_pseudo_p ()))
2769 {
2770 std::swap (op0, op1);
2771 code = swap_condition (code);
2772 if (!REG_P (op0))
2773 op0 = force_reg (op_mode, op0);
2774 }
2775
2776 *pop0 = op0;
2777 *pop1 = op1;
2778 return code;
2779 }
2780
2781 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2782
2783 static rtx
2784 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2785 {
2786 bool unordered_compare = ix86_unordered_fp_compare (code);
2787 machine_mode cmp_mode;
2788 rtx tmp, scratch;
2789
2790 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2791
2792 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2793 if (unordered_compare)
2794 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2795
2796 /* Do fcomi/sahf based test when profitable. */
2797 switch (ix86_fp_comparison_strategy (code))
2798 {
2799 case IX86_FPCMP_COMI:
2800 cmp_mode = CCFPmode;
2801 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2802 break;
2803
2804 case IX86_FPCMP_SAHF:
2805 cmp_mode = CCFPmode;
2806 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2807 scratch = gen_reg_rtx (HImode);
2808 emit_insn (gen_rtx_SET (scratch, tmp));
2809 emit_insn (gen_x86_sahf_1 (scratch));
2810 break;
2811
2812 case IX86_FPCMP_ARITH:
2813 cmp_mode = CCNOmode;
2814 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2815 scratch = gen_reg_rtx (HImode);
2816 emit_insn (gen_rtx_SET (scratch, tmp));
2817
2818 /* In the unordered case, we have to check C2 for NaN's, which
2819 doesn't happen to work out to anything nice combination-wise.
2820 So do some bit twiddling on the value we've got in AH to come
2821 up with an appropriate set of condition codes. */
2822
2823 switch (code)
2824 {
2825 case GT:
2826 case UNGT:
2827 if (code == GT || !TARGET_IEEE_FP)
2828 {
2829 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2830 code = EQ;
2831 }
2832 else
2833 {
2834 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2835 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2836 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2837 cmp_mode = CCmode;
2838 code = GEU;
2839 }
2840 break;
2841 case LT:
2842 case UNLT:
2843 if (code == LT && TARGET_IEEE_FP)
2844 {
2845 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2846 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2847 cmp_mode = CCmode;
2848 code = EQ;
2849 }
2850 else
2851 {
2852 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2853 code = NE;
2854 }
2855 break;
2856 case GE:
2857 case UNGE:
2858 if (code == GE || !TARGET_IEEE_FP)
2859 {
2860 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2861 code = EQ;
2862 }
2863 else
2864 {
2865 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2866 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2867 code = NE;
2868 }
2869 break;
2870 case LE:
2871 case UNLE:
2872 if (code == LE && TARGET_IEEE_FP)
2873 {
2874 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2875 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2876 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2877 cmp_mode = CCmode;
2878 code = LTU;
2879 }
2880 else
2881 {
2882 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2883 code = NE;
2884 }
2885 break;
2886 case EQ:
2887 case UNEQ:
2888 if (code == EQ && TARGET_IEEE_FP)
2889 {
2890 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2891 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2892 cmp_mode = CCmode;
2893 code = EQ;
2894 }
2895 else
2896 {
2897 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2898 code = NE;
2899 }
2900 break;
2901 case NE:
2902 case LTGT:
2903 if (code == NE && TARGET_IEEE_FP)
2904 {
2905 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2906 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2907 GEN_INT (0x40)));
2908 code = NE;
2909 }
2910 else
2911 {
2912 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2913 code = EQ;
2914 }
2915 break;
2916
2917 case UNORDERED:
2918 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2919 code = NE;
2920 break;
2921 case ORDERED:
2922 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2923 code = EQ;
2924 break;
2925
2926 default:
2927 gcc_unreachable ();
2928 }
2929 break;
2930
2931 default:
2932 gcc_unreachable();
2933 }
2934
2935 /* Return the test that should be put into the flags user, i.e.
2936 the bcc, scc, or cmov instruction. */
2937 return gen_rtx_fmt_ee (code, VOIDmode,
2938 gen_rtx_REG (cmp_mode, FLAGS_REG),
2939 const0_rtx);
2940 }
2941
2942 /* Generate insn patterns to do an integer compare of OPERANDS. */
2943
2944 static rtx
2945 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2946 {
2947 machine_mode cmpmode;
2948 rtx tmp, flags;
2949
2950 /* Swap operands to emit carry flag comparison. */
2951 if ((code == GTU || code == LEU)
2952 && nonimmediate_operand (op1, VOIDmode))
2953 {
2954 std::swap (op0, op1);
2955 code = swap_condition (code);
2956 }
2957
2958 cmpmode = SELECT_CC_MODE (code, op0, op1);
2959 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2960
2961 /* This is very simple, but making the interface the same as in the
2962 FP case makes the rest of the code easier. */
2963 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2964 emit_insn (gen_rtx_SET (flags, tmp));
2965
2966 /* Return the test that should be put into the flags user, i.e.
2967 the bcc, scc, or cmov instruction. */
2968 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2969 }
2970
2971 static rtx
2972 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2973 {
2974 rtx ret;
2975
2976 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2977 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2978
2979 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2980 {
2981 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2982 ret = ix86_expand_fp_compare (code, op0, op1);
2983 }
2984 else
2985 ret = ix86_expand_int_compare (code, op0, op1);
2986
2987 return ret;
2988 }
2989
2990 void
2991 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2992 {
2993 rtx ret;
2994
2995 gcc_assert (GET_MODE (dest) == QImode);
2996
2997 ret = ix86_expand_compare (code, op0, op1);
2998 PUT_MODE (ret, QImode);
2999 emit_insn (gen_rtx_SET (dest, ret));
3000 }
3001
3002 /* Expand floating point op0 <=> op1, i.e.
3003 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
3004
3005 void
3006 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
3007 {
3008 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
3009 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
3010 rtx l0 = gen_label_rtx ();
3011 rtx l1 = gen_label_rtx ();
3012 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
3013 rtx lend = gen_label_rtx ();
3014 rtx tmp;
3015 rtx_insn *jmp;
3016 if (l2)
3017 {
3018 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
3019 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3020 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
3021 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
3022 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3023 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
3024 }
3025 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3026 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3027 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3028 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3029 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3030 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3031 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3032 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3033 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3034 add_reg_br_prob_note (jmp, profile_probability::even ());
3035 emit_move_insn (dest, constm1_rtx);
3036 emit_jump (lend);
3037 emit_label (l0);
3038 emit_move_insn (dest, const0_rtx);
3039 emit_jump (lend);
3040 emit_label (l1);
3041 emit_move_insn (dest, const1_rtx);
3042 emit_jump (lend);
3043 if (l2)
3044 {
3045 emit_label (l2);
3046 emit_move_insn (dest, const2_rtx);
3047 }
3048 emit_label (lend);
3049 }
3050
3051 /* Expand comparison setting or clearing carry flag. Return true when
3052 successful and set pop for the operation. */
3053 static bool
3054 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3055 {
3056 machine_mode mode
3057 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3058
3059 /* Do not handle double-mode compares that go through special path. */
3060 if (mode == (TARGET_64BIT ? TImode : DImode))
3061 return false;
3062
3063 if (SCALAR_FLOAT_MODE_P (mode))
3064 {
3065 rtx compare_op;
3066 rtx_insn *compare_seq;
3067
3068 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3069
3070 /* Shortcut: following common codes never translate
3071 into carry flag compares. */
3072 if (code == EQ || code == NE || code == UNEQ || code == LTGT
3073 || code == ORDERED || code == UNORDERED)
3074 return false;
3075
3076 /* These comparisons require zero flag; swap operands so they won't. */
3077 if ((code == GT || code == UNLE || code == LE || code == UNGT)
3078 && !TARGET_IEEE_FP)
3079 {
3080 std::swap (op0, op1);
3081 code = swap_condition (code);
3082 }
3083
3084 /* Try to expand the comparison and verify that we end up with
3085 carry flag based comparison. This fails to be true only when
3086 we decide to expand comparison using arithmetic that is not
3087 too common scenario. */
3088 start_sequence ();
3089 compare_op = ix86_expand_fp_compare (code, op0, op1);
3090 compare_seq = get_insns ();
3091 end_sequence ();
3092
3093 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3094 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3095 else
3096 code = GET_CODE (compare_op);
3097
3098 if (code != LTU && code != GEU)
3099 return false;
3100
3101 emit_insn (compare_seq);
3102 *pop = compare_op;
3103 return true;
3104 }
3105
3106 if (!INTEGRAL_MODE_P (mode))
3107 return false;
3108
3109 switch (code)
3110 {
3111 case LTU:
3112 case GEU:
3113 break;
3114
3115 /* Convert a==0 into (unsigned)a<1. */
3116 case EQ:
3117 case NE:
3118 if (op1 != const0_rtx)
3119 return false;
3120 op1 = const1_rtx;
3121 code = (code == EQ ? LTU : GEU);
3122 break;
3123
3124 /* Convert a>b into b<a or a>=b-1. */
3125 case GTU:
3126 case LEU:
3127 if (CONST_INT_P (op1))
3128 {
3129 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3130 /* Bail out on overflow. We still can swap operands but that
3131 would force loading of the constant into register. */
3132 if (op1 == const0_rtx
3133 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3134 return false;
3135 code = (code == GTU ? GEU : LTU);
3136 }
3137 else
3138 {
3139 std::swap (op0, op1);
3140 code = (code == GTU ? LTU : GEU);
3141 }
3142 break;
3143
3144 /* Convert a>=0 into (unsigned)a<0x80000000. */
3145 case LT:
3146 case GE:
3147 if (mode == DImode || op1 != const0_rtx)
3148 return false;
3149 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3150 code = (code == LT ? GEU : LTU);
3151 break;
3152 case LE:
3153 case GT:
3154 if (mode == DImode || op1 != constm1_rtx)
3155 return false;
3156 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3157 code = (code == LE ? GEU : LTU);
3158 break;
3159
3160 default:
3161 return false;
3162 }
3163 /* Swapping operands may cause constant to appear as first operand. */
3164 if (!nonimmediate_operand (op0, VOIDmode))
3165 {
3166 if (!can_create_pseudo_p ())
3167 return false;
3168 op0 = force_reg (mode, op0);
3169 }
3170 *pop = ix86_expand_compare (code, op0, op1);
3171 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3172 return true;
3173 }
3174
3175 /* Expand conditional increment or decrement using adb/sbb instructions.
3176 The default case using setcc followed by the conditional move can be
3177 done by generic code. */
3178 bool
3179 ix86_expand_int_addcc (rtx operands[])
3180 {
3181 enum rtx_code code = GET_CODE (operands[1]);
3182 rtx flags;
3183 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3184 rtx compare_op;
3185 rtx val = const0_rtx;
3186 bool fpcmp = false;
3187 machine_mode mode;
3188 rtx op0 = XEXP (operands[1], 0);
3189 rtx op1 = XEXP (operands[1], 1);
3190
3191 if (operands[3] != const1_rtx
3192 && operands[3] != constm1_rtx)
3193 return false;
3194 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3195 return false;
3196 code = GET_CODE (compare_op);
3197
3198 flags = XEXP (compare_op, 0);
3199
3200 if (GET_MODE (flags) == CCFPmode)
3201 {
3202 fpcmp = true;
3203 code = ix86_fp_compare_code_to_integer (code);
3204 }
3205
3206 if (code != LTU)
3207 {
3208 val = constm1_rtx;
3209 if (fpcmp)
3210 PUT_CODE (compare_op,
3211 reverse_condition_maybe_unordered
3212 (GET_CODE (compare_op)));
3213 else
3214 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3215 }
3216
3217 mode = GET_MODE (operands[0]);
3218
3219 /* Construct either adc or sbb insn. */
3220 if ((code == LTU) == (operands[3] == constm1_rtx))
3221 insn = gen_sub3_carry;
3222 else
3223 insn = gen_add3_carry;
3224
3225 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3226
3227 return true;
3228 }
3229
3230 bool
3231 ix86_expand_int_movcc (rtx operands[])
3232 {
3233 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3234 rtx_insn *compare_seq;
3235 rtx compare_op;
3236 machine_mode mode = GET_MODE (operands[0]);
3237 bool sign_bit_compare_p = false;
3238 bool negate_cc_compare_p = false;
3239 rtx op0 = XEXP (operands[1], 0);
3240 rtx op1 = XEXP (operands[1], 1);
3241 rtx op2 = operands[2];
3242 rtx op3 = operands[3];
3243
3244 if (GET_MODE (op0) == TImode
3245 || (GET_MODE (op0) == DImode
3246 && !TARGET_64BIT))
3247 return false;
3248
3249 if (GET_MODE (op0) == BFmode
3250 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3251 return false;
3252
3253 start_sequence ();
3254 compare_op = ix86_expand_compare (code, op0, op1);
3255 compare_seq = get_insns ();
3256 end_sequence ();
3257
3258 compare_code = GET_CODE (compare_op);
3259
3260 if ((op1 == const0_rtx && (code == GE || code == LT))
3261 || (op1 == constm1_rtx && (code == GT || code == LE)))
3262 sign_bit_compare_p = true;
3263
3264 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3265 but if op1 is a constant, the latter form allows more optimizations,
3266 either through the last 2 ops being constant handling, or the one
3267 constant and one variable cases. On the other side, for cmov the
3268 former might be better as we don't need to load the constant into
3269 another register. */
3270 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3271 op2 = op1;
3272 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3273 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3274 op3 = op1;
3275
3276 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3277 HImode insns, we'd be swallowed in word prefix ops. */
3278
3279 if ((mode != HImode || TARGET_FAST_PREFIX)
3280 && (mode != (TARGET_64BIT ? TImode : DImode))
3281 && CONST_INT_P (op2)
3282 && CONST_INT_P (op3))
3283 {
3284 rtx out = operands[0];
3285 HOST_WIDE_INT ct = INTVAL (op2);
3286 HOST_WIDE_INT cf = INTVAL (op3);
3287 HOST_WIDE_INT diff;
3288
3289 if ((mode == SImode
3290 || (TARGET_64BIT && mode == DImode))
3291 && (GET_MODE (op0) == SImode
3292 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3293 {
3294 /* Special case x != 0 ? -1 : y. */
3295 if (code == NE && op1 == const0_rtx && ct == -1)
3296 {
3297 negate_cc_compare_p = true;
3298 std::swap (ct, cf);
3299 code = EQ;
3300 }
3301 else if (code == EQ && op1 == const0_rtx && cf == -1)
3302 negate_cc_compare_p = true;
3303 }
3304
3305 diff = ct - cf;
3306 /* Sign bit compares are better done using shifts than we do by using
3307 sbb. */
3308 if (sign_bit_compare_p
3309 || negate_cc_compare_p
3310 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3311 {
3312 /* Detect overlap between destination and compare sources. */
3313 rtx tmp = out;
3314
3315 if (negate_cc_compare_p)
3316 {
3317 if (GET_MODE (op0) == DImode)
3318 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3319 else
3320 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3321 gen_lowpart (SImode, op0)));
3322
3323 tmp = gen_reg_rtx (mode);
3324 if (mode == DImode)
3325 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3326 else
3327 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3328 tmp)));
3329 }
3330 else if (!sign_bit_compare_p)
3331 {
3332 rtx flags;
3333 bool fpcmp = false;
3334
3335 compare_code = GET_CODE (compare_op);
3336
3337 flags = XEXP (compare_op, 0);
3338
3339 if (GET_MODE (flags) == CCFPmode)
3340 {
3341 fpcmp = true;
3342 compare_code
3343 = ix86_fp_compare_code_to_integer (compare_code);
3344 }
3345
3346 /* To simplify rest of code, restrict to the GEU case. */
3347 if (compare_code == LTU)
3348 {
3349 std::swap (ct, cf);
3350 compare_code = reverse_condition (compare_code);
3351 code = reverse_condition (code);
3352 }
3353 else
3354 {
3355 if (fpcmp)
3356 PUT_CODE (compare_op,
3357 reverse_condition_maybe_unordered
3358 (GET_CODE (compare_op)));
3359 else
3360 PUT_CODE (compare_op,
3361 reverse_condition (GET_CODE (compare_op)));
3362 }
3363 diff = ct - cf;
3364
3365 if (reg_overlap_mentioned_p (out, compare_op))
3366 tmp = gen_reg_rtx (mode);
3367
3368 if (mode == DImode)
3369 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3370 else
3371 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3372 flags, compare_op));
3373 }
3374 else
3375 {
3376 if (code == GT || code == GE)
3377 code = reverse_condition (code);
3378 else
3379 {
3380 std::swap (ct, cf);
3381 diff = ct - cf;
3382 }
3383 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3384 }
3385
3386 if (diff == 1)
3387 {
3388 /*
3389 * cmpl op0,op1
3390 * sbbl dest,dest
3391 * [addl dest, ct]
3392 *
3393 * Size 5 - 8.
3394 */
3395 if (ct)
3396 tmp = expand_simple_binop (mode, PLUS,
3397 tmp, GEN_INT (ct),
3398 copy_rtx (tmp), 1, OPTAB_DIRECT);
3399 }
3400 else if (cf == -1)
3401 {
3402 /*
3403 * cmpl op0,op1
3404 * sbbl dest,dest
3405 * orl $ct, dest
3406 *
3407 * Size 8.
3408 */
3409 tmp = expand_simple_binop (mode, IOR,
3410 tmp, GEN_INT (ct),
3411 copy_rtx (tmp), 1, OPTAB_DIRECT);
3412 }
3413 else if (diff == -1 && ct)
3414 {
3415 /*
3416 * cmpl op0,op1
3417 * sbbl dest,dest
3418 * notl dest
3419 * [addl dest, cf]
3420 *
3421 * Size 8 - 11.
3422 */
3423 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3424 if (cf)
3425 tmp = expand_simple_binop (mode, PLUS,
3426 copy_rtx (tmp), GEN_INT (cf),
3427 copy_rtx (tmp), 1, OPTAB_DIRECT);
3428 }
3429 else
3430 {
3431 /*
3432 * cmpl op0,op1
3433 * sbbl dest,dest
3434 * [notl dest]
3435 * andl cf - ct, dest
3436 * [addl dest, ct]
3437 *
3438 * Size 8 - 11.
3439 */
3440
3441 if (cf == 0)
3442 {
3443 cf = ct;
3444 ct = 0;
3445 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3446 }
3447
3448 tmp = expand_simple_binop (mode, AND,
3449 copy_rtx (tmp),
3450 gen_int_mode (cf - ct, mode),
3451 copy_rtx (tmp), 1, OPTAB_DIRECT);
3452 if (ct)
3453 tmp = expand_simple_binop (mode, PLUS,
3454 copy_rtx (tmp), GEN_INT (ct),
3455 copy_rtx (tmp), 1, OPTAB_DIRECT);
3456 }
3457
3458 if (!rtx_equal_p (tmp, out))
3459 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3460
3461 return true;
3462 }
3463
3464 if (diff < 0)
3465 {
3466 machine_mode cmp_mode = GET_MODE (op0);
3467 enum rtx_code new_code;
3468
3469 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3470 {
3471 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3472
3473 /* We may be reversing a non-trapping
3474 comparison to a trapping comparison. */
3475 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3476 && code != EQ && code != NE
3477 && code != ORDERED && code != UNORDERED)
3478 new_code = UNKNOWN;
3479 else
3480 new_code = reverse_condition_maybe_unordered (code);
3481 }
3482 else
3483 new_code = ix86_reverse_condition (code, cmp_mode);
3484 if (new_code != UNKNOWN)
3485 {
3486 std::swap (ct, cf);
3487 diff = -diff;
3488 code = new_code;
3489 }
3490 }
3491
3492 compare_code = UNKNOWN;
3493 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3494 && CONST_INT_P (op1))
3495 {
3496 if (op1 == const0_rtx
3497 && (code == LT || code == GE))
3498 compare_code = code;
3499 else if (op1 == constm1_rtx)
3500 {
3501 if (code == LE)
3502 compare_code = LT;
3503 else if (code == GT)
3504 compare_code = GE;
3505 }
3506 }
3507
3508 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3509 if (compare_code != UNKNOWN
3510 && GET_MODE (op0) == GET_MODE (out)
3511 && (cf == -1 || ct == -1))
3512 {
3513 /* If lea code below could be used, only optimize
3514 if it results in a 2 insn sequence. */
3515
3516 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3517 || diff == 3 || diff == 5 || diff == 9)
3518 || (compare_code == LT && ct == -1)
3519 || (compare_code == GE && cf == -1))
3520 {
3521 /*
3522 * notl op1 (if necessary)
3523 * sarl $31, op1
3524 * orl cf, op1
3525 */
3526 if (ct != -1)
3527 {
3528 cf = ct;
3529 ct = -1;
3530 code = reverse_condition (code);
3531 }
3532
3533 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3534
3535 out = expand_simple_binop (mode, IOR,
3536 out, GEN_INT (cf),
3537 out, 1, OPTAB_DIRECT);
3538 if (out != operands[0])
3539 emit_move_insn (operands[0], out);
3540
3541 return true;
3542 }
3543 }
3544
3545
3546 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3547 || diff == 3 || diff == 5 || diff == 9)
3548 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3549 && (mode != DImode
3550 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3551 {
3552 /*
3553 * xorl dest,dest
3554 * cmpl op1,op2
3555 * setcc dest
3556 * lea cf(dest*(ct-cf)),dest
3557 *
3558 * Size 14.
3559 *
3560 * This also catches the degenerate setcc-only case.
3561 */
3562
3563 rtx tmp;
3564 int nops;
3565
3566 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3567
3568 nops = 0;
3569 /* On x86_64 the lea instruction operates on Pmode, so we need
3570 to get arithmetics done in proper mode to match. */
3571 if (diff == 1)
3572 tmp = copy_rtx (out);
3573 else
3574 {
3575 rtx out1;
3576 out1 = copy_rtx (out);
3577 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3578 nops++;
3579 if (diff & 1)
3580 {
3581 tmp = gen_rtx_PLUS (mode, tmp, out1);
3582 nops++;
3583 }
3584 }
3585 if (cf != 0)
3586 {
3587 tmp = plus_constant (mode, tmp, cf);
3588 nops++;
3589 }
3590 if (!rtx_equal_p (tmp, out))
3591 {
3592 if (nops == 1)
3593 out = force_operand (tmp, copy_rtx (out));
3594 else
3595 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3596 }
3597 if (!rtx_equal_p (out, operands[0]))
3598 emit_move_insn (operands[0], copy_rtx (out));
3599
3600 return true;
3601 }
3602
3603 /*
3604 * General case: Jumpful:
3605 * xorl dest,dest cmpl op1, op2
3606 * cmpl op1, op2 movl ct, dest
3607 * setcc dest jcc 1f
3608 * decl dest movl cf, dest
3609 * andl (cf-ct),dest 1:
3610 * addl ct,dest
3611 *
3612 * Size 20. Size 14.
3613 *
3614 * This is reasonably steep, but branch mispredict costs are
3615 * high on modern cpus, so consider failing only if optimizing
3616 * for space.
3617 */
3618
3619 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3620 && BRANCH_COST (optimize_insn_for_speed_p (),
3621 false) >= 2)
3622 {
3623 if (cf == 0)
3624 {
3625 machine_mode cmp_mode = GET_MODE (op0);
3626 enum rtx_code new_code;
3627
3628 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3629 {
3630 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3631
3632 /* We may be reversing a non-trapping
3633 comparison to a trapping comparison. */
3634 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3635 && code != EQ && code != NE
3636 && code != ORDERED && code != UNORDERED)
3637 new_code = UNKNOWN;
3638 else
3639 new_code = reverse_condition_maybe_unordered (code);
3640
3641 }
3642 else
3643 {
3644 new_code = ix86_reverse_condition (code, cmp_mode);
3645 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3646 compare_code = reverse_condition (compare_code);
3647 }
3648
3649 if (new_code != UNKNOWN)
3650 {
3651 cf = ct;
3652 ct = 0;
3653 code = new_code;
3654 }
3655 }
3656
3657 if (compare_code != UNKNOWN)
3658 {
3659 /* notl op1 (if needed)
3660 sarl $31, op1
3661 andl (cf-ct), op1
3662 addl ct, op1
3663
3664 For x < 0 (resp. x <= -1) there will be no notl,
3665 so if possible swap the constants to get rid of the
3666 complement.
3667 True/false will be -1/0 while code below (store flag
3668 followed by decrement) is 0/-1, so the constants need
3669 to be exchanged once more. */
3670
3671 if (compare_code == GE || !cf)
3672 {
3673 code = reverse_condition (code);
3674 compare_code = LT;
3675 }
3676 else
3677 std::swap (ct, cf);
3678
3679 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3680 }
3681 else
3682 {
3683 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3684
3685 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3686 constm1_rtx,
3687 copy_rtx (out), 1, OPTAB_DIRECT);
3688 }
3689
3690 out = expand_simple_binop (mode, AND, copy_rtx (out),
3691 gen_int_mode (cf - ct, mode),
3692 copy_rtx (out), 1, OPTAB_DIRECT);
3693 if (ct)
3694 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3695 copy_rtx (out), 1, OPTAB_DIRECT);
3696 if (!rtx_equal_p (out, operands[0]))
3697 emit_move_insn (operands[0], copy_rtx (out));
3698
3699 return true;
3700 }
3701 }
3702
3703 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3704 {
3705 /* Try a few things more with specific constants and a variable. */
3706
3707 optab op;
3708 rtx var, orig_out, out, tmp;
3709
3710 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3711 return false;
3712
3713 operands[2] = op2;
3714 operands[3] = op3;
3715
3716 /* If one of the two operands is an interesting constant, load a
3717 constant with the above and mask it in with a logical operation. */
3718
3719 if (CONST_INT_P (operands[2]))
3720 {
3721 var = operands[3];
3722 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3723 operands[3] = constm1_rtx, op = and_optab;
3724 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3725 operands[3] = const0_rtx, op = ior_optab;
3726 else
3727 return false;
3728 }
3729 else if (CONST_INT_P (operands[3]))
3730 {
3731 var = operands[2];
3732 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3733 {
3734 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3735 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3736 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3737 operands[1] = simplify_gen_relational (LT, VOIDmode,
3738 GET_MODE (op0),
3739 op0, const0_rtx);
3740
3741 operands[2] = constm1_rtx;
3742 op = and_optab;
3743 }
3744 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3745 operands[2] = const0_rtx, op = ior_optab;
3746 else
3747 return false;
3748 }
3749 else
3750 return false;
3751
3752 orig_out = operands[0];
3753 tmp = gen_reg_rtx (mode);
3754 operands[0] = tmp;
3755
3756 /* Recurse to get the constant loaded. */
3757 if (!ix86_expand_int_movcc (operands))
3758 return false;
3759
3760 /* Mask in the interesting variable. */
3761 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3762 OPTAB_WIDEN);
3763 if (!rtx_equal_p (out, orig_out))
3764 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3765
3766 return true;
3767 }
3768
3769 /*
3770 * For comparison with above,
3771 *
3772 * movl cf,dest
3773 * movl ct,tmp
3774 * cmpl op1,op2
3775 * cmovcc tmp,dest
3776 *
3777 * Size 15.
3778 */
3779
3780 if (! nonimmediate_operand (operands[2], mode))
3781 operands[2] = force_reg (mode, operands[2]);
3782 if (! nonimmediate_operand (operands[3], mode))
3783 operands[3] = force_reg (mode, operands[3]);
3784
3785 if (! register_operand (operands[2], VOIDmode)
3786 && (mode == QImode
3787 || ! register_operand (operands[3], VOIDmode)))
3788 operands[2] = force_reg (mode, operands[2]);
3789
3790 if (mode == QImode
3791 && ! register_operand (operands[3], VOIDmode))
3792 operands[3] = force_reg (mode, operands[3]);
3793
3794 emit_insn (compare_seq);
3795 emit_insn (gen_rtx_SET (operands[0],
3796 gen_rtx_IF_THEN_ELSE (mode,
3797 compare_op, operands[2],
3798 operands[3])));
3799 return true;
3800 }
3801
3802 /* Detect conditional moves that exactly match min/max operational
3803 semantics. Note that this is IEEE safe, as long as we don't
3804 interchange the operands.
3805
3806 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3807 and TRUE if the operation is successful and instructions are emitted. */
3808
3809 static bool
3810 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3811 rtx cmp_op1, rtx if_true, rtx if_false)
3812 {
3813 machine_mode mode;
3814 bool is_min;
3815 rtx tmp;
3816
3817 if (code == LT)
3818 ;
3819 else if (code == UNGE)
3820 std::swap (if_true, if_false);
3821 else
3822 return false;
3823
3824 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3825 is_min = true;
3826 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3827 is_min = false;
3828 else
3829 return false;
3830
3831 mode = GET_MODE (dest);
3832
3833 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3834 but MODE may be a vector mode and thus not appropriate. */
3835 if (!flag_finite_math_only || flag_signed_zeros)
3836 {
3837 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3838 rtvec v;
3839
3840 if_true = force_reg (mode, if_true);
3841 v = gen_rtvec (2, if_true, if_false);
3842 tmp = gen_rtx_UNSPEC (mode, v, u);
3843 }
3844 else
3845 {
3846 code = is_min ? SMIN : SMAX;
3847 if (MEM_P (if_true) && MEM_P (if_false))
3848 if_true = force_reg (mode, if_true);
3849 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3850 }
3851
3852 emit_insn (gen_rtx_SET (dest, tmp));
3853 return true;
3854 }
3855
3856 /* Return true if MODE is valid for vector compare to mask register,
3857 Same result for conditionl vector move with mask register. */
3858 static bool
3859 ix86_valid_mask_cmp_mode (machine_mode mode)
3860 {
3861 /* XOP has its own vector conditional movement. */
3862 if (TARGET_XOP && !TARGET_AVX512F)
3863 return false;
3864
3865 /* HFmode only supports vcmpsh whose dest is mask register. */
3866 if (TARGET_AVX512FP16 && mode == HFmode)
3867 return true;
3868
3869 /* AVX512F is needed for mask operation. */
3870 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3871 return false;
3872
3873 /* AVX512BW is needed for vector QI/HImode,
3874 AVX512VL is needed for 128/256-bit vector. */
3875 machine_mode inner_mode = GET_MODE_INNER (mode);
3876 int vector_size = GET_MODE_SIZE (mode);
3877 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3878 return false;
3879
3880 return vector_size == 64 || TARGET_AVX512VL;
3881 }
3882
3883 /* Return true if integer mask comparison should be used. */
3884 static bool
3885 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3886 rtx op_true, rtx op_false)
3887 {
3888 int vector_size = GET_MODE_SIZE (mode);
3889
3890 if (cmp_mode == HFmode)
3891 return true;
3892 else if (vector_size < 16)
3893 return false;
3894 else if (vector_size == 64)
3895 return true;
3896 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3897 return true;
3898
3899 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3900 gcc_assert (!op_true == !op_false);
3901
3902 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3903 vector dest is required. */
3904 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3905 return false;
3906
3907 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3908 if (op_false == CONST0_RTX (mode)
3909 || op_true == CONST0_RTX (mode)
3910 || (INTEGRAL_MODE_P (mode)
3911 && (op_true == CONSTM1_RTX (mode)
3912 || op_false == CONSTM1_RTX (mode))))
3913 return false;
3914
3915 return true;
3916 }
3917
3918 /* Expand an SSE comparison. Return the register with the result. */
3919
3920 static rtx
3921 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3922 rtx op_true, rtx op_false)
3923 {
3924 machine_mode mode = GET_MODE (dest);
3925 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3926
3927 /* In general case result of comparison can differ from operands' type. */
3928 machine_mode cmp_mode;
3929
3930 /* In AVX512F the result of comparison is an integer mask. */
3931 bool maskcmp = false;
3932 rtx x;
3933
3934 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3935 {
3936 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3937 maskcmp = true;
3938 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3939 }
3940 else
3941 cmp_mode = cmp_ops_mode;
3942
3943 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3944
3945 bool (*op1_predicate)(rtx, machine_mode)
3946 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3947
3948 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3949 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3950
3951 if (optimize
3952 || (maskcmp && cmp_mode != mode)
3953 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3954 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3955 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3956
3957 if (maskcmp)
3958 {
3959 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3960 gcc_assert (ok);
3961 return dest;
3962 }
3963
3964 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3965
3966 if (cmp_mode != mode)
3967 {
3968 x = force_reg (cmp_ops_mode, x);
3969 convert_move (dest, x, false);
3970 }
3971 else
3972 emit_insn (gen_rtx_SET (dest, x));
3973
3974 return dest;
3975 }
3976
3977 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3978 instructions that can be performed using GP registers. */
3979
3980 static void
3981 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
3982 rtx dst, rtx src1, rtx src2)
3983 {
3984 rtx tmp;
3985
3986 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
3987
3988 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
3989 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
3990 {
3991 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
3992 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
3993 }
3994
3995 emit_insn (tmp);
3996 }
3997
3998 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3999 operations. This is used for both scalar and vector conditional moves. */
4000
4001 void
4002 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
4003 {
4004 machine_mode mode = GET_MODE (dest);
4005 machine_mode cmpmode = GET_MODE (cmp);
4006 rtx x;
4007
4008 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4009 if (rtx_equal_p (op_true, op_false))
4010 {
4011 emit_move_insn (dest, op_true);
4012 return;
4013 }
4014
4015 /* If we have an integer mask and FP value then we need
4016 to cast mask to FP mode. */
4017 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
4018 {
4019 cmp = force_reg (cmpmode, cmp);
4020 cmp = gen_rtx_SUBREG (mode, cmp, 0);
4021 }
4022
4023 /* In AVX512F the result of comparison is an integer mask. */
4024 if (mode != cmpmode
4025 && GET_MODE_CLASS (cmpmode) == MODE_INT)
4026 {
4027 gcc_assert (ix86_valid_mask_cmp_mode (mode));
4028 /* Using scalar/vector move with mask register. */
4029 cmp = force_reg (cmpmode, cmp);
4030 /* Optimize for mask zero. */
4031 op_true = (op_true != CONST0_RTX (mode)
4032 ? force_reg (mode, op_true) : op_true);
4033 op_false = (op_false != CONST0_RTX (mode)
4034 ? force_reg (mode, op_false) : op_false);
4035 if (op_true == CONST0_RTX (mode))
4036 {
4037 if (cmpmode == E_DImode && !TARGET_64BIT)
4038 {
4039 x = gen_reg_rtx (cmpmode);
4040 emit_insn (gen_knotdi (x, cmp));
4041 }
4042 else
4043 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4044 cmp = x;
4045 /* Reverse op_true op_false. */
4046 std::swap (op_true, op_false);
4047 }
4048
4049 if (mode == HFmode)
4050 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4051 else
4052 emit_insn (gen_rtx_SET (dest,
4053 gen_rtx_VEC_MERGE (mode,
4054 op_true, op_false, cmp)));
4055 return;
4056 }
4057
4058 if (vector_all_ones_operand (op_true, mode)
4059 && op_false == CONST0_RTX (mode))
4060 {
4061 emit_move_insn (dest, cmp);
4062 return;
4063 }
4064 else if (op_false == CONST0_RTX (mode))
4065 {
4066 x = expand_simple_binop (mode, AND, cmp, op_true,
4067 dest, 1, OPTAB_DIRECT);
4068 if (x != dest)
4069 emit_move_insn (dest, x);
4070 return;
4071 }
4072 else if (op_true == CONST0_RTX (mode))
4073 {
4074 op_false = force_reg (mode, op_false);
4075 x = gen_rtx_NOT (mode, cmp);
4076 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
4077 return;
4078 }
4079 else if (vector_all_ones_operand (op_true, mode))
4080 {
4081 x = expand_simple_binop (mode, IOR, cmp, op_false,
4082 dest, 1, OPTAB_DIRECT);
4083 if (x != dest)
4084 emit_move_insn (dest, x);
4085 return;
4086 }
4087
4088 if (TARGET_XOP)
4089 {
4090 op_true = force_reg (mode, op_true);
4091
4092 if (GET_MODE_SIZE (mode) < 16
4093 || !nonimmediate_operand (op_false, mode))
4094 op_false = force_reg (mode, op_false);
4095
4096 emit_insn (gen_rtx_SET (dest,
4097 gen_rtx_IF_THEN_ELSE (mode, cmp,
4098 op_true, op_false)));
4099 return;
4100 }
4101
4102 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4103 machine_mode blend_mode = mode;
4104
4105 if (GET_MODE_SIZE (mode) < 16
4106 || !vector_operand (op_true, mode))
4107 op_true = force_reg (mode, op_true);
4108
4109 op_false = force_reg (mode, op_false);
4110
4111 switch (mode)
4112 {
4113 case E_V2SFmode:
4114 if (TARGET_SSE4_1)
4115 gen = gen_mmx_blendvps;
4116 break;
4117 case E_V4SFmode:
4118 if (TARGET_SSE4_1)
4119 gen = gen_sse4_1_blendvps;
4120 break;
4121 case E_V2DFmode:
4122 if (TARGET_SSE4_1)
4123 gen = gen_sse4_1_blendvpd;
4124 break;
4125 case E_SFmode:
4126 if (TARGET_SSE4_1)
4127 gen = gen_sse4_1_blendvss;
4128 break;
4129 case E_DFmode:
4130 if (TARGET_SSE4_1)
4131 gen = gen_sse4_1_blendvsd;
4132 break;
4133 case E_V8QImode:
4134 case E_V4HImode:
4135 case E_V2SImode:
4136 if (TARGET_SSE4_1)
4137 {
4138 gen = gen_mmx_pblendvb_v8qi;
4139 blend_mode = V8QImode;
4140 }
4141 break;
4142 case E_V4QImode:
4143 case E_V2HImode:
4144 if (TARGET_SSE4_1)
4145 {
4146 gen = gen_mmx_pblendvb_v4qi;
4147 blend_mode = V4QImode;
4148 }
4149 break;
4150 case E_V2QImode:
4151 if (TARGET_SSE4_1)
4152 gen = gen_mmx_pblendvb_v2qi;
4153 break;
4154 case E_V16QImode:
4155 case E_V8HImode:
4156 case E_V8HFmode:
4157 case E_V8BFmode:
4158 case E_V4SImode:
4159 case E_V2DImode:
4160 case E_V1TImode:
4161 if (TARGET_SSE4_1)
4162 {
4163 gen = gen_sse4_1_pblendvb;
4164 blend_mode = V16QImode;
4165 }
4166 break;
4167 case E_V8SFmode:
4168 if (TARGET_AVX)
4169 gen = gen_avx_blendvps256;
4170 break;
4171 case E_V4DFmode:
4172 if (TARGET_AVX)
4173 gen = gen_avx_blendvpd256;
4174 break;
4175 case E_V32QImode:
4176 case E_V16HImode:
4177 case E_V16HFmode:
4178 case E_V16BFmode:
4179 case E_V8SImode:
4180 case E_V4DImode:
4181 if (TARGET_AVX2)
4182 {
4183 gen = gen_avx2_pblendvb;
4184 blend_mode = V32QImode;
4185 }
4186 break;
4187
4188 case E_V64QImode:
4189 gen = gen_avx512bw_blendmv64qi;
4190 break;
4191 case E_V32HImode:
4192 gen = gen_avx512bw_blendmv32hi;
4193 break;
4194 case E_V32HFmode:
4195 gen = gen_avx512bw_blendmv32hf;
4196 break;
4197 case E_V32BFmode:
4198 gen = gen_avx512bw_blendmv32bf;
4199 break;
4200 case E_V16SImode:
4201 gen = gen_avx512f_blendmv16si;
4202 break;
4203 case E_V8DImode:
4204 gen = gen_avx512f_blendmv8di;
4205 break;
4206 case E_V8DFmode:
4207 gen = gen_avx512f_blendmv8df;
4208 break;
4209 case E_V16SFmode:
4210 gen = gen_avx512f_blendmv16sf;
4211 break;
4212
4213 default:
4214 break;
4215 }
4216
4217 if (gen != NULL)
4218 {
4219 if (blend_mode == mode)
4220 x = dest;
4221 else
4222 {
4223 x = gen_reg_rtx (blend_mode);
4224 op_false = gen_lowpart (blend_mode, op_false);
4225 op_true = gen_lowpart (blend_mode, op_true);
4226 cmp = gen_lowpart (blend_mode, cmp);
4227 }
4228
4229 emit_insn (gen (x, op_false, op_true, cmp));
4230
4231 if (x != dest)
4232 emit_move_insn (dest, gen_lowpart (mode, x));
4233 }
4234 else
4235 {
4236 rtx t2, t3;
4237
4238 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4239 NULL, 1, OPTAB_DIRECT);
4240
4241 t3 = gen_reg_rtx (mode);
4242 x = gen_rtx_NOT (mode, cmp);
4243 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4244
4245 x = expand_simple_binop (mode, IOR, t3, t2,
4246 dest, 1, OPTAB_DIRECT);
4247 if (x != dest)
4248 emit_move_insn (dest, x);
4249 }
4250 }
4251
4252 /* Swap, force into registers, or otherwise massage the two operands
4253 to an sse comparison with a mask result. Thus we differ a bit from
4254 ix86_prepare_fp_compare_args which expects to produce a flags result.
4255
4256 The DEST operand exists to help determine whether to commute commutative
4257 operators. The POP0/POP1 operands are updated in place. The new
4258 comparison code is returned, or UNKNOWN if not implementable. */
4259
4260 static enum rtx_code
4261 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4262 rtx *pop0, rtx *pop1)
4263 {
4264 switch (code)
4265 {
4266 case LTGT:
4267 case UNEQ:
4268 /* AVX supports all the needed comparisons. */
4269 if (TARGET_AVX)
4270 break;
4271 /* We have no LTGT as an operator. We could implement it with
4272 NE & ORDERED, but this requires an extra temporary. It's
4273 not clear that it's worth it. */
4274 return UNKNOWN;
4275
4276 case LT:
4277 case LE:
4278 case UNGT:
4279 case UNGE:
4280 /* These are supported directly. */
4281 break;
4282
4283 case EQ:
4284 case NE:
4285 case UNORDERED:
4286 case ORDERED:
4287 /* AVX has 3 operand comparisons, no need to swap anything. */
4288 if (TARGET_AVX)
4289 break;
4290 /* For commutative operators, try to canonicalize the destination
4291 operand to be first in the comparison - this helps reload to
4292 avoid extra moves. */
4293 if (!dest || !rtx_equal_p (dest, *pop1))
4294 break;
4295 /* FALLTHRU */
4296
4297 case GE:
4298 case GT:
4299 case UNLE:
4300 case UNLT:
4301 /* These are not supported directly before AVX, and furthermore
4302 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4303 comparison operands to transform into something that is
4304 supported. */
4305 std::swap (*pop0, *pop1);
4306 code = swap_condition (code);
4307 break;
4308
4309 default:
4310 gcc_unreachable ();
4311 }
4312
4313 return code;
4314 }
4315
4316 /* Expand a floating-point conditional move. Return true if successful. */
4317
4318 bool
4319 ix86_expand_fp_movcc (rtx operands[])
4320 {
4321 machine_mode mode = GET_MODE (operands[0]);
4322 enum rtx_code code = GET_CODE (operands[1]);
4323 rtx tmp, compare_op;
4324 rtx op0 = XEXP (operands[1], 0);
4325 rtx op1 = XEXP (operands[1], 1);
4326
4327 if (GET_MODE (op0) == BFmode
4328 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4329 return false;
4330
4331 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4332 {
4333 machine_mode cmode;
4334
4335 /* Since we've no cmove for sse registers, don't force bad register
4336 allocation just to gain access to it. Deny movcc when the
4337 comparison mode doesn't match the move mode. */
4338 cmode = GET_MODE (op0);
4339 if (cmode == VOIDmode)
4340 cmode = GET_MODE (op1);
4341 if (cmode != mode)
4342 return false;
4343
4344 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4345 if (code == UNKNOWN)
4346 return false;
4347
4348 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4349 operands[2], operands[3]))
4350 return true;
4351
4352 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4353 operands[2], operands[3]);
4354 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4355 return true;
4356 }
4357
4358 if (GET_MODE (op0) == TImode
4359 || (GET_MODE (op0) == DImode
4360 && !TARGET_64BIT))
4361 return false;
4362
4363 /* The floating point conditional move instructions don't directly
4364 support conditions resulting from a signed integer comparison. */
4365
4366 compare_op = ix86_expand_compare (code, op0, op1);
4367 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4368 {
4369 tmp = gen_reg_rtx (QImode);
4370 ix86_expand_setcc (tmp, code, op0, op1);
4371
4372 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4373 }
4374
4375 emit_insn (gen_rtx_SET (operands[0],
4376 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4377 operands[2], operands[3])));
4378
4379 return true;
4380 }
4381
4382 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4383
4384 static int
4385 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4386 {
4387 switch (code)
4388 {
4389 case EQ:
4390 return 0;
4391 case LT:
4392 case LTU:
4393 return 1;
4394 case LE:
4395 case LEU:
4396 return 2;
4397 case NE:
4398 return 4;
4399 case GE:
4400 case GEU:
4401 return 5;
4402 case GT:
4403 case GTU:
4404 return 6;
4405 default:
4406 gcc_unreachable ();
4407 }
4408 }
4409
4410 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4411
4412 static int
4413 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4414 {
4415 switch (code)
4416 {
4417 case EQ:
4418 return 0x00;
4419 case NE:
4420 return 0x04;
4421 case GT:
4422 return 0x0e;
4423 case LE:
4424 return 0x02;
4425 case GE:
4426 return 0x0d;
4427 case LT:
4428 return 0x01;
4429 case UNLE:
4430 return 0x0a;
4431 case UNLT:
4432 return 0x09;
4433 case UNGE:
4434 return 0x05;
4435 case UNGT:
4436 return 0x06;
4437 case UNEQ:
4438 return 0x18;
4439 case LTGT:
4440 return 0x0c;
4441 case ORDERED:
4442 return 0x07;
4443 case UNORDERED:
4444 return 0x03;
4445 default:
4446 gcc_unreachable ();
4447 }
4448 }
4449
4450 /* Return immediate value to be used in UNSPEC_PCMP
4451 for comparison CODE in MODE. */
4452
4453 static int
4454 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4455 {
4456 if (FLOAT_MODE_P (mode))
4457 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4458 return ix86_int_cmp_code_to_pcmp_immediate (code);
4459 }
4460
4461 /* Expand AVX-512 vector comparison. */
4462
4463 bool
4464 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4465 {
4466 machine_mode mask_mode = GET_MODE (dest);
4467 machine_mode cmp_mode = GET_MODE (cmp_op0);
4468 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4469 int unspec_code;
4470 rtx unspec;
4471
4472 switch (code)
4473 {
4474 case LEU:
4475 case GTU:
4476 case GEU:
4477 case LTU:
4478 unspec_code = UNSPEC_UNSIGNED_PCMP;
4479 break;
4480
4481 default:
4482 unspec_code = UNSPEC_PCMP;
4483 }
4484
4485 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4486 unspec_code);
4487 emit_insn (gen_rtx_SET (dest, unspec));
4488
4489 return true;
4490 }
4491
4492 /* Expand fp vector comparison. */
4493
4494 bool
4495 ix86_expand_fp_vec_cmp (rtx operands[])
4496 {
4497 enum rtx_code code = GET_CODE (operands[1]);
4498 rtx cmp;
4499
4500 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4501 &operands[2], &operands[3]);
4502 if (code == UNKNOWN)
4503 {
4504 rtx temp;
4505 switch (GET_CODE (operands[1]))
4506 {
4507 case LTGT:
4508 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4509 operands[3], NULL, NULL);
4510 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4511 operands[3], NULL, NULL);
4512 code = AND;
4513 break;
4514 case UNEQ:
4515 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4516 operands[3], NULL, NULL);
4517 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4518 operands[3], NULL, NULL);
4519 code = IOR;
4520 break;
4521 default:
4522 gcc_unreachable ();
4523 }
4524 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4525 OPTAB_DIRECT);
4526 }
4527 else
4528 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4529 NULL, NULL);
4530
4531 if (operands[0] != cmp)
4532 emit_move_insn (operands[0], cmp);
4533
4534 return true;
4535 }
4536
4537 static rtx
4538 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4539 rtx op_true, rtx op_false, bool *negate)
4540 {
4541 machine_mode data_mode = GET_MODE (dest);
4542 machine_mode mode = GET_MODE (cop0);
4543 rtx x;
4544
4545 *negate = false;
4546
4547 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4548 if (TARGET_XOP
4549 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4550 && GET_MODE_SIZE (mode) <= 16)
4551 ;
4552 /* AVX512F supports all of the comparsions
4553 on all 128/256/512-bit vector int types. */
4554 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4555 ;
4556 else
4557 {
4558 /* Canonicalize the comparison to EQ, GT, GTU. */
4559 switch (code)
4560 {
4561 case EQ:
4562 case GT:
4563 case GTU:
4564 break;
4565
4566 case LE:
4567 case LEU:
4568 /* x <= cst can be handled as x < cst + 1 unless there is
4569 wrap around in cst + 1. */
4570 if (GET_CODE (cop1) == CONST_VECTOR
4571 && GET_MODE_INNER (mode) != TImode)
4572 {
4573 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4574 machine_mode eltmode = GET_MODE_INNER (mode);
4575 for (i = 0; i < n_elts; ++i)
4576 {
4577 rtx elt = CONST_VECTOR_ELT (cop1, i);
4578 if (!CONST_INT_P (elt))
4579 break;
4580 if (code == GE)
4581 {
4582 /* For LE punt if some element is signed maximum. */
4583 if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4584 == (GET_MODE_MASK (eltmode) >> 1))
4585 break;
4586 }
4587 /* For LEU punt if some element is unsigned maximum. */
4588 else if (elt == constm1_rtx)
4589 break;
4590 }
4591 if (i == n_elts)
4592 {
4593 rtvec v = rtvec_alloc (n_elts);
4594 for (i = 0; i < n_elts; ++i)
4595 RTVEC_ELT (v, i)
4596 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4597 eltmode);
4598 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4599 std::swap (cop0, cop1);
4600 code = code == LE ? GT : GTU;
4601 break;
4602 }
4603 }
4604 /* FALLTHRU */
4605 case NE:
4606 code = reverse_condition (code);
4607 *negate = true;
4608 break;
4609
4610 case GE:
4611 case GEU:
4612 /* x >= cst can be handled as x > cst - 1 unless there is
4613 wrap around in cst - 1. */
4614 if (GET_CODE (cop1) == CONST_VECTOR
4615 && GET_MODE_INNER (mode) != TImode)
4616 {
4617 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4618 machine_mode eltmode = GET_MODE_INNER (mode);
4619 for (i = 0; i < n_elts; ++i)
4620 {
4621 rtx elt = CONST_VECTOR_ELT (cop1, i);
4622 if (!CONST_INT_P (elt))
4623 break;
4624 if (code == GE)
4625 {
4626 /* For GE punt if some element is signed minimum. */
4627 if (INTVAL (elt) < 0
4628 && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4629 == 0))
4630 break;
4631 }
4632 /* For GEU punt if some element is zero. */
4633 else if (elt == const0_rtx)
4634 break;
4635 }
4636 if (i == n_elts)
4637 {
4638 rtvec v = rtvec_alloc (n_elts);
4639 for (i = 0; i < n_elts; ++i)
4640 RTVEC_ELT (v, i)
4641 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
4642 eltmode);
4643 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4644 code = code == GE ? GT : GTU;
4645 break;
4646 }
4647 }
4648 code = reverse_condition (code);
4649 *negate = true;
4650 /* FALLTHRU */
4651
4652 case LT:
4653 case LTU:
4654 std::swap (cop0, cop1);
4655 code = swap_condition (code);
4656 break;
4657
4658 default:
4659 gcc_unreachable ();
4660 }
4661
4662 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4663 if (mode == V2DImode)
4664 {
4665 switch (code)
4666 {
4667 case EQ:
4668 /* SSE4.1 supports EQ. */
4669 if (!TARGET_SSE4_1)
4670 return NULL;
4671 break;
4672
4673 case GT:
4674 case GTU:
4675 /* SSE4.2 supports GT/GTU. */
4676 if (!TARGET_SSE4_2)
4677 return NULL;
4678 break;
4679
4680 default:
4681 gcc_unreachable ();
4682 }
4683 }
4684
4685 if (GET_CODE (cop0) == CONST_VECTOR)
4686 cop0 = force_reg (mode, cop0);
4687 else if (GET_CODE (cop1) == CONST_VECTOR)
4688 cop1 = force_reg (mode, cop1);
4689
4690 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4691 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4692 if (*negate)
4693 std::swap (optrue, opfalse);
4694
4695 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4696 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4697 min (x, y) == x). While we add one instruction (the minimum),
4698 we remove the need for two instructions in the negation, as the
4699 result is done this way.
4700 When using masks, do it for SI/DImode element types, as it is shorter
4701 than the two subtractions. */
4702 if ((code != EQ
4703 && GET_MODE_SIZE (mode) != 64
4704 && vector_all_ones_operand (opfalse, data_mode)
4705 && optrue == CONST0_RTX (data_mode))
4706 || (code == GTU
4707 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4708 /* Don't do it if not using integer masks and we'd end up with
4709 the right values in the registers though. */
4710 && (GET_MODE_SIZE (mode) == 64
4711 || !vector_all_ones_operand (optrue, data_mode)
4712 || opfalse != CONST0_RTX (data_mode))))
4713 {
4714 rtx (*gen) (rtx, rtx, rtx) = NULL;
4715
4716 switch (mode)
4717 {
4718 case E_V16SImode:
4719 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4720 break;
4721 case E_V8DImode:
4722 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4723 cop0 = force_reg (mode, cop0);
4724 cop1 = force_reg (mode, cop1);
4725 break;
4726 case E_V32QImode:
4727 if (TARGET_AVX2)
4728 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4729 break;
4730 case E_V16HImode:
4731 if (TARGET_AVX2)
4732 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4733 break;
4734 case E_V8SImode:
4735 if (TARGET_AVX2)
4736 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4737 break;
4738 case E_V4DImode:
4739 if (TARGET_AVX512VL)
4740 {
4741 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4742 cop0 = force_reg (mode, cop0);
4743 cop1 = force_reg (mode, cop1);
4744 }
4745 break;
4746 case E_V16QImode:
4747 if (code == GTU && TARGET_SSE2)
4748 gen = gen_uminv16qi3;
4749 else if (code == GT && TARGET_SSE4_1)
4750 gen = gen_sminv16qi3;
4751 break;
4752 case E_V8QImode:
4753 if (code == GTU && TARGET_SSE2)
4754 gen = gen_uminv8qi3;
4755 else if (code == GT && TARGET_SSE4_1)
4756 gen = gen_sminv8qi3;
4757 break;
4758 case E_V4QImode:
4759 if (code == GTU && TARGET_SSE2)
4760 gen = gen_uminv4qi3;
4761 else if (code == GT && TARGET_SSE4_1)
4762 gen = gen_sminv4qi3;
4763 break;
4764 case E_V2QImode:
4765 if (code == GTU && TARGET_SSE2)
4766 gen = gen_uminv2qi3;
4767 else if (code == GT && TARGET_SSE4_1)
4768 gen = gen_sminv2qi3;
4769 break;
4770 case E_V8HImode:
4771 if (code == GTU && TARGET_SSE4_1)
4772 gen = gen_uminv8hi3;
4773 else if (code == GT && TARGET_SSE2)
4774 gen = gen_sminv8hi3;
4775 break;
4776 case E_V4HImode:
4777 if (code == GTU && TARGET_SSE4_1)
4778 gen = gen_uminv4hi3;
4779 else if (code == GT && TARGET_SSE2)
4780 gen = gen_sminv4hi3;
4781 break;
4782 case E_V2HImode:
4783 if (code == GTU && TARGET_SSE4_1)
4784 gen = gen_uminv2hi3;
4785 else if (code == GT && TARGET_SSE2)
4786 gen = gen_sminv2hi3;
4787 break;
4788 case E_V4SImode:
4789 if (TARGET_SSE4_1)
4790 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4791 break;
4792 case E_V2SImode:
4793 if (TARGET_SSE4_1)
4794 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4795 break;
4796 case E_V2DImode:
4797 if (TARGET_AVX512VL)
4798 {
4799 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4800 cop0 = force_reg (mode, cop0);
4801 cop1 = force_reg (mode, cop1);
4802 }
4803 break;
4804 default:
4805 break;
4806 }
4807
4808 if (gen)
4809 {
4810 rtx tem = gen_reg_rtx (mode);
4811 if (!vector_operand (cop0, mode))
4812 cop0 = force_reg (mode, cop0);
4813 if (!vector_operand (cop1, mode))
4814 cop1 = force_reg (mode, cop1);
4815 *negate = !*negate;
4816 emit_insn (gen (tem, cop0, cop1));
4817 cop1 = tem;
4818 code = EQ;
4819 }
4820 }
4821
4822 /* Unsigned parallel compare is not supported by the hardware.
4823 Play some tricks to turn this into a signed comparison
4824 against 0. */
4825 if (code == GTU)
4826 {
4827 cop0 = force_reg (mode, cop0);
4828
4829 switch (mode)
4830 {
4831 case E_V16SImode:
4832 case E_V8DImode:
4833 case E_V8SImode:
4834 case E_V4DImode:
4835 case E_V4SImode:
4836 case E_V2SImode:
4837 case E_V2DImode:
4838 {
4839 rtx t1, t2, mask;
4840
4841 /* Subtract (-(INT MAX) - 1) from both operands to make
4842 them signed. */
4843 mask = ix86_build_signbit_mask (mode, true, false);
4844 t1 = gen_reg_rtx (mode);
4845 emit_insn (gen_sub3_insn (t1, cop0, mask));
4846
4847 t2 = gen_reg_rtx (mode);
4848 emit_insn (gen_sub3_insn (t2, cop1, mask));
4849
4850 cop0 = t1;
4851 cop1 = t2;
4852 code = GT;
4853 }
4854 break;
4855
4856 case E_V64QImode:
4857 case E_V32HImode:
4858 case E_V32QImode:
4859 case E_V16HImode:
4860 case E_V16QImode:
4861 case E_V8QImode:
4862 case E_V4QImode:
4863 case E_V2QImode:
4864 case E_V8HImode:
4865 case E_V4HImode:
4866 case E_V2HImode:
4867 /* Perform a parallel unsigned saturating subtraction. */
4868 x = gen_reg_rtx (mode);
4869 emit_insn (gen_rtx_SET
4870 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4871 cop0 = x;
4872 cop1 = CONST0_RTX (mode);
4873 code = EQ;
4874 *negate = !*negate;
4875 break;
4876
4877 default:
4878 gcc_unreachable ();
4879 }
4880 }
4881 }
4882
4883 if (*negate)
4884 std::swap (op_true, op_false);
4885
4886 if (GET_CODE (cop1) == CONST_VECTOR)
4887 cop1 = force_reg (mode, cop1);
4888
4889 /* Allow the comparison to be done in one mode, but the movcc to
4890 happen in another mode. */
4891 if (data_mode == mode)
4892 x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
4893 else
4894 {
4895 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4896 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4897 op_true, op_false);
4898 if (GET_MODE (x) == mode)
4899 x = gen_lowpart (data_mode, x);
4900 }
4901
4902 return x;
4903 }
4904
4905 /* Expand integer vector comparison. */
4906
4907 bool
4908 ix86_expand_int_vec_cmp (rtx operands[])
4909 {
4910 rtx_code code = GET_CODE (operands[1]);
4911 bool negate = false;
4912 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4913 operands[3], NULL, NULL, &negate);
4914
4915 if (!cmp)
4916 return false;
4917
4918 if (negate)
4919 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4920 CONST0_RTX (GET_MODE (cmp)),
4921 NULL, NULL, &negate);
4922
4923 gcc_assert (!negate);
4924
4925 if (operands[0] != cmp)
4926 emit_move_insn (operands[0], cmp);
4927
4928 return true;
4929 }
4930
4931 /* Expand a floating-point vector conditional move; a vcond operation
4932 rather than a movcc operation. */
4933
4934 bool
4935 ix86_expand_fp_vcond (rtx operands[])
4936 {
4937 enum rtx_code code = GET_CODE (operands[3]);
4938 rtx cmp;
4939
4940 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4941 &operands[4], &operands[5]);
4942 if (code == UNKNOWN)
4943 {
4944 rtx temp;
4945 switch (GET_CODE (operands[3]))
4946 {
4947 case LTGT:
4948 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4949 operands[5], operands[0], operands[0]);
4950 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4951 operands[5], operands[1], operands[2]);
4952 code = AND;
4953 break;
4954 case UNEQ:
4955 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4956 operands[5], operands[0], operands[0]);
4957 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4958 operands[5], operands[1], operands[2]);
4959 code = IOR;
4960 break;
4961 default:
4962 gcc_unreachable ();
4963 }
4964 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4965 OPTAB_DIRECT);
4966 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4967 return true;
4968 }
4969
4970 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4971 operands[5], operands[1], operands[2]))
4972 return true;
4973
4974 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4975 operands[1], operands[2]);
4976 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4977 return true;
4978 }
4979
4980 /* Expand a signed/unsigned integral vector conditional move. */
4981
4982 bool
4983 ix86_expand_int_vcond (rtx operands[])
4984 {
4985 machine_mode data_mode = GET_MODE (operands[0]);
4986 machine_mode mode = GET_MODE (operands[4]);
4987 enum rtx_code code = GET_CODE (operands[3]);
4988 bool negate = false;
4989 rtx x, cop0, cop1;
4990
4991 cop0 = operands[4];
4992 cop1 = operands[5];
4993
4994 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4995 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4996 if ((code == LT || code == GE)
4997 && data_mode == mode
4998 && cop1 == CONST0_RTX (mode)
4999 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
5000 && GET_MODE_UNIT_SIZE (data_mode) > 1
5001 && GET_MODE_UNIT_SIZE (data_mode) <= 8
5002 && (GET_MODE_SIZE (data_mode) == 16
5003 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
5004 {
5005 rtx negop = operands[2 - (code == LT)];
5006 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
5007 if (negop == CONST1_RTX (data_mode))
5008 {
5009 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
5010 operands[0], 1, OPTAB_DIRECT);
5011 if (res != operands[0])
5012 emit_move_insn (operands[0], res);
5013 return true;
5014 }
5015 else if (GET_MODE_INNER (data_mode) != DImode
5016 && vector_all_ones_operand (negop, data_mode))
5017 {
5018 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
5019 operands[0], 0, OPTAB_DIRECT);
5020 if (res != operands[0])
5021 emit_move_insn (operands[0], res);
5022 return true;
5023 }
5024 }
5025
5026 if (!nonimmediate_operand (cop1, mode))
5027 cop1 = force_reg (mode, cop1);
5028 if (!general_operand (operands[1], data_mode))
5029 operands[1] = force_reg (data_mode, operands[1]);
5030 if (!general_operand (operands[2], data_mode))
5031 operands[2] = force_reg (data_mode, operands[2]);
5032
5033 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5034 operands[1], operands[2], &negate);
5035
5036 if (!x)
5037 return false;
5038
5039 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5040 operands[2-negate]);
5041 return true;
5042 }
5043
5044 static bool
5045 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5046 struct expand_vec_perm_d *d)
5047 {
5048 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5049 expander, so args are either in d, or in op0, op1 etc. */
5050 machine_mode mode = GET_MODE (d ? d->op0 : op0);
5051 machine_mode maskmode = mode;
5052 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5053
5054 switch (mode)
5055 {
5056 case E_V16QImode:
5057 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5058 gen = gen_avx512vl_vpermt2varv16qi3;
5059 break;
5060 case E_V32QImode:
5061 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5062 gen = gen_avx512vl_vpermt2varv32qi3;
5063 break;
5064 case E_V64QImode:
5065 if (TARGET_AVX512VBMI)
5066 gen = gen_avx512bw_vpermt2varv64qi3;
5067 break;
5068 case E_V8HImode:
5069 if (TARGET_AVX512VL && TARGET_AVX512BW)
5070 gen = gen_avx512vl_vpermt2varv8hi3;
5071 break;
5072 case E_V16HImode:
5073 if (TARGET_AVX512VL && TARGET_AVX512BW)
5074 gen = gen_avx512vl_vpermt2varv16hi3;
5075 break;
5076 case E_V32HImode:
5077 if (TARGET_AVX512BW)
5078 gen = gen_avx512bw_vpermt2varv32hi3;
5079 break;
5080 case E_V4SImode:
5081 if (TARGET_AVX512VL)
5082 gen = gen_avx512vl_vpermt2varv4si3;
5083 break;
5084 case E_V8SImode:
5085 if (TARGET_AVX512VL)
5086 gen = gen_avx512vl_vpermt2varv8si3;
5087 break;
5088 case E_V16SImode:
5089 if (TARGET_AVX512F)
5090 gen = gen_avx512f_vpermt2varv16si3;
5091 break;
5092 case E_V4SFmode:
5093 if (TARGET_AVX512VL)
5094 {
5095 gen = gen_avx512vl_vpermt2varv4sf3;
5096 maskmode = V4SImode;
5097 }
5098 break;
5099 case E_V8SFmode:
5100 if (TARGET_AVX512VL)
5101 {
5102 gen = gen_avx512vl_vpermt2varv8sf3;
5103 maskmode = V8SImode;
5104 }
5105 break;
5106 case E_V16SFmode:
5107 if (TARGET_AVX512F)
5108 {
5109 gen = gen_avx512f_vpermt2varv16sf3;
5110 maskmode = V16SImode;
5111 }
5112 break;
5113 case E_V2DImode:
5114 if (TARGET_AVX512VL)
5115 gen = gen_avx512vl_vpermt2varv2di3;
5116 break;
5117 case E_V4DImode:
5118 if (TARGET_AVX512VL)
5119 gen = gen_avx512vl_vpermt2varv4di3;
5120 break;
5121 case E_V8DImode:
5122 if (TARGET_AVX512F)
5123 gen = gen_avx512f_vpermt2varv8di3;
5124 break;
5125 case E_V2DFmode:
5126 if (TARGET_AVX512VL)
5127 {
5128 gen = gen_avx512vl_vpermt2varv2df3;
5129 maskmode = V2DImode;
5130 }
5131 break;
5132 case E_V4DFmode:
5133 if (TARGET_AVX512VL)
5134 {
5135 gen = gen_avx512vl_vpermt2varv4df3;
5136 maskmode = V4DImode;
5137 }
5138 break;
5139 case E_V8DFmode:
5140 if (TARGET_AVX512F)
5141 {
5142 gen = gen_avx512f_vpermt2varv8df3;
5143 maskmode = V8DImode;
5144 }
5145 break;
5146 default:
5147 break;
5148 }
5149
5150 if (gen == NULL)
5151 return false;
5152
5153 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5154 expander, so args are either in d, or in op0, op1 etc. */
5155 if (d)
5156 {
5157 rtx vec[64];
5158 target = d->target;
5159 op0 = d->op0;
5160 op1 = d->op1;
5161 for (int i = 0; i < d->nelt; ++i)
5162 vec[i] = GEN_INT (d->perm[i]);
5163 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5164 }
5165
5166 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5167 return true;
5168 }
5169
5170 /* Expand a variable vector permutation. */
5171
5172 void
5173 ix86_expand_vec_perm (rtx operands[])
5174 {
5175 rtx target = operands[0];
5176 rtx op0 = operands[1];
5177 rtx op1 = operands[2];
5178 rtx mask = operands[3];
5179 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5180 machine_mode mode = GET_MODE (op0);
5181 machine_mode maskmode = GET_MODE (mask);
5182 int w, e, i;
5183 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5184
5185 /* Number of elements in the vector. */
5186 w = GET_MODE_NUNITS (mode);
5187 e = GET_MODE_UNIT_SIZE (mode);
5188 gcc_assert (w <= 64);
5189
5190 /* For HF mode vector, convert it to HI using subreg. */
5191 if (GET_MODE_INNER (mode) == HFmode)
5192 {
5193 machine_mode orig_mode = mode;
5194 mode = mode_for_vector (HImode, w).require ();
5195 target = lowpart_subreg (mode, target, orig_mode);
5196 op0 = lowpart_subreg (mode, op0, orig_mode);
5197 op1 = lowpart_subreg (mode, op1, orig_mode);
5198 }
5199
5200 if (TARGET_AVX512F && one_operand_shuffle)
5201 {
5202 rtx (*gen) (rtx, rtx, rtx) = NULL;
5203 switch (mode)
5204 {
5205 case E_V16SImode:
5206 gen =gen_avx512f_permvarv16si;
5207 break;
5208 case E_V16SFmode:
5209 gen = gen_avx512f_permvarv16sf;
5210 break;
5211 case E_V8DImode:
5212 gen = gen_avx512f_permvarv8di;
5213 break;
5214 case E_V8DFmode:
5215 gen = gen_avx512f_permvarv8df;
5216 break;
5217 default:
5218 break;
5219 }
5220 if (gen != NULL)
5221 {
5222 emit_insn (gen (target, op0, mask));
5223 return;
5224 }
5225 }
5226
5227 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5228 return;
5229
5230 if (TARGET_AVX2)
5231 {
5232 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5233 {
5234 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5235 an constant shuffle operand. With a tiny bit of effort we can
5236 use VPERMD instead. A re-interpretation stall for V4DFmode is
5237 unfortunate but there's no avoiding it.
5238 Similarly for V16HImode we don't have instructions for variable
5239 shuffling, while for V32QImode we can use after preparing suitable
5240 masks vpshufb; vpshufb; vpermq; vpor. */
5241
5242 if (mode == V16HImode)
5243 {
5244 maskmode = mode = V32QImode;
5245 w = 32;
5246 e = 1;
5247 }
5248 else
5249 {
5250 maskmode = mode = V8SImode;
5251 w = 8;
5252 e = 4;
5253 }
5254 t1 = gen_reg_rtx (maskmode);
5255
5256 /* Replicate the low bits of the V4DImode mask into V8SImode:
5257 mask = { A B C D }
5258 t1 = { A A B B C C D D }. */
5259 for (i = 0; i < w / 2; ++i)
5260 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5261 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5262 vt = force_reg (maskmode, vt);
5263 mask = gen_lowpart (maskmode, mask);
5264 if (maskmode == V8SImode)
5265 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5266 else
5267 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5268
5269 /* Multiply the shuffle indicies by two. */
5270 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5271 OPTAB_DIRECT);
5272
5273 /* Add one to the odd shuffle indicies:
5274 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5275 for (i = 0; i < w / 2; ++i)
5276 {
5277 vec[i * 2] = const0_rtx;
5278 vec[i * 2 + 1] = const1_rtx;
5279 }
5280 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5281 vt = validize_mem (force_const_mem (maskmode, vt));
5282 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5283 OPTAB_DIRECT);
5284
5285 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5286 operands[3] = mask = t1;
5287 target = gen_reg_rtx (mode);
5288 op0 = gen_lowpart (mode, op0);
5289 op1 = gen_lowpart (mode, op1);
5290 }
5291
5292 switch (mode)
5293 {
5294 case E_V8SImode:
5295 /* The VPERMD and VPERMPS instructions already properly ignore
5296 the high bits of the shuffle elements. No need for us to
5297 perform an AND ourselves. */
5298 if (one_operand_shuffle)
5299 {
5300 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5301 if (target != operands[0])
5302 emit_move_insn (operands[0],
5303 gen_lowpart (GET_MODE (operands[0]), target));
5304 }
5305 else
5306 {
5307 t1 = gen_reg_rtx (V8SImode);
5308 t2 = gen_reg_rtx (V8SImode);
5309 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5310 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5311 goto merge_two;
5312 }
5313 return;
5314
5315 case E_V8SFmode:
5316 mask = gen_lowpart (V8SImode, mask);
5317 if (one_operand_shuffle)
5318 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5319 else
5320 {
5321 t1 = gen_reg_rtx (V8SFmode);
5322 t2 = gen_reg_rtx (V8SFmode);
5323 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5324 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5325 goto merge_two;
5326 }
5327 return;
5328
5329 case E_V4SImode:
5330 /* By combining the two 128-bit input vectors into one 256-bit
5331 input vector, we can use VPERMD and VPERMPS for the full
5332 two-operand shuffle. */
5333 t1 = gen_reg_rtx (V8SImode);
5334 t2 = gen_reg_rtx (V8SImode);
5335 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5336 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5337 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5338 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5339 return;
5340
5341 case E_V4SFmode:
5342 t1 = gen_reg_rtx (V8SFmode);
5343 t2 = gen_reg_rtx (V8SImode);
5344 mask = gen_lowpart (V4SImode, mask);
5345 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5346 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5347 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5348 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5349 return;
5350
5351 case E_V32QImode:
5352 t1 = gen_reg_rtx (V32QImode);
5353 t2 = gen_reg_rtx (V32QImode);
5354 t3 = gen_reg_rtx (V32QImode);
5355 vt2 = GEN_INT (-128);
5356 vt = gen_const_vec_duplicate (V32QImode, vt2);
5357 vt = force_reg (V32QImode, vt);
5358 for (i = 0; i < 32; i++)
5359 vec[i] = i < 16 ? vt2 : const0_rtx;
5360 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5361 vt2 = force_reg (V32QImode, vt2);
5362 /* From mask create two adjusted masks, which contain the same
5363 bits as mask in the low 7 bits of each vector element.
5364 The first mask will have the most significant bit clear
5365 if it requests element from the same 128-bit lane
5366 and MSB set if it requests element from the other 128-bit lane.
5367 The second mask will have the opposite values of the MSB,
5368 and additionally will have its 128-bit lanes swapped.
5369 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5370 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5371 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5372 stands for other 12 bytes. */
5373 /* The bit whether element is from the same lane or the other
5374 lane is bit 4, so shift it up by 3 to the MSB position. */
5375 t5 = gen_reg_rtx (V4DImode);
5376 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5377 GEN_INT (3)));
5378 /* Clear MSB bits from the mask just in case it had them set. */
5379 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5380 /* After this t1 will have MSB set for elements from other lane. */
5381 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5382 /* Clear bits other than MSB. */
5383 emit_insn (gen_andv32qi3 (t1, t1, vt));
5384 /* Or in the lower bits from mask into t3. */
5385 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5386 /* And invert MSB bits in t1, so MSB is set for elements from the same
5387 lane. */
5388 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5389 /* Swap 128-bit lanes in t3. */
5390 t6 = gen_reg_rtx (V4DImode);
5391 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5392 const2_rtx, GEN_INT (3),
5393 const0_rtx, const1_rtx));
5394 /* And or in the lower bits from mask into t1. */
5395 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5396 if (one_operand_shuffle)
5397 {
5398 /* Each of these shuffles will put 0s in places where
5399 element from the other 128-bit lane is needed, otherwise
5400 will shuffle in the requested value. */
5401 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5402 gen_lowpart (V32QImode, t6)));
5403 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5404 /* For t3 the 128-bit lanes are swapped again. */
5405 t7 = gen_reg_rtx (V4DImode);
5406 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5407 const2_rtx, GEN_INT (3),
5408 const0_rtx, const1_rtx));
5409 /* And oring both together leads to the result. */
5410 emit_insn (gen_iorv32qi3 (target, t1,
5411 gen_lowpart (V32QImode, t7)));
5412 if (target != operands[0])
5413 emit_move_insn (operands[0],
5414 gen_lowpart (GET_MODE (operands[0]), target));
5415 return;
5416 }
5417
5418 t4 = gen_reg_rtx (V32QImode);
5419 /* Similarly to the above one_operand_shuffle code,
5420 just for repeated twice for each operand. merge_two:
5421 code will merge the two results together. */
5422 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5423 gen_lowpart (V32QImode, t6)));
5424 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5425 gen_lowpart (V32QImode, t6)));
5426 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5427 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5428 t7 = gen_reg_rtx (V4DImode);
5429 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5430 const2_rtx, GEN_INT (3),
5431 const0_rtx, const1_rtx));
5432 t8 = gen_reg_rtx (V4DImode);
5433 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5434 const2_rtx, GEN_INT (3),
5435 const0_rtx, const1_rtx));
5436 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5437 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5438 t1 = t4;
5439 t2 = t3;
5440 goto merge_two;
5441
5442 default:
5443 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5444 break;
5445 }
5446 }
5447
5448 if (TARGET_XOP)
5449 {
5450 /* The XOP VPPERM insn supports three inputs. By ignoring the
5451 one_operand_shuffle special case, we avoid creating another
5452 set of constant vectors in memory. */
5453 one_operand_shuffle = false;
5454
5455 /* mask = mask & {2*w-1, ...} */
5456 vt = GEN_INT (2*w - 1);
5457 }
5458 else
5459 {
5460 /* mask = mask & {w-1, ...} */
5461 vt = GEN_INT (w - 1);
5462 }
5463
5464 vt = gen_const_vec_duplicate (maskmode, vt);
5465 mask = expand_simple_binop (maskmode, AND, mask, vt,
5466 NULL_RTX, 0, OPTAB_DIRECT);
5467
5468 /* For non-QImode operations, convert the word permutation control
5469 into a byte permutation control. */
5470 if (mode != V16QImode)
5471 {
5472 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5473 GEN_INT (exact_log2 (e)),
5474 NULL_RTX, 0, OPTAB_DIRECT);
5475
5476 /* Convert mask to vector of chars. */
5477 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5478
5479 /* Replicate each of the input bytes into byte positions:
5480 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5481 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5482 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5483 for (i = 0; i < 16; ++i)
5484 vec[i] = GEN_INT (i/e * e);
5485 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5486 vt = validize_mem (force_const_mem (V16QImode, vt));
5487 if (TARGET_XOP)
5488 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5489 else
5490 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5491
5492 /* Convert it into the byte positions by doing
5493 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5494 for (i = 0; i < 16; ++i)
5495 vec[i] = GEN_INT (i % e);
5496 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5497 vt = validize_mem (force_const_mem (V16QImode, vt));
5498 emit_insn (gen_addv16qi3 (mask, mask, vt));
5499 }
5500
5501 /* The actual shuffle operations all operate on V16QImode. */
5502 op0 = gen_lowpart (V16QImode, op0);
5503 op1 = gen_lowpart (V16QImode, op1);
5504
5505 if (TARGET_XOP)
5506 {
5507 if (GET_MODE (target) != V16QImode)
5508 target = gen_reg_rtx (V16QImode);
5509 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5510 if (target != operands[0])
5511 emit_move_insn (operands[0],
5512 gen_lowpart (GET_MODE (operands[0]), target));
5513 }
5514 else if (one_operand_shuffle)
5515 {
5516 if (GET_MODE (target) != V16QImode)
5517 target = gen_reg_rtx (V16QImode);
5518 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5519 if (target != operands[0])
5520 emit_move_insn (operands[0],
5521 gen_lowpart (GET_MODE (operands[0]), target));
5522 }
5523 else
5524 {
5525 rtx xops[6];
5526 bool ok;
5527
5528 /* Shuffle the two input vectors independently. */
5529 t1 = gen_reg_rtx (V16QImode);
5530 t2 = gen_reg_rtx (V16QImode);
5531 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5532 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5533
5534 merge_two:
5535 /* Then merge them together. The key is whether any given control
5536 element contained a bit set that indicates the second word. */
5537 mask = operands[3];
5538 vt = GEN_INT (w);
5539 if (maskmode == V2DImode && !TARGET_SSE4_1)
5540 {
5541 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5542 more shuffle to convert the V2DI input mask into a V4SI
5543 input mask. At which point the masking that expand_int_vcond
5544 will work as desired. */
5545 rtx t3 = gen_reg_rtx (V4SImode);
5546 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5547 const0_rtx, const0_rtx,
5548 const2_rtx, const2_rtx));
5549 mask = t3;
5550 maskmode = V4SImode;
5551 e = w = 4;
5552 }
5553
5554 vt = gen_const_vec_duplicate (maskmode, vt);
5555 vt = force_reg (maskmode, vt);
5556 mask = expand_simple_binop (maskmode, AND, mask, vt,
5557 NULL_RTX, 0, OPTAB_DIRECT);
5558
5559 if (GET_MODE (target) != mode)
5560 target = gen_reg_rtx (mode);
5561 xops[0] = target;
5562 xops[1] = gen_lowpart (mode, t2);
5563 xops[2] = gen_lowpart (mode, t1);
5564 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5565 xops[4] = mask;
5566 xops[5] = vt;
5567 ok = ix86_expand_int_vcond (xops);
5568 gcc_assert (ok);
5569 if (target != operands[0])
5570 emit_move_insn (operands[0],
5571 gen_lowpart (GET_MODE (operands[0]), target));
5572 }
5573 }
5574
5575 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5576 true if we should do zero extension, else sign extension. HIGH_P is
5577 true if we want the N/2 high elements, else the low elements. */
5578
5579 void
5580 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5581 {
5582 machine_mode imode = GET_MODE (src);
5583 rtx tmp;
5584
5585 if (TARGET_SSE4_1)
5586 {
5587 rtx (*unpack)(rtx, rtx);
5588 rtx (*extract)(rtx, rtx) = NULL;
5589 machine_mode halfmode = BLKmode;
5590
5591 switch (imode)
5592 {
5593 case E_V64QImode:
5594 if (unsigned_p)
5595 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5596 else
5597 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5598 halfmode = V32QImode;
5599 extract
5600 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5601 break;
5602 case E_V32QImode:
5603 if (unsigned_p)
5604 unpack = gen_avx2_zero_extendv16qiv16hi2;
5605 else
5606 unpack = gen_avx2_sign_extendv16qiv16hi2;
5607 halfmode = V16QImode;
5608 extract
5609 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5610 break;
5611 case E_V32HImode:
5612 if (unsigned_p)
5613 unpack = gen_avx512f_zero_extendv16hiv16si2;
5614 else
5615 unpack = gen_avx512f_sign_extendv16hiv16si2;
5616 halfmode = V16HImode;
5617 extract
5618 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5619 break;
5620 case E_V16HImode:
5621 if (unsigned_p)
5622 unpack = gen_avx2_zero_extendv8hiv8si2;
5623 else
5624 unpack = gen_avx2_sign_extendv8hiv8si2;
5625 halfmode = V8HImode;
5626 extract
5627 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5628 break;
5629 case E_V16SImode:
5630 if (unsigned_p)
5631 unpack = gen_avx512f_zero_extendv8siv8di2;
5632 else
5633 unpack = gen_avx512f_sign_extendv8siv8di2;
5634 halfmode = V8SImode;
5635 extract
5636 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5637 break;
5638 case E_V8SImode:
5639 if (unsigned_p)
5640 unpack = gen_avx2_zero_extendv4siv4di2;
5641 else
5642 unpack = gen_avx2_sign_extendv4siv4di2;
5643 halfmode = V4SImode;
5644 extract
5645 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5646 break;
5647 case E_V16QImode:
5648 if (unsigned_p)
5649 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5650 else
5651 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5652 break;
5653 case E_V8HImode:
5654 if (unsigned_p)
5655 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5656 else
5657 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5658 break;
5659 case E_V4SImode:
5660 if (unsigned_p)
5661 unpack = gen_sse4_1_zero_extendv2siv2di2;
5662 else
5663 unpack = gen_sse4_1_sign_extendv2siv2di2;
5664 break;
5665 case E_V8QImode:
5666 if (unsigned_p)
5667 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5668 else
5669 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5670 break;
5671 case E_V4HImode:
5672 if (unsigned_p)
5673 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5674 else
5675 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5676 break;
5677 case E_V4QImode:
5678 if (unsigned_p)
5679 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5680 else
5681 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5682 break;
5683 default:
5684 gcc_unreachable ();
5685 }
5686
5687 if (GET_MODE_SIZE (imode) >= 32)
5688 {
5689 tmp = gen_reg_rtx (halfmode);
5690 emit_insn (extract (tmp, src));
5691 }
5692 else if (high_p)
5693 {
5694 switch (GET_MODE_SIZE (imode))
5695 {
5696 case 16:
5697 /* Shift higher 8 bytes to lower 8 bytes. */
5698 tmp = gen_reg_rtx (V1TImode);
5699 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5700 GEN_INT (64)));
5701 break;
5702 case 8:
5703 /* Shift higher 4 bytes to lower 4 bytes. */
5704 tmp = gen_reg_rtx (V1DImode);
5705 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5706 GEN_INT (32)));
5707 break;
5708 case 4:
5709 /* Shift higher 2 bytes to lower 2 bytes. */
5710 tmp = gen_reg_rtx (V1SImode);
5711 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5712 GEN_INT (16)));
5713 break;
5714 default:
5715 gcc_unreachable ();
5716 }
5717
5718 tmp = gen_lowpart (imode, tmp);
5719 }
5720 else
5721 tmp = src;
5722
5723 emit_insn (unpack (dest, tmp));
5724 }
5725 else
5726 {
5727 rtx (*unpack)(rtx, rtx, rtx);
5728
5729 switch (imode)
5730 {
5731 case E_V16QImode:
5732 if (high_p)
5733 unpack = gen_vec_interleave_highv16qi;
5734 else
5735 unpack = gen_vec_interleave_lowv16qi;
5736 break;
5737 case E_V8HImode:
5738 if (high_p)
5739 unpack = gen_vec_interleave_highv8hi;
5740 else
5741 unpack = gen_vec_interleave_lowv8hi;
5742 break;
5743 case E_V4SImode:
5744 if (high_p)
5745 unpack = gen_vec_interleave_highv4si;
5746 else
5747 unpack = gen_vec_interleave_lowv4si;
5748 break;
5749 case E_V8QImode:
5750 if (high_p)
5751 unpack = gen_mmx_punpckhbw;
5752 else
5753 unpack = gen_mmx_punpcklbw;
5754 break;
5755 case E_V4HImode:
5756 if (high_p)
5757 unpack = gen_mmx_punpckhwd;
5758 else
5759 unpack = gen_mmx_punpcklwd;
5760 break;
5761 case E_V4QImode:
5762 if (high_p)
5763 unpack = gen_mmx_punpckhbw_low;
5764 else
5765 unpack = gen_mmx_punpcklbw_low;
5766 break;
5767 default:
5768 gcc_unreachable ();
5769 }
5770
5771 if (unsigned_p)
5772 tmp = force_reg (imode, CONST0_RTX (imode));
5773 else
5774 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5775 src, pc_rtx, pc_rtx);
5776
5777 rtx tmp2 = gen_reg_rtx (imode);
5778 emit_insn (unpack (tmp2, src, tmp));
5779 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5780 }
5781 }
5782
5783 /* Return true if mem is pool constant which contains a const_vector
5784 perm index, assign the index to PERM. */
5785 bool
5786 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5787 {
5788 machine_mode mode = GET_MODE (mem);
5789 int nelt = GET_MODE_NUNITS (mode);
5790
5791 if (!INTEGRAL_MODE_P (mode))
5792 return false;
5793
5794 /* Needs to be constant pool. */
5795 if (!(MEM_P (mem))
5796 || !SYMBOL_REF_P (XEXP (mem, 0))
5797 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5798 return false;
5799
5800 rtx constant = get_pool_constant (XEXP (mem, 0));
5801
5802 if (GET_CODE (constant) != CONST_VECTOR)
5803 return false;
5804
5805 /* There could be some rtx like
5806 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5807 but with "*.LC1" refer to V2DI constant vector. */
5808 if (GET_MODE (constant) != mode)
5809 {
5810 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5811
5812 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5813 return false;
5814 }
5815
5816 for (int i = 0; i != nelt; i++)
5817 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5818
5819 return true;
5820 }
5821
5822 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5823 but works for floating pointer parameters and nonoffsetable memories.
5824 For pushes, it returns just stack offsets; the values will be saved
5825 in the right order. Maximally three parts are generated. */
5826
5827 static int
5828 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5829 {
5830 int size;
5831
5832 if (!TARGET_64BIT)
5833 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5834 else
5835 size = (GET_MODE_SIZE (mode) + 4) / 8;
5836
5837 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5838 gcc_assert (size >= 2 && size <= 4);
5839
5840 /* Optimize constant pool reference to immediates. This is used by fp
5841 moves, that force all constants to memory to allow combining. */
5842 if (MEM_P (operand) && MEM_READONLY_P (operand))
5843 operand = avoid_constant_pool_reference (operand);
5844
5845 if (MEM_P (operand) && !offsettable_memref_p (operand))
5846 {
5847 /* The only non-offsetable memories we handle are pushes. */
5848 int ok = push_operand (operand, VOIDmode);
5849
5850 gcc_assert (ok);
5851
5852 operand = copy_rtx (operand);
5853 PUT_MODE (operand, word_mode);
5854 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5855 return size;
5856 }
5857
5858 if (GET_CODE (operand) == CONST_VECTOR)
5859 {
5860 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5861 /* Caution: if we looked through a constant pool memory above,
5862 the operand may actually have a different mode now. That's
5863 ok, since we want to pun this all the way back to an integer. */
5864 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5865 gcc_assert (operand != NULL);
5866 mode = imode;
5867 }
5868
5869 if (!TARGET_64BIT)
5870 {
5871 if (mode == DImode)
5872 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5873 else
5874 {
5875 int i;
5876
5877 if (REG_P (operand))
5878 {
5879 gcc_assert (reload_completed);
5880 for (i = 0; i < size; i++)
5881 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5882 }
5883 else if (offsettable_memref_p (operand))
5884 {
5885 operand = adjust_address (operand, SImode, 0);
5886 parts[0] = operand;
5887 for (i = 1; i < size; i++)
5888 parts[i] = adjust_address (operand, SImode, 4 * i);
5889 }
5890 else if (CONST_DOUBLE_P (operand))
5891 {
5892 const REAL_VALUE_TYPE *r;
5893 long l[4];
5894
5895 r = CONST_DOUBLE_REAL_VALUE (operand);
5896 switch (mode)
5897 {
5898 case E_TFmode:
5899 real_to_target (l, r, mode);
5900 parts[3] = gen_int_mode (l[3], SImode);
5901 parts[2] = gen_int_mode (l[2], SImode);
5902 break;
5903 case E_XFmode:
5904 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5905 long double may not be 80-bit. */
5906 real_to_target (l, r, mode);
5907 parts[2] = gen_int_mode (l[2], SImode);
5908 break;
5909 case E_DFmode:
5910 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5911 break;
5912 default:
5913 gcc_unreachable ();
5914 }
5915 parts[1] = gen_int_mode (l[1], SImode);
5916 parts[0] = gen_int_mode (l[0], SImode);
5917 }
5918 else
5919 gcc_unreachable ();
5920 }
5921 }
5922 else
5923 {
5924 if (mode == TImode)
5925 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5926 if (mode == XFmode || mode == TFmode)
5927 {
5928 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5929 if (REG_P (operand))
5930 {
5931 gcc_assert (reload_completed);
5932 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5933 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5934 }
5935 else if (offsettable_memref_p (operand))
5936 {
5937 operand = adjust_address (operand, DImode, 0);
5938 parts[0] = operand;
5939 parts[1] = adjust_address (operand, upper_mode, 8);
5940 }
5941 else if (CONST_DOUBLE_P (operand))
5942 {
5943 long l[4];
5944
5945 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5946
5947 /* real_to_target puts 32-bit pieces in each long. */
5948 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5949 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5950 << 32), DImode);
5951
5952 if (upper_mode == SImode)
5953 parts[1] = gen_int_mode (l[2], SImode);
5954 else
5955 parts[1]
5956 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5957 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5958 << 32), DImode);
5959 }
5960 else
5961 gcc_unreachable ();
5962 }
5963 }
5964
5965 return size;
5966 }
5967
5968 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5969 Return false when normal moves are needed; true when all required
5970 insns have been emitted. Operands 2-4 contain the input values
5971 int the correct order; operands 5-7 contain the output values. */
5972
5973 void
5974 ix86_split_long_move (rtx operands[])
5975 {
5976 rtx part[2][4];
5977 int nparts, i, j;
5978 int push = 0;
5979 int collisions = 0;
5980 machine_mode mode = GET_MODE (operands[0]);
5981 bool collisionparts[4];
5982
5983 /* The DFmode expanders may ask us to move double.
5984 For 64bit target this is single move. By hiding the fact
5985 here we simplify i386.md splitters. */
5986 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5987 {
5988 /* Optimize constant pool reference to immediates. This is used by
5989 fp moves, that force all constants to memory to allow combining. */
5990
5991 if (MEM_P (operands[1])
5992 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5993 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5994 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5995 if (push_operand (operands[0], VOIDmode))
5996 {
5997 operands[0] = copy_rtx (operands[0]);
5998 PUT_MODE (operands[0], word_mode);
5999 }
6000 else
6001 operands[0] = gen_lowpart (DImode, operands[0]);
6002 operands[1] = gen_lowpart (DImode, operands[1]);
6003 emit_move_insn (operands[0], operands[1]);
6004 return;
6005 }
6006
6007 /* The only non-offsettable memory we handle is push. */
6008 if (push_operand (operands[0], VOIDmode))
6009 push = 1;
6010 else
6011 gcc_assert (!MEM_P (operands[0])
6012 || offsettable_memref_p (operands[0]));
6013
6014 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
6015 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
6016
6017 /* When emitting push, take care for source operands on the stack. */
6018 if (push && MEM_P (operands[1])
6019 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
6020 {
6021 rtx src_base = XEXP (part[1][nparts - 1], 0);
6022
6023 /* Compensate for the stack decrement by 4. */
6024 if (!TARGET_64BIT && nparts == 3
6025 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6026 src_base = plus_constant (Pmode, src_base, 4);
6027
6028 /* src_base refers to the stack pointer and is
6029 automatically decreased by emitted push. */
6030 for (i = 0; i < nparts; i++)
6031 part[1][i] = change_address (part[1][i],
6032 GET_MODE (part[1][i]), src_base);
6033 }
6034
6035 /* We need to do copy in the right order in case an address register
6036 of the source overlaps the destination. */
6037 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6038 {
6039 rtx tmp;
6040
6041 for (i = 0; i < nparts; i++)
6042 {
6043 collisionparts[i]
6044 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6045 if (collisionparts[i])
6046 collisions++;
6047 }
6048
6049 /* Collision in the middle part can be handled by reordering. */
6050 if (collisions == 1 && nparts == 3 && collisionparts [1])
6051 {
6052 std::swap (part[0][1], part[0][2]);
6053 std::swap (part[1][1], part[1][2]);
6054 }
6055 else if (collisions == 1
6056 && nparts == 4
6057 && (collisionparts [1] || collisionparts [2]))
6058 {
6059 if (collisionparts [1])
6060 {
6061 std::swap (part[0][1], part[0][2]);
6062 std::swap (part[1][1], part[1][2]);
6063 }
6064 else
6065 {
6066 std::swap (part[0][2], part[0][3]);
6067 std::swap (part[1][2], part[1][3]);
6068 }
6069 }
6070
6071 /* If there are more collisions, we can't handle it by reordering.
6072 Do an lea to the last part and use only one colliding move. */
6073 else if (collisions > 1)
6074 {
6075 rtx base, addr;
6076
6077 collisions = 1;
6078
6079 base = part[0][nparts - 1];
6080
6081 /* Handle the case when the last part isn't valid for lea.
6082 Happens in 64-bit mode storing the 12-byte XFmode. */
6083 if (GET_MODE (base) != Pmode)
6084 base = gen_rtx_REG (Pmode, REGNO (base));
6085
6086 addr = XEXP (part[1][0], 0);
6087 if (TARGET_TLS_DIRECT_SEG_REFS)
6088 {
6089 struct ix86_address parts;
6090 int ok = ix86_decompose_address (addr, &parts);
6091 gcc_assert (ok);
6092 /* It is not valid to use %gs: or %fs: in lea. */
6093 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6094 }
6095 emit_insn (gen_rtx_SET (base, addr));
6096 part[1][0] = replace_equiv_address (part[1][0], base);
6097 for (i = 1; i < nparts; i++)
6098 {
6099 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6100 part[1][i] = replace_equiv_address (part[1][i], tmp);
6101 }
6102 }
6103 }
6104
6105 if (push)
6106 {
6107 if (!TARGET_64BIT)
6108 {
6109 if (nparts == 3)
6110 {
6111 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
6112 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
6113 emit_move_insn (part[0][2], part[1][2]);
6114 }
6115 else if (nparts == 4)
6116 {
6117 emit_move_insn (part[0][3], part[1][3]);
6118 emit_move_insn (part[0][2], part[1][2]);
6119 }
6120 }
6121 else
6122 {
6123 /* In 64bit mode we don't have 32bit push available. In case this is
6124 register, it is OK - we will just use larger counterpart. We also
6125 retype memory - these comes from attempt to avoid REX prefix on
6126 moving of second half of TFmode value. */
6127 if (GET_MODE (part[1][1]) == SImode)
6128 {
6129 switch (GET_CODE (part[1][1]))
6130 {
6131 case MEM:
6132 part[1][1] = adjust_address (part[1][1], DImode, 0);
6133 break;
6134
6135 case REG:
6136 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6137 break;
6138
6139 default:
6140 gcc_unreachable ();
6141 }
6142
6143 if (GET_MODE (part[1][0]) == SImode)
6144 part[1][0] = part[1][1];
6145 }
6146 }
6147 emit_move_insn (part[0][1], part[1][1]);
6148 emit_move_insn (part[0][0], part[1][0]);
6149 return;
6150 }
6151
6152 /* Choose correct order to not overwrite the source before it is copied. */
6153 if ((REG_P (part[0][0])
6154 && REG_P (part[1][1])
6155 && (REGNO (part[0][0]) == REGNO (part[1][1])
6156 || (nparts == 3
6157 && REGNO (part[0][0]) == REGNO (part[1][2]))
6158 || (nparts == 4
6159 && REGNO (part[0][0]) == REGNO (part[1][3]))))
6160 || (collisions > 0
6161 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6162 {
6163 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6164 {
6165 operands[2 + i] = part[0][j];
6166 operands[6 + i] = part[1][j];
6167 }
6168 }
6169 else
6170 {
6171 for (i = 0; i < nparts; i++)
6172 {
6173 operands[2 + i] = part[0][i];
6174 operands[6 + i] = part[1][i];
6175 }
6176 }
6177
6178 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6179 if (optimize_insn_for_size_p ())
6180 {
6181 for (j = 0; j < nparts - 1; j++)
6182 if (CONST_INT_P (operands[6 + j])
6183 && operands[6 + j] != const0_rtx
6184 && REG_P (operands[2 + j]))
6185 for (i = j; i < nparts - 1; i++)
6186 if (CONST_INT_P (operands[7 + i])
6187 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6188 operands[7 + i] = operands[2 + j];
6189 }
6190
6191 for (i = 0; i < nparts; i++)
6192 emit_move_insn (operands[2 + i], operands[6 + i]);
6193
6194 return;
6195 }
6196
6197 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6198 left shift by a constant, either using a single shift or
6199 a sequence of add instructions. */
6200
6201 static void
6202 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6203 {
6204 if (count == 1
6205 || (count * ix86_cost->add <= ix86_cost->shift_const
6206 && !optimize_insn_for_size_p ()))
6207 {
6208 while (count-- > 0)
6209 emit_insn (gen_add2_insn (operand, operand));
6210 }
6211 else
6212 {
6213 rtx (*insn)(rtx, rtx, rtx);
6214
6215 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6216 emit_insn (insn (operand, operand, GEN_INT (count)));
6217 }
6218 }
6219
6220 void
6221 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6222 {
6223 rtx (*gen_ashl3)(rtx, rtx, rtx);
6224 rtx (*gen_shld)(rtx, rtx, rtx);
6225 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6226 machine_mode half_mode;
6227
6228 rtx low[2], high[2];
6229 int count;
6230
6231 if (CONST_INT_P (operands[2]))
6232 {
6233 split_double_mode (mode, operands, 2, low, high);
6234 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6235
6236 if (count >= half_width)
6237 {
6238 emit_move_insn (high[0], low[1]);
6239 ix86_expand_clear (low[0]);
6240
6241 if (count > half_width)
6242 ix86_expand_ashl_const (high[0], count - half_width, mode);
6243 }
6244 else
6245 {
6246 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6247
6248 if (!rtx_equal_p (operands[0], operands[1]))
6249 emit_move_insn (operands[0], operands[1]);
6250
6251 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6252 ix86_expand_ashl_const (low[0], count, mode);
6253 }
6254 return;
6255 }
6256
6257 split_double_mode (mode, operands, 1, low, high);
6258 half_mode = mode == DImode ? SImode : DImode;
6259
6260 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6261
6262 if (operands[1] == const1_rtx)
6263 {
6264 /* Assuming we've chosen a QImode capable registers, then 1 << N
6265 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6266 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6267 {
6268 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6269
6270 ix86_expand_clear (low[0]);
6271 ix86_expand_clear (high[0]);
6272 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6273
6274 d = gen_lowpart (QImode, low[0]);
6275 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6276 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6277 emit_insn (gen_rtx_SET (d, s));
6278
6279 d = gen_lowpart (QImode, high[0]);
6280 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6281 s = gen_rtx_NE (QImode, flags, const0_rtx);
6282 emit_insn (gen_rtx_SET (d, s));
6283 }
6284
6285 /* Otherwise, we can get the same results by manually performing
6286 a bit extract operation on bit 5/6, and then performing the two
6287 shifts. The two methods of getting 0/1 into low/high are exactly
6288 the same size. Avoiding the shift in the bit extract case helps
6289 pentium4 a bit; no one else seems to care much either way. */
6290 else
6291 {
6292 rtx (*gen_lshr3)(rtx, rtx, rtx);
6293 rtx (*gen_and3)(rtx, rtx, rtx);
6294 rtx (*gen_xor3)(rtx, rtx, rtx);
6295 HOST_WIDE_INT bits;
6296 rtx x;
6297
6298 if (mode == DImode)
6299 {
6300 gen_lshr3 = gen_lshrsi3;
6301 gen_and3 = gen_andsi3;
6302 gen_xor3 = gen_xorsi3;
6303 bits = 5;
6304 }
6305 else
6306 {
6307 gen_lshr3 = gen_lshrdi3;
6308 gen_and3 = gen_anddi3;
6309 gen_xor3 = gen_xordi3;
6310 bits = 6;
6311 }
6312
6313 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6314 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6315 else
6316 x = gen_lowpart (half_mode, operands[2]);
6317 emit_insn (gen_rtx_SET (high[0], x));
6318
6319 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6320 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6321 emit_move_insn (low[0], high[0]);
6322 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6323 }
6324
6325 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6326 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6327 return;
6328 }
6329
6330 if (operands[1] == constm1_rtx)
6331 {
6332 /* For -1 << N, we can avoid the shld instruction, because we
6333 know that we're shifting 0...31/63 ones into a -1. */
6334 emit_move_insn (low[0], constm1_rtx);
6335 if (optimize_insn_for_size_p ())
6336 emit_move_insn (high[0], low[0]);
6337 else
6338 emit_move_insn (high[0], constm1_rtx);
6339 }
6340 else
6341 {
6342 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6343
6344 if (!rtx_equal_p (operands[0], operands[1]))
6345 emit_move_insn (operands[0], operands[1]);
6346
6347 split_double_mode (mode, operands, 1, low, high);
6348 emit_insn (gen_shld (high[0], low[0], operands[2]));
6349 }
6350
6351 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6352
6353 if (TARGET_CMOVE && scratch)
6354 {
6355 ix86_expand_clear (scratch);
6356 emit_insn (gen_x86_shift_adj_1
6357 (half_mode, high[0], low[0], operands[2], scratch));
6358 }
6359 else
6360 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6361 }
6362
6363 void
6364 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6365 {
6366 rtx (*gen_ashr3)(rtx, rtx, rtx)
6367 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6368 rtx (*gen_shrd)(rtx, rtx, rtx);
6369 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6370
6371 rtx low[2], high[2];
6372 int count;
6373
6374 if (CONST_INT_P (operands[2]))
6375 {
6376 split_double_mode (mode, operands, 2, low, high);
6377 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6378
6379 if (count == GET_MODE_BITSIZE (mode) - 1)
6380 {
6381 emit_move_insn (high[0], high[1]);
6382 emit_insn (gen_ashr3 (high[0], high[0],
6383 GEN_INT (half_width - 1)));
6384 emit_move_insn (low[0], high[0]);
6385
6386 }
6387 else if (count >= half_width)
6388 {
6389 emit_move_insn (low[0], high[1]);
6390 emit_move_insn (high[0], low[0]);
6391 emit_insn (gen_ashr3 (high[0], high[0],
6392 GEN_INT (half_width - 1)));
6393
6394 if (count > half_width)
6395 emit_insn (gen_ashr3 (low[0], low[0],
6396 GEN_INT (count - half_width)));
6397 }
6398 else
6399 {
6400 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6401
6402 if (!rtx_equal_p (operands[0], operands[1]))
6403 emit_move_insn (operands[0], operands[1]);
6404
6405 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6406 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6407 }
6408 }
6409 else
6410 {
6411 machine_mode half_mode;
6412
6413 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6414
6415 if (!rtx_equal_p (operands[0], operands[1]))
6416 emit_move_insn (operands[0], operands[1]);
6417
6418 split_double_mode (mode, operands, 1, low, high);
6419 half_mode = mode == DImode ? SImode : DImode;
6420
6421 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6422 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6423
6424 if (TARGET_CMOVE && scratch)
6425 {
6426 emit_move_insn (scratch, high[0]);
6427 emit_insn (gen_ashr3 (scratch, scratch,
6428 GEN_INT (half_width - 1)));
6429 emit_insn (gen_x86_shift_adj_1
6430 (half_mode, low[0], high[0], operands[2], scratch));
6431 }
6432 else
6433 emit_insn (gen_x86_shift_adj_3
6434 (half_mode, low[0], high[0], operands[2]));
6435 }
6436 }
6437
6438 void
6439 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6440 {
6441 rtx (*gen_lshr3)(rtx, rtx, rtx)
6442 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6443 rtx (*gen_shrd)(rtx, rtx, rtx);
6444 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6445
6446 rtx low[2], high[2];
6447 int count;
6448
6449 if (CONST_INT_P (operands[2]))
6450 {
6451 split_double_mode (mode, operands, 2, low, high);
6452 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6453
6454 if (count >= half_width)
6455 {
6456 emit_move_insn (low[0], high[1]);
6457 ix86_expand_clear (high[0]);
6458
6459 if (count > half_width)
6460 emit_insn (gen_lshr3 (low[0], low[0],
6461 GEN_INT (count - half_width)));
6462 }
6463 else
6464 {
6465 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6466
6467 if (!rtx_equal_p (operands[0], operands[1]))
6468 emit_move_insn (operands[0], operands[1]);
6469
6470 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6471 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6472 }
6473 }
6474 else
6475 {
6476 machine_mode half_mode;
6477
6478 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6479
6480 if (!rtx_equal_p (operands[0], operands[1]))
6481 emit_move_insn (operands[0], operands[1]);
6482
6483 split_double_mode (mode, operands, 1, low, high);
6484 half_mode = mode == DImode ? SImode : DImode;
6485
6486 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6487 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6488
6489 if (TARGET_CMOVE && scratch)
6490 {
6491 ix86_expand_clear (scratch);
6492 emit_insn (gen_x86_shift_adj_1
6493 (half_mode, low[0], high[0], operands[2], scratch));
6494 }
6495 else
6496 emit_insn (gen_x86_shift_adj_2
6497 (half_mode, low[0], high[0], operands[2]));
6498 }
6499 }
6500
6501 /* Expand move of V1TI mode register X to a new TI mode register. */
6502 static rtx
6503 ix86_expand_v1ti_to_ti (rtx x)
6504 {
6505 rtx result = gen_reg_rtx (TImode);
6506 if (TARGET_SSE2)
6507 {
6508 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6509 rtx lo = gen_lowpart (DImode, result);
6510 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6511 rtx hi = gen_highpart (DImode, result);
6512 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6513 }
6514 else
6515 emit_move_insn (result, gen_lowpart (TImode, x));
6516 return result;
6517 }
6518
6519 /* Expand move of TI mode register X to a new V1TI mode register. */
6520 static rtx
6521 ix86_expand_ti_to_v1ti (rtx x)
6522 {
6523 if (TARGET_SSE2)
6524 {
6525 rtx lo = gen_lowpart (DImode, x);
6526 rtx hi = gen_highpart (DImode, x);
6527 rtx tmp = gen_reg_rtx (V2DImode);
6528 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6529 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6530 }
6531
6532 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6533 }
6534
6535 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6536 void
6537 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6538 {
6539 rtx op1 = force_reg (V1TImode, operands[1]);
6540
6541 if (!CONST_INT_P (operands[2]))
6542 {
6543 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6544 rtx tmp2 = gen_reg_rtx (TImode);
6545 rtx (*shift) (rtx, rtx, rtx)
6546 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6547 emit_insn (shift (tmp2, tmp1, operands[2]));
6548 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6549 emit_move_insn (operands[0], tmp3);
6550 return;
6551 }
6552
6553 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6554
6555 if (bits == 0)
6556 {
6557 emit_move_insn (operands[0], op1);
6558 return;
6559 }
6560
6561 if ((bits & 7) == 0)
6562 {
6563 rtx tmp = gen_reg_rtx (V1TImode);
6564 if (code == ASHIFT)
6565 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6566 else
6567 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6568 emit_move_insn (operands[0], tmp);
6569 return;
6570 }
6571
6572 rtx tmp1 = gen_reg_rtx (V1TImode);
6573 if (code == ASHIFT)
6574 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6575 else
6576 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6577
6578 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6579 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6580
6581 /* tmp3 will be the V2DImode result. */
6582 rtx tmp3 = gen_reg_rtx (V2DImode);
6583
6584 if (bits > 64)
6585 {
6586 if (code == ASHIFT)
6587 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6588 else
6589 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6590 }
6591 else
6592 {
6593 /* tmp4 is operands[1], in V2DImode. */
6594 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6595
6596 rtx tmp5 = gen_reg_rtx (V2DImode);
6597 if (code == ASHIFT)
6598 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6599 else
6600 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6601
6602 rtx tmp6 = gen_reg_rtx (V2DImode);
6603 if (code == ASHIFT)
6604 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6605 else
6606 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6607
6608 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6609 }
6610
6611 /* Convert the result back to V1TImode and store in operands[0]. */
6612 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6613 emit_move_insn (operands[0], tmp7);
6614 }
6615
6616 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6617 void
6618 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6619 {
6620 rtx op1 = force_reg (V1TImode, operands[1]);
6621
6622 if (!CONST_INT_P (operands[2]))
6623 {
6624 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6625 rtx tmp2 = gen_reg_rtx (TImode);
6626 rtx (*rotate) (rtx, rtx, rtx)
6627 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6628 emit_insn (rotate (tmp2, tmp1, operands[2]));
6629 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6630 emit_move_insn (operands[0], tmp3);
6631 return;
6632 }
6633
6634 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6635
6636 if (bits == 0)
6637 {
6638 emit_move_insn (operands[0], op1);
6639 return;
6640 }
6641
6642 if (code == ROTATERT)
6643 bits = 128 - bits;
6644
6645 if ((bits & 31) == 0)
6646 {
6647 rtx tmp2 = gen_reg_rtx (V4SImode);
6648 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6649 if (bits == 32)
6650 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6651 else if (bits == 64)
6652 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6653 else
6654 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6655 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6656 return;
6657 }
6658
6659 if ((bits & 7) == 0)
6660 {
6661 rtx tmp1 = gen_reg_rtx (V1TImode);
6662 rtx tmp2 = gen_reg_rtx (V1TImode);
6663 rtx tmp3 = gen_reg_rtx (V1TImode);
6664
6665 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6666 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6667 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6668 emit_move_insn (operands[0], tmp3);
6669 return;
6670 }
6671
6672 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6673
6674 rtx lobits;
6675 rtx hibits;
6676
6677 switch (bits >> 5)
6678 {
6679 case 0:
6680 lobits = op1_v4si;
6681 hibits = gen_reg_rtx (V4SImode);
6682 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6683 break;
6684
6685 case 1:
6686 lobits = gen_reg_rtx (V4SImode);
6687 hibits = gen_reg_rtx (V4SImode);
6688 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6689 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6690 break;
6691
6692 case 2:
6693 lobits = gen_reg_rtx (V4SImode);
6694 hibits = gen_reg_rtx (V4SImode);
6695 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6696 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6697 break;
6698
6699 default:
6700 lobits = gen_reg_rtx (V4SImode);
6701 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6702 hibits = op1_v4si;
6703 break;
6704 }
6705
6706 rtx tmp1 = gen_reg_rtx (V4SImode);
6707 rtx tmp2 = gen_reg_rtx (V4SImode);
6708 rtx tmp3 = gen_reg_rtx (V4SImode);
6709
6710 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6711 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6712 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6713
6714 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6715 }
6716
6717 /* Expand V1TI mode ashiftrt by constant. */
6718 void
6719 ix86_expand_v1ti_ashiftrt (rtx operands[])
6720 {
6721 rtx op1 = force_reg (V1TImode, operands[1]);
6722
6723 if (!CONST_INT_P (operands[2]))
6724 {
6725 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6726 rtx tmp2 = gen_reg_rtx (TImode);
6727 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6728 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6729 emit_move_insn (operands[0], tmp3);
6730 return;
6731 }
6732
6733 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6734
6735 if (bits == 0)
6736 {
6737 emit_move_insn (operands[0], op1);
6738 return;
6739 }
6740
6741 if (bits == 127)
6742 {
6743 /* Two operations. */
6744 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6745 rtx tmp2 = gen_reg_rtx (V4SImode);
6746 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6747
6748 rtx tmp3 = gen_reg_rtx (V4SImode);
6749 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6750
6751 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6752 return;
6753 }
6754
6755 if (bits == 64)
6756 {
6757 /* Three operations. */
6758 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6759 rtx tmp2 = gen_reg_rtx (V4SImode);
6760 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6761
6762 rtx tmp3 = gen_reg_rtx (V4SImode);
6763 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6764
6765 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6766 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6767 rtx tmp6 = gen_reg_rtx (V2DImode);
6768 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6769
6770 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6771 return;
6772 }
6773
6774 if (bits == 96)
6775 {
6776 /* Three operations. */
6777 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6778 rtx tmp2 = gen_reg_rtx (V4SImode);
6779 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6780
6781 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6782 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6783 rtx tmp5 = gen_reg_rtx (V2DImode);
6784 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6785
6786 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
6787 rtx tmp7 = gen_reg_rtx (V4SImode);
6788 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6789
6790 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6791 return;
6792 }
6793
6794 if (bits >= 111)
6795 {
6796 /* Three operations. */
6797 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6798 rtx tmp2 = gen_reg_rtx (V4SImode);
6799 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6800
6801 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6802 rtx tmp4 = gen_reg_rtx (V8HImode);
6803 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6804
6805 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6806 rtx tmp6 = gen_reg_rtx (V4SImode);
6807 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6808
6809 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6810 return;
6811 }
6812
6813 if (TARGET_AVX2 || TARGET_SSE4_1)
6814 {
6815 /* Three operations. */
6816 if (bits == 32)
6817 {
6818 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6819 rtx tmp2 = gen_reg_rtx (V4SImode);
6820 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6821
6822 rtx tmp3 = gen_reg_rtx (V1TImode);
6823 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6824
6825 if (TARGET_AVX2)
6826 {
6827 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6828 rtx tmp5 = gen_reg_rtx (V4SImode);
6829 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6830 GEN_INT (7)));
6831
6832 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6833 }
6834 else
6835 {
6836 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6837 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6838 rtx tmp6 = gen_reg_rtx (V8HImode);
6839 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6840 GEN_INT (0x3f)));
6841
6842 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6843 }
6844 return;
6845 }
6846
6847 /* Three operations. */
6848 if (bits == 8 || bits == 16 || bits == 24)
6849 {
6850 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6851 rtx tmp2 = gen_reg_rtx (V4SImode);
6852 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6853
6854 rtx tmp3 = gen_reg_rtx (V1TImode);
6855 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6856
6857 if (TARGET_AVX2)
6858 {
6859 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6860 rtx tmp5 = gen_reg_rtx (V4SImode);
6861 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6862 GEN_INT (7)));
6863
6864 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6865 }
6866 else
6867 {
6868 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6869 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6870 rtx tmp6 = gen_reg_rtx (V8HImode);
6871 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6872 GEN_INT (0x3f)));
6873
6874 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6875 }
6876 return;
6877 }
6878 }
6879
6880 if (bits > 96)
6881 {
6882 /* Four operations. */
6883 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6884 rtx tmp2 = gen_reg_rtx (V4SImode);
6885 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6886
6887 rtx tmp3 = gen_reg_rtx (V4SImode);
6888 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6889
6890 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6891 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6892 rtx tmp6 = gen_reg_rtx (V2DImode);
6893 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6894
6895 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
6896 rtx tmp8 = gen_reg_rtx (V4SImode);
6897 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6898
6899 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
6900 return;
6901 }
6902
6903 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6904 {
6905 /* Four operations. */
6906 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6907 rtx tmp2 = gen_reg_rtx (V4SImode);
6908 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6909
6910 rtx tmp3 = gen_reg_rtx (V4SImode);
6911 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6912
6913 rtx tmp4 = gen_reg_rtx (V1TImode);
6914 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6915
6916 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6917 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
6918 rtx tmp7 = gen_reg_rtx (V8HImode);
6919 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6920 GEN_INT (bits == 48 ? 0x1f : 0x07)));
6921
6922 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6923 return;
6924 }
6925
6926 if ((bits & 7) == 0)
6927 {
6928 /* Five operations. */
6929 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6930 rtx tmp2 = gen_reg_rtx (V4SImode);
6931 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6932
6933 rtx tmp3 = gen_reg_rtx (V4SImode);
6934 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6935
6936 rtx tmp4 = gen_reg_rtx (V1TImode);
6937 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6938
6939 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6940 rtx tmp6 = gen_reg_rtx (V1TImode);
6941 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6942
6943 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6944 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
6945 rtx tmp9 = gen_reg_rtx (V2DImode);
6946 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6947
6948 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
6949 return;
6950 }
6951
6952 if (TARGET_AVX2 && bits < 32)
6953 {
6954 /* Six operations. */
6955 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6956 rtx tmp2 = gen_reg_rtx (V4SImode);
6957 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6958
6959 rtx tmp3 = gen_reg_rtx (V1TImode);
6960 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6961
6962 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6963 rtx tmp5 = gen_reg_rtx (V2DImode);
6964 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6965
6966 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6967 rtx tmp7 = gen_reg_rtx (V2DImode);
6968 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6969
6970 rtx tmp8 = gen_reg_rtx (V2DImode);
6971 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6972
6973 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
6974 rtx tmp10 = gen_reg_rtx (V4SImode);
6975 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6976
6977 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
6978 return;
6979 }
6980
6981 if (TARGET_SSE4_1 && bits < 15)
6982 {
6983 /* Six operations. */
6984 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6985 rtx tmp2 = gen_reg_rtx (V4SImode);
6986 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6987
6988 rtx tmp3 = gen_reg_rtx (V1TImode);
6989 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6990
6991 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6992 rtx tmp5 = gen_reg_rtx (V2DImode);
6993 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6994
6995 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6996 rtx tmp7 = gen_reg_rtx (V2DImode);
6997 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6998
6999 rtx tmp8 = gen_reg_rtx (V2DImode);
7000 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7001
7002 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7003 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
7004 rtx tmp11 = gen_reg_rtx (V8HImode);
7005 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
7006
7007 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
7008 return;
7009 }
7010
7011 if (bits == 1)
7012 {
7013 /* Eight operations. */
7014 rtx tmp1 = gen_reg_rtx (V1TImode);
7015 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7016
7017 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7018 rtx tmp3 = gen_reg_rtx (V2DImode);
7019 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
7020
7021 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7022 rtx tmp5 = gen_reg_rtx (V2DImode);
7023 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
7024
7025 rtx tmp6 = gen_reg_rtx (V2DImode);
7026 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7027
7028 rtx tmp7 = gen_reg_rtx (V2DImode);
7029 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7030
7031 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
7032 rtx tmp9 = gen_reg_rtx (V4SImode);
7033 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7034
7035 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
7036 rtx tmp11 = gen_reg_rtx (V2DImode);
7037 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7038
7039 rtx tmp12 = gen_reg_rtx (V2DImode);
7040 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7041
7042 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
7043 return;
7044 }
7045
7046 if (bits > 64)
7047 {
7048 /* Eight operations. */
7049 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7050 rtx tmp2 = gen_reg_rtx (V4SImode);
7051 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7052
7053 rtx tmp3 = gen_reg_rtx (V4SImode);
7054 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7055
7056 rtx tmp4 = gen_reg_rtx (V1TImode);
7057 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7058
7059 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7060 rtx tmp6 = gen_reg_rtx (V2DImode);
7061 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7062
7063 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7064 rtx tmp8 = gen_reg_rtx (V1TImode);
7065 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7066
7067 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7068 rtx tmp10 = gen_reg_rtx (V2DImode);
7069 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7070
7071 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
7072 rtx tmp12 = gen_reg_rtx (V2DImode);
7073 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7074
7075 rtx tmp13 = gen_reg_rtx (V2DImode);
7076 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7077
7078 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
7079 }
7080 else
7081 {
7082 /* Nine operations. */
7083 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7084 rtx tmp2 = gen_reg_rtx (V4SImode);
7085 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7086
7087 rtx tmp3 = gen_reg_rtx (V4SImode);
7088 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7089
7090 rtx tmp4 = gen_reg_rtx (V1TImode);
7091 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7092
7093 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7094 rtx tmp6 = gen_reg_rtx (V2DImode);
7095 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7096
7097 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7098 rtx tmp8 = gen_reg_rtx (V2DImode);
7099 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7100
7101 rtx tmp9 = gen_reg_rtx (V2DImode);
7102 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7103
7104 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7105 rtx tmp11 = gen_reg_rtx (V1TImode);
7106 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7107
7108 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
7109 rtx tmp13 = gen_reg_rtx (V2DImode);
7110 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7111
7112 rtx tmp14 = gen_reg_rtx (V2DImode);
7113 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7114
7115 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
7116 }
7117 }
7118
7119 /* Replace all occurrences of REG FROM with REG TO in X, including
7120 occurrences with different modes. */
7121
7122 rtx
7123 ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
7124 {
7125 gcc_checking_assert (REG_P (from)
7126 && REG_P (to)
7127 && GET_MODE (from) == GET_MODE (to));
7128 if (!reg_overlap_mentioned_p (from, x))
7129 return x;
7130 rtx ret = copy_rtx (x);
7131 subrtx_ptr_iterator::array_type array;
7132 FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
7133 {
7134 rtx *loc = *iter;
7135 x = *loc;
7136 if (REG_P (x) && REGNO (x) == REGNO (from))
7137 {
7138 if (x == from)
7139 *loc = to;
7140 else
7141 {
7142 gcc_checking_assert (REG_NREGS (x) == 1);
7143 *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
7144 }
7145 }
7146 }
7147 return ret;
7148 }
7149
7150 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7151 DImode for constant loop counts. */
7152
7153 static machine_mode
7154 counter_mode (rtx count_exp)
7155 {
7156 if (GET_MODE (count_exp) != VOIDmode)
7157 return GET_MODE (count_exp);
7158 if (!CONST_INT_P (count_exp))
7159 return Pmode;
7160 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7161 return DImode;
7162 return SImode;
7163 }
7164
7165 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7166 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7167 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7168 memory by VALUE (supposed to be in MODE).
7169
7170 The size is rounded down to whole number of chunk size moved at once.
7171 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7172
7173
7174 static void
7175 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7176 rtx destptr, rtx srcptr, rtx value,
7177 rtx count, machine_mode mode, int unroll,
7178 int expected_size, bool issetmem)
7179 {
7180 rtx_code_label *out_label, *top_label;
7181 rtx iter, tmp;
7182 machine_mode iter_mode = counter_mode (count);
7183 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7184 rtx piece_size = GEN_INT (piece_size_n);
7185 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7186 rtx size;
7187 int i;
7188
7189 top_label = gen_label_rtx ();
7190 out_label = gen_label_rtx ();
7191 iter = gen_reg_rtx (iter_mode);
7192
7193 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7194 NULL, 1, OPTAB_DIRECT);
7195 /* Those two should combine. */
7196 if (piece_size == const1_rtx)
7197 {
7198 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7199 true, out_label);
7200 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7201 }
7202 emit_move_insn (iter, const0_rtx);
7203
7204 emit_label (top_label);
7205
7206 tmp = convert_modes (Pmode, iter_mode, iter, true);
7207
7208 /* This assert could be relaxed - in this case we'll need to compute
7209 smallest power of two, containing in PIECE_SIZE_N and pass it to
7210 offset_address. */
7211 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7212 destmem = offset_address (destmem, tmp, piece_size_n);
7213 destmem = adjust_address (destmem, mode, 0);
7214
7215 if (!issetmem)
7216 {
7217 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7218 srcmem = adjust_address (srcmem, mode, 0);
7219
7220 /* When unrolling for chips that reorder memory reads and writes,
7221 we can save registers by using single temporary.
7222 Also using 4 temporaries is overkill in 32bit mode. */
7223 if (!TARGET_64BIT && 0)
7224 {
7225 for (i = 0; i < unroll; i++)
7226 {
7227 if (i)
7228 {
7229 destmem = adjust_address (copy_rtx (destmem), mode,
7230 GET_MODE_SIZE (mode));
7231 srcmem = adjust_address (copy_rtx (srcmem), mode,
7232 GET_MODE_SIZE (mode));
7233 }
7234 emit_move_insn (destmem, srcmem);
7235 }
7236 }
7237 else
7238 {
7239 rtx tmpreg[4];
7240 gcc_assert (unroll <= 4);
7241 for (i = 0; i < unroll; i++)
7242 {
7243 tmpreg[i] = gen_reg_rtx (mode);
7244 if (i)
7245 srcmem = adjust_address (copy_rtx (srcmem), mode,
7246 GET_MODE_SIZE (mode));
7247 emit_move_insn (tmpreg[i], srcmem);
7248 }
7249 for (i = 0; i < unroll; i++)
7250 {
7251 if (i)
7252 destmem = adjust_address (copy_rtx (destmem), mode,
7253 GET_MODE_SIZE (mode));
7254 emit_move_insn (destmem, tmpreg[i]);
7255 }
7256 }
7257 }
7258 else
7259 for (i = 0; i < unroll; i++)
7260 {
7261 if (i)
7262 destmem = adjust_address (copy_rtx (destmem), mode,
7263 GET_MODE_SIZE (mode));
7264 emit_move_insn (destmem, value);
7265 }
7266
7267 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7268 true, OPTAB_LIB_WIDEN);
7269 if (tmp != iter)
7270 emit_move_insn (iter, tmp);
7271
7272 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7273 true, top_label);
7274 if (expected_size != -1)
7275 {
7276 expected_size /= GET_MODE_SIZE (mode) * unroll;
7277 if (expected_size == 0)
7278 predict_jump (0);
7279 else if (expected_size > REG_BR_PROB_BASE)
7280 predict_jump (REG_BR_PROB_BASE - 1);
7281 else
7282 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7283 / expected_size);
7284 }
7285 else
7286 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7287 iter = ix86_zero_extend_to_Pmode (iter);
7288 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7289 true, OPTAB_LIB_WIDEN);
7290 if (tmp != destptr)
7291 emit_move_insn (destptr, tmp);
7292 if (!issetmem)
7293 {
7294 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7295 true, OPTAB_LIB_WIDEN);
7296 if (tmp != srcptr)
7297 emit_move_insn (srcptr, tmp);
7298 }
7299 emit_label (out_label);
7300 }
7301
7302 /* Divide COUNTREG by SCALE. */
7303 static rtx
7304 scale_counter (rtx countreg, int scale)
7305 {
7306 rtx sc;
7307
7308 if (scale == 1)
7309 return countreg;
7310 if (CONST_INT_P (countreg))
7311 return GEN_INT (INTVAL (countreg) / scale);
7312 gcc_assert (REG_P (countreg));
7313
7314 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7315 GEN_INT (exact_log2 (scale)),
7316 NULL, 1, OPTAB_DIRECT);
7317 return sc;
7318 }
7319
7320 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7321 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7322 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7323 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7324 ORIG_VALUE is the original value passed to memset to fill the memory with.
7325 Other arguments have same meaning as for previous function. */
7326
7327 static void
7328 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7329 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7330 rtx count,
7331 machine_mode mode, bool issetmem)
7332 {
7333 rtx destexp;
7334 rtx srcexp;
7335 rtx countreg;
7336 HOST_WIDE_INT rounded_count;
7337
7338 /* If possible, it is shorter to use rep movs.
7339 TODO: Maybe it is better to move this logic to decide_alg. */
7340 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7341 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7342 && (!issetmem || orig_value == const0_rtx))
7343 mode = SImode;
7344
7345 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7346 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7347
7348 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7349 GET_MODE_SIZE (mode)));
7350 if (mode != QImode)
7351 {
7352 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7353 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7354 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7355 }
7356 else
7357 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7358 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7359 {
7360 rounded_count
7361 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7362 destmem = shallow_copy_rtx (destmem);
7363 set_mem_size (destmem, rounded_count);
7364 }
7365 else if (MEM_SIZE_KNOWN_P (destmem))
7366 clear_mem_size (destmem);
7367
7368 if (issetmem)
7369 {
7370 value = force_reg (mode, gen_lowpart (mode, value));
7371 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7372 }
7373 else
7374 {
7375 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7376 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7377 if (mode != QImode)
7378 {
7379 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7380 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7381 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7382 }
7383 else
7384 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7385 if (CONST_INT_P (count))
7386 {
7387 rounded_count
7388 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7389 srcmem = shallow_copy_rtx (srcmem);
7390 set_mem_size (srcmem, rounded_count);
7391 }
7392 else
7393 {
7394 if (MEM_SIZE_KNOWN_P (srcmem))
7395 clear_mem_size (srcmem);
7396 }
7397 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7398 destexp, srcexp));
7399 }
7400 }
7401
7402 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7403 DESTMEM.
7404 SRC is passed by pointer to be updated on return.
7405 Return value is updated DST. */
7406 static rtx
7407 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7408 HOST_WIDE_INT size_to_move)
7409 {
7410 rtx dst = destmem, src = *srcmem, tempreg;
7411 enum insn_code code;
7412 machine_mode move_mode;
7413 int piece_size, i;
7414
7415 /* Find the widest mode in which we could perform moves.
7416 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7417 it until move of such size is supported. */
7418 piece_size = 1 << floor_log2 (size_to_move);
7419 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7420 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7421 {
7422 gcc_assert (piece_size > 1);
7423 piece_size >>= 1;
7424 }
7425
7426 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7427 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7428 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7429 {
7430 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7431 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7432 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7433 {
7434 move_mode = word_mode;
7435 piece_size = GET_MODE_SIZE (move_mode);
7436 code = optab_handler (mov_optab, move_mode);
7437 }
7438 }
7439 gcc_assert (code != CODE_FOR_nothing);
7440
7441 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7442 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7443
7444 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7445 gcc_assert (size_to_move % piece_size == 0);
7446
7447 for (i = 0; i < size_to_move; i += piece_size)
7448 {
7449 /* We move from memory to memory, so we'll need to do it via
7450 a temporary register. */
7451 tempreg = gen_reg_rtx (move_mode);
7452 emit_insn (GEN_FCN (code) (tempreg, src));
7453 emit_insn (GEN_FCN (code) (dst, tempreg));
7454
7455 emit_move_insn (destptr,
7456 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7457 emit_move_insn (srcptr,
7458 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7459
7460 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7461 piece_size);
7462 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7463 piece_size);
7464 }
7465
7466 /* Update DST and SRC rtx. */
7467 *srcmem = src;
7468 return dst;
7469 }
7470
7471 /* Helper function for the string operations below. Dest VARIABLE whether
7472 it is aligned to VALUE bytes. If true, jump to the label. */
7473
7474 static rtx_code_label *
7475 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7476 {
7477 rtx_code_label *label = gen_label_rtx ();
7478 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7479 if (GET_MODE (variable) == DImode)
7480 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7481 else
7482 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7483 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7484 1, label);
7485 if (epilogue)
7486 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7487 else
7488 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7489 return label;
7490 }
7491
7492
7493 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7494
7495 static void
7496 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7497 rtx destptr, rtx srcptr, rtx count, int max_size)
7498 {
7499 rtx src, dest;
7500 if (CONST_INT_P (count))
7501 {
7502 HOST_WIDE_INT countval = INTVAL (count);
7503 HOST_WIDE_INT epilogue_size = countval % max_size;
7504 int i;
7505
7506 /* For now MAX_SIZE should be a power of 2. This assert could be
7507 relaxed, but it'll require a bit more complicated epilogue
7508 expanding. */
7509 gcc_assert ((max_size & (max_size - 1)) == 0);
7510 for (i = max_size; i >= 1; i >>= 1)
7511 {
7512 if (epilogue_size & i)
7513 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7514 }
7515 return;
7516 }
7517 if (max_size > 8)
7518 {
7519 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7520 count, 1, OPTAB_DIRECT);
7521 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7522 count, QImode, 1, 4, false);
7523 return;
7524 }
7525
7526 /* When there are stringops, we can cheaply increase dest and src pointers.
7527 Otherwise we save code size by maintaining offset (zero is readily
7528 available from preceding rep operation) and using x86 addressing modes.
7529 */
7530 if (TARGET_SINGLE_STRINGOP)
7531 {
7532 if (max_size > 4)
7533 {
7534 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7535 src = change_address (srcmem, SImode, srcptr);
7536 dest = change_address (destmem, SImode, destptr);
7537 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7538 emit_label (label);
7539 LABEL_NUSES (label) = 1;
7540 }
7541 if (max_size > 2)
7542 {
7543 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7544 src = change_address (srcmem, HImode, srcptr);
7545 dest = change_address (destmem, HImode, destptr);
7546 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7547 emit_label (label);
7548 LABEL_NUSES (label) = 1;
7549 }
7550 if (max_size > 1)
7551 {
7552 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7553 src = change_address (srcmem, QImode, srcptr);
7554 dest = change_address (destmem, QImode, destptr);
7555 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7556 emit_label (label);
7557 LABEL_NUSES (label) = 1;
7558 }
7559 }
7560 else
7561 {
7562 rtx offset = force_reg (Pmode, const0_rtx);
7563 rtx tmp;
7564
7565 if (max_size > 4)
7566 {
7567 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7568 src = change_address (srcmem, SImode, srcptr);
7569 dest = change_address (destmem, SImode, destptr);
7570 emit_move_insn (dest, src);
7571 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7572 true, OPTAB_LIB_WIDEN);
7573 if (tmp != offset)
7574 emit_move_insn (offset, tmp);
7575 emit_label (label);
7576 LABEL_NUSES (label) = 1;
7577 }
7578 if (max_size > 2)
7579 {
7580 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7581 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7582 src = change_address (srcmem, HImode, tmp);
7583 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7584 dest = change_address (destmem, HImode, tmp);
7585 emit_move_insn (dest, src);
7586 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7587 true, OPTAB_LIB_WIDEN);
7588 if (tmp != offset)
7589 emit_move_insn (offset, tmp);
7590 emit_label (label);
7591 LABEL_NUSES (label) = 1;
7592 }
7593 if (max_size > 1)
7594 {
7595 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7596 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7597 src = change_address (srcmem, QImode, tmp);
7598 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7599 dest = change_address (destmem, QImode, tmp);
7600 emit_move_insn (dest, src);
7601 emit_label (label);
7602 LABEL_NUSES (label) = 1;
7603 }
7604 }
7605 }
7606
7607 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7608 with value PROMOTED_VAL.
7609 SRC is passed by pointer to be updated on return.
7610 Return value is updated DST. */
7611 static rtx
7612 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7613 HOST_WIDE_INT size_to_move)
7614 {
7615 rtx dst = destmem;
7616 enum insn_code code;
7617 machine_mode move_mode;
7618 int piece_size, i;
7619
7620 /* Find the widest mode in which we could perform moves.
7621 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7622 it until move of such size is supported. */
7623 move_mode = GET_MODE (promoted_val);
7624 if (move_mode == VOIDmode)
7625 move_mode = QImode;
7626 if (size_to_move < GET_MODE_SIZE (move_mode))
7627 {
7628 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7629 move_mode = int_mode_for_size (move_bits, 0).require ();
7630 promoted_val = gen_lowpart (move_mode, promoted_val);
7631 }
7632 piece_size = GET_MODE_SIZE (move_mode);
7633 code = optab_handler (mov_optab, move_mode);
7634 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7635
7636 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7637
7638 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7639 gcc_assert (size_to_move % piece_size == 0);
7640
7641 for (i = 0; i < size_to_move; i += piece_size)
7642 {
7643 if (piece_size <= GET_MODE_SIZE (word_mode))
7644 {
7645 emit_insn (gen_strset (destptr, dst, promoted_val));
7646 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7647 piece_size);
7648 continue;
7649 }
7650
7651 emit_insn (GEN_FCN (code) (dst, promoted_val));
7652
7653 emit_move_insn (destptr,
7654 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7655
7656 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7657 piece_size);
7658 }
7659
7660 /* Update DST rtx. */
7661 return dst;
7662 }
7663 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7664 static void
7665 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7666 rtx count, int max_size)
7667 {
7668 count = expand_simple_binop (counter_mode (count), AND, count,
7669 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7670 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7671 gen_lowpart (QImode, value), count, QImode,
7672 1, max_size / 2, true);
7673 }
7674
7675 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7676 static void
7677 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7678 rtx count, int max_size)
7679 {
7680 rtx dest;
7681
7682 if (CONST_INT_P (count))
7683 {
7684 HOST_WIDE_INT countval = INTVAL (count);
7685 HOST_WIDE_INT epilogue_size = countval % max_size;
7686 int i;
7687
7688 /* For now MAX_SIZE should be a power of 2. This assert could be
7689 relaxed, but it'll require a bit more complicated epilogue
7690 expanding. */
7691 gcc_assert ((max_size & (max_size - 1)) == 0);
7692 for (i = max_size; i >= 1; i >>= 1)
7693 {
7694 if (epilogue_size & i)
7695 {
7696 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7697 destmem = emit_memset (destmem, destptr, vec_value, i);
7698 else
7699 destmem = emit_memset (destmem, destptr, value, i);
7700 }
7701 }
7702 return;
7703 }
7704 if (max_size > 32)
7705 {
7706 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7707 return;
7708 }
7709 if (max_size > 16)
7710 {
7711 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7712 if (TARGET_64BIT)
7713 {
7714 dest = change_address (destmem, DImode, destptr);
7715 emit_insn (gen_strset (destptr, dest, value));
7716 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7717 emit_insn (gen_strset (destptr, dest, value));
7718 }
7719 else
7720 {
7721 dest = change_address (destmem, SImode, destptr);
7722 emit_insn (gen_strset (destptr, dest, value));
7723 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7724 emit_insn (gen_strset (destptr, dest, value));
7725 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7726 emit_insn (gen_strset (destptr, dest, value));
7727 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7728 emit_insn (gen_strset (destptr, dest, value));
7729 }
7730 emit_label (label);
7731 LABEL_NUSES (label) = 1;
7732 }
7733 if (max_size > 8)
7734 {
7735 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7736 if (TARGET_64BIT)
7737 {
7738 dest = change_address (destmem, DImode, destptr);
7739 emit_insn (gen_strset (destptr, dest, value));
7740 }
7741 else
7742 {
7743 dest = change_address (destmem, SImode, destptr);
7744 emit_insn (gen_strset (destptr, dest, value));
7745 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7746 emit_insn (gen_strset (destptr, dest, value));
7747 }
7748 emit_label (label);
7749 LABEL_NUSES (label) = 1;
7750 }
7751 if (max_size > 4)
7752 {
7753 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7754 dest = change_address (destmem, SImode, destptr);
7755 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7756 emit_label (label);
7757 LABEL_NUSES (label) = 1;
7758 }
7759 if (max_size > 2)
7760 {
7761 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7762 dest = change_address (destmem, HImode, destptr);
7763 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7764 emit_label (label);
7765 LABEL_NUSES (label) = 1;
7766 }
7767 if (max_size > 1)
7768 {
7769 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7770 dest = change_address (destmem, QImode, destptr);
7771 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7772 emit_label (label);
7773 LABEL_NUSES (label) = 1;
7774 }
7775 }
7776
7777 /* Adjust COUNTER by the VALUE. */
7778 static void
7779 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7780 {
7781 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7782 }
7783
7784 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7785 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7786 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7787 ignored.
7788 Return value is updated DESTMEM. */
7789
7790 static rtx
7791 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7792 rtx destptr, rtx srcptr, rtx value,
7793 rtx vec_value, rtx count, int align,
7794 int desired_alignment, bool issetmem)
7795 {
7796 int i;
7797 for (i = 1; i < desired_alignment; i <<= 1)
7798 {
7799 if (align <= i)
7800 {
7801 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7802 if (issetmem)
7803 {
7804 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7805 destmem = emit_memset (destmem, destptr, vec_value, i);
7806 else
7807 destmem = emit_memset (destmem, destptr, value, i);
7808 }
7809 else
7810 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7811 ix86_adjust_counter (count, i);
7812 emit_label (label);
7813 LABEL_NUSES (label) = 1;
7814 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7815 }
7816 }
7817 return destmem;
7818 }
7819
7820 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7821 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7822 and jump to DONE_LABEL. */
7823 static void
7824 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
7825 rtx destptr, rtx srcptr,
7826 rtx value, rtx vec_value,
7827 rtx count, int size,
7828 rtx done_label, bool issetmem)
7829 {
7830 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7831 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7832 rtx modesize;
7833 int n;
7834
7835 /* If we do not have vector value to copy, we must reduce size. */
7836 if (issetmem)
7837 {
7838 if (!vec_value)
7839 {
7840 if (GET_MODE (value) == VOIDmode && size > 8)
7841 mode = Pmode;
7842 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7843 mode = GET_MODE (value);
7844 }
7845 else
7846 mode = GET_MODE (vec_value), value = vec_value;
7847 }
7848 else
7849 {
7850 /* Choose appropriate vector mode. */
7851 if (size >= 32)
7852 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7853 else if (size >= 16)
7854 mode = TARGET_SSE ? V16QImode : DImode;
7855 srcmem = change_address (srcmem, mode, srcptr);
7856 }
7857 destmem = change_address (destmem, mode, destptr);
7858 modesize = GEN_INT (GET_MODE_SIZE (mode));
7859 gcc_assert (GET_MODE_SIZE (mode) <= size);
7860 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7861 {
7862 if (issetmem)
7863 emit_move_insn (destmem, gen_lowpart (mode, value));
7864 else
7865 {
7866 emit_move_insn (destmem, srcmem);
7867 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7868 }
7869 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7870 }
7871
7872 destmem = offset_address (destmem, count, 1);
7873 destmem = offset_address (destmem, GEN_INT (-2 * size),
7874 GET_MODE_SIZE (mode));
7875 if (!issetmem)
7876 {
7877 srcmem = offset_address (srcmem, count, 1);
7878 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7879 GET_MODE_SIZE (mode));
7880 }
7881 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7882 {
7883 if (issetmem)
7884 emit_move_insn (destmem, gen_lowpart (mode, value));
7885 else
7886 {
7887 emit_move_insn (destmem, srcmem);
7888 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7889 }
7890 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7891 }
7892 emit_jump_insn (gen_jump (done_label));
7893 emit_barrier ();
7894
7895 emit_label (label);
7896 LABEL_NUSES (label) = 1;
7897 }
7898
7899 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7900 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7901 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7902 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7903 DONE_LABEL is a label after the whole copying sequence. The label is created
7904 on demand if *DONE_LABEL is NULL.
7905 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7906 bounds after the initial copies.
7907
7908 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7909 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7910 we will dispatch to a library call for large blocks.
7911
7912 In pseudocode we do:
7913
7914 if (COUNT < SIZE)
7915 {
7916 Assume that SIZE is 4. Bigger sizes are handled analogously
7917 if (COUNT & 4)
7918 {
7919 copy 4 bytes from SRCPTR to DESTPTR
7920 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7921 goto done_label
7922 }
7923 if (!COUNT)
7924 goto done_label;
7925 copy 1 byte from SRCPTR to DESTPTR
7926 if (COUNT & 2)
7927 {
7928 copy 2 bytes from SRCPTR to DESTPTR
7929 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7930 }
7931 }
7932 else
7933 {
7934 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7935 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7936
7937 OLD_DESPTR = DESTPTR;
7938 Align DESTPTR up to DESIRED_ALIGN
7939 SRCPTR += DESTPTR - OLD_DESTPTR
7940 COUNT -= DEST_PTR - OLD_DESTPTR
7941 if (DYNAMIC_CHECK)
7942 Round COUNT down to multiple of SIZE
7943 << optional caller supplied zero size guard is here >>
7944 << optional caller supplied dynamic check is here >>
7945 << caller supplied main copy loop is here >>
7946 }
7947 done_label:
7948 */
7949 static void
7950 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
7951 rtx *destptr, rtx *srcptr,
7952 machine_mode mode,
7953 rtx value, rtx vec_value,
7954 rtx *count,
7955 rtx_code_label **done_label,
7956 int size,
7957 int desired_align,
7958 int align,
7959 unsigned HOST_WIDE_INT *min_size,
7960 bool dynamic_check,
7961 bool issetmem)
7962 {
7963 rtx_code_label *loop_label = NULL, *label;
7964 int n;
7965 rtx modesize;
7966 int prolog_size = 0;
7967 rtx mode_value;
7968
7969 /* Chose proper value to copy. */
7970 if (issetmem && VECTOR_MODE_P (mode))
7971 mode_value = vec_value;
7972 else
7973 mode_value = value;
7974 gcc_assert (GET_MODE_SIZE (mode) <= size);
7975
7976 /* See if block is big or small, handle small blocks. */
7977 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7978 {
7979 int size2 = size;
7980 loop_label = gen_label_rtx ();
7981
7982 if (!*done_label)
7983 *done_label = gen_label_rtx ();
7984
7985 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7986 1, loop_label);
7987 size2 >>= 1;
7988
7989 /* Handle sizes > 3. */
7990 for (;size2 > 2; size2 >>= 1)
7991 expand_small_cpymem_or_setmem (destmem, srcmem,
7992 *destptr, *srcptr,
7993 value, vec_value,
7994 *count,
7995 size2, *done_label, issetmem);
7996 /* Nothing to copy? Jump to DONE_LABEL if so */
7997 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7998 1, *done_label);
7999
8000 /* Do a byte copy. */
8001 destmem = change_address (destmem, QImode, *destptr);
8002 if (issetmem)
8003 emit_move_insn (destmem, gen_lowpart (QImode, value));
8004 else
8005 {
8006 srcmem = change_address (srcmem, QImode, *srcptr);
8007 emit_move_insn (destmem, srcmem);
8008 }
8009
8010 /* Handle sizes 2 and 3. */
8011 label = ix86_expand_aligntest (*count, 2, false);
8012 destmem = change_address (destmem, HImode, *destptr);
8013 destmem = offset_address (destmem, *count, 1);
8014 destmem = offset_address (destmem, GEN_INT (-2), 2);
8015 if (issetmem)
8016 emit_move_insn (destmem, gen_lowpart (HImode, value));
8017 else
8018 {
8019 srcmem = change_address (srcmem, HImode, *srcptr);
8020 srcmem = offset_address (srcmem, *count, 1);
8021 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
8022 emit_move_insn (destmem, srcmem);
8023 }
8024
8025 emit_label (label);
8026 LABEL_NUSES (label) = 1;
8027 emit_jump_insn (gen_jump (*done_label));
8028 emit_barrier ();
8029 }
8030 else
8031 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
8032 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
8033
8034 /* Start memcpy for COUNT >= SIZE. */
8035 if (loop_label)
8036 {
8037 emit_label (loop_label);
8038 LABEL_NUSES (loop_label) = 1;
8039 }
8040
8041 /* Copy first desired_align bytes. */
8042 if (!issetmem)
8043 srcmem = change_address (srcmem, mode, *srcptr);
8044 destmem = change_address (destmem, mode, *destptr);
8045 modesize = GEN_INT (GET_MODE_SIZE (mode));
8046 for (n = 0; prolog_size < desired_align - align; n++)
8047 {
8048 if (issetmem)
8049 emit_move_insn (destmem, mode_value);
8050 else
8051 {
8052 emit_move_insn (destmem, srcmem);
8053 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8054 }
8055 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8056 prolog_size += GET_MODE_SIZE (mode);
8057 }
8058
8059
8060 /* Copy last SIZE bytes. */
8061 destmem = offset_address (destmem, *count, 1);
8062 destmem = offset_address (destmem,
8063 GEN_INT (-size - prolog_size),
8064 1);
8065 if (issetmem)
8066 emit_move_insn (destmem, mode_value);
8067 else
8068 {
8069 srcmem = offset_address (srcmem, *count, 1);
8070 srcmem = offset_address (srcmem,
8071 GEN_INT (-size - prolog_size),
8072 1);
8073 emit_move_insn (destmem, srcmem);
8074 }
8075 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8076 {
8077 destmem = offset_address (destmem, modesize, 1);
8078 if (issetmem)
8079 emit_move_insn (destmem, mode_value);
8080 else
8081 {
8082 srcmem = offset_address (srcmem, modesize, 1);
8083 emit_move_insn (destmem, srcmem);
8084 }
8085 }
8086
8087 /* Align destination. */
8088 if (desired_align > 1 && desired_align > align)
8089 {
8090 rtx saveddest = *destptr;
8091
8092 gcc_assert (desired_align <= size);
8093 /* Align destptr up, place it to new register. */
8094 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8095 GEN_INT (prolog_size),
8096 NULL_RTX, 1, OPTAB_DIRECT);
8097 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8098 REG_POINTER (*destptr) = 1;
8099 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8100 GEN_INT (-desired_align),
8101 *destptr, 1, OPTAB_DIRECT);
8102 /* See how many bytes we skipped. */
8103 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8104 *destptr,
8105 saveddest, 1, OPTAB_DIRECT);
8106 /* Adjust srcptr and count. */
8107 if (!issetmem)
8108 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8109 saveddest, *srcptr, 1, OPTAB_DIRECT);
8110 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8111 saveddest, *count, 1, OPTAB_DIRECT);
8112 /* We copied at most size + prolog_size. */
8113 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8114 *min_size
8115 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8116 else
8117 *min_size = 0;
8118
8119 /* Our loops always round down the block size, but for dispatch to
8120 library we need precise value. */
8121 if (dynamic_check)
8122 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
8123 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
8124 }
8125 else
8126 {
8127 gcc_assert (prolog_size == 0);
8128 /* Decrease count, so we won't end up copying last word twice. */
8129 if (!CONST_INT_P (*count))
8130 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8131 constm1_rtx, *count, 1, OPTAB_DIRECT);
8132 else
8133 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
8134 (unsigned HOST_WIDE_INT)size));
8135 if (*min_size)
8136 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
8137 }
8138 }
8139
8140
8141 /* This function is like the previous one, except here we know how many bytes
8142 need to be copied. That allows us to update alignment not only of DST, which
8143 is returned, but also of SRC, which is passed as a pointer for that
8144 reason. */
8145 static rtx
8146 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
8147 rtx srcreg, rtx value, rtx vec_value,
8148 int desired_align, int align_bytes,
8149 bool issetmem)
8150 {
8151 rtx src = NULL;
8152 rtx orig_dst = dst;
8153 rtx orig_src = NULL;
8154 int piece_size = 1;
8155 int copied_bytes = 0;
8156
8157 if (!issetmem)
8158 {
8159 gcc_assert (srcp != NULL);
8160 src = *srcp;
8161 orig_src = src;
8162 }
8163
8164 for (piece_size = 1;
8165 piece_size <= desired_align && copied_bytes < align_bytes;
8166 piece_size <<= 1)
8167 {
8168 if (align_bytes & piece_size)
8169 {
8170 if (issetmem)
8171 {
8172 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
8173 dst = emit_memset (dst, destreg, vec_value, piece_size);
8174 else
8175 dst = emit_memset (dst, destreg, value, piece_size);
8176 }
8177 else
8178 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
8179 copied_bytes += piece_size;
8180 }
8181 }
8182 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
8183 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8184 if (MEM_SIZE_KNOWN_P (orig_dst))
8185 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
8186
8187 if (!issetmem)
8188 {
8189 int src_align_bytes = get_mem_align_offset (src, desired_align
8190 * BITS_PER_UNIT);
8191 if (src_align_bytes >= 0)
8192 src_align_bytes = desired_align - src_align_bytes;
8193 if (src_align_bytes >= 0)
8194 {
8195 unsigned int src_align;
8196 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
8197 {
8198 if ((src_align_bytes & (src_align - 1))
8199 == (align_bytes & (src_align - 1)))
8200 break;
8201 }
8202 if (src_align > (unsigned int) desired_align)
8203 src_align = desired_align;
8204 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8205 set_mem_align (src, src_align * BITS_PER_UNIT);
8206 }
8207 if (MEM_SIZE_KNOWN_P (orig_src))
8208 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8209 *srcp = src;
8210 }
8211
8212 return dst;
8213 }
8214
8215 /* Return true if ALG can be used in current context.
8216 Assume we expand memset if MEMSET is true. */
8217 static bool
8218 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8219 {
8220 if (alg == no_stringop)
8221 return false;
8222 if (alg == vector_loop)
8223 return TARGET_SSE || TARGET_AVX;
8224 /* Algorithms using the rep prefix want at least edi and ecx;
8225 additionally, memset wants eax and memcpy wants esi. Don't
8226 consider such algorithms if the user has appropriated those
8227 registers for their own purposes, or if we have a non-default
8228 address space, since some string insns cannot override the segment. */
8229 if (alg == rep_prefix_1_byte
8230 || alg == rep_prefix_4_byte
8231 || alg == rep_prefix_8_byte)
8232 {
8233 if (have_as)
8234 return false;
8235 if (fixed_regs[CX_REG]
8236 || fixed_regs[DI_REG]
8237 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8238 return false;
8239 }
8240 return true;
8241 }
8242
8243 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8244 static enum stringop_alg
8245 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8246 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8247 bool memset, bool zero_memset, bool have_as,
8248 int *dynamic_check, bool *noalign, bool recur)
8249 {
8250 const struct stringop_algs *algs;
8251 bool optimize_for_speed;
8252 int max = 0;
8253 const struct processor_costs *cost;
8254 int i;
8255 bool any_alg_usable_p = false;
8256
8257 *noalign = false;
8258 *dynamic_check = -1;
8259
8260 /* Even if the string operation call is cold, we still might spend a lot
8261 of time processing large blocks. */
8262 if (optimize_function_for_size_p (cfun)
8263 || (optimize_insn_for_size_p ()
8264 && (max_size < 256
8265 || (expected_size != -1 && expected_size < 256))))
8266 optimize_for_speed = false;
8267 else
8268 optimize_for_speed = true;
8269
8270 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8271 if (memset)
8272 algs = &cost->memset[TARGET_64BIT != 0];
8273 else
8274 algs = &cost->memcpy[TARGET_64BIT != 0];
8275
8276 /* See maximal size for user defined algorithm. */
8277 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8278 {
8279 enum stringop_alg candidate = algs->size[i].alg;
8280 bool usable = alg_usable_p (candidate, memset, have_as);
8281 any_alg_usable_p |= usable;
8282
8283 if (candidate != libcall && candidate && usable)
8284 max = algs->size[i].max;
8285 }
8286
8287 /* If expected size is not known but max size is small enough
8288 so inline version is a win, set expected size into
8289 the range. */
8290 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8291 && expected_size == -1)
8292 expected_size = min_size / 2 + max_size / 2;
8293
8294 /* If user specified the algorithm, honor it if possible. */
8295 if (ix86_stringop_alg != no_stringop
8296 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8297 return ix86_stringop_alg;
8298 /* rep; movq or rep; movl is the smallest variant. */
8299 else if (!optimize_for_speed)
8300 {
8301 *noalign = true;
8302 if (!count || (count & 3) || (memset && !zero_memset))
8303 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8304 ? rep_prefix_1_byte : loop_1_byte;
8305 else
8306 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8307 ? rep_prefix_4_byte : loop;
8308 }
8309 /* Very tiny blocks are best handled via the loop, REP is expensive to
8310 setup. */
8311 else if (expected_size != -1 && expected_size < 4)
8312 return loop_1_byte;
8313 else if (expected_size != -1)
8314 {
8315 enum stringop_alg alg = libcall;
8316 bool alg_noalign = false;
8317 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8318 {
8319 /* We get here if the algorithms that were not libcall-based
8320 were rep-prefix based and we are unable to use rep prefixes
8321 based on global register usage. Break out of the loop and
8322 use the heuristic below. */
8323 if (algs->size[i].max == 0)
8324 break;
8325 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8326 {
8327 enum stringop_alg candidate = algs->size[i].alg;
8328
8329 if (candidate != libcall
8330 && alg_usable_p (candidate, memset, have_as))
8331 {
8332 alg = candidate;
8333 alg_noalign = algs->size[i].noalign;
8334 }
8335 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8336 last non-libcall inline algorithm. */
8337 if (TARGET_INLINE_ALL_STRINGOPS)
8338 {
8339 /* When the current size is best to be copied by a libcall,
8340 but we are still forced to inline, run the heuristic below
8341 that will pick code for medium sized blocks. */
8342 if (alg != libcall)
8343 {
8344 *noalign = alg_noalign;
8345 return alg;
8346 }
8347 else if (!any_alg_usable_p)
8348 break;
8349 }
8350 else if (alg_usable_p (candidate, memset, have_as)
8351 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8352 && candidate == rep_prefix_1_byte
8353 /* NB: If min_size != max_size, size is
8354 unknown. */
8355 && min_size != max_size))
8356 {
8357 *noalign = algs->size[i].noalign;
8358 return candidate;
8359 }
8360 }
8361 }
8362 }
8363 /* When asked to inline the call anyway, try to pick meaningful choice.
8364 We look for maximal size of block that is faster to copy by hand and
8365 take blocks of at most of that size guessing that average size will
8366 be roughly half of the block.
8367
8368 If this turns out to be bad, we might simply specify the preferred
8369 choice in ix86_costs. */
8370 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8371 && (algs->unknown_size == libcall
8372 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8373 {
8374 enum stringop_alg alg;
8375 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8376
8377 /* If there aren't any usable algorithms or if recursing already,
8378 then recursing on smaller sizes or same size isn't going to
8379 find anything. Just return the simple byte-at-a-time copy loop. */
8380 if (!any_alg_usable_p || recur)
8381 {
8382 /* Pick something reasonable. */
8383 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8384 *dynamic_check = 128;
8385 return loop_1_byte;
8386 }
8387 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8388 zero_memset, have_as, dynamic_check, noalign, true);
8389 gcc_assert (*dynamic_check == -1);
8390 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8391 *dynamic_check = max;
8392 else
8393 gcc_assert (alg != libcall);
8394 return alg;
8395 }
8396 return (alg_usable_p (algs->unknown_size, memset, have_as)
8397 ? algs->unknown_size : libcall);
8398 }
8399
8400 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8401 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8402 static int
8403 decide_alignment (int align,
8404 enum stringop_alg alg,
8405 int expected_size,
8406 machine_mode move_mode)
8407 {
8408 int desired_align = 0;
8409
8410 gcc_assert (alg != no_stringop);
8411
8412 if (alg == libcall)
8413 return 0;
8414 if (move_mode == VOIDmode)
8415 return 0;
8416
8417 desired_align = GET_MODE_SIZE (move_mode);
8418 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8419 copying whole cacheline at once. */
8420 if (TARGET_CPU_P (PENTIUMPRO)
8421 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8422 desired_align = 8;
8423
8424 if (optimize_size)
8425 desired_align = 1;
8426 if (desired_align < align)
8427 desired_align = align;
8428 if (expected_size != -1 && expected_size < 4)
8429 desired_align = align;
8430
8431 return desired_align;
8432 }
8433
8434
8435 /* Helper function for memcpy. For QImode value 0xXY produce
8436 0xXYXYXYXY of wide specified by MODE. This is essentially
8437 a * 0x10101010, but we can do slightly better than
8438 synth_mult by unwinding the sequence by hand on CPUs with
8439 slow multiply. */
8440 static rtx
8441 promote_duplicated_reg (machine_mode mode, rtx val)
8442 {
8443 machine_mode valmode = GET_MODE (val);
8444 rtx tmp;
8445 int nops = mode == DImode ? 3 : 2;
8446
8447 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8448 if (val == const0_rtx)
8449 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8450 if (CONST_INT_P (val))
8451 {
8452 HOST_WIDE_INT v = INTVAL (val) & 255;
8453
8454 v |= v << 8;
8455 v |= v << 16;
8456 if (mode == DImode)
8457 v |= (v << 16) << 16;
8458 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8459 }
8460
8461 if (valmode == VOIDmode)
8462 valmode = QImode;
8463 if (valmode != QImode)
8464 val = gen_lowpart (QImode, val);
8465 if (mode == QImode)
8466 return val;
8467 if (!TARGET_PARTIAL_REG_STALL)
8468 nops--;
8469 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8470 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8471 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8472 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8473 {
8474 rtx reg = convert_modes (mode, QImode, val, true);
8475 tmp = promote_duplicated_reg (mode, const1_rtx);
8476 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8477 OPTAB_DIRECT);
8478 }
8479 else
8480 {
8481 rtx reg = convert_modes (mode, QImode, val, true);
8482
8483 if (!TARGET_PARTIAL_REG_STALL)
8484 emit_insn (gen_insv_1 (mode, reg, reg));
8485 else
8486 {
8487 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8488 NULL, 1, OPTAB_DIRECT);
8489 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8490 OPTAB_DIRECT);
8491 }
8492 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8493 NULL, 1, OPTAB_DIRECT);
8494 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8495 if (mode == SImode)
8496 return reg;
8497 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8498 NULL, 1, OPTAB_DIRECT);
8499 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8500 return reg;
8501 }
8502 }
8503
8504 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8505 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8506 alignment from ALIGN to DESIRED_ALIGN. */
8507 static rtx
8508 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8509 int align)
8510 {
8511 rtx promoted_val;
8512
8513 if (TARGET_64BIT
8514 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8515 promoted_val = promote_duplicated_reg (DImode, val);
8516 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8517 promoted_val = promote_duplicated_reg (SImode, val);
8518 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8519 promoted_val = promote_duplicated_reg (HImode, val);
8520 else
8521 promoted_val = val;
8522
8523 return promoted_val;
8524 }
8525
8526 /* Copy the address to a Pmode register. This is used for x32 to
8527 truncate DImode TLS address to a SImode register. */
8528
8529 static rtx
8530 ix86_copy_addr_to_reg (rtx addr)
8531 {
8532 rtx reg;
8533 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8534 {
8535 reg = copy_addr_to_reg (addr);
8536 REG_POINTER (reg) = 1;
8537 return reg;
8538 }
8539 else
8540 {
8541 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8542 reg = copy_to_mode_reg (DImode, addr);
8543 REG_POINTER (reg) = 1;
8544 return gen_rtx_SUBREG (SImode, reg, 0);
8545 }
8546 }
8547
8548 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8549 operations when profitable. The code depends upon architecture, block size
8550 and alignment, but always has one of the following overall structures:
8551
8552 Aligned move sequence:
8553
8554 1) Prologue guard: Conditional that jumps up to epilogues for small
8555 blocks that can be handled by epilogue alone. This is faster
8556 but also needed for correctness, since prologue assume the block
8557 is larger than the desired alignment.
8558
8559 Optional dynamic check for size and libcall for large
8560 blocks is emitted here too, with -minline-stringops-dynamically.
8561
8562 2) Prologue: copy first few bytes in order to get destination
8563 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8564 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8565 copied. We emit either a jump tree on power of two sized
8566 blocks, or a byte loop.
8567
8568 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8569 with specified algorithm.
8570
8571 4) Epilogue: code copying tail of the block that is too small to be
8572 handled by main body (or up to size guarded by prologue guard).
8573
8574 Misaligned move sequence
8575
8576 1) missaligned move prologue/epilogue containing:
8577 a) Prologue handling small memory blocks and jumping to done_label
8578 (skipped if blocks are known to be large enough)
8579 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8580 needed by single possibly misaligned move
8581 (skipped if alignment is not needed)
8582 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8583
8584 2) Zero size guard dispatching to done_label, if needed
8585
8586 3) dispatch to library call, if needed,
8587
8588 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8589 with specified algorithm. */
8590 bool
8591 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8592 rtx align_exp, rtx expected_align_exp,
8593 rtx expected_size_exp, rtx min_size_exp,
8594 rtx max_size_exp, rtx probable_max_size_exp,
8595 bool issetmem)
8596 {
8597 rtx destreg;
8598 rtx srcreg = NULL;
8599 rtx_code_label *label = NULL;
8600 rtx tmp;
8601 rtx_code_label *jump_around_label = NULL;
8602 HOST_WIDE_INT align = 1;
8603 unsigned HOST_WIDE_INT count = 0;
8604 HOST_WIDE_INT expected_size = -1;
8605 int size_needed = 0, epilogue_size_needed;
8606 int desired_align = 0, align_bytes = 0;
8607 enum stringop_alg alg;
8608 rtx promoted_val = NULL;
8609 rtx vec_promoted_val = NULL;
8610 bool force_loopy_epilogue = false;
8611 int dynamic_check;
8612 bool need_zero_guard = false;
8613 bool noalign;
8614 machine_mode move_mode = VOIDmode;
8615 machine_mode wider_mode;
8616 int unroll_factor = 1;
8617 /* TODO: Once value ranges are available, fill in proper data. */
8618 unsigned HOST_WIDE_INT min_size = 0;
8619 unsigned HOST_WIDE_INT max_size = -1;
8620 unsigned HOST_WIDE_INT probable_max_size = -1;
8621 bool misaligned_prologue_used = false;
8622 bool have_as;
8623
8624 if (CONST_INT_P (align_exp))
8625 align = INTVAL (align_exp);
8626 /* i386 can do misaligned access on reasonably increased cost. */
8627 if (CONST_INT_P (expected_align_exp)
8628 && INTVAL (expected_align_exp) > align)
8629 align = INTVAL (expected_align_exp);
8630 /* ALIGN is the minimum of destination and source alignment, but we care here
8631 just about destination alignment. */
8632 else if (!issetmem
8633 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8634 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8635
8636 if (CONST_INT_P (count_exp))
8637 {
8638 min_size = max_size = probable_max_size = count = expected_size
8639 = INTVAL (count_exp);
8640 /* When COUNT is 0, there is nothing to do. */
8641 if (!count)
8642 return true;
8643 }
8644 else
8645 {
8646 if (min_size_exp)
8647 min_size = INTVAL (min_size_exp);
8648 if (max_size_exp)
8649 max_size = INTVAL (max_size_exp);
8650 if (probable_max_size_exp)
8651 probable_max_size = INTVAL (probable_max_size_exp);
8652 if (CONST_INT_P (expected_size_exp))
8653 expected_size = INTVAL (expected_size_exp);
8654 }
8655
8656 /* Make sure we don't need to care about overflow later on. */
8657 if (count > (HOST_WIDE_INT_1U << 30))
8658 return false;
8659
8660 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8661 if (!issetmem)
8662 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8663
8664 /* Step 0: Decide on preferred algorithm, desired alignment and
8665 size of chunks to be copied by main loop. */
8666 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8667 issetmem,
8668 issetmem && val_exp == const0_rtx, have_as,
8669 &dynamic_check, &noalign, false);
8670
8671 if (dump_file)
8672 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8673 stringop_alg_names[alg]);
8674
8675 if (alg == libcall)
8676 return false;
8677 gcc_assert (alg != no_stringop);
8678
8679 /* For now vector-version of memset is generated only for memory zeroing, as
8680 creating of promoted vector value is very cheap in this case. */
8681 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8682 alg = unrolled_loop;
8683
8684 if (!count)
8685 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8686 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8687 if (!issetmem)
8688 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8689
8690 unroll_factor = 1;
8691 move_mode = word_mode;
8692 switch (alg)
8693 {
8694 case libcall:
8695 case no_stringop:
8696 case last_alg:
8697 gcc_unreachable ();
8698 case loop_1_byte:
8699 need_zero_guard = true;
8700 move_mode = QImode;
8701 break;
8702 case loop:
8703 need_zero_guard = true;
8704 break;
8705 case unrolled_loop:
8706 need_zero_guard = true;
8707 unroll_factor = (TARGET_64BIT ? 4 : 2);
8708 break;
8709 case vector_loop:
8710 need_zero_guard = true;
8711 unroll_factor = 4;
8712 /* Find the widest supported mode. */
8713 move_mode = word_mode;
8714 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8715 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8716 move_mode = wider_mode;
8717
8718 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8719 move_mode = TImode;
8720 if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
8721 move_mode = OImode;
8722
8723 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8724 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8725 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8726 {
8727 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8728 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8729 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8730 move_mode = word_mode;
8731 }
8732 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8733 break;
8734 case rep_prefix_8_byte:
8735 move_mode = DImode;
8736 break;
8737 case rep_prefix_4_byte:
8738 move_mode = SImode;
8739 break;
8740 case rep_prefix_1_byte:
8741 move_mode = QImode;
8742 break;
8743 }
8744 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8745 epilogue_size_needed = size_needed;
8746
8747 /* If we are going to call any library calls conditionally, make sure any
8748 pending stack adjustment happen before the first conditional branch,
8749 otherwise they will be emitted before the library call only and won't
8750 happen from the other branches. */
8751 if (dynamic_check != -1)
8752 do_pending_stack_adjust ();
8753
8754 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8755 if (!TARGET_ALIGN_STRINGOPS || noalign)
8756 align = desired_align;
8757
8758 /* Step 1: Prologue guard. */
8759
8760 /* Alignment code needs count to be in register. */
8761 if (CONST_INT_P (count_exp) && desired_align > align)
8762 {
8763 if (INTVAL (count_exp) > desired_align
8764 && INTVAL (count_exp) > size_needed)
8765 {
8766 align_bytes
8767 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8768 if (align_bytes <= 0)
8769 align_bytes = 0;
8770 else
8771 align_bytes = desired_align - align_bytes;
8772 }
8773 if (align_bytes == 0)
8774 count_exp = force_reg (counter_mode (count_exp), count_exp);
8775 }
8776 gcc_assert (desired_align >= 1 && align >= 1);
8777
8778 /* Misaligned move sequences handle both prologue and epilogue at once.
8779 Default code generation results in a smaller code for large alignments
8780 and also avoids redundant job when sizes are known precisely. */
8781 misaligned_prologue_used
8782 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8783 && MAX (desired_align, epilogue_size_needed) <= 32
8784 && desired_align <= epilogue_size_needed
8785 && ((desired_align > align && !align_bytes)
8786 || (!count && epilogue_size_needed > 1)));
8787
8788 /* Do the cheap promotion to allow better CSE across the
8789 main loop and epilogue (ie one load of the big constant in the
8790 front of all code.
8791 For now the misaligned move sequences do not have fast path
8792 without broadcasting. */
8793 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8794 {
8795 if (alg == vector_loop)
8796 {
8797 gcc_assert (val_exp == const0_rtx);
8798 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8799 promoted_val = promote_duplicated_reg_to_size (val_exp,
8800 GET_MODE_SIZE (word_mode),
8801 desired_align, align);
8802 }
8803 else
8804 {
8805 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8806 desired_align, align);
8807 }
8808 }
8809 /* Misaligned move sequences handles both prologues and epilogues at once.
8810 Default code generation results in smaller code for large alignments and
8811 also avoids redundant job when sizes are known precisely. */
8812 if (misaligned_prologue_used)
8813 {
8814 /* Misaligned move prologue handled small blocks by itself. */
8815 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8816 (dst, src, &destreg, &srcreg,
8817 move_mode, promoted_val, vec_promoted_val,
8818 &count_exp,
8819 &jump_around_label,
8820 desired_align < align
8821 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8822 desired_align, align, &min_size, dynamic_check, issetmem);
8823 if (!issetmem)
8824 src = change_address (src, BLKmode, srcreg);
8825 dst = change_address (dst, BLKmode, destreg);
8826 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8827 epilogue_size_needed = 0;
8828 if (need_zero_guard
8829 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8830 {
8831 /* It is possible that we copied enough so the main loop will not
8832 execute. */
8833 gcc_assert (size_needed > 1);
8834 if (jump_around_label == NULL_RTX)
8835 jump_around_label = gen_label_rtx ();
8836 emit_cmp_and_jump_insns (count_exp,
8837 GEN_INT (size_needed),
8838 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8839 if (expected_size == -1
8840 || expected_size < (desired_align - align) / 2 + size_needed)
8841 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8842 else
8843 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8844 }
8845 }
8846 /* Ensure that alignment prologue won't copy past end of block. */
8847 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8848 {
8849 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8850 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8851 Make sure it is power of 2. */
8852 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8853
8854 /* To improve performance of small blocks, we jump around the VAL
8855 promoting mode. This mean that if the promoted VAL is not constant,
8856 we might not use it in the epilogue and have to use byte
8857 loop variant. */
8858 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8859 force_loopy_epilogue = true;
8860 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8861 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8862 {
8863 /* If main algorithm works on QImode, no epilogue is needed.
8864 For small sizes just don't align anything. */
8865 if (size_needed == 1)
8866 desired_align = align;
8867 else
8868 goto epilogue;
8869 }
8870 else if (!count
8871 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8872 {
8873 label = gen_label_rtx ();
8874 emit_cmp_and_jump_insns (count_exp,
8875 GEN_INT (epilogue_size_needed),
8876 LTU, 0, counter_mode (count_exp), 1, label);
8877 if (expected_size == -1 || expected_size < epilogue_size_needed)
8878 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8879 else
8880 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8881 }
8882 }
8883
8884 /* Emit code to decide on runtime whether library call or inline should be
8885 used. */
8886 if (dynamic_check != -1)
8887 {
8888 if (!issetmem && CONST_INT_P (count_exp))
8889 {
8890 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8891 {
8892 emit_block_copy_via_libcall (dst, src, count_exp);
8893 count_exp = const0_rtx;
8894 goto epilogue;
8895 }
8896 }
8897 else
8898 {
8899 rtx_code_label *hot_label = gen_label_rtx ();
8900 if (jump_around_label == NULL_RTX)
8901 jump_around_label = gen_label_rtx ();
8902 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8903 LEU, 0, counter_mode (count_exp),
8904 1, hot_label);
8905 predict_jump (REG_BR_PROB_BASE * 90 / 100);
8906 if (issetmem)
8907 set_storage_via_libcall (dst, count_exp, val_exp);
8908 else
8909 emit_block_copy_via_libcall (dst, src, count_exp);
8910 emit_jump (jump_around_label);
8911 emit_label (hot_label);
8912 }
8913 }
8914
8915 /* Step 2: Alignment prologue. */
8916 /* Do the expensive promotion once we branched off the small blocks. */
8917 if (issetmem && !promoted_val)
8918 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8919 desired_align, align);
8920
8921 if (desired_align > align && !misaligned_prologue_used)
8922 {
8923 if (align_bytes == 0)
8924 {
8925 /* Except for the first move in prologue, we no longer know
8926 constant offset in aliasing info. It don't seems to worth
8927 the pain to maintain it for the first move, so throw away
8928 the info early. */
8929 dst = change_address (dst, BLKmode, destreg);
8930 if (!issetmem)
8931 src = change_address (src, BLKmode, srcreg);
8932 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
8933 promoted_val, vec_promoted_val,
8934 count_exp, align, desired_align,
8935 issetmem);
8936 /* At most desired_align - align bytes are copied. */
8937 if (min_size < (unsigned)(desired_align - align))
8938 min_size = 0;
8939 else
8940 min_size -= desired_align - align;
8941 }
8942 else
8943 {
8944 /* If we know how many bytes need to be stored before dst is
8945 sufficiently aligned, maintain aliasing info accurately. */
8946 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
8947 srcreg,
8948 promoted_val,
8949 vec_promoted_val,
8950 desired_align,
8951 align_bytes,
8952 issetmem);
8953
8954 count_exp = plus_constant (counter_mode (count_exp),
8955 count_exp, -align_bytes);
8956 count -= align_bytes;
8957 min_size -= align_bytes;
8958 max_size -= align_bytes;
8959 }
8960 if (need_zero_guard
8961 && min_size < (unsigned HOST_WIDE_INT) size_needed
8962 && (count < (unsigned HOST_WIDE_INT) size_needed
8963 || (align_bytes == 0
8964 && count < ((unsigned HOST_WIDE_INT) size_needed
8965 + desired_align - align))))
8966 {
8967 /* It is possible that we copied enough so the main loop will not
8968 execute. */
8969 gcc_assert (size_needed > 1);
8970 if (label == NULL_RTX)
8971 label = gen_label_rtx ();
8972 emit_cmp_and_jump_insns (count_exp,
8973 GEN_INT (size_needed),
8974 LTU, 0, counter_mode (count_exp), 1, label);
8975 if (expected_size == -1
8976 || expected_size < (desired_align - align) / 2 + size_needed)
8977 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8978 else
8979 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8980 }
8981 }
8982 if (label && size_needed == 1)
8983 {
8984 emit_label (label);
8985 LABEL_NUSES (label) = 1;
8986 label = NULL;
8987 epilogue_size_needed = 1;
8988 if (issetmem)
8989 promoted_val = val_exp;
8990 }
8991 else if (label == NULL_RTX && !misaligned_prologue_used)
8992 epilogue_size_needed = size_needed;
8993
8994 /* Step 3: Main loop. */
8995
8996 switch (alg)
8997 {
8998 case libcall:
8999 case no_stringop:
9000 case last_alg:
9001 gcc_unreachable ();
9002 case loop_1_byte:
9003 case loop:
9004 case unrolled_loop:
9005 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
9006 count_exp, move_mode, unroll_factor,
9007 expected_size, issetmem);
9008 break;
9009 case vector_loop:
9010 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
9011 vec_promoted_val, count_exp, move_mode,
9012 unroll_factor, expected_size, issetmem);
9013 break;
9014 case rep_prefix_8_byte:
9015 case rep_prefix_4_byte:
9016 case rep_prefix_1_byte:
9017 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
9018 val_exp, count_exp, move_mode, issetmem);
9019 break;
9020 }
9021 /* Adjust properly the offset of src and dest memory for aliasing. */
9022 if (CONST_INT_P (count_exp))
9023 {
9024 if (!issetmem)
9025 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
9026 (count / size_needed) * size_needed);
9027 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
9028 (count / size_needed) * size_needed);
9029 }
9030 else
9031 {
9032 if (!issetmem)
9033 src = change_address (src, BLKmode, srcreg);
9034 dst = change_address (dst, BLKmode, destreg);
9035 }
9036
9037 /* Step 4: Epilogue to copy the remaining bytes. */
9038 epilogue:
9039 if (label)
9040 {
9041 /* When the main loop is done, COUNT_EXP might hold original count,
9042 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9043 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9044 bytes. Compensate if needed. */
9045
9046 if (size_needed < epilogue_size_needed)
9047 {
9048 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
9049 GEN_INT (size_needed - 1), count_exp, 1,
9050 OPTAB_DIRECT);
9051 if (tmp != count_exp)
9052 emit_move_insn (count_exp, tmp);
9053 }
9054 emit_label (label);
9055 LABEL_NUSES (label) = 1;
9056 }
9057
9058 if (count_exp != const0_rtx && epilogue_size_needed > 1)
9059 {
9060 if (force_loopy_epilogue)
9061 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
9062 epilogue_size_needed);
9063 else
9064 {
9065 if (issetmem)
9066 expand_setmem_epilogue (dst, destreg, promoted_val,
9067 vec_promoted_val, count_exp,
9068 epilogue_size_needed);
9069 else
9070 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
9071 epilogue_size_needed);
9072 }
9073 }
9074 if (jump_around_label)
9075 emit_label (jump_around_label);
9076 return true;
9077 }
9078
9079 /* Expand cmpstrn or memcmp. */
9080
9081 bool
9082 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
9083 rtx length, rtx align, bool is_cmpstrn)
9084 {
9085 /* Expand strncmp and memcmp only with -minline-all-stringops since
9086 "repz cmpsb" can be much slower than strncmp and memcmp functions
9087 implemented with vector instructions, see
9088
9089 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9090 */
9091 if (!TARGET_INLINE_ALL_STRINGOPS)
9092 return false;
9093
9094 /* Can't use this if the user has appropriated ecx, esi or edi. */
9095 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
9096 return false;
9097
9098 if (is_cmpstrn)
9099 {
9100 /* For strncmp, length is the maximum length, which can be larger
9101 than actual string lengths. We can expand the cmpstrn pattern
9102 to "repz cmpsb" only if one of the strings is a constant so
9103 that expand_builtin_strncmp() can write the length argument to
9104 be the minimum of the const string length and the actual length
9105 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9106 tree t1 = MEM_EXPR (src1);
9107 tree t2 = MEM_EXPR (src2);
9108 if (!((t1 && TREE_CODE (t1) == MEM_REF
9109 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
9110 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
9111 == STRING_CST))
9112 || (t2 && TREE_CODE (t2) == MEM_REF
9113 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
9114 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
9115 == STRING_CST))))
9116 return false;
9117 }
9118
9119 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
9120 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
9121 if (addr1 != XEXP (src1, 0))
9122 src1 = replace_equiv_address_nv (src1, addr1);
9123 if (addr2 != XEXP (src2, 0))
9124 src2 = replace_equiv_address_nv (src2, addr2);
9125
9126 /* NB: Make a copy of the data length to avoid changing the original
9127 data length by cmpstrnqi patterns. */
9128 length = ix86_zero_extend_to_Pmode (length);
9129 rtx lengthreg = gen_reg_rtx (Pmode);
9130 emit_move_insn (lengthreg, length);
9131
9132 /* If we are testing strict equality, we can use known alignment to
9133 good advantage. This may be possible with combine, particularly
9134 once cc0 is dead. */
9135 if (CONST_INT_P (length))
9136 {
9137 if (length == const0_rtx)
9138 {
9139 emit_move_insn (result, const0_rtx);
9140 return true;
9141 }
9142 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
9143 src1, src2));
9144 }
9145 else
9146 {
9147 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
9148 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
9149 src1, src2));
9150 }
9151
9152 rtx out = gen_lowpart (QImode, result);
9153 emit_insn (gen_cmpintqi (out));
9154 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
9155
9156 return true;
9157 }
9158
9159 /* Expand the appropriate insns for doing strlen if not just doing
9160 repnz; scasb
9161
9162 out = result, initialized with the start address
9163 align_rtx = alignment of the address.
9164 scratch = scratch register, initialized with the startaddress when
9165 not aligned, otherwise undefined
9166
9167 This is just the body. It needs the initializations mentioned above and
9168 some address computing at the end. These things are done in i386.md. */
9169
9170 static void
9171 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
9172 {
9173 int align;
9174 rtx tmp;
9175 rtx_code_label *align_2_label = NULL;
9176 rtx_code_label *align_3_label = NULL;
9177 rtx_code_label *align_4_label = gen_label_rtx ();
9178 rtx_code_label *end_0_label = gen_label_rtx ();
9179 rtx mem;
9180 rtx tmpreg = gen_reg_rtx (SImode);
9181 rtx scratch = gen_reg_rtx (SImode);
9182 rtx cmp;
9183
9184 align = 0;
9185 if (CONST_INT_P (align_rtx))
9186 align = INTVAL (align_rtx);
9187
9188 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9189
9190 /* Is there a known alignment and is it less than 4? */
9191 if (align < 4)
9192 {
9193 rtx scratch1 = gen_reg_rtx (Pmode);
9194 emit_move_insn (scratch1, out);
9195 /* Is there a known alignment and is it not 2? */
9196 if (align != 2)
9197 {
9198 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
9199 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
9200
9201 /* Leave just the 3 lower bits. */
9202 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9203 NULL_RTX, 0, OPTAB_WIDEN);
9204
9205 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9206 Pmode, 1, align_4_label);
9207 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9208 Pmode, 1, align_2_label);
9209 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9210 Pmode, 1, align_3_label);
9211 }
9212 else
9213 {
9214 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9215 check if is aligned to 4 - byte. */
9216
9217 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9218 NULL_RTX, 0, OPTAB_WIDEN);
9219
9220 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9221 Pmode, 1, align_4_label);
9222 }
9223
9224 mem = change_address (src, QImode, out);
9225
9226 /* Now compare the bytes. */
9227
9228 /* Compare the first n unaligned byte on a byte per byte basis. */
9229 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9230 QImode, 1, end_0_label);
9231
9232 /* Increment the address. */
9233 emit_insn (gen_add2_insn (out, const1_rtx));
9234
9235 /* Not needed with an alignment of 2 */
9236 if (align != 2)
9237 {
9238 emit_label (align_2_label);
9239
9240 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9241 end_0_label);
9242
9243 emit_insn (gen_add2_insn (out, const1_rtx));
9244
9245 emit_label (align_3_label);
9246 }
9247
9248 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9249 end_0_label);
9250
9251 emit_insn (gen_add2_insn (out, const1_rtx));
9252 }
9253
9254 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9255 align this loop. It gives only huge programs, but does not help to
9256 speed up. */
9257 emit_label (align_4_label);
9258
9259 mem = change_address (src, SImode, out);
9260 emit_move_insn (scratch, mem);
9261 emit_insn (gen_add2_insn (out, GEN_INT (4)));
9262
9263 /* This formula yields a nonzero result iff one of the bytes is zero.
9264 This saves three branches inside loop and many cycles. */
9265
9266 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9267 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9268 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9269 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9270 gen_int_mode (0x80808080, SImode)));
9271 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9272 align_4_label);
9273
9274 if (TARGET_CMOVE)
9275 {
9276 rtx reg = gen_reg_rtx (SImode);
9277 rtx reg2 = gen_reg_rtx (Pmode);
9278 emit_move_insn (reg, tmpreg);
9279 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9280
9281 /* If zero is not in the first two bytes, move two bytes forward. */
9282 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9283 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9284 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9285 emit_insn (gen_rtx_SET (tmpreg,
9286 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9287 reg,
9288 tmpreg)));
9289 /* Emit lea manually to avoid clobbering of flags. */
9290 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9291
9292 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9293 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9294 emit_insn (gen_rtx_SET (out,
9295 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9296 reg2,
9297 out)));
9298 }
9299 else
9300 {
9301 rtx_code_label *end_2_label = gen_label_rtx ();
9302 /* Is zero in the first two bytes? */
9303
9304 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9305 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9306 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9307 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9308 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9309 pc_rtx);
9310 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9311 JUMP_LABEL (tmp) = end_2_label;
9312
9313 /* Not in the first two. Move two bytes forward. */
9314 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9315 emit_insn (gen_add2_insn (out, const2_rtx));
9316
9317 emit_label (end_2_label);
9318
9319 }
9320
9321 /* Avoid branch in fixing the byte. */
9322 tmpreg = gen_lowpart (QImode, tmpreg);
9323 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9324 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9325 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9326 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9327
9328 emit_label (end_0_label);
9329 }
9330
9331 /* Expand strlen. */
9332
9333 bool
9334 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9335 {
9336 if (TARGET_UNROLL_STRLEN
9337 && TARGET_INLINE_ALL_STRINGOPS
9338 && eoschar == const0_rtx
9339 && optimize > 1)
9340 {
9341 /* The generic case of strlen expander is long. Avoid it's
9342 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9343 rtx addr = force_reg (Pmode, XEXP (src, 0));
9344 /* Well it seems that some optimizer does not combine a call like
9345 foo(strlen(bar), strlen(bar));
9346 when the move and the subtraction is done here. It does calculate
9347 the length just once when these instructions are done inside of
9348 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9349 often used and I use one fewer register for the lifetime of
9350 output_strlen_unroll() this is better. */
9351
9352 emit_move_insn (out, addr);
9353
9354 ix86_expand_strlensi_unroll_1 (out, src, align);
9355
9356 /* strlensi_unroll_1 returns the address of the zero at the end of
9357 the string, like memchr(), so compute the length by subtracting
9358 the start address. */
9359 emit_insn (gen_sub2_insn (out, addr));
9360 return true;
9361 }
9362 else
9363 return false;
9364 }
9365
9366 /* For given symbol (function) construct code to compute address of it's PLT
9367 entry in large x86-64 PIC model. */
9368
9369 static rtx
9370 construct_plt_address (rtx symbol)
9371 {
9372 rtx tmp, unspec;
9373
9374 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9375 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9376 gcc_assert (Pmode == DImode);
9377
9378 tmp = gen_reg_rtx (Pmode);
9379 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9380
9381 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9382 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9383 return tmp;
9384 }
9385
9386 /* Additional registers that are clobbered by SYSV calls. */
9387
9388 static int const x86_64_ms_sysv_extra_clobbered_registers
9389 [NUM_X86_64_MS_CLOBBERED_REGS] =
9390 {
9391 SI_REG, DI_REG,
9392 XMM6_REG, XMM7_REG,
9393 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9394 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9395 };
9396
9397 rtx_insn *
9398 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9399 rtx callarg2,
9400 rtx pop, bool sibcall)
9401 {
9402 rtx vec[3];
9403 rtx use = NULL, call;
9404 unsigned int vec_len = 0;
9405 tree fndecl;
9406
9407 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9408 {
9409 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9410 if (fndecl
9411 && (lookup_attribute ("interrupt",
9412 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9413 error ("interrupt service routine cannot be called directly");
9414 }
9415 else
9416 fndecl = NULL_TREE;
9417
9418 if (pop == const0_rtx)
9419 pop = NULL;
9420 gcc_assert (!TARGET_64BIT || !pop);
9421
9422 rtx addr = XEXP (fnaddr, 0);
9423 if (TARGET_MACHO && !TARGET_64BIT)
9424 {
9425 #if TARGET_MACHO
9426 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9427 fnaddr = machopic_indirect_call_target (fnaddr);
9428 #endif
9429 }
9430 else
9431 {
9432 /* Static functions and indirect calls don't need the pic register. Also,
9433 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9434 it an indirect call. */
9435 if (flag_pic
9436 && GET_CODE (addr) == SYMBOL_REF
9437 && ix86_call_use_plt_p (addr))
9438 {
9439 if (flag_plt
9440 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9441 || !lookup_attribute ("noplt",
9442 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9443 {
9444 if (!TARGET_64BIT
9445 || (ix86_cmodel == CM_LARGE_PIC
9446 && DEFAULT_ABI != MS_ABI))
9447 {
9448 use_reg (&use, gen_rtx_REG (Pmode,
9449 REAL_PIC_OFFSET_TABLE_REGNUM));
9450 if (ix86_use_pseudo_pic_reg ())
9451 emit_move_insn (gen_rtx_REG (Pmode,
9452 REAL_PIC_OFFSET_TABLE_REGNUM),
9453 pic_offset_table_rtx);
9454 }
9455 }
9456 else if (!TARGET_PECOFF && !TARGET_MACHO)
9457 {
9458 if (TARGET_64BIT
9459 && ix86_cmodel == CM_LARGE_PIC
9460 && DEFAULT_ABI != MS_ABI)
9461 {
9462 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9463 UNSPEC_GOT);
9464 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9465 fnaddr = force_reg (Pmode, fnaddr);
9466 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9467 }
9468 else if (TARGET_64BIT)
9469 {
9470 fnaddr = gen_rtx_UNSPEC (Pmode,
9471 gen_rtvec (1, addr),
9472 UNSPEC_GOTPCREL);
9473 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9474 }
9475 else
9476 {
9477 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9478 UNSPEC_GOT);
9479 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9480 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9481 fnaddr);
9482 }
9483 fnaddr = gen_const_mem (Pmode, fnaddr);
9484 /* Pmode may not be the same as word_mode for x32, which
9485 doesn't support indirect branch via 32-bit memory slot.
9486 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9487 indirect branch via x32 GOT slot is OK. */
9488 if (GET_MODE (fnaddr) != word_mode)
9489 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9490 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9491 }
9492 }
9493 }
9494
9495 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9496 parameters passed in vector registers. */
9497 if (TARGET_64BIT
9498 && (INTVAL (callarg2) > 0
9499 || (INTVAL (callarg2) == 0
9500 && (TARGET_SSE || !flag_skip_rax_setup))))
9501 {
9502 rtx al = gen_rtx_REG (QImode, AX_REG);
9503 emit_move_insn (al, callarg2);
9504 use_reg (&use, al);
9505 }
9506
9507 if (ix86_cmodel == CM_LARGE_PIC
9508 && !TARGET_PECOFF
9509 && MEM_P (fnaddr)
9510 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9511 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9512 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9513 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9514 branch via x32 GOT slot is OK. */
9515 else if (!(TARGET_X32
9516 && MEM_P (fnaddr)
9517 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9518 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9519 && (sibcall
9520 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9521 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9522 {
9523 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9524 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9525 }
9526
9527 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9528 mask off code pointers here.
9529 TODO: also need to handle indirect jump. */
9530 if (ix86_memtag_can_tag_addresses () && !fndecl
9531 && sanitize_flags_p (SANITIZE_HWADDRESS))
9532 {
9533 rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
9534 NULL_RTX);
9535 fnaddr = gen_rtx_MEM (QImode, untagged_addr);
9536 }
9537
9538 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9539
9540 if (retval)
9541 call = gen_rtx_SET (retval, call);
9542 vec[vec_len++] = call;
9543
9544 if (pop)
9545 {
9546 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9547 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9548 vec[vec_len++] = pop;
9549 }
9550
9551 if (cfun->machine->no_caller_saved_registers
9552 && (!fndecl
9553 || (!TREE_THIS_VOLATILE (fndecl)
9554 && !lookup_attribute ("no_caller_saved_registers",
9555 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9556 {
9557 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9558 bool is_64bit_ms_abi = (TARGET_64BIT
9559 && ix86_function_abi (fndecl) == MS_ABI);
9560 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9561
9562 /* If there are no caller-saved registers, add all registers
9563 that are clobbered by the call which returns. */
9564 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9565 if (!fixed_regs[i]
9566 && (ix86_call_used_regs[i] == 1
9567 || (ix86_call_used_regs[i] & c_mask))
9568 && !STACK_REGNO_P (i)
9569 && !MMX_REGNO_P (i))
9570 clobber_reg (&use,
9571 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9572 }
9573 else if (TARGET_64BIT_MS_ABI
9574 && (!callarg2 || INTVAL (callarg2) != -2))
9575 {
9576 unsigned i;
9577
9578 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9579 {
9580 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9581 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9582
9583 clobber_reg (&use, gen_rtx_REG (mode, regno));
9584 }
9585
9586 /* Set here, but it may get cleared later. */
9587 if (TARGET_CALL_MS2SYSV_XLOGUES)
9588 {
9589 if (!TARGET_SSE)
9590 ;
9591
9592 /* Don't break hot-patched functions. */
9593 else if (ix86_function_ms_hook_prologue (current_function_decl))
9594 ;
9595
9596 /* TODO: Cases not yet examined. */
9597 else if (flag_split_stack)
9598 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9599
9600 else
9601 {
9602 gcc_assert (!reload_completed);
9603 cfun->machine->call_ms2sysv = true;
9604 }
9605 }
9606 }
9607
9608 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9609 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9610 || !fndecl || TREE_PUBLIC (fndecl)))
9611 {
9612 /* We allow public functions defined in a TU to bind locally for PIC
9613 code (the default) on 64bit Mach-O.
9614 If such functions are not inlined, we cannot tell at compile-time if
9615 they will be called via the lazy symbol resolver (this can depend on
9616 options given at link-time). Therefore, we must assume that the lazy
9617 resolver could be used which clobbers R11 and R10. */
9618 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9619 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9620 }
9621
9622 if (vec_len > 1)
9623 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9624 rtx_insn *call_insn = emit_call_insn (call);
9625 if (use)
9626 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9627
9628 return call_insn;
9629 }
9630
9631 /* Split simple return with popping POPC bytes from stack to indirect
9632 branch with stack adjustment . */
9633
9634 void
9635 ix86_split_simple_return_pop_internal (rtx popc)
9636 {
9637 struct machine_function *m = cfun->machine;
9638 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9639 rtx_insn *insn;
9640
9641 /* There is no "pascal" calling convention in any 64bit ABI. */
9642 gcc_assert (!TARGET_64BIT);
9643
9644 insn = emit_insn (gen_pop (ecx));
9645 m->fs.cfa_offset -= UNITS_PER_WORD;
9646 m->fs.sp_offset -= UNITS_PER_WORD;
9647
9648 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9649 x = gen_rtx_SET (stack_pointer_rtx, x);
9650 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9651 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9652 RTX_FRAME_RELATED_P (insn) = 1;
9653
9654 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9655 x = gen_rtx_SET (stack_pointer_rtx, x);
9656 insn = emit_insn (x);
9657 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9658 RTX_FRAME_RELATED_P (insn) = 1;
9659
9660 /* Now return address is in ECX. */
9661 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9662 }
9663
9664 /* Errors in the source file can cause expand_expr to return const0_rtx
9665 where we expect a vector. To avoid crashing, use one of the vector
9666 clear instructions. */
9667
9668 static rtx
9669 safe_vector_operand (rtx x, machine_mode mode)
9670 {
9671 if (x == const0_rtx)
9672 x = CONST0_RTX (mode);
9673 return x;
9674 }
9675
9676 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9677
9678 static rtx
9679 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9680 {
9681 rtx pat;
9682 tree arg0 = CALL_EXPR_ARG (exp, 0);
9683 tree arg1 = CALL_EXPR_ARG (exp, 1);
9684 rtx op0 = expand_normal (arg0);
9685 rtx op1 = expand_normal (arg1);
9686 machine_mode tmode = insn_data[icode].operand[0].mode;
9687 machine_mode mode0 = insn_data[icode].operand[1].mode;
9688 machine_mode mode1 = insn_data[icode].operand[2].mode;
9689
9690 if (VECTOR_MODE_P (mode0))
9691 op0 = safe_vector_operand (op0, mode0);
9692 if (VECTOR_MODE_P (mode1))
9693 op1 = safe_vector_operand (op1, mode1);
9694
9695 if (optimize || !target
9696 || GET_MODE (target) != tmode
9697 || !insn_data[icode].operand[0].predicate (target, tmode))
9698 target = gen_reg_rtx (tmode);
9699
9700 if (GET_MODE (op1) == SImode && mode1 == TImode)
9701 {
9702 rtx x = gen_reg_rtx (V4SImode);
9703 emit_insn (gen_sse2_loadd (x, op1));
9704 op1 = gen_lowpart (TImode, x);
9705 }
9706
9707 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9708 op0 = copy_to_mode_reg (mode0, op0);
9709 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9710 op1 = copy_to_mode_reg (mode1, op1);
9711
9712 pat = GEN_FCN (icode) (target, op0, op1);
9713 if (! pat)
9714 return 0;
9715
9716 emit_insn (pat);
9717
9718 return target;
9719 }
9720
9721 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9722
9723 static rtx
9724 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9725 enum ix86_builtin_func_type m_type,
9726 enum rtx_code sub_code)
9727 {
9728 rtx pat;
9729 unsigned int i, nargs;
9730 bool comparison_p = false;
9731 bool tf_p = false;
9732 bool last_arg_constant = false;
9733 int num_memory = 0;
9734 rtx xops[4];
9735
9736 machine_mode tmode = insn_data[icode].operand[0].mode;
9737
9738 switch (m_type)
9739 {
9740 case MULTI_ARG_4_DF2_DI_I:
9741 case MULTI_ARG_4_DF2_DI_I1:
9742 case MULTI_ARG_4_SF2_SI_I:
9743 case MULTI_ARG_4_SF2_SI_I1:
9744 nargs = 4;
9745 last_arg_constant = true;
9746 break;
9747
9748 case MULTI_ARG_3_SF:
9749 case MULTI_ARG_3_DF:
9750 case MULTI_ARG_3_SF2:
9751 case MULTI_ARG_3_DF2:
9752 case MULTI_ARG_3_DI:
9753 case MULTI_ARG_3_SI:
9754 case MULTI_ARG_3_SI_DI:
9755 case MULTI_ARG_3_HI:
9756 case MULTI_ARG_3_HI_SI:
9757 case MULTI_ARG_3_QI:
9758 case MULTI_ARG_3_DI2:
9759 case MULTI_ARG_3_SI2:
9760 case MULTI_ARG_3_HI2:
9761 case MULTI_ARG_3_QI2:
9762 nargs = 3;
9763 break;
9764
9765 case MULTI_ARG_2_SF:
9766 case MULTI_ARG_2_DF:
9767 case MULTI_ARG_2_DI:
9768 case MULTI_ARG_2_SI:
9769 case MULTI_ARG_2_HI:
9770 case MULTI_ARG_2_QI:
9771 nargs = 2;
9772 break;
9773
9774 case MULTI_ARG_2_DI_IMM:
9775 case MULTI_ARG_2_SI_IMM:
9776 case MULTI_ARG_2_HI_IMM:
9777 case MULTI_ARG_2_QI_IMM:
9778 nargs = 2;
9779 last_arg_constant = true;
9780 break;
9781
9782 case MULTI_ARG_1_SF:
9783 case MULTI_ARG_1_DF:
9784 case MULTI_ARG_1_SF2:
9785 case MULTI_ARG_1_DF2:
9786 case MULTI_ARG_1_DI:
9787 case MULTI_ARG_1_SI:
9788 case MULTI_ARG_1_HI:
9789 case MULTI_ARG_1_QI:
9790 case MULTI_ARG_1_SI_DI:
9791 case MULTI_ARG_1_HI_DI:
9792 case MULTI_ARG_1_HI_SI:
9793 case MULTI_ARG_1_QI_DI:
9794 case MULTI_ARG_1_QI_SI:
9795 case MULTI_ARG_1_QI_HI:
9796 nargs = 1;
9797 break;
9798
9799 case MULTI_ARG_2_DI_CMP:
9800 case MULTI_ARG_2_SI_CMP:
9801 case MULTI_ARG_2_HI_CMP:
9802 case MULTI_ARG_2_QI_CMP:
9803 nargs = 2;
9804 comparison_p = true;
9805 break;
9806
9807 case MULTI_ARG_2_SF_TF:
9808 case MULTI_ARG_2_DF_TF:
9809 case MULTI_ARG_2_DI_TF:
9810 case MULTI_ARG_2_SI_TF:
9811 case MULTI_ARG_2_HI_TF:
9812 case MULTI_ARG_2_QI_TF:
9813 nargs = 2;
9814 tf_p = true;
9815 break;
9816
9817 default:
9818 gcc_unreachable ();
9819 }
9820
9821 if (optimize || !target
9822 || GET_MODE (target) != tmode
9823 || !insn_data[icode].operand[0].predicate (target, tmode))
9824 target = gen_reg_rtx (tmode);
9825 else if (memory_operand (target, tmode))
9826 num_memory++;
9827
9828 gcc_assert (nargs <= ARRAY_SIZE (xops));
9829
9830 for (i = 0; i < nargs; i++)
9831 {
9832 tree arg = CALL_EXPR_ARG (exp, i);
9833 rtx op = expand_normal (arg);
9834 int adjust = (comparison_p) ? 1 : 0;
9835 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9836
9837 if (last_arg_constant && i == nargs - 1)
9838 {
9839 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9840 {
9841 enum insn_code new_icode = icode;
9842 switch (icode)
9843 {
9844 case CODE_FOR_xop_vpermil2v2df3:
9845 case CODE_FOR_xop_vpermil2v4sf3:
9846 case CODE_FOR_xop_vpermil2v4df3:
9847 case CODE_FOR_xop_vpermil2v8sf3:
9848 error ("the last argument must be a 2-bit immediate");
9849 return gen_reg_rtx (tmode);
9850 case CODE_FOR_xop_rotlv2di3:
9851 new_icode = CODE_FOR_rotlv2di3;
9852 goto xop_rotl;
9853 case CODE_FOR_xop_rotlv4si3:
9854 new_icode = CODE_FOR_rotlv4si3;
9855 goto xop_rotl;
9856 case CODE_FOR_xop_rotlv8hi3:
9857 new_icode = CODE_FOR_rotlv8hi3;
9858 goto xop_rotl;
9859 case CODE_FOR_xop_rotlv16qi3:
9860 new_icode = CODE_FOR_rotlv16qi3;
9861 xop_rotl:
9862 if (CONST_INT_P (op))
9863 {
9864 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9865 op = GEN_INT (INTVAL (op) & mask);
9866 gcc_checking_assert
9867 (insn_data[icode].operand[i + 1].predicate (op, mode));
9868 }
9869 else
9870 {
9871 gcc_checking_assert
9872 (nargs == 2
9873 && insn_data[new_icode].operand[0].mode == tmode
9874 && insn_data[new_icode].operand[1].mode == tmode
9875 && insn_data[new_icode].operand[2].mode == mode
9876 && insn_data[new_icode].operand[0].predicate
9877 == insn_data[icode].operand[0].predicate
9878 && insn_data[new_icode].operand[1].predicate
9879 == insn_data[icode].operand[1].predicate);
9880 icode = new_icode;
9881 goto non_constant;
9882 }
9883 break;
9884 default:
9885 gcc_unreachable ();
9886 }
9887 }
9888 }
9889 else
9890 {
9891 non_constant:
9892 if (VECTOR_MODE_P (mode))
9893 op = safe_vector_operand (op, mode);
9894
9895 /* If we aren't optimizing, only allow one memory operand to be
9896 generated. */
9897 if (memory_operand (op, mode))
9898 num_memory++;
9899
9900 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9901
9902 if (optimize
9903 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9904 || num_memory > 1)
9905 op = force_reg (mode, op);
9906 }
9907
9908 xops[i] = op;
9909 }
9910
9911 switch (nargs)
9912 {
9913 case 1:
9914 pat = GEN_FCN (icode) (target, xops[0]);
9915 break;
9916
9917 case 2:
9918 if (tf_p)
9919 pat = GEN_FCN (icode) (target, xops[0], xops[1],
9920 GEN_INT ((int)sub_code));
9921 else if (! comparison_p)
9922 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
9923 else
9924 {
9925 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
9926 xops[0], xops[1]);
9927
9928 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
9929 }
9930 break;
9931
9932 case 3:
9933 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
9934 break;
9935
9936 case 4:
9937 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
9938 break;
9939
9940 default:
9941 gcc_unreachable ();
9942 }
9943
9944 if (! pat)
9945 return 0;
9946
9947 emit_insn (pat);
9948 return target;
9949 }
9950
9951 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9952 insns with vec_merge. */
9953
9954 static rtx
9955 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9956 rtx target)
9957 {
9958 rtx pat;
9959 tree arg0 = CALL_EXPR_ARG (exp, 0);
9960 rtx op1, op0 = expand_normal (arg0);
9961 machine_mode tmode = insn_data[icode].operand[0].mode;
9962 machine_mode mode0 = insn_data[icode].operand[1].mode;
9963
9964 if (optimize || !target
9965 || GET_MODE (target) != tmode
9966 || !insn_data[icode].operand[0].predicate (target, tmode))
9967 target = gen_reg_rtx (tmode);
9968
9969 if (VECTOR_MODE_P (mode0))
9970 op0 = safe_vector_operand (op0, mode0);
9971
9972 if ((optimize && !register_operand (op0, mode0))
9973 || !insn_data[icode].operand[1].predicate (op0, mode0))
9974 op0 = copy_to_mode_reg (mode0, op0);
9975
9976 op1 = op0;
9977 if (!insn_data[icode].operand[2].predicate (op1, mode0))
9978 op1 = copy_to_mode_reg (mode0, op1);
9979
9980 pat = GEN_FCN (icode) (target, op0, op1);
9981 if (! pat)
9982 return 0;
9983 emit_insn (pat);
9984 return target;
9985 }
9986
9987 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9988
9989 static rtx
9990 ix86_expand_sse_compare (const struct builtin_description *d,
9991 tree exp, rtx target, bool swap)
9992 {
9993 rtx pat;
9994 tree arg0 = CALL_EXPR_ARG (exp, 0);
9995 tree arg1 = CALL_EXPR_ARG (exp, 1);
9996 rtx op0 = expand_normal (arg0);
9997 rtx op1 = expand_normal (arg1);
9998 rtx op2;
9999 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10000 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10001 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10002 enum rtx_code comparison = d->comparison;
10003
10004 if (VECTOR_MODE_P (mode0))
10005 op0 = safe_vector_operand (op0, mode0);
10006 if (VECTOR_MODE_P (mode1))
10007 op1 = safe_vector_operand (op1, mode1);
10008
10009 /* Swap operands if we have a comparison that isn't available in
10010 hardware. */
10011 if (swap)
10012 std::swap (op0, op1);
10013
10014 if (optimize || !target
10015 || GET_MODE (target) != tmode
10016 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10017 target = gen_reg_rtx (tmode);
10018
10019 if ((optimize && !register_operand (op0, mode0))
10020 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
10021 op0 = copy_to_mode_reg (mode0, op0);
10022 if ((optimize && !register_operand (op1, mode1))
10023 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
10024 op1 = copy_to_mode_reg (mode1, op1);
10025
10026 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
10027 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10028 if (! pat)
10029 return 0;
10030 emit_insn (pat);
10031 return target;
10032 }
10033
10034 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
10035 * ordered EQ or unordered NE, generate PF jump. */
10036
10037 static rtx
10038 ix86_ssecom_setcc (const enum rtx_code comparison,
10039 bool check_unordered, machine_mode mode,
10040 rtx set_dst, rtx target)
10041 {
10042
10043 rtx_code_label *label = NULL;
10044
10045 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10046 with NAN operands. */
10047 if (check_unordered)
10048 {
10049 gcc_assert (comparison == EQ || comparison == NE);
10050
10051 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10052 label = gen_label_rtx ();
10053 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10054 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10055 gen_rtx_LABEL_REF (VOIDmode, label),
10056 pc_rtx);
10057 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10058 }
10059
10060 /* NB: Set CCFPmode and check a different CCmode which is in subset
10061 of CCFPmode. */
10062 if (GET_MODE (set_dst) != mode)
10063 {
10064 gcc_assert (mode == CCAmode || mode == CCCmode
10065 || mode == CCOmode || mode == CCPmode
10066 || mode == CCSmode || mode == CCZmode);
10067 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10068 }
10069
10070 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10071 gen_rtx_fmt_ee (comparison, QImode,
10072 set_dst,
10073 const0_rtx)));
10074
10075 if (label)
10076 emit_label (label);
10077
10078 return SUBREG_REG (target);
10079 }
10080
10081 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
10082
10083 static rtx
10084 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
10085 rtx target)
10086 {
10087 rtx pat, set_dst;
10088 tree arg0 = CALL_EXPR_ARG (exp, 0);
10089 tree arg1 = CALL_EXPR_ARG (exp, 1);
10090 rtx op0 = expand_normal (arg0);
10091 rtx op1 = expand_normal (arg1);
10092 enum insn_code icode = d->icode;
10093 const struct insn_data_d *insn_p = &insn_data[icode];
10094 machine_mode mode0 = insn_p->operand[0].mode;
10095 machine_mode mode1 = insn_p->operand[1].mode;
10096
10097 if (VECTOR_MODE_P (mode0))
10098 op0 = safe_vector_operand (op0, mode0);
10099 if (VECTOR_MODE_P (mode1))
10100 op1 = safe_vector_operand (op1, mode1);
10101
10102 enum rtx_code comparison = d->comparison;
10103 rtx const_val = const0_rtx;
10104
10105 bool check_unordered = false;
10106 machine_mode mode = CCFPmode;
10107 switch (comparison)
10108 {
10109 case LE: /* -> GE */
10110 case LT: /* -> GT */
10111 std::swap (op0, op1);
10112 comparison = swap_condition (comparison);
10113 /* FALLTHRU */
10114 case GT:
10115 case GE:
10116 break;
10117 case EQ:
10118 check_unordered = true;
10119 mode = CCZmode;
10120 break;
10121 case NE:
10122 check_unordered = true;
10123 mode = CCZmode;
10124 const_val = const1_rtx;
10125 break;
10126 default:
10127 gcc_unreachable ();
10128 }
10129
10130 target = gen_reg_rtx (SImode);
10131 emit_move_insn (target, const_val);
10132 target = gen_rtx_SUBREG (QImode, target, 0);
10133
10134 if ((optimize && !register_operand (op0, mode0))
10135 || !insn_p->operand[0].predicate (op0, mode0))
10136 op0 = copy_to_mode_reg (mode0, op0);
10137 if ((optimize && !register_operand (op1, mode1))
10138 || !insn_p->operand[1].predicate (op1, mode1))
10139 op1 = copy_to_mode_reg (mode1, op1);
10140
10141 pat = GEN_FCN (icode) (op0, op1);
10142 if (! pat)
10143 return 0;
10144
10145 set_dst = SET_DEST (pat);
10146 emit_insn (pat);
10147 return ix86_ssecom_setcc (comparison, check_unordered, mode,
10148 set_dst, target);
10149 }
10150
10151 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10152
10153 static rtx
10154 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
10155 rtx target)
10156 {
10157 rtx pat;
10158 tree arg0 = CALL_EXPR_ARG (exp, 0);
10159 rtx op1, op0 = expand_normal (arg0);
10160 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10161 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10162
10163 if (optimize || target == 0
10164 || GET_MODE (target) != tmode
10165 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10166 target = gen_reg_rtx (tmode);
10167
10168 if (VECTOR_MODE_P (mode0))
10169 op0 = safe_vector_operand (op0, mode0);
10170
10171 if ((optimize && !register_operand (op0, mode0))
10172 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10173 op0 = copy_to_mode_reg (mode0, op0);
10174
10175 op1 = GEN_INT (d->comparison);
10176
10177 pat = GEN_FCN (d->icode) (target, op0, op1);
10178 if (! pat)
10179 return 0;
10180 emit_insn (pat);
10181 return target;
10182 }
10183
10184 static rtx
10185 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
10186 tree exp, rtx target)
10187 {
10188 rtx pat;
10189 tree arg0 = CALL_EXPR_ARG (exp, 0);
10190 tree arg1 = CALL_EXPR_ARG (exp, 1);
10191 rtx op0 = expand_normal (arg0);
10192 rtx op1 = expand_normal (arg1);
10193 rtx op2;
10194 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10195 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10196 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10197
10198 if (optimize || target == 0
10199 || GET_MODE (target) != tmode
10200 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10201 target = gen_reg_rtx (tmode);
10202
10203 op0 = safe_vector_operand (op0, mode0);
10204 op1 = safe_vector_operand (op1, mode1);
10205
10206 if ((optimize && !register_operand (op0, mode0))
10207 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10208 op0 = copy_to_mode_reg (mode0, op0);
10209 if ((optimize && !register_operand (op1, mode1))
10210 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10211 op1 = copy_to_mode_reg (mode1, op1);
10212
10213 op2 = GEN_INT (d->comparison);
10214
10215 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10216 if (! pat)
10217 return 0;
10218 emit_insn (pat);
10219 return target;
10220 }
10221
10222 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10223
10224 static rtx
10225 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10226 rtx target)
10227 {
10228 rtx pat;
10229 tree arg0 = CALL_EXPR_ARG (exp, 0);
10230 tree arg1 = CALL_EXPR_ARG (exp, 1);
10231 rtx op0 = expand_normal (arg0);
10232 rtx op1 = expand_normal (arg1);
10233 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10234 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10235 enum rtx_code comparison = d->comparison;
10236
10237 /* ptest reg, reg sets the carry flag. */
10238 if (comparison == LTU
10239 && (d->code == IX86_BUILTIN_PTESTC
10240 || d->code == IX86_BUILTIN_PTESTC256)
10241 && rtx_equal_p (op0, op1))
10242 {
10243 if (!target)
10244 target = gen_reg_rtx (SImode);
10245 emit_move_insn (target, const1_rtx);
10246 return target;
10247 }
10248
10249 if (VECTOR_MODE_P (mode0))
10250 op0 = safe_vector_operand (op0, mode0);
10251 if (VECTOR_MODE_P (mode1))
10252 op1 = safe_vector_operand (op1, mode1);
10253
10254 target = gen_reg_rtx (SImode);
10255 emit_move_insn (target, const0_rtx);
10256 target = gen_rtx_SUBREG (QImode, target, 0);
10257
10258 if ((optimize && !register_operand (op0, mode0))
10259 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10260 op0 = copy_to_mode_reg (mode0, op0);
10261 if ((optimize && !register_operand (op1, mode1))
10262 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10263 op1 = copy_to_mode_reg (mode1, op1);
10264
10265 pat = GEN_FCN (d->icode) (op0, op1);
10266 if (! pat)
10267 return 0;
10268 emit_insn (pat);
10269 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10270 gen_rtx_fmt_ee (comparison, QImode,
10271 SET_DEST (pat),
10272 const0_rtx)));
10273
10274 return SUBREG_REG (target);
10275 }
10276
10277 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10278
10279 static rtx
10280 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10281 tree exp, rtx target)
10282 {
10283 rtx pat;
10284 tree arg0 = CALL_EXPR_ARG (exp, 0);
10285 tree arg1 = CALL_EXPR_ARG (exp, 1);
10286 tree arg2 = CALL_EXPR_ARG (exp, 2);
10287 tree arg3 = CALL_EXPR_ARG (exp, 3);
10288 tree arg4 = CALL_EXPR_ARG (exp, 4);
10289 rtx scratch0, scratch1;
10290 rtx op0 = expand_normal (arg0);
10291 rtx op1 = expand_normal (arg1);
10292 rtx op2 = expand_normal (arg2);
10293 rtx op3 = expand_normal (arg3);
10294 rtx op4 = expand_normal (arg4);
10295 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10296
10297 tmode0 = insn_data[d->icode].operand[0].mode;
10298 tmode1 = insn_data[d->icode].operand[1].mode;
10299 modev2 = insn_data[d->icode].operand[2].mode;
10300 modei3 = insn_data[d->icode].operand[3].mode;
10301 modev4 = insn_data[d->icode].operand[4].mode;
10302 modei5 = insn_data[d->icode].operand[5].mode;
10303 modeimm = insn_data[d->icode].operand[6].mode;
10304
10305 if (VECTOR_MODE_P (modev2))
10306 op0 = safe_vector_operand (op0, modev2);
10307 if (VECTOR_MODE_P (modev4))
10308 op2 = safe_vector_operand (op2, modev4);
10309
10310 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10311 op0 = copy_to_mode_reg (modev2, op0);
10312 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10313 op1 = copy_to_mode_reg (modei3, op1);
10314 if ((optimize && !register_operand (op2, modev4))
10315 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10316 op2 = copy_to_mode_reg (modev4, op2);
10317 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10318 op3 = copy_to_mode_reg (modei5, op3);
10319
10320 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10321 {
10322 error ("the fifth argument must be an 8-bit immediate");
10323 return const0_rtx;
10324 }
10325
10326 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10327 {
10328 if (optimize || !target
10329 || GET_MODE (target) != tmode0
10330 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10331 target = gen_reg_rtx (tmode0);
10332
10333 scratch1 = gen_reg_rtx (tmode1);
10334
10335 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10336 }
10337 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10338 {
10339 if (optimize || !target
10340 || GET_MODE (target) != tmode1
10341 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10342 target = gen_reg_rtx (tmode1);
10343
10344 scratch0 = gen_reg_rtx (tmode0);
10345
10346 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10347 }
10348 else
10349 {
10350 gcc_assert (d->flag);
10351
10352 scratch0 = gen_reg_rtx (tmode0);
10353 scratch1 = gen_reg_rtx (tmode1);
10354
10355 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10356 }
10357
10358 if (! pat)
10359 return 0;
10360
10361 emit_insn (pat);
10362
10363 if (d->flag)
10364 {
10365 target = gen_reg_rtx (SImode);
10366 emit_move_insn (target, const0_rtx);
10367 target = gen_rtx_SUBREG (QImode, target, 0);
10368
10369 emit_insn
10370 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10371 gen_rtx_fmt_ee (EQ, QImode,
10372 gen_rtx_REG ((machine_mode) d->flag,
10373 FLAGS_REG),
10374 const0_rtx)));
10375 return SUBREG_REG (target);
10376 }
10377 else
10378 return target;
10379 }
10380
10381
10382 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10383
10384 static rtx
10385 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10386 tree exp, rtx target)
10387 {
10388 rtx pat;
10389 tree arg0 = CALL_EXPR_ARG (exp, 0);
10390 tree arg1 = CALL_EXPR_ARG (exp, 1);
10391 tree arg2 = CALL_EXPR_ARG (exp, 2);
10392 rtx scratch0, scratch1;
10393 rtx op0 = expand_normal (arg0);
10394 rtx op1 = expand_normal (arg1);
10395 rtx op2 = expand_normal (arg2);
10396 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10397
10398 tmode0 = insn_data[d->icode].operand[0].mode;
10399 tmode1 = insn_data[d->icode].operand[1].mode;
10400 modev2 = insn_data[d->icode].operand[2].mode;
10401 modev3 = insn_data[d->icode].operand[3].mode;
10402 modeimm = insn_data[d->icode].operand[4].mode;
10403
10404 if (VECTOR_MODE_P (modev2))
10405 op0 = safe_vector_operand (op0, modev2);
10406 if (VECTOR_MODE_P (modev3))
10407 op1 = safe_vector_operand (op1, modev3);
10408
10409 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10410 op0 = copy_to_mode_reg (modev2, op0);
10411 if ((optimize && !register_operand (op1, modev3))
10412 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10413 op1 = copy_to_mode_reg (modev3, op1);
10414
10415 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10416 {
10417 error ("the third argument must be an 8-bit immediate");
10418 return const0_rtx;
10419 }
10420
10421 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10422 {
10423 if (optimize || !target
10424 || GET_MODE (target) != tmode0
10425 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10426 target = gen_reg_rtx (tmode0);
10427
10428 scratch1 = gen_reg_rtx (tmode1);
10429
10430 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10431 }
10432 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10433 {
10434 if (optimize || !target
10435 || GET_MODE (target) != tmode1
10436 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10437 target = gen_reg_rtx (tmode1);
10438
10439 scratch0 = gen_reg_rtx (tmode0);
10440
10441 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10442 }
10443 else
10444 {
10445 gcc_assert (d->flag);
10446
10447 scratch0 = gen_reg_rtx (tmode0);
10448 scratch1 = gen_reg_rtx (tmode1);
10449
10450 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10451 }
10452
10453 if (! pat)
10454 return 0;
10455
10456 emit_insn (pat);
10457
10458 if (d->flag)
10459 {
10460 target = gen_reg_rtx (SImode);
10461 emit_move_insn (target, const0_rtx);
10462 target = gen_rtx_SUBREG (QImode, target, 0);
10463
10464 emit_insn
10465 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10466 gen_rtx_fmt_ee (EQ, QImode,
10467 gen_rtx_REG ((machine_mode) d->flag,
10468 FLAGS_REG),
10469 const0_rtx)));
10470 return SUBREG_REG (target);
10471 }
10472 else
10473 return target;
10474 }
10475
10476 /* Fixup modeless constants to fit required mode. */
10477
10478 static rtx
10479 fixup_modeless_constant (rtx x, machine_mode mode)
10480 {
10481 if (GET_MODE (x) == VOIDmode)
10482 x = convert_to_mode (mode, x, 1);
10483 return x;
10484 }
10485
10486 /* Subroutine of ix86_expand_builtin to take care of insns with
10487 variable number of operands. */
10488
10489 static rtx
10490 ix86_expand_args_builtin (const struct builtin_description *d,
10491 tree exp, rtx target)
10492 {
10493 rtx pat, real_target;
10494 unsigned int i, nargs;
10495 unsigned int nargs_constant = 0;
10496 unsigned int mask_pos = 0;
10497 int num_memory = 0;
10498 rtx xops[6];
10499 bool second_arg_count = false;
10500 enum insn_code icode = d->icode;
10501 const struct insn_data_d *insn_p = &insn_data[icode];
10502 machine_mode tmode = insn_p->operand[0].mode;
10503 machine_mode rmode = VOIDmode;
10504 bool swap = false;
10505 enum rtx_code comparison = d->comparison;
10506
10507 switch ((enum ix86_builtin_func_type) d->flag)
10508 {
10509 case V2DF_FTYPE_V2DF_ROUND:
10510 case V4DF_FTYPE_V4DF_ROUND:
10511 case V8DF_FTYPE_V8DF_ROUND:
10512 case V4SF_FTYPE_V4SF_ROUND:
10513 case V8SF_FTYPE_V8SF_ROUND:
10514 case V16SF_FTYPE_V16SF_ROUND:
10515 case V8HF_FTYPE_V8HF_ROUND:
10516 case V16HF_FTYPE_V16HF_ROUND:
10517 case V32HF_FTYPE_V32HF_ROUND:
10518 case V4SI_FTYPE_V4SF_ROUND:
10519 case V8SI_FTYPE_V8SF_ROUND:
10520 case V16SI_FTYPE_V16SF_ROUND:
10521 return ix86_expand_sse_round (d, exp, target);
10522 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10523 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10524 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10525 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10526 case INT_FTYPE_V8SF_V8SF_PTEST:
10527 case INT_FTYPE_V4DI_V4DI_PTEST:
10528 case INT_FTYPE_V4DF_V4DF_PTEST:
10529 case INT_FTYPE_V4SF_V4SF_PTEST:
10530 case INT_FTYPE_V2DI_V2DI_PTEST:
10531 case INT_FTYPE_V2DF_V2DF_PTEST:
10532 return ix86_expand_sse_ptest (d, exp, target);
10533 case FLOAT128_FTYPE_FLOAT128:
10534 case FLOAT_FTYPE_FLOAT:
10535 case FLOAT_FTYPE_BFLOAT16:
10536 case INT_FTYPE_INT:
10537 case UINT_FTYPE_UINT:
10538 case UINT16_FTYPE_UINT16:
10539 case UINT64_FTYPE_INT:
10540 case UINT64_FTYPE_UINT64:
10541 case INT64_FTYPE_INT64:
10542 case INT64_FTYPE_V4SF:
10543 case INT64_FTYPE_V2DF:
10544 case INT_FTYPE_V16QI:
10545 case INT_FTYPE_V8QI:
10546 case INT_FTYPE_V8SF:
10547 case INT_FTYPE_V4DF:
10548 case INT_FTYPE_V4SF:
10549 case INT_FTYPE_V2DF:
10550 case INT_FTYPE_V32QI:
10551 case V16QI_FTYPE_V16QI:
10552 case V8SI_FTYPE_V8SF:
10553 case V8SI_FTYPE_V4SI:
10554 case V8HI_FTYPE_V8HI:
10555 case V8HI_FTYPE_V16QI:
10556 case V8QI_FTYPE_V8QI:
10557 case V8SF_FTYPE_V8SF:
10558 case V8SF_FTYPE_V8SI:
10559 case V8SF_FTYPE_V4SF:
10560 case V8SF_FTYPE_V8HI:
10561 case V4SI_FTYPE_V4SI:
10562 case V4SI_FTYPE_V16QI:
10563 case V4SI_FTYPE_V4SF:
10564 case V4SI_FTYPE_V8SI:
10565 case V4SI_FTYPE_V8HI:
10566 case V4SI_FTYPE_V4DF:
10567 case V4SI_FTYPE_V2DF:
10568 case V4HI_FTYPE_V4HI:
10569 case V4DF_FTYPE_V4DF:
10570 case V4DF_FTYPE_V4SI:
10571 case V4DF_FTYPE_V4SF:
10572 case V4DF_FTYPE_V2DF:
10573 case V4SF_FTYPE_V4SF:
10574 case V4SF_FTYPE_V4SI:
10575 case V4SF_FTYPE_V8SF:
10576 case V4SF_FTYPE_V4DF:
10577 case V4SF_FTYPE_V8HI:
10578 case V4SF_FTYPE_V2DF:
10579 case V2DI_FTYPE_V2DI:
10580 case V2DI_FTYPE_V16QI:
10581 case V2DI_FTYPE_V8HI:
10582 case V2DI_FTYPE_V4SI:
10583 case V2DF_FTYPE_V2DF:
10584 case V2DF_FTYPE_V4SI:
10585 case V2DF_FTYPE_V4DF:
10586 case V2DF_FTYPE_V4SF:
10587 case V2DF_FTYPE_V2SI:
10588 case V2SI_FTYPE_V2SI:
10589 case V2SI_FTYPE_V4SF:
10590 case V2SI_FTYPE_V2SF:
10591 case V2SI_FTYPE_V2DF:
10592 case V2SF_FTYPE_V2SF:
10593 case V2SF_FTYPE_V2SI:
10594 case V32QI_FTYPE_V32QI:
10595 case V32QI_FTYPE_V16QI:
10596 case V16HI_FTYPE_V16HI:
10597 case V16HI_FTYPE_V8HI:
10598 case V8SI_FTYPE_V8SI:
10599 case V16HI_FTYPE_V16QI:
10600 case V8SI_FTYPE_V16QI:
10601 case V4DI_FTYPE_V16QI:
10602 case V8SI_FTYPE_V8HI:
10603 case V4DI_FTYPE_V8HI:
10604 case V4DI_FTYPE_V4SI:
10605 case V4DI_FTYPE_V2DI:
10606 case UQI_FTYPE_UQI:
10607 case UHI_FTYPE_UHI:
10608 case USI_FTYPE_USI:
10609 case USI_FTYPE_UQI:
10610 case USI_FTYPE_UHI:
10611 case UDI_FTYPE_UDI:
10612 case UHI_FTYPE_V16QI:
10613 case USI_FTYPE_V32QI:
10614 case UDI_FTYPE_V64QI:
10615 case V16QI_FTYPE_UHI:
10616 case V32QI_FTYPE_USI:
10617 case V64QI_FTYPE_UDI:
10618 case V8HI_FTYPE_UQI:
10619 case V16HI_FTYPE_UHI:
10620 case V32HI_FTYPE_USI:
10621 case V4SI_FTYPE_UQI:
10622 case V8SI_FTYPE_UQI:
10623 case V4SI_FTYPE_UHI:
10624 case V8SI_FTYPE_UHI:
10625 case UQI_FTYPE_V8HI:
10626 case UHI_FTYPE_V16HI:
10627 case USI_FTYPE_V32HI:
10628 case UQI_FTYPE_V4SI:
10629 case UQI_FTYPE_V8SI:
10630 case UHI_FTYPE_V16SI:
10631 case UQI_FTYPE_V2DI:
10632 case UQI_FTYPE_V4DI:
10633 case UQI_FTYPE_V8DI:
10634 case V16SI_FTYPE_UHI:
10635 case V2DI_FTYPE_UQI:
10636 case V4DI_FTYPE_UQI:
10637 case V16SI_FTYPE_INT:
10638 case V16SF_FTYPE_V8SF:
10639 case V16SI_FTYPE_V8SI:
10640 case V16SF_FTYPE_V4SF:
10641 case V16SI_FTYPE_V4SI:
10642 case V16SI_FTYPE_V16SF:
10643 case V16SI_FTYPE_V16SI:
10644 case V64QI_FTYPE_V64QI:
10645 case V32HI_FTYPE_V32HI:
10646 case V16SF_FTYPE_V16SF:
10647 case V8DI_FTYPE_UQI:
10648 case V8DI_FTYPE_V8DI:
10649 case V8DF_FTYPE_V4DF:
10650 case V8DF_FTYPE_V2DF:
10651 case V8DF_FTYPE_V8DF:
10652 case V4DI_FTYPE_V4DI:
10653 case V16BF_FTYPE_V16SF:
10654 case V8BF_FTYPE_V8SF:
10655 case V8BF_FTYPE_V4SF:
10656 nargs = 1;
10657 break;
10658 case V4SF_FTYPE_V4SF_VEC_MERGE:
10659 case V2DF_FTYPE_V2DF_VEC_MERGE:
10660 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10661 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10662 case V16QI_FTYPE_V16QI_V16QI:
10663 case V16QI_FTYPE_V8HI_V8HI:
10664 case V16HF_FTYPE_V16HF_V16HF:
10665 case V16SF_FTYPE_V16SF_V16SF:
10666 case V8QI_FTYPE_V8QI_V8QI:
10667 case V8QI_FTYPE_V4HI_V4HI:
10668 case V8HI_FTYPE_V8HI_V8HI:
10669 case V8HI_FTYPE_V16QI_V16QI:
10670 case V8HI_FTYPE_V4SI_V4SI:
10671 case V8HF_FTYPE_V8HF_V8HF:
10672 case V8SF_FTYPE_V8SF_V8SF:
10673 case V8SF_FTYPE_V8SF_V8SI:
10674 case V8DF_FTYPE_V8DF_V8DF:
10675 case V4SI_FTYPE_V4SI_V4SI:
10676 case V4SI_FTYPE_V8HI_V8HI:
10677 case V4SI_FTYPE_V2DF_V2DF:
10678 case V4HI_FTYPE_V4HI_V4HI:
10679 case V4HI_FTYPE_V8QI_V8QI:
10680 case V4HI_FTYPE_V2SI_V2SI:
10681 case V4DF_FTYPE_V4DF_V4DF:
10682 case V4DF_FTYPE_V4DF_V4DI:
10683 case V4SF_FTYPE_V4SF_V4SF:
10684 case V4SF_FTYPE_V4SF_V4SI:
10685 case V4SF_FTYPE_V4SF_V2SI:
10686 case V4SF_FTYPE_V4SF_V2DF:
10687 case V4SF_FTYPE_V4SF_UINT:
10688 case V4SF_FTYPE_V4SF_DI:
10689 case V4SF_FTYPE_V4SF_SI:
10690 case V2DI_FTYPE_V2DI_V2DI:
10691 case V2DI_FTYPE_V16QI_V16QI:
10692 case V2DI_FTYPE_V4SI_V4SI:
10693 case V2DI_FTYPE_V2DI_V16QI:
10694 case V2SI_FTYPE_V2SI_V2SI:
10695 case V2SI_FTYPE_V4HI_V4HI:
10696 case V2SI_FTYPE_V2SF_V2SF:
10697 case V2DF_FTYPE_V2DF_V2DF:
10698 case V2DF_FTYPE_V2DF_V4SF:
10699 case V2DF_FTYPE_V2DF_V2DI:
10700 case V2DF_FTYPE_V2DF_DI:
10701 case V2DF_FTYPE_V2DF_SI:
10702 case V2DF_FTYPE_V2DF_UINT:
10703 case V2SF_FTYPE_V2SF_V2SF:
10704 case V1DI_FTYPE_V1DI_V1DI:
10705 case V1DI_FTYPE_V8QI_V8QI:
10706 case V1DI_FTYPE_V2SI_V2SI:
10707 case V32QI_FTYPE_V16HI_V16HI:
10708 case V16HI_FTYPE_V8SI_V8SI:
10709 case V64QI_FTYPE_V64QI_V64QI:
10710 case V32QI_FTYPE_V32QI_V32QI:
10711 case V16HI_FTYPE_V32QI_V32QI:
10712 case V16HI_FTYPE_V16HI_V16HI:
10713 case V8SI_FTYPE_V4DF_V4DF:
10714 case V8SI_FTYPE_V8SI_V8SI:
10715 case V8SI_FTYPE_V16HI_V16HI:
10716 case V4DI_FTYPE_V4DI_V4DI:
10717 case V4DI_FTYPE_V8SI_V8SI:
10718 case V4DI_FTYPE_V32QI_V32QI:
10719 case V8DI_FTYPE_V64QI_V64QI:
10720 if (comparison == UNKNOWN)
10721 return ix86_expand_binop_builtin (icode, exp, target);
10722 nargs = 2;
10723 break;
10724 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10725 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10726 gcc_assert (comparison != UNKNOWN);
10727 nargs = 2;
10728 swap = true;
10729 break;
10730 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10731 case V16HI_FTYPE_V16HI_SI_COUNT:
10732 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10733 case V8SI_FTYPE_V8SI_SI_COUNT:
10734 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10735 case V4DI_FTYPE_V4DI_INT_COUNT:
10736 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10737 case V8HI_FTYPE_V8HI_SI_COUNT:
10738 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10739 case V4SI_FTYPE_V4SI_SI_COUNT:
10740 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10741 case V4HI_FTYPE_V4HI_SI_COUNT:
10742 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10743 case V2DI_FTYPE_V2DI_SI_COUNT:
10744 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10745 case V2SI_FTYPE_V2SI_SI_COUNT:
10746 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10747 case V1DI_FTYPE_V1DI_SI_COUNT:
10748 nargs = 2;
10749 second_arg_count = true;
10750 break;
10751 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10752 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10753 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10754 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10755 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10756 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10757 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10758 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10759 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10760 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10761 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10762 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10763 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10764 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10765 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10766 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10767 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10768 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10769 nargs = 4;
10770 second_arg_count = true;
10771 break;
10772 case UINT64_FTYPE_UINT64_UINT64:
10773 case UINT_FTYPE_UINT_UINT:
10774 case UINT_FTYPE_UINT_USHORT:
10775 case UINT_FTYPE_UINT_UCHAR:
10776 case UINT16_FTYPE_UINT16_INT:
10777 case UINT8_FTYPE_UINT8_INT:
10778 case UQI_FTYPE_UQI_UQI:
10779 case UHI_FTYPE_UHI_UHI:
10780 case USI_FTYPE_USI_USI:
10781 case UDI_FTYPE_UDI_UDI:
10782 case V16SI_FTYPE_V8DF_V8DF:
10783 case V32BF_FTYPE_V16SF_V16SF:
10784 case V16BF_FTYPE_V8SF_V8SF:
10785 case V8BF_FTYPE_V4SF_V4SF:
10786 case V16BF_FTYPE_V16SF_UHI:
10787 case V8BF_FTYPE_V8SF_UQI:
10788 case V8BF_FTYPE_V4SF_UQI:
10789 nargs = 2;
10790 break;
10791 case V2DI_FTYPE_V2DI_INT_CONVERT:
10792 nargs = 2;
10793 rmode = V1TImode;
10794 nargs_constant = 1;
10795 break;
10796 case V4DI_FTYPE_V4DI_INT_CONVERT:
10797 nargs = 2;
10798 rmode = V2TImode;
10799 nargs_constant = 1;
10800 break;
10801 case V8DI_FTYPE_V8DI_INT_CONVERT:
10802 nargs = 2;
10803 rmode = V4TImode;
10804 nargs_constant = 1;
10805 break;
10806 case V8HI_FTYPE_V8HI_INT:
10807 case V8HI_FTYPE_V8SF_INT:
10808 case V16HI_FTYPE_V16SF_INT:
10809 case V8HI_FTYPE_V4SF_INT:
10810 case V8SF_FTYPE_V8SF_INT:
10811 case V4SF_FTYPE_V16SF_INT:
10812 case V16SF_FTYPE_V16SF_INT:
10813 case V4SI_FTYPE_V4SI_INT:
10814 case V4SI_FTYPE_V8SI_INT:
10815 case V4HI_FTYPE_V4HI_INT:
10816 case V4DF_FTYPE_V4DF_INT:
10817 case V4DF_FTYPE_V8DF_INT:
10818 case V4SF_FTYPE_V4SF_INT:
10819 case V4SF_FTYPE_V8SF_INT:
10820 case V2DI_FTYPE_V2DI_INT:
10821 case V2DF_FTYPE_V2DF_INT:
10822 case V2DF_FTYPE_V4DF_INT:
10823 case V16HI_FTYPE_V16HI_INT:
10824 case V8SI_FTYPE_V8SI_INT:
10825 case V16SI_FTYPE_V16SI_INT:
10826 case V4SI_FTYPE_V16SI_INT:
10827 case V4DI_FTYPE_V4DI_INT:
10828 case V2DI_FTYPE_V4DI_INT:
10829 case V4DI_FTYPE_V8DI_INT:
10830 case UQI_FTYPE_UQI_UQI_CONST:
10831 case UHI_FTYPE_UHI_UQI:
10832 case USI_FTYPE_USI_UQI:
10833 case UDI_FTYPE_UDI_UQI:
10834 nargs = 2;
10835 nargs_constant = 1;
10836 break;
10837 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10838 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10839 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10840 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10841 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10842 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10843 case UHI_FTYPE_V16SI_V16SI_UHI:
10844 case UQI_FTYPE_V8DI_V8DI_UQI:
10845 case V16HI_FTYPE_V16SI_V16HI_UHI:
10846 case V16QI_FTYPE_V16SI_V16QI_UHI:
10847 case V16QI_FTYPE_V8DI_V16QI_UQI:
10848 case V32HF_FTYPE_V32HF_V32HF_USI:
10849 case V16SF_FTYPE_V16SF_V16SF_UHI:
10850 case V16SF_FTYPE_V4SF_V16SF_UHI:
10851 case V16SI_FTYPE_SI_V16SI_UHI:
10852 case V16SI_FTYPE_V16HI_V16SI_UHI:
10853 case V16SI_FTYPE_V16QI_V16SI_UHI:
10854 case V8SF_FTYPE_V4SF_V8SF_UQI:
10855 case V4DF_FTYPE_V2DF_V4DF_UQI:
10856 case V8SI_FTYPE_V4SI_V8SI_UQI:
10857 case V8SI_FTYPE_SI_V8SI_UQI:
10858 case V4SI_FTYPE_V4SI_V4SI_UQI:
10859 case V4SI_FTYPE_SI_V4SI_UQI:
10860 case V4DI_FTYPE_V2DI_V4DI_UQI:
10861 case V4DI_FTYPE_DI_V4DI_UQI:
10862 case V2DI_FTYPE_V2DI_V2DI_UQI:
10863 case V2DI_FTYPE_DI_V2DI_UQI:
10864 case V64QI_FTYPE_V64QI_V64QI_UDI:
10865 case V64QI_FTYPE_V16QI_V64QI_UDI:
10866 case V64QI_FTYPE_QI_V64QI_UDI:
10867 case V32QI_FTYPE_V32QI_V32QI_USI:
10868 case V32QI_FTYPE_V16QI_V32QI_USI:
10869 case V32QI_FTYPE_QI_V32QI_USI:
10870 case V16QI_FTYPE_V16QI_V16QI_UHI:
10871 case V16QI_FTYPE_QI_V16QI_UHI:
10872 case V32HI_FTYPE_V8HI_V32HI_USI:
10873 case V32HI_FTYPE_HI_V32HI_USI:
10874 case V16HI_FTYPE_V8HI_V16HI_UHI:
10875 case V16HI_FTYPE_HI_V16HI_UHI:
10876 case V8HI_FTYPE_V8HI_V8HI_UQI:
10877 case V8HI_FTYPE_HI_V8HI_UQI:
10878 case V16HF_FTYPE_V16HF_V16HF_UHI:
10879 case V8SF_FTYPE_V8HI_V8SF_UQI:
10880 case V4SF_FTYPE_V8HI_V4SF_UQI:
10881 case V8SI_FTYPE_V8HF_V8SI_UQI:
10882 case V8SF_FTYPE_V8HF_V8SF_UQI:
10883 case V8SI_FTYPE_V8SF_V8SI_UQI:
10884 case V4SI_FTYPE_V4SF_V4SI_UQI:
10885 case V4SI_FTYPE_V8HF_V4SI_UQI:
10886 case V4SF_FTYPE_V8HF_V4SF_UQI:
10887 case V4DI_FTYPE_V8HF_V4DI_UQI:
10888 case V4DI_FTYPE_V4SF_V4DI_UQI:
10889 case V2DI_FTYPE_V8HF_V2DI_UQI:
10890 case V2DI_FTYPE_V4SF_V2DI_UQI:
10891 case V8HF_FTYPE_V8HF_V8HF_UQI:
10892 case V8HF_FTYPE_V8HF_V8HF_V8HF:
10893 case V8HF_FTYPE_V8HI_V8HF_UQI:
10894 case V8HF_FTYPE_V8SI_V8HF_UQI:
10895 case V8HF_FTYPE_V8SF_V8HF_UQI:
10896 case V8HF_FTYPE_V4SI_V8HF_UQI:
10897 case V8HF_FTYPE_V4SF_V8HF_UQI:
10898 case V8HF_FTYPE_V4DI_V8HF_UQI:
10899 case V8HF_FTYPE_V4DF_V8HF_UQI:
10900 case V8HF_FTYPE_V2DI_V8HF_UQI:
10901 case V8HF_FTYPE_V2DF_V8HF_UQI:
10902 case V4SF_FTYPE_V4DI_V4SF_UQI:
10903 case V4SF_FTYPE_V2DI_V4SF_UQI:
10904 case V4DF_FTYPE_V4DI_V4DF_UQI:
10905 case V4DF_FTYPE_V8HF_V4DF_UQI:
10906 case V2DF_FTYPE_V8HF_V2DF_UQI:
10907 case V2DF_FTYPE_V2DI_V2DF_UQI:
10908 case V16QI_FTYPE_V8HI_V16QI_UQI:
10909 case V16QI_FTYPE_V16HI_V16QI_UHI:
10910 case V16QI_FTYPE_V4SI_V16QI_UQI:
10911 case V16QI_FTYPE_V8SI_V16QI_UQI:
10912 case V8HI_FTYPE_V8HF_V8HI_UQI:
10913 case V8HI_FTYPE_V4SI_V8HI_UQI:
10914 case V8HI_FTYPE_V8SI_V8HI_UQI:
10915 case V16QI_FTYPE_V2DI_V16QI_UQI:
10916 case V16QI_FTYPE_V4DI_V16QI_UQI:
10917 case V8HI_FTYPE_V2DI_V8HI_UQI:
10918 case V8HI_FTYPE_V4DI_V8HI_UQI:
10919 case V4SI_FTYPE_V2DI_V4SI_UQI:
10920 case V4SI_FTYPE_V4DI_V4SI_UQI:
10921 case V32QI_FTYPE_V32HI_V32QI_USI:
10922 case UHI_FTYPE_V16QI_V16QI_UHI:
10923 case USI_FTYPE_V32QI_V32QI_USI:
10924 case UDI_FTYPE_V64QI_V64QI_UDI:
10925 case UQI_FTYPE_V8HI_V8HI_UQI:
10926 case UHI_FTYPE_V16HI_V16HI_UHI:
10927 case USI_FTYPE_V32HI_V32HI_USI:
10928 case UQI_FTYPE_V4SI_V4SI_UQI:
10929 case UQI_FTYPE_V8SI_V8SI_UQI:
10930 case UQI_FTYPE_V2DI_V2DI_UQI:
10931 case UQI_FTYPE_V4DI_V4DI_UQI:
10932 case V4SF_FTYPE_V2DF_V4SF_UQI:
10933 case V4SF_FTYPE_V4DF_V4SF_UQI:
10934 case V16SI_FTYPE_V16SI_V16SI_UHI:
10935 case V16SI_FTYPE_V4SI_V16SI_UHI:
10936 case V2DI_FTYPE_V4SI_V2DI_UQI:
10937 case V2DI_FTYPE_V8HI_V2DI_UQI:
10938 case V2DI_FTYPE_V16QI_V2DI_UQI:
10939 case V4DI_FTYPE_V4DI_V4DI_UQI:
10940 case V4DI_FTYPE_V4SI_V4DI_UQI:
10941 case V4DI_FTYPE_V8HI_V4DI_UQI:
10942 case V4DI_FTYPE_V16QI_V4DI_UQI:
10943 case V4DI_FTYPE_V4DF_V4DI_UQI:
10944 case V2DI_FTYPE_V2DF_V2DI_UQI:
10945 case V4SI_FTYPE_V4DF_V4SI_UQI:
10946 case V4SI_FTYPE_V2DF_V4SI_UQI:
10947 case V4SI_FTYPE_V8HI_V4SI_UQI:
10948 case V4SI_FTYPE_V16QI_V4SI_UQI:
10949 case V4DI_FTYPE_V4DI_V4DI_V4DI:
10950 case V8DF_FTYPE_V2DF_V8DF_UQI:
10951 case V8DF_FTYPE_V4DF_V8DF_UQI:
10952 case V8DF_FTYPE_V8DF_V8DF_UQI:
10953 case V8SF_FTYPE_V8SF_V8SF_UQI:
10954 case V8SF_FTYPE_V8SI_V8SF_UQI:
10955 case V4DF_FTYPE_V4DF_V4DF_UQI:
10956 case V4SF_FTYPE_V4SF_V4SF_UQI:
10957 case V2DF_FTYPE_V2DF_V2DF_UQI:
10958 case V2DF_FTYPE_V4SF_V2DF_UQI:
10959 case V2DF_FTYPE_V4SI_V2DF_UQI:
10960 case V4SF_FTYPE_V4SI_V4SF_UQI:
10961 case V4DF_FTYPE_V4SF_V4DF_UQI:
10962 case V4DF_FTYPE_V4SI_V4DF_UQI:
10963 case V8SI_FTYPE_V8SI_V8SI_UQI:
10964 case V8SI_FTYPE_V8HI_V8SI_UQI:
10965 case V8SI_FTYPE_V16QI_V8SI_UQI:
10966 case V8DF_FTYPE_V8SI_V8DF_UQI:
10967 case V8DI_FTYPE_DI_V8DI_UQI:
10968 case V16SF_FTYPE_V8SF_V16SF_UHI:
10969 case V16SI_FTYPE_V8SI_V16SI_UHI:
10970 case V16HF_FTYPE_V16HI_V16HF_UHI:
10971 case V16HF_FTYPE_V16HF_V16HF_V16HF:
10972 case V16HI_FTYPE_V16HF_V16HI_UHI:
10973 case V16HI_FTYPE_V16HI_V16HI_UHI:
10974 case V8HI_FTYPE_V16QI_V8HI_UQI:
10975 case V16HI_FTYPE_V16QI_V16HI_UHI:
10976 case V32HI_FTYPE_V32HI_V32HI_USI:
10977 case V32HI_FTYPE_V32QI_V32HI_USI:
10978 case V8DI_FTYPE_V16QI_V8DI_UQI:
10979 case V8DI_FTYPE_V2DI_V8DI_UQI:
10980 case V8DI_FTYPE_V4DI_V8DI_UQI:
10981 case V8DI_FTYPE_V8DI_V8DI_UQI:
10982 case V8DI_FTYPE_V8HI_V8DI_UQI:
10983 case V8DI_FTYPE_V8SI_V8DI_UQI:
10984 case V8HI_FTYPE_V8DI_V8HI_UQI:
10985 case V8SI_FTYPE_V8DI_V8SI_UQI:
10986 case V4SI_FTYPE_V4SI_V4SI_V4SI:
10987 case V16SI_FTYPE_V16SI_V16SI_V16SI:
10988 case V8DI_FTYPE_V8DI_V8DI_V8DI:
10989 case V32HI_FTYPE_V32HI_V32HI_V32HI:
10990 case V2DI_FTYPE_V2DI_V2DI_V2DI:
10991 case V16HI_FTYPE_V16HI_V16HI_V16HI:
10992 case V8SI_FTYPE_V8SI_V8SI_V8SI:
10993 case V8HI_FTYPE_V8HI_V8HI_V8HI:
10994 case V32BF_FTYPE_V16SF_V16SF_USI:
10995 case V16BF_FTYPE_V8SF_V8SF_UHI:
10996 case V8BF_FTYPE_V4SF_V4SF_UQI:
10997 case V16BF_FTYPE_V16SF_V16BF_UHI:
10998 case V8BF_FTYPE_V8SF_V8BF_UQI:
10999 case V8BF_FTYPE_V4SF_V8BF_UQI:
11000 case V16SF_FTYPE_V16SF_V32BF_V32BF:
11001 case V8SF_FTYPE_V8SF_V16BF_V16BF:
11002 case V4SF_FTYPE_V4SF_V8BF_V8BF:
11003 nargs = 3;
11004 break;
11005 case V32QI_FTYPE_V32QI_V32QI_INT:
11006 case V16HI_FTYPE_V16HI_V16HI_INT:
11007 case V16QI_FTYPE_V16QI_V16QI_INT:
11008 case V4DI_FTYPE_V4DI_V4DI_INT:
11009 case V8HI_FTYPE_V8HI_V8HI_INT:
11010 case V8SI_FTYPE_V8SI_V8SI_INT:
11011 case V8SI_FTYPE_V8SI_V4SI_INT:
11012 case V8SF_FTYPE_V8SF_V8SF_INT:
11013 case V8SF_FTYPE_V8SF_V4SF_INT:
11014 case V4SI_FTYPE_V4SI_V4SI_INT:
11015 case V4DF_FTYPE_V4DF_V4DF_INT:
11016 case V16SF_FTYPE_V16SF_V16SF_INT:
11017 case V16SF_FTYPE_V16SF_V4SF_INT:
11018 case V16SI_FTYPE_V16SI_V4SI_INT:
11019 case V4DF_FTYPE_V4DF_V2DF_INT:
11020 case V4SF_FTYPE_V4SF_V4SF_INT:
11021 case V2DI_FTYPE_V2DI_V2DI_INT:
11022 case V4DI_FTYPE_V4DI_V2DI_INT:
11023 case V2DF_FTYPE_V2DF_V2DF_INT:
11024 case UQI_FTYPE_V8DI_V8UDI_INT:
11025 case UQI_FTYPE_V8DF_V8DF_INT:
11026 case UQI_FTYPE_V2DF_V2DF_INT:
11027 case UQI_FTYPE_V4SF_V4SF_INT:
11028 case UHI_FTYPE_V16SI_V16SI_INT:
11029 case UHI_FTYPE_V16SF_V16SF_INT:
11030 case V64QI_FTYPE_V64QI_V64QI_INT:
11031 case V32HI_FTYPE_V32HI_V32HI_INT:
11032 case V16SI_FTYPE_V16SI_V16SI_INT:
11033 case V8DI_FTYPE_V8DI_V8DI_INT:
11034 nargs = 3;
11035 nargs_constant = 1;
11036 break;
11037 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
11038 nargs = 3;
11039 rmode = V4DImode;
11040 nargs_constant = 1;
11041 break;
11042 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
11043 nargs = 3;
11044 rmode = V2DImode;
11045 nargs_constant = 1;
11046 break;
11047 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
11048 nargs = 3;
11049 rmode = DImode;
11050 nargs_constant = 1;
11051 break;
11052 case V2DI_FTYPE_V2DI_UINT_UINT:
11053 nargs = 3;
11054 nargs_constant = 2;
11055 break;
11056 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
11057 nargs = 3;
11058 rmode = V8DImode;
11059 nargs_constant = 1;
11060 break;
11061 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
11062 nargs = 5;
11063 rmode = V8DImode;
11064 mask_pos = 2;
11065 nargs_constant = 1;
11066 break;
11067 case QI_FTYPE_V8DF_INT_UQI:
11068 case QI_FTYPE_V4DF_INT_UQI:
11069 case QI_FTYPE_V2DF_INT_UQI:
11070 case HI_FTYPE_V16SF_INT_UHI:
11071 case QI_FTYPE_V8SF_INT_UQI:
11072 case QI_FTYPE_V4SF_INT_UQI:
11073 case QI_FTYPE_V8HF_INT_UQI:
11074 case HI_FTYPE_V16HF_INT_UHI:
11075 case SI_FTYPE_V32HF_INT_USI:
11076 case V4SI_FTYPE_V4SI_V4SI_UHI:
11077 case V8SI_FTYPE_V8SI_V8SI_UHI:
11078 nargs = 3;
11079 mask_pos = 1;
11080 nargs_constant = 1;
11081 break;
11082 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
11083 nargs = 5;
11084 rmode = V4DImode;
11085 mask_pos = 2;
11086 nargs_constant = 1;
11087 break;
11088 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
11089 nargs = 5;
11090 rmode = V2DImode;
11091 mask_pos = 2;
11092 nargs_constant = 1;
11093 break;
11094 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
11095 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
11096 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
11097 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
11098 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
11099 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
11100 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
11101 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
11102 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
11103 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
11104 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
11105 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
11106 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
11107 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
11108 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
11109 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
11110 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
11111 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
11112 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
11113 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
11114 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
11115 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
11116 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
11117 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
11118 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
11119 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
11120 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
11121 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
11122 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
11123 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
11124 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
11125 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
11126 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
11127 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
11128 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
11129 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
11130 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
11131 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
11132 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
11133 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
11134 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
11135 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
11136 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
11137 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
11138 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
11139 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
11140 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
11141 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
11142 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
11143 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
11144 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
11145 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
11146 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
11147 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
11148 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
11149 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
11150 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
11151 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
11152 nargs = 4;
11153 break;
11154 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
11155 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
11156 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
11157 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
11158 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
11159 nargs = 4;
11160 nargs_constant = 1;
11161 break;
11162 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
11163 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
11164 case QI_FTYPE_V4DF_V4DF_INT_UQI:
11165 case QI_FTYPE_V8SF_V8SF_INT_UQI:
11166 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
11167 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
11168 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
11169 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
11170 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
11171 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
11172 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
11173 case USI_FTYPE_V32QI_V32QI_INT_USI:
11174 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
11175 case USI_FTYPE_V32HI_V32HI_INT_USI:
11176 case USI_FTYPE_V32HF_V32HF_INT_USI:
11177 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
11178 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
11179 nargs = 4;
11180 mask_pos = 1;
11181 nargs_constant = 1;
11182 break;
11183 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
11184 nargs = 4;
11185 nargs_constant = 2;
11186 break;
11187 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
11188 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
11189 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
11190 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
11191 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
11192 nargs = 4;
11193 break;
11194 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
11195 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
11196 mask_pos = 1;
11197 nargs = 4;
11198 nargs_constant = 1;
11199 break;
11200 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
11201 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
11202 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
11203 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
11204 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
11205 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
11206 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
11207 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
11208 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
11209 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
11210 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
11211 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
11212 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
11213 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
11214 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
11215 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
11216 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
11217 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
11218 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
11219 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
11220 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
11221 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
11222 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
11223 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
11224 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
11225 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11226 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
11227 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11228 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11229 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11230 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11231 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11232 nargs = 4;
11233 mask_pos = 2;
11234 nargs_constant = 1;
11235 break;
11236 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11237 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11238 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11239 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11240 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11241 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11242 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11243 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11244 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11245 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11246 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11247 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11248 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11249 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11250 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11251 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11252 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11253 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11254 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11255 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11256 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11257 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11258 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11259 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11260 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11261 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11262 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11263 nargs = 5;
11264 mask_pos = 2;
11265 nargs_constant = 1;
11266 break;
11267 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11268 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11269 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11270 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11271 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11272 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11273 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11274 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11275 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11276 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11277 nargs = 5;
11278 mask_pos = 1;
11279 nargs_constant = 1;
11280 break;
11281 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11282 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11283 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11284 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11285 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11286 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11287 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11288 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11289 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11290 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11291 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11292 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11293 nargs = 5;
11294 mask_pos = 1;
11295 nargs_constant = 2;
11296 break;
11297
11298 default:
11299 gcc_unreachable ();
11300 }
11301
11302 gcc_assert (nargs <= ARRAY_SIZE (xops));
11303
11304 if (comparison != UNKNOWN)
11305 {
11306 gcc_assert (nargs == 2);
11307 return ix86_expand_sse_compare (d, exp, target, swap);
11308 }
11309
11310 if (rmode == VOIDmode || rmode == tmode)
11311 {
11312 if (optimize
11313 || target == 0
11314 || GET_MODE (target) != tmode
11315 || !insn_p->operand[0].predicate (target, tmode))
11316 target = gen_reg_rtx (tmode);
11317 else if (memory_operand (target, tmode))
11318 num_memory++;
11319 real_target = target;
11320 }
11321 else
11322 {
11323 real_target = gen_reg_rtx (tmode);
11324 target = lowpart_subreg (rmode, real_target, tmode);
11325 }
11326
11327 for (i = 0; i < nargs; i++)
11328 {
11329 tree arg = CALL_EXPR_ARG (exp, i);
11330 rtx op = expand_normal (arg);
11331 machine_mode mode = insn_p->operand[i + 1].mode;
11332 bool match = insn_p->operand[i + 1].predicate (op, mode);
11333
11334 if (second_arg_count && i == 1)
11335 {
11336 /* SIMD shift insns take either an 8-bit immediate or
11337 register as count. But builtin functions take int as
11338 count. If count doesn't match, we put it in register.
11339 The instructions are using 64-bit count, if op is just
11340 32-bit, zero-extend it, as negative shift counts
11341 are undefined behavior and zero-extension is more
11342 efficient. */
11343 if (!match)
11344 {
11345 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11346 op = convert_modes (mode, GET_MODE (op), op, 1);
11347 else
11348 op = lowpart_subreg (mode, op, GET_MODE (op));
11349 if (!insn_p->operand[i + 1].predicate (op, mode))
11350 op = copy_to_reg (op);
11351 }
11352 }
11353 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11354 (!mask_pos && (nargs - i) <= nargs_constant))
11355 {
11356 if (!match)
11357 switch (icode)
11358 {
11359 case CODE_FOR_avx_vinsertf128v4di:
11360 case CODE_FOR_avx_vextractf128v4di:
11361 error ("the last argument must be an 1-bit immediate");
11362 return const0_rtx;
11363
11364 case CODE_FOR_avx512f_cmpv8di3_mask:
11365 case CODE_FOR_avx512f_cmpv16si3_mask:
11366 case CODE_FOR_avx512f_ucmpv8di3_mask:
11367 case CODE_FOR_avx512f_ucmpv16si3_mask:
11368 case CODE_FOR_avx512vl_cmpv4di3_mask:
11369 case CODE_FOR_avx512vl_cmpv8si3_mask:
11370 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11371 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11372 case CODE_FOR_avx512vl_cmpv2di3_mask:
11373 case CODE_FOR_avx512vl_cmpv4si3_mask:
11374 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11375 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11376 error ("the last argument must be a 3-bit immediate");
11377 return const0_rtx;
11378
11379 case CODE_FOR_sse4_1_roundsd:
11380 case CODE_FOR_sse4_1_roundss:
11381
11382 case CODE_FOR_sse4_1_roundpd:
11383 case CODE_FOR_sse4_1_roundps:
11384 case CODE_FOR_avx_roundpd256:
11385 case CODE_FOR_avx_roundps256:
11386
11387 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11388 case CODE_FOR_sse4_1_roundps_sfix:
11389 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11390 case CODE_FOR_avx_roundps_sfix256:
11391
11392 case CODE_FOR_sse4_1_blendps:
11393 case CODE_FOR_avx_blendpd256:
11394 case CODE_FOR_avx_vpermilv4df:
11395 case CODE_FOR_avx_vpermilv4df_mask:
11396 case CODE_FOR_avx512f_getmantv8df_mask:
11397 case CODE_FOR_avx512f_getmantv16sf_mask:
11398 case CODE_FOR_avx512vl_getmantv16hf_mask:
11399 case CODE_FOR_avx512vl_getmantv8sf_mask:
11400 case CODE_FOR_avx512vl_getmantv4df_mask:
11401 case CODE_FOR_avx512fp16_getmantv8hf_mask:
11402 case CODE_FOR_avx512vl_getmantv4sf_mask:
11403 case CODE_FOR_avx512vl_getmantv2df_mask:
11404 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11405 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11406 case CODE_FOR_avx512dq_rangepv4df_mask:
11407 case CODE_FOR_avx512dq_rangepv8sf_mask:
11408 case CODE_FOR_avx512dq_rangepv2df_mask:
11409 case CODE_FOR_avx512dq_rangepv4sf_mask:
11410 case CODE_FOR_avx_shufpd256_mask:
11411 error ("the last argument must be a 4-bit immediate");
11412 return const0_rtx;
11413
11414 case CODE_FOR_sha1rnds4:
11415 case CODE_FOR_sse4_1_blendpd:
11416 case CODE_FOR_avx_vpermilv2df:
11417 case CODE_FOR_avx_vpermilv2df_mask:
11418 case CODE_FOR_xop_vpermil2v2df3:
11419 case CODE_FOR_xop_vpermil2v4sf3:
11420 case CODE_FOR_xop_vpermil2v4df3:
11421 case CODE_FOR_xop_vpermil2v8sf3:
11422 case CODE_FOR_avx512f_vinsertf32x4_mask:
11423 case CODE_FOR_avx512f_vinserti32x4_mask:
11424 case CODE_FOR_avx512f_vextractf32x4_mask:
11425 case CODE_FOR_avx512f_vextracti32x4_mask:
11426 case CODE_FOR_sse2_shufpd:
11427 case CODE_FOR_sse2_shufpd_mask:
11428 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11429 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11430 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11431 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11432 error ("the last argument must be a 2-bit immediate");
11433 return const0_rtx;
11434
11435 case CODE_FOR_avx_vextractf128v4df:
11436 case CODE_FOR_avx_vextractf128v8sf:
11437 case CODE_FOR_avx_vextractf128v8si:
11438 case CODE_FOR_avx_vinsertf128v4df:
11439 case CODE_FOR_avx_vinsertf128v8sf:
11440 case CODE_FOR_avx_vinsertf128v8si:
11441 case CODE_FOR_avx512f_vinsertf64x4_mask:
11442 case CODE_FOR_avx512f_vinserti64x4_mask:
11443 case CODE_FOR_avx512f_vextractf64x4_mask:
11444 case CODE_FOR_avx512f_vextracti64x4_mask:
11445 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11446 case CODE_FOR_avx512dq_vinserti32x8_mask:
11447 case CODE_FOR_avx512vl_vinsertv4df:
11448 case CODE_FOR_avx512vl_vinsertv4di:
11449 case CODE_FOR_avx512vl_vinsertv8sf:
11450 case CODE_FOR_avx512vl_vinsertv8si:
11451 error ("the last argument must be a 1-bit immediate");
11452 return const0_rtx;
11453
11454 case CODE_FOR_avx_vmcmpv2df3:
11455 case CODE_FOR_avx_vmcmpv4sf3:
11456 case CODE_FOR_avx_cmpv2df3:
11457 case CODE_FOR_avx_cmpv4sf3:
11458 case CODE_FOR_avx_cmpv4df3:
11459 case CODE_FOR_avx_cmpv8sf3:
11460 case CODE_FOR_avx512f_cmpv8df3_mask:
11461 case CODE_FOR_avx512f_cmpv16sf3_mask:
11462 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11463 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11464 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11465 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11466 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11467 error ("the last argument must be a 5-bit immediate");
11468 return const0_rtx;
11469
11470 default:
11471 switch (nargs_constant)
11472 {
11473 case 2:
11474 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11475 (!mask_pos && (nargs - i) == nargs_constant))
11476 {
11477 error ("the next to last argument must be an 8-bit immediate");
11478 break;
11479 }
11480 /* FALLTHRU */
11481 case 1:
11482 error ("the last argument must be an 8-bit immediate");
11483 break;
11484 default:
11485 gcc_unreachable ();
11486 }
11487 return const0_rtx;
11488 }
11489 }
11490 else
11491 {
11492 if (VECTOR_MODE_P (mode))
11493 op = safe_vector_operand (op, mode);
11494
11495 /* If we aren't optimizing, only allow one memory operand to
11496 be generated. */
11497 if (memory_operand (op, mode))
11498 num_memory++;
11499
11500 op = fixup_modeless_constant (op, mode);
11501
11502 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11503 {
11504 if (optimize || !match || num_memory > 1)
11505 op = copy_to_mode_reg (mode, op);
11506 }
11507 else
11508 {
11509 op = copy_to_reg (op);
11510 op = lowpart_subreg (mode, op, GET_MODE (op));
11511 }
11512 }
11513
11514 xops[i] = op;
11515 }
11516
11517 switch (nargs)
11518 {
11519 case 1:
11520 pat = GEN_FCN (icode) (real_target, xops[0]);
11521 break;
11522 case 2:
11523 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11524 break;
11525 case 3:
11526 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11527 break;
11528 case 4:
11529 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11530 xops[2], xops[3]);
11531 break;
11532 case 5:
11533 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11534 xops[2], xops[3], xops[4]);
11535 break;
11536 case 6:
11537 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11538 xops[2], xops[3], xops[4], xops[5]);
11539 break;
11540 default:
11541 gcc_unreachable ();
11542 }
11543
11544 if (! pat)
11545 return 0;
11546
11547 emit_insn (pat);
11548 return target;
11549 }
11550
11551 /* Transform pattern of following layout:
11552 (set A
11553 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11554 )
11555 into:
11556 (set (A B)) */
11557
11558 static rtx
11559 ix86_erase_embedded_rounding (rtx pat)
11560 {
11561 if (GET_CODE (pat) == INSN)
11562 pat = PATTERN (pat);
11563
11564 gcc_assert (GET_CODE (pat) == SET);
11565 rtx src = SET_SRC (pat);
11566 gcc_assert (XVECLEN (src, 0) == 2);
11567 rtx p0 = XVECEXP (src, 0, 0);
11568 gcc_assert (GET_CODE (src) == UNSPEC
11569 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11570 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11571 return res;
11572 }
11573
11574 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11575 with rounding. */
11576 static rtx
11577 ix86_expand_sse_comi_round (const struct builtin_description *d,
11578 tree exp, rtx target)
11579 {
11580 rtx pat, set_dst;
11581 tree arg0 = CALL_EXPR_ARG (exp, 0);
11582 tree arg1 = CALL_EXPR_ARG (exp, 1);
11583 tree arg2 = CALL_EXPR_ARG (exp, 2);
11584 tree arg3 = CALL_EXPR_ARG (exp, 3);
11585 rtx op0 = expand_normal (arg0);
11586 rtx op1 = expand_normal (arg1);
11587 rtx op2 = expand_normal (arg2);
11588 rtx op3 = expand_normal (arg3);
11589 enum insn_code icode = d->icode;
11590 const struct insn_data_d *insn_p = &insn_data[icode];
11591 machine_mode mode0 = insn_p->operand[0].mode;
11592 machine_mode mode1 = insn_p->operand[1].mode;
11593
11594 /* See avxintrin.h for values. */
11595 static const enum rtx_code comparisons[32] =
11596 {
11597 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11598 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11599 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11600 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11601 };
11602 static const bool ordereds[32] =
11603 {
11604 true, true, true, false, false, false, false, true,
11605 false, false, false, true, true, true, true, false,
11606 true, true, true, false, false, false, false, true,
11607 false, false, false, true, true, true, true, false
11608 };
11609 static const bool non_signalings[32] =
11610 {
11611 true, false, false, true, true, false, false, true,
11612 true, false, false, true, true, false, false, true,
11613 false, true, true, false, false, true, true, false,
11614 false, true, true, false, false, true, true, false
11615 };
11616
11617 if (!CONST_INT_P (op2))
11618 {
11619 error ("the third argument must be comparison constant");
11620 return const0_rtx;
11621 }
11622 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11623 {
11624 error ("incorrect comparison mode");
11625 return const0_rtx;
11626 }
11627
11628 if (!insn_p->operand[2].predicate (op3, SImode))
11629 {
11630 error ("incorrect rounding operand");
11631 return const0_rtx;
11632 }
11633
11634 if (VECTOR_MODE_P (mode0))
11635 op0 = safe_vector_operand (op0, mode0);
11636 if (VECTOR_MODE_P (mode1))
11637 op1 = safe_vector_operand (op1, mode1);
11638
11639 enum rtx_code comparison = comparisons[INTVAL (op2)];
11640 bool ordered = ordereds[INTVAL (op2)];
11641 bool non_signaling = non_signalings[INTVAL (op2)];
11642 rtx const_val = const0_rtx;
11643
11644 bool check_unordered = false;
11645 machine_mode mode = CCFPmode;
11646 switch (comparison)
11647 {
11648 case ORDERED:
11649 if (!ordered)
11650 {
11651 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11652 if (!non_signaling)
11653 ordered = true;
11654 mode = CCSmode;
11655 }
11656 else
11657 {
11658 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11659 if (non_signaling)
11660 ordered = false;
11661 mode = CCPmode;
11662 }
11663 comparison = NE;
11664 break;
11665 case UNORDERED:
11666 if (ordered)
11667 {
11668 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11669 if (non_signaling)
11670 ordered = false;
11671 mode = CCSmode;
11672 }
11673 else
11674 {
11675 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11676 if (!non_signaling)
11677 ordered = true;
11678 mode = CCPmode;
11679 }
11680 comparison = EQ;
11681 break;
11682
11683 case LE: /* -> GE */
11684 case LT: /* -> GT */
11685 case UNGE: /* -> UNLE */
11686 case UNGT: /* -> UNLT */
11687 std::swap (op0, op1);
11688 comparison = swap_condition (comparison);
11689 /* FALLTHRU */
11690 case GT:
11691 case GE:
11692 case UNEQ:
11693 case UNLT:
11694 case UNLE:
11695 case LTGT:
11696 /* These are supported by CCFPmode. NB: Use ordered/signaling
11697 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11698 with NAN operands. */
11699 if (ordered == non_signaling)
11700 ordered = !ordered;
11701 break;
11702 case EQ:
11703 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11704 _CMP_EQ_OQ/_CMP_EQ_OS. */
11705 check_unordered = true;
11706 mode = CCZmode;
11707 break;
11708 case NE:
11709 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11710 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11711 gcc_assert (!ordered);
11712 check_unordered = true;
11713 mode = CCZmode;
11714 const_val = const1_rtx;
11715 break;
11716 default:
11717 gcc_unreachable ();
11718 }
11719
11720 target = gen_reg_rtx (SImode);
11721 emit_move_insn (target, const_val);
11722 target = gen_rtx_SUBREG (QImode, target, 0);
11723
11724 if ((optimize && !register_operand (op0, mode0))
11725 || !insn_p->operand[0].predicate (op0, mode0))
11726 op0 = copy_to_mode_reg (mode0, op0);
11727 if ((optimize && !register_operand (op1, mode1))
11728 || !insn_p->operand[1].predicate (op1, mode1))
11729 op1 = copy_to_mode_reg (mode1, op1);
11730
11731 /*
11732 1. COMI: ordered and signaling.
11733 2. UCOMI: unordered and non-signaling.
11734 */
11735 if (non_signaling)
11736 icode = (icode == CODE_FOR_sse_comi_round
11737 ? CODE_FOR_sse_ucomi_round
11738 : CODE_FOR_sse2_ucomi_round);
11739
11740 pat = GEN_FCN (icode) (op0, op1, op3);
11741 if (! pat)
11742 return 0;
11743
11744 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11745 if (INTVAL (op3) == NO_ROUND)
11746 {
11747 pat = ix86_erase_embedded_rounding (pat);
11748 if (! pat)
11749 return 0;
11750
11751 set_dst = SET_DEST (pat);
11752 }
11753 else
11754 {
11755 gcc_assert (GET_CODE (pat) == SET);
11756 set_dst = SET_DEST (pat);
11757 }
11758
11759 emit_insn (pat);
11760
11761 return ix86_ssecom_setcc (comparison, check_unordered, mode,
11762 set_dst, target);
11763 }
11764
11765 static rtx
11766 ix86_expand_round_builtin (const struct builtin_description *d,
11767 tree exp, rtx target)
11768 {
11769 rtx pat;
11770 unsigned int i, nargs;
11771 rtx xops[6];
11772 enum insn_code icode = d->icode;
11773 const struct insn_data_d *insn_p = &insn_data[icode];
11774 machine_mode tmode = insn_p->operand[0].mode;
11775 unsigned int nargs_constant = 0;
11776 unsigned int redundant_embed_rnd = 0;
11777
11778 switch ((enum ix86_builtin_func_type) d->flag)
11779 {
11780 case UINT64_FTYPE_V2DF_INT:
11781 case UINT64_FTYPE_V4SF_INT:
11782 case UINT64_FTYPE_V8HF_INT:
11783 case UINT_FTYPE_V2DF_INT:
11784 case UINT_FTYPE_V4SF_INT:
11785 case UINT_FTYPE_V8HF_INT:
11786 case INT64_FTYPE_V2DF_INT:
11787 case INT64_FTYPE_V4SF_INT:
11788 case INT64_FTYPE_V8HF_INT:
11789 case INT_FTYPE_V2DF_INT:
11790 case INT_FTYPE_V4SF_INT:
11791 case INT_FTYPE_V8HF_INT:
11792 nargs = 2;
11793 break;
11794 case V32HF_FTYPE_V32HF_V32HF_INT:
11795 case V8HF_FTYPE_V8HF_V8HF_INT:
11796 case V8HF_FTYPE_V8HF_INT_INT:
11797 case V8HF_FTYPE_V8HF_UINT_INT:
11798 case V8HF_FTYPE_V8HF_INT64_INT:
11799 case V8HF_FTYPE_V8HF_UINT64_INT:
11800 case V4SF_FTYPE_V4SF_UINT_INT:
11801 case V4SF_FTYPE_V4SF_UINT64_INT:
11802 case V2DF_FTYPE_V2DF_UINT64_INT:
11803 case V4SF_FTYPE_V4SF_INT_INT:
11804 case V4SF_FTYPE_V4SF_INT64_INT:
11805 case V2DF_FTYPE_V2DF_INT64_INT:
11806 case V4SF_FTYPE_V4SF_V4SF_INT:
11807 case V2DF_FTYPE_V2DF_V2DF_INT:
11808 case V4SF_FTYPE_V4SF_V2DF_INT:
11809 case V2DF_FTYPE_V2DF_V4SF_INT:
11810 nargs = 3;
11811 break;
11812 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11813 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
11814 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
11815 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
11816 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
11817 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11818 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11819 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
11820 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11821 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
11822 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
11823 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
11824 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
11825 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11826 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11827 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11828 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
11829 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
11830 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
11831 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11832 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11833 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11834 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
11835 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
11836 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11837 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
11838 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
11839 nargs = 4;
11840 break;
11841 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11842 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11843 nargs_constant = 2;
11844 nargs = 4;
11845 break;
11846 case INT_FTYPE_V4SF_V4SF_INT_INT:
11847 case INT_FTYPE_V2DF_V2DF_INT_INT:
11848 return ix86_expand_sse_comi_round (d, exp, target);
11849 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11850 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11851 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
11852 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
11853 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
11854 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
11855 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
11856 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
11857 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11858 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
11859 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
11860 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11861 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
11862 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
11863 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
11864 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11865 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
11866 nargs = 5;
11867 break;
11868 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
11869 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11870 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
11871 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11872 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
11873 nargs_constant = 4;
11874 nargs = 5;
11875 break;
11876 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11877 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11878 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11879 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
11880 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11881 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
11882 nargs_constant = 3;
11883 nargs = 5;
11884 break;
11885 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11886 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11887 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11888 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11889 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11890 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
11891 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
11892 nargs = 6;
11893 nargs_constant = 4;
11894 break;
11895 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11896 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11897 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11898 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11899 nargs = 6;
11900 nargs_constant = 3;
11901 break;
11902 default:
11903 gcc_unreachable ();
11904 }
11905 gcc_assert (nargs <= ARRAY_SIZE (xops));
11906
11907 if (optimize
11908 || target == 0
11909 || GET_MODE (target) != tmode
11910 || !insn_p->operand[0].predicate (target, tmode))
11911 target = gen_reg_rtx (tmode);
11912
11913 for (i = 0; i < nargs; i++)
11914 {
11915 tree arg = CALL_EXPR_ARG (exp, i);
11916 rtx op = expand_normal (arg);
11917 machine_mode mode = insn_p->operand[i + 1].mode;
11918 bool match = insn_p->operand[i + 1].predicate (op, mode);
11919
11920 if (i == nargs - nargs_constant)
11921 {
11922 if (!match)
11923 {
11924 switch (icode)
11925 {
11926 case CODE_FOR_avx512f_getmantv8df_mask_round:
11927 case CODE_FOR_avx512f_getmantv16sf_mask_round:
11928 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
11929 case CODE_FOR_avx512f_vgetmantv2df_round:
11930 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11931 case CODE_FOR_avx512f_vgetmantv4sf_round:
11932 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
11933 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
11934 error ("the immediate argument must be a 4-bit immediate");
11935 return const0_rtx;
11936 case CODE_FOR_avx512f_cmpv8df3_mask_round:
11937 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11938 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11939 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
11940 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11941 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
11942 error ("the immediate argument must be a 5-bit immediate");
11943 return const0_rtx;
11944 default:
11945 error ("the immediate argument must be an 8-bit immediate");
11946 return const0_rtx;
11947 }
11948 }
11949 }
11950 else if (i == nargs-1)
11951 {
11952 if (!insn_p->operand[nargs].predicate (op, SImode))
11953 {
11954 error ("incorrect rounding operand");
11955 return const0_rtx;
11956 }
11957
11958 /* If there is no rounding use normal version of the pattern. */
11959 if (INTVAL (op) == NO_ROUND)
11960 {
11961 /* Skip erasing embedded rounding for below expanders who
11962 generates multiple insns. In ix86_erase_embedded_rounding
11963 the pattern will be transformed to a single set, and emit_insn
11964 appends the set insead of insert it to chain. So the insns
11965 emitted inside define_expander would be ignored. */
11966 switch (icode)
11967 {
11968 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11969 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11970 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11971 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11972 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11973 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11974 redundant_embed_rnd = 0;
11975 break;
11976 default:
11977 redundant_embed_rnd = 1;
11978 break;
11979 }
11980 }
11981 }
11982 else
11983 {
11984 if (VECTOR_MODE_P (mode))
11985 op = safe_vector_operand (op, mode);
11986
11987 op = fixup_modeless_constant (op, mode);
11988
11989 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11990 {
11991 if (optimize || !match)
11992 op = copy_to_mode_reg (mode, op);
11993 }
11994 else
11995 {
11996 op = copy_to_reg (op);
11997 op = lowpart_subreg (mode, op, GET_MODE (op));
11998 }
11999 }
12000
12001 xops[i] = op;
12002 }
12003
12004 switch (nargs)
12005 {
12006 case 1:
12007 pat = GEN_FCN (icode) (target, xops[0]);
12008 break;
12009 case 2:
12010 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12011 break;
12012 case 3:
12013 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12014 break;
12015 case 4:
12016 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12017 xops[2], xops[3]);
12018 break;
12019 case 5:
12020 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12021 xops[2], xops[3], xops[4]);
12022 break;
12023 case 6:
12024 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12025 xops[2], xops[3], xops[4], xops[5]);
12026 break;
12027 default:
12028 gcc_unreachable ();
12029 }
12030
12031 if (!pat)
12032 return 0;
12033
12034 if (redundant_embed_rnd)
12035 pat = ix86_erase_embedded_rounding (pat);
12036
12037 emit_insn (pat);
12038 return target;
12039 }
12040
12041 /* Subroutine of ix86_expand_builtin to take care of special insns
12042 with variable number of operands. */
12043
12044 static rtx
12045 ix86_expand_special_args_builtin (const struct builtin_description *d,
12046 tree exp, rtx target)
12047 {
12048 tree arg;
12049 rtx pat, op;
12050 unsigned int i, nargs, arg_adjust, memory;
12051 unsigned int constant = 100;
12052 bool aligned_mem = false;
12053 rtx xops[4];
12054 enum insn_code icode = d->icode;
12055 const struct insn_data_d *insn_p = &insn_data[icode];
12056 machine_mode tmode = insn_p->operand[0].mode;
12057 enum { load, store } klass;
12058
12059 switch ((enum ix86_builtin_func_type) d->flag)
12060 {
12061 case VOID_FTYPE_VOID:
12062 emit_insn (GEN_FCN (icode) (target));
12063 return 0;
12064 case VOID_FTYPE_UINT64:
12065 case VOID_FTYPE_UNSIGNED:
12066 nargs = 0;
12067 klass = store;
12068 memory = 0;
12069 break;
12070
12071 case INT_FTYPE_VOID:
12072 case USHORT_FTYPE_VOID:
12073 case UINT64_FTYPE_VOID:
12074 case UINT_FTYPE_VOID:
12075 case UINT8_FTYPE_VOID:
12076 case UNSIGNED_FTYPE_VOID:
12077 nargs = 0;
12078 klass = load;
12079 memory = 0;
12080 break;
12081 case UINT64_FTYPE_PUNSIGNED:
12082 case V2DI_FTYPE_PV2DI:
12083 case V4DI_FTYPE_PV4DI:
12084 case V32QI_FTYPE_PCCHAR:
12085 case V16QI_FTYPE_PCCHAR:
12086 case V8SF_FTYPE_PCV4SF:
12087 case V8SF_FTYPE_PCFLOAT:
12088 case V4SF_FTYPE_PCFLOAT:
12089 case V4SF_FTYPE_PCFLOAT16:
12090 case V4SF_FTYPE_PCBFLOAT16:
12091 case V4SF_FTYPE_PCV8BF:
12092 case V4SF_FTYPE_PCV8HF:
12093 case V8SF_FTYPE_PCFLOAT16:
12094 case V8SF_FTYPE_PCBFLOAT16:
12095 case V8SF_FTYPE_PCV16HF:
12096 case V8SF_FTYPE_PCV16BF:
12097 case V4DF_FTYPE_PCV2DF:
12098 case V4DF_FTYPE_PCDOUBLE:
12099 case V2DF_FTYPE_PCDOUBLE:
12100 case VOID_FTYPE_PVOID:
12101 case V8DI_FTYPE_PV8DI:
12102 nargs = 1;
12103 klass = load;
12104 memory = 0;
12105 switch (icode)
12106 {
12107 case CODE_FOR_sse4_1_movntdqa:
12108 case CODE_FOR_avx2_movntdqa:
12109 case CODE_FOR_avx512f_movntdqa:
12110 aligned_mem = true;
12111 break;
12112 default:
12113 break;
12114 }
12115 break;
12116 case VOID_FTYPE_PV2SF_V4SF:
12117 case VOID_FTYPE_PV8DI_V8DI:
12118 case VOID_FTYPE_PV4DI_V4DI:
12119 case VOID_FTYPE_PV2DI_V2DI:
12120 case VOID_FTYPE_PCHAR_V32QI:
12121 case VOID_FTYPE_PCHAR_V16QI:
12122 case VOID_FTYPE_PFLOAT_V16SF:
12123 case VOID_FTYPE_PFLOAT_V8SF:
12124 case VOID_FTYPE_PFLOAT_V4SF:
12125 case VOID_FTYPE_PDOUBLE_V8DF:
12126 case VOID_FTYPE_PDOUBLE_V4DF:
12127 case VOID_FTYPE_PDOUBLE_V2DF:
12128 case VOID_FTYPE_PLONGLONG_LONGLONG:
12129 case VOID_FTYPE_PULONGLONG_ULONGLONG:
12130 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
12131 case VOID_FTYPE_PINT_INT:
12132 nargs = 1;
12133 klass = store;
12134 /* Reserve memory operand for target. */
12135 memory = ARRAY_SIZE (xops);
12136 switch (icode)
12137 {
12138 /* These builtins and instructions require the memory
12139 to be properly aligned. */
12140 case CODE_FOR_avx_movntv4di:
12141 case CODE_FOR_sse2_movntv2di:
12142 case CODE_FOR_avx_movntv8sf:
12143 case CODE_FOR_sse_movntv4sf:
12144 case CODE_FOR_sse4a_vmmovntv4sf:
12145 case CODE_FOR_avx_movntv4df:
12146 case CODE_FOR_sse2_movntv2df:
12147 case CODE_FOR_sse4a_vmmovntv2df:
12148 case CODE_FOR_sse2_movntidi:
12149 case CODE_FOR_sse_movntq:
12150 case CODE_FOR_sse2_movntisi:
12151 case CODE_FOR_avx512f_movntv16sf:
12152 case CODE_FOR_avx512f_movntv8df:
12153 case CODE_FOR_avx512f_movntv8di:
12154 aligned_mem = true;
12155 break;
12156 default:
12157 break;
12158 }
12159 break;
12160 case VOID_FTYPE_PVOID_PCVOID:
12161 nargs = 1;
12162 klass = store;
12163 memory = 0;
12164
12165 break;
12166 case V4SF_FTYPE_V4SF_PCV2SF:
12167 case V2DF_FTYPE_V2DF_PCDOUBLE:
12168 nargs = 2;
12169 klass = load;
12170 memory = 1;
12171 break;
12172 case V8SF_FTYPE_PCV8SF_V8SI:
12173 case V4DF_FTYPE_PCV4DF_V4DI:
12174 case V4SF_FTYPE_PCV4SF_V4SI:
12175 case V2DF_FTYPE_PCV2DF_V2DI:
12176 case V8SI_FTYPE_PCV8SI_V8SI:
12177 case V4DI_FTYPE_PCV4DI_V4DI:
12178 case V4SI_FTYPE_PCV4SI_V4SI:
12179 case V2DI_FTYPE_PCV2DI_V2DI:
12180 case VOID_FTYPE_INT_INT64:
12181 nargs = 2;
12182 klass = load;
12183 memory = 0;
12184 break;
12185 case VOID_FTYPE_PV8DF_V8DF_UQI:
12186 case VOID_FTYPE_PV4DF_V4DF_UQI:
12187 case VOID_FTYPE_PV2DF_V2DF_UQI:
12188 case VOID_FTYPE_PV16SF_V16SF_UHI:
12189 case VOID_FTYPE_PV8SF_V8SF_UQI:
12190 case VOID_FTYPE_PV4SF_V4SF_UQI:
12191 case VOID_FTYPE_PV8DI_V8DI_UQI:
12192 case VOID_FTYPE_PV4DI_V4DI_UQI:
12193 case VOID_FTYPE_PV2DI_V2DI_UQI:
12194 case VOID_FTYPE_PV16SI_V16SI_UHI:
12195 case VOID_FTYPE_PV8SI_V8SI_UQI:
12196 case VOID_FTYPE_PV4SI_V4SI_UQI:
12197 case VOID_FTYPE_PV64QI_V64QI_UDI:
12198 case VOID_FTYPE_PV32HI_V32HI_USI:
12199 case VOID_FTYPE_PV32QI_V32QI_USI:
12200 case VOID_FTYPE_PV16QI_V16QI_UHI:
12201 case VOID_FTYPE_PV16HI_V16HI_UHI:
12202 case VOID_FTYPE_PV8HI_V8HI_UQI:
12203 switch (icode)
12204 {
12205 /* These builtins and instructions require the memory
12206 to be properly aligned. */
12207 case CODE_FOR_avx512f_storev16sf_mask:
12208 case CODE_FOR_avx512f_storev16si_mask:
12209 case CODE_FOR_avx512f_storev8df_mask:
12210 case CODE_FOR_avx512f_storev8di_mask:
12211 case CODE_FOR_avx512vl_storev8sf_mask:
12212 case CODE_FOR_avx512vl_storev8si_mask:
12213 case CODE_FOR_avx512vl_storev4df_mask:
12214 case CODE_FOR_avx512vl_storev4di_mask:
12215 case CODE_FOR_avx512vl_storev4sf_mask:
12216 case CODE_FOR_avx512vl_storev4si_mask:
12217 case CODE_FOR_avx512vl_storev2df_mask:
12218 case CODE_FOR_avx512vl_storev2di_mask:
12219 aligned_mem = true;
12220 break;
12221 default:
12222 break;
12223 }
12224 /* FALLTHRU */
12225 case VOID_FTYPE_PV8SF_V8SI_V8SF:
12226 case VOID_FTYPE_PV4DF_V4DI_V4DF:
12227 case VOID_FTYPE_PV4SF_V4SI_V4SF:
12228 case VOID_FTYPE_PV2DF_V2DI_V2DF:
12229 case VOID_FTYPE_PV8SI_V8SI_V8SI:
12230 case VOID_FTYPE_PV4DI_V4DI_V4DI:
12231 case VOID_FTYPE_PV4SI_V4SI_V4SI:
12232 case VOID_FTYPE_PV2DI_V2DI_V2DI:
12233 case VOID_FTYPE_PV8SI_V8DI_UQI:
12234 case VOID_FTYPE_PV8HI_V8DI_UQI:
12235 case VOID_FTYPE_PV16HI_V16SI_UHI:
12236 case VOID_FTYPE_PUDI_V8DI_UQI:
12237 case VOID_FTYPE_PV16QI_V16SI_UHI:
12238 case VOID_FTYPE_PV4SI_V4DI_UQI:
12239 case VOID_FTYPE_PUDI_V2DI_UQI:
12240 case VOID_FTYPE_PUDI_V4DI_UQI:
12241 case VOID_FTYPE_PUSI_V2DI_UQI:
12242 case VOID_FTYPE_PV8HI_V8SI_UQI:
12243 case VOID_FTYPE_PUDI_V4SI_UQI:
12244 case VOID_FTYPE_PUSI_V4DI_UQI:
12245 case VOID_FTYPE_PUHI_V2DI_UQI:
12246 case VOID_FTYPE_PUDI_V8SI_UQI:
12247 case VOID_FTYPE_PUSI_V4SI_UQI:
12248 case VOID_FTYPE_PCHAR_V64QI_UDI:
12249 case VOID_FTYPE_PCHAR_V32QI_USI:
12250 case VOID_FTYPE_PCHAR_V16QI_UHI:
12251 case VOID_FTYPE_PSHORT_V32HI_USI:
12252 case VOID_FTYPE_PSHORT_V16HI_UHI:
12253 case VOID_FTYPE_PSHORT_V8HI_UQI:
12254 case VOID_FTYPE_PINT_V16SI_UHI:
12255 case VOID_FTYPE_PINT_V8SI_UQI:
12256 case VOID_FTYPE_PINT_V4SI_UQI:
12257 case VOID_FTYPE_PINT64_V8DI_UQI:
12258 case VOID_FTYPE_PINT64_V4DI_UQI:
12259 case VOID_FTYPE_PINT64_V2DI_UQI:
12260 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12261 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12262 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12263 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12264 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12265 case VOID_FTYPE_PFLOAT_V4SF_UQI:
12266 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
12267 case VOID_FTYPE_PV32QI_V32HI_USI:
12268 case VOID_FTYPE_PV16QI_V16HI_UHI:
12269 case VOID_FTYPE_PUDI_V8HI_UQI:
12270 nargs = 2;
12271 klass = store;
12272 /* Reserve memory operand for target. */
12273 memory = ARRAY_SIZE (xops);
12274 break;
12275 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12276 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12277 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12278 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12279 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12280 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12281 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12282 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12283 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12284 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12285 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12286 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12287 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12288 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12289 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12290 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12291 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12292 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12293 switch (icode)
12294 {
12295 /* These builtins and instructions require the memory
12296 to be properly aligned. */
12297 case CODE_FOR_avx512f_loadv16sf_mask:
12298 case CODE_FOR_avx512f_loadv16si_mask:
12299 case CODE_FOR_avx512f_loadv8df_mask:
12300 case CODE_FOR_avx512f_loadv8di_mask:
12301 case CODE_FOR_avx512vl_loadv8sf_mask:
12302 case CODE_FOR_avx512vl_loadv8si_mask:
12303 case CODE_FOR_avx512vl_loadv4df_mask:
12304 case CODE_FOR_avx512vl_loadv4di_mask:
12305 case CODE_FOR_avx512vl_loadv4sf_mask:
12306 case CODE_FOR_avx512vl_loadv4si_mask:
12307 case CODE_FOR_avx512vl_loadv2df_mask:
12308 case CODE_FOR_avx512vl_loadv2di_mask:
12309 case CODE_FOR_avx512bw_loadv64qi_mask:
12310 case CODE_FOR_avx512vl_loadv32qi_mask:
12311 case CODE_FOR_avx512vl_loadv16qi_mask:
12312 case CODE_FOR_avx512bw_loadv32hi_mask:
12313 case CODE_FOR_avx512vl_loadv16hi_mask:
12314 case CODE_FOR_avx512vl_loadv8hi_mask:
12315 aligned_mem = true;
12316 break;
12317 default:
12318 break;
12319 }
12320 /* FALLTHRU */
12321 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12322 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12323 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12324 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12325 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12326 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12327 case V16SI_FTYPE_PCINT_V16SI_UHI:
12328 case V8SI_FTYPE_PCINT_V8SI_UQI:
12329 case V4SI_FTYPE_PCINT_V4SI_UQI:
12330 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12331 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12332 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12333 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12334 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12335 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12336 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12337 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12338 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12339 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12340 nargs = 3;
12341 klass = load;
12342 memory = 0;
12343 break;
12344 case INT_FTYPE_PINT_INT_INT_INT:
12345 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
12346 nargs = 4;
12347 klass = load;
12348 memory = 0;
12349 constant = 3;
12350 break;
12351 default:
12352 gcc_unreachable ();
12353 }
12354
12355 gcc_assert (nargs <= ARRAY_SIZE (xops));
12356
12357 if (klass == store)
12358 {
12359 arg = CALL_EXPR_ARG (exp, 0);
12360 op = expand_normal (arg);
12361 gcc_assert (target == 0);
12362 if (memory)
12363 {
12364 op = ix86_zero_extend_to_Pmode (op);
12365 target = gen_rtx_MEM (tmode, op);
12366 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12367 on it. Try to improve it using get_pointer_alignment,
12368 and if the special builtin is one that requires strict
12369 mode alignment, also from it's GET_MODE_ALIGNMENT.
12370 Failure to do so could lead to ix86_legitimate_combined_insn
12371 rejecting all changes to such insns. */
12372 unsigned int align = get_pointer_alignment (arg);
12373 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12374 align = GET_MODE_ALIGNMENT (tmode);
12375 if (MEM_ALIGN (target) < align)
12376 set_mem_align (target, align);
12377 }
12378 else
12379 target = force_reg (tmode, op);
12380 arg_adjust = 1;
12381 }
12382 else
12383 {
12384 arg_adjust = 0;
12385 if (optimize
12386 || target == 0
12387 || !register_operand (target, tmode)
12388 || GET_MODE (target) != tmode)
12389 target = gen_reg_rtx (tmode);
12390 }
12391
12392 for (i = 0; i < nargs; i++)
12393 {
12394 machine_mode mode = insn_p->operand[i + 1].mode;
12395
12396 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12397 op = expand_normal (arg);
12398
12399 if (i == memory)
12400 {
12401 /* This must be the memory operand. */
12402 op = ix86_zero_extend_to_Pmode (op);
12403 op = gen_rtx_MEM (mode, op);
12404 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12405 on it. Try to improve it using get_pointer_alignment,
12406 and if the special builtin is one that requires strict
12407 mode alignment, also from it's GET_MODE_ALIGNMENT.
12408 Failure to do so could lead to ix86_legitimate_combined_insn
12409 rejecting all changes to such insns. */
12410 unsigned int align = get_pointer_alignment (arg);
12411 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12412 align = GET_MODE_ALIGNMENT (mode);
12413 if (MEM_ALIGN (op) < align)
12414 set_mem_align (op, align);
12415 }
12416 else if (i == constant)
12417 {
12418 /* This must be the constant. */
12419 if (!insn_p->operand[nargs].predicate(op, SImode))
12420 {
12421 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12422 return const0_rtx;
12423 }
12424 }
12425 else
12426 {
12427 /* This must be register. */
12428 if (VECTOR_MODE_P (mode))
12429 op = safe_vector_operand (op, mode);
12430
12431 op = fixup_modeless_constant (op, mode);
12432
12433 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12434 and that mask operand shoud be at the end.
12435 Keep all-ones mask which would be simplified by the expander. */
12436 if (nargs == 3 && i == 2 && klass == load
12437 && constm1_operand (op, mode)
12438 && insn_p->operand[i].predicate (op, mode))
12439 ;
12440 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12441 op = copy_to_mode_reg (mode, op);
12442 else
12443 {
12444 op = copy_to_reg (op);
12445 op = lowpart_subreg (mode, op, GET_MODE (op));
12446 }
12447 }
12448
12449 xops[i]= op;
12450 }
12451
12452 switch (nargs)
12453 {
12454 case 0:
12455 pat = GEN_FCN (icode) (target);
12456 break;
12457 case 1:
12458 pat = GEN_FCN (icode) (target, xops[0]);
12459 break;
12460 case 2:
12461 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12462 break;
12463 case 3:
12464 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12465 break;
12466 case 4:
12467 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
12468 break;
12469 default:
12470 gcc_unreachable ();
12471 }
12472
12473 if (! pat)
12474 return 0;
12475
12476 emit_insn (pat);
12477 return klass == store ? 0 : target;
12478 }
12479
12480 /* Return the integer constant in ARG. Constrain it to be in the range
12481 of the subparts of VEC_TYPE; issue an error if not. */
12482
12483 static int
12484 get_element_number (tree vec_type, tree arg)
12485 {
12486 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12487
12488 if (!tree_fits_uhwi_p (arg)
12489 || (elt = tree_to_uhwi (arg), elt > max))
12490 {
12491 error ("selector must be an integer constant in the range "
12492 "[0, %wi]", max);
12493 return 0;
12494 }
12495
12496 return elt;
12497 }
12498
12499 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12500 ix86_expand_vector_init. We DO have language-level syntax for this, in
12501 the form of (type){ init-list }. Except that since we can't place emms
12502 instructions from inside the compiler, we can't allow the use of MMX
12503 registers unless the user explicitly asks for it. So we do *not* define
12504 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12505 we have builtins invoked by mmintrin.h that gives us license to emit
12506 these sorts of instructions. */
12507
12508 static rtx
12509 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12510 {
12511 machine_mode tmode = TYPE_MODE (type);
12512 machine_mode inner_mode = GET_MODE_INNER (tmode);
12513 int i, n_elt = GET_MODE_NUNITS (tmode);
12514 rtvec v = rtvec_alloc (n_elt);
12515
12516 gcc_assert (VECTOR_MODE_P (tmode));
12517 gcc_assert (call_expr_nargs (exp) == n_elt);
12518
12519 for (i = 0; i < n_elt; ++i)
12520 {
12521 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12522 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12523 }
12524
12525 if (!target || !register_operand (target, tmode))
12526 target = gen_reg_rtx (tmode);
12527
12528 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12529 return target;
12530 }
12531
12532 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12533 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12534 had a language-level syntax for referencing vector elements. */
12535
12536 static rtx
12537 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12538 {
12539 machine_mode tmode, mode0;
12540 tree arg0, arg1;
12541 int elt;
12542 rtx op0;
12543
12544 arg0 = CALL_EXPR_ARG (exp, 0);
12545 arg1 = CALL_EXPR_ARG (exp, 1);
12546
12547 op0 = expand_normal (arg0);
12548 elt = get_element_number (TREE_TYPE (arg0), arg1);
12549
12550 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12551 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12552 gcc_assert (VECTOR_MODE_P (mode0));
12553
12554 op0 = force_reg (mode0, op0);
12555
12556 if (optimize || !target || !register_operand (target, tmode))
12557 target = gen_reg_rtx (tmode);
12558
12559 ix86_expand_vector_extract (true, target, op0, elt);
12560
12561 return target;
12562 }
12563
12564 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12565 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12566 a language-level syntax for referencing vector elements. */
12567
12568 static rtx
12569 ix86_expand_vec_set_builtin (tree exp)
12570 {
12571 machine_mode tmode, mode1;
12572 tree arg0, arg1, arg2;
12573 int elt;
12574 rtx op0, op1, target;
12575
12576 arg0 = CALL_EXPR_ARG (exp, 0);
12577 arg1 = CALL_EXPR_ARG (exp, 1);
12578 arg2 = CALL_EXPR_ARG (exp, 2);
12579
12580 tmode = TYPE_MODE (TREE_TYPE (arg0));
12581 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12582 gcc_assert (VECTOR_MODE_P (tmode));
12583
12584 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12585 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12586 elt = get_element_number (TREE_TYPE (arg0), arg2);
12587
12588 if (GET_MODE (op1) != mode1)
12589 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12590
12591 op0 = force_reg (tmode, op0);
12592 op1 = force_reg (mode1, op1);
12593
12594 /* OP0 is the source of these builtin functions and shouldn't be
12595 modified. Create a copy, use it and return it as target. */
12596 target = gen_reg_rtx (tmode);
12597 emit_move_insn (target, op0);
12598 ix86_expand_vector_set (true, target, op1, elt);
12599
12600 return target;
12601 }
12602
12603 /* Return true if the necessary isa options for this builtin exist,
12604 else false.
12605 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12606 bool
12607 ix86_check_builtin_isa_match (unsigned int fcode,
12608 HOST_WIDE_INT* pbisa,
12609 HOST_WIDE_INT* pbisa2)
12610 {
12611 HOST_WIDE_INT isa = ix86_isa_flags;
12612 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12613 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12614 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12615 HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
12616 /* The general case is we require all the ISAs specified in bisa{,2}
12617 to be enabled.
12618 The exceptions are:
12619 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12620 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12621 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12622 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12623 OPTION_MASK_ISA2_AVXVNNI
12624 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
12625 OPTION_MASK_ISA2_AVXIFMA
12626 (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
12627 OPTION_MASK_ISA2_AVXNECONVERT
12628 where for each such pair it is sufficient if either of the ISAs is
12629 enabled, plus if it is ored with other options also those others.
12630 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12631
12632 #define SHARE_BUILTIN(A1, A2, B1, B2) \
12633 if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
12634 && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
12635 && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
12636 || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
12637 { \
12638 tmp_isa |= (A1) | (B1); \
12639 tmp_isa2 |= (A2) | (B2); \
12640 }
12641
12642 SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
12643 SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
12644 SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
12645 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
12646 OPTION_MASK_ISA2_AVXVNNI);
12647 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
12648 OPTION_MASK_ISA2_AVXIFMA);
12649 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
12650 OPTION_MASK_ISA2_AVXNECONVERT);
12651 SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, 0, OPTION_MASK_ISA2_VAES);
12652 isa = tmp_isa;
12653 isa2 = tmp_isa2;
12654
12655 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12656 /* __builtin_ia32_maskmovq requires MMX registers. */
12657 && fcode != IX86_BUILTIN_MASKMOVQ)
12658 {
12659 bisa &= ~OPTION_MASK_ISA_MMX;
12660 bisa |= OPTION_MASK_ISA_SSE2;
12661 }
12662
12663 if (pbisa)
12664 *pbisa = bisa;
12665 if (pbisa2)
12666 *pbisa2 = bisa2;
12667
12668 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12669 }
12670
12671 /* Emit instructions to set the carry flag from ARG. */
12672
12673 void
12674 ix86_expand_carry (rtx arg)
12675 {
12676 if (!CONST_INT_P (arg) || arg == const0_rtx)
12677 {
12678 arg = convert_to_mode (QImode, arg, 1);
12679 arg = copy_to_mode_reg (QImode, arg);
12680 emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
12681 }
12682 else
12683 emit_insn (gen_x86_stc ());
12684 }
12685
12686 /* Expand an expression EXP that calls a built-in function,
12687 with result going to TARGET if that's convenient
12688 (and in mode MODE if that's convenient).
12689 SUBTARGET may be used as the target for computing one of EXP's operands.
12690 IGNORE is nonzero if the value is to be ignored. */
12691
12692 rtx
12693 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12694 machine_mode mode, int ignore)
12695 {
12696 size_t i;
12697 enum insn_code icode, icode2;
12698 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12699 tree arg0, arg1, arg2, arg3, arg4;
12700 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12701 machine_mode mode0, mode1, mode2, mode3, mode4;
12702 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12703 HOST_WIDE_INT bisa, bisa2;
12704
12705 /* For CPU builtins that can be folded, fold first and expand the fold. */
12706 switch (fcode)
12707 {
12708 case IX86_BUILTIN_CPU_INIT:
12709 {
12710 /* Make it call __cpu_indicator_init in libgcc. */
12711 tree call_expr, fndecl, type;
12712 type = build_function_type_list (integer_type_node, NULL_TREE);
12713 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12714 call_expr = build_call_expr (fndecl, 0);
12715 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12716 }
12717 case IX86_BUILTIN_CPU_IS:
12718 case IX86_BUILTIN_CPU_SUPPORTS:
12719 {
12720 tree arg0 = CALL_EXPR_ARG (exp, 0);
12721 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12722 gcc_assert (fold_expr != NULL_TREE);
12723 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12724 }
12725 }
12726
12727 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
12728 {
12729 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12730 if (TARGET_ABI_X32)
12731 bisa |= OPTION_MASK_ABI_X32;
12732 else
12733 bisa |= OPTION_MASK_ABI_64;
12734 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
12735 (enum fpmath_unit) 0,
12736 (enum prefer_vector_width) 0,
12737 PVW_NONE, PVW_NONE,
12738 false, add_abi_p);
12739 if (!opts)
12740 error ("%qE needs unknown isa option", fndecl);
12741 else
12742 {
12743 gcc_assert (opts != NULL);
12744 error ("%qE needs isa option %s", fndecl, opts);
12745 free (opts);
12746 }
12747 return expand_call (exp, target, ignore);
12748 }
12749
12750 switch (fcode)
12751 {
12752 case IX86_BUILTIN_MASKMOVQ:
12753 case IX86_BUILTIN_MASKMOVDQU:
12754 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12755 ? CODE_FOR_mmx_maskmovq
12756 : CODE_FOR_sse2_maskmovdqu);
12757 /* Note the arg order is different from the operand order. */
12758 arg1 = CALL_EXPR_ARG (exp, 0);
12759 arg2 = CALL_EXPR_ARG (exp, 1);
12760 arg0 = CALL_EXPR_ARG (exp, 2);
12761 op0 = expand_normal (arg0);
12762 op1 = expand_normal (arg1);
12763 op2 = expand_normal (arg2);
12764 mode0 = insn_data[icode].operand[0].mode;
12765 mode1 = insn_data[icode].operand[1].mode;
12766 mode2 = insn_data[icode].operand[2].mode;
12767
12768 op0 = ix86_zero_extend_to_Pmode (op0);
12769 op0 = gen_rtx_MEM (mode1, op0);
12770
12771 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12772 op0 = copy_to_mode_reg (mode0, op0);
12773 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12774 op1 = copy_to_mode_reg (mode1, op1);
12775 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12776 op2 = copy_to_mode_reg (mode2, op2);
12777 pat = GEN_FCN (icode) (op0, op1, op2);
12778 if (! pat)
12779 return 0;
12780 emit_insn (pat);
12781 return 0;
12782
12783 case IX86_BUILTIN_LDMXCSR:
12784 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12785 target = assign_386_stack_local (SImode, SLOT_TEMP);
12786 emit_move_insn (target, op0);
12787 emit_insn (gen_sse_ldmxcsr (target));
12788 return 0;
12789
12790 case IX86_BUILTIN_STMXCSR:
12791 target = assign_386_stack_local (SImode, SLOT_TEMP);
12792 emit_insn (gen_sse_stmxcsr (target));
12793 return copy_to_mode_reg (SImode, target);
12794
12795 case IX86_BUILTIN_CLFLUSH:
12796 arg0 = CALL_EXPR_ARG (exp, 0);
12797 op0 = expand_normal (arg0);
12798 icode = CODE_FOR_sse2_clflush;
12799 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12800 op0 = ix86_zero_extend_to_Pmode (op0);
12801
12802 emit_insn (gen_sse2_clflush (op0));
12803 return 0;
12804
12805 case IX86_BUILTIN_CLWB:
12806 arg0 = CALL_EXPR_ARG (exp, 0);
12807 op0 = expand_normal (arg0);
12808 icode = CODE_FOR_clwb;
12809 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12810 op0 = ix86_zero_extend_to_Pmode (op0);
12811
12812 emit_insn (gen_clwb (op0));
12813 return 0;
12814
12815 case IX86_BUILTIN_CLFLUSHOPT:
12816 arg0 = CALL_EXPR_ARG (exp, 0);
12817 op0 = expand_normal (arg0);
12818 icode = CODE_FOR_clflushopt;
12819 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12820 op0 = ix86_zero_extend_to_Pmode (op0);
12821
12822 emit_insn (gen_clflushopt (op0));
12823 return 0;
12824
12825 case IX86_BUILTIN_MONITOR:
12826 case IX86_BUILTIN_MONITORX:
12827 arg0 = CALL_EXPR_ARG (exp, 0);
12828 arg1 = CALL_EXPR_ARG (exp, 1);
12829 arg2 = CALL_EXPR_ARG (exp, 2);
12830 op0 = expand_normal (arg0);
12831 op1 = expand_normal (arg1);
12832 op2 = expand_normal (arg2);
12833 if (!REG_P (op0))
12834 op0 = ix86_zero_extend_to_Pmode (op0);
12835 if (!REG_P (op1))
12836 op1 = copy_to_mode_reg (SImode, op1);
12837 if (!REG_P (op2))
12838 op2 = copy_to_mode_reg (SImode, op2);
12839
12840 emit_insn (fcode == IX86_BUILTIN_MONITOR
12841 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12842 : gen_monitorx (Pmode, op0, op1, op2));
12843 return 0;
12844
12845 case IX86_BUILTIN_MWAIT:
12846 arg0 = CALL_EXPR_ARG (exp, 0);
12847 arg1 = CALL_EXPR_ARG (exp, 1);
12848 op0 = expand_normal (arg0);
12849 op1 = expand_normal (arg1);
12850 if (!REG_P (op0))
12851 op0 = copy_to_mode_reg (SImode, op0);
12852 if (!REG_P (op1))
12853 op1 = copy_to_mode_reg (SImode, op1);
12854 emit_insn (gen_sse3_mwait (op0, op1));
12855 return 0;
12856
12857 case IX86_BUILTIN_MWAITX:
12858 arg0 = CALL_EXPR_ARG (exp, 0);
12859 arg1 = CALL_EXPR_ARG (exp, 1);
12860 arg2 = CALL_EXPR_ARG (exp, 2);
12861 op0 = expand_normal (arg0);
12862 op1 = expand_normal (arg1);
12863 op2 = expand_normal (arg2);
12864 if (!REG_P (op0))
12865 op0 = copy_to_mode_reg (SImode, op0);
12866 if (!REG_P (op1))
12867 op1 = copy_to_mode_reg (SImode, op1);
12868 if (!REG_P (op2))
12869 op2 = copy_to_mode_reg (SImode, op2);
12870 emit_insn (gen_mwaitx (op0, op1, op2));
12871 return 0;
12872
12873 case IX86_BUILTIN_UMONITOR:
12874 arg0 = CALL_EXPR_ARG (exp, 0);
12875 op0 = expand_normal (arg0);
12876
12877 op0 = ix86_zero_extend_to_Pmode (op0);
12878 emit_insn (gen_umonitor (Pmode, op0));
12879 return 0;
12880
12881 case IX86_BUILTIN_UMWAIT:
12882 case IX86_BUILTIN_TPAUSE:
12883 arg0 = CALL_EXPR_ARG (exp, 0);
12884 arg1 = CALL_EXPR_ARG (exp, 1);
12885 op0 = expand_normal (arg0);
12886 op1 = expand_normal (arg1);
12887
12888 if (!REG_P (op0))
12889 op0 = copy_to_mode_reg (SImode, op0);
12890
12891 op1 = force_reg (DImode, op1);
12892
12893 if (TARGET_64BIT)
12894 {
12895 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12896 NULL, 1, OPTAB_DIRECT);
12897 switch (fcode)
12898 {
12899 case IX86_BUILTIN_UMWAIT:
12900 icode = CODE_FOR_umwait_rex64;
12901 break;
12902 case IX86_BUILTIN_TPAUSE:
12903 icode = CODE_FOR_tpause_rex64;
12904 break;
12905 default:
12906 gcc_unreachable ();
12907 }
12908
12909 op2 = gen_lowpart (SImode, op2);
12910 op1 = gen_lowpart (SImode, op1);
12911 pat = GEN_FCN (icode) (op0, op1, op2);
12912 }
12913 else
12914 {
12915 switch (fcode)
12916 {
12917 case IX86_BUILTIN_UMWAIT:
12918 icode = CODE_FOR_umwait;
12919 break;
12920 case IX86_BUILTIN_TPAUSE:
12921 icode = CODE_FOR_tpause;
12922 break;
12923 default:
12924 gcc_unreachable ();
12925 }
12926 pat = GEN_FCN (icode) (op0, op1);
12927 }
12928
12929 if (!pat)
12930 return 0;
12931
12932 emit_insn (pat);
12933
12934 if (target == 0
12935 || !register_operand (target, QImode))
12936 target = gen_reg_rtx (QImode);
12937
12938 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12939 const0_rtx);
12940 emit_insn (gen_rtx_SET (target, pat));
12941
12942 return target;
12943
12944 case IX86_BUILTIN_TESTUI:
12945 emit_insn (gen_testui ());
12946
12947 if (target == 0
12948 || !register_operand (target, QImode))
12949 target = gen_reg_rtx (QImode);
12950
12951 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12952 const0_rtx);
12953 emit_insn (gen_rtx_SET (target, pat));
12954
12955 return target;
12956
12957 case IX86_BUILTIN_CLZERO:
12958 arg0 = CALL_EXPR_ARG (exp, 0);
12959 op0 = expand_normal (arg0);
12960 if (!REG_P (op0))
12961 op0 = ix86_zero_extend_to_Pmode (op0);
12962 emit_insn (gen_clzero (Pmode, op0));
12963 return 0;
12964
12965 case IX86_BUILTIN_CLDEMOTE:
12966 arg0 = CALL_EXPR_ARG (exp, 0);
12967 op0 = expand_normal (arg0);
12968 icode = CODE_FOR_cldemote;
12969 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12970 op0 = ix86_zero_extend_to_Pmode (op0);
12971
12972 emit_insn (gen_cldemote (op0));
12973 return 0;
12974
12975 case IX86_BUILTIN_LOADIWKEY:
12976 {
12977 arg0 = CALL_EXPR_ARG (exp, 0);
12978 arg1 = CALL_EXPR_ARG (exp, 1);
12979 arg2 = CALL_EXPR_ARG (exp, 2);
12980 arg3 = CALL_EXPR_ARG (exp, 3);
12981
12982 op0 = expand_normal (arg0);
12983 op1 = expand_normal (arg1);
12984 op2 = expand_normal (arg2);
12985 op3 = expand_normal (arg3);
12986
12987 if (!REG_P (op0))
12988 op0 = copy_to_mode_reg (V2DImode, op0);
12989 if (!REG_P (op1))
12990 op1 = copy_to_mode_reg (V2DImode, op1);
12991 if (!REG_P (op2))
12992 op2 = copy_to_mode_reg (V2DImode, op2);
12993 if (!REG_P (op3))
12994 op3 = copy_to_mode_reg (SImode, op3);
12995
12996 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12997
12998 return 0;
12999 }
13000
13001 case IX86_BUILTIN_AESDEC128KLU8:
13002 icode = CODE_FOR_aesdec128klu8;
13003 goto aesdecenc_expand;
13004
13005 case IX86_BUILTIN_AESDEC256KLU8:
13006 icode = CODE_FOR_aesdec256klu8;
13007 goto aesdecenc_expand;
13008
13009 case IX86_BUILTIN_AESENC128KLU8:
13010 icode = CODE_FOR_aesenc128klu8;
13011 goto aesdecenc_expand;
13012
13013 case IX86_BUILTIN_AESENC256KLU8:
13014 icode = CODE_FOR_aesenc256klu8;
13015
13016 aesdecenc_expand:
13017
13018 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
13019 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
13020 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13021
13022 op0 = expand_normal (arg0);
13023 op1 = expand_normal (arg1);
13024 op2 = expand_normal (arg2);
13025
13026 if (!address_operand (op0, V2DImode))
13027 {
13028 op0 = convert_memory_address (Pmode, op0);
13029 op0 = copy_addr_to_reg (op0);
13030 }
13031 op0 = gen_rtx_MEM (V2DImode, op0);
13032
13033 if (!REG_P (op1))
13034 op1 = copy_to_mode_reg (V2DImode, op1);
13035
13036 if (!address_operand (op2, VOIDmode))
13037 {
13038 op2 = convert_memory_address (Pmode, op2);
13039 op2 = copy_addr_to_reg (op2);
13040 }
13041 op2 = gen_rtx_MEM (BLKmode, op2);
13042
13043 emit_insn (GEN_FCN (icode) (op1, op1, op2));
13044
13045 if (target == 0)
13046 target = gen_reg_rtx (QImode);
13047
13048 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
13049 error occurs. Then the output should be cleared for safety. */
13050 rtx_code_label *ok_label;
13051 rtx tmp;
13052
13053 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13054 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13055 ok_label = gen_label_rtx ();
13056 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13057 true, ok_label);
13058 /* Usually the runtime error seldom occur, so predict OK path as
13059 hotspot to optimize it as fallthrough block. */
13060 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13061
13062 emit_insn (gen_rtx_SET (op1, const0_rtx));
13063
13064 emit_label (ok_label);
13065 emit_insn (gen_rtx_SET (target, pat));
13066 emit_insn (gen_rtx_SET (op0, op1));
13067
13068 return target;
13069
13070 case IX86_BUILTIN_AESDECWIDE128KLU8:
13071 icode = CODE_FOR_aesdecwide128klu8;
13072 goto wideaesdecenc_expand;
13073
13074 case IX86_BUILTIN_AESDECWIDE256KLU8:
13075 icode = CODE_FOR_aesdecwide256klu8;
13076 goto wideaesdecenc_expand;
13077
13078 case IX86_BUILTIN_AESENCWIDE128KLU8:
13079 icode = CODE_FOR_aesencwide128klu8;
13080 goto wideaesdecenc_expand;
13081
13082 case IX86_BUILTIN_AESENCWIDE256KLU8:
13083 icode = CODE_FOR_aesencwide256klu8;
13084
13085 wideaesdecenc_expand:
13086
13087 rtx xmm_regs[8];
13088 rtx op;
13089
13090 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
13091 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
13092 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13093
13094 op0 = expand_normal (arg0);
13095 op1 = expand_normal (arg1);
13096 op2 = expand_normal (arg2);
13097
13098 if (!address_operand (op2, VOIDmode))
13099 {
13100 op2 = convert_memory_address (Pmode, op2);
13101 op2 = copy_addr_to_reg (op2);
13102 }
13103 op2 = gen_rtx_MEM (BLKmode, op2);
13104
13105 for (i = 0; i < 8; i++)
13106 {
13107 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13108
13109 op = gen_rtx_MEM (V2DImode,
13110 plus_constant (Pmode, op1, (i * 16)));
13111
13112 emit_move_insn (xmm_regs[i], op);
13113 }
13114
13115 emit_insn (GEN_FCN (icode) (op2));
13116
13117 if (target == 0)
13118 target = gen_reg_rtx (QImode);
13119
13120 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13121 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13122 ok_label = gen_label_rtx ();
13123 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13124 true, ok_label);
13125 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13126
13127 for (i = 0; i < 8; i++)
13128 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
13129
13130 emit_label (ok_label);
13131 emit_insn (gen_rtx_SET (target, pat));
13132
13133 for (i = 0; i < 8; i++)
13134 {
13135 op = gen_rtx_MEM (V2DImode,
13136 plus_constant (Pmode, op0, (i * 16)));
13137 emit_move_insn (op, xmm_regs[i]);
13138 }
13139
13140 return target;
13141
13142 case IX86_BUILTIN_ENCODEKEY128U32:
13143 {
13144 rtx op, xmm_regs[7];
13145
13146 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13147 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
13148 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
13149
13150 op0 = expand_normal (arg0);
13151 op1 = expand_normal (arg1);
13152 op2 = expand_normal (arg2);
13153
13154 if (!REG_P (op0))
13155 op0 = copy_to_mode_reg (SImode, op0);
13156
13157 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13158 emit_move_insn (op, op1);
13159
13160 for (i = 0; i < 3; i++)
13161 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13162
13163 if (target == 0)
13164 target = gen_reg_rtx (SImode);
13165
13166 emit_insn (gen_encodekey128u32 (target, op0));
13167
13168 for (i = 0; i < 3; i++)
13169 {
13170 op = gen_rtx_MEM (V2DImode,
13171 plus_constant (Pmode, op2, (i * 16)));
13172 emit_move_insn (op, xmm_regs[i]);
13173 }
13174
13175 return target;
13176 }
13177 case IX86_BUILTIN_ENCODEKEY256U32:
13178 {
13179 rtx op, xmm_regs[7];
13180
13181 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13182 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
13183 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
13184 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
13185
13186 op0 = expand_normal (arg0);
13187 op1 = expand_normal (arg1);
13188 op2 = expand_normal (arg2);
13189 op3 = expand_normal (arg3);
13190
13191 if (!REG_P (op0))
13192 op0 = copy_to_mode_reg (SImode, op0);
13193
13194 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13195 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13196 emit_move_insn (op, op1);
13197 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
13198 emit_move_insn (op, op2);
13199
13200 for (i = 0; i < 4; i++)
13201 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13202
13203 if (target == 0)
13204 target = gen_reg_rtx (SImode);
13205
13206 emit_insn (gen_encodekey256u32 (target, op0));
13207
13208 for (i = 0; i < 4; i++)
13209 {
13210 op = gen_rtx_MEM (V2DImode,
13211 plus_constant (Pmode, op3, (i * 16)));
13212 emit_move_insn (op, xmm_regs[i]);
13213 }
13214
13215 return target;
13216 }
13217
13218 case IX86_BUILTIN_PREFETCH:
13219 {
13220 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13221 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13222 arg2 = CALL_EXPR_ARG (exp, 2); // const int
13223 arg3 = CALL_EXPR_ARG (exp, 3); // const int
13224
13225 op0 = expand_normal (arg0);
13226 op1 = expand_normal (arg1);
13227 op2 = expand_normal (arg2);
13228 op3 = expand_normal (arg3);
13229
13230 if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
13231 {
13232 error ("second, third and fourth argument must be a const");
13233 return const0_rtx;
13234 }
13235
13236 if (INTVAL (op3) == 1)
13237 {
13238 if (INTVAL (op2) < 2 || INTVAL (op2) > 3)
13239 {
13240 error ("invalid third argument");
13241 return const0_rtx;
13242 }
13243
13244 if (TARGET_64BIT && TARGET_PREFETCHI
13245 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13246 emit_insn (gen_prefetchi (op0, op2));
13247 else
13248 {
13249 warning (0, "instruction prefetch applies when in 64-bit mode"
13250 " with RIP-relative addressing and"
13251 " option %<-mprefetchi%>;"
13252 " they stay NOPs otherwise");
13253 emit_insn (gen_nop ());
13254 }
13255 }
13256 else
13257 {
13258 if (!address_operand (op0, VOIDmode))
13259 {
13260 op0 = convert_memory_address (Pmode, op0);
13261 op0 = copy_addr_to_reg (op0);
13262 }
13263
13264 if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
13265 {
13266 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
13267 op2 = const0_rtx;
13268 }
13269
13270 if (TARGET_3DNOW || TARGET_PREFETCH_SSE
13271 || TARGET_PRFCHW || TARGET_PREFETCHWT1)
13272 emit_insn (gen_prefetch (op0, op1, op2));
13273 else if (!MEM_P (op0) && side_effects_p (op0))
13274 /* Don't do anything with direct references to volatile memory,
13275 but generate code to handle other side effects. */
13276 emit_insn (op0);
13277 }
13278
13279 return 0;
13280 }
13281
13282 case IX86_BUILTIN_PREFETCHI:
13283 {
13284 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13285 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13286
13287 op0 = expand_normal (arg0);
13288 op1 = expand_normal (arg1);
13289
13290 if (!CONST_INT_P (op1))
13291 {
13292 error ("second argument must be a const");
13293 return const0_rtx;
13294 }
13295
13296 /* GOT/PLT_PIC should not be available for instruction prefetch.
13297 It must be real instruction address. */
13298 if (TARGET_64BIT
13299 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13300 emit_insn (gen_prefetchi (op0, op1));
13301 else
13302 {
13303 /* Ignore the hint. */
13304 warning (0, "instruction prefetch applies when in 64-bit mode"
13305 " with RIP-relative addressing and"
13306 " option %<-mprefetchi%>;"
13307 " they stay NOPs otherwise");
13308 emit_insn (gen_nop ());
13309 }
13310
13311 return 0;
13312 }
13313
13314 case IX86_BUILTIN_VEC_INIT_V2SI:
13315 case IX86_BUILTIN_VEC_INIT_V4HI:
13316 case IX86_BUILTIN_VEC_INIT_V8QI:
13317 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
13318
13319 case IX86_BUILTIN_VEC_EXT_V2DF:
13320 case IX86_BUILTIN_VEC_EXT_V2DI:
13321 case IX86_BUILTIN_VEC_EXT_V4SF:
13322 case IX86_BUILTIN_VEC_EXT_V4SI:
13323 case IX86_BUILTIN_VEC_EXT_V8HI:
13324 case IX86_BUILTIN_VEC_EXT_V2SI:
13325 case IX86_BUILTIN_VEC_EXT_V4HI:
13326 case IX86_BUILTIN_VEC_EXT_V16QI:
13327 return ix86_expand_vec_ext_builtin (exp, target);
13328
13329 case IX86_BUILTIN_VEC_SET_V2DI:
13330 case IX86_BUILTIN_VEC_SET_V4SF:
13331 case IX86_BUILTIN_VEC_SET_V4SI:
13332 case IX86_BUILTIN_VEC_SET_V8HI:
13333 case IX86_BUILTIN_VEC_SET_V4HI:
13334 case IX86_BUILTIN_VEC_SET_V16QI:
13335 return ix86_expand_vec_set_builtin (exp);
13336
13337 case IX86_BUILTIN_NANQ:
13338 case IX86_BUILTIN_NANSQ:
13339 return expand_call (exp, target, ignore);
13340
13341 case IX86_BUILTIN_RDPID:
13342
13343 op0 = gen_reg_rtx (word_mode);
13344
13345 if (TARGET_64BIT)
13346 {
13347 insn = gen_rdpid_rex64 (op0);
13348 op0 = convert_to_mode (SImode, op0, 1);
13349 }
13350 else
13351 insn = gen_rdpid (op0);
13352
13353 emit_insn (insn);
13354
13355 if (target == 0
13356 || !register_operand (target, SImode))
13357 target = gen_reg_rtx (SImode);
13358
13359 emit_move_insn (target, op0);
13360 return target;
13361
13362 case IX86_BUILTIN_2INTERSECTD512:
13363 case IX86_BUILTIN_2INTERSECTQ512:
13364 case IX86_BUILTIN_2INTERSECTD256:
13365 case IX86_BUILTIN_2INTERSECTQ256:
13366 case IX86_BUILTIN_2INTERSECTD128:
13367 case IX86_BUILTIN_2INTERSECTQ128:
13368 arg0 = CALL_EXPR_ARG (exp, 0);
13369 arg1 = CALL_EXPR_ARG (exp, 1);
13370 arg2 = CALL_EXPR_ARG (exp, 2);
13371 arg3 = CALL_EXPR_ARG (exp, 3);
13372 op0 = expand_normal (arg0);
13373 op1 = expand_normal (arg1);
13374 op2 = expand_normal (arg2);
13375 op3 = expand_normal (arg3);
13376
13377 if (!address_operand (op0, VOIDmode))
13378 {
13379 op0 = convert_memory_address (Pmode, op0);
13380 op0 = copy_addr_to_reg (op0);
13381 }
13382 if (!address_operand (op1, VOIDmode))
13383 {
13384 op1 = convert_memory_address (Pmode, op1);
13385 op1 = copy_addr_to_reg (op1);
13386 }
13387
13388 switch (fcode)
13389 {
13390 case IX86_BUILTIN_2INTERSECTD512:
13391 mode4 = P2HImode;
13392 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13393 break;
13394 case IX86_BUILTIN_2INTERSECTQ512:
13395 mode4 = P2QImode;
13396 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13397 break;
13398 case IX86_BUILTIN_2INTERSECTD256:
13399 mode4 = P2QImode;
13400 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13401 break;
13402 case IX86_BUILTIN_2INTERSECTQ256:
13403 mode4 = P2QImode;
13404 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13405 break;
13406 case IX86_BUILTIN_2INTERSECTD128:
13407 mode4 = P2QImode;
13408 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13409 break;
13410 case IX86_BUILTIN_2INTERSECTQ128:
13411 mode4 = P2QImode;
13412 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13413 break;
13414 default:
13415 gcc_unreachable ();
13416 }
13417
13418 mode2 = insn_data[icode].operand[1].mode;
13419 mode3 = insn_data[icode].operand[2].mode;
13420 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13421 op2 = copy_to_mode_reg (mode2, op2);
13422 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13423 op3 = copy_to_mode_reg (mode3, op3);
13424
13425 op4 = gen_reg_rtx (mode4);
13426 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13427 mode0 = mode4 == P2HImode ? HImode : QImode;
13428 emit_move_insn (gen_rtx_MEM (mode0, op0),
13429 gen_lowpart (mode0, op4));
13430 emit_move_insn (gen_rtx_MEM (mode0, op1),
13431 gen_highpart (mode0, op4));
13432
13433 return 0;
13434
13435 case IX86_BUILTIN_RDPMC:
13436 case IX86_BUILTIN_RDTSC:
13437 case IX86_BUILTIN_RDTSCP:
13438 case IX86_BUILTIN_XGETBV:
13439
13440 op0 = gen_reg_rtx (DImode);
13441 op1 = gen_reg_rtx (DImode);
13442
13443 if (fcode == IX86_BUILTIN_RDPMC)
13444 {
13445 arg0 = CALL_EXPR_ARG (exp, 0);
13446 op2 = expand_normal (arg0);
13447 if (!register_operand (op2, SImode))
13448 op2 = copy_to_mode_reg (SImode, op2);
13449
13450 insn = (TARGET_64BIT
13451 ? gen_rdpmc_rex64 (op0, op1, op2)
13452 : gen_rdpmc (op0, op2));
13453 emit_insn (insn);
13454 }
13455 else if (fcode == IX86_BUILTIN_XGETBV)
13456 {
13457 arg0 = CALL_EXPR_ARG (exp, 0);
13458 op2 = expand_normal (arg0);
13459 if (!register_operand (op2, SImode))
13460 op2 = copy_to_mode_reg (SImode, op2);
13461
13462 insn = (TARGET_64BIT
13463 ? gen_xgetbv_rex64 (op0, op1, op2)
13464 : gen_xgetbv (op0, op2));
13465 emit_insn (insn);
13466 }
13467 else if (fcode == IX86_BUILTIN_RDTSC)
13468 {
13469 insn = (TARGET_64BIT
13470 ? gen_rdtsc_rex64 (op0, op1)
13471 : gen_rdtsc (op0));
13472 emit_insn (insn);
13473 }
13474 else
13475 {
13476 op2 = gen_reg_rtx (SImode);
13477
13478 insn = (TARGET_64BIT
13479 ? gen_rdtscp_rex64 (op0, op1, op2)
13480 : gen_rdtscp (op0, op2));
13481 emit_insn (insn);
13482
13483 arg0 = CALL_EXPR_ARG (exp, 0);
13484 op4 = expand_normal (arg0);
13485 if (!address_operand (op4, VOIDmode))
13486 {
13487 op4 = convert_memory_address (Pmode, op4);
13488 op4 = copy_addr_to_reg (op4);
13489 }
13490 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13491 }
13492
13493 if (target == 0
13494 || !register_operand (target, DImode))
13495 target = gen_reg_rtx (DImode);
13496
13497 if (TARGET_64BIT)
13498 {
13499 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13500 op1, 1, OPTAB_DIRECT);
13501 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13502 op0, 1, OPTAB_DIRECT);
13503 }
13504
13505 emit_move_insn (target, op0);
13506 return target;
13507
13508 case IX86_BUILTIN_ENQCMD:
13509 case IX86_BUILTIN_ENQCMDS:
13510 case IX86_BUILTIN_MOVDIR64B:
13511
13512 arg0 = CALL_EXPR_ARG (exp, 0);
13513 arg1 = CALL_EXPR_ARG (exp, 1);
13514 op0 = expand_normal (arg0);
13515 op1 = expand_normal (arg1);
13516
13517 op0 = ix86_zero_extend_to_Pmode (op0);
13518 if (!address_operand (op1, VOIDmode))
13519 {
13520 op1 = convert_memory_address (Pmode, op1);
13521 op1 = copy_addr_to_reg (op1);
13522 }
13523 op1 = gen_rtx_MEM (XImode, op1);
13524
13525 if (fcode == IX86_BUILTIN_MOVDIR64B)
13526 {
13527 emit_insn (gen_movdir64b (Pmode, op0, op1));
13528 return 0;
13529 }
13530 else
13531 {
13532 if (target == 0
13533 || !register_operand (target, SImode))
13534 target = gen_reg_rtx (SImode);
13535
13536 emit_move_insn (target, const0_rtx);
13537 target = gen_rtx_SUBREG (QImode, target, 0);
13538
13539 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13540 ? UNSPECV_ENQCMD
13541 : UNSPECV_ENQCMDS);
13542 icode = code_for_enqcmd (unspecv, Pmode);
13543 emit_insn (GEN_FCN (icode) (op0, op1));
13544
13545 emit_insn
13546 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13547 gen_rtx_fmt_ee (EQ, QImode,
13548 gen_rtx_REG (CCZmode, FLAGS_REG),
13549 const0_rtx)));
13550 return SUBREG_REG (target);
13551 }
13552
13553 case IX86_BUILTIN_FXSAVE:
13554 case IX86_BUILTIN_FXRSTOR:
13555 case IX86_BUILTIN_FXSAVE64:
13556 case IX86_BUILTIN_FXRSTOR64:
13557 case IX86_BUILTIN_FNSTENV:
13558 case IX86_BUILTIN_FLDENV:
13559 mode0 = BLKmode;
13560 switch (fcode)
13561 {
13562 case IX86_BUILTIN_FXSAVE:
13563 icode = CODE_FOR_fxsave;
13564 break;
13565 case IX86_BUILTIN_FXRSTOR:
13566 icode = CODE_FOR_fxrstor;
13567 break;
13568 case IX86_BUILTIN_FXSAVE64:
13569 icode = CODE_FOR_fxsave64;
13570 break;
13571 case IX86_BUILTIN_FXRSTOR64:
13572 icode = CODE_FOR_fxrstor64;
13573 break;
13574 case IX86_BUILTIN_FNSTENV:
13575 icode = CODE_FOR_fnstenv;
13576 break;
13577 case IX86_BUILTIN_FLDENV:
13578 icode = CODE_FOR_fldenv;
13579 break;
13580 default:
13581 gcc_unreachable ();
13582 }
13583
13584 arg0 = CALL_EXPR_ARG (exp, 0);
13585 op0 = expand_normal (arg0);
13586
13587 if (!address_operand (op0, VOIDmode))
13588 {
13589 op0 = convert_memory_address (Pmode, op0);
13590 op0 = copy_addr_to_reg (op0);
13591 }
13592 op0 = gen_rtx_MEM (mode0, op0);
13593
13594 pat = GEN_FCN (icode) (op0);
13595 if (pat)
13596 emit_insn (pat);
13597 return 0;
13598
13599 case IX86_BUILTIN_XSETBV:
13600 arg0 = CALL_EXPR_ARG (exp, 0);
13601 arg1 = CALL_EXPR_ARG (exp, 1);
13602 op0 = expand_normal (arg0);
13603 op1 = expand_normal (arg1);
13604
13605 if (!REG_P (op0))
13606 op0 = copy_to_mode_reg (SImode, op0);
13607
13608 op1 = force_reg (DImode, op1);
13609
13610 if (TARGET_64BIT)
13611 {
13612 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13613 NULL, 1, OPTAB_DIRECT);
13614
13615 icode = CODE_FOR_xsetbv_rex64;
13616
13617 op2 = gen_lowpart (SImode, op2);
13618 op1 = gen_lowpart (SImode, op1);
13619 pat = GEN_FCN (icode) (op0, op1, op2);
13620 }
13621 else
13622 {
13623 icode = CODE_FOR_xsetbv;
13624
13625 pat = GEN_FCN (icode) (op0, op1);
13626 }
13627 if (pat)
13628 emit_insn (pat);
13629 return 0;
13630
13631 case IX86_BUILTIN_XSAVE:
13632 case IX86_BUILTIN_XRSTOR:
13633 case IX86_BUILTIN_XSAVE64:
13634 case IX86_BUILTIN_XRSTOR64:
13635 case IX86_BUILTIN_XSAVEOPT:
13636 case IX86_BUILTIN_XSAVEOPT64:
13637 case IX86_BUILTIN_XSAVES:
13638 case IX86_BUILTIN_XRSTORS:
13639 case IX86_BUILTIN_XSAVES64:
13640 case IX86_BUILTIN_XRSTORS64:
13641 case IX86_BUILTIN_XSAVEC:
13642 case IX86_BUILTIN_XSAVEC64:
13643 arg0 = CALL_EXPR_ARG (exp, 0);
13644 arg1 = CALL_EXPR_ARG (exp, 1);
13645 op0 = expand_normal (arg0);
13646 op1 = expand_normal (arg1);
13647
13648 if (!address_operand (op0, VOIDmode))
13649 {
13650 op0 = convert_memory_address (Pmode, op0);
13651 op0 = copy_addr_to_reg (op0);
13652 }
13653 op0 = gen_rtx_MEM (BLKmode, op0);
13654
13655 op1 = force_reg (DImode, op1);
13656
13657 if (TARGET_64BIT)
13658 {
13659 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13660 NULL, 1, OPTAB_DIRECT);
13661 switch (fcode)
13662 {
13663 case IX86_BUILTIN_XSAVE:
13664 icode = CODE_FOR_xsave_rex64;
13665 break;
13666 case IX86_BUILTIN_XRSTOR:
13667 icode = CODE_FOR_xrstor_rex64;
13668 break;
13669 case IX86_BUILTIN_XSAVE64:
13670 icode = CODE_FOR_xsave64;
13671 break;
13672 case IX86_BUILTIN_XRSTOR64:
13673 icode = CODE_FOR_xrstor64;
13674 break;
13675 case IX86_BUILTIN_XSAVEOPT:
13676 icode = CODE_FOR_xsaveopt_rex64;
13677 break;
13678 case IX86_BUILTIN_XSAVEOPT64:
13679 icode = CODE_FOR_xsaveopt64;
13680 break;
13681 case IX86_BUILTIN_XSAVES:
13682 icode = CODE_FOR_xsaves_rex64;
13683 break;
13684 case IX86_BUILTIN_XRSTORS:
13685 icode = CODE_FOR_xrstors_rex64;
13686 break;
13687 case IX86_BUILTIN_XSAVES64:
13688 icode = CODE_FOR_xsaves64;
13689 break;
13690 case IX86_BUILTIN_XRSTORS64:
13691 icode = CODE_FOR_xrstors64;
13692 break;
13693 case IX86_BUILTIN_XSAVEC:
13694 icode = CODE_FOR_xsavec_rex64;
13695 break;
13696 case IX86_BUILTIN_XSAVEC64:
13697 icode = CODE_FOR_xsavec64;
13698 break;
13699 default:
13700 gcc_unreachable ();
13701 }
13702
13703 op2 = gen_lowpart (SImode, op2);
13704 op1 = gen_lowpart (SImode, op1);
13705 pat = GEN_FCN (icode) (op0, op1, op2);
13706 }
13707 else
13708 {
13709 switch (fcode)
13710 {
13711 case IX86_BUILTIN_XSAVE:
13712 icode = CODE_FOR_xsave;
13713 break;
13714 case IX86_BUILTIN_XRSTOR:
13715 icode = CODE_FOR_xrstor;
13716 break;
13717 case IX86_BUILTIN_XSAVEOPT:
13718 icode = CODE_FOR_xsaveopt;
13719 break;
13720 case IX86_BUILTIN_XSAVES:
13721 icode = CODE_FOR_xsaves;
13722 break;
13723 case IX86_BUILTIN_XRSTORS:
13724 icode = CODE_FOR_xrstors;
13725 break;
13726 case IX86_BUILTIN_XSAVEC:
13727 icode = CODE_FOR_xsavec;
13728 break;
13729 default:
13730 gcc_unreachable ();
13731 }
13732 pat = GEN_FCN (icode) (op0, op1);
13733 }
13734
13735 if (pat)
13736 emit_insn (pat);
13737 return 0;
13738
13739 case IX86_BUILTIN_LLWPCB:
13740 arg0 = CALL_EXPR_ARG (exp, 0);
13741 op0 = expand_normal (arg0);
13742
13743 if (!register_operand (op0, Pmode))
13744 op0 = ix86_zero_extend_to_Pmode (op0);
13745 emit_insn (gen_lwp_llwpcb (Pmode, op0));
13746 return 0;
13747
13748 case IX86_BUILTIN_SLWPCB:
13749 if (!target
13750 || !register_operand (target, Pmode))
13751 target = gen_reg_rtx (Pmode);
13752 emit_insn (gen_lwp_slwpcb (Pmode, target));
13753 return target;
13754
13755 case IX86_BUILTIN_LWPVAL32:
13756 case IX86_BUILTIN_LWPVAL64:
13757 case IX86_BUILTIN_LWPINS32:
13758 case IX86_BUILTIN_LWPINS64:
13759 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13760 || fcode == IX86_BUILTIN_LWPINS32)
13761 ? SImode : DImode);
13762
13763 if (fcode == IX86_BUILTIN_LWPVAL32
13764 || fcode == IX86_BUILTIN_LWPVAL64)
13765 icode = code_for_lwp_lwpval (mode);
13766 else
13767 icode = code_for_lwp_lwpins (mode);
13768
13769 arg0 = CALL_EXPR_ARG (exp, 0);
13770 arg1 = CALL_EXPR_ARG (exp, 1);
13771 arg2 = CALL_EXPR_ARG (exp, 2);
13772 op0 = expand_normal (arg0);
13773 op1 = expand_normal (arg1);
13774 op2 = expand_normal (arg2);
13775 mode0 = insn_data[icode].operand[0].mode;
13776
13777 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13778 op0 = copy_to_mode_reg (mode0, op0);
13779 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13780 op1 = copy_to_mode_reg (SImode, op1);
13781
13782 if (!CONST_INT_P (op2))
13783 {
13784 error ("the last argument must be a 32-bit immediate");
13785 return const0_rtx;
13786 }
13787
13788 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13789
13790 if (fcode == IX86_BUILTIN_LWPINS32
13791 || fcode == IX86_BUILTIN_LWPINS64)
13792 {
13793 if (target == 0
13794 || !nonimmediate_operand (target, QImode))
13795 target = gen_reg_rtx (QImode);
13796
13797 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13798 const0_rtx);
13799 emit_insn (gen_rtx_SET (target, pat));
13800
13801 return target;
13802 }
13803 else
13804 return 0;
13805
13806 case IX86_BUILTIN_BEXTRI32:
13807 case IX86_BUILTIN_BEXTRI64:
13808 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13809
13810 arg0 = CALL_EXPR_ARG (exp, 0);
13811 arg1 = CALL_EXPR_ARG (exp, 1);
13812 op0 = expand_normal (arg0);
13813 op1 = expand_normal (arg1);
13814
13815 if (!CONST_INT_P (op1))
13816 {
13817 error ("last argument must be an immediate");
13818 return const0_rtx;
13819 }
13820 else
13821 {
13822 unsigned char lsb_index = UINTVAL (op1);
13823 unsigned char length = UINTVAL (op1) >> 8;
13824
13825 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13826
13827 icode = code_for_tbm_bextri (mode);
13828
13829 mode1 = insn_data[icode].operand[1].mode;
13830 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13831 op0 = copy_to_mode_reg (mode1, op0);
13832
13833 mode0 = insn_data[icode].operand[0].mode;
13834 if (target == 0
13835 || !register_operand (target, mode0))
13836 target = gen_reg_rtx (mode0);
13837
13838 if (length == 0 || lsb_index >= bitsize)
13839 {
13840 emit_move_insn (target, const0_rtx);
13841 return target;
13842 }
13843
13844 if (length + lsb_index > bitsize)
13845 length = bitsize - lsb_index;
13846
13847 op1 = GEN_INT (length);
13848 op2 = GEN_INT (lsb_index);
13849
13850 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13851 return target;
13852 }
13853
13854 case IX86_BUILTIN_RDRAND16_STEP:
13855 mode = HImode;
13856 goto rdrand_step;
13857
13858 case IX86_BUILTIN_RDRAND32_STEP:
13859 mode = SImode;
13860 goto rdrand_step;
13861
13862 case IX86_BUILTIN_RDRAND64_STEP:
13863 mode = DImode;
13864
13865 rdrand_step:
13866 arg0 = CALL_EXPR_ARG (exp, 0);
13867 op1 = expand_normal (arg0);
13868 if (!address_operand (op1, VOIDmode))
13869 {
13870 op1 = convert_memory_address (Pmode, op1);
13871 op1 = copy_addr_to_reg (op1);
13872 }
13873
13874 op0 = gen_reg_rtx (mode);
13875 emit_insn (gen_rdrand (mode, op0));
13876
13877 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13878
13879 op1 = force_reg (SImode, const1_rtx);
13880
13881 /* Emit SImode conditional move. */
13882 if (mode == HImode)
13883 {
13884 if (TARGET_ZERO_EXTEND_WITH_AND
13885 && optimize_function_for_speed_p (cfun))
13886 {
13887 op2 = force_reg (SImode, const0_rtx);
13888
13889 emit_insn (gen_movstricthi
13890 (gen_lowpart (HImode, op2), op0));
13891 }
13892 else
13893 {
13894 op2 = gen_reg_rtx (SImode);
13895
13896 emit_insn (gen_zero_extendhisi2 (op2, op0));
13897 }
13898 }
13899 else if (mode == SImode)
13900 op2 = op0;
13901 else
13902 op2 = gen_rtx_SUBREG (SImode, op0, 0);
13903
13904 if (target == 0
13905 || !register_operand (target, SImode))
13906 target = gen_reg_rtx (SImode);
13907
13908 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13909 const0_rtx);
13910 emit_insn (gen_rtx_SET (target,
13911 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13912 return target;
13913
13914 case IX86_BUILTIN_RDSEED16_STEP:
13915 mode = HImode;
13916 goto rdseed_step;
13917
13918 case IX86_BUILTIN_RDSEED32_STEP:
13919 mode = SImode;
13920 goto rdseed_step;
13921
13922 case IX86_BUILTIN_RDSEED64_STEP:
13923 mode = DImode;
13924
13925 rdseed_step:
13926 arg0 = CALL_EXPR_ARG (exp, 0);
13927 op1 = expand_normal (arg0);
13928 if (!address_operand (op1, VOIDmode))
13929 {
13930 op1 = convert_memory_address (Pmode, op1);
13931 op1 = copy_addr_to_reg (op1);
13932 }
13933
13934 op0 = gen_reg_rtx (mode);
13935 emit_insn (gen_rdseed (mode, op0));
13936
13937 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13938
13939 op2 = gen_reg_rtx (QImode);
13940
13941 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13942 const0_rtx);
13943 emit_insn (gen_rtx_SET (op2, pat));
13944
13945 if (target == 0
13946 || !register_operand (target, SImode))
13947 target = gen_reg_rtx (SImode);
13948
13949 emit_insn (gen_zero_extendqisi2 (target, op2));
13950 return target;
13951
13952 case IX86_BUILTIN_SBB32:
13953 icode = CODE_FOR_subborrowsi;
13954 icode2 = CODE_FOR_subborrowsi_0;
13955 mode0 = SImode;
13956 mode1 = DImode;
13957 mode2 = CCmode;
13958 goto handlecarry;
13959
13960 case IX86_BUILTIN_SBB64:
13961 icode = CODE_FOR_subborrowdi;
13962 icode2 = CODE_FOR_subborrowdi_0;
13963 mode0 = DImode;
13964 mode1 = TImode;
13965 mode2 = CCmode;
13966 goto handlecarry;
13967
13968 case IX86_BUILTIN_ADDCARRYX32:
13969 icode = CODE_FOR_addcarrysi;
13970 icode2 = CODE_FOR_addcarrysi_0;
13971 mode0 = SImode;
13972 mode1 = DImode;
13973 mode2 = CCCmode;
13974 goto handlecarry;
13975
13976 case IX86_BUILTIN_ADDCARRYX64:
13977 icode = CODE_FOR_addcarrydi;
13978 icode2 = CODE_FOR_addcarrydi_0;
13979 mode0 = DImode;
13980 mode1 = TImode;
13981 mode2 = CCCmode;
13982
13983 handlecarry:
13984 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
13985 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
13986 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
13987 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
13988
13989 op1 = expand_normal (arg0);
13990
13991 op2 = expand_normal (arg1);
13992 if (!register_operand (op2, mode0))
13993 op2 = copy_to_mode_reg (mode0, op2);
13994
13995 op3 = expand_normal (arg2);
13996 if (!register_operand (op3, mode0))
13997 op3 = copy_to_mode_reg (mode0, op3);
13998
13999 op4 = expand_normal (arg3);
14000 if (!address_operand (op4, VOIDmode))
14001 {
14002 op4 = convert_memory_address (Pmode, op4);
14003 op4 = copy_addr_to_reg (op4);
14004 }
14005
14006 op0 = gen_reg_rtx (mode0);
14007 if (op1 == const0_rtx)
14008 {
14009 /* If arg0 is 0, optimize right away into add or sub
14010 instruction that sets CCCmode flags. */
14011 op1 = gen_rtx_REG (mode2, FLAGS_REG);
14012 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
14013 }
14014 else
14015 {
14016 /* Generate CF from input operand. */
14017 ix86_expand_carry (op1);
14018
14019 /* Generate instruction that consumes CF. */
14020 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
14021 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
14022 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
14023 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
14024 }
14025
14026 /* Return current CF value. */
14027 if (target == 0)
14028 target = gen_reg_rtx (QImode);
14029
14030 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
14031 emit_insn (gen_rtx_SET (target, pat));
14032
14033 /* Store the result. */
14034 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
14035
14036 return target;
14037
14038 case IX86_BUILTIN_READ_FLAGS:
14039 if (ignore)
14040 return const0_rtx;
14041
14042 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
14043
14044 if (optimize
14045 || target == NULL_RTX
14046 || !nonimmediate_operand (target, word_mode)
14047 || GET_MODE (target) != word_mode)
14048 target = gen_reg_rtx (word_mode);
14049
14050 emit_insn (gen_pop (target));
14051 return target;
14052
14053 case IX86_BUILTIN_WRITE_FLAGS:
14054
14055 arg0 = CALL_EXPR_ARG (exp, 0);
14056 op0 = expand_normal (arg0);
14057 if (!general_no_elim_operand (op0, word_mode))
14058 op0 = copy_to_mode_reg (word_mode, op0);
14059
14060 emit_insn (gen_push (op0));
14061 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
14062 return 0;
14063
14064 case IX86_BUILTIN_KTESTC8:
14065 icode = CODE_FOR_ktestqi;
14066 mode3 = CCCmode;
14067 goto kortest;
14068
14069 case IX86_BUILTIN_KTESTZ8:
14070 icode = CODE_FOR_ktestqi;
14071 mode3 = CCZmode;
14072 goto kortest;
14073
14074 case IX86_BUILTIN_KTESTC16:
14075 icode = CODE_FOR_ktesthi;
14076 mode3 = CCCmode;
14077 goto kortest;
14078
14079 case IX86_BUILTIN_KTESTZ16:
14080 icode = CODE_FOR_ktesthi;
14081 mode3 = CCZmode;
14082 goto kortest;
14083
14084 case IX86_BUILTIN_KTESTC32:
14085 icode = CODE_FOR_ktestsi;
14086 mode3 = CCCmode;
14087 goto kortest;
14088
14089 case IX86_BUILTIN_KTESTZ32:
14090 icode = CODE_FOR_ktestsi;
14091 mode3 = CCZmode;
14092 goto kortest;
14093
14094 case IX86_BUILTIN_KTESTC64:
14095 icode = CODE_FOR_ktestdi;
14096 mode3 = CCCmode;
14097 goto kortest;
14098
14099 case IX86_BUILTIN_KTESTZ64:
14100 icode = CODE_FOR_ktestdi;
14101 mode3 = CCZmode;
14102 goto kortest;
14103
14104 case IX86_BUILTIN_KORTESTC8:
14105 icode = CODE_FOR_kortestqi;
14106 mode3 = CCCmode;
14107 goto kortest;
14108
14109 case IX86_BUILTIN_KORTESTZ8:
14110 icode = CODE_FOR_kortestqi;
14111 mode3 = CCZmode;
14112 goto kortest;
14113
14114 case IX86_BUILTIN_KORTESTC16:
14115 icode = CODE_FOR_kortesthi;
14116 mode3 = CCCmode;
14117 goto kortest;
14118
14119 case IX86_BUILTIN_KORTESTZ16:
14120 icode = CODE_FOR_kortesthi;
14121 mode3 = CCZmode;
14122 goto kortest;
14123
14124 case IX86_BUILTIN_KORTESTC32:
14125 icode = CODE_FOR_kortestsi;
14126 mode3 = CCCmode;
14127 goto kortest;
14128
14129 case IX86_BUILTIN_KORTESTZ32:
14130 icode = CODE_FOR_kortestsi;
14131 mode3 = CCZmode;
14132 goto kortest;
14133
14134 case IX86_BUILTIN_KORTESTC64:
14135 icode = CODE_FOR_kortestdi;
14136 mode3 = CCCmode;
14137 goto kortest;
14138
14139 case IX86_BUILTIN_KORTESTZ64:
14140 icode = CODE_FOR_kortestdi;
14141 mode3 = CCZmode;
14142
14143 kortest:
14144 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
14145 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
14146 op0 = expand_normal (arg0);
14147 op1 = expand_normal (arg1);
14148
14149 mode0 = insn_data[icode].operand[0].mode;
14150 mode1 = insn_data[icode].operand[1].mode;
14151
14152 if (GET_MODE (op0) != VOIDmode)
14153 op0 = force_reg (GET_MODE (op0), op0);
14154
14155 op0 = gen_lowpart (mode0, op0);
14156
14157 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14158 op0 = copy_to_mode_reg (mode0, op0);
14159
14160 if (GET_MODE (op1) != VOIDmode)
14161 op1 = force_reg (GET_MODE (op1), op1);
14162
14163 op1 = gen_lowpart (mode1, op1);
14164
14165 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14166 op1 = copy_to_mode_reg (mode1, op1);
14167
14168 target = gen_reg_rtx (QImode);
14169
14170 /* Emit kortest. */
14171 emit_insn (GEN_FCN (icode) (op0, op1));
14172 /* And use setcc to return result from flags. */
14173 ix86_expand_setcc (target, EQ,
14174 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
14175 return target;
14176
14177 case IX86_BUILTIN_GATHERSIV2DF:
14178 icode = CODE_FOR_avx2_gathersiv2df;
14179 goto gather_gen;
14180 case IX86_BUILTIN_GATHERSIV4DF:
14181 icode = CODE_FOR_avx2_gathersiv4df;
14182 goto gather_gen;
14183 case IX86_BUILTIN_GATHERDIV2DF:
14184 icode = CODE_FOR_avx2_gatherdiv2df;
14185 goto gather_gen;
14186 case IX86_BUILTIN_GATHERDIV4DF:
14187 icode = CODE_FOR_avx2_gatherdiv4df;
14188 goto gather_gen;
14189 case IX86_BUILTIN_GATHERSIV4SF:
14190 icode = CODE_FOR_avx2_gathersiv4sf;
14191 goto gather_gen;
14192 case IX86_BUILTIN_GATHERSIV8SF:
14193 icode = CODE_FOR_avx2_gathersiv8sf;
14194 goto gather_gen;
14195 case IX86_BUILTIN_GATHERDIV4SF:
14196 icode = CODE_FOR_avx2_gatherdiv4sf;
14197 goto gather_gen;
14198 case IX86_BUILTIN_GATHERDIV8SF:
14199 icode = CODE_FOR_avx2_gatherdiv8sf;
14200 goto gather_gen;
14201 case IX86_BUILTIN_GATHERSIV2DI:
14202 icode = CODE_FOR_avx2_gathersiv2di;
14203 goto gather_gen;
14204 case IX86_BUILTIN_GATHERSIV4DI:
14205 icode = CODE_FOR_avx2_gathersiv4di;
14206 goto gather_gen;
14207 case IX86_BUILTIN_GATHERDIV2DI:
14208 icode = CODE_FOR_avx2_gatherdiv2di;
14209 goto gather_gen;
14210 case IX86_BUILTIN_GATHERDIV4DI:
14211 icode = CODE_FOR_avx2_gatherdiv4di;
14212 goto gather_gen;
14213 case IX86_BUILTIN_GATHERSIV4SI:
14214 icode = CODE_FOR_avx2_gathersiv4si;
14215 goto gather_gen;
14216 case IX86_BUILTIN_GATHERSIV8SI:
14217 icode = CODE_FOR_avx2_gathersiv8si;
14218 goto gather_gen;
14219 case IX86_BUILTIN_GATHERDIV4SI:
14220 icode = CODE_FOR_avx2_gatherdiv4si;
14221 goto gather_gen;
14222 case IX86_BUILTIN_GATHERDIV8SI:
14223 icode = CODE_FOR_avx2_gatherdiv8si;
14224 goto gather_gen;
14225 case IX86_BUILTIN_GATHERALTSIV4DF:
14226 icode = CODE_FOR_avx2_gathersiv4df;
14227 goto gather_gen;
14228 case IX86_BUILTIN_GATHERALTDIV8SF:
14229 icode = CODE_FOR_avx2_gatherdiv8sf;
14230 goto gather_gen;
14231 case IX86_BUILTIN_GATHERALTSIV4DI:
14232 icode = CODE_FOR_avx2_gathersiv4di;
14233 goto gather_gen;
14234 case IX86_BUILTIN_GATHERALTDIV8SI:
14235 icode = CODE_FOR_avx2_gatherdiv8si;
14236 goto gather_gen;
14237 case IX86_BUILTIN_GATHER3SIV16SF:
14238 icode = CODE_FOR_avx512f_gathersiv16sf;
14239 goto gather_gen;
14240 case IX86_BUILTIN_GATHER3SIV8DF:
14241 icode = CODE_FOR_avx512f_gathersiv8df;
14242 goto gather_gen;
14243 case IX86_BUILTIN_GATHER3DIV16SF:
14244 icode = CODE_FOR_avx512f_gatherdiv16sf;
14245 goto gather_gen;
14246 case IX86_BUILTIN_GATHER3DIV8DF:
14247 icode = CODE_FOR_avx512f_gatherdiv8df;
14248 goto gather_gen;
14249 case IX86_BUILTIN_GATHER3SIV16SI:
14250 icode = CODE_FOR_avx512f_gathersiv16si;
14251 goto gather_gen;
14252 case IX86_BUILTIN_GATHER3SIV8DI:
14253 icode = CODE_FOR_avx512f_gathersiv8di;
14254 goto gather_gen;
14255 case IX86_BUILTIN_GATHER3DIV16SI:
14256 icode = CODE_FOR_avx512f_gatherdiv16si;
14257 goto gather_gen;
14258 case IX86_BUILTIN_GATHER3DIV8DI:
14259 icode = CODE_FOR_avx512f_gatherdiv8di;
14260 goto gather_gen;
14261 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14262 icode = CODE_FOR_avx512f_gathersiv8df;
14263 goto gather_gen;
14264 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14265 icode = CODE_FOR_avx512f_gatherdiv16sf;
14266 goto gather_gen;
14267 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14268 icode = CODE_FOR_avx512f_gathersiv8di;
14269 goto gather_gen;
14270 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14271 icode = CODE_FOR_avx512f_gatherdiv16si;
14272 goto gather_gen;
14273 case IX86_BUILTIN_GATHER3SIV2DF:
14274 icode = CODE_FOR_avx512vl_gathersiv2df;
14275 goto gather_gen;
14276 case IX86_BUILTIN_GATHER3SIV4DF:
14277 icode = CODE_FOR_avx512vl_gathersiv4df;
14278 goto gather_gen;
14279 case IX86_BUILTIN_GATHER3DIV2DF:
14280 icode = CODE_FOR_avx512vl_gatherdiv2df;
14281 goto gather_gen;
14282 case IX86_BUILTIN_GATHER3DIV4DF:
14283 icode = CODE_FOR_avx512vl_gatherdiv4df;
14284 goto gather_gen;
14285 case IX86_BUILTIN_GATHER3SIV4SF:
14286 icode = CODE_FOR_avx512vl_gathersiv4sf;
14287 goto gather_gen;
14288 case IX86_BUILTIN_GATHER3SIV8SF:
14289 icode = CODE_FOR_avx512vl_gathersiv8sf;
14290 goto gather_gen;
14291 case IX86_BUILTIN_GATHER3DIV4SF:
14292 icode = CODE_FOR_avx512vl_gatherdiv4sf;
14293 goto gather_gen;
14294 case IX86_BUILTIN_GATHER3DIV8SF:
14295 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14296 goto gather_gen;
14297 case IX86_BUILTIN_GATHER3SIV2DI:
14298 icode = CODE_FOR_avx512vl_gathersiv2di;
14299 goto gather_gen;
14300 case IX86_BUILTIN_GATHER3SIV4DI:
14301 icode = CODE_FOR_avx512vl_gathersiv4di;
14302 goto gather_gen;
14303 case IX86_BUILTIN_GATHER3DIV2DI:
14304 icode = CODE_FOR_avx512vl_gatherdiv2di;
14305 goto gather_gen;
14306 case IX86_BUILTIN_GATHER3DIV4DI:
14307 icode = CODE_FOR_avx512vl_gatherdiv4di;
14308 goto gather_gen;
14309 case IX86_BUILTIN_GATHER3SIV4SI:
14310 icode = CODE_FOR_avx512vl_gathersiv4si;
14311 goto gather_gen;
14312 case IX86_BUILTIN_GATHER3SIV8SI:
14313 icode = CODE_FOR_avx512vl_gathersiv8si;
14314 goto gather_gen;
14315 case IX86_BUILTIN_GATHER3DIV4SI:
14316 icode = CODE_FOR_avx512vl_gatherdiv4si;
14317 goto gather_gen;
14318 case IX86_BUILTIN_GATHER3DIV8SI:
14319 icode = CODE_FOR_avx512vl_gatherdiv8si;
14320 goto gather_gen;
14321 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14322 icode = CODE_FOR_avx512vl_gathersiv4df;
14323 goto gather_gen;
14324 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14325 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14326 goto gather_gen;
14327 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14328 icode = CODE_FOR_avx512vl_gathersiv4di;
14329 goto gather_gen;
14330 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14331 icode = CODE_FOR_avx512vl_gatherdiv8si;
14332 goto gather_gen;
14333 case IX86_BUILTIN_SCATTERSIV16SF:
14334 icode = CODE_FOR_avx512f_scattersiv16sf;
14335 goto scatter_gen;
14336 case IX86_BUILTIN_SCATTERSIV8DF:
14337 icode = CODE_FOR_avx512f_scattersiv8df;
14338 goto scatter_gen;
14339 case IX86_BUILTIN_SCATTERDIV16SF:
14340 icode = CODE_FOR_avx512f_scatterdiv16sf;
14341 goto scatter_gen;
14342 case IX86_BUILTIN_SCATTERDIV8DF:
14343 icode = CODE_FOR_avx512f_scatterdiv8df;
14344 goto scatter_gen;
14345 case IX86_BUILTIN_SCATTERSIV16SI:
14346 icode = CODE_FOR_avx512f_scattersiv16si;
14347 goto scatter_gen;
14348 case IX86_BUILTIN_SCATTERSIV8DI:
14349 icode = CODE_FOR_avx512f_scattersiv8di;
14350 goto scatter_gen;
14351 case IX86_BUILTIN_SCATTERDIV16SI:
14352 icode = CODE_FOR_avx512f_scatterdiv16si;
14353 goto scatter_gen;
14354 case IX86_BUILTIN_SCATTERDIV8DI:
14355 icode = CODE_FOR_avx512f_scatterdiv8di;
14356 goto scatter_gen;
14357 case IX86_BUILTIN_SCATTERSIV8SF:
14358 icode = CODE_FOR_avx512vl_scattersiv8sf;
14359 goto scatter_gen;
14360 case IX86_BUILTIN_SCATTERSIV4SF:
14361 icode = CODE_FOR_avx512vl_scattersiv4sf;
14362 goto scatter_gen;
14363 case IX86_BUILTIN_SCATTERSIV4DF:
14364 icode = CODE_FOR_avx512vl_scattersiv4df;
14365 goto scatter_gen;
14366 case IX86_BUILTIN_SCATTERSIV2DF:
14367 icode = CODE_FOR_avx512vl_scattersiv2df;
14368 goto scatter_gen;
14369 case IX86_BUILTIN_SCATTERDIV8SF:
14370 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14371 goto scatter_gen;
14372 case IX86_BUILTIN_SCATTERDIV4SF:
14373 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14374 goto scatter_gen;
14375 case IX86_BUILTIN_SCATTERDIV4DF:
14376 icode = CODE_FOR_avx512vl_scatterdiv4df;
14377 goto scatter_gen;
14378 case IX86_BUILTIN_SCATTERDIV2DF:
14379 icode = CODE_FOR_avx512vl_scatterdiv2df;
14380 goto scatter_gen;
14381 case IX86_BUILTIN_SCATTERSIV8SI:
14382 icode = CODE_FOR_avx512vl_scattersiv8si;
14383 goto scatter_gen;
14384 case IX86_BUILTIN_SCATTERSIV4SI:
14385 icode = CODE_FOR_avx512vl_scattersiv4si;
14386 goto scatter_gen;
14387 case IX86_BUILTIN_SCATTERSIV4DI:
14388 icode = CODE_FOR_avx512vl_scattersiv4di;
14389 goto scatter_gen;
14390 case IX86_BUILTIN_SCATTERSIV2DI:
14391 icode = CODE_FOR_avx512vl_scattersiv2di;
14392 goto scatter_gen;
14393 case IX86_BUILTIN_SCATTERDIV8SI:
14394 icode = CODE_FOR_avx512vl_scatterdiv8si;
14395 goto scatter_gen;
14396 case IX86_BUILTIN_SCATTERDIV4SI:
14397 icode = CODE_FOR_avx512vl_scatterdiv4si;
14398 goto scatter_gen;
14399 case IX86_BUILTIN_SCATTERDIV4DI:
14400 icode = CODE_FOR_avx512vl_scatterdiv4di;
14401 goto scatter_gen;
14402 case IX86_BUILTIN_SCATTERDIV2DI:
14403 icode = CODE_FOR_avx512vl_scatterdiv2di;
14404 goto scatter_gen;
14405 case IX86_BUILTIN_GATHERPFDPD:
14406 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14407 goto vec_prefetch_gen;
14408 case IX86_BUILTIN_SCATTERALTSIV8DF:
14409 icode = CODE_FOR_avx512f_scattersiv8df;
14410 goto scatter_gen;
14411 case IX86_BUILTIN_SCATTERALTDIV16SF:
14412 icode = CODE_FOR_avx512f_scatterdiv16sf;
14413 goto scatter_gen;
14414 case IX86_BUILTIN_SCATTERALTSIV8DI:
14415 icode = CODE_FOR_avx512f_scattersiv8di;
14416 goto scatter_gen;
14417 case IX86_BUILTIN_SCATTERALTDIV16SI:
14418 icode = CODE_FOR_avx512f_scatterdiv16si;
14419 goto scatter_gen;
14420 case IX86_BUILTIN_SCATTERALTSIV4DF:
14421 icode = CODE_FOR_avx512vl_scattersiv4df;
14422 goto scatter_gen;
14423 case IX86_BUILTIN_SCATTERALTDIV8SF:
14424 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14425 goto scatter_gen;
14426 case IX86_BUILTIN_SCATTERALTSIV4DI:
14427 icode = CODE_FOR_avx512vl_scattersiv4di;
14428 goto scatter_gen;
14429 case IX86_BUILTIN_SCATTERALTDIV8SI:
14430 icode = CODE_FOR_avx512vl_scatterdiv8si;
14431 goto scatter_gen;
14432 case IX86_BUILTIN_SCATTERALTSIV2DF:
14433 icode = CODE_FOR_avx512vl_scattersiv2df;
14434 goto scatter_gen;
14435 case IX86_BUILTIN_SCATTERALTDIV4SF:
14436 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14437 goto scatter_gen;
14438 case IX86_BUILTIN_SCATTERALTSIV2DI:
14439 icode = CODE_FOR_avx512vl_scattersiv2di;
14440 goto scatter_gen;
14441 case IX86_BUILTIN_SCATTERALTDIV4SI:
14442 icode = CODE_FOR_avx512vl_scatterdiv4si;
14443 goto scatter_gen;
14444 case IX86_BUILTIN_GATHERPFDPS:
14445 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14446 goto vec_prefetch_gen;
14447 case IX86_BUILTIN_GATHERPFQPD:
14448 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14449 goto vec_prefetch_gen;
14450 case IX86_BUILTIN_GATHERPFQPS:
14451 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14452 goto vec_prefetch_gen;
14453 case IX86_BUILTIN_SCATTERPFDPD:
14454 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14455 goto vec_prefetch_gen;
14456 case IX86_BUILTIN_SCATTERPFDPS:
14457 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14458 goto vec_prefetch_gen;
14459 case IX86_BUILTIN_SCATTERPFQPD:
14460 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14461 goto vec_prefetch_gen;
14462 case IX86_BUILTIN_SCATTERPFQPS:
14463 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14464 goto vec_prefetch_gen;
14465
14466 gather_gen:
14467 rtx half;
14468 rtx (*gen) (rtx, rtx);
14469
14470 arg0 = CALL_EXPR_ARG (exp, 0);
14471 arg1 = CALL_EXPR_ARG (exp, 1);
14472 arg2 = CALL_EXPR_ARG (exp, 2);
14473 arg3 = CALL_EXPR_ARG (exp, 3);
14474 arg4 = CALL_EXPR_ARG (exp, 4);
14475 op0 = expand_normal (arg0);
14476 op1 = expand_normal (arg1);
14477 op2 = expand_normal (arg2);
14478 op3 = expand_normal (arg3);
14479 op4 = expand_normal (arg4);
14480 /* Note the arg order is different from the operand order. */
14481 mode0 = insn_data[icode].operand[1].mode;
14482 mode2 = insn_data[icode].operand[3].mode;
14483 mode3 = insn_data[icode].operand[4].mode;
14484 mode4 = insn_data[icode].operand[5].mode;
14485
14486 if (target == NULL_RTX
14487 || GET_MODE (target) != insn_data[icode].operand[0].mode
14488 || !insn_data[icode].operand[0].predicate (target,
14489 GET_MODE (target)))
14490 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14491 else
14492 subtarget = target;
14493
14494 switch (fcode)
14495 {
14496 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14497 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14498 half = gen_reg_rtx (V8SImode);
14499 if (!nonimmediate_operand (op2, V16SImode))
14500 op2 = copy_to_mode_reg (V16SImode, op2);
14501 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14502 op2 = half;
14503 break;
14504 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14505 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14506 case IX86_BUILTIN_GATHERALTSIV4DF:
14507 case IX86_BUILTIN_GATHERALTSIV4DI:
14508 half = gen_reg_rtx (V4SImode);
14509 if (!nonimmediate_operand (op2, V8SImode))
14510 op2 = copy_to_mode_reg (V8SImode, op2);
14511 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14512 op2 = half;
14513 break;
14514 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14515 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14516 half = gen_reg_rtx (mode0);
14517 if (mode0 == V8SFmode)
14518 gen = gen_vec_extract_lo_v16sf;
14519 else
14520 gen = gen_vec_extract_lo_v16si;
14521 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14522 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14523 emit_insn (gen (half, op0));
14524 op0 = half;
14525 op3 = lowpart_subreg (QImode, op3, HImode);
14526 break;
14527 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14528 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14529 case IX86_BUILTIN_GATHERALTDIV8SF:
14530 case IX86_BUILTIN_GATHERALTDIV8SI:
14531 half = gen_reg_rtx (mode0);
14532 if (mode0 == V4SFmode)
14533 gen = gen_vec_extract_lo_v8sf;
14534 else
14535 gen = gen_vec_extract_lo_v8si;
14536 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14537 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14538 emit_insn (gen (half, op0));
14539 op0 = half;
14540 if (VECTOR_MODE_P (GET_MODE (op3)))
14541 {
14542 half = gen_reg_rtx (mode0);
14543 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14544 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14545 emit_insn (gen (half, op3));
14546 op3 = half;
14547 }
14548 break;
14549 default:
14550 break;
14551 }
14552
14553 /* Force memory operand only with base register here. But we
14554 don't want to do it on memory operand for other builtin
14555 functions. */
14556 op1 = ix86_zero_extend_to_Pmode (op1);
14557
14558 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14559 op0 = copy_to_mode_reg (mode0, op0);
14560 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14561 op1 = copy_to_mode_reg (Pmode, op1);
14562 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14563 op2 = copy_to_mode_reg (mode2, op2);
14564
14565 op3 = fixup_modeless_constant (op3, mode3);
14566
14567 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14568 {
14569 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14570 op3 = copy_to_mode_reg (mode3, op3);
14571 }
14572 else
14573 {
14574 op3 = copy_to_reg (op3);
14575 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14576 }
14577 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14578 {
14579 error ("the last argument must be scale 1, 2, 4, 8");
14580 return const0_rtx;
14581 }
14582
14583 /* Optimize. If mask is known to have all high bits set,
14584 replace op0 with pc_rtx to signal that the instruction
14585 overwrites the whole destination and doesn't use its
14586 previous contents. */
14587 if (optimize)
14588 {
14589 if (TREE_CODE (arg3) == INTEGER_CST)
14590 {
14591 if (integer_all_onesp (arg3))
14592 op0 = pc_rtx;
14593 }
14594 else if (TREE_CODE (arg3) == VECTOR_CST)
14595 {
14596 unsigned int negative = 0;
14597 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14598 {
14599 tree cst = VECTOR_CST_ELT (arg3, i);
14600 if (TREE_CODE (cst) == INTEGER_CST
14601 && tree_int_cst_sign_bit (cst))
14602 negative++;
14603 else if (TREE_CODE (cst) == REAL_CST
14604 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14605 negative++;
14606 }
14607 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14608 op0 = pc_rtx;
14609 }
14610 else if (TREE_CODE (arg3) == SSA_NAME
14611 && VECTOR_TYPE_P (TREE_TYPE (arg3)))
14612 {
14613 /* Recognize also when mask is like:
14614 __v2df src = _mm_setzero_pd ();
14615 __v2df mask = _mm_cmpeq_pd (src, src);
14616 or
14617 __v8sf src = _mm256_setzero_ps ();
14618 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14619 as that is a cheaper way to load all ones into
14620 a register than having to load a constant from
14621 memory. */
14622 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14623 if (is_gimple_call (def_stmt))
14624 {
14625 tree fndecl = gimple_call_fndecl (def_stmt);
14626 if (fndecl
14627 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
14628 switch (DECL_MD_FUNCTION_CODE (fndecl))
14629 {
14630 case IX86_BUILTIN_CMPPD:
14631 case IX86_BUILTIN_CMPPS:
14632 case IX86_BUILTIN_CMPPD256:
14633 case IX86_BUILTIN_CMPPS256:
14634 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14635 break;
14636 /* FALLTHRU */
14637 case IX86_BUILTIN_CMPEQPD:
14638 case IX86_BUILTIN_CMPEQPS:
14639 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14640 && initializer_zerop (gimple_call_arg (def_stmt,
14641 1)))
14642 op0 = pc_rtx;
14643 break;
14644 default:
14645 break;
14646 }
14647 }
14648 }
14649 }
14650
14651 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14652 if (! pat)
14653 return const0_rtx;
14654 emit_insn (pat);
14655
14656 switch (fcode)
14657 {
14658 case IX86_BUILTIN_GATHER3DIV16SF:
14659 if (target == NULL_RTX)
14660 target = gen_reg_rtx (V8SFmode);
14661 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14662 break;
14663 case IX86_BUILTIN_GATHER3DIV16SI:
14664 if (target == NULL_RTX)
14665 target = gen_reg_rtx (V8SImode);
14666 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14667 break;
14668 case IX86_BUILTIN_GATHER3DIV8SF:
14669 case IX86_BUILTIN_GATHERDIV8SF:
14670 if (target == NULL_RTX)
14671 target = gen_reg_rtx (V4SFmode);
14672 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14673 break;
14674 case IX86_BUILTIN_GATHER3DIV8SI:
14675 case IX86_BUILTIN_GATHERDIV8SI:
14676 if (target == NULL_RTX)
14677 target = gen_reg_rtx (V4SImode);
14678 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14679 break;
14680 default:
14681 target = subtarget;
14682 break;
14683 }
14684 return target;
14685
14686 scatter_gen:
14687 arg0 = CALL_EXPR_ARG (exp, 0);
14688 arg1 = CALL_EXPR_ARG (exp, 1);
14689 arg2 = CALL_EXPR_ARG (exp, 2);
14690 arg3 = CALL_EXPR_ARG (exp, 3);
14691 arg4 = CALL_EXPR_ARG (exp, 4);
14692 op0 = expand_normal (arg0);
14693 op1 = expand_normal (arg1);
14694 op2 = expand_normal (arg2);
14695 op3 = expand_normal (arg3);
14696 op4 = expand_normal (arg4);
14697 mode1 = insn_data[icode].operand[1].mode;
14698 mode2 = insn_data[icode].operand[2].mode;
14699 mode3 = insn_data[icode].operand[3].mode;
14700 mode4 = insn_data[icode].operand[4].mode;
14701
14702 /* Scatter instruction stores operand op3 to memory with
14703 indices from op2 and scale from op4 under writemask op1.
14704 If index operand op2 has more elements then source operand
14705 op3 one need to use only its low half. And vice versa. */
14706 switch (fcode)
14707 {
14708 case IX86_BUILTIN_SCATTERALTSIV8DF:
14709 case IX86_BUILTIN_SCATTERALTSIV8DI:
14710 half = gen_reg_rtx (V8SImode);
14711 if (!nonimmediate_operand (op2, V16SImode))
14712 op2 = copy_to_mode_reg (V16SImode, op2);
14713 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14714 op2 = half;
14715 break;
14716 case IX86_BUILTIN_SCATTERALTDIV16SF:
14717 case IX86_BUILTIN_SCATTERALTDIV16SI:
14718 half = gen_reg_rtx (mode3);
14719 if (mode3 == V8SFmode)
14720 gen = gen_vec_extract_lo_v16sf;
14721 else
14722 gen = gen_vec_extract_lo_v16si;
14723 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14724 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14725 emit_insn (gen (half, op3));
14726 op3 = half;
14727 break;
14728 case IX86_BUILTIN_SCATTERALTSIV4DF:
14729 case IX86_BUILTIN_SCATTERALTSIV4DI:
14730 half = gen_reg_rtx (V4SImode);
14731 if (!nonimmediate_operand (op2, V8SImode))
14732 op2 = copy_to_mode_reg (V8SImode, op2);
14733 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14734 op2 = half;
14735 break;
14736 case IX86_BUILTIN_SCATTERALTDIV8SF:
14737 case IX86_BUILTIN_SCATTERALTDIV8SI:
14738 half = gen_reg_rtx (mode3);
14739 if (mode3 == V4SFmode)
14740 gen = gen_vec_extract_lo_v8sf;
14741 else
14742 gen = gen_vec_extract_lo_v8si;
14743 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14744 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14745 emit_insn (gen (half, op3));
14746 op3 = half;
14747 break;
14748 case IX86_BUILTIN_SCATTERALTSIV2DF:
14749 case IX86_BUILTIN_SCATTERALTSIV2DI:
14750 if (!nonimmediate_operand (op2, V4SImode))
14751 op2 = copy_to_mode_reg (V4SImode, op2);
14752 break;
14753 case IX86_BUILTIN_SCATTERALTDIV4SF:
14754 case IX86_BUILTIN_SCATTERALTDIV4SI:
14755 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14756 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14757 break;
14758 default:
14759 break;
14760 }
14761
14762 /* Force memory operand only with base register here. But we
14763 don't want to do it on memory operand for other builtin
14764 functions. */
14765 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14766
14767 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14768 op0 = copy_to_mode_reg (Pmode, op0);
14769
14770 op1 = fixup_modeless_constant (op1, mode1);
14771
14772 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14773 {
14774 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14775 op1 = copy_to_mode_reg (mode1, op1);
14776 }
14777 else
14778 {
14779 op1 = copy_to_reg (op1);
14780 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14781 }
14782
14783 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14784 op2 = copy_to_mode_reg (mode2, op2);
14785
14786 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14787 op3 = copy_to_mode_reg (mode3, op3);
14788
14789 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14790 {
14791 error ("the last argument must be scale 1, 2, 4, 8");
14792 return const0_rtx;
14793 }
14794
14795 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14796 if (! pat)
14797 return const0_rtx;
14798
14799 emit_insn (pat);
14800 return 0;
14801
14802 vec_prefetch_gen:
14803 arg0 = CALL_EXPR_ARG (exp, 0);
14804 arg1 = CALL_EXPR_ARG (exp, 1);
14805 arg2 = CALL_EXPR_ARG (exp, 2);
14806 arg3 = CALL_EXPR_ARG (exp, 3);
14807 arg4 = CALL_EXPR_ARG (exp, 4);
14808 op0 = expand_normal (arg0);
14809 op1 = expand_normal (arg1);
14810 op2 = expand_normal (arg2);
14811 op3 = expand_normal (arg3);
14812 op4 = expand_normal (arg4);
14813 mode0 = insn_data[icode].operand[0].mode;
14814 mode1 = insn_data[icode].operand[1].mode;
14815 mode3 = insn_data[icode].operand[3].mode;
14816 mode4 = insn_data[icode].operand[4].mode;
14817
14818 op0 = fixup_modeless_constant (op0, mode0);
14819
14820 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14821 {
14822 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14823 op0 = copy_to_mode_reg (mode0, op0);
14824 }
14825 else
14826 {
14827 op0 = copy_to_reg (op0);
14828 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14829 }
14830
14831 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14832 op1 = copy_to_mode_reg (mode1, op1);
14833
14834 /* Force memory operand only with base register here. But we
14835 don't want to do it on memory operand for other builtin
14836 functions. */
14837 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14838
14839 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14840 op2 = copy_to_mode_reg (Pmode, op2);
14841
14842 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14843 {
14844 error ("the forth argument must be scale 1, 2, 4, 8");
14845 return const0_rtx;
14846 }
14847
14848 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14849 {
14850 error ("incorrect hint operand");
14851 return const0_rtx;
14852 }
14853
14854 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14855 if (! pat)
14856 return const0_rtx;
14857
14858 emit_insn (pat);
14859
14860 return 0;
14861
14862 case IX86_BUILTIN_XABORT:
14863 icode = CODE_FOR_xabort;
14864 arg0 = CALL_EXPR_ARG (exp, 0);
14865 op0 = expand_normal (arg0);
14866 mode0 = insn_data[icode].operand[0].mode;
14867 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14868 {
14869 error ("the argument to %<xabort%> intrinsic must "
14870 "be an 8-bit immediate");
14871 return const0_rtx;
14872 }
14873 emit_insn (gen_xabort (op0));
14874 return 0;
14875
14876 case IX86_BUILTIN_RDSSPD:
14877 case IX86_BUILTIN_RDSSPQ:
14878 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14879
14880 if (target == 0
14881 || !register_operand (target, mode))
14882 target = gen_reg_rtx (mode);
14883
14884 op0 = force_reg (mode, const0_rtx);
14885
14886 emit_insn (gen_rdssp (mode, target, op0));
14887 return target;
14888
14889 case IX86_BUILTIN_INCSSPD:
14890 case IX86_BUILTIN_INCSSPQ:
14891 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14892
14893 arg0 = CALL_EXPR_ARG (exp, 0);
14894 op0 = expand_normal (arg0);
14895
14896 op0 = force_reg (mode, op0);
14897
14898 emit_insn (gen_incssp (mode, op0));
14899 return 0;
14900
14901 case IX86_BUILTIN_HRESET:
14902 icode = CODE_FOR_hreset;
14903 arg0 = CALL_EXPR_ARG (exp, 0);
14904 op0 = expand_normal (arg0);
14905 op0 = force_reg (SImode, op0);
14906 emit_insn (gen_hreset (op0));
14907 return 0;
14908
14909 case IX86_BUILTIN_RSTORSSP:
14910 case IX86_BUILTIN_CLRSSBSY:
14911 arg0 = CALL_EXPR_ARG (exp, 0);
14912 op0 = expand_normal (arg0);
14913 icode = (fcode == IX86_BUILTIN_RSTORSSP
14914 ? CODE_FOR_rstorssp
14915 : CODE_FOR_clrssbsy);
14916
14917 if (!address_operand (op0, VOIDmode))
14918 {
14919 op0 = convert_memory_address (Pmode, op0);
14920 op0 = copy_addr_to_reg (op0);
14921 }
14922 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
14923 return 0;
14924
14925 case IX86_BUILTIN_WRSSD:
14926 case IX86_BUILTIN_WRSSQ:
14927 case IX86_BUILTIN_WRUSSD:
14928 case IX86_BUILTIN_WRUSSQ:
14929 mode = ((fcode == IX86_BUILTIN_WRSSD
14930 || fcode == IX86_BUILTIN_WRUSSD)
14931 ? SImode : DImode);
14932
14933 arg0 = CALL_EXPR_ARG (exp, 0);
14934 op0 = expand_normal (arg0);
14935 arg1 = CALL_EXPR_ARG (exp, 1);
14936 op1 = expand_normal (arg1);
14937
14938 op0 = force_reg (mode, op0);
14939
14940 if (!address_operand (op1, VOIDmode))
14941 {
14942 op1 = convert_memory_address (Pmode, op1);
14943 op1 = copy_addr_to_reg (op1);
14944 }
14945 op1 = gen_rtx_MEM (mode, op1);
14946
14947 icode = ((fcode == IX86_BUILTIN_WRSSD
14948 || fcode == IX86_BUILTIN_WRSSQ)
14949 ? code_for_wrss (mode)
14950 : code_for_wruss (mode));
14951 emit_insn (GEN_FCN (icode) (op0, op1));
14952
14953 return 0;
14954
14955 default:
14956 break;
14957 }
14958
14959 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14960 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14961 {
14962 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14963 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14964 target);
14965 }
14966
14967 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14968 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14969 {
14970 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14971 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14972 target);
14973 }
14974
14975 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14976 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14977 {
14978 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14979 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14980 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14981 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14982 int masked = 1;
14983 machine_mode mode, wide_mode, nar_mode;
14984
14985 nar_mode = V4SFmode;
14986 mode = V16SFmode;
14987 wide_mode = V64SFmode;
14988 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
14989 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14990
14991 switch (fcode)
14992 {
14993 case IX86_BUILTIN_4FMAPS:
14994 fcn = gen_avx5124fmaddps_4fmaddps;
14995 masked = 0;
14996 goto v4fma_expand;
14997
14998 case IX86_BUILTIN_4DPWSSD:
14999 nar_mode = V4SImode;
15000 mode = V16SImode;
15001 wide_mode = V64SImode;
15002 fcn = gen_avx5124vnniw_vp4dpwssd;
15003 masked = 0;
15004 goto v4fma_expand;
15005
15006 case IX86_BUILTIN_4DPWSSDS:
15007 nar_mode = V4SImode;
15008 mode = V16SImode;
15009 wide_mode = V64SImode;
15010 fcn = gen_avx5124vnniw_vp4dpwssds;
15011 masked = 0;
15012 goto v4fma_expand;
15013
15014 case IX86_BUILTIN_4FNMAPS:
15015 fcn = gen_avx5124fmaddps_4fnmaddps;
15016 masked = 0;
15017 goto v4fma_expand;
15018
15019 case IX86_BUILTIN_4FNMAPS_MASK:
15020 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
15021 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
15022 goto v4fma_expand;
15023
15024 case IX86_BUILTIN_4DPWSSD_MASK:
15025 nar_mode = V4SImode;
15026 mode = V16SImode;
15027 wide_mode = V64SImode;
15028 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
15029 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
15030 goto v4fma_expand;
15031
15032 case IX86_BUILTIN_4DPWSSDS_MASK:
15033 nar_mode = V4SImode;
15034 mode = V16SImode;
15035 wide_mode = V64SImode;
15036 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
15037 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
15038 goto v4fma_expand;
15039
15040 case IX86_BUILTIN_4FMAPS_MASK:
15041 {
15042 tree args[4];
15043 rtx ops[4];
15044 rtx wide_reg;
15045 rtx accum;
15046 rtx addr;
15047 rtx mem;
15048
15049 v4fma_expand:
15050 wide_reg = gen_reg_rtx (wide_mode);
15051 for (i = 0; i < 4; i++)
15052 {
15053 args[i] = CALL_EXPR_ARG (exp, i);
15054 ops[i] = expand_normal (args[i]);
15055
15056 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
15057 ops[i]);
15058 }
15059
15060 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15061 accum = force_reg (mode, accum);
15062
15063 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15064 addr = force_reg (Pmode, addr);
15065
15066 mem = gen_rtx_MEM (nar_mode, addr);
15067
15068 target = gen_reg_rtx (mode);
15069
15070 emit_move_insn (target, accum);
15071
15072 if (! masked)
15073 emit_insn (fcn (target, accum, wide_reg, mem));
15074 else
15075 {
15076 rtx merge, mask;
15077 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15078
15079 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15080
15081 if (CONST_INT_P (mask))
15082 mask = fixup_modeless_constant (mask, HImode);
15083
15084 mask = force_reg (HImode, mask);
15085
15086 if (GET_MODE (mask) != HImode)
15087 mask = gen_rtx_SUBREG (HImode, mask, 0);
15088
15089 /* If merge is 0 then we're about to emit z-masked variant. */
15090 if (const0_operand (merge, mode))
15091 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15092 /* If merge is the same as accum then emit merge-masked variant. */
15093 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15094 {
15095 merge = force_reg (mode, merge);
15096 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15097 }
15098 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15099 else
15100 {
15101 target = gen_reg_rtx (mode);
15102 emit_move_insn (target, merge);
15103 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15104 }
15105 }
15106 return target;
15107 }
15108
15109 case IX86_BUILTIN_4FNMASS:
15110 fcn = gen_avx5124fmaddps_4fnmaddss;
15111 masked = 0;
15112 goto s4fma_expand;
15113
15114 case IX86_BUILTIN_4FMASS:
15115 fcn = gen_avx5124fmaddps_4fmaddss;
15116 masked = 0;
15117 goto s4fma_expand;
15118
15119 case IX86_BUILTIN_4FNMASS_MASK:
15120 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
15121 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
15122 goto s4fma_expand;
15123
15124 case IX86_BUILTIN_4FMASS_MASK:
15125 {
15126 tree args[4];
15127 rtx ops[4];
15128 rtx wide_reg;
15129 rtx accum;
15130 rtx addr;
15131 rtx mem;
15132
15133 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
15134 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
15135
15136 s4fma_expand:
15137 mode = V4SFmode;
15138 wide_reg = gen_reg_rtx (V64SFmode);
15139 for (i = 0; i < 4; i++)
15140 {
15141 rtx tmp;
15142 args[i] = CALL_EXPR_ARG (exp, i);
15143 ops[i] = expand_normal (args[i]);
15144
15145 tmp = gen_reg_rtx (SFmode);
15146 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
15147
15148 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
15149 gen_rtx_SUBREG (V16SFmode, tmp, 0));
15150 }
15151
15152 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15153 accum = force_reg (V4SFmode, accum);
15154
15155 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15156 addr = force_reg (Pmode, addr);
15157
15158 mem = gen_rtx_MEM (V4SFmode, addr);
15159
15160 target = gen_reg_rtx (V4SFmode);
15161
15162 emit_move_insn (target, accum);
15163
15164 if (! masked)
15165 emit_insn (fcn (target, accum, wide_reg, mem));
15166 else
15167 {
15168 rtx merge, mask;
15169 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15170
15171 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15172
15173 if (CONST_INT_P (mask))
15174 mask = fixup_modeless_constant (mask, QImode);
15175
15176 mask = force_reg (QImode, mask);
15177
15178 if (GET_MODE (mask) != QImode)
15179 mask = gen_rtx_SUBREG (QImode, mask, 0);
15180
15181 /* If merge is 0 then we're about to emit z-masked variant. */
15182 if (const0_operand (merge, mode))
15183 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15184 /* If merge is the same as accum then emit merge-masked
15185 variant. */
15186 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15187 {
15188 merge = force_reg (mode, merge);
15189 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15190 }
15191 /* Merge with something unknown might happen if we z-mask
15192 w/ -O0. */
15193 else
15194 {
15195 target = gen_reg_rtx (mode);
15196 emit_move_insn (target, merge);
15197 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15198 }
15199 }
15200 return target;
15201 }
15202 case IX86_BUILTIN_RDPID:
15203 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
15204 target);
15205 case IX86_BUILTIN_FABSQ:
15206 case IX86_BUILTIN_COPYSIGNQ:
15207 if (!TARGET_SSE)
15208 /* Emit a normal call if SSE isn't available. */
15209 return expand_call (exp, target, ignore);
15210 /* FALLTHRU */
15211 default:
15212 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
15213 }
15214 }
15215
15216 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
15217 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
15218 {
15219 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
15220 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
15221 }
15222
15223 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15224 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
15225 {
15226 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
15227 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
15228 }
15229
15230 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15231 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
15232 {
15233 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
15234 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
15235 }
15236
15237 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15238 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
15239 {
15240 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
15241 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
15242 }
15243
15244 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15245 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
15246 {
15247 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
15248 const struct builtin_description *d = bdesc_multi_arg + i;
15249 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
15250 (enum ix86_builtin_func_type)
15251 d->flag, d->comparison);
15252 }
15253
15254 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
15255 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
15256 {
15257 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
15258 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
15259 target);
15260 }
15261
15262 gcc_unreachable ();
15263 }
15264
15265 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15266 fill target with val via vec_duplicate. */
15267
15268 static bool
15269 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
15270 {
15271 bool ok;
15272 rtx_insn *insn;
15273 rtx dup;
15274 /* Save/restore recog_data in case this is called from splitters
15275 or other routines where recog_data needs to stay valid across
15276 force_reg. See PR106577. */
15277 recog_data_d recog_data_save = recog_data;
15278
15279 /* First attempt to recognize VAL as-is. */
15280 dup = gen_vec_duplicate (mode, val);
15281 insn = emit_insn (gen_rtx_SET (target, dup));
15282 if (recog_memoized (insn) < 0)
15283 {
15284 rtx_insn *seq;
15285 machine_mode innermode = GET_MODE_INNER (mode);
15286 rtx reg;
15287
15288 /* If that fails, force VAL into a register. */
15289
15290 start_sequence ();
15291 reg = force_reg (innermode, val);
15292 if (GET_MODE (reg) != innermode)
15293 reg = gen_lowpart (innermode, reg);
15294 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
15295 seq = get_insns ();
15296 end_sequence ();
15297 if (seq)
15298 emit_insn_before (seq, insn);
15299
15300 ok = recog_memoized (insn) >= 0;
15301 gcc_assert (ok);
15302 }
15303 recog_data = recog_data_save;
15304 return true;
15305 }
15306
15307 /* Get a vector mode of the same size as the original but with elements
15308 twice as wide. This is only guaranteed to apply to integral vectors. */
15309
15310 static machine_mode
15311 get_mode_wider_vector (machine_mode o)
15312 {
15313 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15314 machine_mode n = GET_MODE_NEXT_MODE (o).require ();
15315 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
15316 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
15317 return n;
15318 }
15319
15320 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
15321 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
15322
15323 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15324 with all elements equal to VAR. Return true if successful. */
15325
15326 bool
15327 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
15328 rtx target, rtx val)
15329 {
15330 bool ok;
15331
15332 switch (mode)
15333 {
15334 case E_V2SImode:
15335 case E_V2SFmode:
15336 if (!mmx_ok)
15337 return false;
15338 /* FALLTHRU */
15339
15340 case E_V4DFmode:
15341 case E_V4DImode:
15342 case E_V8SFmode:
15343 case E_V8SImode:
15344 case E_V2DFmode:
15345 case E_V2DImode:
15346 case E_V4SFmode:
15347 case E_V4SImode:
15348 case E_V16SImode:
15349 case E_V8DImode:
15350 case E_V16SFmode:
15351 case E_V8DFmode:
15352 return ix86_vector_duplicate_value (mode, target, val);
15353
15354 case E_V4HImode:
15355 if (!mmx_ok)
15356 return false;
15357 if (TARGET_SSE || TARGET_3DNOW_A)
15358 {
15359 rtx x;
15360
15361 val = gen_lowpart (SImode, val);
15362 x = gen_rtx_TRUNCATE (HImode, val);
15363 x = gen_rtx_VEC_DUPLICATE (mode, x);
15364 emit_insn (gen_rtx_SET (target, x));
15365 return true;
15366 }
15367 goto widen;
15368
15369 case E_V2HImode:
15370 if (TARGET_SSE2)
15371 {
15372 rtx x;
15373
15374 val = gen_lowpart (SImode, val);
15375 x = gen_rtx_TRUNCATE (HImode, val);
15376 x = gen_rtx_VEC_DUPLICATE (mode, x);
15377 emit_insn (gen_rtx_SET (target, x));
15378 return true;
15379 }
15380 return false;
15381
15382 case E_V8QImode:
15383 case E_V4QImode:
15384 if (!mmx_ok)
15385 return false;
15386 goto widen;
15387
15388 case E_V8HImode:
15389 case E_V8HFmode:
15390 case E_V8BFmode:
15391 if (TARGET_AVX2)
15392 return ix86_vector_duplicate_value (mode, target, val);
15393
15394 if (TARGET_SSE2)
15395 {
15396 struct expand_vec_perm_d dperm;
15397 rtx tmp1, tmp2;
15398
15399 permute:
15400 memset (&dperm, 0, sizeof (dperm));
15401 dperm.target = target;
15402 dperm.vmode = mode;
15403 dperm.nelt = GET_MODE_NUNITS (mode);
15404 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15405 dperm.one_operand_p = true;
15406
15407 if (mode == V8HFmode || mode == V8BFmode)
15408 {
15409 tmp1 = force_reg (GET_MODE_INNER (mode), val);
15410 tmp2 = gen_reg_rtx (mode);
15411 emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
15412 CONST0_RTX (mode), tmp1));
15413 tmp1 = gen_lowpart (mode, tmp2);
15414 }
15415 else
15416 {
15417 /* Extend to SImode using a paradoxical SUBREG. */
15418 tmp1 = gen_reg_rtx (SImode);
15419 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15420
15421 /* Insert the SImode value as
15422 low element of a V4SImode vector. */
15423 tmp2 = gen_reg_rtx (V4SImode);
15424 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15425 tmp1 = gen_lowpart (mode, tmp2);
15426 }
15427
15428 emit_move_insn (dperm.op0, tmp1);
15429 ok = (expand_vec_perm_1 (&dperm)
15430 || expand_vec_perm_broadcast_1 (&dperm));
15431 gcc_assert (ok);
15432 return ok;
15433 }
15434 goto widen;
15435
15436 case E_V16QImode:
15437 if (TARGET_AVX2)
15438 return ix86_vector_duplicate_value (mode, target, val);
15439
15440 if (TARGET_SSE2)
15441 goto permute;
15442 goto widen;
15443
15444 widen:
15445 /* Replicate the value once into the next wider mode and recurse. */
15446 {
15447 machine_mode smode, wsmode, wvmode;
15448 rtx x;
15449
15450 smode = GET_MODE_INNER (mode);
15451 wvmode = get_mode_wider_vector (mode);
15452 wsmode = GET_MODE_INNER (wvmode);
15453
15454 val = convert_modes (wsmode, smode, val, true);
15455
15456 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15457 emit_insn (gen_insv_1 (wsmode, val, val));
15458 else
15459 {
15460 x = expand_simple_binop (wsmode, ASHIFT, val,
15461 GEN_INT (GET_MODE_BITSIZE (smode)),
15462 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15463 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15464 OPTAB_LIB_WIDEN);
15465 }
15466
15467 x = gen_reg_rtx (wvmode);
15468 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15469 gcc_assert (ok);
15470 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15471 return ok;
15472 }
15473
15474 case E_V16HImode:
15475 case E_V16HFmode:
15476 case E_V16BFmode:
15477 case E_V32QImode:
15478 if (TARGET_AVX2)
15479 return ix86_vector_duplicate_value (mode, target, val);
15480 else
15481 {
15482 machine_mode hvmode;
15483 switch (mode)
15484 {
15485 case V16HImode:
15486 hvmode = V8HImode;
15487 break;
15488 case V16HFmode:
15489 hvmode = V8HFmode;
15490 break;
15491 case V16BFmode:
15492 hvmode = V8BFmode;
15493 break;
15494 case V32QImode:
15495 hvmode = V16QImode;
15496 break;
15497 default:
15498 gcc_unreachable ();
15499 }
15500 rtx x = gen_reg_rtx (hvmode);
15501
15502 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15503 gcc_assert (ok);
15504
15505 x = gen_rtx_VEC_CONCAT (mode, x, x);
15506 emit_insn (gen_rtx_SET (target, x));
15507 }
15508 return true;
15509
15510 case E_V32HImode:
15511 case E_V32HFmode:
15512 case E_V32BFmode:
15513 case E_V64QImode:
15514 if (TARGET_AVX512BW)
15515 return ix86_vector_duplicate_value (mode, target, val);
15516 else
15517 {
15518 machine_mode hvmode;
15519 switch (mode)
15520 {
15521 case V32HImode:
15522 hvmode = V16HImode;
15523 break;
15524 case V32HFmode:
15525 hvmode = V16HFmode;
15526 break;
15527 case V32BFmode:
15528 hvmode = V16BFmode;
15529 break;
15530 case V64QImode:
15531 hvmode = V32QImode;
15532 break;
15533 default:
15534 gcc_unreachable ();
15535 }
15536 rtx x = gen_reg_rtx (hvmode);
15537
15538 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15539 gcc_assert (ok);
15540
15541 x = gen_rtx_VEC_CONCAT (mode, x, x);
15542 emit_insn (gen_rtx_SET (target, x));
15543 }
15544 return true;
15545
15546 default:
15547 return false;
15548 }
15549 }
15550
15551 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15552 whose ONE_VAR element is VAR, and other elements are zero. Return true
15553 if successful. */
15554
15555 static bool
15556 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15557 rtx target, rtx var, int one_var)
15558 {
15559 machine_mode vsimode;
15560 rtx new_target;
15561 rtx x, tmp;
15562 bool use_vector_set = false;
15563 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15564
15565 switch (mode)
15566 {
15567 case E_V2DImode:
15568 /* For SSE4.1, we normally use vector set. But if the second
15569 element is zero and inter-unit moves are OK, we use movq
15570 instead. */
15571 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15572 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15573 && one_var == 0));
15574 break;
15575 case E_V16QImode:
15576 case E_V4SImode:
15577 case E_V4SFmode:
15578 use_vector_set = TARGET_SSE4_1;
15579 break;
15580 case E_V8HImode:
15581 use_vector_set = TARGET_SSE2;
15582 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15583 ? gen_vec_setv8hi_0 : NULL;
15584 break;
15585 case E_V8QImode:
15586 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15587 break;
15588 case E_V4HImode:
15589 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15590 break;
15591 case E_V4QImode:
15592 use_vector_set = TARGET_SSE4_1;
15593 break;
15594 case E_V32QImode:
15595 use_vector_set = TARGET_AVX;
15596 break;
15597 case E_V16HImode:
15598 use_vector_set = TARGET_AVX;
15599 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15600 ? gen_vec_setv16hi_0 : NULL;
15601 break;
15602 case E_V8SImode:
15603 use_vector_set = TARGET_AVX;
15604 gen_vec_set_0 = gen_vec_setv8si_0;
15605 break;
15606 case E_V8SFmode:
15607 use_vector_set = TARGET_AVX;
15608 gen_vec_set_0 = gen_vec_setv8sf_0;
15609 break;
15610 case E_V4DFmode:
15611 use_vector_set = TARGET_AVX;
15612 gen_vec_set_0 = gen_vec_setv4df_0;
15613 break;
15614 case E_V4DImode:
15615 /* Use ix86_expand_vector_set in 64bit mode only. */
15616 use_vector_set = TARGET_AVX && TARGET_64BIT;
15617 gen_vec_set_0 = gen_vec_setv4di_0;
15618 break;
15619 case E_V16SImode:
15620 use_vector_set = TARGET_AVX512F && one_var == 0;
15621 gen_vec_set_0 = gen_vec_setv16si_0;
15622 break;
15623 case E_V16SFmode:
15624 use_vector_set = TARGET_AVX512F && one_var == 0;
15625 gen_vec_set_0 = gen_vec_setv16sf_0;
15626 break;
15627 case E_V8DFmode:
15628 use_vector_set = TARGET_AVX512F && one_var == 0;
15629 gen_vec_set_0 = gen_vec_setv8df_0;
15630 break;
15631 case E_V8DImode:
15632 /* Use ix86_expand_vector_set in 64bit mode only. */
15633 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15634 gen_vec_set_0 = gen_vec_setv8di_0;
15635 break;
15636 case E_V8HFmode:
15637 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15638 gen_vec_set_0 = gen_vec_setv8hf_0;
15639 break;
15640 case E_V16HFmode:
15641 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15642 gen_vec_set_0 = gen_vec_setv16hf_0;
15643 break;
15644 case E_V32HFmode:
15645 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15646 gen_vec_set_0 = gen_vec_setv32hf_0;
15647 break;
15648 case E_V8BFmode:
15649 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15650 gen_vec_set_0 = gen_vec_setv8bf_0;
15651 break;
15652 case E_V16BFmode:
15653 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15654 gen_vec_set_0 = gen_vec_setv16bf_0;
15655 break;
15656 case E_V32BFmode:
15657 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15658 gen_vec_set_0 = gen_vec_setv32bf_0;
15659 break;
15660 case E_V32HImode:
15661 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15662 gen_vec_set_0 = gen_vec_setv32hi_0;
15663 default:
15664 break;
15665 }
15666
15667 if (use_vector_set)
15668 {
15669 if (gen_vec_set_0 && one_var == 0)
15670 {
15671 var = force_reg (GET_MODE_INNER (mode), var);
15672 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15673 return true;
15674 }
15675 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15676 var = force_reg (GET_MODE_INNER (mode), var);
15677 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15678 return true;
15679 }
15680
15681 switch (mode)
15682 {
15683 case E_V2SFmode:
15684 case E_V2SImode:
15685 if (!mmx_ok)
15686 return false;
15687 /* FALLTHRU */
15688
15689 case E_V2DFmode:
15690 case E_V2DImode:
15691 if (one_var != 0)
15692 return false;
15693 var = force_reg (GET_MODE_INNER (mode), var);
15694 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15695 emit_insn (gen_rtx_SET (target, x));
15696 return true;
15697
15698 case E_V4SFmode:
15699 case E_V4SImode:
15700 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15701 new_target = gen_reg_rtx (mode);
15702 else
15703 new_target = target;
15704 var = force_reg (GET_MODE_INNER (mode), var);
15705 x = gen_rtx_VEC_DUPLICATE (mode, var);
15706 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15707 emit_insn (gen_rtx_SET (new_target, x));
15708 if (one_var != 0)
15709 {
15710 /* We need to shuffle the value to the correct position, so
15711 create a new pseudo to store the intermediate result. */
15712
15713 /* With SSE2, we can use the integer shuffle insns. */
15714 if (mode != V4SFmode && TARGET_SSE2)
15715 {
15716 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15717 const1_rtx,
15718 GEN_INT (one_var == 1 ? 0 : 1),
15719 GEN_INT (one_var == 2 ? 0 : 1),
15720 GEN_INT (one_var == 3 ? 0 : 1)));
15721 if (target != new_target)
15722 emit_move_insn (target, new_target);
15723 return true;
15724 }
15725
15726 /* Otherwise convert the intermediate result to V4SFmode and
15727 use the SSE1 shuffle instructions. */
15728 if (mode != V4SFmode)
15729 {
15730 tmp = gen_reg_rtx (V4SFmode);
15731 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15732 }
15733 else
15734 tmp = new_target;
15735
15736 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15737 const1_rtx,
15738 GEN_INT (one_var == 1 ? 0 : 1),
15739 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15740 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15741
15742 if (mode != V4SFmode)
15743 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15744 else if (tmp != target)
15745 emit_move_insn (target, tmp);
15746 }
15747 else if (target != new_target)
15748 emit_move_insn (target, new_target);
15749 return true;
15750
15751 case E_V8HImode:
15752 case E_V16QImode:
15753 vsimode = V4SImode;
15754 goto widen;
15755 case E_V4HImode:
15756 case E_V8QImode:
15757 if (!mmx_ok)
15758 return false;
15759 vsimode = V2SImode;
15760 goto widen;
15761 widen:
15762 if (one_var != 0)
15763 return false;
15764
15765 /* Zero extend the variable element to SImode and recurse. */
15766 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15767
15768 x = gen_reg_rtx (vsimode);
15769 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15770 var, one_var))
15771 gcc_unreachable ();
15772
15773 emit_move_insn (target, gen_lowpart (mode, x));
15774 return true;
15775
15776 default:
15777 return false;
15778 }
15779 }
15780
15781 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15782 consisting of the values in VALS. It is known that all elements
15783 except ONE_VAR are constants. Return true if successful. */
15784
15785 static bool
15786 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15787 rtx target, rtx vals, int one_var)
15788 {
15789 rtx var = XVECEXP (vals, 0, one_var);
15790 machine_mode wmode;
15791 rtx const_vec, x;
15792
15793 const_vec = copy_rtx (vals);
15794 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15795 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15796
15797 switch (mode)
15798 {
15799 case E_V2DFmode:
15800 case E_V2DImode:
15801 case E_V2SFmode:
15802 case E_V2SImode:
15803 /* For the two element vectors, it's just as easy to use
15804 the general case. */
15805 return false;
15806
15807 case E_V4DImode:
15808 /* Use ix86_expand_vector_set in 64bit mode only. */
15809 if (!TARGET_64BIT)
15810 return false;
15811 /* FALLTHRU */
15812 case E_V8HFmode:
15813 case E_V16HFmode:
15814 case E_V8BFmode:
15815 case E_V16BFmode:
15816 case E_V4DFmode:
15817 case E_V8SFmode:
15818 case E_V8SImode:
15819 case E_V16HImode:
15820 case E_V32QImode:
15821 case E_V4SFmode:
15822 case E_V4SImode:
15823 case E_V8HImode:
15824 case E_V4HImode:
15825 break;
15826
15827 case E_V16QImode:
15828 if (TARGET_SSE4_1)
15829 break;
15830 wmode = V8HImode;
15831 goto widen;
15832 case E_V8QImode:
15833 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15834 break;
15835 wmode = V4HImode;
15836 goto widen;
15837 case E_V4QImode:
15838 if (TARGET_SSE4_1)
15839 break;
15840 wmode = V2HImode;
15841 widen:
15842 /* There's no way to set one QImode entry easily. Combine
15843 the variable value with its adjacent constant value, and
15844 promote to an HImode set. */
15845 x = XVECEXP (vals, 0, one_var ^ 1);
15846 if (one_var & 1)
15847 {
15848 var = convert_modes (HImode, QImode, var, true);
15849 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15850 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15851 x = GEN_INT (INTVAL (x) & 0xff);
15852 }
15853 else
15854 {
15855 var = convert_modes (HImode, QImode, var, true);
15856 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15857 }
15858 if (x != const0_rtx)
15859 var = expand_simple_binop (HImode, IOR, var, x, var,
15860 1, OPTAB_LIB_WIDEN);
15861
15862 x = gen_reg_rtx (wmode);
15863 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15864 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15865
15866 emit_move_insn (target, gen_lowpart (mode, x));
15867 return true;
15868
15869 default:
15870 return false;
15871 }
15872
15873 emit_move_insn (target, const_vec);
15874 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15875 return true;
15876 }
15877
15878 /* A subroutine of ix86_expand_vector_init_general. Use vector
15879 concatenate to handle the most general case: all values variable,
15880 and none identical. */
15881
15882 static void
15883 ix86_expand_vector_init_concat (machine_mode mode,
15884 rtx target, rtx *ops, int n)
15885 {
15886 machine_mode half_mode = VOIDmode;
15887 rtx half[2];
15888 rtvec v;
15889 int i, j;
15890
15891 switch (n)
15892 {
15893 case 2:
15894 switch (mode)
15895 {
15896 case E_V32HFmode:
15897 half_mode = V16HFmode;
15898 break;
15899 case E_V32BFmode:
15900 half_mode = V16BFmode;
15901 break;
15902 case E_V16SImode:
15903 half_mode = V8SImode;
15904 break;
15905 case E_V16SFmode:
15906 half_mode = V8SFmode;
15907 break;
15908 case E_V8DImode:
15909 half_mode = V4DImode;
15910 break;
15911 case E_V8DFmode:
15912 half_mode = V4DFmode;
15913 break;
15914 case E_V16HFmode:
15915 half_mode = V8HFmode;
15916 break;
15917 case E_V16BFmode:
15918 half_mode = V8BFmode;
15919 break;
15920 case E_V8SImode:
15921 half_mode = V4SImode;
15922 break;
15923 case E_V8SFmode:
15924 half_mode = V4SFmode;
15925 break;
15926 case E_V4DImode:
15927 half_mode = V2DImode;
15928 break;
15929 case E_V4DFmode:
15930 half_mode = V2DFmode;
15931 break;
15932 case E_V4SImode:
15933 half_mode = V2SImode;
15934 break;
15935 case E_V4SFmode:
15936 half_mode = V2SFmode;
15937 break;
15938 case E_V2DImode:
15939 half_mode = DImode;
15940 break;
15941 case E_V2SImode:
15942 half_mode = SImode;
15943 break;
15944 case E_V2DFmode:
15945 half_mode = DFmode;
15946 break;
15947 case E_V2SFmode:
15948 half_mode = SFmode;
15949 break;
15950 default:
15951 gcc_unreachable ();
15952 }
15953
15954 if (!register_operand (ops[1], half_mode))
15955 ops[1] = force_reg (half_mode, ops[1]);
15956 if (!register_operand (ops[0], half_mode))
15957 ops[0] = force_reg (half_mode, ops[0]);
15958 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15959 ops[1])));
15960 break;
15961
15962 case 4:
15963 switch (mode)
15964 {
15965 case E_V4DImode:
15966 half_mode = V2DImode;
15967 break;
15968 case E_V4DFmode:
15969 half_mode = V2DFmode;
15970 break;
15971 case E_V4SImode:
15972 half_mode = V2SImode;
15973 break;
15974 case E_V4SFmode:
15975 half_mode = V2SFmode;
15976 break;
15977 default:
15978 gcc_unreachable ();
15979 }
15980 goto half;
15981
15982 case 8:
15983 switch (mode)
15984 {
15985 case E_V8DImode:
15986 half_mode = V4DImode;
15987 break;
15988 case E_V8DFmode:
15989 half_mode = V4DFmode;
15990 break;
15991 case E_V8SImode:
15992 half_mode = V4SImode;
15993 break;
15994 case E_V8SFmode:
15995 half_mode = V4SFmode;
15996 break;
15997 default:
15998 gcc_unreachable ();
15999 }
16000 goto half;
16001
16002 case 16:
16003 switch (mode)
16004 {
16005 case E_V16SImode:
16006 half_mode = V8SImode;
16007 break;
16008 case E_V16SFmode:
16009 half_mode = V8SFmode;
16010 break;
16011 default:
16012 gcc_unreachable ();
16013 }
16014 goto half;
16015
16016 half:
16017 /* FIXME: We process inputs backward to help RA. PR 36222. */
16018 i = n - 1;
16019 for (j = 1; j != -1; j--)
16020 {
16021 half[j] = gen_reg_rtx (half_mode);
16022 switch (n >> 1)
16023 {
16024 case 2:
16025 v = gen_rtvec (2, ops[i-1], ops[i]);
16026 i -= 2;
16027 break;
16028 case 4:
16029 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
16030 i -= 4;
16031 break;
16032 case 8:
16033 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
16034 ops[i-3], ops[i-2], ops[i-1], ops[i]);
16035 i -= 8;
16036 break;
16037 default:
16038 gcc_unreachable ();
16039 }
16040 ix86_expand_vector_init (false, half[j],
16041 gen_rtx_PARALLEL (half_mode, v));
16042 }
16043
16044 ix86_expand_vector_init_concat (mode, target, half, 2);
16045 break;
16046
16047 default:
16048 gcc_unreachable ();
16049 }
16050 }
16051
16052 /* A subroutine of ix86_expand_vector_init_general. Use vector
16053 interleave to handle the most general case: all values variable,
16054 and none identical. */
16055
16056 static void
16057 ix86_expand_vector_init_interleave (machine_mode mode,
16058 rtx target, rtx *ops, int n)
16059 {
16060 machine_mode first_imode, second_imode, third_imode, inner_mode;
16061 int i, j;
16062 rtx op, op0, op1;
16063 rtx (*gen_load_even) (rtx, rtx, rtx);
16064 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
16065 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
16066
16067 switch (mode)
16068 {
16069 case E_V8HFmode:
16070 gen_load_even = gen_vec_interleave_lowv8hf;
16071 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16072 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16073 inner_mode = HFmode;
16074 first_imode = V4SImode;
16075 second_imode = V2DImode;
16076 third_imode = VOIDmode;
16077 break;
16078 case E_V8BFmode:
16079 gen_load_even = gen_vec_interleave_lowv8bf;
16080 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16081 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16082 inner_mode = BFmode;
16083 first_imode = V4SImode;
16084 second_imode = V2DImode;
16085 third_imode = VOIDmode;
16086 break;
16087 case E_V8HImode:
16088 gen_load_even = gen_vec_setv8hi;
16089 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16090 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16091 inner_mode = HImode;
16092 first_imode = V4SImode;
16093 second_imode = V2DImode;
16094 third_imode = VOIDmode;
16095 break;
16096 case E_V16QImode:
16097 gen_load_even = gen_vec_setv16qi;
16098 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
16099 gen_interleave_second_low = gen_vec_interleave_lowv4si;
16100 inner_mode = QImode;
16101 first_imode = V8HImode;
16102 second_imode = V4SImode;
16103 third_imode = V2DImode;
16104 break;
16105 default:
16106 gcc_unreachable ();
16107 }
16108
16109 for (i = 0; i < n; i++)
16110 {
16111 op = ops [i + i];
16112 if (inner_mode == HFmode || inner_mode == BFmode)
16113 {
16114 rtx even, odd;
16115 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16116 machine_mode vec_mode =
16117 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
16118 op0 = gen_reg_rtx (vec_mode);
16119 even = lowpart_subreg (vec_mode,
16120 force_reg (inner_mode, op), inner_mode);
16121 odd = lowpart_subreg (vec_mode,
16122 force_reg (inner_mode, ops[i + i + 1]),
16123 inner_mode);
16124 emit_insn (gen_load_even (op0, even, odd));
16125 }
16126 else
16127 {
16128 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16129 op0 = gen_reg_rtx (SImode);
16130 emit_move_insn (op0, gen_lowpart (SImode, op));
16131
16132 /* Insert the SImode value as low element of V4SImode vector. */
16133 op1 = gen_reg_rtx (V4SImode);
16134 op0 = gen_rtx_VEC_MERGE (V4SImode,
16135 gen_rtx_VEC_DUPLICATE (V4SImode,
16136 op0),
16137 CONST0_RTX (V4SImode),
16138 const1_rtx);
16139 emit_insn (gen_rtx_SET (op1, op0));
16140
16141 /* Cast the V4SImode vector back to a vector in orignal mode. */
16142 op0 = gen_reg_rtx (mode);
16143 emit_move_insn (op0, gen_lowpart (mode, op1));
16144
16145 /* Load even elements into the second position. */
16146 emit_insn (gen_load_even (op0,
16147 force_reg (inner_mode,
16148 ops[i + i + 1]),
16149 const1_rtx));
16150 }
16151
16152 /* Cast vector to FIRST_IMODE vector. */
16153 ops[i] = gen_reg_rtx (first_imode);
16154 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
16155 }
16156
16157 /* Interleave low FIRST_IMODE vectors. */
16158 for (i = j = 0; i < n; i += 2, j++)
16159 {
16160 op0 = gen_reg_rtx (first_imode);
16161 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
16162
16163 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16164 ops[j] = gen_reg_rtx (second_imode);
16165 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
16166 }
16167
16168 /* Interleave low SECOND_IMODE vectors. */
16169 switch (second_imode)
16170 {
16171 case E_V4SImode:
16172 for (i = j = 0; i < n / 2; i += 2, j++)
16173 {
16174 op0 = gen_reg_rtx (second_imode);
16175 emit_insn (gen_interleave_second_low (op0, ops[i],
16176 ops[i + 1]));
16177
16178 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16179 vector. */
16180 ops[j] = gen_reg_rtx (third_imode);
16181 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
16182 }
16183 second_imode = V2DImode;
16184 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16185 /* FALLTHRU */
16186
16187 case E_V2DImode:
16188 op0 = gen_reg_rtx (second_imode);
16189 emit_insn (gen_interleave_second_low (op0, ops[0],
16190 ops[1]));
16191
16192 /* Cast the SECOND_IMODE vector back to a vector on original
16193 mode. */
16194 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
16195 break;
16196
16197 default:
16198 gcc_unreachable ();
16199 }
16200 }
16201
16202 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
16203 all values variable, and none identical. */
16204
16205 static void
16206 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
16207 rtx target, rtx vals)
16208 {
16209 rtx ops[64], op0, op1, op2, op3, op4, op5;
16210 machine_mode half_mode = VOIDmode;
16211 machine_mode quarter_mode = VOIDmode;
16212 int n, i;
16213
16214 switch (mode)
16215 {
16216 case E_V2SFmode:
16217 case E_V2SImode:
16218 if (!mmx_ok && !TARGET_SSE)
16219 break;
16220 /* FALLTHRU */
16221
16222 case E_V16SImode:
16223 case E_V16SFmode:
16224 case E_V8DFmode:
16225 case E_V8DImode:
16226 case E_V8SFmode:
16227 case E_V8SImode:
16228 case E_V4DFmode:
16229 case E_V4DImode:
16230 case E_V4SFmode:
16231 case E_V4SImode:
16232 case E_V2DFmode:
16233 case E_V2DImode:
16234 n = GET_MODE_NUNITS (mode);
16235 for (i = 0; i < n; i++)
16236 ops[i] = XVECEXP (vals, 0, i);
16237 ix86_expand_vector_init_concat (mode, target, ops, n);
16238 return;
16239
16240 case E_V2TImode:
16241 for (i = 0; i < 2; i++)
16242 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16243 op0 = gen_reg_rtx (V4DImode);
16244 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
16245 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16246 return;
16247
16248 case E_V4TImode:
16249 for (i = 0; i < 4; i++)
16250 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16251 ops[4] = gen_reg_rtx (V4DImode);
16252 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
16253 ops[5] = gen_reg_rtx (V4DImode);
16254 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
16255 op0 = gen_reg_rtx (V8DImode);
16256 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
16257 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16258 return;
16259
16260 case E_V32QImode:
16261 half_mode = V16QImode;
16262 goto half;
16263
16264 case E_V16HImode:
16265 half_mode = V8HImode;
16266 goto half;
16267
16268 case E_V16HFmode:
16269 half_mode = V8HFmode;
16270 goto half;
16271
16272 case E_V16BFmode:
16273 half_mode = V8BFmode;
16274 goto half;
16275
16276 half:
16277 n = GET_MODE_NUNITS (mode);
16278 for (i = 0; i < n; i++)
16279 ops[i] = XVECEXP (vals, 0, i);
16280 op0 = gen_reg_rtx (half_mode);
16281 op1 = gen_reg_rtx (half_mode);
16282 ix86_expand_vector_init_interleave (half_mode, op0, ops,
16283 n >> 2);
16284 ix86_expand_vector_init_interleave (half_mode, op1,
16285 &ops [n >> 1], n >> 2);
16286 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
16287 return;
16288
16289 case E_V64QImode:
16290 quarter_mode = V16QImode;
16291 half_mode = V32QImode;
16292 goto quarter;
16293
16294 case E_V32HImode:
16295 quarter_mode = V8HImode;
16296 half_mode = V16HImode;
16297 goto quarter;
16298
16299 case E_V32HFmode:
16300 quarter_mode = V8HFmode;
16301 half_mode = V16HFmode;
16302 goto quarter;
16303
16304 case E_V32BFmode:
16305 quarter_mode = V8BFmode;
16306 half_mode = V16BFmode;
16307 goto quarter;
16308
16309 quarter:
16310 n = GET_MODE_NUNITS (mode);
16311 for (i = 0; i < n; i++)
16312 ops[i] = XVECEXP (vals, 0, i);
16313 op0 = gen_reg_rtx (quarter_mode);
16314 op1 = gen_reg_rtx (quarter_mode);
16315 op2 = gen_reg_rtx (quarter_mode);
16316 op3 = gen_reg_rtx (quarter_mode);
16317 op4 = gen_reg_rtx (half_mode);
16318 op5 = gen_reg_rtx (half_mode);
16319 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
16320 n >> 3);
16321 ix86_expand_vector_init_interleave (quarter_mode, op1,
16322 &ops [n >> 2], n >> 3);
16323 ix86_expand_vector_init_interleave (quarter_mode, op2,
16324 &ops [n >> 1], n >> 3);
16325 ix86_expand_vector_init_interleave (quarter_mode, op3,
16326 &ops [(n >> 1) | (n >> 2)], n >> 3);
16327 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
16328 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
16329 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
16330 return;
16331
16332 case E_V16QImode:
16333 if (!TARGET_SSE4_1)
16334 break;
16335 /* FALLTHRU */
16336
16337 case E_V8HImode:
16338 if (!TARGET_SSE2)
16339 break;
16340
16341 /* Don't use ix86_expand_vector_init_interleave if we can't
16342 move from GPR to SSE register directly. */
16343 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
16344 break;
16345 /* FALLTHRU */
16346
16347 case E_V8HFmode:
16348 case E_V8BFmode:
16349
16350 n = GET_MODE_NUNITS (mode);
16351 for (i = 0; i < n; i++)
16352 ops[i] = XVECEXP (vals, 0, i);
16353 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
16354 return;
16355
16356 case E_V4HImode:
16357 case E_V8QImode:
16358
16359 case E_V2HImode:
16360 case E_V4QImode:
16361 break;
16362
16363 default:
16364 gcc_unreachable ();
16365 }
16366
16367 {
16368 int i, j, n_elts, n_words, n_elt_per_word;
16369 machine_mode tmp_mode, inner_mode;
16370 rtx words[4], shift;
16371
16372 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
16373
16374 inner_mode = GET_MODE_INNER (mode);
16375 n_elts = GET_MODE_NUNITS (mode);
16376 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
16377 n_elt_per_word = n_elts / n_words;
16378 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
16379
16380 for (i = 0; i < n_words; ++i)
16381 {
16382 rtx word = NULL_RTX;
16383
16384 for (j = 0; j < n_elt_per_word; ++j)
16385 {
16386 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
16387 elt = convert_modes (tmp_mode, inner_mode, elt, true);
16388
16389 if (j == 0)
16390 word = elt;
16391 else
16392 {
16393 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
16394 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16395 word = expand_simple_binop (tmp_mode, IOR, word, elt,
16396 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16397 }
16398 }
16399
16400 words[i] = word;
16401 }
16402
16403 if (n_words == 1)
16404 emit_move_insn (target, gen_lowpart (mode, words[0]));
16405 else if (n_words == 2)
16406 {
16407 gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
16408 machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
16409 rtx tmp = gen_reg_rtx (concat_mode);
16410 vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
16411 ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
16412 emit_move_insn (target, gen_lowpart (mode, tmp));
16413 }
16414 else if (n_words == 4)
16415 {
16416 rtx tmp = gen_reg_rtx (V4SImode);
16417 gcc_assert (tmp_mode == SImode);
16418 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16419 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
16420 emit_move_insn (target, gen_lowpart (mode, tmp));
16421 }
16422 else
16423 gcc_unreachable ();
16424 }
16425 }
16426
16427 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16428 instructions unless MMX_OK is true. */
16429
16430 void
16431 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16432 {
16433 machine_mode mode = GET_MODE (target);
16434 machine_mode inner_mode = GET_MODE_INNER (mode);
16435 int n_elts = GET_MODE_NUNITS (mode);
16436 int n_var = 0, one_var = -1;
16437 bool all_same = true, all_const_zero = true;
16438 int i;
16439 rtx x;
16440
16441 /* Handle first initialization from vector elts. */
16442 if (n_elts != XVECLEN (vals, 0))
16443 {
16444 rtx subtarget = target;
16445 x = XVECEXP (vals, 0, 0);
16446 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16447 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16448 {
16449 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
16450 if (inner_mode == QImode
16451 || inner_mode == HImode
16452 || inner_mode == TImode
16453 || inner_mode == HFmode
16454 || inner_mode == BFmode)
16455 {
16456 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
16457 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16458 n_bits /= GET_MODE_SIZE (elt_mode);
16459 mode = mode_for_vector (elt_mode, n_bits).require ();
16460 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
16461 ops[0] = gen_lowpart (inner_mode, ops[0]);
16462 ops[1] = gen_lowpart (inner_mode, ops[1]);
16463 subtarget = gen_reg_rtx (mode);
16464 }
16465 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
16466 if (subtarget != target)
16467 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
16468 return;
16469 }
16470 gcc_unreachable ();
16471 }
16472
16473 for (i = 0; i < n_elts; ++i)
16474 {
16475 x = XVECEXP (vals, 0, i);
16476 if (!(CONST_SCALAR_INT_P (x)
16477 || CONST_DOUBLE_P (x)
16478 || CONST_FIXED_P (x)))
16479 n_var++, one_var = i;
16480 else if (x != CONST0_RTX (inner_mode))
16481 all_const_zero = false;
16482 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
16483 all_same = false;
16484 }
16485
16486 /* Constants are best loaded from the constant pool. */
16487 if (n_var == 0)
16488 {
16489 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
16490 return;
16491 }
16492
16493 /* If all values are identical, broadcast the value. */
16494 if (all_same
16495 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16496 XVECEXP (vals, 0, 0)))
16497 return;
16498
16499 /* Values where only one field is non-constant are best loaded from
16500 the pool and overwritten via move later. */
16501 if (n_var == 1)
16502 {
16503 if (all_const_zero
16504 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16505 XVECEXP (vals, 0, one_var),
16506 one_var))
16507 return;
16508
16509 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16510 return;
16511 }
16512
16513 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16514 }
16515
16516 /* Implemented as
16517 V setg (V v, int idx, T val)
16518 {
16519 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16520 V valv = (V){val, val, val, val, val, val, val, val};
16521 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16522 v = (v & ~mask) | (valv & mask);
16523 return v;
16524 }. */
16525 void
16526 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16527 {
16528 rtx vec[64];
16529 machine_mode mode = GET_MODE (target);
16530 machine_mode cmp_mode = mode;
16531 int n_elts = GET_MODE_NUNITS (mode);
16532 rtx valv,idxv,constv,idx_tmp;
16533 bool ok = false;
16534
16535 /* 512-bits vector byte/word broadcast and comparison only available
16536 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16537 when without TARGET_AVX512BW. */
16538 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
16539 || mode == V64QImode)
16540 && !TARGET_AVX512BW)
16541 {
16542 gcc_assert (TARGET_AVX512F);
16543 rtx vhi, vlo, idx_hi;
16544 machine_mode half_mode;
16545 rtx (*extract_hi)(rtx, rtx);
16546 rtx (*extract_lo)(rtx, rtx);
16547
16548 if (mode == V32HImode)
16549 {
16550 half_mode = V16HImode;
16551 extract_hi = gen_vec_extract_hi_v32hi;
16552 extract_lo = gen_vec_extract_lo_v32hi;
16553 }
16554 else if (mode == V32HFmode)
16555 {
16556 half_mode = V16HFmode;
16557 extract_hi = gen_vec_extract_hi_v32hf;
16558 extract_lo = gen_vec_extract_lo_v32hf;
16559 }
16560 else if (mode == V32BFmode)
16561 {
16562 half_mode = V16BFmode;
16563 extract_hi = gen_vec_extract_hi_v32bf;
16564 extract_lo = gen_vec_extract_lo_v32bf;
16565 }
16566 else
16567 {
16568 half_mode = V32QImode;
16569 extract_hi = gen_vec_extract_hi_v64qi;
16570 extract_lo = gen_vec_extract_lo_v64qi;
16571 }
16572
16573 vhi = gen_reg_rtx (half_mode);
16574 vlo = gen_reg_rtx (half_mode);
16575 idx_hi = gen_reg_rtx (GET_MODE (idx));
16576 emit_insn (extract_hi (vhi, target));
16577 emit_insn (extract_lo (vlo, target));
16578 vec[0] = idx_hi;
16579 vec[1] = idx;
16580 vec[2] = GEN_INT (n_elts/2);
16581 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16582 ix86_expand_vector_set_var (vhi, val, idx_hi);
16583 ix86_expand_vector_set_var (vlo, val, idx);
16584 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16585 return;
16586 }
16587
16588 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16589 {
16590 switch (mode)
16591 {
16592 case E_V2DFmode:
16593 cmp_mode = V2DImode;
16594 break;
16595 case E_V4DFmode:
16596 cmp_mode = V4DImode;
16597 break;
16598 case E_V8DFmode:
16599 cmp_mode = V8DImode;
16600 break;
16601 case E_V2SFmode:
16602 cmp_mode = V2SImode;
16603 break;
16604 case E_V4SFmode:
16605 cmp_mode = V4SImode;
16606 break;
16607 case E_V8SFmode:
16608 cmp_mode = V8SImode;
16609 break;
16610 case E_V16SFmode:
16611 cmp_mode = V16SImode;
16612 break;
16613 case E_V8HFmode:
16614 cmp_mode = V8HImode;
16615 break;
16616 case E_V16HFmode:
16617 cmp_mode = V16HImode;
16618 break;
16619 case E_V32HFmode:
16620 cmp_mode = V32HImode;
16621 break;
16622 case E_V8BFmode:
16623 cmp_mode = V8HImode;
16624 break;
16625 case E_V16BFmode:
16626 cmp_mode = V16HImode;
16627 break;
16628 case E_V32BFmode:
16629 cmp_mode = V32HImode;
16630 break;
16631 default:
16632 gcc_unreachable ();
16633 }
16634 }
16635
16636 for (int i = 0; i != n_elts; i++)
16637 vec[i] = GEN_INT (i);
16638 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16639 valv = gen_reg_rtx (mode);
16640 idxv = gen_reg_rtx (cmp_mode);
16641 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16642
16643 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16644 mode, valv, val);
16645 gcc_assert (ok);
16646 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16647 cmp_mode, idxv, idx_tmp);
16648 gcc_assert (ok);
16649 vec[0] = target;
16650 vec[1] = valv;
16651 vec[2] = target;
16652 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16653 vec[4] = idxv;
16654 vec[5] = constv;
16655 ok = ix86_expand_int_vcond (vec);
16656 gcc_assert (ok);
16657 }
16658
16659 void
16660 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16661 {
16662 machine_mode mode = GET_MODE (target);
16663 machine_mode inner_mode = GET_MODE_INNER (mode);
16664 machine_mode half_mode;
16665 bool use_vec_merge = false;
16666 bool blendm_const = false;
16667 rtx tmp;
16668 static rtx (*gen_extract[8][2]) (rtx, rtx)
16669 = {
16670 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16671 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16672 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16673 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16674 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16675 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16676 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
16677 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
16678 };
16679 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
16680 = {
16681 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16682 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16683 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16684 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16685 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16686 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16687 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16688 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
16689 };
16690 int i, j, n;
16691 machine_mode mmode = VOIDmode;
16692 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16693
16694 switch (mode)
16695 {
16696 case E_V2SImode:
16697 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16698 if (use_vec_merge)
16699 break;
16700 /* FALLTHRU */
16701
16702 case E_V2SFmode:
16703 if (mmx_ok)
16704 {
16705 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16706 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16707 if (elt == 0)
16708 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16709 else
16710 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16711 emit_insn (gen_rtx_SET (target, tmp));
16712 return;
16713 }
16714 break;
16715
16716 case E_V2DImode:
16717 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16718 if (use_vec_merge)
16719 break;
16720
16721 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16722 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16723 if (elt == 0)
16724 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16725 else
16726 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16727 emit_insn (gen_rtx_SET (target, tmp));
16728 return;
16729
16730 case E_V2DFmode:
16731 /* NB: For ELT == 0, use standard scalar operation patterns which
16732 preserve the rest of the vector for combiner:
16733
16734 (vec_merge:V2DF
16735 (vec_duplicate:V2DF (reg:DF))
16736 (reg:V2DF)
16737 (const_int 1))
16738 */
16739 if (elt == 0)
16740 goto do_vec_merge;
16741
16742 {
16743 rtx op0, op1;
16744
16745 /* For the two element vectors, we implement a VEC_CONCAT with
16746 the extraction of the other element. */
16747
16748 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16749 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16750
16751 if (elt == 0)
16752 op0 = val, op1 = tmp;
16753 else
16754 op0 = tmp, op1 = val;
16755
16756 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16757 emit_insn (gen_rtx_SET (target, tmp));
16758 }
16759 return;
16760
16761 case E_V4SFmode:
16762 use_vec_merge = TARGET_SSE4_1;
16763 if (use_vec_merge)
16764 break;
16765
16766 switch (elt)
16767 {
16768 case 0:
16769 use_vec_merge = true;
16770 break;
16771
16772 case 1:
16773 /* tmp = target = A B C D */
16774 tmp = copy_to_reg (target);
16775 /* target = A A B B */
16776 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16777 /* target = X A B B */
16778 ix86_expand_vector_set (false, target, val, 0);
16779 /* target = A X C D */
16780 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16781 const1_rtx, const0_rtx,
16782 GEN_INT (2+4), GEN_INT (3+4)));
16783 return;
16784
16785 case 2:
16786 /* tmp = target = A B C D */
16787 tmp = copy_to_reg (target);
16788 /* tmp = X B C D */
16789 ix86_expand_vector_set (false, tmp, val, 0);
16790 /* target = A B X D */
16791 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16792 const0_rtx, const1_rtx,
16793 GEN_INT (0+4), GEN_INT (3+4)));
16794 return;
16795
16796 case 3:
16797 /* tmp = target = A B C D */
16798 tmp = copy_to_reg (target);
16799 /* tmp = X B C D */
16800 ix86_expand_vector_set (false, tmp, val, 0);
16801 /* target = A B X D */
16802 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16803 const0_rtx, const1_rtx,
16804 GEN_INT (2+4), GEN_INT (0+4)));
16805 return;
16806
16807 default:
16808 gcc_unreachable ();
16809 }
16810 break;
16811
16812 case E_V4SImode:
16813 use_vec_merge = TARGET_SSE4_1;
16814 if (use_vec_merge)
16815 break;
16816
16817 /* Element 0 handled by vec_merge below. */
16818 if (elt == 0)
16819 {
16820 use_vec_merge = true;
16821 break;
16822 }
16823
16824 if (TARGET_SSE2)
16825 {
16826 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16827 store into element 0, then shuffle them back. */
16828
16829 rtx order[4];
16830
16831 order[0] = GEN_INT (elt);
16832 order[1] = const1_rtx;
16833 order[2] = const2_rtx;
16834 order[3] = GEN_INT (3);
16835 order[elt] = const0_rtx;
16836
16837 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16838 order[1], order[2], order[3]));
16839
16840 ix86_expand_vector_set (false, target, val, 0);
16841
16842 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16843 order[1], order[2], order[3]));
16844 }
16845 else
16846 {
16847 /* For SSE1, we have to reuse the V4SF code. */
16848 rtx t = gen_reg_rtx (V4SFmode);
16849 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16850 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16851 emit_move_insn (target, gen_lowpart (mode, t));
16852 }
16853 return;
16854
16855 case E_V8HImode:
16856 case E_V8HFmode:
16857 case E_V8BFmode:
16858 case E_V2HImode:
16859 use_vec_merge = TARGET_SSE2;
16860 break;
16861 case E_V4HImode:
16862 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16863 break;
16864
16865 case E_V16QImode:
16866 case E_V4QImode:
16867 use_vec_merge = TARGET_SSE4_1;
16868 break;
16869
16870 case E_V8QImode:
16871 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16872 break;
16873
16874 case E_V32QImode:
16875 half_mode = V16QImode;
16876 j = 0;
16877 n = 16;
16878 goto half;
16879
16880 case E_V16HFmode:
16881 case E_V16BFmode:
16882 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16883 if (TARGET_AVX2 && elt != 0)
16884 {
16885 mmode = SImode;
16886 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
16887 : gen_avx2_pblendbf_1);
16888 blendm_const = true;
16889 break;
16890 }
16891 else
16892 {
16893 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
16894 j = ((mode == E_V16HFmode) ? 6 : 7);
16895 n = 8;
16896 goto half;
16897 }
16898
16899 case E_V16HImode:
16900 half_mode = V8HImode;
16901 j = 1;
16902 n = 8;
16903 goto half;
16904
16905 case E_V8SImode:
16906 half_mode = V4SImode;
16907 j = 2;
16908 n = 4;
16909 goto half;
16910
16911 case E_V4DImode:
16912 half_mode = V2DImode;
16913 j = 3;
16914 n = 2;
16915 goto half;
16916
16917 case E_V8SFmode:
16918 half_mode = V4SFmode;
16919 j = 4;
16920 n = 4;
16921 goto half;
16922
16923 case E_V4DFmode:
16924 half_mode = V2DFmode;
16925 j = 5;
16926 n = 2;
16927 goto half;
16928
16929 half:
16930 /* Compute offset. */
16931 i = elt / n;
16932 elt %= n;
16933
16934 gcc_assert (i <= 1);
16935
16936 /* Extract the half. */
16937 tmp = gen_reg_rtx (half_mode);
16938 emit_insn (gen_extract[j][i] (tmp, target));
16939
16940 /* Put val in tmp at elt. */
16941 ix86_expand_vector_set (false, tmp, val, elt);
16942
16943 /* Put it back. */
16944 emit_insn (gen_insert[j][i] (target, target, tmp));
16945 return;
16946
16947 case E_V8DFmode:
16948 if (TARGET_AVX512F)
16949 {
16950 mmode = QImode;
16951 gen_blendm = gen_avx512f_blendmv8df;
16952 }
16953 break;
16954
16955 case E_V8DImode:
16956 if (TARGET_AVX512F)
16957 {
16958 mmode = QImode;
16959 gen_blendm = gen_avx512f_blendmv8di;
16960 }
16961 break;
16962
16963 case E_V16SFmode:
16964 if (TARGET_AVX512F)
16965 {
16966 mmode = HImode;
16967 gen_blendm = gen_avx512f_blendmv16sf;
16968 }
16969 break;
16970
16971 case E_V16SImode:
16972 if (TARGET_AVX512F)
16973 {
16974 mmode = HImode;
16975 gen_blendm = gen_avx512f_blendmv16si;
16976 }
16977 break;
16978
16979 case E_V32HFmode:
16980 if (TARGET_AVX512BW)
16981 {
16982 mmode = SImode;
16983 gen_blendm = gen_avx512bw_blendmv32hf;
16984 }
16985 break;
16986 case E_V32BFmode:
16987 if (TARGET_AVX512BW)
16988 {
16989 mmode = SImode;
16990 gen_blendm = gen_avx512bw_blendmv32bf;
16991 }
16992 break;
16993 case E_V32HImode:
16994 if (TARGET_AVX512BW)
16995 {
16996 mmode = SImode;
16997 gen_blendm = gen_avx512bw_blendmv32hi;
16998 }
16999 else if (TARGET_AVX512F)
17000 {
17001 half_mode = E_V8HImode;
17002 n = 8;
17003 goto quarter;
17004 }
17005 break;
17006
17007 case E_V64QImode:
17008 if (TARGET_AVX512BW)
17009 {
17010 mmode = DImode;
17011 gen_blendm = gen_avx512bw_blendmv64qi;
17012 }
17013 else if (TARGET_AVX512F)
17014 {
17015 half_mode = E_V16QImode;
17016 n = 16;
17017 goto quarter;
17018 }
17019 break;
17020
17021 quarter:
17022 /* Compute offset. */
17023 i = elt / n;
17024 elt %= n;
17025
17026 gcc_assert (i <= 3);
17027
17028 {
17029 /* Extract the quarter. */
17030 tmp = gen_reg_rtx (V4SImode);
17031 rtx tmp2 = gen_lowpart (V16SImode, target);
17032 rtx mask = gen_reg_rtx (QImode);
17033
17034 emit_move_insn (mask, constm1_rtx);
17035 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
17036 tmp, mask));
17037
17038 tmp2 = gen_reg_rtx (half_mode);
17039 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
17040 tmp = tmp2;
17041
17042 /* Put val in tmp at elt. */
17043 ix86_expand_vector_set (false, tmp, val, elt);
17044
17045 /* Put it back. */
17046 tmp2 = gen_reg_rtx (V16SImode);
17047 rtx tmp3 = gen_lowpart (V16SImode, target);
17048 mask = gen_reg_rtx (HImode);
17049 emit_move_insn (mask, constm1_rtx);
17050 tmp = gen_lowpart (V4SImode, tmp);
17051 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
17052 tmp3, mask));
17053 emit_move_insn (target, gen_lowpart (mode, tmp2));
17054 }
17055 return;
17056
17057 default:
17058 break;
17059 }
17060
17061 if (mmode != VOIDmode)
17062 {
17063 tmp = gen_reg_rtx (mode);
17064 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
17065 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
17066 /* The avx512*_blendm<mode> expanders have different operand order
17067 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
17068 elements where the mask is set and second input operand otherwise,
17069 in {sse,avx}*_*blend* the first input operand is used for elements
17070 where the mask is clear and second input operand otherwise. */
17071 if (!blendm_const)
17072 merge_mask = force_reg (mmode, merge_mask);
17073 emit_insn (gen_blendm (target, target, tmp, merge_mask));
17074 }
17075 else if (use_vec_merge)
17076 {
17077 do_vec_merge:
17078 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
17079 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
17080 GEN_INT (HOST_WIDE_INT_1U << elt));
17081 emit_insn (gen_rtx_SET (target, tmp));
17082 }
17083 else
17084 {
17085 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17086
17087 emit_move_insn (mem, target);
17088
17089 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
17090 emit_move_insn (tmp, val);
17091
17092 emit_move_insn (target, mem);
17093 }
17094 }
17095
17096 void
17097 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
17098 {
17099 machine_mode mode = GET_MODE (vec);
17100 machine_mode inner_mode = GET_MODE_INNER (mode);
17101 bool use_vec_extr = false;
17102 rtx tmp;
17103
17104 switch (mode)
17105 {
17106 case E_V2SImode:
17107 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17108 if (use_vec_extr)
17109 break;
17110 /* FALLTHRU */
17111
17112 case E_V2SFmode:
17113 if (!mmx_ok)
17114 break;
17115 /* FALLTHRU */
17116
17117 case E_V2DFmode:
17118 case E_V2DImode:
17119 case E_V2TImode:
17120 case E_V4TImode:
17121 use_vec_extr = true;
17122 break;
17123
17124 case E_V4SFmode:
17125 use_vec_extr = TARGET_SSE4_1;
17126 if (use_vec_extr)
17127 break;
17128
17129 switch (elt)
17130 {
17131 case 0:
17132 tmp = vec;
17133 break;
17134
17135 case 1:
17136 case 3:
17137 tmp = gen_reg_rtx (mode);
17138 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
17139 GEN_INT (elt), GEN_INT (elt),
17140 GEN_INT (elt+4), GEN_INT (elt+4)));
17141 break;
17142
17143 case 2:
17144 tmp = gen_reg_rtx (mode);
17145 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
17146 break;
17147
17148 default:
17149 gcc_unreachable ();
17150 }
17151 vec = tmp;
17152 use_vec_extr = true;
17153 elt = 0;
17154 break;
17155
17156 case E_V4SImode:
17157 use_vec_extr = TARGET_SSE4_1;
17158 if (use_vec_extr)
17159 break;
17160
17161 if (TARGET_SSE2)
17162 {
17163 switch (elt)
17164 {
17165 case 0:
17166 tmp = vec;
17167 break;
17168
17169 case 1:
17170 case 3:
17171 tmp = gen_reg_rtx (mode);
17172 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
17173 GEN_INT (elt), GEN_INT (elt),
17174 GEN_INT (elt), GEN_INT (elt)));
17175 break;
17176
17177 case 2:
17178 tmp = gen_reg_rtx (mode);
17179 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
17180 break;
17181
17182 default:
17183 gcc_unreachable ();
17184 }
17185 vec = tmp;
17186 use_vec_extr = true;
17187 elt = 0;
17188 }
17189 else
17190 {
17191 /* For SSE1, we have to reuse the V4SF code. */
17192 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
17193 gen_lowpart (V4SFmode, vec), elt);
17194 return;
17195 }
17196 break;
17197
17198 case E_V8HImode:
17199 case E_V8HFmode:
17200 case E_V8BFmode:
17201 case E_V2HImode:
17202 use_vec_extr = TARGET_SSE2;
17203 break;
17204 case E_V4HImode:
17205 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17206 break;
17207
17208 case E_V16QImode:
17209 use_vec_extr = TARGET_SSE4_1;
17210 if (!use_vec_extr
17211 && TARGET_SSE2
17212 && elt == 0
17213 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
17214 {
17215 tmp = gen_reg_rtx (SImode);
17216 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
17217 0);
17218 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
17219 return;
17220 }
17221 break;
17222 case E_V4QImode:
17223 use_vec_extr = TARGET_SSE4_1;
17224 break;
17225
17226 case E_V8SFmode:
17227 if (TARGET_AVX)
17228 {
17229 tmp = gen_reg_rtx (V4SFmode);
17230 if (elt < 4)
17231 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
17232 else
17233 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
17234 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17235 return;
17236 }
17237 break;
17238
17239 case E_V4DFmode:
17240 if (TARGET_AVX)
17241 {
17242 tmp = gen_reg_rtx (V2DFmode);
17243 if (elt < 2)
17244 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
17245 else
17246 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
17247 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17248 return;
17249 }
17250 break;
17251
17252 case E_V32QImode:
17253 if (TARGET_AVX)
17254 {
17255 tmp = gen_reg_rtx (V16QImode);
17256 if (elt < 16)
17257 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
17258 else
17259 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
17260 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17261 return;
17262 }
17263 break;
17264
17265 case E_V16HImode:
17266 if (TARGET_AVX)
17267 {
17268 tmp = gen_reg_rtx (V8HImode);
17269 if (elt < 8)
17270 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
17271 else
17272 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
17273 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17274 return;
17275 }
17276 break;
17277
17278 case E_V8SImode:
17279 if (TARGET_AVX)
17280 {
17281 tmp = gen_reg_rtx (V4SImode);
17282 if (elt < 4)
17283 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
17284 else
17285 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
17286 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17287 return;
17288 }
17289 break;
17290
17291 case E_V4DImode:
17292 if (TARGET_AVX)
17293 {
17294 tmp = gen_reg_rtx (V2DImode);
17295 if (elt < 2)
17296 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
17297 else
17298 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
17299 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17300 return;
17301 }
17302 break;
17303
17304 case E_V32HImode:
17305 if (TARGET_AVX512BW)
17306 {
17307 tmp = gen_reg_rtx (V16HImode);
17308 if (elt < 16)
17309 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
17310 else
17311 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
17312 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17313 return;
17314 }
17315 break;
17316
17317 case E_V64QImode:
17318 if (TARGET_AVX512BW)
17319 {
17320 tmp = gen_reg_rtx (V32QImode);
17321 if (elt < 32)
17322 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
17323 else
17324 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
17325 ix86_expand_vector_extract (false, target, tmp, elt & 31);
17326 return;
17327 }
17328 break;
17329
17330 case E_V16SFmode:
17331 tmp = gen_reg_rtx (V8SFmode);
17332 if (elt < 8)
17333 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
17334 else
17335 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
17336 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17337 return;
17338
17339 case E_V8DFmode:
17340 tmp = gen_reg_rtx (V4DFmode);
17341 if (elt < 4)
17342 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
17343 else
17344 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
17345 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17346 return;
17347
17348 case E_V16SImode:
17349 tmp = gen_reg_rtx (V8SImode);
17350 if (elt < 8)
17351 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
17352 else
17353 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
17354 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17355 return;
17356
17357 case E_V8DImode:
17358 tmp = gen_reg_rtx (V4DImode);
17359 if (elt < 4)
17360 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
17361 else
17362 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
17363 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17364 return;
17365
17366 case E_V32HFmode:
17367 case E_V32BFmode:
17368 if (TARGET_AVX512BW)
17369 {
17370 tmp = (mode == E_V32HFmode
17371 ? gen_reg_rtx (V16HFmode)
17372 : gen_reg_rtx (V16BFmode));
17373 if (elt < 16)
17374 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
17375 else
17376 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
17377 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17378 return;
17379 }
17380 break;
17381
17382 case E_V16HFmode:
17383 case E_V16BFmode:
17384 if (TARGET_AVX)
17385 {
17386 tmp = (mode == E_V16HFmode
17387 ? gen_reg_rtx (V8HFmode)
17388 : gen_reg_rtx (V8BFmode));
17389 if (elt < 8)
17390 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
17391 else
17392 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
17393 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17394 return;
17395 }
17396 break;
17397
17398 case E_V8QImode:
17399 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17400 /* ??? Could extract the appropriate HImode element and shift. */
17401 break;
17402
17403 default:
17404 break;
17405 }
17406
17407 if (use_vec_extr)
17408 {
17409 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17410 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17411
17412 /* Let the rtl optimizers know about the zero extension performed. */
17413 if (inner_mode == QImode || inner_mode == HImode)
17414 {
17415 rtx reg = gen_reg_rtx (SImode);
17416 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
17417 emit_move_insn (reg, tmp);
17418 tmp = gen_lowpart (inner_mode, reg);
17419 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17420 SUBREG_PROMOTED_SET (tmp, 1);
17421 }
17422
17423 emit_move_insn (target, tmp);
17424 }
17425 else
17426 {
17427 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17428
17429 emit_move_insn (mem, vec);
17430
17431 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17432 emit_move_insn (target, tmp);
17433 }
17434 }
17435
17436 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17437 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17438 The upper bits of DEST are undefined, though they shouldn't cause
17439 exceptions (some bits from src or all zeros are ok). */
17440
17441 static void
17442 emit_reduc_half (rtx dest, rtx src, int i)
17443 {
17444 rtx tem, d = dest;
17445 switch (GET_MODE (src))
17446 {
17447 case E_V4SFmode:
17448 if (i == 128)
17449 tem = gen_sse_movhlps (dest, src, src);
17450 else
17451 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
17452 GEN_INT (1 + 4), GEN_INT (1 + 4));
17453 break;
17454 case E_V2DFmode:
17455 tem = gen_vec_interleave_highv2df (dest, src, src);
17456 break;
17457 case E_V4QImode:
17458 d = gen_reg_rtx (V1SImode);
17459 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
17460 GEN_INT (i / 2));
17461 break;
17462 case E_V4HImode:
17463 d = gen_reg_rtx (V1DImode);
17464 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
17465 GEN_INT (i / 2));
17466 break;
17467 case E_V16QImode:
17468 case E_V8HImode:
17469 case E_V8HFmode:
17470 case E_V4SImode:
17471 case E_V2DImode:
17472 d = gen_reg_rtx (V1TImode);
17473 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
17474 GEN_INT (i / 2));
17475 break;
17476 case E_V8SFmode:
17477 if (i == 256)
17478 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
17479 else
17480 tem = gen_avx_shufps256 (dest, src, src,
17481 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
17482 break;
17483 case E_V4DFmode:
17484 if (i == 256)
17485 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
17486 else
17487 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
17488 break;
17489 case E_V32QImode:
17490 case E_V16HImode:
17491 case E_V16HFmode:
17492 case E_V8SImode:
17493 case E_V4DImode:
17494 if (i == 256)
17495 {
17496 if (GET_MODE (dest) != V4DImode)
17497 d = gen_reg_rtx (V4DImode);
17498 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
17499 gen_lowpart (V4DImode, src),
17500 const1_rtx);
17501 }
17502 else
17503 {
17504 d = gen_reg_rtx (V2TImode);
17505 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
17506 GEN_INT (i / 2));
17507 }
17508 break;
17509 case E_V64QImode:
17510 case E_V32HImode:
17511 case E_V32HFmode:
17512 if (i < 64)
17513 {
17514 d = gen_reg_rtx (V4TImode);
17515 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
17516 GEN_INT (i / 2));
17517 break;
17518 }
17519 /* FALLTHRU */
17520 case E_V16SImode:
17521 case E_V16SFmode:
17522 case E_V8DImode:
17523 case E_V8DFmode:
17524 if (i > 128)
17525 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
17526 gen_lowpart (V16SImode, src),
17527 gen_lowpart (V16SImode, src),
17528 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17529 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17530 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17531 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17532 GEN_INT (0xC), GEN_INT (0xD),
17533 GEN_INT (0xE), GEN_INT (0xF),
17534 GEN_INT (0x10), GEN_INT (0x11),
17535 GEN_INT (0x12), GEN_INT (0x13),
17536 GEN_INT (0x14), GEN_INT (0x15),
17537 GEN_INT (0x16), GEN_INT (0x17));
17538 else
17539 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
17540 gen_lowpart (V16SImode, src),
17541 GEN_INT (i == 128 ? 0x2 : 0x1),
17542 GEN_INT (0x3),
17543 GEN_INT (0x3),
17544 GEN_INT (0x3),
17545 GEN_INT (i == 128 ? 0x6 : 0x5),
17546 GEN_INT (0x7),
17547 GEN_INT (0x7),
17548 GEN_INT (0x7),
17549 GEN_INT (i == 128 ? 0xA : 0x9),
17550 GEN_INT (0xB),
17551 GEN_INT (0xB),
17552 GEN_INT (0xB),
17553 GEN_INT (i == 128 ? 0xE : 0xD),
17554 GEN_INT (0xF),
17555 GEN_INT (0xF),
17556 GEN_INT (0xF));
17557 break;
17558 default:
17559 gcc_unreachable ();
17560 }
17561 emit_insn (tem);
17562 if (d != dest)
17563 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17564 }
17565
17566 /* Expand a vector reduction. FN is the binary pattern to reduce;
17567 DEST is the destination; IN is the input vector. */
17568
17569 void
17570 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17571 {
17572 rtx half, dst, vec = in;
17573 machine_mode mode = GET_MODE (in);
17574 int i;
17575
17576 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17577 if (TARGET_SSE4_1
17578 && mode == V8HImode
17579 && fn == gen_uminv8hi3)
17580 {
17581 emit_insn (gen_sse4_1_phminposuw (dest, in));
17582 return;
17583 }
17584
17585 for (i = GET_MODE_BITSIZE (mode);
17586 i > GET_MODE_UNIT_BITSIZE (mode);
17587 i >>= 1)
17588 {
17589 half = gen_reg_rtx (mode);
17590 emit_reduc_half (half, vec, i);
17591 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17592 dst = dest;
17593 else
17594 dst = gen_reg_rtx (mode);
17595 emit_insn (fn (dst, half, vec));
17596 vec = dst;
17597 }
17598 }
17599
17600 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17601 FP status register is set. */
17602
17603 void
17604 ix86_emit_fp_unordered_jump (rtx label)
17605 {
17606 rtx reg = gen_reg_rtx (HImode);
17607 rtx_insn *insn;
17608 rtx temp;
17609
17610 emit_insn (gen_x86_fnstsw_1 (reg));
17611
17612 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17613 {
17614 emit_insn (gen_x86_sahf_1 (reg));
17615
17616 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17617 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17618 }
17619 else
17620 {
17621 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17622
17623 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17624 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17625 }
17626
17627 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17628 gen_rtx_LABEL_REF (VOIDmode, label),
17629 pc_rtx);
17630 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17631 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17632 JUMP_LABEL (insn) = label;
17633 }
17634
17635 /* Output code to perform an sinh XFmode calculation. */
17636
17637 void
17638 ix86_emit_i387_sinh (rtx op0, rtx op1)
17639 {
17640 rtx e1 = gen_reg_rtx (XFmode);
17641 rtx e2 = gen_reg_rtx (XFmode);
17642 rtx scratch = gen_reg_rtx (HImode);
17643 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17644 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17645 rtx cst1, tmp;
17646 rtx_code_label *jump_label = gen_label_rtx ();
17647 rtx_insn *insn;
17648
17649 /* scratch = fxam (op1) */
17650 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17651
17652 /* e1 = expm1 (|op1|) */
17653 emit_insn (gen_absxf2 (e2, op1));
17654 emit_insn (gen_expm1xf2 (e1, e2));
17655
17656 /* e2 = e1 / (e1 + 1.0) + e1 */
17657 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17658 emit_insn (gen_addxf3 (e2, e1, cst1));
17659 emit_insn (gen_divxf3 (e2, e1, e2));
17660 emit_insn (gen_addxf3 (e2, e2, e1));
17661
17662 /* flags = signbit (op1) */
17663 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17664
17665 /* if (flags) then e2 = -e2 */
17666 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17667 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17668 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17669 pc_rtx);
17670 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17671 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17672 JUMP_LABEL (insn) = jump_label;
17673
17674 emit_insn (gen_negxf2 (e2, e2));
17675
17676 emit_label (jump_label);
17677 LABEL_NUSES (jump_label) = 1;
17678
17679 /* op0 = 0.5 * e2 */
17680 half = force_reg (XFmode, half);
17681 emit_insn (gen_mulxf3 (op0, e2, half));
17682 }
17683
17684 /* Output code to perform an cosh XFmode calculation. */
17685
17686 void
17687 ix86_emit_i387_cosh (rtx op0, rtx op1)
17688 {
17689 rtx e1 = gen_reg_rtx (XFmode);
17690 rtx e2 = gen_reg_rtx (XFmode);
17691 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17692 rtx cst1;
17693
17694 /* e1 = exp (op1) */
17695 emit_insn (gen_expxf2 (e1, op1));
17696
17697 /* e2 = e1 + 1.0 / e1 */
17698 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17699 emit_insn (gen_divxf3 (e2, cst1, e1));
17700 emit_insn (gen_addxf3 (e2, e1, e2));
17701
17702 /* op0 = 0.5 * e2 */
17703 half = force_reg (XFmode, half);
17704 emit_insn (gen_mulxf3 (op0, e2, half));
17705 }
17706
17707 /* Output code to perform an tanh XFmode calculation. */
17708
17709 void
17710 ix86_emit_i387_tanh (rtx op0, rtx op1)
17711 {
17712 rtx e1 = gen_reg_rtx (XFmode);
17713 rtx e2 = gen_reg_rtx (XFmode);
17714 rtx scratch = gen_reg_rtx (HImode);
17715 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17716 rtx cst2, tmp;
17717 rtx_code_label *jump_label = gen_label_rtx ();
17718 rtx_insn *insn;
17719
17720 /* scratch = fxam (op1) */
17721 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17722
17723 /* e1 = expm1 (-|2 * op1|) */
17724 emit_insn (gen_addxf3 (e2, op1, op1));
17725 emit_insn (gen_absxf2 (e2, e2));
17726 emit_insn (gen_negxf2 (e2, e2));
17727 emit_insn (gen_expm1xf2 (e1, e2));
17728
17729 /* e2 = e1 / (e1 + 2.0) */
17730 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17731 emit_insn (gen_addxf3 (e2, e1, cst2));
17732 emit_insn (gen_divxf3 (e2, e1, e2));
17733
17734 /* flags = signbit (op1) */
17735 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17736
17737 /* if (!flags) then e2 = -e2 */
17738 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17739 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17740 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17741 pc_rtx);
17742 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17743 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17744 JUMP_LABEL (insn) = jump_label;
17745
17746 emit_insn (gen_negxf2 (e2, e2));
17747
17748 emit_label (jump_label);
17749 LABEL_NUSES (jump_label) = 1;
17750
17751 emit_move_insn (op0, e2);
17752 }
17753
17754 /* Output code to perform an asinh XFmode calculation. */
17755
17756 void
17757 ix86_emit_i387_asinh (rtx op0, rtx op1)
17758 {
17759 rtx e1 = gen_reg_rtx (XFmode);
17760 rtx e2 = gen_reg_rtx (XFmode);
17761 rtx scratch = gen_reg_rtx (HImode);
17762 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17763 rtx cst1, tmp;
17764 rtx_code_label *jump_label = gen_label_rtx ();
17765 rtx_insn *insn;
17766
17767 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17768 emit_insn (gen_mulxf3 (e1, op1, op1));
17769 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17770 emit_insn (gen_addxf3 (e2, e1, cst1));
17771 emit_insn (gen_sqrtxf2 (e2, e2));
17772 emit_insn (gen_addxf3 (e2, e2, cst1));
17773
17774 /* e1 = e1 / e2 */
17775 emit_insn (gen_divxf3 (e1, e1, e2));
17776
17777 /* scratch = fxam (op1) */
17778 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17779
17780 /* e1 = e1 + |op1| */
17781 emit_insn (gen_absxf2 (e2, op1));
17782 emit_insn (gen_addxf3 (e1, e1, e2));
17783
17784 /* e2 = log1p (e1) */
17785 ix86_emit_i387_log1p (e2, e1);
17786
17787 /* flags = signbit (op1) */
17788 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17789
17790 /* if (flags) then e2 = -e2 */
17791 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17792 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17793 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17794 pc_rtx);
17795 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17796 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17797 JUMP_LABEL (insn) = jump_label;
17798
17799 emit_insn (gen_negxf2 (e2, e2));
17800
17801 emit_label (jump_label);
17802 LABEL_NUSES (jump_label) = 1;
17803
17804 emit_move_insn (op0, e2);
17805 }
17806
17807 /* Output code to perform an acosh XFmode calculation. */
17808
17809 void
17810 ix86_emit_i387_acosh (rtx op0, rtx op1)
17811 {
17812 rtx e1 = gen_reg_rtx (XFmode);
17813 rtx e2 = gen_reg_rtx (XFmode);
17814 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17815
17816 /* e2 = sqrt (op1 + 1.0) */
17817 emit_insn (gen_addxf3 (e2, op1, cst1));
17818 emit_insn (gen_sqrtxf2 (e2, e2));
17819
17820 /* e1 = sqrt (op1 - 1.0) */
17821 emit_insn (gen_subxf3 (e1, op1, cst1));
17822 emit_insn (gen_sqrtxf2 (e1, e1));
17823
17824 /* e1 = e1 * e2 */
17825 emit_insn (gen_mulxf3 (e1, e1, e2));
17826
17827 /* e1 = e1 + op1 */
17828 emit_insn (gen_addxf3 (e1, e1, op1));
17829
17830 /* op0 = log (e1) */
17831 emit_insn (gen_logxf2 (op0, e1));
17832 }
17833
17834 /* Output code to perform an atanh XFmode calculation. */
17835
17836 void
17837 ix86_emit_i387_atanh (rtx op0, rtx op1)
17838 {
17839 rtx e1 = gen_reg_rtx (XFmode);
17840 rtx e2 = gen_reg_rtx (XFmode);
17841 rtx scratch = gen_reg_rtx (HImode);
17842 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17843 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17844 rtx cst1, tmp;
17845 rtx_code_label *jump_label = gen_label_rtx ();
17846 rtx_insn *insn;
17847
17848 /* scratch = fxam (op1) */
17849 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17850
17851 /* e2 = |op1| */
17852 emit_insn (gen_absxf2 (e2, op1));
17853
17854 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17855 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17856 emit_insn (gen_addxf3 (e1, e2, cst1));
17857 emit_insn (gen_addxf3 (e2, e2, e2));
17858 emit_insn (gen_negxf2 (e2, e2));
17859 emit_insn (gen_divxf3 (e1, e2, e1));
17860
17861 /* e2 = log1p (e1) */
17862 ix86_emit_i387_log1p (e2, e1);
17863
17864 /* flags = signbit (op1) */
17865 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17866
17867 /* if (!flags) then e2 = -e2 */
17868 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17869 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17870 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17871 pc_rtx);
17872 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17873 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17874 JUMP_LABEL (insn) = jump_label;
17875
17876 emit_insn (gen_negxf2 (e2, e2));
17877
17878 emit_label (jump_label);
17879 LABEL_NUSES (jump_label) = 1;
17880
17881 /* op0 = 0.5 * e2 */
17882 half = force_reg (XFmode, half);
17883 emit_insn (gen_mulxf3 (op0, e2, half));
17884 }
17885
17886 /* Output code to perform a log1p XFmode calculation. */
17887
17888 void
17889 ix86_emit_i387_log1p (rtx op0, rtx op1)
17890 {
17891 rtx_code_label *label1 = gen_label_rtx ();
17892 rtx_code_label *label2 = gen_label_rtx ();
17893
17894 rtx tmp = gen_reg_rtx (XFmode);
17895 rtx res = gen_reg_rtx (XFmode);
17896 rtx cst, cstln2, cst1;
17897 rtx_insn *insn;
17898
17899 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17900 before the conditional jump, otherwise the stack adjustment will be
17901 only conditional. */
17902 do_pending_stack_adjust ();
17903
17904 cst = const_double_from_real_value
17905 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17906 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17907
17908 emit_insn (gen_absxf2 (tmp, op1));
17909
17910 cst = force_reg (XFmode, cst);
17911 ix86_expand_branch (GE, tmp, cst, label1);
17912 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17913 insn = get_last_insn ();
17914 JUMP_LABEL (insn) = label1;
17915
17916 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17917 emit_jump (label2);
17918
17919 emit_label (label1);
17920 LABEL_NUSES (label1) = 1;
17921
17922 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17923 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17924 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17925
17926 emit_label (label2);
17927 LABEL_NUSES (label2) = 1;
17928
17929 emit_move_insn (op0, res);
17930 }
17931
17932 /* Emit code for round calculation. */
17933 void
17934 ix86_emit_i387_round (rtx op0, rtx op1)
17935 {
17936 machine_mode inmode = GET_MODE (op1);
17937 machine_mode outmode = GET_MODE (op0);
17938 rtx e1 = gen_reg_rtx (XFmode);
17939 rtx e2 = gen_reg_rtx (XFmode);
17940 rtx scratch = gen_reg_rtx (HImode);
17941 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17942 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17943 rtx res = gen_reg_rtx (outmode);
17944 rtx_code_label *jump_label = gen_label_rtx ();
17945 rtx (*floor_insn) (rtx, rtx);
17946 rtx (*neg_insn) (rtx, rtx);
17947 rtx_insn *insn;
17948 rtx tmp;
17949
17950 switch (inmode)
17951 {
17952 case E_SFmode:
17953 case E_DFmode:
17954 tmp = gen_reg_rtx (XFmode);
17955
17956 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17957 op1 = tmp;
17958 break;
17959 case E_XFmode:
17960 break;
17961 default:
17962 gcc_unreachable ();
17963 }
17964
17965 switch (outmode)
17966 {
17967 case E_SFmode:
17968 floor_insn = gen_frndintxf2_floor;
17969 neg_insn = gen_negsf2;
17970 break;
17971 case E_DFmode:
17972 floor_insn = gen_frndintxf2_floor;
17973 neg_insn = gen_negdf2;
17974 break;
17975 case E_XFmode:
17976 floor_insn = gen_frndintxf2_floor;
17977 neg_insn = gen_negxf2;
17978 break;
17979 case E_HImode:
17980 floor_insn = gen_lfloorxfhi2;
17981 neg_insn = gen_neghi2;
17982 break;
17983 case E_SImode:
17984 floor_insn = gen_lfloorxfsi2;
17985 neg_insn = gen_negsi2;
17986 break;
17987 case E_DImode:
17988 floor_insn = gen_lfloorxfdi2;
17989 neg_insn = gen_negdi2;
17990 break;
17991 default:
17992 gcc_unreachable ();
17993 }
17994
17995 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17996
17997 /* scratch = fxam(op1) */
17998 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17999
18000 /* e1 = fabs(op1) */
18001 emit_insn (gen_absxf2 (e1, op1));
18002
18003 /* e2 = e1 + 0.5 */
18004 half = force_reg (XFmode, half);
18005 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
18006
18007 /* res = floor(e2) */
18008 switch (outmode)
18009 {
18010 case E_SFmode:
18011 case E_DFmode:
18012 {
18013 tmp = gen_reg_rtx (XFmode);
18014
18015 emit_insn (floor_insn (tmp, e2));
18016 emit_insn (gen_rtx_SET (res,
18017 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
18018 UNSPEC_TRUNC_NOOP)));
18019 }
18020 break;
18021 default:
18022 emit_insn (floor_insn (res, e2));
18023 }
18024
18025 /* flags = signbit(a) */
18026 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18027
18028 /* if (flags) then res = -res */
18029 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18030 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
18031 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18032 pc_rtx);
18033 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18034 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18035 JUMP_LABEL (insn) = jump_label;
18036
18037 emit_insn (neg_insn (res, res));
18038
18039 emit_label (jump_label);
18040 LABEL_NUSES (jump_label) = 1;
18041
18042 emit_move_insn (op0, res);
18043 }
18044
18045 /* Output code to perform a Newton-Rhapson approximation of a single precision
18046 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
18047
18048 void
18049 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
18050 {
18051 rtx x0, x1, e0, e1;
18052
18053 x0 = gen_reg_rtx (mode);
18054 e0 = gen_reg_rtx (mode);
18055 e1 = gen_reg_rtx (mode);
18056 x1 = gen_reg_rtx (mode);
18057
18058 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
18059
18060 b = force_reg (mode, b);
18061
18062 /* x0 = rcp(b) estimate */
18063 if (mode == V16SFmode || mode == V8DFmode)
18064 {
18065 if (TARGET_AVX512ER)
18066 {
18067 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18068 UNSPEC_RCP28)));
18069 /* res = a * x0 */
18070 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
18071 return;
18072 }
18073 else
18074 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18075 UNSPEC_RCP14)));
18076 }
18077 else
18078 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18079 UNSPEC_RCP)));
18080
18081 /* e0 = x0 * b */
18082 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
18083
18084 /* e0 = x0 * e0 */
18085 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
18086
18087 /* e1 = x0 + x0 */
18088 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
18089
18090 /* x1 = e1 - e0 */
18091 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
18092
18093 /* res = a * x1 */
18094 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
18095 }
18096
18097 /* Output code to perform a Newton-Rhapson approximation of a
18098 single precision floating point [reciprocal] square root. */
18099
18100 void
18101 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
18102 {
18103 rtx x0, e0, e1, e2, e3, mthree, mhalf;
18104 REAL_VALUE_TYPE r;
18105 int unspec;
18106
18107 x0 = gen_reg_rtx (mode);
18108 e0 = gen_reg_rtx (mode);
18109 e1 = gen_reg_rtx (mode);
18110 e2 = gen_reg_rtx (mode);
18111 e3 = gen_reg_rtx (mode);
18112
18113 if (TARGET_AVX512ER && mode == V16SFmode)
18114 {
18115 if (recip)
18116 /* res = rsqrt28(a) estimate */
18117 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18118 UNSPEC_RSQRT28)));
18119 else
18120 {
18121 /* x0 = rsqrt28(a) estimate */
18122 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18123 UNSPEC_RSQRT28)));
18124 /* res = rcp28(x0) estimate */
18125 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
18126 UNSPEC_RCP28)));
18127 }
18128 return;
18129 }
18130
18131 real_from_integer (&r, VOIDmode, -3, SIGNED);
18132 mthree = const_double_from_real_value (r, SFmode);
18133
18134 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
18135 mhalf = const_double_from_real_value (r, SFmode);
18136 unspec = UNSPEC_RSQRT;
18137
18138 if (VECTOR_MODE_P (mode))
18139 {
18140 mthree = ix86_build_const_vector (mode, true, mthree);
18141 mhalf = ix86_build_const_vector (mode, true, mhalf);
18142 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18143 if (GET_MODE_SIZE (mode) == 64)
18144 unspec = UNSPEC_RSQRT14;
18145 }
18146
18147 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18148 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18149
18150 a = force_reg (mode, a);
18151
18152 /* x0 = rsqrt(a) estimate */
18153 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18154 unspec)));
18155
18156 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18157 if (!recip)
18158 {
18159 rtx zero = force_reg (mode, CONST0_RTX(mode));
18160 rtx mask;
18161
18162 /* Handle masked compare. */
18163 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
18164 {
18165 mask = gen_reg_rtx (HImode);
18166 /* Imm value 0x4 corresponds to not-equal comparison. */
18167 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
18168 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
18169 }
18170 else
18171 {
18172 mask = gen_reg_rtx (mode);
18173 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
18174 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
18175 }
18176 }
18177
18178 mthree = force_reg (mode, mthree);
18179
18180 /* e0 = x0 * a */
18181 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
18182
18183 unsigned vector_size = GET_MODE_SIZE (mode);
18184 if (TARGET_FMA
18185 || (TARGET_AVX512F && vector_size == 64)
18186 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
18187 emit_insn (gen_rtx_SET (e2,
18188 gen_rtx_FMA (mode, e0, x0, mthree)));
18189 else
18190 {
18191 /* e1 = e0 * x0 */
18192 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
18193
18194 /* e2 = e1 - 3. */
18195 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
18196 }
18197
18198 mhalf = force_reg (mode, mhalf);
18199 if (recip)
18200 /* e3 = -.5 * x0 */
18201 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
18202 else
18203 /* e3 = -.5 * e0 */
18204 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
18205 /* ret = e2 * e3 */
18206 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
18207 }
18208
18209 /* Expand fabs (OP0) and return a new rtx that holds the result. The
18210 mask for masking out the sign-bit is stored in *SMASK, if that is
18211 non-null. */
18212
18213 static rtx
18214 ix86_expand_sse_fabs (rtx op0, rtx *smask)
18215 {
18216 machine_mode vmode, mode = GET_MODE (op0);
18217 rtx xa, mask;
18218
18219 xa = gen_reg_rtx (mode);
18220 if (mode == SFmode)
18221 vmode = V4SFmode;
18222 else if (mode == DFmode)
18223 vmode = V2DFmode;
18224 else
18225 vmode = mode;
18226 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
18227 if (!VECTOR_MODE_P (mode))
18228 {
18229 /* We need to generate a scalar mode mask in this case. */
18230 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18231 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18232 mask = gen_reg_rtx (mode);
18233 emit_insn (gen_rtx_SET (mask, tmp));
18234 }
18235 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
18236
18237 if (smask)
18238 *smask = mask;
18239
18240 return xa;
18241 }
18242
18243 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
18244 swapping the operands if SWAP_OPERANDS is true. The expanded
18245 code is a forward jump to a newly created label in case the
18246 comparison is true. The generated label rtx is returned. */
18247 static rtx_code_label *
18248 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
18249 bool swap_operands)
18250 {
18251 bool unordered_compare = ix86_unordered_fp_compare (code);
18252 rtx_code_label *label;
18253 rtx tmp, reg;
18254
18255 if (swap_operands)
18256 std::swap (op0, op1);
18257
18258 label = gen_label_rtx ();
18259 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
18260 if (unordered_compare)
18261 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
18262 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
18263 emit_insn (gen_rtx_SET (reg, tmp));
18264 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
18265 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18266 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
18267 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18268 JUMP_LABEL (tmp) = label;
18269
18270 return label;
18271 }
18272
18273 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18274 using comparison code CODE. Operands are swapped for the comparison if
18275 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18276 static rtx
18277 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
18278 bool swap_operands)
18279 {
18280 rtx (*insn)(rtx, rtx, rtx, rtx);
18281 machine_mode mode = GET_MODE (op0);
18282 rtx mask = gen_reg_rtx (mode);
18283
18284 if (swap_operands)
18285 std::swap (op0, op1);
18286
18287 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
18288
18289 emit_insn (insn (mask, op0, op1,
18290 gen_rtx_fmt_ee (code, mode, op0, op1)));
18291 return mask;
18292 }
18293
18294 /* Expand copysign from SIGN to the positive value ABS_VALUE
18295 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18296 the sign-bit. */
18297
18298 static void
18299 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
18300 {
18301 machine_mode mode = GET_MODE (sign);
18302 rtx sgn = gen_reg_rtx (mode);
18303 if (mask == NULL_RTX)
18304 {
18305 machine_mode vmode;
18306
18307 if (mode == SFmode)
18308 vmode = V4SFmode;
18309 else if (mode == DFmode)
18310 vmode = V2DFmode;
18311 else
18312 vmode = mode;
18313
18314 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
18315 if (!VECTOR_MODE_P (mode))
18316 {
18317 /* We need to generate a scalar mode mask in this case. */
18318 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18319 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18320 mask = gen_reg_rtx (mode);
18321 emit_insn (gen_rtx_SET (mask, tmp));
18322 }
18323 }
18324 else
18325 mask = gen_rtx_NOT (mode, mask);
18326 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
18327 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
18328 }
18329
18330 /* Expand SSE sequence for computing lround from OP1 storing
18331 into OP0. */
18332
18333 void
18334 ix86_expand_lround (rtx op0, rtx op1)
18335 {
18336 /* C code for the stuff we're doing below:
18337 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18338 return (long)tmp;
18339 */
18340 machine_mode mode = GET_MODE (op1);
18341 const struct real_format *fmt;
18342 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18343 rtx adj;
18344
18345 /* load nextafter (0.5, 0.0) */
18346 fmt = REAL_MODE_FORMAT (mode);
18347 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18348 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18349
18350 /* adj = copysign (0.5, op1) */
18351 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
18352 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
18353
18354 /* adj = op1 + adj */
18355 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
18356
18357 /* op0 = (imode)adj */
18358 expand_fix (op0, adj, 0);
18359 }
18360
18361 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18362 into OPERAND0. */
18363
18364 void
18365 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
18366 {
18367 /* C code for the stuff we're doing below (for do_floor):
18368 xi = (long)op1;
18369 xi -= (double)xi > op1 ? 1 : 0;
18370 return xi;
18371 */
18372 machine_mode fmode = GET_MODE (op1);
18373 machine_mode imode = GET_MODE (op0);
18374 rtx ireg, freg, tmp;
18375 rtx_code_label *label;
18376
18377 /* reg = (long)op1 */
18378 ireg = gen_reg_rtx (imode);
18379 expand_fix (ireg, op1, 0);
18380
18381 /* freg = (double)reg */
18382 freg = gen_reg_rtx (fmode);
18383 expand_float (freg, ireg, 0);
18384
18385 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18386 label = ix86_expand_sse_compare_and_jump (UNLE,
18387 freg, op1, !do_floor);
18388 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
18389 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
18390 emit_move_insn (ireg, tmp);
18391
18392 emit_label (label);
18393 LABEL_NUSES (label) = 1;
18394
18395 emit_move_insn (op0, ireg);
18396 }
18397
18398 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18399 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18400
18401 static rtx
18402 ix86_gen_TWO52 (machine_mode mode)
18403 {
18404 const struct real_format *fmt;
18405 REAL_VALUE_TYPE TWO52r;
18406 rtx TWO52;
18407
18408 fmt = REAL_MODE_FORMAT (mode);
18409 real_2expN (&TWO52r, fmt->p - 1, mode);
18410 TWO52 = const_double_from_real_value (TWO52r, mode);
18411 TWO52 = force_reg (mode, TWO52);
18412
18413 return TWO52;
18414 }
18415
18416 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18417
18418 void
18419 ix86_expand_rint (rtx operand0, rtx operand1)
18420 {
18421 /* C code for the stuff we're doing below:
18422 xa = fabs (operand1);
18423 if (!isless (xa, 2**52))
18424 return operand1;
18425 two52 = 2**52;
18426 if (flag_rounding_math)
18427 {
18428 two52 = copysign (two52, operand1);
18429 xa = operand1;
18430 }
18431 xa = xa + two52 - two52;
18432 return copysign (xa, operand1);
18433 */
18434 machine_mode mode = GET_MODE (operand0);
18435 rtx res, xa, TWO52, mask;
18436 rtx_code_label *label;
18437
18438 TWO52 = ix86_gen_TWO52 (mode);
18439
18440 /* Temporary for holding the result, initialized to the input
18441 operand to ease control flow. */
18442 res = copy_to_reg (operand1);
18443
18444 /* xa = abs (operand1) */
18445 xa = ix86_expand_sse_fabs (res, &mask);
18446
18447 /* if (!isless (xa, TWO52)) goto label; */
18448 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18449
18450 if (flag_rounding_math)
18451 {
18452 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
18453 xa = res;
18454 }
18455
18456 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18457 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18458
18459 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18460 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18461 xa = ix86_expand_sse_fabs (xa, NULL);
18462
18463 ix86_sse_copysign_to_positive (res, xa, res, mask);
18464
18465 emit_label (label);
18466 LABEL_NUSES (label) = 1;
18467
18468 emit_move_insn (operand0, res);
18469 }
18470
18471 /* Expand SSE2 sequence for computing floor or ceil
18472 from OPERAND1 storing into OPERAND0. */
18473 void
18474 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
18475 {
18476 /* C code for the stuff we expand below.
18477 double xa = fabs (x), x2;
18478 if (!isless (xa, TWO52))
18479 return x;
18480 x2 = (double)(long)x;
18481
18482 Compensate. Floor:
18483 if (x2 > x)
18484 x2 -= 1;
18485 Compensate. Ceil:
18486 if (x2 < x)
18487 x2 += 1;
18488
18489 if (HONOR_SIGNED_ZEROS (mode))
18490 return copysign (x2, x);
18491 return x2;
18492 */
18493 machine_mode mode = GET_MODE (operand0);
18494 rtx xa, xi, TWO52, tmp, one, res, mask;
18495 rtx_code_label *label;
18496
18497 TWO52 = ix86_gen_TWO52 (mode);
18498
18499 /* Temporary for holding the result, initialized to the input
18500 operand to ease control flow. */
18501 res = copy_to_reg (operand1);
18502
18503 /* xa = abs (operand1) */
18504 xa = ix86_expand_sse_fabs (res, &mask);
18505
18506 /* if (!isless (xa, TWO52)) goto label; */
18507 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18508
18509 /* xa = (double)(long)x */
18510 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18511 expand_fix (xi, res, 0);
18512 expand_float (xa, xi, 0);
18513
18514 /* generate 1.0 */
18515 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18516
18517 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18518 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18519 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18520 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18521 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18522 if (HONOR_SIGNED_ZEROS (mode))
18523 {
18524 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18525 if (do_floor && flag_rounding_math)
18526 tmp = ix86_expand_sse_fabs (tmp, NULL);
18527
18528 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18529 }
18530 emit_move_insn (res, tmp);
18531
18532 emit_label (label);
18533 LABEL_NUSES (label) = 1;
18534
18535 emit_move_insn (operand0, res);
18536 }
18537
18538 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18539 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18540 that is only available on 64bit targets. */
18541 void
18542 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
18543 {
18544 /* C code for the stuff we expand below.
18545 double xa = fabs (x), x2;
18546 if (!isless (xa, TWO52))
18547 return x;
18548 xa = xa + TWO52 - TWO52;
18549 x2 = copysign (xa, x);
18550
18551 Compensate. Floor:
18552 if (x2 > x)
18553 x2 -= 1;
18554 Compensate. Ceil:
18555 if (x2 < x)
18556 x2 += 1;
18557
18558 if (HONOR_SIGNED_ZEROS (mode))
18559 x2 = copysign (x2, x);
18560 return x2;
18561 */
18562 machine_mode mode = GET_MODE (operand0);
18563 rtx xa, TWO52, tmp, one, res, mask;
18564 rtx_code_label *label;
18565
18566 TWO52 = ix86_gen_TWO52 (mode);
18567
18568 /* Temporary for holding the result, initialized to the input
18569 operand to ease control flow. */
18570 res = copy_to_reg (operand1);
18571
18572 /* xa = abs (operand1) */
18573 xa = ix86_expand_sse_fabs (res, &mask);
18574
18575 /* if (!isless (xa, TWO52)) goto label; */
18576 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18577
18578 /* xa = xa + TWO52 - TWO52; */
18579 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18580 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18581
18582 /* xa = copysign (xa, operand1) */
18583 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18584
18585 /* generate 1.0 */
18586 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18587
18588 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18589 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18590 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18591 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18592 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18593 if (HONOR_SIGNED_ZEROS (mode))
18594 {
18595 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18596 if (do_floor && flag_rounding_math)
18597 tmp = ix86_expand_sse_fabs (tmp, NULL);
18598
18599 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18600 }
18601 emit_move_insn (res, tmp);
18602
18603 emit_label (label);
18604 LABEL_NUSES (label) = 1;
18605
18606 emit_move_insn (operand0, res);
18607 }
18608
18609 /* Expand SSE sequence for computing trunc
18610 from OPERAND1 storing into OPERAND0. */
18611 void
18612 ix86_expand_trunc (rtx operand0, rtx operand1)
18613 {
18614 /* C code for SSE variant we expand below.
18615 double xa = fabs (x), x2;
18616 if (!isless (xa, TWO52))
18617 return x;
18618 x2 = (double)(long)x;
18619 if (HONOR_SIGNED_ZEROS (mode))
18620 return copysign (x2, x);
18621 return x2;
18622 */
18623 machine_mode mode = GET_MODE (operand0);
18624 rtx xa, xi, TWO52, res, mask;
18625 rtx_code_label *label;
18626
18627 TWO52 = ix86_gen_TWO52 (mode);
18628
18629 /* Temporary for holding the result, initialized to the input
18630 operand to ease control flow. */
18631 res = copy_to_reg (operand1);
18632
18633 /* xa = abs (operand1) */
18634 xa = ix86_expand_sse_fabs (res, &mask);
18635
18636 /* if (!isless (xa, TWO52)) goto label; */
18637 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18638
18639 /* xa = (double)(long)x */
18640 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18641 expand_fix (xi, res, 0);
18642 expand_float (xa, xi, 0);
18643
18644 if (HONOR_SIGNED_ZEROS (mode))
18645 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18646
18647 emit_move_insn (res, xa);
18648
18649 emit_label (label);
18650 LABEL_NUSES (label) = 1;
18651
18652 emit_move_insn (operand0, res);
18653 }
18654
18655 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18656 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18657 that is only available on 64bit targets. */
18658 void
18659 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18660 {
18661 machine_mode mode = GET_MODE (operand0);
18662 rtx xa, xa2, TWO52, tmp, one, res, mask;
18663 rtx_code_label *label;
18664
18665 /* C code for SSE variant we expand below.
18666 double xa = fabs (x), x2;
18667 if (!isless (xa, TWO52))
18668 return x;
18669 xa2 = xa + TWO52 - TWO52;
18670 Compensate:
18671 if (xa2 > xa)
18672 xa2 -= 1.0;
18673 x2 = copysign (xa2, x);
18674 return x2;
18675 */
18676
18677 TWO52 = ix86_gen_TWO52 (mode);
18678
18679 /* Temporary for holding the result, initialized to the input
18680 operand to ease control flow. */
18681 res =copy_to_reg (operand1);
18682
18683 /* xa = abs (operand1) */
18684 xa = ix86_expand_sse_fabs (res, &mask);
18685
18686 /* if (!isless (xa, TWO52)) goto label; */
18687 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18688
18689 /* xa2 = xa + TWO52 - TWO52; */
18690 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18691 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18692
18693 /* generate 1.0 */
18694 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18695
18696 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18697 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18698 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18699 tmp = expand_simple_binop (mode, MINUS,
18700 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18701 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18702 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18703 tmp = ix86_expand_sse_fabs (tmp, NULL);
18704
18705 /* res = copysign (xa2, operand1) */
18706 ix86_sse_copysign_to_positive (res, tmp, res, mask);
18707
18708 emit_label (label);
18709 LABEL_NUSES (label) = 1;
18710
18711 emit_move_insn (operand0, res);
18712 }
18713
18714 /* Expand SSE sequence for computing round
18715 from OPERAND1 storing into OPERAND0. */
18716 void
18717 ix86_expand_round (rtx operand0, rtx operand1)
18718 {
18719 /* C code for the stuff we're doing below:
18720 double xa = fabs (x);
18721 if (!isless (xa, TWO52))
18722 return x;
18723 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18724 return copysign (xa, x);
18725 */
18726 machine_mode mode = GET_MODE (operand0);
18727 rtx res, TWO52, xa, xi, half, mask;
18728 rtx_code_label *label;
18729 const struct real_format *fmt;
18730 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18731
18732 /* Temporary for holding the result, initialized to the input
18733 operand to ease control flow. */
18734 res = copy_to_reg (operand1);
18735
18736 TWO52 = ix86_gen_TWO52 (mode);
18737 xa = ix86_expand_sse_fabs (res, &mask);
18738 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18739
18740 /* load nextafter (0.5, 0.0) */
18741 fmt = REAL_MODE_FORMAT (mode);
18742 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18743 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18744
18745 /* xa = xa + 0.5 */
18746 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18747 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18748
18749 /* xa = (double)(int64_t)xa */
18750 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18751 expand_fix (xi, xa, 0);
18752 expand_float (xa, xi, 0);
18753
18754 /* res = copysign (xa, operand1) */
18755 ix86_sse_copysign_to_positive (res, xa, res, mask);
18756
18757 emit_label (label);
18758 LABEL_NUSES (label) = 1;
18759
18760 emit_move_insn (operand0, res);
18761 }
18762
18763 /* Expand SSE sequence for computing round from OPERAND1 storing
18764 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18765 that is only available on 64bit targets. */
18766 void
18767 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18768 {
18769 /* C code for the stuff we expand below.
18770 double xa = fabs (x), xa2, x2;
18771 if (!isless (xa, TWO52))
18772 return x;
18773 Using the absolute value and copying back sign makes
18774 -0.0 -> -0.0 correct.
18775 xa2 = xa + TWO52 - TWO52;
18776 Compensate.
18777 dxa = xa2 - xa;
18778 if (dxa <= -0.5)
18779 xa2 += 1;
18780 else if (dxa > 0.5)
18781 xa2 -= 1;
18782 x2 = copysign (xa2, x);
18783 return x2;
18784 */
18785 machine_mode mode = GET_MODE (operand0);
18786 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18787 rtx_code_label *label;
18788
18789 TWO52 = ix86_gen_TWO52 (mode);
18790
18791 /* Temporary for holding the result, initialized to the input
18792 operand to ease control flow. */
18793 res = copy_to_reg (operand1);
18794
18795 /* xa = abs (operand1) */
18796 xa = ix86_expand_sse_fabs (res, &mask);
18797
18798 /* if (!isless (xa, TWO52)) goto label; */
18799 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18800
18801 /* xa2 = xa + TWO52 - TWO52; */
18802 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18803 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18804
18805 /* dxa = xa2 - xa; */
18806 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18807
18808 /* generate 0.5, 1.0 and -0.5 */
18809 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18810 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18811 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18812 0, OPTAB_DIRECT);
18813
18814 /* Compensate. */
18815 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18816 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18817 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18818 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18819 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18820 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18821 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18822 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18823
18824 /* res = copysign (xa2, operand1) */
18825 ix86_sse_copysign_to_positive (res, xa2, res, mask);
18826
18827 emit_label (label);
18828 LABEL_NUSES (label) = 1;
18829
18830 emit_move_insn (operand0, res);
18831 }
18832
18833 /* Expand SSE sequence for computing round
18834 from OP1 storing into OP0 using sse4 round insn. */
18835 void
18836 ix86_expand_round_sse4 (rtx op0, rtx op1)
18837 {
18838 machine_mode mode = GET_MODE (op0);
18839 rtx e1, e2, res, half;
18840 const struct real_format *fmt;
18841 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18842 rtx (*gen_copysign) (rtx, rtx, rtx);
18843 rtx (*gen_round) (rtx, rtx, rtx);
18844
18845 switch (mode)
18846 {
18847 case E_SFmode:
18848 gen_copysign = gen_copysignsf3;
18849 gen_round = gen_sse4_1_roundsf2;
18850 break;
18851 case E_DFmode:
18852 gen_copysign = gen_copysigndf3;
18853 gen_round = gen_sse4_1_rounddf2;
18854 break;
18855 default:
18856 gcc_unreachable ();
18857 }
18858
18859 /* round (a) = trunc (a + copysign (0.5, a)) */
18860
18861 /* load nextafter (0.5, 0.0) */
18862 fmt = REAL_MODE_FORMAT (mode);
18863 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18864 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18865 half = const_double_from_real_value (pred_half, mode);
18866
18867 /* e1 = copysign (0.5, op1) */
18868 e1 = gen_reg_rtx (mode);
18869 emit_insn (gen_copysign (e1, half, op1));
18870
18871 /* e2 = op1 + e1 */
18872 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18873
18874 /* res = trunc (e2) */
18875 res = gen_reg_rtx (mode);
18876 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18877
18878 emit_move_insn (op0, res);
18879 }
18880
18881 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18882 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18883 insn every time. */
18884
18885 static GTY(()) rtx_insn *vselect_insn;
18886
18887 /* Initialize vselect_insn. */
18888
18889 static void
18890 init_vselect_insn (void)
18891 {
18892 unsigned i;
18893 rtx x;
18894
18895 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18896 for (i = 0; i < MAX_VECT_LEN; ++i)
18897 XVECEXP (x, 0, i) = const0_rtx;
18898 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18899 const0_rtx), x);
18900 x = gen_rtx_SET (const0_rtx, x);
18901 start_sequence ();
18902 vselect_insn = emit_insn (x);
18903 end_sequence ();
18904 }
18905
18906 /* Construct (set target (vec_select op0 (parallel perm))) and
18907 return true if that's a valid instruction in the active ISA. */
18908
18909 static bool
18910 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18911 unsigned nelt, bool testing_p)
18912 {
18913 unsigned int i;
18914 rtx x, save_vconcat;
18915 int icode;
18916
18917 if (vselect_insn == NULL_RTX)
18918 init_vselect_insn ();
18919
18920 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18921 PUT_NUM_ELEM (XVEC (x, 0), nelt);
18922 for (i = 0; i < nelt; ++i)
18923 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18924 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18925 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18926 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18927 SET_DEST (PATTERN (vselect_insn)) = target;
18928 icode = recog_memoized (vselect_insn);
18929
18930 if (icode >= 0 && !testing_p)
18931 emit_insn (copy_rtx (PATTERN (vselect_insn)));
18932
18933 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18934 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18935 INSN_CODE (vselect_insn) = -1;
18936
18937 return icode >= 0;
18938 }
18939
18940 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18941
18942 static bool
18943 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18944 const unsigned char *perm, unsigned nelt,
18945 bool testing_p)
18946 {
18947 machine_mode v2mode;
18948 rtx x;
18949 bool ok;
18950
18951 if (vselect_insn == NULL_RTX)
18952 init_vselect_insn ();
18953
18954 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18955 return false;
18956 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18957 PUT_MODE (x, v2mode);
18958 XEXP (x, 0) = op0;
18959 XEXP (x, 1) = op1;
18960 ok = expand_vselect (target, x, perm, nelt, testing_p);
18961 XEXP (x, 0) = const0_rtx;
18962 XEXP (x, 1) = const0_rtx;
18963 return ok;
18964 }
18965
18966 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18967 using movss or movsd. */
18968 static bool
18969 expand_vec_perm_movs (struct expand_vec_perm_d *d)
18970 {
18971 machine_mode vmode = d->vmode;
18972 unsigned i, nelt = d->nelt;
18973 rtx x;
18974
18975 if (d->one_operand_p)
18976 return false;
18977
18978 if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
18979 && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
18980 && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
18981 return false;
18982
18983 /* Only the first element is changed. */
18984 if (d->perm[0] != nelt && d->perm[0] != 0)
18985 return false;
18986 for (i = 1; i < nelt; ++i)
18987 if (d->perm[i] != i + nelt - d->perm[0])
18988 return false;
18989
18990 if (d->testing_p)
18991 return true;
18992
18993 if (d->perm[0] == nelt)
18994 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18995 else
18996 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18997
18998 emit_insn (gen_rtx_SET (d->target, x));
18999
19000 return true;
19001 }
19002
19003 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19004 using insertps. */
19005 static bool
19006 expand_vec_perm_insertps (struct expand_vec_perm_d *d)
19007 {
19008 machine_mode vmode = d->vmode;
19009 unsigned i, cnt_s, nelt = d->nelt;
19010 int cnt_d = -1;
19011 rtx src, dst;
19012
19013 if (d->one_operand_p)
19014 return false;
19015
19016 if (!(TARGET_SSE4_1
19017 && (vmode == V4SFmode || vmode == V4SImode
19018 || (TARGET_MMX_WITH_SSE
19019 && (vmode == V2SFmode || vmode == V2SImode)))))
19020 return false;
19021
19022 for (i = 0; i < nelt; ++i)
19023 {
19024 if (d->perm[i] == i)
19025 continue;
19026 if (cnt_d != -1)
19027 {
19028 cnt_d = -1;
19029 break;
19030 }
19031 cnt_d = i;
19032 }
19033
19034 if (cnt_d == -1)
19035 {
19036 for (i = 0; i < nelt; ++i)
19037 {
19038 if (d->perm[i] == i + nelt)
19039 continue;
19040 if (cnt_d != -1)
19041 return false;
19042 cnt_d = i;
19043 }
19044
19045 if (cnt_d == -1)
19046 return false;
19047 }
19048
19049 if (d->testing_p)
19050 return true;
19051
19052 gcc_assert (cnt_d != -1);
19053
19054 cnt_s = d->perm[cnt_d];
19055 if (cnt_s < nelt)
19056 {
19057 src = d->op0;
19058 dst = d->op1;
19059 }
19060 else
19061 {
19062 cnt_s -= nelt;
19063 src = d->op1;
19064 dst = d->op0;
19065 }
19066 gcc_assert (cnt_s < nelt);
19067
19068 rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
19069 GEN_INT (cnt_s << 6 | cnt_d << 4));
19070 emit_insn (x);
19071
19072 return true;
19073 }
19074
19075 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19076 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
19077
19078 static bool
19079 expand_vec_perm_blend (struct expand_vec_perm_d *d)
19080 {
19081 machine_mode mmode, vmode = d->vmode;
19082 unsigned i, nelt = d->nelt;
19083 unsigned HOST_WIDE_INT mask;
19084 rtx target, op0, op1, maskop, x;
19085 rtx rperm[32], vperm;
19086
19087 if (d->one_operand_p)
19088 return false;
19089 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
19090 && (TARGET_AVX512BW
19091 || GET_MODE_UNIT_SIZE (vmode) >= 4))
19092 ;
19093 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19094 ;
19095 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19096 ;
19097 else if (TARGET_SSE4_1
19098 && (GET_MODE_SIZE (vmode) == 16
19099 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
19100 || GET_MODE_SIZE (vmode) == 4))
19101 ;
19102 else
19103 return false;
19104
19105 /* This is a blend, not a permute. Elements must stay in their
19106 respective lanes. */
19107 for (i = 0; i < nelt; ++i)
19108 {
19109 unsigned e = d->perm[i];
19110 if (!(e == i || e == i + nelt))
19111 return false;
19112 }
19113
19114 if (d->testing_p)
19115 return true;
19116
19117 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
19118 decision should be extracted elsewhere, so that we only try that
19119 sequence once all budget==3 options have been tried. */
19120 target = d->target;
19121 op0 = d->op0;
19122 op1 = d->op1;
19123 mask = 0;
19124
19125 switch (vmode)
19126 {
19127 case E_V8DFmode:
19128 case E_V16SFmode:
19129 case E_V4DFmode:
19130 case E_V8SFmode:
19131 case E_V2DFmode:
19132 case E_V4SFmode:
19133 case E_V2SFmode:
19134 case E_V2HImode:
19135 case E_V4HImode:
19136 case E_V8HImode:
19137 case E_V8SImode:
19138 case E_V32HImode:
19139 case E_V64QImode:
19140 case E_V16SImode:
19141 case E_V8DImode:
19142 for (i = 0; i < nelt; ++i)
19143 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19144 break;
19145
19146 case E_V2DImode:
19147 for (i = 0; i < 2; ++i)
19148 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
19149 vmode = V8HImode;
19150 goto do_subreg;
19151
19152 case E_V2SImode:
19153 for (i = 0; i < 2; ++i)
19154 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
19155 vmode = V4HImode;
19156 goto do_subreg;
19157
19158 case E_V4SImode:
19159 if (TARGET_AVX2)
19160 {
19161 /* Use vpblendd instead of vpblendw. */
19162 for (i = 0; i < nelt; ++i)
19163 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19164 break;
19165 }
19166 else
19167 {
19168 for (i = 0; i < 4; ++i)
19169 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19170 vmode = V8HImode;
19171 goto do_subreg;
19172 }
19173
19174 case E_V16QImode:
19175 /* See if bytes move in pairs so we can use pblendw with
19176 an immediate argument, rather than pblendvb with a vector
19177 argument. */
19178 for (i = 0; i < 16; i += 2)
19179 if (d->perm[i] + 1 != d->perm[i + 1])
19180 {
19181 use_pblendvb:
19182 for (i = 0; i < nelt; ++i)
19183 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
19184
19185 finish_pblendvb:
19186 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19187 vperm = force_reg (vmode, vperm);
19188
19189 if (GET_MODE_SIZE (vmode) == 4)
19190 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
19191 else if (GET_MODE_SIZE (vmode) == 8)
19192 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
19193 else if (GET_MODE_SIZE (vmode) == 16)
19194 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
19195 else
19196 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
19197 if (target != d->target)
19198 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19199 return true;
19200 }
19201
19202 for (i = 0; i < 8; ++i)
19203 mask |= (d->perm[i * 2] >= 16) << i;
19204 vmode = V8HImode;
19205 /* FALLTHRU */
19206
19207 do_subreg:
19208 target = gen_reg_rtx (vmode);
19209 op0 = gen_lowpart (vmode, op0);
19210 op1 = gen_lowpart (vmode, op1);
19211 break;
19212
19213 case E_V8QImode:
19214 for (i = 0; i < 8; i += 2)
19215 if (d->perm[i] + 1 != d->perm[i + 1])
19216 goto use_pblendvb;
19217
19218 for (i = 0; i < 4; ++i)
19219 mask |= (d->perm[i * 2] >= 8) << i;
19220 vmode = V4HImode;
19221 goto do_subreg;
19222
19223 case E_V4QImode:
19224 for (i = 0; i < 4; i += 2)
19225 if (d->perm[i] + 1 != d->perm[i + 1])
19226 goto use_pblendvb;
19227
19228 for (i = 0; i < 2; ++i)
19229 mask |= (d->perm[i * 2] >= 4) << i;
19230 vmode = V2HImode;
19231 goto do_subreg;
19232
19233 case E_V32QImode:
19234 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19235 for (i = 0; i < 32; i += 2)
19236 if (d->perm[i] + 1 != d->perm[i + 1])
19237 goto use_pblendvb;
19238 /* See if bytes move in quadruplets. If yes, vpblendd
19239 with immediate can be used. */
19240 for (i = 0; i < 32; i += 4)
19241 if (d->perm[i] + 2 != d->perm[i + 2])
19242 break;
19243 if (i < 32)
19244 {
19245 /* See if bytes move the same in both lanes. If yes,
19246 vpblendw with immediate can be used. */
19247 for (i = 0; i < 16; i += 2)
19248 if (d->perm[i] + 16 != d->perm[i + 16])
19249 goto use_pblendvb;
19250
19251 /* Use vpblendw. */
19252 for (i = 0; i < 16; ++i)
19253 mask |= (d->perm[i * 2] >= 32) << i;
19254 vmode = V16HImode;
19255 goto do_subreg;
19256 }
19257
19258 /* Use vpblendd. */
19259 for (i = 0; i < 8; ++i)
19260 mask |= (d->perm[i * 4] >= 32) << i;
19261 vmode = V8SImode;
19262 goto do_subreg;
19263
19264 case E_V16HImode:
19265 /* See if words move in pairs. If yes, vpblendd can be used. */
19266 for (i = 0; i < 16; i += 2)
19267 if (d->perm[i] + 1 != d->perm[i + 1])
19268 break;
19269 if (i < 16)
19270 {
19271 /* See if words move the same in both lanes. If not,
19272 vpblendvb must be used. */
19273 for (i = 0; i < 8; i++)
19274 if (d->perm[i] + 8 != d->perm[i + 8])
19275 {
19276 /* Use vpblendvb. */
19277 for (i = 0; i < 32; ++i)
19278 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
19279
19280 vmode = V32QImode;
19281 nelt = 32;
19282 target = gen_reg_rtx (vmode);
19283 op0 = gen_lowpart (vmode, op0);
19284 op1 = gen_lowpart (vmode, op1);
19285 goto finish_pblendvb;
19286 }
19287
19288 /* Use vpblendw. */
19289 for (i = 0; i < 16; ++i)
19290 mask |= (d->perm[i] >= 16) << i;
19291 break;
19292 }
19293
19294 /* Use vpblendd. */
19295 for (i = 0; i < 8; ++i)
19296 mask |= (d->perm[i * 2] >= 16) << i;
19297 vmode = V8SImode;
19298 goto do_subreg;
19299
19300 case E_V4DImode:
19301 /* Use vpblendd. */
19302 for (i = 0; i < 4; ++i)
19303 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19304 vmode = V8SImode;
19305 goto do_subreg;
19306
19307 default:
19308 gcc_unreachable ();
19309 }
19310
19311 switch (vmode)
19312 {
19313 case E_V8DFmode:
19314 case E_V8DImode:
19315 mmode = QImode;
19316 break;
19317 case E_V16SFmode:
19318 case E_V16SImode:
19319 mmode = HImode;
19320 break;
19321 case E_V32HImode:
19322 mmode = SImode;
19323 break;
19324 case E_V64QImode:
19325 mmode = DImode;
19326 break;
19327 default:
19328 mmode = VOIDmode;
19329 }
19330
19331 if (mmode != VOIDmode)
19332 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
19333 else
19334 maskop = GEN_INT (mask);
19335
19336 /* This matches five different patterns with the different modes. */
19337 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
19338 x = gen_rtx_SET (target, x);
19339 emit_insn (x);
19340 if (target != d->target)
19341 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19342
19343 return true;
19344 }
19345
19346 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19347 in terms of the variable form of vpermilps.
19348
19349 Note that we will have already failed the immediate input vpermilps,
19350 which requires that the high and low part shuffle be identical; the
19351 variable form doesn't require that. */
19352
19353 static bool
19354 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
19355 {
19356 rtx rperm[8], vperm;
19357 unsigned i;
19358
19359 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
19360 return false;
19361
19362 /* We can only permute within the 128-bit lane. */
19363 for (i = 0; i < 8; ++i)
19364 {
19365 unsigned e = d->perm[i];
19366 if (i < 4 ? e >= 4 : e < 4)
19367 return false;
19368 }
19369
19370 if (d->testing_p)
19371 return true;
19372
19373 for (i = 0; i < 8; ++i)
19374 {
19375 unsigned e = d->perm[i];
19376
19377 /* Within each 128-bit lane, the elements of op0 are numbered
19378 from 0 and the elements of op1 are numbered from 4. */
19379 if (e >= 8 + 4)
19380 e -= 8;
19381 else if (e >= 4)
19382 e -= 4;
19383
19384 rperm[i] = GEN_INT (e);
19385 }
19386
19387 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
19388 vperm = force_reg (V8SImode, vperm);
19389 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
19390
19391 return true;
19392 }
19393
19394 /* For V*[QHS]Imode permutations, check if the same permutation
19395 can't be performed in a 2x, 4x or 8x wider inner mode. */
19396
19397 static bool
19398 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
19399 struct expand_vec_perm_d *nd)
19400 {
19401 int i;
19402 machine_mode mode = VOIDmode;
19403
19404 switch (d->vmode)
19405 {
19406 case E_V8QImode: mode = V4HImode; break;
19407 case E_V16QImode: mode = V8HImode; break;
19408 case E_V32QImode: mode = V16HImode; break;
19409 case E_V64QImode: mode = V32HImode; break;
19410 case E_V4HImode: mode = V2SImode; break;
19411 case E_V8HImode: mode = V4SImode; break;
19412 case E_V16HImode: mode = V8SImode; break;
19413 case E_V32HImode: mode = V16SImode; break;
19414 case E_V4SImode: mode = V2DImode; break;
19415 case E_V8SImode: mode = V4DImode; break;
19416 case E_V16SImode: mode = V8DImode; break;
19417 default: return false;
19418 }
19419 for (i = 0; i < d->nelt; i += 2)
19420 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
19421 return false;
19422 nd->vmode = mode;
19423 nd->nelt = d->nelt / 2;
19424 for (i = 0; i < nd->nelt; i++)
19425 nd->perm[i] = d->perm[2 * i] / 2;
19426 if (GET_MODE_INNER (mode) != DImode)
19427 canonicalize_vector_int_perm (nd, nd);
19428 if (nd != d)
19429 {
19430 nd->one_operand_p = d->one_operand_p;
19431 nd->testing_p = d->testing_p;
19432 if (d->op0 == d->op1)
19433 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
19434 else
19435 {
19436 nd->op0 = gen_lowpart (nd->vmode, d->op0);
19437 nd->op1 = gen_lowpart (nd->vmode, d->op1);
19438 }
19439 if (d->testing_p)
19440 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
19441 else
19442 nd->target = gen_reg_rtx (nd->vmode);
19443 }
19444 return true;
19445 }
19446
19447 /* Return true if permutation D can be performed as VMODE permutation
19448 instead. */
19449
19450 static bool
19451 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
19452 {
19453 unsigned int i, j, chunk;
19454
19455 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
19456 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
19457 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
19458 return false;
19459
19460 if (GET_MODE_NUNITS (vmode) >= d->nelt)
19461 return true;
19462
19463 chunk = d->nelt / GET_MODE_NUNITS (vmode);
19464 for (i = 0; i < d->nelt; i += chunk)
19465 if (d->perm[i] & (chunk - 1))
19466 return false;
19467 else
19468 for (j = 1; j < chunk; ++j)
19469 if (d->perm[i] + j != d->perm[i + j])
19470 return false;
19471
19472 return true;
19473 }
19474
19475 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19476 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19477
19478 static bool
19479 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
19480 {
19481 unsigned i, nelt, eltsz, mask;
19482 unsigned char perm[64];
19483 machine_mode vmode;
19484 struct expand_vec_perm_d nd;
19485 rtx rperm[64], vperm, target, op0, op1;
19486
19487 nelt = d->nelt;
19488
19489 if (!d->one_operand_p)
19490 switch (GET_MODE_SIZE (d->vmode))
19491 {
19492 case 4:
19493 if (!TARGET_XOP)
19494 return false;
19495 vmode = V4QImode;
19496 break;
19497
19498 case 8:
19499 if (!TARGET_XOP)
19500 return false;
19501 vmode = V8QImode;
19502 break;
19503
19504 case 16:
19505 if (!TARGET_XOP)
19506 return false;
19507 vmode = V16QImode;
19508 break;
19509
19510 case 32:
19511 if (!TARGET_AVX2)
19512 return false;
19513
19514 if (valid_perm_using_mode_p (V2TImode, d))
19515 {
19516 if (d->testing_p)
19517 return true;
19518
19519 /* Use vperm2i128 insn. The pattern uses
19520 V4DImode instead of V2TImode. */
19521 target = d->target;
19522 if (d->vmode != V4DImode)
19523 target = gen_reg_rtx (V4DImode);
19524 op0 = gen_lowpart (V4DImode, d->op0);
19525 op1 = gen_lowpart (V4DImode, d->op1);
19526 rperm[0]
19527 = GEN_INT ((d->perm[0] / (nelt / 2))
19528 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
19529 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
19530 if (target != d->target)
19531 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19532 return true;
19533 }
19534 /* FALLTHRU */
19535
19536 default:
19537 return false;
19538 }
19539 else
19540 switch (GET_MODE_SIZE (d->vmode))
19541 {
19542 case 4:
19543 if (!TARGET_SSSE3)
19544 return false;
19545 vmode = V4QImode;
19546 break;
19547
19548 case 8:
19549 if (!TARGET_SSSE3)
19550 return false;
19551 vmode = V8QImode;
19552 break;
19553
19554 case 16:
19555 if (!TARGET_SSSE3)
19556 return false;
19557 vmode = V16QImode;
19558 break;
19559
19560 case 32:
19561 if (!TARGET_AVX2)
19562 return false;
19563
19564 /* V4DImode should be already handled through
19565 expand_vselect by vpermq instruction. */
19566 gcc_assert (d->vmode != V4DImode);
19567
19568 vmode = V32QImode;
19569 if (d->vmode == V8SImode
19570 || d->vmode == V16HImode
19571 || d->vmode == V32QImode)
19572 {
19573 /* First see if vpermq can be used for
19574 V8SImode/V16HImode/V32QImode. */
19575 if (valid_perm_using_mode_p (V4DImode, d))
19576 {
19577 for (i = 0; i < 4; i++)
19578 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
19579 if (d->testing_p)
19580 return true;
19581 target = gen_reg_rtx (V4DImode);
19582 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
19583 perm, 4, false))
19584 {
19585 emit_move_insn (d->target,
19586 gen_lowpart (d->vmode, target));
19587 return true;
19588 }
19589 return false;
19590 }
19591
19592 /* Next see if vpermd can be used. */
19593 if (valid_perm_using_mode_p (V8SImode, d))
19594 vmode = V8SImode;
19595 }
19596 /* Or if vpermps can be used. */
19597 else if (d->vmode == V8SFmode)
19598 vmode = V8SImode;
19599
19600 if (vmode == V32QImode)
19601 {
19602 /* vpshufb only works intra lanes, it is not
19603 possible to shuffle bytes in between the lanes. */
19604 for (i = 0; i < nelt; ++i)
19605 if ((d->perm[i] ^ i) & (nelt / 2))
19606 return false;
19607 }
19608 break;
19609
19610 case 64:
19611 if (!TARGET_AVX512BW)
19612 return false;
19613
19614 /* If vpermq didn't work, vpshufb won't work either. */
19615 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19616 return false;
19617
19618 vmode = V64QImode;
19619 if (d->vmode == V16SImode
19620 || d->vmode == V32HImode
19621 || d->vmode == V64QImode)
19622 {
19623 /* First see if vpermq can be used for
19624 V16SImode/V32HImode/V64QImode. */
19625 if (valid_perm_using_mode_p (V8DImode, d))
19626 {
19627 for (i = 0; i < 8; i++)
19628 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19629 if (d->testing_p)
19630 return true;
19631 target = gen_reg_rtx (V8DImode);
19632 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19633 perm, 8, false))
19634 {
19635 emit_move_insn (d->target,
19636 gen_lowpart (d->vmode, target));
19637 return true;
19638 }
19639 return false;
19640 }
19641
19642 /* Next see if vpermd can be used. */
19643 if (valid_perm_using_mode_p (V16SImode, d))
19644 vmode = V16SImode;
19645 }
19646 /* Or if vpermps can be used. */
19647 else if (d->vmode == V16SFmode)
19648 vmode = V16SImode;
19649
19650 if (vmode == V64QImode)
19651 {
19652 /* vpshufb only works intra lanes, it is not
19653 possible to shuffle bytes in between the lanes. */
19654 for (i = 0; i < nelt; ++i)
19655 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19656 return false;
19657 }
19658 break;
19659
19660 default:
19661 return false;
19662 }
19663
19664 if (d->testing_p)
19665 return true;
19666
19667 /* Try to avoid variable permutation instruction. */
19668 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19669 {
19670 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19671 return true;
19672 }
19673
19674 if (vmode == V8SImode)
19675 for (i = 0; i < 8; ++i)
19676 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19677 else if (vmode == V16SImode)
19678 for (i = 0; i < 16; ++i)
19679 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19680 else
19681 {
19682 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19683 if (!d->one_operand_p)
19684 mask = 2 * nelt - 1;
19685 else if (vmode == V64QImode)
19686 mask = nelt / 4 - 1;
19687 else if (vmode == V32QImode)
19688 mask = nelt / 2 - 1;
19689 else
19690 mask = nelt - 1;
19691
19692 for (i = 0; i < nelt; ++i)
19693 {
19694 unsigned j, e = d->perm[i] & mask;
19695 for (j = 0; j < eltsz; ++j)
19696 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19697 }
19698 }
19699
19700 machine_mode vpmode = vmode;
19701
19702 nelt = GET_MODE_SIZE (vmode);
19703
19704 /* Emulate narrow modes with V16QI instructions. */
19705 if (nelt < 16)
19706 {
19707 rtx m128 = GEN_INT (-128);
19708
19709 /* Remap elements from the second operand, as we have to
19710 account for inactive top elements from the first operand. */
19711 if (!d->one_operand_p)
19712 {
19713 for (i = 0; i < nelt; ++i)
19714 {
19715 unsigned ival = UINTVAL (rperm[i]);
19716 if (ival >= nelt)
19717 rperm[i] = GEN_INT (ival + 16 - nelt);
19718 }
19719 }
19720
19721 /* Fill inactive elements in the top positions with zeros. */
19722 for (i = nelt; i < 16; ++i)
19723 rperm[i] = m128;
19724
19725 vpmode = V16QImode;
19726 }
19727
19728 vperm = gen_rtx_CONST_VECTOR (vpmode,
19729 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19730 vperm = force_reg (vpmode, vperm);
19731
19732 if (vmode == d->vmode)
19733 target = d->target;
19734 else
19735 target = gen_reg_rtx (vmode);
19736
19737 op0 = gen_lowpart (vmode, d->op0);
19738
19739 if (d->one_operand_p)
19740 {
19741 rtx (*gen) (rtx, rtx, rtx);
19742
19743 if (vmode == V4QImode)
19744 gen = gen_mmx_pshufbv4qi3;
19745 else if (vmode == V8QImode)
19746 gen = gen_mmx_pshufbv8qi3;
19747 else if (vmode == V16QImode)
19748 gen = gen_ssse3_pshufbv16qi3;
19749 else if (vmode == V32QImode)
19750 gen = gen_avx2_pshufbv32qi3;
19751 else if (vmode == V64QImode)
19752 gen = gen_avx512bw_pshufbv64qi3;
19753 else if (vmode == V8SFmode)
19754 gen = gen_avx2_permvarv8sf;
19755 else if (vmode == V8SImode)
19756 gen = gen_avx2_permvarv8si;
19757 else if (vmode == V16SFmode)
19758 gen = gen_avx512f_permvarv16sf;
19759 else if (vmode == V16SImode)
19760 gen = gen_avx512f_permvarv16si;
19761 else
19762 gcc_unreachable ();
19763
19764 emit_insn (gen (target, op0, vperm));
19765 }
19766 else
19767 {
19768 rtx (*gen) (rtx, rtx, rtx, rtx);
19769
19770 op1 = gen_lowpart (vmode, d->op1);
19771
19772 if (vmode == V4QImode)
19773 gen = gen_mmx_ppermv32;
19774 else if (vmode == V8QImode)
19775 gen = gen_mmx_ppermv64;
19776 else if (vmode == V16QImode)
19777 gen = gen_xop_pperm;
19778 else
19779 gcc_unreachable ();
19780
19781 emit_insn (gen (target, op0, op1, vperm));
19782 }
19783
19784 if (target != d->target)
19785 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19786
19787 return true;
19788 }
19789
19790 /* Try to expand one-operand permutation with constant mask. */
19791
19792 static bool
19793 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19794 {
19795 machine_mode mode = GET_MODE (d->op0);
19796 machine_mode maskmode = mode;
19797 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
19798 rtx (*gen) (rtx, rtx, rtx) = NULL;
19799 rtx target, op0, mask;
19800 rtx vec[64];
19801
19802 if (!rtx_equal_p (d->op0, d->op1))
19803 return false;
19804
19805 if (!TARGET_AVX512F)
19806 return false;
19807
19808 /* Accept VNxHImode and VNxQImode now. */
19809 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19810 return false;
19811
19812 /* vpermw. */
19813 if (!TARGET_AVX512BW && inner_size == 2)
19814 return false;
19815
19816 /* vpermb. */
19817 if (!TARGET_AVX512VBMI && inner_size == 1)
19818 return false;
19819
19820 switch (mode)
19821 {
19822 case E_V16SImode:
19823 gen = gen_avx512f_permvarv16si;
19824 break;
19825 case E_V16SFmode:
19826 gen = gen_avx512f_permvarv16sf;
19827 maskmode = V16SImode;
19828 break;
19829 case E_V8DImode:
19830 gen = gen_avx512f_permvarv8di;
19831 break;
19832 case E_V8DFmode:
19833 gen = gen_avx512f_permvarv8df;
19834 maskmode = V8DImode;
19835 break;
19836 case E_V32HImode:
19837 gen = gen_avx512bw_permvarv32hi;
19838 break;
19839 case E_V16HImode:
19840 gen = gen_avx512vl_permvarv16hi;
19841 break;
19842 case E_V8HImode:
19843 gen = gen_avx512vl_permvarv8hi;
19844 break;
19845 case E_V64QImode:
19846 gen = gen_avx512bw_permvarv64qi;
19847 break;
19848 case E_V32QImode:
19849 gen = gen_avx512vl_permvarv32qi;
19850 break;
19851 case E_V16QImode:
19852 gen = gen_avx512vl_permvarv16qi;
19853 break;
19854
19855 default:
19856 return false;
19857 }
19858
19859 if (d->testing_p)
19860 return true;
19861
19862 target = d->target;
19863 op0 = d->op0;
19864 for (int i = 0; i < d->nelt; ++i)
19865 vec[i] = GEN_INT (d->perm[i]);
19866 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19867 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19868 return true;
19869 }
19870
19871 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19872
19873 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19874 in a single instruction. */
19875
19876 static bool
19877 expand_vec_perm_1 (struct expand_vec_perm_d *d)
19878 {
19879 unsigned i, nelt = d->nelt;
19880 struct expand_vec_perm_d nd;
19881
19882 /* Check plain VEC_SELECT first, because AVX has instructions that could
19883 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19884 input where SEL+CONCAT may not. */
19885 if (d->one_operand_p)
19886 {
19887 int mask = nelt - 1;
19888 bool identity_perm = true;
19889 bool broadcast_perm = true;
19890
19891 for (i = 0; i < nelt; i++)
19892 {
19893 nd.perm[i] = d->perm[i] & mask;
19894 if (nd.perm[i] != i)
19895 identity_perm = false;
19896 if (nd.perm[i])
19897 broadcast_perm = false;
19898 }
19899
19900 if (identity_perm)
19901 {
19902 if (!d->testing_p)
19903 emit_move_insn (d->target, d->op0);
19904 return true;
19905 }
19906 else if (broadcast_perm && TARGET_AVX2)
19907 {
19908 /* Use vpbroadcast{b,w,d}. */
19909 rtx (*gen) (rtx, rtx) = NULL;
19910 switch (d->vmode)
19911 {
19912 case E_V64QImode:
19913 if (TARGET_AVX512BW)
19914 gen = gen_avx512bw_vec_dupv64qi_1;
19915 break;
19916 case E_V32QImode:
19917 gen = gen_avx2_pbroadcastv32qi_1;
19918 break;
19919 case E_V32HImode:
19920 if (TARGET_AVX512BW)
19921 gen = gen_avx512bw_vec_dupv32hi_1;
19922 break;
19923 case E_V16HImode:
19924 gen = gen_avx2_pbroadcastv16hi_1;
19925 break;
19926 case E_V16SImode:
19927 if (TARGET_AVX512F)
19928 gen = gen_avx512f_vec_dupv16si_1;
19929 break;
19930 case E_V8SImode:
19931 gen = gen_avx2_pbroadcastv8si_1;
19932 break;
19933 case E_V16QImode:
19934 gen = gen_avx2_pbroadcastv16qi;
19935 break;
19936 case E_V8HImode:
19937 gen = gen_avx2_pbroadcastv8hi;
19938 break;
19939 case E_V16SFmode:
19940 if (TARGET_AVX512F)
19941 gen = gen_avx512f_vec_dupv16sf_1;
19942 break;
19943 case E_V8SFmode:
19944 gen = gen_avx2_vec_dupv8sf_1;
19945 break;
19946 case E_V8DFmode:
19947 if (TARGET_AVX512F)
19948 gen = gen_avx512f_vec_dupv8df_1;
19949 break;
19950 case E_V8DImode:
19951 if (TARGET_AVX512F)
19952 gen = gen_avx512f_vec_dupv8di_1;
19953 break;
19954 /* For other modes prefer other shuffles this function creates. */
19955 default: break;
19956 }
19957 if (gen != NULL)
19958 {
19959 if (!d->testing_p)
19960 emit_insn (gen (d->target, d->op0));
19961 return true;
19962 }
19963 }
19964
19965 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19966 return true;
19967
19968 /* There are plenty of patterns in sse.md that are written for
19969 SEL+CONCAT and are not replicated for a single op. Perhaps
19970 that should be changed, to avoid the nastiness here. */
19971
19972 /* Recognize interleave style patterns, which means incrementing
19973 every other permutation operand. */
19974 for (i = 0; i < nelt; i += 2)
19975 {
19976 nd.perm[i] = d->perm[i] & mask;
19977 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19978 }
19979 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19980 d->testing_p))
19981 return true;
19982
19983 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19984 if (nelt >= 4)
19985 {
19986 for (i = 0; i < nelt; i += 4)
19987 {
19988 nd.perm[i + 0] = d->perm[i + 0] & mask;
19989 nd.perm[i + 1] = d->perm[i + 1] & mask;
19990 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19991 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19992 }
19993
19994 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19995 d->testing_p))
19996 return true;
19997 }
19998 }
19999
20000 /* Try the SSE4.1 blend variable merge instructions. */
20001 if (expand_vec_perm_blend (d))
20002 return true;
20003
20004 /* Try movss/movsd instructions. */
20005 if (expand_vec_perm_movs (d))
20006 return true;
20007
20008 /* Try the SSE4.1 insertps instruction. */
20009 if (expand_vec_perm_insertps (d))
20010 return true;
20011
20012 /* Try the fully general two operand permute. */
20013 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
20014 d->testing_p))
20015 return true;
20016
20017 /* Recognize interleave style patterns with reversed operands. */
20018 if (!d->one_operand_p)
20019 {
20020 for (i = 0; i < nelt; ++i)
20021 {
20022 unsigned e = d->perm[i];
20023 if (e >= nelt)
20024 e -= nelt;
20025 else
20026 e += nelt;
20027 nd.perm[i] = e;
20028 }
20029
20030 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
20031 d->testing_p))
20032 return true;
20033 }
20034
20035 /* Try one of the AVX vpermil variable permutations. */
20036 if (expand_vec_perm_vpermil (d))
20037 return true;
20038
20039 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
20040 vpshufb, vpermd, vpermps or vpermq variable permutation. */
20041 if (expand_vec_perm_pshufb (d))
20042 return true;
20043
20044 /* Try the AVX2 vpalignr instruction. */
20045 if (expand_vec_perm_palignr (d, true))
20046 return true;
20047
20048 /* Try the AVX512F vperm{w,b,s,d} instructions */
20049 if (ix86_expand_vec_one_operand_perm_avx512 (d))
20050 return true;
20051
20052 /* Try the AVX512F vpermt2/vpermi2 instructions. */
20053 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
20054 return true;
20055
20056 /* See if we can get the same permutation in different vector integer
20057 mode. */
20058 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
20059 {
20060 if (!d->testing_p)
20061 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
20062 return true;
20063 }
20064 return false;
20065 }
20066
20067 /* Canonicalize vec_perm index to make the first index
20068 always comes from the first vector. */
20069 static void
20070 ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
20071 {
20072 unsigned nelt = d->nelt;
20073 if (d->perm[0] < nelt)
20074 return;
20075
20076 for (unsigned i = 0; i != nelt; i++)
20077 d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
20078
20079 std::swap (d->op0, d->op1);
20080 return;
20081 }
20082
20083 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20084 in terms of a pair of shufps+ shufps/pshufd instructions. */
20085 static bool
20086 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
20087 {
20088 unsigned char perm1[4];
20089 machine_mode vmode = d->vmode;
20090 bool ok;
20091 unsigned i, j, k, count = 0;
20092
20093 if (d->one_operand_p
20094 || (vmode != V4SImode && vmode != V4SFmode))
20095 return false;
20096
20097 if (d->testing_p)
20098 return true;
20099
20100 ix86_vec_perm_index_canon (d);
20101 for (i = 0; i < 4; ++i)
20102 count += d->perm[i] > 3 ? 1 : 0;
20103
20104 gcc_assert (count & 3);
20105
20106 rtx tmp = gen_reg_rtx (vmode);
20107 /* 2 from op0 and 2 from op1. */
20108 if (count == 2)
20109 {
20110 unsigned char perm2[4];
20111 for (i = 0, j = 0, k = 2; i < 4; ++i)
20112 if (d->perm[i] & 4)
20113 {
20114 perm1[k++] = d->perm[i];
20115 perm2[i] = k - 1;
20116 }
20117 else
20118 {
20119 perm1[j++] = d->perm[i];
20120 perm2[i] = j - 1;
20121 }
20122
20123 /* shufps. */
20124 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20125 perm1, d->nelt, false);
20126 gcc_assert (ok);
20127 if (vmode == V4SImode && TARGET_SSE2)
20128 /* pshufd. */
20129 ok = expand_vselect (d->target, tmp,
20130 perm2, d->nelt, false);
20131 else
20132 {
20133 /* shufps. */
20134 perm2[2] += 4;
20135 perm2[3] += 4;
20136 ok = expand_vselect_vconcat (d->target, tmp, tmp,
20137 perm2, d->nelt, false);
20138 }
20139 gcc_assert (ok);
20140 }
20141 /* 3 from one op and 1 from another. */
20142 else
20143 {
20144 unsigned pair_idx = 8, lone_idx = 8, shift;
20145
20146 /* Find the lone index. */
20147 for (i = 0; i < 4; ++i)
20148 if ((d->perm[i] > 3 && count == 1)
20149 || (d->perm[i] < 4 && count == 3))
20150 lone_idx = i;
20151
20152 /* When lone_idx is not 0, it must from second op(count == 1). */
20153 gcc_assert (count == (lone_idx ? 1 : 3));
20154
20155 /* Find the pair index that sits in the same half as the lone index. */
20156 shift = lone_idx & 2;
20157 pair_idx = 1 - lone_idx + 2 * shift;
20158
20159 /* First permutate lone index and pair index into the same vector as
20160 [ lone, lone, pair, pair ]. */
20161 perm1[1] = perm1[0]
20162 = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
20163 perm1[3] = perm1[2]
20164 = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
20165
20166 /* Alway put the vector contains lone indx at the first. */
20167 if (count == 1)
20168 std::swap (d->op0, d->op1);
20169
20170 /* shufps. */
20171 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20172 perm1, d->nelt, false);
20173 gcc_assert (ok);
20174
20175 /* Refine lone and pair index to original order. */
20176 perm1[shift] = lone_idx << 1;
20177 perm1[shift + 1] = pair_idx << 1;
20178
20179 /* Select the remaining 2 elements in another vector. */
20180 for (i = 2 - shift; i < 4 - shift; ++i)
20181 perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
20182
20183 /* Adjust to original selector. */
20184 if (lone_idx > 1)
20185 std::swap (tmp, d->op1);
20186
20187 /* shufps. */
20188 ok = expand_vselect_vconcat (d->target, tmp, d->op1,
20189 perm1, d->nelt, false);
20190
20191 gcc_assert (ok);
20192 }
20193
20194 return true;
20195 }
20196
20197 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20198 in terms of a pair of pshuflw + pshufhw instructions. */
20199
20200 static bool
20201 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
20202 {
20203 unsigned char perm2[MAX_VECT_LEN];
20204 unsigned i;
20205 bool ok;
20206
20207 if (d->vmode != V8HImode || !d->one_operand_p)
20208 return false;
20209
20210 /* The two permutations only operate in 64-bit lanes. */
20211 for (i = 0; i < 4; ++i)
20212 if (d->perm[i] >= 4)
20213 return false;
20214 for (i = 4; i < 8; ++i)
20215 if (d->perm[i] < 4)
20216 return false;
20217
20218 if (d->testing_p)
20219 return true;
20220
20221 /* Emit the pshuflw. */
20222 memcpy (perm2, d->perm, 4);
20223 for (i = 4; i < 8; ++i)
20224 perm2[i] = i;
20225 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
20226 gcc_assert (ok);
20227
20228 /* Emit the pshufhw. */
20229 memcpy (perm2 + 4, d->perm + 4, 4);
20230 for (i = 0; i < 4; ++i)
20231 perm2[i] = i;
20232 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
20233 gcc_assert (ok);
20234
20235 return true;
20236 }
20237
20238 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20239 the permutation using the SSSE3 palignr instruction. This succeeds
20240 when all of the elements in PERM fit within one vector and we merely
20241 need to shift them down so that a single vector permutation has a
20242 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20243 the vpalignr instruction itself can perform the requested permutation. */
20244
20245 static bool
20246 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
20247 {
20248 unsigned i, nelt = d->nelt;
20249 unsigned min, max, minswap, maxswap;
20250 bool in_order, ok, swap = false;
20251 rtx shift, target;
20252 struct expand_vec_perm_d dcopy;
20253
20254 /* Even with AVX, palignr only operates on 128-bit vectors,
20255 in AVX2 palignr operates on both 128-bit lanes. */
20256 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
20257 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
20258 return false;
20259
20260 min = 2 * nelt;
20261 max = 0;
20262 minswap = 2 * nelt;
20263 maxswap = 0;
20264 for (i = 0; i < nelt; ++i)
20265 {
20266 unsigned e = d->perm[i];
20267 unsigned eswap = d->perm[i] ^ nelt;
20268 if (GET_MODE_SIZE (d->vmode) == 32)
20269 {
20270 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
20271 eswap = e ^ (nelt / 2);
20272 }
20273 if (e < min)
20274 min = e;
20275 if (e > max)
20276 max = e;
20277 if (eswap < minswap)
20278 minswap = eswap;
20279 if (eswap > maxswap)
20280 maxswap = eswap;
20281 }
20282 if (min == 0
20283 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
20284 {
20285 if (d->one_operand_p
20286 || minswap == 0
20287 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
20288 ? nelt / 2 : nelt))
20289 return false;
20290 swap = true;
20291 min = minswap;
20292 max = maxswap;
20293 }
20294
20295 /* Given that we have SSSE3, we know we'll be able to implement the
20296 single operand permutation after the palignr with pshufb for
20297 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20298 first. */
20299 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
20300 return true;
20301
20302 dcopy = *d;
20303 if (swap)
20304 {
20305 dcopy.op0 = d->op1;
20306 dcopy.op1 = d->op0;
20307 for (i = 0; i < nelt; ++i)
20308 dcopy.perm[i] ^= nelt;
20309 }
20310
20311 in_order = true;
20312 for (i = 0; i < nelt; ++i)
20313 {
20314 unsigned e = dcopy.perm[i];
20315 if (GET_MODE_SIZE (d->vmode) == 32
20316 && e >= nelt
20317 && (e & (nelt / 2 - 1)) < min)
20318 e = e - min - (nelt / 2);
20319 else
20320 e = e - min;
20321 if (e != i)
20322 in_order = false;
20323 dcopy.perm[i] = e;
20324 }
20325 dcopy.one_operand_p = true;
20326
20327 if (single_insn_only_p && !in_order)
20328 return false;
20329
20330 /* For AVX2, test whether we can permute the result in one instruction. */
20331 if (d->testing_p)
20332 {
20333 if (in_order)
20334 return true;
20335 dcopy.op1 = dcopy.op0;
20336 return expand_vec_perm_1 (&dcopy);
20337 }
20338
20339 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
20340 if (GET_MODE_SIZE (d->vmode) == 16)
20341 {
20342 target = gen_reg_rtx (V1TImode);
20343 emit_insn (gen_ssse3_palignrv1ti (target,
20344 gen_lowpart (V1TImode, dcopy.op1),
20345 gen_lowpart (V1TImode, dcopy.op0),
20346 shift));
20347 }
20348 else
20349 {
20350 target = gen_reg_rtx (V2TImode);
20351 emit_insn (gen_avx2_palignrv2ti (target,
20352 gen_lowpart (V2TImode, dcopy.op1),
20353 gen_lowpart (V2TImode, dcopy.op0),
20354 shift));
20355 }
20356
20357 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
20358
20359 /* Test for the degenerate case where the alignment by itself
20360 produces the desired permutation. */
20361 if (in_order)
20362 {
20363 emit_move_insn (d->target, dcopy.op0);
20364 return true;
20365 }
20366
20367 ok = expand_vec_perm_1 (&dcopy);
20368 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
20369
20370 return ok;
20371 }
20372
20373 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20374 the permutation using the SSE4_1 pblendv instruction. Potentially
20375 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20376
20377 static bool
20378 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
20379 {
20380 unsigned i, which, nelt = d->nelt;
20381 struct expand_vec_perm_d dcopy, dcopy1;
20382 machine_mode vmode = d->vmode;
20383 bool ok;
20384
20385 /* Use the same checks as in expand_vec_perm_blend. */
20386 if (d->one_operand_p)
20387 return false;
20388 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20389 ;
20390 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20391 ;
20392 else if (TARGET_SSE4_1
20393 && (GET_MODE_SIZE (vmode) == 16
20394 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
20395 || GET_MODE_SIZE (vmode) == 4))
20396 ;
20397 else
20398 return false;
20399
20400 /* Figure out where permutation elements stay not in their
20401 respective lanes. */
20402 for (i = 0, which = 0; i < nelt; ++i)
20403 {
20404 unsigned e = d->perm[i];
20405 if (e != i)
20406 which |= (e < nelt ? 1 : 2);
20407 }
20408 /* We can pblend the part where elements stay not in their
20409 respective lanes only when these elements are all in one
20410 half of a permutation.
20411 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20412 lanes, but both 8 and 9 >= 8
20413 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20414 respective lanes and 8 >= 8, but 2 not. */
20415 if (which != 1 && which != 2)
20416 return false;
20417 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
20418 return true;
20419
20420 /* First we apply one operand permutation to the part where
20421 elements stay not in their respective lanes. */
20422 dcopy = *d;
20423 if (which == 2)
20424 dcopy.op0 = dcopy.op1 = d->op1;
20425 else
20426 dcopy.op0 = dcopy.op1 = d->op0;
20427 if (!d->testing_p)
20428 dcopy.target = gen_reg_rtx (vmode);
20429 dcopy.one_operand_p = true;
20430
20431 for (i = 0; i < nelt; ++i)
20432 dcopy.perm[i] = d->perm[i] & (nelt - 1);
20433
20434 ok = expand_vec_perm_1 (&dcopy);
20435 if (GET_MODE_SIZE (vmode) != 16 && !ok)
20436 return false;
20437 else
20438 gcc_assert (ok);
20439 if (d->testing_p)
20440 return true;
20441
20442 /* Next we put permuted elements into their positions. */
20443 dcopy1 = *d;
20444 if (which == 2)
20445 dcopy1.op1 = dcopy.target;
20446 else
20447 dcopy1.op0 = dcopy.target;
20448
20449 for (i = 0; i < nelt; ++i)
20450 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
20451
20452 ok = expand_vec_perm_blend (&dcopy1);
20453 gcc_assert (ok);
20454
20455 return true;
20456 }
20457
20458 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
20459
20460 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20461 a two vector permutation into a single vector permutation by using
20462 an interleave operation to merge the vectors. */
20463
20464 static bool
20465 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
20466 {
20467 struct expand_vec_perm_d dremap, dfinal;
20468 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
20469 unsigned HOST_WIDE_INT contents;
20470 unsigned char remap[2 * MAX_VECT_LEN];
20471 rtx_insn *seq;
20472 bool ok, same_halves = false;
20473
20474 if (GET_MODE_SIZE (d->vmode) == 4
20475 || GET_MODE_SIZE (d->vmode) == 8
20476 || GET_MODE_SIZE (d->vmode) == 16)
20477 {
20478 if (d->one_operand_p)
20479 return false;
20480 }
20481 else if (GET_MODE_SIZE (d->vmode) == 32)
20482 {
20483 if (!TARGET_AVX)
20484 return false;
20485 /* For 32-byte modes allow even d->one_operand_p.
20486 The lack of cross-lane shuffling in some instructions
20487 might prevent a single insn shuffle. */
20488 dfinal = *d;
20489 dfinal.testing_p = true;
20490 /* If expand_vec_perm_interleave3 can expand this into
20491 a 3 insn sequence, give up and let it be expanded as
20492 3 insn sequence. While that is one insn longer,
20493 it doesn't need a memory operand and in the common
20494 case that both interleave low and high permutations
20495 with the same operands are adjacent needs 4 insns
20496 for both after CSE. */
20497 if (expand_vec_perm_interleave3 (&dfinal))
20498 return false;
20499 }
20500 else
20501 return false;
20502
20503 /* Examine from whence the elements come. */
20504 contents = 0;
20505 for (i = 0; i < nelt; ++i)
20506 contents |= HOST_WIDE_INT_1U << d->perm[i];
20507
20508 memset (remap, 0xff, sizeof (remap));
20509 dremap = *d;
20510
20511 if (GET_MODE_SIZE (d->vmode) == 4
20512 || GET_MODE_SIZE (d->vmode) == 8)
20513 {
20514 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20515
20516 /* Split the two input vectors into 4 halves. */
20517 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20518 h2 = h1 << nelt2;
20519 h3 = h2 << nelt2;
20520 h4 = h3 << nelt2;
20521
20522 /* If the elements from the low halves use interleave low,
20523 and similarly for interleave high. */
20524 if ((contents & (h1 | h3)) == contents)
20525 {
20526 /* punpckl* */
20527 for (i = 0; i < nelt2; ++i)
20528 {
20529 remap[i] = i * 2;
20530 remap[i + nelt] = i * 2 + 1;
20531 dremap.perm[i * 2] = i;
20532 dremap.perm[i * 2 + 1] = i + nelt;
20533 }
20534 }
20535 else if ((contents & (h2 | h4)) == contents)
20536 {
20537 /* punpckh* */
20538 for (i = 0; i < nelt2; ++i)
20539 {
20540 remap[i + nelt2] = i * 2;
20541 remap[i + nelt + nelt2] = i * 2 + 1;
20542 dremap.perm[i * 2] = i + nelt2;
20543 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20544 }
20545 }
20546 else
20547 return false;
20548 }
20549 else if (GET_MODE_SIZE (d->vmode) == 16)
20550 {
20551 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20552
20553 /* Split the two input vectors into 4 halves. */
20554 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20555 h2 = h1 << nelt2;
20556 h3 = h2 << nelt2;
20557 h4 = h3 << nelt2;
20558
20559 /* If the elements from the low halves use interleave low, and similarly
20560 for interleave high. If the elements are from mis-matched halves, we
20561 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20562 if ((contents & (h1 | h3)) == contents)
20563 {
20564 /* punpckl* */
20565 for (i = 0; i < nelt2; ++i)
20566 {
20567 remap[i] = i * 2;
20568 remap[i + nelt] = i * 2 + 1;
20569 dremap.perm[i * 2] = i;
20570 dremap.perm[i * 2 + 1] = i + nelt;
20571 }
20572 if (!TARGET_SSE2 && d->vmode == V4SImode)
20573 dremap.vmode = V4SFmode;
20574 }
20575 else if ((contents & (h2 | h4)) == contents)
20576 {
20577 /* punpckh* */
20578 for (i = 0; i < nelt2; ++i)
20579 {
20580 remap[i + nelt2] = i * 2;
20581 remap[i + nelt + nelt2] = i * 2 + 1;
20582 dremap.perm[i * 2] = i + nelt2;
20583 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20584 }
20585 if (!TARGET_SSE2 && d->vmode == V4SImode)
20586 dremap.vmode = V4SFmode;
20587 }
20588 else if ((contents & (h1 | h4)) == contents)
20589 {
20590 /* shufps */
20591 for (i = 0; i < nelt2; ++i)
20592 {
20593 remap[i] = i;
20594 remap[i + nelt + nelt2] = i + nelt2;
20595 dremap.perm[i] = i;
20596 dremap.perm[i + nelt2] = i + nelt + nelt2;
20597 }
20598 if (nelt != 4)
20599 {
20600 /* shufpd */
20601 dremap.vmode = V2DImode;
20602 dremap.nelt = 2;
20603 dremap.perm[0] = 0;
20604 dremap.perm[1] = 3;
20605 }
20606 }
20607 else if ((contents & (h2 | h3)) == contents)
20608 {
20609 /* shufps */
20610 for (i = 0; i < nelt2; ++i)
20611 {
20612 remap[i + nelt2] = i;
20613 remap[i + nelt] = i + nelt2;
20614 dremap.perm[i] = i + nelt2;
20615 dremap.perm[i + nelt2] = i + nelt;
20616 }
20617 if (nelt != 4)
20618 {
20619 /* shufpd */
20620 dremap.vmode = V2DImode;
20621 dremap.nelt = 2;
20622 dremap.perm[0] = 1;
20623 dremap.perm[1] = 2;
20624 }
20625 }
20626 else
20627 return false;
20628 }
20629 else
20630 {
20631 unsigned int nelt4 = nelt / 4, nzcnt = 0;
20632 unsigned HOST_WIDE_INT q[8];
20633 unsigned int nonzero_halves[4];
20634
20635 /* Split the two input vectors into 8 quarters. */
20636 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
20637 for (i = 1; i < 8; ++i)
20638 q[i] = q[0] << (nelt4 * i);
20639 for (i = 0; i < 4; ++i)
20640 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
20641 {
20642 nonzero_halves[nzcnt] = i;
20643 ++nzcnt;
20644 }
20645
20646 if (nzcnt == 1)
20647 {
20648 gcc_assert (d->one_operand_p);
20649 nonzero_halves[1] = nonzero_halves[0];
20650 same_halves = true;
20651 }
20652 else if (d->one_operand_p)
20653 {
20654 gcc_assert (nonzero_halves[0] == 0);
20655 gcc_assert (nonzero_halves[1] == 1);
20656 }
20657
20658 if (nzcnt <= 2)
20659 {
20660 if (d->perm[0] / nelt2 == nonzero_halves[1])
20661 {
20662 /* Attempt to increase the likelihood that dfinal
20663 shuffle will be intra-lane. */
20664 std::swap (nonzero_halves[0], nonzero_halves[1]);
20665 }
20666
20667 /* vperm2f128 or vperm2i128. */
20668 for (i = 0; i < nelt2; ++i)
20669 {
20670 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
20671 remap[i + nonzero_halves[0] * nelt2] = i;
20672 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
20673 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
20674 }
20675
20676 if (d->vmode != V8SFmode
20677 && d->vmode != V4DFmode
20678 && d->vmode != V8SImode)
20679 {
20680 dremap.vmode = V8SImode;
20681 dremap.nelt = 8;
20682 for (i = 0; i < 4; ++i)
20683 {
20684 dremap.perm[i] = i + nonzero_halves[0] * 4;
20685 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
20686 }
20687 }
20688 }
20689 else if (d->one_operand_p)
20690 return false;
20691 else if (TARGET_AVX2
20692 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
20693 {
20694 /* vpunpckl* */
20695 for (i = 0; i < nelt4; ++i)
20696 {
20697 remap[i] = i * 2;
20698 remap[i + nelt] = i * 2 + 1;
20699 remap[i + nelt2] = i * 2 + nelt2;
20700 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
20701 dremap.perm[i * 2] = i;
20702 dremap.perm[i * 2 + 1] = i + nelt;
20703 dremap.perm[i * 2 + nelt2] = i + nelt2;
20704 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
20705 }
20706 }
20707 else if (TARGET_AVX2
20708 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
20709 {
20710 /* vpunpckh* */
20711 for (i = 0; i < nelt4; ++i)
20712 {
20713 remap[i + nelt4] = i * 2;
20714 remap[i + nelt + nelt4] = i * 2 + 1;
20715 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
20716 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
20717 dremap.perm[i * 2] = i + nelt4;
20718 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
20719 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
20720 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
20721 }
20722 }
20723 else
20724 return false;
20725 }
20726
20727 /* Use the remapping array set up above to move the elements from their
20728 swizzled locations into their final destinations. */
20729 dfinal = *d;
20730 for (i = 0; i < nelt; ++i)
20731 {
20732 unsigned e = remap[d->perm[i]];
20733 gcc_assert (e < nelt);
20734 /* If same_halves is true, both halves of the remapped vector are the
20735 same. Avoid cross-lane accesses if possible. */
20736 if (same_halves && i >= nelt2)
20737 {
20738 gcc_assert (e < nelt2);
20739 dfinal.perm[i] = e + nelt2;
20740 }
20741 else
20742 dfinal.perm[i] = e;
20743 }
20744 if (!d->testing_p)
20745 {
20746 dremap.target = gen_reg_rtx (dremap.vmode);
20747 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20748 }
20749 dfinal.op1 = dfinal.op0;
20750 dfinal.one_operand_p = true;
20751
20752 /* Test if the final remap can be done with a single insn. For V4SFmode or
20753 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20754 start_sequence ();
20755 ok = expand_vec_perm_1 (&dfinal);
20756 seq = get_insns ();
20757 end_sequence ();
20758
20759 if (!ok)
20760 return false;
20761
20762 if (d->testing_p)
20763 return true;
20764
20765 if (dremap.vmode != dfinal.vmode)
20766 {
20767 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20768 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20769 }
20770
20771 ok = expand_vec_perm_1 (&dremap);
20772 gcc_assert (ok);
20773
20774 emit_insn (seq);
20775 return true;
20776 }
20777
20778 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20779 a single vector cross-lane permutation into vpermq followed
20780 by any of the single insn permutations. */
20781
20782 static bool
20783 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20784 {
20785 struct expand_vec_perm_d dremap, dfinal;
20786 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20787 unsigned contents[2];
20788 bool ok;
20789
20790 if (!(TARGET_AVX2
20791 && (d->vmode == V32QImode || d->vmode == V16HImode)
20792 && d->one_operand_p))
20793 return false;
20794
20795 contents[0] = 0;
20796 contents[1] = 0;
20797 for (i = 0; i < nelt2; ++i)
20798 {
20799 contents[0] |= 1u << (d->perm[i] / nelt4);
20800 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20801 }
20802
20803 for (i = 0; i < 2; ++i)
20804 {
20805 unsigned int cnt = 0;
20806 for (j = 0; j < 4; ++j)
20807 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20808 return false;
20809 }
20810
20811 if (d->testing_p)
20812 return true;
20813
20814 dremap = *d;
20815 dremap.vmode = V4DImode;
20816 dremap.nelt = 4;
20817 dremap.target = gen_reg_rtx (V4DImode);
20818 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20819 dremap.op1 = dremap.op0;
20820 dremap.one_operand_p = true;
20821 for (i = 0; i < 2; ++i)
20822 {
20823 unsigned int cnt = 0;
20824 for (j = 0; j < 4; ++j)
20825 if ((contents[i] & (1u << j)) != 0)
20826 dremap.perm[2 * i + cnt++] = j;
20827 for (; cnt < 2; ++cnt)
20828 dremap.perm[2 * i + cnt] = 0;
20829 }
20830
20831 dfinal = *d;
20832 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20833 dfinal.op1 = dfinal.op0;
20834 dfinal.one_operand_p = true;
20835 for (i = 0, j = 0; i < nelt; ++i)
20836 {
20837 if (i == nelt2)
20838 j = 2;
20839 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20840 if ((d->perm[i] / nelt4) == dremap.perm[j])
20841 ;
20842 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20843 dfinal.perm[i] |= nelt4;
20844 else
20845 gcc_unreachable ();
20846 }
20847
20848 ok = expand_vec_perm_1 (&dremap);
20849 gcc_assert (ok);
20850
20851 ok = expand_vec_perm_1 (&dfinal);
20852 gcc_assert (ok);
20853
20854 return true;
20855 }
20856
20857 static bool canonicalize_perm (struct expand_vec_perm_d *d);
20858
20859 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20860 a vector permutation using two instructions, vperm2f128 resp.
20861 vperm2i128 followed by any single in-lane permutation. */
20862
20863 static bool
20864 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20865 {
20866 struct expand_vec_perm_d dfirst, dsecond;
20867 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20868 bool ok;
20869
20870 if (!TARGET_AVX
20871 || GET_MODE_SIZE (d->vmode) != 32
20872 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20873 return false;
20874
20875 dsecond = *d;
20876 dsecond.one_operand_p = false;
20877 dsecond.testing_p = true;
20878
20879 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20880 immediate. For perm < 16 the second permutation uses
20881 d->op0 as first operand, for perm >= 16 it uses d->op1
20882 as first operand. The second operand is the result of
20883 vperm2[fi]128. */
20884 for (perm = 0; perm < 32; perm++)
20885 {
20886 /* Ignore permutations which do not move anything cross-lane. */
20887 if (perm < 16)
20888 {
20889 /* The second shuffle for e.g. V4DFmode has
20890 0123 and ABCD operands.
20891 Ignore AB23, as 23 is already in the second lane
20892 of the first operand. */
20893 if ((perm & 0xc) == (1 << 2)) continue;
20894 /* And 01CD, as 01 is in the first lane of the first
20895 operand. */
20896 if ((perm & 3) == 0) continue;
20897 /* And 4567, as then the vperm2[fi]128 doesn't change
20898 anything on the original 4567 second operand. */
20899 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
20900 }
20901 else
20902 {
20903 /* The second shuffle for e.g. V4DFmode has
20904 4567 and ABCD operands.
20905 Ignore AB67, as 67 is already in the second lane
20906 of the first operand. */
20907 if ((perm & 0xc) == (3 << 2)) continue;
20908 /* And 45CD, as 45 is in the first lane of the first
20909 operand. */
20910 if ((perm & 3) == 2) continue;
20911 /* And 0123, as then the vperm2[fi]128 doesn't change
20912 anything on the original 0123 first operand. */
20913 if ((perm & 0xf) == (1 << 2)) continue;
20914 }
20915
20916 for (i = 0; i < nelt; i++)
20917 {
20918 j = d->perm[i] / nelt2;
20919 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20920 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20921 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20922 dsecond.perm[i] = d->perm[i] & (nelt - 1);
20923 else
20924 break;
20925 }
20926
20927 if (i == nelt)
20928 {
20929 start_sequence ();
20930 ok = expand_vec_perm_1 (&dsecond);
20931 end_sequence ();
20932 }
20933 else
20934 ok = false;
20935
20936 if (ok)
20937 {
20938 if (d->testing_p)
20939 return true;
20940
20941 /* Found a usable second shuffle. dfirst will be
20942 vperm2f128 on d->op0 and d->op1. */
20943 dsecond.testing_p = false;
20944 dfirst = *d;
20945 dfirst.target = gen_reg_rtx (d->vmode);
20946 for (i = 0; i < nelt; i++)
20947 dfirst.perm[i] = (i & (nelt2 - 1))
20948 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20949
20950 canonicalize_perm (&dfirst);
20951 ok = expand_vec_perm_1 (&dfirst);
20952 gcc_assert (ok);
20953
20954 /* And dsecond is some single insn shuffle, taking
20955 d->op0 and result of vperm2f128 (if perm < 16) or
20956 d->op1 and result of vperm2f128 (otherwise). */
20957 if (perm >= 16)
20958 dsecond.op0 = dsecond.op1;
20959 dsecond.op1 = dfirst.target;
20960
20961 ok = expand_vec_perm_1 (&dsecond);
20962 gcc_assert (ok);
20963
20964 return true;
20965 }
20966
20967 /* For one operand, the only useful vperm2f128 permutation is 0x01
20968 aka lanes swap. */
20969 if (d->one_operand_p)
20970 return false;
20971 }
20972
20973 return false;
20974 }
20975
20976 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20977 a two vector permutation using 2 intra-lane interleave insns
20978 and cross-lane shuffle for 32-byte vectors. */
20979
20980 static bool
20981 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20982 {
20983 unsigned i, nelt;
20984 rtx (*gen) (rtx, rtx, rtx);
20985
20986 if (d->one_operand_p)
20987 return false;
20988 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20989 ;
20990 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20991 ;
20992 else
20993 return false;
20994
20995 nelt = d->nelt;
20996 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20997 return false;
20998 for (i = 0; i < nelt; i += 2)
20999 if (d->perm[i] != d->perm[0] + i / 2
21000 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
21001 return false;
21002
21003 if (d->testing_p)
21004 return true;
21005
21006 switch (d->vmode)
21007 {
21008 case E_V32QImode:
21009 if (d->perm[0])
21010 gen = gen_vec_interleave_highv32qi;
21011 else
21012 gen = gen_vec_interleave_lowv32qi;
21013 break;
21014 case E_V16HImode:
21015 if (d->perm[0])
21016 gen = gen_vec_interleave_highv16hi;
21017 else
21018 gen = gen_vec_interleave_lowv16hi;
21019 break;
21020 case E_V8SImode:
21021 if (d->perm[0])
21022 gen = gen_vec_interleave_highv8si;
21023 else
21024 gen = gen_vec_interleave_lowv8si;
21025 break;
21026 case E_V4DImode:
21027 if (d->perm[0])
21028 gen = gen_vec_interleave_highv4di;
21029 else
21030 gen = gen_vec_interleave_lowv4di;
21031 break;
21032 case E_V8SFmode:
21033 if (d->perm[0])
21034 gen = gen_vec_interleave_highv8sf;
21035 else
21036 gen = gen_vec_interleave_lowv8sf;
21037 break;
21038 case E_V4DFmode:
21039 if (d->perm[0])
21040 gen = gen_vec_interleave_highv4df;
21041 else
21042 gen = gen_vec_interleave_lowv4df;
21043 break;
21044 default:
21045 gcc_unreachable ();
21046 }
21047
21048 emit_insn (gen (d->target, d->op0, d->op1));
21049 return true;
21050 }
21051
21052 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21053 a single vector permutation using a single intra-lane vector
21054 permutation, vperm2f128 swapping the lanes and vblend* insn blending
21055 the non-swapped and swapped vectors together. */
21056
21057 static bool
21058 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
21059 {
21060 struct expand_vec_perm_d dfirst, dsecond;
21061 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
21062 rtx_insn *seq;
21063 bool ok;
21064 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21065
21066 if (!TARGET_AVX
21067 || TARGET_AVX2
21068 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21069 || !d->one_operand_p)
21070 return false;
21071
21072 dfirst = *d;
21073 for (i = 0; i < nelt; i++)
21074 dfirst.perm[i] = 0xff;
21075 for (i = 0, msk = 0; i < nelt; i++)
21076 {
21077 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21078 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
21079 return false;
21080 dfirst.perm[j] = d->perm[i];
21081 if (j != i)
21082 msk |= (1 << i);
21083 }
21084 for (i = 0; i < nelt; i++)
21085 if (dfirst.perm[i] == 0xff)
21086 dfirst.perm[i] = i;
21087
21088 if (!d->testing_p)
21089 dfirst.target = gen_reg_rtx (dfirst.vmode);
21090
21091 start_sequence ();
21092 ok = expand_vec_perm_1 (&dfirst);
21093 seq = get_insns ();
21094 end_sequence ();
21095
21096 if (!ok)
21097 return false;
21098
21099 if (d->testing_p)
21100 return true;
21101
21102 emit_insn (seq);
21103
21104 dsecond = *d;
21105 dsecond.op0 = dfirst.target;
21106 dsecond.op1 = dfirst.target;
21107 dsecond.one_operand_p = true;
21108 dsecond.target = gen_reg_rtx (dsecond.vmode);
21109 for (i = 0; i < nelt; i++)
21110 dsecond.perm[i] = i ^ nelt2;
21111
21112 ok = expand_vec_perm_1 (&dsecond);
21113 gcc_assert (ok);
21114
21115 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21116 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
21117 return true;
21118 }
21119
21120 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21121 a two vector permutation using two single vector permutations and
21122 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
21123 of dfirst or dsecond is identity permutation. */
21124
21125 static bool
21126 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
21127 {
21128 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
21129 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21130 bool ident1 = true, ident2 = true;
21131
21132 if (d->one_operand_p)
21133 return false;
21134
21135 if (GET_MODE_SIZE (d->vmode) == 16)
21136 {
21137 if (!TARGET_SSE)
21138 return false;
21139 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
21140 return false;
21141 }
21142 else if (GET_MODE_SIZE (d->vmode) == 32)
21143 {
21144 if (!TARGET_AVX)
21145 return false;
21146 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
21147 return false;
21148 lane = nelt2;
21149 }
21150 else
21151 return false;
21152
21153 for (i = 1; i < nelt; i++)
21154 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
21155 return false;
21156
21157 dfirst = *d;
21158 dsecond = *d;
21159 dfinal = *d;
21160 dfirst.op1 = dfirst.op0;
21161 dfirst.one_operand_p = true;
21162 dsecond.op0 = dsecond.op1;
21163 dsecond.one_operand_p = true;
21164
21165 for (i = 0; i < nelt; i++)
21166 if (d->perm[i] >= nelt)
21167 {
21168 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
21169 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
21170 ident2 = false;
21171 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
21172 = d->perm[i] - nelt;
21173 }
21174 else
21175 {
21176 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
21177 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
21178 ident1 = false;
21179 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
21180 }
21181
21182 if (two_insn && !ident1 && !ident2)
21183 return false;
21184
21185 if (!d->testing_p)
21186 {
21187 if (!ident1)
21188 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21189 if (!ident2)
21190 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21191 if (d->perm[0] >= nelt)
21192 std::swap (dfinal.op0, dfinal.op1);
21193 }
21194
21195 bool ok;
21196 rtx_insn *seq1 = NULL, *seq2 = NULL;
21197
21198 if (!ident1)
21199 {
21200 start_sequence ();
21201 ok = expand_vec_perm_1 (&dfirst);
21202 seq1 = get_insns ();
21203 end_sequence ();
21204
21205 if (!ok)
21206 return false;
21207 }
21208
21209 if (!ident2)
21210 {
21211 start_sequence ();
21212 ok = expand_vec_perm_1 (&dsecond);
21213 seq2 = get_insns ();
21214 end_sequence ();
21215
21216 if (!ok)
21217 return false;
21218 }
21219
21220 if (d->testing_p)
21221 return true;
21222
21223 for (i = 0; i < nelt; i++)
21224 {
21225 dfinal.perm[i] = i / 2;
21226 if (i >= lane)
21227 dfinal.perm[i] += lane / 2;
21228 if ((i & 1) != 0)
21229 dfinal.perm[i] += nelt;
21230 }
21231 emit_insn (seq1);
21232 emit_insn (seq2);
21233 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
21234 dfinal.perm, dfinal.nelt, false);
21235 gcc_assert (ok);
21236 return true;
21237 }
21238
21239 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21240 the permutation using two single vector permutations and the SSE4_1 pblendv
21241 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21242 identity permutation. */
21243
21244 static bool
21245 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
21246 {
21247 unsigned i, nelt = d->nelt;
21248 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21249 machine_mode vmode = d->vmode;
21250 bool ident1 = true, ident2 = true;
21251
21252 /* Use the same checks as in expand_vec_perm_blend. */
21253 if (d->one_operand_p)
21254 return false;
21255 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21256 ;
21257 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21258 ;
21259 else if (TARGET_SSE4_1
21260 && (GET_MODE_SIZE (vmode) == 16
21261 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
21262 || GET_MODE_SIZE (vmode) == 4))
21263 ;
21264 else
21265 return false;
21266
21267 dfirst = *d;
21268 dsecond = *d;
21269 dfinal = *d;
21270 dfirst.op1 = dfirst.op0;
21271 dfirst.one_operand_p = true;
21272 dsecond.op0 = dsecond.op1;
21273 dsecond.one_operand_p = true;
21274
21275 for (i = 0; i < nelt; ++i)
21276 if (d->perm[i] >= nelt)
21277 {
21278 dfirst.perm[i] = 0xff;
21279 dsecond.perm[i] = d->perm[i] - nelt;
21280 if (d->perm[i] != i + nelt)
21281 ident2 = false;
21282 }
21283 else
21284 {
21285 dsecond.perm[i] = 0xff;
21286 dfirst.perm[i] = d->perm[i];
21287 if (d->perm[i] != i)
21288 ident1 = false;
21289 }
21290
21291 if (two_insn && !ident1 && !ident2)
21292 return false;
21293
21294 /* For now. Ideally treat 0xff as a wildcard. */
21295 for (i = 0; i < nelt; ++i)
21296 if (dfirst.perm[i] == 0xff)
21297 {
21298 if (GET_MODE_SIZE (vmode) == 32
21299 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
21300 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21301 else
21302 dfirst.perm[i] = i;
21303 }
21304 else
21305 {
21306 if (GET_MODE_SIZE (vmode) == 32
21307 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
21308 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21309 else
21310 dsecond.perm[i] = i;
21311 }
21312
21313 if (!d->testing_p)
21314 {
21315 if (!ident1)
21316 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21317 if (!ident2)
21318 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21319 }
21320
21321 bool ok;
21322 rtx_insn *seq1 = NULL, *seq2 = NULL;
21323
21324 if (!ident1)
21325 {
21326 start_sequence ();
21327 ok = expand_vec_perm_1 (&dfirst);
21328 seq1 = get_insns ();
21329 end_sequence ();
21330
21331 if (!ok)
21332 return false;
21333 }
21334
21335 if (!ident2)
21336 {
21337 start_sequence ();
21338 ok = expand_vec_perm_1 (&dsecond);
21339 seq2 = get_insns ();
21340 end_sequence ();
21341
21342 if (!ok)
21343 return false;
21344 }
21345
21346 if (d->testing_p)
21347 return true;
21348
21349 for (i = 0; i < nelt; ++i)
21350 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
21351
21352 emit_insn (seq1);
21353 emit_insn (seq2);
21354 ok = expand_vec_perm_blend (&dfinal);
21355 gcc_assert (ok);
21356 return true;
21357 }
21358
21359 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21360 permutation using two vperm2f128, followed by a vshufpd insn blending
21361 the two vectors together. */
21362
21363 static bool
21364 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
21365 {
21366 struct expand_vec_perm_d dfirst, dsecond, dthird;
21367 bool ok;
21368
21369 if (!TARGET_AVX || (d->vmode != V4DFmode))
21370 return false;
21371
21372 if (d->testing_p)
21373 return true;
21374
21375 dfirst = *d;
21376 dsecond = *d;
21377 dthird = *d;
21378
21379 dfirst.perm[0] = (d->perm[0] & ~1);
21380 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
21381 dfirst.perm[2] = (d->perm[2] & ~1);
21382 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
21383 dsecond.perm[0] = (d->perm[1] & ~1);
21384 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
21385 dsecond.perm[2] = (d->perm[3] & ~1);
21386 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
21387 dthird.perm[0] = (d->perm[0] % 2);
21388 dthird.perm[1] = (d->perm[1] % 2) + 4;
21389 dthird.perm[2] = (d->perm[2] % 2) + 2;
21390 dthird.perm[3] = (d->perm[3] % 2) + 6;
21391
21392 dfirst.target = gen_reg_rtx (dfirst.vmode);
21393 dsecond.target = gen_reg_rtx (dsecond.vmode);
21394 dthird.op0 = dfirst.target;
21395 dthird.op1 = dsecond.target;
21396 dthird.one_operand_p = false;
21397
21398 canonicalize_perm (&dfirst);
21399 canonicalize_perm (&dsecond);
21400
21401 ok = expand_vec_perm_1 (&dfirst)
21402 && expand_vec_perm_1 (&dsecond)
21403 && expand_vec_perm_1 (&dthird);
21404
21405 gcc_assert (ok);
21406
21407 return true;
21408 }
21409
21410 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
21411
21412 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21413 a two vector permutation using two intra-lane vector
21414 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21415 the non-swapped and swapped vectors together. */
21416
21417 static bool
21418 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
21419 {
21420 struct expand_vec_perm_d dfirst, dsecond, dthird;
21421 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
21422 rtx_insn *seq1, *seq2;
21423 bool ok;
21424 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21425
21426 if (!TARGET_AVX
21427 || TARGET_AVX2
21428 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21429 || d->one_operand_p)
21430 return false;
21431
21432 dfirst = *d;
21433 dsecond = *d;
21434 for (i = 0; i < nelt; i++)
21435 {
21436 dfirst.perm[i] = 0xff;
21437 dsecond.perm[i] = 0xff;
21438 }
21439 for (i = 0, msk = 0; i < nelt; i++)
21440 {
21441 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21442 if (j == i)
21443 {
21444 dfirst.perm[j] = d->perm[i];
21445 which1 |= (d->perm[i] < nelt ? 1 : 2);
21446 }
21447 else
21448 {
21449 dsecond.perm[j] = d->perm[i];
21450 which2 |= (d->perm[i] < nelt ? 1 : 2);
21451 msk |= (1U << i);
21452 }
21453 }
21454 if (msk == 0 || msk == (1U << nelt) - 1)
21455 return false;
21456
21457 if (!d->testing_p)
21458 {
21459 dfirst.target = gen_reg_rtx (dfirst.vmode);
21460 dsecond.target = gen_reg_rtx (dsecond.vmode);
21461 }
21462
21463 for (i = 0; i < nelt; i++)
21464 {
21465 if (dfirst.perm[i] == 0xff)
21466 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
21467 if (dsecond.perm[i] == 0xff)
21468 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
21469 }
21470 canonicalize_perm (&dfirst);
21471 start_sequence ();
21472 ok = ix86_expand_vec_perm_const_1 (&dfirst);
21473 seq1 = get_insns ();
21474 end_sequence ();
21475
21476 if (!ok)
21477 return false;
21478
21479 canonicalize_perm (&dsecond);
21480 start_sequence ();
21481 ok = ix86_expand_vec_perm_const_1 (&dsecond);
21482 seq2 = get_insns ();
21483 end_sequence ();
21484
21485 if (!ok)
21486 return false;
21487
21488 if (d->testing_p)
21489 return true;
21490
21491 emit_insn (seq1);
21492 emit_insn (seq2);
21493
21494 dthird = *d;
21495 dthird.op0 = dsecond.target;
21496 dthird.op1 = dsecond.target;
21497 dthird.one_operand_p = true;
21498 dthird.target = gen_reg_rtx (dthird.vmode);
21499 for (i = 0; i < nelt; i++)
21500 dthird.perm[i] = i ^ nelt2;
21501
21502 ok = expand_vec_perm_1 (&dthird);
21503 gcc_assert (ok);
21504
21505 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21506 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
21507 return true;
21508 }
21509
21510 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21511 permutation with two pshufb insns and an ior. We should have already
21512 failed all two instruction sequences. */
21513
21514 static bool
21515 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
21516 {
21517 rtx rperm[2][16], vperm, l, h, op, m128;
21518 unsigned int i, nelt, eltsz;
21519 machine_mode mode;
21520 rtx (*gen) (rtx, rtx, rtx);
21521
21522 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
21523 && GET_MODE_SIZE (d->vmode) != 8
21524 && GET_MODE_SIZE (d->vmode) != 4))
21525 return false;
21526 gcc_assert (!d->one_operand_p);
21527
21528 if (d->testing_p)
21529 return true;
21530
21531 switch (GET_MODE_SIZE (d->vmode))
21532 {
21533 case 4:
21534 mode = V4QImode;
21535 gen = gen_mmx_pshufbv4qi3;
21536 break;
21537 case 8:
21538 mode = V8QImode;
21539 gen = gen_mmx_pshufbv8qi3;
21540 break;
21541 case 16:
21542 mode = V16QImode;
21543 gen = gen_ssse3_pshufbv16qi3;
21544 break;
21545 default:
21546 gcc_unreachable ();
21547 }
21548
21549 nelt = d->nelt;
21550 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21551
21552 /* Generate two permutation masks. If the required element is within
21553 the given vector it is shuffled into the proper lane. If the required
21554 element is in the other vector, force a zero into the lane by setting
21555 bit 7 in the permutation mask. */
21556 m128 = GEN_INT (-128);
21557 for (i = 0; i < nelt; ++i)
21558 {
21559 unsigned j, k, e = d->perm[i];
21560 unsigned which = (e >= nelt);
21561 if (e >= nelt)
21562 e -= nelt;
21563
21564 for (j = 0; j < eltsz; ++j)
21565 {
21566 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
21567 rperm[1-which][i*eltsz + j] = m128;
21568 }
21569
21570 for (k = i*eltsz + j; k < 16; ++k)
21571 rperm[0][k] = rperm[1][k] = m128;
21572 }
21573
21574 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
21575 vperm = force_reg (V16QImode, vperm);
21576
21577 l = gen_reg_rtx (mode);
21578 op = gen_lowpart (mode, d->op0);
21579 emit_insn (gen (l, op, vperm));
21580
21581 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
21582 vperm = force_reg (V16QImode, vperm);
21583
21584 h = gen_reg_rtx (mode);
21585 op = gen_lowpart (mode, d->op1);
21586 emit_insn (gen (h, op, vperm));
21587
21588 op = d->target;
21589 if (d->vmode != mode)
21590 op = gen_reg_rtx (mode);
21591 ix86_emit_vec_binop (IOR, mode, op, l, h);
21592 if (op != d->target)
21593 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21594
21595 return true;
21596 }
21597
21598 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
21599 with two vpshufb insns, vpermq and vpor. We should have already failed
21600 all two or three instruction sequences. */
21601
21602 static bool
21603 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
21604 {
21605 rtx rperm[2][32], vperm, l, h, hp, op, m128;
21606 unsigned int i, nelt, eltsz;
21607
21608 if (!TARGET_AVX2
21609 || !d->one_operand_p
21610 || (d->vmode != V32QImode && d->vmode != V16HImode))
21611 return false;
21612
21613 if (d->testing_p)
21614 return true;
21615
21616 nelt = d->nelt;
21617 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21618
21619 /* Generate two permutation masks. If the required element is within
21620 the same lane, it is shuffled in. If the required element from the
21621 other lane, force a zero by setting bit 7 in the permutation mask.
21622 In the other mask the mask has non-negative elements if element
21623 is requested from the other lane, but also moved to the other lane,
21624 so that the result of vpshufb can have the two V2TImode halves
21625 swapped. */
21626 m128 = GEN_INT (-128);
21627 for (i = 0; i < nelt; ++i)
21628 {
21629 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21630 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21631
21632 for (j = 0; j < eltsz; ++j)
21633 {
21634 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
21635 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
21636 }
21637 }
21638
21639 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21640 vperm = force_reg (V32QImode, vperm);
21641
21642 h = gen_reg_rtx (V32QImode);
21643 op = gen_lowpart (V32QImode, d->op0);
21644 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21645
21646 /* Swap the 128-byte lanes of h into hp. */
21647 hp = gen_reg_rtx (V4DImode);
21648 op = gen_lowpart (V4DImode, h);
21649 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
21650 const1_rtx));
21651
21652 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21653 vperm = force_reg (V32QImode, vperm);
21654
21655 l = gen_reg_rtx (V32QImode);
21656 op = gen_lowpart (V32QImode, d->op0);
21657 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21658
21659 op = d->target;
21660 if (d->vmode != V32QImode)
21661 op = gen_reg_rtx (V32QImode);
21662 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
21663 if (op != d->target)
21664 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21665
21666 return true;
21667 }
21668
21669 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21670 and extract-odd permutations of two V32QImode and V16QImode operand
21671 with two vpshufb insns, vpor and vpermq. We should have already
21672 failed all two or three instruction sequences. */
21673
21674 static bool
21675 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
21676 {
21677 rtx rperm[2][32], vperm, l, h, ior, op, m128;
21678 unsigned int i, nelt, eltsz;
21679
21680 if (!TARGET_AVX2
21681 || d->one_operand_p
21682 || (d->vmode != V32QImode && d->vmode != V16HImode))
21683 return false;
21684
21685 for (i = 0; i < d->nelt; ++i)
21686 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
21687 return false;
21688
21689 if (d->testing_p)
21690 return true;
21691
21692 nelt = d->nelt;
21693 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21694
21695 /* Generate two permutation masks. In the first permutation mask
21696 the first quarter will contain indexes for the first half
21697 of the op0, the second quarter will contain bit 7 set, third quarter
21698 will contain indexes for the second half of the op0 and the
21699 last quarter bit 7 set. In the second permutation mask
21700 the first quarter will contain bit 7 set, the second quarter
21701 indexes for the first half of the op1, the third quarter bit 7 set
21702 and last quarter indexes for the second half of the op1.
21703 I.e. the first mask e.g. for V32QImode extract even will be:
21704 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21705 (all values masked with 0xf except for -128) and second mask
21706 for extract even will be
21707 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21708 m128 = GEN_INT (-128);
21709 for (i = 0; i < nelt; ++i)
21710 {
21711 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21712 unsigned which = d->perm[i] >= nelt;
21713 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
21714
21715 for (j = 0; j < eltsz; ++j)
21716 {
21717 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
21718 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
21719 }
21720 }
21721
21722 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21723 vperm = force_reg (V32QImode, vperm);
21724
21725 l = gen_reg_rtx (V32QImode);
21726 op = gen_lowpart (V32QImode, d->op0);
21727 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21728
21729 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21730 vperm = force_reg (V32QImode, vperm);
21731
21732 h = gen_reg_rtx (V32QImode);
21733 op = gen_lowpart (V32QImode, d->op1);
21734 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21735
21736 ior = gen_reg_rtx (V32QImode);
21737 emit_insn (gen_iorv32qi3 (ior, l, h));
21738
21739 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21740 op = gen_reg_rtx (V4DImode);
21741 ior = gen_lowpart (V4DImode, ior);
21742 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
21743 const1_rtx, GEN_INT (3)));
21744 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21745
21746 return true;
21747 }
21748
21749 /* Implement permutation with pslldq + psrldq + por when pshufb is not
21750 available. */
21751 static bool
21752 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
21753 {
21754 unsigned i, nelt = d->nelt;
21755 unsigned start1, end1 = -1;
21756 machine_mode vmode = d->vmode, imode;
21757 int start2 = -1;
21758 bool clear_op0, clear_op1;
21759 unsigned inner_size;
21760 rtx op0, op1, dop1;
21761 rtx (*gen_vec_shr) (rtx, rtx, rtx);
21762 rtx (*gen_vec_shl) (rtx, rtx, rtx);
21763
21764 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21765 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
21766 return false;
21767
21768 start1 = d->perm[0];
21769 for (i = 1; i < nelt; i++)
21770 {
21771 if (d->perm[i] != d->perm[i-1] + 1
21772 || d->perm[i] == nelt)
21773 {
21774 if (start2 == -1)
21775 {
21776 start2 = d->perm[i];
21777 end1 = d->perm[i-1];
21778 }
21779 else
21780 return false;
21781 }
21782 }
21783
21784 clear_op0 = end1 != nelt - 1;
21785 clear_op1 = start2 % nelt != 0;
21786 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21787 if (!pandn && (clear_op0 || clear_op1))
21788 return false;
21789
21790 if (d->testing_p)
21791 return true;
21792
21793 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
21794 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
21795 imode = GET_MODE_INNER (vmode);
21796 inner_size = GET_MODE_BITSIZE (imode);
21797 op0 = gen_reg_rtx (vmode);
21798 op1 = gen_reg_rtx (vmode);
21799
21800 if (start1)
21801 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
21802 else
21803 emit_move_insn (op0, d->op0);
21804
21805 dop1 = d->op1;
21806 if (d->one_operand_p)
21807 dop1 = d->op0;
21808
21809 int shl_offset = end1 - start1 + 1 - start2 % nelt;
21810 if (shl_offset)
21811 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
21812 else
21813 emit_move_insn (op1, dop1);
21814
21815 /* Clear lower/upper bits for op0/op1. */
21816 if (clear_op0 || clear_op1)
21817 {
21818 rtx vec[16];
21819 rtx const_vec;
21820 rtx clear;
21821 for (i = 0; i != nelt; i++)
21822 {
21823 if (i < (end1 - start1 + 1))
21824 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
21825 else
21826 vec[i] = CONST0_RTX (imode);
21827 }
21828 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
21829 const_vec = validize_mem (force_const_mem (vmode, const_vec));
21830 clear = force_reg (vmode, const_vec);
21831
21832 if (clear_op0)
21833 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
21834 if (clear_op1)
21835 emit_move_insn (op1, gen_rtx_AND (vmode,
21836 gen_rtx_NOT (vmode, clear),
21837 op1));
21838 }
21839
21840 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
21841 return true;
21842 }
21843
21844 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21845 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21846 operands with two "and" and "pack" or two "shift" and "pack" insns.
21847 We should have already failed all two instruction sequences. */
21848
21849 static bool
21850 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
21851 {
21852 rtx op, dop0, dop1, t;
21853 unsigned i, odd, c, s, nelt = d->nelt;
21854 bool end_perm = false;
21855 machine_mode half_mode;
21856 rtx (*gen_and) (rtx, rtx, rtx);
21857 rtx (*gen_pack) (rtx, rtx, rtx);
21858 rtx (*gen_shift) (rtx, rtx, rtx);
21859
21860 if (d->one_operand_p)
21861 return false;
21862
21863 switch (d->vmode)
21864 {
21865 case E_V4HImode:
21866 /* Required for "pack". */
21867 if (!TARGET_SSE4_1)
21868 return false;
21869 c = 0xffff;
21870 s = 16;
21871 half_mode = V2SImode;
21872 gen_and = gen_andv2si3;
21873 gen_pack = gen_mmx_packusdw;
21874 gen_shift = gen_lshrv2si3;
21875 break;
21876 case E_V8HImode:
21877 /* Required for "pack". */
21878 if (!TARGET_SSE4_1)
21879 return false;
21880 c = 0xffff;
21881 s = 16;
21882 half_mode = V4SImode;
21883 gen_and = gen_andv4si3;
21884 gen_pack = gen_sse4_1_packusdw;
21885 gen_shift = gen_lshrv4si3;
21886 break;
21887 case E_V8QImode:
21888 /* No check as all instructions are SSE2. */
21889 c = 0xff;
21890 s = 8;
21891 half_mode = V4HImode;
21892 gen_and = gen_andv4hi3;
21893 gen_pack = gen_mmx_packuswb;
21894 gen_shift = gen_lshrv4hi3;
21895 break;
21896 case E_V16QImode:
21897 /* No check as all instructions are SSE2. */
21898 c = 0xff;
21899 s = 8;
21900 half_mode = V8HImode;
21901 gen_and = gen_andv8hi3;
21902 gen_pack = gen_sse2_packuswb;
21903 gen_shift = gen_lshrv8hi3;
21904 break;
21905 case E_V16HImode:
21906 if (!TARGET_AVX2)
21907 return false;
21908 c = 0xffff;
21909 s = 16;
21910 half_mode = V8SImode;
21911 gen_and = gen_andv8si3;
21912 gen_pack = gen_avx2_packusdw;
21913 gen_shift = gen_lshrv8si3;
21914 end_perm = true;
21915 break;
21916 case E_V32QImode:
21917 if (!TARGET_AVX2)
21918 return false;
21919 c = 0xff;
21920 s = 8;
21921 half_mode = V16HImode;
21922 gen_and = gen_andv16hi3;
21923 gen_pack = gen_avx2_packuswb;
21924 gen_shift = gen_lshrv16hi3;
21925 end_perm = true;
21926 break;
21927 default:
21928 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21929 are more profitable than general shuffles. */
21930 return false;
21931 }
21932
21933 /* Check that permutation is even or odd. */
21934 odd = d->perm[0];
21935 if (odd > 1)
21936 return false;
21937
21938 for (i = 1; i < nelt; ++i)
21939 if (d->perm[i] != 2 * i + odd)
21940 return false;
21941
21942 if (d->testing_p)
21943 return true;
21944
21945 dop0 = gen_reg_rtx (half_mode);
21946 dop1 = gen_reg_rtx (half_mode);
21947 if (odd == 0)
21948 {
21949 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
21950 t = force_reg (half_mode, t);
21951 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
21952 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
21953 }
21954 else
21955 {
21956 emit_insn (gen_shift (dop0,
21957 gen_lowpart (half_mode, d->op0),
21958 GEN_INT (s)));
21959 emit_insn (gen_shift (dop1,
21960 gen_lowpart (half_mode, d->op1),
21961 GEN_INT (s)));
21962 }
21963 /* In AVX2 for 256 bit case we need to permute pack result. */
21964 if (TARGET_AVX2 && end_perm)
21965 {
21966 op = gen_reg_rtx (d->vmode);
21967 t = gen_reg_rtx (V4DImode);
21968 emit_insn (gen_pack (op, dop0, dop1));
21969 emit_insn (gen_avx2_permv4di_1 (t,
21970 gen_lowpart (V4DImode, op),
21971 const0_rtx,
21972 const2_rtx,
21973 const1_rtx,
21974 GEN_INT (3)));
21975 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
21976 }
21977 else
21978 emit_insn (gen_pack (d->target, dop0, dop1));
21979
21980 return true;
21981 }
21982
21983 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21984 and extract-odd permutations of two V64QI operands
21985 with two "shifts", two "truncs" and one "concat" insns for "odd"
21986 and two "truncs" and one concat insn for "even."
21987 Have already failed all two instruction sequences. */
21988
21989 static bool
21990 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
21991 {
21992 rtx t1, t2, t3, t4;
21993 unsigned i, odd, nelt = d->nelt;
21994
21995 if (!TARGET_AVX512BW
21996 || d->one_operand_p
21997 || d->vmode != V64QImode)
21998 return false;
21999
22000 /* Check that permutation is even or odd. */
22001 odd = d->perm[0];
22002 if (odd > 1)
22003 return false;
22004
22005 for (i = 1; i < nelt; ++i)
22006 if (d->perm[i] != 2 * i + odd)
22007 return false;
22008
22009 if (d->testing_p)
22010 return true;
22011
22012
22013 if (odd)
22014 {
22015 t1 = gen_reg_rtx (V32HImode);
22016 t2 = gen_reg_rtx (V32HImode);
22017 emit_insn (gen_lshrv32hi3 (t1,
22018 gen_lowpart (V32HImode, d->op0),
22019 GEN_INT (8)));
22020 emit_insn (gen_lshrv32hi3 (t2,
22021 gen_lowpart (V32HImode, d->op1),
22022 GEN_INT (8)));
22023 }
22024 else
22025 {
22026 t1 = gen_lowpart (V32HImode, d->op0);
22027 t2 = gen_lowpart (V32HImode, d->op1);
22028 }
22029
22030 t3 = gen_reg_rtx (V32QImode);
22031 t4 = gen_reg_rtx (V32QImode);
22032 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
22033 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
22034 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
22035
22036 return true;
22037 }
22038
22039 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
22040 and extract-odd permutations. */
22041
22042 static bool
22043 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
22044 {
22045 rtx t1, t2, t3, t4, t5;
22046
22047 switch (d->vmode)
22048 {
22049 case E_V4DFmode:
22050 if (d->testing_p)
22051 break;
22052 t1 = gen_reg_rtx (V4DFmode);
22053 t2 = gen_reg_rtx (V4DFmode);
22054
22055 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22056 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
22057 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
22058
22059 /* Now an unpck[lh]pd will produce the result required. */
22060 if (odd)
22061 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
22062 else
22063 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
22064 emit_insn (t3);
22065 break;
22066
22067 case E_V8SFmode:
22068 {
22069 int mask = odd ? 0xdd : 0x88;
22070
22071 if (d->testing_p)
22072 break;
22073 t1 = gen_reg_rtx (V8SFmode);
22074 t2 = gen_reg_rtx (V8SFmode);
22075 t3 = gen_reg_rtx (V8SFmode);
22076
22077 /* Shuffle within the 128-bit lanes to produce:
22078 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
22079 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
22080 GEN_INT (mask)));
22081
22082 /* Shuffle the lanes around to produce:
22083 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
22084 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
22085 GEN_INT (0x3)));
22086
22087 /* Shuffle within the 128-bit lanes to produce:
22088 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
22089 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
22090
22091 /* Shuffle within the 128-bit lanes to produce:
22092 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
22093 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
22094
22095 /* Shuffle the lanes around to produce:
22096 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
22097 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
22098 GEN_INT (0x20)));
22099 }
22100 break;
22101
22102 case E_V2DFmode:
22103 case E_V4SFmode:
22104 case E_V2DImode:
22105 case E_V2SImode:
22106 case E_V4SImode:
22107 case E_V2HImode:
22108 /* These are always directly implementable by expand_vec_perm_1. */
22109 gcc_unreachable ();
22110
22111 case E_V2SFmode:
22112 gcc_assert (TARGET_MMX_WITH_SSE);
22113 /* We have no suitable instructions. */
22114 if (d->testing_p)
22115 return false;
22116 break;
22117
22118 case E_V4QImode:
22119 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22120 return expand_vec_perm_pshufb2 (d);
22121 else
22122 {
22123 if (d->testing_p)
22124 break;
22125 /* We need 2*log2(N)-1 operations to achieve odd/even
22126 with interleave. */
22127 t1 = gen_reg_rtx (V4QImode);
22128 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
22129 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
22130 if (odd)
22131 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
22132 else
22133 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
22134 emit_insn (t2);
22135 }
22136 break;
22137
22138 case E_V4HImode:
22139 if (TARGET_SSE4_1)
22140 return expand_vec_perm_even_odd_pack (d);
22141 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22142 return expand_vec_perm_pshufb2 (d);
22143 else
22144 {
22145 if (d->testing_p)
22146 break;
22147 /* We need 2*log2(N)-1 operations to achieve odd/even
22148 with interleave. */
22149 t1 = gen_reg_rtx (V4HImode);
22150 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
22151 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
22152 if (odd)
22153 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
22154 else
22155 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
22156 emit_insn (t2);
22157 }
22158 break;
22159
22160 case E_V8HImode:
22161 if (TARGET_SSE4_1)
22162 return expand_vec_perm_even_odd_pack (d);
22163 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22164 return expand_vec_perm_pshufb2 (d);
22165 else
22166 {
22167 if (d->testing_p)
22168 break;
22169 /* We need 2*log2(N)-1 operations to achieve odd/even
22170 with interleave. */
22171 t1 = gen_reg_rtx (V8HImode);
22172 t2 = gen_reg_rtx (V8HImode);
22173 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
22174 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
22175 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
22176 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
22177 if (odd)
22178 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
22179 else
22180 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
22181 emit_insn (t3);
22182 }
22183 break;
22184
22185 case E_V8QImode:
22186 case E_V16QImode:
22187 return expand_vec_perm_even_odd_pack (d);
22188
22189 case E_V16HImode:
22190 case E_V32QImode:
22191 return expand_vec_perm_even_odd_pack (d);
22192
22193 case E_V64QImode:
22194 return expand_vec_perm_even_odd_trunc (d);
22195
22196 case E_V4DImode:
22197 if (!TARGET_AVX2)
22198 {
22199 struct expand_vec_perm_d d_copy = *d;
22200 d_copy.vmode = V4DFmode;
22201 if (d->testing_p)
22202 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
22203 else
22204 d_copy.target = gen_reg_rtx (V4DFmode);
22205 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
22206 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
22207 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22208 {
22209 if (!d->testing_p)
22210 emit_move_insn (d->target,
22211 gen_lowpart (V4DImode, d_copy.target));
22212 return true;
22213 }
22214 return false;
22215 }
22216
22217 if (d->testing_p)
22218 break;
22219
22220 t1 = gen_reg_rtx (V4DImode);
22221 t2 = gen_reg_rtx (V4DImode);
22222
22223 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22224 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
22225 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
22226
22227 /* Now an vpunpck[lh]qdq will produce the result required. */
22228 if (odd)
22229 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
22230 else
22231 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
22232 emit_insn (t3);
22233 break;
22234
22235 case E_V8SImode:
22236 if (!TARGET_AVX2)
22237 {
22238 struct expand_vec_perm_d d_copy = *d;
22239 d_copy.vmode = V8SFmode;
22240 if (d->testing_p)
22241 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
22242 else
22243 d_copy.target = gen_reg_rtx (V8SFmode);
22244 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
22245 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
22246 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22247 {
22248 if (!d->testing_p)
22249 emit_move_insn (d->target,
22250 gen_lowpart (V8SImode, d_copy.target));
22251 return true;
22252 }
22253 return false;
22254 }
22255
22256 if (d->testing_p)
22257 break;
22258
22259 t1 = gen_reg_rtx (V8SImode);
22260 t2 = gen_reg_rtx (V8SImode);
22261 t3 = gen_reg_rtx (V4DImode);
22262 t4 = gen_reg_rtx (V4DImode);
22263 t5 = gen_reg_rtx (V4DImode);
22264
22265 /* Shuffle the lanes around into
22266 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22267 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
22268 gen_lowpart (V4DImode, d->op1),
22269 GEN_INT (0x20)));
22270 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
22271 gen_lowpart (V4DImode, d->op1),
22272 GEN_INT (0x31)));
22273
22274 /* Swap the 2nd and 3rd position in each lane into
22275 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22276 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
22277 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22278 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
22279 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22280
22281 /* Now an vpunpck[lh]qdq will produce
22282 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22283 if (odd)
22284 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
22285 gen_lowpart (V4DImode, t2));
22286 else
22287 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
22288 gen_lowpart (V4DImode, t2));
22289 emit_insn (t3);
22290 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
22291 break;
22292
22293 default:
22294 gcc_unreachable ();
22295 }
22296
22297 return true;
22298 }
22299
22300 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22301 extract-even and extract-odd permutations. */
22302
22303 static bool
22304 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
22305 {
22306 unsigned i, odd, nelt = d->nelt;
22307
22308 odd = d->perm[0];
22309 if (odd != 0 && odd != 1)
22310 return false;
22311
22312 for (i = 1; i < nelt; ++i)
22313 if (d->perm[i] != 2 * i + odd)
22314 return false;
22315
22316 if (d->vmode == E_V32HImode
22317 && d->testing_p
22318 && !TARGET_AVX512BW)
22319 return false;
22320
22321 return expand_vec_perm_even_odd_1 (d, odd);
22322 }
22323
22324 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22325 permutations. We assume that expand_vec_perm_1 has already failed. */
22326
22327 static bool
22328 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
22329 {
22330 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
22331 machine_mode vmode = d->vmode;
22332 rtx (*gen) (rtx, rtx, rtx);
22333 unsigned char perm2[4];
22334 rtx op0 = d->op0, dest;
22335 bool ok;
22336
22337 switch (vmode)
22338 {
22339 case E_V4DFmode:
22340 case E_V8SFmode:
22341 /* These are special-cased in sse.md so that we can optionally
22342 use the vbroadcast instruction. They expand to two insns
22343 if the input happens to be in a register. */
22344 gcc_unreachable ();
22345
22346 case E_V2DFmode:
22347 case E_V2SFmode:
22348 case E_V4SFmode:
22349 case E_V2DImode:
22350 case E_V2SImode:
22351 case E_V4SImode:
22352 case E_V2HImode:
22353 case E_V4HImode:
22354 /* These are always implementable using standard shuffle patterns. */
22355 gcc_unreachable ();
22356
22357 case E_V4QImode:
22358 /* This can be implemented via interleave and pshuflw. */
22359 if (d->testing_p)
22360 return true;
22361
22362 if (elt >= nelt2)
22363 {
22364 gen = gen_mmx_punpckhbw_low;
22365 elt -= nelt2;
22366 }
22367 else
22368 gen = gen_mmx_punpcklbw_low;
22369
22370 dest = gen_reg_rtx (vmode);
22371 emit_insn (gen (dest, op0, op0));
22372 vmode = get_mode_wider_vector (vmode);
22373 op0 = gen_lowpart (vmode, dest);
22374
22375 memset (perm2, elt, 2);
22376 dest = gen_reg_rtx (vmode);
22377 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22378 gcc_assert (ok);
22379
22380 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22381 return true;
22382
22383 case E_V8QImode:
22384 /* This can be implemented via interleave. We save one insn by
22385 stopping once we have promoted to V2SImode and then use pshufd. */
22386 if (d->testing_p)
22387 return true;
22388 do
22389 {
22390 if (elt >= nelt2)
22391 {
22392 gen = vmode == V8QImode ? gen_mmx_punpckhbw
22393 : gen_mmx_punpckhwd;
22394 elt -= nelt2;
22395 }
22396 else
22397 gen = vmode == V8QImode ? gen_mmx_punpcklbw
22398 : gen_mmx_punpcklwd;
22399 nelt2 /= 2;
22400
22401 dest = gen_reg_rtx (vmode);
22402 emit_insn (gen (dest, op0, op0));
22403 vmode = get_mode_wider_vector (vmode);
22404 op0 = gen_lowpart (vmode, dest);
22405 }
22406 while (vmode != V2SImode);
22407
22408 memset (perm2, elt, 2);
22409 dest = gen_reg_rtx (vmode);
22410 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22411 gcc_assert (ok);
22412
22413 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22414 return true;
22415
22416 case E_V8HImode:
22417 case E_V16QImode:
22418 /* These can be implemented via interleave. We save one insn by
22419 stopping once we have promoted to V4SImode and then use pshufd. */
22420 if (d->testing_p)
22421 return true;
22422 do
22423 {
22424 if (elt >= nelt2)
22425 {
22426 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
22427 : gen_vec_interleave_highv8hi;
22428 elt -= nelt2;
22429 }
22430 else
22431 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
22432 : gen_vec_interleave_lowv8hi;
22433 nelt2 /= 2;
22434
22435 dest = gen_reg_rtx (vmode);
22436 emit_insn (gen (dest, op0, op0));
22437 vmode = get_mode_wider_vector (vmode);
22438 op0 = gen_lowpart (vmode, dest);
22439 }
22440 while (vmode != V4SImode);
22441
22442 memset (perm2, elt, 4);
22443 dest = gen_reg_rtx (vmode);
22444 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22445 gcc_assert (ok);
22446
22447 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22448 return true;
22449
22450 case E_V8HFmode:
22451 case E_V8BFmode:
22452 /* This can be implemented via interleave and pshufd. */
22453 if (d->testing_p)
22454 return true;
22455
22456 rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
22457 if (elt >= nelt2)
22458 {
22459 maybe_gen = maybe_gen_vec_interleave_high;
22460 elt -= nelt2;
22461 }
22462 else
22463 maybe_gen = maybe_gen_vec_interleave_low;
22464 nelt2 /= 2;
22465
22466 dest = gen_reg_rtx (vmode);
22467 emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
22468
22469 vmode = V4SImode;
22470 op0 = gen_lowpart (vmode, dest);
22471
22472 memset (perm2, elt, 4);
22473 dest = gen_reg_rtx (vmode);
22474 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22475 gcc_assert (ok);
22476
22477 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22478 return true;
22479
22480 case E_V32QImode:
22481 case E_V16HImode:
22482 case E_V8SImode:
22483 case E_V4DImode:
22484 /* For AVX2 broadcasts of the first element vpbroadcast* or
22485 vpermq should be used by expand_vec_perm_1. */
22486 gcc_assert (!TARGET_AVX2 || d->perm[0]);
22487 return false;
22488
22489 case E_V64QImode:
22490 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
22491 return false;
22492
22493 case E_V32HImode:
22494 gcc_assert (!TARGET_AVX512BW);
22495 return false;
22496
22497 default:
22498 gcc_unreachable ();
22499 }
22500 }
22501
22502 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22503 broadcast permutations. */
22504
22505 static bool
22506 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
22507 {
22508 unsigned i, elt, nelt = d->nelt;
22509
22510 if (!d->one_operand_p)
22511 return false;
22512
22513 elt = d->perm[0];
22514 for (i = 1; i < nelt; ++i)
22515 if (d->perm[i] != elt)
22516 return false;
22517
22518 return expand_vec_perm_broadcast_1 (d);
22519 }
22520
22521 /* Implement arbitrary permutations of two V64QImode operands
22522 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22523 static bool
22524 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
22525 {
22526 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
22527 return false;
22528
22529 if (d->testing_p)
22530 return true;
22531
22532 struct expand_vec_perm_d ds[2];
22533 rtx rperm[128], vperm, target0, target1;
22534 unsigned int i, nelt;
22535 machine_mode vmode;
22536
22537 nelt = d->nelt;
22538 vmode = V64QImode;
22539
22540 for (i = 0; i < 2; i++)
22541 {
22542 ds[i] = *d;
22543 ds[i].vmode = V32HImode;
22544 ds[i].nelt = 32;
22545 ds[i].target = gen_reg_rtx (V32HImode);
22546 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
22547 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
22548 }
22549
22550 /* Prepare permutations such that the first one takes care of
22551 putting the even bytes into the right positions or one higher
22552 positions (ds[0]) and the second one takes care of
22553 putting the odd bytes into the right positions or one below
22554 (ds[1]). */
22555
22556 for (i = 0; i < nelt; i++)
22557 {
22558 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
22559 if (i & 1)
22560 {
22561 rperm[i] = constm1_rtx;
22562 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22563 }
22564 else
22565 {
22566 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22567 rperm[i + 64] = constm1_rtx;
22568 }
22569 }
22570
22571 bool ok = expand_vec_perm_1 (&ds[0]);
22572 gcc_assert (ok);
22573 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
22574
22575 ok = expand_vec_perm_1 (&ds[1]);
22576 gcc_assert (ok);
22577 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
22578
22579 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
22580 vperm = force_reg (vmode, vperm);
22581 target0 = gen_reg_rtx (V64QImode);
22582 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
22583
22584 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
22585 vperm = force_reg (vmode, vperm);
22586 target1 = gen_reg_rtx (V64QImode);
22587 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
22588
22589 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
22590 return true;
22591 }
22592
22593 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
22594 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22595 all the shorter instruction sequences. */
22596
22597 static bool
22598 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
22599 {
22600 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
22601 unsigned int i, nelt, eltsz;
22602 bool used[4];
22603
22604 if (!TARGET_AVX2
22605 || d->one_operand_p
22606 || (d->vmode != V32QImode && d->vmode != V16HImode))
22607 return false;
22608
22609 if (d->testing_p)
22610 return true;
22611
22612 nelt = d->nelt;
22613 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22614
22615 /* Generate 4 permutation masks. If the required element is within
22616 the same lane, it is shuffled in. If the required element from the
22617 other lane, force a zero by setting bit 7 in the permutation mask.
22618 In the other mask the mask has non-negative elements if element
22619 is requested from the other lane, but also moved to the other lane,
22620 so that the result of vpshufb can have the two V2TImode halves
22621 swapped. */
22622 m128 = GEN_INT (-128);
22623 for (i = 0; i < 32; ++i)
22624 {
22625 rperm[0][i] = m128;
22626 rperm[1][i] = m128;
22627 rperm[2][i] = m128;
22628 rperm[3][i] = m128;
22629 }
22630 used[0] = false;
22631 used[1] = false;
22632 used[2] = false;
22633 used[3] = false;
22634 for (i = 0; i < nelt; ++i)
22635 {
22636 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22637 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22638 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
22639
22640 for (j = 0; j < eltsz; ++j)
22641 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
22642 used[which] = true;
22643 }
22644
22645 for (i = 0; i < 2; ++i)
22646 {
22647 if (!used[2 * i + 1])
22648 {
22649 h[i] = NULL_RTX;
22650 continue;
22651 }
22652 vperm = gen_rtx_CONST_VECTOR (V32QImode,
22653 gen_rtvec_v (32, rperm[2 * i + 1]));
22654 vperm = force_reg (V32QImode, vperm);
22655 h[i] = gen_reg_rtx (V32QImode);
22656 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22657 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
22658 }
22659
22660 /* Swap the 128-byte lanes of h[X]. */
22661 for (i = 0; i < 2; ++i)
22662 {
22663 if (h[i] == NULL_RTX)
22664 continue;
22665 op = gen_reg_rtx (V4DImode);
22666 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
22667 const2_rtx, GEN_INT (3), const0_rtx,
22668 const1_rtx));
22669 h[i] = gen_lowpart (V32QImode, op);
22670 }
22671
22672 for (i = 0; i < 2; ++i)
22673 {
22674 if (!used[2 * i])
22675 {
22676 l[i] = NULL_RTX;
22677 continue;
22678 }
22679 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
22680 vperm = force_reg (V32QImode, vperm);
22681 l[i] = gen_reg_rtx (V32QImode);
22682 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22683 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
22684 }
22685
22686 for (i = 0; i < 2; ++i)
22687 {
22688 if (h[i] && l[i])
22689 {
22690 op = gen_reg_rtx (V32QImode);
22691 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
22692 l[i] = op;
22693 }
22694 else if (h[i])
22695 l[i] = h[i];
22696 }
22697
22698 gcc_assert (l[0] && l[1]);
22699 op = d->target;
22700 if (d->vmode != V32QImode)
22701 op = gen_reg_rtx (V32QImode);
22702 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
22703 if (op != d->target)
22704 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22705 return true;
22706 }
22707
22708 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22709 taken care of, perform the expansion in D and return true on success. */
22710
22711 static bool
22712 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22713 {
22714 /* Try a single instruction expansion. */
22715 if (expand_vec_perm_1 (d))
22716 return true;
22717
22718 /* Try sequences of two instructions. */
22719
22720 if (expand_vec_perm_pshuflw_pshufhw (d))
22721 return true;
22722
22723 if (expand_vec_perm_palignr (d, false))
22724 return true;
22725
22726 if (expand_vec_perm_interleave2 (d))
22727 return true;
22728
22729 if (expand_vec_perm_broadcast (d))
22730 return true;
22731
22732 if (expand_vec_perm_vpermq_perm_1 (d))
22733 return true;
22734
22735 if (expand_vec_perm_vperm2f128 (d))
22736 return true;
22737
22738 if (expand_vec_perm_pblendv (d))
22739 return true;
22740
22741 if (expand_vec_perm_2perm_interleave (d, true))
22742 return true;
22743
22744 if (expand_vec_perm_2perm_pblendv (d, true))
22745 return true;
22746
22747 if (expand_vec_perm_shufps_shufps (d))
22748 return true;
22749
22750 /* Try sequences of three instructions. */
22751
22752 if (expand_vec_perm_even_odd_pack (d))
22753 return true;
22754
22755 if (expand_vec_perm_2vperm2f128_vshuf (d))
22756 return true;
22757
22758 if (expand_vec_perm_pshufb2 (d))
22759 return true;
22760
22761 if (expand_vec_perm_pslldq_psrldq_por (d, false))
22762 return true;
22763
22764 if (expand_vec_perm_interleave3 (d))
22765 return true;
22766
22767 if (expand_vec_perm_vperm2f128_vblend (d))
22768 return true;
22769
22770 if (expand_vec_perm_2perm_interleave (d, false))
22771 return true;
22772
22773 if (expand_vec_perm_2perm_pblendv (d, false))
22774 return true;
22775
22776 /* Try sequences of four instructions. */
22777
22778 if (expand_vec_perm_even_odd_trunc (d))
22779 return true;
22780 if (expand_vec_perm_vpshufb2_vpermq (d))
22781 return true;
22782
22783 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
22784 return true;
22785
22786 if (expand_vec_perm_vpermt2_vpshub2 (d))
22787 return true;
22788
22789 /* ??? Look for narrow permutations whose element orderings would
22790 allow the promotion to a wider mode. */
22791
22792 /* ??? Look for sequences of interleave or a wider permute that place
22793 the data into the correct lanes for a half-vector shuffle like
22794 pshuf[lh]w or vpermilps. */
22795
22796 /* ??? Look for sequences of interleave that produce the desired results.
22797 The combinatorics of punpck[lh] get pretty ugly... */
22798
22799 if (expand_vec_perm_even_odd (d))
22800 return true;
22801
22802 /* Generate four or five instructions. */
22803 if (expand_vec_perm_pslldq_psrldq_por (d, true))
22804 return true;
22805
22806 /* Even longer sequences. */
22807 if (expand_vec_perm_vpshufb4_vpermq2 (d))
22808 return true;
22809
22810 /* See if we can get the same permutation in different vector integer
22811 mode. */
22812 struct expand_vec_perm_d nd;
22813 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22814 {
22815 if (!d->testing_p)
22816 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22817 return true;
22818 }
22819
22820 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22821 if (expand_vec_perm2_vperm2f128_vblend (d))
22822 return true;
22823
22824 return false;
22825 }
22826
22827 /* If a permutation only uses one operand, make it clear. Returns true
22828 if the permutation references both operands. */
22829
22830 static bool
22831 canonicalize_perm (struct expand_vec_perm_d *d)
22832 {
22833 int i, which, nelt = d->nelt;
22834
22835 for (i = which = 0; i < nelt; ++i)
22836 which |= (d->perm[i] < nelt ? 1 : 2);
22837
22838 d->one_operand_p = true;
22839 switch (which)
22840 {
22841 default:
22842 gcc_unreachable();
22843
22844 case 3:
22845 if (!rtx_equal_p (d->op0, d->op1))
22846 {
22847 d->one_operand_p = false;
22848 break;
22849 }
22850 /* The elements of PERM do not suggest that only the first operand
22851 is used, but both operands are identical. Allow easier matching
22852 of the permutation by folding the permutation into the single
22853 input vector. */
22854 /* FALLTHRU */
22855
22856 case 2:
22857 for (i = 0; i < nelt; ++i)
22858 d->perm[i] &= nelt - 1;
22859 d->op0 = d->op1;
22860 break;
22861
22862 case 1:
22863 d->op1 = d->op0;
22864 break;
22865 }
22866
22867 return (which == 3);
22868 }
22869
22870 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22871
22872 bool
22873 ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
22874 rtx target, rtx op0, rtx op1,
22875 const vec_perm_indices &sel)
22876 {
22877 if (vmode != op_mode)
22878 return false;
22879
22880 struct expand_vec_perm_d d;
22881 unsigned char perm[MAX_VECT_LEN];
22882 unsigned int i, nelt, which;
22883 bool two_args;
22884
22885 /* For HF mode vector, convert it to HI using subreg. */
22886 if (GET_MODE_INNER (vmode) == HFmode)
22887 {
22888 machine_mode orig_mode = vmode;
22889 vmode = mode_for_vector (HImode,
22890 GET_MODE_NUNITS (vmode)).require ();
22891 if (target)
22892 target = lowpart_subreg (vmode, target, orig_mode);
22893 if (op0)
22894 op0 = lowpart_subreg (vmode, op0, orig_mode);
22895 if (op1)
22896 op1 = lowpart_subreg (vmode, op1, orig_mode);
22897 }
22898
22899 d.target = target;
22900 d.op0 = op0;
22901 d.op1 = op1;
22902
22903 d.vmode = vmode;
22904 gcc_assert (VECTOR_MODE_P (d.vmode));
22905 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22906 d.testing_p = !target;
22907
22908 gcc_assert (sel.length () == nelt);
22909 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
22910
22911 /* Given sufficient ISA support we can just return true here
22912 for selected vector modes. */
22913 switch (d.vmode)
22914 {
22915 case E_V16SFmode:
22916 case E_V16SImode:
22917 case E_V8DImode:
22918 case E_V8DFmode:
22919 if (!TARGET_AVX512F)
22920 return false;
22921 /* All implementable with a single vperm[it]2 insn. */
22922 if (d.testing_p)
22923 return true;
22924 break;
22925 case E_V32HImode:
22926 if (!TARGET_AVX512F)
22927 return false;
22928 if (d.testing_p && TARGET_AVX512BW)
22929 /* All implementable with a single vperm[it]2 insn. */
22930 return true;
22931 break;
22932 case E_V64QImode:
22933 if (!TARGET_AVX512F)
22934 return false;
22935 if (d.testing_p && TARGET_AVX512BW)
22936 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22937 return true;
22938 break;
22939 case E_V8SImode:
22940 case E_V8SFmode:
22941 case E_V4DFmode:
22942 case E_V4DImode:
22943 if (!TARGET_AVX)
22944 return false;
22945 if (d.testing_p && TARGET_AVX512VL)
22946 /* All implementable with a single vperm[it]2 insn. */
22947 return true;
22948 break;
22949 case E_V16HImode:
22950 if (!TARGET_SSE2)
22951 return false;
22952 if (d.testing_p && TARGET_AVX2)
22953 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22954 return true;
22955 break;
22956 case E_V32QImode:
22957 if (!TARGET_SSE2)
22958 return false;
22959 if (d.testing_p && TARGET_AVX2)
22960 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22961 return true;
22962 break;
22963 case E_V8HImode:
22964 case E_V16QImode:
22965 if (!TARGET_SSE2)
22966 return false;
22967 /* Fall through. */
22968 case E_V4SImode:
22969 case E_V4SFmode:
22970 if (!TARGET_SSE)
22971 return false;
22972 /* All implementable with a single vpperm insn. */
22973 if (d.testing_p && TARGET_XOP)
22974 return true;
22975 /* All implementable with 2 pshufb + 1 ior. */
22976 if (d.testing_p && TARGET_SSSE3)
22977 return true;
22978 break;
22979 case E_V2SFmode:
22980 case E_V2SImode:
22981 case E_V4HImode:
22982 case E_V8QImode:
22983 if (!TARGET_MMX_WITH_SSE)
22984 return false;
22985 break;
22986 case E_V2HImode:
22987 if (!TARGET_SSE2)
22988 return false;
22989 /* All implementable with *punpckwd. */
22990 if (d.testing_p)
22991 return true;
22992 break;
22993 case E_V4QImode:
22994 if (!TARGET_SSE2)
22995 return false;
22996 break;
22997 case E_V2DImode:
22998 case E_V2DFmode:
22999 if (!TARGET_SSE)
23000 return false;
23001 /* All implementable with shufpd or unpck[lh]pd. */
23002 if (d.testing_p)
23003 return true;
23004 break;
23005 default:
23006 return false;
23007 }
23008
23009 for (i = which = 0; i < nelt; ++i)
23010 {
23011 unsigned char e = sel[i];
23012 gcc_assert (e < 2 * nelt);
23013 d.perm[i] = e;
23014 perm[i] = e;
23015 which |= (e < nelt ? 1 : 2);
23016 }
23017
23018 if (d.testing_p)
23019 {
23020 /* For all elements from second vector, fold the elements to first. */
23021 if (which == 2)
23022 for (i = 0; i < nelt; ++i)
23023 d.perm[i] -= nelt;
23024
23025 /* Check whether the mask can be applied to the vector type. */
23026 d.one_operand_p = (which != 3);
23027
23028 /* Implementable with shufps, pshufd or pshuflw. */
23029 if (d.one_operand_p
23030 && (d.vmode == V4SFmode || d.vmode == V2SFmode
23031 || d.vmode == V4SImode || d.vmode == V2SImode
23032 || d.vmode == V4HImode || d.vmode == V2HImode))
23033 return true;
23034
23035 /* Otherwise we have to go through the motions and see if we can
23036 figure out how to generate the requested permutation. */
23037 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
23038 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
23039 if (!d.one_operand_p)
23040 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
23041
23042 start_sequence ();
23043 bool ret = ix86_expand_vec_perm_const_1 (&d);
23044 end_sequence ();
23045
23046 return ret;
23047 }
23048
23049 two_args = canonicalize_perm (&d);
23050
23051 /* If one of the operands is a zero vector, try to match pmovzx. */
23052 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
23053 {
23054 struct expand_vec_perm_d dzero = d;
23055 if (d.op0 == CONST0_RTX (vmode))
23056 {
23057 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
23058 std::swap (dzero.op0, dzero.op1);
23059 for (i = 0; i < nelt; ++i)
23060 dzero.perm[i] ^= nelt;
23061 }
23062 else
23063 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
23064
23065 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
23066 dzero.perm, nelt, dzero.testing_p))
23067 return true;
23068 }
23069
23070 /* Force operands into registers. */
23071 rtx nop0 = force_reg (vmode, d.op0);
23072 if (d.op0 == d.op1)
23073 d.op1 = nop0;
23074 d.op0 = nop0;
23075 d.op1 = force_reg (vmode, d.op1);
23076
23077 if (ix86_expand_vec_perm_const_1 (&d))
23078 return true;
23079
23080 /* If the selector says both arguments are needed, but the operands are the
23081 same, the above tried to expand with one_operand_p and flattened selector.
23082 If that didn't work, retry without one_operand_p; we succeeded with that
23083 during testing. */
23084 if (two_args && d.one_operand_p)
23085 {
23086 d.one_operand_p = false;
23087 memcpy (d.perm, perm, sizeof (perm));
23088 return ix86_expand_vec_perm_const_1 (&d);
23089 }
23090
23091 return false;
23092 }
23093
23094 void
23095 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
23096 {
23097 struct expand_vec_perm_d d;
23098 unsigned i, nelt;
23099
23100 d.target = targ;
23101 d.op0 = op0;
23102 d.op1 = op1;
23103 d.vmode = GET_MODE (targ);
23104 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23105 d.one_operand_p = false;
23106 d.testing_p = false;
23107
23108 for (i = 0; i < nelt; ++i)
23109 d.perm[i] = i * 2 + odd;
23110
23111 /* We'll either be able to implement the permutation directly... */
23112 if (expand_vec_perm_1 (&d))
23113 return;
23114
23115 /* ... or we use the special-case patterns. */
23116 expand_vec_perm_even_odd_1 (&d, odd);
23117 }
23118
23119 static void
23120 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
23121 {
23122 struct expand_vec_perm_d d;
23123 unsigned i, nelt, base;
23124 bool ok;
23125
23126 d.target = targ;
23127 d.op0 = op0;
23128 d.op1 = op1;
23129 d.vmode = GET_MODE (targ);
23130 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23131 d.one_operand_p = false;
23132 d.testing_p = false;
23133
23134 base = high_p ? nelt / 2 : 0;
23135 for (i = 0; i < nelt / 2; ++i)
23136 {
23137 d.perm[i * 2] = i + base;
23138 d.perm[i * 2 + 1] = i + base + nelt;
23139 }
23140
23141 /* Note that for AVX this isn't one instruction. */
23142 ok = ix86_expand_vec_perm_const_1 (&d);
23143 gcc_assert (ok);
23144 }
23145
23146 /* Expand a vector operation shift by constant for a V*QImode in terms of the
23147 same operation on V*HImode. Return true if success. */
23148 static bool
23149 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
23150 rtx dest, rtx op1, rtx op2)
23151 {
23152 machine_mode qimode, himode;
23153 HOST_WIDE_INT and_constant, xor_constant;
23154 HOST_WIDE_INT shift_amount;
23155 rtx vec_const_and, vec_const_xor;
23156 rtx tmp, op1_subreg;
23157 rtx (*gen_shift) (rtx, rtx, rtx);
23158 rtx (*gen_and) (rtx, rtx, rtx);
23159 rtx (*gen_xor) (rtx, rtx, rtx);
23160 rtx (*gen_sub) (rtx, rtx, rtx);
23161
23162 /* Only optimize shift by constant. */
23163 if (!CONST_INT_P (op2))
23164 return false;
23165
23166 qimode = GET_MODE (dest);
23167 shift_amount = INTVAL (op2);
23168 /* Do nothing when shift amount greater equal 8. */
23169 if (shift_amount > 7)
23170 return false;
23171
23172 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
23173 /* Record sign bit. */
23174 xor_constant = 1 << (8 - shift_amount - 1);
23175
23176 /* Zero upper/lower bits shift from left/right element. */
23177 and_constant
23178 = (code == ASHIFT ? 256 - (1 << shift_amount)
23179 : (1 << (8 - shift_amount)) - 1);
23180
23181 switch (qimode)
23182 {
23183 case V16QImode:
23184 himode = V8HImode;
23185 gen_shift =
23186 ((code == ASHIFT)
23187 ? gen_ashlv8hi3
23188 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
23189 gen_and = gen_andv16qi3;
23190 gen_xor = gen_xorv16qi3;
23191 gen_sub = gen_subv16qi3;
23192 break;
23193 case V32QImode:
23194 himode = V16HImode;
23195 gen_shift =
23196 ((code == ASHIFT)
23197 ? gen_ashlv16hi3
23198 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
23199 gen_and = gen_andv32qi3;
23200 gen_xor = gen_xorv32qi3;
23201 gen_sub = gen_subv32qi3;
23202 break;
23203 case V64QImode:
23204 himode = V32HImode;
23205 gen_shift =
23206 ((code == ASHIFT)
23207 ? gen_ashlv32hi3
23208 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
23209 gen_and = gen_andv64qi3;
23210 gen_xor = gen_xorv64qi3;
23211 gen_sub = gen_subv64qi3;
23212 break;
23213 default:
23214 gcc_unreachable ();
23215 }
23216
23217 tmp = gen_reg_rtx (himode);
23218 vec_const_and = gen_reg_rtx (qimode);
23219 op1_subreg = lowpart_subreg (himode, op1, qimode);
23220
23221 /* For ASHIFT and LSHIFTRT, perform operation like
23222 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23223 vpand %vec_const_and, %dest. */
23224 emit_insn (gen_shift (tmp, op1_subreg, op2));
23225 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
23226 emit_move_insn (vec_const_and,
23227 ix86_build_const_vector (qimode, true,
23228 gen_int_mode (and_constant, QImode)));
23229 emit_insn (gen_and (dest, dest, vec_const_and));
23230
23231 /* For ASHIFTRT, perform extra operation like
23232 vpxor %vec_const_xor, %dest, %dest
23233 vpsubb %vec_const_xor, %dest, %dest */
23234 if (code == ASHIFTRT)
23235 {
23236 vec_const_xor = gen_reg_rtx (qimode);
23237 emit_move_insn (vec_const_xor,
23238 ix86_build_const_vector (qimode, true,
23239 gen_int_mode (xor_constant, QImode)));
23240 emit_insn (gen_xor (dest, dest, vec_const_xor));
23241 emit_insn (gen_sub (dest, dest, vec_const_xor));
23242 }
23243 return true;
23244 }
23245
23246 void
23247 ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23248 {
23249 machine_mode qimode = GET_MODE (dest);
23250 rtx qop1, qop2, hop1, hop2, qdest, hdest;
23251 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23252 bool uns_p = code != ASHIFTRT;
23253
23254 switch (qimode)
23255 {
23256 case E_V4QImode:
23257 case E_V8QImode:
23258 break;
23259 default:
23260 gcc_unreachable ();
23261 }
23262
23263 qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
23264
23265 if (op2vec)
23266 qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
23267 else
23268 qop2 = op2;
23269
23270 qdest = gen_reg_rtx (V16QImode);
23271
23272 if (CONST_INT_P (op2)
23273 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23274 && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
23275 {
23276 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23277 return;
23278 }
23279
23280 switch (code)
23281 {
23282 case MULT:
23283 gcc_assert (op2vec);
23284 if (!TARGET_SSE4_1)
23285 {
23286 /* Unpack data such that we've got a source byte in each low byte
23287 of each word. We don't care what goes into the high byte of
23288 each word. Rather than trying to get zero in there, most
23289 convenient is to let it be a copy of the low byte. */
23290 hop1 = copy_to_reg (qop1);
23291 hop2 = copy_to_reg (qop2);
23292 emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
23293 emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
23294 break;
23295 }
23296 /* FALLTHRU */
23297 case ASHIFT:
23298 case ASHIFTRT:
23299 case LSHIFTRT:
23300 hop1 = gen_reg_rtx (V8HImode);
23301 ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
23302 /* mult/vashr/vlshr/vashl */
23303 if (op2vec)
23304 {
23305 hop2 = gen_reg_rtx (V8HImode);
23306 ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
23307 }
23308 else
23309 hop2 = qop2;
23310
23311 break;
23312 default:
23313 gcc_unreachable ();
23314 }
23315
23316 if (code != MULT && op2vec)
23317 {
23318 /* Expand vashr/vlshr/vashl. */
23319 hdest = gen_reg_rtx (V8HImode);
23320 emit_insn (gen_rtx_SET (hdest,
23321 simplify_gen_binary (code, V8HImode,
23322 hop1, hop2)));
23323 }
23324 else
23325 /* Expand mult/ashr/lshr/ashl. */
23326 hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
23327 NULL_RTX, 1, OPTAB_DIRECT);
23328
23329 if (TARGET_AVX512BW && TARGET_AVX512VL)
23330 {
23331 if (qimode == V8QImode)
23332 qdest = dest;
23333 else
23334 qdest = gen_reg_rtx (V8QImode);
23335
23336 emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
23337 }
23338 else
23339 {
23340 struct expand_vec_perm_d d;
23341 rtx qres = gen_lowpart (V16QImode, hdest);
23342 bool ok;
23343 int i;
23344
23345 /* Merge the data back into the right place. */
23346 d.target = qdest;
23347 d.op0 = d.op1 = qres;
23348 d.vmode = V16QImode;
23349 d.nelt = 16;
23350 d.one_operand_p = false;
23351 d.testing_p = false;
23352
23353 for (i = 0; i < d.nelt; ++i)
23354 d.perm[i] = i * 2;
23355
23356 ok = ix86_expand_vec_perm_const_1 (&d);
23357 gcc_assert (ok);
23358 }
23359
23360 if (qdest != dest)
23361 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23362 }
23363
23364 /* Emit instruction in 2x wider mode. For example, optimize
23365 vector MUL generation like
23366
23367 vpmovzxbw ymm2, xmm0
23368 vpmovzxbw ymm3, xmm1
23369 vpmullw ymm4, ymm2, ymm3
23370 vpmovwb xmm0, ymm4
23371
23372 it would take less instructions than ix86_expand_vecop_qihi.
23373 Return true if success. */
23374
23375 static bool
23376 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23377 {
23378 machine_mode himode, qimode = GET_MODE (dest);
23379 machine_mode wqimode;
23380 rtx qop1, qop2, hop1, hop2, hdest;
23381 rtx (*gen_truncate)(rtx, rtx) = NULL;
23382 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23383 bool uns_p = code != ASHIFTRT;
23384
23385 if ((qimode == V16QImode && !TARGET_AVX2)
23386 || (qimode == V32QImode && !TARGET_AVX512BW)
23387 /* There are no V64HImode instructions. */
23388 || qimode == V64QImode)
23389 return false;
23390
23391 /* Do not generate ymm/zmm instructions when
23392 target prefers 128/256 bit vector width. */
23393 if ((qimode == V16QImode && TARGET_PREFER_AVX128)
23394 || (qimode == V32QImode && TARGET_PREFER_AVX256))
23395 return false;
23396
23397 switch (qimode)
23398 {
23399 case E_V16QImode:
23400 himode = V16HImode;
23401 if (TARGET_AVX512VL && TARGET_AVX512BW)
23402 gen_truncate = gen_truncv16hiv16qi2;
23403 break;
23404 case E_V32QImode:
23405 himode = V32HImode;
23406 gen_truncate = gen_truncv32hiv32qi2;
23407 break;
23408 default:
23409 gcc_unreachable ();
23410 }
23411
23412 wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
23413 qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
23414
23415 if (op2vec)
23416 qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
23417 else
23418 qop2 = op2;
23419
23420 hop1 = gen_reg_rtx (himode);
23421 ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
23422
23423 if (op2vec)
23424 {
23425 hop2 = gen_reg_rtx (himode);
23426 ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
23427 }
23428 else
23429 hop2 = qop2;
23430
23431 if (code != MULT && op2vec)
23432 {
23433 /* Expand vashr/vlshr/vashl. */
23434 hdest = gen_reg_rtx (himode);
23435 emit_insn (gen_rtx_SET (hdest,
23436 simplify_gen_binary (code, himode,
23437 hop1, hop2)));
23438 }
23439 else
23440 /* Expand mult/ashr/lshr/ashl. */
23441 hdest = expand_simple_binop (himode, code, hop1, hop2,
23442 NULL_RTX, 1, OPTAB_DIRECT);
23443
23444 if (gen_truncate)
23445 emit_insn (gen_truncate (dest, hdest));
23446 else
23447 {
23448 struct expand_vec_perm_d d;
23449 rtx wqdest = gen_reg_rtx (wqimode);
23450 rtx wqres = gen_lowpart (wqimode, hdest);
23451 bool ok;
23452 int i;
23453
23454 /* Merge the data back into the right place. */
23455 d.target = wqdest;
23456 d.op0 = d.op1 = wqres;
23457 d.vmode = wqimode;
23458 d.nelt = GET_MODE_NUNITS (wqimode);
23459 d.one_operand_p = false;
23460 d.testing_p = false;
23461
23462 for (i = 0; i < d.nelt; ++i)
23463 d.perm[i] = i * 2;
23464
23465 ok = ix86_expand_vec_perm_const_1 (&d);
23466 gcc_assert (ok);
23467
23468 emit_move_insn (dest, gen_lowpart (qimode, wqdest));
23469 }
23470
23471 return true;
23472 }
23473
23474 /* Expand a vector operation CODE for a V*QImode in terms of the
23475 same operation on V*HImode. */
23476
23477 void
23478 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23479 {
23480 machine_mode qimode = GET_MODE (dest);
23481 machine_mode himode;
23482 rtx (*gen_il) (rtx, rtx, rtx);
23483 rtx (*gen_ih) (rtx, rtx, rtx);
23484 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
23485 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23486 struct expand_vec_perm_d d;
23487 bool full_interleave = true;
23488 bool uns_p = code != ASHIFTRT;
23489 bool ok;
23490 int i;
23491
23492 if (CONST_INT_P (op2)
23493 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23494 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
23495 return;
23496
23497 if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
23498 return;
23499
23500 switch (qimode)
23501 {
23502 case E_V16QImode:
23503 himode = V8HImode;
23504 break;
23505 case E_V32QImode:
23506 himode = V16HImode;
23507 break;
23508 case E_V64QImode:
23509 himode = V32HImode;
23510 break;
23511 default:
23512 gcc_unreachable ();
23513 }
23514
23515 switch (code)
23516 {
23517 case MULT:
23518 gcc_assert (op2vec);
23519 /* Unpack data such that we've got a source byte in each low byte of
23520 each word. We don't care what goes into the high byte of each word.
23521 Rather than trying to get zero in there, most convenient is to let
23522 it be a copy of the low byte. */
23523 switch (qimode)
23524 {
23525 case E_V16QImode:
23526 gen_il = gen_vec_interleave_lowv16qi;
23527 gen_ih = gen_vec_interleave_highv16qi;
23528 break;
23529 case E_V32QImode:
23530 gen_il = gen_avx2_interleave_lowv32qi;
23531 gen_ih = gen_avx2_interleave_highv32qi;
23532 full_interleave = false;
23533 break;
23534 case E_V64QImode:
23535 gen_il = gen_avx512bw_interleave_lowv64qi;
23536 gen_ih = gen_avx512bw_interleave_highv64qi;
23537 full_interleave = false;
23538 break;
23539 default:
23540 gcc_unreachable ();
23541 }
23542
23543 op2_l = gen_reg_rtx (qimode);
23544 op2_h = gen_reg_rtx (qimode);
23545 emit_insn (gen_il (op2_l, op2, op2));
23546 emit_insn (gen_ih (op2_h, op2, op2));
23547
23548 op1_l = gen_reg_rtx (qimode);
23549 op1_h = gen_reg_rtx (qimode);
23550 emit_insn (gen_il (op1_l, op1, op1));
23551 emit_insn (gen_ih (op1_h, op1, op1));
23552 break;
23553
23554 case ASHIFT:
23555 case ASHIFTRT:
23556 case LSHIFTRT:
23557 op1_l = gen_reg_rtx (himode);
23558 op1_h = gen_reg_rtx (himode);
23559 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
23560 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
23561 /* vashr/vlshr/vashl */
23562 if (op2vec)
23563 {
23564 rtx tmp = force_reg (qimode, op2);
23565 op2_l = gen_reg_rtx (himode);
23566 op2_h = gen_reg_rtx (himode);
23567 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
23568 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
23569 }
23570 else
23571 op2_l = op2_h = op2;
23572
23573 break;
23574 default:
23575 gcc_unreachable ();
23576 }
23577
23578 if (code != MULT && op2vec)
23579 {
23580 /* Expand vashr/vlshr/vashl. */
23581 res_l = gen_reg_rtx (himode);
23582 res_h = gen_reg_rtx (himode);
23583 emit_insn (gen_rtx_SET (res_l,
23584 simplify_gen_binary (code, himode,
23585 op1_l, op2_l)));
23586 emit_insn (gen_rtx_SET (res_h,
23587 simplify_gen_binary (code, himode,
23588 op1_h, op2_h)));
23589 }
23590 else
23591 {
23592 /* Expand mult/ashr/lshr/ashl. */
23593 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
23594 1, OPTAB_DIRECT);
23595 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
23596 1, OPTAB_DIRECT);
23597 }
23598
23599 gcc_assert (res_l && res_h);
23600
23601 /* Merge the data back into the right place. */
23602 d.target = dest;
23603 d.op0 = gen_lowpart (qimode, res_l);
23604 d.op1 = gen_lowpart (qimode, res_h);
23605 d.vmode = qimode;
23606 d.nelt = GET_MODE_NUNITS (qimode);
23607 d.one_operand_p = false;
23608 d.testing_p = false;
23609
23610 if (full_interleave)
23611 {
23612 /* We used the full interleave, the desired
23613 results are in the even elements. */
23614 for (i = 0; i < d.nelt; ++i)
23615 d.perm[i] = i * 2;
23616 }
23617 else
23618 {
23619 /* For AVX, the interleave used above was not cross-lane. So the
23620 extraction is evens but with the second and third quarter swapped.
23621 Happily, that is even one insn shorter than even extraction.
23622 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23623 always first from the first and then from the second source operand,
23624 the index bits above the low 4 bits remains the same.
23625 Thus, for d.nelt == 32 we want permutation
23626 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23627 and for d.nelt == 64 we want permutation
23628 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23629 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23630 for (i = 0; i < d.nelt; ++i)
23631 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
23632 }
23633
23634 ok = ix86_expand_vec_perm_const_1 (&d);
23635 gcc_assert (ok);
23636 }
23637
23638 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
23639 if op is CONST_VECTOR with all odd elements equal to their
23640 preceding element. */
23641
23642 static bool
23643 const_vector_equal_evenodd_p (rtx op)
23644 {
23645 machine_mode mode = GET_MODE (op);
23646 int i, nunits = GET_MODE_NUNITS (mode);
23647 if (GET_CODE (op) != CONST_VECTOR
23648 || nunits != CONST_VECTOR_NUNITS (op))
23649 return false;
23650 for (i = 0; i < nunits; i += 2)
23651 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
23652 return false;
23653 return true;
23654 }
23655
23656 void
23657 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
23658 bool uns_p, bool odd_p)
23659 {
23660 machine_mode mode = GET_MODE (op1);
23661 machine_mode wmode = GET_MODE (dest);
23662 rtx x;
23663 rtx orig_op1 = op1, orig_op2 = op2;
23664
23665 if (!nonimmediate_operand (op1, mode))
23666 op1 = force_reg (mode, op1);
23667 if (!nonimmediate_operand (op2, mode))
23668 op2 = force_reg (mode, op2);
23669
23670 /* We only play even/odd games with vectors of SImode. */
23671 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
23672
23673 /* If we're looking for the odd results, shift those members down to
23674 the even slots. For some cpus this is faster than a PSHUFD. */
23675 if (odd_p)
23676 {
23677 /* For XOP use vpmacsdqh, but only for smult, as it is only
23678 signed. */
23679 if (TARGET_XOP && mode == V4SImode && !uns_p)
23680 {
23681 x = force_reg (wmode, CONST0_RTX (wmode));
23682 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
23683 return;
23684 }
23685
23686 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
23687 if (!const_vector_equal_evenodd_p (orig_op1))
23688 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
23689 x, NULL, 1, OPTAB_DIRECT);
23690 if (!const_vector_equal_evenodd_p (orig_op2))
23691 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
23692 x, NULL, 1, OPTAB_DIRECT);
23693 op1 = gen_lowpart (mode, op1);
23694 op2 = gen_lowpart (mode, op2);
23695 }
23696
23697 if (mode == V16SImode)
23698 {
23699 if (uns_p)
23700 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
23701 else
23702 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
23703 }
23704 else if (mode == V8SImode)
23705 {
23706 if (uns_p)
23707 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
23708 else
23709 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
23710 }
23711 else if (uns_p)
23712 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
23713 else if (TARGET_SSE4_1)
23714 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
23715 else
23716 {
23717 rtx s1, s2, t0, t1, t2;
23718
23719 /* The easiest way to implement this without PMULDQ is to go through
23720 the motions as if we are performing a full 64-bit multiply. With
23721 the exception that we need to do less shuffling of the elements. */
23722
23723 /* Compute the sign-extension, aka highparts, of the two operands. */
23724 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23725 op1, pc_rtx, pc_rtx);
23726 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23727 op2, pc_rtx, pc_rtx);
23728
23729 /* Multiply LO(A) * HI(B), and vice-versa. */
23730 t1 = gen_reg_rtx (wmode);
23731 t2 = gen_reg_rtx (wmode);
23732 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
23733 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
23734
23735 /* Multiply LO(A) * LO(B). */
23736 t0 = gen_reg_rtx (wmode);
23737 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
23738
23739 /* Combine and shift the highparts into place. */
23740 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
23741 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
23742 1, OPTAB_DIRECT);
23743
23744 /* Combine high and low parts. */
23745 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
23746 return;
23747 }
23748 emit_insn (x);
23749 }
23750
23751 void
23752 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
23753 bool uns_p, bool high_p)
23754 {
23755 machine_mode wmode = GET_MODE (dest);
23756 machine_mode mode = GET_MODE (op1);
23757 rtx t1, t2, t3, t4, mask;
23758
23759 switch (mode)
23760 {
23761 case E_V4SImode:
23762 t1 = gen_reg_rtx (mode);
23763 t2 = gen_reg_rtx (mode);
23764 if (TARGET_XOP && !uns_p)
23765 {
23766 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23767 shuffle the elements once so that all elements are in the right
23768 place for immediate use: { A C B D }. */
23769 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
23770 const1_rtx, GEN_INT (3)));
23771 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
23772 const1_rtx, GEN_INT (3)));
23773 }
23774 else
23775 {
23776 /* Put the elements into place for the multiply. */
23777 ix86_expand_vec_interleave (t1, op1, op1, high_p);
23778 ix86_expand_vec_interleave (t2, op2, op2, high_p);
23779 high_p = false;
23780 }
23781 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
23782 break;
23783
23784 case E_V8SImode:
23785 /* Shuffle the elements between the lanes. After this we
23786 have { A B E F | C D G H } for each operand. */
23787 t1 = gen_reg_rtx (V4DImode);
23788 t2 = gen_reg_rtx (V4DImode);
23789 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
23790 const0_rtx, const2_rtx,
23791 const1_rtx, GEN_INT (3)));
23792 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
23793 const0_rtx, const2_rtx,
23794 const1_rtx, GEN_INT (3)));
23795
23796 /* Shuffle the elements within the lanes. After this we
23797 have { A A B B | C C D D } or { E E F F | G G H H }. */
23798 t3 = gen_reg_rtx (V8SImode);
23799 t4 = gen_reg_rtx (V8SImode);
23800 mask = GEN_INT (high_p
23801 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23802 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23803 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
23804 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
23805
23806 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
23807 break;
23808
23809 case E_V8HImode:
23810 case E_V16HImode:
23811 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
23812 uns_p, OPTAB_DIRECT);
23813 t2 = expand_binop (mode,
23814 uns_p ? umul_highpart_optab : smul_highpart_optab,
23815 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
23816 gcc_assert (t1 && t2);
23817
23818 t3 = gen_reg_rtx (mode);
23819 ix86_expand_vec_interleave (t3, t1, t2, high_p);
23820 emit_move_insn (dest, gen_lowpart (wmode, t3));
23821 break;
23822
23823 case E_V16QImode:
23824 case E_V32QImode:
23825 case E_V32HImode:
23826 case E_V16SImode:
23827 case E_V64QImode:
23828 t1 = gen_reg_rtx (wmode);
23829 t2 = gen_reg_rtx (wmode);
23830 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
23831 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
23832
23833 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
23834 break;
23835
23836 default:
23837 gcc_unreachable ();
23838 }
23839 }
23840
23841 void
23842 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
23843 {
23844 rtx res_1, res_2, res_3, res_4;
23845
23846 res_1 = gen_reg_rtx (V4SImode);
23847 res_2 = gen_reg_rtx (V4SImode);
23848 res_3 = gen_reg_rtx (V2DImode);
23849 res_4 = gen_reg_rtx (V2DImode);
23850 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
23851 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
23852
23853 /* Move the results in element 2 down to element 1; we don't care
23854 what goes in elements 2 and 3. Then we can merge the parts
23855 back together with an interleave.
23856
23857 Note that two other sequences were tried:
23858 (1) Use interleaves at the start instead of psrldq, which allows
23859 us to use a single shufps to merge things back at the end.
23860 (2) Use shufps here to combine the two vectors, then pshufd to
23861 put the elements in the correct order.
23862 In both cases the cost of the reformatting stall was too high
23863 and the overall sequence slower. */
23864
23865 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
23866 const0_rtx, const2_rtx,
23867 const0_rtx, const0_rtx));
23868 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
23869 const0_rtx, const2_rtx,
23870 const0_rtx, const0_rtx));
23871 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
23872
23873 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
23874 }
23875
23876 void
23877 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
23878 {
23879 machine_mode mode = GET_MODE (op0);
23880 rtx t1, t2, t3, t4, t5, t6;
23881
23882 if (TARGET_AVX512DQ && mode == V8DImode)
23883 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
23884 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
23885 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
23886 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
23887 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
23888 else if (TARGET_XOP && mode == V2DImode)
23889 {
23890 /* op1: A,B,C,D, op2: E,F,G,H */
23891 op1 = gen_lowpart (V4SImode, op1);
23892 op2 = gen_lowpart (V4SImode, op2);
23893
23894 t1 = gen_reg_rtx (V4SImode);
23895 t2 = gen_reg_rtx (V4SImode);
23896 t3 = gen_reg_rtx (V2DImode);
23897 t4 = gen_reg_rtx (V2DImode);
23898
23899 /* t1: B,A,D,C */
23900 emit_insn (gen_sse2_pshufd_1 (t1, op1,
23901 GEN_INT (1),
23902 GEN_INT (0),
23903 GEN_INT (3),
23904 GEN_INT (2)));
23905
23906 /* t2: (B*E),(A*F),(D*G),(C*H) */
23907 emit_insn (gen_mulv4si3 (t2, t1, op2));
23908
23909 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
23910 emit_insn (gen_xop_phadddq (t3, t2));
23911
23912 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
23913 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
23914
23915 /* Multiply lower parts and add all */
23916 t5 = gen_reg_rtx (V2DImode);
23917 emit_insn (gen_vec_widen_umult_even_v4si (t5,
23918 gen_lowpart (V4SImode, op1),
23919 gen_lowpart (V4SImode, op2)));
23920 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
23921 }
23922 else
23923 {
23924 machine_mode nmode;
23925 rtx (*umul) (rtx, rtx, rtx);
23926
23927 if (mode == V2DImode)
23928 {
23929 umul = gen_vec_widen_umult_even_v4si;
23930 nmode = V4SImode;
23931 }
23932 else if (mode == V4DImode)
23933 {
23934 umul = gen_vec_widen_umult_even_v8si;
23935 nmode = V8SImode;
23936 }
23937 else if (mode == V8DImode)
23938 {
23939 umul = gen_vec_widen_umult_even_v16si;
23940 nmode = V16SImode;
23941 }
23942 else
23943 gcc_unreachable ();
23944
23945
23946 /* Multiply low parts. */
23947 t1 = gen_reg_rtx (mode);
23948 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
23949
23950 /* Shift input vectors right 32 bits so we can multiply high parts. */
23951 t6 = GEN_INT (32);
23952 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
23953 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
23954
23955 /* Multiply high parts by low parts. */
23956 t4 = gen_reg_rtx (mode);
23957 t5 = gen_reg_rtx (mode);
23958 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
23959 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
23960
23961 /* Combine and shift the highparts back. */
23962 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
23963 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
23964
23965 /* Combine high and low parts. */
23966 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
23967 }
23968
23969 set_unique_reg_note (get_last_insn (), REG_EQUAL,
23970 gen_rtx_MULT (mode, op1, op2));
23971 }
23972
23973 /* Return 1 if control tansfer instruction INSN
23974 should be encoded with notrack prefix. */
23975
23976 bool
23977 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
23978 {
23979 if (!insn || !((flag_cf_protection & CF_BRANCH)))
23980 return false;
23981
23982 if (CALL_P (insn))
23983 {
23984 rtx call = get_call_rtx_from (insn);
23985 gcc_assert (call != NULL_RTX);
23986 rtx addr = XEXP (call, 0);
23987
23988 /* Do not emit 'notrack' if it's not an indirect call. */
23989 if (MEM_P (addr)
23990 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
23991 return false;
23992 else
23993 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
23994 }
23995
23996 if (JUMP_P (insn) && !flag_cet_switch)
23997 {
23998 rtx target = JUMP_LABEL (insn);
23999 if (target == NULL_RTX || ANY_RETURN_P (target))
24000 return false;
24001
24002 /* Check the jump is a switch table. */
24003 rtx_insn *label = as_a<rtx_insn *> (target);
24004 rtx_insn *table = next_insn (label);
24005 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
24006 return false;
24007 else
24008 return true;
24009 }
24010 return false;
24011 }
24012
24013 /* Calculate integer abs() using only SSE2 instructions. */
24014
24015 void
24016 ix86_expand_sse2_abs (rtx target, rtx input)
24017 {
24018 machine_mode mode = GET_MODE (target);
24019 rtx tmp0, tmp1, x;
24020
24021 switch (mode)
24022 {
24023 case E_V2DImode:
24024 case E_V4DImode:
24025 /* For 64-bit signed integer X, with SSE4.2 use
24026 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
24027 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
24028 32 and use logical instead of arithmetic right shift (which is
24029 unimplemented) and subtract. */
24030 if (TARGET_SSE4_2)
24031 {
24032 tmp0 = gen_reg_rtx (mode);
24033 tmp1 = gen_reg_rtx (mode);
24034 emit_move_insn (tmp1, CONST0_RTX (mode));
24035 if (mode == E_V2DImode)
24036 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
24037 else
24038 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
24039 }
24040 else
24041 {
24042 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
24043 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
24044 - 1), NULL, 0, OPTAB_DIRECT);
24045 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
24046 }
24047
24048 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24049 NULL, 0, OPTAB_DIRECT);
24050 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24051 target, 0, OPTAB_DIRECT);
24052 break;
24053
24054 case E_V4SImode:
24055 /* For 32-bit signed integer X, the best way to calculate the absolute
24056 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
24057 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
24058 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
24059 NULL, 0, OPTAB_DIRECT);
24060 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24061 NULL, 0, OPTAB_DIRECT);
24062 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24063 target, 0, OPTAB_DIRECT);
24064 break;
24065
24066 case E_V8HImode:
24067 /* For 16-bit signed integer X, the best way to calculate the absolute
24068 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
24069 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24070
24071 x = expand_simple_binop (mode, SMAX, tmp0, input,
24072 target, 0, OPTAB_DIRECT);
24073 break;
24074
24075 case E_V16QImode:
24076 /* For 8-bit signed integer X, the best way to calculate the absolute
24077 value of X is min ((unsigned char) X, (unsigned char) (-X)),
24078 as SSE2 provides the PMINUB insn. */
24079 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24080
24081 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
24082 target, 0, OPTAB_DIRECT);
24083 break;
24084
24085 default:
24086 gcc_unreachable ();
24087 }
24088
24089 if (x != target)
24090 emit_move_insn (target, x);
24091 }
24092
24093 /* Expand an extract from a vector register through pextr insn.
24094 Return true if successful. */
24095
24096 bool
24097 ix86_expand_pextr (rtx *operands)
24098 {
24099 rtx dst = operands[0];
24100 rtx src = operands[1];
24101
24102 unsigned int size = INTVAL (operands[2]);
24103 unsigned int pos = INTVAL (operands[3]);
24104
24105 if (SUBREG_P (dst))
24106 {
24107 /* Reject non-lowpart subregs. */
24108 if (SUBREG_BYTE (dst) > 0)
24109 return false;
24110 dst = SUBREG_REG (dst);
24111 }
24112
24113 if (SUBREG_P (src))
24114 {
24115 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
24116 src = SUBREG_REG (src);
24117 }
24118
24119 switch (GET_MODE (src))
24120 {
24121 case E_V16QImode:
24122 case E_V8HImode:
24123 case E_V4SImode:
24124 case E_V2DImode:
24125 case E_V1TImode:
24126 {
24127 machine_mode srcmode, dstmode;
24128 rtx d, pat;
24129
24130 if (!int_mode_for_size (size, 0).exists (&dstmode))
24131 return false;
24132
24133 switch (dstmode)
24134 {
24135 case E_QImode:
24136 if (!TARGET_SSE4_1)
24137 return false;
24138 srcmode = V16QImode;
24139 break;
24140
24141 case E_HImode:
24142 if (!TARGET_SSE2)
24143 return false;
24144 srcmode = V8HImode;
24145 break;
24146
24147 case E_SImode:
24148 if (!TARGET_SSE4_1)
24149 return false;
24150 srcmode = V4SImode;
24151 break;
24152
24153 case E_DImode:
24154 gcc_assert (TARGET_64BIT);
24155 if (!TARGET_SSE4_1)
24156 return false;
24157 srcmode = V2DImode;
24158 break;
24159
24160 default:
24161 return false;
24162 }
24163
24164 /* Reject extractions from misaligned positions. */
24165 if (pos & (size-1))
24166 return false;
24167
24168 if (GET_MODE (dst) == dstmode)
24169 d = dst;
24170 else
24171 d = gen_reg_rtx (dstmode);
24172
24173 /* Construct insn pattern. */
24174 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
24175 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
24176
24177 /* Let the rtl optimizers know about the zero extension performed. */
24178 if (dstmode == QImode || dstmode == HImode)
24179 {
24180 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
24181 d = gen_lowpart (SImode, d);
24182 }
24183
24184 emit_insn (gen_rtx_SET (d, pat));
24185
24186 if (d != dst)
24187 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24188 return true;
24189 }
24190
24191 default:
24192 return false;
24193 }
24194 }
24195
24196 /* Expand an insert into a vector register through pinsr insn.
24197 Return true if successful. */
24198
24199 bool
24200 ix86_expand_pinsr (rtx *operands)
24201 {
24202 rtx dst = operands[0];
24203 rtx src = operands[3];
24204
24205 unsigned int size = INTVAL (operands[1]);
24206 unsigned int pos = INTVAL (operands[2]);
24207
24208 if (SUBREG_P (dst))
24209 {
24210 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
24211 dst = SUBREG_REG (dst);
24212 }
24213
24214 switch (GET_MODE (dst))
24215 {
24216 case E_V16QImode:
24217 case E_V8HImode:
24218 case E_V4SImode:
24219 case E_V2DImode:
24220 case E_V1TImode:
24221 {
24222 machine_mode srcmode, dstmode;
24223 rtx (*pinsr)(rtx, rtx, rtx, rtx);
24224 rtx d;
24225
24226 if (!int_mode_for_size (size, 0).exists (&srcmode))
24227 return false;
24228
24229 switch (srcmode)
24230 {
24231 case E_QImode:
24232 if (!TARGET_SSE4_1)
24233 return false;
24234 dstmode = V16QImode;
24235 pinsr = gen_sse4_1_pinsrb;
24236 break;
24237
24238 case E_HImode:
24239 if (!TARGET_SSE2)
24240 return false;
24241 dstmode = V8HImode;
24242 pinsr = gen_sse2_pinsrw;
24243 break;
24244
24245 case E_SImode:
24246 if (!TARGET_SSE4_1)
24247 return false;
24248 dstmode = V4SImode;
24249 pinsr = gen_sse4_1_pinsrd;
24250 break;
24251
24252 case E_DImode:
24253 gcc_assert (TARGET_64BIT);
24254 if (!TARGET_SSE4_1)
24255 return false;
24256 dstmode = V2DImode;
24257 pinsr = gen_sse4_1_pinsrq;
24258 break;
24259
24260 default:
24261 return false;
24262 }
24263
24264 /* Reject insertions to misaligned positions. */
24265 if (pos & (size-1))
24266 return false;
24267
24268 if (SUBREG_P (src))
24269 {
24270 unsigned int srcpos = SUBREG_BYTE (src);
24271
24272 if (srcpos > 0)
24273 {
24274 rtx extr_ops[4];
24275
24276 extr_ops[0] = gen_reg_rtx (srcmode);
24277 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
24278 extr_ops[2] = GEN_INT (size);
24279 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
24280
24281 if (!ix86_expand_pextr (extr_ops))
24282 return false;
24283
24284 src = extr_ops[0];
24285 }
24286 else
24287 src = gen_lowpart (srcmode, SUBREG_REG (src));
24288 }
24289
24290 if (GET_MODE (dst) == dstmode)
24291 d = dst;
24292 else
24293 d = gen_reg_rtx (dstmode);
24294
24295 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
24296 gen_lowpart (srcmode, src),
24297 GEN_INT (1 << (pos / size))));
24298 if (d != dst)
24299 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24300 return true;
24301 }
24302
24303 default:
24304 return false;
24305 }
24306 }
24307
24308 /* All CPUs prefer to avoid cross-lane operations so perform reductions
24309 upper against lower halves up to SSE reg size. */
24310
24311 machine_mode
24312 ix86_split_reduction (machine_mode mode)
24313 {
24314 /* Reduce lowpart against highpart until we reach SSE reg width to
24315 avoid cross-lane operations. */
24316 switch (mode)
24317 {
24318 case E_V8DImode:
24319 case E_V4DImode:
24320 return V2DImode;
24321 case E_V16SImode:
24322 case E_V8SImode:
24323 return V4SImode;
24324 case E_V32HImode:
24325 case E_V16HImode:
24326 return V8HImode;
24327 case E_V64QImode:
24328 case E_V32QImode:
24329 return V16QImode;
24330 case E_V16SFmode:
24331 case E_V8SFmode:
24332 return V4SFmode;
24333 case E_V8DFmode:
24334 case E_V4DFmode:
24335 return V2DFmode;
24336 default:
24337 return mode;
24338 }
24339 }
24340
24341 /* Generate call to __divmoddi4. */
24342
24343 void
24344 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
24345 rtx op0, rtx op1,
24346 rtx *quot_p, rtx *rem_p)
24347 {
24348 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
24349
24350 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
24351 mode, op0, mode, op1, mode,
24352 XEXP (rem, 0), Pmode);
24353 *quot_p = quot;
24354 *rem_p = rem;
24355 }
24356
24357 void
24358 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
24359 enum rtx_code code, bool after,
24360 bool doubleword)
24361 {
24362 rtx old_reg, new_reg, old_mem, success;
24363 machine_mode mode = GET_MODE (target);
24364 rtx_code_label *loop_label = NULL;
24365
24366 old_reg = gen_reg_rtx (mode);
24367 new_reg = old_reg;
24368 old_mem = copy_to_reg (mem);
24369 loop_label = gen_label_rtx ();
24370 emit_label (loop_label);
24371 emit_move_insn (old_reg, old_mem);
24372
24373 /* return value for atomic_fetch_op. */
24374 if (!after)
24375 emit_move_insn (target, old_reg);
24376
24377 if (code == NOT)
24378 {
24379 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
24380 true, OPTAB_LIB_WIDEN);
24381 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
24382 }
24383 else
24384 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
24385 true, OPTAB_LIB_WIDEN);
24386
24387 /* return value for atomic_op_fetch. */
24388 if (after)
24389 emit_move_insn (target, new_reg);
24390
24391 success = NULL_RTX;
24392
24393 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
24394 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
24395 SImode),
24396 doubleword, loop_label);
24397 }
24398
24399 /* Relax cmpxchg instruction, param loop_label indicates whether
24400 the instruction should be relaxed with a pause loop. If not,
24401 it will be relaxed to an atomic load + compare, and skip
24402 cmpxchg instruction if mem != exp_input. */
24403
24404 void
24405 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
24406 rtx mem, rtx exp_input, rtx new_input,
24407 rtx mem_model, bool doubleword,
24408 rtx_code_label *loop_label)
24409 {
24410 rtx_code_label *cmp_label = NULL;
24411 rtx_code_label *done_label = NULL;
24412 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
24413 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
24414 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
24415 machine_mode mode = GET_MODE (target_val), hmode = mode;
24416
24417 if (*ptarget_bool == NULL)
24418 target_bool = gen_reg_rtx (QImode);
24419 else
24420 target_bool = *ptarget_bool;
24421
24422 cmp_label = gen_label_rtx ();
24423 done_label = gen_label_rtx ();
24424
24425 new_mem = gen_reg_rtx (mode);
24426 /* Load memory first. */
24427 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
24428
24429 switch (mode)
24430 {
24431 case E_TImode:
24432 gendw = gen_atomic_compare_and_swapti_doubleword;
24433 hmode = DImode;
24434 break;
24435 case E_DImode:
24436 if (doubleword)
24437 {
24438 gendw = gen_atomic_compare_and_swapdi_doubleword;
24439 hmode = SImode;
24440 }
24441 else
24442 gen = gen_atomic_compare_and_swapdi_1;
24443 break;
24444 case E_SImode:
24445 gen = gen_atomic_compare_and_swapsi_1;
24446 break;
24447 case E_HImode:
24448 gen = gen_atomic_compare_and_swaphi_1;
24449 break;
24450 case E_QImode:
24451 gen = gen_atomic_compare_and_swapqi_1;
24452 break;
24453 default:
24454 gcc_unreachable ();
24455 }
24456
24457 /* Compare mem value with expected value. */
24458 if (doubleword)
24459 {
24460 rtx low_new_mem = gen_lowpart (hmode, new_mem);
24461 rtx low_exp_input = gen_lowpart (hmode, exp_input);
24462 rtx high_new_mem = gen_highpart (hmode, new_mem);
24463 rtx high_exp_input = gen_highpart (hmode, exp_input);
24464 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
24465 hmode, 1, cmp_label,
24466 profile_probability::guessed_never ());
24467 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
24468 hmode, 1, cmp_label,
24469 profile_probability::guessed_never ());
24470 }
24471 else
24472 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
24473 GET_MODE (exp_input), 1, cmp_label,
24474 profile_probability::guessed_never ());
24475
24476 /* Directly emits cmpxchg here. */
24477 if (doubleword)
24478 emit_insn (gendw (target_val, mem, exp_input,
24479 gen_lowpart (hmode, new_input),
24480 gen_highpart (hmode, new_input),
24481 mem_model));
24482 else
24483 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
24484
24485 if (!loop_label)
24486 {
24487 emit_jump_insn (gen_jump (done_label));
24488 emit_barrier ();
24489 emit_label (cmp_label);
24490 emit_move_insn (target_val, new_mem);
24491 emit_label (done_label);
24492 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24493 const0_rtx);
24494 }
24495 else
24496 {
24497 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24498 const0_rtx);
24499 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
24500 GET_MODE (target_bool), 1, loop_label,
24501 profile_probability::guessed_never ());
24502 emit_jump_insn (gen_jump (done_label));
24503 emit_barrier ();
24504
24505 /* If mem is not expected, pause and loop back. */
24506 emit_label (cmp_label);
24507 emit_move_insn (target_val, new_mem);
24508 emit_insn (gen_pause ());
24509 emit_jump_insn (gen_jump (loop_label));
24510 emit_barrier ();
24511 emit_label (done_label);
24512 }
24513
24514 *ptarget_bool = target_bool;
24515 }
24516
24517 /* Convert a BFmode VAL to SFmode without signaling sNaNs.
24518 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24519
24520 rtx
24521 ix86_expand_fast_convert_bf_to_sf (rtx val)
24522 {
24523 rtx op = gen_lowpart (HImode, val), ret;
24524 if (CONST_INT_P (op))
24525 {
24526 ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
24527 val, BFmode);
24528 if (ret)
24529 return ret;
24530 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24531 ret = gen_reg_rtx (SImode);
24532 emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
24533 emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
24534 return gen_lowpart (SFmode, ret);
24535 }
24536
24537 ret = gen_reg_rtx (SFmode);
24538 emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
24539 return ret;
24540 }
24541
24542 #include "gt-i386-expand.h"