]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386-expand.cc
i386: Improve ix86_expand_int_movcc
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157
158 rtx tmp = simplify_gen_subreg (half_mode, op,
159 GET_MODE (op) == VOIDmode
160 ? mode : GET_MODE (op), byte);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
164 }
165 }
166 }
167
168 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
169 for the target. */
170
171 void
172 ix86_expand_clear (rtx dest)
173 {
174 rtx tmp;
175
176 /* We play register width games, which are only valid after reload. */
177 gcc_assert (reload_completed);
178
179 /* Avoid HImode and its attendant prefix byte. */
180 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
181 dest = gen_rtx_REG (SImode, REGNO (dest));
182 tmp = gen_rtx_SET (dest, const0_rtx);
183
184 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
185 {
186 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
187 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
188 }
189
190 emit_insn (tmp);
191 }
192
193 /* Return true if V can be broadcasted from an integer of WIDTH bits
194 which is returned in VAL_BROADCAST. Otherwise, return false. */
195
196 static bool
197 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
198 HOST_WIDE_INT &val_broadcast)
199 {
200 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
201 val_broadcast = wi::extract_uhwi (val, 0, width);
202 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
203 {
204 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
205 if (val_broadcast != each)
206 return false;
207 }
208 val_broadcast = sext_hwi (val_broadcast, width);
209 return true;
210 }
211
212 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
213
214 static rtx
215 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
216 {
217 /* Don't use integer vector broadcast if we can't move from GPR to SSE
218 register directly. */
219 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
220 return nullptr;
221
222 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
223 broadcast only if vector broadcast is available. */
224 if (!TARGET_AVX
225 || !CONST_WIDE_INT_P (op)
226 || standard_sse_constant_p (op, mode))
227 return nullptr;
228
229 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
230 HOST_WIDE_INT val_broadcast;
231 scalar_int_mode broadcast_mode;
232 if (TARGET_AVX2
233 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
234 val_broadcast))
235 broadcast_mode = QImode;
236 else if (TARGET_AVX2
237 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
238 val_broadcast))
239 broadcast_mode = HImode;
240 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
241 val_broadcast))
242 broadcast_mode = SImode;
243 else if (TARGET_64BIT
244 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
245 val_broadcast))
246 broadcast_mode = DImode;
247 else
248 return nullptr;
249
250 /* Check if OP can be broadcasted from VAL. */
251 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
252 if (val != CONST_WIDE_INT_ELT (op, i))
253 return nullptr;
254
255 unsigned int nunits = (GET_MODE_SIZE (mode)
256 / GET_MODE_SIZE (broadcast_mode));
257 machine_mode vector_mode;
258 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
259 gcc_unreachable ();
260 rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
261 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
262 target,
263 GEN_INT (val_broadcast));
264 gcc_assert (ok);
265 target = lowpart_subreg (mode, target, vector_mode);
266 return target;
267 }
268
269 void
270 ix86_expand_move (machine_mode mode, rtx operands[])
271 {
272 rtx op0, op1;
273 rtx tmp, addend = NULL_RTX;
274 enum tls_model model;
275
276 op0 = operands[0];
277 op1 = operands[1];
278
279 /* Avoid complex sets of likely spilled hard registers before reload. */
280 if (!ix86_hardreg_mov_ok (op0, op1))
281 {
282 tmp = gen_reg_rtx (mode);
283 operands[0] = tmp;
284 ix86_expand_move (mode, operands);
285 operands[0] = op0;
286 operands[1] = tmp;
287 op1 = tmp;
288 }
289
290 switch (GET_CODE (op1))
291 {
292 case CONST:
293 tmp = XEXP (op1, 0);
294
295 if (GET_CODE (tmp) != PLUS
296 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
297 break;
298
299 op1 = XEXP (tmp, 0);
300 addend = XEXP (tmp, 1);
301 /* FALLTHRU */
302
303 case SYMBOL_REF:
304 model = SYMBOL_REF_TLS_MODEL (op1);
305
306 if (model)
307 op1 = legitimize_tls_address (op1, model, true);
308 else if (ix86_force_load_from_GOT_p (op1))
309 {
310 /* Load the external function address via GOT slot to avoid PLT. */
311 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
312 (TARGET_64BIT
313 ? UNSPEC_GOTPCREL
314 : UNSPEC_GOT));
315 op1 = gen_rtx_CONST (Pmode, op1);
316 op1 = gen_const_mem (Pmode, op1);
317 set_mem_alias_set (op1, ix86_GOT_alias_set ());
318 }
319 else
320 {
321 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
322 if (tmp)
323 {
324 op1 = tmp;
325 if (!addend)
326 break;
327 }
328 else
329 {
330 op1 = operands[1];
331 break;
332 }
333 }
334
335 if (addend)
336 {
337 op1 = force_operand (op1, NULL_RTX);
338 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
339 op0, 1, OPTAB_DIRECT);
340 }
341 else
342 op1 = force_operand (op1, op0);
343
344 if (op1 == op0)
345 return;
346
347 op1 = convert_to_mode (mode, op1, 1);
348
349 default:
350 break;
351 }
352
353 if ((flag_pic || MACHOPIC_INDIRECT)
354 && symbolic_operand (op1, mode))
355 {
356 if (TARGET_MACHO && !TARGET_64BIT)
357 {
358 #if TARGET_MACHO
359 /* dynamic-no-pic */
360 if (MACHOPIC_INDIRECT)
361 {
362 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
363 ? op0 : gen_reg_rtx (Pmode);
364 op1 = machopic_indirect_data_reference (op1, temp);
365 if (MACHOPIC_PURE)
366 op1 = machopic_legitimize_pic_address (op1, mode,
367 temp == op1 ? 0 : temp);
368 }
369 if (op0 != op1 && GET_CODE (op0) != MEM)
370 {
371 rtx insn = gen_rtx_SET (op0, op1);
372 emit_insn (insn);
373 return;
374 }
375 if (GET_CODE (op0) == MEM)
376 op1 = force_reg (Pmode, op1);
377 else
378 {
379 rtx temp = op0;
380 if (GET_CODE (temp) != REG)
381 temp = gen_reg_rtx (Pmode);
382 temp = legitimize_pic_address (op1, temp);
383 if (temp == op0)
384 return;
385 op1 = temp;
386 }
387 /* dynamic-no-pic */
388 #endif
389 }
390 else
391 {
392 if (MEM_P (op0))
393 op1 = force_reg (mode, op1);
394 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
395 {
396 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
397 op1 = legitimize_pic_address (op1, reg);
398 if (op0 == op1)
399 return;
400 op1 = convert_to_mode (mode, op1, 1);
401 }
402 }
403 }
404 else
405 {
406 if (MEM_P (op0)
407 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
408 || !push_operand (op0, mode))
409 && MEM_P (op1))
410 op1 = force_reg (mode, op1);
411
412 if (push_operand (op0, mode)
413 && ! general_no_elim_operand (op1, mode))
414 op1 = copy_to_mode_reg (mode, op1);
415
416 /* Force large constants in 64bit compilation into register
417 to get them CSEed. */
418 if (can_create_pseudo_p ()
419 && (mode == DImode) && TARGET_64BIT
420 && immediate_operand (op1, mode)
421 && !x86_64_zext_immediate_operand (op1, VOIDmode)
422 && !register_operand (op0, mode)
423 && optimize)
424 op1 = copy_to_mode_reg (mode, op1);
425
426 if (can_create_pseudo_p ())
427 {
428 if (CONST_DOUBLE_P (op1))
429 {
430 /* If we are loading a floating point constant to a
431 register, force the value to memory now, since we'll
432 get better code out the back end. */
433
434 op1 = validize_mem (force_const_mem (mode, op1));
435 if (!register_operand (op0, mode))
436 {
437 rtx temp = gen_reg_rtx (mode);
438 emit_insn (gen_rtx_SET (temp, op1));
439 emit_move_insn (op0, temp);
440 return;
441 }
442 }
443 else if (GET_MODE_SIZE (mode) >= 16)
444 {
445 rtx tmp = ix86_convert_const_wide_int_to_broadcast
446 (GET_MODE (op0), op1);
447 if (tmp != nullptr)
448 op1 = tmp;
449 }
450 }
451 }
452
453 emit_insn (gen_rtx_SET (op0, op1));
454 }
455
456 /* OP is a memref of CONST_VECTOR, return scalar constant mem
457 if CONST_VECTOR is a vec_duplicate, else return NULL. */
458 static rtx
459 ix86_broadcast_from_constant (machine_mode mode, rtx op)
460 {
461 int nunits = GET_MODE_NUNITS (mode);
462 if (nunits < 2)
463 return nullptr;
464
465 /* Don't use integer vector broadcast if we can't move from GPR to SSE
466 register directly. */
467 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
468 && INTEGRAL_MODE_P (mode))
469 return nullptr;
470
471 /* Convert CONST_VECTOR to a non-standard SSE constant integer
472 broadcast only if vector broadcast is available. */
473 if (!(TARGET_AVX2
474 || (TARGET_AVX
475 && (GET_MODE_INNER (mode) == SImode
476 || GET_MODE_INNER (mode) == DImode))
477 || FLOAT_MODE_P (mode))
478 || standard_sse_constant_p (op, mode))
479 return nullptr;
480
481 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
482 We can still put 64-bit integer constant in memory when
483 avx512 embed broadcast is available. */
484 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
485 && (!TARGET_AVX512F
486 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
487 return nullptr;
488
489 if (GET_MODE_INNER (mode) == TImode)
490 return nullptr;
491
492 rtx constant = get_pool_constant (XEXP (op, 0));
493 if (GET_CODE (constant) != CONST_VECTOR)
494 return nullptr;
495
496 /* There could be some rtx like
497 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
498 but with "*.LC1" refer to V2DI constant vector. */
499 if (GET_MODE (constant) != mode)
500 {
501 constant = simplify_subreg (mode, constant, GET_MODE (constant),
502 0);
503 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
504 return nullptr;
505 }
506
507 rtx first = XVECEXP (constant, 0, 0);
508
509 for (int i = 1; i < nunits; ++i)
510 {
511 rtx tmp = XVECEXP (constant, 0, i);
512 /* Vector duplicate value. */
513 if (!rtx_equal_p (tmp, first))
514 return nullptr;
515 }
516
517 return first;
518 }
519
520 void
521 ix86_expand_vector_move (machine_mode mode, rtx operands[])
522 {
523 rtx op0 = operands[0], op1 = operands[1];
524 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
525 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
526 unsigned int align = (TARGET_IAMCU
527 ? GET_MODE_BITSIZE (mode)
528 : GET_MODE_ALIGNMENT (mode));
529
530 if (push_operand (op0, VOIDmode))
531 op0 = emit_move_resolve_push (mode, op0);
532
533 /* Force constants other than zero into memory. We do not know how
534 the instructions used to build constants modify the upper 64 bits
535 of the register, once we have that information we may be able
536 to handle some of them more efficiently. */
537 if (can_create_pseudo_p ()
538 && (CONSTANT_P (op1)
539 || (SUBREG_P (op1)
540 && CONSTANT_P (SUBREG_REG (op1))))
541 && ((register_operand (op0, mode)
542 && !standard_sse_constant_p (op1, mode))
543 /* ix86_expand_vector_move_misalign() does not like constants. */
544 || (SSE_REG_MODE_P (mode)
545 && MEM_P (op0)
546 && MEM_ALIGN (op0) < align)))
547 {
548 if (SUBREG_P (op1))
549 {
550 machine_mode imode = GET_MODE (SUBREG_REG (op1));
551 rtx r = force_const_mem (imode, SUBREG_REG (op1));
552 if (r)
553 r = validize_mem (r);
554 else
555 r = force_reg (imode, SUBREG_REG (op1));
556 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
557 }
558 else
559 {
560 machine_mode mode = GET_MODE (op0);
561 rtx tmp = ix86_convert_const_wide_int_to_broadcast
562 (mode, op1);
563 if (tmp == nullptr)
564 op1 = validize_mem (force_const_mem (mode, op1));
565 else
566 op1 = tmp;
567 }
568 }
569
570 if (can_create_pseudo_p ()
571 && GET_MODE_SIZE (mode) >= 16
572 && VECTOR_MODE_P (mode)
573 && (MEM_P (op1)
574 && SYMBOL_REF_P (XEXP (op1, 0))
575 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
576 {
577 rtx first = ix86_broadcast_from_constant (mode, op1);
578 if (first != nullptr)
579 {
580 /* Broadcast to XMM/YMM/ZMM register from an integer
581 constant or scalar mem. */
582 op1 = gen_reg_rtx (mode);
583 if (FLOAT_MODE_P (mode)
584 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
585 first = force_const_mem (GET_MODE_INNER (mode), first);
586 bool ok = ix86_expand_vector_init_duplicate (false, mode,
587 op1, first);
588 gcc_assert (ok);
589 emit_move_insn (op0, op1);
590 return;
591 }
592 }
593
594 /* We need to check memory alignment for SSE mode since attribute
595 can make operands unaligned. */
596 if (can_create_pseudo_p ()
597 && SSE_REG_MODE_P (mode)
598 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
599 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
600 {
601 rtx tmp[2];
602
603 /* ix86_expand_vector_move_misalign() does not like both
604 arguments in memory. */
605 if (!register_operand (op0, mode)
606 && !register_operand (op1, mode))
607 {
608 rtx scratch = ix86_gen_scratch_sse_rtx (mode);
609 emit_move_insn (scratch, op1);
610 op1 = scratch;
611 }
612
613 tmp[0] = op0; tmp[1] = op1;
614 ix86_expand_vector_move_misalign (mode, tmp);
615 return;
616 }
617
618 /* Special case TImode to V1TImode conversions, via V2DI. */
619 if (mode == V1TImode
620 && SUBREG_P (op1)
621 && GET_MODE (SUBREG_REG (op1)) == TImode
622 && TARGET_64BIT && TARGET_SSE
623 && can_create_pseudo_p ())
624 {
625 rtx tmp = gen_reg_rtx (V2DImode);
626 rtx lo = gen_reg_rtx (DImode);
627 rtx hi = gen_reg_rtx (DImode);
628 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
629 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
630 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
631 emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
632 return;
633 }
634
635 /* If operand0 is a hard register, make operand1 a pseudo. */
636 if (can_create_pseudo_p ()
637 && !ix86_hardreg_mov_ok (op0, op1))
638 {
639 rtx tmp = gen_reg_rtx (GET_MODE (op0));
640 emit_move_insn (tmp, op1);
641 emit_move_insn (op0, tmp);
642 return;
643 }
644
645 /* Make operand1 a register if it isn't already. */
646 if (can_create_pseudo_p ()
647 && !register_operand (op0, mode)
648 && !register_operand (op1, mode))
649 {
650 rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
651 emit_move_insn (tmp, op1);
652 emit_move_insn (op0, tmp);
653 return;
654 }
655
656 emit_insn (gen_rtx_SET (op0, op1));
657 }
658
659 /* Split 32-byte AVX unaligned load and store if needed. */
660
661 static void
662 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
663 {
664 rtx m;
665 rtx (*extract) (rtx, rtx, rtx);
666 machine_mode mode;
667
668 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
669 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
670 {
671 emit_insn (gen_rtx_SET (op0, op1));
672 return;
673 }
674
675 rtx orig_op0 = NULL_RTX;
676 mode = GET_MODE (op0);
677 switch (GET_MODE_CLASS (mode))
678 {
679 case MODE_VECTOR_INT:
680 case MODE_INT:
681 if (mode != V32QImode)
682 {
683 if (!MEM_P (op0))
684 {
685 orig_op0 = op0;
686 op0 = gen_reg_rtx (V32QImode);
687 }
688 else
689 op0 = gen_lowpart (V32QImode, op0);
690 op1 = gen_lowpart (V32QImode, op1);
691 mode = V32QImode;
692 }
693 break;
694 case MODE_VECTOR_FLOAT:
695 break;
696 default:
697 gcc_unreachable ();
698 }
699
700 switch (mode)
701 {
702 default:
703 gcc_unreachable ();
704 case E_V32QImode:
705 extract = gen_avx_vextractf128v32qi;
706 mode = V16QImode;
707 break;
708 case E_V16HFmode:
709 extract = gen_avx_vextractf128v16hf;
710 mode = V8HFmode;
711 break;
712 case E_V8SFmode:
713 extract = gen_avx_vextractf128v8sf;
714 mode = V4SFmode;
715 break;
716 case E_V4DFmode:
717 extract = gen_avx_vextractf128v4df;
718 mode = V2DFmode;
719 break;
720 }
721
722 if (MEM_P (op1))
723 {
724 rtx r = gen_reg_rtx (mode);
725 m = adjust_address (op1, mode, 0);
726 emit_move_insn (r, m);
727 m = adjust_address (op1, mode, 16);
728 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
729 emit_move_insn (op0, r);
730 }
731 else if (MEM_P (op0))
732 {
733 m = adjust_address (op0, mode, 0);
734 emit_insn (extract (m, op1, const0_rtx));
735 m = adjust_address (op0, mode, 16);
736 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
737 }
738 else
739 gcc_unreachable ();
740
741 if (orig_op0)
742 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
743 }
744
745 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
746 straight to ix86_expand_vector_move. */
747 /* Code generation for scalar reg-reg moves of single and double precision data:
748 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
749 movaps reg, reg
750 else
751 movss reg, reg
752 if (x86_sse_partial_reg_dependency == true)
753 movapd reg, reg
754 else
755 movsd reg, reg
756
757 Code generation for scalar loads of double precision data:
758 if (x86_sse_split_regs == true)
759 movlpd mem, reg (gas syntax)
760 else
761 movsd mem, reg
762
763 Code generation for unaligned packed loads of single precision data
764 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
765 if (x86_sse_unaligned_move_optimal)
766 movups mem, reg
767
768 if (x86_sse_partial_reg_dependency == true)
769 {
770 xorps reg, reg
771 movlps mem, reg
772 movhps mem+8, reg
773 }
774 else
775 {
776 movlps mem, reg
777 movhps mem+8, reg
778 }
779
780 Code generation for unaligned packed loads of double precision data
781 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
782 if (x86_sse_unaligned_move_optimal)
783 movupd mem, reg
784
785 if (x86_sse_split_regs == true)
786 {
787 movlpd mem, reg
788 movhpd mem+8, reg
789 }
790 else
791 {
792 movsd mem, reg
793 movhpd mem+8, reg
794 }
795 */
796
797 void
798 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
799 {
800 rtx op0, op1, m;
801
802 op0 = operands[0];
803 op1 = operands[1];
804
805 /* Use unaligned load/store for AVX512 or when optimizing for size. */
806 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
807 {
808 emit_insn (gen_rtx_SET (op0, op1));
809 return;
810 }
811
812 if (TARGET_AVX)
813 {
814 if (GET_MODE_SIZE (mode) == 32)
815 ix86_avx256_split_vector_move_misalign (op0, op1);
816 else
817 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
818 emit_insn (gen_rtx_SET (op0, op1));
819 return;
820 }
821
822 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
823 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
824 {
825 emit_insn (gen_rtx_SET (op0, op1));
826 return;
827 }
828
829 /* ??? If we have typed data, then it would appear that using
830 movdqu is the only way to get unaligned data loaded with
831 integer type. */
832 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
833 {
834 emit_insn (gen_rtx_SET (op0, op1));
835 return;
836 }
837
838 if (MEM_P (op1))
839 {
840 if (TARGET_SSE2 && mode == V2DFmode)
841 {
842 rtx zero;
843
844 /* When SSE registers are split into halves, we can avoid
845 writing to the top half twice. */
846 if (TARGET_SSE_SPLIT_REGS)
847 {
848 emit_clobber (op0);
849 zero = op0;
850 }
851 else
852 {
853 /* ??? Not sure about the best option for the Intel chips.
854 The following would seem to satisfy; the register is
855 entirely cleared, breaking the dependency chain. We
856 then store to the upper half, with a dependency depth
857 of one. A rumor has it that Intel recommends two movsd
858 followed by an unpacklpd, but this is unconfirmed. And
859 given that the dependency depth of the unpacklpd would
860 still be one, I'm not sure why this would be better. */
861 zero = CONST0_RTX (V2DFmode);
862 }
863
864 m = adjust_address (op1, DFmode, 0);
865 emit_insn (gen_sse2_loadlpd (op0, zero, m));
866 m = adjust_address (op1, DFmode, 8);
867 emit_insn (gen_sse2_loadhpd (op0, op0, m));
868 }
869 else
870 {
871 rtx t;
872
873 if (mode != V4SFmode)
874 t = gen_reg_rtx (V4SFmode);
875 else
876 t = op0;
877
878 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
879 emit_move_insn (t, CONST0_RTX (V4SFmode));
880 else
881 emit_clobber (t);
882
883 m = adjust_address (op1, V2SFmode, 0);
884 emit_insn (gen_sse_loadlps (t, t, m));
885 m = adjust_address (op1, V2SFmode, 8);
886 emit_insn (gen_sse_loadhps (t, t, m));
887 if (mode != V4SFmode)
888 emit_move_insn (op0, gen_lowpart (mode, t));
889 }
890 }
891 else if (MEM_P (op0))
892 {
893 if (TARGET_SSE2 && mode == V2DFmode)
894 {
895 m = adjust_address (op0, DFmode, 0);
896 emit_insn (gen_sse2_storelpd (m, op1));
897 m = adjust_address (op0, DFmode, 8);
898 emit_insn (gen_sse2_storehpd (m, op1));
899 }
900 else
901 {
902 if (mode != V4SFmode)
903 op1 = gen_lowpart (V4SFmode, op1);
904
905 m = adjust_address (op0, V2SFmode, 0);
906 emit_insn (gen_sse_storelps (m, op1));
907 m = adjust_address (op0, V2SFmode, 8);
908 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
909 }
910 }
911 else
912 gcc_unreachable ();
913 }
914
915 /* Move bits 64:95 to bits 32:63. */
916
917 void
918 ix86_move_vector_high_sse_to_mmx (rtx op)
919 {
920 rtx mask = gen_rtx_PARALLEL (VOIDmode,
921 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
922 GEN_INT (0), GEN_INT (0)));
923 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
924 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
925 rtx insn = gen_rtx_SET (dest, op);
926 emit_insn (insn);
927 }
928
929 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
930
931 void
932 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
933 {
934 rtx op0 = operands[0];
935 rtx op1 = operands[1];
936 rtx op2 = operands[2];
937
938 machine_mode dmode = GET_MODE (op0);
939 machine_mode smode = GET_MODE (op1);
940 machine_mode inner_dmode = GET_MODE_INNER (dmode);
941 machine_mode inner_smode = GET_MODE_INNER (smode);
942
943 /* Get the corresponding SSE mode for destination. */
944 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
945 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
946 nunits).require ();
947 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
948 nunits / 2).require ();
949
950 /* Get the corresponding SSE mode for source. */
951 nunits = 16 / GET_MODE_SIZE (inner_smode);
952 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
953 nunits).require ();
954
955 /* Generate SSE pack with signed/unsigned saturation. */
956 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
957 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
958 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
959
960 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
961 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
962 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
963 op1, op2));
964 emit_insn (insn);
965
966 ix86_move_vector_high_sse_to_mmx (op0);
967 }
968
969 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
970
971 void
972 ix86_split_mmx_punpck (rtx operands[], bool high_p)
973 {
974 rtx op0 = operands[0];
975 rtx op1 = operands[1];
976 rtx op2 = operands[2];
977 machine_mode mode = GET_MODE (op0);
978 rtx mask;
979 /* The corresponding SSE mode. */
980 machine_mode sse_mode, double_sse_mode;
981
982 switch (mode)
983 {
984 case E_V4QImode:
985 case E_V8QImode:
986 sse_mode = V16QImode;
987 double_sse_mode = V32QImode;
988 mask = gen_rtx_PARALLEL (VOIDmode,
989 gen_rtvec (16,
990 GEN_INT (0), GEN_INT (16),
991 GEN_INT (1), GEN_INT (17),
992 GEN_INT (2), GEN_INT (18),
993 GEN_INT (3), GEN_INT (19),
994 GEN_INT (4), GEN_INT (20),
995 GEN_INT (5), GEN_INT (21),
996 GEN_INT (6), GEN_INT (22),
997 GEN_INT (7), GEN_INT (23)));
998 break;
999
1000 case E_V4HImode:
1001 case E_V2HImode:
1002 sse_mode = V8HImode;
1003 double_sse_mode = V16HImode;
1004 mask = gen_rtx_PARALLEL (VOIDmode,
1005 gen_rtvec (8,
1006 GEN_INT (0), GEN_INT (8),
1007 GEN_INT (1), GEN_INT (9),
1008 GEN_INT (2), GEN_INT (10),
1009 GEN_INT (3), GEN_INT (11)));
1010 break;
1011
1012 case E_V2SImode:
1013 sse_mode = V4SImode;
1014 double_sse_mode = V8SImode;
1015 mask = gen_rtx_PARALLEL (VOIDmode,
1016 gen_rtvec (4,
1017 GEN_INT (0), GEN_INT (4),
1018 GEN_INT (1), GEN_INT (5)));
1019 break;
1020
1021 case E_V2SFmode:
1022 sse_mode = V4SFmode;
1023 double_sse_mode = V8SFmode;
1024 mask = gen_rtx_PARALLEL (VOIDmode,
1025 gen_rtvec (4,
1026 GEN_INT (0), GEN_INT (4),
1027 GEN_INT (1), GEN_INT (5)));
1028 break;
1029
1030 default:
1031 gcc_unreachable ();
1032 }
1033
1034 /* Generate SSE punpcklXX. */
1035 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1036 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1037 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1038
1039 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1040 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1041 rtx insn = gen_rtx_SET (dest, op2);
1042 emit_insn (insn);
1043
1044 /* Move high bits to low bits. */
1045 if (high_p)
1046 {
1047 if (sse_mode == V4SFmode)
1048 {
1049 mask = gen_rtx_PARALLEL (VOIDmode,
1050 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1051 GEN_INT (4), GEN_INT (5)));
1052 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1053 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1054 }
1055 else
1056 {
1057 int sz = GET_MODE_SIZE (mode);
1058
1059 if (sz == 4)
1060 mask = gen_rtx_PARALLEL (VOIDmode,
1061 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1062 GEN_INT (0), GEN_INT (1)));
1063 else if (sz == 8)
1064 mask = gen_rtx_PARALLEL (VOIDmode,
1065 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1066 GEN_INT (0), GEN_INT (1)));
1067 else
1068 gcc_unreachable ();
1069
1070 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1071 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1072 }
1073
1074 insn = gen_rtx_SET (dest, op1);
1075 emit_insn (insn);
1076 }
1077 }
1078
1079 /* Helper function of ix86_fixup_binary_operands to canonicalize
1080 operand order. Returns true if the operands should be swapped. */
1081
1082 static bool
1083 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1084 rtx operands[])
1085 {
1086 rtx dst = operands[0];
1087 rtx src1 = operands[1];
1088 rtx src2 = operands[2];
1089
1090 /* If the operation is not commutative, we can't do anything. */
1091 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1092 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1093 return false;
1094
1095 /* Highest priority is that src1 should match dst. */
1096 if (rtx_equal_p (dst, src1))
1097 return false;
1098 if (rtx_equal_p (dst, src2))
1099 return true;
1100
1101 /* Next highest priority is that immediate constants come second. */
1102 if (immediate_operand (src2, mode))
1103 return false;
1104 if (immediate_operand (src1, mode))
1105 return true;
1106
1107 /* Lowest priority is that memory references should come second. */
1108 if (MEM_P (src2))
1109 return false;
1110 if (MEM_P (src1))
1111 return true;
1112
1113 return false;
1114 }
1115
1116
1117 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1118 destination to use for the operation. If different from the true
1119 destination in operands[0], a copy operation will be required. */
1120
1121 rtx
1122 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1123 rtx operands[])
1124 {
1125 rtx dst = operands[0];
1126 rtx src1 = operands[1];
1127 rtx src2 = operands[2];
1128
1129 /* Canonicalize operand order. */
1130 if (ix86_swap_binary_operands_p (code, mode, operands))
1131 {
1132 /* It is invalid to swap operands of different modes. */
1133 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1134
1135 std::swap (src1, src2);
1136 }
1137
1138 /* Both source operands cannot be in memory. */
1139 if (MEM_P (src1) && MEM_P (src2))
1140 {
1141 /* Optimization: Only read from memory once. */
1142 if (rtx_equal_p (src1, src2))
1143 {
1144 src2 = force_reg (mode, src2);
1145 src1 = src2;
1146 }
1147 else if (rtx_equal_p (dst, src1))
1148 src2 = force_reg (mode, src2);
1149 else
1150 src1 = force_reg (mode, src1);
1151 }
1152
1153 /* If the destination is memory, and we do not have matching source
1154 operands, do things in registers. */
1155 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1156 dst = gen_reg_rtx (mode);
1157
1158 /* Source 1 cannot be a constant. */
1159 if (CONSTANT_P (src1))
1160 src1 = force_reg (mode, src1);
1161
1162 /* Source 1 cannot be a non-matching memory. */
1163 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1164 src1 = force_reg (mode, src1);
1165
1166 /* Improve address combine. */
1167 if (code == PLUS
1168 && GET_MODE_CLASS (mode) == MODE_INT
1169 && MEM_P (src2))
1170 src2 = force_reg (mode, src2);
1171
1172 operands[1] = src1;
1173 operands[2] = src2;
1174 return dst;
1175 }
1176
1177 /* Similarly, but assume that the destination has already been
1178 set up properly. */
1179
1180 void
1181 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1182 machine_mode mode, rtx operands[])
1183 {
1184 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1185 gcc_assert (dst == operands[0]);
1186 }
1187
1188 /* Attempt to expand a binary operator. Make the expansion closer to the
1189 actual machine, then just general_operand, which will allow 3 separate
1190 memory references (one output, two input) in a single insn. */
1191
1192 void
1193 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1194 rtx operands[])
1195 {
1196 rtx src1, src2, dst, op, clob;
1197
1198 dst = ix86_fixup_binary_operands (code, mode, operands);
1199 src1 = operands[1];
1200 src2 = operands[2];
1201
1202 /* Emit the instruction. */
1203
1204 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1205
1206 if (reload_completed
1207 && code == PLUS
1208 && !rtx_equal_p (dst, src1))
1209 {
1210 /* This is going to be an LEA; avoid splitting it later. */
1211 emit_insn (op);
1212 }
1213 else
1214 {
1215 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1216 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1217 }
1218
1219 /* Fix up the destination if needed. */
1220 if (dst != operands[0])
1221 emit_move_insn (operands[0], dst);
1222 }
1223
1224 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1225 the given OPERANDS. */
1226
1227 void
1228 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1229 rtx operands[])
1230 {
1231 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1232 if (SUBREG_P (operands[1]))
1233 {
1234 op1 = operands[1];
1235 op2 = operands[2];
1236 }
1237 else if (SUBREG_P (operands[2]))
1238 {
1239 op1 = operands[2];
1240 op2 = operands[1];
1241 }
1242 /* Optimize (__m128i) d | (__m128i) e and similar code
1243 when d and e are float vectors into float vector logical
1244 insn. In C/C++ without using intrinsics there is no other way
1245 to express vector logical operation on float vectors than
1246 to cast them temporarily to integer vectors. */
1247 if (op1
1248 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1249 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1250 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1251 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1252 && SUBREG_BYTE (op1) == 0
1253 && (GET_CODE (op2) == CONST_VECTOR
1254 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1255 && SUBREG_BYTE (op2) == 0))
1256 && can_create_pseudo_p ())
1257 {
1258 rtx dst;
1259 switch (GET_MODE (SUBREG_REG (op1)))
1260 {
1261 case E_V4SFmode:
1262 case E_V8SFmode:
1263 case E_V16SFmode:
1264 case E_V2DFmode:
1265 case E_V4DFmode:
1266 case E_V8DFmode:
1267 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1268 if (GET_CODE (op2) == CONST_VECTOR)
1269 {
1270 op2 = gen_lowpart (GET_MODE (dst), op2);
1271 op2 = force_reg (GET_MODE (dst), op2);
1272 }
1273 else
1274 {
1275 op1 = operands[1];
1276 op2 = SUBREG_REG (operands[2]);
1277 if (!vector_operand (op2, GET_MODE (dst)))
1278 op2 = force_reg (GET_MODE (dst), op2);
1279 }
1280 op1 = SUBREG_REG (op1);
1281 if (!vector_operand (op1, GET_MODE (dst)))
1282 op1 = force_reg (GET_MODE (dst), op1);
1283 emit_insn (gen_rtx_SET (dst,
1284 gen_rtx_fmt_ee (code, GET_MODE (dst),
1285 op1, op2)));
1286 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1287 return;
1288 default:
1289 break;
1290 }
1291 }
1292 if (!vector_operand (operands[1], mode))
1293 operands[1] = force_reg (mode, operands[1]);
1294 if (!vector_operand (operands[2], mode))
1295 operands[2] = force_reg (mode, operands[2]);
1296 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1297 emit_insn (gen_rtx_SET (operands[0],
1298 gen_rtx_fmt_ee (code, mode, operands[1],
1299 operands[2])));
1300 }
1301
1302 /* Return TRUE or FALSE depending on whether the binary operator meets the
1303 appropriate constraints. */
1304
1305 bool
1306 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1307 rtx operands[3])
1308 {
1309 rtx dst = operands[0];
1310 rtx src1 = operands[1];
1311 rtx src2 = operands[2];
1312
1313 /* Both source operands cannot be in memory. */
1314 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1315 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1316 return false;
1317
1318 /* Canonicalize operand order for commutative operators. */
1319 if (ix86_swap_binary_operands_p (code, mode, operands))
1320 std::swap (src1, src2);
1321
1322 /* If the destination is memory, we must have a matching source operand. */
1323 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1324 return false;
1325
1326 /* Source 1 cannot be a constant. */
1327 if (CONSTANT_P (src1))
1328 return false;
1329
1330 /* Source 1 cannot be a non-matching memory. */
1331 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1332 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1333 return (code == AND
1334 && (mode == HImode
1335 || mode == SImode
1336 || (TARGET_64BIT && mode == DImode))
1337 && satisfies_constraint_L (src2));
1338
1339 return true;
1340 }
1341
1342 /* Attempt to expand a unary operator. Make the expansion closer to the
1343 actual machine, then just general_operand, which will allow 2 separate
1344 memory references (one output, one input) in a single insn. */
1345
1346 void
1347 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1348 rtx operands[])
1349 {
1350 bool matching_memory = false;
1351 rtx src, dst, op, clob;
1352
1353 dst = operands[0];
1354 src = operands[1];
1355
1356 /* If the destination is memory, and we do not have matching source
1357 operands, do things in registers. */
1358 if (MEM_P (dst))
1359 {
1360 if (rtx_equal_p (dst, src))
1361 matching_memory = true;
1362 else
1363 dst = gen_reg_rtx (mode);
1364 }
1365
1366 /* When source operand is memory, destination must match. */
1367 if (MEM_P (src) && !matching_memory)
1368 src = force_reg (mode, src);
1369
1370 /* Emit the instruction. */
1371
1372 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1373
1374 if (code == NOT)
1375 emit_insn (op);
1376 else
1377 {
1378 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1379 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1380 }
1381
1382 /* Fix up the destination if needed. */
1383 if (dst != operands[0])
1384 emit_move_insn (operands[0], dst);
1385 }
1386
1387 /* Predict just emitted jump instruction to be taken with probability PROB. */
1388
1389 static void
1390 predict_jump (int prob)
1391 {
1392 rtx_insn *insn = get_last_insn ();
1393 gcc_assert (JUMP_P (insn));
1394 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1395 }
1396
1397 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1398 divisor are within the range [0-255]. */
1399
1400 void
1401 ix86_split_idivmod (machine_mode mode, rtx operands[],
1402 bool unsigned_p)
1403 {
1404 rtx_code_label *end_label, *qimode_label;
1405 rtx div, mod;
1406 rtx_insn *insn;
1407 rtx scratch, tmp0, tmp1, tmp2;
1408 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1409
1410 operands[2] = force_reg (mode, operands[2]);
1411 operands[3] = force_reg (mode, operands[3]);
1412
1413 switch (mode)
1414 {
1415 case E_SImode:
1416 if (GET_MODE (operands[0]) == SImode)
1417 {
1418 if (GET_MODE (operands[1]) == SImode)
1419 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1420 else
1421 gen_divmod4_1
1422 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1423 }
1424 else
1425 gen_divmod4_1
1426 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1427 break;
1428
1429 case E_DImode:
1430 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1431 break;
1432
1433 default:
1434 gcc_unreachable ();
1435 }
1436
1437 end_label = gen_label_rtx ();
1438 qimode_label = gen_label_rtx ();
1439
1440 scratch = gen_reg_rtx (mode);
1441
1442 /* Use 8bit unsigned divimod if dividend and divisor are within
1443 the range [0-255]. */
1444 emit_move_insn (scratch, operands[2]);
1445 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1446 scratch, 1, OPTAB_DIRECT);
1447 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1448 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1449 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1450 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1451 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1452 pc_rtx);
1453 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1454 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1455 JUMP_LABEL (insn) = qimode_label;
1456
1457 /* Generate original signed/unsigned divimod. */
1458 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1459 operands[2], operands[3]));
1460
1461 /* Branch to the end. */
1462 emit_jump_insn (gen_jump (end_label));
1463 emit_barrier ();
1464
1465 /* Generate 8bit unsigned divide. */
1466 emit_label (qimode_label);
1467 /* Don't use operands[0] for result of 8bit divide since not all
1468 registers support QImode ZERO_EXTRACT. */
1469 tmp0 = lowpart_subreg (HImode, scratch, mode);
1470 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1471 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1472 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1473
1474 if (unsigned_p)
1475 {
1476 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1477 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1478 }
1479 else
1480 {
1481 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1482 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1483 }
1484 if (mode == SImode)
1485 {
1486 if (GET_MODE (operands[0]) != SImode)
1487 div = gen_rtx_ZERO_EXTEND (DImode, div);
1488 if (GET_MODE (operands[1]) != SImode)
1489 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1490 }
1491
1492 /* Extract remainder from AH. */
1493 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1494 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1495 GEN_INT (8), GEN_INT (8));
1496 insn = emit_move_insn (operands[1], tmp1);
1497 set_unique_reg_note (insn, REG_EQUAL, mod);
1498
1499 /* Zero extend quotient from AL. */
1500 tmp1 = gen_lowpart (QImode, tmp0);
1501 insn = emit_insn (gen_extend_insn
1502 (operands[0], tmp1,
1503 GET_MODE (operands[0]), QImode, 1));
1504 set_unique_reg_note (insn, REG_EQUAL, div);
1505
1506 emit_label (end_label);
1507 }
1508
1509 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1510 matches destination. RTX includes clobber of FLAGS_REG. */
1511
1512 void
1513 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1514 rtx dst, rtx src)
1515 {
1516 rtx op, clob;
1517
1518 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1519 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1520
1521 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1522 }
1523
1524 /* Return true if regno1 def is nearest to the insn. */
1525
1526 static bool
1527 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1528 {
1529 rtx_insn *prev = insn;
1530 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1531
1532 if (insn == start)
1533 return false;
1534 while (prev && prev != start)
1535 {
1536 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1537 {
1538 prev = PREV_INSN (prev);
1539 continue;
1540 }
1541 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1542 return true;
1543 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1544 return false;
1545 prev = PREV_INSN (prev);
1546 }
1547
1548 /* None of the regs is defined in the bb. */
1549 return false;
1550 }
1551
1552 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1553 int ix86_last_zero_store_uid;
1554
1555 /* Split lea instructions into a sequence of instructions
1556 which are executed on ALU to avoid AGU stalls.
1557 It is assumed that it is allowed to clobber flags register
1558 at lea position. */
1559
1560 void
1561 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1562 {
1563 unsigned int regno0, regno1, regno2;
1564 struct ix86_address parts;
1565 rtx target, tmp;
1566 int ok, adds;
1567
1568 ok = ix86_decompose_address (operands[1], &parts);
1569 gcc_assert (ok);
1570
1571 target = gen_lowpart (mode, operands[0]);
1572
1573 regno0 = true_regnum (target);
1574 regno1 = INVALID_REGNUM;
1575 regno2 = INVALID_REGNUM;
1576
1577 if (parts.base)
1578 {
1579 parts.base = gen_lowpart (mode, parts.base);
1580 regno1 = true_regnum (parts.base);
1581 }
1582
1583 if (parts.index)
1584 {
1585 parts.index = gen_lowpart (mode, parts.index);
1586 regno2 = true_regnum (parts.index);
1587 }
1588
1589 if (parts.disp)
1590 parts.disp = gen_lowpart (mode, parts.disp);
1591
1592 if (parts.scale > 1)
1593 {
1594 /* Case r1 = r1 + ... */
1595 if (regno1 == regno0)
1596 {
1597 /* If we have a case r1 = r1 + C * r2 then we
1598 should use multiplication which is very
1599 expensive. Assume cost model is wrong if we
1600 have such case here. */
1601 gcc_assert (regno2 != regno0);
1602
1603 for (adds = parts.scale; adds > 0; adds--)
1604 ix86_emit_binop (PLUS, mode, target, parts.index);
1605 }
1606 else
1607 {
1608 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1609 if (regno0 != regno2)
1610 emit_insn (gen_rtx_SET (target, parts.index));
1611
1612 /* Use shift for scaling, but emit it as MULT instead
1613 to avoid it being immediately peephole2 optimized back
1614 into lea. */
1615 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1616
1617 if (parts.base)
1618 ix86_emit_binop (PLUS, mode, target, parts.base);
1619
1620 if (parts.disp && parts.disp != const0_rtx)
1621 ix86_emit_binop (PLUS, mode, target, parts.disp);
1622 }
1623 }
1624 else if (!parts.base && !parts.index)
1625 {
1626 gcc_assert(parts.disp);
1627 emit_insn (gen_rtx_SET (target, parts.disp));
1628 }
1629 else
1630 {
1631 if (!parts.base)
1632 {
1633 if (regno0 != regno2)
1634 emit_insn (gen_rtx_SET (target, parts.index));
1635 }
1636 else if (!parts.index)
1637 {
1638 if (regno0 != regno1)
1639 emit_insn (gen_rtx_SET (target, parts.base));
1640 }
1641 else
1642 {
1643 if (regno0 == regno1)
1644 tmp = parts.index;
1645 else if (regno0 == regno2)
1646 tmp = parts.base;
1647 else
1648 {
1649 rtx tmp1;
1650
1651 /* Find better operand for SET instruction, depending
1652 on which definition is farther from the insn. */
1653 if (find_nearest_reg_def (insn, regno1, regno2))
1654 tmp = parts.index, tmp1 = parts.base;
1655 else
1656 tmp = parts.base, tmp1 = parts.index;
1657
1658 emit_insn (gen_rtx_SET (target, tmp));
1659
1660 if (parts.disp && parts.disp != const0_rtx)
1661 ix86_emit_binop (PLUS, mode, target, parts.disp);
1662
1663 ix86_emit_binop (PLUS, mode, target, tmp1);
1664 return;
1665 }
1666
1667 ix86_emit_binop (PLUS, mode, target, tmp);
1668 }
1669
1670 if (parts.disp && parts.disp != const0_rtx)
1671 ix86_emit_binop (PLUS, mode, target, parts.disp);
1672 }
1673 }
1674
1675 /* Post-reload splitter for converting an SF or DFmode value in an
1676 SSE register into an unsigned SImode. */
1677
1678 void
1679 ix86_split_convert_uns_si_sse (rtx operands[])
1680 {
1681 machine_mode vecmode;
1682 rtx value, large, zero_or_two31, input, two31, x;
1683
1684 large = operands[1];
1685 zero_or_two31 = operands[2];
1686 input = operands[3];
1687 two31 = operands[4];
1688 vecmode = GET_MODE (large);
1689 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1690
1691 /* Load up the value into the low element. We must ensure that the other
1692 elements are valid floats -- zero is the easiest such value. */
1693 if (MEM_P (input))
1694 {
1695 if (vecmode == V4SFmode)
1696 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1697 else
1698 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1699 }
1700 else
1701 {
1702 input = gen_rtx_REG (vecmode, REGNO (input));
1703 emit_move_insn (value, CONST0_RTX (vecmode));
1704 if (vecmode == V4SFmode)
1705 emit_insn (gen_sse_movss (value, value, input));
1706 else
1707 emit_insn (gen_sse2_movsd (value, value, input));
1708 }
1709
1710 emit_move_insn (large, two31);
1711 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1712
1713 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1714 emit_insn (gen_rtx_SET (large, x));
1715
1716 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1717 emit_insn (gen_rtx_SET (zero_or_two31, x));
1718
1719 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1720 emit_insn (gen_rtx_SET (value, x));
1721
1722 large = gen_rtx_REG (V4SImode, REGNO (large));
1723 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1724
1725 x = gen_rtx_REG (V4SImode, REGNO (value));
1726 if (vecmode == V4SFmode)
1727 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1728 else
1729 emit_insn (gen_sse2_cvttpd2dq (x, value));
1730 value = x;
1731
1732 emit_insn (gen_xorv4si3 (value, value, large));
1733 }
1734
1735 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1736 machine_mode mode, rtx target,
1737 rtx var, int one_var);
1738
1739 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1740 Expects the 64-bit DImode to be supplied in a pair of integral
1741 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1742 -mfpmath=sse, !optimize_size only. */
1743
1744 void
1745 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1746 {
1747 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1748 rtx int_xmm, fp_xmm;
1749 rtx biases, exponents;
1750 rtx x;
1751
1752 int_xmm = gen_reg_rtx (V4SImode);
1753 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1754 emit_insn (gen_movdi_to_sse (int_xmm, input));
1755 else if (TARGET_SSE_SPLIT_REGS)
1756 {
1757 emit_clobber (int_xmm);
1758 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1759 }
1760 else
1761 {
1762 x = gen_reg_rtx (V2DImode);
1763 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1764 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1765 }
1766
1767 x = gen_rtx_CONST_VECTOR (V4SImode,
1768 gen_rtvec (4, GEN_INT (0x43300000UL),
1769 GEN_INT (0x45300000UL),
1770 const0_rtx, const0_rtx));
1771 exponents = validize_mem (force_const_mem (V4SImode, x));
1772
1773 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1774 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1775
1776 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1777 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1778 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1779 (0x1.0p84 + double(fp_value_hi_xmm)).
1780 Note these exponents differ by 32. */
1781
1782 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1783
1784 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1785 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1786 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1787 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1788 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1789 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1790 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1791 biases = validize_mem (force_const_mem (V2DFmode, biases));
1792 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1793
1794 /* Add the upper and lower DFmode values together. */
1795 if (TARGET_SSE3)
1796 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1797 else
1798 {
1799 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1800 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1801 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1802 }
1803
1804 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1805 }
1806
1807 /* Not used, but eases macroization of patterns. */
1808 void
1809 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1810 {
1811 gcc_unreachable ();
1812 }
1813
1814 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1815
1816 /* Convert an unsigned SImode value into a DFmode. Only currently used
1817 for SSE, but applicable anywhere. */
1818
1819 void
1820 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1821 {
1822 REAL_VALUE_TYPE TWO31r;
1823 rtx x, fp;
1824
1825 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1826 NULL, 1, OPTAB_DIRECT);
1827
1828 fp = gen_reg_rtx (DFmode);
1829 emit_insn (gen_floatsidf2 (fp, x));
1830
1831 real_ldexp (&TWO31r, &dconst1, 31);
1832 x = const_double_from_real_value (TWO31r, DFmode);
1833
1834 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1835
1836 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1837 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1838 x = ix86_expand_sse_fabs (x, NULL);
1839
1840 if (x != target)
1841 emit_move_insn (target, x);
1842 }
1843
1844 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1845 32-bit mode; otherwise we have a direct convert instruction. */
1846
1847 void
1848 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1849 {
1850 REAL_VALUE_TYPE TWO32r;
1851 rtx fp_lo, fp_hi, x;
1852
1853 fp_lo = gen_reg_rtx (DFmode);
1854 fp_hi = gen_reg_rtx (DFmode);
1855
1856 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1857
1858 real_ldexp (&TWO32r, &dconst1, 32);
1859 x = const_double_from_real_value (TWO32r, DFmode);
1860 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1861
1862 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1863
1864 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1865 0, OPTAB_DIRECT);
1866 if (x != target)
1867 emit_move_insn (target, x);
1868 }
1869
1870 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1871 For x86_32, -mfpmath=sse, !optimize_size only. */
1872 void
1873 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1874 {
1875 REAL_VALUE_TYPE ONE16r;
1876 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1877
1878 real_ldexp (&ONE16r, &dconst1, 16);
1879 x = const_double_from_real_value (ONE16r, SFmode);
1880 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1881 NULL, 0, OPTAB_DIRECT);
1882 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1883 NULL, 0, OPTAB_DIRECT);
1884 fp_hi = gen_reg_rtx (SFmode);
1885 fp_lo = gen_reg_rtx (SFmode);
1886 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1887 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1888 if (TARGET_FMA)
1889 {
1890 x = validize_mem (force_const_mem (SFmode, x));
1891 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1892 emit_move_insn (target, fp_hi);
1893 }
1894 else
1895 {
1896 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1897 0, OPTAB_DIRECT);
1898 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1899 0, OPTAB_DIRECT);
1900 if (!rtx_equal_p (target, fp_hi))
1901 emit_move_insn (target, fp_hi);
1902 }
1903 }
1904
1905 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1906 a vector of unsigned ints VAL to vector of floats TARGET. */
1907
1908 void
1909 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1910 {
1911 rtx tmp[8];
1912 REAL_VALUE_TYPE TWO16r;
1913 machine_mode intmode = GET_MODE (val);
1914 machine_mode fltmode = GET_MODE (target);
1915 rtx (*cvt) (rtx, rtx);
1916
1917 if (intmode == V4SImode)
1918 cvt = gen_floatv4siv4sf2;
1919 else
1920 cvt = gen_floatv8siv8sf2;
1921 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1922 tmp[0] = force_reg (intmode, tmp[0]);
1923 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1924 OPTAB_DIRECT);
1925 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1926 NULL_RTX, 1, OPTAB_DIRECT);
1927 tmp[3] = gen_reg_rtx (fltmode);
1928 emit_insn (cvt (tmp[3], tmp[1]));
1929 tmp[4] = gen_reg_rtx (fltmode);
1930 emit_insn (cvt (tmp[4], tmp[2]));
1931 real_ldexp (&TWO16r, &dconst1, 16);
1932 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1933 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1934 if (TARGET_FMA)
1935 {
1936 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
1937 emit_move_insn (target, tmp[6]);
1938 }
1939 else
1940 {
1941 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
1942 NULL_RTX, 1, OPTAB_DIRECT);
1943 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
1944 target, 1, OPTAB_DIRECT);
1945 if (tmp[7] != target)
1946 emit_move_insn (target, tmp[7]);
1947 }
1948 }
1949
1950 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1951 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1952 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1953 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1954
1955 rtx
1956 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1957 {
1958 REAL_VALUE_TYPE TWO31r;
1959 rtx two31r, tmp[4];
1960 machine_mode mode = GET_MODE (val);
1961 machine_mode scalarmode = GET_MODE_INNER (mode);
1962 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1963 rtx (*cmp) (rtx, rtx, rtx, rtx);
1964 int i;
1965
1966 for (i = 0; i < 3; i++)
1967 tmp[i] = gen_reg_rtx (mode);
1968 real_ldexp (&TWO31r, &dconst1, 31);
1969 two31r = const_double_from_real_value (TWO31r, scalarmode);
1970 two31r = ix86_build_const_vector (mode, 1, two31r);
1971 two31r = force_reg (mode, two31r);
1972 switch (mode)
1973 {
1974 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1975 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1976 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1977 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1978 default: gcc_unreachable ();
1979 }
1980 tmp[3] = gen_rtx_LE (mode, two31r, val);
1981 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1982 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1983 0, OPTAB_DIRECT);
1984 if (intmode == V4SImode || TARGET_AVX2)
1985 *xorp = expand_simple_binop (intmode, ASHIFT,
1986 gen_lowpart (intmode, tmp[0]),
1987 GEN_INT (31), NULL_RTX, 0,
1988 OPTAB_DIRECT);
1989 else
1990 {
1991 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1992 two31 = ix86_build_const_vector (intmode, 1, two31);
1993 *xorp = expand_simple_binop (intmode, AND,
1994 gen_lowpart (intmode, tmp[0]),
1995 two31, NULL_RTX, 0,
1996 OPTAB_DIRECT);
1997 }
1998 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1999 0, OPTAB_DIRECT);
2000 }
2001
2002 /* Generate code for floating point ABS or NEG. */
2003
2004 void
2005 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2006 rtx operands[])
2007 {
2008 rtx set, dst, src;
2009 bool use_sse = false;
2010 bool vector_mode = VECTOR_MODE_P (mode);
2011 machine_mode vmode = mode;
2012 rtvec par;
2013
2014 if (vector_mode || mode == TFmode || mode == HFmode)
2015 {
2016 use_sse = true;
2017 if (mode == HFmode)
2018 vmode = V8HFmode;
2019 }
2020 else if (TARGET_SSE_MATH)
2021 {
2022 use_sse = SSE_FLOAT_MODE_P (mode);
2023 if (mode == SFmode)
2024 vmode = V4SFmode;
2025 else if (mode == DFmode)
2026 vmode = V2DFmode;
2027 }
2028
2029 dst = operands[0];
2030 src = operands[1];
2031
2032 set = gen_rtx_fmt_e (code, mode, src);
2033 set = gen_rtx_SET (dst, set);
2034
2035 if (use_sse)
2036 {
2037 rtx mask, use, clob;
2038
2039 /* NEG and ABS performed with SSE use bitwise mask operations.
2040 Create the appropriate mask now. */
2041 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2042 use = gen_rtx_USE (VOIDmode, mask);
2043 if (vector_mode || mode == TFmode)
2044 par = gen_rtvec (2, set, use);
2045 else
2046 {
2047 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2048 par = gen_rtvec (3, set, use, clob);
2049 }
2050 }
2051 else
2052 {
2053 rtx clob;
2054
2055 /* Changing of sign for FP values is doable using integer unit too. */
2056 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2057 par = gen_rtvec (2, set, clob);
2058 }
2059
2060 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2061 }
2062
2063 /* Deconstruct a floating point ABS or NEG operation
2064 with integer registers into integer operations. */
2065
2066 void
2067 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2068 rtx operands[])
2069 {
2070 enum rtx_code absneg_op;
2071 rtx dst, set;
2072
2073 gcc_assert (operands_match_p (operands[0], operands[1]));
2074
2075 switch (mode)
2076 {
2077 case E_SFmode:
2078 dst = gen_lowpart (SImode, operands[0]);
2079
2080 if (code == ABS)
2081 {
2082 set = gen_int_mode (0x7fffffff, SImode);
2083 absneg_op = AND;
2084 }
2085 else
2086 {
2087 set = gen_int_mode (0x80000000, SImode);
2088 absneg_op = XOR;
2089 }
2090 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2091 break;
2092
2093 case E_DFmode:
2094 if (TARGET_64BIT)
2095 {
2096 dst = gen_lowpart (DImode, operands[0]);
2097 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2098
2099 if (code == ABS)
2100 set = const0_rtx;
2101 else
2102 set = gen_rtx_NOT (DImode, dst);
2103 }
2104 else
2105 {
2106 dst = gen_highpart (SImode, operands[0]);
2107
2108 if (code == ABS)
2109 {
2110 set = gen_int_mode (0x7fffffff, SImode);
2111 absneg_op = AND;
2112 }
2113 else
2114 {
2115 set = gen_int_mode (0x80000000, SImode);
2116 absneg_op = XOR;
2117 }
2118 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2119 }
2120 break;
2121
2122 case E_XFmode:
2123 dst = gen_rtx_REG (SImode,
2124 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2125 if (code == ABS)
2126 {
2127 set = GEN_INT (0x7fff);
2128 absneg_op = AND;
2129 }
2130 else
2131 {
2132 set = GEN_INT (0x8000);
2133 absneg_op = XOR;
2134 }
2135 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2136 break;
2137
2138 default:
2139 gcc_unreachable ();
2140 }
2141
2142 set = gen_rtx_SET (dst, set);
2143
2144 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2145 rtvec par = gen_rtvec (2, set, clob);
2146
2147 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2148 }
2149
2150 /* Expand a copysign operation. Special case operand 0 being a constant. */
2151
2152 void
2153 ix86_expand_copysign (rtx operands[])
2154 {
2155 machine_mode mode, vmode;
2156 rtx dest, vdest, op0, op1, mask, op2, op3;
2157
2158 mode = GET_MODE (operands[0]);
2159
2160 if (mode == HFmode)
2161 vmode = V8HFmode;
2162 else if (mode == SFmode)
2163 vmode = V4SFmode;
2164 else if (mode == DFmode)
2165 vmode = V2DFmode;
2166 else if (mode == TFmode)
2167 vmode = mode;
2168 else
2169 gcc_unreachable ();
2170
2171 if (rtx_equal_p (operands[1], operands[2]))
2172 {
2173 emit_move_insn (operands[0], operands[1]);
2174 return;
2175 }
2176
2177 dest = operands[0];
2178 vdest = lowpart_subreg (vmode, dest, mode);
2179 if (vdest == NULL_RTX)
2180 vdest = gen_reg_rtx (vmode);
2181 else
2182 dest = NULL_RTX;
2183 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2184 mask = ix86_build_signbit_mask (vmode, 0, 0);
2185
2186 if (CONST_DOUBLE_P (operands[1]))
2187 {
2188 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2189 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2190 if (op0 == CONST0_RTX (mode))
2191 {
2192 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2193 if (dest)
2194 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2195 return;
2196 }
2197
2198 if (GET_MODE_SIZE (mode) < 16)
2199 op0 = ix86_build_const_vector (vmode, false, op0);
2200 op0 = force_reg (vmode, op0);
2201 }
2202 else
2203 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2204
2205 op2 = gen_reg_rtx (vmode);
2206 op3 = gen_reg_rtx (vmode);
2207 emit_move_insn (op2, gen_rtx_AND (vmode,
2208 gen_rtx_NOT (vmode, mask),
2209 op0));
2210 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2211 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2212 if (dest)
2213 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2214 }
2215
2216 /* Expand an xorsign operation. */
2217
2218 void
2219 ix86_expand_xorsign (rtx operands[])
2220 {
2221 machine_mode mode, vmode;
2222 rtx dest, vdest, op0, op1, mask, x, temp;
2223
2224 dest = operands[0];
2225 op0 = operands[1];
2226 op1 = operands[2];
2227
2228 mode = GET_MODE (dest);
2229
2230 if (mode == HFmode)
2231 vmode = V8HFmode;
2232 else if (mode == SFmode)
2233 vmode = V4SFmode;
2234 else if (mode == DFmode)
2235 vmode = V2DFmode;
2236 else
2237 gcc_unreachable ();
2238
2239 temp = gen_reg_rtx (vmode);
2240 mask = ix86_build_signbit_mask (vmode, 0, 0);
2241
2242 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2243 x = gen_rtx_AND (vmode, op1, mask);
2244 emit_insn (gen_rtx_SET (temp, x));
2245
2246 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2247 x = gen_rtx_XOR (vmode, temp, op0);
2248
2249 vdest = lowpart_subreg (vmode, dest, mode);
2250 if (vdest == NULL_RTX)
2251 vdest = gen_reg_rtx (vmode);
2252 else
2253 dest = NULL_RTX;
2254 emit_insn (gen_rtx_SET (vdest, x));
2255
2256 if (dest)
2257 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2258 }
2259
2260 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2261
2262 void
2263 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2264 {
2265 machine_mode mode = GET_MODE (op0);
2266 rtx tmp;
2267
2268 /* Handle special case - vector comparsion with boolean result, transform
2269 it using ptest instruction. */
2270 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2271 {
2272 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2273 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2274
2275 gcc_assert (code == EQ || code == NE);
2276 /* Generate XOR since we can't check that one operand is zero vector. */
2277 tmp = gen_reg_rtx (mode);
2278 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2279 tmp = gen_lowpart (p_mode, tmp);
2280 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2281 gen_rtx_UNSPEC (CCmode,
2282 gen_rtvec (2, tmp, tmp),
2283 UNSPEC_PTEST)));
2284 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2285 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2286 gen_rtx_LABEL_REF (VOIDmode, label),
2287 pc_rtx);
2288 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2289 return;
2290 }
2291
2292 switch (mode)
2293 {
2294 case E_HFmode:
2295 case E_SFmode:
2296 case E_DFmode:
2297 case E_XFmode:
2298 case E_QImode:
2299 case E_HImode:
2300 case E_SImode:
2301 simple:
2302 tmp = ix86_expand_compare (code, op0, op1);
2303 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2304 gen_rtx_LABEL_REF (VOIDmode, label),
2305 pc_rtx);
2306 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2307 return;
2308
2309 case E_DImode:
2310 if (TARGET_64BIT)
2311 goto simple;
2312 /* For 32-bit target DI comparison may be performed on
2313 SSE registers. To allow this we should avoid split
2314 to SI mode which is achieved by doing xor in DI mode
2315 and then comparing with zero (which is recognized by
2316 STV pass). We don't compare using xor when optimizing
2317 for size. */
2318 if (!optimize_insn_for_size_p ()
2319 && TARGET_STV
2320 && (code == EQ || code == NE))
2321 {
2322 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2323 op1 = const0_rtx;
2324 }
2325 /* FALLTHRU */
2326 case E_TImode:
2327 /* Expand DImode branch into multiple compare+branch. */
2328 {
2329 rtx lo[2], hi[2];
2330 rtx_code_label *label2;
2331 enum rtx_code code1, code2, code3;
2332 machine_mode submode;
2333
2334 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2335 {
2336 std::swap (op0, op1);
2337 code = swap_condition (code);
2338 }
2339
2340 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2341 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2342
2343 submode = mode == DImode ? SImode : DImode;
2344
2345 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2346 avoid two branches. This costs one extra insn, so disable when
2347 optimizing for size. */
2348
2349 if ((code == EQ || code == NE)
2350 && (!optimize_insn_for_size_p ()
2351 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2352 {
2353 rtx xor0, xor1;
2354
2355 xor1 = hi[0];
2356 if (hi[1] != const0_rtx)
2357 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2358 NULL_RTX, 0, OPTAB_WIDEN);
2359
2360 xor0 = lo[0];
2361 if (lo[1] != const0_rtx)
2362 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2363 NULL_RTX, 0, OPTAB_WIDEN);
2364
2365 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2366 NULL_RTX, 0, OPTAB_WIDEN);
2367
2368 ix86_expand_branch (code, tmp, const0_rtx, label);
2369 return;
2370 }
2371
2372 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2373 op1 is a constant and the low word is zero, then we can just
2374 examine the high word. Similarly for low word -1 and
2375 less-or-equal-than or greater-than. */
2376
2377 if (CONST_INT_P (hi[1]))
2378 switch (code)
2379 {
2380 case LT: case LTU: case GE: case GEU:
2381 if (lo[1] == const0_rtx)
2382 {
2383 ix86_expand_branch (code, hi[0], hi[1], label);
2384 return;
2385 }
2386 break;
2387 case LE: case LEU: case GT: case GTU:
2388 if (lo[1] == constm1_rtx)
2389 {
2390 ix86_expand_branch (code, hi[0], hi[1], label);
2391 return;
2392 }
2393 break;
2394 default:
2395 break;
2396 }
2397
2398 /* Emulate comparisons that do not depend on Zero flag with
2399 double-word subtraction. Note that only Overflow, Sign
2400 and Carry flags are valid, so swap arguments and condition
2401 of comparisons that would otherwise test Zero flag. */
2402
2403 switch (code)
2404 {
2405 case LE: case LEU: case GT: case GTU:
2406 std::swap (lo[0], lo[1]);
2407 std::swap (hi[0], hi[1]);
2408 code = swap_condition (code);
2409 /* FALLTHRU */
2410
2411 case LT: case LTU: case GE: case GEU:
2412 {
2413 bool uns = (code == LTU || code == GEU);
2414 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2415 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2416
2417 if (!nonimmediate_operand (lo[0], submode))
2418 lo[0] = force_reg (submode, lo[0]);
2419 if (!x86_64_general_operand (lo[1], submode))
2420 lo[1] = force_reg (submode, lo[1]);
2421
2422 if (!register_operand (hi[0], submode))
2423 hi[0] = force_reg (submode, hi[0]);
2424 if ((uns && !nonimmediate_operand (hi[1], submode))
2425 || (!uns && !x86_64_general_operand (hi[1], submode)))
2426 hi[1] = force_reg (submode, hi[1]);
2427
2428 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2429
2430 tmp = gen_rtx_SCRATCH (submode);
2431 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2432
2433 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2434 ix86_expand_branch (code, tmp, const0_rtx, label);
2435 return;
2436 }
2437
2438 default:
2439 break;
2440 }
2441
2442 /* Otherwise, we need two or three jumps. */
2443
2444 label2 = gen_label_rtx ();
2445
2446 code1 = code;
2447 code2 = swap_condition (code);
2448 code3 = unsigned_condition (code);
2449
2450 switch (code)
2451 {
2452 case LT: case GT: case LTU: case GTU:
2453 break;
2454
2455 case LE: code1 = LT; code2 = GT; break;
2456 case GE: code1 = GT; code2 = LT; break;
2457 case LEU: code1 = LTU; code2 = GTU; break;
2458 case GEU: code1 = GTU; code2 = LTU; break;
2459
2460 case EQ: code1 = UNKNOWN; code2 = NE; break;
2461 case NE: code2 = UNKNOWN; break;
2462
2463 default:
2464 gcc_unreachable ();
2465 }
2466
2467 /*
2468 * a < b =>
2469 * if (hi(a) < hi(b)) goto true;
2470 * if (hi(a) > hi(b)) goto false;
2471 * if (lo(a) < lo(b)) goto true;
2472 * false:
2473 */
2474
2475 if (code1 != UNKNOWN)
2476 ix86_expand_branch (code1, hi[0], hi[1], label);
2477 if (code2 != UNKNOWN)
2478 ix86_expand_branch (code2, hi[0], hi[1], label2);
2479
2480 ix86_expand_branch (code3, lo[0], lo[1], label);
2481
2482 if (code2 != UNKNOWN)
2483 emit_label (label2);
2484 return;
2485 }
2486
2487 default:
2488 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2489 goto simple;
2490 }
2491 }
2492
2493 /* Figure out whether to use unordered fp comparisons. */
2494
2495 static bool
2496 ix86_unordered_fp_compare (enum rtx_code code)
2497 {
2498 if (!TARGET_IEEE_FP)
2499 return false;
2500
2501 switch (code)
2502 {
2503 case LT:
2504 case LE:
2505 case GT:
2506 case GE:
2507 case LTGT:
2508 return false;
2509
2510 case EQ:
2511 case NE:
2512
2513 case UNORDERED:
2514 case ORDERED:
2515 case UNLT:
2516 case UNLE:
2517 case UNGT:
2518 case UNGE:
2519 case UNEQ:
2520 return true;
2521
2522 default:
2523 gcc_unreachable ();
2524 }
2525 }
2526
2527 /* Return a comparison we can do and that it is equivalent to
2528 swap_condition (code) apart possibly from orderedness.
2529 But, never change orderedness if TARGET_IEEE_FP, returning
2530 UNKNOWN in that case if necessary. */
2531
2532 static enum rtx_code
2533 ix86_fp_swap_condition (enum rtx_code code)
2534 {
2535 switch (code)
2536 {
2537 case GT: /* GTU - CF=0 & ZF=0 */
2538 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2539 case GE: /* GEU - CF=0 */
2540 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2541 case UNLT: /* LTU - CF=1 */
2542 return TARGET_IEEE_FP ? UNKNOWN : GT;
2543 case UNLE: /* LEU - CF=1 | ZF=1 */
2544 return TARGET_IEEE_FP ? UNKNOWN : GE;
2545 default:
2546 return swap_condition (code);
2547 }
2548 }
2549
2550 /* Return cost of comparison CODE using the best strategy for performance.
2551 All following functions do use number of instructions as a cost metrics.
2552 In future this should be tweaked to compute bytes for optimize_size and
2553 take into account performance of various instructions on various CPUs. */
2554
2555 static int
2556 ix86_fp_comparison_cost (enum rtx_code code)
2557 {
2558 int arith_cost;
2559
2560 /* The cost of code using bit-twiddling on %ah. */
2561 switch (code)
2562 {
2563 case UNLE:
2564 case UNLT:
2565 case LTGT:
2566 case GT:
2567 case GE:
2568 case UNORDERED:
2569 case ORDERED:
2570 case UNEQ:
2571 arith_cost = 4;
2572 break;
2573 case LT:
2574 case NE:
2575 case EQ:
2576 case UNGE:
2577 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2578 break;
2579 case LE:
2580 case UNGT:
2581 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2582 break;
2583 default:
2584 gcc_unreachable ();
2585 }
2586
2587 switch (ix86_fp_comparison_strategy (code))
2588 {
2589 case IX86_FPCMP_COMI:
2590 return arith_cost > 4 ? 3 : 2;
2591 case IX86_FPCMP_SAHF:
2592 return arith_cost > 4 ? 4 : 3;
2593 default:
2594 return arith_cost;
2595 }
2596 }
2597
2598 /* Swap, force into registers, or otherwise massage the two operands
2599 to a fp comparison. The operands are updated in place; the new
2600 comparison code is returned. */
2601
2602 static enum rtx_code
2603 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2604 {
2605 bool unordered_compare = ix86_unordered_fp_compare (code);
2606 rtx op0 = *pop0, op1 = *pop1;
2607 machine_mode op_mode = GET_MODE (op0);
2608 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2609
2610 /* All of the unordered compare instructions only work on registers.
2611 The same is true of the fcomi compare instructions. The XFmode
2612 compare instructions require registers except when comparing
2613 against zero or when converting operand 1 from fixed point to
2614 floating point. */
2615
2616 if (!is_sse
2617 && (unordered_compare
2618 || (op_mode == XFmode
2619 && ! (standard_80387_constant_p (op0) == 1
2620 || standard_80387_constant_p (op1) == 1)
2621 && GET_CODE (op1) != FLOAT)
2622 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2623 {
2624 op0 = force_reg (op_mode, op0);
2625 op1 = force_reg (op_mode, op1);
2626 }
2627 else
2628 {
2629 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2630 things around if they appear profitable, otherwise force op0
2631 into a register. */
2632
2633 if (standard_80387_constant_p (op0) == 0
2634 || (MEM_P (op0)
2635 && ! (standard_80387_constant_p (op1) == 0
2636 || MEM_P (op1))))
2637 {
2638 enum rtx_code new_code = ix86_fp_swap_condition (code);
2639 if (new_code != UNKNOWN)
2640 {
2641 std::swap (op0, op1);
2642 code = new_code;
2643 }
2644 }
2645
2646 if (!REG_P (op0))
2647 op0 = force_reg (op_mode, op0);
2648
2649 if (CONSTANT_P (op1))
2650 {
2651 int tmp = standard_80387_constant_p (op1);
2652 if (tmp == 0)
2653 op1 = validize_mem (force_const_mem (op_mode, op1));
2654 else if (tmp == 1)
2655 {
2656 if (TARGET_CMOVE)
2657 op1 = force_reg (op_mode, op1);
2658 }
2659 else
2660 op1 = force_reg (op_mode, op1);
2661 }
2662 }
2663
2664 /* Try to rearrange the comparison to make it cheaper. */
2665 if (ix86_fp_comparison_cost (code)
2666 > ix86_fp_comparison_cost (swap_condition (code))
2667 && (REG_P (op1) || can_create_pseudo_p ()))
2668 {
2669 std::swap (op0, op1);
2670 code = swap_condition (code);
2671 if (!REG_P (op0))
2672 op0 = force_reg (op_mode, op0);
2673 }
2674
2675 *pop0 = op0;
2676 *pop1 = op1;
2677 return code;
2678 }
2679
2680 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2681
2682 static rtx
2683 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2684 {
2685 bool unordered_compare = ix86_unordered_fp_compare (code);
2686 machine_mode cmp_mode;
2687 rtx tmp, scratch;
2688
2689 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2690
2691 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2692 if (unordered_compare)
2693 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2694
2695 /* Do fcomi/sahf based test when profitable. */
2696 switch (ix86_fp_comparison_strategy (code))
2697 {
2698 case IX86_FPCMP_COMI:
2699 cmp_mode = CCFPmode;
2700 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2701 break;
2702
2703 case IX86_FPCMP_SAHF:
2704 cmp_mode = CCFPmode;
2705 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2706 scratch = gen_reg_rtx (HImode);
2707 emit_insn (gen_rtx_SET (scratch, tmp));
2708 emit_insn (gen_x86_sahf_1 (scratch));
2709 break;
2710
2711 case IX86_FPCMP_ARITH:
2712 cmp_mode = CCNOmode;
2713 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2714 scratch = gen_reg_rtx (HImode);
2715 emit_insn (gen_rtx_SET (scratch, tmp));
2716
2717 /* In the unordered case, we have to check C2 for NaN's, which
2718 doesn't happen to work out to anything nice combination-wise.
2719 So do some bit twiddling on the value we've got in AH to come
2720 up with an appropriate set of condition codes. */
2721
2722 switch (code)
2723 {
2724 case GT:
2725 case UNGT:
2726 if (code == GT || !TARGET_IEEE_FP)
2727 {
2728 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2729 code = EQ;
2730 }
2731 else
2732 {
2733 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2734 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2735 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2736 cmp_mode = CCmode;
2737 code = GEU;
2738 }
2739 break;
2740 case LT:
2741 case UNLT:
2742 if (code == LT && TARGET_IEEE_FP)
2743 {
2744 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2745 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2746 cmp_mode = CCmode;
2747 code = EQ;
2748 }
2749 else
2750 {
2751 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2752 code = NE;
2753 }
2754 break;
2755 case GE:
2756 case UNGE:
2757 if (code == GE || !TARGET_IEEE_FP)
2758 {
2759 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2760 code = EQ;
2761 }
2762 else
2763 {
2764 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2765 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2766 code = NE;
2767 }
2768 break;
2769 case LE:
2770 case UNLE:
2771 if (code == LE && TARGET_IEEE_FP)
2772 {
2773 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2774 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2775 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2776 cmp_mode = CCmode;
2777 code = LTU;
2778 }
2779 else
2780 {
2781 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2782 code = NE;
2783 }
2784 break;
2785 case EQ:
2786 case UNEQ:
2787 if (code == EQ && TARGET_IEEE_FP)
2788 {
2789 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2790 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2791 cmp_mode = CCmode;
2792 code = EQ;
2793 }
2794 else
2795 {
2796 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2797 code = NE;
2798 }
2799 break;
2800 case NE:
2801 case LTGT:
2802 if (code == NE && TARGET_IEEE_FP)
2803 {
2804 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2805 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2806 GEN_INT (0x40)));
2807 code = NE;
2808 }
2809 else
2810 {
2811 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2812 code = EQ;
2813 }
2814 break;
2815
2816 case UNORDERED:
2817 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2818 code = NE;
2819 break;
2820 case ORDERED:
2821 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2822 code = EQ;
2823 break;
2824
2825 default:
2826 gcc_unreachable ();
2827 }
2828 break;
2829
2830 default:
2831 gcc_unreachable();
2832 }
2833
2834 /* Return the test that should be put into the flags user, i.e.
2835 the bcc, scc, or cmov instruction. */
2836 return gen_rtx_fmt_ee (code, VOIDmode,
2837 gen_rtx_REG (cmp_mode, FLAGS_REG),
2838 const0_rtx);
2839 }
2840
2841 /* Generate insn patterns to do an integer compare of OPERANDS. */
2842
2843 static rtx
2844 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2845 {
2846 machine_mode cmpmode;
2847 rtx tmp, flags;
2848
2849 /* Swap operands to emit carry flag comparison. */
2850 if ((code == GTU || code == LEU)
2851 && nonimmediate_operand (op1, VOIDmode))
2852 {
2853 std::swap (op0, op1);
2854 code = swap_condition (code);
2855 }
2856
2857 cmpmode = SELECT_CC_MODE (code, op0, op1);
2858 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2859
2860 /* This is very simple, but making the interface the same as in the
2861 FP case makes the rest of the code easier. */
2862 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2863 emit_insn (gen_rtx_SET (flags, tmp));
2864
2865 /* Return the test that should be put into the flags user, i.e.
2866 the bcc, scc, or cmov instruction. */
2867 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2868 }
2869
2870 static rtx
2871 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2872 {
2873 rtx ret;
2874
2875 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2876 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2877
2878 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2879 {
2880 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2881 ret = ix86_expand_fp_compare (code, op0, op1);
2882 }
2883 else
2884 ret = ix86_expand_int_compare (code, op0, op1);
2885
2886 return ret;
2887 }
2888
2889 void
2890 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2891 {
2892 rtx ret;
2893
2894 gcc_assert (GET_MODE (dest) == QImode);
2895
2896 ret = ix86_expand_compare (code, op0, op1);
2897 PUT_MODE (ret, QImode);
2898 emit_insn (gen_rtx_SET (dest, ret));
2899 }
2900
2901 /* Expand floating point op0 <=> op1, i.e.
2902 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2903
2904 void
2905 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
2906 {
2907 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
2908 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
2909 rtx l0 = gen_label_rtx ();
2910 rtx l1 = gen_label_rtx ();
2911 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
2912 rtx lend = gen_label_rtx ();
2913 rtx tmp;
2914 rtx_insn *jmp;
2915 if (l2)
2916 {
2917 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
2918 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2919 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
2920 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
2921 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2922 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
2923 }
2924 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
2925 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2926 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
2927 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
2928 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2929 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
2930 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
2931 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
2932 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2933 add_reg_br_prob_note (jmp, profile_probability::even ());
2934 emit_move_insn (dest, constm1_rtx);
2935 emit_jump (lend);
2936 emit_label (l0);
2937 emit_move_insn (dest, const0_rtx);
2938 emit_jump (lend);
2939 emit_label (l1);
2940 emit_move_insn (dest, const1_rtx);
2941 emit_jump (lend);
2942 if (l2)
2943 {
2944 emit_label (l2);
2945 emit_move_insn (dest, const2_rtx);
2946 }
2947 emit_label (lend);
2948 }
2949
2950 /* Expand comparison setting or clearing carry flag. Return true when
2951 successful and set pop for the operation. */
2952 static bool
2953 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2954 {
2955 machine_mode mode
2956 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2957
2958 /* Do not handle double-mode compares that go through special path. */
2959 if (mode == (TARGET_64BIT ? TImode : DImode))
2960 return false;
2961
2962 if (SCALAR_FLOAT_MODE_P (mode))
2963 {
2964 rtx compare_op;
2965 rtx_insn *compare_seq;
2966
2967 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2968
2969 /* Shortcut: following common codes never translate
2970 into carry flag compares. */
2971 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2972 || code == ORDERED || code == UNORDERED)
2973 return false;
2974
2975 /* These comparisons require zero flag; swap operands so they won't. */
2976 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2977 && !TARGET_IEEE_FP)
2978 {
2979 std::swap (op0, op1);
2980 code = swap_condition (code);
2981 }
2982
2983 /* Try to expand the comparison and verify that we end up with
2984 carry flag based comparison. This fails to be true only when
2985 we decide to expand comparison using arithmetic that is not
2986 too common scenario. */
2987 start_sequence ();
2988 compare_op = ix86_expand_fp_compare (code, op0, op1);
2989 compare_seq = get_insns ();
2990 end_sequence ();
2991
2992 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2993 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2994 else
2995 code = GET_CODE (compare_op);
2996
2997 if (code != LTU && code != GEU)
2998 return false;
2999
3000 emit_insn (compare_seq);
3001 *pop = compare_op;
3002 return true;
3003 }
3004
3005 if (!INTEGRAL_MODE_P (mode))
3006 return false;
3007
3008 switch (code)
3009 {
3010 case LTU:
3011 case GEU:
3012 break;
3013
3014 /* Convert a==0 into (unsigned)a<1. */
3015 case EQ:
3016 case NE:
3017 if (op1 != const0_rtx)
3018 return false;
3019 op1 = const1_rtx;
3020 code = (code == EQ ? LTU : GEU);
3021 break;
3022
3023 /* Convert a>b into b<a or a>=b-1. */
3024 case GTU:
3025 case LEU:
3026 if (CONST_INT_P (op1))
3027 {
3028 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3029 /* Bail out on overflow. We still can swap operands but that
3030 would force loading of the constant into register. */
3031 if (op1 == const0_rtx
3032 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3033 return false;
3034 code = (code == GTU ? GEU : LTU);
3035 }
3036 else
3037 {
3038 std::swap (op0, op1);
3039 code = (code == GTU ? LTU : GEU);
3040 }
3041 break;
3042
3043 /* Convert a>=0 into (unsigned)a<0x80000000. */
3044 case LT:
3045 case GE:
3046 if (mode == DImode || op1 != const0_rtx)
3047 return false;
3048 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3049 code = (code == LT ? GEU : LTU);
3050 break;
3051 case LE:
3052 case GT:
3053 if (mode == DImode || op1 != constm1_rtx)
3054 return false;
3055 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3056 code = (code == LE ? GEU : LTU);
3057 break;
3058
3059 default:
3060 return false;
3061 }
3062 /* Swapping operands may cause constant to appear as first operand. */
3063 if (!nonimmediate_operand (op0, VOIDmode))
3064 {
3065 if (!can_create_pseudo_p ())
3066 return false;
3067 op0 = force_reg (mode, op0);
3068 }
3069 *pop = ix86_expand_compare (code, op0, op1);
3070 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3071 return true;
3072 }
3073
3074 /* Expand conditional increment or decrement using adb/sbb instructions.
3075 The default case using setcc followed by the conditional move can be
3076 done by generic code. */
3077 bool
3078 ix86_expand_int_addcc (rtx operands[])
3079 {
3080 enum rtx_code code = GET_CODE (operands[1]);
3081 rtx flags;
3082 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3083 rtx compare_op;
3084 rtx val = const0_rtx;
3085 bool fpcmp = false;
3086 machine_mode mode;
3087 rtx op0 = XEXP (operands[1], 0);
3088 rtx op1 = XEXP (operands[1], 1);
3089
3090 if (operands[3] != const1_rtx
3091 && operands[3] != constm1_rtx)
3092 return false;
3093 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3094 return false;
3095 code = GET_CODE (compare_op);
3096
3097 flags = XEXP (compare_op, 0);
3098
3099 if (GET_MODE (flags) == CCFPmode)
3100 {
3101 fpcmp = true;
3102 code = ix86_fp_compare_code_to_integer (code);
3103 }
3104
3105 if (code != LTU)
3106 {
3107 val = constm1_rtx;
3108 if (fpcmp)
3109 PUT_CODE (compare_op,
3110 reverse_condition_maybe_unordered
3111 (GET_CODE (compare_op)));
3112 else
3113 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3114 }
3115
3116 mode = GET_MODE (operands[0]);
3117
3118 /* Construct either adc or sbb insn. */
3119 if ((code == LTU) == (operands[3] == constm1_rtx))
3120 insn = gen_sub3_carry;
3121 else
3122 insn = gen_add3_carry;
3123
3124 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3125
3126 return true;
3127 }
3128
3129 bool
3130 ix86_expand_int_movcc (rtx operands[])
3131 {
3132 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3133 rtx_insn *compare_seq;
3134 rtx compare_op;
3135 machine_mode mode = GET_MODE (operands[0]);
3136 bool sign_bit_compare_p = false;
3137 rtx op0 = XEXP (operands[1], 0);
3138 rtx op1 = XEXP (operands[1], 1);
3139 rtx op2 = operands[2];
3140 rtx op3 = operands[3];
3141
3142 if (GET_MODE (op0) == TImode
3143 || (GET_MODE (op0) == DImode
3144 && !TARGET_64BIT))
3145 return false;
3146
3147 start_sequence ();
3148 compare_op = ix86_expand_compare (code, op0, op1);
3149 compare_seq = get_insns ();
3150 end_sequence ();
3151
3152 compare_code = GET_CODE (compare_op);
3153
3154 if ((op1 == const0_rtx && (code == GE || code == LT))
3155 || (op1 == constm1_rtx && (code == GT || code == LE)))
3156 sign_bit_compare_p = true;
3157
3158 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3159 but if op1 is a constant, the latter form allows more optimizations,
3160 either through the last 2 ops being constant handling, or the one
3161 constant and one variable cases. On the other side, for cmov the
3162 former might be better as we don't need to load the constant into
3163 another register. */
3164 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3165 op2 = op1;
3166 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3167 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3168 op3 = op1;
3169
3170 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3171 HImode insns, we'd be swallowed in word prefix ops. */
3172
3173 if ((mode != HImode || TARGET_FAST_PREFIX)
3174 && (mode != (TARGET_64BIT ? TImode : DImode))
3175 && CONST_INT_P (op2)
3176 && CONST_INT_P (op3))
3177 {
3178 rtx out = operands[0];
3179 HOST_WIDE_INT ct = INTVAL (op2);
3180 HOST_WIDE_INT cf = INTVAL (op3);
3181 HOST_WIDE_INT diff;
3182
3183 diff = ct - cf;
3184 /* Sign bit compares are better done using shifts than we do by using
3185 sbb. */
3186 if (sign_bit_compare_p
3187 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3188 {
3189 /* Detect overlap between destination and compare sources. */
3190 rtx tmp = out;
3191
3192 if (!sign_bit_compare_p)
3193 {
3194 rtx flags;
3195 bool fpcmp = false;
3196
3197 compare_code = GET_CODE (compare_op);
3198
3199 flags = XEXP (compare_op, 0);
3200
3201 if (GET_MODE (flags) == CCFPmode)
3202 {
3203 fpcmp = true;
3204 compare_code
3205 = ix86_fp_compare_code_to_integer (compare_code);
3206 }
3207
3208 /* To simplify rest of code, restrict to the GEU case. */
3209 if (compare_code == LTU)
3210 {
3211 std::swap (ct, cf);
3212 compare_code = reverse_condition (compare_code);
3213 code = reverse_condition (code);
3214 }
3215 else
3216 {
3217 if (fpcmp)
3218 PUT_CODE (compare_op,
3219 reverse_condition_maybe_unordered
3220 (GET_CODE (compare_op)));
3221 else
3222 PUT_CODE (compare_op,
3223 reverse_condition (GET_CODE (compare_op)));
3224 }
3225 diff = ct - cf;
3226
3227 if (reg_overlap_mentioned_p (out, compare_op))
3228 tmp = gen_reg_rtx (mode);
3229
3230 if (mode == DImode)
3231 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3232 else
3233 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3234 flags, compare_op));
3235 }
3236 else
3237 {
3238 if (code == GT || code == GE)
3239 code = reverse_condition (code);
3240 else
3241 {
3242 std::swap (ct, cf);
3243 diff = ct - cf;
3244 }
3245 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3246 }
3247
3248 if (diff == 1)
3249 {
3250 /*
3251 * cmpl op0,op1
3252 * sbbl dest,dest
3253 * [addl dest, ct]
3254 *
3255 * Size 5 - 8.
3256 */
3257 if (ct)
3258 tmp = expand_simple_binop (mode, PLUS,
3259 tmp, GEN_INT (ct),
3260 copy_rtx (tmp), 1, OPTAB_DIRECT);
3261 }
3262 else if (cf == -1)
3263 {
3264 /*
3265 * cmpl op0,op1
3266 * sbbl dest,dest
3267 * orl $ct, dest
3268 *
3269 * Size 8.
3270 */
3271 tmp = expand_simple_binop (mode, IOR,
3272 tmp, GEN_INT (ct),
3273 copy_rtx (tmp), 1, OPTAB_DIRECT);
3274 }
3275 else if (diff == -1 && ct)
3276 {
3277 /*
3278 * cmpl op0,op1
3279 * sbbl dest,dest
3280 * notl dest
3281 * [addl dest, cf]
3282 *
3283 * Size 8 - 11.
3284 */
3285 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3286 if (cf)
3287 tmp = expand_simple_binop (mode, PLUS,
3288 copy_rtx (tmp), GEN_INT (cf),
3289 copy_rtx (tmp), 1, OPTAB_DIRECT);
3290 }
3291 else
3292 {
3293 /*
3294 * cmpl op0,op1
3295 * sbbl dest,dest
3296 * [notl dest]
3297 * andl cf - ct, dest
3298 * [addl dest, ct]
3299 *
3300 * Size 8 - 11.
3301 */
3302
3303 if (cf == 0)
3304 {
3305 cf = ct;
3306 ct = 0;
3307 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3308 }
3309
3310 tmp = expand_simple_binop (mode, AND,
3311 copy_rtx (tmp),
3312 gen_int_mode (cf - ct, mode),
3313 copy_rtx (tmp), 1, OPTAB_DIRECT);
3314 if (ct)
3315 tmp = expand_simple_binop (mode, PLUS,
3316 copy_rtx (tmp), GEN_INT (ct),
3317 copy_rtx (tmp), 1, OPTAB_DIRECT);
3318 }
3319
3320 if (!rtx_equal_p (tmp, out))
3321 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3322
3323 return true;
3324 }
3325
3326 if (diff < 0)
3327 {
3328 machine_mode cmp_mode = GET_MODE (op0);
3329 enum rtx_code new_code;
3330
3331 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3332 {
3333 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3334
3335 /* We may be reversing a non-trapping
3336 comparison to a trapping comparison. */
3337 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3338 && code != EQ && code != NE
3339 && code != ORDERED && code != UNORDERED)
3340 new_code = UNKNOWN;
3341 else
3342 new_code = reverse_condition_maybe_unordered (code);
3343 }
3344 else
3345 new_code = ix86_reverse_condition (code, cmp_mode);
3346 if (new_code != UNKNOWN)
3347 {
3348 std::swap (ct, cf);
3349 diff = -diff;
3350 code = new_code;
3351 }
3352 }
3353
3354 compare_code = UNKNOWN;
3355 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3356 && CONST_INT_P (op1))
3357 {
3358 if (op1 == const0_rtx
3359 && (code == LT || code == GE))
3360 compare_code = code;
3361 else if (op1 == constm1_rtx)
3362 {
3363 if (code == LE)
3364 compare_code = LT;
3365 else if (code == GT)
3366 compare_code = GE;
3367 }
3368 }
3369
3370 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3371 if (compare_code != UNKNOWN
3372 && GET_MODE (op0) == GET_MODE (out)
3373 && (cf == -1 || ct == -1))
3374 {
3375 /* If lea code below could be used, only optimize
3376 if it results in a 2 insn sequence. */
3377
3378 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3379 || diff == 3 || diff == 5 || diff == 9)
3380 || (compare_code == LT && ct == -1)
3381 || (compare_code == GE && cf == -1))
3382 {
3383 /*
3384 * notl op1 (if necessary)
3385 * sarl $31, op1
3386 * orl cf, op1
3387 */
3388 if (ct != -1)
3389 {
3390 cf = ct;
3391 ct = -1;
3392 code = reverse_condition (code);
3393 }
3394
3395 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3396
3397 out = expand_simple_binop (mode, IOR,
3398 out, GEN_INT (cf),
3399 out, 1, OPTAB_DIRECT);
3400 if (out != operands[0])
3401 emit_move_insn (operands[0], out);
3402
3403 return true;
3404 }
3405 }
3406
3407
3408 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3409 || diff == 3 || diff == 5 || diff == 9)
3410 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3411 && (mode != DImode
3412 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3413 {
3414 /*
3415 * xorl dest,dest
3416 * cmpl op1,op2
3417 * setcc dest
3418 * lea cf(dest*(ct-cf)),dest
3419 *
3420 * Size 14.
3421 *
3422 * This also catches the degenerate setcc-only case.
3423 */
3424
3425 rtx tmp;
3426 int nops;
3427
3428 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3429
3430 nops = 0;
3431 /* On x86_64 the lea instruction operates on Pmode, so we need
3432 to get arithmetics done in proper mode to match. */
3433 if (diff == 1)
3434 tmp = copy_rtx (out);
3435 else
3436 {
3437 rtx out1;
3438 out1 = copy_rtx (out);
3439 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3440 nops++;
3441 if (diff & 1)
3442 {
3443 tmp = gen_rtx_PLUS (mode, tmp, out1);
3444 nops++;
3445 }
3446 }
3447 if (cf != 0)
3448 {
3449 tmp = plus_constant (mode, tmp, cf);
3450 nops++;
3451 }
3452 if (!rtx_equal_p (tmp, out))
3453 {
3454 if (nops == 1)
3455 out = force_operand (tmp, copy_rtx (out));
3456 else
3457 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3458 }
3459 if (!rtx_equal_p (out, operands[0]))
3460 emit_move_insn (operands[0], copy_rtx (out));
3461
3462 return true;
3463 }
3464
3465 /*
3466 * General case: Jumpful:
3467 * xorl dest,dest cmpl op1, op2
3468 * cmpl op1, op2 movl ct, dest
3469 * setcc dest jcc 1f
3470 * decl dest movl cf, dest
3471 * andl (cf-ct),dest 1:
3472 * addl ct,dest
3473 *
3474 * Size 20. Size 14.
3475 *
3476 * This is reasonably steep, but branch mispredict costs are
3477 * high on modern cpus, so consider failing only if optimizing
3478 * for space.
3479 */
3480
3481 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3482 && BRANCH_COST (optimize_insn_for_speed_p (),
3483 false) >= 2)
3484 {
3485 if (cf == 0)
3486 {
3487 machine_mode cmp_mode = GET_MODE (op0);
3488 enum rtx_code new_code;
3489
3490 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3491 {
3492 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3493
3494 /* We may be reversing a non-trapping
3495 comparison to a trapping comparison. */
3496 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3497 && code != EQ && code != NE
3498 && code != ORDERED && code != UNORDERED)
3499 new_code = UNKNOWN;
3500 else
3501 new_code = reverse_condition_maybe_unordered (code);
3502
3503 }
3504 else
3505 {
3506 new_code = ix86_reverse_condition (code, cmp_mode);
3507 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3508 compare_code = reverse_condition (compare_code);
3509 }
3510
3511 if (new_code != UNKNOWN)
3512 {
3513 cf = ct;
3514 ct = 0;
3515 code = new_code;
3516 }
3517 }
3518
3519 if (compare_code != UNKNOWN)
3520 {
3521 /* notl op1 (if needed)
3522 sarl $31, op1
3523 andl (cf-ct), op1
3524 addl ct, op1
3525
3526 For x < 0 (resp. x <= -1) there will be no notl,
3527 so if possible swap the constants to get rid of the
3528 complement.
3529 True/false will be -1/0 while code below (store flag
3530 followed by decrement) is 0/-1, so the constants need
3531 to be exchanged once more. */
3532
3533 if (compare_code == GE || !cf)
3534 {
3535 code = reverse_condition (code);
3536 compare_code = LT;
3537 }
3538 else
3539 std::swap (ct, cf);
3540
3541 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3542 }
3543 else
3544 {
3545 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3546
3547 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3548 constm1_rtx,
3549 copy_rtx (out), 1, OPTAB_DIRECT);
3550 }
3551
3552 out = expand_simple_binop (mode, AND, copy_rtx (out),
3553 gen_int_mode (cf - ct, mode),
3554 copy_rtx (out), 1, OPTAB_DIRECT);
3555 if (ct)
3556 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3557 copy_rtx (out), 1, OPTAB_DIRECT);
3558 if (!rtx_equal_p (out, operands[0]))
3559 emit_move_insn (operands[0], copy_rtx (out));
3560
3561 return true;
3562 }
3563 }
3564
3565 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3566 {
3567 /* Try a few things more with specific constants and a variable. */
3568
3569 optab op;
3570 rtx var, orig_out, out, tmp;
3571
3572 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3573 return false;
3574
3575 operands[2] = op2;
3576 operands[3] = op3;
3577
3578 /* If one of the two operands is an interesting constant, load a
3579 constant with the above and mask it in with a logical operation. */
3580
3581 if (CONST_INT_P (operands[2]))
3582 {
3583 var = operands[3];
3584 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3585 operands[3] = constm1_rtx, op = and_optab;
3586 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3587 operands[3] = const0_rtx, op = ior_optab;
3588 else
3589 return false;
3590 }
3591 else if (CONST_INT_P (operands[3]))
3592 {
3593 var = operands[2];
3594 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3595 {
3596 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3597 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3598 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3599 operands[1] = simplify_gen_relational (LT, VOIDmode,
3600 GET_MODE (op0),
3601 op0, const0_rtx);
3602
3603 operands[2] = constm1_rtx;
3604 op = and_optab;
3605 }
3606 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3607 operands[2] = const0_rtx, op = ior_optab;
3608 else
3609 return false;
3610 }
3611 else
3612 return false;
3613
3614 orig_out = operands[0];
3615 tmp = gen_reg_rtx (mode);
3616 operands[0] = tmp;
3617
3618 /* Recurse to get the constant loaded. */
3619 if (!ix86_expand_int_movcc (operands))
3620 return false;
3621
3622 /* Mask in the interesting variable. */
3623 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3624 OPTAB_WIDEN);
3625 if (!rtx_equal_p (out, orig_out))
3626 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3627
3628 return true;
3629 }
3630
3631 /*
3632 * For comparison with above,
3633 *
3634 * movl cf,dest
3635 * movl ct,tmp
3636 * cmpl op1,op2
3637 * cmovcc tmp,dest
3638 *
3639 * Size 15.
3640 */
3641
3642 if (! nonimmediate_operand (operands[2], mode))
3643 operands[2] = force_reg (mode, operands[2]);
3644 if (! nonimmediate_operand (operands[3], mode))
3645 operands[3] = force_reg (mode, operands[3]);
3646
3647 if (! register_operand (operands[2], VOIDmode)
3648 && (mode == QImode
3649 || ! register_operand (operands[3], VOIDmode)))
3650 operands[2] = force_reg (mode, operands[2]);
3651
3652 if (mode == QImode
3653 && ! register_operand (operands[3], VOIDmode))
3654 operands[3] = force_reg (mode, operands[3]);
3655
3656 emit_insn (compare_seq);
3657 emit_insn (gen_rtx_SET (operands[0],
3658 gen_rtx_IF_THEN_ELSE (mode,
3659 compare_op, operands[2],
3660 operands[3])));
3661 return true;
3662 }
3663
3664 /* Detect conditional moves that exactly match min/max operational
3665 semantics. Note that this is IEEE safe, as long as we don't
3666 interchange the operands.
3667
3668 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3669 and TRUE if the operation is successful and instructions are emitted. */
3670
3671 static bool
3672 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3673 rtx cmp_op1, rtx if_true, rtx if_false)
3674 {
3675 machine_mode mode;
3676 bool is_min;
3677 rtx tmp;
3678
3679 if (code == LT)
3680 ;
3681 else if (code == UNGE)
3682 std::swap (if_true, if_false);
3683 else
3684 return false;
3685
3686 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3687 is_min = true;
3688 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3689 is_min = false;
3690 else
3691 return false;
3692
3693 mode = GET_MODE (dest);
3694
3695 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3696 but MODE may be a vector mode and thus not appropriate. */
3697 if (!flag_finite_math_only || flag_signed_zeros)
3698 {
3699 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3700 rtvec v;
3701
3702 if_true = force_reg (mode, if_true);
3703 v = gen_rtvec (2, if_true, if_false);
3704 tmp = gen_rtx_UNSPEC (mode, v, u);
3705 }
3706 else
3707 {
3708 code = is_min ? SMIN : SMAX;
3709 if (MEM_P (if_true) && MEM_P (if_false))
3710 if_true = force_reg (mode, if_true);
3711 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3712 }
3713
3714 emit_insn (gen_rtx_SET (dest, tmp));
3715 return true;
3716 }
3717
3718 /* Return true if MODE is valid for vector compare to mask register,
3719 Same result for conditionl vector move with mask register. */
3720 static bool
3721 ix86_valid_mask_cmp_mode (machine_mode mode)
3722 {
3723 /* XOP has its own vector conditional movement. */
3724 if (TARGET_XOP && !TARGET_AVX512F)
3725 return false;
3726
3727 /* HFmode only supports vcmpsh whose dest is mask register. */
3728 if (TARGET_AVX512FP16 && mode == HFmode)
3729 return true;
3730
3731 /* AVX512F is needed for mask operation. */
3732 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3733 return false;
3734
3735 /* AVX512BW is needed for vector QI/HImode,
3736 AVX512VL is needed for 128/256-bit vector. */
3737 machine_mode inner_mode = GET_MODE_INNER (mode);
3738 int vector_size = GET_MODE_SIZE (mode);
3739 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3740 return false;
3741
3742 return vector_size == 64 || TARGET_AVX512VL;
3743 }
3744
3745 /* Return true if integer mask comparison should be used. */
3746 static bool
3747 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3748 rtx op_true, rtx op_false)
3749 {
3750 int vector_size = GET_MODE_SIZE (mode);
3751
3752 if (cmp_mode == HFmode)
3753 return true;
3754 else if (vector_size < 16)
3755 return false;
3756 else if (vector_size == 64)
3757 return true;
3758 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3759 return true;
3760
3761 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3762 gcc_assert (!op_true == !op_false);
3763
3764 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3765 vector dest is required. */
3766 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3767 return false;
3768
3769 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3770 if (op_false == CONST0_RTX (mode)
3771 || op_true == CONST0_RTX (mode)
3772 || (INTEGRAL_MODE_P (mode)
3773 && (op_true == CONSTM1_RTX (mode)
3774 || op_false == CONSTM1_RTX (mode))))
3775 return false;
3776
3777 return true;
3778 }
3779
3780 /* Expand an SSE comparison. Return the register with the result. */
3781
3782 static rtx
3783 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3784 rtx op_true, rtx op_false)
3785 {
3786 machine_mode mode = GET_MODE (dest);
3787 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3788
3789 /* In general case result of comparison can differ from operands' type. */
3790 machine_mode cmp_mode;
3791
3792 /* In AVX512F the result of comparison is an integer mask. */
3793 bool maskcmp = false;
3794 rtx x;
3795
3796 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3797 {
3798 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3799 maskcmp = true;
3800 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3801 }
3802 else
3803 cmp_mode = cmp_ops_mode;
3804
3805 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3806
3807 bool (*op1_predicate)(rtx, machine_mode)
3808 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3809
3810 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3811 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3812
3813 if (optimize
3814 || (maskcmp && cmp_mode != mode)
3815 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3816 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3817 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3818
3819 if (maskcmp)
3820 {
3821 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3822 gcc_assert (ok);
3823 return dest;
3824 }
3825
3826 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3827
3828 if (cmp_mode != mode)
3829 {
3830 x = force_reg (cmp_ops_mode, x);
3831 convert_move (dest, x, false);
3832 }
3833 else
3834 emit_insn (gen_rtx_SET (dest, x));
3835
3836 return dest;
3837 }
3838
3839 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3840 instructions that can be performed using GP registers. */
3841
3842 static void
3843 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
3844 rtx dst, rtx src1, rtx src2)
3845 {
3846 rtx tmp;
3847
3848 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
3849
3850 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
3851 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
3852 {
3853 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
3854 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
3855 }
3856
3857 emit_insn (tmp);
3858 }
3859
3860 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3861 operations. This is used for both scalar and vector conditional moves. */
3862
3863 void
3864 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3865 {
3866 machine_mode mode = GET_MODE (dest);
3867 machine_mode cmpmode = GET_MODE (cmp);
3868 rtx x;
3869
3870 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3871 if (rtx_equal_p (op_true, op_false))
3872 {
3873 emit_move_insn (dest, op_true);
3874 return;
3875 }
3876
3877 /* If we have an integer mask and FP value then we need
3878 to cast mask to FP mode. */
3879 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3880 {
3881 cmp = force_reg (cmpmode, cmp);
3882 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3883 }
3884
3885 /* In AVX512F the result of comparison is an integer mask. */
3886 if (mode != cmpmode
3887 && GET_MODE_CLASS (cmpmode) == MODE_INT)
3888 {
3889 gcc_assert (ix86_valid_mask_cmp_mode (mode));
3890 /* Using scalar/vector move with mask register. */
3891 cmp = force_reg (cmpmode, cmp);
3892 /* Optimize for mask zero. */
3893 op_true = (op_true != CONST0_RTX (mode)
3894 ? force_reg (mode, op_true) : op_true);
3895 op_false = (op_false != CONST0_RTX (mode)
3896 ? force_reg (mode, op_false) : op_false);
3897 if (op_true == CONST0_RTX (mode))
3898 {
3899 if (cmpmode == E_DImode && !TARGET_64BIT)
3900 {
3901 x = gen_reg_rtx (cmpmode);
3902 emit_insn (gen_knotdi (x, cmp));
3903 }
3904 else
3905 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
3906 cmp = x;
3907 /* Reverse op_true op_false. */
3908 std::swap (op_true, op_false);
3909 }
3910
3911 if (mode == HFmode)
3912 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
3913 else
3914 emit_insn (gen_rtx_SET (dest,
3915 gen_rtx_VEC_MERGE (mode,
3916 op_true, op_false, cmp)));
3917 return;
3918 }
3919
3920 if (vector_all_ones_operand (op_true, mode)
3921 && op_false == CONST0_RTX (mode))
3922 {
3923 emit_move_insn (dest, cmp);
3924 return;
3925 }
3926 else if (op_false == CONST0_RTX (mode))
3927 {
3928 x = expand_simple_binop (mode, AND, cmp, op_true,
3929 dest, 1, OPTAB_DIRECT);
3930 if (x != dest)
3931 emit_move_insn (dest, x);
3932 return;
3933 }
3934 else if (op_true == CONST0_RTX (mode))
3935 {
3936 op_false = force_reg (mode, op_false);
3937 x = gen_rtx_NOT (mode, cmp);
3938 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
3939 return;
3940 }
3941 else if (vector_all_ones_operand (op_true, mode))
3942 {
3943 x = expand_simple_binop (mode, IOR, cmp, op_false,
3944 dest, 1, OPTAB_DIRECT);
3945 if (x != dest)
3946 emit_move_insn (dest, x);
3947 return;
3948 }
3949
3950 if (TARGET_XOP)
3951 {
3952 op_true = force_reg (mode, op_true);
3953
3954 if (GET_MODE_SIZE (mode) < 16
3955 || !nonimmediate_operand (op_false, mode))
3956 op_false = force_reg (mode, op_false);
3957
3958 emit_insn (gen_rtx_SET (dest,
3959 gen_rtx_IF_THEN_ELSE (mode, cmp,
3960 op_true, op_false)));
3961 return;
3962 }
3963
3964 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3965 machine_mode blend_mode = mode;
3966
3967 if (GET_MODE_SIZE (mode) < 16
3968 || !vector_operand (op_true, mode))
3969 op_true = force_reg (mode, op_true);
3970
3971 op_false = force_reg (mode, op_false);
3972
3973 switch (mode)
3974 {
3975 case E_V2SFmode:
3976 if (TARGET_SSE4_1)
3977 gen = gen_mmx_blendvps;
3978 break;
3979 case E_V4SFmode:
3980 if (TARGET_SSE4_1)
3981 gen = gen_sse4_1_blendvps;
3982 break;
3983 case E_V2DFmode:
3984 if (TARGET_SSE4_1)
3985 gen = gen_sse4_1_blendvpd;
3986 break;
3987 case E_SFmode:
3988 if (TARGET_SSE4_1)
3989 gen = gen_sse4_1_blendvss;
3990 break;
3991 case E_DFmode:
3992 if (TARGET_SSE4_1)
3993 gen = gen_sse4_1_blendvsd;
3994 break;
3995 case E_V8QImode:
3996 case E_V4HImode:
3997 case E_V2SImode:
3998 if (TARGET_SSE4_1)
3999 {
4000 gen = gen_mmx_pblendvb_v8qi;
4001 blend_mode = V8QImode;
4002 }
4003 break;
4004 case E_V4QImode:
4005 case E_V2HImode:
4006 if (TARGET_SSE4_1)
4007 {
4008 gen = gen_mmx_pblendvb_v4qi;
4009 blend_mode = V4QImode;
4010 }
4011 break;
4012 case E_V2QImode:
4013 if (TARGET_SSE4_1)
4014 gen = gen_mmx_pblendvb_v2qi;
4015 break;
4016 case E_V16QImode:
4017 case E_V8HImode:
4018 case E_V8HFmode:
4019 case E_V4SImode:
4020 case E_V2DImode:
4021 if (TARGET_SSE4_1)
4022 {
4023 gen = gen_sse4_1_pblendvb;
4024 blend_mode = V16QImode;
4025 }
4026 break;
4027 case E_V8SFmode:
4028 if (TARGET_AVX)
4029 gen = gen_avx_blendvps256;
4030 break;
4031 case E_V4DFmode:
4032 if (TARGET_AVX)
4033 gen = gen_avx_blendvpd256;
4034 break;
4035 case E_V32QImode:
4036 case E_V16HImode:
4037 case E_V16HFmode:
4038 case E_V8SImode:
4039 case E_V4DImode:
4040 if (TARGET_AVX2)
4041 {
4042 gen = gen_avx2_pblendvb;
4043 blend_mode = V32QImode;
4044 }
4045 break;
4046
4047 case E_V64QImode:
4048 gen = gen_avx512bw_blendmv64qi;
4049 break;
4050 case E_V32HImode:
4051 gen = gen_avx512bw_blendmv32hi;
4052 break;
4053 case E_V32HFmode:
4054 gen = gen_avx512bw_blendmv32hf;
4055 break;
4056 case E_V16SImode:
4057 gen = gen_avx512f_blendmv16si;
4058 break;
4059 case E_V8DImode:
4060 gen = gen_avx512f_blendmv8di;
4061 break;
4062 case E_V8DFmode:
4063 gen = gen_avx512f_blendmv8df;
4064 break;
4065 case E_V16SFmode:
4066 gen = gen_avx512f_blendmv16sf;
4067 break;
4068
4069 default:
4070 break;
4071 }
4072
4073 if (gen != NULL)
4074 {
4075 if (blend_mode == mode)
4076 x = dest;
4077 else
4078 {
4079 x = gen_reg_rtx (blend_mode);
4080 op_false = gen_lowpart (blend_mode, op_false);
4081 op_true = gen_lowpart (blend_mode, op_true);
4082 cmp = gen_lowpart (blend_mode, cmp);
4083 }
4084
4085 emit_insn (gen (x, op_false, op_true, cmp));
4086
4087 if (x != dest)
4088 emit_move_insn (dest, gen_lowpart (mode, x));
4089 }
4090 else
4091 {
4092 rtx t2, t3;
4093
4094 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4095 NULL, 1, OPTAB_DIRECT);
4096
4097 t3 = gen_reg_rtx (mode);
4098 x = gen_rtx_NOT (mode, cmp);
4099 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4100
4101 x = expand_simple_binop (mode, IOR, t3, t2,
4102 dest, 1, OPTAB_DIRECT);
4103 if (x != dest)
4104 emit_move_insn (dest, x);
4105 }
4106 }
4107
4108 /* Swap, force into registers, or otherwise massage the two operands
4109 to an sse comparison with a mask result. Thus we differ a bit from
4110 ix86_prepare_fp_compare_args which expects to produce a flags result.
4111
4112 The DEST operand exists to help determine whether to commute commutative
4113 operators. The POP0/POP1 operands are updated in place. The new
4114 comparison code is returned, or UNKNOWN if not implementable. */
4115
4116 static enum rtx_code
4117 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4118 rtx *pop0, rtx *pop1)
4119 {
4120 switch (code)
4121 {
4122 case LTGT:
4123 case UNEQ:
4124 /* AVX supports all the needed comparisons. */
4125 if (TARGET_AVX)
4126 break;
4127 /* We have no LTGT as an operator. We could implement it with
4128 NE & ORDERED, but this requires an extra temporary. It's
4129 not clear that it's worth it. */
4130 return UNKNOWN;
4131
4132 case LT:
4133 case LE:
4134 case UNGT:
4135 case UNGE:
4136 /* These are supported directly. */
4137 break;
4138
4139 case EQ:
4140 case NE:
4141 case UNORDERED:
4142 case ORDERED:
4143 /* AVX has 3 operand comparisons, no need to swap anything. */
4144 if (TARGET_AVX)
4145 break;
4146 /* For commutative operators, try to canonicalize the destination
4147 operand to be first in the comparison - this helps reload to
4148 avoid extra moves. */
4149 if (!dest || !rtx_equal_p (dest, *pop1))
4150 break;
4151 /* FALLTHRU */
4152
4153 case GE:
4154 case GT:
4155 case UNLE:
4156 case UNLT:
4157 /* These are not supported directly before AVX, and furthermore
4158 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4159 comparison operands to transform into something that is
4160 supported. */
4161 std::swap (*pop0, *pop1);
4162 code = swap_condition (code);
4163 break;
4164
4165 default:
4166 gcc_unreachable ();
4167 }
4168
4169 return code;
4170 }
4171
4172 /* Expand a floating-point conditional move. Return true if successful. */
4173
4174 bool
4175 ix86_expand_fp_movcc (rtx operands[])
4176 {
4177 machine_mode mode = GET_MODE (operands[0]);
4178 enum rtx_code code = GET_CODE (operands[1]);
4179 rtx tmp, compare_op;
4180 rtx op0 = XEXP (operands[1], 0);
4181 rtx op1 = XEXP (operands[1], 1);
4182
4183 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4184 {
4185 machine_mode cmode;
4186
4187 /* Since we've no cmove for sse registers, don't force bad register
4188 allocation just to gain access to it. Deny movcc when the
4189 comparison mode doesn't match the move mode. */
4190 cmode = GET_MODE (op0);
4191 if (cmode == VOIDmode)
4192 cmode = GET_MODE (op1);
4193 if (cmode != mode)
4194 return false;
4195
4196 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4197 if (code == UNKNOWN)
4198 return false;
4199
4200 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4201 operands[2], operands[3]))
4202 return true;
4203
4204 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4205 operands[2], operands[3]);
4206 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4207 return true;
4208 }
4209
4210 if (GET_MODE (op0) == TImode
4211 || (GET_MODE (op0) == DImode
4212 && !TARGET_64BIT))
4213 return false;
4214
4215 /* The floating point conditional move instructions don't directly
4216 support conditions resulting from a signed integer comparison. */
4217
4218 compare_op = ix86_expand_compare (code, op0, op1);
4219 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4220 {
4221 tmp = gen_reg_rtx (QImode);
4222 ix86_expand_setcc (tmp, code, op0, op1);
4223
4224 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4225 }
4226
4227 emit_insn (gen_rtx_SET (operands[0],
4228 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4229 operands[2], operands[3])));
4230
4231 return true;
4232 }
4233
4234 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4235
4236 static int
4237 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4238 {
4239 switch (code)
4240 {
4241 case EQ:
4242 return 0;
4243 case LT:
4244 case LTU:
4245 return 1;
4246 case LE:
4247 case LEU:
4248 return 2;
4249 case NE:
4250 return 4;
4251 case GE:
4252 case GEU:
4253 return 5;
4254 case GT:
4255 case GTU:
4256 return 6;
4257 default:
4258 gcc_unreachable ();
4259 }
4260 }
4261
4262 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4263
4264 static int
4265 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4266 {
4267 switch (code)
4268 {
4269 case EQ:
4270 return 0x00;
4271 case NE:
4272 return 0x04;
4273 case GT:
4274 return 0x0e;
4275 case LE:
4276 return 0x02;
4277 case GE:
4278 return 0x0d;
4279 case LT:
4280 return 0x01;
4281 case UNLE:
4282 return 0x0a;
4283 case UNLT:
4284 return 0x09;
4285 case UNGE:
4286 return 0x05;
4287 case UNGT:
4288 return 0x06;
4289 case UNEQ:
4290 return 0x18;
4291 case LTGT:
4292 return 0x0c;
4293 case ORDERED:
4294 return 0x07;
4295 case UNORDERED:
4296 return 0x03;
4297 default:
4298 gcc_unreachable ();
4299 }
4300 }
4301
4302 /* Return immediate value to be used in UNSPEC_PCMP
4303 for comparison CODE in MODE. */
4304
4305 static int
4306 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4307 {
4308 if (FLOAT_MODE_P (mode))
4309 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4310 return ix86_int_cmp_code_to_pcmp_immediate (code);
4311 }
4312
4313 /* Expand AVX-512 vector comparison. */
4314
4315 bool
4316 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4317 {
4318 machine_mode mask_mode = GET_MODE (dest);
4319 machine_mode cmp_mode = GET_MODE (cmp_op0);
4320 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4321 int unspec_code;
4322 rtx unspec;
4323
4324 switch (code)
4325 {
4326 case LEU:
4327 case GTU:
4328 case GEU:
4329 case LTU:
4330 unspec_code = UNSPEC_UNSIGNED_PCMP;
4331 break;
4332
4333 default:
4334 unspec_code = UNSPEC_PCMP;
4335 }
4336
4337 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4338 unspec_code);
4339 emit_insn (gen_rtx_SET (dest, unspec));
4340
4341 return true;
4342 }
4343
4344 /* Expand fp vector comparison. */
4345
4346 bool
4347 ix86_expand_fp_vec_cmp (rtx operands[])
4348 {
4349 enum rtx_code code = GET_CODE (operands[1]);
4350 rtx cmp;
4351
4352 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4353 &operands[2], &operands[3]);
4354 if (code == UNKNOWN)
4355 {
4356 rtx temp;
4357 switch (GET_CODE (operands[1]))
4358 {
4359 case LTGT:
4360 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4361 operands[3], NULL, NULL);
4362 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4363 operands[3], NULL, NULL);
4364 code = AND;
4365 break;
4366 case UNEQ:
4367 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4368 operands[3], NULL, NULL);
4369 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4370 operands[3], NULL, NULL);
4371 code = IOR;
4372 break;
4373 default:
4374 gcc_unreachable ();
4375 }
4376 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4377 OPTAB_DIRECT);
4378 }
4379 else
4380 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4381 NULL, NULL);
4382
4383 if (operands[0] != cmp)
4384 emit_move_insn (operands[0], cmp);
4385
4386 return true;
4387 }
4388
4389 static rtx
4390 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4391 rtx op_true, rtx op_false, bool *negate)
4392 {
4393 machine_mode data_mode = GET_MODE (dest);
4394 machine_mode mode = GET_MODE (cop0);
4395 rtx x;
4396
4397 *negate = false;
4398
4399 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4400 if (TARGET_XOP
4401 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4402 && GET_MODE_SIZE (mode) <= 16)
4403 ;
4404 /* AVX512F supports all of the comparsions
4405 on all 128/256/512-bit vector int types. */
4406 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4407 ;
4408 else
4409 {
4410 /* Canonicalize the comparison to EQ, GT, GTU. */
4411 switch (code)
4412 {
4413 case EQ:
4414 case GT:
4415 case GTU:
4416 break;
4417
4418 case NE:
4419 case LE:
4420 case LEU:
4421 code = reverse_condition (code);
4422 *negate = true;
4423 break;
4424
4425 case GE:
4426 case GEU:
4427 code = reverse_condition (code);
4428 *negate = true;
4429 /* FALLTHRU */
4430
4431 case LT:
4432 case LTU:
4433 std::swap (cop0, cop1);
4434 code = swap_condition (code);
4435 break;
4436
4437 default:
4438 gcc_unreachable ();
4439 }
4440
4441 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4442 if (mode == V2DImode)
4443 {
4444 switch (code)
4445 {
4446 case EQ:
4447 /* SSE4.1 supports EQ. */
4448 if (!TARGET_SSE4_1)
4449 return NULL;
4450 break;
4451
4452 case GT:
4453 case GTU:
4454 /* SSE4.2 supports GT/GTU. */
4455 if (!TARGET_SSE4_2)
4456 return NULL;
4457 break;
4458
4459 default:
4460 gcc_unreachable ();
4461 }
4462 }
4463
4464 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4465 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4466 if (*negate)
4467 std::swap (optrue, opfalse);
4468
4469 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4470 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4471 min (x, y) == x). While we add one instruction (the minimum),
4472 we remove the need for two instructions in the negation, as the
4473 result is done this way.
4474 When using masks, do it for SI/DImode element types, as it is shorter
4475 than the two subtractions. */
4476 if ((code != EQ
4477 && GET_MODE_SIZE (mode) != 64
4478 && vector_all_ones_operand (opfalse, data_mode)
4479 && optrue == CONST0_RTX (data_mode))
4480 || (code == GTU
4481 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4482 /* Don't do it if not using integer masks and we'd end up with
4483 the right values in the registers though. */
4484 && (GET_MODE_SIZE (mode) == 64
4485 || !vector_all_ones_operand (optrue, data_mode)
4486 || opfalse != CONST0_RTX (data_mode))))
4487 {
4488 rtx (*gen) (rtx, rtx, rtx) = NULL;
4489
4490 switch (mode)
4491 {
4492 case E_V16SImode:
4493 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4494 break;
4495 case E_V8DImode:
4496 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4497 cop0 = force_reg (mode, cop0);
4498 cop1 = force_reg (mode, cop1);
4499 break;
4500 case E_V32QImode:
4501 if (TARGET_AVX2)
4502 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4503 break;
4504 case E_V16HImode:
4505 if (TARGET_AVX2)
4506 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4507 break;
4508 case E_V8SImode:
4509 if (TARGET_AVX2)
4510 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4511 break;
4512 case E_V4DImode:
4513 if (TARGET_AVX512VL)
4514 {
4515 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4516 cop0 = force_reg (mode, cop0);
4517 cop1 = force_reg (mode, cop1);
4518 }
4519 break;
4520 case E_V16QImode:
4521 if (code == GTU && TARGET_SSE2)
4522 gen = gen_uminv16qi3;
4523 else if (code == GT && TARGET_SSE4_1)
4524 gen = gen_sminv16qi3;
4525 break;
4526 case E_V8QImode:
4527 if (code == GTU && TARGET_SSE2)
4528 gen = gen_uminv8qi3;
4529 else if (code == GT && TARGET_SSE4_1)
4530 gen = gen_sminv8qi3;
4531 break;
4532 case E_V4QImode:
4533 if (code == GTU && TARGET_SSE2)
4534 gen = gen_uminv4qi3;
4535 else if (code == GT && TARGET_SSE4_1)
4536 gen = gen_sminv4qi3;
4537 break;
4538 case E_V2QImode:
4539 if (code == GTU && TARGET_SSE2)
4540 gen = gen_uminv2qi3;
4541 else if (code == GT && TARGET_SSE4_1)
4542 gen = gen_sminv2qi3;
4543 break;
4544 case E_V8HImode:
4545 if (code == GTU && TARGET_SSE4_1)
4546 gen = gen_uminv8hi3;
4547 else if (code == GT && TARGET_SSE2)
4548 gen = gen_sminv8hi3;
4549 break;
4550 case E_V4HImode:
4551 if (code == GTU && TARGET_SSE4_1)
4552 gen = gen_uminv4hi3;
4553 else if (code == GT && TARGET_SSE2)
4554 gen = gen_sminv4hi3;
4555 break;
4556 case E_V2HImode:
4557 if (code == GTU && TARGET_SSE4_1)
4558 gen = gen_uminv2hi3;
4559 else if (code == GT && TARGET_SSE2)
4560 gen = gen_sminv2hi3;
4561 break;
4562 case E_V4SImode:
4563 if (TARGET_SSE4_1)
4564 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4565 break;
4566 case E_V2SImode:
4567 if (TARGET_SSE4_1)
4568 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4569 break;
4570 case E_V2DImode:
4571 if (TARGET_AVX512VL)
4572 {
4573 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4574 cop0 = force_reg (mode, cop0);
4575 cop1 = force_reg (mode, cop1);
4576 }
4577 break;
4578 default:
4579 break;
4580 }
4581
4582 if (gen)
4583 {
4584 rtx tem = gen_reg_rtx (mode);
4585 if (!vector_operand (cop0, mode))
4586 cop0 = force_reg (mode, cop0);
4587 if (!vector_operand (cop1, mode))
4588 cop1 = force_reg (mode, cop1);
4589 *negate = !*negate;
4590 emit_insn (gen (tem, cop0, cop1));
4591 cop1 = tem;
4592 code = EQ;
4593 }
4594 }
4595
4596 /* Unsigned parallel compare is not supported by the hardware.
4597 Play some tricks to turn this into a signed comparison
4598 against 0. */
4599 if (code == GTU)
4600 {
4601 cop0 = force_reg (mode, cop0);
4602
4603 switch (mode)
4604 {
4605 case E_V16SImode:
4606 case E_V8DImode:
4607 case E_V8SImode:
4608 case E_V4DImode:
4609 case E_V4SImode:
4610 case E_V2SImode:
4611 case E_V2DImode:
4612 {
4613 rtx t1, t2, mask;
4614
4615 /* Subtract (-(INT MAX) - 1) from both operands to make
4616 them signed. */
4617 mask = ix86_build_signbit_mask (mode, true, false);
4618 t1 = gen_reg_rtx (mode);
4619 emit_insn (gen_sub3_insn (t1, cop0, mask));
4620
4621 t2 = gen_reg_rtx (mode);
4622 emit_insn (gen_sub3_insn (t2, cop1, mask));
4623
4624 cop0 = t1;
4625 cop1 = t2;
4626 code = GT;
4627 }
4628 break;
4629
4630 case E_V64QImode:
4631 case E_V32HImode:
4632 case E_V32QImode:
4633 case E_V16HImode:
4634 case E_V16QImode:
4635 case E_V8QImode:
4636 case E_V4QImode:
4637 case E_V2QImode:
4638 case E_V8HImode:
4639 case E_V4HImode:
4640 case E_V2HImode:
4641 /* Perform a parallel unsigned saturating subtraction. */
4642 x = gen_reg_rtx (mode);
4643 emit_insn (gen_rtx_SET
4644 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4645 cop0 = x;
4646 cop1 = CONST0_RTX (mode);
4647 code = EQ;
4648 *negate = !*negate;
4649 break;
4650
4651 default:
4652 gcc_unreachable ();
4653 }
4654 }
4655 }
4656
4657 if (*negate)
4658 std::swap (op_true, op_false);
4659
4660 /* Allow the comparison to be done in one mode, but the movcc to
4661 happen in another mode. */
4662 if (data_mode == mode)
4663 {
4664 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4665 op_true, op_false);
4666 }
4667 else
4668 {
4669 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4670 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4671 op_true, op_false);
4672 if (GET_MODE (x) == mode)
4673 x = gen_lowpart (data_mode, x);
4674 }
4675
4676 return x;
4677 }
4678
4679 /* Expand integer vector comparison. */
4680
4681 bool
4682 ix86_expand_int_vec_cmp (rtx operands[])
4683 {
4684 rtx_code code = GET_CODE (operands[1]);
4685 bool negate = false;
4686 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4687 operands[3], NULL, NULL, &negate);
4688
4689 if (!cmp)
4690 return false;
4691
4692 if (negate)
4693 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4694 CONST0_RTX (GET_MODE (cmp)),
4695 NULL, NULL, &negate);
4696
4697 gcc_assert (!negate);
4698
4699 if (operands[0] != cmp)
4700 emit_move_insn (operands[0], cmp);
4701
4702 return true;
4703 }
4704
4705 /* Expand a floating-point vector conditional move; a vcond operation
4706 rather than a movcc operation. */
4707
4708 bool
4709 ix86_expand_fp_vcond (rtx operands[])
4710 {
4711 enum rtx_code code = GET_CODE (operands[3]);
4712 rtx cmp;
4713
4714 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4715 &operands[4], &operands[5]);
4716 if (code == UNKNOWN)
4717 {
4718 rtx temp;
4719 switch (GET_CODE (operands[3]))
4720 {
4721 case LTGT:
4722 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4723 operands[5], operands[0], operands[0]);
4724 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4725 operands[5], operands[1], operands[2]);
4726 code = AND;
4727 break;
4728 case UNEQ:
4729 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4730 operands[5], operands[0], operands[0]);
4731 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4732 operands[5], operands[1], operands[2]);
4733 code = IOR;
4734 break;
4735 default:
4736 gcc_unreachable ();
4737 }
4738 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4739 OPTAB_DIRECT);
4740 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4741 return true;
4742 }
4743
4744 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4745 operands[5], operands[1], operands[2]))
4746 return true;
4747
4748 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4749 operands[1], operands[2]);
4750 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4751 return true;
4752 }
4753
4754 /* Expand a signed/unsigned integral vector conditional move. */
4755
4756 bool
4757 ix86_expand_int_vcond (rtx operands[])
4758 {
4759 machine_mode data_mode = GET_MODE (operands[0]);
4760 machine_mode mode = GET_MODE (operands[4]);
4761 enum rtx_code code = GET_CODE (operands[3]);
4762 bool negate = false;
4763 rtx x, cop0, cop1;
4764
4765 cop0 = operands[4];
4766 cop1 = operands[5];
4767
4768 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4769 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4770 if ((code == LT || code == GE)
4771 && data_mode == mode
4772 && cop1 == CONST0_RTX (mode)
4773 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4774 && GET_MODE_UNIT_SIZE (data_mode) > 1
4775 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4776 && (GET_MODE_SIZE (data_mode) == 16
4777 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4778 {
4779 rtx negop = operands[2 - (code == LT)];
4780 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4781 if (negop == CONST1_RTX (data_mode))
4782 {
4783 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4784 operands[0], 1, OPTAB_DIRECT);
4785 if (res != operands[0])
4786 emit_move_insn (operands[0], res);
4787 return true;
4788 }
4789 else if (GET_MODE_INNER (data_mode) != DImode
4790 && vector_all_ones_operand (negop, data_mode))
4791 {
4792 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4793 operands[0], 0, OPTAB_DIRECT);
4794 if (res != operands[0])
4795 emit_move_insn (operands[0], res);
4796 return true;
4797 }
4798 }
4799
4800 if (!nonimmediate_operand (cop1, mode))
4801 cop1 = force_reg (mode, cop1);
4802 if (!general_operand (operands[1], data_mode))
4803 operands[1] = force_reg (data_mode, operands[1]);
4804 if (!general_operand (operands[2], data_mode))
4805 operands[2] = force_reg (data_mode, operands[2]);
4806
4807 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4808 operands[1], operands[2], &negate);
4809
4810 if (!x)
4811 return false;
4812
4813 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4814 operands[2-negate]);
4815 return true;
4816 }
4817
4818 static bool
4819 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4820 struct expand_vec_perm_d *d)
4821 {
4822 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4823 expander, so args are either in d, or in op0, op1 etc. */
4824 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4825 machine_mode maskmode = mode;
4826 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4827
4828 switch (mode)
4829 {
4830 case E_V16QImode:
4831 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4832 gen = gen_avx512vl_vpermt2varv16qi3;
4833 break;
4834 case E_V32QImode:
4835 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4836 gen = gen_avx512vl_vpermt2varv32qi3;
4837 break;
4838 case E_V64QImode:
4839 if (TARGET_AVX512VBMI)
4840 gen = gen_avx512bw_vpermt2varv64qi3;
4841 break;
4842 case E_V8HImode:
4843 if (TARGET_AVX512VL && TARGET_AVX512BW)
4844 gen = gen_avx512vl_vpermt2varv8hi3;
4845 break;
4846 case E_V16HImode:
4847 if (TARGET_AVX512VL && TARGET_AVX512BW)
4848 gen = gen_avx512vl_vpermt2varv16hi3;
4849 break;
4850 case E_V32HImode:
4851 if (TARGET_AVX512BW)
4852 gen = gen_avx512bw_vpermt2varv32hi3;
4853 break;
4854 case E_V4SImode:
4855 if (TARGET_AVX512VL)
4856 gen = gen_avx512vl_vpermt2varv4si3;
4857 break;
4858 case E_V8SImode:
4859 if (TARGET_AVX512VL)
4860 gen = gen_avx512vl_vpermt2varv8si3;
4861 break;
4862 case E_V16SImode:
4863 if (TARGET_AVX512F)
4864 gen = gen_avx512f_vpermt2varv16si3;
4865 break;
4866 case E_V4SFmode:
4867 if (TARGET_AVX512VL)
4868 {
4869 gen = gen_avx512vl_vpermt2varv4sf3;
4870 maskmode = V4SImode;
4871 }
4872 break;
4873 case E_V8SFmode:
4874 if (TARGET_AVX512VL)
4875 {
4876 gen = gen_avx512vl_vpermt2varv8sf3;
4877 maskmode = V8SImode;
4878 }
4879 break;
4880 case E_V16SFmode:
4881 if (TARGET_AVX512F)
4882 {
4883 gen = gen_avx512f_vpermt2varv16sf3;
4884 maskmode = V16SImode;
4885 }
4886 break;
4887 case E_V2DImode:
4888 if (TARGET_AVX512VL)
4889 gen = gen_avx512vl_vpermt2varv2di3;
4890 break;
4891 case E_V4DImode:
4892 if (TARGET_AVX512VL)
4893 gen = gen_avx512vl_vpermt2varv4di3;
4894 break;
4895 case E_V8DImode:
4896 if (TARGET_AVX512F)
4897 gen = gen_avx512f_vpermt2varv8di3;
4898 break;
4899 case E_V2DFmode:
4900 if (TARGET_AVX512VL)
4901 {
4902 gen = gen_avx512vl_vpermt2varv2df3;
4903 maskmode = V2DImode;
4904 }
4905 break;
4906 case E_V4DFmode:
4907 if (TARGET_AVX512VL)
4908 {
4909 gen = gen_avx512vl_vpermt2varv4df3;
4910 maskmode = V4DImode;
4911 }
4912 break;
4913 case E_V8DFmode:
4914 if (TARGET_AVX512F)
4915 {
4916 gen = gen_avx512f_vpermt2varv8df3;
4917 maskmode = V8DImode;
4918 }
4919 break;
4920 default:
4921 break;
4922 }
4923
4924 if (gen == NULL)
4925 return false;
4926
4927 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4928 expander, so args are either in d, or in op0, op1 etc. */
4929 if (d)
4930 {
4931 rtx vec[64];
4932 target = d->target;
4933 op0 = d->op0;
4934 op1 = d->op1;
4935 for (int i = 0; i < d->nelt; ++i)
4936 vec[i] = GEN_INT (d->perm[i]);
4937 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4938 }
4939
4940 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4941 return true;
4942 }
4943
4944 /* Expand a variable vector permutation. */
4945
4946 void
4947 ix86_expand_vec_perm (rtx operands[])
4948 {
4949 rtx target = operands[0];
4950 rtx op0 = operands[1];
4951 rtx op1 = operands[2];
4952 rtx mask = operands[3];
4953 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4954 machine_mode mode = GET_MODE (op0);
4955 machine_mode maskmode = GET_MODE (mask);
4956 int w, e, i;
4957 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4958
4959 /* Number of elements in the vector. */
4960 w = GET_MODE_NUNITS (mode);
4961 e = GET_MODE_UNIT_SIZE (mode);
4962 gcc_assert (w <= 64);
4963
4964 /* For HF mode vector, convert it to HI using subreg. */
4965 if (GET_MODE_INNER (mode) == HFmode)
4966 {
4967 machine_mode orig_mode = mode;
4968 mode = mode_for_vector (HImode, w).require ();
4969 target = lowpart_subreg (mode, target, orig_mode);
4970 op0 = lowpart_subreg (mode, op0, orig_mode);
4971 op1 = lowpart_subreg (mode, op1, orig_mode);
4972 }
4973
4974 if (TARGET_AVX512F && one_operand_shuffle)
4975 {
4976 rtx (*gen) (rtx, rtx, rtx) = NULL;
4977 switch (mode)
4978 {
4979 case E_V16SImode:
4980 gen =gen_avx512f_permvarv16si;
4981 break;
4982 case E_V16SFmode:
4983 gen = gen_avx512f_permvarv16sf;
4984 break;
4985 case E_V8DImode:
4986 gen = gen_avx512f_permvarv8di;
4987 break;
4988 case E_V8DFmode:
4989 gen = gen_avx512f_permvarv8df;
4990 break;
4991 default:
4992 break;
4993 }
4994 if (gen != NULL)
4995 {
4996 emit_insn (gen (target, op0, mask));
4997 return;
4998 }
4999 }
5000
5001 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5002 return;
5003
5004 if (TARGET_AVX2)
5005 {
5006 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5007 {
5008 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5009 an constant shuffle operand. With a tiny bit of effort we can
5010 use VPERMD instead. A re-interpretation stall for V4DFmode is
5011 unfortunate but there's no avoiding it.
5012 Similarly for V16HImode we don't have instructions for variable
5013 shuffling, while for V32QImode we can use after preparing suitable
5014 masks vpshufb; vpshufb; vpermq; vpor. */
5015
5016 if (mode == V16HImode)
5017 {
5018 maskmode = mode = V32QImode;
5019 w = 32;
5020 e = 1;
5021 }
5022 else
5023 {
5024 maskmode = mode = V8SImode;
5025 w = 8;
5026 e = 4;
5027 }
5028 t1 = gen_reg_rtx (maskmode);
5029
5030 /* Replicate the low bits of the V4DImode mask into V8SImode:
5031 mask = { A B C D }
5032 t1 = { A A B B C C D D }. */
5033 for (i = 0; i < w / 2; ++i)
5034 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5035 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5036 vt = force_reg (maskmode, vt);
5037 mask = gen_lowpart (maskmode, mask);
5038 if (maskmode == V8SImode)
5039 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5040 else
5041 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5042
5043 /* Multiply the shuffle indicies by two. */
5044 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5045 OPTAB_DIRECT);
5046
5047 /* Add one to the odd shuffle indicies:
5048 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5049 for (i = 0; i < w / 2; ++i)
5050 {
5051 vec[i * 2] = const0_rtx;
5052 vec[i * 2 + 1] = const1_rtx;
5053 }
5054 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5055 vt = validize_mem (force_const_mem (maskmode, vt));
5056 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5057 OPTAB_DIRECT);
5058
5059 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5060 operands[3] = mask = t1;
5061 target = gen_reg_rtx (mode);
5062 op0 = gen_lowpart (mode, op0);
5063 op1 = gen_lowpart (mode, op1);
5064 }
5065
5066 switch (mode)
5067 {
5068 case E_V8SImode:
5069 /* The VPERMD and VPERMPS instructions already properly ignore
5070 the high bits of the shuffle elements. No need for us to
5071 perform an AND ourselves. */
5072 if (one_operand_shuffle)
5073 {
5074 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5075 if (target != operands[0])
5076 emit_move_insn (operands[0],
5077 gen_lowpart (GET_MODE (operands[0]), target));
5078 }
5079 else
5080 {
5081 t1 = gen_reg_rtx (V8SImode);
5082 t2 = gen_reg_rtx (V8SImode);
5083 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5084 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5085 goto merge_two;
5086 }
5087 return;
5088
5089 case E_V8SFmode:
5090 mask = gen_lowpart (V8SImode, mask);
5091 if (one_operand_shuffle)
5092 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5093 else
5094 {
5095 t1 = gen_reg_rtx (V8SFmode);
5096 t2 = gen_reg_rtx (V8SFmode);
5097 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5098 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5099 goto merge_two;
5100 }
5101 return;
5102
5103 case E_V4SImode:
5104 /* By combining the two 128-bit input vectors into one 256-bit
5105 input vector, we can use VPERMD and VPERMPS for the full
5106 two-operand shuffle. */
5107 t1 = gen_reg_rtx (V8SImode);
5108 t2 = gen_reg_rtx (V8SImode);
5109 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5110 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5111 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5112 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5113 return;
5114
5115 case E_V4SFmode:
5116 t1 = gen_reg_rtx (V8SFmode);
5117 t2 = gen_reg_rtx (V8SImode);
5118 mask = gen_lowpart (V4SImode, mask);
5119 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5120 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5121 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5122 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5123 return;
5124
5125 case E_V32QImode:
5126 t1 = gen_reg_rtx (V32QImode);
5127 t2 = gen_reg_rtx (V32QImode);
5128 t3 = gen_reg_rtx (V32QImode);
5129 vt2 = GEN_INT (-128);
5130 vt = gen_const_vec_duplicate (V32QImode, vt2);
5131 vt = force_reg (V32QImode, vt);
5132 for (i = 0; i < 32; i++)
5133 vec[i] = i < 16 ? vt2 : const0_rtx;
5134 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5135 vt2 = force_reg (V32QImode, vt2);
5136 /* From mask create two adjusted masks, which contain the same
5137 bits as mask in the low 7 bits of each vector element.
5138 The first mask will have the most significant bit clear
5139 if it requests element from the same 128-bit lane
5140 and MSB set if it requests element from the other 128-bit lane.
5141 The second mask will have the opposite values of the MSB,
5142 and additionally will have its 128-bit lanes swapped.
5143 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5144 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5145 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5146 stands for other 12 bytes. */
5147 /* The bit whether element is from the same lane or the other
5148 lane is bit 4, so shift it up by 3 to the MSB position. */
5149 t5 = gen_reg_rtx (V4DImode);
5150 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5151 GEN_INT (3)));
5152 /* Clear MSB bits from the mask just in case it had them set. */
5153 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5154 /* After this t1 will have MSB set for elements from other lane. */
5155 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5156 /* Clear bits other than MSB. */
5157 emit_insn (gen_andv32qi3 (t1, t1, vt));
5158 /* Or in the lower bits from mask into t3. */
5159 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5160 /* And invert MSB bits in t1, so MSB is set for elements from the same
5161 lane. */
5162 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5163 /* Swap 128-bit lanes in t3. */
5164 t6 = gen_reg_rtx (V4DImode);
5165 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5166 const2_rtx, GEN_INT (3),
5167 const0_rtx, const1_rtx));
5168 /* And or in the lower bits from mask into t1. */
5169 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5170 if (one_operand_shuffle)
5171 {
5172 /* Each of these shuffles will put 0s in places where
5173 element from the other 128-bit lane is needed, otherwise
5174 will shuffle in the requested value. */
5175 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5176 gen_lowpart (V32QImode, t6)));
5177 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5178 /* For t3 the 128-bit lanes are swapped again. */
5179 t7 = gen_reg_rtx (V4DImode);
5180 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5181 const2_rtx, GEN_INT (3),
5182 const0_rtx, const1_rtx));
5183 /* And oring both together leads to the result. */
5184 emit_insn (gen_iorv32qi3 (target, t1,
5185 gen_lowpart (V32QImode, t7)));
5186 if (target != operands[0])
5187 emit_move_insn (operands[0],
5188 gen_lowpart (GET_MODE (operands[0]), target));
5189 return;
5190 }
5191
5192 t4 = gen_reg_rtx (V32QImode);
5193 /* Similarly to the above one_operand_shuffle code,
5194 just for repeated twice for each operand. merge_two:
5195 code will merge the two results together. */
5196 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5197 gen_lowpart (V32QImode, t6)));
5198 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5199 gen_lowpart (V32QImode, t6)));
5200 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5201 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5202 t7 = gen_reg_rtx (V4DImode);
5203 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5204 const2_rtx, GEN_INT (3),
5205 const0_rtx, const1_rtx));
5206 t8 = gen_reg_rtx (V4DImode);
5207 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5208 const2_rtx, GEN_INT (3),
5209 const0_rtx, const1_rtx));
5210 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5211 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5212 t1 = t4;
5213 t2 = t3;
5214 goto merge_two;
5215
5216 default:
5217 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5218 break;
5219 }
5220 }
5221
5222 if (TARGET_XOP)
5223 {
5224 /* The XOP VPPERM insn supports three inputs. By ignoring the
5225 one_operand_shuffle special case, we avoid creating another
5226 set of constant vectors in memory. */
5227 one_operand_shuffle = false;
5228
5229 /* mask = mask & {2*w-1, ...} */
5230 vt = GEN_INT (2*w - 1);
5231 }
5232 else
5233 {
5234 /* mask = mask & {w-1, ...} */
5235 vt = GEN_INT (w - 1);
5236 }
5237
5238 vt = gen_const_vec_duplicate (maskmode, vt);
5239 mask = expand_simple_binop (maskmode, AND, mask, vt,
5240 NULL_RTX, 0, OPTAB_DIRECT);
5241
5242 /* For non-QImode operations, convert the word permutation control
5243 into a byte permutation control. */
5244 if (mode != V16QImode)
5245 {
5246 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5247 GEN_INT (exact_log2 (e)),
5248 NULL_RTX, 0, OPTAB_DIRECT);
5249
5250 /* Convert mask to vector of chars. */
5251 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5252
5253 /* Replicate each of the input bytes into byte positions:
5254 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5255 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5256 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5257 for (i = 0; i < 16; ++i)
5258 vec[i] = GEN_INT (i/e * e);
5259 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5260 vt = validize_mem (force_const_mem (V16QImode, vt));
5261 if (TARGET_XOP)
5262 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5263 else
5264 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5265
5266 /* Convert it into the byte positions by doing
5267 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5268 for (i = 0; i < 16; ++i)
5269 vec[i] = GEN_INT (i % e);
5270 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5271 vt = validize_mem (force_const_mem (V16QImode, vt));
5272 emit_insn (gen_addv16qi3 (mask, mask, vt));
5273 }
5274
5275 /* The actual shuffle operations all operate on V16QImode. */
5276 op0 = gen_lowpart (V16QImode, op0);
5277 op1 = gen_lowpart (V16QImode, op1);
5278
5279 if (TARGET_XOP)
5280 {
5281 if (GET_MODE (target) != V16QImode)
5282 target = gen_reg_rtx (V16QImode);
5283 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5284 if (target != operands[0])
5285 emit_move_insn (operands[0],
5286 gen_lowpart (GET_MODE (operands[0]), target));
5287 }
5288 else if (one_operand_shuffle)
5289 {
5290 if (GET_MODE (target) != V16QImode)
5291 target = gen_reg_rtx (V16QImode);
5292 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5293 if (target != operands[0])
5294 emit_move_insn (operands[0],
5295 gen_lowpart (GET_MODE (operands[0]), target));
5296 }
5297 else
5298 {
5299 rtx xops[6];
5300 bool ok;
5301
5302 /* Shuffle the two input vectors independently. */
5303 t1 = gen_reg_rtx (V16QImode);
5304 t2 = gen_reg_rtx (V16QImode);
5305 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5306 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5307
5308 merge_two:
5309 /* Then merge them together. The key is whether any given control
5310 element contained a bit set that indicates the second word. */
5311 mask = operands[3];
5312 vt = GEN_INT (w);
5313 if (maskmode == V2DImode && !TARGET_SSE4_1)
5314 {
5315 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5316 more shuffle to convert the V2DI input mask into a V4SI
5317 input mask. At which point the masking that expand_int_vcond
5318 will work as desired. */
5319 rtx t3 = gen_reg_rtx (V4SImode);
5320 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5321 const0_rtx, const0_rtx,
5322 const2_rtx, const2_rtx));
5323 mask = t3;
5324 maskmode = V4SImode;
5325 e = w = 4;
5326 }
5327
5328 vt = gen_const_vec_duplicate (maskmode, vt);
5329 vt = force_reg (maskmode, vt);
5330 mask = expand_simple_binop (maskmode, AND, mask, vt,
5331 NULL_RTX, 0, OPTAB_DIRECT);
5332
5333 if (GET_MODE (target) != mode)
5334 target = gen_reg_rtx (mode);
5335 xops[0] = target;
5336 xops[1] = gen_lowpart (mode, t2);
5337 xops[2] = gen_lowpart (mode, t1);
5338 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5339 xops[4] = mask;
5340 xops[5] = vt;
5341 ok = ix86_expand_int_vcond (xops);
5342 gcc_assert (ok);
5343 if (target != operands[0])
5344 emit_move_insn (operands[0],
5345 gen_lowpart (GET_MODE (operands[0]), target));
5346 }
5347 }
5348
5349 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5350 true if we should do zero extension, else sign extension. HIGH_P is
5351 true if we want the N/2 high elements, else the low elements. */
5352
5353 void
5354 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5355 {
5356 machine_mode imode = GET_MODE (src);
5357 rtx tmp;
5358
5359 if (TARGET_SSE4_1)
5360 {
5361 rtx (*unpack)(rtx, rtx);
5362 rtx (*extract)(rtx, rtx) = NULL;
5363 machine_mode halfmode = BLKmode;
5364
5365 switch (imode)
5366 {
5367 case E_V64QImode:
5368 if (unsigned_p)
5369 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5370 else
5371 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5372 halfmode = V32QImode;
5373 extract
5374 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5375 break;
5376 case E_V32QImode:
5377 if (unsigned_p)
5378 unpack = gen_avx2_zero_extendv16qiv16hi2;
5379 else
5380 unpack = gen_avx2_sign_extendv16qiv16hi2;
5381 halfmode = V16QImode;
5382 extract
5383 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5384 break;
5385 case E_V32HImode:
5386 if (unsigned_p)
5387 unpack = gen_avx512f_zero_extendv16hiv16si2;
5388 else
5389 unpack = gen_avx512f_sign_extendv16hiv16si2;
5390 halfmode = V16HImode;
5391 extract
5392 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5393 break;
5394 case E_V16HImode:
5395 if (unsigned_p)
5396 unpack = gen_avx2_zero_extendv8hiv8si2;
5397 else
5398 unpack = gen_avx2_sign_extendv8hiv8si2;
5399 halfmode = V8HImode;
5400 extract
5401 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5402 break;
5403 case E_V16SImode:
5404 if (unsigned_p)
5405 unpack = gen_avx512f_zero_extendv8siv8di2;
5406 else
5407 unpack = gen_avx512f_sign_extendv8siv8di2;
5408 halfmode = V8SImode;
5409 extract
5410 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5411 break;
5412 case E_V8SImode:
5413 if (unsigned_p)
5414 unpack = gen_avx2_zero_extendv4siv4di2;
5415 else
5416 unpack = gen_avx2_sign_extendv4siv4di2;
5417 halfmode = V4SImode;
5418 extract
5419 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5420 break;
5421 case E_V16QImode:
5422 if (unsigned_p)
5423 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5424 else
5425 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5426 break;
5427 case E_V8HImode:
5428 if (unsigned_p)
5429 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5430 else
5431 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5432 break;
5433 case E_V4SImode:
5434 if (unsigned_p)
5435 unpack = gen_sse4_1_zero_extendv2siv2di2;
5436 else
5437 unpack = gen_sse4_1_sign_extendv2siv2di2;
5438 break;
5439 case E_V8QImode:
5440 if (unsigned_p)
5441 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5442 else
5443 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5444 break;
5445 case E_V4HImode:
5446 if (unsigned_p)
5447 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5448 else
5449 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5450 break;
5451 case E_V4QImode:
5452 if (unsigned_p)
5453 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5454 else
5455 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5456 break;
5457 default:
5458 gcc_unreachable ();
5459 }
5460
5461 if (GET_MODE_SIZE (imode) >= 32)
5462 {
5463 tmp = gen_reg_rtx (halfmode);
5464 emit_insn (extract (tmp, src));
5465 }
5466 else if (high_p)
5467 {
5468 switch (GET_MODE_SIZE (imode))
5469 {
5470 case 16:
5471 /* Shift higher 8 bytes to lower 8 bytes. */
5472 tmp = gen_reg_rtx (V1TImode);
5473 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5474 GEN_INT (64)));
5475 break;
5476 case 8:
5477 /* Shift higher 4 bytes to lower 4 bytes. */
5478 tmp = gen_reg_rtx (V1DImode);
5479 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5480 GEN_INT (32)));
5481 break;
5482 case 4:
5483 /* Shift higher 2 bytes to lower 2 bytes. */
5484 tmp = gen_reg_rtx (V1SImode);
5485 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5486 GEN_INT (16)));
5487 break;
5488 default:
5489 gcc_unreachable ();
5490 }
5491
5492 tmp = gen_lowpart (imode, tmp);
5493 }
5494 else
5495 tmp = src;
5496
5497 emit_insn (unpack (dest, tmp));
5498 }
5499 else
5500 {
5501 rtx (*unpack)(rtx, rtx, rtx);
5502
5503 switch (imode)
5504 {
5505 case E_V16QImode:
5506 if (high_p)
5507 unpack = gen_vec_interleave_highv16qi;
5508 else
5509 unpack = gen_vec_interleave_lowv16qi;
5510 break;
5511 case E_V8HImode:
5512 if (high_p)
5513 unpack = gen_vec_interleave_highv8hi;
5514 else
5515 unpack = gen_vec_interleave_lowv8hi;
5516 break;
5517 case E_V4SImode:
5518 if (high_p)
5519 unpack = gen_vec_interleave_highv4si;
5520 else
5521 unpack = gen_vec_interleave_lowv4si;
5522 break;
5523 case E_V8QImode:
5524 if (high_p)
5525 unpack = gen_mmx_punpckhbw;
5526 else
5527 unpack = gen_mmx_punpcklbw;
5528 break;
5529 case E_V4HImode:
5530 if (high_p)
5531 unpack = gen_mmx_punpckhwd;
5532 else
5533 unpack = gen_mmx_punpcklwd;
5534 break;
5535 case E_V4QImode:
5536 if (high_p)
5537 unpack = gen_mmx_punpckhbw_low;
5538 else
5539 unpack = gen_mmx_punpcklbw_low;
5540 break;
5541 default:
5542 gcc_unreachable ();
5543 }
5544
5545 if (unsigned_p)
5546 tmp = force_reg (imode, CONST0_RTX (imode));
5547 else
5548 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5549 src, pc_rtx, pc_rtx);
5550
5551 rtx tmp2 = gen_reg_rtx (imode);
5552 emit_insn (unpack (tmp2, src, tmp));
5553 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5554 }
5555 }
5556
5557 /* Return true if mem is pool constant which contains a const_vector
5558 perm index, assign the index to PERM. */
5559 bool
5560 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5561 {
5562 machine_mode mode = GET_MODE (mem);
5563 int nelt = GET_MODE_NUNITS (mode);
5564
5565 if (!INTEGRAL_MODE_P (mode))
5566 return false;
5567
5568 /* Needs to be constant pool. */
5569 if (!(MEM_P (mem))
5570 || !SYMBOL_REF_P (XEXP (mem, 0))
5571 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5572 return false;
5573
5574 rtx constant = get_pool_constant (XEXP (mem, 0));
5575
5576 if (GET_CODE (constant) != CONST_VECTOR)
5577 return false;
5578
5579 /* There could be some rtx like
5580 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5581 but with "*.LC1" refer to V2DI constant vector. */
5582 if (GET_MODE (constant) != mode)
5583 {
5584 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5585
5586 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5587 return false;
5588 }
5589
5590 for (int i = 0; i != nelt; i++)
5591 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5592
5593 return true;
5594 }
5595
5596 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5597 but works for floating pointer parameters and nonoffsetable memories.
5598 For pushes, it returns just stack offsets; the values will be saved
5599 in the right order. Maximally three parts are generated. */
5600
5601 static int
5602 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5603 {
5604 int size;
5605
5606 if (!TARGET_64BIT)
5607 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5608 else
5609 size = (GET_MODE_SIZE (mode) + 4) / 8;
5610
5611 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5612 gcc_assert (size >= 2 && size <= 4);
5613
5614 /* Optimize constant pool reference to immediates. This is used by fp
5615 moves, that force all constants to memory to allow combining. */
5616 if (MEM_P (operand) && MEM_READONLY_P (operand))
5617 operand = avoid_constant_pool_reference (operand);
5618
5619 if (MEM_P (operand) && !offsettable_memref_p (operand))
5620 {
5621 /* The only non-offsetable memories we handle are pushes. */
5622 int ok = push_operand (operand, VOIDmode);
5623
5624 gcc_assert (ok);
5625
5626 operand = copy_rtx (operand);
5627 PUT_MODE (operand, word_mode);
5628 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5629 return size;
5630 }
5631
5632 if (GET_CODE (operand) == CONST_VECTOR)
5633 {
5634 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5635 /* Caution: if we looked through a constant pool memory above,
5636 the operand may actually have a different mode now. That's
5637 ok, since we want to pun this all the way back to an integer. */
5638 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5639 gcc_assert (operand != NULL);
5640 mode = imode;
5641 }
5642
5643 if (!TARGET_64BIT)
5644 {
5645 if (mode == DImode)
5646 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5647 else
5648 {
5649 int i;
5650
5651 if (REG_P (operand))
5652 {
5653 gcc_assert (reload_completed);
5654 for (i = 0; i < size; i++)
5655 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5656 }
5657 else if (offsettable_memref_p (operand))
5658 {
5659 operand = adjust_address (operand, SImode, 0);
5660 parts[0] = operand;
5661 for (i = 1; i < size; i++)
5662 parts[i] = adjust_address (operand, SImode, 4 * i);
5663 }
5664 else if (CONST_DOUBLE_P (operand))
5665 {
5666 const REAL_VALUE_TYPE *r;
5667 long l[4];
5668
5669 r = CONST_DOUBLE_REAL_VALUE (operand);
5670 switch (mode)
5671 {
5672 case E_TFmode:
5673 real_to_target (l, r, mode);
5674 parts[3] = gen_int_mode (l[3], SImode);
5675 parts[2] = gen_int_mode (l[2], SImode);
5676 break;
5677 case E_XFmode:
5678 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5679 long double may not be 80-bit. */
5680 real_to_target (l, r, mode);
5681 parts[2] = gen_int_mode (l[2], SImode);
5682 break;
5683 case E_DFmode:
5684 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5685 break;
5686 default:
5687 gcc_unreachable ();
5688 }
5689 parts[1] = gen_int_mode (l[1], SImode);
5690 parts[0] = gen_int_mode (l[0], SImode);
5691 }
5692 else
5693 gcc_unreachable ();
5694 }
5695 }
5696 else
5697 {
5698 if (mode == TImode)
5699 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5700 if (mode == XFmode || mode == TFmode)
5701 {
5702 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5703 if (REG_P (operand))
5704 {
5705 gcc_assert (reload_completed);
5706 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5707 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5708 }
5709 else if (offsettable_memref_p (operand))
5710 {
5711 operand = adjust_address (operand, DImode, 0);
5712 parts[0] = operand;
5713 parts[1] = adjust_address (operand, upper_mode, 8);
5714 }
5715 else if (CONST_DOUBLE_P (operand))
5716 {
5717 long l[4];
5718
5719 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5720
5721 /* real_to_target puts 32-bit pieces in each long. */
5722 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5723 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5724 << 32), DImode);
5725
5726 if (upper_mode == SImode)
5727 parts[1] = gen_int_mode (l[2], SImode);
5728 else
5729 parts[1]
5730 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5731 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5732 << 32), DImode);
5733 }
5734 else
5735 gcc_unreachable ();
5736 }
5737 }
5738
5739 return size;
5740 }
5741
5742 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5743 Return false when normal moves are needed; true when all required
5744 insns have been emitted. Operands 2-4 contain the input values
5745 int the correct order; operands 5-7 contain the output values. */
5746
5747 void
5748 ix86_split_long_move (rtx operands[])
5749 {
5750 rtx part[2][4];
5751 int nparts, i, j;
5752 int push = 0;
5753 int collisions = 0;
5754 machine_mode mode = GET_MODE (operands[0]);
5755 bool collisionparts[4];
5756
5757 /* The DFmode expanders may ask us to move double.
5758 For 64bit target this is single move. By hiding the fact
5759 here we simplify i386.md splitters. */
5760 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5761 {
5762 /* Optimize constant pool reference to immediates. This is used by
5763 fp moves, that force all constants to memory to allow combining. */
5764
5765 if (MEM_P (operands[1])
5766 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5767 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5768 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5769 if (push_operand (operands[0], VOIDmode))
5770 {
5771 operands[0] = copy_rtx (operands[0]);
5772 PUT_MODE (operands[0], word_mode);
5773 }
5774 else
5775 operands[0] = gen_lowpart (DImode, operands[0]);
5776 operands[1] = gen_lowpart (DImode, operands[1]);
5777 emit_move_insn (operands[0], operands[1]);
5778 return;
5779 }
5780
5781 /* The only non-offsettable memory we handle is push. */
5782 if (push_operand (operands[0], VOIDmode))
5783 push = 1;
5784 else
5785 gcc_assert (!MEM_P (operands[0])
5786 || offsettable_memref_p (operands[0]));
5787
5788 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5789 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5790
5791 /* When emitting push, take care for source operands on the stack. */
5792 if (push && MEM_P (operands[1])
5793 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5794 {
5795 rtx src_base = XEXP (part[1][nparts - 1], 0);
5796
5797 /* Compensate for the stack decrement by 4. */
5798 if (!TARGET_64BIT && nparts == 3
5799 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5800 src_base = plus_constant (Pmode, src_base, 4);
5801
5802 /* src_base refers to the stack pointer and is
5803 automatically decreased by emitted push. */
5804 for (i = 0; i < nparts; i++)
5805 part[1][i] = change_address (part[1][i],
5806 GET_MODE (part[1][i]), src_base);
5807 }
5808
5809 /* We need to do copy in the right order in case an address register
5810 of the source overlaps the destination. */
5811 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5812 {
5813 rtx tmp;
5814
5815 for (i = 0; i < nparts; i++)
5816 {
5817 collisionparts[i]
5818 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5819 if (collisionparts[i])
5820 collisions++;
5821 }
5822
5823 /* Collision in the middle part can be handled by reordering. */
5824 if (collisions == 1 && nparts == 3 && collisionparts [1])
5825 {
5826 std::swap (part[0][1], part[0][2]);
5827 std::swap (part[1][1], part[1][2]);
5828 }
5829 else if (collisions == 1
5830 && nparts == 4
5831 && (collisionparts [1] || collisionparts [2]))
5832 {
5833 if (collisionparts [1])
5834 {
5835 std::swap (part[0][1], part[0][2]);
5836 std::swap (part[1][1], part[1][2]);
5837 }
5838 else
5839 {
5840 std::swap (part[0][2], part[0][3]);
5841 std::swap (part[1][2], part[1][3]);
5842 }
5843 }
5844
5845 /* If there are more collisions, we can't handle it by reordering.
5846 Do an lea to the last part and use only one colliding move. */
5847 else if (collisions > 1)
5848 {
5849 rtx base, addr;
5850
5851 collisions = 1;
5852
5853 base = part[0][nparts - 1];
5854
5855 /* Handle the case when the last part isn't valid for lea.
5856 Happens in 64-bit mode storing the 12-byte XFmode. */
5857 if (GET_MODE (base) != Pmode)
5858 base = gen_rtx_REG (Pmode, REGNO (base));
5859
5860 addr = XEXP (part[1][0], 0);
5861 if (TARGET_TLS_DIRECT_SEG_REFS)
5862 {
5863 struct ix86_address parts;
5864 int ok = ix86_decompose_address (addr, &parts);
5865 gcc_assert (ok);
5866 /* It is not valid to use %gs: or %fs: in lea. */
5867 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5868 }
5869 emit_insn (gen_rtx_SET (base, addr));
5870 part[1][0] = replace_equiv_address (part[1][0], base);
5871 for (i = 1; i < nparts; i++)
5872 {
5873 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5874 part[1][i] = replace_equiv_address (part[1][i], tmp);
5875 }
5876 }
5877 }
5878
5879 if (push)
5880 {
5881 if (!TARGET_64BIT)
5882 {
5883 if (nparts == 3)
5884 {
5885 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5886 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5887 emit_move_insn (part[0][2], part[1][2]);
5888 }
5889 else if (nparts == 4)
5890 {
5891 emit_move_insn (part[0][3], part[1][3]);
5892 emit_move_insn (part[0][2], part[1][2]);
5893 }
5894 }
5895 else
5896 {
5897 /* In 64bit mode we don't have 32bit push available. In case this is
5898 register, it is OK - we will just use larger counterpart. We also
5899 retype memory - these comes from attempt to avoid REX prefix on
5900 moving of second half of TFmode value. */
5901 if (GET_MODE (part[1][1]) == SImode)
5902 {
5903 switch (GET_CODE (part[1][1]))
5904 {
5905 case MEM:
5906 part[1][1] = adjust_address (part[1][1], DImode, 0);
5907 break;
5908
5909 case REG:
5910 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5911 break;
5912
5913 default:
5914 gcc_unreachable ();
5915 }
5916
5917 if (GET_MODE (part[1][0]) == SImode)
5918 part[1][0] = part[1][1];
5919 }
5920 }
5921 emit_move_insn (part[0][1], part[1][1]);
5922 emit_move_insn (part[0][0], part[1][0]);
5923 return;
5924 }
5925
5926 /* Choose correct order to not overwrite the source before it is copied. */
5927 if ((REG_P (part[0][0])
5928 && REG_P (part[1][1])
5929 && (REGNO (part[0][0]) == REGNO (part[1][1])
5930 || (nparts == 3
5931 && REGNO (part[0][0]) == REGNO (part[1][2]))
5932 || (nparts == 4
5933 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5934 || (collisions > 0
5935 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5936 {
5937 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5938 {
5939 operands[2 + i] = part[0][j];
5940 operands[6 + i] = part[1][j];
5941 }
5942 }
5943 else
5944 {
5945 for (i = 0; i < nparts; i++)
5946 {
5947 operands[2 + i] = part[0][i];
5948 operands[6 + i] = part[1][i];
5949 }
5950 }
5951
5952 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5953 if (optimize_insn_for_size_p ())
5954 {
5955 for (j = 0; j < nparts - 1; j++)
5956 if (CONST_INT_P (operands[6 + j])
5957 && operands[6 + j] != const0_rtx
5958 && REG_P (operands[2 + j]))
5959 for (i = j; i < nparts - 1; i++)
5960 if (CONST_INT_P (operands[7 + i])
5961 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5962 operands[7 + i] = operands[2 + j];
5963 }
5964
5965 for (i = 0; i < nparts; i++)
5966 emit_move_insn (operands[2 + i], operands[6 + i]);
5967
5968 return;
5969 }
5970
5971 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5972 left shift by a constant, either using a single shift or
5973 a sequence of add instructions. */
5974
5975 static void
5976 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5977 {
5978 if (count == 1
5979 || (count * ix86_cost->add <= ix86_cost->shift_const
5980 && !optimize_insn_for_size_p ()))
5981 {
5982 while (count-- > 0)
5983 emit_insn (gen_add2_insn (operand, operand));
5984 }
5985 else
5986 {
5987 rtx (*insn)(rtx, rtx, rtx);
5988
5989 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5990 emit_insn (insn (operand, operand, GEN_INT (count)));
5991 }
5992 }
5993
5994 void
5995 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5996 {
5997 rtx (*gen_ashl3)(rtx, rtx, rtx);
5998 rtx (*gen_shld)(rtx, rtx, rtx);
5999 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6000 machine_mode half_mode;
6001
6002 rtx low[2], high[2];
6003 int count;
6004
6005 if (CONST_INT_P (operands[2]))
6006 {
6007 split_double_mode (mode, operands, 2, low, high);
6008 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6009
6010 if (count >= half_width)
6011 {
6012 emit_move_insn (high[0], low[1]);
6013 emit_move_insn (low[0], const0_rtx);
6014
6015 if (count > half_width)
6016 ix86_expand_ashl_const (high[0], count - half_width, mode);
6017 }
6018 else
6019 {
6020 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6021
6022 if (!rtx_equal_p (operands[0], operands[1]))
6023 emit_move_insn (operands[0], operands[1]);
6024
6025 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6026 ix86_expand_ashl_const (low[0], count, mode);
6027 }
6028 return;
6029 }
6030
6031 split_double_mode (mode, operands, 1, low, high);
6032 half_mode = mode == DImode ? SImode : DImode;
6033
6034 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6035
6036 if (operands[1] == const1_rtx)
6037 {
6038 /* Assuming we've chosen a QImode capable registers, then 1 << N
6039 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6040 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6041 {
6042 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6043
6044 ix86_expand_clear (low[0]);
6045 ix86_expand_clear (high[0]);
6046 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6047
6048 d = gen_lowpart (QImode, low[0]);
6049 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6050 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6051 emit_insn (gen_rtx_SET (d, s));
6052
6053 d = gen_lowpart (QImode, high[0]);
6054 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6055 s = gen_rtx_NE (QImode, flags, const0_rtx);
6056 emit_insn (gen_rtx_SET (d, s));
6057 }
6058
6059 /* Otherwise, we can get the same results by manually performing
6060 a bit extract operation on bit 5/6, and then performing the two
6061 shifts. The two methods of getting 0/1 into low/high are exactly
6062 the same size. Avoiding the shift in the bit extract case helps
6063 pentium4 a bit; no one else seems to care much either way. */
6064 else
6065 {
6066 rtx (*gen_lshr3)(rtx, rtx, rtx);
6067 rtx (*gen_and3)(rtx, rtx, rtx);
6068 rtx (*gen_xor3)(rtx, rtx, rtx);
6069 HOST_WIDE_INT bits;
6070 rtx x;
6071
6072 if (mode == DImode)
6073 {
6074 gen_lshr3 = gen_lshrsi3;
6075 gen_and3 = gen_andsi3;
6076 gen_xor3 = gen_xorsi3;
6077 bits = 5;
6078 }
6079 else
6080 {
6081 gen_lshr3 = gen_lshrdi3;
6082 gen_and3 = gen_anddi3;
6083 gen_xor3 = gen_xordi3;
6084 bits = 6;
6085 }
6086
6087 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6088 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6089 else
6090 x = gen_lowpart (half_mode, operands[2]);
6091 emit_insn (gen_rtx_SET (high[0], x));
6092
6093 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6094 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6095 emit_move_insn (low[0], high[0]);
6096 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6097 }
6098
6099 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6100 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6101 return;
6102 }
6103
6104 if (operands[1] == constm1_rtx)
6105 {
6106 /* For -1 << N, we can avoid the shld instruction, because we
6107 know that we're shifting 0...31/63 ones into a -1. */
6108 emit_move_insn (low[0], constm1_rtx);
6109 if (optimize_insn_for_size_p ())
6110 emit_move_insn (high[0], low[0]);
6111 else
6112 emit_move_insn (high[0], constm1_rtx);
6113 }
6114 else
6115 {
6116 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6117
6118 if (!rtx_equal_p (operands[0], operands[1]))
6119 emit_move_insn (operands[0], operands[1]);
6120
6121 split_double_mode (mode, operands, 1, low, high);
6122 emit_insn (gen_shld (high[0], low[0], operands[2]));
6123 }
6124
6125 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6126
6127 if (TARGET_CMOVE && scratch)
6128 {
6129 ix86_expand_clear (scratch);
6130 emit_insn (gen_x86_shift_adj_1
6131 (half_mode, high[0], low[0], operands[2], scratch));
6132 }
6133 else
6134 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6135 }
6136
6137 void
6138 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6139 {
6140 rtx (*gen_ashr3)(rtx, rtx, rtx)
6141 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6142 rtx (*gen_shrd)(rtx, rtx, rtx);
6143 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6144
6145 rtx low[2], high[2];
6146 int count;
6147
6148 if (CONST_INT_P (operands[2]))
6149 {
6150 split_double_mode (mode, operands, 2, low, high);
6151 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6152
6153 if (count == GET_MODE_BITSIZE (mode) - 1)
6154 {
6155 emit_move_insn (high[0], high[1]);
6156 emit_insn (gen_ashr3 (high[0], high[0],
6157 GEN_INT (half_width - 1)));
6158 emit_move_insn (low[0], high[0]);
6159
6160 }
6161 else if (count >= half_width)
6162 {
6163 emit_move_insn (low[0], high[1]);
6164 emit_move_insn (high[0], low[0]);
6165 emit_insn (gen_ashr3 (high[0], high[0],
6166 GEN_INT (half_width - 1)));
6167
6168 if (count > half_width)
6169 emit_insn (gen_ashr3 (low[0], low[0],
6170 GEN_INT (count - half_width)));
6171 }
6172 else
6173 {
6174 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6175
6176 if (!rtx_equal_p (operands[0], operands[1]))
6177 emit_move_insn (operands[0], operands[1]);
6178
6179 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6180 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6181 }
6182 }
6183 else
6184 {
6185 machine_mode half_mode;
6186
6187 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6188
6189 if (!rtx_equal_p (operands[0], operands[1]))
6190 emit_move_insn (operands[0], operands[1]);
6191
6192 split_double_mode (mode, operands, 1, low, high);
6193 half_mode = mode == DImode ? SImode : DImode;
6194
6195 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6196 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6197
6198 if (TARGET_CMOVE && scratch)
6199 {
6200 emit_move_insn (scratch, high[0]);
6201 emit_insn (gen_ashr3 (scratch, scratch,
6202 GEN_INT (half_width - 1)));
6203 emit_insn (gen_x86_shift_adj_1
6204 (half_mode, low[0], high[0], operands[2], scratch));
6205 }
6206 else
6207 emit_insn (gen_x86_shift_adj_3
6208 (half_mode, low[0], high[0], operands[2]));
6209 }
6210 }
6211
6212 void
6213 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6214 {
6215 rtx (*gen_lshr3)(rtx, rtx, rtx)
6216 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6217 rtx (*gen_shrd)(rtx, rtx, rtx);
6218 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6219
6220 rtx low[2], high[2];
6221 int count;
6222
6223 if (CONST_INT_P (operands[2]))
6224 {
6225 split_double_mode (mode, operands, 2, low, high);
6226 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6227
6228 if (count >= half_width)
6229 {
6230 emit_move_insn (low[0], high[1]);
6231 ix86_expand_clear (high[0]);
6232
6233 if (count > half_width)
6234 emit_insn (gen_lshr3 (low[0], low[0],
6235 GEN_INT (count - half_width)));
6236 }
6237 else
6238 {
6239 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6240
6241 if (!rtx_equal_p (operands[0], operands[1]))
6242 emit_move_insn (operands[0], operands[1]);
6243
6244 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6245 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6246 }
6247 }
6248 else
6249 {
6250 machine_mode half_mode;
6251
6252 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6253
6254 if (!rtx_equal_p (operands[0], operands[1]))
6255 emit_move_insn (operands[0], operands[1]);
6256
6257 split_double_mode (mode, operands, 1, low, high);
6258 half_mode = mode == DImode ? SImode : DImode;
6259
6260 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6261 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6262
6263 if (TARGET_CMOVE && scratch)
6264 {
6265 ix86_expand_clear (scratch);
6266 emit_insn (gen_x86_shift_adj_1
6267 (half_mode, low[0], high[0], operands[2], scratch));
6268 }
6269 else
6270 emit_insn (gen_x86_shift_adj_2
6271 (half_mode, low[0], high[0], operands[2]));
6272 }
6273 }
6274
6275 /* Expand move of V1TI mode register X to a new TI mode register. */
6276 static rtx
6277 ix86_expand_v1ti_to_ti (rtx x)
6278 {
6279 rtx result = gen_reg_rtx (TImode);
6280 if (TARGET_SSE2)
6281 {
6282 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6283 rtx lo = gen_lowpart (DImode, result);
6284 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6285 rtx hi = gen_highpart (DImode, result);
6286 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6287 }
6288 else
6289 emit_move_insn (result, gen_lowpart (TImode, x));
6290 return result;
6291 }
6292
6293 /* Expand move of TI mode register X to a new V1TI mode register. */
6294 static rtx
6295 ix86_expand_ti_to_v1ti (rtx x)
6296 {
6297 if (TARGET_SSE2)
6298 {
6299 rtx lo = gen_lowpart (DImode, x);
6300 rtx hi = gen_highpart (DImode, x);
6301 rtx tmp = gen_reg_rtx (V2DImode);
6302 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6303 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6304 }
6305
6306 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6307 }
6308
6309 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6310 void
6311 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6312 {
6313 rtx op1 = force_reg (V1TImode, operands[1]);
6314
6315 if (!CONST_INT_P (operands[2]))
6316 {
6317 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6318 rtx tmp2 = gen_reg_rtx (TImode);
6319 rtx (*shift) (rtx, rtx, rtx)
6320 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6321 emit_insn (shift (tmp2, tmp1, operands[2]));
6322 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6323 emit_move_insn (operands[0], tmp3);
6324 return;
6325 }
6326
6327 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6328
6329 if (bits == 0)
6330 {
6331 emit_move_insn (operands[0], op1);
6332 return;
6333 }
6334
6335 if ((bits & 7) == 0)
6336 {
6337 rtx tmp = gen_reg_rtx (V1TImode);
6338 if (code == ASHIFT)
6339 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6340 else
6341 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6342 emit_move_insn (operands[0], tmp);
6343 return;
6344 }
6345
6346 rtx tmp1 = gen_reg_rtx (V1TImode);
6347 if (code == ASHIFT)
6348 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6349 else
6350 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6351
6352 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6353 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6354
6355 /* tmp3 will be the V2DImode result. */
6356 rtx tmp3 = gen_reg_rtx (V2DImode);
6357
6358 if (bits > 64)
6359 {
6360 if (code == ASHIFT)
6361 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6362 else
6363 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6364 }
6365 else
6366 {
6367 /* tmp4 is operands[1], in V2DImode. */
6368 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6369
6370 rtx tmp5 = gen_reg_rtx (V2DImode);
6371 if (code == ASHIFT)
6372 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6373 else
6374 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6375
6376 rtx tmp6 = gen_reg_rtx (V2DImode);
6377 if (code == ASHIFT)
6378 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6379 else
6380 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6381
6382 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6383 }
6384
6385 /* Convert the result back to V1TImode and store in operands[0]. */
6386 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6387 emit_move_insn (operands[0], tmp7);
6388 }
6389
6390 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6391 void
6392 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6393 {
6394 rtx op1 = force_reg (V1TImode, operands[1]);
6395
6396 if (!CONST_INT_P (operands[2]))
6397 {
6398 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6399 rtx tmp2 = gen_reg_rtx (TImode);
6400 rtx (*rotate) (rtx, rtx, rtx)
6401 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6402 emit_insn (rotate (tmp2, tmp1, operands[2]));
6403 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6404 emit_move_insn (operands[0], tmp3);
6405 return;
6406 }
6407
6408 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6409
6410 if (bits == 0)
6411 {
6412 emit_move_insn (operands[0], op1);
6413 return;
6414 }
6415
6416 if (code == ROTATERT)
6417 bits = 128 - bits;
6418
6419 if ((bits & 31) == 0)
6420 {
6421 rtx tmp2 = gen_reg_rtx (V4SImode);
6422 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6423 if (bits == 32)
6424 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6425 else if (bits == 64)
6426 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6427 else
6428 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6429 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6430 return;
6431 }
6432
6433 if ((bits & 7) == 0)
6434 {
6435 rtx tmp1 = gen_reg_rtx (V1TImode);
6436 rtx tmp2 = gen_reg_rtx (V1TImode);
6437 rtx tmp3 = gen_reg_rtx (V1TImode);
6438
6439 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6440 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6441 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6442 emit_move_insn (operands[0], tmp3);
6443 return;
6444 }
6445
6446 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6447
6448 rtx lobits;
6449 rtx hibits;
6450
6451 switch (bits >> 5)
6452 {
6453 case 0:
6454 lobits = op1_v4si;
6455 hibits = gen_reg_rtx (V4SImode);
6456 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6457 break;
6458
6459 case 1:
6460 lobits = gen_reg_rtx (V4SImode);
6461 hibits = gen_reg_rtx (V4SImode);
6462 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6463 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6464 break;
6465
6466 case 2:
6467 lobits = gen_reg_rtx (V4SImode);
6468 hibits = gen_reg_rtx (V4SImode);
6469 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6470 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6471 break;
6472
6473 default:
6474 lobits = gen_reg_rtx (V4SImode);
6475 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6476 hibits = op1_v4si;
6477 break;
6478 }
6479
6480 rtx tmp1 = gen_reg_rtx (V4SImode);
6481 rtx tmp2 = gen_reg_rtx (V4SImode);
6482 rtx tmp3 = gen_reg_rtx (V4SImode);
6483
6484 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6485 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6486 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6487
6488 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6489 }
6490
6491 /* Expand V1TI mode ashiftrt by constant. */
6492 void
6493 ix86_expand_v1ti_ashiftrt (rtx operands[])
6494 {
6495 rtx op1 = force_reg (V1TImode, operands[1]);
6496
6497 if (!CONST_INT_P (operands[2]))
6498 {
6499 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6500 rtx tmp2 = gen_reg_rtx (TImode);
6501 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6502 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6503 emit_move_insn (operands[0], tmp3);
6504 return;
6505 }
6506
6507 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6508
6509 if (bits == 0)
6510 {
6511 emit_move_insn (operands[0], op1);
6512 return;
6513 }
6514
6515 if (bits == 127)
6516 {
6517 /* Two operations. */
6518 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6519 rtx tmp2 = gen_reg_rtx (V4SImode);
6520 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6521
6522 rtx tmp3 = gen_reg_rtx (V4SImode);
6523 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6524
6525 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6526 return;
6527 }
6528
6529 if (bits == 64)
6530 {
6531 /* Three operations. */
6532 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6533 rtx tmp2 = gen_reg_rtx (V4SImode);
6534 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6535
6536 rtx tmp3 = gen_reg_rtx (V4SImode);
6537 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6538
6539 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6540 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6541 rtx tmp6 = gen_reg_rtx (V2DImode);
6542 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6543
6544 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6545 return;
6546 }
6547
6548 if (bits == 96)
6549 {
6550 /* Three operations. */
6551 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6552 rtx tmp2 = gen_reg_rtx (V4SImode);
6553 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6554
6555 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6556 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6557 rtx tmp5 = gen_reg_rtx (V2DImode);
6558 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6559
6560 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
6561 rtx tmp7 = gen_reg_rtx (V4SImode);
6562 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6563
6564 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6565 return;
6566 }
6567
6568 if (bits >= 111)
6569 {
6570 /* Three operations. */
6571 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6572 rtx tmp2 = gen_reg_rtx (V4SImode);
6573 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6574
6575 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6576 rtx tmp4 = gen_reg_rtx (V8HImode);
6577 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6578
6579 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6580 rtx tmp6 = gen_reg_rtx (V4SImode);
6581 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6582
6583 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6584 return;
6585 }
6586
6587 if (TARGET_AVX2 || TARGET_SSE4_1)
6588 {
6589 /* Three operations. */
6590 if (bits == 32)
6591 {
6592 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6593 rtx tmp2 = gen_reg_rtx (V4SImode);
6594 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6595
6596 rtx tmp3 = gen_reg_rtx (V1TImode);
6597 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6598
6599 if (TARGET_AVX2)
6600 {
6601 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6602 rtx tmp5 = gen_reg_rtx (V4SImode);
6603 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6604 GEN_INT (7)));
6605
6606 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6607 }
6608 else
6609 {
6610 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6611 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6612 rtx tmp6 = gen_reg_rtx (V8HImode);
6613 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6614 GEN_INT (0x3f)));
6615
6616 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6617 }
6618 return;
6619 }
6620
6621 /* Three operations. */
6622 if (bits == 8 || bits == 16 || bits == 24)
6623 {
6624 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6625 rtx tmp2 = gen_reg_rtx (V4SImode);
6626 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6627
6628 rtx tmp3 = gen_reg_rtx (V1TImode);
6629 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6630
6631 if (TARGET_AVX2)
6632 {
6633 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6634 rtx tmp5 = gen_reg_rtx (V4SImode);
6635 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6636 GEN_INT (7)));
6637
6638 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6639 }
6640 else
6641 {
6642 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6643 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6644 rtx tmp6 = gen_reg_rtx (V8HImode);
6645 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6646 GEN_INT (0x3f)));
6647
6648 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6649 }
6650 return;
6651 }
6652 }
6653
6654 if (bits > 96)
6655 {
6656 /* Four operations. */
6657 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6658 rtx tmp2 = gen_reg_rtx (V4SImode);
6659 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6660
6661 rtx tmp3 = gen_reg_rtx (V4SImode);
6662 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6663
6664 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6665 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6666 rtx tmp6 = gen_reg_rtx (V2DImode);
6667 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6668
6669 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
6670 rtx tmp8 = gen_reg_rtx (V4SImode);
6671 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6672
6673 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
6674 return;
6675 }
6676
6677 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6678 {
6679 /* Four operations. */
6680 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6681 rtx tmp2 = gen_reg_rtx (V4SImode);
6682 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6683
6684 rtx tmp3 = gen_reg_rtx (V4SImode);
6685 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6686
6687 rtx tmp4 = gen_reg_rtx (V1TImode);
6688 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6689
6690 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6691 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
6692 rtx tmp7 = gen_reg_rtx (V8HImode);
6693 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6694 GEN_INT (bits == 48 ? 0x1f : 0x07)));
6695
6696 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6697 return;
6698 }
6699
6700 if ((bits & 7) == 0)
6701 {
6702 /* Five operations. */
6703 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6704 rtx tmp2 = gen_reg_rtx (V4SImode);
6705 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6706
6707 rtx tmp3 = gen_reg_rtx (V4SImode);
6708 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6709
6710 rtx tmp4 = gen_reg_rtx (V1TImode);
6711 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6712
6713 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6714 rtx tmp6 = gen_reg_rtx (V1TImode);
6715 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6716
6717 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6718 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
6719 rtx tmp9 = gen_reg_rtx (V2DImode);
6720 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6721
6722 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
6723 return;
6724 }
6725
6726 if (TARGET_AVX2 && bits < 32)
6727 {
6728 /* Six operations. */
6729 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6730 rtx tmp2 = gen_reg_rtx (V4SImode);
6731 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6732
6733 rtx tmp3 = gen_reg_rtx (V1TImode);
6734 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6735
6736 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6737 rtx tmp5 = gen_reg_rtx (V2DImode);
6738 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6739
6740 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6741 rtx tmp7 = gen_reg_rtx (V2DImode);
6742 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6743
6744 rtx tmp8 = gen_reg_rtx (V2DImode);
6745 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6746
6747 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
6748 rtx tmp10 = gen_reg_rtx (V4SImode);
6749 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6750
6751 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
6752 return;
6753 }
6754
6755 if (TARGET_SSE4_1 && bits < 15)
6756 {
6757 /* Six operations. */
6758 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6759 rtx tmp2 = gen_reg_rtx (V4SImode);
6760 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6761
6762 rtx tmp3 = gen_reg_rtx (V1TImode);
6763 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6764
6765 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6766 rtx tmp5 = gen_reg_rtx (V2DImode);
6767 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6768
6769 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6770 rtx tmp7 = gen_reg_rtx (V2DImode);
6771 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6772
6773 rtx tmp8 = gen_reg_rtx (V2DImode);
6774 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6775
6776 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6777 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
6778 rtx tmp11 = gen_reg_rtx (V8HImode);
6779 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
6780
6781 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
6782 return;
6783 }
6784
6785 if (bits == 1)
6786 {
6787 /* Eight operations. */
6788 rtx tmp1 = gen_reg_rtx (V1TImode);
6789 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6790
6791 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6792 rtx tmp3 = gen_reg_rtx (V2DImode);
6793 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
6794
6795 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6796 rtx tmp5 = gen_reg_rtx (V2DImode);
6797 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
6798
6799 rtx tmp6 = gen_reg_rtx (V2DImode);
6800 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
6801
6802 rtx tmp7 = gen_reg_rtx (V2DImode);
6803 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
6804
6805 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
6806 rtx tmp9 = gen_reg_rtx (V4SImode);
6807 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
6808
6809 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
6810 rtx tmp11 = gen_reg_rtx (V2DImode);
6811 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
6812
6813 rtx tmp12 = gen_reg_rtx (V2DImode);
6814 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
6815
6816 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
6817 return;
6818 }
6819
6820 if (bits > 64)
6821 {
6822 /* Eight operations. */
6823 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6824 rtx tmp2 = gen_reg_rtx (V4SImode);
6825 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6826
6827 rtx tmp3 = gen_reg_rtx (V4SImode);
6828 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6829
6830 rtx tmp4 = gen_reg_rtx (V1TImode);
6831 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6832
6833 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6834 rtx tmp6 = gen_reg_rtx (V2DImode);
6835 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
6836
6837 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6838 rtx tmp8 = gen_reg_rtx (V1TImode);
6839 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
6840
6841 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6842 rtx tmp10 = gen_reg_rtx (V2DImode);
6843 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
6844
6845 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
6846 rtx tmp12 = gen_reg_rtx (V2DImode);
6847 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
6848
6849 rtx tmp13 = gen_reg_rtx (V2DImode);
6850 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
6851
6852 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
6853 }
6854 else
6855 {
6856 /* Nine operations. */
6857 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6858 rtx tmp2 = gen_reg_rtx (V4SImode);
6859 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6860
6861 rtx tmp3 = gen_reg_rtx (V4SImode);
6862 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6863
6864 rtx tmp4 = gen_reg_rtx (V1TImode);
6865 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6866
6867 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6868 rtx tmp6 = gen_reg_rtx (V2DImode);
6869 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
6870
6871 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6872 rtx tmp8 = gen_reg_rtx (V2DImode);
6873 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
6874
6875 rtx tmp9 = gen_reg_rtx (V2DImode);
6876 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
6877
6878 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6879 rtx tmp11 = gen_reg_rtx (V1TImode);
6880 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
6881
6882 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
6883 rtx tmp13 = gen_reg_rtx (V2DImode);
6884 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
6885
6886 rtx tmp14 = gen_reg_rtx (V2DImode);
6887 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
6888
6889 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
6890 }
6891 }
6892
6893 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
6894 DImode for constant loop counts. */
6895
6896 static machine_mode
6897 counter_mode (rtx count_exp)
6898 {
6899 if (GET_MODE (count_exp) != VOIDmode)
6900 return GET_MODE (count_exp);
6901 if (!CONST_INT_P (count_exp))
6902 return Pmode;
6903 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
6904 return DImode;
6905 return SImode;
6906 }
6907
6908 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
6909 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
6910 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
6911 memory by VALUE (supposed to be in MODE).
6912
6913 The size is rounded down to whole number of chunk size moved at once.
6914 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
6915
6916
6917 static void
6918 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
6919 rtx destptr, rtx srcptr, rtx value,
6920 rtx count, machine_mode mode, int unroll,
6921 int expected_size, bool issetmem)
6922 {
6923 rtx_code_label *out_label, *top_label;
6924 rtx iter, tmp;
6925 machine_mode iter_mode = counter_mode (count);
6926 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
6927 rtx piece_size = GEN_INT (piece_size_n);
6928 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
6929 rtx size;
6930 int i;
6931
6932 top_label = gen_label_rtx ();
6933 out_label = gen_label_rtx ();
6934 iter = gen_reg_rtx (iter_mode);
6935
6936 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
6937 NULL, 1, OPTAB_DIRECT);
6938 /* Those two should combine. */
6939 if (piece_size == const1_rtx)
6940 {
6941 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
6942 true, out_label);
6943 predict_jump (REG_BR_PROB_BASE * 10 / 100);
6944 }
6945 emit_move_insn (iter, const0_rtx);
6946
6947 emit_label (top_label);
6948
6949 tmp = convert_modes (Pmode, iter_mode, iter, true);
6950
6951 /* This assert could be relaxed - in this case we'll need to compute
6952 smallest power of two, containing in PIECE_SIZE_N and pass it to
6953 offset_address. */
6954 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
6955 destmem = offset_address (destmem, tmp, piece_size_n);
6956 destmem = adjust_address (destmem, mode, 0);
6957
6958 if (!issetmem)
6959 {
6960 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
6961 srcmem = adjust_address (srcmem, mode, 0);
6962
6963 /* When unrolling for chips that reorder memory reads and writes,
6964 we can save registers by using single temporary.
6965 Also using 4 temporaries is overkill in 32bit mode. */
6966 if (!TARGET_64BIT && 0)
6967 {
6968 for (i = 0; i < unroll; i++)
6969 {
6970 if (i)
6971 {
6972 destmem = adjust_address (copy_rtx (destmem), mode,
6973 GET_MODE_SIZE (mode));
6974 srcmem = adjust_address (copy_rtx (srcmem), mode,
6975 GET_MODE_SIZE (mode));
6976 }
6977 emit_move_insn (destmem, srcmem);
6978 }
6979 }
6980 else
6981 {
6982 rtx tmpreg[4];
6983 gcc_assert (unroll <= 4);
6984 for (i = 0; i < unroll; i++)
6985 {
6986 tmpreg[i] = gen_reg_rtx (mode);
6987 if (i)
6988 srcmem = adjust_address (copy_rtx (srcmem), mode,
6989 GET_MODE_SIZE (mode));
6990 emit_move_insn (tmpreg[i], srcmem);
6991 }
6992 for (i = 0; i < unroll; i++)
6993 {
6994 if (i)
6995 destmem = adjust_address (copy_rtx (destmem), mode,
6996 GET_MODE_SIZE (mode));
6997 emit_move_insn (destmem, tmpreg[i]);
6998 }
6999 }
7000 }
7001 else
7002 for (i = 0; i < unroll; i++)
7003 {
7004 if (i)
7005 destmem = adjust_address (copy_rtx (destmem), mode,
7006 GET_MODE_SIZE (mode));
7007 emit_move_insn (destmem, value);
7008 }
7009
7010 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7011 true, OPTAB_LIB_WIDEN);
7012 if (tmp != iter)
7013 emit_move_insn (iter, tmp);
7014
7015 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7016 true, top_label);
7017 if (expected_size != -1)
7018 {
7019 expected_size /= GET_MODE_SIZE (mode) * unroll;
7020 if (expected_size == 0)
7021 predict_jump (0);
7022 else if (expected_size > REG_BR_PROB_BASE)
7023 predict_jump (REG_BR_PROB_BASE - 1);
7024 else
7025 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7026 / expected_size);
7027 }
7028 else
7029 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7030 iter = ix86_zero_extend_to_Pmode (iter);
7031 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7032 true, OPTAB_LIB_WIDEN);
7033 if (tmp != destptr)
7034 emit_move_insn (destptr, tmp);
7035 if (!issetmem)
7036 {
7037 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7038 true, OPTAB_LIB_WIDEN);
7039 if (tmp != srcptr)
7040 emit_move_insn (srcptr, tmp);
7041 }
7042 emit_label (out_label);
7043 }
7044
7045 /* Divide COUNTREG by SCALE. */
7046 static rtx
7047 scale_counter (rtx countreg, int scale)
7048 {
7049 rtx sc;
7050
7051 if (scale == 1)
7052 return countreg;
7053 if (CONST_INT_P (countreg))
7054 return GEN_INT (INTVAL (countreg) / scale);
7055 gcc_assert (REG_P (countreg));
7056
7057 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7058 GEN_INT (exact_log2 (scale)),
7059 NULL, 1, OPTAB_DIRECT);
7060 return sc;
7061 }
7062
7063 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7064 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7065 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7066 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7067 ORIG_VALUE is the original value passed to memset to fill the memory with.
7068 Other arguments have same meaning as for previous function. */
7069
7070 static void
7071 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7072 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7073 rtx count,
7074 machine_mode mode, bool issetmem)
7075 {
7076 rtx destexp;
7077 rtx srcexp;
7078 rtx countreg;
7079 HOST_WIDE_INT rounded_count;
7080
7081 /* If possible, it is shorter to use rep movs.
7082 TODO: Maybe it is better to move this logic to decide_alg. */
7083 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7084 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7085 && (!issetmem || orig_value == const0_rtx))
7086 mode = SImode;
7087
7088 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7089 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7090
7091 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7092 GET_MODE_SIZE (mode)));
7093 if (mode != QImode)
7094 {
7095 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7096 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7097 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7098 }
7099 else
7100 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7101 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7102 {
7103 rounded_count
7104 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7105 destmem = shallow_copy_rtx (destmem);
7106 set_mem_size (destmem, rounded_count);
7107 }
7108 else if (MEM_SIZE_KNOWN_P (destmem))
7109 clear_mem_size (destmem);
7110
7111 if (issetmem)
7112 {
7113 value = force_reg (mode, gen_lowpart (mode, value));
7114 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7115 }
7116 else
7117 {
7118 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7119 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7120 if (mode != QImode)
7121 {
7122 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7123 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7124 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7125 }
7126 else
7127 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7128 if (CONST_INT_P (count))
7129 {
7130 rounded_count
7131 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7132 srcmem = shallow_copy_rtx (srcmem);
7133 set_mem_size (srcmem, rounded_count);
7134 }
7135 else
7136 {
7137 if (MEM_SIZE_KNOWN_P (srcmem))
7138 clear_mem_size (srcmem);
7139 }
7140 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7141 destexp, srcexp));
7142 }
7143 }
7144
7145 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7146 DESTMEM.
7147 SRC is passed by pointer to be updated on return.
7148 Return value is updated DST. */
7149 static rtx
7150 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7151 HOST_WIDE_INT size_to_move)
7152 {
7153 rtx dst = destmem, src = *srcmem, tempreg;
7154 enum insn_code code;
7155 machine_mode move_mode;
7156 int piece_size, i;
7157
7158 /* Find the widest mode in which we could perform moves.
7159 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7160 it until move of such size is supported. */
7161 piece_size = 1 << floor_log2 (size_to_move);
7162 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7163 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7164 {
7165 gcc_assert (piece_size > 1);
7166 piece_size >>= 1;
7167 }
7168
7169 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7170 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7171 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7172 {
7173 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7174 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7175 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7176 {
7177 move_mode = word_mode;
7178 piece_size = GET_MODE_SIZE (move_mode);
7179 code = optab_handler (mov_optab, move_mode);
7180 }
7181 }
7182 gcc_assert (code != CODE_FOR_nothing);
7183
7184 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7185 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7186
7187 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7188 gcc_assert (size_to_move % piece_size == 0);
7189
7190 for (i = 0; i < size_to_move; i += piece_size)
7191 {
7192 /* We move from memory to memory, so we'll need to do it via
7193 a temporary register. */
7194 tempreg = gen_reg_rtx (move_mode);
7195 emit_insn (GEN_FCN (code) (tempreg, src));
7196 emit_insn (GEN_FCN (code) (dst, tempreg));
7197
7198 emit_move_insn (destptr,
7199 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7200 emit_move_insn (srcptr,
7201 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7202
7203 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7204 piece_size);
7205 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7206 piece_size);
7207 }
7208
7209 /* Update DST and SRC rtx. */
7210 *srcmem = src;
7211 return dst;
7212 }
7213
7214 /* Helper function for the string operations below. Dest VARIABLE whether
7215 it is aligned to VALUE bytes. If true, jump to the label. */
7216
7217 static rtx_code_label *
7218 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7219 {
7220 rtx_code_label *label = gen_label_rtx ();
7221 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7222 if (GET_MODE (variable) == DImode)
7223 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7224 else
7225 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7226 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7227 1, label);
7228 if (epilogue)
7229 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7230 else
7231 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7232 return label;
7233 }
7234
7235
7236 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7237
7238 static void
7239 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7240 rtx destptr, rtx srcptr, rtx count, int max_size)
7241 {
7242 rtx src, dest;
7243 if (CONST_INT_P (count))
7244 {
7245 HOST_WIDE_INT countval = INTVAL (count);
7246 HOST_WIDE_INT epilogue_size = countval % max_size;
7247 int i;
7248
7249 /* For now MAX_SIZE should be a power of 2. This assert could be
7250 relaxed, but it'll require a bit more complicated epilogue
7251 expanding. */
7252 gcc_assert ((max_size & (max_size - 1)) == 0);
7253 for (i = max_size; i >= 1; i >>= 1)
7254 {
7255 if (epilogue_size & i)
7256 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7257 }
7258 return;
7259 }
7260 if (max_size > 8)
7261 {
7262 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7263 count, 1, OPTAB_DIRECT);
7264 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7265 count, QImode, 1, 4, false);
7266 return;
7267 }
7268
7269 /* When there are stringops, we can cheaply increase dest and src pointers.
7270 Otherwise we save code size by maintaining offset (zero is readily
7271 available from preceding rep operation) and using x86 addressing modes.
7272 */
7273 if (TARGET_SINGLE_STRINGOP)
7274 {
7275 if (max_size > 4)
7276 {
7277 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7278 src = change_address (srcmem, SImode, srcptr);
7279 dest = change_address (destmem, SImode, destptr);
7280 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7281 emit_label (label);
7282 LABEL_NUSES (label) = 1;
7283 }
7284 if (max_size > 2)
7285 {
7286 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7287 src = change_address (srcmem, HImode, srcptr);
7288 dest = change_address (destmem, HImode, destptr);
7289 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7290 emit_label (label);
7291 LABEL_NUSES (label) = 1;
7292 }
7293 if (max_size > 1)
7294 {
7295 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7296 src = change_address (srcmem, QImode, srcptr);
7297 dest = change_address (destmem, QImode, destptr);
7298 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7299 emit_label (label);
7300 LABEL_NUSES (label) = 1;
7301 }
7302 }
7303 else
7304 {
7305 rtx offset = force_reg (Pmode, const0_rtx);
7306 rtx tmp;
7307
7308 if (max_size > 4)
7309 {
7310 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7311 src = change_address (srcmem, SImode, srcptr);
7312 dest = change_address (destmem, SImode, destptr);
7313 emit_move_insn (dest, src);
7314 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7315 true, OPTAB_LIB_WIDEN);
7316 if (tmp != offset)
7317 emit_move_insn (offset, tmp);
7318 emit_label (label);
7319 LABEL_NUSES (label) = 1;
7320 }
7321 if (max_size > 2)
7322 {
7323 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7324 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7325 src = change_address (srcmem, HImode, tmp);
7326 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7327 dest = change_address (destmem, HImode, tmp);
7328 emit_move_insn (dest, src);
7329 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7330 true, OPTAB_LIB_WIDEN);
7331 if (tmp != offset)
7332 emit_move_insn (offset, tmp);
7333 emit_label (label);
7334 LABEL_NUSES (label) = 1;
7335 }
7336 if (max_size > 1)
7337 {
7338 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7339 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7340 src = change_address (srcmem, QImode, tmp);
7341 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7342 dest = change_address (destmem, QImode, tmp);
7343 emit_move_insn (dest, src);
7344 emit_label (label);
7345 LABEL_NUSES (label) = 1;
7346 }
7347 }
7348 }
7349
7350 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7351 with value PROMOTED_VAL.
7352 SRC is passed by pointer to be updated on return.
7353 Return value is updated DST. */
7354 static rtx
7355 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7356 HOST_WIDE_INT size_to_move)
7357 {
7358 rtx dst = destmem;
7359 enum insn_code code;
7360 machine_mode move_mode;
7361 int piece_size, i;
7362
7363 /* Find the widest mode in which we could perform moves.
7364 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7365 it until move of such size is supported. */
7366 move_mode = GET_MODE (promoted_val);
7367 if (move_mode == VOIDmode)
7368 move_mode = QImode;
7369 if (size_to_move < GET_MODE_SIZE (move_mode))
7370 {
7371 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7372 move_mode = int_mode_for_size (move_bits, 0).require ();
7373 promoted_val = gen_lowpart (move_mode, promoted_val);
7374 }
7375 piece_size = GET_MODE_SIZE (move_mode);
7376 code = optab_handler (mov_optab, move_mode);
7377 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7378
7379 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7380
7381 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7382 gcc_assert (size_to_move % piece_size == 0);
7383
7384 for (i = 0; i < size_to_move; i += piece_size)
7385 {
7386 if (piece_size <= GET_MODE_SIZE (word_mode))
7387 {
7388 emit_insn (gen_strset (destptr, dst, promoted_val));
7389 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7390 piece_size);
7391 continue;
7392 }
7393
7394 emit_insn (GEN_FCN (code) (dst, promoted_val));
7395
7396 emit_move_insn (destptr,
7397 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7398
7399 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7400 piece_size);
7401 }
7402
7403 /* Update DST rtx. */
7404 return dst;
7405 }
7406 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7407 static void
7408 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7409 rtx count, int max_size)
7410 {
7411 count = expand_simple_binop (counter_mode (count), AND, count,
7412 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7413 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7414 gen_lowpart (QImode, value), count, QImode,
7415 1, max_size / 2, true);
7416 }
7417
7418 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7419 static void
7420 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7421 rtx count, int max_size)
7422 {
7423 rtx dest;
7424
7425 if (CONST_INT_P (count))
7426 {
7427 HOST_WIDE_INT countval = INTVAL (count);
7428 HOST_WIDE_INT epilogue_size = countval % max_size;
7429 int i;
7430
7431 /* For now MAX_SIZE should be a power of 2. This assert could be
7432 relaxed, but it'll require a bit more complicated epilogue
7433 expanding. */
7434 gcc_assert ((max_size & (max_size - 1)) == 0);
7435 for (i = max_size; i >= 1; i >>= 1)
7436 {
7437 if (epilogue_size & i)
7438 {
7439 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7440 destmem = emit_memset (destmem, destptr, vec_value, i);
7441 else
7442 destmem = emit_memset (destmem, destptr, value, i);
7443 }
7444 }
7445 return;
7446 }
7447 if (max_size > 32)
7448 {
7449 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7450 return;
7451 }
7452 if (max_size > 16)
7453 {
7454 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7455 if (TARGET_64BIT)
7456 {
7457 dest = change_address (destmem, DImode, destptr);
7458 emit_insn (gen_strset (destptr, dest, value));
7459 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7460 emit_insn (gen_strset (destptr, dest, value));
7461 }
7462 else
7463 {
7464 dest = change_address (destmem, SImode, destptr);
7465 emit_insn (gen_strset (destptr, dest, value));
7466 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7467 emit_insn (gen_strset (destptr, dest, value));
7468 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7469 emit_insn (gen_strset (destptr, dest, value));
7470 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7471 emit_insn (gen_strset (destptr, dest, value));
7472 }
7473 emit_label (label);
7474 LABEL_NUSES (label) = 1;
7475 }
7476 if (max_size > 8)
7477 {
7478 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7479 if (TARGET_64BIT)
7480 {
7481 dest = change_address (destmem, DImode, destptr);
7482 emit_insn (gen_strset (destptr, dest, value));
7483 }
7484 else
7485 {
7486 dest = change_address (destmem, SImode, destptr);
7487 emit_insn (gen_strset (destptr, dest, value));
7488 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7489 emit_insn (gen_strset (destptr, dest, value));
7490 }
7491 emit_label (label);
7492 LABEL_NUSES (label) = 1;
7493 }
7494 if (max_size > 4)
7495 {
7496 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7497 dest = change_address (destmem, SImode, destptr);
7498 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7499 emit_label (label);
7500 LABEL_NUSES (label) = 1;
7501 }
7502 if (max_size > 2)
7503 {
7504 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7505 dest = change_address (destmem, HImode, destptr);
7506 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7507 emit_label (label);
7508 LABEL_NUSES (label) = 1;
7509 }
7510 if (max_size > 1)
7511 {
7512 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7513 dest = change_address (destmem, QImode, destptr);
7514 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7515 emit_label (label);
7516 LABEL_NUSES (label) = 1;
7517 }
7518 }
7519
7520 /* Adjust COUNTER by the VALUE. */
7521 static void
7522 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7523 {
7524 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7525 }
7526
7527 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7528 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7529 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7530 ignored.
7531 Return value is updated DESTMEM. */
7532
7533 static rtx
7534 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7535 rtx destptr, rtx srcptr, rtx value,
7536 rtx vec_value, rtx count, int align,
7537 int desired_alignment, bool issetmem)
7538 {
7539 int i;
7540 for (i = 1; i < desired_alignment; i <<= 1)
7541 {
7542 if (align <= i)
7543 {
7544 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7545 if (issetmem)
7546 {
7547 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7548 destmem = emit_memset (destmem, destptr, vec_value, i);
7549 else
7550 destmem = emit_memset (destmem, destptr, value, i);
7551 }
7552 else
7553 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7554 ix86_adjust_counter (count, i);
7555 emit_label (label);
7556 LABEL_NUSES (label) = 1;
7557 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7558 }
7559 }
7560 return destmem;
7561 }
7562
7563 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7564 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7565 and jump to DONE_LABEL. */
7566 static void
7567 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
7568 rtx destptr, rtx srcptr,
7569 rtx value, rtx vec_value,
7570 rtx count, int size,
7571 rtx done_label, bool issetmem)
7572 {
7573 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7574 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7575 rtx modesize;
7576 int n;
7577
7578 /* If we do not have vector value to copy, we must reduce size. */
7579 if (issetmem)
7580 {
7581 if (!vec_value)
7582 {
7583 if (GET_MODE (value) == VOIDmode && size > 8)
7584 mode = Pmode;
7585 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7586 mode = GET_MODE (value);
7587 }
7588 else
7589 mode = GET_MODE (vec_value), value = vec_value;
7590 }
7591 else
7592 {
7593 /* Choose appropriate vector mode. */
7594 if (size >= 32)
7595 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7596 else if (size >= 16)
7597 mode = TARGET_SSE ? V16QImode : DImode;
7598 srcmem = change_address (srcmem, mode, srcptr);
7599 }
7600 destmem = change_address (destmem, mode, destptr);
7601 modesize = GEN_INT (GET_MODE_SIZE (mode));
7602 gcc_assert (GET_MODE_SIZE (mode) <= size);
7603 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7604 {
7605 if (issetmem)
7606 emit_move_insn (destmem, gen_lowpart (mode, value));
7607 else
7608 {
7609 emit_move_insn (destmem, srcmem);
7610 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7611 }
7612 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7613 }
7614
7615 destmem = offset_address (destmem, count, 1);
7616 destmem = offset_address (destmem, GEN_INT (-2 * size),
7617 GET_MODE_SIZE (mode));
7618 if (!issetmem)
7619 {
7620 srcmem = offset_address (srcmem, count, 1);
7621 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7622 GET_MODE_SIZE (mode));
7623 }
7624 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7625 {
7626 if (issetmem)
7627 emit_move_insn (destmem, gen_lowpart (mode, value));
7628 else
7629 {
7630 emit_move_insn (destmem, srcmem);
7631 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7632 }
7633 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7634 }
7635 emit_jump_insn (gen_jump (done_label));
7636 emit_barrier ();
7637
7638 emit_label (label);
7639 LABEL_NUSES (label) = 1;
7640 }
7641
7642 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7643 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7644 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7645 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7646 DONE_LABEL is a label after the whole copying sequence. The label is created
7647 on demand if *DONE_LABEL is NULL.
7648 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7649 bounds after the initial copies.
7650
7651 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7652 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7653 we will dispatch to a library call for large blocks.
7654
7655 In pseudocode we do:
7656
7657 if (COUNT < SIZE)
7658 {
7659 Assume that SIZE is 4. Bigger sizes are handled analogously
7660 if (COUNT & 4)
7661 {
7662 copy 4 bytes from SRCPTR to DESTPTR
7663 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7664 goto done_label
7665 }
7666 if (!COUNT)
7667 goto done_label;
7668 copy 1 byte from SRCPTR to DESTPTR
7669 if (COUNT & 2)
7670 {
7671 copy 2 bytes from SRCPTR to DESTPTR
7672 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7673 }
7674 }
7675 else
7676 {
7677 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7678 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7679
7680 OLD_DESPTR = DESTPTR;
7681 Align DESTPTR up to DESIRED_ALIGN
7682 SRCPTR += DESTPTR - OLD_DESTPTR
7683 COUNT -= DEST_PTR - OLD_DESTPTR
7684 if (DYNAMIC_CHECK)
7685 Round COUNT down to multiple of SIZE
7686 << optional caller supplied zero size guard is here >>
7687 << optional caller supplied dynamic check is here >>
7688 << caller supplied main copy loop is here >>
7689 }
7690 done_label:
7691 */
7692 static void
7693 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
7694 rtx *destptr, rtx *srcptr,
7695 machine_mode mode,
7696 rtx value, rtx vec_value,
7697 rtx *count,
7698 rtx_code_label **done_label,
7699 int size,
7700 int desired_align,
7701 int align,
7702 unsigned HOST_WIDE_INT *min_size,
7703 bool dynamic_check,
7704 bool issetmem)
7705 {
7706 rtx_code_label *loop_label = NULL, *label;
7707 int n;
7708 rtx modesize;
7709 int prolog_size = 0;
7710 rtx mode_value;
7711
7712 /* Chose proper value to copy. */
7713 if (issetmem && VECTOR_MODE_P (mode))
7714 mode_value = vec_value;
7715 else
7716 mode_value = value;
7717 gcc_assert (GET_MODE_SIZE (mode) <= size);
7718
7719 /* See if block is big or small, handle small blocks. */
7720 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7721 {
7722 int size2 = size;
7723 loop_label = gen_label_rtx ();
7724
7725 if (!*done_label)
7726 *done_label = gen_label_rtx ();
7727
7728 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7729 1, loop_label);
7730 size2 >>= 1;
7731
7732 /* Handle sizes > 3. */
7733 for (;size2 > 2; size2 >>= 1)
7734 expand_small_cpymem_or_setmem (destmem, srcmem,
7735 *destptr, *srcptr,
7736 value, vec_value,
7737 *count,
7738 size2, *done_label, issetmem);
7739 /* Nothing to copy? Jump to DONE_LABEL if so */
7740 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7741 1, *done_label);
7742
7743 /* Do a byte copy. */
7744 destmem = change_address (destmem, QImode, *destptr);
7745 if (issetmem)
7746 emit_move_insn (destmem, gen_lowpart (QImode, value));
7747 else
7748 {
7749 srcmem = change_address (srcmem, QImode, *srcptr);
7750 emit_move_insn (destmem, srcmem);
7751 }
7752
7753 /* Handle sizes 2 and 3. */
7754 label = ix86_expand_aligntest (*count, 2, false);
7755 destmem = change_address (destmem, HImode, *destptr);
7756 destmem = offset_address (destmem, *count, 1);
7757 destmem = offset_address (destmem, GEN_INT (-2), 2);
7758 if (issetmem)
7759 emit_move_insn (destmem, gen_lowpart (HImode, value));
7760 else
7761 {
7762 srcmem = change_address (srcmem, HImode, *srcptr);
7763 srcmem = offset_address (srcmem, *count, 1);
7764 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
7765 emit_move_insn (destmem, srcmem);
7766 }
7767
7768 emit_label (label);
7769 LABEL_NUSES (label) = 1;
7770 emit_jump_insn (gen_jump (*done_label));
7771 emit_barrier ();
7772 }
7773 else
7774 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
7775 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
7776
7777 /* Start memcpy for COUNT >= SIZE. */
7778 if (loop_label)
7779 {
7780 emit_label (loop_label);
7781 LABEL_NUSES (loop_label) = 1;
7782 }
7783
7784 /* Copy first desired_align bytes. */
7785 if (!issetmem)
7786 srcmem = change_address (srcmem, mode, *srcptr);
7787 destmem = change_address (destmem, mode, *destptr);
7788 modesize = GEN_INT (GET_MODE_SIZE (mode));
7789 for (n = 0; prolog_size < desired_align - align; n++)
7790 {
7791 if (issetmem)
7792 emit_move_insn (destmem, mode_value);
7793 else
7794 {
7795 emit_move_insn (destmem, srcmem);
7796 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7797 }
7798 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7799 prolog_size += GET_MODE_SIZE (mode);
7800 }
7801
7802
7803 /* Copy last SIZE bytes. */
7804 destmem = offset_address (destmem, *count, 1);
7805 destmem = offset_address (destmem,
7806 GEN_INT (-size - prolog_size),
7807 1);
7808 if (issetmem)
7809 emit_move_insn (destmem, mode_value);
7810 else
7811 {
7812 srcmem = offset_address (srcmem, *count, 1);
7813 srcmem = offset_address (srcmem,
7814 GEN_INT (-size - prolog_size),
7815 1);
7816 emit_move_insn (destmem, srcmem);
7817 }
7818 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
7819 {
7820 destmem = offset_address (destmem, modesize, 1);
7821 if (issetmem)
7822 emit_move_insn (destmem, mode_value);
7823 else
7824 {
7825 srcmem = offset_address (srcmem, modesize, 1);
7826 emit_move_insn (destmem, srcmem);
7827 }
7828 }
7829
7830 /* Align destination. */
7831 if (desired_align > 1 && desired_align > align)
7832 {
7833 rtx saveddest = *destptr;
7834
7835 gcc_assert (desired_align <= size);
7836 /* Align destptr up, place it to new register. */
7837 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
7838 GEN_INT (prolog_size),
7839 NULL_RTX, 1, OPTAB_DIRECT);
7840 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
7841 REG_POINTER (*destptr) = 1;
7842 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
7843 GEN_INT (-desired_align),
7844 *destptr, 1, OPTAB_DIRECT);
7845 /* See how many bytes we skipped. */
7846 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
7847 *destptr,
7848 saveddest, 1, OPTAB_DIRECT);
7849 /* Adjust srcptr and count. */
7850 if (!issetmem)
7851 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
7852 saveddest, *srcptr, 1, OPTAB_DIRECT);
7853 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7854 saveddest, *count, 1, OPTAB_DIRECT);
7855 /* We copied at most size + prolog_size. */
7856 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
7857 *min_size
7858 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
7859 else
7860 *min_size = 0;
7861
7862 /* Our loops always round down the block size, but for dispatch to
7863 library we need precise value. */
7864 if (dynamic_check)
7865 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
7866 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
7867 }
7868 else
7869 {
7870 gcc_assert (prolog_size == 0);
7871 /* Decrease count, so we won't end up copying last word twice. */
7872 if (!CONST_INT_P (*count))
7873 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7874 constm1_rtx, *count, 1, OPTAB_DIRECT);
7875 else
7876 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
7877 (unsigned HOST_WIDE_INT)size));
7878 if (*min_size)
7879 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
7880 }
7881 }
7882
7883
7884 /* This function is like the previous one, except here we know how many bytes
7885 need to be copied. That allows us to update alignment not only of DST, which
7886 is returned, but also of SRC, which is passed as a pointer for that
7887 reason. */
7888 static rtx
7889 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
7890 rtx srcreg, rtx value, rtx vec_value,
7891 int desired_align, int align_bytes,
7892 bool issetmem)
7893 {
7894 rtx src = NULL;
7895 rtx orig_dst = dst;
7896 rtx orig_src = NULL;
7897 int piece_size = 1;
7898 int copied_bytes = 0;
7899
7900 if (!issetmem)
7901 {
7902 gcc_assert (srcp != NULL);
7903 src = *srcp;
7904 orig_src = src;
7905 }
7906
7907 for (piece_size = 1;
7908 piece_size <= desired_align && copied_bytes < align_bytes;
7909 piece_size <<= 1)
7910 {
7911 if (align_bytes & piece_size)
7912 {
7913 if (issetmem)
7914 {
7915 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
7916 dst = emit_memset (dst, destreg, vec_value, piece_size);
7917 else
7918 dst = emit_memset (dst, destreg, value, piece_size);
7919 }
7920 else
7921 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
7922 copied_bytes += piece_size;
7923 }
7924 }
7925 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
7926 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7927 if (MEM_SIZE_KNOWN_P (orig_dst))
7928 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
7929
7930 if (!issetmem)
7931 {
7932 int src_align_bytes = get_mem_align_offset (src, desired_align
7933 * BITS_PER_UNIT);
7934 if (src_align_bytes >= 0)
7935 src_align_bytes = desired_align - src_align_bytes;
7936 if (src_align_bytes >= 0)
7937 {
7938 unsigned int src_align;
7939 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
7940 {
7941 if ((src_align_bytes & (src_align - 1))
7942 == (align_bytes & (src_align - 1)))
7943 break;
7944 }
7945 if (src_align > (unsigned int) desired_align)
7946 src_align = desired_align;
7947 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
7948 set_mem_align (src, src_align * BITS_PER_UNIT);
7949 }
7950 if (MEM_SIZE_KNOWN_P (orig_src))
7951 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
7952 *srcp = src;
7953 }
7954
7955 return dst;
7956 }
7957
7958 /* Return true if ALG can be used in current context.
7959 Assume we expand memset if MEMSET is true. */
7960 static bool
7961 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
7962 {
7963 if (alg == no_stringop)
7964 return false;
7965 if (alg == vector_loop)
7966 return TARGET_SSE || TARGET_AVX;
7967 /* Algorithms using the rep prefix want at least edi and ecx;
7968 additionally, memset wants eax and memcpy wants esi. Don't
7969 consider such algorithms if the user has appropriated those
7970 registers for their own purposes, or if we have a non-default
7971 address space, since some string insns cannot override the segment. */
7972 if (alg == rep_prefix_1_byte
7973 || alg == rep_prefix_4_byte
7974 || alg == rep_prefix_8_byte)
7975 {
7976 if (have_as)
7977 return false;
7978 if (fixed_regs[CX_REG]
7979 || fixed_regs[DI_REG]
7980 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
7981 return false;
7982 }
7983 return true;
7984 }
7985
7986 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
7987 static enum stringop_alg
7988 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
7989 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
7990 bool memset, bool zero_memset, bool have_as,
7991 int *dynamic_check, bool *noalign, bool recur)
7992 {
7993 const struct stringop_algs *algs;
7994 bool optimize_for_speed;
7995 int max = 0;
7996 const struct processor_costs *cost;
7997 int i;
7998 bool any_alg_usable_p = false;
7999
8000 *noalign = false;
8001 *dynamic_check = -1;
8002
8003 /* Even if the string operation call is cold, we still might spend a lot
8004 of time processing large blocks. */
8005 if (optimize_function_for_size_p (cfun)
8006 || (optimize_insn_for_size_p ()
8007 && (max_size < 256
8008 || (expected_size != -1 && expected_size < 256))))
8009 optimize_for_speed = false;
8010 else
8011 optimize_for_speed = true;
8012
8013 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8014 if (memset)
8015 algs = &cost->memset[TARGET_64BIT != 0];
8016 else
8017 algs = &cost->memcpy[TARGET_64BIT != 0];
8018
8019 /* See maximal size for user defined algorithm. */
8020 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8021 {
8022 enum stringop_alg candidate = algs->size[i].alg;
8023 bool usable = alg_usable_p (candidate, memset, have_as);
8024 any_alg_usable_p |= usable;
8025
8026 if (candidate != libcall && candidate && usable)
8027 max = algs->size[i].max;
8028 }
8029
8030 /* If expected size is not known but max size is small enough
8031 so inline version is a win, set expected size into
8032 the range. */
8033 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8034 && expected_size == -1)
8035 expected_size = min_size / 2 + max_size / 2;
8036
8037 /* If user specified the algorithm, honor it if possible. */
8038 if (ix86_stringop_alg != no_stringop
8039 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8040 return ix86_stringop_alg;
8041 /* rep; movq or rep; movl is the smallest variant. */
8042 else if (!optimize_for_speed)
8043 {
8044 *noalign = true;
8045 if (!count || (count & 3) || (memset && !zero_memset))
8046 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8047 ? rep_prefix_1_byte : loop_1_byte;
8048 else
8049 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8050 ? rep_prefix_4_byte : loop;
8051 }
8052 /* Very tiny blocks are best handled via the loop, REP is expensive to
8053 setup. */
8054 else if (expected_size != -1 && expected_size < 4)
8055 return loop_1_byte;
8056 else if (expected_size != -1)
8057 {
8058 enum stringop_alg alg = libcall;
8059 bool alg_noalign = false;
8060 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8061 {
8062 /* We get here if the algorithms that were not libcall-based
8063 were rep-prefix based and we are unable to use rep prefixes
8064 based on global register usage. Break out of the loop and
8065 use the heuristic below. */
8066 if (algs->size[i].max == 0)
8067 break;
8068 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8069 {
8070 enum stringop_alg candidate = algs->size[i].alg;
8071
8072 if (candidate != libcall
8073 && alg_usable_p (candidate, memset, have_as))
8074 {
8075 alg = candidate;
8076 alg_noalign = algs->size[i].noalign;
8077 }
8078 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8079 last non-libcall inline algorithm. */
8080 if (TARGET_INLINE_ALL_STRINGOPS)
8081 {
8082 /* When the current size is best to be copied by a libcall,
8083 but we are still forced to inline, run the heuristic below
8084 that will pick code for medium sized blocks. */
8085 if (alg != libcall)
8086 {
8087 *noalign = alg_noalign;
8088 return alg;
8089 }
8090 else if (!any_alg_usable_p)
8091 break;
8092 }
8093 else if (alg_usable_p (candidate, memset, have_as)
8094 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8095 && candidate == rep_prefix_1_byte
8096 /* NB: If min_size != max_size, size is
8097 unknown. */
8098 && min_size != max_size))
8099 {
8100 *noalign = algs->size[i].noalign;
8101 return candidate;
8102 }
8103 }
8104 }
8105 }
8106 /* When asked to inline the call anyway, try to pick meaningful choice.
8107 We look for maximal size of block that is faster to copy by hand and
8108 take blocks of at most of that size guessing that average size will
8109 be roughly half of the block.
8110
8111 If this turns out to be bad, we might simply specify the preferred
8112 choice in ix86_costs. */
8113 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8114 && (algs->unknown_size == libcall
8115 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8116 {
8117 enum stringop_alg alg;
8118 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8119
8120 /* If there aren't any usable algorithms or if recursing already,
8121 then recursing on smaller sizes or same size isn't going to
8122 find anything. Just return the simple byte-at-a-time copy loop. */
8123 if (!any_alg_usable_p || recur)
8124 {
8125 /* Pick something reasonable. */
8126 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8127 *dynamic_check = 128;
8128 return loop_1_byte;
8129 }
8130 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8131 zero_memset, have_as, dynamic_check, noalign, true);
8132 gcc_assert (*dynamic_check == -1);
8133 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8134 *dynamic_check = max;
8135 else
8136 gcc_assert (alg != libcall);
8137 return alg;
8138 }
8139 return (alg_usable_p (algs->unknown_size, memset, have_as)
8140 ? algs->unknown_size : libcall);
8141 }
8142
8143 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8144 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8145 static int
8146 decide_alignment (int align,
8147 enum stringop_alg alg,
8148 int expected_size,
8149 machine_mode move_mode)
8150 {
8151 int desired_align = 0;
8152
8153 gcc_assert (alg != no_stringop);
8154
8155 if (alg == libcall)
8156 return 0;
8157 if (move_mode == VOIDmode)
8158 return 0;
8159
8160 desired_align = GET_MODE_SIZE (move_mode);
8161 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8162 copying whole cacheline at once. */
8163 if (TARGET_CPU_P (PENTIUMPRO)
8164 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8165 desired_align = 8;
8166
8167 if (optimize_size)
8168 desired_align = 1;
8169 if (desired_align < align)
8170 desired_align = align;
8171 if (expected_size != -1 && expected_size < 4)
8172 desired_align = align;
8173
8174 return desired_align;
8175 }
8176
8177
8178 /* Helper function for memcpy. For QImode value 0xXY produce
8179 0xXYXYXYXY of wide specified by MODE. This is essentially
8180 a * 0x10101010, but we can do slightly better than
8181 synth_mult by unwinding the sequence by hand on CPUs with
8182 slow multiply. */
8183 static rtx
8184 promote_duplicated_reg (machine_mode mode, rtx val)
8185 {
8186 machine_mode valmode = GET_MODE (val);
8187 rtx tmp;
8188 int nops = mode == DImode ? 3 : 2;
8189
8190 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8191 if (val == const0_rtx)
8192 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8193 if (CONST_INT_P (val))
8194 {
8195 HOST_WIDE_INT v = INTVAL (val) & 255;
8196
8197 v |= v << 8;
8198 v |= v << 16;
8199 if (mode == DImode)
8200 v |= (v << 16) << 16;
8201 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8202 }
8203
8204 if (valmode == VOIDmode)
8205 valmode = QImode;
8206 if (valmode != QImode)
8207 val = gen_lowpart (QImode, val);
8208 if (mode == QImode)
8209 return val;
8210 if (!TARGET_PARTIAL_REG_STALL)
8211 nops--;
8212 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8213 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8214 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8215 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8216 {
8217 rtx reg = convert_modes (mode, QImode, val, true);
8218 tmp = promote_duplicated_reg (mode, const1_rtx);
8219 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8220 OPTAB_DIRECT);
8221 }
8222 else
8223 {
8224 rtx reg = convert_modes (mode, QImode, val, true);
8225
8226 if (!TARGET_PARTIAL_REG_STALL)
8227 emit_insn (gen_insv_1 (mode, reg, reg));
8228 else
8229 {
8230 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8231 NULL, 1, OPTAB_DIRECT);
8232 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8233 OPTAB_DIRECT);
8234 }
8235 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8236 NULL, 1, OPTAB_DIRECT);
8237 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8238 if (mode == SImode)
8239 return reg;
8240 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8241 NULL, 1, OPTAB_DIRECT);
8242 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8243 return reg;
8244 }
8245 }
8246
8247 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8248 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8249 alignment from ALIGN to DESIRED_ALIGN. */
8250 static rtx
8251 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8252 int align)
8253 {
8254 rtx promoted_val;
8255
8256 if (TARGET_64BIT
8257 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8258 promoted_val = promote_duplicated_reg (DImode, val);
8259 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8260 promoted_val = promote_duplicated_reg (SImode, val);
8261 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8262 promoted_val = promote_duplicated_reg (HImode, val);
8263 else
8264 promoted_val = val;
8265
8266 return promoted_val;
8267 }
8268
8269 /* Copy the address to a Pmode register. This is used for x32 to
8270 truncate DImode TLS address to a SImode register. */
8271
8272 static rtx
8273 ix86_copy_addr_to_reg (rtx addr)
8274 {
8275 rtx reg;
8276 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8277 {
8278 reg = copy_addr_to_reg (addr);
8279 REG_POINTER (reg) = 1;
8280 return reg;
8281 }
8282 else
8283 {
8284 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8285 reg = copy_to_mode_reg (DImode, addr);
8286 REG_POINTER (reg) = 1;
8287 return gen_rtx_SUBREG (SImode, reg, 0);
8288 }
8289 }
8290
8291 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8292 operations when profitable. The code depends upon architecture, block size
8293 and alignment, but always has one of the following overall structures:
8294
8295 Aligned move sequence:
8296
8297 1) Prologue guard: Conditional that jumps up to epilogues for small
8298 blocks that can be handled by epilogue alone. This is faster
8299 but also needed for correctness, since prologue assume the block
8300 is larger than the desired alignment.
8301
8302 Optional dynamic check for size and libcall for large
8303 blocks is emitted here too, with -minline-stringops-dynamically.
8304
8305 2) Prologue: copy first few bytes in order to get destination
8306 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8307 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8308 copied. We emit either a jump tree on power of two sized
8309 blocks, or a byte loop.
8310
8311 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8312 with specified algorithm.
8313
8314 4) Epilogue: code copying tail of the block that is too small to be
8315 handled by main body (or up to size guarded by prologue guard).
8316
8317 Misaligned move sequence
8318
8319 1) missaligned move prologue/epilogue containing:
8320 a) Prologue handling small memory blocks and jumping to done_label
8321 (skipped if blocks are known to be large enough)
8322 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8323 needed by single possibly misaligned move
8324 (skipped if alignment is not needed)
8325 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8326
8327 2) Zero size guard dispatching to done_label, if needed
8328
8329 3) dispatch to library call, if needed,
8330
8331 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8332 with specified algorithm. */
8333 bool
8334 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8335 rtx align_exp, rtx expected_align_exp,
8336 rtx expected_size_exp, rtx min_size_exp,
8337 rtx max_size_exp, rtx probable_max_size_exp,
8338 bool issetmem)
8339 {
8340 rtx destreg;
8341 rtx srcreg = NULL;
8342 rtx_code_label *label = NULL;
8343 rtx tmp;
8344 rtx_code_label *jump_around_label = NULL;
8345 HOST_WIDE_INT align = 1;
8346 unsigned HOST_WIDE_INT count = 0;
8347 HOST_WIDE_INT expected_size = -1;
8348 int size_needed = 0, epilogue_size_needed;
8349 int desired_align = 0, align_bytes = 0;
8350 enum stringop_alg alg;
8351 rtx promoted_val = NULL;
8352 rtx vec_promoted_val = NULL;
8353 bool force_loopy_epilogue = false;
8354 int dynamic_check;
8355 bool need_zero_guard = false;
8356 bool noalign;
8357 machine_mode move_mode = VOIDmode;
8358 machine_mode wider_mode;
8359 int unroll_factor = 1;
8360 /* TODO: Once value ranges are available, fill in proper data. */
8361 unsigned HOST_WIDE_INT min_size = 0;
8362 unsigned HOST_WIDE_INT max_size = -1;
8363 unsigned HOST_WIDE_INT probable_max_size = -1;
8364 bool misaligned_prologue_used = false;
8365 bool have_as;
8366
8367 if (CONST_INT_P (align_exp))
8368 align = INTVAL (align_exp);
8369 /* i386 can do misaligned access on reasonably increased cost. */
8370 if (CONST_INT_P (expected_align_exp)
8371 && INTVAL (expected_align_exp) > align)
8372 align = INTVAL (expected_align_exp);
8373 /* ALIGN is the minimum of destination and source alignment, but we care here
8374 just about destination alignment. */
8375 else if (!issetmem
8376 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8377 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8378
8379 if (CONST_INT_P (count_exp))
8380 {
8381 min_size = max_size = probable_max_size = count = expected_size
8382 = INTVAL (count_exp);
8383 /* When COUNT is 0, there is nothing to do. */
8384 if (!count)
8385 return true;
8386 }
8387 else
8388 {
8389 if (min_size_exp)
8390 min_size = INTVAL (min_size_exp);
8391 if (max_size_exp)
8392 max_size = INTVAL (max_size_exp);
8393 if (probable_max_size_exp)
8394 probable_max_size = INTVAL (probable_max_size_exp);
8395 if (CONST_INT_P (expected_size_exp))
8396 expected_size = INTVAL (expected_size_exp);
8397 }
8398
8399 /* Make sure we don't need to care about overflow later on. */
8400 if (count > (HOST_WIDE_INT_1U << 30))
8401 return false;
8402
8403 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8404 if (!issetmem)
8405 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8406
8407 /* Step 0: Decide on preferred algorithm, desired alignment and
8408 size of chunks to be copied by main loop. */
8409 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8410 issetmem,
8411 issetmem && val_exp == const0_rtx, have_as,
8412 &dynamic_check, &noalign, false);
8413
8414 if (dump_file)
8415 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8416 stringop_alg_names[alg]);
8417
8418 if (alg == libcall)
8419 return false;
8420 gcc_assert (alg != no_stringop);
8421
8422 /* For now vector-version of memset is generated only for memory zeroing, as
8423 creating of promoted vector value is very cheap in this case. */
8424 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8425 alg = unrolled_loop;
8426
8427 if (!count)
8428 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8429 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8430 if (!issetmem)
8431 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8432
8433 unroll_factor = 1;
8434 move_mode = word_mode;
8435 switch (alg)
8436 {
8437 case libcall:
8438 case no_stringop:
8439 case last_alg:
8440 gcc_unreachable ();
8441 case loop_1_byte:
8442 need_zero_guard = true;
8443 move_mode = QImode;
8444 break;
8445 case loop:
8446 need_zero_guard = true;
8447 break;
8448 case unrolled_loop:
8449 need_zero_guard = true;
8450 unroll_factor = (TARGET_64BIT ? 4 : 2);
8451 break;
8452 case vector_loop:
8453 need_zero_guard = true;
8454 unroll_factor = 4;
8455 /* Find the widest supported mode. */
8456 move_mode = word_mode;
8457 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8458 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8459 move_mode = wider_mode;
8460
8461 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8462 move_mode = TImode;
8463
8464 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8465 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8466 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8467 {
8468 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8469 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8470 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8471 move_mode = word_mode;
8472 }
8473 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8474 break;
8475 case rep_prefix_8_byte:
8476 move_mode = DImode;
8477 break;
8478 case rep_prefix_4_byte:
8479 move_mode = SImode;
8480 break;
8481 case rep_prefix_1_byte:
8482 move_mode = QImode;
8483 break;
8484 }
8485 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8486 epilogue_size_needed = size_needed;
8487
8488 /* If we are going to call any library calls conditionally, make sure any
8489 pending stack adjustment happen before the first conditional branch,
8490 otherwise they will be emitted before the library call only and won't
8491 happen from the other branches. */
8492 if (dynamic_check != -1)
8493 do_pending_stack_adjust ();
8494
8495 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8496 if (!TARGET_ALIGN_STRINGOPS || noalign)
8497 align = desired_align;
8498
8499 /* Step 1: Prologue guard. */
8500
8501 /* Alignment code needs count to be in register. */
8502 if (CONST_INT_P (count_exp) && desired_align > align)
8503 {
8504 if (INTVAL (count_exp) > desired_align
8505 && INTVAL (count_exp) > size_needed)
8506 {
8507 align_bytes
8508 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8509 if (align_bytes <= 0)
8510 align_bytes = 0;
8511 else
8512 align_bytes = desired_align - align_bytes;
8513 }
8514 if (align_bytes == 0)
8515 count_exp = force_reg (counter_mode (count_exp), count_exp);
8516 }
8517 gcc_assert (desired_align >= 1 && align >= 1);
8518
8519 /* Misaligned move sequences handle both prologue and epilogue at once.
8520 Default code generation results in a smaller code for large alignments
8521 and also avoids redundant job when sizes are known precisely. */
8522 misaligned_prologue_used
8523 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8524 && MAX (desired_align, epilogue_size_needed) <= 32
8525 && desired_align <= epilogue_size_needed
8526 && ((desired_align > align && !align_bytes)
8527 || (!count && epilogue_size_needed > 1)));
8528
8529 /* Do the cheap promotion to allow better CSE across the
8530 main loop and epilogue (ie one load of the big constant in the
8531 front of all code.
8532 For now the misaligned move sequences do not have fast path
8533 without broadcasting. */
8534 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8535 {
8536 if (alg == vector_loop)
8537 {
8538 gcc_assert (val_exp == const0_rtx);
8539 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8540 promoted_val = promote_duplicated_reg_to_size (val_exp,
8541 GET_MODE_SIZE (word_mode),
8542 desired_align, align);
8543 }
8544 else
8545 {
8546 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8547 desired_align, align);
8548 }
8549 }
8550 /* Misaligned move sequences handles both prologues and epilogues at once.
8551 Default code generation results in smaller code for large alignments and
8552 also avoids redundant job when sizes are known precisely. */
8553 if (misaligned_prologue_used)
8554 {
8555 /* Misaligned move prologue handled small blocks by itself. */
8556 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8557 (dst, src, &destreg, &srcreg,
8558 move_mode, promoted_val, vec_promoted_val,
8559 &count_exp,
8560 &jump_around_label,
8561 desired_align < align
8562 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8563 desired_align, align, &min_size, dynamic_check, issetmem);
8564 if (!issetmem)
8565 src = change_address (src, BLKmode, srcreg);
8566 dst = change_address (dst, BLKmode, destreg);
8567 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8568 epilogue_size_needed = 0;
8569 if (need_zero_guard
8570 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8571 {
8572 /* It is possible that we copied enough so the main loop will not
8573 execute. */
8574 gcc_assert (size_needed > 1);
8575 if (jump_around_label == NULL_RTX)
8576 jump_around_label = gen_label_rtx ();
8577 emit_cmp_and_jump_insns (count_exp,
8578 GEN_INT (size_needed),
8579 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8580 if (expected_size == -1
8581 || expected_size < (desired_align - align) / 2 + size_needed)
8582 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8583 else
8584 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8585 }
8586 }
8587 /* Ensure that alignment prologue won't copy past end of block. */
8588 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8589 {
8590 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8591 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8592 Make sure it is power of 2. */
8593 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8594
8595 /* To improve performance of small blocks, we jump around the VAL
8596 promoting mode. This mean that if the promoted VAL is not constant,
8597 we might not use it in the epilogue and have to use byte
8598 loop variant. */
8599 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8600 force_loopy_epilogue = true;
8601 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8602 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8603 {
8604 /* If main algorithm works on QImode, no epilogue is needed.
8605 For small sizes just don't align anything. */
8606 if (size_needed == 1)
8607 desired_align = align;
8608 else
8609 goto epilogue;
8610 }
8611 else if (!count
8612 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8613 {
8614 label = gen_label_rtx ();
8615 emit_cmp_and_jump_insns (count_exp,
8616 GEN_INT (epilogue_size_needed),
8617 LTU, 0, counter_mode (count_exp), 1, label);
8618 if (expected_size == -1 || expected_size < epilogue_size_needed)
8619 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8620 else
8621 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8622 }
8623 }
8624
8625 /* Emit code to decide on runtime whether library call or inline should be
8626 used. */
8627 if (dynamic_check != -1)
8628 {
8629 if (!issetmem && CONST_INT_P (count_exp))
8630 {
8631 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8632 {
8633 emit_block_copy_via_libcall (dst, src, count_exp);
8634 count_exp = const0_rtx;
8635 goto epilogue;
8636 }
8637 }
8638 else
8639 {
8640 rtx_code_label *hot_label = gen_label_rtx ();
8641 if (jump_around_label == NULL_RTX)
8642 jump_around_label = gen_label_rtx ();
8643 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8644 LEU, 0, counter_mode (count_exp),
8645 1, hot_label);
8646 predict_jump (REG_BR_PROB_BASE * 90 / 100);
8647 if (issetmem)
8648 set_storage_via_libcall (dst, count_exp, val_exp);
8649 else
8650 emit_block_copy_via_libcall (dst, src, count_exp);
8651 emit_jump (jump_around_label);
8652 emit_label (hot_label);
8653 }
8654 }
8655
8656 /* Step 2: Alignment prologue. */
8657 /* Do the expensive promotion once we branched off the small blocks. */
8658 if (issetmem && !promoted_val)
8659 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8660 desired_align, align);
8661
8662 if (desired_align > align && !misaligned_prologue_used)
8663 {
8664 if (align_bytes == 0)
8665 {
8666 /* Except for the first move in prologue, we no longer know
8667 constant offset in aliasing info. It don't seems to worth
8668 the pain to maintain it for the first move, so throw away
8669 the info early. */
8670 dst = change_address (dst, BLKmode, destreg);
8671 if (!issetmem)
8672 src = change_address (src, BLKmode, srcreg);
8673 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
8674 promoted_val, vec_promoted_val,
8675 count_exp, align, desired_align,
8676 issetmem);
8677 /* At most desired_align - align bytes are copied. */
8678 if (min_size < (unsigned)(desired_align - align))
8679 min_size = 0;
8680 else
8681 min_size -= desired_align - align;
8682 }
8683 else
8684 {
8685 /* If we know how many bytes need to be stored before dst is
8686 sufficiently aligned, maintain aliasing info accurately. */
8687 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
8688 srcreg,
8689 promoted_val,
8690 vec_promoted_val,
8691 desired_align,
8692 align_bytes,
8693 issetmem);
8694
8695 count_exp = plus_constant (counter_mode (count_exp),
8696 count_exp, -align_bytes);
8697 count -= align_bytes;
8698 min_size -= align_bytes;
8699 max_size -= align_bytes;
8700 }
8701 if (need_zero_guard
8702 && min_size < (unsigned HOST_WIDE_INT) size_needed
8703 && (count < (unsigned HOST_WIDE_INT) size_needed
8704 || (align_bytes == 0
8705 && count < ((unsigned HOST_WIDE_INT) size_needed
8706 + desired_align - align))))
8707 {
8708 /* It is possible that we copied enough so the main loop will not
8709 execute. */
8710 gcc_assert (size_needed > 1);
8711 if (label == NULL_RTX)
8712 label = gen_label_rtx ();
8713 emit_cmp_and_jump_insns (count_exp,
8714 GEN_INT (size_needed),
8715 LTU, 0, counter_mode (count_exp), 1, label);
8716 if (expected_size == -1
8717 || expected_size < (desired_align - align) / 2 + size_needed)
8718 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8719 else
8720 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8721 }
8722 }
8723 if (label && size_needed == 1)
8724 {
8725 emit_label (label);
8726 LABEL_NUSES (label) = 1;
8727 label = NULL;
8728 epilogue_size_needed = 1;
8729 if (issetmem)
8730 promoted_val = val_exp;
8731 }
8732 else if (label == NULL_RTX && !misaligned_prologue_used)
8733 epilogue_size_needed = size_needed;
8734
8735 /* Step 3: Main loop. */
8736
8737 switch (alg)
8738 {
8739 case libcall:
8740 case no_stringop:
8741 case last_alg:
8742 gcc_unreachable ();
8743 case loop_1_byte:
8744 case loop:
8745 case unrolled_loop:
8746 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
8747 count_exp, move_mode, unroll_factor,
8748 expected_size, issetmem);
8749 break;
8750 case vector_loop:
8751 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
8752 vec_promoted_val, count_exp, move_mode,
8753 unroll_factor, expected_size, issetmem);
8754 break;
8755 case rep_prefix_8_byte:
8756 case rep_prefix_4_byte:
8757 case rep_prefix_1_byte:
8758 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
8759 val_exp, count_exp, move_mode, issetmem);
8760 break;
8761 }
8762 /* Adjust properly the offset of src and dest memory for aliasing. */
8763 if (CONST_INT_P (count_exp))
8764 {
8765 if (!issetmem)
8766 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
8767 (count / size_needed) * size_needed);
8768 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
8769 (count / size_needed) * size_needed);
8770 }
8771 else
8772 {
8773 if (!issetmem)
8774 src = change_address (src, BLKmode, srcreg);
8775 dst = change_address (dst, BLKmode, destreg);
8776 }
8777
8778 /* Step 4: Epilogue to copy the remaining bytes. */
8779 epilogue:
8780 if (label)
8781 {
8782 /* When the main loop is done, COUNT_EXP might hold original count,
8783 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8784 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8785 bytes. Compensate if needed. */
8786
8787 if (size_needed < epilogue_size_needed)
8788 {
8789 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
8790 GEN_INT (size_needed - 1), count_exp, 1,
8791 OPTAB_DIRECT);
8792 if (tmp != count_exp)
8793 emit_move_insn (count_exp, tmp);
8794 }
8795 emit_label (label);
8796 LABEL_NUSES (label) = 1;
8797 }
8798
8799 if (count_exp != const0_rtx && epilogue_size_needed > 1)
8800 {
8801 if (force_loopy_epilogue)
8802 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
8803 epilogue_size_needed);
8804 else
8805 {
8806 if (issetmem)
8807 expand_setmem_epilogue (dst, destreg, promoted_val,
8808 vec_promoted_val, count_exp,
8809 epilogue_size_needed);
8810 else
8811 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
8812 epilogue_size_needed);
8813 }
8814 }
8815 if (jump_around_label)
8816 emit_label (jump_around_label);
8817 return true;
8818 }
8819
8820 /* Expand cmpstrn or memcmp. */
8821
8822 bool
8823 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
8824 rtx length, rtx align, bool is_cmpstrn)
8825 {
8826 /* Expand strncmp and memcmp only with -minline-all-stringops since
8827 "repz cmpsb" can be much slower than strncmp and memcmp functions
8828 implemented with vector instructions, see
8829
8830 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
8831 */
8832 if (!TARGET_INLINE_ALL_STRINGOPS)
8833 return false;
8834
8835 /* Can't use this if the user has appropriated ecx, esi or edi. */
8836 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
8837 return false;
8838
8839 if (is_cmpstrn)
8840 {
8841 /* For strncmp, length is the maximum length, which can be larger
8842 than actual string lengths. We can expand the cmpstrn pattern
8843 to "repz cmpsb" only if one of the strings is a constant so
8844 that expand_builtin_strncmp() can write the length argument to
8845 be the minimum of the const string length and the actual length
8846 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
8847 tree t1 = MEM_EXPR (src1);
8848 tree t2 = MEM_EXPR (src2);
8849 if (!((t1 && TREE_CODE (t1) == MEM_REF
8850 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
8851 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
8852 == STRING_CST))
8853 || (t2 && TREE_CODE (t2) == MEM_REF
8854 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
8855 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
8856 == STRING_CST))))
8857 return false;
8858 }
8859
8860 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
8861 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
8862 if (addr1 != XEXP (src1, 0))
8863 src1 = replace_equiv_address_nv (src1, addr1);
8864 if (addr2 != XEXP (src2, 0))
8865 src2 = replace_equiv_address_nv (src2, addr2);
8866
8867 /* NB: Make a copy of the data length to avoid changing the original
8868 data length by cmpstrnqi patterns. */
8869 length = ix86_zero_extend_to_Pmode (length);
8870 rtx lengthreg = gen_reg_rtx (Pmode);
8871 emit_move_insn (lengthreg, length);
8872
8873 /* If we are testing strict equality, we can use known alignment to
8874 good advantage. This may be possible with combine, particularly
8875 once cc0 is dead. */
8876 if (CONST_INT_P (length))
8877 {
8878 if (length == const0_rtx)
8879 {
8880 emit_move_insn (result, const0_rtx);
8881 return true;
8882 }
8883 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
8884 src1, src2));
8885 }
8886 else
8887 {
8888 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
8889 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
8890 src1, src2));
8891 }
8892
8893 rtx out = gen_lowpart (QImode, result);
8894 emit_insn (gen_cmpintqi (out));
8895 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
8896
8897 return true;
8898 }
8899
8900 /* Expand the appropriate insns for doing strlen if not just doing
8901 repnz; scasb
8902
8903 out = result, initialized with the start address
8904 align_rtx = alignment of the address.
8905 scratch = scratch register, initialized with the startaddress when
8906 not aligned, otherwise undefined
8907
8908 This is just the body. It needs the initializations mentioned above and
8909 some address computing at the end. These things are done in i386.md. */
8910
8911 static void
8912 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
8913 {
8914 int align;
8915 rtx tmp;
8916 rtx_code_label *align_2_label = NULL;
8917 rtx_code_label *align_3_label = NULL;
8918 rtx_code_label *align_4_label = gen_label_rtx ();
8919 rtx_code_label *end_0_label = gen_label_rtx ();
8920 rtx mem;
8921 rtx tmpreg = gen_reg_rtx (SImode);
8922 rtx scratch = gen_reg_rtx (SImode);
8923 rtx cmp;
8924
8925 align = 0;
8926 if (CONST_INT_P (align_rtx))
8927 align = INTVAL (align_rtx);
8928
8929 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
8930
8931 /* Is there a known alignment and is it less than 4? */
8932 if (align < 4)
8933 {
8934 rtx scratch1 = gen_reg_rtx (Pmode);
8935 emit_move_insn (scratch1, out);
8936 /* Is there a known alignment and is it not 2? */
8937 if (align != 2)
8938 {
8939 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
8940 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
8941
8942 /* Leave just the 3 lower bits. */
8943 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
8944 NULL_RTX, 0, OPTAB_WIDEN);
8945
8946 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8947 Pmode, 1, align_4_label);
8948 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
8949 Pmode, 1, align_2_label);
8950 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
8951 Pmode, 1, align_3_label);
8952 }
8953 else
8954 {
8955 /* Since the alignment is 2, we have to check 2 or 0 bytes;
8956 check if is aligned to 4 - byte. */
8957
8958 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
8959 NULL_RTX, 0, OPTAB_WIDEN);
8960
8961 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8962 Pmode, 1, align_4_label);
8963 }
8964
8965 mem = change_address (src, QImode, out);
8966
8967 /* Now compare the bytes. */
8968
8969 /* Compare the first n unaligned byte on a byte per byte basis. */
8970 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
8971 QImode, 1, end_0_label);
8972
8973 /* Increment the address. */
8974 emit_insn (gen_add2_insn (out, const1_rtx));
8975
8976 /* Not needed with an alignment of 2 */
8977 if (align != 2)
8978 {
8979 emit_label (align_2_label);
8980
8981 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
8982 end_0_label);
8983
8984 emit_insn (gen_add2_insn (out, const1_rtx));
8985
8986 emit_label (align_3_label);
8987 }
8988
8989 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
8990 end_0_label);
8991
8992 emit_insn (gen_add2_insn (out, const1_rtx));
8993 }
8994
8995 /* Generate loop to check 4 bytes at a time. It is not a good idea to
8996 align this loop. It gives only huge programs, but does not help to
8997 speed up. */
8998 emit_label (align_4_label);
8999
9000 mem = change_address (src, SImode, out);
9001 emit_move_insn (scratch, mem);
9002 emit_insn (gen_add2_insn (out, GEN_INT (4)));
9003
9004 /* This formula yields a nonzero result iff one of the bytes is zero.
9005 This saves three branches inside loop and many cycles. */
9006
9007 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9008 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9009 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9010 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9011 gen_int_mode (0x80808080, SImode)));
9012 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9013 align_4_label);
9014
9015 if (TARGET_CMOVE)
9016 {
9017 rtx reg = gen_reg_rtx (SImode);
9018 rtx reg2 = gen_reg_rtx (Pmode);
9019 emit_move_insn (reg, tmpreg);
9020 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9021
9022 /* If zero is not in the first two bytes, move two bytes forward. */
9023 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9024 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9025 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9026 emit_insn (gen_rtx_SET (tmpreg,
9027 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9028 reg,
9029 tmpreg)));
9030 /* Emit lea manually to avoid clobbering of flags. */
9031 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9032
9033 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9034 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9035 emit_insn (gen_rtx_SET (out,
9036 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9037 reg2,
9038 out)));
9039 }
9040 else
9041 {
9042 rtx_code_label *end_2_label = gen_label_rtx ();
9043 /* Is zero in the first two bytes? */
9044
9045 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9046 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9047 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9048 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9049 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9050 pc_rtx);
9051 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9052 JUMP_LABEL (tmp) = end_2_label;
9053
9054 /* Not in the first two. Move two bytes forward. */
9055 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9056 emit_insn (gen_add2_insn (out, const2_rtx));
9057
9058 emit_label (end_2_label);
9059
9060 }
9061
9062 /* Avoid branch in fixing the byte. */
9063 tmpreg = gen_lowpart (QImode, tmpreg);
9064 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9065 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9066 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9067 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9068
9069 emit_label (end_0_label);
9070 }
9071
9072 /* Expand strlen. */
9073
9074 bool
9075 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9076 {
9077 if (TARGET_UNROLL_STRLEN
9078 && TARGET_INLINE_ALL_STRINGOPS
9079 && eoschar == const0_rtx
9080 && optimize > 1)
9081 {
9082 /* The generic case of strlen expander is long. Avoid it's
9083 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9084 rtx addr = force_reg (Pmode, XEXP (src, 0));
9085 /* Well it seems that some optimizer does not combine a call like
9086 foo(strlen(bar), strlen(bar));
9087 when the move and the subtraction is done here. It does calculate
9088 the length just once when these instructions are done inside of
9089 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9090 often used and I use one fewer register for the lifetime of
9091 output_strlen_unroll() this is better. */
9092
9093 emit_move_insn (out, addr);
9094
9095 ix86_expand_strlensi_unroll_1 (out, src, align);
9096
9097 /* strlensi_unroll_1 returns the address of the zero at the end of
9098 the string, like memchr(), so compute the length by subtracting
9099 the start address. */
9100 emit_insn (gen_sub2_insn (out, addr));
9101 return true;
9102 }
9103 else
9104 return false;
9105 }
9106
9107 /* For given symbol (function) construct code to compute address of it's PLT
9108 entry in large x86-64 PIC model. */
9109
9110 static rtx
9111 construct_plt_address (rtx symbol)
9112 {
9113 rtx tmp, unspec;
9114
9115 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9116 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9117 gcc_assert (Pmode == DImode);
9118
9119 tmp = gen_reg_rtx (Pmode);
9120 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9121
9122 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9123 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9124 return tmp;
9125 }
9126
9127 /* Additional registers that are clobbered by SYSV calls. */
9128
9129 static int const x86_64_ms_sysv_extra_clobbered_registers
9130 [NUM_X86_64_MS_CLOBBERED_REGS] =
9131 {
9132 SI_REG, DI_REG,
9133 XMM6_REG, XMM7_REG,
9134 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9135 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9136 };
9137
9138 rtx_insn *
9139 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9140 rtx callarg2,
9141 rtx pop, bool sibcall)
9142 {
9143 rtx vec[3];
9144 rtx use = NULL, call;
9145 unsigned int vec_len = 0;
9146 tree fndecl;
9147
9148 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9149 {
9150 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9151 if (fndecl
9152 && (lookup_attribute ("interrupt",
9153 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9154 error ("interrupt service routine cannot be called directly");
9155 }
9156 else
9157 fndecl = NULL_TREE;
9158
9159 if (pop == const0_rtx)
9160 pop = NULL;
9161 gcc_assert (!TARGET_64BIT || !pop);
9162
9163 rtx addr = XEXP (fnaddr, 0);
9164 if (TARGET_MACHO && !TARGET_64BIT)
9165 {
9166 #if TARGET_MACHO
9167 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9168 fnaddr = machopic_indirect_call_target (fnaddr);
9169 #endif
9170 }
9171 else
9172 {
9173 /* Static functions and indirect calls don't need the pic register. Also,
9174 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9175 it an indirect call. */
9176 if (flag_pic
9177 && GET_CODE (addr) == SYMBOL_REF
9178 && ix86_call_use_plt_p (addr))
9179 {
9180 if (flag_plt
9181 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9182 || !lookup_attribute ("noplt",
9183 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9184 {
9185 if (!TARGET_64BIT
9186 || (ix86_cmodel == CM_LARGE_PIC
9187 && DEFAULT_ABI != MS_ABI))
9188 {
9189 use_reg (&use, gen_rtx_REG (Pmode,
9190 REAL_PIC_OFFSET_TABLE_REGNUM));
9191 if (ix86_use_pseudo_pic_reg ())
9192 emit_move_insn (gen_rtx_REG (Pmode,
9193 REAL_PIC_OFFSET_TABLE_REGNUM),
9194 pic_offset_table_rtx);
9195 }
9196 }
9197 else if (!TARGET_PECOFF && !TARGET_MACHO)
9198 {
9199 if (TARGET_64BIT
9200 && ix86_cmodel == CM_LARGE_PIC
9201 && DEFAULT_ABI != MS_ABI)
9202 {
9203 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9204 UNSPEC_GOT);
9205 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9206 fnaddr = force_reg (Pmode, fnaddr);
9207 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9208 }
9209 else if (TARGET_64BIT)
9210 {
9211 fnaddr = gen_rtx_UNSPEC (Pmode,
9212 gen_rtvec (1, addr),
9213 UNSPEC_GOTPCREL);
9214 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9215 }
9216 else
9217 {
9218 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9219 UNSPEC_GOT);
9220 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9221 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9222 fnaddr);
9223 }
9224 fnaddr = gen_const_mem (Pmode, fnaddr);
9225 /* Pmode may not be the same as word_mode for x32, which
9226 doesn't support indirect branch via 32-bit memory slot.
9227 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9228 indirect branch via x32 GOT slot is OK. */
9229 if (GET_MODE (fnaddr) != word_mode)
9230 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9231 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9232 }
9233 }
9234 }
9235
9236 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9237 parameters passed in vector registers. */
9238 if (TARGET_64BIT
9239 && (INTVAL (callarg2) > 0
9240 || (INTVAL (callarg2) == 0
9241 && (TARGET_SSE || !flag_skip_rax_setup))))
9242 {
9243 rtx al = gen_rtx_REG (QImode, AX_REG);
9244 emit_move_insn (al, callarg2);
9245 use_reg (&use, al);
9246 }
9247
9248 if (ix86_cmodel == CM_LARGE_PIC
9249 && !TARGET_PECOFF
9250 && MEM_P (fnaddr)
9251 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9252 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9253 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9254 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9255 branch via x32 GOT slot is OK. */
9256 else if (!(TARGET_X32
9257 && MEM_P (fnaddr)
9258 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9259 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9260 && (sibcall
9261 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9262 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9263 {
9264 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9265 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9266 }
9267
9268 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9269
9270 if (retval)
9271 call = gen_rtx_SET (retval, call);
9272 vec[vec_len++] = call;
9273
9274 if (pop)
9275 {
9276 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9277 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9278 vec[vec_len++] = pop;
9279 }
9280
9281 if (cfun->machine->no_caller_saved_registers
9282 && (!fndecl
9283 || (!TREE_THIS_VOLATILE (fndecl)
9284 && !lookup_attribute ("no_caller_saved_registers",
9285 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9286 {
9287 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9288 bool is_64bit_ms_abi = (TARGET_64BIT
9289 && ix86_function_abi (fndecl) == MS_ABI);
9290 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9291
9292 /* If there are no caller-saved registers, add all registers
9293 that are clobbered by the call which returns. */
9294 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9295 if (!fixed_regs[i]
9296 && (ix86_call_used_regs[i] == 1
9297 || (ix86_call_used_regs[i] & c_mask))
9298 && !STACK_REGNO_P (i)
9299 && !MMX_REGNO_P (i))
9300 clobber_reg (&use,
9301 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9302 }
9303 else if (TARGET_64BIT_MS_ABI
9304 && (!callarg2 || INTVAL (callarg2) != -2))
9305 {
9306 unsigned i;
9307
9308 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9309 {
9310 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9311 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9312
9313 clobber_reg (&use, gen_rtx_REG (mode, regno));
9314 }
9315
9316 /* Set here, but it may get cleared later. */
9317 if (TARGET_CALL_MS2SYSV_XLOGUES)
9318 {
9319 if (!TARGET_SSE)
9320 ;
9321
9322 /* Don't break hot-patched functions. */
9323 else if (ix86_function_ms_hook_prologue (current_function_decl))
9324 ;
9325
9326 /* TODO: Cases not yet examined. */
9327 else if (flag_split_stack)
9328 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9329
9330 else
9331 {
9332 gcc_assert (!reload_completed);
9333 cfun->machine->call_ms2sysv = true;
9334 }
9335 }
9336 }
9337
9338 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9339 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9340 || !fndecl || TREE_PUBLIC (fndecl)))
9341 {
9342 /* We allow public functions defined in a TU to bind locally for PIC
9343 code (the default) on 64bit Mach-O.
9344 If such functions are not inlined, we cannot tell at compile-time if
9345 they will be called via the lazy symbol resolver (this can depend on
9346 options given at link-time). Therefore, we must assume that the lazy
9347 resolver could be used which clobbers R11 and R10. */
9348 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9349 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9350 }
9351
9352 if (vec_len > 1)
9353 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9354 rtx_insn *call_insn = emit_call_insn (call);
9355 if (use)
9356 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9357
9358 return call_insn;
9359 }
9360
9361 /* Split simple return with popping POPC bytes from stack to indirect
9362 branch with stack adjustment . */
9363
9364 void
9365 ix86_split_simple_return_pop_internal (rtx popc)
9366 {
9367 struct machine_function *m = cfun->machine;
9368 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9369 rtx_insn *insn;
9370
9371 /* There is no "pascal" calling convention in any 64bit ABI. */
9372 gcc_assert (!TARGET_64BIT);
9373
9374 insn = emit_insn (gen_pop (ecx));
9375 m->fs.cfa_offset -= UNITS_PER_WORD;
9376 m->fs.sp_offset -= UNITS_PER_WORD;
9377
9378 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9379 x = gen_rtx_SET (stack_pointer_rtx, x);
9380 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9381 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9382 RTX_FRAME_RELATED_P (insn) = 1;
9383
9384 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9385 x = gen_rtx_SET (stack_pointer_rtx, x);
9386 insn = emit_insn (x);
9387 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9388 RTX_FRAME_RELATED_P (insn) = 1;
9389
9390 /* Now return address is in ECX. */
9391 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9392 }
9393
9394 /* Errors in the source file can cause expand_expr to return const0_rtx
9395 where we expect a vector. To avoid crashing, use one of the vector
9396 clear instructions. */
9397
9398 static rtx
9399 safe_vector_operand (rtx x, machine_mode mode)
9400 {
9401 if (x == const0_rtx)
9402 x = CONST0_RTX (mode);
9403 return x;
9404 }
9405
9406 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9407
9408 static rtx
9409 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9410 {
9411 rtx pat;
9412 tree arg0 = CALL_EXPR_ARG (exp, 0);
9413 tree arg1 = CALL_EXPR_ARG (exp, 1);
9414 rtx op0 = expand_normal (arg0);
9415 rtx op1 = expand_normal (arg1);
9416 machine_mode tmode = insn_data[icode].operand[0].mode;
9417 machine_mode mode0 = insn_data[icode].operand[1].mode;
9418 machine_mode mode1 = insn_data[icode].operand[2].mode;
9419
9420 if (VECTOR_MODE_P (mode0))
9421 op0 = safe_vector_operand (op0, mode0);
9422 if (VECTOR_MODE_P (mode1))
9423 op1 = safe_vector_operand (op1, mode1);
9424
9425 if (optimize || !target
9426 || GET_MODE (target) != tmode
9427 || !insn_data[icode].operand[0].predicate (target, tmode))
9428 target = gen_reg_rtx (tmode);
9429
9430 if (GET_MODE (op1) == SImode && mode1 == TImode)
9431 {
9432 rtx x = gen_reg_rtx (V4SImode);
9433 emit_insn (gen_sse2_loadd (x, op1));
9434 op1 = gen_lowpart (TImode, x);
9435 }
9436
9437 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9438 op0 = copy_to_mode_reg (mode0, op0);
9439 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9440 op1 = copy_to_mode_reg (mode1, op1);
9441
9442 pat = GEN_FCN (icode) (target, op0, op1);
9443 if (! pat)
9444 return 0;
9445
9446 emit_insn (pat);
9447
9448 return target;
9449 }
9450
9451 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9452
9453 static rtx
9454 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9455 enum ix86_builtin_func_type m_type,
9456 enum rtx_code sub_code)
9457 {
9458 rtx pat;
9459 unsigned int i, nargs;
9460 bool comparison_p = false;
9461 bool tf_p = false;
9462 bool last_arg_constant = false;
9463 int num_memory = 0;
9464 rtx xops[4];
9465
9466 machine_mode tmode = insn_data[icode].operand[0].mode;
9467
9468 switch (m_type)
9469 {
9470 case MULTI_ARG_4_DF2_DI_I:
9471 case MULTI_ARG_4_DF2_DI_I1:
9472 case MULTI_ARG_4_SF2_SI_I:
9473 case MULTI_ARG_4_SF2_SI_I1:
9474 nargs = 4;
9475 last_arg_constant = true;
9476 break;
9477
9478 case MULTI_ARG_3_SF:
9479 case MULTI_ARG_3_DF:
9480 case MULTI_ARG_3_SF2:
9481 case MULTI_ARG_3_DF2:
9482 case MULTI_ARG_3_DI:
9483 case MULTI_ARG_3_SI:
9484 case MULTI_ARG_3_SI_DI:
9485 case MULTI_ARG_3_HI:
9486 case MULTI_ARG_3_HI_SI:
9487 case MULTI_ARG_3_QI:
9488 case MULTI_ARG_3_DI2:
9489 case MULTI_ARG_3_SI2:
9490 case MULTI_ARG_3_HI2:
9491 case MULTI_ARG_3_QI2:
9492 nargs = 3;
9493 break;
9494
9495 case MULTI_ARG_2_SF:
9496 case MULTI_ARG_2_DF:
9497 case MULTI_ARG_2_DI:
9498 case MULTI_ARG_2_SI:
9499 case MULTI_ARG_2_HI:
9500 case MULTI_ARG_2_QI:
9501 nargs = 2;
9502 break;
9503
9504 case MULTI_ARG_2_DI_IMM:
9505 case MULTI_ARG_2_SI_IMM:
9506 case MULTI_ARG_2_HI_IMM:
9507 case MULTI_ARG_2_QI_IMM:
9508 nargs = 2;
9509 last_arg_constant = true;
9510 break;
9511
9512 case MULTI_ARG_1_SF:
9513 case MULTI_ARG_1_DF:
9514 case MULTI_ARG_1_SF2:
9515 case MULTI_ARG_1_DF2:
9516 case MULTI_ARG_1_DI:
9517 case MULTI_ARG_1_SI:
9518 case MULTI_ARG_1_HI:
9519 case MULTI_ARG_1_QI:
9520 case MULTI_ARG_1_SI_DI:
9521 case MULTI_ARG_1_HI_DI:
9522 case MULTI_ARG_1_HI_SI:
9523 case MULTI_ARG_1_QI_DI:
9524 case MULTI_ARG_1_QI_SI:
9525 case MULTI_ARG_1_QI_HI:
9526 nargs = 1;
9527 break;
9528
9529 case MULTI_ARG_2_DI_CMP:
9530 case MULTI_ARG_2_SI_CMP:
9531 case MULTI_ARG_2_HI_CMP:
9532 case MULTI_ARG_2_QI_CMP:
9533 nargs = 2;
9534 comparison_p = true;
9535 break;
9536
9537 case MULTI_ARG_2_SF_TF:
9538 case MULTI_ARG_2_DF_TF:
9539 case MULTI_ARG_2_DI_TF:
9540 case MULTI_ARG_2_SI_TF:
9541 case MULTI_ARG_2_HI_TF:
9542 case MULTI_ARG_2_QI_TF:
9543 nargs = 2;
9544 tf_p = true;
9545 break;
9546
9547 default:
9548 gcc_unreachable ();
9549 }
9550
9551 if (optimize || !target
9552 || GET_MODE (target) != tmode
9553 || !insn_data[icode].operand[0].predicate (target, tmode))
9554 target = gen_reg_rtx (tmode);
9555 else if (memory_operand (target, tmode))
9556 num_memory++;
9557
9558 gcc_assert (nargs <= ARRAY_SIZE (xops));
9559
9560 for (i = 0; i < nargs; i++)
9561 {
9562 tree arg = CALL_EXPR_ARG (exp, i);
9563 rtx op = expand_normal (arg);
9564 int adjust = (comparison_p) ? 1 : 0;
9565 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9566
9567 if (last_arg_constant && i == nargs - 1)
9568 {
9569 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9570 {
9571 enum insn_code new_icode = icode;
9572 switch (icode)
9573 {
9574 case CODE_FOR_xop_vpermil2v2df3:
9575 case CODE_FOR_xop_vpermil2v4sf3:
9576 case CODE_FOR_xop_vpermil2v4df3:
9577 case CODE_FOR_xop_vpermil2v8sf3:
9578 error ("the last argument must be a 2-bit immediate");
9579 return gen_reg_rtx (tmode);
9580 case CODE_FOR_xop_rotlv2di3:
9581 new_icode = CODE_FOR_rotlv2di3;
9582 goto xop_rotl;
9583 case CODE_FOR_xop_rotlv4si3:
9584 new_icode = CODE_FOR_rotlv4si3;
9585 goto xop_rotl;
9586 case CODE_FOR_xop_rotlv8hi3:
9587 new_icode = CODE_FOR_rotlv8hi3;
9588 goto xop_rotl;
9589 case CODE_FOR_xop_rotlv16qi3:
9590 new_icode = CODE_FOR_rotlv16qi3;
9591 xop_rotl:
9592 if (CONST_INT_P (op))
9593 {
9594 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9595 op = GEN_INT (INTVAL (op) & mask);
9596 gcc_checking_assert
9597 (insn_data[icode].operand[i + 1].predicate (op, mode));
9598 }
9599 else
9600 {
9601 gcc_checking_assert
9602 (nargs == 2
9603 && insn_data[new_icode].operand[0].mode == tmode
9604 && insn_data[new_icode].operand[1].mode == tmode
9605 && insn_data[new_icode].operand[2].mode == mode
9606 && insn_data[new_icode].operand[0].predicate
9607 == insn_data[icode].operand[0].predicate
9608 && insn_data[new_icode].operand[1].predicate
9609 == insn_data[icode].operand[1].predicate);
9610 icode = new_icode;
9611 goto non_constant;
9612 }
9613 break;
9614 default:
9615 gcc_unreachable ();
9616 }
9617 }
9618 }
9619 else
9620 {
9621 non_constant:
9622 if (VECTOR_MODE_P (mode))
9623 op = safe_vector_operand (op, mode);
9624
9625 /* If we aren't optimizing, only allow one memory operand to be
9626 generated. */
9627 if (memory_operand (op, mode))
9628 num_memory++;
9629
9630 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9631
9632 if (optimize
9633 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9634 || num_memory > 1)
9635 op = force_reg (mode, op);
9636 }
9637
9638 xops[i] = op;
9639 }
9640
9641 switch (nargs)
9642 {
9643 case 1:
9644 pat = GEN_FCN (icode) (target, xops[0]);
9645 break;
9646
9647 case 2:
9648 if (tf_p)
9649 pat = GEN_FCN (icode) (target, xops[0], xops[1],
9650 GEN_INT ((int)sub_code));
9651 else if (! comparison_p)
9652 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
9653 else
9654 {
9655 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
9656 xops[0], xops[1]);
9657
9658 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
9659 }
9660 break;
9661
9662 case 3:
9663 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
9664 break;
9665
9666 case 4:
9667 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
9668 break;
9669
9670 default:
9671 gcc_unreachable ();
9672 }
9673
9674 if (! pat)
9675 return 0;
9676
9677 emit_insn (pat);
9678 return target;
9679 }
9680
9681 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9682 insns with vec_merge. */
9683
9684 static rtx
9685 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9686 rtx target)
9687 {
9688 rtx pat;
9689 tree arg0 = CALL_EXPR_ARG (exp, 0);
9690 rtx op1, op0 = expand_normal (arg0);
9691 machine_mode tmode = insn_data[icode].operand[0].mode;
9692 machine_mode mode0 = insn_data[icode].operand[1].mode;
9693
9694 if (optimize || !target
9695 || GET_MODE (target) != tmode
9696 || !insn_data[icode].operand[0].predicate (target, tmode))
9697 target = gen_reg_rtx (tmode);
9698
9699 if (VECTOR_MODE_P (mode0))
9700 op0 = safe_vector_operand (op0, mode0);
9701
9702 if ((optimize && !register_operand (op0, mode0))
9703 || !insn_data[icode].operand[1].predicate (op0, mode0))
9704 op0 = copy_to_mode_reg (mode0, op0);
9705
9706 op1 = op0;
9707 if (!insn_data[icode].operand[2].predicate (op1, mode0))
9708 op1 = copy_to_mode_reg (mode0, op1);
9709
9710 pat = GEN_FCN (icode) (target, op0, op1);
9711 if (! pat)
9712 return 0;
9713 emit_insn (pat);
9714 return target;
9715 }
9716
9717 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9718
9719 static rtx
9720 ix86_expand_sse_compare (const struct builtin_description *d,
9721 tree exp, rtx target, bool swap)
9722 {
9723 rtx pat;
9724 tree arg0 = CALL_EXPR_ARG (exp, 0);
9725 tree arg1 = CALL_EXPR_ARG (exp, 1);
9726 rtx op0 = expand_normal (arg0);
9727 rtx op1 = expand_normal (arg1);
9728 rtx op2;
9729 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9730 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9731 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9732 enum rtx_code comparison = d->comparison;
9733
9734 if (VECTOR_MODE_P (mode0))
9735 op0 = safe_vector_operand (op0, mode0);
9736 if (VECTOR_MODE_P (mode1))
9737 op1 = safe_vector_operand (op1, mode1);
9738
9739 /* Swap operands if we have a comparison that isn't available in
9740 hardware. */
9741 if (swap)
9742 std::swap (op0, op1);
9743
9744 if (optimize || !target
9745 || GET_MODE (target) != tmode
9746 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9747 target = gen_reg_rtx (tmode);
9748
9749 if ((optimize && !register_operand (op0, mode0))
9750 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
9751 op0 = copy_to_mode_reg (mode0, op0);
9752 if ((optimize && !register_operand (op1, mode1))
9753 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
9754 op1 = copy_to_mode_reg (mode1, op1);
9755
9756 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
9757 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9758 if (! pat)
9759 return 0;
9760 emit_insn (pat);
9761 return target;
9762 }
9763
9764 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
9765
9766 static rtx
9767 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
9768 rtx target)
9769 {
9770 rtx pat;
9771 tree arg0 = CALL_EXPR_ARG (exp, 0);
9772 tree arg1 = CALL_EXPR_ARG (exp, 1);
9773 rtx op0 = expand_normal (arg0);
9774 rtx op1 = expand_normal (arg1);
9775 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9776 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9777 enum rtx_code comparison = d->comparison;
9778
9779 if (VECTOR_MODE_P (mode0))
9780 op0 = safe_vector_operand (op0, mode0);
9781 if (VECTOR_MODE_P (mode1))
9782 op1 = safe_vector_operand (op1, mode1);
9783
9784 target = gen_reg_rtx (SImode);
9785 emit_move_insn (target, const0_rtx);
9786 target = gen_rtx_SUBREG (QImode, target, 0);
9787
9788 if ((optimize && !register_operand (op0, mode0))
9789 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9790 op0 = copy_to_mode_reg (mode0, op0);
9791 if ((optimize && !register_operand (op1, mode1))
9792 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9793 op1 = copy_to_mode_reg (mode1, op1);
9794
9795 pat = GEN_FCN (d->icode) (op0, op1);
9796 if (! pat)
9797 return 0;
9798 emit_insn (pat);
9799 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9800 gen_rtx_fmt_ee (comparison, QImode,
9801 SET_DEST (pat),
9802 const0_rtx)));
9803
9804 return SUBREG_REG (target);
9805 }
9806
9807 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
9808
9809 static rtx
9810 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
9811 rtx target)
9812 {
9813 rtx pat;
9814 tree arg0 = CALL_EXPR_ARG (exp, 0);
9815 rtx op1, op0 = expand_normal (arg0);
9816 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9817 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9818
9819 if (optimize || target == 0
9820 || GET_MODE (target) != tmode
9821 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9822 target = gen_reg_rtx (tmode);
9823
9824 if (VECTOR_MODE_P (mode0))
9825 op0 = safe_vector_operand (op0, mode0);
9826
9827 if ((optimize && !register_operand (op0, mode0))
9828 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9829 op0 = copy_to_mode_reg (mode0, op0);
9830
9831 op1 = GEN_INT (d->comparison);
9832
9833 pat = GEN_FCN (d->icode) (target, op0, op1);
9834 if (! pat)
9835 return 0;
9836 emit_insn (pat);
9837 return target;
9838 }
9839
9840 static rtx
9841 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
9842 tree exp, rtx target)
9843 {
9844 rtx pat;
9845 tree arg0 = CALL_EXPR_ARG (exp, 0);
9846 tree arg1 = CALL_EXPR_ARG (exp, 1);
9847 rtx op0 = expand_normal (arg0);
9848 rtx op1 = expand_normal (arg1);
9849 rtx op2;
9850 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9851 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9852 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9853
9854 if (optimize || target == 0
9855 || GET_MODE (target) != tmode
9856 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9857 target = gen_reg_rtx (tmode);
9858
9859 op0 = safe_vector_operand (op0, mode0);
9860 op1 = safe_vector_operand (op1, mode1);
9861
9862 if ((optimize && !register_operand (op0, mode0))
9863 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9864 op0 = copy_to_mode_reg (mode0, op0);
9865 if ((optimize && !register_operand (op1, mode1))
9866 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9867 op1 = copy_to_mode_reg (mode1, op1);
9868
9869 op2 = GEN_INT (d->comparison);
9870
9871 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9872 if (! pat)
9873 return 0;
9874 emit_insn (pat);
9875 return target;
9876 }
9877
9878 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
9879
9880 static rtx
9881 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
9882 rtx target)
9883 {
9884 rtx pat;
9885 tree arg0 = CALL_EXPR_ARG (exp, 0);
9886 tree arg1 = CALL_EXPR_ARG (exp, 1);
9887 rtx op0 = expand_normal (arg0);
9888 rtx op1 = expand_normal (arg1);
9889 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9890 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9891 enum rtx_code comparison = d->comparison;
9892
9893 if (VECTOR_MODE_P (mode0))
9894 op0 = safe_vector_operand (op0, mode0);
9895 if (VECTOR_MODE_P (mode1))
9896 op1 = safe_vector_operand (op1, mode1);
9897
9898 target = gen_reg_rtx (SImode);
9899 emit_move_insn (target, const0_rtx);
9900 target = gen_rtx_SUBREG (QImode, target, 0);
9901
9902 if ((optimize && !register_operand (op0, mode0))
9903 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9904 op0 = copy_to_mode_reg (mode0, op0);
9905 if ((optimize && !register_operand (op1, mode1))
9906 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9907 op1 = copy_to_mode_reg (mode1, op1);
9908
9909 pat = GEN_FCN (d->icode) (op0, op1);
9910 if (! pat)
9911 return 0;
9912 emit_insn (pat);
9913 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9914 gen_rtx_fmt_ee (comparison, QImode,
9915 SET_DEST (pat),
9916 const0_rtx)));
9917
9918 return SUBREG_REG (target);
9919 }
9920
9921 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
9922
9923 static rtx
9924 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
9925 tree exp, rtx target)
9926 {
9927 rtx pat;
9928 tree arg0 = CALL_EXPR_ARG (exp, 0);
9929 tree arg1 = CALL_EXPR_ARG (exp, 1);
9930 tree arg2 = CALL_EXPR_ARG (exp, 2);
9931 tree arg3 = CALL_EXPR_ARG (exp, 3);
9932 tree arg4 = CALL_EXPR_ARG (exp, 4);
9933 rtx scratch0, scratch1;
9934 rtx op0 = expand_normal (arg0);
9935 rtx op1 = expand_normal (arg1);
9936 rtx op2 = expand_normal (arg2);
9937 rtx op3 = expand_normal (arg3);
9938 rtx op4 = expand_normal (arg4);
9939 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
9940
9941 tmode0 = insn_data[d->icode].operand[0].mode;
9942 tmode1 = insn_data[d->icode].operand[1].mode;
9943 modev2 = insn_data[d->icode].operand[2].mode;
9944 modei3 = insn_data[d->icode].operand[3].mode;
9945 modev4 = insn_data[d->icode].operand[4].mode;
9946 modei5 = insn_data[d->icode].operand[5].mode;
9947 modeimm = insn_data[d->icode].operand[6].mode;
9948
9949 if (VECTOR_MODE_P (modev2))
9950 op0 = safe_vector_operand (op0, modev2);
9951 if (VECTOR_MODE_P (modev4))
9952 op2 = safe_vector_operand (op2, modev4);
9953
9954 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
9955 op0 = copy_to_mode_reg (modev2, op0);
9956 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
9957 op1 = copy_to_mode_reg (modei3, op1);
9958 if ((optimize && !register_operand (op2, modev4))
9959 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
9960 op2 = copy_to_mode_reg (modev4, op2);
9961 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
9962 op3 = copy_to_mode_reg (modei5, op3);
9963
9964 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
9965 {
9966 error ("the fifth argument must be an 8-bit immediate");
9967 return const0_rtx;
9968 }
9969
9970 if (d->code == IX86_BUILTIN_PCMPESTRI128)
9971 {
9972 if (optimize || !target
9973 || GET_MODE (target) != tmode0
9974 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
9975 target = gen_reg_rtx (tmode0);
9976
9977 scratch1 = gen_reg_rtx (tmode1);
9978
9979 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
9980 }
9981 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
9982 {
9983 if (optimize || !target
9984 || GET_MODE (target) != tmode1
9985 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
9986 target = gen_reg_rtx (tmode1);
9987
9988 scratch0 = gen_reg_rtx (tmode0);
9989
9990 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
9991 }
9992 else
9993 {
9994 gcc_assert (d->flag);
9995
9996 scratch0 = gen_reg_rtx (tmode0);
9997 scratch1 = gen_reg_rtx (tmode1);
9998
9999 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10000 }
10001
10002 if (! pat)
10003 return 0;
10004
10005 emit_insn (pat);
10006
10007 if (d->flag)
10008 {
10009 target = gen_reg_rtx (SImode);
10010 emit_move_insn (target, const0_rtx);
10011 target = gen_rtx_SUBREG (QImode, target, 0);
10012
10013 emit_insn
10014 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10015 gen_rtx_fmt_ee (EQ, QImode,
10016 gen_rtx_REG ((machine_mode) d->flag,
10017 FLAGS_REG),
10018 const0_rtx)));
10019 return SUBREG_REG (target);
10020 }
10021 else
10022 return target;
10023 }
10024
10025
10026 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10027
10028 static rtx
10029 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10030 tree exp, rtx target)
10031 {
10032 rtx pat;
10033 tree arg0 = CALL_EXPR_ARG (exp, 0);
10034 tree arg1 = CALL_EXPR_ARG (exp, 1);
10035 tree arg2 = CALL_EXPR_ARG (exp, 2);
10036 rtx scratch0, scratch1;
10037 rtx op0 = expand_normal (arg0);
10038 rtx op1 = expand_normal (arg1);
10039 rtx op2 = expand_normal (arg2);
10040 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10041
10042 tmode0 = insn_data[d->icode].operand[0].mode;
10043 tmode1 = insn_data[d->icode].operand[1].mode;
10044 modev2 = insn_data[d->icode].operand[2].mode;
10045 modev3 = insn_data[d->icode].operand[3].mode;
10046 modeimm = insn_data[d->icode].operand[4].mode;
10047
10048 if (VECTOR_MODE_P (modev2))
10049 op0 = safe_vector_operand (op0, modev2);
10050 if (VECTOR_MODE_P (modev3))
10051 op1 = safe_vector_operand (op1, modev3);
10052
10053 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10054 op0 = copy_to_mode_reg (modev2, op0);
10055 if ((optimize && !register_operand (op1, modev3))
10056 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10057 op1 = copy_to_mode_reg (modev3, op1);
10058
10059 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10060 {
10061 error ("the third argument must be an 8-bit immediate");
10062 return const0_rtx;
10063 }
10064
10065 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10066 {
10067 if (optimize || !target
10068 || GET_MODE (target) != tmode0
10069 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10070 target = gen_reg_rtx (tmode0);
10071
10072 scratch1 = gen_reg_rtx (tmode1);
10073
10074 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10075 }
10076 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10077 {
10078 if (optimize || !target
10079 || GET_MODE (target) != tmode1
10080 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10081 target = gen_reg_rtx (tmode1);
10082
10083 scratch0 = gen_reg_rtx (tmode0);
10084
10085 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10086 }
10087 else
10088 {
10089 gcc_assert (d->flag);
10090
10091 scratch0 = gen_reg_rtx (tmode0);
10092 scratch1 = gen_reg_rtx (tmode1);
10093
10094 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10095 }
10096
10097 if (! pat)
10098 return 0;
10099
10100 emit_insn (pat);
10101
10102 if (d->flag)
10103 {
10104 target = gen_reg_rtx (SImode);
10105 emit_move_insn (target, const0_rtx);
10106 target = gen_rtx_SUBREG (QImode, target, 0);
10107
10108 emit_insn
10109 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10110 gen_rtx_fmt_ee (EQ, QImode,
10111 gen_rtx_REG ((machine_mode) d->flag,
10112 FLAGS_REG),
10113 const0_rtx)));
10114 return SUBREG_REG (target);
10115 }
10116 else
10117 return target;
10118 }
10119
10120 /* Fixup modeless constants to fit required mode. */
10121
10122 static rtx
10123 fixup_modeless_constant (rtx x, machine_mode mode)
10124 {
10125 if (GET_MODE (x) == VOIDmode)
10126 x = convert_to_mode (mode, x, 1);
10127 return x;
10128 }
10129
10130 /* Subroutine of ix86_expand_builtin to take care of insns with
10131 variable number of operands. */
10132
10133 static rtx
10134 ix86_expand_args_builtin (const struct builtin_description *d,
10135 tree exp, rtx target)
10136 {
10137 rtx pat, real_target;
10138 unsigned int i, nargs;
10139 unsigned int nargs_constant = 0;
10140 unsigned int mask_pos = 0;
10141 int num_memory = 0;
10142 rtx xops[6];
10143 bool second_arg_count = false;
10144 enum insn_code icode = d->icode;
10145 const struct insn_data_d *insn_p = &insn_data[icode];
10146 machine_mode tmode = insn_p->operand[0].mode;
10147 machine_mode rmode = VOIDmode;
10148 bool swap = false;
10149 enum rtx_code comparison = d->comparison;
10150
10151 switch ((enum ix86_builtin_func_type) d->flag)
10152 {
10153 case V2DF_FTYPE_V2DF_ROUND:
10154 case V4DF_FTYPE_V4DF_ROUND:
10155 case V8DF_FTYPE_V8DF_ROUND:
10156 case V4SF_FTYPE_V4SF_ROUND:
10157 case V8SF_FTYPE_V8SF_ROUND:
10158 case V16SF_FTYPE_V16SF_ROUND:
10159 case V8HF_FTYPE_V8HF_ROUND:
10160 case V16HF_FTYPE_V16HF_ROUND:
10161 case V32HF_FTYPE_V32HF_ROUND:
10162 case V4SI_FTYPE_V4SF_ROUND:
10163 case V8SI_FTYPE_V8SF_ROUND:
10164 case V16SI_FTYPE_V16SF_ROUND:
10165 return ix86_expand_sse_round (d, exp, target);
10166 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10167 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10168 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10169 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10170 case INT_FTYPE_V8SF_V8SF_PTEST:
10171 case INT_FTYPE_V4DI_V4DI_PTEST:
10172 case INT_FTYPE_V4DF_V4DF_PTEST:
10173 case INT_FTYPE_V4SF_V4SF_PTEST:
10174 case INT_FTYPE_V2DI_V2DI_PTEST:
10175 case INT_FTYPE_V2DF_V2DF_PTEST:
10176 return ix86_expand_sse_ptest (d, exp, target);
10177 case FLOAT128_FTYPE_FLOAT128:
10178 case FLOAT_FTYPE_FLOAT:
10179 case INT_FTYPE_INT:
10180 case UINT_FTYPE_UINT:
10181 case UINT16_FTYPE_UINT16:
10182 case UINT64_FTYPE_INT:
10183 case UINT64_FTYPE_UINT64:
10184 case INT64_FTYPE_INT64:
10185 case INT64_FTYPE_V4SF:
10186 case INT64_FTYPE_V2DF:
10187 case INT_FTYPE_V16QI:
10188 case INT_FTYPE_V8QI:
10189 case INT_FTYPE_V8SF:
10190 case INT_FTYPE_V4DF:
10191 case INT_FTYPE_V4SF:
10192 case INT_FTYPE_V2DF:
10193 case INT_FTYPE_V32QI:
10194 case V16QI_FTYPE_V16QI:
10195 case V8SI_FTYPE_V8SF:
10196 case V8SI_FTYPE_V4SI:
10197 case V8HI_FTYPE_V8HI:
10198 case V8HI_FTYPE_V16QI:
10199 case V8QI_FTYPE_V8QI:
10200 case V8SF_FTYPE_V8SF:
10201 case V8SF_FTYPE_V8SI:
10202 case V8SF_FTYPE_V4SF:
10203 case V8SF_FTYPE_V8HI:
10204 case V4SI_FTYPE_V4SI:
10205 case V4SI_FTYPE_V16QI:
10206 case V4SI_FTYPE_V4SF:
10207 case V4SI_FTYPE_V8SI:
10208 case V4SI_FTYPE_V8HI:
10209 case V4SI_FTYPE_V4DF:
10210 case V4SI_FTYPE_V2DF:
10211 case V4HI_FTYPE_V4HI:
10212 case V4DF_FTYPE_V4DF:
10213 case V4DF_FTYPE_V4SI:
10214 case V4DF_FTYPE_V4SF:
10215 case V4DF_FTYPE_V2DF:
10216 case V4SF_FTYPE_V4SF:
10217 case V4SF_FTYPE_V4SI:
10218 case V4SF_FTYPE_V8SF:
10219 case V4SF_FTYPE_V4DF:
10220 case V4SF_FTYPE_V8HI:
10221 case V4SF_FTYPE_V2DF:
10222 case V2DI_FTYPE_V2DI:
10223 case V2DI_FTYPE_V16QI:
10224 case V2DI_FTYPE_V8HI:
10225 case V2DI_FTYPE_V4SI:
10226 case V2DF_FTYPE_V2DF:
10227 case V2DF_FTYPE_V4SI:
10228 case V2DF_FTYPE_V4DF:
10229 case V2DF_FTYPE_V4SF:
10230 case V2DF_FTYPE_V2SI:
10231 case V2SI_FTYPE_V2SI:
10232 case V2SI_FTYPE_V4SF:
10233 case V2SI_FTYPE_V2SF:
10234 case V2SI_FTYPE_V2DF:
10235 case V2SF_FTYPE_V2SF:
10236 case V2SF_FTYPE_V2SI:
10237 case V32QI_FTYPE_V32QI:
10238 case V32QI_FTYPE_V16QI:
10239 case V16HI_FTYPE_V16HI:
10240 case V16HI_FTYPE_V8HI:
10241 case V8SI_FTYPE_V8SI:
10242 case V16HI_FTYPE_V16QI:
10243 case V8SI_FTYPE_V16QI:
10244 case V4DI_FTYPE_V16QI:
10245 case V8SI_FTYPE_V8HI:
10246 case V4DI_FTYPE_V8HI:
10247 case V4DI_FTYPE_V4SI:
10248 case V4DI_FTYPE_V2DI:
10249 case UQI_FTYPE_UQI:
10250 case UHI_FTYPE_UHI:
10251 case USI_FTYPE_USI:
10252 case USI_FTYPE_UQI:
10253 case USI_FTYPE_UHI:
10254 case UDI_FTYPE_UDI:
10255 case UHI_FTYPE_V16QI:
10256 case USI_FTYPE_V32QI:
10257 case UDI_FTYPE_V64QI:
10258 case V16QI_FTYPE_UHI:
10259 case V32QI_FTYPE_USI:
10260 case V64QI_FTYPE_UDI:
10261 case V8HI_FTYPE_UQI:
10262 case V16HI_FTYPE_UHI:
10263 case V32HI_FTYPE_USI:
10264 case V4SI_FTYPE_UQI:
10265 case V8SI_FTYPE_UQI:
10266 case V4SI_FTYPE_UHI:
10267 case V8SI_FTYPE_UHI:
10268 case UQI_FTYPE_V8HI:
10269 case UHI_FTYPE_V16HI:
10270 case USI_FTYPE_V32HI:
10271 case UQI_FTYPE_V4SI:
10272 case UQI_FTYPE_V8SI:
10273 case UHI_FTYPE_V16SI:
10274 case UQI_FTYPE_V2DI:
10275 case UQI_FTYPE_V4DI:
10276 case UQI_FTYPE_V8DI:
10277 case V16SI_FTYPE_UHI:
10278 case V2DI_FTYPE_UQI:
10279 case V4DI_FTYPE_UQI:
10280 case V16SI_FTYPE_INT:
10281 case V16SF_FTYPE_V8SF:
10282 case V16SI_FTYPE_V8SI:
10283 case V16SF_FTYPE_V4SF:
10284 case V16SI_FTYPE_V4SI:
10285 case V16SI_FTYPE_V16SF:
10286 case V16SI_FTYPE_V16SI:
10287 case V64QI_FTYPE_V64QI:
10288 case V32HI_FTYPE_V32HI:
10289 case V16SF_FTYPE_V16SF:
10290 case V8DI_FTYPE_UQI:
10291 case V8DI_FTYPE_V8DI:
10292 case V8DF_FTYPE_V4DF:
10293 case V8DF_FTYPE_V2DF:
10294 case V8DF_FTYPE_V8DF:
10295 case V4DI_FTYPE_V4DI:
10296 case V16HI_FTYPE_V16SF:
10297 case V8HI_FTYPE_V8SF:
10298 case V8HI_FTYPE_V4SF:
10299 nargs = 1;
10300 break;
10301 case V4SF_FTYPE_V4SF_VEC_MERGE:
10302 case V2DF_FTYPE_V2DF_VEC_MERGE:
10303 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10304 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10305 case V16QI_FTYPE_V16QI_V16QI:
10306 case V16QI_FTYPE_V8HI_V8HI:
10307 case V16HF_FTYPE_V16HF_V16HF:
10308 case V16SF_FTYPE_V16SF_V16SF:
10309 case V8QI_FTYPE_V8QI_V8QI:
10310 case V8QI_FTYPE_V4HI_V4HI:
10311 case V8HI_FTYPE_V8HI_V8HI:
10312 case V8HI_FTYPE_V16QI_V16QI:
10313 case V8HI_FTYPE_V4SI_V4SI:
10314 case V8HF_FTYPE_V8HF_V8HF:
10315 case V8SF_FTYPE_V8SF_V8SF:
10316 case V8SF_FTYPE_V8SF_V8SI:
10317 case V8DF_FTYPE_V8DF_V8DF:
10318 case V4SI_FTYPE_V4SI_V4SI:
10319 case V4SI_FTYPE_V8HI_V8HI:
10320 case V4SI_FTYPE_V2DF_V2DF:
10321 case V4HI_FTYPE_V4HI_V4HI:
10322 case V4HI_FTYPE_V8QI_V8QI:
10323 case V4HI_FTYPE_V2SI_V2SI:
10324 case V4DF_FTYPE_V4DF_V4DF:
10325 case V4DF_FTYPE_V4DF_V4DI:
10326 case V4SF_FTYPE_V4SF_V4SF:
10327 case V4SF_FTYPE_V4SF_V4SI:
10328 case V4SF_FTYPE_V4SF_V2SI:
10329 case V4SF_FTYPE_V4SF_V2DF:
10330 case V4SF_FTYPE_V4SF_UINT:
10331 case V4SF_FTYPE_V4SF_DI:
10332 case V4SF_FTYPE_V4SF_SI:
10333 case V2DI_FTYPE_V2DI_V2DI:
10334 case V2DI_FTYPE_V16QI_V16QI:
10335 case V2DI_FTYPE_V4SI_V4SI:
10336 case V2DI_FTYPE_V2DI_V16QI:
10337 case V2SI_FTYPE_V2SI_V2SI:
10338 case V2SI_FTYPE_V4HI_V4HI:
10339 case V2SI_FTYPE_V2SF_V2SF:
10340 case V2DF_FTYPE_V2DF_V2DF:
10341 case V2DF_FTYPE_V2DF_V4SF:
10342 case V2DF_FTYPE_V2DF_V2DI:
10343 case V2DF_FTYPE_V2DF_DI:
10344 case V2DF_FTYPE_V2DF_SI:
10345 case V2DF_FTYPE_V2DF_UINT:
10346 case V2SF_FTYPE_V2SF_V2SF:
10347 case V1DI_FTYPE_V1DI_V1DI:
10348 case V1DI_FTYPE_V8QI_V8QI:
10349 case V1DI_FTYPE_V2SI_V2SI:
10350 case V32QI_FTYPE_V16HI_V16HI:
10351 case V16HI_FTYPE_V8SI_V8SI:
10352 case V64QI_FTYPE_V64QI_V64QI:
10353 case V32QI_FTYPE_V32QI_V32QI:
10354 case V16HI_FTYPE_V32QI_V32QI:
10355 case V16HI_FTYPE_V16HI_V16HI:
10356 case V8SI_FTYPE_V4DF_V4DF:
10357 case V8SI_FTYPE_V8SI_V8SI:
10358 case V8SI_FTYPE_V16HI_V16HI:
10359 case V4DI_FTYPE_V4DI_V4DI:
10360 case V4DI_FTYPE_V8SI_V8SI:
10361 case V8DI_FTYPE_V64QI_V64QI:
10362 if (comparison == UNKNOWN)
10363 return ix86_expand_binop_builtin (icode, exp, target);
10364 nargs = 2;
10365 break;
10366 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10367 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10368 gcc_assert (comparison != UNKNOWN);
10369 nargs = 2;
10370 swap = true;
10371 break;
10372 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10373 case V16HI_FTYPE_V16HI_SI_COUNT:
10374 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10375 case V8SI_FTYPE_V8SI_SI_COUNT:
10376 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10377 case V4DI_FTYPE_V4DI_INT_COUNT:
10378 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10379 case V8HI_FTYPE_V8HI_SI_COUNT:
10380 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10381 case V4SI_FTYPE_V4SI_SI_COUNT:
10382 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10383 case V4HI_FTYPE_V4HI_SI_COUNT:
10384 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10385 case V2DI_FTYPE_V2DI_SI_COUNT:
10386 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10387 case V2SI_FTYPE_V2SI_SI_COUNT:
10388 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10389 case V1DI_FTYPE_V1DI_SI_COUNT:
10390 nargs = 2;
10391 second_arg_count = true;
10392 break;
10393 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10394 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10395 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10396 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10397 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10398 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10399 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10400 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10401 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10402 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10403 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10404 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10405 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10406 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10407 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10408 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10409 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10410 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10411 nargs = 4;
10412 second_arg_count = true;
10413 break;
10414 case UINT64_FTYPE_UINT64_UINT64:
10415 case UINT_FTYPE_UINT_UINT:
10416 case UINT_FTYPE_UINT_USHORT:
10417 case UINT_FTYPE_UINT_UCHAR:
10418 case UINT16_FTYPE_UINT16_INT:
10419 case UINT8_FTYPE_UINT8_INT:
10420 case UQI_FTYPE_UQI_UQI:
10421 case UHI_FTYPE_UHI_UHI:
10422 case USI_FTYPE_USI_USI:
10423 case UDI_FTYPE_UDI_UDI:
10424 case V16SI_FTYPE_V8DF_V8DF:
10425 case V32HI_FTYPE_V16SF_V16SF:
10426 case V16HI_FTYPE_V8SF_V8SF:
10427 case V8HI_FTYPE_V4SF_V4SF:
10428 case V16HI_FTYPE_V16SF_UHI:
10429 case V8HI_FTYPE_V8SF_UQI:
10430 case V8HI_FTYPE_V4SF_UQI:
10431 nargs = 2;
10432 break;
10433 case V2DI_FTYPE_V2DI_INT_CONVERT:
10434 nargs = 2;
10435 rmode = V1TImode;
10436 nargs_constant = 1;
10437 break;
10438 case V4DI_FTYPE_V4DI_INT_CONVERT:
10439 nargs = 2;
10440 rmode = V2TImode;
10441 nargs_constant = 1;
10442 break;
10443 case V8DI_FTYPE_V8DI_INT_CONVERT:
10444 nargs = 2;
10445 rmode = V4TImode;
10446 nargs_constant = 1;
10447 break;
10448 case V8HI_FTYPE_V8HI_INT:
10449 case V8HI_FTYPE_V8SF_INT:
10450 case V16HI_FTYPE_V16SF_INT:
10451 case V8HI_FTYPE_V4SF_INT:
10452 case V8SF_FTYPE_V8SF_INT:
10453 case V4SF_FTYPE_V16SF_INT:
10454 case V16SF_FTYPE_V16SF_INT:
10455 case V4SI_FTYPE_V4SI_INT:
10456 case V4SI_FTYPE_V8SI_INT:
10457 case V4HI_FTYPE_V4HI_INT:
10458 case V4DF_FTYPE_V4DF_INT:
10459 case V4DF_FTYPE_V8DF_INT:
10460 case V4SF_FTYPE_V4SF_INT:
10461 case V4SF_FTYPE_V8SF_INT:
10462 case V2DI_FTYPE_V2DI_INT:
10463 case V2DF_FTYPE_V2DF_INT:
10464 case V2DF_FTYPE_V4DF_INT:
10465 case V16HI_FTYPE_V16HI_INT:
10466 case V8SI_FTYPE_V8SI_INT:
10467 case V16SI_FTYPE_V16SI_INT:
10468 case V4SI_FTYPE_V16SI_INT:
10469 case V4DI_FTYPE_V4DI_INT:
10470 case V2DI_FTYPE_V4DI_INT:
10471 case V4DI_FTYPE_V8DI_INT:
10472 case UQI_FTYPE_UQI_UQI_CONST:
10473 case UHI_FTYPE_UHI_UQI:
10474 case USI_FTYPE_USI_UQI:
10475 case UDI_FTYPE_UDI_UQI:
10476 nargs = 2;
10477 nargs_constant = 1;
10478 break;
10479 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10480 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10481 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10482 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10483 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10484 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10485 case UHI_FTYPE_V16SI_V16SI_UHI:
10486 case UQI_FTYPE_V8DI_V8DI_UQI:
10487 case V16HI_FTYPE_V16SI_V16HI_UHI:
10488 case V16QI_FTYPE_V16SI_V16QI_UHI:
10489 case V16QI_FTYPE_V8DI_V16QI_UQI:
10490 case V32HF_FTYPE_V32HF_V32HF_USI:
10491 case V16SF_FTYPE_V16SF_V16SF_UHI:
10492 case V16SF_FTYPE_V4SF_V16SF_UHI:
10493 case V16SI_FTYPE_SI_V16SI_UHI:
10494 case V16SI_FTYPE_V16HI_V16SI_UHI:
10495 case V16SI_FTYPE_V16QI_V16SI_UHI:
10496 case V8SF_FTYPE_V4SF_V8SF_UQI:
10497 case V4DF_FTYPE_V2DF_V4DF_UQI:
10498 case V8SI_FTYPE_V4SI_V8SI_UQI:
10499 case V8SI_FTYPE_SI_V8SI_UQI:
10500 case V4SI_FTYPE_V4SI_V4SI_UQI:
10501 case V4SI_FTYPE_SI_V4SI_UQI:
10502 case V4DI_FTYPE_V2DI_V4DI_UQI:
10503 case V4DI_FTYPE_DI_V4DI_UQI:
10504 case V2DI_FTYPE_V2DI_V2DI_UQI:
10505 case V2DI_FTYPE_DI_V2DI_UQI:
10506 case V64QI_FTYPE_V64QI_V64QI_UDI:
10507 case V64QI_FTYPE_V16QI_V64QI_UDI:
10508 case V64QI_FTYPE_QI_V64QI_UDI:
10509 case V32QI_FTYPE_V32QI_V32QI_USI:
10510 case V32QI_FTYPE_V16QI_V32QI_USI:
10511 case V32QI_FTYPE_QI_V32QI_USI:
10512 case V16QI_FTYPE_V16QI_V16QI_UHI:
10513 case V16QI_FTYPE_QI_V16QI_UHI:
10514 case V32HI_FTYPE_V8HI_V32HI_USI:
10515 case V32HI_FTYPE_HI_V32HI_USI:
10516 case V16HI_FTYPE_V8HI_V16HI_UHI:
10517 case V16HI_FTYPE_HI_V16HI_UHI:
10518 case V8HI_FTYPE_V8HI_V8HI_UQI:
10519 case V8HI_FTYPE_HI_V8HI_UQI:
10520 case V16HF_FTYPE_V16HF_V16HF_UHI:
10521 case V8SF_FTYPE_V8HI_V8SF_UQI:
10522 case V4SF_FTYPE_V8HI_V4SF_UQI:
10523 case V8SI_FTYPE_V8HF_V8SI_UQI:
10524 case V8SF_FTYPE_V8HF_V8SF_UQI:
10525 case V8SI_FTYPE_V8SF_V8SI_UQI:
10526 case V4SI_FTYPE_V4SF_V4SI_UQI:
10527 case V4SI_FTYPE_V8HF_V4SI_UQI:
10528 case V4SF_FTYPE_V8HF_V4SF_UQI:
10529 case V4DI_FTYPE_V8HF_V4DI_UQI:
10530 case V4DI_FTYPE_V4SF_V4DI_UQI:
10531 case V2DI_FTYPE_V8HF_V2DI_UQI:
10532 case V2DI_FTYPE_V4SF_V2DI_UQI:
10533 case V8HF_FTYPE_V8HF_V8HF_UQI:
10534 case V8HF_FTYPE_V8HF_V8HF_V8HF:
10535 case V8HF_FTYPE_V8HI_V8HF_UQI:
10536 case V8HF_FTYPE_V8SI_V8HF_UQI:
10537 case V8HF_FTYPE_V8SF_V8HF_UQI:
10538 case V8HF_FTYPE_V4SI_V8HF_UQI:
10539 case V8HF_FTYPE_V4SF_V8HF_UQI:
10540 case V8HF_FTYPE_V4DI_V8HF_UQI:
10541 case V8HF_FTYPE_V4DF_V8HF_UQI:
10542 case V8HF_FTYPE_V2DI_V8HF_UQI:
10543 case V8HF_FTYPE_V2DF_V8HF_UQI:
10544 case V4SF_FTYPE_V4DI_V4SF_UQI:
10545 case V4SF_FTYPE_V2DI_V4SF_UQI:
10546 case V4DF_FTYPE_V4DI_V4DF_UQI:
10547 case V4DF_FTYPE_V8HF_V4DF_UQI:
10548 case V2DF_FTYPE_V8HF_V2DF_UQI:
10549 case V2DF_FTYPE_V2DI_V2DF_UQI:
10550 case V16QI_FTYPE_V8HI_V16QI_UQI:
10551 case V16QI_FTYPE_V16HI_V16QI_UHI:
10552 case V16QI_FTYPE_V4SI_V16QI_UQI:
10553 case V16QI_FTYPE_V8SI_V16QI_UQI:
10554 case V8HI_FTYPE_V8HF_V8HI_UQI:
10555 case V8HI_FTYPE_V4SI_V8HI_UQI:
10556 case V8HI_FTYPE_V8SI_V8HI_UQI:
10557 case V16QI_FTYPE_V2DI_V16QI_UQI:
10558 case V16QI_FTYPE_V4DI_V16QI_UQI:
10559 case V8HI_FTYPE_V2DI_V8HI_UQI:
10560 case V8HI_FTYPE_V4DI_V8HI_UQI:
10561 case V4SI_FTYPE_V2DI_V4SI_UQI:
10562 case V4SI_FTYPE_V4DI_V4SI_UQI:
10563 case V32QI_FTYPE_V32HI_V32QI_USI:
10564 case UHI_FTYPE_V16QI_V16QI_UHI:
10565 case USI_FTYPE_V32QI_V32QI_USI:
10566 case UDI_FTYPE_V64QI_V64QI_UDI:
10567 case UQI_FTYPE_V8HI_V8HI_UQI:
10568 case UHI_FTYPE_V16HI_V16HI_UHI:
10569 case USI_FTYPE_V32HI_V32HI_USI:
10570 case UQI_FTYPE_V4SI_V4SI_UQI:
10571 case UQI_FTYPE_V8SI_V8SI_UQI:
10572 case UQI_FTYPE_V2DI_V2DI_UQI:
10573 case UQI_FTYPE_V4DI_V4DI_UQI:
10574 case V4SF_FTYPE_V2DF_V4SF_UQI:
10575 case V4SF_FTYPE_V4DF_V4SF_UQI:
10576 case V16SI_FTYPE_V16SI_V16SI_UHI:
10577 case V16SI_FTYPE_V4SI_V16SI_UHI:
10578 case V2DI_FTYPE_V4SI_V2DI_UQI:
10579 case V2DI_FTYPE_V8HI_V2DI_UQI:
10580 case V2DI_FTYPE_V16QI_V2DI_UQI:
10581 case V4DI_FTYPE_V4DI_V4DI_UQI:
10582 case V4DI_FTYPE_V4SI_V4DI_UQI:
10583 case V4DI_FTYPE_V8HI_V4DI_UQI:
10584 case V4DI_FTYPE_V16QI_V4DI_UQI:
10585 case V4DI_FTYPE_V4DF_V4DI_UQI:
10586 case V2DI_FTYPE_V2DF_V2DI_UQI:
10587 case V4SI_FTYPE_V4DF_V4SI_UQI:
10588 case V4SI_FTYPE_V2DF_V4SI_UQI:
10589 case V4SI_FTYPE_V8HI_V4SI_UQI:
10590 case V4SI_FTYPE_V16QI_V4SI_UQI:
10591 case V4DI_FTYPE_V4DI_V4DI_V4DI:
10592 case V8DF_FTYPE_V2DF_V8DF_UQI:
10593 case V8DF_FTYPE_V4DF_V8DF_UQI:
10594 case V8DF_FTYPE_V8DF_V8DF_UQI:
10595 case V8SF_FTYPE_V8SF_V8SF_UQI:
10596 case V8SF_FTYPE_V8SI_V8SF_UQI:
10597 case V4DF_FTYPE_V4DF_V4DF_UQI:
10598 case V4SF_FTYPE_V4SF_V4SF_UQI:
10599 case V2DF_FTYPE_V2DF_V2DF_UQI:
10600 case V2DF_FTYPE_V4SF_V2DF_UQI:
10601 case V2DF_FTYPE_V4SI_V2DF_UQI:
10602 case V4SF_FTYPE_V4SI_V4SF_UQI:
10603 case V4DF_FTYPE_V4SF_V4DF_UQI:
10604 case V4DF_FTYPE_V4SI_V4DF_UQI:
10605 case V8SI_FTYPE_V8SI_V8SI_UQI:
10606 case V8SI_FTYPE_V8HI_V8SI_UQI:
10607 case V8SI_FTYPE_V16QI_V8SI_UQI:
10608 case V8DF_FTYPE_V8SI_V8DF_UQI:
10609 case V8DI_FTYPE_DI_V8DI_UQI:
10610 case V16SF_FTYPE_V8SF_V16SF_UHI:
10611 case V16SI_FTYPE_V8SI_V16SI_UHI:
10612 case V16HF_FTYPE_V16HI_V16HF_UHI:
10613 case V16HF_FTYPE_V16HF_V16HF_V16HF:
10614 case V16HI_FTYPE_V16HF_V16HI_UHI:
10615 case V16HI_FTYPE_V16HI_V16HI_UHI:
10616 case V8HI_FTYPE_V16QI_V8HI_UQI:
10617 case V16HI_FTYPE_V16QI_V16HI_UHI:
10618 case V32HI_FTYPE_V32HI_V32HI_USI:
10619 case V32HI_FTYPE_V32QI_V32HI_USI:
10620 case V8DI_FTYPE_V16QI_V8DI_UQI:
10621 case V8DI_FTYPE_V2DI_V8DI_UQI:
10622 case V8DI_FTYPE_V4DI_V8DI_UQI:
10623 case V8DI_FTYPE_V8DI_V8DI_UQI:
10624 case V8DI_FTYPE_V8HI_V8DI_UQI:
10625 case V8DI_FTYPE_V8SI_V8DI_UQI:
10626 case V8HI_FTYPE_V8DI_V8HI_UQI:
10627 case V8SI_FTYPE_V8DI_V8SI_UQI:
10628 case V4SI_FTYPE_V4SI_V4SI_V4SI:
10629 case V16SI_FTYPE_V16SI_V16SI_V16SI:
10630 case V8DI_FTYPE_V8DI_V8DI_V8DI:
10631 case V32HI_FTYPE_V32HI_V32HI_V32HI:
10632 case V2DI_FTYPE_V2DI_V2DI_V2DI:
10633 case V16HI_FTYPE_V16HI_V16HI_V16HI:
10634 case V8SI_FTYPE_V8SI_V8SI_V8SI:
10635 case V8HI_FTYPE_V8HI_V8HI_V8HI:
10636 case V32HI_FTYPE_V16SF_V16SF_USI:
10637 case V16HI_FTYPE_V8SF_V8SF_UHI:
10638 case V8HI_FTYPE_V4SF_V4SF_UQI:
10639 case V16HI_FTYPE_V16SF_V16HI_UHI:
10640 case V8HI_FTYPE_V8SF_V8HI_UQI:
10641 case V8HI_FTYPE_V4SF_V8HI_UQI:
10642 case V16SF_FTYPE_V16SF_V32HI_V32HI:
10643 case V8SF_FTYPE_V8SF_V16HI_V16HI:
10644 case V4SF_FTYPE_V4SF_V8HI_V8HI:
10645 nargs = 3;
10646 break;
10647 case V32QI_FTYPE_V32QI_V32QI_INT:
10648 case V16HI_FTYPE_V16HI_V16HI_INT:
10649 case V16QI_FTYPE_V16QI_V16QI_INT:
10650 case V4DI_FTYPE_V4DI_V4DI_INT:
10651 case V8HI_FTYPE_V8HI_V8HI_INT:
10652 case V8SI_FTYPE_V8SI_V8SI_INT:
10653 case V8SI_FTYPE_V8SI_V4SI_INT:
10654 case V8SF_FTYPE_V8SF_V8SF_INT:
10655 case V8SF_FTYPE_V8SF_V4SF_INT:
10656 case V4SI_FTYPE_V4SI_V4SI_INT:
10657 case V4DF_FTYPE_V4DF_V4DF_INT:
10658 case V16SF_FTYPE_V16SF_V16SF_INT:
10659 case V16SF_FTYPE_V16SF_V4SF_INT:
10660 case V16SI_FTYPE_V16SI_V4SI_INT:
10661 case V4DF_FTYPE_V4DF_V2DF_INT:
10662 case V4SF_FTYPE_V4SF_V4SF_INT:
10663 case V2DI_FTYPE_V2DI_V2DI_INT:
10664 case V4DI_FTYPE_V4DI_V2DI_INT:
10665 case V2DF_FTYPE_V2DF_V2DF_INT:
10666 case UQI_FTYPE_V8DI_V8UDI_INT:
10667 case UQI_FTYPE_V8DF_V8DF_INT:
10668 case UQI_FTYPE_V2DF_V2DF_INT:
10669 case UQI_FTYPE_V4SF_V4SF_INT:
10670 case UHI_FTYPE_V16SI_V16SI_INT:
10671 case UHI_FTYPE_V16SF_V16SF_INT:
10672 case V64QI_FTYPE_V64QI_V64QI_INT:
10673 case V32HI_FTYPE_V32HI_V32HI_INT:
10674 case V16SI_FTYPE_V16SI_V16SI_INT:
10675 case V8DI_FTYPE_V8DI_V8DI_INT:
10676 nargs = 3;
10677 nargs_constant = 1;
10678 break;
10679 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
10680 nargs = 3;
10681 rmode = V4DImode;
10682 nargs_constant = 1;
10683 break;
10684 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
10685 nargs = 3;
10686 rmode = V2DImode;
10687 nargs_constant = 1;
10688 break;
10689 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
10690 nargs = 3;
10691 rmode = DImode;
10692 nargs_constant = 1;
10693 break;
10694 case V2DI_FTYPE_V2DI_UINT_UINT:
10695 nargs = 3;
10696 nargs_constant = 2;
10697 break;
10698 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
10699 nargs = 3;
10700 rmode = V8DImode;
10701 nargs_constant = 1;
10702 break;
10703 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
10704 nargs = 5;
10705 rmode = V8DImode;
10706 mask_pos = 2;
10707 nargs_constant = 1;
10708 break;
10709 case QI_FTYPE_V8DF_INT_UQI:
10710 case QI_FTYPE_V4DF_INT_UQI:
10711 case QI_FTYPE_V2DF_INT_UQI:
10712 case HI_FTYPE_V16SF_INT_UHI:
10713 case QI_FTYPE_V8SF_INT_UQI:
10714 case QI_FTYPE_V4SF_INT_UQI:
10715 case QI_FTYPE_V8HF_INT_UQI:
10716 case HI_FTYPE_V16HF_INT_UHI:
10717 case SI_FTYPE_V32HF_INT_USI:
10718 case V4SI_FTYPE_V4SI_V4SI_UHI:
10719 case V8SI_FTYPE_V8SI_V8SI_UHI:
10720 nargs = 3;
10721 mask_pos = 1;
10722 nargs_constant = 1;
10723 break;
10724 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
10725 nargs = 5;
10726 rmode = V4DImode;
10727 mask_pos = 2;
10728 nargs_constant = 1;
10729 break;
10730 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
10731 nargs = 5;
10732 rmode = V2DImode;
10733 mask_pos = 2;
10734 nargs_constant = 1;
10735 break;
10736 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
10737 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
10738 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
10739 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
10740 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
10741 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
10742 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
10743 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
10744 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
10745 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
10746 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
10747 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
10748 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
10749 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
10750 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
10751 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
10752 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
10753 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
10754 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
10755 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
10756 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
10757 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
10758 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
10759 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
10760 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
10761 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
10762 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
10763 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
10764 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
10765 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
10766 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
10767 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
10768 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
10769 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
10770 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
10771 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
10772 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
10773 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
10774 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
10775 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
10776 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
10777 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
10778 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
10779 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
10780 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
10781 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
10782 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
10783 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
10784 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
10785 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
10786 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
10787 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
10788 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
10789 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
10790 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
10791 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
10792 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
10793 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
10794 nargs = 4;
10795 break;
10796 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
10797 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
10798 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
10799 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
10800 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
10801 nargs = 4;
10802 nargs_constant = 1;
10803 break;
10804 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
10805 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
10806 case QI_FTYPE_V4DF_V4DF_INT_UQI:
10807 case QI_FTYPE_V8SF_V8SF_INT_UQI:
10808 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
10809 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
10810 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
10811 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
10812 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
10813 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
10814 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
10815 case USI_FTYPE_V32QI_V32QI_INT_USI:
10816 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
10817 case USI_FTYPE_V32HI_V32HI_INT_USI:
10818 case USI_FTYPE_V32HF_V32HF_INT_USI:
10819 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
10820 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
10821 nargs = 4;
10822 mask_pos = 1;
10823 nargs_constant = 1;
10824 break;
10825 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
10826 nargs = 4;
10827 nargs_constant = 2;
10828 break;
10829 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
10830 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
10831 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
10832 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
10833 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
10834 nargs = 4;
10835 break;
10836 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
10837 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
10838 mask_pos = 1;
10839 nargs = 4;
10840 nargs_constant = 1;
10841 break;
10842 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
10843 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
10844 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
10845 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
10846 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
10847 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
10848 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
10849 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
10850 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
10851 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
10852 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
10853 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
10854 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
10855 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
10856 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
10857 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
10858 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
10859 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
10860 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
10861 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
10862 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
10863 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
10864 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
10865 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
10866 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
10867 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
10868 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
10869 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
10870 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
10871 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
10872 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
10873 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
10874 nargs = 4;
10875 mask_pos = 2;
10876 nargs_constant = 1;
10877 break;
10878 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
10879 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
10880 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
10881 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
10882 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
10883 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
10884 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
10885 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
10886 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
10887 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
10888 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
10889 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
10890 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
10891 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
10892 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
10893 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
10894 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
10895 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
10896 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
10897 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
10898 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
10899 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
10900 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
10901 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
10902 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
10903 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
10904 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
10905 nargs = 5;
10906 mask_pos = 2;
10907 nargs_constant = 1;
10908 break;
10909 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
10910 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
10911 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
10912 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
10913 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
10914 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
10915 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
10916 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
10917 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
10918 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
10919 nargs = 5;
10920 mask_pos = 1;
10921 nargs_constant = 1;
10922 break;
10923 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
10924 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
10925 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
10926 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
10927 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
10928 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
10929 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
10930 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
10931 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
10932 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
10933 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
10934 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
10935 nargs = 5;
10936 mask_pos = 1;
10937 nargs_constant = 2;
10938 break;
10939
10940 default:
10941 gcc_unreachable ();
10942 }
10943
10944 gcc_assert (nargs <= ARRAY_SIZE (xops));
10945
10946 if (comparison != UNKNOWN)
10947 {
10948 gcc_assert (nargs == 2);
10949 return ix86_expand_sse_compare (d, exp, target, swap);
10950 }
10951
10952 if (rmode == VOIDmode || rmode == tmode)
10953 {
10954 if (optimize
10955 || target == 0
10956 || GET_MODE (target) != tmode
10957 || !insn_p->operand[0].predicate (target, tmode))
10958 target = gen_reg_rtx (tmode);
10959 else if (memory_operand (target, tmode))
10960 num_memory++;
10961 real_target = target;
10962 }
10963 else
10964 {
10965 real_target = gen_reg_rtx (tmode);
10966 target = lowpart_subreg (rmode, real_target, tmode);
10967 }
10968
10969 for (i = 0; i < nargs; i++)
10970 {
10971 tree arg = CALL_EXPR_ARG (exp, i);
10972 rtx op = expand_normal (arg);
10973 machine_mode mode = insn_p->operand[i + 1].mode;
10974 bool match = insn_p->operand[i + 1].predicate (op, mode);
10975
10976 if (second_arg_count && i == 1)
10977 {
10978 /* SIMD shift insns take either an 8-bit immediate or
10979 register as count. But builtin functions take int as
10980 count. If count doesn't match, we put it in register.
10981 The instructions are using 64-bit count, if op is just
10982 32-bit, zero-extend it, as negative shift counts
10983 are undefined behavior and zero-extension is more
10984 efficient. */
10985 if (!match)
10986 {
10987 if (SCALAR_INT_MODE_P (GET_MODE (op)))
10988 op = convert_modes (mode, GET_MODE (op), op, 1);
10989 else
10990 op = lowpart_subreg (mode, op, GET_MODE (op));
10991 if (!insn_p->operand[i + 1].predicate (op, mode))
10992 op = copy_to_reg (op);
10993 }
10994 }
10995 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
10996 (!mask_pos && (nargs - i) <= nargs_constant))
10997 {
10998 if (!match)
10999 switch (icode)
11000 {
11001 case CODE_FOR_avx_vinsertf128v4di:
11002 case CODE_FOR_avx_vextractf128v4di:
11003 error ("the last argument must be an 1-bit immediate");
11004 return const0_rtx;
11005
11006 case CODE_FOR_avx512f_cmpv8di3_mask:
11007 case CODE_FOR_avx512f_cmpv16si3_mask:
11008 case CODE_FOR_avx512f_ucmpv8di3_mask:
11009 case CODE_FOR_avx512f_ucmpv16si3_mask:
11010 case CODE_FOR_avx512vl_cmpv4di3_mask:
11011 case CODE_FOR_avx512vl_cmpv8si3_mask:
11012 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11013 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11014 case CODE_FOR_avx512vl_cmpv2di3_mask:
11015 case CODE_FOR_avx512vl_cmpv4si3_mask:
11016 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11017 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11018 error ("the last argument must be a 3-bit immediate");
11019 return const0_rtx;
11020
11021 case CODE_FOR_sse4_1_roundsd:
11022 case CODE_FOR_sse4_1_roundss:
11023
11024 case CODE_FOR_sse4_1_roundpd:
11025 case CODE_FOR_sse4_1_roundps:
11026 case CODE_FOR_avx_roundpd256:
11027 case CODE_FOR_avx_roundps256:
11028
11029 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11030 case CODE_FOR_sse4_1_roundps_sfix:
11031 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11032 case CODE_FOR_avx_roundps_sfix256:
11033
11034 case CODE_FOR_sse4_1_blendps:
11035 case CODE_FOR_avx_blendpd256:
11036 case CODE_FOR_avx_vpermilv4df:
11037 case CODE_FOR_avx_vpermilv4df_mask:
11038 case CODE_FOR_avx512f_getmantv8df_mask:
11039 case CODE_FOR_avx512f_getmantv16sf_mask:
11040 case CODE_FOR_avx512vl_getmantv16hf_mask:
11041 case CODE_FOR_avx512vl_getmantv8sf_mask:
11042 case CODE_FOR_avx512vl_getmantv4df_mask:
11043 case CODE_FOR_avx512fp16_getmantv8hf_mask:
11044 case CODE_FOR_avx512vl_getmantv4sf_mask:
11045 case CODE_FOR_avx512vl_getmantv2df_mask:
11046 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11047 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11048 case CODE_FOR_avx512dq_rangepv4df_mask:
11049 case CODE_FOR_avx512dq_rangepv8sf_mask:
11050 case CODE_FOR_avx512dq_rangepv2df_mask:
11051 case CODE_FOR_avx512dq_rangepv4sf_mask:
11052 case CODE_FOR_avx_shufpd256_mask:
11053 error ("the last argument must be a 4-bit immediate");
11054 return const0_rtx;
11055
11056 case CODE_FOR_sha1rnds4:
11057 case CODE_FOR_sse4_1_blendpd:
11058 case CODE_FOR_avx_vpermilv2df:
11059 case CODE_FOR_avx_vpermilv2df_mask:
11060 case CODE_FOR_xop_vpermil2v2df3:
11061 case CODE_FOR_xop_vpermil2v4sf3:
11062 case CODE_FOR_xop_vpermil2v4df3:
11063 case CODE_FOR_xop_vpermil2v8sf3:
11064 case CODE_FOR_avx512f_vinsertf32x4_mask:
11065 case CODE_FOR_avx512f_vinserti32x4_mask:
11066 case CODE_FOR_avx512f_vextractf32x4_mask:
11067 case CODE_FOR_avx512f_vextracti32x4_mask:
11068 case CODE_FOR_sse2_shufpd:
11069 case CODE_FOR_sse2_shufpd_mask:
11070 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11071 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11072 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11073 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11074 error ("the last argument must be a 2-bit immediate");
11075 return const0_rtx;
11076
11077 case CODE_FOR_avx_vextractf128v4df:
11078 case CODE_FOR_avx_vextractf128v8sf:
11079 case CODE_FOR_avx_vextractf128v8si:
11080 case CODE_FOR_avx_vinsertf128v4df:
11081 case CODE_FOR_avx_vinsertf128v8sf:
11082 case CODE_FOR_avx_vinsertf128v8si:
11083 case CODE_FOR_avx512f_vinsertf64x4_mask:
11084 case CODE_FOR_avx512f_vinserti64x4_mask:
11085 case CODE_FOR_avx512f_vextractf64x4_mask:
11086 case CODE_FOR_avx512f_vextracti64x4_mask:
11087 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11088 case CODE_FOR_avx512dq_vinserti32x8_mask:
11089 case CODE_FOR_avx512vl_vinsertv4df:
11090 case CODE_FOR_avx512vl_vinsertv4di:
11091 case CODE_FOR_avx512vl_vinsertv8sf:
11092 case CODE_FOR_avx512vl_vinsertv8si:
11093 error ("the last argument must be a 1-bit immediate");
11094 return const0_rtx;
11095
11096 case CODE_FOR_avx_vmcmpv2df3:
11097 case CODE_FOR_avx_vmcmpv4sf3:
11098 case CODE_FOR_avx_cmpv2df3:
11099 case CODE_FOR_avx_cmpv4sf3:
11100 case CODE_FOR_avx_cmpv4df3:
11101 case CODE_FOR_avx_cmpv8sf3:
11102 case CODE_FOR_avx512f_cmpv8df3_mask:
11103 case CODE_FOR_avx512f_cmpv16sf3_mask:
11104 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11105 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11106 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11107 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11108 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11109 error ("the last argument must be a 5-bit immediate");
11110 return const0_rtx;
11111
11112 default:
11113 switch (nargs_constant)
11114 {
11115 case 2:
11116 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11117 (!mask_pos && (nargs - i) == nargs_constant))
11118 {
11119 error ("the next to last argument must be an 8-bit immediate");
11120 break;
11121 }
11122 /* FALLTHRU */
11123 case 1:
11124 error ("the last argument must be an 8-bit immediate");
11125 break;
11126 default:
11127 gcc_unreachable ();
11128 }
11129 return const0_rtx;
11130 }
11131 }
11132 else
11133 {
11134 if (VECTOR_MODE_P (mode))
11135 op = safe_vector_operand (op, mode);
11136
11137 /* If we aren't optimizing, only allow one memory operand to
11138 be generated. */
11139 if (memory_operand (op, mode))
11140 num_memory++;
11141
11142 op = fixup_modeless_constant (op, mode);
11143
11144 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11145 {
11146 if (optimize || !match || num_memory > 1)
11147 op = copy_to_mode_reg (mode, op);
11148 }
11149 else
11150 {
11151 op = copy_to_reg (op);
11152 op = lowpart_subreg (mode, op, GET_MODE (op));
11153 }
11154 }
11155
11156 xops[i] = op;
11157 }
11158
11159 switch (nargs)
11160 {
11161 case 1:
11162 pat = GEN_FCN (icode) (real_target, xops[0]);
11163 break;
11164 case 2:
11165 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11166 break;
11167 case 3:
11168 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11169 break;
11170 case 4:
11171 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11172 xops[2], xops[3]);
11173 break;
11174 case 5:
11175 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11176 xops[2], xops[3], xops[4]);
11177 break;
11178 case 6:
11179 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11180 xops[2], xops[3], xops[4], xops[5]);
11181 break;
11182 default:
11183 gcc_unreachable ();
11184 }
11185
11186 if (! pat)
11187 return 0;
11188
11189 emit_insn (pat);
11190 return target;
11191 }
11192
11193 /* Transform pattern of following layout:
11194 (set A
11195 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11196 )
11197 into:
11198 (set (A B)) */
11199
11200 static rtx
11201 ix86_erase_embedded_rounding (rtx pat)
11202 {
11203 if (GET_CODE (pat) == INSN)
11204 pat = PATTERN (pat);
11205
11206 gcc_assert (GET_CODE (pat) == SET);
11207 rtx src = SET_SRC (pat);
11208 gcc_assert (XVECLEN (src, 0) == 2);
11209 rtx p0 = XVECEXP (src, 0, 0);
11210 gcc_assert (GET_CODE (src) == UNSPEC
11211 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11212 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11213 return res;
11214 }
11215
11216 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11217 with rounding. */
11218 static rtx
11219 ix86_expand_sse_comi_round (const struct builtin_description *d,
11220 tree exp, rtx target)
11221 {
11222 rtx pat, set_dst;
11223 tree arg0 = CALL_EXPR_ARG (exp, 0);
11224 tree arg1 = CALL_EXPR_ARG (exp, 1);
11225 tree arg2 = CALL_EXPR_ARG (exp, 2);
11226 tree arg3 = CALL_EXPR_ARG (exp, 3);
11227 rtx op0 = expand_normal (arg0);
11228 rtx op1 = expand_normal (arg1);
11229 rtx op2 = expand_normal (arg2);
11230 rtx op3 = expand_normal (arg3);
11231 enum insn_code icode = d->icode;
11232 const struct insn_data_d *insn_p = &insn_data[icode];
11233 machine_mode mode0 = insn_p->operand[0].mode;
11234 machine_mode mode1 = insn_p->operand[1].mode;
11235
11236 /* See avxintrin.h for values. */
11237 static const enum rtx_code comparisons[32] =
11238 {
11239 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11240 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11241 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11242 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11243 };
11244 static const bool ordereds[32] =
11245 {
11246 true, true, true, false, false, false, false, true,
11247 false, false, false, true, true, true, true, false,
11248 true, true, true, false, false, false, false, true,
11249 false, false, false, true, true, true, true, false
11250 };
11251 static const bool non_signalings[32] =
11252 {
11253 true, false, false, true, true, false, false, true,
11254 true, false, false, true, true, false, false, true,
11255 false, true, true, false, false, true, true, false,
11256 false, true, true, false, false, true, true, false
11257 };
11258
11259 if (!CONST_INT_P (op2))
11260 {
11261 error ("the third argument must be comparison constant");
11262 return const0_rtx;
11263 }
11264 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11265 {
11266 error ("incorrect comparison mode");
11267 return const0_rtx;
11268 }
11269
11270 if (!insn_p->operand[2].predicate (op3, SImode))
11271 {
11272 error ("incorrect rounding operand");
11273 return const0_rtx;
11274 }
11275
11276 if (VECTOR_MODE_P (mode0))
11277 op0 = safe_vector_operand (op0, mode0);
11278 if (VECTOR_MODE_P (mode1))
11279 op1 = safe_vector_operand (op1, mode1);
11280
11281 enum rtx_code comparison = comparisons[INTVAL (op2)];
11282 bool ordered = ordereds[INTVAL (op2)];
11283 bool non_signaling = non_signalings[INTVAL (op2)];
11284 rtx const_val = const0_rtx;
11285
11286 bool check_unordered = false;
11287 machine_mode mode = CCFPmode;
11288 switch (comparison)
11289 {
11290 case ORDERED:
11291 if (!ordered)
11292 {
11293 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11294 if (!non_signaling)
11295 ordered = true;
11296 mode = CCSmode;
11297 }
11298 else
11299 {
11300 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11301 if (non_signaling)
11302 ordered = false;
11303 mode = CCPmode;
11304 }
11305 comparison = NE;
11306 break;
11307 case UNORDERED:
11308 if (ordered)
11309 {
11310 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11311 if (non_signaling)
11312 ordered = false;
11313 mode = CCSmode;
11314 }
11315 else
11316 {
11317 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11318 if (!non_signaling)
11319 ordered = true;
11320 mode = CCPmode;
11321 }
11322 comparison = EQ;
11323 break;
11324
11325 case LE: /* -> GE */
11326 case LT: /* -> GT */
11327 case UNGE: /* -> UNLE */
11328 case UNGT: /* -> UNLT */
11329 std::swap (op0, op1);
11330 comparison = swap_condition (comparison);
11331 /* FALLTHRU */
11332 case GT:
11333 case GE:
11334 case UNEQ:
11335 case UNLT:
11336 case UNLE:
11337 case LTGT:
11338 /* These are supported by CCFPmode. NB: Use ordered/signaling
11339 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11340 with NAN operands. */
11341 if (ordered == non_signaling)
11342 ordered = !ordered;
11343 break;
11344 case EQ:
11345 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11346 _CMP_EQ_OQ/_CMP_EQ_OS. */
11347 check_unordered = true;
11348 mode = CCZmode;
11349 break;
11350 case NE:
11351 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11352 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11353 gcc_assert (!ordered);
11354 check_unordered = true;
11355 mode = CCZmode;
11356 const_val = const1_rtx;
11357 break;
11358 default:
11359 gcc_unreachable ();
11360 }
11361
11362 target = gen_reg_rtx (SImode);
11363 emit_move_insn (target, const_val);
11364 target = gen_rtx_SUBREG (QImode, target, 0);
11365
11366 if ((optimize && !register_operand (op0, mode0))
11367 || !insn_p->operand[0].predicate (op0, mode0))
11368 op0 = copy_to_mode_reg (mode0, op0);
11369 if ((optimize && !register_operand (op1, mode1))
11370 || !insn_p->operand[1].predicate (op1, mode1))
11371 op1 = copy_to_mode_reg (mode1, op1);
11372
11373 /*
11374 1. COMI: ordered and signaling.
11375 2. UCOMI: unordered and non-signaling.
11376 */
11377 if (non_signaling)
11378 icode = (icode == CODE_FOR_sse_comi_round
11379 ? CODE_FOR_sse_ucomi_round
11380 : CODE_FOR_sse2_ucomi_round);
11381
11382 pat = GEN_FCN (icode) (op0, op1, op3);
11383 if (! pat)
11384 return 0;
11385
11386 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11387 if (INTVAL (op3) == NO_ROUND)
11388 {
11389 pat = ix86_erase_embedded_rounding (pat);
11390 if (! pat)
11391 return 0;
11392
11393 set_dst = SET_DEST (pat);
11394 }
11395 else
11396 {
11397 gcc_assert (GET_CODE (pat) == SET);
11398 set_dst = SET_DEST (pat);
11399 }
11400
11401 emit_insn (pat);
11402
11403 rtx_code_label *label = NULL;
11404
11405 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
11406 with NAN operands. */
11407 if (check_unordered)
11408 {
11409 gcc_assert (comparison == EQ || comparison == NE);
11410
11411 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
11412 label = gen_label_rtx ();
11413 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
11414 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11415 gen_rtx_LABEL_REF (VOIDmode, label),
11416 pc_rtx);
11417 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
11418 }
11419
11420 /* NB: Set CCFPmode and check a different CCmode which is in subset
11421 of CCFPmode. */
11422 if (GET_MODE (set_dst) != mode)
11423 {
11424 gcc_assert (mode == CCAmode || mode == CCCmode
11425 || mode == CCOmode || mode == CCPmode
11426 || mode == CCSmode || mode == CCZmode);
11427 set_dst = gen_rtx_REG (mode, FLAGS_REG);
11428 }
11429
11430 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11431 gen_rtx_fmt_ee (comparison, QImode,
11432 set_dst,
11433 const0_rtx)));
11434
11435 if (label)
11436 emit_label (label);
11437
11438 return SUBREG_REG (target);
11439 }
11440
11441 static rtx
11442 ix86_expand_round_builtin (const struct builtin_description *d,
11443 tree exp, rtx target)
11444 {
11445 rtx pat;
11446 unsigned int i, nargs;
11447 rtx xops[6];
11448 enum insn_code icode = d->icode;
11449 const struct insn_data_d *insn_p = &insn_data[icode];
11450 machine_mode tmode = insn_p->operand[0].mode;
11451 unsigned int nargs_constant = 0;
11452 unsigned int redundant_embed_rnd = 0;
11453
11454 switch ((enum ix86_builtin_func_type) d->flag)
11455 {
11456 case UINT64_FTYPE_V2DF_INT:
11457 case UINT64_FTYPE_V4SF_INT:
11458 case UINT64_FTYPE_V8HF_INT:
11459 case UINT_FTYPE_V2DF_INT:
11460 case UINT_FTYPE_V4SF_INT:
11461 case UINT_FTYPE_V8HF_INT:
11462 case INT64_FTYPE_V2DF_INT:
11463 case INT64_FTYPE_V4SF_INT:
11464 case INT64_FTYPE_V8HF_INT:
11465 case INT_FTYPE_V2DF_INT:
11466 case INT_FTYPE_V4SF_INT:
11467 case INT_FTYPE_V8HF_INT:
11468 nargs = 2;
11469 break;
11470 case V32HF_FTYPE_V32HF_V32HF_INT:
11471 case V8HF_FTYPE_V8HF_V8HF_INT:
11472 case V8HF_FTYPE_V8HF_INT_INT:
11473 case V8HF_FTYPE_V8HF_UINT_INT:
11474 case V8HF_FTYPE_V8HF_INT64_INT:
11475 case V8HF_FTYPE_V8HF_UINT64_INT:
11476 case V4SF_FTYPE_V4SF_UINT_INT:
11477 case V4SF_FTYPE_V4SF_UINT64_INT:
11478 case V2DF_FTYPE_V2DF_UINT64_INT:
11479 case V4SF_FTYPE_V4SF_INT_INT:
11480 case V4SF_FTYPE_V4SF_INT64_INT:
11481 case V2DF_FTYPE_V2DF_INT64_INT:
11482 case V4SF_FTYPE_V4SF_V4SF_INT:
11483 case V2DF_FTYPE_V2DF_V2DF_INT:
11484 case V4SF_FTYPE_V4SF_V2DF_INT:
11485 case V2DF_FTYPE_V2DF_V4SF_INT:
11486 nargs = 3;
11487 break;
11488 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11489 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
11490 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
11491 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
11492 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
11493 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11494 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11495 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
11496 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11497 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
11498 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
11499 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
11500 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
11501 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11502 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11503 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11504 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
11505 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
11506 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
11507 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11508 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11509 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11510 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
11511 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
11512 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11513 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
11514 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
11515 nargs = 4;
11516 break;
11517 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11518 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11519 nargs_constant = 2;
11520 nargs = 4;
11521 break;
11522 case INT_FTYPE_V4SF_V4SF_INT_INT:
11523 case INT_FTYPE_V2DF_V2DF_INT_INT:
11524 return ix86_expand_sse_comi_round (d, exp, target);
11525 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11526 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11527 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
11528 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
11529 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
11530 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
11531 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
11532 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
11533 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11534 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
11535 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
11536 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11537 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
11538 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
11539 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
11540 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11541 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
11542 nargs = 5;
11543 break;
11544 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
11545 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11546 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
11547 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11548 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
11549 nargs_constant = 4;
11550 nargs = 5;
11551 break;
11552 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11553 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11554 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11555 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
11556 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11557 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
11558 nargs_constant = 3;
11559 nargs = 5;
11560 break;
11561 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11562 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11563 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11564 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11565 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11566 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
11567 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
11568 nargs = 6;
11569 nargs_constant = 4;
11570 break;
11571 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11572 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11573 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11574 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11575 nargs = 6;
11576 nargs_constant = 3;
11577 break;
11578 default:
11579 gcc_unreachable ();
11580 }
11581 gcc_assert (nargs <= ARRAY_SIZE (xops));
11582
11583 if (optimize
11584 || target == 0
11585 || GET_MODE (target) != tmode
11586 || !insn_p->operand[0].predicate (target, tmode))
11587 target = gen_reg_rtx (tmode);
11588
11589 for (i = 0; i < nargs; i++)
11590 {
11591 tree arg = CALL_EXPR_ARG (exp, i);
11592 rtx op = expand_normal (arg);
11593 machine_mode mode = insn_p->operand[i + 1].mode;
11594 bool match = insn_p->operand[i + 1].predicate (op, mode);
11595
11596 if (i == nargs - nargs_constant)
11597 {
11598 if (!match)
11599 {
11600 switch (icode)
11601 {
11602 case CODE_FOR_avx512f_getmantv8df_mask_round:
11603 case CODE_FOR_avx512f_getmantv16sf_mask_round:
11604 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
11605 case CODE_FOR_avx512f_vgetmantv2df_round:
11606 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11607 case CODE_FOR_avx512f_vgetmantv4sf_round:
11608 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
11609 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
11610 error ("the immediate argument must be a 4-bit immediate");
11611 return const0_rtx;
11612 case CODE_FOR_avx512f_cmpv8df3_mask_round:
11613 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11614 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11615 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
11616 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11617 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
11618 error ("the immediate argument must be a 5-bit immediate");
11619 return const0_rtx;
11620 default:
11621 error ("the immediate argument must be an 8-bit immediate");
11622 return const0_rtx;
11623 }
11624 }
11625 }
11626 else if (i == nargs-1)
11627 {
11628 if (!insn_p->operand[nargs].predicate (op, SImode))
11629 {
11630 error ("incorrect rounding operand");
11631 return const0_rtx;
11632 }
11633
11634 /* If there is no rounding use normal version of the pattern. */
11635 if (INTVAL (op) == NO_ROUND)
11636 {
11637 /* Skip erasing embedded rounding for below expanders who
11638 generates multiple insns. In ix86_erase_embedded_rounding
11639 the pattern will be transformed to a single set, and emit_insn
11640 appends the set insead of insert it to chain. So the insns
11641 emitted inside define_expander would be ignored. */
11642 switch (icode)
11643 {
11644 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11645 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11646 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11647 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11648 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11649 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11650 redundant_embed_rnd = 0;
11651 break;
11652 default:
11653 redundant_embed_rnd = 1;
11654 break;
11655 }
11656 }
11657 }
11658 else
11659 {
11660 if (VECTOR_MODE_P (mode))
11661 op = safe_vector_operand (op, mode);
11662
11663 op = fixup_modeless_constant (op, mode);
11664
11665 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11666 {
11667 if (optimize || !match)
11668 op = copy_to_mode_reg (mode, op);
11669 }
11670 else
11671 {
11672 op = copy_to_reg (op);
11673 op = lowpart_subreg (mode, op, GET_MODE (op));
11674 }
11675 }
11676
11677 xops[i] = op;
11678 }
11679
11680 switch (nargs)
11681 {
11682 case 1:
11683 pat = GEN_FCN (icode) (target, xops[0]);
11684 break;
11685 case 2:
11686 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11687 break;
11688 case 3:
11689 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11690 break;
11691 case 4:
11692 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11693 xops[2], xops[3]);
11694 break;
11695 case 5:
11696 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11697 xops[2], xops[3], xops[4]);
11698 break;
11699 case 6:
11700 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11701 xops[2], xops[3], xops[4], xops[5]);
11702 break;
11703 default:
11704 gcc_unreachable ();
11705 }
11706
11707 if (!pat)
11708 return 0;
11709
11710 if (redundant_embed_rnd)
11711 pat = ix86_erase_embedded_rounding (pat);
11712
11713 emit_insn (pat);
11714 return target;
11715 }
11716
11717 /* Subroutine of ix86_expand_builtin to take care of special insns
11718 with variable number of operands. */
11719
11720 static rtx
11721 ix86_expand_special_args_builtin (const struct builtin_description *d,
11722 tree exp, rtx target)
11723 {
11724 tree arg;
11725 rtx pat, op;
11726 unsigned int i, nargs, arg_adjust, memory;
11727 bool aligned_mem = false;
11728 rtx xops[3];
11729 enum insn_code icode = d->icode;
11730 const struct insn_data_d *insn_p = &insn_data[icode];
11731 machine_mode tmode = insn_p->operand[0].mode;
11732 enum { load, store } klass;
11733
11734 switch ((enum ix86_builtin_func_type) d->flag)
11735 {
11736 case VOID_FTYPE_VOID:
11737 emit_insn (GEN_FCN (icode) (target));
11738 return 0;
11739 case VOID_FTYPE_UINT64:
11740 case VOID_FTYPE_UNSIGNED:
11741 nargs = 0;
11742 klass = store;
11743 memory = 0;
11744 break;
11745
11746 case INT_FTYPE_VOID:
11747 case USHORT_FTYPE_VOID:
11748 case UINT64_FTYPE_VOID:
11749 case UINT_FTYPE_VOID:
11750 case UINT8_FTYPE_VOID:
11751 case UNSIGNED_FTYPE_VOID:
11752 nargs = 0;
11753 klass = load;
11754 memory = 0;
11755 break;
11756 case UINT64_FTYPE_PUNSIGNED:
11757 case V2DI_FTYPE_PV2DI:
11758 case V4DI_FTYPE_PV4DI:
11759 case V32QI_FTYPE_PCCHAR:
11760 case V16QI_FTYPE_PCCHAR:
11761 case V8SF_FTYPE_PCV4SF:
11762 case V8SF_FTYPE_PCFLOAT:
11763 case V4SF_FTYPE_PCFLOAT:
11764 case V4DF_FTYPE_PCV2DF:
11765 case V4DF_FTYPE_PCDOUBLE:
11766 case V2DF_FTYPE_PCDOUBLE:
11767 case VOID_FTYPE_PVOID:
11768 case V8DI_FTYPE_PV8DI:
11769 nargs = 1;
11770 klass = load;
11771 memory = 0;
11772 switch (icode)
11773 {
11774 case CODE_FOR_sse4_1_movntdqa:
11775 case CODE_FOR_avx2_movntdqa:
11776 case CODE_FOR_avx512f_movntdqa:
11777 aligned_mem = true;
11778 break;
11779 default:
11780 break;
11781 }
11782 break;
11783 case VOID_FTYPE_PV2SF_V4SF:
11784 case VOID_FTYPE_PV8DI_V8DI:
11785 case VOID_FTYPE_PV4DI_V4DI:
11786 case VOID_FTYPE_PV2DI_V2DI:
11787 case VOID_FTYPE_PCHAR_V32QI:
11788 case VOID_FTYPE_PCHAR_V16QI:
11789 case VOID_FTYPE_PFLOAT_V16SF:
11790 case VOID_FTYPE_PFLOAT_V8SF:
11791 case VOID_FTYPE_PFLOAT_V4SF:
11792 case VOID_FTYPE_PDOUBLE_V8DF:
11793 case VOID_FTYPE_PDOUBLE_V4DF:
11794 case VOID_FTYPE_PDOUBLE_V2DF:
11795 case VOID_FTYPE_PLONGLONG_LONGLONG:
11796 case VOID_FTYPE_PULONGLONG_ULONGLONG:
11797 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
11798 case VOID_FTYPE_PINT_INT:
11799 nargs = 1;
11800 klass = store;
11801 /* Reserve memory operand for target. */
11802 memory = ARRAY_SIZE (xops);
11803 switch (icode)
11804 {
11805 /* These builtins and instructions require the memory
11806 to be properly aligned. */
11807 case CODE_FOR_avx_movntv4di:
11808 case CODE_FOR_sse2_movntv2di:
11809 case CODE_FOR_avx_movntv8sf:
11810 case CODE_FOR_sse_movntv4sf:
11811 case CODE_FOR_sse4a_vmmovntv4sf:
11812 case CODE_FOR_avx_movntv4df:
11813 case CODE_FOR_sse2_movntv2df:
11814 case CODE_FOR_sse4a_vmmovntv2df:
11815 case CODE_FOR_sse2_movntidi:
11816 case CODE_FOR_sse_movntq:
11817 case CODE_FOR_sse2_movntisi:
11818 case CODE_FOR_avx512f_movntv16sf:
11819 case CODE_FOR_avx512f_movntv8df:
11820 case CODE_FOR_avx512f_movntv8di:
11821 aligned_mem = true;
11822 break;
11823 default:
11824 break;
11825 }
11826 break;
11827 case VOID_FTYPE_PVOID_PCVOID:
11828 nargs = 1;
11829 klass = store;
11830 memory = 0;
11831
11832 break;
11833 case V4SF_FTYPE_V4SF_PCV2SF:
11834 case V2DF_FTYPE_V2DF_PCDOUBLE:
11835 nargs = 2;
11836 klass = load;
11837 memory = 1;
11838 break;
11839 case V8SF_FTYPE_PCV8SF_V8SI:
11840 case V4DF_FTYPE_PCV4DF_V4DI:
11841 case V4SF_FTYPE_PCV4SF_V4SI:
11842 case V2DF_FTYPE_PCV2DF_V2DI:
11843 case V8SI_FTYPE_PCV8SI_V8SI:
11844 case V4DI_FTYPE_PCV4DI_V4DI:
11845 case V4SI_FTYPE_PCV4SI_V4SI:
11846 case V2DI_FTYPE_PCV2DI_V2DI:
11847 case VOID_FTYPE_INT_INT64:
11848 nargs = 2;
11849 klass = load;
11850 memory = 0;
11851 break;
11852 case VOID_FTYPE_PV8DF_V8DF_UQI:
11853 case VOID_FTYPE_PV4DF_V4DF_UQI:
11854 case VOID_FTYPE_PV2DF_V2DF_UQI:
11855 case VOID_FTYPE_PV16SF_V16SF_UHI:
11856 case VOID_FTYPE_PV8SF_V8SF_UQI:
11857 case VOID_FTYPE_PV4SF_V4SF_UQI:
11858 case VOID_FTYPE_PV8DI_V8DI_UQI:
11859 case VOID_FTYPE_PV4DI_V4DI_UQI:
11860 case VOID_FTYPE_PV2DI_V2DI_UQI:
11861 case VOID_FTYPE_PV16SI_V16SI_UHI:
11862 case VOID_FTYPE_PV8SI_V8SI_UQI:
11863 case VOID_FTYPE_PV4SI_V4SI_UQI:
11864 case VOID_FTYPE_PV64QI_V64QI_UDI:
11865 case VOID_FTYPE_PV32HI_V32HI_USI:
11866 case VOID_FTYPE_PV32QI_V32QI_USI:
11867 case VOID_FTYPE_PV16QI_V16QI_UHI:
11868 case VOID_FTYPE_PV16HI_V16HI_UHI:
11869 case VOID_FTYPE_PV8HI_V8HI_UQI:
11870 switch (icode)
11871 {
11872 /* These builtins and instructions require the memory
11873 to be properly aligned. */
11874 case CODE_FOR_avx512f_storev16sf_mask:
11875 case CODE_FOR_avx512f_storev16si_mask:
11876 case CODE_FOR_avx512f_storev8df_mask:
11877 case CODE_FOR_avx512f_storev8di_mask:
11878 case CODE_FOR_avx512vl_storev8sf_mask:
11879 case CODE_FOR_avx512vl_storev8si_mask:
11880 case CODE_FOR_avx512vl_storev4df_mask:
11881 case CODE_FOR_avx512vl_storev4di_mask:
11882 case CODE_FOR_avx512vl_storev4sf_mask:
11883 case CODE_FOR_avx512vl_storev4si_mask:
11884 case CODE_FOR_avx512vl_storev2df_mask:
11885 case CODE_FOR_avx512vl_storev2di_mask:
11886 aligned_mem = true;
11887 break;
11888 default:
11889 break;
11890 }
11891 /* FALLTHRU */
11892 case VOID_FTYPE_PV8SF_V8SI_V8SF:
11893 case VOID_FTYPE_PV4DF_V4DI_V4DF:
11894 case VOID_FTYPE_PV4SF_V4SI_V4SF:
11895 case VOID_FTYPE_PV2DF_V2DI_V2DF:
11896 case VOID_FTYPE_PV8SI_V8SI_V8SI:
11897 case VOID_FTYPE_PV4DI_V4DI_V4DI:
11898 case VOID_FTYPE_PV4SI_V4SI_V4SI:
11899 case VOID_FTYPE_PV2DI_V2DI_V2DI:
11900 case VOID_FTYPE_PV8SI_V8DI_UQI:
11901 case VOID_FTYPE_PV8HI_V8DI_UQI:
11902 case VOID_FTYPE_PV16HI_V16SI_UHI:
11903 case VOID_FTYPE_PUDI_V8DI_UQI:
11904 case VOID_FTYPE_PV16QI_V16SI_UHI:
11905 case VOID_FTYPE_PV4SI_V4DI_UQI:
11906 case VOID_FTYPE_PUDI_V2DI_UQI:
11907 case VOID_FTYPE_PUDI_V4DI_UQI:
11908 case VOID_FTYPE_PUSI_V2DI_UQI:
11909 case VOID_FTYPE_PV8HI_V8SI_UQI:
11910 case VOID_FTYPE_PUDI_V4SI_UQI:
11911 case VOID_FTYPE_PUSI_V4DI_UQI:
11912 case VOID_FTYPE_PUHI_V2DI_UQI:
11913 case VOID_FTYPE_PUDI_V8SI_UQI:
11914 case VOID_FTYPE_PUSI_V4SI_UQI:
11915 case VOID_FTYPE_PCHAR_V64QI_UDI:
11916 case VOID_FTYPE_PCHAR_V32QI_USI:
11917 case VOID_FTYPE_PCHAR_V16QI_UHI:
11918 case VOID_FTYPE_PSHORT_V32HI_USI:
11919 case VOID_FTYPE_PSHORT_V16HI_UHI:
11920 case VOID_FTYPE_PSHORT_V8HI_UQI:
11921 case VOID_FTYPE_PINT_V16SI_UHI:
11922 case VOID_FTYPE_PINT_V8SI_UQI:
11923 case VOID_FTYPE_PINT_V4SI_UQI:
11924 case VOID_FTYPE_PINT64_V8DI_UQI:
11925 case VOID_FTYPE_PINT64_V4DI_UQI:
11926 case VOID_FTYPE_PINT64_V2DI_UQI:
11927 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
11928 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
11929 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
11930 case VOID_FTYPE_PFLOAT_V16SF_UHI:
11931 case VOID_FTYPE_PFLOAT_V8SF_UQI:
11932 case VOID_FTYPE_PFLOAT_V4SF_UQI:
11933 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
11934 case VOID_FTYPE_PV32QI_V32HI_USI:
11935 case VOID_FTYPE_PV16QI_V16HI_UHI:
11936 case VOID_FTYPE_PUDI_V8HI_UQI:
11937 nargs = 2;
11938 klass = store;
11939 /* Reserve memory operand for target. */
11940 memory = ARRAY_SIZE (xops);
11941 break;
11942 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
11943 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
11944 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
11945 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
11946 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
11947 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
11948 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
11949 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
11950 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
11951 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
11952 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
11953 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
11954 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
11955 case V32HI_FTYPE_PCV32HI_V32HI_USI:
11956 case V32QI_FTYPE_PCV32QI_V32QI_USI:
11957 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
11958 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
11959 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
11960 switch (icode)
11961 {
11962 /* These builtins and instructions require the memory
11963 to be properly aligned. */
11964 case CODE_FOR_avx512f_loadv16sf_mask:
11965 case CODE_FOR_avx512f_loadv16si_mask:
11966 case CODE_FOR_avx512f_loadv8df_mask:
11967 case CODE_FOR_avx512f_loadv8di_mask:
11968 case CODE_FOR_avx512vl_loadv8sf_mask:
11969 case CODE_FOR_avx512vl_loadv8si_mask:
11970 case CODE_FOR_avx512vl_loadv4df_mask:
11971 case CODE_FOR_avx512vl_loadv4di_mask:
11972 case CODE_FOR_avx512vl_loadv4sf_mask:
11973 case CODE_FOR_avx512vl_loadv4si_mask:
11974 case CODE_FOR_avx512vl_loadv2df_mask:
11975 case CODE_FOR_avx512vl_loadv2di_mask:
11976 case CODE_FOR_avx512bw_loadv64qi_mask:
11977 case CODE_FOR_avx512vl_loadv32qi_mask:
11978 case CODE_FOR_avx512vl_loadv16qi_mask:
11979 case CODE_FOR_avx512bw_loadv32hi_mask:
11980 case CODE_FOR_avx512vl_loadv16hi_mask:
11981 case CODE_FOR_avx512vl_loadv8hi_mask:
11982 aligned_mem = true;
11983 break;
11984 default:
11985 break;
11986 }
11987 /* FALLTHRU */
11988 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
11989 case V32QI_FTYPE_PCCHAR_V32QI_USI:
11990 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
11991 case V32HI_FTYPE_PCSHORT_V32HI_USI:
11992 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
11993 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
11994 case V16SI_FTYPE_PCINT_V16SI_UHI:
11995 case V8SI_FTYPE_PCINT_V8SI_UQI:
11996 case V4SI_FTYPE_PCINT_V4SI_UQI:
11997 case V8DI_FTYPE_PCINT64_V8DI_UQI:
11998 case V4DI_FTYPE_PCINT64_V4DI_UQI:
11999 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12000 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12001 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12002 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12003 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12004 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12005 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12006 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12007 nargs = 3;
12008 klass = load;
12009 memory = 0;
12010 break;
12011 default:
12012 gcc_unreachable ();
12013 }
12014
12015 gcc_assert (nargs <= ARRAY_SIZE (xops));
12016
12017 if (klass == store)
12018 {
12019 arg = CALL_EXPR_ARG (exp, 0);
12020 op = expand_normal (arg);
12021 gcc_assert (target == 0);
12022 if (memory)
12023 {
12024 op = ix86_zero_extend_to_Pmode (op);
12025 target = gen_rtx_MEM (tmode, op);
12026 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12027 on it. Try to improve it using get_pointer_alignment,
12028 and if the special builtin is one that requires strict
12029 mode alignment, also from it's GET_MODE_ALIGNMENT.
12030 Failure to do so could lead to ix86_legitimate_combined_insn
12031 rejecting all changes to such insns. */
12032 unsigned int align = get_pointer_alignment (arg);
12033 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12034 align = GET_MODE_ALIGNMENT (tmode);
12035 if (MEM_ALIGN (target) < align)
12036 set_mem_align (target, align);
12037 }
12038 else
12039 target = force_reg (tmode, op);
12040 arg_adjust = 1;
12041 }
12042 else
12043 {
12044 arg_adjust = 0;
12045 if (optimize
12046 || target == 0
12047 || !register_operand (target, tmode)
12048 || GET_MODE (target) != tmode)
12049 target = gen_reg_rtx (tmode);
12050 }
12051
12052 for (i = 0; i < nargs; i++)
12053 {
12054 machine_mode mode = insn_p->operand[i + 1].mode;
12055
12056 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12057 op = expand_normal (arg);
12058
12059 if (i == memory)
12060 {
12061 /* This must be the memory operand. */
12062 op = ix86_zero_extend_to_Pmode (op);
12063 op = gen_rtx_MEM (mode, op);
12064 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12065 on it. Try to improve it using get_pointer_alignment,
12066 and if the special builtin is one that requires strict
12067 mode alignment, also from it's GET_MODE_ALIGNMENT.
12068 Failure to do so could lead to ix86_legitimate_combined_insn
12069 rejecting all changes to such insns. */
12070 unsigned int align = get_pointer_alignment (arg);
12071 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12072 align = GET_MODE_ALIGNMENT (mode);
12073 if (MEM_ALIGN (op) < align)
12074 set_mem_align (op, align);
12075 }
12076 else
12077 {
12078 /* This must be register. */
12079 if (VECTOR_MODE_P (mode))
12080 op = safe_vector_operand (op, mode);
12081
12082 op = fixup_modeless_constant (op, mode);
12083
12084 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12085 and that mask operand shoud be at the end.
12086 Keep all-ones mask which would be simplified by the expander. */
12087 if (nargs == 3 && i == 2 && klass == load
12088 && constm1_operand (op, mode)
12089 && insn_p->operand[i].predicate (op, mode))
12090 ;
12091 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12092 op = copy_to_mode_reg (mode, op);
12093 else
12094 {
12095 op = copy_to_reg (op);
12096 op = lowpart_subreg (mode, op, GET_MODE (op));
12097 }
12098 }
12099
12100 xops[i]= op;
12101 }
12102
12103 switch (nargs)
12104 {
12105 case 0:
12106 pat = GEN_FCN (icode) (target);
12107 break;
12108 case 1:
12109 pat = GEN_FCN (icode) (target, xops[0]);
12110 break;
12111 case 2:
12112 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12113 break;
12114 case 3:
12115 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12116 break;
12117 default:
12118 gcc_unreachable ();
12119 }
12120
12121 if (! pat)
12122 return 0;
12123
12124 emit_insn (pat);
12125 return klass == store ? 0 : target;
12126 }
12127
12128 /* Return the integer constant in ARG. Constrain it to be in the range
12129 of the subparts of VEC_TYPE; issue an error if not. */
12130
12131 static int
12132 get_element_number (tree vec_type, tree arg)
12133 {
12134 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12135
12136 if (!tree_fits_uhwi_p (arg)
12137 || (elt = tree_to_uhwi (arg), elt > max))
12138 {
12139 error ("selector must be an integer constant in the range "
12140 "[0, %wi]", max);
12141 return 0;
12142 }
12143
12144 return elt;
12145 }
12146
12147 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12148 ix86_expand_vector_init. We DO have language-level syntax for this, in
12149 the form of (type){ init-list }. Except that since we can't place emms
12150 instructions from inside the compiler, we can't allow the use of MMX
12151 registers unless the user explicitly asks for it. So we do *not* define
12152 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12153 we have builtins invoked by mmintrin.h that gives us license to emit
12154 these sorts of instructions. */
12155
12156 static rtx
12157 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12158 {
12159 machine_mode tmode = TYPE_MODE (type);
12160 machine_mode inner_mode = GET_MODE_INNER (tmode);
12161 int i, n_elt = GET_MODE_NUNITS (tmode);
12162 rtvec v = rtvec_alloc (n_elt);
12163
12164 gcc_assert (VECTOR_MODE_P (tmode));
12165 gcc_assert (call_expr_nargs (exp) == n_elt);
12166
12167 for (i = 0; i < n_elt; ++i)
12168 {
12169 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12170 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12171 }
12172
12173 if (!target || !register_operand (target, tmode))
12174 target = gen_reg_rtx (tmode);
12175
12176 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12177 return target;
12178 }
12179
12180 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12181 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12182 had a language-level syntax for referencing vector elements. */
12183
12184 static rtx
12185 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12186 {
12187 machine_mode tmode, mode0;
12188 tree arg0, arg1;
12189 int elt;
12190 rtx op0;
12191
12192 arg0 = CALL_EXPR_ARG (exp, 0);
12193 arg1 = CALL_EXPR_ARG (exp, 1);
12194
12195 op0 = expand_normal (arg0);
12196 elt = get_element_number (TREE_TYPE (arg0), arg1);
12197
12198 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12199 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12200 gcc_assert (VECTOR_MODE_P (mode0));
12201
12202 op0 = force_reg (mode0, op0);
12203
12204 if (optimize || !target || !register_operand (target, tmode))
12205 target = gen_reg_rtx (tmode);
12206
12207 ix86_expand_vector_extract (true, target, op0, elt);
12208
12209 return target;
12210 }
12211
12212 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12213 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12214 a language-level syntax for referencing vector elements. */
12215
12216 static rtx
12217 ix86_expand_vec_set_builtin (tree exp)
12218 {
12219 machine_mode tmode, mode1;
12220 tree arg0, arg1, arg2;
12221 int elt;
12222 rtx op0, op1, target;
12223
12224 arg0 = CALL_EXPR_ARG (exp, 0);
12225 arg1 = CALL_EXPR_ARG (exp, 1);
12226 arg2 = CALL_EXPR_ARG (exp, 2);
12227
12228 tmode = TYPE_MODE (TREE_TYPE (arg0));
12229 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12230 gcc_assert (VECTOR_MODE_P (tmode));
12231
12232 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12233 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12234 elt = get_element_number (TREE_TYPE (arg0), arg2);
12235
12236 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
12237 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12238
12239 op0 = force_reg (tmode, op0);
12240 op1 = force_reg (mode1, op1);
12241
12242 /* OP0 is the source of these builtin functions and shouldn't be
12243 modified. Create a copy, use it and return it as target. */
12244 target = gen_reg_rtx (tmode);
12245 emit_move_insn (target, op0);
12246 ix86_expand_vector_set (true, target, op1, elt);
12247
12248 return target;
12249 }
12250
12251 /* Return true if the necessary isa options for this builtin exist,
12252 else false.
12253 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12254 bool
12255 ix86_check_builtin_isa_match (unsigned int fcode,
12256 HOST_WIDE_INT* pbisa,
12257 HOST_WIDE_INT* pbisa2)
12258 {
12259 HOST_WIDE_INT isa = ix86_isa_flags;
12260 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12261 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12262 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12263 /* The general case is we require all the ISAs specified in bisa{,2}
12264 to be enabled.
12265 The exceptions are:
12266 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12267 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12268 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12269 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12270 OPTION_MASK_ISA2_AVXVNNI
12271 where for each such pair it is sufficient if either of the ISAs is
12272 enabled, plus if it is ored with other options also those others.
12273 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12274 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12275 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12276 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
12277 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
12278
12279 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12280 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12281 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
12282 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
12283
12284 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12285 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12286 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
12287 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
12288
12289 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12290 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12291 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
12292 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12293 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12294 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
12295 {
12296 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
12297 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
12298 }
12299
12300 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12301 /* __builtin_ia32_maskmovq requires MMX registers. */
12302 && fcode != IX86_BUILTIN_MASKMOVQ)
12303 {
12304 bisa &= ~OPTION_MASK_ISA_MMX;
12305 bisa |= OPTION_MASK_ISA_SSE2;
12306 }
12307
12308 if (pbisa)
12309 *pbisa = bisa;
12310 if (pbisa2)
12311 *pbisa2 = bisa2;
12312
12313 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12314 }
12315
12316 /* Expand an expression EXP that calls a built-in function,
12317 with result going to TARGET if that's convenient
12318 (and in mode MODE if that's convenient).
12319 SUBTARGET may be used as the target for computing one of EXP's operands.
12320 IGNORE is nonzero if the value is to be ignored. */
12321
12322 rtx
12323 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12324 machine_mode mode, int ignore)
12325 {
12326 size_t i;
12327 enum insn_code icode, icode2;
12328 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12329 tree arg0, arg1, arg2, arg3, arg4;
12330 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12331 machine_mode mode0, mode1, mode2, mode3, mode4;
12332 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12333 HOST_WIDE_INT bisa, bisa2;
12334
12335 /* For CPU builtins that can be folded, fold first and expand the fold. */
12336 switch (fcode)
12337 {
12338 case IX86_BUILTIN_CPU_INIT:
12339 {
12340 /* Make it call __cpu_indicator_init in libgcc. */
12341 tree call_expr, fndecl, type;
12342 type = build_function_type_list (integer_type_node, NULL_TREE);
12343 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12344 call_expr = build_call_expr (fndecl, 0);
12345 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12346 }
12347 case IX86_BUILTIN_CPU_IS:
12348 case IX86_BUILTIN_CPU_SUPPORTS:
12349 {
12350 tree arg0 = CALL_EXPR_ARG (exp, 0);
12351 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12352 gcc_assert (fold_expr != NULL_TREE);
12353 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12354 }
12355 }
12356
12357 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
12358 {
12359 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12360 if (TARGET_ABI_X32)
12361 bisa |= OPTION_MASK_ABI_X32;
12362 else
12363 bisa |= OPTION_MASK_ABI_64;
12364 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
12365 (enum fpmath_unit) 0,
12366 (enum prefer_vector_width) 0,
12367 PVW_NONE, PVW_NONE,
12368 false, add_abi_p);
12369 if (!opts)
12370 error ("%qE needs unknown isa option", fndecl);
12371 else
12372 {
12373 gcc_assert (opts != NULL);
12374 error ("%qE needs isa option %s", fndecl, opts);
12375 free (opts);
12376 }
12377 return expand_call (exp, target, ignore);
12378 }
12379
12380 switch (fcode)
12381 {
12382 case IX86_BUILTIN_MASKMOVQ:
12383 case IX86_BUILTIN_MASKMOVDQU:
12384 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12385 ? CODE_FOR_mmx_maskmovq
12386 : CODE_FOR_sse2_maskmovdqu);
12387 /* Note the arg order is different from the operand order. */
12388 arg1 = CALL_EXPR_ARG (exp, 0);
12389 arg2 = CALL_EXPR_ARG (exp, 1);
12390 arg0 = CALL_EXPR_ARG (exp, 2);
12391 op0 = expand_normal (arg0);
12392 op1 = expand_normal (arg1);
12393 op2 = expand_normal (arg2);
12394 mode0 = insn_data[icode].operand[0].mode;
12395 mode1 = insn_data[icode].operand[1].mode;
12396 mode2 = insn_data[icode].operand[2].mode;
12397
12398 op0 = ix86_zero_extend_to_Pmode (op0);
12399 op0 = gen_rtx_MEM (mode1, op0);
12400
12401 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12402 op0 = copy_to_mode_reg (mode0, op0);
12403 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12404 op1 = copy_to_mode_reg (mode1, op1);
12405 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12406 op2 = copy_to_mode_reg (mode2, op2);
12407 pat = GEN_FCN (icode) (op0, op1, op2);
12408 if (! pat)
12409 return 0;
12410 emit_insn (pat);
12411 return 0;
12412
12413 case IX86_BUILTIN_LDMXCSR:
12414 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12415 target = assign_386_stack_local (SImode, SLOT_TEMP);
12416 emit_move_insn (target, op0);
12417 emit_insn (gen_sse_ldmxcsr (target));
12418 return 0;
12419
12420 case IX86_BUILTIN_STMXCSR:
12421 target = assign_386_stack_local (SImode, SLOT_TEMP);
12422 emit_insn (gen_sse_stmxcsr (target));
12423 return copy_to_mode_reg (SImode, target);
12424
12425 case IX86_BUILTIN_CLFLUSH:
12426 arg0 = CALL_EXPR_ARG (exp, 0);
12427 op0 = expand_normal (arg0);
12428 icode = CODE_FOR_sse2_clflush;
12429 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12430 op0 = ix86_zero_extend_to_Pmode (op0);
12431
12432 emit_insn (gen_sse2_clflush (op0));
12433 return 0;
12434
12435 case IX86_BUILTIN_CLWB:
12436 arg0 = CALL_EXPR_ARG (exp, 0);
12437 op0 = expand_normal (arg0);
12438 icode = CODE_FOR_clwb;
12439 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12440 op0 = ix86_zero_extend_to_Pmode (op0);
12441
12442 emit_insn (gen_clwb (op0));
12443 return 0;
12444
12445 case IX86_BUILTIN_CLFLUSHOPT:
12446 arg0 = CALL_EXPR_ARG (exp, 0);
12447 op0 = expand_normal (arg0);
12448 icode = CODE_FOR_clflushopt;
12449 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12450 op0 = ix86_zero_extend_to_Pmode (op0);
12451
12452 emit_insn (gen_clflushopt (op0));
12453 return 0;
12454
12455 case IX86_BUILTIN_MONITOR:
12456 case IX86_BUILTIN_MONITORX:
12457 arg0 = CALL_EXPR_ARG (exp, 0);
12458 arg1 = CALL_EXPR_ARG (exp, 1);
12459 arg2 = CALL_EXPR_ARG (exp, 2);
12460 op0 = expand_normal (arg0);
12461 op1 = expand_normal (arg1);
12462 op2 = expand_normal (arg2);
12463 if (!REG_P (op0))
12464 op0 = ix86_zero_extend_to_Pmode (op0);
12465 if (!REG_P (op1))
12466 op1 = copy_to_mode_reg (SImode, op1);
12467 if (!REG_P (op2))
12468 op2 = copy_to_mode_reg (SImode, op2);
12469
12470 emit_insn (fcode == IX86_BUILTIN_MONITOR
12471 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12472 : gen_monitorx (Pmode, op0, op1, op2));
12473 return 0;
12474
12475 case IX86_BUILTIN_MWAIT:
12476 arg0 = CALL_EXPR_ARG (exp, 0);
12477 arg1 = CALL_EXPR_ARG (exp, 1);
12478 op0 = expand_normal (arg0);
12479 op1 = expand_normal (arg1);
12480 if (!REG_P (op0))
12481 op0 = copy_to_mode_reg (SImode, op0);
12482 if (!REG_P (op1))
12483 op1 = copy_to_mode_reg (SImode, op1);
12484 emit_insn (gen_sse3_mwait (op0, op1));
12485 return 0;
12486
12487 case IX86_BUILTIN_MWAITX:
12488 arg0 = CALL_EXPR_ARG (exp, 0);
12489 arg1 = CALL_EXPR_ARG (exp, 1);
12490 arg2 = CALL_EXPR_ARG (exp, 2);
12491 op0 = expand_normal (arg0);
12492 op1 = expand_normal (arg1);
12493 op2 = expand_normal (arg2);
12494 if (!REG_P (op0))
12495 op0 = copy_to_mode_reg (SImode, op0);
12496 if (!REG_P (op1))
12497 op1 = copy_to_mode_reg (SImode, op1);
12498 if (!REG_P (op2))
12499 op2 = copy_to_mode_reg (SImode, op2);
12500 emit_insn (gen_mwaitx (op0, op1, op2));
12501 return 0;
12502
12503 case IX86_BUILTIN_UMONITOR:
12504 arg0 = CALL_EXPR_ARG (exp, 0);
12505 op0 = expand_normal (arg0);
12506
12507 op0 = ix86_zero_extend_to_Pmode (op0);
12508 emit_insn (gen_umonitor (Pmode, op0));
12509 return 0;
12510
12511 case IX86_BUILTIN_UMWAIT:
12512 case IX86_BUILTIN_TPAUSE:
12513 arg0 = CALL_EXPR_ARG (exp, 0);
12514 arg1 = CALL_EXPR_ARG (exp, 1);
12515 op0 = expand_normal (arg0);
12516 op1 = expand_normal (arg1);
12517
12518 if (!REG_P (op0))
12519 op0 = copy_to_mode_reg (SImode, op0);
12520
12521 op1 = force_reg (DImode, op1);
12522
12523 if (TARGET_64BIT)
12524 {
12525 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12526 NULL, 1, OPTAB_DIRECT);
12527 switch (fcode)
12528 {
12529 case IX86_BUILTIN_UMWAIT:
12530 icode = CODE_FOR_umwait_rex64;
12531 break;
12532 case IX86_BUILTIN_TPAUSE:
12533 icode = CODE_FOR_tpause_rex64;
12534 break;
12535 default:
12536 gcc_unreachable ();
12537 }
12538
12539 op2 = gen_lowpart (SImode, op2);
12540 op1 = gen_lowpart (SImode, op1);
12541 pat = GEN_FCN (icode) (op0, op1, op2);
12542 }
12543 else
12544 {
12545 switch (fcode)
12546 {
12547 case IX86_BUILTIN_UMWAIT:
12548 icode = CODE_FOR_umwait;
12549 break;
12550 case IX86_BUILTIN_TPAUSE:
12551 icode = CODE_FOR_tpause;
12552 break;
12553 default:
12554 gcc_unreachable ();
12555 }
12556 pat = GEN_FCN (icode) (op0, op1);
12557 }
12558
12559 if (!pat)
12560 return 0;
12561
12562 emit_insn (pat);
12563
12564 if (target == 0
12565 || !register_operand (target, QImode))
12566 target = gen_reg_rtx (QImode);
12567
12568 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12569 const0_rtx);
12570 emit_insn (gen_rtx_SET (target, pat));
12571
12572 return target;
12573
12574 case IX86_BUILTIN_TESTUI:
12575 emit_insn (gen_testui ());
12576
12577 if (target == 0
12578 || !register_operand (target, QImode))
12579 target = gen_reg_rtx (QImode);
12580
12581 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12582 const0_rtx);
12583 emit_insn (gen_rtx_SET (target, pat));
12584
12585 return target;
12586
12587 case IX86_BUILTIN_CLZERO:
12588 arg0 = CALL_EXPR_ARG (exp, 0);
12589 op0 = expand_normal (arg0);
12590 if (!REG_P (op0))
12591 op0 = ix86_zero_extend_to_Pmode (op0);
12592 emit_insn (gen_clzero (Pmode, op0));
12593 return 0;
12594
12595 case IX86_BUILTIN_CLDEMOTE:
12596 arg0 = CALL_EXPR_ARG (exp, 0);
12597 op0 = expand_normal (arg0);
12598 icode = CODE_FOR_cldemote;
12599 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12600 op0 = ix86_zero_extend_to_Pmode (op0);
12601
12602 emit_insn (gen_cldemote (op0));
12603 return 0;
12604
12605 case IX86_BUILTIN_LOADIWKEY:
12606 {
12607 arg0 = CALL_EXPR_ARG (exp, 0);
12608 arg1 = CALL_EXPR_ARG (exp, 1);
12609 arg2 = CALL_EXPR_ARG (exp, 2);
12610 arg3 = CALL_EXPR_ARG (exp, 3);
12611
12612 op0 = expand_normal (arg0);
12613 op1 = expand_normal (arg1);
12614 op2 = expand_normal (arg2);
12615 op3 = expand_normal (arg3);
12616
12617 if (!REG_P (op0))
12618 op0 = copy_to_mode_reg (V2DImode, op0);
12619 if (!REG_P (op1))
12620 op1 = copy_to_mode_reg (V2DImode, op1);
12621 if (!REG_P (op2))
12622 op2 = copy_to_mode_reg (V2DImode, op2);
12623 if (!REG_P (op3))
12624 op3 = copy_to_mode_reg (SImode, op3);
12625
12626 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12627
12628 return 0;
12629 }
12630
12631 case IX86_BUILTIN_AESDEC128KLU8:
12632 icode = CODE_FOR_aesdec128klu8;
12633 goto aesdecenc_expand;
12634
12635 case IX86_BUILTIN_AESDEC256KLU8:
12636 icode = CODE_FOR_aesdec256klu8;
12637 goto aesdecenc_expand;
12638
12639 case IX86_BUILTIN_AESENC128KLU8:
12640 icode = CODE_FOR_aesenc128klu8;
12641 goto aesdecenc_expand;
12642
12643 case IX86_BUILTIN_AESENC256KLU8:
12644 icode = CODE_FOR_aesenc256klu8;
12645
12646 aesdecenc_expand:
12647
12648 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
12649 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
12650 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12651
12652 op0 = expand_normal (arg0);
12653 op1 = expand_normal (arg1);
12654 op2 = expand_normal (arg2);
12655
12656 if (!address_operand (op0, V2DImode))
12657 {
12658 op0 = convert_memory_address (Pmode, op0);
12659 op0 = copy_addr_to_reg (op0);
12660 }
12661 op0 = gen_rtx_MEM (V2DImode, op0);
12662
12663 if (!REG_P (op1))
12664 op1 = copy_to_mode_reg (V2DImode, op1);
12665
12666 if (!address_operand (op2, VOIDmode))
12667 {
12668 op2 = convert_memory_address (Pmode, op2);
12669 op2 = copy_addr_to_reg (op2);
12670 }
12671 op2 = gen_rtx_MEM (BLKmode, op2);
12672
12673 emit_insn (GEN_FCN (icode) (op1, op1, op2));
12674
12675 if (target == 0)
12676 target = gen_reg_rtx (QImode);
12677
12678 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12679 error occurs. Then the output should be cleared for safety. */
12680 rtx_code_label *ok_label;
12681 rtx tmp;
12682
12683 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12684 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12685 ok_label = gen_label_rtx ();
12686 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12687 true, ok_label);
12688 /* Usually the runtime error seldom occur, so predict OK path as
12689 hotspot to optimize it as fallthrough block. */
12690 predict_jump (REG_BR_PROB_BASE * 90 / 100);
12691
12692 emit_insn (gen_rtx_SET (op1, const0_rtx));
12693
12694 emit_label (ok_label);
12695 emit_insn (gen_rtx_SET (target, pat));
12696 emit_insn (gen_rtx_SET (op0, op1));
12697
12698 return target;
12699
12700 case IX86_BUILTIN_AESDECWIDE128KLU8:
12701 icode = CODE_FOR_aesdecwide128klu8;
12702 goto wideaesdecenc_expand;
12703
12704 case IX86_BUILTIN_AESDECWIDE256KLU8:
12705 icode = CODE_FOR_aesdecwide256klu8;
12706 goto wideaesdecenc_expand;
12707
12708 case IX86_BUILTIN_AESENCWIDE128KLU8:
12709 icode = CODE_FOR_aesencwide128klu8;
12710 goto wideaesdecenc_expand;
12711
12712 case IX86_BUILTIN_AESENCWIDE256KLU8:
12713 icode = CODE_FOR_aesencwide256klu8;
12714
12715 wideaesdecenc_expand:
12716
12717 rtx xmm_regs[8];
12718 rtx op;
12719
12720 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
12721 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
12722 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12723
12724 op0 = expand_normal (arg0);
12725 op1 = expand_normal (arg1);
12726 op2 = expand_normal (arg2);
12727
12728 if (!address_operand (op2, VOIDmode))
12729 {
12730 op2 = convert_memory_address (Pmode, op2);
12731 op2 = copy_addr_to_reg (op2);
12732 }
12733 op2 = gen_rtx_MEM (BLKmode, op2);
12734
12735 for (i = 0; i < 8; i++)
12736 {
12737 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12738
12739 op = gen_rtx_MEM (V2DImode,
12740 plus_constant (Pmode, op1, (i * 16)));
12741
12742 emit_move_insn (xmm_regs[i], op);
12743 }
12744
12745 emit_insn (GEN_FCN (icode) (op2));
12746
12747 if (target == 0)
12748 target = gen_reg_rtx (QImode);
12749
12750 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12751 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12752 ok_label = gen_label_rtx ();
12753 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12754 true, ok_label);
12755 predict_jump (REG_BR_PROB_BASE * 90 / 100);
12756
12757 for (i = 0; i < 8; i++)
12758 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
12759
12760 emit_label (ok_label);
12761 emit_insn (gen_rtx_SET (target, pat));
12762
12763 for (i = 0; i < 8; i++)
12764 {
12765 op = gen_rtx_MEM (V2DImode,
12766 plus_constant (Pmode, op0, (i * 16)));
12767 emit_move_insn (op, xmm_regs[i]);
12768 }
12769
12770 return target;
12771
12772 case IX86_BUILTIN_ENCODEKEY128U32:
12773 {
12774 rtx op, xmm_regs[7];
12775
12776 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12777 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
12778 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
12779
12780 op0 = expand_normal (arg0);
12781 op1 = expand_normal (arg1);
12782 op2 = expand_normal (arg2);
12783
12784 if (!REG_P (op0))
12785 op0 = copy_to_mode_reg (SImode, op0);
12786
12787 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12788 emit_move_insn (op, op1);
12789
12790 for (i = 0; i < 3; i++)
12791 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12792
12793 if (target == 0)
12794 target = gen_reg_rtx (SImode);
12795
12796 emit_insn (gen_encodekey128u32 (target, op0));
12797
12798 for (i = 0; i < 3; i++)
12799 {
12800 op = gen_rtx_MEM (V2DImode,
12801 plus_constant (Pmode, op2, (i * 16)));
12802 emit_move_insn (op, xmm_regs[i]);
12803 }
12804
12805 return target;
12806 }
12807 case IX86_BUILTIN_ENCODEKEY256U32:
12808 {
12809 rtx op, xmm_regs[7];
12810
12811 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12812 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
12813 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
12814 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
12815
12816 op0 = expand_normal (arg0);
12817 op1 = expand_normal (arg1);
12818 op2 = expand_normal (arg2);
12819 op3 = expand_normal (arg3);
12820
12821 if (!REG_P (op0))
12822 op0 = copy_to_mode_reg (SImode, op0);
12823
12824 /* Force to use xmm0, xmm1 for keylow, keyhi*/
12825 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12826 emit_move_insn (op, op1);
12827 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
12828 emit_move_insn (op, op2);
12829
12830 for (i = 0; i < 4; i++)
12831 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12832
12833 if (target == 0)
12834 target = gen_reg_rtx (SImode);
12835
12836 emit_insn (gen_encodekey256u32 (target, op0));
12837
12838 for (i = 0; i < 4; i++)
12839 {
12840 op = gen_rtx_MEM (V2DImode,
12841 plus_constant (Pmode, op3, (i * 16)));
12842 emit_move_insn (op, xmm_regs[i]);
12843 }
12844
12845 return target;
12846 }
12847
12848 case IX86_BUILTIN_VEC_INIT_V2SI:
12849 case IX86_BUILTIN_VEC_INIT_V4HI:
12850 case IX86_BUILTIN_VEC_INIT_V8QI:
12851 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
12852
12853 case IX86_BUILTIN_VEC_EXT_V2DF:
12854 case IX86_BUILTIN_VEC_EXT_V2DI:
12855 case IX86_BUILTIN_VEC_EXT_V4SF:
12856 case IX86_BUILTIN_VEC_EXT_V4SI:
12857 case IX86_BUILTIN_VEC_EXT_V8HI:
12858 case IX86_BUILTIN_VEC_EXT_V2SI:
12859 case IX86_BUILTIN_VEC_EXT_V4HI:
12860 case IX86_BUILTIN_VEC_EXT_V16QI:
12861 return ix86_expand_vec_ext_builtin (exp, target);
12862
12863 case IX86_BUILTIN_VEC_SET_V2DI:
12864 case IX86_BUILTIN_VEC_SET_V4SF:
12865 case IX86_BUILTIN_VEC_SET_V4SI:
12866 case IX86_BUILTIN_VEC_SET_V8HI:
12867 case IX86_BUILTIN_VEC_SET_V4HI:
12868 case IX86_BUILTIN_VEC_SET_V16QI:
12869 return ix86_expand_vec_set_builtin (exp);
12870
12871 case IX86_BUILTIN_NANQ:
12872 case IX86_BUILTIN_NANSQ:
12873 return expand_call (exp, target, ignore);
12874
12875 case IX86_BUILTIN_RDPID:
12876
12877 op0 = gen_reg_rtx (word_mode);
12878
12879 if (TARGET_64BIT)
12880 {
12881 insn = gen_rdpid_rex64 (op0);
12882 op0 = convert_to_mode (SImode, op0, 1);
12883 }
12884 else
12885 insn = gen_rdpid (op0);
12886
12887 emit_insn (insn);
12888
12889 if (target == 0
12890 || !register_operand (target, SImode))
12891 target = gen_reg_rtx (SImode);
12892
12893 emit_move_insn (target, op0);
12894 return target;
12895
12896 case IX86_BUILTIN_2INTERSECTD512:
12897 case IX86_BUILTIN_2INTERSECTQ512:
12898 case IX86_BUILTIN_2INTERSECTD256:
12899 case IX86_BUILTIN_2INTERSECTQ256:
12900 case IX86_BUILTIN_2INTERSECTD128:
12901 case IX86_BUILTIN_2INTERSECTQ128:
12902 arg0 = CALL_EXPR_ARG (exp, 0);
12903 arg1 = CALL_EXPR_ARG (exp, 1);
12904 arg2 = CALL_EXPR_ARG (exp, 2);
12905 arg3 = CALL_EXPR_ARG (exp, 3);
12906 op0 = expand_normal (arg0);
12907 op1 = expand_normal (arg1);
12908 op2 = expand_normal (arg2);
12909 op3 = expand_normal (arg3);
12910
12911 if (!address_operand (op0, VOIDmode))
12912 {
12913 op0 = convert_memory_address (Pmode, op0);
12914 op0 = copy_addr_to_reg (op0);
12915 }
12916 if (!address_operand (op1, VOIDmode))
12917 {
12918 op1 = convert_memory_address (Pmode, op1);
12919 op1 = copy_addr_to_reg (op1);
12920 }
12921
12922 switch (fcode)
12923 {
12924 case IX86_BUILTIN_2INTERSECTD512:
12925 mode4 = P2HImode;
12926 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
12927 break;
12928 case IX86_BUILTIN_2INTERSECTQ512:
12929 mode4 = P2QImode;
12930 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
12931 break;
12932 case IX86_BUILTIN_2INTERSECTD256:
12933 mode4 = P2QImode;
12934 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
12935 break;
12936 case IX86_BUILTIN_2INTERSECTQ256:
12937 mode4 = P2QImode;
12938 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
12939 break;
12940 case IX86_BUILTIN_2INTERSECTD128:
12941 mode4 = P2QImode;
12942 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
12943 break;
12944 case IX86_BUILTIN_2INTERSECTQ128:
12945 mode4 = P2QImode;
12946 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
12947 break;
12948 default:
12949 gcc_unreachable ();
12950 }
12951
12952 mode2 = insn_data[icode].operand[1].mode;
12953 mode3 = insn_data[icode].operand[2].mode;
12954 if (!insn_data[icode].operand[1].predicate (op2, mode2))
12955 op2 = copy_to_mode_reg (mode2, op2);
12956 if (!insn_data[icode].operand[2].predicate (op3, mode3))
12957 op3 = copy_to_mode_reg (mode3, op3);
12958
12959 op4 = gen_reg_rtx (mode4);
12960 emit_insn (GEN_FCN (icode) (op4, op2, op3));
12961 mode0 = mode4 == P2HImode ? HImode : QImode;
12962 emit_move_insn (gen_rtx_MEM (mode0, op0),
12963 gen_lowpart (mode0, op4));
12964 emit_move_insn (gen_rtx_MEM (mode0, op1),
12965 gen_highpart (mode0, op4));
12966
12967 return 0;
12968
12969 case IX86_BUILTIN_RDPMC:
12970 case IX86_BUILTIN_RDTSC:
12971 case IX86_BUILTIN_RDTSCP:
12972 case IX86_BUILTIN_XGETBV:
12973
12974 op0 = gen_reg_rtx (DImode);
12975 op1 = gen_reg_rtx (DImode);
12976
12977 if (fcode == IX86_BUILTIN_RDPMC)
12978 {
12979 arg0 = CALL_EXPR_ARG (exp, 0);
12980 op2 = expand_normal (arg0);
12981 if (!register_operand (op2, SImode))
12982 op2 = copy_to_mode_reg (SImode, op2);
12983
12984 insn = (TARGET_64BIT
12985 ? gen_rdpmc_rex64 (op0, op1, op2)
12986 : gen_rdpmc (op0, op2));
12987 emit_insn (insn);
12988 }
12989 else if (fcode == IX86_BUILTIN_XGETBV)
12990 {
12991 arg0 = CALL_EXPR_ARG (exp, 0);
12992 op2 = expand_normal (arg0);
12993 if (!register_operand (op2, SImode))
12994 op2 = copy_to_mode_reg (SImode, op2);
12995
12996 insn = (TARGET_64BIT
12997 ? gen_xgetbv_rex64 (op0, op1, op2)
12998 : gen_xgetbv (op0, op2));
12999 emit_insn (insn);
13000 }
13001 else if (fcode == IX86_BUILTIN_RDTSC)
13002 {
13003 insn = (TARGET_64BIT
13004 ? gen_rdtsc_rex64 (op0, op1)
13005 : gen_rdtsc (op0));
13006 emit_insn (insn);
13007 }
13008 else
13009 {
13010 op2 = gen_reg_rtx (SImode);
13011
13012 insn = (TARGET_64BIT
13013 ? gen_rdtscp_rex64 (op0, op1, op2)
13014 : gen_rdtscp (op0, op2));
13015 emit_insn (insn);
13016
13017 arg0 = CALL_EXPR_ARG (exp, 0);
13018 op4 = expand_normal (arg0);
13019 if (!address_operand (op4, VOIDmode))
13020 {
13021 op4 = convert_memory_address (Pmode, op4);
13022 op4 = copy_addr_to_reg (op4);
13023 }
13024 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13025 }
13026
13027 if (target == 0
13028 || !register_operand (target, DImode))
13029 target = gen_reg_rtx (DImode);
13030
13031 if (TARGET_64BIT)
13032 {
13033 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13034 op1, 1, OPTAB_DIRECT);
13035 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13036 op0, 1, OPTAB_DIRECT);
13037 }
13038
13039 emit_move_insn (target, op0);
13040 return target;
13041
13042 case IX86_BUILTIN_ENQCMD:
13043 case IX86_BUILTIN_ENQCMDS:
13044 case IX86_BUILTIN_MOVDIR64B:
13045
13046 arg0 = CALL_EXPR_ARG (exp, 0);
13047 arg1 = CALL_EXPR_ARG (exp, 1);
13048 op0 = expand_normal (arg0);
13049 op1 = expand_normal (arg1);
13050
13051 op0 = ix86_zero_extend_to_Pmode (op0);
13052 if (!address_operand (op1, VOIDmode))
13053 {
13054 op1 = convert_memory_address (Pmode, op1);
13055 op1 = copy_addr_to_reg (op1);
13056 }
13057 op1 = gen_rtx_MEM (XImode, op1);
13058
13059 if (fcode == IX86_BUILTIN_MOVDIR64B)
13060 {
13061 emit_insn (gen_movdir64b (Pmode, op0, op1));
13062 return 0;
13063 }
13064 else
13065 {
13066 if (target == 0
13067 || !register_operand (target, SImode))
13068 target = gen_reg_rtx (SImode);
13069
13070 emit_move_insn (target, const0_rtx);
13071 target = gen_rtx_SUBREG (QImode, target, 0);
13072
13073 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13074 ? UNSPECV_ENQCMD
13075 : UNSPECV_ENQCMDS);
13076 icode = code_for_enqcmd (unspecv, Pmode);
13077 emit_insn (GEN_FCN (icode) (op0, op1));
13078
13079 emit_insn
13080 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13081 gen_rtx_fmt_ee (EQ, QImode,
13082 gen_rtx_REG (CCZmode, FLAGS_REG),
13083 const0_rtx)));
13084 return SUBREG_REG (target);
13085 }
13086
13087 case IX86_BUILTIN_FXSAVE:
13088 case IX86_BUILTIN_FXRSTOR:
13089 case IX86_BUILTIN_FXSAVE64:
13090 case IX86_BUILTIN_FXRSTOR64:
13091 case IX86_BUILTIN_FNSTENV:
13092 case IX86_BUILTIN_FLDENV:
13093 mode0 = BLKmode;
13094 switch (fcode)
13095 {
13096 case IX86_BUILTIN_FXSAVE:
13097 icode = CODE_FOR_fxsave;
13098 break;
13099 case IX86_BUILTIN_FXRSTOR:
13100 icode = CODE_FOR_fxrstor;
13101 break;
13102 case IX86_BUILTIN_FXSAVE64:
13103 icode = CODE_FOR_fxsave64;
13104 break;
13105 case IX86_BUILTIN_FXRSTOR64:
13106 icode = CODE_FOR_fxrstor64;
13107 break;
13108 case IX86_BUILTIN_FNSTENV:
13109 icode = CODE_FOR_fnstenv;
13110 break;
13111 case IX86_BUILTIN_FLDENV:
13112 icode = CODE_FOR_fldenv;
13113 break;
13114 default:
13115 gcc_unreachable ();
13116 }
13117
13118 arg0 = CALL_EXPR_ARG (exp, 0);
13119 op0 = expand_normal (arg0);
13120
13121 if (!address_operand (op0, VOIDmode))
13122 {
13123 op0 = convert_memory_address (Pmode, op0);
13124 op0 = copy_addr_to_reg (op0);
13125 }
13126 op0 = gen_rtx_MEM (mode0, op0);
13127
13128 pat = GEN_FCN (icode) (op0);
13129 if (pat)
13130 emit_insn (pat);
13131 return 0;
13132
13133 case IX86_BUILTIN_XSETBV:
13134 arg0 = CALL_EXPR_ARG (exp, 0);
13135 arg1 = CALL_EXPR_ARG (exp, 1);
13136 op0 = expand_normal (arg0);
13137 op1 = expand_normal (arg1);
13138
13139 if (!REG_P (op0))
13140 op0 = copy_to_mode_reg (SImode, op0);
13141
13142 op1 = force_reg (DImode, op1);
13143
13144 if (TARGET_64BIT)
13145 {
13146 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13147 NULL, 1, OPTAB_DIRECT);
13148
13149 icode = CODE_FOR_xsetbv_rex64;
13150
13151 op2 = gen_lowpart (SImode, op2);
13152 op1 = gen_lowpart (SImode, op1);
13153 pat = GEN_FCN (icode) (op0, op1, op2);
13154 }
13155 else
13156 {
13157 icode = CODE_FOR_xsetbv;
13158
13159 pat = GEN_FCN (icode) (op0, op1);
13160 }
13161 if (pat)
13162 emit_insn (pat);
13163 return 0;
13164
13165 case IX86_BUILTIN_XSAVE:
13166 case IX86_BUILTIN_XRSTOR:
13167 case IX86_BUILTIN_XSAVE64:
13168 case IX86_BUILTIN_XRSTOR64:
13169 case IX86_BUILTIN_XSAVEOPT:
13170 case IX86_BUILTIN_XSAVEOPT64:
13171 case IX86_BUILTIN_XSAVES:
13172 case IX86_BUILTIN_XRSTORS:
13173 case IX86_BUILTIN_XSAVES64:
13174 case IX86_BUILTIN_XRSTORS64:
13175 case IX86_BUILTIN_XSAVEC:
13176 case IX86_BUILTIN_XSAVEC64:
13177 arg0 = CALL_EXPR_ARG (exp, 0);
13178 arg1 = CALL_EXPR_ARG (exp, 1);
13179 op0 = expand_normal (arg0);
13180 op1 = expand_normal (arg1);
13181
13182 if (!address_operand (op0, VOIDmode))
13183 {
13184 op0 = convert_memory_address (Pmode, op0);
13185 op0 = copy_addr_to_reg (op0);
13186 }
13187 op0 = gen_rtx_MEM (BLKmode, op0);
13188
13189 op1 = force_reg (DImode, op1);
13190
13191 if (TARGET_64BIT)
13192 {
13193 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13194 NULL, 1, OPTAB_DIRECT);
13195 switch (fcode)
13196 {
13197 case IX86_BUILTIN_XSAVE:
13198 icode = CODE_FOR_xsave_rex64;
13199 break;
13200 case IX86_BUILTIN_XRSTOR:
13201 icode = CODE_FOR_xrstor_rex64;
13202 break;
13203 case IX86_BUILTIN_XSAVE64:
13204 icode = CODE_FOR_xsave64;
13205 break;
13206 case IX86_BUILTIN_XRSTOR64:
13207 icode = CODE_FOR_xrstor64;
13208 break;
13209 case IX86_BUILTIN_XSAVEOPT:
13210 icode = CODE_FOR_xsaveopt_rex64;
13211 break;
13212 case IX86_BUILTIN_XSAVEOPT64:
13213 icode = CODE_FOR_xsaveopt64;
13214 break;
13215 case IX86_BUILTIN_XSAVES:
13216 icode = CODE_FOR_xsaves_rex64;
13217 break;
13218 case IX86_BUILTIN_XRSTORS:
13219 icode = CODE_FOR_xrstors_rex64;
13220 break;
13221 case IX86_BUILTIN_XSAVES64:
13222 icode = CODE_FOR_xsaves64;
13223 break;
13224 case IX86_BUILTIN_XRSTORS64:
13225 icode = CODE_FOR_xrstors64;
13226 break;
13227 case IX86_BUILTIN_XSAVEC:
13228 icode = CODE_FOR_xsavec_rex64;
13229 break;
13230 case IX86_BUILTIN_XSAVEC64:
13231 icode = CODE_FOR_xsavec64;
13232 break;
13233 default:
13234 gcc_unreachable ();
13235 }
13236
13237 op2 = gen_lowpart (SImode, op2);
13238 op1 = gen_lowpart (SImode, op1);
13239 pat = GEN_FCN (icode) (op0, op1, op2);
13240 }
13241 else
13242 {
13243 switch (fcode)
13244 {
13245 case IX86_BUILTIN_XSAVE:
13246 icode = CODE_FOR_xsave;
13247 break;
13248 case IX86_BUILTIN_XRSTOR:
13249 icode = CODE_FOR_xrstor;
13250 break;
13251 case IX86_BUILTIN_XSAVEOPT:
13252 icode = CODE_FOR_xsaveopt;
13253 break;
13254 case IX86_BUILTIN_XSAVES:
13255 icode = CODE_FOR_xsaves;
13256 break;
13257 case IX86_BUILTIN_XRSTORS:
13258 icode = CODE_FOR_xrstors;
13259 break;
13260 case IX86_BUILTIN_XSAVEC:
13261 icode = CODE_FOR_xsavec;
13262 break;
13263 default:
13264 gcc_unreachable ();
13265 }
13266 pat = GEN_FCN (icode) (op0, op1);
13267 }
13268
13269 if (pat)
13270 emit_insn (pat);
13271 return 0;
13272
13273 case IX86_BUILTIN_LLWPCB:
13274 arg0 = CALL_EXPR_ARG (exp, 0);
13275 op0 = expand_normal (arg0);
13276
13277 if (!register_operand (op0, Pmode))
13278 op0 = ix86_zero_extend_to_Pmode (op0);
13279 emit_insn (gen_lwp_llwpcb (Pmode, op0));
13280 return 0;
13281
13282 case IX86_BUILTIN_SLWPCB:
13283 if (!target
13284 || !register_operand (target, Pmode))
13285 target = gen_reg_rtx (Pmode);
13286 emit_insn (gen_lwp_slwpcb (Pmode, target));
13287 return target;
13288
13289 case IX86_BUILTIN_LWPVAL32:
13290 case IX86_BUILTIN_LWPVAL64:
13291 case IX86_BUILTIN_LWPINS32:
13292 case IX86_BUILTIN_LWPINS64:
13293 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13294 || fcode == IX86_BUILTIN_LWPINS32)
13295 ? SImode : DImode);
13296
13297 if (fcode == IX86_BUILTIN_LWPVAL32
13298 || fcode == IX86_BUILTIN_LWPVAL64)
13299 icode = code_for_lwp_lwpval (mode);
13300 else
13301 icode = code_for_lwp_lwpins (mode);
13302
13303 arg0 = CALL_EXPR_ARG (exp, 0);
13304 arg1 = CALL_EXPR_ARG (exp, 1);
13305 arg2 = CALL_EXPR_ARG (exp, 2);
13306 op0 = expand_normal (arg0);
13307 op1 = expand_normal (arg1);
13308 op2 = expand_normal (arg2);
13309 mode0 = insn_data[icode].operand[0].mode;
13310
13311 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13312 op0 = copy_to_mode_reg (mode0, op0);
13313 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13314 op1 = copy_to_mode_reg (SImode, op1);
13315
13316 if (!CONST_INT_P (op2))
13317 {
13318 error ("the last argument must be a 32-bit immediate");
13319 return const0_rtx;
13320 }
13321
13322 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13323
13324 if (fcode == IX86_BUILTIN_LWPINS32
13325 || fcode == IX86_BUILTIN_LWPINS64)
13326 {
13327 if (target == 0
13328 || !nonimmediate_operand (target, QImode))
13329 target = gen_reg_rtx (QImode);
13330
13331 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13332 const0_rtx);
13333 emit_insn (gen_rtx_SET (target, pat));
13334
13335 return target;
13336 }
13337 else
13338 return 0;
13339
13340 case IX86_BUILTIN_BEXTRI32:
13341 case IX86_BUILTIN_BEXTRI64:
13342 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13343
13344 arg0 = CALL_EXPR_ARG (exp, 0);
13345 arg1 = CALL_EXPR_ARG (exp, 1);
13346 op0 = expand_normal (arg0);
13347 op1 = expand_normal (arg1);
13348
13349 if (!CONST_INT_P (op1))
13350 {
13351 error ("last argument must be an immediate");
13352 return const0_rtx;
13353 }
13354 else
13355 {
13356 unsigned char lsb_index = UINTVAL (op1);
13357 unsigned char length = UINTVAL (op1) >> 8;
13358
13359 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13360
13361 icode = code_for_tbm_bextri (mode);
13362
13363 mode1 = insn_data[icode].operand[1].mode;
13364 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13365 op0 = copy_to_mode_reg (mode1, op0);
13366
13367 mode0 = insn_data[icode].operand[0].mode;
13368 if (target == 0
13369 || !register_operand (target, mode0))
13370 target = gen_reg_rtx (mode0);
13371
13372 if (length == 0 || lsb_index >= bitsize)
13373 {
13374 emit_move_insn (target, const0_rtx);
13375 return target;
13376 }
13377
13378 if (length + lsb_index > bitsize)
13379 length = bitsize - lsb_index;
13380
13381 op1 = GEN_INT (length);
13382 op2 = GEN_INT (lsb_index);
13383
13384 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13385 return target;
13386 }
13387
13388 case IX86_BUILTIN_RDRAND16_STEP:
13389 mode = HImode;
13390 goto rdrand_step;
13391
13392 case IX86_BUILTIN_RDRAND32_STEP:
13393 mode = SImode;
13394 goto rdrand_step;
13395
13396 case IX86_BUILTIN_RDRAND64_STEP:
13397 mode = DImode;
13398
13399 rdrand_step:
13400 arg0 = CALL_EXPR_ARG (exp, 0);
13401 op1 = expand_normal (arg0);
13402 if (!address_operand (op1, VOIDmode))
13403 {
13404 op1 = convert_memory_address (Pmode, op1);
13405 op1 = copy_addr_to_reg (op1);
13406 }
13407
13408 op0 = gen_reg_rtx (mode);
13409 emit_insn (gen_rdrand (mode, op0));
13410
13411 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13412
13413 op1 = force_reg (SImode, const1_rtx);
13414
13415 /* Emit SImode conditional move. */
13416 if (mode == HImode)
13417 {
13418 if (TARGET_ZERO_EXTEND_WITH_AND
13419 && optimize_function_for_speed_p (cfun))
13420 {
13421 op2 = force_reg (SImode, const0_rtx);
13422
13423 emit_insn (gen_movstricthi
13424 (gen_lowpart (HImode, op2), op0));
13425 }
13426 else
13427 {
13428 op2 = gen_reg_rtx (SImode);
13429
13430 emit_insn (gen_zero_extendhisi2 (op2, op0));
13431 }
13432 }
13433 else if (mode == SImode)
13434 op2 = op0;
13435 else
13436 op2 = gen_rtx_SUBREG (SImode, op0, 0);
13437
13438 if (target == 0
13439 || !register_operand (target, SImode))
13440 target = gen_reg_rtx (SImode);
13441
13442 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13443 const0_rtx);
13444 emit_insn (gen_rtx_SET (target,
13445 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13446 return target;
13447
13448 case IX86_BUILTIN_RDSEED16_STEP:
13449 mode = HImode;
13450 goto rdseed_step;
13451
13452 case IX86_BUILTIN_RDSEED32_STEP:
13453 mode = SImode;
13454 goto rdseed_step;
13455
13456 case IX86_BUILTIN_RDSEED64_STEP:
13457 mode = DImode;
13458
13459 rdseed_step:
13460 arg0 = CALL_EXPR_ARG (exp, 0);
13461 op1 = expand_normal (arg0);
13462 if (!address_operand (op1, VOIDmode))
13463 {
13464 op1 = convert_memory_address (Pmode, op1);
13465 op1 = copy_addr_to_reg (op1);
13466 }
13467
13468 op0 = gen_reg_rtx (mode);
13469 emit_insn (gen_rdseed (mode, op0));
13470
13471 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13472
13473 op2 = gen_reg_rtx (QImode);
13474
13475 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13476 const0_rtx);
13477 emit_insn (gen_rtx_SET (op2, pat));
13478
13479 if (target == 0
13480 || !register_operand (target, SImode))
13481 target = gen_reg_rtx (SImode);
13482
13483 emit_insn (gen_zero_extendqisi2 (target, op2));
13484 return target;
13485
13486 case IX86_BUILTIN_SBB32:
13487 icode = CODE_FOR_subborrowsi;
13488 icode2 = CODE_FOR_subborrowsi_0;
13489 mode0 = SImode;
13490 mode1 = DImode;
13491 mode2 = CCmode;
13492 goto handlecarry;
13493
13494 case IX86_BUILTIN_SBB64:
13495 icode = CODE_FOR_subborrowdi;
13496 icode2 = CODE_FOR_subborrowdi_0;
13497 mode0 = DImode;
13498 mode1 = TImode;
13499 mode2 = CCmode;
13500 goto handlecarry;
13501
13502 case IX86_BUILTIN_ADDCARRYX32:
13503 icode = CODE_FOR_addcarrysi;
13504 icode2 = CODE_FOR_addcarrysi_0;
13505 mode0 = SImode;
13506 mode1 = DImode;
13507 mode2 = CCCmode;
13508 goto handlecarry;
13509
13510 case IX86_BUILTIN_ADDCARRYX64:
13511 icode = CODE_FOR_addcarrydi;
13512 icode2 = CODE_FOR_addcarrydi_0;
13513 mode0 = DImode;
13514 mode1 = TImode;
13515 mode2 = CCCmode;
13516
13517 handlecarry:
13518 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
13519 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
13520 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
13521 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
13522
13523 op1 = expand_normal (arg0);
13524 if (!integer_zerop (arg0))
13525 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
13526
13527 op2 = expand_normal (arg1);
13528 if (!register_operand (op2, mode0))
13529 op2 = copy_to_mode_reg (mode0, op2);
13530
13531 op3 = expand_normal (arg2);
13532 if (!register_operand (op3, mode0))
13533 op3 = copy_to_mode_reg (mode0, op3);
13534
13535 op4 = expand_normal (arg3);
13536 if (!address_operand (op4, VOIDmode))
13537 {
13538 op4 = convert_memory_address (Pmode, op4);
13539 op4 = copy_addr_to_reg (op4);
13540 }
13541
13542 op0 = gen_reg_rtx (mode0);
13543 if (integer_zerop (arg0))
13544 {
13545 /* If arg0 is 0, optimize right away into add or sub
13546 instruction that sets CCCmode flags. */
13547 op1 = gen_rtx_REG (mode2, FLAGS_REG);
13548 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
13549 }
13550 else
13551 {
13552 /* Generate CF from input operand. */
13553 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
13554
13555 /* Generate instruction that consumes CF. */
13556 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
13557 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
13558 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
13559 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
13560 }
13561
13562 /* Return current CF value. */
13563 if (target == 0)
13564 target = gen_reg_rtx (QImode);
13565
13566 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
13567 emit_insn (gen_rtx_SET (target, pat));
13568
13569 /* Store the result. */
13570 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
13571
13572 return target;
13573
13574 case IX86_BUILTIN_READ_FLAGS:
13575 if (ignore)
13576 return const0_rtx;
13577
13578 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
13579
13580 if (optimize
13581 || target == NULL_RTX
13582 || !nonimmediate_operand (target, word_mode)
13583 || GET_MODE (target) != word_mode)
13584 target = gen_reg_rtx (word_mode);
13585
13586 emit_insn (gen_pop (target));
13587 return target;
13588
13589 case IX86_BUILTIN_WRITE_FLAGS:
13590
13591 arg0 = CALL_EXPR_ARG (exp, 0);
13592 op0 = expand_normal (arg0);
13593 if (!general_no_elim_operand (op0, word_mode))
13594 op0 = copy_to_mode_reg (word_mode, op0);
13595
13596 emit_insn (gen_push (op0));
13597 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
13598 return 0;
13599
13600 case IX86_BUILTIN_KTESTC8:
13601 icode = CODE_FOR_ktestqi;
13602 mode3 = CCCmode;
13603 goto kortest;
13604
13605 case IX86_BUILTIN_KTESTZ8:
13606 icode = CODE_FOR_ktestqi;
13607 mode3 = CCZmode;
13608 goto kortest;
13609
13610 case IX86_BUILTIN_KTESTC16:
13611 icode = CODE_FOR_ktesthi;
13612 mode3 = CCCmode;
13613 goto kortest;
13614
13615 case IX86_BUILTIN_KTESTZ16:
13616 icode = CODE_FOR_ktesthi;
13617 mode3 = CCZmode;
13618 goto kortest;
13619
13620 case IX86_BUILTIN_KTESTC32:
13621 icode = CODE_FOR_ktestsi;
13622 mode3 = CCCmode;
13623 goto kortest;
13624
13625 case IX86_BUILTIN_KTESTZ32:
13626 icode = CODE_FOR_ktestsi;
13627 mode3 = CCZmode;
13628 goto kortest;
13629
13630 case IX86_BUILTIN_KTESTC64:
13631 icode = CODE_FOR_ktestdi;
13632 mode3 = CCCmode;
13633 goto kortest;
13634
13635 case IX86_BUILTIN_KTESTZ64:
13636 icode = CODE_FOR_ktestdi;
13637 mode3 = CCZmode;
13638 goto kortest;
13639
13640 case IX86_BUILTIN_KORTESTC8:
13641 icode = CODE_FOR_kortestqi;
13642 mode3 = CCCmode;
13643 goto kortest;
13644
13645 case IX86_BUILTIN_KORTESTZ8:
13646 icode = CODE_FOR_kortestqi;
13647 mode3 = CCZmode;
13648 goto kortest;
13649
13650 case IX86_BUILTIN_KORTESTC16:
13651 icode = CODE_FOR_kortesthi;
13652 mode3 = CCCmode;
13653 goto kortest;
13654
13655 case IX86_BUILTIN_KORTESTZ16:
13656 icode = CODE_FOR_kortesthi;
13657 mode3 = CCZmode;
13658 goto kortest;
13659
13660 case IX86_BUILTIN_KORTESTC32:
13661 icode = CODE_FOR_kortestsi;
13662 mode3 = CCCmode;
13663 goto kortest;
13664
13665 case IX86_BUILTIN_KORTESTZ32:
13666 icode = CODE_FOR_kortestsi;
13667 mode3 = CCZmode;
13668 goto kortest;
13669
13670 case IX86_BUILTIN_KORTESTC64:
13671 icode = CODE_FOR_kortestdi;
13672 mode3 = CCCmode;
13673 goto kortest;
13674
13675 case IX86_BUILTIN_KORTESTZ64:
13676 icode = CODE_FOR_kortestdi;
13677 mode3 = CCZmode;
13678
13679 kortest:
13680 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
13681 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
13682 op0 = expand_normal (arg0);
13683 op1 = expand_normal (arg1);
13684
13685 mode0 = insn_data[icode].operand[0].mode;
13686 mode1 = insn_data[icode].operand[1].mode;
13687
13688 if (GET_MODE (op0) != VOIDmode)
13689 op0 = force_reg (GET_MODE (op0), op0);
13690
13691 op0 = gen_lowpart (mode0, op0);
13692
13693 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13694 op0 = copy_to_mode_reg (mode0, op0);
13695
13696 if (GET_MODE (op1) != VOIDmode)
13697 op1 = force_reg (GET_MODE (op1), op1);
13698
13699 op1 = gen_lowpart (mode1, op1);
13700
13701 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13702 op1 = copy_to_mode_reg (mode1, op1);
13703
13704 target = gen_reg_rtx (QImode);
13705
13706 /* Emit kortest. */
13707 emit_insn (GEN_FCN (icode) (op0, op1));
13708 /* And use setcc to return result from flags. */
13709 ix86_expand_setcc (target, EQ,
13710 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
13711 return target;
13712
13713 case IX86_BUILTIN_GATHERSIV2DF:
13714 icode = CODE_FOR_avx2_gathersiv2df;
13715 goto gather_gen;
13716 case IX86_BUILTIN_GATHERSIV4DF:
13717 icode = CODE_FOR_avx2_gathersiv4df;
13718 goto gather_gen;
13719 case IX86_BUILTIN_GATHERDIV2DF:
13720 icode = CODE_FOR_avx2_gatherdiv2df;
13721 goto gather_gen;
13722 case IX86_BUILTIN_GATHERDIV4DF:
13723 icode = CODE_FOR_avx2_gatherdiv4df;
13724 goto gather_gen;
13725 case IX86_BUILTIN_GATHERSIV4SF:
13726 icode = CODE_FOR_avx2_gathersiv4sf;
13727 goto gather_gen;
13728 case IX86_BUILTIN_GATHERSIV8SF:
13729 icode = CODE_FOR_avx2_gathersiv8sf;
13730 goto gather_gen;
13731 case IX86_BUILTIN_GATHERDIV4SF:
13732 icode = CODE_FOR_avx2_gatherdiv4sf;
13733 goto gather_gen;
13734 case IX86_BUILTIN_GATHERDIV8SF:
13735 icode = CODE_FOR_avx2_gatherdiv8sf;
13736 goto gather_gen;
13737 case IX86_BUILTIN_GATHERSIV2DI:
13738 icode = CODE_FOR_avx2_gathersiv2di;
13739 goto gather_gen;
13740 case IX86_BUILTIN_GATHERSIV4DI:
13741 icode = CODE_FOR_avx2_gathersiv4di;
13742 goto gather_gen;
13743 case IX86_BUILTIN_GATHERDIV2DI:
13744 icode = CODE_FOR_avx2_gatherdiv2di;
13745 goto gather_gen;
13746 case IX86_BUILTIN_GATHERDIV4DI:
13747 icode = CODE_FOR_avx2_gatherdiv4di;
13748 goto gather_gen;
13749 case IX86_BUILTIN_GATHERSIV4SI:
13750 icode = CODE_FOR_avx2_gathersiv4si;
13751 goto gather_gen;
13752 case IX86_BUILTIN_GATHERSIV8SI:
13753 icode = CODE_FOR_avx2_gathersiv8si;
13754 goto gather_gen;
13755 case IX86_BUILTIN_GATHERDIV4SI:
13756 icode = CODE_FOR_avx2_gatherdiv4si;
13757 goto gather_gen;
13758 case IX86_BUILTIN_GATHERDIV8SI:
13759 icode = CODE_FOR_avx2_gatherdiv8si;
13760 goto gather_gen;
13761 case IX86_BUILTIN_GATHERALTSIV4DF:
13762 icode = CODE_FOR_avx2_gathersiv4df;
13763 goto gather_gen;
13764 case IX86_BUILTIN_GATHERALTDIV8SF:
13765 icode = CODE_FOR_avx2_gatherdiv8sf;
13766 goto gather_gen;
13767 case IX86_BUILTIN_GATHERALTSIV4DI:
13768 icode = CODE_FOR_avx2_gathersiv4di;
13769 goto gather_gen;
13770 case IX86_BUILTIN_GATHERALTDIV8SI:
13771 icode = CODE_FOR_avx2_gatherdiv8si;
13772 goto gather_gen;
13773 case IX86_BUILTIN_GATHER3SIV16SF:
13774 icode = CODE_FOR_avx512f_gathersiv16sf;
13775 goto gather_gen;
13776 case IX86_BUILTIN_GATHER3SIV8DF:
13777 icode = CODE_FOR_avx512f_gathersiv8df;
13778 goto gather_gen;
13779 case IX86_BUILTIN_GATHER3DIV16SF:
13780 icode = CODE_FOR_avx512f_gatherdiv16sf;
13781 goto gather_gen;
13782 case IX86_BUILTIN_GATHER3DIV8DF:
13783 icode = CODE_FOR_avx512f_gatherdiv8df;
13784 goto gather_gen;
13785 case IX86_BUILTIN_GATHER3SIV16SI:
13786 icode = CODE_FOR_avx512f_gathersiv16si;
13787 goto gather_gen;
13788 case IX86_BUILTIN_GATHER3SIV8DI:
13789 icode = CODE_FOR_avx512f_gathersiv8di;
13790 goto gather_gen;
13791 case IX86_BUILTIN_GATHER3DIV16SI:
13792 icode = CODE_FOR_avx512f_gatherdiv16si;
13793 goto gather_gen;
13794 case IX86_BUILTIN_GATHER3DIV8DI:
13795 icode = CODE_FOR_avx512f_gatherdiv8di;
13796 goto gather_gen;
13797 case IX86_BUILTIN_GATHER3ALTSIV8DF:
13798 icode = CODE_FOR_avx512f_gathersiv8df;
13799 goto gather_gen;
13800 case IX86_BUILTIN_GATHER3ALTDIV16SF:
13801 icode = CODE_FOR_avx512f_gatherdiv16sf;
13802 goto gather_gen;
13803 case IX86_BUILTIN_GATHER3ALTSIV8DI:
13804 icode = CODE_FOR_avx512f_gathersiv8di;
13805 goto gather_gen;
13806 case IX86_BUILTIN_GATHER3ALTDIV16SI:
13807 icode = CODE_FOR_avx512f_gatherdiv16si;
13808 goto gather_gen;
13809 case IX86_BUILTIN_GATHER3SIV2DF:
13810 icode = CODE_FOR_avx512vl_gathersiv2df;
13811 goto gather_gen;
13812 case IX86_BUILTIN_GATHER3SIV4DF:
13813 icode = CODE_FOR_avx512vl_gathersiv4df;
13814 goto gather_gen;
13815 case IX86_BUILTIN_GATHER3DIV2DF:
13816 icode = CODE_FOR_avx512vl_gatherdiv2df;
13817 goto gather_gen;
13818 case IX86_BUILTIN_GATHER3DIV4DF:
13819 icode = CODE_FOR_avx512vl_gatherdiv4df;
13820 goto gather_gen;
13821 case IX86_BUILTIN_GATHER3SIV4SF:
13822 icode = CODE_FOR_avx512vl_gathersiv4sf;
13823 goto gather_gen;
13824 case IX86_BUILTIN_GATHER3SIV8SF:
13825 icode = CODE_FOR_avx512vl_gathersiv8sf;
13826 goto gather_gen;
13827 case IX86_BUILTIN_GATHER3DIV4SF:
13828 icode = CODE_FOR_avx512vl_gatherdiv4sf;
13829 goto gather_gen;
13830 case IX86_BUILTIN_GATHER3DIV8SF:
13831 icode = CODE_FOR_avx512vl_gatherdiv8sf;
13832 goto gather_gen;
13833 case IX86_BUILTIN_GATHER3SIV2DI:
13834 icode = CODE_FOR_avx512vl_gathersiv2di;
13835 goto gather_gen;
13836 case IX86_BUILTIN_GATHER3SIV4DI:
13837 icode = CODE_FOR_avx512vl_gathersiv4di;
13838 goto gather_gen;
13839 case IX86_BUILTIN_GATHER3DIV2DI:
13840 icode = CODE_FOR_avx512vl_gatherdiv2di;
13841 goto gather_gen;
13842 case IX86_BUILTIN_GATHER3DIV4DI:
13843 icode = CODE_FOR_avx512vl_gatherdiv4di;
13844 goto gather_gen;
13845 case IX86_BUILTIN_GATHER3SIV4SI:
13846 icode = CODE_FOR_avx512vl_gathersiv4si;
13847 goto gather_gen;
13848 case IX86_BUILTIN_GATHER3SIV8SI:
13849 icode = CODE_FOR_avx512vl_gathersiv8si;
13850 goto gather_gen;
13851 case IX86_BUILTIN_GATHER3DIV4SI:
13852 icode = CODE_FOR_avx512vl_gatherdiv4si;
13853 goto gather_gen;
13854 case IX86_BUILTIN_GATHER3DIV8SI:
13855 icode = CODE_FOR_avx512vl_gatherdiv8si;
13856 goto gather_gen;
13857 case IX86_BUILTIN_GATHER3ALTSIV4DF:
13858 icode = CODE_FOR_avx512vl_gathersiv4df;
13859 goto gather_gen;
13860 case IX86_BUILTIN_GATHER3ALTDIV8SF:
13861 icode = CODE_FOR_avx512vl_gatherdiv8sf;
13862 goto gather_gen;
13863 case IX86_BUILTIN_GATHER3ALTSIV4DI:
13864 icode = CODE_FOR_avx512vl_gathersiv4di;
13865 goto gather_gen;
13866 case IX86_BUILTIN_GATHER3ALTDIV8SI:
13867 icode = CODE_FOR_avx512vl_gatherdiv8si;
13868 goto gather_gen;
13869 case IX86_BUILTIN_SCATTERSIV16SF:
13870 icode = CODE_FOR_avx512f_scattersiv16sf;
13871 goto scatter_gen;
13872 case IX86_BUILTIN_SCATTERSIV8DF:
13873 icode = CODE_FOR_avx512f_scattersiv8df;
13874 goto scatter_gen;
13875 case IX86_BUILTIN_SCATTERDIV16SF:
13876 icode = CODE_FOR_avx512f_scatterdiv16sf;
13877 goto scatter_gen;
13878 case IX86_BUILTIN_SCATTERDIV8DF:
13879 icode = CODE_FOR_avx512f_scatterdiv8df;
13880 goto scatter_gen;
13881 case IX86_BUILTIN_SCATTERSIV16SI:
13882 icode = CODE_FOR_avx512f_scattersiv16si;
13883 goto scatter_gen;
13884 case IX86_BUILTIN_SCATTERSIV8DI:
13885 icode = CODE_FOR_avx512f_scattersiv8di;
13886 goto scatter_gen;
13887 case IX86_BUILTIN_SCATTERDIV16SI:
13888 icode = CODE_FOR_avx512f_scatterdiv16si;
13889 goto scatter_gen;
13890 case IX86_BUILTIN_SCATTERDIV8DI:
13891 icode = CODE_FOR_avx512f_scatterdiv8di;
13892 goto scatter_gen;
13893 case IX86_BUILTIN_SCATTERSIV8SF:
13894 icode = CODE_FOR_avx512vl_scattersiv8sf;
13895 goto scatter_gen;
13896 case IX86_BUILTIN_SCATTERSIV4SF:
13897 icode = CODE_FOR_avx512vl_scattersiv4sf;
13898 goto scatter_gen;
13899 case IX86_BUILTIN_SCATTERSIV4DF:
13900 icode = CODE_FOR_avx512vl_scattersiv4df;
13901 goto scatter_gen;
13902 case IX86_BUILTIN_SCATTERSIV2DF:
13903 icode = CODE_FOR_avx512vl_scattersiv2df;
13904 goto scatter_gen;
13905 case IX86_BUILTIN_SCATTERDIV8SF:
13906 icode = CODE_FOR_avx512vl_scatterdiv8sf;
13907 goto scatter_gen;
13908 case IX86_BUILTIN_SCATTERDIV4SF:
13909 icode = CODE_FOR_avx512vl_scatterdiv4sf;
13910 goto scatter_gen;
13911 case IX86_BUILTIN_SCATTERDIV4DF:
13912 icode = CODE_FOR_avx512vl_scatterdiv4df;
13913 goto scatter_gen;
13914 case IX86_BUILTIN_SCATTERDIV2DF:
13915 icode = CODE_FOR_avx512vl_scatterdiv2df;
13916 goto scatter_gen;
13917 case IX86_BUILTIN_SCATTERSIV8SI:
13918 icode = CODE_FOR_avx512vl_scattersiv8si;
13919 goto scatter_gen;
13920 case IX86_BUILTIN_SCATTERSIV4SI:
13921 icode = CODE_FOR_avx512vl_scattersiv4si;
13922 goto scatter_gen;
13923 case IX86_BUILTIN_SCATTERSIV4DI:
13924 icode = CODE_FOR_avx512vl_scattersiv4di;
13925 goto scatter_gen;
13926 case IX86_BUILTIN_SCATTERSIV2DI:
13927 icode = CODE_FOR_avx512vl_scattersiv2di;
13928 goto scatter_gen;
13929 case IX86_BUILTIN_SCATTERDIV8SI:
13930 icode = CODE_FOR_avx512vl_scatterdiv8si;
13931 goto scatter_gen;
13932 case IX86_BUILTIN_SCATTERDIV4SI:
13933 icode = CODE_FOR_avx512vl_scatterdiv4si;
13934 goto scatter_gen;
13935 case IX86_BUILTIN_SCATTERDIV4DI:
13936 icode = CODE_FOR_avx512vl_scatterdiv4di;
13937 goto scatter_gen;
13938 case IX86_BUILTIN_SCATTERDIV2DI:
13939 icode = CODE_FOR_avx512vl_scatterdiv2di;
13940 goto scatter_gen;
13941 case IX86_BUILTIN_GATHERPFDPD:
13942 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
13943 goto vec_prefetch_gen;
13944 case IX86_BUILTIN_SCATTERALTSIV8DF:
13945 icode = CODE_FOR_avx512f_scattersiv8df;
13946 goto scatter_gen;
13947 case IX86_BUILTIN_SCATTERALTDIV16SF:
13948 icode = CODE_FOR_avx512f_scatterdiv16sf;
13949 goto scatter_gen;
13950 case IX86_BUILTIN_SCATTERALTSIV8DI:
13951 icode = CODE_FOR_avx512f_scattersiv8di;
13952 goto scatter_gen;
13953 case IX86_BUILTIN_SCATTERALTDIV16SI:
13954 icode = CODE_FOR_avx512f_scatterdiv16si;
13955 goto scatter_gen;
13956 case IX86_BUILTIN_SCATTERALTSIV4DF:
13957 icode = CODE_FOR_avx512vl_scattersiv4df;
13958 goto scatter_gen;
13959 case IX86_BUILTIN_SCATTERALTDIV8SF:
13960 icode = CODE_FOR_avx512vl_scatterdiv8sf;
13961 goto scatter_gen;
13962 case IX86_BUILTIN_SCATTERALTSIV4DI:
13963 icode = CODE_FOR_avx512vl_scattersiv4di;
13964 goto scatter_gen;
13965 case IX86_BUILTIN_SCATTERALTDIV8SI:
13966 icode = CODE_FOR_avx512vl_scatterdiv8si;
13967 goto scatter_gen;
13968 case IX86_BUILTIN_SCATTERALTSIV2DF:
13969 icode = CODE_FOR_avx512vl_scattersiv2df;
13970 goto scatter_gen;
13971 case IX86_BUILTIN_SCATTERALTDIV4SF:
13972 icode = CODE_FOR_avx512vl_scatterdiv4sf;
13973 goto scatter_gen;
13974 case IX86_BUILTIN_SCATTERALTSIV2DI:
13975 icode = CODE_FOR_avx512vl_scattersiv2di;
13976 goto scatter_gen;
13977 case IX86_BUILTIN_SCATTERALTDIV4SI:
13978 icode = CODE_FOR_avx512vl_scatterdiv4si;
13979 goto scatter_gen;
13980 case IX86_BUILTIN_GATHERPFDPS:
13981 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
13982 goto vec_prefetch_gen;
13983 case IX86_BUILTIN_GATHERPFQPD:
13984 icode = CODE_FOR_avx512pf_gatherpfv8didf;
13985 goto vec_prefetch_gen;
13986 case IX86_BUILTIN_GATHERPFQPS:
13987 icode = CODE_FOR_avx512pf_gatherpfv8disf;
13988 goto vec_prefetch_gen;
13989 case IX86_BUILTIN_SCATTERPFDPD:
13990 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
13991 goto vec_prefetch_gen;
13992 case IX86_BUILTIN_SCATTERPFDPS:
13993 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
13994 goto vec_prefetch_gen;
13995 case IX86_BUILTIN_SCATTERPFQPD:
13996 icode = CODE_FOR_avx512pf_scatterpfv8didf;
13997 goto vec_prefetch_gen;
13998 case IX86_BUILTIN_SCATTERPFQPS:
13999 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14000 goto vec_prefetch_gen;
14001
14002 gather_gen:
14003 rtx half;
14004 rtx (*gen) (rtx, rtx);
14005
14006 arg0 = CALL_EXPR_ARG (exp, 0);
14007 arg1 = CALL_EXPR_ARG (exp, 1);
14008 arg2 = CALL_EXPR_ARG (exp, 2);
14009 arg3 = CALL_EXPR_ARG (exp, 3);
14010 arg4 = CALL_EXPR_ARG (exp, 4);
14011 op0 = expand_normal (arg0);
14012 op1 = expand_normal (arg1);
14013 op2 = expand_normal (arg2);
14014 op3 = expand_normal (arg3);
14015 op4 = expand_normal (arg4);
14016 /* Note the arg order is different from the operand order. */
14017 mode0 = insn_data[icode].operand[1].mode;
14018 mode2 = insn_data[icode].operand[3].mode;
14019 mode3 = insn_data[icode].operand[4].mode;
14020 mode4 = insn_data[icode].operand[5].mode;
14021
14022 if (target == NULL_RTX
14023 || GET_MODE (target) != insn_data[icode].operand[0].mode
14024 || !insn_data[icode].operand[0].predicate (target,
14025 GET_MODE (target)))
14026 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14027 else
14028 subtarget = target;
14029
14030 switch (fcode)
14031 {
14032 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14033 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14034 half = gen_reg_rtx (V8SImode);
14035 if (!nonimmediate_operand (op2, V16SImode))
14036 op2 = copy_to_mode_reg (V16SImode, op2);
14037 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14038 op2 = half;
14039 break;
14040 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14041 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14042 case IX86_BUILTIN_GATHERALTSIV4DF:
14043 case IX86_BUILTIN_GATHERALTSIV4DI:
14044 half = gen_reg_rtx (V4SImode);
14045 if (!nonimmediate_operand (op2, V8SImode))
14046 op2 = copy_to_mode_reg (V8SImode, op2);
14047 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14048 op2 = half;
14049 break;
14050 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14051 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14052 half = gen_reg_rtx (mode0);
14053 if (mode0 == V8SFmode)
14054 gen = gen_vec_extract_lo_v16sf;
14055 else
14056 gen = gen_vec_extract_lo_v16si;
14057 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14058 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14059 emit_insn (gen (half, op0));
14060 op0 = half;
14061 op3 = lowpart_subreg (QImode, op3, HImode);
14062 break;
14063 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14064 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14065 case IX86_BUILTIN_GATHERALTDIV8SF:
14066 case IX86_BUILTIN_GATHERALTDIV8SI:
14067 half = gen_reg_rtx (mode0);
14068 if (mode0 == V4SFmode)
14069 gen = gen_vec_extract_lo_v8sf;
14070 else
14071 gen = gen_vec_extract_lo_v8si;
14072 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14073 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14074 emit_insn (gen (half, op0));
14075 op0 = half;
14076 if (VECTOR_MODE_P (GET_MODE (op3)))
14077 {
14078 half = gen_reg_rtx (mode0);
14079 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14080 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14081 emit_insn (gen (half, op3));
14082 op3 = half;
14083 }
14084 break;
14085 default:
14086 break;
14087 }
14088
14089 /* Force memory operand only with base register here. But we
14090 don't want to do it on memory operand for other builtin
14091 functions. */
14092 op1 = ix86_zero_extend_to_Pmode (op1);
14093
14094 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14095 op0 = copy_to_mode_reg (mode0, op0);
14096 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14097 op1 = copy_to_mode_reg (Pmode, op1);
14098 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14099 op2 = copy_to_mode_reg (mode2, op2);
14100
14101 op3 = fixup_modeless_constant (op3, mode3);
14102
14103 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14104 {
14105 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14106 op3 = copy_to_mode_reg (mode3, op3);
14107 }
14108 else
14109 {
14110 op3 = copy_to_reg (op3);
14111 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14112 }
14113 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14114 {
14115 error ("the last argument must be scale 1, 2, 4, 8");
14116 return const0_rtx;
14117 }
14118
14119 /* Optimize. If mask is known to have all high bits set,
14120 replace op0 with pc_rtx to signal that the instruction
14121 overwrites the whole destination and doesn't use its
14122 previous contents. */
14123 if (optimize)
14124 {
14125 if (TREE_CODE (arg3) == INTEGER_CST)
14126 {
14127 if (integer_all_onesp (arg3))
14128 op0 = pc_rtx;
14129 }
14130 else if (TREE_CODE (arg3) == VECTOR_CST)
14131 {
14132 unsigned int negative = 0;
14133 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14134 {
14135 tree cst = VECTOR_CST_ELT (arg3, i);
14136 if (TREE_CODE (cst) == INTEGER_CST
14137 && tree_int_cst_sign_bit (cst))
14138 negative++;
14139 else if (TREE_CODE (cst) == REAL_CST
14140 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14141 negative++;
14142 }
14143 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14144 op0 = pc_rtx;
14145 }
14146 else if (TREE_CODE (arg3) == SSA_NAME
14147 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
14148 {
14149 /* Recognize also when mask is like:
14150 __v2df src = _mm_setzero_pd ();
14151 __v2df mask = _mm_cmpeq_pd (src, src);
14152 or
14153 __v8sf src = _mm256_setzero_ps ();
14154 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14155 as that is a cheaper way to load all ones into
14156 a register than having to load a constant from
14157 memory. */
14158 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14159 if (is_gimple_call (def_stmt))
14160 {
14161 tree fndecl = gimple_call_fndecl (def_stmt);
14162 if (fndecl
14163 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
14164 switch (DECL_MD_FUNCTION_CODE (fndecl))
14165 {
14166 case IX86_BUILTIN_CMPPD:
14167 case IX86_BUILTIN_CMPPS:
14168 case IX86_BUILTIN_CMPPD256:
14169 case IX86_BUILTIN_CMPPS256:
14170 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14171 break;
14172 /* FALLTHRU */
14173 case IX86_BUILTIN_CMPEQPD:
14174 case IX86_BUILTIN_CMPEQPS:
14175 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14176 && initializer_zerop (gimple_call_arg (def_stmt,
14177 1)))
14178 op0 = pc_rtx;
14179 break;
14180 default:
14181 break;
14182 }
14183 }
14184 }
14185 }
14186
14187 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14188 if (! pat)
14189 return const0_rtx;
14190 emit_insn (pat);
14191
14192 switch (fcode)
14193 {
14194 case IX86_BUILTIN_GATHER3DIV16SF:
14195 if (target == NULL_RTX)
14196 target = gen_reg_rtx (V8SFmode);
14197 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14198 break;
14199 case IX86_BUILTIN_GATHER3DIV16SI:
14200 if (target == NULL_RTX)
14201 target = gen_reg_rtx (V8SImode);
14202 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14203 break;
14204 case IX86_BUILTIN_GATHER3DIV8SF:
14205 case IX86_BUILTIN_GATHERDIV8SF:
14206 if (target == NULL_RTX)
14207 target = gen_reg_rtx (V4SFmode);
14208 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14209 break;
14210 case IX86_BUILTIN_GATHER3DIV8SI:
14211 case IX86_BUILTIN_GATHERDIV8SI:
14212 if (target == NULL_RTX)
14213 target = gen_reg_rtx (V4SImode);
14214 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14215 break;
14216 default:
14217 target = subtarget;
14218 break;
14219 }
14220 return target;
14221
14222 scatter_gen:
14223 arg0 = CALL_EXPR_ARG (exp, 0);
14224 arg1 = CALL_EXPR_ARG (exp, 1);
14225 arg2 = CALL_EXPR_ARG (exp, 2);
14226 arg3 = CALL_EXPR_ARG (exp, 3);
14227 arg4 = CALL_EXPR_ARG (exp, 4);
14228 op0 = expand_normal (arg0);
14229 op1 = expand_normal (arg1);
14230 op2 = expand_normal (arg2);
14231 op3 = expand_normal (arg3);
14232 op4 = expand_normal (arg4);
14233 mode1 = insn_data[icode].operand[1].mode;
14234 mode2 = insn_data[icode].operand[2].mode;
14235 mode3 = insn_data[icode].operand[3].mode;
14236 mode4 = insn_data[icode].operand[4].mode;
14237
14238 /* Scatter instruction stores operand op3 to memory with
14239 indices from op2 and scale from op4 under writemask op1.
14240 If index operand op2 has more elements then source operand
14241 op3 one need to use only its low half. And vice versa. */
14242 switch (fcode)
14243 {
14244 case IX86_BUILTIN_SCATTERALTSIV8DF:
14245 case IX86_BUILTIN_SCATTERALTSIV8DI:
14246 half = gen_reg_rtx (V8SImode);
14247 if (!nonimmediate_operand (op2, V16SImode))
14248 op2 = copy_to_mode_reg (V16SImode, op2);
14249 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14250 op2 = half;
14251 break;
14252 case IX86_BUILTIN_SCATTERALTDIV16SF:
14253 case IX86_BUILTIN_SCATTERALTDIV16SI:
14254 half = gen_reg_rtx (mode3);
14255 if (mode3 == V8SFmode)
14256 gen = gen_vec_extract_lo_v16sf;
14257 else
14258 gen = gen_vec_extract_lo_v16si;
14259 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14260 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14261 emit_insn (gen (half, op3));
14262 op3 = half;
14263 break;
14264 case IX86_BUILTIN_SCATTERALTSIV4DF:
14265 case IX86_BUILTIN_SCATTERALTSIV4DI:
14266 half = gen_reg_rtx (V4SImode);
14267 if (!nonimmediate_operand (op2, V8SImode))
14268 op2 = copy_to_mode_reg (V8SImode, op2);
14269 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14270 op2 = half;
14271 break;
14272 case IX86_BUILTIN_SCATTERALTDIV8SF:
14273 case IX86_BUILTIN_SCATTERALTDIV8SI:
14274 half = gen_reg_rtx (mode3);
14275 if (mode3 == V4SFmode)
14276 gen = gen_vec_extract_lo_v8sf;
14277 else
14278 gen = gen_vec_extract_lo_v8si;
14279 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14280 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14281 emit_insn (gen (half, op3));
14282 op3 = half;
14283 break;
14284 case IX86_BUILTIN_SCATTERALTSIV2DF:
14285 case IX86_BUILTIN_SCATTERALTSIV2DI:
14286 if (!nonimmediate_operand (op2, V4SImode))
14287 op2 = copy_to_mode_reg (V4SImode, op2);
14288 break;
14289 case IX86_BUILTIN_SCATTERALTDIV4SF:
14290 case IX86_BUILTIN_SCATTERALTDIV4SI:
14291 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14292 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14293 break;
14294 default:
14295 break;
14296 }
14297
14298 /* Force memory operand only with base register here. But we
14299 don't want to do it on memory operand for other builtin
14300 functions. */
14301 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14302
14303 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14304 op0 = copy_to_mode_reg (Pmode, op0);
14305
14306 op1 = fixup_modeless_constant (op1, mode1);
14307
14308 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14309 {
14310 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14311 op1 = copy_to_mode_reg (mode1, op1);
14312 }
14313 else
14314 {
14315 op1 = copy_to_reg (op1);
14316 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14317 }
14318
14319 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14320 op2 = copy_to_mode_reg (mode2, op2);
14321
14322 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14323 op3 = copy_to_mode_reg (mode3, op3);
14324
14325 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14326 {
14327 error ("the last argument must be scale 1, 2, 4, 8");
14328 return const0_rtx;
14329 }
14330
14331 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14332 if (! pat)
14333 return const0_rtx;
14334
14335 emit_insn (pat);
14336 return 0;
14337
14338 vec_prefetch_gen:
14339 arg0 = CALL_EXPR_ARG (exp, 0);
14340 arg1 = CALL_EXPR_ARG (exp, 1);
14341 arg2 = CALL_EXPR_ARG (exp, 2);
14342 arg3 = CALL_EXPR_ARG (exp, 3);
14343 arg4 = CALL_EXPR_ARG (exp, 4);
14344 op0 = expand_normal (arg0);
14345 op1 = expand_normal (arg1);
14346 op2 = expand_normal (arg2);
14347 op3 = expand_normal (arg3);
14348 op4 = expand_normal (arg4);
14349 mode0 = insn_data[icode].operand[0].mode;
14350 mode1 = insn_data[icode].operand[1].mode;
14351 mode3 = insn_data[icode].operand[3].mode;
14352 mode4 = insn_data[icode].operand[4].mode;
14353
14354 op0 = fixup_modeless_constant (op0, mode0);
14355
14356 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14357 {
14358 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14359 op0 = copy_to_mode_reg (mode0, op0);
14360 }
14361 else
14362 {
14363 op0 = copy_to_reg (op0);
14364 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14365 }
14366
14367 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14368 op1 = copy_to_mode_reg (mode1, op1);
14369
14370 /* Force memory operand only with base register here. But we
14371 don't want to do it on memory operand for other builtin
14372 functions. */
14373 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14374
14375 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14376 op2 = copy_to_mode_reg (Pmode, op2);
14377
14378 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14379 {
14380 error ("the forth argument must be scale 1, 2, 4, 8");
14381 return const0_rtx;
14382 }
14383
14384 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14385 {
14386 error ("incorrect hint operand");
14387 return const0_rtx;
14388 }
14389
14390 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14391 if (! pat)
14392 return const0_rtx;
14393
14394 emit_insn (pat);
14395
14396 return 0;
14397
14398 case IX86_BUILTIN_XABORT:
14399 icode = CODE_FOR_xabort;
14400 arg0 = CALL_EXPR_ARG (exp, 0);
14401 op0 = expand_normal (arg0);
14402 mode0 = insn_data[icode].operand[0].mode;
14403 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14404 {
14405 error ("the argument to %<xabort%> intrinsic must "
14406 "be an 8-bit immediate");
14407 return const0_rtx;
14408 }
14409 emit_insn (gen_xabort (op0));
14410 return 0;
14411
14412 case IX86_BUILTIN_RDSSPD:
14413 case IX86_BUILTIN_RDSSPQ:
14414 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14415
14416 if (target == 0
14417 || !register_operand (target, mode))
14418 target = gen_reg_rtx (mode);
14419
14420 op0 = force_reg (mode, const0_rtx);
14421
14422 emit_insn (gen_rdssp (mode, target, op0));
14423 return target;
14424
14425 case IX86_BUILTIN_INCSSPD:
14426 case IX86_BUILTIN_INCSSPQ:
14427 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14428
14429 arg0 = CALL_EXPR_ARG (exp, 0);
14430 op0 = expand_normal (arg0);
14431
14432 op0 = force_reg (mode, op0);
14433
14434 emit_insn (gen_incssp (mode, op0));
14435 return 0;
14436
14437 case IX86_BUILTIN_HRESET:
14438 icode = CODE_FOR_hreset;
14439 arg0 = CALL_EXPR_ARG (exp, 0);
14440 op0 = expand_normal (arg0);
14441 op0 = force_reg (SImode, op0);
14442 emit_insn (gen_hreset (op0));
14443 return 0;
14444
14445 case IX86_BUILTIN_RSTORSSP:
14446 case IX86_BUILTIN_CLRSSBSY:
14447 arg0 = CALL_EXPR_ARG (exp, 0);
14448 op0 = expand_normal (arg0);
14449 icode = (fcode == IX86_BUILTIN_RSTORSSP
14450 ? CODE_FOR_rstorssp
14451 : CODE_FOR_clrssbsy);
14452
14453 if (!address_operand (op0, VOIDmode))
14454 {
14455 op0 = convert_memory_address (Pmode, op0);
14456 op0 = copy_addr_to_reg (op0);
14457 }
14458 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
14459 return 0;
14460
14461 case IX86_BUILTIN_WRSSD:
14462 case IX86_BUILTIN_WRSSQ:
14463 case IX86_BUILTIN_WRUSSD:
14464 case IX86_BUILTIN_WRUSSQ:
14465 mode = ((fcode == IX86_BUILTIN_WRSSD
14466 || fcode == IX86_BUILTIN_WRUSSD)
14467 ? SImode : DImode);
14468
14469 arg0 = CALL_EXPR_ARG (exp, 0);
14470 op0 = expand_normal (arg0);
14471 arg1 = CALL_EXPR_ARG (exp, 1);
14472 op1 = expand_normal (arg1);
14473
14474 op0 = force_reg (mode, op0);
14475
14476 if (!address_operand (op1, VOIDmode))
14477 {
14478 op1 = convert_memory_address (Pmode, op1);
14479 op1 = copy_addr_to_reg (op1);
14480 }
14481 op1 = gen_rtx_MEM (mode, op1);
14482
14483 icode = ((fcode == IX86_BUILTIN_WRSSD
14484 || fcode == IX86_BUILTIN_WRSSQ)
14485 ? code_for_wrss (mode)
14486 : code_for_wruss (mode));
14487 emit_insn (GEN_FCN (icode) (op0, op1));
14488
14489 return 0;
14490
14491 default:
14492 break;
14493 }
14494
14495 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14496 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14497 {
14498 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14499 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14500 target);
14501 }
14502
14503 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14504 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14505 {
14506 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14507 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14508 target);
14509 }
14510
14511 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14512 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14513 {
14514 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14515 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14516 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14517 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14518 int masked = 1;
14519 machine_mode mode, wide_mode, nar_mode;
14520
14521 nar_mode = V4SFmode;
14522 mode = V16SFmode;
14523 wide_mode = V64SFmode;
14524 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
14525 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14526
14527 switch (fcode)
14528 {
14529 case IX86_BUILTIN_4FMAPS:
14530 fcn = gen_avx5124fmaddps_4fmaddps;
14531 masked = 0;
14532 goto v4fma_expand;
14533
14534 case IX86_BUILTIN_4DPWSSD:
14535 nar_mode = V4SImode;
14536 mode = V16SImode;
14537 wide_mode = V64SImode;
14538 fcn = gen_avx5124vnniw_vp4dpwssd;
14539 masked = 0;
14540 goto v4fma_expand;
14541
14542 case IX86_BUILTIN_4DPWSSDS:
14543 nar_mode = V4SImode;
14544 mode = V16SImode;
14545 wide_mode = V64SImode;
14546 fcn = gen_avx5124vnniw_vp4dpwssds;
14547 masked = 0;
14548 goto v4fma_expand;
14549
14550 case IX86_BUILTIN_4FNMAPS:
14551 fcn = gen_avx5124fmaddps_4fnmaddps;
14552 masked = 0;
14553 goto v4fma_expand;
14554
14555 case IX86_BUILTIN_4FNMAPS_MASK:
14556 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
14557 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
14558 goto v4fma_expand;
14559
14560 case IX86_BUILTIN_4DPWSSD_MASK:
14561 nar_mode = V4SImode;
14562 mode = V16SImode;
14563 wide_mode = V64SImode;
14564 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
14565 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
14566 goto v4fma_expand;
14567
14568 case IX86_BUILTIN_4DPWSSDS_MASK:
14569 nar_mode = V4SImode;
14570 mode = V16SImode;
14571 wide_mode = V64SImode;
14572 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
14573 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
14574 goto v4fma_expand;
14575
14576 case IX86_BUILTIN_4FMAPS_MASK:
14577 {
14578 tree args[4];
14579 rtx ops[4];
14580 rtx wide_reg;
14581 rtx accum;
14582 rtx addr;
14583 rtx mem;
14584
14585 v4fma_expand:
14586 wide_reg = gen_reg_rtx (wide_mode);
14587 for (i = 0; i < 4; i++)
14588 {
14589 args[i] = CALL_EXPR_ARG (exp, i);
14590 ops[i] = expand_normal (args[i]);
14591
14592 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
14593 ops[i]);
14594 }
14595
14596 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14597 accum = force_reg (mode, accum);
14598
14599 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14600 addr = force_reg (Pmode, addr);
14601
14602 mem = gen_rtx_MEM (nar_mode, addr);
14603
14604 target = gen_reg_rtx (mode);
14605
14606 emit_move_insn (target, accum);
14607
14608 if (! masked)
14609 emit_insn (fcn (target, accum, wide_reg, mem));
14610 else
14611 {
14612 rtx merge, mask;
14613 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14614
14615 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14616
14617 if (CONST_INT_P (mask))
14618 mask = fixup_modeless_constant (mask, HImode);
14619
14620 mask = force_reg (HImode, mask);
14621
14622 if (GET_MODE (mask) != HImode)
14623 mask = gen_rtx_SUBREG (HImode, mask, 0);
14624
14625 /* If merge is 0 then we're about to emit z-masked variant. */
14626 if (const0_operand (merge, mode))
14627 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14628 /* If merge is the same as accum then emit merge-masked variant. */
14629 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14630 {
14631 merge = force_reg (mode, merge);
14632 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14633 }
14634 /* Merge with something unknown might happen if we z-mask w/ -O0. */
14635 else
14636 {
14637 target = gen_reg_rtx (mode);
14638 emit_move_insn (target, merge);
14639 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14640 }
14641 }
14642 return target;
14643 }
14644
14645 case IX86_BUILTIN_4FNMASS:
14646 fcn = gen_avx5124fmaddps_4fnmaddss;
14647 masked = 0;
14648 goto s4fma_expand;
14649
14650 case IX86_BUILTIN_4FMASS:
14651 fcn = gen_avx5124fmaddps_4fmaddss;
14652 masked = 0;
14653 goto s4fma_expand;
14654
14655 case IX86_BUILTIN_4FNMASS_MASK:
14656 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
14657 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
14658 goto s4fma_expand;
14659
14660 case IX86_BUILTIN_4FMASS_MASK:
14661 {
14662 tree args[4];
14663 rtx ops[4];
14664 rtx wide_reg;
14665 rtx accum;
14666 rtx addr;
14667 rtx mem;
14668
14669 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
14670 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
14671
14672 s4fma_expand:
14673 mode = V4SFmode;
14674 wide_reg = gen_reg_rtx (V64SFmode);
14675 for (i = 0; i < 4; i++)
14676 {
14677 rtx tmp;
14678 args[i] = CALL_EXPR_ARG (exp, i);
14679 ops[i] = expand_normal (args[i]);
14680
14681 tmp = gen_reg_rtx (SFmode);
14682 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
14683
14684 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
14685 gen_rtx_SUBREG (V16SFmode, tmp, 0));
14686 }
14687
14688 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14689 accum = force_reg (V4SFmode, accum);
14690
14691 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14692 addr = force_reg (Pmode, addr);
14693
14694 mem = gen_rtx_MEM (V4SFmode, addr);
14695
14696 target = gen_reg_rtx (V4SFmode);
14697
14698 emit_move_insn (target, accum);
14699
14700 if (! masked)
14701 emit_insn (fcn (target, accum, wide_reg, mem));
14702 else
14703 {
14704 rtx merge, mask;
14705 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14706
14707 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14708
14709 if (CONST_INT_P (mask))
14710 mask = fixup_modeless_constant (mask, QImode);
14711
14712 mask = force_reg (QImode, mask);
14713
14714 if (GET_MODE (mask) != QImode)
14715 mask = gen_rtx_SUBREG (QImode, mask, 0);
14716
14717 /* If merge is 0 then we're about to emit z-masked variant. */
14718 if (const0_operand (merge, mode))
14719 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14720 /* If merge is the same as accum then emit merge-masked
14721 variant. */
14722 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14723 {
14724 merge = force_reg (mode, merge);
14725 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14726 }
14727 /* Merge with something unknown might happen if we z-mask
14728 w/ -O0. */
14729 else
14730 {
14731 target = gen_reg_rtx (mode);
14732 emit_move_insn (target, merge);
14733 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14734 }
14735 }
14736 return target;
14737 }
14738 case IX86_BUILTIN_RDPID:
14739 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
14740 target);
14741 case IX86_BUILTIN_FABSQ:
14742 case IX86_BUILTIN_COPYSIGNQ:
14743 if (!TARGET_SSE)
14744 /* Emit a normal call if SSE isn't available. */
14745 return expand_call (exp, target, ignore);
14746 /* FALLTHRU */
14747 default:
14748 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
14749 }
14750 }
14751
14752 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
14753 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
14754 {
14755 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
14756 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
14757 }
14758
14759 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
14760 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
14761 {
14762 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
14763 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
14764 }
14765
14766 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
14767 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
14768 {
14769 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
14770 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
14771 }
14772
14773 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
14774 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
14775 {
14776 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
14777 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
14778 }
14779
14780 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
14781 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
14782 {
14783 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
14784 const struct builtin_description *d = bdesc_multi_arg + i;
14785 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
14786 (enum ix86_builtin_func_type)
14787 d->flag, d->comparison);
14788 }
14789
14790 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
14791 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
14792 {
14793 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
14794 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
14795 target);
14796 }
14797
14798 gcc_unreachable ();
14799 }
14800
14801 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
14802 fill target with val via vec_duplicate. */
14803
14804 static bool
14805 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
14806 {
14807 bool ok;
14808 rtx_insn *insn;
14809 rtx dup;
14810
14811 /* First attempt to recognize VAL as-is. */
14812 dup = gen_vec_duplicate (mode, val);
14813 insn = emit_insn (gen_rtx_SET (target, dup));
14814 if (recog_memoized (insn) < 0)
14815 {
14816 rtx_insn *seq;
14817 machine_mode innermode = GET_MODE_INNER (mode);
14818 rtx reg;
14819
14820 /* If that fails, force VAL into a register. */
14821
14822 start_sequence ();
14823 reg = force_reg (innermode, val);
14824 if (GET_MODE (reg) != innermode)
14825 reg = gen_lowpart (innermode, reg);
14826 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
14827 seq = get_insns ();
14828 end_sequence ();
14829 if (seq)
14830 emit_insn_before (seq, insn);
14831
14832 ok = recog_memoized (insn) >= 0;
14833 gcc_assert (ok);
14834 }
14835 return true;
14836 }
14837
14838 /* Get a vector mode of the same size as the original but with elements
14839 twice as wide. This is only guaranteed to apply to integral vectors. */
14840
14841 static machine_mode
14842 get_mode_wider_vector (machine_mode o)
14843 {
14844 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
14845 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
14846 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
14847 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
14848 return n;
14849 }
14850
14851 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
14852 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
14853
14854 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
14855 with all elements equal to VAR. Return true if successful. */
14856
14857 bool
14858 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
14859 rtx target, rtx val)
14860 {
14861 bool ok;
14862
14863 switch (mode)
14864 {
14865 case E_V2SImode:
14866 case E_V2SFmode:
14867 if (!mmx_ok)
14868 return false;
14869 /* FALLTHRU */
14870
14871 case E_V4DFmode:
14872 case E_V4DImode:
14873 case E_V8SFmode:
14874 case E_V8SImode:
14875 case E_V2DFmode:
14876 case E_V2DImode:
14877 case E_V4SFmode:
14878 case E_V4SImode:
14879 case E_V16SImode:
14880 case E_V8DImode:
14881 case E_V16SFmode:
14882 case E_V8DFmode:
14883 return ix86_vector_duplicate_value (mode, target, val);
14884
14885 case E_V4HImode:
14886 if (!mmx_ok)
14887 return false;
14888 if (TARGET_SSE || TARGET_3DNOW_A)
14889 {
14890 rtx x;
14891
14892 val = gen_lowpart (SImode, val);
14893 x = gen_rtx_TRUNCATE (HImode, val);
14894 x = gen_rtx_VEC_DUPLICATE (mode, x);
14895 emit_insn (gen_rtx_SET (target, x));
14896 return true;
14897 }
14898 goto widen;
14899
14900 case E_V2HImode:
14901 if (TARGET_SSE2)
14902 {
14903 rtx x;
14904
14905 val = gen_lowpart (SImode, val);
14906 x = gen_rtx_TRUNCATE (HImode, val);
14907 x = gen_rtx_VEC_DUPLICATE (mode, x);
14908 emit_insn (gen_rtx_SET (target, x));
14909 return true;
14910 }
14911 return false;
14912
14913 case E_V8QImode:
14914 case E_V4QImode:
14915 if (!mmx_ok)
14916 return false;
14917 goto widen;
14918
14919 case E_V8HImode:
14920 case E_V8HFmode:
14921 if (TARGET_AVX2)
14922 return ix86_vector_duplicate_value (mode, target, val);
14923
14924 if (TARGET_SSE2)
14925 {
14926 struct expand_vec_perm_d dperm;
14927 rtx tmp1, tmp2;
14928
14929 permute:
14930 memset (&dperm, 0, sizeof (dperm));
14931 dperm.target = target;
14932 dperm.vmode = mode;
14933 dperm.nelt = GET_MODE_NUNITS (mode);
14934 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
14935 dperm.one_operand_p = true;
14936
14937 if (mode == V8HFmode)
14938 {
14939 tmp1 = force_reg (HFmode, val);
14940 tmp2 = gen_reg_rtx (mode);
14941 emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
14942 tmp1 = gen_lowpart (mode, tmp2);
14943 }
14944 else
14945 {
14946 /* Extend to SImode using a paradoxical SUBREG. */
14947 tmp1 = gen_reg_rtx (SImode);
14948 emit_move_insn (tmp1, gen_lowpart (SImode, val));
14949
14950 /* Insert the SImode value as
14951 low element of a V4SImode vector. */
14952 tmp2 = gen_reg_rtx (V4SImode);
14953 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
14954 tmp1 = gen_lowpart (mode, tmp2);
14955 }
14956
14957 emit_move_insn (dperm.op0, tmp1);
14958 ok = (expand_vec_perm_1 (&dperm)
14959 || expand_vec_perm_broadcast_1 (&dperm));
14960 gcc_assert (ok);
14961 return ok;
14962 }
14963 goto widen;
14964
14965 case E_V16QImode:
14966 if (TARGET_AVX2)
14967 return ix86_vector_duplicate_value (mode, target, val);
14968
14969 if (TARGET_SSE2)
14970 goto permute;
14971 goto widen;
14972
14973 widen:
14974 /* Replicate the value once into the next wider mode and recurse. */
14975 {
14976 machine_mode smode, wsmode, wvmode;
14977 rtx x;
14978
14979 smode = GET_MODE_INNER (mode);
14980 wvmode = get_mode_wider_vector (mode);
14981 wsmode = GET_MODE_INNER (wvmode);
14982
14983 val = convert_modes (wsmode, smode, val, true);
14984
14985 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
14986 emit_insn (gen_insv_1 (wsmode, val, val));
14987 else
14988 {
14989 x = expand_simple_binop (wsmode, ASHIFT, val,
14990 GEN_INT (GET_MODE_BITSIZE (smode)),
14991 NULL_RTX, 1, OPTAB_LIB_WIDEN);
14992 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
14993 OPTAB_LIB_WIDEN);
14994 }
14995
14996 x = gen_reg_rtx (wvmode);
14997 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
14998 gcc_assert (ok);
14999 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15000 return ok;
15001 }
15002
15003 case E_V16HImode:
15004 case E_V16HFmode:
15005 case E_V32QImode:
15006 if (TARGET_AVX2)
15007 return ix86_vector_duplicate_value (mode, target, val);
15008 else
15009 {
15010 machine_mode hvmode = (mode == V16HImode ? V8HImode
15011 : mode == V16HFmode ? V8HFmode
15012 : V16QImode);
15013 rtx x = gen_reg_rtx (hvmode);
15014
15015 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15016 gcc_assert (ok);
15017
15018 x = gen_rtx_VEC_CONCAT (mode, x, x);
15019 emit_insn (gen_rtx_SET (target, x));
15020 }
15021 return true;
15022
15023 case E_V32HImode:
15024 case E_V32HFmode:
15025 case E_V64QImode:
15026 if (TARGET_AVX512BW)
15027 return ix86_vector_duplicate_value (mode, target, val);
15028 else
15029 {
15030 machine_mode hvmode = (mode == V32HImode ? V16HImode
15031 : mode == V32HFmode ? V16HFmode
15032 : V32QImode);
15033 rtx x = gen_reg_rtx (hvmode);
15034
15035 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15036 gcc_assert (ok);
15037
15038 x = gen_rtx_VEC_CONCAT (mode, x, x);
15039 emit_insn (gen_rtx_SET (target, x));
15040 }
15041 return true;
15042
15043 default:
15044 return false;
15045 }
15046 }
15047
15048 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15049 whose ONE_VAR element is VAR, and other elements are zero. Return true
15050 if successful. */
15051
15052 static bool
15053 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15054 rtx target, rtx var, int one_var)
15055 {
15056 machine_mode vsimode;
15057 rtx new_target;
15058 rtx x, tmp;
15059 bool use_vector_set = false;
15060 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15061
15062 switch (mode)
15063 {
15064 case E_V2DImode:
15065 /* For SSE4.1, we normally use vector set. But if the second
15066 element is zero and inter-unit moves are OK, we use movq
15067 instead. */
15068 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15069 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15070 && one_var == 0));
15071 break;
15072 case E_V16QImode:
15073 case E_V4SImode:
15074 case E_V4SFmode:
15075 use_vector_set = TARGET_SSE4_1;
15076 break;
15077 case E_V8HImode:
15078 use_vector_set = TARGET_SSE2;
15079 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15080 ? gen_vec_setv8hi_0 : NULL;
15081 break;
15082 case E_V8QImode:
15083 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15084 break;
15085 case E_V4HImode:
15086 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15087 break;
15088 case E_V4QImode:
15089 use_vector_set = TARGET_SSE4_1;
15090 break;
15091 case E_V32QImode:
15092 use_vector_set = TARGET_AVX;
15093 break;
15094 case E_V16HImode:
15095 use_vector_set = TARGET_AVX;
15096 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15097 ? gen_vec_setv16hi_0 : NULL;
15098 break;
15099 case E_V8SImode:
15100 use_vector_set = TARGET_AVX;
15101 gen_vec_set_0 = gen_vec_setv8si_0;
15102 break;
15103 case E_V8SFmode:
15104 use_vector_set = TARGET_AVX;
15105 gen_vec_set_0 = gen_vec_setv8sf_0;
15106 break;
15107 case E_V4DFmode:
15108 use_vector_set = TARGET_AVX;
15109 gen_vec_set_0 = gen_vec_setv4df_0;
15110 break;
15111 case E_V4DImode:
15112 /* Use ix86_expand_vector_set in 64bit mode only. */
15113 use_vector_set = TARGET_AVX && TARGET_64BIT;
15114 gen_vec_set_0 = gen_vec_setv4di_0;
15115 break;
15116 case E_V16SImode:
15117 use_vector_set = TARGET_AVX512F && one_var == 0;
15118 gen_vec_set_0 = gen_vec_setv16si_0;
15119 break;
15120 case E_V16SFmode:
15121 use_vector_set = TARGET_AVX512F && one_var == 0;
15122 gen_vec_set_0 = gen_vec_setv16sf_0;
15123 break;
15124 case E_V8DFmode:
15125 use_vector_set = TARGET_AVX512F && one_var == 0;
15126 gen_vec_set_0 = gen_vec_setv8df_0;
15127 break;
15128 case E_V8DImode:
15129 /* Use ix86_expand_vector_set in 64bit mode only. */
15130 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15131 gen_vec_set_0 = gen_vec_setv8di_0;
15132 break;
15133 case E_V8HFmode:
15134 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15135 gen_vec_set_0 = gen_vec_setv8hf_0;
15136 break;
15137 case E_V16HFmode:
15138 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15139 gen_vec_set_0 = gen_vec_setv16hf_0;
15140 break;
15141 case E_V32HFmode:
15142 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15143 gen_vec_set_0 = gen_vec_setv32hf_0;
15144 break;
15145 case E_V32HImode:
15146 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15147 gen_vec_set_0 = gen_vec_setv32hi_0;
15148 default:
15149 break;
15150 }
15151
15152 if (use_vector_set)
15153 {
15154 if (gen_vec_set_0 && one_var == 0)
15155 {
15156 var = force_reg (GET_MODE_INNER (mode), var);
15157 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15158 return true;
15159 }
15160 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15161 var = force_reg (GET_MODE_INNER (mode), var);
15162 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15163 return true;
15164 }
15165
15166 switch (mode)
15167 {
15168 case E_V2SFmode:
15169 case E_V2SImode:
15170 if (!mmx_ok)
15171 return false;
15172 /* FALLTHRU */
15173
15174 case E_V2DFmode:
15175 case E_V2DImode:
15176 if (one_var != 0)
15177 return false;
15178 var = force_reg (GET_MODE_INNER (mode), var);
15179 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15180 emit_insn (gen_rtx_SET (target, x));
15181 return true;
15182
15183 case E_V4SFmode:
15184 case E_V4SImode:
15185 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15186 new_target = gen_reg_rtx (mode);
15187 else
15188 new_target = target;
15189 var = force_reg (GET_MODE_INNER (mode), var);
15190 x = gen_rtx_VEC_DUPLICATE (mode, var);
15191 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15192 emit_insn (gen_rtx_SET (new_target, x));
15193 if (one_var != 0)
15194 {
15195 /* We need to shuffle the value to the correct position, so
15196 create a new pseudo to store the intermediate result. */
15197
15198 /* With SSE2, we can use the integer shuffle insns. */
15199 if (mode != V4SFmode && TARGET_SSE2)
15200 {
15201 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15202 const1_rtx,
15203 GEN_INT (one_var == 1 ? 0 : 1),
15204 GEN_INT (one_var == 2 ? 0 : 1),
15205 GEN_INT (one_var == 3 ? 0 : 1)));
15206 if (target != new_target)
15207 emit_move_insn (target, new_target);
15208 return true;
15209 }
15210
15211 /* Otherwise convert the intermediate result to V4SFmode and
15212 use the SSE1 shuffle instructions. */
15213 if (mode != V4SFmode)
15214 {
15215 tmp = gen_reg_rtx (V4SFmode);
15216 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15217 }
15218 else
15219 tmp = new_target;
15220
15221 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15222 const1_rtx,
15223 GEN_INT (one_var == 1 ? 0 : 1),
15224 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15225 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15226
15227 if (mode != V4SFmode)
15228 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15229 else if (tmp != target)
15230 emit_move_insn (target, tmp);
15231 }
15232 else if (target != new_target)
15233 emit_move_insn (target, new_target);
15234 return true;
15235
15236 case E_V8HImode:
15237 case E_V16QImode:
15238 vsimode = V4SImode;
15239 goto widen;
15240 case E_V4HImode:
15241 case E_V8QImode:
15242 if (!mmx_ok)
15243 return false;
15244 vsimode = V2SImode;
15245 goto widen;
15246 widen:
15247 if (one_var != 0)
15248 return false;
15249
15250 /* Zero extend the variable element to SImode and recurse. */
15251 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15252
15253 x = gen_reg_rtx (vsimode);
15254 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15255 var, one_var))
15256 gcc_unreachable ();
15257
15258 emit_move_insn (target, gen_lowpart (mode, x));
15259 return true;
15260
15261 default:
15262 return false;
15263 }
15264 }
15265
15266 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15267 consisting of the values in VALS. It is known that all elements
15268 except ONE_VAR are constants. Return true if successful. */
15269
15270 static bool
15271 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15272 rtx target, rtx vals, int one_var)
15273 {
15274 rtx var = XVECEXP (vals, 0, one_var);
15275 machine_mode wmode;
15276 rtx const_vec, x;
15277
15278 const_vec = copy_rtx (vals);
15279 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15280 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15281
15282 switch (mode)
15283 {
15284 case E_V2DFmode:
15285 case E_V2DImode:
15286 case E_V2SFmode:
15287 case E_V2SImode:
15288 /* For the two element vectors, it's just as easy to use
15289 the general case. */
15290 return false;
15291
15292 case E_V4DImode:
15293 /* Use ix86_expand_vector_set in 64bit mode only. */
15294 if (!TARGET_64BIT)
15295 return false;
15296 /* FALLTHRU */
15297 case E_V8HFmode:
15298 case E_V16HFmode:
15299 case E_V4DFmode:
15300 case E_V8SFmode:
15301 case E_V8SImode:
15302 case E_V16HImode:
15303 case E_V32QImode:
15304 case E_V4SFmode:
15305 case E_V4SImode:
15306 case E_V8HImode:
15307 case E_V4HImode:
15308 break;
15309
15310 case E_V16QImode:
15311 if (TARGET_SSE4_1)
15312 break;
15313 wmode = V8HImode;
15314 goto widen;
15315 case E_V8QImode:
15316 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15317 break;
15318 wmode = V4HImode;
15319 goto widen;
15320 case E_V4QImode:
15321 if (TARGET_SSE4_1)
15322 break;
15323 wmode = V2HImode;
15324 widen:
15325 /* There's no way to set one QImode entry easily. Combine
15326 the variable value with its adjacent constant value, and
15327 promote to an HImode set. */
15328 x = XVECEXP (vals, 0, one_var ^ 1);
15329 if (one_var & 1)
15330 {
15331 var = convert_modes (HImode, QImode, var, true);
15332 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15333 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15334 x = GEN_INT (INTVAL (x) & 0xff);
15335 }
15336 else
15337 {
15338 var = convert_modes (HImode, QImode, var, true);
15339 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15340 }
15341 if (x != const0_rtx)
15342 var = expand_simple_binop (HImode, IOR, var, x, var,
15343 1, OPTAB_LIB_WIDEN);
15344
15345 x = gen_reg_rtx (wmode);
15346 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15347 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15348
15349 emit_move_insn (target, gen_lowpart (mode, x));
15350 return true;
15351
15352 default:
15353 return false;
15354 }
15355
15356 emit_move_insn (target, const_vec);
15357 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15358 return true;
15359 }
15360
15361 /* A subroutine of ix86_expand_vector_init_general. Use vector
15362 concatenate to handle the most general case: all values variable,
15363 and none identical. */
15364
15365 static void
15366 ix86_expand_vector_init_concat (machine_mode mode,
15367 rtx target, rtx *ops, int n)
15368 {
15369 machine_mode half_mode = VOIDmode;
15370 rtx half[2];
15371 rtvec v;
15372 int i, j;
15373
15374 switch (n)
15375 {
15376 case 2:
15377 switch (mode)
15378 {
15379 case E_V32HFmode:
15380 half_mode = V16HFmode;
15381 break;
15382 case E_V16SImode:
15383 half_mode = V8SImode;
15384 break;
15385 case E_V16SFmode:
15386 half_mode = V8SFmode;
15387 break;
15388 case E_V8DImode:
15389 half_mode = V4DImode;
15390 break;
15391 case E_V8DFmode:
15392 half_mode = V4DFmode;
15393 break;
15394 case E_V16HFmode:
15395 half_mode = V8HFmode;
15396 break;
15397 case E_V8SImode:
15398 half_mode = V4SImode;
15399 break;
15400 case E_V8SFmode:
15401 half_mode = V4SFmode;
15402 break;
15403 case E_V4DImode:
15404 half_mode = V2DImode;
15405 break;
15406 case E_V4DFmode:
15407 half_mode = V2DFmode;
15408 break;
15409 case E_V4SImode:
15410 half_mode = V2SImode;
15411 break;
15412 case E_V4SFmode:
15413 half_mode = V2SFmode;
15414 break;
15415 case E_V2DImode:
15416 half_mode = DImode;
15417 break;
15418 case E_V2SImode:
15419 half_mode = SImode;
15420 break;
15421 case E_V2DFmode:
15422 half_mode = DFmode;
15423 break;
15424 case E_V2SFmode:
15425 half_mode = SFmode;
15426 break;
15427 default:
15428 gcc_unreachable ();
15429 }
15430
15431 if (!register_operand (ops[1], half_mode))
15432 ops[1] = force_reg (half_mode, ops[1]);
15433 if (!register_operand (ops[0], half_mode))
15434 ops[0] = force_reg (half_mode, ops[0]);
15435 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15436 ops[1])));
15437 break;
15438
15439 case 4:
15440 switch (mode)
15441 {
15442 case E_V4DImode:
15443 half_mode = V2DImode;
15444 break;
15445 case E_V4DFmode:
15446 half_mode = V2DFmode;
15447 break;
15448 case E_V4SImode:
15449 half_mode = V2SImode;
15450 break;
15451 case E_V4SFmode:
15452 half_mode = V2SFmode;
15453 break;
15454 default:
15455 gcc_unreachable ();
15456 }
15457 goto half;
15458
15459 case 8:
15460 switch (mode)
15461 {
15462 case E_V8DImode:
15463 half_mode = V4DImode;
15464 break;
15465 case E_V8DFmode:
15466 half_mode = V4DFmode;
15467 break;
15468 case E_V8SImode:
15469 half_mode = V4SImode;
15470 break;
15471 case E_V8SFmode:
15472 half_mode = V4SFmode;
15473 break;
15474 default:
15475 gcc_unreachable ();
15476 }
15477 goto half;
15478
15479 case 16:
15480 switch (mode)
15481 {
15482 case E_V16SImode:
15483 half_mode = V8SImode;
15484 break;
15485 case E_V16SFmode:
15486 half_mode = V8SFmode;
15487 break;
15488 default:
15489 gcc_unreachable ();
15490 }
15491 goto half;
15492
15493 half:
15494 /* FIXME: We process inputs backward to help RA. PR 36222. */
15495 i = n - 1;
15496 for (j = 1; j != -1; j--)
15497 {
15498 half[j] = gen_reg_rtx (half_mode);
15499 switch (n >> 1)
15500 {
15501 case 2:
15502 v = gen_rtvec (2, ops[i-1], ops[i]);
15503 i -= 2;
15504 break;
15505 case 4:
15506 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
15507 i -= 4;
15508 break;
15509 case 8:
15510 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
15511 ops[i-3], ops[i-2], ops[i-1], ops[i]);
15512 i -= 8;
15513 break;
15514 default:
15515 gcc_unreachable ();
15516 }
15517 ix86_expand_vector_init (false, half[j],
15518 gen_rtx_PARALLEL (half_mode, v));
15519 }
15520
15521 ix86_expand_vector_init_concat (mode, target, half, 2);
15522 break;
15523
15524 default:
15525 gcc_unreachable ();
15526 }
15527 }
15528
15529 /* A subroutine of ix86_expand_vector_init_general. Use vector
15530 interleave to handle the most general case: all values variable,
15531 and none identical. */
15532
15533 static void
15534 ix86_expand_vector_init_interleave (machine_mode mode,
15535 rtx target, rtx *ops, int n)
15536 {
15537 machine_mode first_imode, second_imode, third_imode, inner_mode;
15538 int i, j;
15539 rtx op, op0, op1;
15540 rtx (*gen_load_even) (rtx, rtx, rtx);
15541 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
15542 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
15543
15544 switch (mode)
15545 {
15546 case E_V8HFmode:
15547 gen_load_even = gen_vec_interleave_lowv8hf;
15548 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15549 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15550 inner_mode = HFmode;
15551 first_imode = V4SImode;
15552 second_imode = V2DImode;
15553 third_imode = VOIDmode;
15554 break;
15555 case E_V8HImode:
15556 gen_load_even = gen_vec_setv8hi;
15557 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15558 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15559 inner_mode = HImode;
15560 first_imode = V4SImode;
15561 second_imode = V2DImode;
15562 third_imode = VOIDmode;
15563 break;
15564 case E_V16QImode:
15565 gen_load_even = gen_vec_setv16qi;
15566 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
15567 gen_interleave_second_low = gen_vec_interleave_lowv4si;
15568 inner_mode = QImode;
15569 first_imode = V8HImode;
15570 second_imode = V4SImode;
15571 third_imode = V2DImode;
15572 break;
15573 default:
15574 gcc_unreachable ();
15575 }
15576
15577 for (i = 0; i < n; i++)
15578 {
15579 op = ops [i + i];
15580 if (inner_mode == HFmode)
15581 {
15582 rtx even, odd;
15583 /* Use vpuncklwd to pack 2 HFmode. */
15584 op0 = gen_reg_rtx (V8HFmode);
15585 even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode);
15586 odd = lowpart_subreg (V8HFmode,
15587 force_reg (HFmode, ops[i + i + 1]),
15588 HFmode);
15589 emit_insn (gen_load_even (op0, even, odd));
15590 }
15591 else
15592 {
15593 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
15594 op0 = gen_reg_rtx (SImode);
15595 emit_move_insn (op0, gen_lowpart (SImode, op));
15596
15597 /* Insert the SImode value as low element of V4SImode vector. */
15598 op1 = gen_reg_rtx (V4SImode);
15599 op0 = gen_rtx_VEC_MERGE (V4SImode,
15600 gen_rtx_VEC_DUPLICATE (V4SImode,
15601 op0),
15602 CONST0_RTX (V4SImode),
15603 const1_rtx);
15604 emit_insn (gen_rtx_SET (op1, op0));
15605
15606 /* Cast the V4SImode vector back to a vector in orignal mode. */
15607 op0 = gen_reg_rtx (mode);
15608 emit_move_insn (op0, gen_lowpart (mode, op1));
15609
15610 /* Load even elements into the second position. */
15611 emit_insn (gen_load_even (op0,
15612 force_reg (inner_mode,
15613 ops[i + i + 1]),
15614 const1_rtx));
15615 }
15616
15617 /* Cast vector to FIRST_IMODE vector. */
15618 ops[i] = gen_reg_rtx (first_imode);
15619 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
15620 }
15621
15622 /* Interleave low FIRST_IMODE vectors. */
15623 for (i = j = 0; i < n; i += 2, j++)
15624 {
15625 op0 = gen_reg_rtx (first_imode);
15626 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
15627
15628 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
15629 ops[j] = gen_reg_rtx (second_imode);
15630 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
15631 }
15632
15633 /* Interleave low SECOND_IMODE vectors. */
15634 switch (second_imode)
15635 {
15636 case E_V4SImode:
15637 for (i = j = 0; i < n / 2; i += 2, j++)
15638 {
15639 op0 = gen_reg_rtx (second_imode);
15640 emit_insn (gen_interleave_second_low (op0, ops[i],
15641 ops[i + 1]));
15642
15643 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
15644 vector. */
15645 ops[j] = gen_reg_rtx (third_imode);
15646 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
15647 }
15648 second_imode = V2DImode;
15649 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15650 /* FALLTHRU */
15651
15652 case E_V2DImode:
15653 op0 = gen_reg_rtx (second_imode);
15654 emit_insn (gen_interleave_second_low (op0, ops[0],
15655 ops[1]));
15656
15657 /* Cast the SECOND_IMODE vector back to a vector on original
15658 mode. */
15659 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
15660 break;
15661
15662 default:
15663 gcc_unreachable ();
15664 }
15665 }
15666
15667 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
15668 all values variable, and none identical. */
15669
15670 static void
15671 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
15672 rtx target, rtx vals)
15673 {
15674 rtx ops[64], op0, op1, op2, op3, op4, op5;
15675 machine_mode half_mode = VOIDmode;
15676 machine_mode quarter_mode = VOIDmode;
15677 int n, i;
15678
15679 switch (mode)
15680 {
15681 case E_V2SFmode:
15682 case E_V2SImode:
15683 if (!mmx_ok && !TARGET_SSE)
15684 break;
15685 /* FALLTHRU */
15686
15687 case E_V16SImode:
15688 case E_V16SFmode:
15689 case E_V8DFmode:
15690 case E_V8DImode:
15691 case E_V8SFmode:
15692 case E_V8SImode:
15693 case E_V4DFmode:
15694 case E_V4DImode:
15695 case E_V4SFmode:
15696 case E_V4SImode:
15697 case E_V2DFmode:
15698 case E_V2DImode:
15699 n = GET_MODE_NUNITS (mode);
15700 for (i = 0; i < n; i++)
15701 ops[i] = XVECEXP (vals, 0, i);
15702 ix86_expand_vector_init_concat (mode, target, ops, n);
15703 return;
15704
15705 case E_V2TImode:
15706 for (i = 0; i < 2; i++)
15707 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15708 op0 = gen_reg_rtx (V4DImode);
15709 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
15710 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15711 return;
15712
15713 case E_V4TImode:
15714 for (i = 0; i < 4; i++)
15715 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15716 ops[4] = gen_reg_rtx (V4DImode);
15717 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
15718 ops[5] = gen_reg_rtx (V4DImode);
15719 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
15720 op0 = gen_reg_rtx (V8DImode);
15721 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
15722 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15723 return;
15724
15725 case E_V32QImode:
15726 half_mode = V16QImode;
15727 goto half;
15728
15729 case E_V16HImode:
15730 half_mode = V8HImode;
15731 goto half;
15732
15733 case E_V16HFmode:
15734 half_mode = V8HFmode;
15735 goto half;
15736
15737 half:
15738 n = GET_MODE_NUNITS (mode);
15739 for (i = 0; i < n; i++)
15740 ops[i] = XVECEXP (vals, 0, i);
15741 op0 = gen_reg_rtx (half_mode);
15742 op1 = gen_reg_rtx (half_mode);
15743 ix86_expand_vector_init_interleave (half_mode, op0, ops,
15744 n >> 2);
15745 ix86_expand_vector_init_interleave (half_mode, op1,
15746 &ops [n >> 1], n >> 2);
15747 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
15748 return;
15749
15750 case E_V64QImode:
15751 quarter_mode = V16QImode;
15752 half_mode = V32QImode;
15753 goto quarter;
15754
15755 case E_V32HImode:
15756 quarter_mode = V8HImode;
15757 half_mode = V16HImode;
15758 goto quarter;
15759
15760 case E_V32HFmode:
15761 quarter_mode = V8HFmode;
15762 half_mode = V16HFmode;
15763 goto quarter;
15764
15765 quarter:
15766 n = GET_MODE_NUNITS (mode);
15767 for (i = 0; i < n; i++)
15768 ops[i] = XVECEXP (vals, 0, i);
15769 op0 = gen_reg_rtx (quarter_mode);
15770 op1 = gen_reg_rtx (quarter_mode);
15771 op2 = gen_reg_rtx (quarter_mode);
15772 op3 = gen_reg_rtx (quarter_mode);
15773 op4 = gen_reg_rtx (half_mode);
15774 op5 = gen_reg_rtx (half_mode);
15775 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
15776 n >> 3);
15777 ix86_expand_vector_init_interleave (quarter_mode, op1,
15778 &ops [n >> 2], n >> 3);
15779 ix86_expand_vector_init_interleave (quarter_mode, op2,
15780 &ops [n >> 1], n >> 3);
15781 ix86_expand_vector_init_interleave (quarter_mode, op3,
15782 &ops [(n >> 1) | (n >> 2)], n >> 3);
15783 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
15784 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
15785 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
15786 return;
15787
15788 case E_V16QImode:
15789 if (!TARGET_SSE4_1)
15790 break;
15791 /* FALLTHRU */
15792
15793 case E_V8HImode:
15794 if (!TARGET_SSE2)
15795 break;
15796
15797 /* Don't use ix86_expand_vector_init_interleave if we can't
15798 move from GPR to SSE register directly. */
15799 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
15800 break;
15801 /* FALLTHRU */
15802
15803 case E_V8HFmode:
15804
15805 n = GET_MODE_NUNITS (mode);
15806 for (i = 0; i < n; i++)
15807 ops[i] = XVECEXP (vals, 0, i);
15808 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
15809 return;
15810
15811 case E_V4HImode:
15812 case E_V8QImode:
15813
15814 case E_V2HImode:
15815 case E_V4QImode:
15816 break;
15817
15818 default:
15819 gcc_unreachable ();
15820 }
15821
15822 {
15823 int i, j, n_elts, n_words, n_elt_per_word;
15824 machine_mode tmp_mode, inner_mode;
15825 rtx words[4], shift;
15826
15827 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
15828
15829 inner_mode = GET_MODE_INNER (mode);
15830 n_elts = GET_MODE_NUNITS (mode);
15831 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
15832 n_elt_per_word = n_elts / n_words;
15833 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
15834
15835 for (i = 0; i < n_words; ++i)
15836 {
15837 rtx word = NULL_RTX;
15838
15839 for (j = 0; j < n_elt_per_word; ++j)
15840 {
15841 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
15842 elt = convert_modes (tmp_mode, inner_mode, elt, true);
15843
15844 if (j == 0)
15845 word = elt;
15846 else
15847 {
15848 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
15849 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15850 word = expand_simple_binop (tmp_mode, IOR, word, elt,
15851 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15852 }
15853 }
15854
15855 words[i] = word;
15856 }
15857
15858 if (n_words == 1)
15859 emit_move_insn (target, gen_lowpart (mode, words[0]));
15860 else if (n_words == 2)
15861 {
15862 rtx tmp = gen_reg_rtx (mode);
15863 emit_clobber (tmp);
15864 emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
15865 emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
15866 emit_move_insn (target, tmp);
15867 }
15868 else if (n_words == 4)
15869 {
15870 rtx tmp = gen_reg_rtx (V4SImode);
15871 gcc_assert (tmp_mode == SImode);
15872 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
15873 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
15874 emit_move_insn (target, gen_lowpart (mode, tmp));
15875 }
15876 else
15877 gcc_unreachable ();
15878 }
15879 }
15880
15881 /* Initialize vector TARGET via VALS. Suppress the use of MMX
15882 instructions unless MMX_OK is true. */
15883
15884 void
15885 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
15886 {
15887 machine_mode mode = GET_MODE (target);
15888 machine_mode inner_mode = GET_MODE_INNER (mode);
15889 int n_elts = GET_MODE_NUNITS (mode);
15890 int n_var = 0, one_var = -1;
15891 bool all_same = true, all_const_zero = true;
15892 int i;
15893 rtx x;
15894
15895 /* Handle first initialization from vector elts. */
15896 if (n_elts != XVECLEN (vals, 0))
15897 {
15898 rtx subtarget = target;
15899 x = XVECEXP (vals, 0, 0);
15900 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
15901 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
15902 {
15903 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
15904 if (inner_mode == QImode
15905 || inner_mode == HImode
15906 || inner_mode == TImode
15907 || inner_mode == HFmode)
15908 {
15909 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
15910 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
15911 n_bits /= GET_MODE_SIZE (elt_mode);
15912 mode = mode_for_vector (elt_mode, n_bits).require ();
15913 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
15914 ops[0] = gen_lowpart (inner_mode, ops[0]);
15915 ops[1] = gen_lowpart (inner_mode, ops[1]);
15916 subtarget = gen_reg_rtx (mode);
15917 }
15918 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
15919 if (subtarget != target)
15920 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
15921 return;
15922 }
15923 gcc_unreachable ();
15924 }
15925
15926 for (i = 0; i < n_elts; ++i)
15927 {
15928 x = XVECEXP (vals, 0, i);
15929 if (!(CONST_SCALAR_INT_P (x)
15930 || CONST_DOUBLE_P (x)
15931 || CONST_FIXED_P (x)))
15932 n_var++, one_var = i;
15933 else if (x != CONST0_RTX (inner_mode))
15934 all_const_zero = false;
15935 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
15936 all_same = false;
15937 }
15938
15939 /* Constants are best loaded from the constant pool. */
15940 if (n_var == 0)
15941 {
15942 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
15943 return;
15944 }
15945
15946 /* If all values are identical, broadcast the value. */
15947 if (all_same
15948 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
15949 XVECEXP (vals, 0, 0)))
15950 return;
15951
15952 /* Values where only one field is non-constant are best loaded from
15953 the pool and overwritten via move later. */
15954 if (n_var == 1)
15955 {
15956 if (all_const_zero
15957 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
15958 XVECEXP (vals, 0, one_var),
15959 one_var))
15960 return;
15961
15962 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
15963 return;
15964 }
15965
15966 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
15967 }
15968
15969 /* Implemented as
15970 V setg (V v, int idx, T val)
15971 {
15972 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
15973 V valv = (V){val, val, val, val, val, val, val, val};
15974 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
15975 v = (v & ~mask) | (valv & mask);
15976 return v;
15977 }. */
15978 void
15979 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
15980 {
15981 rtx vec[64];
15982 machine_mode mode = GET_MODE (target);
15983 machine_mode cmp_mode = mode;
15984 int n_elts = GET_MODE_NUNITS (mode);
15985 rtx valv,idxv,constv,idx_tmp;
15986 bool ok = false;
15987
15988 /* 512-bits vector byte/word broadcast and comparison only available
15989 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
15990 when without TARGET_AVX512BW. */
15991 if ((mode == V32HImode || mode == V32HFmode || mode == V64QImode)
15992 && !TARGET_AVX512BW)
15993 {
15994 gcc_assert (TARGET_AVX512F);
15995 rtx vhi, vlo, idx_hi;
15996 machine_mode half_mode;
15997 rtx (*extract_hi)(rtx, rtx);
15998 rtx (*extract_lo)(rtx, rtx);
15999
16000 if (mode == V32HImode)
16001 {
16002 half_mode = V16HImode;
16003 extract_hi = gen_vec_extract_hi_v32hi;
16004 extract_lo = gen_vec_extract_lo_v32hi;
16005 }
16006 else if (mode == V32HFmode)
16007 {
16008 half_mode = V16HFmode;
16009 extract_hi = gen_vec_extract_hi_v32hf;
16010 extract_lo = gen_vec_extract_lo_v32hf;
16011 }
16012 else
16013 {
16014 half_mode = V32QImode;
16015 extract_hi = gen_vec_extract_hi_v64qi;
16016 extract_lo = gen_vec_extract_lo_v64qi;
16017 }
16018
16019 vhi = gen_reg_rtx (half_mode);
16020 vlo = gen_reg_rtx (half_mode);
16021 idx_hi = gen_reg_rtx (GET_MODE (idx));
16022 emit_insn (extract_hi (vhi, target));
16023 emit_insn (extract_lo (vlo, target));
16024 vec[0] = idx_hi;
16025 vec[1] = idx;
16026 vec[2] = GEN_INT (n_elts/2);
16027 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16028 ix86_expand_vector_set_var (vhi, val, idx_hi);
16029 ix86_expand_vector_set_var (vlo, val, idx);
16030 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16031 return;
16032 }
16033
16034 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16035 {
16036 switch (mode)
16037 {
16038 case E_V2DFmode:
16039 cmp_mode = V2DImode;
16040 break;
16041 case E_V4DFmode:
16042 cmp_mode = V4DImode;
16043 break;
16044 case E_V8DFmode:
16045 cmp_mode = V8DImode;
16046 break;
16047 case E_V2SFmode:
16048 cmp_mode = V2SImode;
16049 break;
16050 case E_V4SFmode:
16051 cmp_mode = V4SImode;
16052 break;
16053 case E_V8SFmode:
16054 cmp_mode = V8SImode;
16055 break;
16056 case E_V16SFmode:
16057 cmp_mode = V16SImode;
16058 break;
16059 case E_V8HFmode:
16060 cmp_mode = V8HImode;
16061 break;
16062 case E_V16HFmode:
16063 cmp_mode = V16HImode;
16064 break;
16065 case E_V32HFmode:
16066 cmp_mode = V32HImode;
16067 break;
16068 default:
16069 gcc_unreachable ();
16070 }
16071 }
16072
16073 for (int i = 0; i != n_elts; i++)
16074 vec[i] = GEN_INT (i);
16075 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16076 valv = gen_reg_rtx (mode);
16077 idxv = gen_reg_rtx (cmp_mode);
16078 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16079
16080 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16081 mode, valv, val);
16082 gcc_assert (ok);
16083 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16084 cmp_mode, idxv, idx_tmp);
16085 gcc_assert (ok);
16086 vec[0] = target;
16087 vec[1] = valv;
16088 vec[2] = target;
16089 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16090 vec[4] = idxv;
16091 vec[5] = constv;
16092 ok = ix86_expand_int_vcond (vec);
16093 gcc_assert (ok);
16094 }
16095
16096 void
16097 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16098 {
16099 machine_mode mode = GET_MODE (target);
16100 machine_mode inner_mode = GET_MODE_INNER (mode);
16101 machine_mode half_mode;
16102 bool use_vec_merge = false;
16103 bool blendm_const = false;
16104 rtx tmp;
16105 static rtx (*gen_extract[7][2]) (rtx, rtx)
16106 = {
16107 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16108 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16109 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16110 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16111 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16112 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16113 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf }
16114 };
16115 static rtx (*gen_insert[7][2]) (rtx, rtx, rtx)
16116 = {
16117 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16118 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16119 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16120 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16121 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16122 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16123 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16124 };
16125 int i, j, n;
16126 machine_mode mmode = VOIDmode;
16127 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16128
16129 switch (mode)
16130 {
16131 case E_V2SImode:
16132 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16133 if (use_vec_merge)
16134 break;
16135 /* FALLTHRU */
16136
16137 case E_V2SFmode:
16138 if (mmx_ok)
16139 {
16140 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16141 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16142 if (elt == 0)
16143 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16144 else
16145 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16146 emit_insn (gen_rtx_SET (target, tmp));
16147 return;
16148 }
16149 break;
16150
16151 case E_V2DImode:
16152 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16153 if (use_vec_merge)
16154 break;
16155
16156 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16157 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16158 if (elt == 0)
16159 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16160 else
16161 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16162 emit_insn (gen_rtx_SET (target, tmp));
16163 return;
16164
16165 case E_V2DFmode:
16166 /* NB: For ELT == 0, use standard scalar operation patterns which
16167 preserve the rest of the vector for combiner:
16168
16169 (vec_merge:V2DF
16170 (vec_duplicate:V2DF (reg:DF))
16171 (reg:V2DF)
16172 (const_int 1))
16173 */
16174 if (elt == 0)
16175 goto do_vec_merge;
16176
16177 {
16178 rtx op0, op1;
16179
16180 /* For the two element vectors, we implement a VEC_CONCAT with
16181 the extraction of the other element. */
16182
16183 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16184 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16185
16186 if (elt == 0)
16187 op0 = val, op1 = tmp;
16188 else
16189 op0 = tmp, op1 = val;
16190
16191 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16192 emit_insn (gen_rtx_SET (target, tmp));
16193 }
16194 return;
16195
16196 case E_V4SFmode:
16197 use_vec_merge = TARGET_SSE4_1;
16198 if (use_vec_merge)
16199 break;
16200
16201 switch (elt)
16202 {
16203 case 0:
16204 use_vec_merge = true;
16205 break;
16206
16207 case 1:
16208 /* tmp = target = A B C D */
16209 tmp = copy_to_reg (target);
16210 /* target = A A B B */
16211 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16212 /* target = X A B B */
16213 ix86_expand_vector_set (false, target, val, 0);
16214 /* target = A X C D */
16215 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16216 const1_rtx, const0_rtx,
16217 GEN_INT (2+4), GEN_INT (3+4)));
16218 return;
16219
16220 case 2:
16221 /* tmp = target = A B C D */
16222 tmp = copy_to_reg (target);
16223 /* tmp = X B C D */
16224 ix86_expand_vector_set (false, tmp, val, 0);
16225 /* target = A B X D */
16226 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16227 const0_rtx, const1_rtx,
16228 GEN_INT (0+4), GEN_INT (3+4)));
16229 return;
16230
16231 case 3:
16232 /* tmp = target = A B C D */
16233 tmp = copy_to_reg (target);
16234 /* tmp = X B C D */
16235 ix86_expand_vector_set (false, tmp, val, 0);
16236 /* target = A B X D */
16237 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16238 const0_rtx, const1_rtx,
16239 GEN_INT (2+4), GEN_INT (0+4)));
16240 return;
16241
16242 default:
16243 gcc_unreachable ();
16244 }
16245 break;
16246
16247 case E_V4SImode:
16248 use_vec_merge = TARGET_SSE4_1;
16249 if (use_vec_merge)
16250 break;
16251
16252 /* Element 0 handled by vec_merge below. */
16253 if (elt == 0)
16254 {
16255 use_vec_merge = true;
16256 break;
16257 }
16258
16259 if (TARGET_SSE2)
16260 {
16261 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16262 store into element 0, then shuffle them back. */
16263
16264 rtx order[4];
16265
16266 order[0] = GEN_INT (elt);
16267 order[1] = const1_rtx;
16268 order[2] = const2_rtx;
16269 order[3] = GEN_INT (3);
16270 order[elt] = const0_rtx;
16271
16272 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16273 order[1], order[2], order[3]));
16274
16275 ix86_expand_vector_set (false, target, val, 0);
16276
16277 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16278 order[1], order[2], order[3]));
16279 }
16280 else
16281 {
16282 /* For SSE1, we have to reuse the V4SF code. */
16283 rtx t = gen_reg_rtx (V4SFmode);
16284 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16285 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16286 emit_move_insn (target, gen_lowpart (mode, t));
16287 }
16288 return;
16289
16290 case E_V8HImode:
16291 case E_V8HFmode:
16292 case E_V2HImode:
16293 use_vec_merge = TARGET_SSE2;
16294 break;
16295 case E_V4HImode:
16296 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16297 break;
16298
16299 case E_V16QImode:
16300 case E_V4QImode:
16301 use_vec_merge = TARGET_SSE4_1;
16302 break;
16303
16304 case E_V8QImode:
16305 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16306 break;
16307
16308 case E_V32QImode:
16309 half_mode = V16QImode;
16310 j = 0;
16311 n = 16;
16312 goto half;
16313
16314 case E_V16HFmode:
16315 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16316 if (TARGET_AVX2 && elt != 0)
16317 {
16318 mmode = SImode;
16319 gen_blendm = gen_avx2_pblendph_1;
16320 blendm_const = true;
16321 break;
16322 }
16323 else
16324 {
16325 half_mode = V8HFmode;
16326 j = 6;
16327 n = 8;
16328 goto half;
16329 }
16330
16331 case E_V16HImode:
16332 half_mode = V8HImode;
16333 j = 1;
16334 n = 8;
16335 goto half;
16336
16337 case E_V8SImode:
16338 half_mode = V4SImode;
16339 j = 2;
16340 n = 4;
16341 goto half;
16342
16343 case E_V4DImode:
16344 half_mode = V2DImode;
16345 j = 3;
16346 n = 2;
16347 goto half;
16348
16349 case E_V8SFmode:
16350 half_mode = V4SFmode;
16351 j = 4;
16352 n = 4;
16353 goto half;
16354
16355 case E_V4DFmode:
16356 half_mode = V2DFmode;
16357 j = 5;
16358 n = 2;
16359 goto half;
16360
16361 half:
16362 /* Compute offset. */
16363 i = elt / n;
16364 elt %= n;
16365
16366 gcc_assert (i <= 1);
16367
16368 /* Extract the half. */
16369 tmp = gen_reg_rtx (half_mode);
16370 emit_insn (gen_extract[j][i] (tmp, target));
16371
16372 /* Put val in tmp at elt. */
16373 ix86_expand_vector_set (false, tmp, val, elt);
16374
16375 /* Put it back. */
16376 emit_insn (gen_insert[j][i] (target, target, tmp));
16377 return;
16378
16379 case E_V8DFmode:
16380 if (TARGET_AVX512F)
16381 {
16382 mmode = QImode;
16383 gen_blendm = gen_avx512f_blendmv8df;
16384 }
16385 break;
16386
16387 case E_V8DImode:
16388 if (TARGET_AVX512F)
16389 {
16390 mmode = QImode;
16391 gen_blendm = gen_avx512f_blendmv8di;
16392 }
16393 break;
16394
16395 case E_V16SFmode:
16396 if (TARGET_AVX512F)
16397 {
16398 mmode = HImode;
16399 gen_blendm = gen_avx512f_blendmv16sf;
16400 }
16401 break;
16402
16403 case E_V16SImode:
16404 if (TARGET_AVX512F)
16405 {
16406 mmode = HImode;
16407 gen_blendm = gen_avx512f_blendmv16si;
16408 }
16409 break;
16410
16411 case E_V32HFmode:
16412 if (TARGET_AVX512BW)
16413 {
16414 mmode = SImode;
16415 gen_blendm = gen_avx512bw_blendmv32hf;
16416 }
16417 break;
16418 case E_V32HImode:
16419 if (TARGET_AVX512BW)
16420 {
16421 mmode = SImode;
16422 gen_blendm = gen_avx512bw_blendmv32hi;
16423 }
16424 else if (TARGET_AVX512F)
16425 {
16426 half_mode = E_V8HImode;
16427 n = 8;
16428 goto quarter;
16429 }
16430 break;
16431
16432 case E_V64QImode:
16433 if (TARGET_AVX512BW)
16434 {
16435 mmode = DImode;
16436 gen_blendm = gen_avx512bw_blendmv64qi;
16437 }
16438 else if (TARGET_AVX512F)
16439 {
16440 half_mode = E_V16QImode;
16441 n = 16;
16442 goto quarter;
16443 }
16444 break;
16445
16446 quarter:
16447 /* Compute offset. */
16448 i = elt / n;
16449 elt %= n;
16450
16451 gcc_assert (i <= 3);
16452
16453 {
16454 /* Extract the quarter. */
16455 tmp = gen_reg_rtx (V4SImode);
16456 rtx tmp2 = gen_lowpart (V16SImode, target);
16457 rtx mask = gen_reg_rtx (QImode);
16458
16459 emit_move_insn (mask, constm1_rtx);
16460 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
16461 tmp, mask));
16462
16463 tmp2 = gen_reg_rtx (half_mode);
16464 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
16465 tmp = tmp2;
16466
16467 /* Put val in tmp at elt. */
16468 ix86_expand_vector_set (false, tmp, val, elt);
16469
16470 /* Put it back. */
16471 tmp2 = gen_reg_rtx (V16SImode);
16472 rtx tmp3 = gen_lowpart (V16SImode, target);
16473 mask = gen_reg_rtx (HImode);
16474 emit_move_insn (mask, constm1_rtx);
16475 tmp = gen_lowpart (V4SImode, tmp);
16476 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
16477 tmp3, mask));
16478 emit_move_insn (target, gen_lowpart (mode, tmp2));
16479 }
16480 return;
16481
16482 default:
16483 break;
16484 }
16485
16486 if (mmode != VOIDmode)
16487 {
16488 tmp = gen_reg_rtx (mode);
16489 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
16490 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
16491 /* The avx512*_blendm<mode> expanders have different operand order
16492 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
16493 elements where the mask is set and second input operand otherwise,
16494 in {sse,avx}*_*blend* the first input operand is used for elements
16495 where the mask is clear and second input operand otherwise. */
16496 if (!blendm_const)
16497 merge_mask = force_reg (mmode, merge_mask);
16498 emit_insn (gen_blendm (target, target, tmp, merge_mask));
16499 }
16500 else if (use_vec_merge)
16501 {
16502 do_vec_merge:
16503 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
16504 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
16505 GEN_INT (HOST_WIDE_INT_1U << elt));
16506 emit_insn (gen_rtx_SET (target, tmp));
16507 }
16508 else
16509 {
16510 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16511
16512 emit_move_insn (mem, target);
16513
16514 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
16515 emit_move_insn (tmp, val);
16516
16517 emit_move_insn (target, mem);
16518 }
16519 }
16520
16521 void
16522 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
16523 {
16524 machine_mode mode = GET_MODE (vec);
16525 machine_mode inner_mode = GET_MODE_INNER (mode);
16526 bool use_vec_extr = false;
16527 rtx tmp;
16528
16529 switch (mode)
16530 {
16531 case E_V2SImode:
16532 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16533 if (use_vec_extr)
16534 break;
16535 /* FALLTHRU */
16536
16537 case E_V2SFmode:
16538 if (!mmx_ok)
16539 break;
16540 /* FALLTHRU */
16541
16542 case E_V2DFmode:
16543 case E_V2DImode:
16544 case E_V2TImode:
16545 case E_V4TImode:
16546 use_vec_extr = true;
16547 break;
16548
16549 case E_V4SFmode:
16550 use_vec_extr = TARGET_SSE4_1;
16551 if (use_vec_extr)
16552 break;
16553
16554 switch (elt)
16555 {
16556 case 0:
16557 tmp = vec;
16558 break;
16559
16560 case 1:
16561 case 3:
16562 tmp = gen_reg_rtx (mode);
16563 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
16564 GEN_INT (elt), GEN_INT (elt),
16565 GEN_INT (elt+4), GEN_INT (elt+4)));
16566 break;
16567
16568 case 2:
16569 tmp = gen_reg_rtx (mode);
16570 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
16571 break;
16572
16573 default:
16574 gcc_unreachable ();
16575 }
16576 vec = tmp;
16577 use_vec_extr = true;
16578 elt = 0;
16579 break;
16580
16581 case E_V4SImode:
16582 use_vec_extr = TARGET_SSE4_1;
16583 if (use_vec_extr)
16584 break;
16585
16586 if (TARGET_SSE2)
16587 {
16588 switch (elt)
16589 {
16590 case 0:
16591 tmp = vec;
16592 break;
16593
16594 case 1:
16595 case 3:
16596 tmp = gen_reg_rtx (mode);
16597 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
16598 GEN_INT (elt), GEN_INT (elt),
16599 GEN_INT (elt), GEN_INT (elt)));
16600 break;
16601
16602 case 2:
16603 tmp = gen_reg_rtx (mode);
16604 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
16605 break;
16606
16607 default:
16608 gcc_unreachable ();
16609 }
16610 vec = tmp;
16611 use_vec_extr = true;
16612 elt = 0;
16613 }
16614 else
16615 {
16616 /* For SSE1, we have to reuse the V4SF code. */
16617 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
16618 gen_lowpart (V4SFmode, vec), elt);
16619 return;
16620 }
16621 break;
16622
16623 case E_V8HImode:
16624 case E_V8HFmode:
16625 case E_V2HImode:
16626 use_vec_extr = TARGET_SSE2;
16627 break;
16628 case E_V4HImode:
16629 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16630 break;
16631
16632 case E_V16QImode:
16633 use_vec_extr = TARGET_SSE4_1;
16634 if (!use_vec_extr
16635 && TARGET_SSE2
16636 && elt == 0
16637 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
16638 {
16639 tmp = gen_reg_rtx (SImode);
16640 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
16641 0);
16642 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
16643 return;
16644 }
16645 break;
16646 case E_V4QImode:
16647 use_vec_extr = TARGET_SSE4_1;
16648 break;
16649
16650 case E_V8SFmode:
16651 if (TARGET_AVX)
16652 {
16653 tmp = gen_reg_rtx (V4SFmode);
16654 if (elt < 4)
16655 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
16656 else
16657 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
16658 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16659 return;
16660 }
16661 break;
16662
16663 case E_V4DFmode:
16664 if (TARGET_AVX)
16665 {
16666 tmp = gen_reg_rtx (V2DFmode);
16667 if (elt < 2)
16668 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
16669 else
16670 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
16671 ix86_expand_vector_extract (false, target, tmp, elt & 1);
16672 return;
16673 }
16674 break;
16675
16676 case E_V32QImode:
16677 if (TARGET_AVX)
16678 {
16679 tmp = gen_reg_rtx (V16QImode);
16680 if (elt < 16)
16681 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
16682 else
16683 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
16684 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16685 return;
16686 }
16687 break;
16688
16689 case E_V16HImode:
16690 if (TARGET_AVX)
16691 {
16692 tmp = gen_reg_rtx (V8HImode);
16693 if (elt < 8)
16694 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
16695 else
16696 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
16697 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16698 return;
16699 }
16700 break;
16701
16702 case E_V8SImode:
16703 if (TARGET_AVX)
16704 {
16705 tmp = gen_reg_rtx (V4SImode);
16706 if (elt < 4)
16707 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
16708 else
16709 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
16710 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16711 return;
16712 }
16713 break;
16714
16715 case E_V4DImode:
16716 if (TARGET_AVX)
16717 {
16718 tmp = gen_reg_rtx (V2DImode);
16719 if (elt < 2)
16720 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
16721 else
16722 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
16723 ix86_expand_vector_extract (false, target, tmp, elt & 1);
16724 return;
16725 }
16726 break;
16727
16728 case E_V32HImode:
16729 if (TARGET_AVX512BW)
16730 {
16731 tmp = gen_reg_rtx (V16HImode);
16732 if (elt < 16)
16733 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
16734 else
16735 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
16736 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16737 return;
16738 }
16739 break;
16740
16741 case E_V64QImode:
16742 if (TARGET_AVX512BW)
16743 {
16744 tmp = gen_reg_rtx (V32QImode);
16745 if (elt < 32)
16746 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
16747 else
16748 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
16749 ix86_expand_vector_extract (false, target, tmp, elt & 31);
16750 return;
16751 }
16752 break;
16753
16754 case E_V16SFmode:
16755 tmp = gen_reg_rtx (V8SFmode);
16756 if (elt < 8)
16757 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
16758 else
16759 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
16760 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16761 return;
16762
16763 case E_V8DFmode:
16764 tmp = gen_reg_rtx (V4DFmode);
16765 if (elt < 4)
16766 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
16767 else
16768 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
16769 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16770 return;
16771
16772 case E_V16SImode:
16773 tmp = gen_reg_rtx (V8SImode);
16774 if (elt < 8)
16775 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
16776 else
16777 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
16778 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16779 return;
16780
16781 case E_V8DImode:
16782 tmp = gen_reg_rtx (V4DImode);
16783 if (elt < 4)
16784 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
16785 else
16786 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
16787 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16788 return;
16789
16790 case E_V32HFmode:
16791 if (TARGET_AVX512BW)
16792 {
16793 tmp = gen_reg_rtx (V16HFmode);
16794 if (elt < 16)
16795 emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
16796 else
16797 emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
16798 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16799 return;
16800 }
16801 break;
16802
16803 case E_V16HFmode:
16804 if (TARGET_AVX)
16805 {
16806 tmp = gen_reg_rtx (V8HFmode);
16807 if (elt < 8)
16808 emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
16809 else
16810 emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
16811 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16812 return;
16813 }
16814 break;
16815
16816 case E_V8QImode:
16817 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16818 /* ??? Could extract the appropriate HImode element and shift. */
16819 break;
16820
16821 default:
16822 break;
16823 }
16824
16825 if (use_vec_extr)
16826 {
16827 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
16828 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
16829
16830 /* Let the rtl optimizers know about the zero extension performed. */
16831 if (inner_mode == QImode || inner_mode == HImode)
16832 {
16833 rtx reg = gen_reg_rtx (SImode);
16834 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
16835 emit_move_insn (reg, tmp);
16836 tmp = gen_lowpart (inner_mode, reg);
16837 SUBREG_PROMOTED_VAR_P (tmp) = 1;
16838 SUBREG_PROMOTED_SET (tmp, 1);
16839 }
16840
16841 emit_move_insn (target, tmp);
16842 }
16843 else
16844 {
16845 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16846
16847 emit_move_insn (mem, vec);
16848
16849 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
16850 emit_move_insn (target, tmp);
16851 }
16852 }
16853
16854 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
16855 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
16856 The upper bits of DEST are undefined, though they shouldn't cause
16857 exceptions (some bits from src or all zeros are ok). */
16858
16859 static void
16860 emit_reduc_half (rtx dest, rtx src, int i)
16861 {
16862 rtx tem, d = dest;
16863 switch (GET_MODE (src))
16864 {
16865 case E_V4SFmode:
16866 if (i == 128)
16867 tem = gen_sse_movhlps (dest, src, src);
16868 else
16869 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
16870 GEN_INT (1 + 4), GEN_INT (1 + 4));
16871 break;
16872 case E_V2DFmode:
16873 tem = gen_vec_interleave_highv2df (dest, src, src);
16874 break;
16875 case E_V4QImode:
16876 d = gen_reg_rtx (V1SImode);
16877 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
16878 GEN_INT (i / 2));
16879 break;
16880 case E_V4HImode:
16881 d = gen_reg_rtx (V1DImode);
16882 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
16883 GEN_INT (i / 2));
16884 break;
16885 case E_V16QImode:
16886 case E_V8HImode:
16887 case E_V8HFmode:
16888 case E_V4SImode:
16889 case E_V2DImode:
16890 d = gen_reg_rtx (V1TImode);
16891 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
16892 GEN_INT (i / 2));
16893 break;
16894 case E_V8SFmode:
16895 if (i == 256)
16896 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
16897 else
16898 tem = gen_avx_shufps256 (dest, src, src,
16899 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
16900 break;
16901 case E_V4DFmode:
16902 if (i == 256)
16903 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
16904 else
16905 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
16906 break;
16907 case E_V32QImode:
16908 case E_V16HImode:
16909 case E_V16HFmode:
16910 case E_V8SImode:
16911 case E_V4DImode:
16912 if (i == 256)
16913 {
16914 if (GET_MODE (dest) != V4DImode)
16915 d = gen_reg_rtx (V4DImode);
16916 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
16917 gen_lowpart (V4DImode, src),
16918 const1_rtx);
16919 }
16920 else
16921 {
16922 d = gen_reg_rtx (V2TImode);
16923 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
16924 GEN_INT (i / 2));
16925 }
16926 break;
16927 case E_V64QImode:
16928 case E_V32HImode:
16929 case E_V32HFmode:
16930 if (i < 64)
16931 {
16932 d = gen_reg_rtx (V4TImode);
16933 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
16934 GEN_INT (i / 2));
16935 break;
16936 }
16937 /* FALLTHRU */
16938 case E_V16SImode:
16939 case E_V16SFmode:
16940 case E_V8DImode:
16941 case E_V8DFmode:
16942 if (i > 128)
16943 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
16944 gen_lowpart (V16SImode, src),
16945 gen_lowpart (V16SImode, src),
16946 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
16947 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
16948 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
16949 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
16950 GEN_INT (0xC), GEN_INT (0xD),
16951 GEN_INT (0xE), GEN_INT (0xF),
16952 GEN_INT (0x10), GEN_INT (0x11),
16953 GEN_INT (0x12), GEN_INT (0x13),
16954 GEN_INT (0x14), GEN_INT (0x15),
16955 GEN_INT (0x16), GEN_INT (0x17));
16956 else
16957 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
16958 gen_lowpart (V16SImode, src),
16959 GEN_INT (i == 128 ? 0x2 : 0x1),
16960 GEN_INT (0x3),
16961 GEN_INT (0x3),
16962 GEN_INT (0x3),
16963 GEN_INT (i == 128 ? 0x6 : 0x5),
16964 GEN_INT (0x7),
16965 GEN_INT (0x7),
16966 GEN_INT (0x7),
16967 GEN_INT (i == 128 ? 0xA : 0x9),
16968 GEN_INT (0xB),
16969 GEN_INT (0xB),
16970 GEN_INT (0xB),
16971 GEN_INT (i == 128 ? 0xE : 0xD),
16972 GEN_INT (0xF),
16973 GEN_INT (0xF),
16974 GEN_INT (0xF));
16975 break;
16976 default:
16977 gcc_unreachable ();
16978 }
16979 emit_insn (tem);
16980 if (d != dest)
16981 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
16982 }
16983
16984 /* Expand a vector reduction. FN is the binary pattern to reduce;
16985 DEST is the destination; IN is the input vector. */
16986
16987 void
16988 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
16989 {
16990 rtx half, dst, vec = in;
16991 machine_mode mode = GET_MODE (in);
16992 int i;
16993
16994 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
16995 if (TARGET_SSE4_1
16996 && mode == V8HImode
16997 && fn == gen_uminv8hi3)
16998 {
16999 emit_insn (gen_sse4_1_phminposuw (dest, in));
17000 return;
17001 }
17002
17003 for (i = GET_MODE_BITSIZE (mode);
17004 i > GET_MODE_UNIT_BITSIZE (mode);
17005 i >>= 1)
17006 {
17007 half = gen_reg_rtx (mode);
17008 emit_reduc_half (half, vec, i);
17009 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17010 dst = dest;
17011 else
17012 dst = gen_reg_rtx (mode);
17013 emit_insn (fn (dst, half, vec));
17014 vec = dst;
17015 }
17016 }
17017
17018 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17019 FP status register is set. */
17020
17021 void
17022 ix86_emit_fp_unordered_jump (rtx label)
17023 {
17024 rtx reg = gen_reg_rtx (HImode);
17025 rtx_insn *insn;
17026 rtx temp;
17027
17028 emit_insn (gen_x86_fnstsw_1 (reg));
17029
17030 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17031 {
17032 emit_insn (gen_x86_sahf_1 (reg));
17033
17034 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17035 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17036 }
17037 else
17038 {
17039 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17040
17041 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17042 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17043 }
17044
17045 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17046 gen_rtx_LABEL_REF (VOIDmode, label),
17047 pc_rtx);
17048 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17049 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17050 JUMP_LABEL (insn) = label;
17051 }
17052
17053 /* Output code to perform an sinh XFmode calculation. */
17054
17055 void
17056 ix86_emit_i387_sinh (rtx op0, rtx op1)
17057 {
17058 rtx e1 = gen_reg_rtx (XFmode);
17059 rtx e2 = gen_reg_rtx (XFmode);
17060 rtx scratch = gen_reg_rtx (HImode);
17061 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17062 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17063 rtx cst1, tmp;
17064 rtx_code_label *jump_label = gen_label_rtx ();
17065 rtx_insn *insn;
17066
17067 /* scratch = fxam (op1) */
17068 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17069
17070 /* e1 = expm1 (|op1|) */
17071 emit_insn (gen_absxf2 (e2, op1));
17072 emit_insn (gen_expm1xf2 (e1, e2));
17073
17074 /* e2 = e1 / (e1 + 1.0) + e1 */
17075 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17076 emit_insn (gen_addxf3 (e2, e1, cst1));
17077 emit_insn (gen_divxf3 (e2, e1, e2));
17078 emit_insn (gen_addxf3 (e2, e2, e1));
17079
17080 /* flags = signbit (op1) */
17081 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17082
17083 /* if (flags) then e2 = -e2 */
17084 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17085 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17086 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17087 pc_rtx);
17088 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17089 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17090 JUMP_LABEL (insn) = jump_label;
17091
17092 emit_insn (gen_negxf2 (e2, e2));
17093
17094 emit_label (jump_label);
17095 LABEL_NUSES (jump_label) = 1;
17096
17097 /* op0 = 0.5 * e2 */
17098 half = force_reg (XFmode, half);
17099 emit_insn (gen_mulxf3 (op0, e2, half));
17100 }
17101
17102 /* Output code to perform an cosh XFmode calculation. */
17103
17104 void
17105 ix86_emit_i387_cosh (rtx op0, rtx op1)
17106 {
17107 rtx e1 = gen_reg_rtx (XFmode);
17108 rtx e2 = gen_reg_rtx (XFmode);
17109 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17110 rtx cst1;
17111
17112 /* e1 = exp (op1) */
17113 emit_insn (gen_expxf2 (e1, op1));
17114
17115 /* e2 = e1 + 1.0 / e1 */
17116 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17117 emit_insn (gen_divxf3 (e2, cst1, e1));
17118 emit_insn (gen_addxf3 (e2, e1, e2));
17119
17120 /* op0 = 0.5 * e2 */
17121 half = force_reg (XFmode, half);
17122 emit_insn (gen_mulxf3 (op0, e2, half));
17123 }
17124
17125 /* Output code to perform an tanh XFmode calculation. */
17126
17127 void
17128 ix86_emit_i387_tanh (rtx op0, rtx op1)
17129 {
17130 rtx e1 = gen_reg_rtx (XFmode);
17131 rtx e2 = gen_reg_rtx (XFmode);
17132 rtx scratch = gen_reg_rtx (HImode);
17133 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17134 rtx cst2, tmp;
17135 rtx_code_label *jump_label = gen_label_rtx ();
17136 rtx_insn *insn;
17137
17138 /* scratch = fxam (op1) */
17139 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17140
17141 /* e1 = expm1 (-|2 * op1|) */
17142 emit_insn (gen_addxf3 (e2, op1, op1));
17143 emit_insn (gen_absxf2 (e2, e2));
17144 emit_insn (gen_negxf2 (e2, e2));
17145 emit_insn (gen_expm1xf2 (e1, e2));
17146
17147 /* e2 = e1 / (e1 + 2.0) */
17148 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17149 emit_insn (gen_addxf3 (e2, e1, cst2));
17150 emit_insn (gen_divxf3 (e2, e1, e2));
17151
17152 /* flags = signbit (op1) */
17153 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17154
17155 /* if (!flags) then e2 = -e2 */
17156 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17157 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17158 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17159 pc_rtx);
17160 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17161 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17162 JUMP_LABEL (insn) = jump_label;
17163
17164 emit_insn (gen_negxf2 (e2, e2));
17165
17166 emit_label (jump_label);
17167 LABEL_NUSES (jump_label) = 1;
17168
17169 emit_move_insn (op0, e2);
17170 }
17171
17172 /* Output code to perform an asinh XFmode calculation. */
17173
17174 void
17175 ix86_emit_i387_asinh (rtx op0, rtx op1)
17176 {
17177 rtx e1 = gen_reg_rtx (XFmode);
17178 rtx e2 = gen_reg_rtx (XFmode);
17179 rtx scratch = gen_reg_rtx (HImode);
17180 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17181 rtx cst1, tmp;
17182 rtx_code_label *jump_label = gen_label_rtx ();
17183 rtx_insn *insn;
17184
17185 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17186 emit_insn (gen_mulxf3 (e1, op1, op1));
17187 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17188 emit_insn (gen_addxf3 (e2, e1, cst1));
17189 emit_insn (gen_sqrtxf2 (e2, e2));
17190 emit_insn (gen_addxf3 (e2, e2, cst1));
17191
17192 /* e1 = e1 / e2 */
17193 emit_insn (gen_divxf3 (e1, e1, e2));
17194
17195 /* scratch = fxam (op1) */
17196 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17197
17198 /* e1 = e1 + |op1| */
17199 emit_insn (gen_absxf2 (e2, op1));
17200 emit_insn (gen_addxf3 (e1, e1, e2));
17201
17202 /* e2 = log1p (e1) */
17203 ix86_emit_i387_log1p (e2, e1);
17204
17205 /* flags = signbit (op1) */
17206 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17207
17208 /* if (flags) then e2 = -e2 */
17209 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17210 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17211 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17212 pc_rtx);
17213 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17214 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17215 JUMP_LABEL (insn) = jump_label;
17216
17217 emit_insn (gen_negxf2 (e2, e2));
17218
17219 emit_label (jump_label);
17220 LABEL_NUSES (jump_label) = 1;
17221
17222 emit_move_insn (op0, e2);
17223 }
17224
17225 /* Output code to perform an acosh XFmode calculation. */
17226
17227 void
17228 ix86_emit_i387_acosh (rtx op0, rtx op1)
17229 {
17230 rtx e1 = gen_reg_rtx (XFmode);
17231 rtx e2 = gen_reg_rtx (XFmode);
17232 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17233
17234 /* e2 = sqrt (op1 + 1.0) */
17235 emit_insn (gen_addxf3 (e2, op1, cst1));
17236 emit_insn (gen_sqrtxf2 (e2, e2));
17237
17238 /* e1 = sqrt (op1 - 1.0) */
17239 emit_insn (gen_subxf3 (e1, op1, cst1));
17240 emit_insn (gen_sqrtxf2 (e1, e1));
17241
17242 /* e1 = e1 * e2 */
17243 emit_insn (gen_mulxf3 (e1, e1, e2));
17244
17245 /* e1 = e1 + op1 */
17246 emit_insn (gen_addxf3 (e1, e1, op1));
17247
17248 /* op0 = log (e1) */
17249 emit_insn (gen_logxf2 (op0, e1));
17250 }
17251
17252 /* Output code to perform an atanh XFmode calculation. */
17253
17254 void
17255 ix86_emit_i387_atanh (rtx op0, rtx op1)
17256 {
17257 rtx e1 = gen_reg_rtx (XFmode);
17258 rtx e2 = gen_reg_rtx (XFmode);
17259 rtx scratch = gen_reg_rtx (HImode);
17260 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17261 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17262 rtx cst1, tmp;
17263 rtx_code_label *jump_label = gen_label_rtx ();
17264 rtx_insn *insn;
17265
17266 /* scratch = fxam (op1) */
17267 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17268
17269 /* e2 = |op1| */
17270 emit_insn (gen_absxf2 (e2, op1));
17271
17272 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17273 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17274 emit_insn (gen_addxf3 (e1, e2, cst1));
17275 emit_insn (gen_addxf3 (e2, e2, e2));
17276 emit_insn (gen_negxf2 (e2, e2));
17277 emit_insn (gen_divxf3 (e1, e2, e1));
17278
17279 /* e2 = log1p (e1) */
17280 ix86_emit_i387_log1p (e2, e1);
17281
17282 /* flags = signbit (op1) */
17283 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17284
17285 /* if (!flags) then e2 = -e2 */
17286 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17287 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17288 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17289 pc_rtx);
17290 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17291 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17292 JUMP_LABEL (insn) = jump_label;
17293
17294 emit_insn (gen_negxf2 (e2, e2));
17295
17296 emit_label (jump_label);
17297 LABEL_NUSES (jump_label) = 1;
17298
17299 /* op0 = 0.5 * e2 */
17300 half = force_reg (XFmode, half);
17301 emit_insn (gen_mulxf3 (op0, e2, half));
17302 }
17303
17304 /* Output code to perform a log1p XFmode calculation. */
17305
17306 void
17307 ix86_emit_i387_log1p (rtx op0, rtx op1)
17308 {
17309 rtx_code_label *label1 = gen_label_rtx ();
17310 rtx_code_label *label2 = gen_label_rtx ();
17311
17312 rtx tmp = gen_reg_rtx (XFmode);
17313 rtx res = gen_reg_rtx (XFmode);
17314 rtx cst, cstln2, cst1;
17315 rtx_insn *insn;
17316
17317 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17318 before the conditional jump, otherwise the stack adjustment will be
17319 only conditional. */
17320 do_pending_stack_adjust ();
17321
17322 cst = const_double_from_real_value
17323 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17324 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17325
17326 emit_insn (gen_absxf2 (tmp, op1));
17327
17328 cst = force_reg (XFmode, cst);
17329 ix86_expand_branch (GE, tmp, cst, label1);
17330 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17331 insn = get_last_insn ();
17332 JUMP_LABEL (insn) = label1;
17333
17334 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17335 emit_jump (label2);
17336
17337 emit_label (label1);
17338 LABEL_NUSES (label1) = 1;
17339
17340 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17341 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17342 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17343
17344 emit_label (label2);
17345 LABEL_NUSES (label2) = 1;
17346
17347 emit_move_insn (op0, res);
17348 }
17349
17350 /* Emit code for round calculation. */
17351 void
17352 ix86_emit_i387_round (rtx op0, rtx op1)
17353 {
17354 machine_mode inmode = GET_MODE (op1);
17355 machine_mode outmode = GET_MODE (op0);
17356 rtx e1 = gen_reg_rtx (XFmode);
17357 rtx e2 = gen_reg_rtx (XFmode);
17358 rtx scratch = gen_reg_rtx (HImode);
17359 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17360 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17361 rtx res = gen_reg_rtx (outmode);
17362 rtx_code_label *jump_label = gen_label_rtx ();
17363 rtx (*floor_insn) (rtx, rtx);
17364 rtx (*neg_insn) (rtx, rtx);
17365 rtx_insn *insn;
17366 rtx tmp;
17367
17368 switch (inmode)
17369 {
17370 case E_SFmode:
17371 case E_DFmode:
17372 tmp = gen_reg_rtx (XFmode);
17373
17374 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17375 op1 = tmp;
17376 break;
17377 case E_XFmode:
17378 break;
17379 default:
17380 gcc_unreachable ();
17381 }
17382
17383 switch (outmode)
17384 {
17385 case E_SFmode:
17386 floor_insn = gen_frndintxf2_floor;
17387 neg_insn = gen_negsf2;
17388 break;
17389 case E_DFmode:
17390 floor_insn = gen_frndintxf2_floor;
17391 neg_insn = gen_negdf2;
17392 break;
17393 case E_XFmode:
17394 floor_insn = gen_frndintxf2_floor;
17395 neg_insn = gen_negxf2;
17396 break;
17397 case E_HImode:
17398 floor_insn = gen_lfloorxfhi2;
17399 neg_insn = gen_neghi2;
17400 break;
17401 case E_SImode:
17402 floor_insn = gen_lfloorxfsi2;
17403 neg_insn = gen_negsi2;
17404 break;
17405 case E_DImode:
17406 floor_insn = gen_lfloorxfdi2;
17407 neg_insn = gen_negdi2;
17408 break;
17409 default:
17410 gcc_unreachable ();
17411 }
17412
17413 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17414
17415 /* scratch = fxam(op1) */
17416 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17417
17418 /* e1 = fabs(op1) */
17419 emit_insn (gen_absxf2 (e1, op1));
17420
17421 /* e2 = e1 + 0.5 */
17422 half = force_reg (XFmode, half);
17423 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
17424
17425 /* res = floor(e2) */
17426 switch (outmode)
17427 {
17428 case E_SFmode:
17429 case E_DFmode:
17430 {
17431 tmp = gen_reg_rtx (XFmode);
17432
17433 emit_insn (floor_insn (tmp, e2));
17434 emit_insn (gen_rtx_SET (res,
17435 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
17436 UNSPEC_TRUNC_NOOP)));
17437 }
17438 break;
17439 default:
17440 emit_insn (floor_insn (res, e2));
17441 }
17442
17443 /* flags = signbit(a) */
17444 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17445
17446 /* if (flags) then res = -res */
17447 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17448 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17449 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17450 pc_rtx);
17451 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17452 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17453 JUMP_LABEL (insn) = jump_label;
17454
17455 emit_insn (neg_insn (res, res));
17456
17457 emit_label (jump_label);
17458 LABEL_NUSES (jump_label) = 1;
17459
17460 emit_move_insn (op0, res);
17461 }
17462
17463 /* Output code to perform a Newton-Rhapson approximation of a single precision
17464 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
17465
17466 void
17467 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
17468 {
17469 rtx x0, x1, e0, e1;
17470
17471 x0 = gen_reg_rtx (mode);
17472 e0 = gen_reg_rtx (mode);
17473 e1 = gen_reg_rtx (mode);
17474 x1 = gen_reg_rtx (mode);
17475
17476 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17477
17478 b = force_reg (mode, b);
17479
17480 /* x0 = rcp(b) estimate */
17481 if (mode == V16SFmode || mode == V8DFmode)
17482 {
17483 if (TARGET_AVX512ER)
17484 {
17485 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17486 UNSPEC_RCP28)));
17487 /* res = a * x0 */
17488 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
17489 return;
17490 }
17491 else
17492 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17493 UNSPEC_RCP14)));
17494 }
17495 else
17496 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17497 UNSPEC_RCP)));
17498
17499 /* e0 = x0 * b */
17500 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
17501
17502 /* e0 = x0 * e0 */
17503 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
17504
17505 /* e1 = x0 + x0 */
17506 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
17507
17508 /* x1 = e1 - e0 */
17509 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
17510
17511 /* res = a * x1 */
17512 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
17513 }
17514
17515 /* Output code to perform a Newton-Rhapson approximation of a
17516 single precision floating point [reciprocal] square root. */
17517
17518 void
17519 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
17520 {
17521 rtx x0, e0, e1, e2, e3, mthree, mhalf;
17522 REAL_VALUE_TYPE r;
17523 int unspec;
17524
17525 x0 = gen_reg_rtx (mode);
17526 e0 = gen_reg_rtx (mode);
17527 e1 = gen_reg_rtx (mode);
17528 e2 = gen_reg_rtx (mode);
17529 e3 = gen_reg_rtx (mode);
17530
17531 if (TARGET_AVX512ER && mode == V16SFmode)
17532 {
17533 if (recip)
17534 /* res = rsqrt28(a) estimate */
17535 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17536 UNSPEC_RSQRT28)));
17537 else
17538 {
17539 /* x0 = rsqrt28(a) estimate */
17540 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17541 UNSPEC_RSQRT28)));
17542 /* res = rcp28(x0) estimate */
17543 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
17544 UNSPEC_RCP28)));
17545 }
17546 return;
17547 }
17548
17549 real_from_integer (&r, VOIDmode, -3, SIGNED);
17550 mthree = const_double_from_real_value (r, SFmode);
17551
17552 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
17553 mhalf = const_double_from_real_value (r, SFmode);
17554 unspec = UNSPEC_RSQRT;
17555
17556 if (VECTOR_MODE_P (mode))
17557 {
17558 mthree = ix86_build_const_vector (mode, true, mthree);
17559 mhalf = ix86_build_const_vector (mode, true, mhalf);
17560 /* There is no 512-bit rsqrt. There is however rsqrt14. */
17561 if (GET_MODE_SIZE (mode) == 64)
17562 unspec = UNSPEC_RSQRT14;
17563 }
17564
17565 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
17566 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
17567
17568 a = force_reg (mode, a);
17569
17570 /* x0 = rsqrt(a) estimate */
17571 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17572 unspec)));
17573
17574 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
17575 if (!recip)
17576 {
17577 rtx zero = force_reg (mode, CONST0_RTX(mode));
17578 rtx mask;
17579
17580 /* Handle masked compare. */
17581 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
17582 {
17583 mask = gen_reg_rtx (HImode);
17584 /* Imm value 0x4 corresponds to not-equal comparison. */
17585 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
17586 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
17587 }
17588 else
17589 {
17590 mask = gen_reg_rtx (mode);
17591 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
17592 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
17593 }
17594 }
17595
17596 mthree = force_reg (mode, mthree);
17597
17598 /* e0 = x0 * a */
17599 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
17600
17601 unsigned vector_size = GET_MODE_SIZE (mode);
17602 if (TARGET_FMA
17603 || (TARGET_AVX512F && vector_size == 64)
17604 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
17605 emit_insn (gen_rtx_SET (e2,
17606 gen_rtx_FMA (mode, e0, x0, mthree)));
17607 else
17608 {
17609 /* e1 = e0 * x0 */
17610 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
17611
17612 /* e2 = e1 - 3. */
17613 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
17614 }
17615
17616 mhalf = force_reg (mode, mhalf);
17617 if (recip)
17618 /* e3 = -.5 * x0 */
17619 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
17620 else
17621 /* e3 = -.5 * e0 */
17622 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
17623 /* ret = e2 * e3 */
17624 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
17625 }
17626
17627 /* Expand fabs (OP0) and return a new rtx that holds the result. The
17628 mask for masking out the sign-bit is stored in *SMASK, if that is
17629 non-null. */
17630
17631 static rtx
17632 ix86_expand_sse_fabs (rtx op0, rtx *smask)
17633 {
17634 machine_mode vmode, mode = GET_MODE (op0);
17635 rtx xa, mask;
17636
17637 xa = gen_reg_rtx (mode);
17638 if (mode == SFmode)
17639 vmode = V4SFmode;
17640 else if (mode == DFmode)
17641 vmode = V2DFmode;
17642 else
17643 vmode = mode;
17644 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
17645 if (!VECTOR_MODE_P (mode))
17646 {
17647 /* We need to generate a scalar mode mask in this case. */
17648 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17649 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17650 mask = gen_reg_rtx (mode);
17651 emit_insn (gen_rtx_SET (mask, tmp));
17652 }
17653 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
17654
17655 if (smask)
17656 *smask = mask;
17657
17658 return xa;
17659 }
17660
17661 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
17662 swapping the operands if SWAP_OPERANDS is true. The expanded
17663 code is a forward jump to a newly created label in case the
17664 comparison is true. The generated label rtx is returned. */
17665 static rtx_code_label *
17666 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
17667 bool swap_operands)
17668 {
17669 bool unordered_compare = ix86_unordered_fp_compare (code);
17670 rtx_code_label *label;
17671 rtx tmp, reg;
17672
17673 if (swap_operands)
17674 std::swap (op0, op1);
17675
17676 label = gen_label_rtx ();
17677 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
17678 if (unordered_compare)
17679 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
17680 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
17681 emit_insn (gen_rtx_SET (reg, tmp));
17682 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
17683 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
17684 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
17685 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17686 JUMP_LABEL (tmp) = label;
17687
17688 return label;
17689 }
17690
17691 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
17692 using comparison code CODE. Operands are swapped for the comparison if
17693 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
17694 static rtx
17695 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
17696 bool swap_operands)
17697 {
17698 rtx (*insn)(rtx, rtx, rtx, rtx);
17699 machine_mode mode = GET_MODE (op0);
17700 rtx mask = gen_reg_rtx (mode);
17701
17702 if (swap_operands)
17703 std::swap (op0, op1);
17704
17705 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
17706
17707 emit_insn (insn (mask, op0, op1,
17708 gen_rtx_fmt_ee (code, mode, op0, op1)));
17709 return mask;
17710 }
17711
17712 /* Expand copysign from SIGN to the positive value ABS_VALUE
17713 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
17714 the sign-bit. */
17715
17716 static void
17717 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
17718 {
17719 machine_mode mode = GET_MODE (sign);
17720 rtx sgn = gen_reg_rtx (mode);
17721 if (mask == NULL_RTX)
17722 {
17723 machine_mode vmode;
17724
17725 if (mode == SFmode)
17726 vmode = V4SFmode;
17727 else if (mode == DFmode)
17728 vmode = V2DFmode;
17729 else
17730 vmode = mode;
17731
17732 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
17733 if (!VECTOR_MODE_P (mode))
17734 {
17735 /* We need to generate a scalar mode mask in this case. */
17736 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17737 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17738 mask = gen_reg_rtx (mode);
17739 emit_insn (gen_rtx_SET (mask, tmp));
17740 }
17741 }
17742 else
17743 mask = gen_rtx_NOT (mode, mask);
17744 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
17745 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
17746 }
17747
17748 /* Expand SSE sequence for computing lround from OP1 storing
17749 into OP0. */
17750
17751 void
17752 ix86_expand_lround (rtx op0, rtx op1)
17753 {
17754 /* C code for the stuff we're doing below:
17755 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
17756 return (long)tmp;
17757 */
17758 machine_mode mode = GET_MODE (op1);
17759 const struct real_format *fmt;
17760 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
17761 rtx adj;
17762
17763 /* load nextafter (0.5, 0.0) */
17764 fmt = REAL_MODE_FORMAT (mode);
17765 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
17766 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
17767
17768 /* adj = copysign (0.5, op1) */
17769 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
17770 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
17771
17772 /* adj = op1 + adj */
17773 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
17774
17775 /* op0 = (imode)adj */
17776 expand_fix (op0, adj, 0);
17777 }
17778
17779 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
17780 into OPERAND0. */
17781
17782 void
17783 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
17784 {
17785 /* C code for the stuff we're doing below (for do_floor):
17786 xi = (long)op1;
17787 xi -= (double)xi > op1 ? 1 : 0;
17788 return xi;
17789 */
17790 machine_mode fmode = GET_MODE (op1);
17791 machine_mode imode = GET_MODE (op0);
17792 rtx ireg, freg, tmp;
17793 rtx_code_label *label;
17794
17795 /* reg = (long)op1 */
17796 ireg = gen_reg_rtx (imode);
17797 expand_fix (ireg, op1, 0);
17798
17799 /* freg = (double)reg */
17800 freg = gen_reg_rtx (fmode);
17801 expand_float (freg, ireg, 0);
17802
17803 /* ireg = (freg > op1) ? ireg - 1 : ireg */
17804 label = ix86_expand_sse_compare_and_jump (UNLE,
17805 freg, op1, !do_floor);
17806 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
17807 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
17808 emit_move_insn (ireg, tmp);
17809
17810 emit_label (label);
17811 LABEL_NUSES (label) = 1;
17812
17813 emit_move_insn (op0, ireg);
17814 }
17815
17816 /* Generate and return a rtx of mode MODE for 2**n where n is the number
17817 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
17818
17819 static rtx
17820 ix86_gen_TWO52 (machine_mode mode)
17821 {
17822 const struct real_format *fmt;
17823 REAL_VALUE_TYPE TWO52r;
17824 rtx TWO52;
17825
17826 fmt = REAL_MODE_FORMAT (mode);
17827 real_2expN (&TWO52r, fmt->p - 1, mode);
17828 TWO52 = const_double_from_real_value (TWO52r, mode);
17829 TWO52 = force_reg (mode, TWO52);
17830
17831 return TWO52;
17832 }
17833
17834 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
17835
17836 void
17837 ix86_expand_rint (rtx operand0, rtx operand1)
17838 {
17839 /* C code for the stuff we're doing below:
17840 xa = fabs (operand1);
17841 if (!isless (xa, 2**52))
17842 return operand1;
17843 two52 = 2**52;
17844 if (flag_rounding_math)
17845 {
17846 two52 = copysign (two52, operand1);
17847 xa = operand1;
17848 }
17849 xa = xa + two52 - two52;
17850 return copysign (xa, operand1);
17851 */
17852 machine_mode mode = GET_MODE (operand0);
17853 rtx res, xa, TWO52, mask;
17854 rtx_code_label *label;
17855
17856 TWO52 = ix86_gen_TWO52 (mode);
17857
17858 /* Temporary for holding the result, initialized to the input
17859 operand to ease control flow. */
17860 res = copy_to_reg (operand1);
17861
17862 /* xa = abs (operand1) */
17863 xa = ix86_expand_sse_fabs (res, &mask);
17864
17865 /* if (!isless (xa, TWO52)) goto label; */
17866 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17867
17868 if (flag_rounding_math)
17869 {
17870 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
17871 xa = res;
17872 }
17873
17874 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17875 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
17876
17877 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
17878 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
17879 xa = ix86_expand_sse_fabs (xa, NULL);
17880
17881 ix86_sse_copysign_to_positive (res, xa, res, mask);
17882
17883 emit_label (label);
17884 LABEL_NUSES (label) = 1;
17885
17886 emit_move_insn (operand0, res);
17887 }
17888
17889 /* Expand SSE2 sequence for computing floor or ceil
17890 from OPERAND1 storing into OPERAND0. */
17891 void
17892 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
17893 {
17894 /* C code for the stuff we expand below.
17895 double xa = fabs (x), x2;
17896 if (!isless (xa, TWO52))
17897 return x;
17898 x2 = (double)(long)x;
17899
17900 Compensate. Floor:
17901 if (x2 > x)
17902 x2 -= 1;
17903 Compensate. Ceil:
17904 if (x2 < x)
17905 x2 += 1;
17906
17907 if (HONOR_SIGNED_ZEROS (mode))
17908 return copysign (x2, x);
17909 return x2;
17910 */
17911 machine_mode mode = GET_MODE (operand0);
17912 rtx xa, xi, TWO52, tmp, one, res, mask;
17913 rtx_code_label *label;
17914
17915 TWO52 = ix86_gen_TWO52 (mode);
17916
17917 /* Temporary for holding the result, initialized to the input
17918 operand to ease control flow. */
17919 res = copy_to_reg (operand1);
17920
17921 /* xa = abs (operand1) */
17922 xa = ix86_expand_sse_fabs (res, &mask);
17923
17924 /* if (!isless (xa, TWO52)) goto label; */
17925 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17926
17927 /* xa = (double)(long)x */
17928 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
17929 expand_fix (xi, res, 0);
17930 expand_float (xa, xi, 0);
17931
17932 /* generate 1.0 */
17933 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
17934
17935 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
17936 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
17937 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
17938 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
17939 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
17940 if (HONOR_SIGNED_ZEROS (mode))
17941 {
17942 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
17943 if (do_floor && flag_rounding_math)
17944 tmp = ix86_expand_sse_fabs (tmp, NULL);
17945
17946 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
17947 }
17948 emit_move_insn (res, tmp);
17949
17950 emit_label (label);
17951 LABEL_NUSES (label) = 1;
17952
17953 emit_move_insn (operand0, res);
17954 }
17955
17956 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
17957 into OPERAND0 without relying on DImode truncation via cvttsd2siq
17958 that is only available on 64bit targets. */
17959 void
17960 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
17961 {
17962 /* C code for the stuff we expand below.
17963 double xa = fabs (x), x2;
17964 if (!isless (xa, TWO52))
17965 return x;
17966 xa = xa + TWO52 - TWO52;
17967 x2 = copysign (xa, x);
17968
17969 Compensate. Floor:
17970 if (x2 > x)
17971 x2 -= 1;
17972 Compensate. Ceil:
17973 if (x2 < x)
17974 x2 += 1;
17975
17976 if (HONOR_SIGNED_ZEROS (mode))
17977 x2 = copysign (x2, x);
17978 return x2;
17979 */
17980 machine_mode mode = GET_MODE (operand0);
17981 rtx xa, TWO52, tmp, one, res, mask;
17982 rtx_code_label *label;
17983
17984 TWO52 = ix86_gen_TWO52 (mode);
17985
17986 /* Temporary for holding the result, initialized to the input
17987 operand to ease control flow. */
17988 res = copy_to_reg (operand1);
17989
17990 /* xa = abs (operand1) */
17991 xa = ix86_expand_sse_fabs (res, &mask);
17992
17993 /* if (!isless (xa, TWO52)) goto label; */
17994 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17995
17996 /* xa = xa + TWO52 - TWO52; */
17997 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17998 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
17999
18000 /* xa = copysign (xa, operand1) */
18001 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18002
18003 /* generate 1.0 */
18004 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18005
18006 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18007 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18008 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18009 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18010 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18011 if (HONOR_SIGNED_ZEROS (mode))
18012 {
18013 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18014 if (do_floor && flag_rounding_math)
18015 tmp = ix86_expand_sse_fabs (tmp, NULL);
18016
18017 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18018 }
18019 emit_move_insn (res, tmp);
18020
18021 emit_label (label);
18022 LABEL_NUSES (label) = 1;
18023
18024 emit_move_insn (operand0, res);
18025 }
18026
18027 /* Expand SSE sequence for computing trunc
18028 from OPERAND1 storing into OPERAND0. */
18029 void
18030 ix86_expand_trunc (rtx operand0, rtx operand1)
18031 {
18032 /* C code for SSE variant we expand below.
18033 double xa = fabs (x), x2;
18034 if (!isless (xa, TWO52))
18035 return x;
18036 x2 = (double)(long)x;
18037 if (HONOR_SIGNED_ZEROS (mode))
18038 return copysign (x2, x);
18039 return x2;
18040 */
18041 machine_mode mode = GET_MODE (operand0);
18042 rtx xa, xi, TWO52, res, mask;
18043 rtx_code_label *label;
18044
18045 TWO52 = ix86_gen_TWO52 (mode);
18046
18047 /* Temporary for holding the result, initialized to the input
18048 operand to ease control flow. */
18049 res = copy_to_reg (operand1);
18050
18051 /* xa = abs (operand1) */
18052 xa = ix86_expand_sse_fabs (res, &mask);
18053
18054 /* if (!isless (xa, TWO52)) goto label; */
18055 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18056
18057 /* xa = (double)(long)x */
18058 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18059 expand_fix (xi, res, 0);
18060 expand_float (xa, xi, 0);
18061
18062 if (HONOR_SIGNED_ZEROS (mode))
18063 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18064
18065 emit_move_insn (res, xa);
18066
18067 emit_label (label);
18068 LABEL_NUSES (label) = 1;
18069
18070 emit_move_insn (operand0, res);
18071 }
18072
18073 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18074 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18075 that is only available on 64bit targets. */
18076 void
18077 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18078 {
18079 machine_mode mode = GET_MODE (operand0);
18080 rtx xa, xa2, TWO52, tmp, one, res, mask;
18081 rtx_code_label *label;
18082
18083 /* C code for SSE variant we expand below.
18084 double xa = fabs (x), x2;
18085 if (!isless (xa, TWO52))
18086 return x;
18087 xa2 = xa + TWO52 - TWO52;
18088 Compensate:
18089 if (xa2 > xa)
18090 xa2 -= 1.0;
18091 x2 = copysign (xa2, x);
18092 return x2;
18093 */
18094
18095 TWO52 = ix86_gen_TWO52 (mode);
18096
18097 /* Temporary for holding the result, initialized to the input
18098 operand to ease control flow. */
18099 res =copy_to_reg (operand1);
18100
18101 /* xa = abs (operand1) */
18102 xa = ix86_expand_sse_fabs (res, &mask);
18103
18104 /* if (!isless (xa, TWO52)) goto label; */
18105 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18106
18107 /* xa2 = xa + TWO52 - TWO52; */
18108 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18109 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18110
18111 /* generate 1.0 */
18112 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18113
18114 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18115 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18116 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18117 tmp = expand_simple_binop (mode, MINUS,
18118 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18119 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18120 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18121 tmp = ix86_expand_sse_fabs (tmp, NULL);
18122
18123 /* res = copysign (xa2, operand1) */
18124 ix86_sse_copysign_to_positive (res, tmp, res, mask);
18125
18126 emit_label (label);
18127 LABEL_NUSES (label) = 1;
18128
18129 emit_move_insn (operand0, res);
18130 }
18131
18132 /* Expand SSE sequence for computing round
18133 from OPERAND1 storing into OPERAND0. */
18134 void
18135 ix86_expand_round (rtx operand0, rtx operand1)
18136 {
18137 /* C code for the stuff we're doing below:
18138 double xa = fabs (x);
18139 if (!isless (xa, TWO52))
18140 return x;
18141 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18142 return copysign (xa, x);
18143 */
18144 machine_mode mode = GET_MODE (operand0);
18145 rtx res, TWO52, xa, xi, half, mask;
18146 rtx_code_label *label;
18147 const struct real_format *fmt;
18148 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18149
18150 /* Temporary for holding the result, initialized to the input
18151 operand to ease control flow. */
18152 res = copy_to_reg (operand1);
18153
18154 TWO52 = ix86_gen_TWO52 (mode);
18155 xa = ix86_expand_sse_fabs (res, &mask);
18156 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18157
18158 /* load nextafter (0.5, 0.0) */
18159 fmt = REAL_MODE_FORMAT (mode);
18160 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18161 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18162
18163 /* xa = xa + 0.5 */
18164 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18165 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18166
18167 /* xa = (double)(int64_t)xa */
18168 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18169 expand_fix (xi, xa, 0);
18170 expand_float (xa, xi, 0);
18171
18172 /* res = copysign (xa, operand1) */
18173 ix86_sse_copysign_to_positive (res, xa, res, mask);
18174
18175 emit_label (label);
18176 LABEL_NUSES (label) = 1;
18177
18178 emit_move_insn (operand0, res);
18179 }
18180
18181 /* Expand SSE sequence for computing round from OPERAND1 storing
18182 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18183 that is only available on 64bit targets. */
18184 void
18185 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18186 {
18187 /* C code for the stuff we expand below.
18188 double xa = fabs (x), xa2, x2;
18189 if (!isless (xa, TWO52))
18190 return x;
18191 Using the absolute value and copying back sign makes
18192 -0.0 -> -0.0 correct.
18193 xa2 = xa + TWO52 - TWO52;
18194 Compensate.
18195 dxa = xa2 - xa;
18196 if (dxa <= -0.5)
18197 xa2 += 1;
18198 else if (dxa > 0.5)
18199 xa2 -= 1;
18200 x2 = copysign (xa2, x);
18201 return x2;
18202 */
18203 machine_mode mode = GET_MODE (operand0);
18204 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18205 rtx_code_label *label;
18206
18207 TWO52 = ix86_gen_TWO52 (mode);
18208
18209 /* Temporary for holding the result, initialized to the input
18210 operand to ease control flow. */
18211 res = copy_to_reg (operand1);
18212
18213 /* xa = abs (operand1) */
18214 xa = ix86_expand_sse_fabs (res, &mask);
18215
18216 /* if (!isless (xa, TWO52)) goto label; */
18217 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18218
18219 /* xa2 = xa + TWO52 - TWO52; */
18220 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18221 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18222
18223 /* dxa = xa2 - xa; */
18224 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18225
18226 /* generate 0.5, 1.0 and -0.5 */
18227 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18228 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18229 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18230 0, OPTAB_DIRECT);
18231
18232 /* Compensate. */
18233 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18234 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18235 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18236 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18237 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18238 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18239 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18240 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18241
18242 /* res = copysign (xa2, operand1) */
18243 ix86_sse_copysign_to_positive (res, xa2, res, mask);
18244
18245 emit_label (label);
18246 LABEL_NUSES (label) = 1;
18247
18248 emit_move_insn (operand0, res);
18249 }
18250
18251 /* Expand SSE sequence for computing round
18252 from OP1 storing into OP0 using sse4 round insn. */
18253 void
18254 ix86_expand_round_sse4 (rtx op0, rtx op1)
18255 {
18256 machine_mode mode = GET_MODE (op0);
18257 rtx e1, e2, res, half;
18258 const struct real_format *fmt;
18259 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18260 rtx (*gen_copysign) (rtx, rtx, rtx);
18261 rtx (*gen_round) (rtx, rtx, rtx);
18262
18263 switch (mode)
18264 {
18265 case E_SFmode:
18266 gen_copysign = gen_copysignsf3;
18267 gen_round = gen_sse4_1_roundsf2;
18268 break;
18269 case E_DFmode:
18270 gen_copysign = gen_copysigndf3;
18271 gen_round = gen_sse4_1_rounddf2;
18272 break;
18273 default:
18274 gcc_unreachable ();
18275 }
18276
18277 /* round (a) = trunc (a + copysign (0.5, a)) */
18278
18279 /* load nextafter (0.5, 0.0) */
18280 fmt = REAL_MODE_FORMAT (mode);
18281 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18282 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18283 half = const_double_from_real_value (pred_half, mode);
18284
18285 /* e1 = copysign (0.5, op1) */
18286 e1 = gen_reg_rtx (mode);
18287 emit_insn (gen_copysign (e1, half, op1));
18288
18289 /* e2 = op1 + e1 */
18290 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18291
18292 /* res = trunc (e2) */
18293 res = gen_reg_rtx (mode);
18294 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18295
18296 emit_move_insn (op0, res);
18297 }
18298
18299 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18300 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18301 insn every time. */
18302
18303 static GTY(()) rtx_insn *vselect_insn;
18304
18305 /* Initialize vselect_insn. */
18306
18307 static void
18308 init_vselect_insn (void)
18309 {
18310 unsigned i;
18311 rtx x;
18312
18313 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18314 for (i = 0; i < MAX_VECT_LEN; ++i)
18315 XVECEXP (x, 0, i) = const0_rtx;
18316 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18317 const0_rtx), x);
18318 x = gen_rtx_SET (const0_rtx, x);
18319 start_sequence ();
18320 vselect_insn = emit_insn (x);
18321 end_sequence ();
18322 }
18323
18324 /* Construct (set target (vec_select op0 (parallel perm))) and
18325 return true if that's a valid instruction in the active ISA. */
18326
18327 static bool
18328 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18329 unsigned nelt, bool testing_p)
18330 {
18331 unsigned int i;
18332 rtx x, save_vconcat;
18333 int icode;
18334
18335 if (vselect_insn == NULL_RTX)
18336 init_vselect_insn ();
18337
18338 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18339 PUT_NUM_ELEM (XVEC (x, 0), nelt);
18340 for (i = 0; i < nelt; ++i)
18341 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18342 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18343 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18344 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18345 SET_DEST (PATTERN (vselect_insn)) = target;
18346 icode = recog_memoized (vselect_insn);
18347
18348 if (icode >= 0 && !testing_p)
18349 emit_insn (copy_rtx (PATTERN (vselect_insn)));
18350
18351 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18352 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18353 INSN_CODE (vselect_insn) = -1;
18354
18355 return icode >= 0;
18356 }
18357
18358 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18359
18360 static bool
18361 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18362 const unsigned char *perm, unsigned nelt,
18363 bool testing_p)
18364 {
18365 machine_mode v2mode;
18366 rtx x;
18367 bool ok;
18368
18369 if (vselect_insn == NULL_RTX)
18370 init_vselect_insn ();
18371
18372 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18373 return false;
18374 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18375 PUT_MODE (x, v2mode);
18376 XEXP (x, 0) = op0;
18377 XEXP (x, 1) = op1;
18378 ok = expand_vselect (target, x, perm, nelt, testing_p);
18379 XEXP (x, 0) = const0_rtx;
18380 XEXP (x, 1) = const0_rtx;
18381 return ok;
18382 }
18383
18384 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18385 using movss or movsd. */
18386 static bool
18387 expand_vec_perm_movs (struct expand_vec_perm_d *d)
18388 {
18389 machine_mode vmode = d->vmode;
18390 unsigned i, nelt = d->nelt;
18391 rtx x;
18392
18393 if (d->one_operand_p)
18394 return false;
18395
18396 if (!(TARGET_SSE && vmode == V4SFmode)
18397 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
18398 && !(TARGET_SSE2 && vmode == V2DFmode))
18399 return false;
18400
18401 /* Only the first element is changed. */
18402 if (d->perm[0] != nelt && d->perm[0] != 0)
18403 return false;
18404 for (i = 1; i < nelt; ++i)
18405 if (d->perm[i] != i + nelt - d->perm[0])
18406 return false;
18407
18408 if (d->testing_p)
18409 return true;
18410
18411 if (d->perm[0] == nelt)
18412 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18413 else
18414 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18415
18416 emit_insn (gen_rtx_SET (d->target, x));
18417
18418 return true;
18419 }
18420
18421 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18422 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
18423
18424 static bool
18425 expand_vec_perm_blend (struct expand_vec_perm_d *d)
18426 {
18427 machine_mode mmode, vmode = d->vmode;
18428 unsigned i, nelt = d->nelt;
18429 unsigned HOST_WIDE_INT mask;
18430 rtx target, op0, op1, maskop, x;
18431 rtx rperm[32], vperm;
18432
18433 if (d->one_operand_p)
18434 return false;
18435 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
18436 && (TARGET_AVX512BW
18437 || GET_MODE_UNIT_SIZE (vmode) >= 4))
18438 ;
18439 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
18440 ;
18441 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
18442 ;
18443 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
18444 || GET_MODE_SIZE (vmode) == 8
18445 || GET_MODE_SIZE (vmode) == 4))
18446 ;
18447 else
18448 return false;
18449
18450 /* This is a blend, not a permute. Elements must stay in their
18451 respective lanes. */
18452 for (i = 0; i < nelt; ++i)
18453 {
18454 unsigned e = d->perm[i];
18455 if (!(e == i || e == i + nelt))
18456 return false;
18457 }
18458
18459 if (d->testing_p)
18460 return true;
18461
18462 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
18463 decision should be extracted elsewhere, so that we only try that
18464 sequence once all budget==3 options have been tried. */
18465 target = d->target;
18466 op0 = d->op0;
18467 op1 = d->op1;
18468 mask = 0;
18469
18470 switch (vmode)
18471 {
18472 case E_V8DFmode:
18473 case E_V16SFmode:
18474 case E_V4DFmode:
18475 case E_V8SFmode:
18476 case E_V2DFmode:
18477 case E_V4SFmode:
18478 case E_V4HImode:
18479 case E_V8HImode:
18480 case E_V8SImode:
18481 case E_V32HImode:
18482 case E_V64QImode:
18483 case E_V16SImode:
18484 case E_V8DImode:
18485 for (i = 0; i < nelt; ++i)
18486 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
18487 break;
18488
18489 case E_V2DImode:
18490 for (i = 0; i < 2; ++i)
18491 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
18492 vmode = V8HImode;
18493 goto do_subreg;
18494
18495 case E_V2SImode:
18496 for (i = 0; i < 2; ++i)
18497 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
18498 vmode = V4HImode;
18499 goto do_subreg;
18500
18501 case E_V4SImode:
18502 for (i = 0; i < 4; ++i)
18503 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18504 vmode = V8HImode;
18505 goto do_subreg;
18506
18507 case E_V16QImode:
18508 /* See if bytes move in pairs so we can use pblendw with
18509 an immediate argument, rather than pblendvb with a vector
18510 argument. */
18511 for (i = 0; i < 16; i += 2)
18512 if (d->perm[i] + 1 != d->perm[i + 1])
18513 {
18514 use_pblendvb:
18515 for (i = 0; i < nelt; ++i)
18516 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
18517
18518 finish_pblendvb:
18519 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18520 vperm = force_reg (vmode, vperm);
18521
18522 if (GET_MODE_SIZE (vmode) == 4)
18523 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
18524 else if (GET_MODE_SIZE (vmode) == 8)
18525 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
18526 else if (GET_MODE_SIZE (vmode) == 16)
18527 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
18528 else
18529 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
18530 if (target != d->target)
18531 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18532 return true;
18533 }
18534
18535 for (i = 0; i < 8; ++i)
18536 mask |= (d->perm[i * 2] >= 16) << i;
18537 vmode = V8HImode;
18538 /* FALLTHRU */
18539
18540 do_subreg:
18541 target = gen_reg_rtx (vmode);
18542 op0 = gen_lowpart (vmode, op0);
18543 op1 = gen_lowpart (vmode, op1);
18544 break;
18545
18546 case E_V8QImode:
18547 for (i = 0; i < 8; i += 2)
18548 if (d->perm[i] + 1 != d->perm[i + 1])
18549 goto use_pblendvb;
18550
18551 for (i = 0; i < 4; ++i)
18552 mask |= (d->perm[i * 2] >= 8) << i;
18553 vmode = V4HImode;
18554 goto do_subreg;
18555
18556 case E_V4QImode:
18557 for (i = 0; i < 4; i += 2)
18558 if (d->perm[i] + 1 != d->perm[i + 1])
18559 goto use_pblendvb;
18560
18561 for (i = 0; i < 2; ++i)
18562 mask |= (d->perm[i * 2] >= 4) << i;
18563 vmode = V2HImode;
18564 goto do_subreg;
18565
18566 case E_V32QImode:
18567 /* See if bytes move in pairs. If not, vpblendvb must be used. */
18568 for (i = 0; i < 32; i += 2)
18569 if (d->perm[i] + 1 != d->perm[i + 1])
18570 goto use_pblendvb;
18571 /* See if bytes move in quadruplets. If yes, vpblendd
18572 with immediate can be used. */
18573 for (i = 0; i < 32; i += 4)
18574 if (d->perm[i] + 2 != d->perm[i + 2])
18575 break;
18576 if (i < 32)
18577 {
18578 /* See if bytes move the same in both lanes. If yes,
18579 vpblendw with immediate can be used. */
18580 for (i = 0; i < 16; i += 2)
18581 if (d->perm[i] + 16 != d->perm[i + 16])
18582 goto use_pblendvb;
18583
18584 /* Use vpblendw. */
18585 for (i = 0; i < 16; ++i)
18586 mask |= (d->perm[i * 2] >= 32) << i;
18587 vmode = V16HImode;
18588 goto do_subreg;
18589 }
18590
18591 /* Use vpblendd. */
18592 for (i = 0; i < 8; ++i)
18593 mask |= (d->perm[i * 4] >= 32) << i;
18594 vmode = V8SImode;
18595 goto do_subreg;
18596
18597 case E_V16HImode:
18598 /* See if words move in pairs. If yes, vpblendd can be used. */
18599 for (i = 0; i < 16; i += 2)
18600 if (d->perm[i] + 1 != d->perm[i + 1])
18601 break;
18602 if (i < 16)
18603 {
18604 /* See if words move the same in both lanes. If not,
18605 vpblendvb must be used. */
18606 for (i = 0; i < 8; i++)
18607 if (d->perm[i] + 8 != d->perm[i + 8])
18608 {
18609 /* Use vpblendvb. */
18610 for (i = 0; i < 32; ++i)
18611 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
18612
18613 vmode = V32QImode;
18614 nelt = 32;
18615 target = gen_reg_rtx (vmode);
18616 op0 = gen_lowpart (vmode, op0);
18617 op1 = gen_lowpart (vmode, op1);
18618 goto finish_pblendvb;
18619 }
18620
18621 /* Use vpblendw. */
18622 for (i = 0; i < 16; ++i)
18623 mask |= (d->perm[i] >= 16) << i;
18624 break;
18625 }
18626
18627 /* Use vpblendd. */
18628 for (i = 0; i < 8; ++i)
18629 mask |= (d->perm[i * 2] >= 16) << i;
18630 vmode = V8SImode;
18631 goto do_subreg;
18632
18633 case E_V4DImode:
18634 /* Use vpblendd. */
18635 for (i = 0; i < 4; ++i)
18636 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18637 vmode = V8SImode;
18638 goto do_subreg;
18639
18640 default:
18641 gcc_unreachable ();
18642 }
18643
18644 switch (vmode)
18645 {
18646 case E_V8DFmode:
18647 case E_V8DImode:
18648 mmode = QImode;
18649 break;
18650 case E_V16SFmode:
18651 case E_V16SImode:
18652 mmode = HImode;
18653 break;
18654 case E_V32HImode:
18655 mmode = SImode;
18656 break;
18657 case E_V64QImode:
18658 mmode = DImode;
18659 break;
18660 default:
18661 mmode = VOIDmode;
18662 }
18663
18664 if (mmode != VOIDmode)
18665 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
18666 else
18667 maskop = GEN_INT (mask);
18668
18669 /* This matches five different patterns with the different modes. */
18670 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
18671 x = gen_rtx_SET (target, x);
18672 emit_insn (x);
18673 if (target != d->target)
18674 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18675
18676 return true;
18677 }
18678
18679 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18680 in terms of the variable form of vpermilps.
18681
18682 Note that we will have already failed the immediate input vpermilps,
18683 which requires that the high and low part shuffle be identical; the
18684 variable form doesn't require that. */
18685
18686 static bool
18687 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
18688 {
18689 rtx rperm[8], vperm;
18690 unsigned i;
18691
18692 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
18693 return false;
18694
18695 /* We can only permute within the 128-bit lane. */
18696 for (i = 0; i < 8; ++i)
18697 {
18698 unsigned e = d->perm[i];
18699 if (i < 4 ? e >= 4 : e < 4)
18700 return false;
18701 }
18702
18703 if (d->testing_p)
18704 return true;
18705
18706 for (i = 0; i < 8; ++i)
18707 {
18708 unsigned e = d->perm[i];
18709
18710 /* Within each 128-bit lane, the elements of op0 are numbered
18711 from 0 and the elements of op1 are numbered from 4. */
18712 if (e >= 8 + 4)
18713 e -= 8;
18714 else if (e >= 4)
18715 e -= 4;
18716
18717 rperm[i] = GEN_INT (e);
18718 }
18719
18720 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
18721 vperm = force_reg (V8SImode, vperm);
18722 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
18723
18724 return true;
18725 }
18726
18727 /* For V*[QHS]Imode permutations, check if the same permutation
18728 can't be performed in a 2x, 4x or 8x wider inner mode. */
18729
18730 static bool
18731 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
18732 struct expand_vec_perm_d *nd)
18733 {
18734 int i;
18735 machine_mode mode = VOIDmode;
18736
18737 switch (d->vmode)
18738 {
18739 case E_V8QImode: mode = V4HImode; break;
18740 case E_V16QImode: mode = V8HImode; break;
18741 case E_V32QImode: mode = V16HImode; break;
18742 case E_V64QImode: mode = V32HImode; break;
18743 case E_V4HImode: mode = V2SImode; break;
18744 case E_V8HImode: mode = V4SImode; break;
18745 case E_V16HImode: mode = V8SImode; break;
18746 case E_V32HImode: mode = V16SImode; break;
18747 case E_V4SImode: mode = V2DImode; break;
18748 case E_V8SImode: mode = V4DImode; break;
18749 case E_V16SImode: mode = V8DImode; break;
18750 default: return false;
18751 }
18752 for (i = 0; i < d->nelt; i += 2)
18753 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
18754 return false;
18755 nd->vmode = mode;
18756 nd->nelt = d->nelt / 2;
18757 for (i = 0; i < nd->nelt; i++)
18758 nd->perm[i] = d->perm[2 * i] / 2;
18759 if (GET_MODE_INNER (mode) != DImode)
18760 canonicalize_vector_int_perm (nd, nd);
18761 if (nd != d)
18762 {
18763 nd->one_operand_p = d->one_operand_p;
18764 nd->testing_p = d->testing_p;
18765 if (d->op0 == d->op1)
18766 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
18767 else
18768 {
18769 nd->op0 = gen_lowpart (nd->vmode, d->op0);
18770 nd->op1 = gen_lowpart (nd->vmode, d->op1);
18771 }
18772 if (d->testing_p)
18773 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
18774 else
18775 nd->target = gen_reg_rtx (nd->vmode);
18776 }
18777 return true;
18778 }
18779
18780 /* Return true if permutation D can be performed as VMODE permutation
18781 instead. */
18782
18783 static bool
18784 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
18785 {
18786 unsigned int i, j, chunk;
18787
18788 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
18789 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
18790 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
18791 return false;
18792
18793 if (GET_MODE_NUNITS (vmode) >= d->nelt)
18794 return true;
18795
18796 chunk = d->nelt / GET_MODE_NUNITS (vmode);
18797 for (i = 0; i < d->nelt; i += chunk)
18798 if (d->perm[i] & (chunk - 1))
18799 return false;
18800 else
18801 for (j = 1; j < chunk; ++j)
18802 if (d->perm[i] + j != d->perm[i + j])
18803 return false;
18804
18805 return true;
18806 }
18807
18808 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18809 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
18810
18811 static bool
18812 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
18813 {
18814 unsigned i, nelt, eltsz, mask;
18815 unsigned char perm[64];
18816 machine_mode vmode;
18817 struct expand_vec_perm_d nd;
18818 rtx rperm[64], vperm, target, op0, op1;
18819
18820 nelt = d->nelt;
18821
18822 if (!d->one_operand_p)
18823 switch (GET_MODE_SIZE (d->vmode))
18824 {
18825 case 4:
18826 if (!TARGET_XOP)
18827 return false;
18828 vmode = V4QImode;
18829 break;
18830
18831 case 8:
18832 if (!TARGET_XOP)
18833 return false;
18834 vmode = V8QImode;
18835 break;
18836
18837 case 16:
18838 if (!TARGET_XOP)
18839 return false;
18840 vmode = V16QImode;
18841 break;
18842
18843 case 32:
18844 if (!TARGET_AVX2)
18845 return false;
18846
18847 if (valid_perm_using_mode_p (V2TImode, d))
18848 {
18849 if (d->testing_p)
18850 return true;
18851
18852 /* Use vperm2i128 insn. The pattern uses
18853 V4DImode instead of V2TImode. */
18854 target = d->target;
18855 if (d->vmode != V4DImode)
18856 target = gen_reg_rtx (V4DImode);
18857 op0 = gen_lowpart (V4DImode, d->op0);
18858 op1 = gen_lowpart (V4DImode, d->op1);
18859 rperm[0]
18860 = GEN_INT ((d->perm[0] / (nelt / 2))
18861 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
18862 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
18863 if (target != d->target)
18864 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18865 return true;
18866 }
18867 /* FALLTHRU */
18868
18869 default:
18870 return false;
18871 }
18872 else
18873 switch (GET_MODE_SIZE (d->vmode))
18874 {
18875 case 4:
18876 if (!TARGET_SSSE3)
18877 return false;
18878 vmode = V4QImode;
18879 break;
18880
18881 case 8:
18882 if (!TARGET_SSSE3)
18883 return false;
18884 vmode = V8QImode;
18885 break;
18886
18887 case 16:
18888 if (!TARGET_SSSE3)
18889 return false;
18890 vmode = V16QImode;
18891 break;
18892
18893 case 32:
18894 if (!TARGET_AVX2)
18895 return false;
18896
18897 /* V4DImode should be already handled through
18898 expand_vselect by vpermq instruction. */
18899 gcc_assert (d->vmode != V4DImode);
18900
18901 vmode = V32QImode;
18902 if (d->vmode == V8SImode
18903 || d->vmode == V16HImode
18904 || d->vmode == V32QImode)
18905 {
18906 /* First see if vpermq can be used for
18907 V8SImode/V16HImode/V32QImode. */
18908 if (valid_perm_using_mode_p (V4DImode, d))
18909 {
18910 for (i = 0; i < 4; i++)
18911 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
18912 if (d->testing_p)
18913 return true;
18914 target = gen_reg_rtx (V4DImode);
18915 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
18916 perm, 4, false))
18917 {
18918 emit_move_insn (d->target,
18919 gen_lowpart (d->vmode, target));
18920 return true;
18921 }
18922 return false;
18923 }
18924
18925 /* Next see if vpermd can be used. */
18926 if (valid_perm_using_mode_p (V8SImode, d))
18927 vmode = V8SImode;
18928 }
18929 /* Or if vpermps can be used. */
18930 else if (d->vmode == V8SFmode)
18931 vmode = V8SImode;
18932
18933 if (vmode == V32QImode)
18934 {
18935 /* vpshufb only works intra lanes, it is not
18936 possible to shuffle bytes in between the lanes. */
18937 for (i = 0; i < nelt; ++i)
18938 if ((d->perm[i] ^ i) & (nelt / 2))
18939 return false;
18940 }
18941 break;
18942
18943 case 64:
18944 if (!TARGET_AVX512BW)
18945 return false;
18946
18947 /* If vpermq didn't work, vpshufb won't work either. */
18948 if (d->vmode == V8DFmode || d->vmode == V8DImode)
18949 return false;
18950
18951 vmode = V64QImode;
18952 if (d->vmode == V16SImode
18953 || d->vmode == V32HImode
18954 || d->vmode == V64QImode)
18955 {
18956 /* First see if vpermq can be used for
18957 V16SImode/V32HImode/V64QImode. */
18958 if (valid_perm_using_mode_p (V8DImode, d))
18959 {
18960 for (i = 0; i < 8; i++)
18961 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
18962 if (d->testing_p)
18963 return true;
18964 target = gen_reg_rtx (V8DImode);
18965 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
18966 perm, 8, false))
18967 {
18968 emit_move_insn (d->target,
18969 gen_lowpart (d->vmode, target));
18970 return true;
18971 }
18972 return false;
18973 }
18974
18975 /* Next see if vpermd can be used. */
18976 if (valid_perm_using_mode_p (V16SImode, d))
18977 vmode = V16SImode;
18978 }
18979 /* Or if vpermps can be used. */
18980 else if (d->vmode == V16SFmode)
18981 vmode = V16SImode;
18982
18983 if (vmode == V64QImode)
18984 {
18985 /* vpshufb only works intra lanes, it is not
18986 possible to shuffle bytes in between the lanes. */
18987 for (i = 0; i < nelt; ++i)
18988 if ((d->perm[i] ^ i) & (3 * nelt / 4))
18989 return false;
18990 }
18991 break;
18992
18993 default:
18994 return false;
18995 }
18996
18997 if (d->testing_p)
18998 return true;
18999
19000 /* Try to avoid variable permutation instruction. */
19001 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19002 {
19003 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19004 return true;
19005 }
19006
19007 if (vmode == V8SImode)
19008 for (i = 0; i < 8; ++i)
19009 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19010 else if (vmode == V16SImode)
19011 for (i = 0; i < 16; ++i)
19012 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19013 else
19014 {
19015 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19016 if (!d->one_operand_p)
19017 mask = 2 * nelt - 1;
19018 else if (vmode == V64QImode)
19019 mask = nelt / 4 - 1;
19020 else if (vmode == V32QImode)
19021 mask = nelt / 2 - 1;
19022 else
19023 mask = nelt - 1;
19024
19025 for (i = 0; i < nelt; ++i)
19026 {
19027 unsigned j, e = d->perm[i] & mask;
19028 for (j = 0; j < eltsz; ++j)
19029 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19030 }
19031 }
19032
19033 machine_mode vpmode = vmode;
19034
19035 nelt = GET_MODE_SIZE (vmode);
19036
19037 /* Emulate narrow modes with V16QI instructions. */
19038 if (nelt < 16)
19039 {
19040 rtx m128 = GEN_INT (-128);
19041
19042 /* Remap elements from the second operand, as we have to
19043 account for inactive top elements from the first operand. */
19044 if (!d->one_operand_p)
19045 {
19046 for (i = 0; i < nelt; ++i)
19047 {
19048 unsigned ival = UINTVAL (rperm[i]);
19049 if (ival >= nelt)
19050 rperm[i] = GEN_INT (ival + 16 - nelt);
19051 }
19052 }
19053
19054 /* Fill inactive elements in the top positions with zeros. */
19055 for (i = nelt; i < 16; ++i)
19056 rperm[i] = m128;
19057
19058 vpmode = V16QImode;
19059 }
19060
19061 vperm = gen_rtx_CONST_VECTOR (vpmode,
19062 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19063 vperm = force_reg (vpmode, vperm);
19064
19065 if (vmode == d->vmode)
19066 target = d->target;
19067 else
19068 target = gen_reg_rtx (vmode);
19069
19070 op0 = gen_lowpart (vmode, d->op0);
19071
19072 if (d->one_operand_p)
19073 {
19074 rtx (*gen) (rtx, rtx, rtx);
19075
19076 if (vmode == V4QImode)
19077 gen = gen_mmx_pshufbv4qi3;
19078 else if (vmode == V8QImode)
19079 gen = gen_mmx_pshufbv8qi3;
19080 else if (vmode == V16QImode)
19081 gen = gen_ssse3_pshufbv16qi3;
19082 else if (vmode == V32QImode)
19083 gen = gen_avx2_pshufbv32qi3;
19084 else if (vmode == V64QImode)
19085 gen = gen_avx512bw_pshufbv64qi3;
19086 else if (vmode == V8SFmode)
19087 gen = gen_avx2_permvarv8sf;
19088 else if (vmode == V8SImode)
19089 gen = gen_avx2_permvarv8si;
19090 else if (vmode == V16SFmode)
19091 gen = gen_avx512f_permvarv16sf;
19092 else if (vmode == V16SImode)
19093 gen = gen_avx512f_permvarv16si;
19094 else
19095 gcc_unreachable ();
19096
19097 emit_insn (gen (target, op0, vperm));
19098 }
19099 else
19100 {
19101 rtx (*gen) (rtx, rtx, rtx, rtx);
19102
19103 op1 = gen_lowpart (vmode, d->op1);
19104
19105 if (vmode == V4QImode)
19106 gen = gen_mmx_ppermv32;
19107 else if (vmode == V8QImode)
19108 gen = gen_mmx_ppermv64;
19109 else if (vmode == V16QImode)
19110 gen = gen_xop_pperm;
19111 else
19112 gcc_unreachable ();
19113
19114 emit_insn (gen (target, op0, op1, vperm));
19115 }
19116
19117 if (target != d->target)
19118 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19119
19120 return true;
19121 }
19122
19123 /* Try to expand one-operand permutation with constant mask. */
19124
19125 static bool
19126 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19127 {
19128 machine_mode mode = GET_MODE (d->op0);
19129 machine_mode maskmode = mode;
19130 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
19131 rtx (*gen) (rtx, rtx, rtx) = NULL;
19132 rtx target, op0, mask;
19133 rtx vec[64];
19134
19135 if (!rtx_equal_p (d->op0, d->op1))
19136 return false;
19137
19138 if (!TARGET_AVX512F)
19139 return false;
19140
19141 /* Accept VNxHImode and VNxQImode now. */
19142 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19143 return false;
19144
19145 /* vpermw. */
19146 if (!TARGET_AVX512BW && inner_size == 2)
19147 return false;
19148
19149 /* vpermb. */
19150 if (!TARGET_AVX512VBMI && inner_size == 1)
19151 return false;
19152
19153 switch (mode)
19154 {
19155 case E_V16SImode:
19156 gen = gen_avx512f_permvarv16si;
19157 break;
19158 case E_V16SFmode:
19159 gen = gen_avx512f_permvarv16sf;
19160 maskmode = V16SImode;
19161 break;
19162 case E_V8DImode:
19163 gen = gen_avx512f_permvarv8di;
19164 break;
19165 case E_V8DFmode:
19166 gen = gen_avx512f_permvarv8df;
19167 maskmode = V8DImode;
19168 break;
19169 case E_V32HImode:
19170 gen = gen_avx512bw_permvarv32hi;
19171 break;
19172 case E_V16HImode:
19173 gen = gen_avx512vl_permvarv16hi;
19174 break;
19175 case E_V8HImode:
19176 gen = gen_avx512vl_permvarv8hi;
19177 break;
19178 case E_V64QImode:
19179 gen = gen_avx512bw_permvarv64qi;
19180 break;
19181 case E_V32QImode:
19182 gen = gen_avx512vl_permvarv32qi;
19183 break;
19184 case E_V16QImode:
19185 gen = gen_avx512vl_permvarv16qi;
19186 break;
19187
19188 default:
19189 return false;
19190 }
19191
19192 if (d->testing_p)
19193 return true;
19194
19195 target = d->target;
19196 op0 = d->op0;
19197 for (int i = 0; i < d->nelt; ++i)
19198 vec[i] = GEN_INT (d->perm[i]);
19199 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19200 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19201 return true;
19202 }
19203
19204 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19205
19206 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19207 in a single instruction. */
19208
19209 static bool
19210 expand_vec_perm_1 (struct expand_vec_perm_d *d)
19211 {
19212 unsigned i, nelt = d->nelt;
19213 struct expand_vec_perm_d nd;
19214
19215 /* Check plain VEC_SELECT first, because AVX has instructions that could
19216 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19217 input where SEL+CONCAT may not. */
19218 if (d->one_operand_p)
19219 {
19220 int mask = nelt - 1;
19221 bool identity_perm = true;
19222 bool broadcast_perm = true;
19223
19224 for (i = 0; i < nelt; i++)
19225 {
19226 nd.perm[i] = d->perm[i] & mask;
19227 if (nd.perm[i] != i)
19228 identity_perm = false;
19229 if (nd.perm[i])
19230 broadcast_perm = false;
19231 }
19232
19233 if (identity_perm)
19234 {
19235 if (!d->testing_p)
19236 emit_move_insn (d->target, d->op0);
19237 return true;
19238 }
19239 else if (broadcast_perm && TARGET_AVX2)
19240 {
19241 /* Use vpbroadcast{b,w,d}. */
19242 rtx (*gen) (rtx, rtx) = NULL;
19243 switch (d->vmode)
19244 {
19245 case E_V64QImode:
19246 if (TARGET_AVX512BW)
19247 gen = gen_avx512bw_vec_dupv64qi_1;
19248 break;
19249 case E_V32QImode:
19250 gen = gen_avx2_pbroadcastv32qi_1;
19251 break;
19252 case E_V32HImode:
19253 if (TARGET_AVX512BW)
19254 gen = gen_avx512bw_vec_dupv32hi_1;
19255 break;
19256 case E_V16HImode:
19257 gen = gen_avx2_pbroadcastv16hi_1;
19258 break;
19259 case E_V16SImode:
19260 if (TARGET_AVX512F)
19261 gen = gen_avx512f_vec_dupv16si_1;
19262 break;
19263 case E_V8SImode:
19264 gen = gen_avx2_pbroadcastv8si_1;
19265 break;
19266 case E_V16QImode:
19267 gen = gen_avx2_pbroadcastv16qi;
19268 break;
19269 case E_V8HImode:
19270 gen = gen_avx2_pbroadcastv8hi;
19271 break;
19272 case E_V16SFmode:
19273 if (TARGET_AVX512F)
19274 gen = gen_avx512f_vec_dupv16sf_1;
19275 break;
19276 case E_V8SFmode:
19277 gen = gen_avx2_vec_dupv8sf_1;
19278 break;
19279 case E_V8DFmode:
19280 if (TARGET_AVX512F)
19281 gen = gen_avx512f_vec_dupv8df_1;
19282 break;
19283 case E_V8DImode:
19284 if (TARGET_AVX512F)
19285 gen = gen_avx512f_vec_dupv8di_1;
19286 break;
19287 /* For other modes prefer other shuffles this function creates. */
19288 default: break;
19289 }
19290 if (gen != NULL)
19291 {
19292 if (!d->testing_p)
19293 emit_insn (gen (d->target, d->op0));
19294 return true;
19295 }
19296 }
19297
19298 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19299 return true;
19300
19301 /* There are plenty of patterns in sse.md that are written for
19302 SEL+CONCAT and are not replicated for a single op. Perhaps
19303 that should be changed, to avoid the nastiness here. */
19304
19305 /* Recognize interleave style patterns, which means incrementing
19306 every other permutation operand. */
19307 for (i = 0; i < nelt; i += 2)
19308 {
19309 nd.perm[i] = d->perm[i] & mask;
19310 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19311 }
19312 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19313 d->testing_p))
19314 return true;
19315
19316 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19317 if (nelt >= 4)
19318 {
19319 for (i = 0; i < nelt; i += 4)
19320 {
19321 nd.perm[i + 0] = d->perm[i + 0] & mask;
19322 nd.perm[i + 1] = d->perm[i + 1] & mask;
19323 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19324 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19325 }
19326
19327 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19328 d->testing_p))
19329 return true;
19330 }
19331 }
19332
19333 /* Try movss/movsd instructions. */
19334 if (expand_vec_perm_movs (d))
19335 return true;
19336
19337 /* Finally, try the fully general two operand permute. */
19338 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
19339 d->testing_p))
19340 return true;
19341
19342 /* Recognize interleave style patterns with reversed operands. */
19343 if (!d->one_operand_p)
19344 {
19345 for (i = 0; i < nelt; ++i)
19346 {
19347 unsigned e = d->perm[i];
19348 if (e >= nelt)
19349 e -= nelt;
19350 else
19351 e += nelt;
19352 nd.perm[i] = e;
19353 }
19354
19355 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
19356 d->testing_p))
19357 return true;
19358 }
19359
19360 /* Try the SSE4.1 blend variable merge instructions. */
19361 if (expand_vec_perm_blend (d))
19362 return true;
19363
19364 /* Try one of the AVX vpermil variable permutations. */
19365 if (expand_vec_perm_vpermil (d))
19366 return true;
19367
19368 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19369 vpshufb, vpermd, vpermps or vpermq variable permutation. */
19370 if (expand_vec_perm_pshufb (d))
19371 return true;
19372
19373 /* Try the AVX2 vpalignr instruction. */
19374 if (expand_vec_perm_palignr (d, true))
19375 return true;
19376
19377 /* Try the AVX512F vperm{w,b,s,d} instructions */
19378 if (ix86_expand_vec_one_operand_perm_avx512 (d))
19379 return true;
19380
19381 /* Try the AVX512F vpermt2/vpermi2 instructions. */
19382 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
19383 return true;
19384
19385 /* See if we can get the same permutation in different vector integer
19386 mode. */
19387 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19388 {
19389 if (!d->testing_p)
19390 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19391 return true;
19392 }
19393 return false;
19394 }
19395
19396 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19397 in terms of a pair of pshuflw + pshufhw instructions. */
19398
19399 static bool
19400 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
19401 {
19402 unsigned char perm2[MAX_VECT_LEN];
19403 unsigned i;
19404 bool ok;
19405
19406 if (d->vmode != V8HImode || !d->one_operand_p)
19407 return false;
19408
19409 /* The two permutations only operate in 64-bit lanes. */
19410 for (i = 0; i < 4; ++i)
19411 if (d->perm[i] >= 4)
19412 return false;
19413 for (i = 4; i < 8; ++i)
19414 if (d->perm[i] < 4)
19415 return false;
19416
19417 if (d->testing_p)
19418 return true;
19419
19420 /* Emit the pshuflw. */
19421 memcpy (perm2, d->perm, 4);
19422 for (i = 4; i < 8; ++i)
19423 perm2[i] = i;
19424 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
19425 gcc_assert (ok);
19426
19427 /* Emit the pshufhw. */
19428 memcpy (perm2 + 4, d->perm + 4, 4);
19429 for (i = 0; i < 4; ++i)
19430 perm2[i] = i;
19431 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
19432 gcc_assert (ok);
19433
19434 return true;
19435 }
19436
19437 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19438 the permutation using the SSSE3 palignr instruction. This succeeds
19439 when all of the elements in PERM fit within one vector and we merely
19440 need to shift them down so that a single vector permutation has a
19441 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
19442 the vpalignr instruction itself can perform the requested permutation. */
19443
19444 static bool
19445 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
19446 {
19447 unsigned i, nelt = d->nelt;
19448 unsigned min, max, minswap, maxswap;
19449 bool in_order, ok, swap = false;
19450 rtx shift, target;
19451 struct expand_vec_perm_d dcopy;
19452
19453 /* Even with AVX, palignr only operates on 128-bit vectors,
19454 in AVX2 palignr operates on both 128-bit lanes. */
19455 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
19456 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
19457 return false;
19458
19459 min = 2 * nelt;
19460 max = 0;
19461 minswap = 2 * nelt;
19462 maxswap = 0;
19463 for (i = 0; i < nelt; ++i)
19464 {
19465 unsigned e = d->perm[i];
19466 unsigned eswap = d->perm[i] ^ nelt;
19467 if (GET_MODE_SIZE (d->vmode) == 32)
19468 {
19469 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
19470 eswap = e ^ (nelt / 2);
19471 }
19472 if (e < min)
19473 min = e;
19474 if (e > max)
19475 max = e;
19476 if (eswap < minswap)
19477 minswap = eswap;
19478 if (eswap > maxswap)
19479 maxswap = eswap;
19480 }
19481 if (min == 0
19482 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
19483 {
19484 if (d->one_operand_p
19485 || minswap == 0
19486 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
19487 ? nelt / 2 : nelt))
19488 return false;
19489 swap = true;
19490 min = minswap;
19491 max = maxswap;
19492 }
19493
19494 /* Given that we have SSSE3, we know we'll be able to implement the
19495 single operand permutation after the palignr with pshufb for
19496 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
19497 first. */
19498 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
19499 return true;
19500
19501 dcopy = *d;
19502 if (swap)
19503 {
19504 dcopy.op0 = d->op1;
19505 dcopy.op1 = d->op0;
19506 for (i = 0; i < nelt; ++i)
19507 dcopy.perm[i] ^= nelt;
19508 }
19509
19510 in_order = true;
19511 for (i = 0; i < nelt; ++i)
19512 {
19513 unsigned e = dcopy.perm[i];
19514 if (GET_MODE_SIZE (d->vmode) == 32
19515 && e >= nelt
19516 && (e & (nelt / 2 - 1)) < min)
19517 e = e - min - (nelt / 2);
19518 else
19519 e = e - min;
19520 if (e != i)
19521 in_order = false;
19522 dcopy.perm[i] = e;
19523 }
19524 dcopy.one_operand_p = true;
19525
19526 if (single_insn_only_p && !in_order)
19527 return false;
19528
19529 /* For AVX2, test whether we can permute the result in one instruction. */
19530 if (d->testing_p)
19531 {
19532 if (in_order)
19533 return true;
19534 dcopy.op1 = dcopy.op0;
19535 return expand_vec_perm_1 (&dcopy);
19536 }
19537
19538 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
19539 if (GET_MODE_SIZE (d->vmode) == 16)
19540 {
19541 target = gen_reg_rtx (TImode);
19542 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
19543 gen_lowpart (TImode, dcopy.op0), shift));
19544 }
19545 else
19546 {
19547 target = gen_reg_rtx (V2TImode);
19548 emit_insn (gen_avx2_palignrv2ti (target,
19549 gen_lowpart (V2TImode, dcopy.op1),
19550 gen_lowpart (V2TImode, dcopy.op0),
19551 shift));
19552 }
19553
19554 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
19555
19556 /* Test for the degenerate case where the alignment by itself
19557 produces the desired permutation. */
19558 if (in_order)
19559 {
19560 emit_move_insn (d->target, dcopy.op0);
19561 return true;
19562 }
19563
19564 ok = expand_vec_perm_1 (&dcopy);
19565 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
19566
19567 return ok;
19568 }
19569
19570 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19571 the permutation using the SSE4_1 pblendv instruction. Potentially
19572 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
19573
19574 static bool
19575 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
19576 {
19577 unsigned i, which, nelt = d->nelt;
19578 struct expand_vec_perm_d dcopy, dcopy1;
19579 machine_mode vmode = d->vmode;
19580 bool ok;
19581
19582 /* Use the same checks as in expand_vec_perm_blend. */
19583 if (d->one_operand_p)
19584 return false;
19585 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19586 ;
19587 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19588 ;
19589 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
19590 || GET_MODE_SIZE (vmode) == 8
19591 || GET_MODE_SIZE (vmode) == 16))
19592 ;
19593 else
19594 return false;
19595
19596 /* Figure out where permutation elements stay not in their
19597 respective lanes. */
19598 for (i = 0, which = 0; i < nelt; ++i)
19599 {
19600 unsigned e = d->perm[i];
19601 if (e != i)
19602 which |= (e < nelt ? 1 : 2);
19603 }
19604 /* We can pblend the part where elements stay not in their
19605 respective lanes only when these elements are all in one
19606 half of a permutation.
19607 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
19608 lanes, but both 8 and 9 >= 8
19609 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
19610 respective lanes and 8 >= 8, but 2 not. */
19611 if (which != 1 && which != 2)
19612 return false;
19613 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
19614 return true;
19615
19616 /* First we apply one operand permutation to the part where
19617 elements stay not in their respective lanes. */
19618 dcopy = *d;
19619 if (which == 2)
19620 dcopy.op0 = dcopy.op1 = d->op1;
19621 else
19622 dcopy.op0 = dcopy.op1 = d->op0;
19623 if (!d->testing_p)
19624 dcopy.target = gen_reg_rtx (vmode);
19625 dcopy.one_operand_p = true;
19626
19627 for (i = 0; i < nelt; ++i)
19628 dcopy.perm[i] = d->perm[i] & (nelt - 1);
19629
19630 ok = expand_vec_perm_1 (&dcopy);
19631 if (GET_MODE_SIZE (vmode) != 16 && !ok)
19632 return false;
19633 else
19634 gcc_assert (ok);
19635 if (d->testing_p)
19636 return true;
19637
19638 /* Next we put permuted elements into their positions. */
19639 dcopy1 = *d;
19640 if (which == 2)
19641 dcopy1.op1 = dcopy.target;
19642 else
19643 dcopy1.op0 = dcopy.target;
19644
19645 for (i = 0; i < nelt; ++i)
19646 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
19647
19648 ok = expand_vec_perm_blend (&dcopy1);
19649 gcc_assert (ok);
19650
19651 return true;
19652 }
19653
19654 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
19655
19656 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19657 a two vector permutation into a single vector permutation by using
19658 an interleave operation to merge the vectors. */
19659
19660 static bool
19661 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
19662 {
19663 struct expand_vec_perm_d dremap, dfinal;
19664 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
19665 unsigned HOST_WIDE_INT contents;
19666 unsigned char remap[2 * MAX_VECT_LEN];
19667 rtx_insn *seq;
19668 bool ok, same_halves = false;
19669
19670 if (GET_MODE_SIZE (d->vmode) == 4
19671 || GET_MODE_SIZE (d->vmode) == 8
19672 || GET_MODE_SIZE (d->vmode) == 16)
19673 {
19674 if (d->one_operand_p)
19675 return false;
19676 }
19677 else if (GET_MODE_SIZE (d->vmode) == 32)
19678 {
19679 if (!TARGET_AVX)
19680 return false;
19681 /* For 32-byte modes allow even d->one_operand_p.
19682 The lack of cross-lane shuffling in some instructions
19683 might prevent a single insn shuffle. */
19684 dfinal = *d;
19685 dfinal.testing_p = true;
19686 /* If expand_vec_perm_interleave3 can expand this into
19687 a 3 insn sequence, give up and let it be expanded as
19688 3 insn sequence. While that is one insn longer,
19689 it doesn't need a memory operand and in the common
19690 case that both interleave low and high permutations
19691 with the same operands are adjacent needs 4 insns
19692 for both after CSE. */
19693 if (expand_vec_perm_interleave3 (&dfinal))
19694 return false;
19695 }
19696 else
19697 return false;
19698
19699 /* Examine from whence the elements come. */
19700 contents = 0;
19701 for (i = 0; i < nelt; ++i)
19702 contents |= HOST_WIDE_INT_1U << d->perm[i];
19703
19704 memset (remap, 0xff, sizeof (remap));
19705 dremap = *d;
19706
19707 if (GET_MODE_SIZE (d->vmode) == 4
19708 || GET_MODE_SIZE (d->vmode) == 8)
19709 {
19710 unsigned HOST_WIDE_INT h1, h2, h3, h4;
19711
19712 /* Split the two input vectors into 4 halves. */
19713 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19714 h2 = h1 << nelt2;
19715 h3 = h2 << nelt2;
19716 h4 = h3 << nelt2;
19717
19718 /* If the elements from the low halves use interleave low,
19719 and similarly for interleave high. */
19720 if ((contents & (h1 | h3)) == contents)
19721 {
19722 /* punpckl* */
19723 for (i = 0; i < nelt2; ++i)
19724 {
19725 remap[i] = i * 2;
19726 remap[i + nelt] = i * 2 + 1;
19727 dremap.perm[i * 2] = i;
19728 dremap.perm[i * 2 + 1] = i + nelt;
19729 }
19730 }
19731 else if ((contents & (h2 | h4)) == contents)
19732 {
19733 /* punpckh* */
19734 for (i = 0; i < nelt2; ++i)
19735 {
19736 remap[i + nelt2] = i * 2;
19737 remap[i + nelt + nelt2] = i * 2 + 1;
19738 dremap.perm[i * 2] = i + nelt2;
19739 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19740 }
19741 }
19742 else
19743 return false;
19744 }
19745 else if (GET_MODE_SIZE (d->vmode) == 16)
19746 {
19747 unsigned HOST_WIDE_INT h1, h2, h3, h4;
19748
19749 /* Split the two input vectors into 4 halves. */
19750 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19751 h2 = h1 << nelt2;
19752 h3 = h2 << nelt2;
19753 h4 = h3 << nelt2;
19754
19755 /* If the elements from the low halves use interleave low, and similarly
19756 for interleave high. If the elements are from mis-matched halves, we
19757 can use shufps for V4SF/V4SI or do a DImode shuffle. */
19758 if ((contents & (h1 | h3)) == contents)
19759 {
19760 /* punpckl* */
19761 for (i = 0; i < nelt2; ++i)
19762 {
19763 remap[i] = i * 2;
19764 remap[i + nelt] = i * 2 + 1;
19765 dremap.perm[i * 2] = i;
19766 dremap.perm[i * 2 + 1] = i + nelt;
19767 }
19768 if (!TARGET_SSE2 && d->vmode == V4SImode)
19769 dremap.vmode = V4SFmode;
19770 }
19771 else if ((contents & (h2 | h4)) == contents)
19772 {
19773 /* punpckh* */
19774 for (i = 0; i < nelt2; ++i)
19775 {
19776 remap[i + nelt2] = i * 2;
19777 remap[i + nelt + nelt2] = i * 2 + 1;
19778 dremap.perm[i * 2] = i + nelt2;
19779 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19780 }
19781 if (!TARGET_SSE2 && d->vmode == V4SImode)
19782 dremap.vmode = V4SFmode;
19783 }
19784 else if ((contents & (h1 | h4)) == contents)
19785 {
19786 /* shufps */
19787 for (i = 0; i < nelt2; ++i)
19788 {
19789 remap[i] = i;
19790 remap[i + nelt + nelt2] = i + nelt2;
19791 dremap.perm[i] = i;
19792 dremap.perm[i + nelt2] = i + nelt + nelt2;
19793 }
19794 if (nelt != 4)
19795 {
19796 /* shufpd */
19797 dremap.vmode = V2DImode;
19798 dremap.nelt = 2;
19799 dremap.perm[0] = 0;
19800 dremap.perm[1] = 3;
19801 }
19802 }
19803 else if ((contents & (h2 | h3)) == contents)
19804 {
19805 /* shufps */
19806 for (i = 0; i < nelt2; ++i)
19807 {
19808 remap[i + nelt2] = i;
19809 remap[i + nelt] = i + nelt2;
19810 dremap.perm[i] = i + nelt2;
19811 dremap.perm[i + nelt2] = i + nelt;
19812 }
19813 if (nelt != 4)
19814 {
19815 /* shufpd */
19816 dremap.vmode = V2DImode;
19817 dremap.nelt = 2;
19818 dremap.perm[0] = 1;
19819 dremap.perm[1] = 2;
19820 }
19821 }
19822 else
19823 return false;
19824 }
19825 else
19826 {
19827 unsigned int nelt4 = nelt / 4, nzcnt = 0;
19828 unsigned HOST_WIDE_INT q[8];
19829 unsigned int nonzero_halves[4];
19830
19831 /* Split the two input vectors into 8 quarters. */
19832 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
19833 for (i = 1; i < 8; ++i)
19834 q[i] = q[0] << (nelt4 * i);
19835 for (i = 0; i < 4; ++i)
19836 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
19837 {
19838 nonzero_halves[nzcnt] = i;
19839 ++nzcnt;
19840 }
19841
19842 if (nzcnt == 1)
19843 {
19844 gcc_assert (d->one_operand_p);
19845 nonzero_halves[1] = nonzero_halves[0];
19846 same_halves = true;
19847 }
19848 else if (d->one_operand_p)
19849 {
19850 gcc_assert (nonzero_halves[0] == 0);
19851 gcc_assert (nonzero_halves[1] == 1);
19852 }
19853
19854 if (nzcnt <= 2)
19855 {
19856 if (d->perm[0] / nelt2 == nonzero_halves[1])
19857 {
19858 /* Attempt to increase the likelihood that dfinal
19859 shuffle will be intra-lane. */
19860 std::swap (nonzero_halves[0], nonzero_halves[1]);
19861 }
19862
19863 /* vperm2f128 or vperm2i128. */
19864 for (i = 0; i < nelt2; ++i)
19865 {
19866 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
19867 remap[i + nonzero_halves[0] * nelt2] = i;
19868 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
19869 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
19870 }
19871
19872 if (d->vmode != V8SFmode
19873 && d->vmode != V4DFmode
19874 && d->vmode != V8SImode)
19875 {
19876 dremap.vmode = V8SImode;
19877 dremap.nelt = 8;
19878 for (i = 0; i < 4; ++i)
19879 {
19880 dremap.perm[i] = i + nonzero_halves[0] * 4;
19881 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
19882 }
19883 }
19884 }
19885 else if (d->one_operand_p)
19886 return false;
19887 else if (TARGET_AVX2
19888 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
19889 {
19890 /* vpunpckl* */
19891 for (i = 0; i < nelt4; ++i)
19892 {
19893 remap[i] = i * 2;
19894 remap[i + nelt] = i * 2 + 1;
19895 remap[i + nelt2] = i * 2 + nelt2;
19896 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
19897 dremap.perm[i * 2] = i;
19898 dremap.perm[i * 2 + 1] = i + nelt;
19899 dremap.perm[i * 2 + nelt2] = i + nelt2;
19900 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
19901 }
19902 }
19903 else if (TARGET_AVX2
19904 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
19905 {
19906 /* vpunpckh* */
19907 for (i = 0; i < nelt4; ++i)
19908 {
19909 remap[i + nelt4] = i * 2;
19910 remap[i + nelt + nelt4] = i * 2 + 1;
19911 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
19912 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
19913 dremap.perm[i * 2] = i + nelt4;
19914 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
19915 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
19916 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
19917 }
19918 }
19919 else
19920 return false;
19921 }
19922
19923 /* Use the remapping array set up above to move the elements from their
19924 swizzled locations into their final destinations. */
19925 dfinal = *d;
19926 for (i = 0; i < nelt; ++i)
19927 {
19928 unsigned e = remap[d->perm[i]];
19929 gcc_assert (e < nelt);
19930 /* If same_halves is true, both halves of the remapped vector are the
19931 same. Avoid cross-lane accesses if possible. */
19932 if (same_halves && i >= nelt2)
19933 {
19934 gcc_assert (e < nelt2);
19935 dfinal.perm[i] = e + nelt2;
19936 }
19937 else
19938 dfinal.perm[i] = e;
19939 }
19940 if (!d->testing_p)
19941 {
19942 dremap.target = gen_reg_rtx (dremap.vmode);
19943 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
19944 }
19945 dfinal.op1 = dfinal.op0;
19946 dfinal.one_operand_p = true;
19947
19948 /* Test if the final remap can be done with a single insn. For V4SFmode or
19949 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
19950 start_sequence ();
19951 ok = expand_vec_perm_1 (&dfinal);
19952 seq = get_insns ();
19953 end_sequence ();
19954
19955 if (!ok)
19956 return false;
19957
19958 if (d->testing_p)
19959 return true;
19960
19961 if (dremap.vmode != dfinal.vmode)
19962 {
19963 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
19964 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
19965 }
19966
19967 ok = expand_vec_perm_1 (&dremap);
19968 gcc_assert (ok);
19969
19970 emit_insn (seq);
19971 return true;
19972 }
19973
19974 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19975 a single vector cross-lane permutation into vpermq followed
19976 by any of the single insn permutations. */
19977
19978 static bool
19979 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
19980 {
19981 struct expand_vec_perm_d dremap, dfinal;
19982 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
19983 unsigned contents[2];
19984 bool ok;
19985
19986 if (!(TARGET_AVX2
19987 && (d->vmode == V32QImode || d->vmode == V16HImode)
19988 && d->one_operand_p))
19989 return false;
19990
19991 contents[0] = 0;
19992 contents[1] = 0;
19993 for (i = 0; i < nelt2; ++i)
19994 {
19995 contents[0] |= 1u << (d->perm[i] / nelt4);
19996 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
19997 }
19998
19999 for (i = 0; i < 2; ++i)
20000 {
20001 unsigned int cnt = 0;
20002 for (j = 0; j < 4; ++j)
20003 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20004 return false;
20005 }
20006
20007 if (d->testing_p)
20008 return true;
20009
20010 dremap = *d;
20011 dremap.vmode = V4DImode;
20012 dremap.nelt = 4;
20013 dremap.target = gen_reg_rtx (V4DImode);
20014 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20015 dremap.op1 = dremap.op0;
20016 dremap.one_operand_p = true;
20017 for (i = 0; i < 2; ++i)
20018 {
20019 unsigned int cnt = 0;
20020 for (j = 0; j < 4; ++j)
20021 if ((contents[i] & (1u << j)) != 0)
20022 dremap.perm[2 * i + cnt++] = j;
20023 for (; cnt < 2; ++cnt)
20024 dremap.perm[2 * i + cnt] = 0;
20025 }
20026
20027 dfinal = *d;
20028 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20029 dfinal.op1 = dfinal.op0;
20030 dfinal.one_operand_p = true;
20031 for (i = 0, j = 0; i < nelt; ++i)
20032 {
20033 if (i == nelt2)
20034 j = 2;
20035 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20036 if ((d->perm[i] / nelt4) == dremap.perm[j])
20037 ;
20038 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20039 dfinal.perm[i] |= nelt4;
20040 else
20041 gcc_unreachable ();
20042 }
20043
20044 ok = expand_vec_perm_1 (&dremap);
20045 gcc_assert (ok);
20046
20047 ok = expand_vec_perm_1 (&dfinal);
20048 gcc_assert (ok);
20049
20050 return true;
20051 }
20052
20053 static bool canonicalize_perm (struct expand_vec_perm_d *d);
20054
20055 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20056 a vector permutation using two instructions, vperm2f128 resp.
20057 vperm2i128 followed by any single in-lane permutation. */
20058
20059 static bool
20060 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20061 {
20062 struct expand_vec_perm_d dfirst, dsecond;
20063 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20064 bool ok;
20065
20066 if (!TARGET_AVX
20067 || GET_MODE_SIZE (d->vmode) != 32
20068 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20069 return false;
20070
20071 dsecond = *d;
20072 dsecond.one_operand_p = false;
20073 dsecond.testing_p = true;
20074
20075 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20076 immediate. For perm < 16 the second permutation uses
20077 d->op0 as first operand, for perm >= 16 it uses d->op1
20078 as first operand. The second operand is the result of
20079 vperm2[fi]128. */
20080 for (perm = 0; perm < 32; perm++)
20081 {
20082 /* Ignore permutations which do not move anything cross-lane. */
20083 if (perm < 16)
20084 {
20085 /* The second shuffle for e.g. V4DFmode has
20086 0123 and ABCD operands.
20087 Ignore AB23, as 23 is already in the second lane
20088 of the first operand. */
20089 if ((perm & 0xc) == (1 << 2)) continue;
20090 /* And 01CD, as 01 is in the first lane of the first
20091 operand. */
20092 if ((perm & 3) == 0) continue;
20093 /* And 4567, as then the vperm2[fi]128 doesn't change
20094 anything on the original 4567 second operand. */
20095 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
20096 }
20097 else
20098 {
20099 /* The second shuffle for e.g. V4DFmode has
20100 4567 and ABCD operands.
20101 Ignore AB67, as 67 is already in the second lane
20102 of the first operand. */
20103 if ((perm & 0xc) == (3 << 2)) continue;
20104 /* And 45CD, as 45 is in the first lane of the first
20105 operand. */
20106 if ((perm & 3) == 2) continue;
20107 /* And 0123, as then the vperm2[fi]128 doesn't change
20108 anything on the original 0123 first operand. */
20109 if ((perm & 0xf) == (1 << 2)) continue;
20110 }
20111
20112 for (i = 0; i < nelt; i++)
20113 {
20114 j = d->perm[i] / nelt2;
20115 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20116 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20117 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20118 dsecond.perm[i] = d->perm[i] & (nelt - 1);
20119 else
20120 break;
20121 }
20122
20123 if (i == nelt)
20124 {
20125 start_sequence ();
20126 ok = expand_vec_perm_1 (&dsecond);
20127 end_sequence ();
20128 }
20129 else
20130 ok = false;
20131
20132 if (ok)
20133 {
20134 if (d->testing_p)
20135 return true;
20136
20137 /* Found a usable second shuffle. dfirst will be
20138 vperm2f128 on d->op0 and d->op1. */
20139 dsecond.testing_p = false;
20140 dfirst = *d;
20141 dfirst.target = gen_reg_rtx (d->vmode);
20142 for (i = 0; i < nelt; i++)
20143 dfirst.perm[i] = (i & (nelt2 - 1))
20144 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20145
20146 canonicalize_perm (&dfirst);
20147 ok = expand_vec_perm_1 (&dfirst);
20148 gcc_assert (ok);
20149
20150 /* And dsecond is some single insn shuffle, taking
20151 d->op0 and result of vperm2f128 (if perm < 16) or
20152 d->op1 and result of vperm2f128 (otherwise). */
20153 if (perm >= 16)
20154 dsecond.op0 = dsecond.op1;
20155 dsecond.op1 = dfirst.target;
20156
20157 ok = expand_vec_perm_1 (&dsecond);
20158 gcc_assert (ok);
20159
20160 return true;
20161 }
20162
20163 /* For one operand, the only useful vperm2f128 permutation is 0x01
20164 aka lanes swap. */
20165 if (d->one_operand_p)
20166 return false;
20167 }
20168
20169 return false;
20170 }
20171
20172 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20173 a two vector permutation using 2 intra-lane interleave insns
20174 and cross-lane shuffle for 32-byte vectors. */
20175
20176 static bool
20177 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20178 {
20179 unsigned i, nelt;
20180 rtx (*gen) (rtx, rtx, rtx);
20181
20182 if (d->one_operand_p)
20183 return false;
20184 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20185 ;
20186 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20187 ;
20188 else
20189 return false;
20190
20191 nelt = d->nelt;
20192 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20193 return false;
20194 for (i = 0; i < nelt; i += 2)
20195 if (d->perm[i] != d->perm[0] + i / 2
20196 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
20197 return false;
20198
20199 if (d->testing_p)
20200 return true;
20201
20202 switch (d->vmode)
20203 {
20204 case E_V32QImode:
20205 if (d->perm[0])
20206 gen = gen_vec_interleave_highv32qi;
20207 else
20208 gen = gen_vec_interleave_lowv32qi;
20209 break;
20210 case E_V16HImode:
20211 if (d->perm[0])
20212 gen = gen_vec_interleave_highv16hi;
20213 else
20214 gen = gen_vec_interleave_lowv16hi;
20215 break;
20216 case E_V8SImode:
20217 if (d->perm[0])
20218 gen = gen_vec_interleave_highv8si;
20219 else
20220 gen = gen_vec_interleave_lowv8si;
20221 break;
20222 case E_V4DImode:
20223 if (d->perm[0])
20224 gen = gen_vec_interleave_highv4di;
20225 else
20226 gen = gen_vec_interleave_lowv4di;
20227 break;
20228 case E_V8SFmode:
20229 if (d->perm[0])
20230 gen = gen_vec_interleave_highv8sf;
20231 else
20232 gen = gen_vec_interleave_lowv8sf;
20233 break;
20234 case E_V4DFmode:
20235 if (d->perm[0])
20236 gen = gen_vec_interleave_highv4df;
20237 else
20238 gen = gen_vec_interleave_lowv4df;
20239 break;
20240 default:
20241 gcc_unreachable ();
20242 }
20243
20244 emit_insn (gen (d->target, d->op0, d->op1));
20245 return true;
20246 }
20247
20248 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20249 a single vector permutation using a single intra-lane vector
20250 permutation, vperm2f128 swapping the lanes and vblend* insn blending
20251 the non-swapped and swapped vectors together. */
20252
20253 static bool
20254 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
20255 {
20256 struct expand_vec_perm_d dfirst, dsecond;
20257 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
20258 rtx_insn *seq;
20259 bool ok;
20260 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20261
20262 if (!TARGET_AVX
20263 || TARGET_AVX2
20264 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20265 || !d->one_operand_p)
20266 return false;
20267
20268 dfirst = *d;
20269 for (i = 0; i < nelt; i++)
20270 dfirst.perm[i] = 0xff;
20271 for (i = 0, msk = 0; i < nelt; i++)
20272 {
20273 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20274 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
20275 return false;
20276 dfirst.perm[j] = d->perm[i];
20277 if (j != i)
20278 msk |= (1 << i);
20279 }
20280 for (i = 0; i < nelt; i++)
20281 if (dfirst.perm[i] == 0xff)
20282 dfirst.perm[i] = i;
20283
20284 if (!d->testing_p)
20285 dfirst.target = gen_reg_rtx (dfirst.vmode);
20286
20287 start_sequence ();
20288 ok = expand_vec_perm_1 (&dfirst);
20289 seq = get_insns ();
20290 end_sequence ();
20291
20292 if (!ok)
20293 return false;
20294
20295 if (d->testing_p)
20296 return true;
20297
20298 emit_insn (seq);
20299
20300 dsecond = *d;
20301 dsecond.op0 = dfirst.target;
20302 dsecond.op1 = dfirst.target;
20303 dsecond.one_operand_p = true;
20304 dsecond.target = gen_reg_rtx (dsecond.vmode);
20305 for (i = 0; i < nelt; i++)
20306 dsecond.perm[i] = i ^ nelt2;
20307
20308 ok = expand_vec_perm_1 (&dsecond);
20309 gcc_assert (ok);
20310
20311 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20312 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
20313 return true;
20314 }
20315
20316 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20317 a two vector permutation using two single vector permutations and
20318 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
20319 of dfirst or dsecond is identity permutation. */
20320
20321 static bool
20322 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
20323 {
20324 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
20325 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20326 bool ident1 = true, ident2 = true;
20327
20328 if (d->one_operand_p)
20329 return false;
20330
20331 if (GET_MODE_SIZE (d->vmode) == 16)
20332 {
20333 if (!TARGET_SSE)
20334 return false;
20335 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
20336 return false;
20337 }
20338 else if (GET_MODE_SIZE (d->vmode) == 32)
20339 {
20340 if (!TARGET_AVX)
20341 return false;
20342 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
20343 return false;
20344 lane = nelt2;
20345 }
20346 else
20347 return false;
20348
20349 for (i = 1; i < nelt; i++)
20350 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
20351 return false;
20352
20353 dfirst = *d;
20354 dsecond = *d;
20355 dfinal = *d;
20356 dfirst.op1 = dfirst.op0;
20357 dfirst.one_operand_p = true;
20358 dsecond.op0 = dsecond.op1;
20359 dsecond.one_operand_p = true;
20360
20361 for (i = 0; i < nelt; i++)
20362 if (d->perm[i] >= nelt)
20363 {
20364 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
20365 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
20366 ident2 = false;
20367 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
20368 = d->perm[i] - nelt;
20369 }
20370 else
20371 {
20372 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
20373 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
20374 ident1 = false;
20375 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
20376 }
20377
20378 if (two_insn && !ident1 && !ident2)
20379 return false;
20380
20381 if (!d->testing_p)
20382 {
20383 if (!ident1)
20384 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20385 if (!ident2)
20386 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20387 if (d->perm[0] >= nelt)
20388 std::swap (dfinal.op0, dfinal.op1);
20389 }
20390
20391 bool ok;
20392 rtx_insn *seq1 = NULL, *seq2 = NULL;
20393
20394 if (!ident1)
20395 {
20396 start_sequence ();
20397 ok = expand_vec_perm_1 (&dfirst);
20398 seq1 = get_insns ();
20399 end_sequence ();
20400
20401 if (!ok)
20402 return false;
20403 }
20404
20405 if (!ident2)
20406 {
20407 start_sequence ();
20408 ok = expand_vec_perm_1 (&dsecond);
20409 seq2 = get_insns ();
20410 end_sequence ();
20411
20412 if (!ok)
20413 return false;
20414 }
20415
20416 if (d->testing_p)
20417 return true;
20418
20419 for (i = 0; i < nelt; i++)
20420 {
20421 dfinal.perm[i] = i / 2;
20422 if (i >= lane)
20423 dfinal.perm[i] += lane / 2;
20424 if ((i & 1) != 0)
20425 dfinal.perm[i] += nelt;
20426 }
20427 emit_insn (seq1);
20428 emit_insn (seq2);
20429 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
20430 dfinal.perm, dfinal.nelt, false);
20431 gcc_assert (ok);
20432 return true;
20433 }
20434
20435 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20436 the permutation using two single vector permutations and the SSE4_1 pblendv
20437 instruction. If two_insn, succeed only if one of dfirst or dsecond is
20438 identity permutation. */
20439
20440 static bool
20441 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
20442 {
20443 unsigned i, nelt = d->nelt;
20444 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20445 machine_mode vmode = d->vmode;
20446 bool ident1 = true, ident2 = true;
20447
20448 /* Use the same checks as in expand_vec_perm_blend. */
20449 if (d->one_operand_p)
20450 return false;
20451 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20452 ;
20453 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20454 ;
20455 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
20456 || GET_MODE_SIZE (vmode) == 8
20457 || GET_MODE_SIZE (vmode) == 4))
20458 ;
20459 else
20460 return false;
20461
20462 dfirst = *d;
20463 dsecond = *d;
20464 dfinal = *d;
20465 dfirst.op1 = dfirst.op0;
20466 dfirst.one_operand_p = true;
20467 dsecond.op0 = dsecond.op1;
20468 dsecond.one_operand_p = true;
20469
20470 for (i = 0; i < nelt; ++i)
20471 if (d->perm[i] >= nelt)
20472 {
20473 dfirst.perm[i] = 0xff;
20474 dsecond.perm[i] = d->perm[i] - nelt;
20475 if (d->perm[i] != i + nelt)
20476 ident2 = false;
20477 }
20478 else
20479 {
20480 dsecond.perm[i] = 0xff;
20481 dfirst.perm[i] = d->perm[i];
20482 if (d->perm[i] != i)
20483 ident1 = false;
20484 }
20485
20486 if (two_insn && !ident1 && !ident2)
20487 return false;
20488
20489 /* For now. Ideally treat 0xff as a wildcard. */
20490 for (i = 0; i < nelt; ++i)
20491 if (dfirst.perm[i] == 0xff)
20492 {
20493 if (GET_MODE_SIZE (vmode) == 32
20494 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
20495 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20496 else
20497 dfirst.perm[i] = i;
20498 }
20499 else
20500 {
20501 if (GET_MODE_SIZE (vmode) == 32
20502 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
20503 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20504 else
20505 dsecond.perm[i] = i;
20506 }
20507
20508 if (!d->testing_p)
20509 {
20510 if (!ident1)
20511 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20512 if (!ident2)
20513 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20514 }
20515
20516 bool ok;
20517 rtx_insn *seq1 = NULL, *seq2 = NULL;
20518
20519 if (!ident1)
20520 {
20521 start_sequence ();
20522 ok = expand_vec_perm_1 (&dfirst);
20523 seq1 = get_insns ();
20524 end_sequence ();
20525
20526 if (!ok)
20527 return false;
20528 }
20529
20530 if (!ident2)
20531 {
20532 start_sequence ();
20533 ok = expand_vec_perm_1 (&dsecond);
20534 seq2 = get_insns ();
20535 end_sequence ();
20536
20537 if (!ok)
20538 return false;
20539 }
20540
20541 if (d->testing_p)
20542 return true;
20543
20544 for (i = 0; i < nelt; ++i)
20545 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
20546
20547 emit_insn (seq1);
20548 emit_insn (seq2);
20549 ok = expand_vec_perm_blend (&dfinal);
20550 gcc_assert (ok);
20551 return true;
20552 }
20553
20554 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
20555 permutation using two vperm2f128, followed by a vshufpd insn blending
20556 the two vectors together. */
20557
20558 static bool
20559 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
20560 {
20561 struct expand_vec_perm_d dfirst, dsecond, dthird;
20562 bool ok;
20563
20564 if (!TARGET_AVX || (d->vmode != V4DFmode))
20565 return false;
20566
20567 if (d->testing_p)
20568 return true;
20569
20570 dfirst = *d;
20571 dsecond = *d;
20572 dthird = *d;
20573
20574 dfirst.perm[0] = (d->perm[0] & ~1);
20575 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
20576 dfirst.perm[2] = (d->perm[2] & ~1);
20577 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
20578 dsecond.perm[0] = (d->perm[1] & ~1);
20579 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
20580 dsecond.perm[2] = (d->perm[3] & ~1);
20581 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
20582 dthird.perm[0] = (d->perm[0] % 2);
20583 dthird.perm[1] = (d->perm[1] % 2) + 4;
20584 dthird.perm[2] = (d->perm[2] % 2) + 2;
20585 dthird.perm[3] = (d->perm[3] % 2) + 6;
20586
20587 dfirst.target = gen_reg_rtx (dfirst.vmode);
20588 dsecond.target = gen_reg_rtx (dsecond.vmode);
20589 dthird.op0 = dfirst.target;
20590 dthird.op1 = dsecond.target;
20591 dthird.one_operand_p = false;
20592
20593 canonicalize_perm (&dfirst);
20594 canonicalize_perm (&dsecond);
20595
20596 ok = expand_vec_perm_1 (&dfirst)
20597 && expand_vec_perm_1 (&dsecond)
20598 && expand_vec_perm_1 (&dthird);
20599
20600 gcc_assert (ok);
20601
20602 return true;
20603 }
20604
20605 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
20606
20607 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20608 a two vector permutation using two intra-lane vector
20609 permutations, vperm2f128 swapping the lanes and vblend* insn blending
20610 the non-swapped and swapped vectors together. */
20611
20612 static bool
20613 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
20614 {
20615 struct expand_vec_perm_d dfirst, dsecond, dthird;
20616 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
20617 rtx_insn *seq1, *seq2;
20618 bool ok;
20619 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20620
20621 if (!TARGET_AVX
20622 || TARGET_AVX2
20623 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20624 || d->one_operand_p)
20625 return false;
20626
20627 dfirst = *d;
20628 dsecond = *d;
20629 for (i = 0; i < nelt; i++)
20630 {
20631 dfirst.perm[i] = 0xff;
20632 dsecond.perm[i] = 0xff;
20633 }
20634 for (i = 0, msk = 0; i < nelt; i++)
20635 {
20636 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20637 if (j == i)
20638 {
20639 dfirst.perm[j] = d->perm[i];
20640 which1 |= (d->perm[i] < nelt ? 1 : 2);
20641 }
20642 else
20643 {
20644 dsecond.perm[j] = d->perm[i];
20645 which2 |= (d->perm[i] < nelt ? 1 : 2);
20646 msk |= (1U << i);
20647 }
20648 }
20649 if (msk == 0 || msk == (1U << nelt) - 1)
20650 return false;
20651
20652 if (!d->testing_p)
20653 {
20654 dfirst.target = gen_reg_rtx (dfirst.vmode);
20655 dsecond.target = gen_reg_rtx (dsecond.vmode);
20656 }
20657
20658 for (i = 0; i < nelt; i++)
20659 {
20660 if (dfirst.perm[i] == 0xff)
20661 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
20662 if (dsecond.perm[i] == 0xff)
20663 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
20664 }
20665 canonicalize_perm (&dfirst);
20666 start_sequence ();
20667 ok = ix86_expand_vec_perm_const_1 (&dfirst);
20668 seq1 = get_insns ();
20669 end_sequence ();
20670
20671 if (!ok)
20672 return false;
20673
20674 canonicalize_perm (&dsecond);
20675 start_sequence ();
20676 ok = ix86_expand_vec_perm_const_1 (&dsecond);
20677 seq2 = get_insns ();
20678 end_sequence ();
20679
20680 if (!ok)
20681 return false;
20682
20683 if (d->testing_p)
20684 return true;
20685
20686 emit_insn (seq1);
20687 emit_insn (seq2);
20688
20689 dthird = *d;
20690 dthird.op0 = dsecond.target;
20691 dthird.op1 = dsecond.target;
20692 dthird.one_operand_p = true;
20693 dthird.target = gen_reg_rtx (dthird.vmode);
20694 for (i = 0; i < nelt; i++)
20695 dthird.perm[i] = i ^ nelt2;
20696
20697 ok = expand_vec_perm_1 (&dthird);
20698 gcc_assert (ok);
20699
20700 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20701 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
20702 return true;
20703 }
20704
20705 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
20706 permutation with two pshufb insns and an ior. We should have already
20707 failed all two instruction sequences. */
20708
20709 static bool
20710 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
20711 {
20712 rtx rperm[2][16], vperm, l, h, op, m128;
20713 unsigned int i, nelt, eltsz;
20714 machine_mode mode;
20715 rtx (*gen) (rtx, rtx, rtx);
20716
20717 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
20718 && GET_MODE_SIZE (d->vmode) != 8
20719 && GET_MODE_SIZE (d->vmode) != 4))
20720 return false;
20721 gcc_assert (!d->one_operand_p);
20722
20723 if (d->testing_p)
20724 return true;
20725
20726 switch (GET_MODE_SIZE (d->vmode))
20727 {
20728 case 4:
20729 mode = V4QImode;
20730 gen = gen_mmx_pshufbv4qi3;
20731 break;
20732 case 8:
20733 mode = V8QImode;
20734 gen = gen_mmx_pshufbv8qi3;
20735 break;
20736 case 16:
20737 mode = V16QImode;
20738 gen = gen_ssse3_pshufbv16qi3;
20739 break;
20740 default:
20741 gcc_unreachable ();
20742 }
20743
20744 nelt = d->nelt;
20745 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20746
20747 /* Generate two permutation masks. If the required element is within
20748 the given vector it is shuffled into the proper lane. If the required
20749 element is in the other vector, force a zero into the lane by setting
20750 bit 7 in the permutation mask. */
20751 m128 = GEN_INT (-128);
20752 for (i = 0; i < nelt; ++i)
20753 {
20754 unsigned j, k, e = d->perm[i];
20755 unsigned which = (e >= nelt);
20756 if (e >= nelt)
20757 e -= nelt;
20758
20759 for (j = 0; j < eltsz; ++j)
20760 {
20761 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
20762 rperm[1-which][i*eltsz + j] = m128;
20763 }
20764
20765 for (k = i*eltsz + j; k < 16; ++k)
20766 rperm[0][k] = rperm[1][k] = m128;
20767 }
20768
20769 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
20770 vperm = force_reg (V16QImode, vperm);
20771
20772 l = gen_reg_rtx (mode);
20773 op = gen_lowpart (mode, d->op0);
20774 emit_insn (gen (l, op, vperm));
20775
20776 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
20777 vperm = force_reg (V16QImode, vperm);
20778
20779 h = gen_reg_rtx (mode);
20780 op = gen_lowpart (mode, d->op1);
20781 emit_insn (gen (h, op, vperm));
20782
20783 op = d->target;
20784 if (d->vmode != mode)
20785 op = gen_reg_rtx (mode);
20786 ix86_emit_vec_binop (IOR, mode, op, l, h);
20787 if (op != d->target)
20788 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20789
20790 return true;
20791 }
20792
20793 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
20794 with two vpshufb insns, vpermq and vpor. We should have already failed
20795 all two or three instruction sequences. */
20796
20797 static bool
20798 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
20799 {
20800 rtx rperm[2][32], vperm, l, h, hp, op, m128;
20801 unsigned int i, nelt, eltsz;
20802
20803 if (!TARGET_AVX2
20804 || !d->one_operand_p
20805 || (d->vmode != V32QImode && d->vmode != V16HImode))
20806 return false;
20807
20808 if (d->testing_p)
20809 return true;
20810
20811 nelt = d->nelt;
20812 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20813
20814 /* Generate two permutation masks. If the required element is within
20815 the same lane, it is shuffled in. If the required element from the
20816 other lane, force a zero by setting bit 7 in the permutation mask.
20817 In the other mask the mask has non-negative elements if element
20818 is requested from the other lane, but also moved to the other lane,
20819 so that the result of vpshufb can have the two V2TImode halves
20820 swapped. */
20821 m128 = GEN_INT (-128);
20822 for (i = 0; i < nelt; ++i)
20823 {
20824 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20825 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
20826
20827 for (j = 0; j < eltsz; ++j)
20828 {
20829 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
20830 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
20831 }
20832 }
20833
20834 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
20835 vperm = force_reg (V32QImode, vperm);
20836
20837 h = gen_reg_rtx (V32QImode);
20838 op = gen_lowpart (V32QImode, d->op0);
20839 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
20840
20841 /* Swap the 128-byte lanes of h into hp. */
20842 hp = gen_reg_rtx (V4DImode);
20843 op = gen_lowpart (V4DImode, h);
20844 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
20845 const1_rtx));
20846
20847 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
20848 vperm = force_reg (V32QImode, vperm);
20849
20850 l = gen_reg_rtx (V32QImode);
20851 op = gen_lowpart (V32QImode, d->op0);
20852 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
20853
20854 op = d->target;
20855 if (d->vmode != V32QImode)
20856 op = gen_reg_rtx (V32QImode);
20857 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
20858 if (op != d->target)
20859 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20860
20861 return true;
20862 }
20863
20864 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
20865 and extract-odd permutations of two V32QImode and V16QImode operand
20866 with two vpshufb insns, vpor and vpermq. We should have already
20867 failed all two or three instruction sequences. */
20868
20869 static bool
20870 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
20871 {
20872 rtx rperm[2][32], vperm, l, h, ior, op, m128;
20873 unsigned int i, nelt, eltsz;
20874
20875 if (!TARGET_AVX2
20876 || d->one_operand_p
20877 || (d->vmode != V32QImode && d->vmode != V16HImode))
20878 return false;
20879
20880 for (i = 0; i < d->nelt; ++i)
20881 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
20882 return false;
20883
20884 if (d->testing_p)
20885 return true;
20886
20887 nelt = d->nelt;
20888 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20889
20890 /* Generate two permutation masks. In the first permutation mask
20891 the first quarter will contain indexes for the first half
20892 of the op0, the second quarter will contain bit 7 set, third quarter
20893 will contain indexes for the second half of the op0 and the
20894 last quarter bit 7 set. In the second permutation mask
20895 the first quarter will contain bit 7 set, the second quarter
20896 indexes for the first half of the op1, the third quarter bit 7 set
20897 and last quarter indexes for the second half of the op1.
20898 I.e. the first mask e.g. for V32QImode extract even will be:
20899 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
20900 (all values masked with 0xf except for -128) and second mask
20901 for extract even will be
20902 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
20903 m128 = GEN_INT (-128);
20904 for (i = 0; i < nelt; ++i)
20905 {
20906 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20907 unsigned which = d->perm[i] >= nelt;
20908 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
20909
20910 for (j = 0; j < eltsz; ++j)
20911 {
20912 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
20913 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
20914 }
20915 }
20916
20917 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
20918 vperm = force_reg (V32QImode, vperm);
20919
20920 l = gen_reg_rtx (V32QImode);
20921 op = gen_lowpart (V32QImode, d->op0);
20922 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
20923
20924 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
20925 vperm = force_reg (V32QImode, vperm);
20926
20927 h = gen_reg_rtx (V32QImode);
20928 op = gen_lowpart (V32QImode, d->op1);
20929 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
20930
20931 ior = gen_reg_rtx (V32QImode);
20932 emit_insn (gen_iorv32qi3 (ior, l, h));
20933
20934 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
20935 op = gen_reg_rtx (V4DImode);
20936 ior = gen_lowpart (V4DImode, ior);
20937 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
20938 const1_rtx, GEN_INT (3)));
20939 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20940
20941 return true;
20942 }
20943
20944 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
20945 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
20946 operands with two "and" and "pack" or two "shift" and "pack" insns.
20947 We should have already failed all two instruction sequences. */
20948
20949 static bool
20950 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
20951 {
20952 rtx op, dop0, dop1, t;
20953 unsigned i, odd, c, s, nelt = d->nelt;
20954 bool end_perm = false;
20955 machine_mode half_mode;
20956 rtx (*gen_and) (rtx, rtx, rtx);
20957 rtx (*gen_pack) (rtx, rtx, rtx);
20958 rtx (*gen_shift) (rtx, rtx, rtx);
20959
20960 if (d->one_operand_p)
20961 return false;
20962
20963 switch (d->vmode)
20964 {
20965 case E_V4HImode:
20966 /* Required for "pack". */
20967 if (!TARGET_SSE4_1)
20968 return false;
20969 c = 0xffff;
20970 s = 16;
20971 half_mode = V2SImode;
20972 gen_and = gen_andv2si3;
20973 gen_pack = gen_mmx_packusdw;
20974 gen_shift = gen_lshrv2si3;
20975 break;
20976 case E_V8HImode:
20977 /* Required for "pack". */
20978 if (!TARGET_SSE4_1)
20979 return false;
20980 c = 0xffff;
20981 s = 16;
20982 half_mode = V4SImode;
20983 gen_and = gen_andv4si3;
20984 gen_pack = gen_sse4_1_packusdw;
20985 gen_shift = gen_lshrv4si3;
20986 break;
20987 case E_V8QImode:
20988 /* No check as all instructions are SSE2. */
20989 c = 0xff;
20990 s = 8;
20991 half_mode = V4HImode;
20992 gen_and = gen_andv4hi3;
20993 gen_pack = gen_mmx_packuswb;
20994 gen_shift = gen_lshrv4hi3;
20995 break;
20996 case E_V16QImode:
20997 /* No check as all instructions are SSE2. */
20998 c = 0xff;
20999 s = 8;
21000 half_mode = V8HImode;
21001 gen_and = gen_andv8hi3;
21002 gen_pack = gen_sse2_packuswb;
21003 gen_shift = gen_lshrv8hi3;
21004 break;
21005 case E_V16HImode:
21006 if (!TARGET_AVX2)
21007 return false;
21008 c = 0xffff;
21009 s = 16;
21010 half_mode = V8SImode;
21011 gen_and = gen_andv8si3;
21012 gen_pack = gen_avx2_packusdw;
21013 gen_shift = gen_lshrv8si3;
21014 end_perm = true;
21015 break;
21016 case E_V32QImode:
21017 if (!TARGET_AVX2)
21018 return false;
21019 c = 0xff;
21020 s = 8;
21021 half_mode = V16HImode;
21022 gen_and = gen_andv16hi3;
21023 gen_pack = gen_avx2_packuswb;
21024 gen_shift = gen_lshrv16hi3;
21025 end_perm = true;
21026 break;
21027 default:
21028 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21029 are more profitable than general shuffles. */
21030 return false;
21031 }
21032
21033 /* Check that permutation is even or odd. */
21034 odd = d->perm[0];
21035 if (odd > 1)
21036 return false;
21037
21038 for (i = 1; i < nelt; ++i)
21039 if (d->perm[i] != 2 * i + odd)
21040 return false;
21041
21042 if (d->testing_p)
21043 return true;
21044
21045 dop0 = gen_reg_rtx (half_mode);
21046 dop1 = gen_reg_rtx (half_mode);
21047 if (odd == 0)
21048 {
21049 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
21050 t = force_reg (half_mode, t);
21051 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
21052 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
21053 }
21054 else
21055 {
21056 emit_insn (gen_shift (dop0,
21057 gen_lowpart (half_mode, d->op0),
21058 GEN_INT (s)));
21059 emit_insn (gen_shift (dop1,
21060 gen_lowpart (half_mode, d->op1),
21061 GEN_INT (s)));
21062 }
21063 /* In AVX2 for 256 bit case we need to permute pack result. */
21064 if (TARGET_AVX2 && end_perm)
21065 {
21066 op = gen_reg_rtx (d->vmode);
21067 t = gen_reg_rtx (V4DImode);
21068 emit_insn (gen_pack (op, dop0, dop1));
21069 emit_insn (gen_avx2_permv4di_1 (t,
21070 gen_lowpart (V4DImode, op),
21071 const0_rtx,
21072 const2_rtx,
21073 const1_rtx,
21074 GEN_INT (3)));
21075 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
21076 }
21077 else
21078 emit_insn (gen_pack (d->target, dop0, dop1));
21079
21080 return true;
21081 }
21082
21083 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21084 and extract-odd permutations of two V64QI operands
21085 with two "shifts", two "truncs" and one "concat" insns for "odd"
21086 and two "truncs" and one concat insn for "even."
21087 Have already failed all two instruction sequences. */
21088
21089 static bool
21090 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
21091 {
21092 rtx t1, t2, t3, t4;
21093 unsigned i, odd, nelt = d->nelt;
21094
21095 if (!TARGET_AVX512BW
21096 || d->one_operand_p
21097 || d->vmode != V64QImode)
21098 return false;
21099
21100 /* Check that permutation is even or odd. */
21101 odd = d->perm[0];
21102 if (odd > 1)
21103 return false;
21104
21105 for (i = 1; i < nelt; ++i)
21106 if (d->perm[i] != 2 * i + odd)
21107 return false;
21108
21109 if (d->testing_p)
21110 return true;
21111
21112
21113 if (odd)
21114 {
21115 t1 = gen_reg_rtx (V32HImode);
21116 t2 = gen_reg_rtx (V32HImode);
21117 emit_insn (gen_lshrv32hi3 (t1,
21118 gen_lowpart (V32HImode, d->op0),
21119 GEN_INT (8)));
21120 emit_insn (gen_lshrv32hi3 (t2,
21121 gen_lowpart (V32HImode, d->op1),
21122 GEN_INT (8)));
21123 }
21124 else
21125 {
21126 t1 = gen_lowpart (V32HImode, d->op0);
21127 t2 = gen_lowpart (V32HImode, d->op1);
21128 }
21129
21130 t3 = gen_reg_rtx (V32QImode);
21131 t4 = gen_reg_rtx (V32QImode);
21132 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
21133 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
21134 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
21135
21136 return true;
21137 }
21138
21139 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
21140 and extract-odd permutations. */
21141
21142 static bool
21143 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
21144 {
21145 rtx t1, t2, t3, t4, t5;
21146
21147 switch (d->vmode)
21148 {
21149 case E_V4DFmode:
21150 if (d->testing_p)
21151 break;
21152 t1 = gen_reg_rtx (V4DFmode);
21153 t2 = gen_reg_rtx (V4DFmode);
21154
21155 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21156 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
21157 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
21158
21159 /* Now an unpck[lh]pd will produce the result required. */
21160 if (odd)
21161 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
21162 else
21163 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
21164 emit_insn (t3);
21165 break;
21166
21167 case E_V8SFmode:
21168 {
21169 int mask = odd ? 0xdd : 0x88;
21170
21171 if (d->testing_p)
21172 break;
21173 t1 = gen_reg_rtx (V8SFmode);
21174 t2 = gen_reg_rtx (V8SFmode);
21175 t3 = gen_reg_rtx (V8SFmode);
21176
21177 /* Shuffle within the 128-bit lanes to produce:
21178 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
21179 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
21180 GEN_INT (mask)));
21181
21182 /* Shuffle the lanes around to produce:
21183 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
21184 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
21185 GEN_INT (0x3)));
21186
21187 /* Shuffle within the 128-bit lanes to produce:
21188 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
21189 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
21190
21191 /* Shuffle within the 128-bit lanes to produce:
21192 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
21193 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
21194
21195 /* Shuffle the lanes around to produce:
21196 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
21197 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
21198 GEN_INT (0x20)));
21199 }
21200 break;
21201
21202 case E_V2DFmode:
21203 case E_V4SFmode:
21204 case E_V2DImode:
21205 case E_V2SImode:
21206 case E_V4SImode:
21207 case E_V2HImode:
21208 /* These are always directly implementable by expand_vec_perm_1. */
21209 gcc_unreachable ();
21210
21211 case E_V2SFmode:
21212 gcc_assert (TARGET_MMX_WITH_SSE);
21213 /* We have no suitable instructions. */
21214 if (d->testing_p)
21215 return false;
21216 break;
21217
21218 case E_V4QImode:
21219 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21220 return expand_vec_perm_pshufb2 (d);
21221 else
21222 {
21223 if (d->testing_p)
21224 break;
21225 /* We need 2*log2(N)-1 operations to achieve odd/even
21226 with interleave. */
21227 t1 = gen_reg_rtx (V4QImode);
21228 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
21229 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
21230 if (odd)
21231 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
21232 else
21233 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
21234 emit_insn (t2);
21235 }
21236 break;
21237
21238 case E_V4HImode:
21239 if (TARGET_SSE4_1)
21240 return expand_vec_perm_even_odd_pack (d);
21241 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21242 return expand_vec_perm_pshufb2 (d);
21243 else
21244 {
21245 if (d->testing_p)
21246 break;
21247 /* We need 2*log2(N)-1 operations to achieve odd/even
21248 with interleave. */
21249 t1 = gen_reg_rtx (V4HImode);
21250 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
21251 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
21252 if (odd)
21253 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
21254 else
21255 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
21256 emit_insn (t2);
21257 }
21258 break;
21259
21260 case E_V8HImode:
21261 if (TARGET_SSE4_1)
21262 return expand_vec_perm_even_odd_pack (d);
21263 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21264 return expand_vec_perm_pshufb2 (d);
21265 else
21266 {
21267 if (d->testing_p)
21268 break;
21269 /* We need 2*log2(N)-1 operations to achieve odd/even
21270 with interleave. */
21271 t1 = gen_reg_rtx (V8HImode);
21272 t2 = gen_reg_rtx (V8HImode);
21273 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
21274 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
21275 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
21276 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
21277 if (odd)
21278 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
21279 else
21280 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
21281 emit_insn (t3);
21282 }
21283 break;
21284
21285 case E_V8QImode:
21286 case E_V16QImode:
21287 return expand_vec_perm_even_odd_pack (d);
21288
21289 case E_V16HImode:
21290 case E_V32QImode:
21291 return expand_vec_perm_even_odd_pack (d);
21292
21293 case E_V64QImode:
21294 return expand_vec_perm_even_odd_trunc (d);
21295
21296 case E_V4DImode:
21297 if (!TARGET_AVX2)
21298 {
21299 struct expand_vec_perm_d d_copy = *d;
21300 d_copy.vmode = V4DFmode;
21301 if (d->testing_p)
21302 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
21303 else
21304 d_copy.target = gen_reg_rtx (V4DFmode);
21305 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
21306 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
21307 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21308 {
21309 if (!d->testing_p)
21310 emit_move_insn (d->target,
21311 gen_lowpart (V4DImode, d_copy.target));
21312 return true;
21313 }
21314 return false;
21315 }
21316
21317 if (d->testing_p)
21318 break;
21319
21320 t1 = gen_reg_rtx (V4DImode);
21321 t2 = gen_reg_rtx (V4DImode);
21322
21323 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21324 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
21325 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
21326
21327 /* Now an vpunpck[lh]qdq will produce the result required. */
21328 if (odd)
21329 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
21330 else
21331 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
21332 emit_insn (t3);
21333 break;
21334
21335 case E_V8SImode:
21336 if (!TARGET_AVX2)
21337 {
21338 struct expand_vec_perm_d d_copy = *d;
21339 d_copy.vmode = V8SFmode;
21340 if (d->testing_p)
21341 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
21342 else
21343 d_copy.target = gen_reg_rtx (V8SFmode);
21344 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
21345 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
21346 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21347 {
21348 if (!d->testing_p)
21349 emit_move_insn (d->target,
21350 gen_lowpart (V8SImode, d_copy.target));
21351 return true;
21352 }
21353 return false;
21354 }
21355
21356 if (d->testing_p)
21357 break;
21358
21359 t1 = gen_reg_rtx (V8SImode);
21360 t2 = gen_reg_rtx (V8SImode);
21361 t3 = gen_reg_rtx (V4DImode);
21362 t4 = gen_reg_rtx (V4DImode);
21363 t5 = gen_reg_rtx (V4DImode);
21364
21365 /* Shuffle the lanes around into
21366 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
21367 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
21368 gen_lowpart (V4DImode, d->op1),
21369 GEN_INT (0x20)));
21370 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
21371 gen_lowpart (V4DImode, d->op1),
21372 GEN_INT (0x31)));
21373
21374 /* Swap the 2nd and 3rd position in each lane into
21375 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
21376 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
21377 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21378 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
21379 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21380
21381 /* Now an vpunpck[lh]qdq will produce
21382 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
21383 if (odd)
21384 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
21385 gen_lowpart (V4DImode, t2));
21386 else
21387 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
21388 gen_lowpart (V4DImode, t2));
21389 emit_insn (t3);
21390 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
21391 break;
21392
21393 default:
21394 gcc_unreachable ();
21395 }
21396
21397 return true;
21398 }
21399
21400 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
21401 extract-even and extract-odd permutations. */
21402
21403 static bool
21404 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
21405 {
21406 unsigned i, odd, nelt = d->nelt;
21407
21408 odd = d->perm[0];
21409 if (odd != 0 && odd != 1)
21410 return false;
21411
21412 for (i = 1; i < nelt; ++i)
21413 if (d->perm[i] != 2 * i + odd)
21414 return false;
21415
21416 if (d->vmode == E_V32HImode
21417 && d->testing_p
21418 && !TARGET_AVX512BW)
21419 return false;
21420
21421 return expand_vec_perm_even_odd_1 (d, odd);
21422 }
21423
21424 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
21425 permutations. We assume that expand_vec_perm_1 has already failed. */
21426
21427 static bool
21428 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
21429 {
21430 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
21431 machine_mode vmode = d->vmode;
21432 rtx (*gen) (rtx, rtx, rtx);
21433 unsigned char perm2[4];
21434 rtx op0 = d->op0, dest;
21435 bool ok;
21436
21437 switch (vmode)
21438 {
21439 case E_V4DFmode:
21440 case E_V8SFmode:
21441 /* These are special-cased in sse.md so that we can optionally
21442 use the vbroadcast instruction. They expand to two insns
21443 if the input happens to be in a register. */
21444 gcc_unreachable ();
21445
21446 case E_V2DFmode:
21447 case E_V2SFmode:
21448 case E_V4SFmode:
21449 case E_V2DImode:
21450 case E_V2SImode:
21451 case E_V4SImode:
21452 case E_V2HImode:
21453 case E_V4HImode:
21454 /* These are always implementable using standard shuffle patterns. */
21455 gcc_unreachable ();
21456
21457 case E_V4QImode:
21458 /* This can be implemented via interleave and pshuflw. */
21459 if (d->testing_p)
21460 return true;
21461
21462 if (elt >= nelt2)
21463 {
21464 gen = gen_mmx_punpckhbw_low;
21465 elt -= nelt2;
21466 }
21467 else
21468 gen = gen_mmx_punpcklbw_low;
21469
21470 dest = gen_reg_rtx (vmode);
21471 emit_insn (gen (dest, op0, op0));
21472 vmode = get_mode_wider_vector (vmode);
21473 op0 = gen_lowpart (vmode, dest);
21474
21475 memset (perm2, elt, 2);
21476 dest = gen_reg_rtx (vmode);
21477 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21478 gcc_assert (ok);
21479
21480 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21481 return true;
21482
21483 case E_V8QImode:
21484 /* This can be implemented via interleave. We save one insn by
21485 stopping once we have promoted to V2SImode and then use pshufd. */
21486 if (d->testing_p)
21487 return true;
21488 do
21489 {
21490 if (elt >= nelt2)
21491 {
21492 gen = vmode == V8QImode ? gen_mmx_punpckhbw
21493 : gen_mmx_punpckhwd;
21494 elt -= nelt2;
21495 }
21496 else
21497 gen = vmode == V8QImode ? gen_mmx_punpcklbw
21498 : gen_mmx_punpcklwd;
21499 nelt2 /= 2;
21500
21501 dest = gen_reg_rtx (vmode);
21502 emit_insn (gen (dest, op0, op0));
21503 vmode = get_mode_wider_vector (vmode);
21504 op0 = gen_lowpart (vmode, dest);
21505 }
21506 while (vmode != V2SImode);
21507
21508 memset (perm2, elt, 2);
21509 dest = gen_reg_rtx (vmode);
21510 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21511 gcc_assert (ok);
21512
21513 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21514 return true;
21515
21516 case E_V8HImode:
21517 case E_V16QImode:
21518 /* These can be implemented via interleave. We save one insn by
21519 stopping once we have promoted to V4SImode and then use pshufd. */
21520 if (d->testing_p)
21521 return true;
21522 do
21523 {
21524 if (elt >= nelt2)
21525 {
21526 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
21527 : gen_vec_interleave_highv8hi;
21528 elt -= nelt2;
21529 }
21530 else
21531 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
21532 : gen_vec_interleave_lowv8hi;
21533 nelt2 /= 2;
21534
21535 dest = gen_reg_rtx (vmode);
21536 emit_insn (gen (dest, op0, op0));
21537 vmode = get_mode_wider_vector (vmode);
21538 op0 = gen_lowpart (vmode, dest);
21539 }
21540 while (vmode != V4SImode);
21541
21542 memset (perm2, elt, 4);
21543 dest = gen_reg_rtx (vmode);
21544 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
21545 gcc_assert (ok);
21546
21547 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21548 return true;
21549
21550 case E_V8HFmode:
21551 /* This can be implemented via interleave and pshufd. */
21552 if (d->testing_p)
21553 return true;
21554
21555 if (elt >= nelt2)
21556 {
21557 gen = gen_vec_interleave_highv8hf;
21558 elt -= nelt2;
21559 }
21560 else
21561 gen = gen_vec_interleave_lowv8hf;
21562 nelt2 /= 2;
21563
21564 dest = gen_reg_rtx (vmode);
21565 emit_insn (gen (dest, op0, op0));
21566
21567 vmode = V4SImode;
21568 op0 = gen_lowpart (vmode, dest);
21569
21570 memset (perm2, elt, 4);
21571 dest = gen_reg_rtx (vmode);
21572 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
21573 gcc_assert (ok);
21574
21575 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21576 return true;
21577
21578 case E_V32QImode:
21579 case E_V16HImode:
21580 case E_V8SImode:
21581 case E_V4DImode:
21582 /* For AVX2 broadcasts of the first element vpbroadcast* or
21583 vpermq should be used by expand_vec_perm_1. */
21584 gcc_assert (!TARGET_AVX2 || d->perm[0]);
21585 return false;
21586
21587 case E_V64QImode:
21588 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
21589 return false;
21590
21591 case E_V32HImode:
21592 gcc_assert (!TARGET_AVX512BW);
21593 return false;
21594
21595 default:
21596 gcc_unreachable ();
21597 }
21598 }
21599
21600 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
21601 broadcast permutations. */
21602
21603 static bool
21604 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
21605 {
21606 unsigned i, elt, nelt = d->nelt;
21607
21608 if (!d->one_operand_p)
21609 return false;
21610
21611 elt = d->perm[0];
21612 for (i = 1; i < nelt; ++i)
21613 if (d->perm[i] != elt)
21614 return false;
21615
21616 return expand_vec_perm_broadcast_1 (d);
21617 }
21618
21619 /* Implement arbitrary permutations of two V64QImode operands
21620 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
21621 static bool
21622 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
21623 {
21624 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
21625 return false;
21626
21627 if (d->testing_p)
21628 return true;
21629
21630 struct expand_vec_perm_d ds[2];
21631 rtx rperm[128], vperm, target0, target1;
21632 unsigned int i, nelt;
21633 machine_mode vmode;
21634
21635 nelt = d->nelt;
21636 vmode = V64QImode;
21637
21638 for (i = 0; i < 2; i++)
21639 {
21640 ds[i] = *d;
21641 ds[i].vmode = V32HImode;
21642 ds[i].nelt = 32;
21643 ds[i].target = gen_reg_rtx (V32HImode);
21644 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
21645 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
21646 }
21647
21648 /* Prepare permutations such that the first one takes care of
21649 putting the even bytes into the right positions or one higher
21650 positions (ds[0]) and the second one takes care of
21651 putting the odd bytes into the right positions or one below
21652 (ds[1]). */
21653
21654 for (i = 0; i < nelt; i++)
21655 {
21656 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
21657 if (i & 1)
21658 {
21659 rperm[i] = constm1_rtx;
21660 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21661 }
21662 else
21663 {
21664 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21665 rperm[i + 64] = constm1_rtx;
21666 }
21667 }
21668
21669 bool ok = expand_vec_perm_1 (&ds[0]);
21670 gcc_assert (ok);
21671 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
21672
21673 ok = expand_vec_perm_1 (&ds[1]);
21674 gcc_assert (ok);
21675 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
21676
21677 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
21678 vperm = force_reg (vmode, vperm);
21679 target0 = gen_reg_rtx (V64QImode);
21680 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
21681
21682 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
21683 vperm = force_reg (vmode, vperm);
21684 target1 = gen_reg_rtx (V64QImode);
21685 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
21686
21687 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
21688 return true;
21689 }
21690
21691 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
21692 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
21693 all the shorter instruction sequences. */
21694
21695 static bool
21696 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
21697 {
21698 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
21699 unsigned int i, nelt, eltsz;
21700 bool used[4];
21701
21702 if (!TARGET_AVX2
21703 || d->one_operand_p
21704 || (d->vmode != V32QImode && d->vmode != V16HImode))
21705 return false;
21706
21707 if (d->testing_p)
21708 return true;
21709
21710 nelt = d->nelt;
21711 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21712
21713 /* Generate 4 permutation masks. If the required element is within
21714 the same lane, it is shuffled in. If the required element from the
21715 other lane, force a zero by setting bit 7 in the permutation mask.
21716 In the other mask the mask has non-negative elements if element
21717 is requested from the other lane, but also moved to the other lane,
21718 so that the result of vpshufb can have the two V2TImode halves
21719 swapped. */
21720 m128 = GEN_INT (-128);
21721 for (i = 0; i < 32; ++i)
21722 {
21723 rperm[0][i] = m128;
21724 rperm[1][i] = m128;
21725 rperm[2][i] = m128;
21726 rperm[3][i] = m128;
21727 }
21728 used[0] = false;
21729 used[1] = false;
21730 used[2] = false;
21731 used[3] = false;
21732 for (i = 0; i < nelt; ++i)
21733 {
21734 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21735 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21736 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
21737
21738 for (j = 0; j < eltsz; ++j)
21739 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
21740 used[which] = true;
21741 }
21742
21743 for (i = 0; i < 2; ++i)
21744 {
21745 if (!used[2 * i + 1])
21746 {
21747 h[i] = NULL_RTX;
21748 continue;
21749 }
21750 vperm = gen_rtx_CONST_VECTOR (V32QImode,
21751 gen_rtvec_v (32, rperm[2 * i + 1]));
21752 vperm = force_reg (V32QImode, vperm);
21753 h[i] = gen_reg_rtx (V32QImode);
21754 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
21755 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
21756 }
21757
21758 /* Swap the 128-byte lanes of h[X]. */
21759 for (i = 0; i < 2; ++i)
21760 {
21761 if (h[i] == NULL_RTX)
21762 continue;
21763 op = gen_reg_rtx (V4DImode);
21764 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
21765 const2_rtx, GEN_INT (3), const0_rtx,
21766 const1_rtx));
21767 h[i] = gen_lowpart (V32QImode, op);
21768 }
21769
21770 for (i = 0; i < 2; ++i)
21771 {
21772 if (!used[2 * i])
21773 {
21774 l[i] = NULL_RTX;
21775 continue;
21776 }
21777 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
21778 vperm = force_reg (V32QImode, vperm);
21779 l[i] = gen_reg_rtx (V32QImode);
21780 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
21781 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
21782 }
21783
21784 for (i = 0; i < 2; ++i)
21785 {
21786 if (h[i] && l[i])
21787 {
21788 op = gen_reg_rtx (V32QImode);
21789 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
21790 l[i] = op;
21791 }
21792 else if (h[i])
21793 l[i] = h[i];
21794 }
21795
21796 gcc_assert (l[0] && l[1]);
21797 op = d->target;
21798 if (d->vmode != V32QImode)
21799 op = gen_reg_rtx (V32QImode);
21800 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
21801 if (op != d->target)
21802 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21803 return true;
21804 }
21805
21806 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
21807 taken care of, perform the expansion in D and return true on success. */
21808
21809 static bool
21810 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
21811 {
21812 /* Try a single instruction expansion. */
21813 if (expand_vec_perm_1 (d))
21814 return true;
21815
21816 /* Try sequences of two instructions. */
21817
21818 if (expand_vec_perm_pshuflw_pshufhw (d))
21819 return true;
21820
21821 if (expand_vec_perm_palignr (d, false))
21822 return true;
21823
21824 if (expand_vec_perm_interleave2 (d))
21825 return true;
21826
21827 if (expand_vec_perm_broadcast (d))
21828 return true;
21829
21830 if (expand_vec_perm_vpermq_perm_1 (d))
21831 return true;
21832
21833 if (expand_vec_perm_vperm2f128 (d))
21834 return true;
21835
21836 if (expand_vec_perm_pblendv (d))
21837 return true;
21838
21839 if (expand_vec_perm_2perm_interleave (d, true))
21840 return true;
21841
21842 if (expand_vec_perm_2perm_pblendv (d, true))
21843 return true;
21844
21845 /* Try sequences of three instructions. */
21846
21847 if (expand_vec_perm_even_odd_pack (d))
21848 return true;
21849
21850 if (expand_vec_perm_2vperm2f128_vshuf (d))
21851 return true;
21852
21853 if (expand_vec_perm_pshufb2 (d))
21854 return true;
21855
21856 if (expand_vec_perm_interleave3 (d))
21857 return true;
21858
21859 if (expand_vec_perm_vperm2f128_vblend (d))
21860 return true;
21861
21862 if (expand_vec_perm_2perm_interleave (d, false))
21863 return true;
21864
21865 if (expand_vec_perm_2perm_pblendv (d, false))
21866 return true;
21867
21868 /* Try sequences of four instructions. */
21869
21870 if (expand_vec_perm_even_odd_trunc (d))
21871 return true;
21872 if (expand_vec_perm_vpshufb2_vpermq (d))
21873 return true;
21874
21875 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
21876 return true;
21877
21878 if (expand_vec_perm_vpermt2_vpshub2 (d))
21879 return true;
21880
21881 /* ??? Look for narrow permutations whose element orderings would
21882 allow the promotion to a wider mode. */
21883
21884 /* ??? Look for sequences of interleave or a wider permute that place
21885 the data into the correct lanes for a half-vector shuffle like
21886 pshuf[lh]w or vpermilps. */
21887
21888 /* ??? Look for sequences of interleave that produce the desired results.
21889 The combinatorics of punpck[lh] get pretty ugly... */
21890
21891 if (expand_vec_perm_even_odd (d))
21892 return true;
21893
21894 /* Even longer sequences. */
21895 if (expand_vec_perm_vpshufb4_vpermq2 (d))
21896 return true;
21897
21898 /* See if we can get the same permutation in different vector integer
21899 mode. */
21900 struct expand_vec_perm_d nd;
21901 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
21902 {
21903 if (!d->testing_p)
21904 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
21905 return true;
21906 }
21907
21908 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
21909 if (expand_vec_perm2_vperm2f128_vblend (d))
21910 return true;
21911
21912 return false;
21913 }
21914
21915 /* If a permutation only uses one operand, make it clear. Returns true
21916 if the permutation references both operands. */
21917
21918 static bool
21919 canonicalize_perm (struct expand_vec_perm_d *d)
21920 {
21921 int i, which, nelt = d->nelt;
21922
21923 for (i = which = 0; i < nelt; ++i)
21924 which |= (d->perm[i] < nelt ? 1 : 2);
21925
21926 d->one_operand_p = true;
21927 switch (which)
21928 {
21929 default:
21930 gcc_unreachable();
21931
21932 case 3:
21933 if (!rtx_equal_p (d->op0, d->op1))
21934 {
21935 d->one_operand_p = false;
21936 break;
21937 }
21938 /* The elements of PERM do not suggest that only the first operand
21939 is used, but both operands are identical. Allow easier matching
21940 of the permutation by folding the permutation into the single
21941 input vector. */
21942 /* FALLTHRU */
21943
21944 case 2:
21945 for (i = 0; i < nelt; ++i)
21946 d->perm[i] &= nelt - 1;
21947 d->op0 = d->op1;
21948 break;
21949
21950 case 1:
21951 d->op1 = d->op0;
21952 break;
21953 }
21954
21955 return (which == 3);
21956 }
21957
21958 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
21959
21960 bool
21961 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
21962 rtx op1, const vec_perm_indices &sel)
21963 {
21964 struct expand_vec_perm_d d;
21965 unsigned char perm[MAX_VECT_LEN];
21966 unsigned int i, nelt, which;
21967 bool two_args;
21968
21969 /* For HF mode vector, convert it to HI using subreg. */
21970 if (GET_MODE_INNER (vmode) == HFmode)
21971 {
21972 machine_mode orig_mode = vmode;
21973 vmode = mode_for_vector (HImode,
21974 GET_MODE_NUNITS (vmode)).require ();
21975 if (target)
21976 target = lowpart_subreg (vmode, target, orig_mode);
21977 if (op0)
21978 op0 = lowpart_subreg (vmode, op0, orig_mode);
21979 if (op1)
21980 op1 = lowpart_subreg (vmode, op1, orig_mode);
21981 }
21982
21983 d.target = target;
21984 d.op0 = op0;
21985 d.op1 = op1;
21986
21987 d.vmode = vmode;
21988 gcc_assert (VECTOR_MODE_P (d.vmode));
21989 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
21990 d.testing_p = !target;
21991
21992 gcc_assert (sel.length () == nelt);
21993 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
21994
21995 /* Given sufficient ISA support we can just return true here
21996 for selected vector modes. */
21997 switch (d.vmode)
21998 {
21999 case E_V16SFmode:
22000 case E_V16SImode:
22001 case E_V8DImode:
22002 case E_V8DFmode:
22003 if (!TARGET_AVX512F)
22004 return false;
22005 /* All implementable with a single vperm[it]2 insn. */
22006 if (d.testing_p)
22007 return true;
22008 break;
22009 case E_V32HImode:
22010 if (!TARGET_AVX512F)
22011 return false;
22012 if (d.testing_p && TARGET_AVX512BW)
22013 /* All implementable with a single vperm[it]2 insn. */
22014 return true;
22015 break;
22016 case E_V64QImode:
22017 if (!TARGET_AVX512F)
22018 return false;
22019 if (d.testing_p && TARGET_AVX512BW)
22020 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22021 return true;
22022 break;
22023 case E_V8SImode:
22024 case E_V8SFmode:
22025 case E_V4DFmode:
22026 case E_V4DImode:
22027 if (!TARGET_AVX)
22028 return false;
22029 if (d.testing_p && TARGET_AVX512VL)
22030 /* All implementable with a single vperm[it]2 insn. */
22031 return true;
22032 break;
22033 case E_V16HImode:
22034 if (!TARGET_SSE2)
22035 return false;
22036 if (d.testing_p && TARGET_AVX2)
22037 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22038 return true;
22039 break;
22040 case E_V32QImode:
22041 if (!TARGET_SSE2)
22042 return false;
22043 if (d.testing_p && TARGET_AVX2)
22044 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22045 return true;
22046 break;
22047 case E_V8HImode:
22048 case E_V16QImode:
22049 if (!TARGET_SSE2)
22050 return false;
22051 /* Fall through. */
22052 case E_V4SImode:
22053 case E_V4SFmode:
22054 if (!TARGET_SSE)
22055 return false;
22056 /* All implementable with a single vpperm insn. */
22057 if (d.testing_p && TARGET_XOP)
22058 return true;
22059 /* All implementable with 2 pshufb + 1 ior. */
22060 if (d.testing_p && TARGET_SSSE3)
22061 return true;
22062 break;
22063 case E_V2SFmode:
22064 case E_V2SImode:
22065 case E_V4HImode:
22066 case E_V8QImode:
22067 if (!TARGET_MMX_WITH_SSE)
22068 return false;
22069 break;
22070 case E_V2HImode:
22071 if (!TARGET_SSE2)
22072 return false;
22073 /* All implementable with *punpckwd. */
22074 if (d.testing_p)
22075 return true;
22076 break;
22077 case E_V4QImode:
22078 if (!TARGET_SSE2)
22079 return false;
22080 break;
22081 case E_V2DImode:
22082 case E_V2DFmode:
22083 if (!TARGET_SSE)
22084 return false;
22085 /* All implementable with shufpd or unpck[lh]pd. */
22086 if (d.testing_p)
22087 return true;
22088 break;
22089 default:
22090 return false;
22091 }
22092
22093 for (i = which = 0; i < nelt; ++i)
22094 {
22095 unsigned char e = sel[i];
22096 gcc_assert (e < 2 * nelt);
22097 d.perm[i] = e;
22098 perm[i] = e;
22099 which |= (e < nelt ? 1 : 2);
22100 }
22101
22102 if (d.testing_p)
22103 {
22104 /* For all elements from second vector, fold the elements to first. */
22105 if (which == 2)
22106 for (i = 0; i < nelt; ++i)
22107 d.perm[i] -= nelt;
22108
22109 /* Check whether the mask can be applied to the vector type. */
22110 d.one_operand_p = (which != 3);
22111
22112 /* Implementable with shufps, pshufd or pshuflw. */
22113 if (d.one_operand_p
22114 && (d.vmode == V4SFmode || d.vmode == V2SFmode
22115 || d.vmode == V4SImode || d.vmode == V2SImode
22116 || d.vmode == V4HImode || d.vmode == V2HImode))
22117 return true;
22118
22119 /* Otherwise we have to go through the motions and see if we can
22120 figure out how to generate the requested permutation. */
22121 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
22122 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
22123 if (!d.one_operand_p)
22124 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
22125
22126 start_sequence ();
22127 bool ret = ix86_expand_vec_perm_const_1 (&d);
22128 end_sequence ();
22129
22130 return ret;
22131 }
22132
22133 two_args = canonicalize_perm (&d);
22134
22135 /* If one of the operands is a zero vector, try to match pmovzx. */
22136 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
22137 {
22138 struct expand_vec_perm_d dzero = d;
22139 if (d.op0 == CONST0_RTX (vmode))
22140 {
22141 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
22142 std::swap (dzero.op0, dzero.op1);
22143 for (i = 0; i < nelt; ++i)
22144 dzero.perm[i] ^= nelt;
22145 }
22146 else
22147 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
22148
22149 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
22150 dzero.perm, nelt, dzero.testing_p))
22151 return true;
22152 }
22153
22154 /* Force operands into registers. */
22155 rtx nop0 = force_reg (vmode, d.op0);
22156 if (d.op0 == d.op1)
22157 d.op1 = nop0;
22158 d.op0 = nop0;
22159 d.op1 = force_reg (vmode, d.op1);
22160
22161 if (ix86_expand_vec_perm_const_1 (&d))
22162 return true;
22163
22164 /* If the selector says both arguments are needed, but the operands are the
22165 same, the above tried to expand with one_operand_p and flattened selector.
22166 If that didn't work, retry without one_operand_p; we succeeded with that
22167 during testing. */
22168 if (two_args && d.one_operand_p)
22169 {
22170 d.one_operand_p = false;
22171 memcpy (d.perm, perm, sizeof (perm));
22172 return ix86_expand_vec_perm_const_1 (&d);
22173 }
22174
22175 return false;
22176 }
22177
22178 void
22179 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
22180 {
22181 struct expand_vec_perm_d d;
22182 unsigned i, nelt;
22183
22184 d.target = targ;
22185 d.op0 = op0;
22186 d.op1 = op1;
22187 d.vmode = GET_MODE (targ);
22188 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22189 d.one_operand_p = false;
22190 d.testing_p = false;
22191
22192 for (i = 0; i < nelt; ++i)
22193 d.perm[i] = i * 2 + odd;
22194
22195 /* We'll either be able to implement the permutation directly... */
22196 if (expand_vec_perm_1 (&d))
22197 return;
22198
22199 /* ... or we use the special-case patterns. */
22200 expand_vec_perm_even_odd_1 (&d, odd);
22201 }
22202
22203 static void
22204 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
22205 {
22206 struct expand_vec_perm_d d;
22207 unsigned i, nelt, base;
22208 bool ok;
22209
22210 d.target = targ;
22211 d.op0 = op0;
22212 d.op1 = op1;
22213 d.vmode = GET_MODE (targ);
22214 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22215 d.one_operand_p = false;
22216 d.testing_p = false;
22217
22218 base = high_p ? nelt / 2 : 0;
22219 for (i = 0; i < nelt / 2; ++i)
22220 {
22221 d.perm[i * 2] = i + base;
22222 d.perm[i * 2 + 1] = i + base + nelt;
22223 }
22224
22225 /* Note that for AVX this isn't one instruction. */
22226 ok = ix86_expand_vec_perm_const_1 (&d);
22227 gcc_assert (ok);
22228 }
22229
22230 /* This function is similar as ix86_expand_vecop_qihi,
22231 but optimized under AVX512BW by using vpmovwb.
22232 For example, optimize vector MUL generation like
22233
22234 vpmovzxbw ymm2, xmm0
22235 vpmovzxbw ymm3, xmm1
22236 vpmullw ymm4, ymm2, ymm3
22237 vpmovwb xmm0, ymm4
22238
22239 it would take less instructions than ix86_expand_vecop_qihi.
22240 Return true if success. */
22241
22242 static bool
22243 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22244 {
22245 machine_mode himode, qimode = GET_MODE (dest);
22246 rtx hop1, hop2, hdest;
22247 rtx (*gen_extend)(rtx, rtx);
22248 rtx (*gen_truncate)(rtx, rtx);
22249 bool uns_p = (code == ASHIFTRT) ? false : true;
22250
22251 /* There's no V64HImode multiplication instruction. */
22252 if (qimode == E_V64QImode)
22253 return false;
22254
22255 /* vpmovwb only available under AVX512BW. */
22256 if (!TARGET_AVX512BW)
22257 return false;
22258 if ((qimode == V8QImode || qimode == V16QImode)
22259 && !TARGET_AVX512VL)
22260 return false;
22261 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
22262 if (qimode == V32QImode
22263 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
22264 return false;
22265
22266 switch (qimode)
22267 {
22268 case E_V8QImode:
22269 himode = V8HImode;
22270 gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
22271 gen_truncate = gen_truncv8hiv8qi2;
22272 break;
22273 case E_V16QImode:
22274 himode = V16HImode;
22275 gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
22276 gen_truncate = gen_truncv16hiv16qi2;
22277 break;
22278 case E_V32QImode:
22279 himode = V32HImode;
22280 gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
22281 gen_truncate = gen_truncv32hiv32qi2;
22282 break;
22283 default:
22284 gcc_unreachable ();
22285 }
22286
22287 hop1 = gen_reg_rtx (himode);
22288 hop2 = gen_reg_rtx (himode);
22289 hdest = gen_reg_rtx (himode);
22290 emit_insn (gen_extend (hop1, op1));
22291 emit_insn (gen_extend (hop2, op2));
22292 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
22293 hop1, hop2)));
22294 emit_insn (gen_truncate (dest, hdest));
22295 return true;
22296 }
22297
22298 /* Expand a vector operation shift by constant for a V*QImode in terms of the
22299 same operation on V*HImode. Return true if success. */
22300 static bool
22301 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
22302 rtx dest, rtx op1, rtx op2)
22303 {
22304 machine_mode qimode, himode;
22305 HOST_WIDE_INT and_constant, xor_constant;
22306 HOST_WIDE_INT shift_amount;
22307 rtx vec_const_and, vec_const_xor;
22308 rtx tmp, op1_subreg;
22309 rtx (*gen_shift) (rtx, rtx, rtx);
22310 rtx (*gen_and) (rtx, rtx, rtx);
22311 rtx (*gen_xor) (rtx, rtx, rtx);
22312 rtx (*gen_sub) (rtx, rtx, rtx);
22313
22314 /* Only optimize shift by constant. */
22315 if (!CONST_INT_P (op2))
22316 return false;
22317
22318 qimode = GET_MODE (dest);
22319 shift_amount = INTVAL (op2);
22320 /* Do nothing when shift amount greater equal 8. */
22321 if (shift_amount > 7)
22322 return false;
22323
22324 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
22325 /* Record sign bit. */
22326 xor_constant = 1 << (8 - shift_amount - 1);
22327
22328 /* Zero upper/lower bits shift from left/right element. */
22329 and_constant
22330 = (code == ASHIFT ? 256 - (1 << shift_amount)
22331 : (1 << (8 - shift_amount)) - 1);
22332
22333 switch (qimode)
22334 {
22335 case V16QImode:
22336 himode = V8HImode;
22337 gen_shift =
22338 ((code == ASHIFT)
22339 ? gen_ashlv8hi3
22340 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
22341 gen_and = gen_andv16qi3;
22342 gen_xor = gen_xorv16qi3;
22343 gen_sub = gen_subv16qi3;
22344 break;
22345 case V32QImode:
22346 himode = V16HImode;
22347 gen_shift =
22348 ((code == ASHIFT)
22349 ? gen_ashlv16hi3
22350 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
22351 gen_and = gen_andv32qi3;
22352 gen_xor = gen_xorv32qi3;
22353 gen_sub = gen_subv32qi3;
22354 break;
22355 case V64QImode:
22356 himode = V32HImode;
22357 gen_shift =
22358 ((code == ASHIFT)
22359 ? gen_ashlv32hi3
22360 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
22361 gen_and = gen_andv64qi3;
22362 gen_xor = gen_xorv64qi3;
22363 gen_sub = gen_subv64qi3;
22364 break;
22365 default:
22366 gcc_unreachable ();
22367 }
22368
22369 tmp = gen_reg_rtx (himode);
22370 vec_const_and = gen_reg_rtx (qimode);
22371 op1_subreg = lowpart_subreg (himode, op1, qimode);
22372
22373 /* For ASHIFT and LSHIFTRT, perform operation like
22374 vpsllw/vpsrlw $shift_amount, %op1, %dest.
22375 vpand %vec_const_and, %dest. */
22376 emit_insn (gen_shift (tmp, op1_subreg, op2));
22377 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
22378 emit_move_insn (vec_const_and,
22379 ix86_build_const_vector (qimode, true,
22380 gen_int_mode (and_constant, QImode)));
22381 emit_insn (gen_and (dest, dest, vec_const_and));
22382
22383 /* For ASHIFTRT, perform extra operation like
22384 vpxor %vec_const_xor, %dest, %dest
22385 vpsubb %vec_const_xor, %dest, %dest */
22386 if (code == ASHIFTRT)
22387 {
22388 vec_const_xor = gen_reg_rtx (qimode);
22389 emit_move_insn (vec_const_xor,
22390 ix86_build_const_vector (qimode, true,
22391 gen_int_mode (xor_constant, QImode)));
22392 emit_insn (gen_xor (dest, dest, vec_const_xor));
22393 emit_insn (gen_sub (dest, dest, vec_const_xor));
22394 }
22395 return true;
22396 }
22397
22398 /* Expand a vector operation CODE for a V*QImode in terms of the
22399 same operation on V*HImode. */
22400
22401 void
22402 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22403 {
22404 machine_mode qimode = GET_MODE (dest);
22405 machine_mode himode;
22406 rtx (*gen_il) (rtx, rtx, rtx);
22407 rtx (*gen_ih) (rtx, rtx, rtx);
22408 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
22409 struct expand_vec_perm_d d;
22410 bool ok, full_interleave;
22411 bool uns_p = false;
22412 int i;
22413
22414 if (CONST_INT_P (op2)
22415 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
22416 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
22417 return;
22418
22419 if (TARGET_AVX512BW
22420 && VECTOR_MODE_P (GET_MODE (op2))
22421 && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
22422 return;
22423
22424 switch (qimode)
22425 {
22426 case E_V16QImode:
22427 himode = V8HImode;
22428 gen_il = gen_vec_interleave_lowv16qi;
22429 gen_ih = gen_vec_interleave_highv16qi;
22430 break;
22431 case E_V32QImode:
22432 himode = V16HImode;
22433 gen_il = gen_avx2_interleave_lowv32qi;
22434 gen_ih = gen_avx2_interleave_highv32qi;
22435 break;
22436 case E_V64QImode:
22437 himode = V32HImode;
22438 gen_il = gen_avx512bw_interleave_lowv64qi;
22439 gen_ih = gen_avx512bw_interleave_highv64qi;
22440 break;
22441 default:
22442 gcc_unreachable ();
22443 }
22444
22445 switch (code)
22446 {
22447 case MULT:
22448 /* Unpack data such that we've got a source byte in each low byte of
22449 each word. We don't care what goes into the high byte of each word.
22450 Rather than trying to get zero in there, most convenient is to let
22451 it be a copy of the low byte. */
22452 op2_l = gen_reg_rtx (qimode);
22453 op2_h = gen_reg_rtx (qimode);
22454 emit_insn (gen_il (op2_l, op2, op2));
22455 emit_insn (gen_ih (op2_h, op2, op2));
22456
22457 op1_l = gen_reg_rtx (qimode);
22458 op1_h = gen_reg_rtx (qimode);
22459 emit_insn (gen_il (op1_l, op1, op1));
22460 emit_insn (gen_ih (op1_h, op1, op1));
22461 full_interleave = qimode == V16QImode;
22462 break;
22463
22464 case ASHIFT:
22465 case LSHIFTRT:
22466 uns_p = true;
22467 /* FALLTHRU */
22468 case ASHIFTRT:
22469 op1_l = gen_reg_rtx (himode);
22470 op1_h = gen_reg_rtx (himode);
22471 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
22472 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
22473 /* vashr/vlshr/vashl */
22474 if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22475 {
22476 rtx tmp = force_reg (qimode, op2);
22477 op2_l = gen_reg_rtx (himode);
22478 op2_h = gen_reg_rtx (himode);
22479 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
22480 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
22481 }
22482 else
22483 op2_l = op2_h = op2;
22484
22485 full_interleave = true;
22486 break;
22487 default:
22488 gcc_unreachable ();
22489 }
22490
22491 /* Perform vashr/vlshr/vashl. */
22492 if (code != MULT
22493 && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22494 {
22495 res_l = gen_reg_rtx (himode);
22496 res_h = gen_reg_rtx (himode);
22497 emit_insn (gen_rtx_SET (res_l,
22498 simplify_gen_binary (code, himode,
22499 op1_l, op2_l)));
22500 emit_insn (gen_rtx_SET (res_h,
22501 simplify_gen_binary (code, himode,
22502 op1_h, op2_h)));
22503 }
22504 /* Performance mult/ashr/lshr/ashl. */
22505 else
22506 {
22507 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
22508 1, OPTAB_DIRECT);
22509 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
22510 1, OPTAB_DIRECT);
22511 }
22512
22513 gcc_assert (res_l && res_h);
22514
22515 /* Merge the data back into the right place. */
22516 d.target = dest;
22517 d.op0 = gen_lowpart (qimode, res_l);
22518 d.op1 = gen_lowpart (qimode, res_h);
22519 d.vmode = qimode;
22520 d.nelt = GET_MODE_NUNITS (qimode);
22521 d.one_operand_p = false;
22522 d.testing_p = false;
22523
22524 if (full_interleave)
22525 {
22526 /* For SSE2, we used an full interleave, so the desired
22527 results are in the even elements. */
22528 for (i = 0; i < d.nelt; ++i)
22529 d.perm[i] = i * 2;
22530 }
22531 else
22532 {
22533 /* For AVX, the interleave used above was not cross-lane. So the
22534 extraction is evens but with the second and third quarter swapped.
22535 Happily, that is even one insn shorter than even extraction.
22536 For AVX512BW we have 4 lanes. We extract evens from within a lane,
22537 always first from the first and then from the second source operand,
22538 the index bits above the low 4 bits remains the same.
22539 Thus, for d.nelt == 32 we want permutation
22540 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
22541 and for d.nelt == 64 we want permutation
22542 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
22543 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
22544 for (i = 0; i < d.nelt; ++i)
22545 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
22546 }
22547
22548 ok = ix86_expand_vec_perm_const_1 (&d);
22549 gcc_assert (ok);
22550
22551 set_unique_reg_note (get_last_insn (), REG_EQUAL,
22552 gen_rtx_fmt_ee (code, qimode, op1, op2));
22553 }
22554
22555 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
22556 if op is CONST_VECTOR with all odd elements equal to their
22557 preceding element. */
22558
22559 static bool
22560 const_vector_equal_evenodd_p (rtx op)
22561 {
22562 machine_mode mode = GET_MODE (op);
22563 int i, nunits = GET_MODE_NUNITS (mode);
22564 if (GET_CODE (op) != CONST_VECTOR
22565 || nunits != CONST_VECTOR_NUNITS (op))
22566 return false;
22567 for (i = 0; i < nunits; i += 2)
22568 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
22569 return false;
22570 return true;
22571 }
22572
22573 void
22574 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
22575 bool uns_p, bool odd_p)
22576 {
22577 machine_mode mode = GET_MODE (op1);
22578 machine_mode wmode = GET_MODE (dest);
22579 rtx x;
22580 rtx orig_op1 = op1, orig_op2 = op2;
22581
22582 if (!nonimmediate_operand (op1, mode))
22583 op1 = force_reg (mode, op1);
22584 if (!nonimmediate_operand (op2, mode))
22585 op2 = force_reg (mode, op2);
22586
22587 /* We only play even/odd games with vectors of SImode. */
22588 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
22589
22590 /* If we're looking for the odd results, shift those members down to
22591 the even slots. For some cpus this is faster than a PSHUFD. */
22592 if (odd_p)
22593 {
22594 /* For XOP use vpmacsdqh, but only for smult, as it is only
22595 signed. */
22596 if (TARGET_XOP && mode == V4SImode && !uns_p)
22597 {
22598 x = force_reg (wmode, CONST0_RTX (wmode));
22599 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
22600 return;
22601 }
22602
22603 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
22604 if (!const_vector_equal_evenodd_p (orig_op1))
22605 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
22606 x, NULL, 1, OPTAB_DIRECT);
22607 if (!const_vector_equal_evenodd_p (orig_op2))
22608 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
22609 x, NULL, 1, OPTAB_DIRECT);
22610 op1 = gen_lowpart (mode, op1);
22611 op2 = gen_lowpart (mode, op2);
22612 }
22613
22614 if (mode == V16SImode)
22615 {
22616 if (uns_p)
22617 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
22618 else
22619 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
22620 }
22621 else if (mode == V8SImode)
22622 {
22623 if (uns_p)
22624 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
22625 else
22626 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
22627 }
22628 else if (uns_p)
22629 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
22630 else if (TARGET_SSE4_1)
22631 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
22632 else
22633 {
22634 rtx s1, s2, t0, t1, t2;
22635
22636 /* The easiest way to implement this without PMULDQ is to go through
22637 the motions as if we are performing a full 64-bit multiply. With
22638 the exception that we need to do less shuffling of the elements. */
22639
22640 /* Compute the sign-extension, aka highparts, of the two operands. */
22641 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22642 op1, pc_rtx, pc_rtx);
22643 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22644 op2, pc_rtx, pc_rtx);
22645
22646 /* Multiply LO(A) * HI(B), and vice-versa. */
22647 t1 = gen_reg_rtx (wmode);
22648 t2 = gen_reg_rtx (wmode);
22649 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
22650 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
22651
22652 /* Multiply LO(A) * LO(B). */
22653 t0 = gen_reg_rtx (wmode);
22654 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
22655
22656 /* Combine and shift the highparts into place. */
22657 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
22658 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
22659 1, OPTAB_DIRECT);
22660
22661 /* Combine high and low parts. */
22662 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
22663 return;
22664 }
22665 emit_insn (x);
22666 }
22667
22668 void
22669 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
22670 bool uns_p, bool high_p)
22671 {
22672 machine_mode wmode = GET_MODE (dest);
22673 machine_mode mode = GET_MODE (op1);
22674 rtx t1, t2, t3, t4, mask;
22675
22676 switch (mode)
22677 {
22678 case E_V4SImode:
22679 t1 = gen_reg_rtx (mode);
22680 t2 = gen_reg_rtx (mode);
22681 if (TARGET_XOP && !uns_p)
22682 {
22683 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
22684 shuffle the elements once so that all elements are in the right
22685 place for immediate use: { A C B D }. */
22686 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
22687 const1_rtx, GEN_INT (3)));
22688 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
22689 const1_rtx, GEN_INT (3)));
22690 }
22691 else
22692 {
22693 /* Put the elements into place for the multiply. */
22694 ix86_expand_vec_interleave (t1, op1, op1, high_p);
22695 ix86_expand_vec_interleave (t2, op2, op2, high_p);
22696 high_p = false;
22697 }
22698 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
22699 break;
22700
22701 case E_V8SImode:
22702 /* Shuffle the elements between the lanes. After this we
22703 have { A B E F | C D G H } for each operand. */
22704 t1 = gen_reg_rtx (V4DImode);
22705 t2 = gen_reg_rtx (V4DImode);
22706 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
22707 const0_rtx, const2_rtx,
22708 const1_rtx, GEN_INT (3)));
22709 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
22710 const0_rtx, const2_rtx,
22711 const1_rtx, GEN_INT (3)));
22712
22713 /* Shuffle the elements within the lanes. After this we
22714 have { A A B B | C C D D } or { E E F F | G G H H }. */
22715 t3 = gen_reg_rtx (V8SImode);
22716 t4 = gen_reg_rtx (V8SImode);
22717 mask = GEN_INT (high_p
22718 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
22719 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
22720 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
22721 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
22722
22723 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
22724 break;
22725
22726 case E_V8HImode:
22727 case E_V16HImode:
22728 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
22729 uns_p, OPTAB_DIRECT);
22730 t2 = expand_binop (mode,
22731 uns_p ? umul_highpart_optab : smul_highpart_optab,
22732 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
22733 gcc_assert (t1 && t2);
22734
22735 t3 = gen_reg_rtx (mode);
22736 ix86_expand_vec_interleave (t3, t1, t2, high_p);
22737 emit_move_insn (dest, gen_lowpart (wmode, t3));
22738 break;
22739
22740 case E_V16QImode:
22741 case E_V32QImode:
22742 case E_V32HImode:
22743 case E_V16SImode:
22744 case E_V64QImode:
22745 t1 = gen_reg_rtx (wmode);
22746 t2 = gen_reg_rtx (wmode);
22747 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
22748 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
22749
22750 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
22751 break;
22752
22753 default:
22754 gcc_unreachable ();
22755 }
22756 }
22757
22758 void
22759 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
22760 {
22761 rtx res_1, res_2, res_3, res_4;
22762
22763 res_1 = gen_reg_rtx (V4SImode);
22764 res_2 = gen_reg_rtx (V4SImode);
22765 res_3 = gen_reg_rtx (V2DImode);
22766 res_4 = gen_reg_rtx (V2DImode);
22767 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
22768 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
22769
22770 /* Move the results in element 2 down to element 1; we don't care
22771 what goes in elements 2 and 3. Then we can merge the parts
22772 back together with an interleave.
22773
22774 Note that two other sequences were tried:
22775 (1) Use interleaves at the start instead of psrldq, which allows
22776 us to use a single shufps to merge things back at the end.
22777 (2) Use shufps here to combine the two vectors, then pshufd to
22778 put the elements in the correct order.
22779 In both cases the cost of the reformatting stall was too high
22780 and the overall sequence slower. */
22781
22782 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
22783 const0_rtx, const2_rtx,
22784 const0_rtx, const0_rtx));
22785 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
22786 const0_rtx, const2_rtx,
22787 const0_rtx, const0_rtx));
22788 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
22789
22790 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
22791 }
22792
22793 void
22794 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
22795 {
22796 machine_mode mode = GET_MODE (op0);
22797 rtx t1, t2, t3, t4, t5, t6;
22798
22799 if (TARGET_AVX512DQ && mode == V8DImode)
22800 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
22801 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
22802 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
22803 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
22804 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
22805 else if (TARGET_XOP && mode == V2DImode)
22806 {
22807 /* op1: A,B,C,D, op2: E,F,G,H */
22808 op1 = gen_lowpart (V4SImode, op1);
22809 op2 = gen_lowpart (V4SImode, op2);
22810
22811 t1 = gen_reg_rtx (V4SImode);
22812 t2 = gen_reg_rtx (V4SImode);
22813 t3 = gen_reg_rtx (V2DImode);
22814 t4 = gen_reg_rtx (V2DImode);
22815
22816 /* t1: B,A,D,C */
22817 emit_insn (gen_sse2_pshufd_1 (t1, op1,
22818 GEN_INT (1),
22819 GEN_INT (0),
22820 GEN_INT (3),
22821 GEN_INT (2)));
22822
22823 /* t2: (B*E),(A*F),(D*G),(C*H) */
22824 emit_insn (gen_mulv4si3 (t2, t1, op2));
22825
22826 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
22827 emit_insn (gen_xop_phadddq (t3, t2));
22828
22829 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
22830 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
22831
22832 /* Multiply lower parts and add all */
22833 t5 = gen_reg_rtx (V2DImode);
22834 emit_insn (gen_vec_widen_umult_even_v4si (t5,
22835 gen_lowpart (V4SImode, op1),
22836 gen_lowpart (V4SImode, op2)));
22837 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
22838 }
22839 else
22840 {
22841 machine_mode nmode;
22842 rtx (*umul) (rtx, rtx, rtx);
22843
22844 if (mode == V2DImode)
22845 {
22846 umul = gen_vec_widen_umult_even_v4si;
22847 nmode = V4SImode;
22848 }
22849 else if (mode == V4DImode)
22850 {
22851 umul = gen_vec_widen_umult_even_v8si;
22852 nmode = V8SImode;
22853 }
22854 else if (mode == V8DImode)
22855 {
22856 umul = gen_vec_widen_umult_even_v16si;
22857 nmode = V16SImode;
22858 }
22859 else
22860 gcc_unreachable ();
22861
22862
22863 /* Multiply low parts. */
22864 t1 = gen_reg_rtx (mode);
22865 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
22866
22867 /* Shift input vectors right 32 bits so we can multiply high parts. */
22868 t6 = GEN_INT (32);
22869 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
22870 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
22871
22872 /* Multiply high parts by low parts. */
22873 t4 = gen_reg_rtx (mode);
22874 t5 = gen_reg_rtx (mode);
22875 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
22876 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
22877
22878 /* Combine and shift the highparts back. */
22879 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
22880 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
22881
22882 /* Combine high and low parts. */
22883 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
22884 }
22885
22886 set_unique_reg_note (get_last_insn (), REG_EQUAL,
22887 gen_rtx_MULT (mode, op1, op2));
22888 }
22889
22890 /* Return 1 if control tansfer instruction INSN
22891 should be encoded with notrack prefix. */
22892
22893 bool
22894 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
22895 {
22896 if (!insn || !((flag_cf_protection & CF_BRANCH)))
22897 return false;
22898
22899 if (CALL_P (insn))
22900 {
22901 rtx call = get_call_rtx_from (insn);
22902 gcc_assert (call != NULL_RTX);
22903 rtx addr = XEXP (call, 0);
22904
22905 /* Do not emit 'notrack' if it's not an indirect call. */
22906 if (MEM_P (addr)
22907 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
22908 return false;
22909 else
22910 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
22911 }
22912
22913 if (JUMP_P (insn) && !flag_cet_switch)
22914 {
22915 rtx target = JUMP_LABEL (insn);
22916 if (target == NULL_RTX || ANY_RETURN_P (target))
22917 return false;
22918
22919 /* Check the jump is a switch table. */
22920 rtx_insn *label = as_a<rtx_insn *> (target);
22921 rtx_insn *table = next_insn (label);
22922 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
22923 return false;
22924 else
22925 return true;
22926 }
22927 return false;
22928 }
22929
22930 /* Calculate integer abs() using only SSE2 instructions. */
22931
22932 void
22933 ix86_expand_sse2_abs (rtx target, rtx input)
22934 {
22935 machine_mode mode = GET_MODE (target);
22936 rtx tmp0, tmp1, x;
22937
22938 switch (mode)
22939 {
22940 case E_V2DImode:
22941 case E_V4DImode:
22942 /* For 64-bit signed integer X, with SSE4.2 use
22943 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
22944 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
22945 32 and use logical instead of arithmetic right shift (which is
22946 unimplemented) and subtract. */
22947 if (TARGET_SSE4_2)
22948 {
22949 tmp0 = gen_reg_rtx (mode);
22950 tmp1 = gen_reg_rtx (mode);
22951 emit_move_insn (tmp1, CONST0_RTX (mode));
22952 if (mode == E_V2DImode)
22953 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
22954 else
22955 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
22956 }
22957 else
22958 {
22959 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
22960 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
22961 - 1), NULL, 0, OPTAB_DIRECT);
22962 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
22963 }
22964
22965 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
22966 NULL, 0, OPTAB_DIRECT);
22967 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
22968 target, 0, OPTAB_DIRECT);
22969 break;
22970
22971 case E_V4SImode:
22972 /* For 32-bit signed integer X, the best way to calculate the absolute
22973 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
22974 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
22975 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
22976 NULL, 0, OPTAB_DIRECT);
22977 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
22978 NULL, 0, OPTAB_DIRECT);
22979 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
22980 target, 0, OPTAB_DIRECT);
22981 break;
22982
22983 case E_V8HImode:
22984 /* For 16-bit signed integer X, the best way to calculate the absolute
22985 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
22986 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
22987
22988 x = expand_simple_binop (mode, SMAX, tmp0, input,
22989 target, 0, OPTAB_DIRECT);
22990 break;
22991
22992 case E_V16QImode:
22993 /* For 8-bit signed integer X, the best way to calculate the absolute
22994 value of X is min ((unsigned char) X, (unsigned char) (-X)),
22995 as SSE2 provides the PMINUB insn. */
22996 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
22997
22998 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
22999 target, 0, OPTAB_DIRECT);
23000 break;
23001
23002 default:
23003 gcc_unreachable ();
23004 }
23005
23006 if (x != target)
23007 emit_move_insn (target, x);
23008 }
23009
23010 /* Expand an extract from a vector register through pextr insn.
23011 Return true if successful. */
23012
23013 bool
23014 ix86_expand_pextr (rtx *operands)
23015 {
23016 rtx dst = operands[0];
23017 rtx src = operands[1];
23018
23019 unsigned int size = INTVAL (operands[2]);
23020 unsigned int pos = INTVAL (operands[3]);
23021
23022 if (SUBREG_P (dst))
23023 {
23024 /* Reject non-lowpart subregs. */
23025 if (SUBREG_BYTE (dst) > 0)
23026 return false;
23027 dst = SUBREG_REG (dst);
23028 }
23029
23030 if (SUBREG_P (src))
23031 {
23032 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
23033 src = SUBREG_REG (src);
23034 }
23035
23036 switch (GET_MODE (src))
23037 {
23038 case E_V16QImode:
23039 case E_V8HImode:
23040 case E_V4SImode:
23041 case E_V2DImode:
23042 case E_V1TImode:
23043 {
23044 machine_mode srcmode, dstmode;
23045 rtx d, pat;
23046
23047 if (!int_mode_for_size (size, 0).exists (&dstmode))
23048 return false;
23049
23050 switch (dstmode)
23051 {
23052 case E_QImode:
23053 if (!TARGET_SSE4_1)
23054 return false;
23055 srcmode = V16QImode;
23056 break;
23057
23058 case E_HImode:
23059 if (!TARGET_SSE2)
23060 return false;
23061 srcmode = V8HImode;
23062 break;
23063
23064 case E_SImode:
23065 if (!TARGET_SSE4_1)
23066 return false;
23067 srcmode = V4SImode;
23068 break;
23069
23070 case E_DImode:
23071 gcc_assert (TARGET_64BIT);
23072 if (!TARGET_SSE4_1)
23073 return false;
23074 srcmode = V2DImode;
23075 break;
23076
23077 default:
23078 return false;
23079 }
23080
23081 /* Reject extractions from misaligned positions. */
23082 if (pos & (size-1))
23083 return false;
23084
23085 if (GET_MODE (dst) == dstmode)
23086 d = dst;
23087 else
23088 d = gen_reg_rtx (dstmode);
23089
23090 /* Construct insn pattern. */
23091 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
23092 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
23093
23094 /* Let the rtl optimizers know about the zero extension performed. */
23095 if (dstmode == QImode || dstmode == HImode)
23096 {
23097 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
23098 d = gen_lowpart (SImode, d);
23099 }
23100
23101 emit_insn (gen_rtx_SET (d, pat));
23102
23103 if (d != dst)
23104 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23105 return true;
23106 }
23107
23108 default:
23109 return false;
23110 }
23111 }
23112
23113 /* Expand an insert into a vector register through pinsr insn.
23114 Return true if successful. */
23115
23116 bool
23117 ix86_expand_pinsr (rtx *operands)
23118 {
23119 rtx dst = operands[0];
23120 rtx src = operands[3];
23121
23122 unsigned int size = INTVAL (operands[1]);
23123 unsigned int pos = INTVAL (operands[2]);
23124
23125 if (SUBREG_P (dst))
23126 {
23127 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
23128 dst = SUBREG_REG (dst);
23129 }
23130
23131 switch (GET_MODE (dst))
23132 {
23133 case E_V16QImode:
23134 case E_V8HImode:
23135 case E_V4SImode:
23136 case E_V2DImode:
23137 case E_V1TImode:
23138 {
23139 machine_mode srcmode, dstmode;
23140 rtx (*pinsr)(rtx, rtx, rtx, rtx);
23141 rtx d;
23142
23143 if (!int_mode_for_size (size, 0).exists (&srcmode))
23144 return false;
23145
23146 switch (srcmode)
23147 {
23148 case E_QImode:
23149 if (!TARGET_SSE4_1)
23150 return false;
23151 dstmode = V16QImode;
23152 pinsr = gen_sse4_1_pinsrb;
23153 break;
23154
23155 case E_HImode:
23156 if (!TARGET_SSE2)
23157 return false;
23158 dstmode = V8HImode;
23159 pinsr = gen_sse2_pinsrw;
23160 break;
23161
23162 case E_SImode:
23163 if (!TARGET_SSE4_1)
23164 return false;
23165 dstmode = V4SImode;
23166 pinsr = gen_sse4_1_pinsrd;
23167 break;
23168
23169 case E_DImode:
23170 gcc_assert (TARGET_64BIT);
23171 if (!TARGET_SSE4_1)
23172 return false;
23173 dstmode = V2DImode;
23174 pinsr = gen_sse4_1_pinsrq;
23175 break;
23176
23177 default:
23178 return false;
23179 }
23180
23181 /* Reject insertions to misaligned positions. */
23182 if (pos & (size-1))
23183 return false;
23184
23185 if (SUBREG_P (src))
23186 {
23187 unsigned int srcpos = SUBREG_BYTE (src);
23188
23189 if (srcpos > 0)
23190 {
23191 rtx extr_ops[4];
23192
23193 extr_ops[0] = gen_reg_rtx (srcmode);
23194 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
23195 extr_ops[2] = GEN_INT (size);
23196 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
23197
23198 if (!ix86_expand_pextr (extr_ops))
23199 return false;
23200
23201 src = extr_ops[0];
23202 }
23203 else
23204 src = gen_lowpart (srcmode, SUBREG_REG (src));
23205 }
23206
23207 if (GET_MODE (dst) == dstmode)
23208 d = dst;
23209 else
23210 d = gen_reg_rtx (dstmode);
23211
23212 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
23213 gen_lowpart (srcmode, src),
23214 GEN_INT (1 << (pos / size))));
23215 if (d != dst)
23216 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23217 return true;
23218 }
23219
23220 default:
23221 return false;
23222 }
23223 }
23224
23225 /* All CPUs prefer to avoid cross-lane operations so perform reductions
23226 upper against lower halves up to SSE reg size. */
23227
23228 machine_mode
23229 ix86_split_reduction (machine_mode mode)
23230 {
23231 /* Reduce lowpart against highpart until we reach SSE reg width to
23232 avoid cross-lane operations. */
23233 switch (mode)
23234 {
23235 case E_V8DImode:
23236 case E_V4DImode:
23237 return V2DImode;
23238 case E_V16SImode:
23239 case E_V8SImode:
23240 return V4SImode;
23241 case E_V32HImode:
23242 case E_V16HImode:
23243 return V8HImode;
23244 case E_V64QImode:
23245 case E_V32QImode:
23246 return V16QImode;
23247 case E_V16SFmode:
23248 case E_V8SFmode:
23249 return V4SFmode;
23250 case E_V8DFmode:
23251 case E_V4DFmode:
23252 return V2DFmode;
23253 default:
23254 return mode;
23255 }
23256 }
23257
23258 /* Generate call to __divmoddi4. */
23259
23260 void
23261 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
23262 rtx op0, rtx op1,
23263 rtx *quot_p, rtx *rem_p)
23264 {
23265 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
23266
23267 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
23268 mode, op0, mode, op1, mode,
23269 XEXP (rem, 0), Pmode);
23270 *quot_p = quot;
23271 *rem_p = rem;
23272 }
23273
23274 void
23275 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
23276 enum rtx_code code, bool after,
23277 bool doubleword)
23278 {
23279 rtx old_reg, new_reg, old_mem, success;
23280 machine_mode mode = GET_MODE (target);
23281 rtx_code_label *loop_label = NULL;
23282
23283 old_reg = gen_reg_rtx (mode);
23284 new_reg = old_reg;
23285 old_mem = copy_to_reg (mem);
23286 loop_label = gen_label_rtx ();
23287 emit_label (loop_label);
23288 emit_move_insn (old_reg, old_mem);
23289
23290 /* return value for atomic_fetch_op. */
23291 if (!after)
23292 emit_move_insn (target, old_reg);
23293
23294 if (code == NOT)
23295 {
23296 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
23297 true, OPTAB_LIB_WIDEN);
23298 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
23299 }
23300 else
23301 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
23302 true, OPTAB_LIB_WIDEN);
23303
23304 /* return value for atomic_op_fetch. */
23305 if (after)
23306 emit_move_insn (target, new_reg);
23307
23308 success = NULL_RTX;
23309
23310 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
23311 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
23312 SImode),
23313 doubleword, loop_label);
23314 }
23315
23316 /* Relax cmpxchg instruction, param loop_label indicates whether
23317 the instruction should be relaxed with a pause loop. If not,
23318 it will be relaxed to an atomic load + compare, and skip
23319 cmpxchg instruction if mem != exp_input. */
23320
23321 void
23322 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
23323 rtx mem, rtx exp_input, rtx new_input,
23324 rtx mem_model, bool doubleword,
23325 rtx_code_label *loop_label)
23326 {
23327 rtx_code_label *cmp_label = NULL;
23328 rtx_code_label *done_label = NULL;
23329 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
23330 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
23331 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
23332 machine_mode mode = GET_MODE (target_val), hmode = mode;
23333
23334 if (*ptarget_bool == NULL)
23335 target_bool = gen_reg_rtx (QImode);
23336 else
23337 target_bool = *ptarget_bool;
23338
23339 cmp_label = gen_label_rtx ();
23340 done_label = gen_label_rtx ();
23341
23342 new_mem = gen_reg_rtx (mode);
23343 /* Load memory first. */
23344 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
23345
23346 switch (mode)
23347 {
23348 case E_TImode:
23349 gendw = gen_atomic_compare_and_swapti_doubleword;
23350 hmode = DImode;
23351 break;
23352 case E_DImode:
23353 if (doubleword)
23354 {
23355 gendw = gen_atomic_compare_and_swapdi_doubleword;
23356 hmode = SImode;
23357 }
23358 else
23359 gen = gen_atomic_compare_and_swapdi_1;
23360 break;
23361 case E_SImode:
23362 gen = gen_atomic_compare_and_swapsi_1;
23363 break;
23364 case E_HImode:
23365 gen = gen_atomic_compare_and_swaphi_1;
23366 break;
23367 case E_QImode:
23368 gen = gen_atomic_compare_and_swapqi_1;
23369 break;
23370 default:
23371 gcc_unreachable ();
23372 }
23373
23374 /* Compare mem value with expected value. */
23375 if (doubleword)
23376 {
23377 rtx low_new_mem = gen_lowpart (hmode, new_mem);
23378 rtx low_exp_input = gen_lowpart (hmode, exp_input);
23379 rtx high_new_mem = gen_highpart (hmode, new_mem);
23380 rtx high_exp_input = gen_highpart (hmode, exp_input);
23381 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
23382 hmode, 1, cmp_label,
23383 profile_probability::guessed_never ());
23384 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
23385 hmode, 1, cmp_label,
23386 profile_probability::guessed_never ());
23387 }
23388 else
23389 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
23390 GET_MODE (exp_input), 1, cmp_label,
23391 profile_probability::guessed_never ());
23392
23393 /* Directly emits cmpxchg here. */
23394 if (doubleword)
23395 emit_insn (gendw (target_val, mem, exp_input,
23396 gen_lowpart (hmode, new_input),
23397 gen_highpart (hmode, new_input),
23398 mem_model));
23399 else
23400 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
23401
23402 if (!loop_label)
23403 {
23404 emit_jump_insn (gen_jump (done_label));
23405 emit_barrier ();
23406 emit_label (cmp_label);
23407 emit_move_insn (target_val, new_mem);
23408 emit_label (done_label);
23409 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23410 const0_rtx);
23411 }
23412 else
23413 {
23414 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23415 const0_rtx);
23416 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
23417 GET_MODE (target_bool), 1, loop_label,
23418 profile_probability::guessed_never ());
23419 emit_jump_insn (gen_jump (done_label));
23420 emit_barrier ();
23421
23422 /* If mem is not expected, pause and loop back. */
23423 emit_label (cmp_label);
23424 emit_move_insn (target_val, new_mem);
23425 emit_insn (gen_pause ());
23426 emit_jump_insn (gen_jump (loop_label));
23427 emit_barrier ();
23428 emit_label (done_label);
23429 }
23430
23431 *ptarget_bool = target_bool;
23432 }
23433
23434 #include "gt-i386-expand.h"