]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386-expand.cc
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
1 /* Copyright (C) 1988-2023 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95 #include "asan.h"
96
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
102
103 void
104 split_double_mode (machine_mode mode, rtx operands[],
105 int num, rtx lo_half[], rtx hi_half[])
106 {
107 machine_mode half_mode;
108 unsigned int byte;
109 rtx mem_op = NULL_RTX;
110 int mem_num = 0;
111
112 switch (mode)
113 {
114 case E_TImode:
115 half_mode = DImode;
116 break;
117 case E_DImode:
118 half_mode = SImode;
119 break;
120 case E_P2HImode:
121 half_mode = HImode;
122 break;
123 case E_P2QImode:
124 half_mode = QImode;
125 break;
126 default:
127 gcc_unreachable ();
128 }
129
130 byte = GET_MODE_SIZE (half_mode);
131
132 while (num--)
133 {
134 rtx op = operands[num];
135
136 /* simplify_subreg refuse to split volatile memory addresses,
137 but we still have to handle it. */
138 if (MEM_P (op))
139 {
140 if (mem_op && rtx_equal_p (op, mem_op))
141 {
142 lo_half[num] = lo_half[mem_num];
143 hi_half[num] = hi_half[mem_num];
144 }
145 else
146 {
147 mem_op = op;
148 mem_num = num;
149 lo_half[num] = adjust_address (op, half_mode, 0);
150 hi_half[num] = adjust_address (op, half_mode, byte);
151 }
152 }
153 else
154 {
155 lo_half[num] = simplify_gen_subreg (half_mode, op,
156 GET_MODE (op) == VOIDmode
157 ? mode : GET_MODE (op), 0);
158
159 rtx tmp = simplify_gen_subreg (half_mode, op,
160 GET_MODE (op) == VOIDmode
161 ? mode : GET_MODE (op), byte);
162 /* simplify_gen_subreg will return NULL RTX for the
163 high half of the paradoxical subreg. */
164 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
165 }
166 }
167 }
168
169 /* Emit the double word assignment DST = { LO, HI }. */
170
171 void
172 split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
173 {
174 rtx dlo, dhi;
175 int deleted_move_count = 0;
176 split_double_mode (mode, &dst, 1, &dlo, &dhi);
177 /* Constraints ensure that if both lo and hi are MEMs, then
178 dst has early-clobber and thus addresses of MEMs don't use
179 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
180 dlo/dhi are registers. */
181 if (MEM_P (lo)
182 && rtx_equal_p (dlo, hi)
183 && reg_overlap_mentioned_p (dhi, lo))
184 {
185 /* If dlo is same as hi and lo's address uses dhi register,
186 code below would first emit_move_insn (dhi, hi)
187 and then emit_move_insn (dlo, lo). But the former
188 would invalidate lo's address. Load into dhi first,
189 then swap. */
190 emit_move_insn (dhi, lo);
191 lo = dhi;
192 }
193 else if (MEM_P (hi)
194 && !MEM_P (lo)
195 && !rtx_equal_p (dlo, lo)
196 && reg_overlap_mentioned_p (dlo, hi))
197 {
198 /* In this case, code below would first emit_move_insn (dlo, lo)
199 and then emit_move_insn (dhi, hi). But the former would
200 invalidate hi's address. Load into dhi first. */
201 emit_move_insn (dhi, hi);
202 hi = dhi;
203 }
204 if (!rtx_equal_p (dlo, hi))
205 {
206 if (!rtx_equal_p (dlo, lo))
207 emit_move_insn (dlo, lo);
208 else
209 deleted_move_count++;
210 if (!rtx_equal_p (dhi, hi))
211 emit_move_insn (dhi, hi);
212 else
213 deleted_move_count++;
214 }
215 else if (!rtx_equal_p (lo, dhi))
216 {
217 if (!rtx_equal_p (dhi, hi))
218 emit_move_insn (dhi, hi);
219 else
220 deleted_move_count++;
221 if (!rtx_equal_p (dlo, lo))
222 emit_move_insn (dlo, lo);
223 else
224 deleted_move_count++;
225 }
226 else if (mode == TImode)
227 emit_insn (gen_swapdi (dlo, dhi));
228 else
229 emit_insn (gen_swapsi (dlo, dhi));
230
231 if (deleted_move_count == 2)
232 emit_note (NOTE_INSN_DELETED);
233 }
234
235
236 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
237 for the target. */
238
239 void
240 ix86_expand_clear (rtx dest)
241 {
242 rtx tmp;
243
244 /* We play register width games, which are only valid after reload. */
245 gcc_assert (reload_completed);
246
247 /* Avoid HImode and its attendant prefix byte. */
248 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
249 dest = gen_rtx_REG (SImode, REGNO (dest));
250 tmp = gen_rtx_SET (dest, const0_rtx);
251
252 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
253 {
254 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
255 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
256 }
257
258 emit_insn (tmp);
259 }
260
261 /* Return true if V can be broadcasted from an integer of WIDTH bits
262 which is returned in VAL_BROADCAST. Otherwise, return false. */
263
264 static bool
265 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
266 HOST_WIDE_INT &val_broadcast)
267 {
268 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
269 val_broadcast = wi::extract_uhwi (val, 0, width);
270 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
271 {
272 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
273 if (val_broadcast != each)
274 return false;
275 }
276 val_broadcast = sext_hwi (val_broadcast, width);
277 return true;
278 }
279
280 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
281
282 static rtx
283 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
284 {
285 /* Don't use integer vector broadcast if we can't move from GPR to SSE
286 register directly. */
287 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
288 return nullptr;
289
290 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
291 broadcast only if vector broadcast is available. */
292 if (!TARGET_AVX
293 || !CONST_WIDE_INT_P (op)
294 || standard_sse_constant_p (op, mode))
295 return nullptr;
296
297 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
298 HOST_WIDE_INT val_broadcast;
299 scalar_int_mode broadcast_mode;
300 if (TARGET_AVX2
301 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
302 val_broadcast))
303 broadcast_mode = QImode;
304 else if (TARGET_AVX2
305 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
306 val_broadcast))
307 broadcast_mode = HImode;
308 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
309 val_broadcast))
310 broadcast_mode = SImode;
311 else if (TARGET_64BIT
312 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
313 val_broadcast))
314 broadcast_mode = DImode;
315 else
316 return nullptr;
317
318 /* Check if OP can be broadcasted from VAL. */
319 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
320 if (val != CONST_WIDE_INT_ELT (op, i))
321 return nullptr;
322
323 unsigned int nunits = (GET_MODE_SIZE (mode)
324 / GET_MODE_SIZE (broadcast_mode));
325 machine_mode vector_mode;
326 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
327 gcc_unreachable ();
328 rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
329 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
330 target,
331 GEN_INT (val_broadcast));
332 gcc_assert (ok);
333 target = lowpart_subreg (mode, target, vector_mode);
334 return target;
335 }
336
337 void
338 ix86_expand_move (machine_mode mode, rtx operands[])
339 {
340 rtx op0, op1;
341 rtx tmp, addend = NULL_RTX;
342 enum tls_model model;
343
344 op0 = operands[0];
345 op1 = operands[1];
346
347 /* Avoid complex sets of likely spilled hard registers before reload. */
348 if (!ix86_hardreg_mov_ok (op0, op1))
349 {
350 tmp = gen_reg_rtx (mode);
351 operands[0] = tmp;
352 ix86_expand_move (mode, operands);
353 operands[0] = op0;
354 operands[1] = tmp;
355 op1 = tmp;
356 }
357
358 switch (GET_CODE (op1))
359 {
360 case CONST:
361 tmp = XEXP (op1, 0);
362
363 if (GET_CODE (tmp) != PLUS
364 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
365 break;
366
367 op1 = XEXP (tmp, 0);
368 addend = XEXP (tmp, 1);
369 /* FALLTHRU */
370
371 case SYMBOL_REF:
372 model = SYMBOL_REF_TLS_MODEL (op1);
373
374 if (model)
375 op1 = legitimize_tls_address (op1, model, true);
376 else if (ix86_force_load_from_GOT_p (op1))
377 {
378 /* Load the external function address via GOT slot to avoid PLT. */
379 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
380 (TARGET_64BIT
381 ? UNSPEC_GOTPCREL
382 : UNSPEC_GOT));
383 op1 = gen_rtx_CONST (Pmode, op1);
384 op1 = gen_const_mem (Pmode, op1);
385 set_mem_alias_set (op1, ix86_GOT_alias_set ());
386 }
387 else
388 {
389 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
390 if (tmp)
391 {
392 op1 = tmp;
393 if (!addend)
394 break;
395 }
396 else
397 {
398 op1 = operands[1];
399 break;
400 }
401 }
402
403 if (addend)
404 {
405 op1 = force_operand (op1, NULL_RTX);
406 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
407 op0, 1, OPTAB_DIRECT);
408 }
409 else
410 op1 = force_operand (op1, op0);
411
412 if (op1 == op0)
413 return;
414
415 op1 = convert_to_mode (mode, op1, 1);
416
417 default:
418 break;
419 }
420
421 if ((flag_pic || MACHOPIC_INDIRECT)
422 && symbolic_operand (op1, mode))
423 {
424 if (TARGET_MACHO && !TARGET_64BIT)
425 {
426 #if TARGET_MACHO
427 /* dynamic-no-pic */
428 if (MACHOPIC_INDIRECT)
429 {
430 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
431 ? op0 : gen_reg_rtx (Pmode);
432 op1 = machopic_indirect_data_reference (op1, temp);
433 if (MACHOPIC_PURE)
434 op1 = machopic_legitimize_pic_address (op1, mode,
435 temp == op1 ? 0 : temp);
436 }
437 if (op0 != op1 && GET_CODE (op0) != MEM)
438 {
439 rtx insn = gen_rtx_SET (op0, op1);
440 emit_insn (insn);
441 return;
442 }
443 if (GET_CODE (op0) == MEM)
444 op1 = force_reg (Pmode, op1);
445 else
446 {
447 rtx temp = op0;
448 if (GET_CODE (temp) != REG)
449 temp = gen_reg_rtx (Pmode);
450 temp = legitimize_pic_address (op1, temp);
451 if (temp == op0)
452 return;
453 op1 = temp;
454 }
455 /* dynamic-no-pic */
456 #endif
457 }
458 else
459 {
460 if (MEM_P (op0))
461 op1 = force_reg (mode, op1);
462 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
463 {
464 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
465 op1 = legitimize_pic_address (op1, reg);
466 if (op0 == op1)
467 return;
468 op1 = convert_to_mode (mode, op1, 1);
469 }
470 }
471 }
472 else
473 {
474 if (MEM_P (op0)
475 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
476 || !push_operand (op0, mode))
477 && MEM_P (op1))
478 op1 = force_reg (mode, op1);
479
480 if (push_operand (op0, mode)
481 && ! general_no_elim_operand (op1, mode))
482 op1 = copy_to_mode_reg (mode, op1);
483
484 /* Force large constants in 64bit compilation into register
485 to get them CSEed. */
486 if (can_create_pseudo_p ()
487 && (mode == DImode) && TARGET_64BIT
488 && immediate_operand (op1, mode)
489 && !x86_64_zext_immediate_operand (op1, VOIDmode)
490 && !register_operand (op0, mode)
491 && optimize)
492 op1 = copy_to_mode_reg (mode, op1);
493
494 if (can_create_pseudo_p ())
495 {
496 if (CONST_DOUBLE_P (op1))
497 {
498 /* If we are loading a floating point constant to a
499 register, force the value to memory now, since we'll
500 get better code out the back end. */
501
502 op1 = validize_mem (force_const_mem (mode, op1));
503 if (!register_operand (op0, mode))
504 {
505 rtx temp = gen_reg_rtx (mode);
506 emit_insn (gen_rtx_SET (temp, op1));
507 emit_move_insn (op0, temp);
508 return;
509 }
510 }
511 else if (GET_MODE_SIZE (mode) >= 16)
512 {
513 rtx tmp = ix86_convert_const_wide_int_to_broadcast
514 (GET_MODE (op0), op1);
515 if (tmp != nullptr)
516 op1 = tmp;
517 }
518 }
519 }
520
521 emit_insn (gen_rtx_SET (op0, op1));
522 }
523
524 /* OP is a memref of CONST_VECTOR, return scalar constant mem
525 if CONST_VECTOR is a vec_duplicate, else return NULL. */
526 static rtx
527 ix86_broadcast_from_constant (machine_mode mode, rtx op)
528 {
529 int nunits = GET_MODE_NUNITS (mode);
530 if (nunits < 2)
531 return nullptr;
532
533 /* Don't use integer vector broadcast if we can't move from GPR to SSE
534 register directly. */
535 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
536 && INTEGRAL_MODE_P (mode))
537 return nullptr;
538
539 /* Convert CONST_VECTOR to a non-standard SSE constant integer
540 broadcast only if vector broadcast is available. */
541 if (!(TARGET_AVX2
542 || (TARGET_AVX
543 && (GET_MODE_INNER (mode) == SImode
544 || GET_MODE_INNER (mode) == DImode))
545 || FLOAT_MODE_P (mode))
546 || standard_sse_constant_p (op, mode))
547 return nullptr;
548
549 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
550 We can still put 64-bit integer constant in memory when
551 avx512 embed broadcast is available. */
552 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
553 && (!TARGET_AVX512F
554 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
555 return nullptr;
556
557 if (GET_MODE_INNER (mode) == TImode)
558 return nullptr;
559
560 rtx constant = get_pool_constant (XEXP (op, 0));
561 if (GET_CODE (constant) != CONST_VECTOR)
562 return nullptr;
563
564 /* There could be some rtx like
565 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
566 but with "*.LC1" refer to V2DI constant vector. */
567 if (GET_MODE (constant) != mode)
568 {
569 constant = simplify_subreg (mode, constant, GET_MODE (constant),
570 0);
571 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
572 return nullptr;
573 }
574
575 rtx first = XVECEXP (constant, 0, 0);
576
577 for (int i = 1; i < nunits; ++i)
578 {
579 rtx tmp = XVECEXP (constant, 0, i);
580 /* Vector duplicate value. */
581 if (!rtx_equal_p (tmp, first))
582 return nullptr;
583 }
584
585 return first;
586 }
587
588 void
589 ix86_expand_vector_move (machine_mode mode, rtx operands[])
590 {
591 rtx op0 = operands[0], op1 = operands[1];
592 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
593 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
594 unsigned int align = (TARGET_IAMCU
595 ? GET_MODE_BITSIZE (mode)
596 : GET_MODE_ALIGNMENT (mode));
597
598 if (push_operand (op0, VOIDmode))
599 op0 = emit_move_resolve_push (mode, op0);
600
601 /* Force constants other than zero into memory. We do not know how
602 the instructions used to build constants modify the upper 64 bits
603 of the register, once we have that information we may be able
604 to handle some of them more efficiently. */
605 if (can_create_pseudo_p ()
606 && (CONSTANT_P (op1)
607 || (SUBREG_P (op1)
608 && CONSTANT_P (SUBREG_REG (op1))))
609 && ((register_operand (op0, mode)
610 && !standard_sse_constant_p (op1, mode))
611 /* ix86_expand_vector_move_misalign() does not like constants. */
612 || (SSE_REG_MODE_P (mode)
613 && MEM_P (op0)
614 && MEM_ALIGN (op0) < align)))
615 {
616 if (SUBREG_P (op1))
617 {
618 machine_mode imode = GET_MODE (SUBREG_REG (op1));
619 rtx r = force_const_mem (imode, SUBREG_REG (op1));
620 if (r)
621 r = validize_mem (r);
622 else
623 r = force_reg (imode, SUBREG_REG (op1));
624 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
625 }
626 else
627 {
628 machine_mode mode = GET_MODE (op0);
629 rtx tmp = ix86_convert_const_wide_int_to_broadcast
630 (mode, op1);
631 if (tmp == nullptr)
632 op1 = validize_mem (force_const_mem (mode, op1));
633 else
634 op1 = tmp;
635 }
636 }
637
638 if (can_create_pseudo_p ()
639 && GET_MODE_SIZE (mode) >= 16
640 && VECTOR_MODE_P (mode)
641 && (MEM_P (op1)
642 && SYMBOL_REF_P (XEXP (op1, 0))
643 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
644 {
645 rtx first = ix86_broadcast_from_constant (mode, op1);
646 if (first != nullptr)
647 {
648 /* Broadcast to XMM/YMM/ZMM register from an integer
649 constant or scalar mem. */
650 op1 = gen_reg_rtx (mode);
651 if (FLOAT_MODE_P (mode)
652 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
653 first = force_const_mem (GET_MODE_INNER (mode), first);
654 bool ok = ix86_expand_vector_init_duplicate (false, mode,
655 op1, first);
656 gcc_assert (ok);
657 emit_move_insn (op0, op1);
658 return;
659 }
660 }
661
662 /* We need to check memory alignment for SSE mode since attribute
663 can make operands unaligned. */
664 if (can_create_pseudo_p ()
665 && SSE_REG_MODE_P (mode)
666 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
667 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
668 {
669 rtx tmp[2];
670
671 /* ix86_expand_vector_move_misalign() does not like both
672 arguments in memory. */
673 if (!register_operand (op0, mode)
674 && !register_operand (op1, mode))
675 {
676 rtx scratch = ix86_gen_scratch_sse_rtx (mode);
677 emit_move_insn (scratch, op1);
678 op1 = scratch;
679 }
680
681 tmp[0] = op0; tmp[1] = op1;
682 ix86_expand_vector_move_misalign (mode, tmp);
683 return;
684 }
685
686 /* Special case TImode to V1TImode conversions, via V2DI. */
687 if (mode == V1TImode
688 && SUBREG_P (op1)
689 && GET_MODE (SUBREG_REG (op1)) == TImode
690 && TARGET_64BIT && TARGET_SSE
691 && can_create_pseudo_p ())
692 {
693 rtx tmp = gen_reg_rtx (V2DImode);
694 rtx lo = gen_reg_rtx (DImode);
695 rtx hi = gen_reg_rtx (DImode);
696 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
697 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
698 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
699 emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
700 return;
701 }
702
703 /* If operand0 is a hard register, make operand1 a pseudo. */
704 if (can_create_pseudo_p ()
705 && !ix86_hardreg_mov_ok (op0, op1))
706 {
707 rtx tmp = gen_reg_rtx (GET_MODE (op0));
708 emit_move_insn (tmp, op1);
709 emit_move_insn (op0, tmp);
710 return;
711 }
712
713 /* Make operand1 a register if it isn't already. */
714 if (can_create_pseudo_p ()
715 && !register_operand (op0, mode)
716 && !register_operand (op1, mode))
717 {
718 rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
719 emit_move_insn (tmp, op1);
720 emit_move_insn (op0, tmp);
721 return;
722 }
723
724 emit_insn (gen_rtx_SET (op0, op1));
725 }
726
727 /* Split 32-byte AVX unaligned load and store if needed. */
728
729 static void
730 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
731 {
732 rtx m;
733 rtx (*extract) (rtx, rtx, rtx);
734 machine_mode mode;
735
736 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
737 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
738 {
739 emit_insn (gen_rtx_SET (op0, op1));
740 return;
741 }
742
743 rtx orig_op0 = NULL_RTX;
744 mode = GET_MODE (op0);
745 switch (GET_MODE_CLASS (mode))
746 {
747 case MODE_VECTOR_INT:
748 case MODE_INT:
749 if (mode != V32QImode)
750 {
751 if (!MEM_P (op0))
752 {
753 orig_op0 = op0;
754 op0 = gen_reg_rtx (V32QImode);
755 }
756 else
757 op0 = gen_lowpart (V32QImode, op0);
758 op1 = gen_lowpart (V32QImode, op1);
759 mode = V32QImode;
760 }
761 break;
762 case MODE_VECTOR_FLOAT:
763 break;
764 default:
765 gcc_unreachable ();
766 }
767
768 switch (mode)
769 {
770 default:
771 gcc_unreachable ();
772 case E_V32QImode:
773 extract = gen_avx_vextractf128v32qi;
774 mode = V16QImode;
775 break;
776 case E_V16BFmode:
777 extract = gen_avx_vextractf128v16bf;
778 mode = V8BFmode;
779 break;
780 case E_V16HFmode:
781 extract = gen_avx_vextractf128v16hf;
782 mode = V8HFmode;
783 break;
784 case E_V8SFmode:
785 extract = gen_avx_vextractf128v8sf;
786 mode = V4SFmode;
787 break;
788 case E_V4DFmode:
789 extract = gen_avx_vextractf128v4df;
790 mode = V2DFmode;
791 break;
792 }
793
794 if (MEM_P (op1))
795 {
796 rtx r = gen_reg_rtx (mode);
797 m = adjust_address (op1, mode, 0);
798 emit_move_insn (r, m);
799 m = adjust_address (op1, mode, 16);
800 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
801 emit_move_insn (op0, r);
802 }
803 else if (MEM_P (op0))
804 {
805 m = adjust_address (op0, mode, 0);
806 emit_insn (extract (m, op1, const0_rtx));
807 m = adjust_address (op0, mode, 16);
808 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
809 }
810 else
811 gcc_unreachable ();
812
813 if (orig_op0)
814 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
815 }
816
817 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
818 straight to ix86_expand_vector_move. */
819 /* Code generation for scalar reg-reg moves of single and double precision data:
820 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
821 movaps reg, reg
822 else
823 movss reg, reg
824 if (x86_sse_partial_reg_dependency == true)
825 movapd reg, reg
826 else
827 movsd reg, reg
828
829 Code generation for scalar loads of double precision data:
830 if (x86_sse_split_regs == true)
831 movlpd mem, reg (gas syntax)
832 else
833 movsd mem, reg
834
835 Code generation for unaligned packed loads of single precision data
836 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
837 if (x86_sse_unaligned_move_optimal)
838 movups mem, reg
839
840 if (x86_sse_partial_reg_dependency == true)
841 {
842 xorps reg, reg
843 movlps mem, reg
844 movhps mem+8, reg
845 }
846 else
847 {
848 movlps mem, reg
849 movhps mem+8, reg
850 }
851
852 Code generation for unaligned packed loads of double precision data
853 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
854 if (x86_sse_unaligned_move_optimal)
855 movupd mem, reg
856
857 if (x86_sse_split_regs == true)
858 {
859 movlpd mem, reg
860 movhpd mem+8, reg
861 }
862 else
863 {
864 movsd mem, reg
865 movhpd mem+8, reg
866 }
867 */
868
869 void
870 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
871 {
872 rtx op0, op1, m;
873
874 op0 = operands[0];
875 op1 = operands[1];
876
877 /* Use unaligned load/store for AVX512 or when optimizing for size. */
878 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
879 {
880 emit_insn (gen_rtx_SET (op0, op1));
881 return;
882 }
883
884 if (TARGET_AVX)
885 {
886 if (GET_MODE_SIZE (mode) == 32)
887 ix86_avx256_split_vector_move_misalign (op0, op1);
888 else
889 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
890 emit_insn (gen_rtx_SET (op0, op1));
891 return;
892 }
893
894 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
895 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
896 {
897 emit_insn (gen_rtx_SET (op0, op1));
898 return;
899 }
900
901 /* ??? If we have typed data, then it would appear that using
902 movdqu is the only way to get unaligned data loaded with
903 integer type. */
904 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
905 {
906 emit_insn (gen_rtx_SET (op0, op1));
907 return;
908 }
909
910 if (MEM_P (op1))
911 {
912 if (TARGET_SSE2 && mode == V2DFmode)
913 {
914 rtx zero;
915
916 /* When SSE registers are split into halves, we can avoid
917 writing to the top half twice. */
918 if (TARGET_SSE_SPLIT_REGS)
919 {
920 emit_clobber (op0);
921 zero = op0;
922 }
923 else
924 {
925 /* ??? Not sure about the best option for the Intel chips.
926 The following would seem to satisfy; the register is
927 entirely cleared, breaking the dependency chain. We
928 then store to the upper half, with a dependency depth
929 of one. A rumor has it that Intel recommends two movsd
930 followed by an unpacklpd, but this is unconfirmed. And
931 given that the dependency depth of the unpacklpd would
932 still be one, I'm not sure why this would be better. */
933 zero = CONST0_RTX (V2DFmode);
934 }
935
936 m = adjust_address (op1, DFmode, 0);
937 emit_insn (gen_sse2_loadlpd (op0, zero, m));
938 m = adjust_address (op1, DFmode, 8);
939 emit_insn (gen_sse2_loadhpd (op0, op0, m));
940 }
941 else
942 {
943 rtx t;
944
945 if (mode != V4SFmode)
946 t = gen_reg_rtx (V4SFmode);
947 else
948 t = op0;
949
950 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
951 emit_move_insn (t, CONST0_RTX (V4SFmode));
952 else
953 emit_clobber (t);
954
955 m = adjust_address (op1, V2SFmode, 0);
956 emit_insn (gen_sse_loadlps (t, t, m));
957 m = adjust_address (op1, V2SFmode, 8);
958 emit_insn (gen_sse_loadhps (t, t, m));
959 if (mode != V4SFmode)
960 emit_move_insn (op0, gen_lowpart (mode, t));
961 }
962 }
963 else if (MEM_P (op0))
964 {
965 if (TARGET_SSE2 && mode == V2DFmode)
966 {
967 m = adjust_address (op0, DFmode, 0);
968 emit_insn (gen_sse2_storelpd (m, op1));
969 m = adjust_address (op0, DFmode, 8);
970 emit_insn (gen_sse2_storehpd (m, op1));
971 }
972 else
973 {
974 if (mode != V4SFmode)
975 op1 = gen_lowpart (V4SFmode, op1);
976
977 m = adjust_address (op0, V2SFmode, 0);
978 emit_insn (gen_sse_storelps (m, op1));
979 m = adjust_address (op0, V2SFmode, 8);
980 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
981 }
982 }
983 else
984 gcc_unreachable ();
985 }
986
987 /* Move bits 64:95 to bits 32:63. */
988
989 void
990 ix86_move_vector_high_sse_to_mmx (rtx op)
991 {
992 rtx mask = gen_rtx_PARALLEL (VOIDmode,
993 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
994 GEN_INT (0), GEN_INT (0)));
995 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
996 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
997 rtx insn = gen_rtx_SET (dest, op);
998 emit_insn (insn);
999 }
1000
1001 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1002
1003 void
1004 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1005 {
1006 rtx op0 = operands[0];
1007 rtx op1 = operands[1];
1008 rtx op2 = operands[2];
1009
1010 machine_mode dmode = GET_MODE (op0);
1011 machine_mode smode = GET_MODE (op1);
1012 machine_mode inner_dmode = GET_MODE_INNER (dmode);
1013 machine_mode inner_smode = GET_MODE_INNER (smode);
1014
1015 /* Get the corresponding SSE mode for destination. */
1016 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1017 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1018 nunits).require ();
1019 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1020 nunits / 2).require ();
1021
1022 /* Get the corresponding SSE mode for source. */
1023 nunits = 16 / GET_MODE_SIZE (inner_smode);
1024 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1025 nunits).require ();
1026
1027 /* Generate SSE pack with signed/unsigned saturation. */
1028 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1029 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1030 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1031
1032 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1033 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1034 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
1035 op1, op2));
1036 emit_insn (insn);
1037
1038 ix86_move_vector_high_sse_to_mmx (op0);
1039 }
1040
1041 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1042
1043 void
1044 ix86_split_mmx_punpck (rtx operands[], bool high_p)
1045 {
1046 rtx op0 = operands[0];
1047 rtx op1 = operands[1];
1048 rtx op2 = operands[2];
1049 machine_mode mode = GET_MODE (op0);
1050 rtx mask;
1051 /* The corresponding SSE mode. */
1052 machine_mode sse_mode, double_sse_mode;
1053
1054 switch (mode)
1055 {
1056 case E_V4QImode:
1057 case E_V8QImode:
1058 sse_mode = V16QImode;
1059 double_sse_mode = V32QImode;
1060 mask = gen_rtx_PARALLEL (VOIDmode,
1061 gen_rtvec (16,
1062 GEN_INT (0), GEN_INT (16),
1063 GEN_INT (1), GEN_INT (17),
1064 GEN_INT (2), GEN_INT (18),
1065 GEN_INT (3), GEN_INT (19),
1066 GEN_INT (4), GEN_INT (20),
1067 GEN_INT (5), GEN_INT (21),
1068 GEN_INT (6), GEN_INT (22),
1069 GEN_INT (7), GEN_INT (23)));
1070 break;
1071
1072 case E_V4HImode:
1073 case E_V2HImode:
1074 sse_mode = V8HImode;
1075 double_sse_mode = V16HImode;
1076 mask = gen_rtx_PARALLEL (VOIDmode,
1077 gen_rtvec (8,
1078 GEN_INT (0), GEN_INT (8),
1079 GEN_INT (1), GEN_INT (9),
1080 GEN_INT (2), GEN_INT (10),
1081 GEN_INT (3), GEN_INT (11)));
1082 break;
1083
1084 case E_V2SImode:
1085 sse_mode = V4SImode;
1086 double_sse_mode = V8SImode;
1087 mask = gen_rtx_PARALLEL (VOIDmode,
1088 gen_rtvec (4,
1089 GEN_INT (0), GEN_INT (4),
1090 GEN_INT (1), GEN_INT (5)));
1091 break;
1092
1093 case E_V2SFmode:
1094 sse_mode = V4SFmode;
1095 double_sse_mode = V8SFmode;
1096 mask = gen_rtx_PARALLEL (VOIDmode,
1097 gen_rtvec (4,
1098 GEN_INT (0), GEN_INT (4),
1099 GEN_INT (1), GEN_INT (5)));
1100 break;
1101
1102 default:
1103 gcc_unreachable ();
1104 }
1105
1106 /* Generate SSE punpcklXX. */
1107 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1108 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1109 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1110
1111 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1112 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1113 rtx insn = gen_rtx_SET (dest, op2);
1114 emit_insn (insn);
1115
1116 /* Move high bits to low bits. */
1117 if (high_p)
1118 {
1119 if (sse_mode == V4SFmode)
1120 {
1121 mask = gen_rtx_PARALLEL (VOIDmode,
1122 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1123 GEN_INT (4), GEN_INT (5)));
1124 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1125 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1126 }
1127 else
1128 {
1129 int sz = GET_MODE_SIZE (mode);
1130
1131 if (sz == 4)
1132 mask = gen_rtx_PARALLEL (VOIDmode,
1133 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1134 GEN_INT (0), GEN_INT (1)));
1135 else if (sz == 8)
1136 mask = gen_rtx_PARALLEL (VOIDmode,
1137 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1138 GEN_INT (0), GEN_INT (1)));
1139 else
1140 gcc_unreachable ();
1141
1142 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1143 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1144 }
1145
1146 insn = gen_rtx_SET (dest, op1);
1147 emit_insn (insn);
1148 }
1149 }
1150
1151 /* Helper function of ix86_fixup_binary_operands to canonicalize
1152 operand order. Returns true if the operands should be swapped. */
1153
1154 static bool
1155 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1156 rtx operands[])
1157 {
1158 rtx dst = operands[0];
1159 rtx src1 = operands[1];
1160 rtx src2 = operands[2];
1161
1162 /* If the operation is not commutative, we can't do anything. */
1163 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1164 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1165 return false;
1166
1167 /* Highest priority is that src1 should match dst. */
1168 if (rtx_equal_p (dst, src1))
1169 return false;
1170 if (rtx_equal_p (dst, src2))
1171 return true;
1172
1173 /* Next highest priority is that immediate constants come second. */
1174 if (immediate_operand (src2, mode))
1175 return false;
1176 if (immediate_operand (src1, mode))
1177 return true;
1178
1179 /* Lowest priority is that memory references should come second. */
1180 if (MEM_P (src2))
1181 return false;
1182 if (MEM_P (src1))
1183 return true;
1184
1185 return false;
1186 }
1187
1188
1189 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1190 destination to use for the operation. If different from the true
1191 destination in operands[0], a copy operation will be required. */
1192
1193 rtx
1194 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1195 rtx operands[])
1196 {
1197 rtx dst = operands[0];
1198 rtx src1 = operands[1];
1199 rtx src2 = operands[2];
1200
1201 /* Canonicalize operand order. */
1202 if (ix86_swap_binary_operands_p (code, mode, operands))
1203 {
1204 /* It is invalid to swap operands of different modes. */
1205 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1206
1207 std::swap (src1, src2);
1208 }
1209
1210 /* Both source operands cannot be in memory. */
1211 if (MEM_P (src1) && MEM_P (src2))
1212 {
1213 /* Optimization: Only read from memory once. */
1214 if (rtx_equal_p (src1, src2))
1215 {
1216 src2 = force_reg (mode, src2);
1217 src1 = src2;
1218 }
1219 else if (rtx_equal_p (dst, src1))
1220 src2 = force_reg (mode, src2);
1221 else
1222 src1 = force_reg (mode, src1);
1223 }
1224
1225 /* If the destination is memory, and we do not have matching source
1226 operands, do things in registers. */
1227 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1228 dst = gen_reg_rtx (mode);
1229
1230 /* Source 1 cannot be a constant. */
1231 if (CONSTANT_P (src1))
1232 src1 = force_reg (mode, src1);
1233
1234 /* Source 1 cannot be a non-matching memory. */
1235 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1236 src1 = force_reg (mode, src1);
1237
1238 /* Improve address combine. */
1239 if (code == PLUS
1240 && GET_MODE_CLASS (mode) == MODE_INT
1241 && MEM_P (src2))
1242 src2 = force_reg (mode, src2);
1243
1244 operands[1] = src1;
1245 operands[2] = src2;
1246 return dst;
1247 }
1248
1249 /* Similarly, but assume that the destination has already been
1250 set up properly. */
1251
1252 void
1253 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1254 machine_mode mode, rtx operands[])
1255 {
1256 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1257 gcc_assert (dst == operands[0]);
1258 }
1259
1260 /* Attempt to expand a binary operator. Make the expansion closer to the
1261 actual machine, then just general_operand, which will allow 3 separate
1262 memory references (one output, two input) in a single insn. */
1263
1264 void
1265 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1266 rtx operands[])
1267 {
1268 rtx src1, src2, dst, op, clob;
1269
1270 dst = ix86_fixup_binary_operands (code, mode, operands);
1271 src1 = operands[1];
1272 src2 = operands[2];
1273
1274 /* Emit the instruction. */
1275
1276 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1277
1278 if (reload_completed
1279 && code == PLUS
1280 && !rtx_equal_p (dst, src1))
1281 {
1282 /* This is going to be an LEA; avoid splitting it later. */
1283 emit_insn (op);
1284 }
1285 else
1286 {
1287 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1288 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1289 }
1290
1291 /* Fix up the destination if needed. */
1292 if (dst != operands[0])
1293 emit_move_insn (operands[0], dst);
1294 }
1295
1296 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1297 the given OPERANDS. */
1298
1299 void
1300 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1301 rtx operands[])
1302 {
1303 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1304 if (SUBREG_P (operands[1]))
1305 {
1306 op1 = operands[1];
1307 op2 = operands[2];
1308 }
1309 else if (SUBREG_P (operands[2]))
1310 {
1311 op1 = operands[2];
1312 op2 = operands[1];
1313 }
1314 /* Optimize (__m128i) d | (__m128i) e and similar code
1315 when d and e are float vectors into float vector logical
1316 insn. In C/C++ without using intrinsics there is no other way
1317 to express vector logical operation on float vectors than
1318 to cast them temporarily to integer vectors. */
1319 if (op1
1320 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1321 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1322 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1323 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1324 && SUBREG_BYTE (op1) == 0
1325 && (GET_CODE (op2) == CONST_VECTOR
1326 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1327 && SUBREG_BYTE (op2) == 0))
1328 && can_create_pseudo_p ())
1329 {
1330 rtx dst;
1331 switch (GET_MODE (SUBREG_REG (op1)))
1332 {
1333 case E_V4SFmode:
1334 case E_V8SFmode:
1335 case E_V16SFmode:
1336 case E_V2DFmode:
1337 case E_V4DFmode:
1338 case E_V8DFmode:
1339 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1340 if (GET_CODE (op2) == CONST_VECTOR)
1341 {
1342 op2 = gen_lowpart (GET_MODE (dst), op2);
1343 op2 = force_reg (GET_MODE (dst), op2);
1344 }
1345 else
1346 {
1347 op1 = operands[1];
1348 op2 = SUBREG_REG (operands[2]);
1349 if (!vector_operand (op2, GET_MODE (dst)))
1350 op2 = force_reg (GET_MODE (dst), op2);
1351 }
1352 op1 = SUBREG_REG (op1);
1353 if (!vector_operand (op1, GET_MODE (dst)))
1354 op1 = force_reg (GET_MODE (dst), op1);
1355 emit_insn (gen_rtx_SET (dst,
1356 gen_rtx_fmt_ee (code, GET_MODE (dst),
1357 op1, op2)));
1358 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1359 return;
1360 default:
1361 break;
1362 }
1363 }
1364 if (!vector_operand (operands[1], mode))
1365 operands[1] = force_reg (mode, operands[1]);
1366 if (!vector_operand (operands[2], mode))
1367 operands[2] = force_reg (mode, operands[2]);
1368 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1369 emit_insn (gen_rtx_SET (operands[0],
1370 gen_rtx_fmt_ee (code, mode, operands[1],
1371 operands[2])));
1372 }
1373
1374 /* Return TRUE or FALSE depending on whether the binary operator meets the
1375 appropriate constraints. */
1376
1377 bool
1378 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1379 rtx operands[3])
1380 {
1381 rtx dst = operands[0];
1382 rtx src1 = operands[1];
1383 rtx src2 = operands[2];
1384
1385 /* Both source operands cannot be in memory. */
1386 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1387 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1388 return false;
1389
1390 /* Canonicalize operand order for commutative operators. */
1391 if (ix86_swap_binary_operands_p (code, mode, operands))
1392 std::swap (src1, src2);
1393
1394 /* If the destination is memory, we must have a matching source operand. */
1395 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1396 return false;
1397
1398 /* Source 1 cannot be a constant. */
1399 if (CONSTANT_P (src1))
1400 return false;
1401
1402 /* Source 1 cannot be a non-matching memory. */
1403 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1404 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1405 return (code == AND
1406 && (mode == HImode
1407 || mode == SImode
1408 || (TARGET_64BIT && mode == DImode))
1409 && satisfies_constraint_L (src2));
1410
1411 return true;
1412 }
1413
1414 /* Attempt to expand a unary operator. Make the expansion closer to the
1415 actual machine, then just general_operand, which will allow 2 separate
1416 memory references (one output, one input) in a single insn. */
1417
1418 void
1419 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1420 rtx operands[])
1421 {
1422 bool matching_memory = false;
1423 rtx src, dst, op, clob;
1424
1425 dst = operands[0];
1426 src = operands[1];
1427
1428 /* If the destination is memory, and we do not have matching source
1429 operands, do things in registers. */
1430 if (MEM_P (dst))
1431 {
1432 if (rtx_equal_p (dst, src))
1433 matching_memory = true;
1434 else
1435 dst = gen_reg_rtx (mode);
1436 }
1437
1438 /* When source operand is memory, destination must match. */
1439 if (MEM_P (src) && !matching_memory)
1440 src = force_reg (mode, src);
1441
1442 /* Emit the instruction. */
1443
1444 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1445
1446 if (code == NOT)
1447 emit_insn (op);
1448 else
1449 {
1450 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1451 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1452 }
1453
1454 /* Fix up the destination if needed. */
1455 if (dst != operands[0])
1456 emit_move_insn (operands[0], dst);
1457 }
1458
1459 /* Predict just emitted jump instruction to be taken with probability PROB. */
1460
1461 static void
1462 predict_jump (int prob)
1463 {
1464 rtx_insn *insn = get_last_insn ();
1465 gcc_assert (JUMP_P (insn));
1466 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1467 }
1468
1469 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1470 divisor are within the range [0-255]. */
1471
1472 void
1473 ix86_split_idivmod (machine_mode mode, rtx operands[],
1474 bool unsigned_p)
1475 {
1476 rtx_code_label *end_label, *qimode_label;
1477 rtx div, mod;
1478 rtx_insn *insn;
1479 rtx scratch, tmp0, tmp1, tmp2;
1480 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1481
1482 operands[2] = force_reg (mode, operands[2]);
1483 operands[3] = force_reg (mode, operands[3]);
1484
1485 switch (mode)
1486 {
1487 case E_SImode:
1488 if (GET_MODE (operands[0]) == SImode)
1489 {
1490 if (GET_MODE (operands[1]) == SImode)
1491 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1492 else
1493 gen_divmod4_1
1494 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1495 }
1496 else
1497 gen_divmod4_1
1498 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1499 break;
1500
1501 case E_DImode:
1502 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1503 break;
1504
1505 default:
1506 gcc_unreachable ();
1507 }
1508
1509 end_label = gen_label_rtx ();
1510 qimode_label = gen_label_rtx ();
1511
1512 scratch = gen_reg_rtx (mode);
1513
1514 /* Use 8bit unsigned divimod if dividend and divisor are within
1515 the range [0-255]. */
1516 emit_move_insn (scratch, operands[2]);
1517 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1518 scratch, 1, OPTAB_DIRECT);
1519 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1520 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1521 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1522 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1523 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1524 pc_rtx);
1525 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1526 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1527 JUMP_LABEL (insn) = qimode_label;
1528
1529 /* Generate original signed/unsigned divimod. */
1530 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1531 operands[2], operands[3]));
1532
1533 /* Branch to the end. */
1534 emit_jump_insn (gen_jump (end_label));
1535 emit_barrier ();
1536
1537 /* Generate 8bit unsigned divide. */
1538 emit_label (qimode_label);
1539 /* Don't use operands[0] for result of 8bit divide since not all
1540 registers support QImode ZERO_EXTRACT. */
1541 tmp0 = lowpart_subreg (HImode, scratch, mode);
1542 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1543 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1544 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1545
1546 if (unsigned_p)
1547 {
1548 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1549 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1550 }
1551 else
1552 {
1553 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1554 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1555 }
1556 if (mode == SImode)
1557 {
1558 if (GET_MODE (operands[0]) != SImode)
1559 div = gen_rtx_ZERO_EXTEND (DImode, div);
1560 if (GET_MODE (operands[1]) != SImode)
1561 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1562 }
1563
1564 /* Extract remainder from AH. */
1565 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1566 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1567 GEN_INT (8), GEN_INT (8));
1568 insn = emit_move_insn (operands[1], tmp1);
1569 set_unique_reg_note (insn, REG_EQUAL, mod);
1570
1571 /* Zero extend quotient from AL. */
1572 tmp1 = gen_lowpart (QImode, tmp0);
1573 insn = emit_insn (gen_extend_insn
1574 (operands[0], tmp1,
1575 GET_MODE (operands[0]), QImode, 1));
1576 set_unique_reg_note (insn, REG_EQUAL, div);
1577
1578 emit_label (end_label);
1579 }
1580
1581 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1582 matches destination. RTX includes clobber of FLAGS_REG. */
1583
1584 void
1585 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1586 rtx dst, rtx src)
1587 {
1588 rtx op, clob;
1589
1590 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1591 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1592
1593 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1594 }
1595
1596 /* Return true if regno1 def is nearest to the insn. */
1597
1598 static bool
1599 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1600 {
1601 rtx_insn *prev = insn;
1602 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1603
1604 if (insn == start)
1605 return false;
1606 while (prev && prev != start)
1607 {
1608 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1609 {
1610 prev = PREV_INSN (prev);
1611 continue;
1612 }
1613 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1614 return true;
1615 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1616 return false;
1617 prev = PREV_INSN (prev);
1618 }
1619
1620 /* None of the regs is defined in the bb. */
1621 return false;
1622 }
1623
1624 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1625 int ix86_last_zero_store_uid;
1626
1627 /* Split lea instructions into a sequence of instructions
1628 which are executed on ALU to avoid AGU stalls.
1629 It is assumed that it is allowed to clobber flags register
1630 at lea position. */
1631
1632 void
1633 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1634 {
1635 unsigned int regno0, regno1, regno2;
1636 struct ix86_address parts;
1637 rtx target, tmp;
1638 int ok, adds;
1639
1640 ok = ix86_decompose_address (operands[1], &parts);
1641 gcc_assert (ok);
1642
1643 target = gen_lowpart (mode, operands[0]);
1644
1645 regno0 = true_regnum (target);
1646 regno1 = INVALID_REGNUM;
1647 regno2 = INVALID_REGNUM;
1648
1649 if (parts.base)
1650 {
1651 parts.base = gen_lowpart (mode, parts.base);
1652 regno1 = true_regnum (parts.base);
1653 }
1654
1655 if (parts.index)
1656 {
1657 parts.index = gen_lowpart (mode, parts.index);
1658 regno2 = true_regnum (parts.index);
1659 }
1660
1661 if (parts.disp)
1662 parts.disp = gen_lowpart (mode, parts.disp);
1663
1664 if (parts.scale > 1)
1665 {
1666 /* Case r1 = r1 + ... */
1667 if (regno1 == regno0)
1668 {
1669 /* If we have a case r1 = r1 + C * r2 then we
1670 should use multiplication which is very
1671 expensive. Assume cost model is wrong if we
1672 have such case here. */
1673 gcc_assert (regno2 != regno0);
1674
1675 for (adds = parts.scale; adds > 0; adds--)
1676 ix86_emit_binop (PLUS, mode, target, parts.index);
1677 }
1678 else
1679 {
1680 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1681 if (regno0 != regno2)
1682 emit_insn (gen_rtx_SET (target, parts.index));
1683
1684 /* Use shift for scaling, but emit it as MULT instead
1685 to avoid it being immediately peephole2 optimized back
1686 into lea. */
1687 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1688
1689 if (parts.base)
1690 ix86_emit_binop (PLUS, mode, target, parts.base);
1691
1692 if (parts.disp && parts.disp != const0_rtx)
1693 ix86_emit_binop (PLUS, mode, target, parts.disp);
1694 }
1695 }
1696 else if (!parts.base && !parts.index)
1697 {
1698 gcc_assert(parts.disp);
1699 emit_insn (gen_rtx_SET (target, parts.disp));
1700 }
1701 else
1702 {
1703 if (!parts.base)
1704 {
1705 if (regno0 != regno2)
1706 emit_insn (gen_rtx_SET (target, parts.index));
1707 }
1708 else if (!parts.index)
1709 {
1710 if (regno0 != regno1)
1711 emit_insn (gen_rtx_SET (target, parts.base));
1712 }
1713 else
1714 {
1715 if (regno0 == regno1)
1716 tmp = parts.index;
1717 else if (regno0 == regno2)
1718 tmp = parts.base;
1719 else
1720 {
1721 rtx tmp1;
1722
1723 /* Find better operand for SET instruction, depending
1724 on which definition is farther from the insn. */
1725 if (find_nearest_reg_def (insn, regno1, regno2))
1726 tmp = parts.index, tmp1 = parts.base;
1727 else
1728 tmp = parts.base, tmp1 = parts.index;
1729
1730 emit_insn (gen_rtx_SET (target, tmp));
1731
1732 if (parts.disp && parts.disp != const0_rtx)
1733 ix86_emit_binop (PLUS, mode, target, parts.disp);
1734
1735 ix86_emit_binop (PLUS, mode, target, tmp1);
1736 return;
1737 }
1738
1739 ix86_emit_binop (PLUS, mode, target, tmp);
1740 }
1741
1742 if (parts.disp && parts.disp != const0_rtx)
1743 ix86_emit_binop (PLUS, mode, target, parts.disp);
1744 }
1745 }
1746
1747 /* Post-reload splitter for converting an SF or DFmode value in an
1748 SSE register into an unsigned SImode. */
1749
1750 void
1751 ix86_split_convert_uns_si_sse (rtx operands[])
1752 {
1753 machine_mode vecmode;
1754 rtx value, large, zero_or_two31, input, two31, x;
1755
1756 large = operands[1];
1757 zero_or_two31 = operands[2];
1758 input = operands[3];
1759 two31 = operands[4];
1760 vecmode = GET_MODE (large);
1761 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1762
1763 /* Load up the value into the low element. We must ensure that the other
1764 elements are valid floats -- zero is the easiest such value. */
1765 if (MEM_P (input))
1766 {
1767 if (vecmode == V4SFmode)
1768 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1769 else
1770 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1771 }
1772 else
1773 {
1774 input = gen_rtx_REG (vecmode, REGNO (input));
1775 emit_move_insn (value, CONST0_RTX (vecmode));
1776 if (vecmode == V4SFmode)
1777 emit_insn (gen_sse_movss_v4sf (value, value, input));
1778 else
1779 emit_insn (gen_sse2_movsd_v2df (value, value, input));
1780 }
1781
1782 emit_move_insn (large, two31);
1783 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1784
1785 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1786 emit_insn (gen_rtx_SET (large, x));
1787
1788 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1789 emit_insn (gen_rtx_SET (zero_or_two31, x));
1790
1791 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1792 emit_insn (gen_rtx_SET (value, x));
1793
1794 large = gen_rtx_REG (V4SImode, REGNO (large));
1795 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1796
1797 x = gen_rtx_REG (V4SImode, REGNO (value));
1798 if (vecmode == V4SFmode)
1799 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1800 else
1801 emit_insn (gen_sse2_cvttpd2dq (x, value));
1802 value = x;
1803
1804 emit_insn (gen_xorv4si3 (value, value, large));
1805 }
1806
1807 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1808 machine_mode mode, rtx target,
1809 rtx var, int one_var);
1810
1811 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1812 Expects the 64-bit DImode to be supplied in a pair of integral
1813 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1814 -mfpmath=sse, !optimize_size only. */
1815
1816 void
1817 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1818 {
1819 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1820 rtx int_xmm, fp_xmm;
1821 rtx biases, exponents;
1822 rtx x;
1823
1824 int_xmm = gen_reg_rtx (V4SImode);
1825 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1826 emit_insn (gen_movdi_to_sse (int_xmm, input));
1827 else if (TARGET_SSE_SPLIT_REGS)
1828 {
1829 emit_clobber (int_xmm);
1830 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1831 }
1832 else
1833 {
1834 x = gen_reg_rtx (V2DImode);
1835 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1836 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1837 }
1838
1839 x = gen_rtx_CONST_VECTOR (V4SImode,
1840 gen_rtvec (4, GEN_INT (0x43300000UL),
1841 GEN_INT (0x45300000UL),
1842 const0_rtx, const0_rtx));
1843 exponents = validize_mem (force_const_mem (V4SImode, x));
1844
1845 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1846 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1847
1848 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1849 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1850 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1851 (0x1.0p84 + double(fp_value_hi_xmm)).
1852 Note these exponents differ by 32. */
1853
1854 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1855
1856 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1857 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1858 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1859 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1860 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1861 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1862 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1863 biases = validize_mem (force_const_mem (V2DFmode, biases));
1864 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1865
1866 /* Add the upper and lower DFmode values together. */
1867 if (TARGET_SSE3)
1868 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1869 else
1870 {
1871 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1872 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1873 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1874 }
1875
1876 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1877 }
1878
1879 /* Not used, but eases macroization of patterns. */
1880 void
1881 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1882 {
1883 gcc_unreachable ();
1884 }
1885
1886 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1887
1888 /* Convert an unsigned SImode value into a DFmode. Only currently used
1889 for SSE, but applicable anywhere. */
1890
1891 void
1892 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1893 {
1894 REAL_VALUE_TYPE TWO31r;
1895 rtx x, fp;
1896
1897 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1898 NULL, 1, OPTAB_DIRECT);
1899
1900 fp = gen_reg_rtx (DFmode);
1901 emit_insn (gen_floatsidf2 (fp, x));
1902
1903 real_ldexp (&TWO31r, &dconst1, 31);
1904 x = const_double_from_real_value (TWO31r, DFmode);
1905
1906 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1907
1908 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1909 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1910 x = ix86_expand_sse_fabs (x, NULL);
1911
1912 if (x != target)
1913 emit_move_insn (target, x);
1914 }
1915
1916 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1917 32-bit mode; otherwise we have a direct convert instruction. */
1918
1919 void
1920 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1921 {
1922 REAL_VALUE_TYPE TWO32r;
1923 rtx fp_lo, fp_hi, x;
1924
1925 fp_lo = gen_reg_rtx (DFmode);
1926 fp_hi = gen_reg_rtx (DFmode);
1927
1928 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1929
1930 real_ldexp (&TWO32r, &dconst1, 32);
1931 x = const_double_from_real_value (TWO32r, DFmode);
1932 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1933
1934 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1935
1936 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1937 0, OPTAB_DIRECT);
1938 if (x != target)
1939 emit_move_insn (target, x);
1940 }
1941
1942 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1943 For x86_32, -mfpmath=sse, !optimize_size only. */
1944 void
1945 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1946 {
1947 REAL_VALUE_TYPE ONE16r;
1948 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1949
1950 real_ldexp (&ONE16r, &dconst1, 16);
1951 x = const_double_from_real_value (ONE16r, SFmode);
1952 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1953 NULL, 0, OPTAB_DIRECT);
1954 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1955 NULL, 0, OPTAB_DIRECT);
1956 fp_hi = gen_reg_rtx (SFmode);
1957 fp_lo = gen_reg_rtx (SFmode);
1958 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1959 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1960 if (TARGET_FMA)
1961 {
1962 x = validize_mem (force_const_mem (SFmode, x));
1963 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1964 emit_move_insn (target, fp_hi);
1965 }
1966 else
1967 {
1968 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1969 0, OPTAB_DIRECT);
1970 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1971 0, OPTAB_DIRECT);
1972 if (!rtx_equal_p (target, fp_hi))
1973 emit_move_insn (target, fp_hi);
1974 }
1975 }
1976
1977 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1978 a vector of unsigned ints VAL to vector of floats TARGET. */
1979
1980 void
1981 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1982 {
1983 rtx tmp[8];
1984 REAL_VALUE_TYPE TWO16r;
1985 machine_mode intmode = GET_MODE (val);
1986 machine_mode fltmode = GET_MODE (target);
1987 rtx (*cvt) (rtx, rtx);
1988
1989 if (intmode == V4SImode)
1990 cvt = gen_floatv4siv4sf2;
1991 else
1992 cvt = gen_floatv8siv8sf2;
1993 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1994 tmp[0] = force_reg (intmode, tmp[0]);
1995 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1996 OPTAB_DIRECT);
1997 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1998 NULL_RTX, 1, OPTAB_DIRECT);
1999 tmp[3] = gen_reg_rtx (fltmode);
2000 emit_insn (cvt (tmp[3], tmp[1]));
2001 tmp[4] = gen_reg_rtx (fltmode);
2002 emit_insn (cvt (tmp[4], tmp[2]));
2003 real_ldexp (&TWO16r, &dconst1, 16);
2004 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2005 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
2006 if (TARGET_FMA)
2007 {
2008 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2009 emit_move_insn (target, tmp[6]);
2010 }
2011 else
2012 {
2013 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2014 NULL_RTX, 1, OPTAB_DIRECT);
2015 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2016 target, 1, OPTAB_DIRECT);
2017 if (tmp[7] != target)
2018 emit_move_insn (target, tmp[7]);
2019 }
2020 }
2021
2022 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2023 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
2024 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2025 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2026
2027 rtx
2028 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2029 {
2030 REAL_VALUE_TYPE TWO31r;
2031 rtx two31r, tmp[4];
2032 machine_mode mode = GET_MODE (val);
2033 machine_mode scalarmode = GET_MODE_INNER (mode);
2034 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2035 rtx (*cmp) (rtx, rtx, rtx, rtx);
2036 int i;
2037
2038 for (i = 0; i < 3; i++)
2039 tmp[i] = gen_reg_rtx (mode);
2040 real_ldexp (&TWO31r, &dconst1, 31);
2041 two31r = const_double_from_real_value (TWO31r, scalarmode);
2042 two31r = ix86_build_const_vector (mode, 1, two31r);
2043 two31r = force_reg (mode, two31r);
2044 switch (mode)
2045 {
2046 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2047 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2048 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2049 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2050 default: gcc_unreachable ();
2051 }
2052 tmp[3] = gen_rtx_LE (mode, two31r, val);
2053 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2054 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2055 0, OPTAB_DIRECT);
2056 if (intmode == V4SImode || TARGET_AVX2)
2057 *xorp = expand_simple_binop (intmode, ASHIFT,
2058 gen_lowpart (intmode, tmp[0]),
2059 GEN_INT (31), NULL_RTX, 0,
2060 OPTAB_DIRECT);
2061 else
2062 {
2063 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2064 two31 = ix86_build_const_vector (intmode, 1, two31);
2065 *xorp = expand_simple_binop (intmode, AND,
2066 gen_lowpart (intmode, tmp[0]),
2067 two31, NULL_RTX, 0,
2068 OPTAB_DIRECT);
2069 }
2070 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2071 0, OPTAB_DIRECT);
2072 }
2073
2074 /* Generate code for floating point ABS or NEG. */
2075
2076 void
2077 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2078 rtx operands[])
2079 {
2080 rtx set, dst, src;
2081 bool use_sse = false;
2082 bool vector_mode = VECTOR_MODE_P (mode);
2083 machine_mode vmode = mode;
2084 rtvec par;
2085
2086 if (vector_mode || mode == TFmode || mode == HFmode)
2087 {
2088 use_sse = true;
2089 if (mode == HFmode)
2090 vmode = V8HFmode;
2091 }
2092 else if (TARGET_SSE_MATH)
2093 {
2094 use_sse = SSE_FLOAT_MODE_P (mode);
2095 if (mode == SFmode)
2096 vmode = V4SFmode;
2097 else if (mode == DFmode)
2098 vmode = V2DFmode;
2099 }
2100
2101 dst = operands[0];
2102 src = operands[1];
2103
2104 set = gen_rtx_fmt_e (code, mode, src);
2105 set = gen_rtx_SET (dst, set);
2106
2107 if (use_sse)
2108 {
2109 rtx mask, use, clob;
2110
2111 /* NEG and ABS performed with SSE use bitwise mask operations.
2112 Create the appropriate mask now. */
2113 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2114 use = gen_rtx_USE (VOIDmode, mask);
2115 if (vector_mode || mode == TFmode)
2116 par = gen_rtvec (2, set, use);
2117 else
2118 {
2119 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2120 par = gen_rtvec (3, set, use, clob);
2121 }
2122 }
2123 else
2124 {
2125 rtx clob;
2126
2127 /* Changing of sign for FP values is doable using integer unit too. */
2128 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2129 par = gen_rtvec (2, set, clob);
2130 }
2131
2132 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2133 }
2134
2135 /* Deconstruct a floating point ABS or NEG operation
2136 with integer registers into integer operations. */
2137
2138 void
2139 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2140 rtx operands[])
2141 {
2142 enum rtx_code absneg_op;
2143 rtx dst, set;
2144
2145 gcc_assert (operands_match_p (operands[0], operands[1]));
2146
2147 switch (mode)
2148 {
2149 case E_SFmode:
2150 dst = gen_lowpart (SImode, operands[0]);
2151
2152 if (code == ABS)
2153 {
2154 set = gen_int_mode (0x7fffffff, SImode);
2155 absneg_op = AND;
2156 }
2157 else
2158 {
2159 set = gen_int_mode (0x80000000, SImode);
2160 absneg_op = XOR;
2161 }
2162 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2163 break;
2164
2165 case E_DFmode:
2166 if (TARGET_64BIT)
2167 {
2168 dst = gen_lowpart (DImode, operands[0]);
2169 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2170
2171 if (code == ABS)
2172 set = const0_rtx;
2173 else
2174 set = gen_rtx_NOT (DImode, dst);
2175 }
2176 else
2177 {
2178 dst = gen_highpart (SImode, operands[0]);
2179
2180 if (code == ABS)
2181 {
2182 set = gen_int_mode (0x7fffffff, SImode);
2183 absneg_op = AND;
2184 }
2185 else
2186 {
2187 set = gen_int_mode (0x80000000, SImode);
2188 absneg_op = XOR;
2189 }
2190 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2191 }
2192 break;
2193
2194 case E_XFmode:
2195 dst = gen_rtx_REG (SImode,
2196 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2197 if (code == ABS)
2198 {
2199 set = GEN_INT (0x7fff);
2200 absneg_op = AND;
2201 }
2202 else
2203 {
2204 set = GEN_INT (0x8000);
2205 absneg_op = XOR;
2206 }
2207 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2208 break;
2209
2210 default:
2211 gcc_unreachable ();
2212 }
2213
2214 set = gen_rtx_SET (dst, set);
2215
2216 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2217 rtvec par = gen_rtvec (2, set, clob);
2218
2219 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2220 }
2221
2222 /* Expand a copysign operation. Special case operand 0 being a constant. */
2223
2224 void
2225 ix86_expand_copysign (rtx operands[])
2226 {
2227 machine_mode mode, vmode;
2228 rtx dest, vdest, op0, op1, mask, op2, op3;
2229
2230 mode = GET_MODE (operands[0]);
2231
2232 if (mode == HFmode)
2233 vmode = V8HFmode;
2234 else if (mode == SFmode)
2235 vmode = V4SFmode;
2236 else if (mode == DFmode)
2237 vmode = V2DFmode;
2238 else if (mode == TFmode)
2239 vmode = mode;
2240 else
2241 gcc_unreachable ();
2242
2243 if (rtx_equal_p (operands[1], operands[2]))
2244 {
2245 emit_move_insn (operands[0], operands[1]);
2246 return;
2247 }
2248
2249 dest = operands[0];
2250 vdest = lowpart_subreg (vmode, dest, mode);
2251 if (vdest == NULL_RTX)
2252 vdest = gen_reg_rtx (vmode);
2253 else
2254 dest = NULL_RTX;
2255 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2256 mask = ix86_build_signbit_mask (vmode, 0, 0);
2257
2258 if (CONST_DOUBLE_P (operands[1]))
2259 {
2260 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2261 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2262 if (op0 == CONST0_RTX (mode))
2263 {
2264 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2265 if (dest)
2266 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2267 return;
2268 }
2269
2270 if (GET_MODE_SIZE (mode) < 16)
2271 op0 = ix86_build_const_vector (vmode, false, op0);
2272 op0 = force_reg (vmode, op0);
2273 }
2274 else
2275 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2276
2277 op2 = gen_reg_rtx (vmode);
2278 op3 = gen_reg_rtx (vmode);
2279 emit_move_insn (op2, gen_rtx_AND (vmode,
2280 gen_rtx_NOT (vmode, mask),
2281 op0));
2282 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2283 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2284 if (dest)
2285 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2286 }
2287
2288 /* Expand an xorsign operation. */
2289
2290 void
2291 ix86_expand_xorsign (rtx operands[])
2292 {
2293 machine_mode mode, vmode;
2294 rtx dest, vdest, op0, op1, mask, x, temp;
2295
2296 dest = operands[0];
2297 op0 = operands[1];
2298 op1 = operands[2];
2299
2300 mode = GET_MODE (dest);
2301
2302 if (mode == HFmode)
2303 vmode = V8HFmode;
2304 else if (mode == SFmode)
2305 vmode = V4SFmode;
2306 else if (mode == DFmode)
2307 vmode = V2DFmode;
2308 else
2309 gcc_unreachable ();
2310
2311 temp = gen_reg_rtx (vmode);
2312 mask = ix86_build_signbit_mask (vmode, 0, 0);
2313
2314 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2315 x = gen_rtx_AND (vmode, op1, mask);
2316 emit_insn (gen_rtx_SET (temp, x));
2317
2318 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2319 x = gen_rtx_XOR (vmode, temp, op0);
2320
2321 vdest = lowpart_subreg (vmode, dest, mode);
2322 if (vdest == NULL_RTX)
2323 vdest = gen_reg_rtx (vmode);
2324 else
2325 dest = NULL_RTX;
2326 emit_insn (gen_rtx_SET (vdest, x));
2327
2328 if (dest)
2329 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2330 }
2331
2332 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2333
2334 void
2335 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2336 {
2337 machine_mode mode = GET_MODE (op0);
2338 rtx tmp;
2339
2340 /* Handle special case - vector comparsion with boolean result, transform
2341 it using ptest instruction. */
2342 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2343 || mode == OImode)
2344 {
2345 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2346 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2347
2348 gcc_assert (code == EQ || code == NE);
2349
2350 if (mode == OImode)
2351 {
2352 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2353 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2354 mode = p_mode;
2355 }
2356 /* Generate XOR since we can't check that one operand is zero vector. */
2357 tmp = gen_reg_rtx (mode);
2358 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2359 tmp = gen_lowpart (p_mode, tmp);
2360 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2361 gen_rtx_UNSPEC (CCmode,
2362 gen_rtvec (2, tmp, tmp),
2363 UNSPEC_PTEST)));
2364 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2365 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2366 gen_rtx_LABEL_REF (VOIDmode, label),
2367 pc_rtx);
2368 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2369 return;
2370 }
2371
2372 switch (mode)
2373 {
2374 case E_HFmode:
2375 case E_SFmode:
2376 case E_DFmode:
2377 case E_XFmode:
2378 case E_QImode:
2379 case E_HImode:
2380 case E_SImode:
2381 simple:
2382 tmp = ix86_expand_compare (code, op0, op1);
2383 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2384 gen_rtx_LABEL_REF (VOIDmode, label),
2385 pc_rtx);
2386 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2387 return;
2388
2389 case E_DImode:
2390 if (TARGET_64BIT)
2391 goto simple;
2392 /* FALLTHRU */
2393 case E_TImode:
2394 /* DI and TI mode equality/inequality comparisons may be performed
2395 on SSE registers. Avoid splitting them, except when optimizing
2396 for size. */
2397 if ((code == EQ || code == NE)
2398 && !optimize_insn_for_size_p ())
2399 goto simple;
2400
2401 /* Expand DImode branch into multiple compare+branch. */
2402 {
2403 rtx lo[2], hi[2];
2404 rtx_code_label *label2;
2405 enum rtx_code code1, code2, code3;
2406 machine_mode submode;
2407
2408 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2409 {
2410 std::swap (op0, op1);
2411 code = swap_condition (code);
2412 }
2413
2414 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2415 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2416
2417 submode = mode == DImode ? SImode : DImode;
2418
2419 /* If we are doing less-than or greater-or-equal-than,
2420 op1 is a constant and the low word is zero, then we can just
2421 examine the high word. Similarly for low word -1 and
2422 less-or-equal-than or greater-than. */
2423
2424 if (CONST_INT_P (hi[1]))
2425 switch (code)
2426 {
2427 case LT: case LTU: case GE: case GEU:
2428 if (lo[1] == const0_rtx)
2429 {
2430 ix86_expand_branch (code, hi[0], hi[1], label);
2431 return;
2432 }
2433 break;
2434 case LE: case LEU: case GT: case GTU:
2435 if (lo[1] == constm1_rtx)
2436 {
2437 ix86_expand_branch (code, hi[0], hi[1], label);
2438 return;
2439 }
2440 break;
2441 default:
2442 break;
2443 }
2444
2445 /* Emulate comparisons that do not depend on Zero flag with
2446 double-word subtraction. Note that only Overflow, Sign
2447 and Carry flags are valid, so swap arguments and condition
2448 of comparisons that would otherwise test Zero flag. */
2449
2450 switch (code)
2451 {
2452 case LE: case LEU: case GT: case GTU:
2453 std::swap (lo[0], lo[1]);
2454 std::swap (hi[0], hi[1]);
2455 code = swap_condition (code);
2456 /* FALLTHRU */
2457
2458 case LT: case LTU: case GE: case GEU:
2459 {
2460 bool uns = (code == LTU || code == GEU);
2461 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2462 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2463
2464 if (!nonimmediate_operand (lo[0], submode))
2465 lo[0] = force_reg (submode, lo[0]);
2466 if (!x86_64_general_operand (lo[1], submode))
2467 lo[1] = force_reg (submode, lo[1]);
2468
2469 if (!register_operand (hi[0], submode))
2470 hi[0] = force_reg (submode, hi[0]);
2471 if ((uns && !nonimmediate_operand (hi[1], submode))
2472 || (!uns && !x86_64_general_operand (hi[1], submode)))
2473 hi[1] = force_reg (submode, hi[1]);
2474
2475 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2476
2477 tmp = gen_rtx_SCRATCH (submode);
2478 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2479
2480 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2481 ix86_expand_branch (code, tmp, const0_rtx, label);
2482 return;
2483 }
2484
2485 default:
2486 break;
2487 }
2488
2489 /* Otherwise, we need two or three jumps. */
2490
2491 label2 = gen_label_rtx ();
2492
2493 code1 = code;
2494 code2 = swap_condition (code);
2495 code3 = unsigned_condition (code);
2496
2497 switch (code)
2498 {
2499 case LT: case GT: case LTU: case GTU:
2500 break;
2501
2502 case LE: code1 = LT; code2 = GT; break;
2503 case GE: code1 = GT; code2 = LT; break;
2504 case LEU: code1 = LTU; code2 = GTU; break;
2505 case GEU: code1 = GTU; code2 = LTU; break;
2506
2507 case EQ: code1 = UNKNOWN; code2 = NE; break;
2508 case NE: code2 = UNKNOWN; break;
2509
2510 default:
2511 gcc_unreachable ();
2512 }
2513
2514 /*
2515 * a < b =>
2516 * if (hi(a) < hi(b)) goto true;
2517 * if (hi(a) > hi(b)) goto false;
2518 * if (lo(a) < lo(b)) goto true;
2519 * false:
2520 */
2521
2522 if (code1 != UNKNOWN)
2523 ix86_expand_branch (code1, hi[0], hi[1], label);
2524 if (code2 != UNKNOWN)
2525 ix86_expand_branch (code2, hi[0], hi[1], label2);
2526
2527 ix86_expand_branch (code3, lo[0], lo[1], label);
2528
2529 if (code2 != UNKNOWN)
2530 emit_label (label2);
2531 return;
2532 }
2533
2534 default:
2535 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2536 goto simple;
2537 }
2538 }
2539
2540 /* Figure out whether to use unordered fp comparisons. */
2541
2542 static bool
2543 ix86_unordered_fp_compare (enum rtx_code code)
2544 {
2545 if (!TARGET_IEEE_FP)
2546 return false;
2547
2548 switch (code)
2549 {
2550 case LT:
2551 case LE:
2552 case GT:
2553 case GE:
2554 case LTGT:
2555 return false;
2556
2557 case EQ:
2558 case NE:
2559
2560 case UNORDERED:
2561 case ORDERED:
2562 case UNLT:
2563 case UNLE:
2564 case UNGT:
2565 case UNGE:
2566 case UNEQ:
2567 return true;
2568
2569 default:
2570 gcc_unreachable ();
2571 }
2572 }
2573
2574 /* Return a comparison we can do and that it is equivalent to
2575 swap_condition (code) apart possibly from orderedness.
2576 But, never change orderedness if TARGET_IEEE_FP, returning
2577 UNKNOWN in that case if necessary. */
2578
2579 static enum rtx_code
2580 ix86_fp_swap_condition (enum rtx_code code)
2581 {
2582 switch (code)
2583 {
2584 case GT: /* GTU - CF=0 & ZF=0 */
2585 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2586 case GE: /* GEU - CF=0 */
2587 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2588 case UNLT: /* LTU - CF=1 */
2589 return TARGET_IEEE_FP ? UNKNOWN : GT;
2590 case UNLE: /* LEU - CF=1 | ZF=1 */
2591 return TARGET_IEEE_FP ? UNKNOWN : GE;
2592 default:
2593 return swap_condition (code);
2594 }
2595 }
2596
2597 /* Return cost of comparison CODE using the best strategy for performance.
2598 All following functions do use number of instructions as a cost metrics.
2599 In future this should be tweaked to compute bytes for optimize_size and
2600 take into account performance of various instructions on various CPUs. */
2601
2602 static int
2603 ix86_fp_comparison_cost (enum rtx_code code)
2604 {
2605 int arith_cost;
2606
2607 /* The cost of code using bit-twiddling on %ah. */
2608 switch (code)
2609 {
2610 case UNLE:
2611 case UNLT:
2612 case LTGT:
2613 case GT:
2614 case GE:
2615 case UNORDERED:
2616 case ORDERED:
2617 case UNEQ:
2618 arith_cost = 4;
2619 break;
2620 case LT:
2621 case NE:
2622 case EQ:
2623 case UNGE:
2624 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2625 break;
2626 case LE:
2627 case UNGT:
2628 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2629 break;
2630 default:
2631 gcc_unreachable ();
2632 }
2633
2634 switch (ix86_fp_comparison_strategy (code))
2635 {
2636 case IX86_FPCMP_COMI:
2637 return arith_cost > 4 ? 3 : 2;
2638 case IX86_FPCMP_SAHF:
2639 return arith_cost > 4 ? 4 : 3;
2640 default:
2641 return arith_cost;
2642 }
2643 }
2644
2645 /* Swap, force into registers, or otherwise massage the two operands
2646 to a fp comparison. The operands are updated in place; the new
2647 comparison code is returned. */
2648
2649 static enum rtx_code
2650 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2651 {
2652 bool unordered_compare = ix86_unordered_fp_compare (code);
2653 rtx op0 = *pop0, op1 = *pop1;
2654 machine_mode op_mode = GET_MODE (op0);
2655 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2656
2657 if (op_mode == BFmode)
2658 {
2659 rtx op = gen_lowpart (HImode, op0);
2660 if (CONST_INT_P (op))
2661 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2662 op0, BFmode);
2663 else
2664 {
2665 rtx t1 = gen_reg_rtx (SImode);
2666 emit_insn (gen_zero_extendhisi2 (t1, op));
2667 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2668 op = gen_lowpart (SFmode, t1);
2669 }
2670 *pop0 = op;
2671 op = gen_lowpart (HImode, op1);
2672 if (CONST_INT_P (op))
2673 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2674 op1, BFmode);
2675 else
2676 {
2677 rtx t1 = gen_reg_rtx (SImode);
2678 emit_insn (gen_zero_extendhisi2 (t1, op));
2679 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2680 op = gen_lowpart (SFmode, t1);
2681 }
2682 *pop1 = op;
2683 return ix86_prepare_fp_compare_args (code, pop0, pop1);
2684 }
2685
2686 /* All of the unordered compare instructions only work on registers.
2687 The same is true of the fcomi compare instructions. The XFmode
2688 compare instructions require registers except when comparing
2689 against zero or when converting operand 1 from fixed point to
2690 floating point. */
2691
2692 if (!is_sse
2693 && (unordered_compare
2694 || (op_mode == XFmode
2695 && ! (standard_80387_constant_p (op0) == 1
2696 || standard_80387_constant_p (op1) == 1)
2697 && GET_CODE (op1) != FLOAT)
2698 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2699 {
2700 op0 = force_reg (op_mode, op0);
2701 op1 = force_reg (op_mode, op1);
2702 }
2703 else
2704 {
2705 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2706 things around if they appear profitable, otherwise force op0
2707 into a register. */
2708
2709 if (standard_80387_constant_p (op0) == 0
2710 || (MEM_P (op0)
2711 && ! (standard_80387_constant_p (op1) == 0
2712 || MEM_P (op1))))
2713 {
2714 enum rtx_code new_code = ix86_fp_swap_condition (code);
2715 if (new_code != UNKNOWN)
2716 {
2717 std::swap (op0, op1);
2718 code = new_code;
2719 }
2720 }
2721
2722 if (!REG_P (op0))
2723 op0 = force_reg (op_mode, op0);
2724
2725 if (CONSTANT_P (op1))
2726 {
2727 int tmp = standard_80387_constant_p (op1);
2728 if (tmp == 0)
2729 op1 = validize_mem (force_const_mem (op_mode, op1));
2730 else if (tmp == 1)
2731 {
2732 if (TARGET_CMOVE)
2733 op1 = force_reg (op_mode, op1);
2734 }
2735 else
2736 op1 = force_reg (op_mode, op1);
2737 }
2738 }
2739
2740 /* Try to rearrange the comparison to make it cheaper. */
2741 if (ix86_fp_comparison_cost (code)
2742 > ix86_fp_comparison_cost (swap_condition (code))
2743 && (REG_P (op1) || can_create_pseudo_p ()))
2744 {
2745 std::swap (op0, op1);
2746 code = swap_condition (code);
2747 if (!REG_P (op0))
2748 op0 = force_reg (op_mode, op0);
2749 }
2750
2751 *pop0 = op0;
2752 *pop1 = op1;
2753 return code;
2754 }
2755
2756 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2757
2758 static rtx
2759 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2760 {
2761 bool unordered_compare = ix86_unordered_fp_compare (code);
2762 machine_mode cmp_mode;
2763 rtx tmp, scratch;
2764
2765 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2766
2767 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2768 if (unordered_compare)
2769 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2770
2771 /* Do fcomi/sahf based test when profitable. */
2772 switch (ix86_fp_comparison_strategy (code))
2773 {
2774 case IX86_FPCMP_COMI:
2775 cmp_mode = CCFPmode;
2776 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2777 break;
2778
2779 case IX86_FPCMP_SAHF:
2780 cmp_mode = CCFPmode;
2781 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2782 scratch = gen_reg_rtx (HImode);
2783 emit_insn (gen_rtx_SET (scratch, tmp));
2784 emit_insn (gen_x86_sahf_1 (scratch));
2785 break;
2786
2787 case IX86_FPCMP_ARITH:
2788 cmp_mode = CCNOmode;
2789 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2790 scratch = gen_reg_rtx (HImode);
2791 emit_insn (gen_rtx_SET (scratch, tmp));
2792
2793 /* In the unordered case, we have to check C2 for NaN's, which
2794 doesn't happen to work out to anything nice combination-wise.
2795 So do some bit twiddling on the value we've got in AH to come
2796 up with an appropriate set of condition codes. */
2797
2798 switch (code)
2799 {
2800 case GT:
2801 case UNGT:
2802 if (code == GT || !TARGET_IEEE_FP)
2803 {
2804 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2805 code = EQ;
2806 }
2807 else
2808 {
2809 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2810 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2811 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2812 cmp_mode = CCmode;
2813 code = GEU;
2814 }
2815 break;
2816 case LT:
2817 case UNLT:
2818 if (code == LT && TARGET_IEEE_FP)
2819 {
2820 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2821 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2822 cmp_mode = CCmode;
2823 code = EQ;
2824 }
2825 else
2826 {
2827 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2828 code = NE;
2829 }
2830 break;
2831 case GE:
2832 case UNGE:
2833 if (code == GE || !TARGET_IEEE_FP)
2834 {
2835 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2836 code = EQ;
2837 }
2838 else
2839 {
2840 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2841 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2842 code = NE;
2843 }
2844 break;
2845 case LE:
2846 case UNLE:
2847 if (code == LE && TARGET_IEEE_FP)
2848 {
2849 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2850 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2851 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2852 cmp_mode = CCmode;
2853 code = LTU;
2854 }
2855 else
2856 {
2857 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2858 code = NE;
2859 }
2860 break;
2861 case EQ:
2862 case UNEQ:
2863 if (code == EQ && TARGET_IEEE_FP)
2864 {
2865 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2866 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2867 cmp_mode = CCmode;
2868 code = EQ;
2869 }
2870 else
2871 {
2872 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2873 code = NE;
2874 }
2875 break;
2876 case NE:
2877 case LTGT:
2878 if (code == NE && TARGET_IEEE_FP)
2879 {
2880 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2881 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2882 GEN_INT (0x40)));
2883 code = NE;
2884 }
2885 else
2886 {
2887 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2888 code = EQ;
2889 }
2890 break;
2891
2892 case UNORDERED:
2893 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2894 code = NE;
2895 break;
2896 case ORDERED:
2897 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2898 code = EQ;
2899 break;
2900
2901 default:
2902 gcc_unreachable ();
2903 }
2904 break;
2905
2906 default:
2907 gcc_unreachable();
2908 }
2909
2910 /* Return the test that should be put into the flags user, i.e.
2911 the bcc, scc, or cmov instruction. */
2912 return gen_rtx_fmt_ee (code, VOIDmode,
2913 gen_rtx_REG (cmp_mode, FLAGS_REG),
2914 const0_rtx);
2915 }
2916
2917 /* Generate insn patterns to do an integer compare of OPERANDS. */
2918
2919 static rtx
2920 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2921 {
2922 machine_mode cmpmode;
2923 rtx tmp, flags;
2924
2925 /* Swap operands to emit carry flag comparison. */
2926 if ((code == GTU || code == LEU)
2927 && nonimmediate_operand (op1, VOIDmode))
2928 {
2929 std::swap (op0, op1);
2930 code = swap_condition (code);
2931 }
2932
2933 cmpmode = SELECT_CC_MODE (code, op0, op1);
2934 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2935
2936 /* This is very simple, but making the interface the same as in the
2937 FP case makes the rest of the code easier. */
2938 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2939 emit_insn (gen_rtx_SET (flags, tmp));
2940
2941 /* Return the test that should be put into the flags user, i.e.
2942 the bcc, scc, or cmov instruction. */
2943 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2944 }
2945
2946 static rtx
2947 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2948 {
2949 rtx ret;
2950
2951 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2952 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2953
2954 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2955 {
2956 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2957 ret = ix86_expand_fp_compare (code, op0, op1);
2958 }
2959 else
2960 ret = ix86_expand_int_compare (code, op0, op1);
2961
2962 return ret;
2963 }
2964
2965 void
2966 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2967 {
2968 rtx ret;
2969
2970 gcc_assert (GET_MODE (dest) == QImode);
2971
2972 ret = ix86_expand_compare (code, op0, op1);
2973 PUT_MODE (ret, QImode);
2974 emit_insn (gen_rtx_SET (dest, ret));
2975 }
2976
2977 /* Expand floating point op0 <=> op1, i.e.
2978 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2979
2980 void
2981 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
2982 {
2983 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
2984 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
2985 rtx l0 = gen_label_rtx ();
2986 rtx l1 = gen_label_rtx ();
2987 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
2988 rtx lend = gen_label_rtx ();
2989 rtx tmp;
2990 rtx_insn *jmp;
2991 if (l2)
2992 {
2993 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
2994 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2995 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
2996 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
2997 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2998 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
2999 }
3000 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3001 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3002 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3003 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3004 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3005 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3006 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3007 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3008 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3009 add_reg_br_prob_note (jmp, profile_probability::even ());
3010 emit_move_insn (dest, constm1_rtx);
3011 emit_jump (lend);
3012 emit_label (l0);
3013 emit_move_insn (dest, const0_rtx);
3014 emit_jump (lend);
3015 emit_label (l1);
3016 emit_move_insn (dest, const1_rtx);
3017 emit_jump (lend);
3018 if (l2)
3019 {
3020 emit_label (l2);
3021 emit_move_insn (dest, const2_rtx);
3022 }
3023 emit_label (lend);
3024 }
3025
3026 /* Expand comparison setting or clearing carry flag. Return true when
3027 successful and set pop for the operation. */
3028 static bool
3029 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3030 {
3031 machine_mode mode
3032 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3033
3034 /* Do not handle double-mode compares that go through special path. */
3035 if (mode == (TARGET_64BIT ? TImode : DImode))
3036 return false;
3037
3038 if (SCALAR_FLOAT_MODE_P (mode))
3039 {
3040 rtx compare_op;
3041 rtx_insn *compare_seq;
3042
3043 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3044
3045 /* Shortcut: following common codes never translate
3046 into carry flag compares. */
3047 if (code == EQ || code == NE || code == UNEQ || code == LTGT
3048 || code == ORDERED || code == UNORDERED)
3049 return false;
3050
3051 /* These comparisons require zero flag; swap operands so they won't. */
3052 if ((code == GT || code == UNLE || code == LE || code == UNGT)
3053 && !TARGET_IEEE_FP)
3054 {
3055 std::swap (op0, op1);
3056 code = swap_condition (code);
3057 }
3058
3059 /* Try to expand the comparison and verify that we end up with
3060 carry flag based comparison. This fails to be true only when
3061 we decide to expand comparison using arithmetic that is not
3062 too common scenario. */
3063 start_sequence ();
3064 compare_op = ix86_expand_fp_compare (code, op0, op1);
3065 compare_seq = get_insns ();
3066 end_sequence ();
3067
3068 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3069 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3070 else
3071 code = GET_CODE (compare_op);
3072
3073 if (code != LTU && code != GEU)
3074 return false;
3075
3076 emit_insn (compare_seq);
3077 *pop = compare_op;
3078 return true;
3079 }
3080
3081 if (!INTEGRAL_MODE_P (mode))
3082 return false;
3083
3084 switch (code)
3085 {
3086 case LTU:
3087 case GEU:
3088 break;
3089
3090 /* Convert a==0 into (unsigned)a<1. */
3091 case EQ:
3092 case NE:
3093 if (op1 != const0_rtx)
3094 return false;
3095 op1 = const1_rtx;
3096 code = (code == EQ ? LTU : GEU);
3097 break;
3098
3099 /* Convert a>b into b<a or a>=b-1. */
3100 case GTU:
3101 case LEU:
3102 if (CONST_INT_P (op1))
3103 {
3104 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3105 /* Bail out on overflow. We still can swap operands but that
3106 would force loading of the constant into register. */
3107 if (op1 == const0_rtx
3108 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3109 return false;
3110 code = (code == GTU ? GEU : LTU);
3111 }
3112 else
3113 {
3114 std::swap (op0, op1);
3115 code = (code == GTU ? LTU : GEU);
3116 }
3117 break;
3118
3119 /* Convert a>=0 into (unsigned)a<0x80000000. */
3120 case LT:
3121 case GE:
3122 if (mode == DImode || op1 != const0_rtx)
3123 return false;
3124 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3125 code = (code == LT ? GEU : LTU);
3126 break;
3127 case LE:
3128 case GT:
3129 if (mode == DImode || op1 != constm1_rtx)
3130 return false;
3131 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3132 code = (code == LE ? GEU : LTU);
3133 break;
3134
3135 default:
3136 return false;
3137 }
3138 /* Swapping operands may cause constant to appear as first operand. */
3139 if (!nonimmediate_operand (op0, VOIDmode))
3140 {
3141 if (!can_create_pseudo_p ())
3142 return false;
3143 op0 = force_reg (mode, op0);
3144 }
3145 *pop = ix86_expand_compare (code, op0, op1);
3146 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3147 return true;
3148 }
3149
3150 /* Expand conditional increment or decrement using adb/sbb instructions.
3151 The default case using setcc followed by the conditional move can be
3152 done by generic code. */
3153 bool
3154 ix86_expand_int_addcc (rtx operands[])
3155 {
3156 enum rtx_code code = GET_CODE (operands[1]);
3157 rtx flags;
3158 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3159 rtx compare_op;
3160 rtx val = const0_rtx;
3161 bool fpcmp = false;
3162 machine_mode mode;
3163 rtx op0 = XEXP (operands[1], 0);
3164 rtx op1 = XEXP (operands[1], 1);
3165
3166 if (operands[3] != const1_rtx
3167 && operands[3] != constm1_rtx)
3168 return false;
3169 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3170 return false;
3171 code = GET_CODE (compare_op);
3172
3173 flags = XEXP (compare_op, 0);
3174
3175 if (GET_MODE (flags) == CCFPmode)
3176 {
3177 fpcmp = true;
3178 code = ix86_fp_compare_code_to_integer (code);
3179 }
3180
3181 if (code != LTU)
3182 {
3183 val = constm1_rtx;
3184 if (fpcmp)
3185 PUT_CODE (compare_op,
3186 reverse_condition_maybe_unordered
3187 (GET_CODE (compare_op)));
3188 else
3189 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3190 }
3191
3192 mode = GET_MODE (operands[0]);
3193
3194 /* Construct either adc or sbb insn. */
3195 if ((code == LTU) == (operands[3] == constm1_rtx))
3196 insn = gen_sub3_carry;
3197 else
3198 insn = gen_add3_carry;
3199
3200 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3201
3202 return true;
3203 }
3204
3205 bool
3206 ix86_expand_int_movcc (rtx operands[])
3207 {
3208 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3209 rtx_insn *compare_seq;
3210 rtx compare_op;
3211 machine_mode mode = GET_MODE (operands[0]);
3212 bool sign_bit_compare_p = false;
3213 bool negate_cc_compare_p = false;
3214 rtx op0 = XEXP (operands[1], 0);
3215 rtx op1 = XEXP (operands[1], 1);
3216 rtx op2 = operands[2];
3217 rtx op3 = operands[3];
3218
3219 if (GET_MODE (op0) == TImode
3220 || (GET_MODE (op0) == DImode
3221 && !TARGET_64BIT))
3222 return false;
3223
3224 if (GET_MODE (op0) == BFmode
3225 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3226 return false;
3227
3228 start_sequence ();
3229 compare_op = ix86_expand_compare (code, op0, op1);
3230 compare_seq = get_insns ();
3231 end_sequence ();
3232
3233 compare_code = GET_CODE (compare_op);
3234
3235 if ((op1 == const0_rtx && (code == GE || code == LT))
3236 || (op1 == constm1_rtx && (code == GT || code == LE)))
3237 sign_bit_compare_p = true;
3238
3239 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3240 but if op1 is a constant, the latter form allows more optimizations,
3241 either through the last 2 ops being constant handling, or the one
3242 constant and one variable cases. On the other side, for cmov the
3243 former might be better as we don't need to load the constant into
3244 another register. */
3245 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3246 op2 = op1;
3247 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3248 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3249 op3 = op1;
3250
3251 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3252 HImode insns, we'd be swallowed in word prefix ops. */
3253
3254 if ((mode != HImode || TARGET_FAST_PREFIX)
3255 && (mode != (TARGET_64BIT ? TImode : DImode))
3256 && CONST_INT_P (op2)
3257 && CONST_INT_P (op3))
3258 {
3259 rtx out = operands[0];
3260 HOST_WIDE_INT ct = INTVAL (op2);
3261 HOST_WIDE_INT cf = INTVAL (op3);
3262 HOST_WIDE_INT diff;
3263
3264 if ((mode == SImode
3265 || (TARGET_64BIT && mode == DImode))
3266 && (GET_MODE (op0) == SImode
3267 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3268 {
3269 /* Special case x != 0 ? -1 : y. */
3270 if (code == NE && op1 == const0_rtx && ct == -1)
3271 {
3272 negate_cc_compare_p = true;
3273 std::swap (ct, cf);
3274 code = EQ;
3275 }
3276 else if (code == EQ && op1 == const0_rtx && cf == -1)
3277 negate_cc_compare_p = true;
3278 }
3279
3280 diff = ct - cf;
3281 /* Sign bit compares are better done using shifts than we do by using
3282 sbb. */
3283 if (sign_bit_compare_p
3284 || negate_cc_compare_p
3285 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3286 {
3287 /* Detect overlap between destination and compare sources. */
3288 rtx tmp = out;
3289
3290 if (negate_cc_compare_p)
3291 {
3292 if (GET_MODE (op0) == DImode)
3293 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3294 else
3295 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3296 gen_lowpart (SImode, op0)));
3297
3298 tmp = gen_reg_rtx (mode);
3299 if (mode == DImode)
3300 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3301 else
3302 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3303 tmp)));
3304 }
3305 else if (!sign_bit_compare_p)
3306 {
3307 rtx flags;
3308 bool fpcmp = false;
3309
3310 compare_code = GET_CODE (compare_op);
3311
3312 flags = XEXP (compare_op, 0);
3313
3314 if (GET_MODE (flags) == CCFPmode)
3315 {
3316 fpcmp = true;
3317 compare_code
3318 = ix86_fp_compare_code_to_integer (compare_code);
3319 }
3320
3321 /* To simplify rest of code, restrict to the GEU case. */
3322 if (compare_code == LTU)
3323 {
3324 std::swap (ct, cf);
3325 compare_code = reverse_condition (compare_code);
3326 code = reverse_condition (code);
3327 }
3328 else
3329 {
3330 if (fpcmp)
3331 PUT_CODE (compare_op,
3332 reverse_condition_maybe_unordered
3333 (GET_CODE (compare_op)));
3334 else
3335 PUT_CODE (compare_op,
3336 reverse_condition (GET_CODE (compare_op)));
3337 }
3338 diff = ct - cf;
3339
3340 if (reg_overlap_mentioned_p (out, compare_op))
3341 tmp = gen_reg_rtx (mode);
3342
3343 if (mode == DImode)
3344 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3345 else
3346 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3347 flags, compare_op));
3348 }
3349 else
3350 {
3351 if (code == GT || code == GE)
3352 code = reverse_condition (code);
3353 else
3354 {
3355 std::swap (ct, cf);
3356 diff = ct - cf;
3357 }
3358 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3359 }
3360
3361 if (diff == 1)
3362 {
3363 /*
3364 * cmpl op0,op1
3365 * sbbl dest,dest
3366 * [addl dest, ct]
3367 *
3368 * Size 5 - 8.
3369 */
3370 if (ct)
3371 tmp = expand_simple_binop (mode, PLUS,
3372 tmp, GEN_INT (ct),
3373 copy_rtx (tmp), 1, OPTAB_DIRECT);
3374 }
3375 else if (cf == -1)
3376 {
3377 /*
3378 * cmpl op0,op1
3379 * sbbl dest,dest
3380 * orl $ct, dest
3381 *
3382 * Size 8.
3383 */
3384 tmp = expand_simple_binop (mode, IOR,
3385 tmp, GEN_INT (ct),
3386 copy_rtx (tmp), 1, OPTAB_DIRECT);
3387 }
3388 else if (diff == -1 && ct)
3389 {
3390 /*
3391 * cmpl op0,op1
3392 * sbbl dest,dest
3393 * notl dest
3394 * [addl dest, cf]
3395 *
3396 * Size 8 - 11.
3397 */
3398 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3399 if (cf)
3400 tmp = expand_simple_binop (mode, PLUS,
3401 copy_rtx (tmp), GEN_INT (cf),
3402 copy_rtx (tmp), 1, OPTAB_DIRECT);
3403 }
3404 else
3405 {
3406 /*
3407 * cmpl op0,op1
3408 * sbbl dest,dest
3409 * [notl dest]
3410 * andl cf - ct, dest
3411 * [addl dest, ct]
3412 *
3413 * Size 8 - 11.
3414 */
3415
3416 if (cf == 0)
3417 {
3418 cf = ct;
3419 ct = 0;
3420 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3421 }
3422
3423 tmp = expand_simple_binop (mode, AND,
3424 copy_rtx (tmp),
3425 gen_int_mode (cf - ct, mode),
3426 copy_rtx (tmp), 1, OPTAB_DIRECT);
3427 if (ct)
3428 tmp = expand_simple_binop (mode, PLUS,
3429 copy_rtx (tmp), GEN_INT (ct),
3430 copy_rtx (tmp), 1, OPTAB_DIRECT);
3431 }
3432
3433 if (!rtx_equal_p (tmp, out))
3434 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3435
3436 return true;
3437 }
3438
3439 if (diff < 0)
3440 {
3441 machine_mode cmp_mode = GET_MODE (op0);
3442 enum rtx_code new_code;
3443
3444 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3445 {
3446 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3447
3448 /* We may be reversing a non-trapping
3449 comparison to a trapping comparison. */
3450 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3451 && code != EQ && code != NE
3452 && code != ORDERED && code != UNORDERED)
3453 new_code = UNKNOWN;
3454 else
3455 new_code = reverse_condition_maybe_unordered (code);
3456 }
3457 else
3458 new_code = ix86_reverse_condition (code, cmp_mode);
3459 if (new_code != UNKNOWN)
3460 {
3461 std::swap (ct, cf);
3462 diff = -diff;
3463 code = new_code;
3464 }
3465 }
3466
3467 compare_code = UNKNOWN;
3468 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3469 && CONST_INT_P (op1))
3470 {
3471 if (op1 == const0_rtx
3472 && (code == LT || code == GE))
3473 compare_code = code;
3474 else if (op1 == constm1_rtx)
3475 {
3476 if (code == LE)
3477 compare_code = LT;
3478 else if (code == GT)
3479 compare_code = GE;
3480 }
3481 }
3482
3483 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3484 if (compare_code != UNKNOWN
3485 && GET_MODE (op0) == GET_MODE (out)
3486 && (cf == -1 || ct == -1))
3487 {
3488 /* If lea code below could be used, only optimize
3489 if it results in a 2 insn sequence. */
3490
3491 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3492 || diff == 3 || diff == 5 || diff == 9)
3493 || (compare_code == LT && ct == -1)
3494 || (compare_code == GE && cf == -1))
3495 {
3496 /*
3497 * notl op1 (if necessary)
3498 * sarl $31, op1
3499 * orl cf, op1
3500 */
3501 if (ct != -1)
3502 {
3503 cf = ct;
3504 ct = -1;
3505 code = reverse_condition (code);
3506 }
3507
3508 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3509
3510 out = expand_simple_binop (mode, IOR,
3511 out, GEN_INT (cf),
3512 out, 1, OPTAB_DIRECT);
3513 if (out != operands[0])
3514 emit_move_insn (operands[0], out);
3515
3516 return true;
3517 }
3518 }
3519
3520
3521 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3522 || diff == 3 || diff == 5 || diff == 9)
3523 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3524 && (mode != DImode
3525 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3526 {
3527 /*
3528 * xorl dest,dest
3529 * cmpl op1,op2
3530 * setcc dest
3531 * lea cf(dest*(ct-cf)),dest
3532 *
3533 * Size 14.
3534 *
3535 * This also catches the degenerate setcc-only case.
3536 */
3537
3538 rtx tmp;
3539 int nops;
3540
3541 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3542
3543 nops = 0;
3544 /* On x86_64 the lea instruction operates on Pmode, so we need
3545 to get arithmetics done in proper mode to match. */
3546 if (diff == 1)
3547 tmp = copy_rtx (out);
3548 else
3549 {
3550 rtx out1;
3551 out1 = copy_rtx (out);
3552 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3553 nops++;
3554 if (diff & 1)
3555 {
3556 tmp = gen_rtx_PLUS (mode, tmp, out1);
3557 nops++;
3558 }
3559 }
3560 if (cf != 0)
3561 {
3562 tmp = plus_constant (mode, tmp, cf);
3563 nops++;
3564 }
3565 if (!rtx_equal_p (tmp, out))
3566 {
3567 if (nops == 1)
3568 out = force_operand (tmp, copy_rtx (out));
3569 else
3570 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3571 }
3572 if (!rtx_equal_p (out, operands[0]))
3573 emit_move_insn (operands[0], copy_rtx (out));
3574
3575 return true;
3576 }
3577
3578 /*
3579 * General case: Jumpful:
3580 * xorl dest,dest cmpl op1, op2
3581 * cmpl op1, op2 movl ct, dest
3582 * setcc dest jcc 1f
3583 * decl dest movl cf, dest
3584 * andl (cf-ct),dest 1:
3585 * addl ct,dest
3586 *
3587 * Size 20. Size 14.
3588 *
3589 * This is reasonably steep, but branch mispredict costs are
3590 * high on modern cpus, so consider failing only if optimizing
3591 * for space.
3592 */
3593
3594 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3595 && BRANCH_COST (optimize_insn_for_speed_p (),
3596 false) >= 2)
3597 {
3598 if (cf == 0)
3599 {
3600 machine_mode cmp_mode = GET_MODE (op0);
3601 enum rtx_code new_code;
3602
3603 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3604 {
3605 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3606
3607 /* We may be reversing a non-trapping
3608 comparison to a trapping comparison. */
3609 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3610 && code != EQ && code != NE
3611 && code != ORDERED && code != UNORDERED)
3612 new_code = UNKNOWN;
3613 else
3614 new_code = reverse_condition_maybe_unordered (code);
3615
3616 }
3617 else
3618 {
3619 new_code = ix86_reverse_condition (code, cmp_mode);
3620 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3621 compare_code = reverse_condition (compare_code);
3622 }
3623
3624 if (new_code != UNKNOWN)
3625 {
3626 cf = ct;
3627 ct = 0;
3628 code = new_code;
3629 }
3630 }
3631
3632 if (compare_code != UNKNOWN)
3633 {
3634 /* notl op1 (if needed)
3635 sarl $31, op1
3636 andl (cf-ct), op1
3637 addl ct, op1
3638
3639 For x < 0 (resp. x <= -1) there will be no notl,
3640 so if possible swap the constants to get rid of the
3641 complement.
3642 True/false will be -1/0 while code below (store flag
3643 followed by decrement) is 0/-1, so the constants need
3644 to be exchanged once more. */
3645
3646 if (compare_code == GE || !cf)
3647 {
3648 code = reverse_condition (code);
3649 compare_code = LT;
3650 }
3651 else
3652 std::swap (ct, cf);
3653
3654 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3655 }
3656 else
3657 {
3658 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3659
3660 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3661 constm1_rtx,
3662 copy_rtx (out), 1, OPTAB_DIRECT);
3663 }
3664
3665 out = expand_simple_binop (mode, AND, copy_rtx (out),
3666 gen_int_mode (cf - ct, mode),
3667 copy_rtx (out), 1, OPTAB_DIRECT);
3668 if (ct)
3669 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3670 copy_rtx (out), 1, OPTAB_DIRECT);
3671 if (!rtx_equal_p (out, operands[0]))
3672 emit_move_insn (operands[0], copy_rtx (out));
3673
3674 return true;
3675 }
3676 }
3677
3678 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3679 {
3680 /* Try a few things more with specific constants and a variable. */
3681
3682 optab op;
3683 rtx var, orig_out, out, tmp;
3684
3685 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3686 return false;
3687
3688 operands[2] = op2;
3689 operands[3] = op3;
3690
3691 /* If one of the two operands is an interesting constant, load a
3692 constant with the above and mask it in with a logical operation. */
3693
3694 if (CONST_INT_P (operands[2]))
3695 {
3696 var = operands[3];
3697 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3698 operands[3] = constm1_rtx, op = and_optab;
3699 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3700 operands[3] = const0_rtx, op = ior_optab;
3701 else
3702 return false;
3703 }
3704 else if (CONST_INT_P (operands[3]))
3705 {
3706 var = operands[2];
3707 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3708 {
3709 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3710 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3711 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3712 operands[1] = simplify_gen_relational (LT, VOIDmode,
3713 GET_MODE (op0),
3714 op0, const0_rtx);
3715
3716 operands[2] = constm1_rtx;
3717 op = and_optab;
3718 }
3719 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3720 operands[2] = const0_rtx, op = ior_optab;
3721 else
3722 return false;
3723 }
3724 else
3725 return false;
3726
3727 orig_out = operands[0];
3728 tmp = gen_reg_rtx (mode);
3729 operands[0] = tmp;
3730
3731 /* Recurse to get the constant loaded. */
3732 if (!ix86_expand_int_movcc (operands))
3733 return false;
3734
3735 /* Mask in the interesting variable. */
3736 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3737 OPTAB_WIDEN);
3738 if (!rtx_equal_p (out, orig_out))
3739 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3740
3741 return true;
3742 }
3743
3744 /*
3745 * For comparison with above,
3746 *
3747 * movl cf,dest
3748 * movl ct,tmp
3749 * cmpl op1,op2
3750 * cmovcc tmp,dest
3751 *
3752 * Size 15.
3753 */
3754
3755 if (! nonimmediate_operand (operands[2], mode))
3756 operands[2] = force_reg (mode, operands[2]);
3757 if (! nonimmediate_operand (operands[3], mode))
3758 operands[3] = force_reg (mode, operands[3]);
3759
3760 if (! register_operand (operands[2], VOIDmode)
3761 && (mode == QImode
3762 || ! register_operand (operands[3], VOIDmode)))
3763 operands[2] = force_reg (mode, operands[2]);
3764
3765 if (mode == QImode
3766 && ! register_operand (operands[3], VOIDmode))
3767 operands[3] = force_reg (mode, operands[3]);
3768
3769 emit_insn (compare_seq);
3770 emit_insn (gen_rtx_SET (operands[0],
3771 gen_rtx_IF_THEN_ELSE (mode,
3772 compare_op, operands[2],
3773 operands[3])));
3774 return true;
3775 }
3776
3777 /* Detect conditional moves that exactly match min/max operational
3778 semantics. Note that this is IEEE safe, as long as we don't
3779 interchange the operands.
3780
3781 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3782 and TRUE if the operation is successful and instructions are emitted. */
3783
3784 static bool
3785 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3786 rtx cmp_op1, rtx if_true, rtx if_false)
3787 {
3788 machine_mode mode;
3789 bool is_min;
3790 rtx tmp;
3791
3792 if (code == LT)
3793 ;
3794 else if (code == UNGE)
3795 std::swap (if_true, if_false);
3796 else
3797 return false;
3798
3799 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3800 is_min = true;
3801 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3802 is_min = false;
3803 else
3804 return false;
3805
3806 mode = GET_MODE (dest);
3807
3808 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3809 but MODE may be a vector mode and thus not appropriate. */
3810 if (!flag_finite_math_only || flag_signed_zeros)
3811 {
3812 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3813 rtvec v;
3814
3815 if_true = force_reg (mode, if_true);
3816 v = gen_rtvec (2, if_true, if_false);
3817 tmp = gen_rtx_UNSPEC (mode, v, u);
3818 }
3819 else
3820 {
3821 code = is_min ? SMIN : SMAX;
3822 if (MEM_P (if_true) && MEM_P (if_false))
3823 if_true = force_reg (mode, if_true);
3824 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3825 }
3826
3827 emit_insn (gen_rtx_SET (dest, tmp));
3828 return true;
3829 }
3830
3831 /* Return true if MODE is valid for vector compare to mask register,
3832 Same result for conditionl vector move with mask register. */
3833 static bool
3834 ix86_valid_mask_cmp_mode (machine_mode mode)
3835 {
3836 /* XOP has its own vector conditional movement. */
3837 if (TARGET_XOP && !TARGET_AVX512F)
3838 return false;
3839
3840 /* HFmode only supports vcmpsh whose dest is mask register. */
3841 if (TARGET_AVX512FP16 && mode == HFmode)
3842 return true;
3843
3844 /* AVX512F is needed for mask operation. */
3845 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3846 return false;
3847
3848 /* AVX512BW is needed for vector QI/HImode,
3849 AVX512VL is needed for 128/256-bit vector. */
3850 machine_mode inner_mode = GET_MODE_INNER (mode);
3851 int vector_size = GET_MODE_SIZE (mode);
3852 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3853 return false;
3854
3855 return vector_size == 64 || TARGET_AVX512VL;
3856 }
3857
3858 /* Return true if integer mask comparison should be used. */
3859 static bool
3860 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3861 rtx op_true, rtx op_false)
3862 {
3863 int vector_size = GET_MODE_SIZE (mode);
3864
3865 if (cmp_mode == HFmode)
3866 return true;
3867 else if (vector_size < 16)
3868 return false;
3869 else if (vector_size == 64)
3870 return true;
3871 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3872 return true;
3873
3874 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3875 gcc_assert (!op_true == !op_false);
3876
3877 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3878 vector dest is required. */
3879 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3880 return false;
3881
3882 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3883 if (op_false == CONST0_RTX (mode)
3884 || op_true == CONST0_RTX (mode)
3885 || (INTEGRAL_MODE_P (mode)
3886 && (op_true == CONSTM1_RTX (mode)
3887 || op_false == CONSTM1_RTX (mode))))
3888 return false;
3889
3890 return true;
3891 }
3892
3893 /* Expand an SSE comparison. Return the register with the result. */
3894
3895 static rtx
3896 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3897 rtx op_true, rtx op_false)
3898 {
3899 machine_mode mode = GET_MODE (dest);
3900 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3901
3902 /* In general case result of comparison can differ from operands' type. */
3903 machine_mode cmp_mode;
3904
3905 /* In AVX512F the result of comparison is an integer mask. */
3906 bool maskcmp = false;
3907 rtx x;
3908
3909 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3910 {
3911 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3912 maskcmp = true;
3913 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3914 }
3915 else
3916 cmp_mode = cmp_ops_mode;
3917
3918 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3919
3920 bool (*op1_predicate)(rtx, machine_mode)
3921 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3922
3923 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3924 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3925
3926 if (optimize
3927 || (maskcmp && cmp_mode != mode)
3928 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3929 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3930 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3931
3932 if (maskcmp)
3933 {
3934 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3935 gcc_assert (ok);
3936 return dest;
3937 }
3938
3939 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3940
3941 if (cmp_mode != mode)
3942 {
3943 x = force_reg (cmp_ops_mode, x);
3944 convert_move (dest, x, false);
3945 }
3946 else
3947 emit_insn (gen_rtx_SET (dest, x));
3948
3949 return dest;
3950 }
3951
3952 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3953 instructions that can be performed using GP registers. */
3954
3955 static void
3956 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
3957 rtx dst, rtx src1, rtx src2)
3958 {
3959 rtx tmp;
3960
3961 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
3962
3963 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
3964 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
3965 {
3966 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
3967 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
3968 }
3969
3970 emit_insn (tmp);
3971 }
3972
3973 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3974 operations. This is used for both scalar and vector conditional moves. */
3975
3976 void
3977 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3978 {
3979 machine_mode mode = GET_MODE (dest);
3980 machine_mode cmpmode = GET_MODE (cmp);
3981 rtx x;
3982
3983 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3984 if (rtx_equal_p (op_true, op_false))
3985 {
3986 emit_move_insn (dest, op_true);
3987 return;
3988 }
3989
3990 /* If we have an integer mask and FP value then we need
3991 to cast mask to FP mode. */
3992 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3993 {
3994 cmp = force_reg (cmpmode, cmp);
3995 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3996 }
3997
3998 /* In AVX512F the result of comparison is an integer mask. */
3999 if (mode != cmpmode
4000 && GET_MODE_CLASS (cmpmode) == MODE_INT)
4001 {
4002 gcc_assert (ix86_valid_mask_cmp_mode (mode));
4003 /* Using scalar/vector move with mask register. */
4004 cmp = force_reg (cmpmode, cmp);
4005 /* Optimize for mask zero. */
4006 op_true = (op_true != CONST0_RTX (mode)
4007 ? force_reg (mode, op_true) : op_true);
4008 op_false = (op_false != CONST0_RTX (mode)
4009 ? force_reg (mode, op_false) : op_false);
4010 if (op_true == CONST0_RTX (mode))
4011 {
4012 if (cmpmode == E_DImode && !TARGET_64BIT)
4013 {
4014 x = gen_reg_rtx (cmpmode);
4015 emit_insn (gen_knotdi (x, cmp));
4016 }
4017 else
4018 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4019 cmp = x;
4020 /* Reverse op_true op_false. */
4021 std::swap (op_true, op_false);
4022 }
4023
4024 if (mode == HFmode)
4025 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4026 else
4027 emit_insn (gen_rtx_SET (dest,
4028 gen_rtx_VEC_MERGE (mode,
4029 op_true, op_false, cmp)));
4030 return;
4031 }
4032
4033 if (vector_all_ones_operand (op_true, mode)
4034 && op_false == CONST0_RTX (mode))
4035 {
4036 emit_move_insn (dest, cmp);
4037 return;
4038 }
4039 else if (op_false == CONST0_RTX (mode))
4040 {
4041 x = expand_simple_binop (mode, AND, cmp, op_true,
4042 dest, 1, OPTAB_DIRECT);
4043 if (x != dest)
4044 emit_move_insn (dest, x);
4045 return;
4046 }
4047 else if (op_true == CONST0_RTX (mode))
4048 {
4049 op_false = force_reg (mode, op_false);
4050 x = gen_rtx_NOT (mode, cmp);
4051 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
4052 return;
4053 }
4054 else if (vector_all_ones_operand (op_true, mode))
4055 {
4056 x = expand_simple_binop (mode, IOR, cmp, op_false,
4057 dest, 1, OPTAB_DIRECT);
4058 if (x != dest)
4059 emit_move_insn (dest, x);
4060 return;
4061 }
4062
4063 if (TARGET_XOP)
4064 {
4065 op_true = force_reg (mode, op_true);
4066
4067 if (GET_MODE_SIZE (mode) < 16
4068 || !nonimmediate_operand (op_false, mode))
4069 op_false = force_reg (mode, op_false);
4070
4071 emit_insn (gen_rtx_SET (dest,
4072 gen_rtx_IF_THEN_ELSE (mode, cmp,
4073 op_true, op_false)));
4074 return;
4075 }
4076
4077 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4078 machine_mode blend_mode = mode;
4079
4080 if (GET_MODE_SIZE (mode) < 16
4081 || !vector_operand (op_true, mode))
4082 op_true = force_reg (mode, op_true);
4083
4084 op_false = force_reg (mode, op_false);
4085
4086 switch (mode)
4087 {
4088 case E_V2SFmode:
4089 if (TARGET_SSE4_1)
4090 gen = gen_mmx_blendvps;
4091 break;
4092 case E_V4SFmode:
4093 if (TARGET_SSE4_1)
4094 gen = gen_sse4_1_blendvps;
4095 break;
4096 case E_V2DFmode:
4097 if (TARGET_SSE4_1)
4098 gen = gen_sse4_1_blendvpd;
4099 break;
4100 case E_SFmode:
4101 if (TARGET_SSE4_1)
4102 gen = gen_sse4_1_blendvss;
4103 break;
4104 case E_DFmode:
4105 if (TARGET_SSE4_1)
4106 gen = gen_sse4_1_blendvsd;
4107 break;
4108 case E_V8QImode:
4109 case E_V4HImode:
4110 case E_V2SImode:
4111 if (TARGET_SSE4_1)
4112 {
4113 gen = gen_mmx_pblendvb_v8qi;
4114 blend_mode = V8QImode;
4115 }
4116 break;
4117 case E_V4QImode:
4118 case E_V2HImode:
4119 if (TARGET_SSE4_1)
4120 {
4121 gen = gen_mmx_pblendvb_v4qi;
4122 blend_mode = V4QImode;
4123 }
4124 break;
4125 case E_V2QImode:
4126 if (TARGET_SSE4_1)
4127 gen = gen_mmx_pblendvb_v2qi;
4128 break;
4129 case E_V16QImode:
4130 case E_V8HImode:
4131 case E_V8HFmode:
4132 case E_V8BFmode:
4133 case E_V4SImode:
4134 case E_V2DImode:
4135 case E_V1TImode:
4136 if (TARGET_SSE4_1)
4137 {
4138 gen = gen_sse4_1_pblendvb;
4139 blend_mode = V16QImode;
4140 }
4141 break;
4142 case E_V8SFmode:
4143 if (TARGET_AVX)
4144 gen = gen_avx_blendvps256;
4145 break;
4146 case E_V4DFmode:
4147 if (TARGET_AVX)
4148 gen = gen_avx_blendvpd256;
4149 break;
4150 case E_V32QImode:
4151 case E_V16HImode:
4152 case E_V16HFmode:
4153 case E_V16BFmode:
4154 case E_V8SImode:
4155 case E_V4DImode:
4156 if (TARGET_AVX2)
4157 {
4158 gen = gen_avx2_pblendvb;
4159 blend_mode = V32QImode;
4160 }
4161 break;
4162
4163 case E_V64QImode:
4164 gen = gen_avx512bw_blendmv64qi;
4165 break;
4166 case E_V32HImode:
4167 gen = gen_avx512bw_blendmv32hi;
4168 break;
4169 case E_V32HFmode:
4170 gen = gen_avx512bw_blendmv32hf;
4171 break;
4172 case E_V32BFmode:
4173 gen = gen_avx512bw_blendmv32bf;
4174 break;
4175 case E_V16SImode:
4176 gen = gen_avx512f_blendmv16si;
4177 break;
4178 case E_V8DImode:
4179 gen = gen_avx512f_blendmv8di;
4180 break;
4181 case E_V8DFmode:
4182 gen = gen_avx512f_blendmv8df;
4183 break;
4184 case E_V16SFmode:
4185 gen = gen_avx512f_blendmv16sf;
4186 break;
4187
4188 default:
4189 break;
4190 }
4191
4192 if (gen != NULL)
4193 {
4194 if (blend_mode == mode)
4195 x = dest;
4196 else
4197 {
4198 x = gen_reg_rtx (blend_mode);
4199 op_false = gen_lowpart (blend_mode, op_false);
4200 op_true = gen_lowpart (blend_mode, op_true);
4201 cmp = gen_lowpart (blend_mode, cmp);
4202 }
4203
4204 emit_insn (gen (x, op_false, op_true, cmp));
4205
4206 if (x != dest)
4207 emit_move_insn (dest, gen_lowpart (mode, x));
4208 }
4209 else
4210 {
4211 rtx t2, t3;
4212
4213 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4214 NULL, 1, OPTAB_DIRECT);
4215
4216 t3 = gen_reg_rtx (mode);
4217 x = gen_rtx_NOT (mode, cmp);
4218 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4219
4220 x = expand_simple_binop (mode, IOR, t3, t2,
4221 dest, 1, OPTAB_DIRECT);
4222 if (x != dest)
4223 emit_move_insn (dest, x);
4224 }
4225 }
4226
4227 /* Swap, force into registers, or otherwise massage the two operands
4228 to an sse comparison with a mask result. Thus we differ a bit from
4229 ix86_prepare_fp_compare_args which expects to produce a flags result.
4230
4231 The DEST operand exists to help determine whether to commute commutative
4232 operators. The POP0/POP1 operands are updated in place. The new
4233 comparison code is returned, or UNKNOWN if not implementable. */
4234
4235 static enum rtx_code
4236 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4237 rtx *pop0, rtx *pop1)
4238 {
4239 switch (code)
4240 {
4241 case LTGT:
4242 case UNEQ:
4243 /* AVX supports all the needed comparisons. */
4244 if (TARGET_AVX)
4245 break;
4246 /* We have no LTGT as an operator. We could implement it with
4247 NE & ORDERED, but this requires an extra temporary. It's
4248 not clear that it's worth it. */
4249 return UNKNOWN;
4250
4251 case LT:
4252 case LE:
4253 case UNGT:
4254 case UNGE:
4255 /* These are supported directly. */
4256 break;
4257
4258 case EQ:
4259 case NE:
4260 case UNORDERED:
4261 case ORDERED:
4262 /* AVX has 3 operand comparisons, no need to swap anything. */
4263 if (TARGET_AVX)
4264 break;
4265 /* For commutative operators, try to canonicalize the destination
4266 operand to be first in the comparison - this helps reload to
4267 avoid extra moves. */
4268 if (!dest || !rtx_equal_p (dest, *pop1))
4269 break;
4270 /* FALLTHRU */
4271
4272 case GE:
4273 case GT:
4274 case UNLE:
4275 case UNLT:
4276 /* These are not supported directly before AVX, and furthermore
4277 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4278 comparison operands to transform into something that is
4279 supported. */
4280 std::swap (*pop0, *pop1);
4281 code = swap_condition (code);
4282 break;
4283
4284 default:
4285 gcc_unreachable ();
4286 }
4287
4288 return code;
4289 }
4290
4291 /* Expand a floating-point conditional move. Return true if successful. */
4292
4293 bool
4294 ix86_expand_fp_movcc (rtx operands[])
4295 {
4296 machine_mode mode = GET_MODE (operands[0]);
4297 enum rtx_code code = GET_CODE (operands[1]);
4298 rtx tmp, compare_op;
4299 rtx op0 = XEXP (operands[1], 0);
4300 rtx op1 = XEXP (operands[1], 1);
4301
4302 if (GET_MODE (op0) == BFmode
4303 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4304 return false;
4305
4306 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4307 {
4308 machine_mode cmode;
4309
4310 /* Since we've no cmove for sse registers, don't force bad register
4311 allocation just to gain access to it. Deny movcc when the
4312 comparison mode doesn't match the move mode. */
4313 cmode = GET_MODE (op0);
4314 if (cmode == VOIDmode)
4315 cmode = GET_MODE (op1);
4316 if (cmode != mode)
4317 return false;
4318
4319 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4320 if (code == UNKNOWN)
4321 return false;
4322
4323 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4324 operands[2], operands[3]))
4325 return true;
4326
4327 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4328 operands[2], operands[3]);
4329 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4330 return true;
4331 }
4332
4333 if (GET_MODE (op0) == TImode
4334 || (GET_MODE (op0) == DImode
4335 && !TARGET_64BIT))
4336 return false;
4337
4338 /* The floating point conditional move instructions don't directly
4339 support conditions resulting from a signed integer comparison. */
4340
4341 compare_op = ix86_expand_compare (code, op0, op1);
4342 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4343 {
4344 tmp = gen_reg_rtx (QImode);
4345 ix86_expand_setcc (tmp, code, op0, op1);
4346
4347 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4348 }
4349
4350 emit_insn (gen_rtx_SET (operands[0],
4351 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4352 operands[2], operands[3])));
4353
4354 return true;
4355 }
4356
4357 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4358
4359 static int
4360 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4361 {
4362 switch (code)
4363 {
4364 case EQ:
4365 return 0;
4366 case LT:
4367 case LTU:
4368 return 1;
4369 case LE:
4370 case LEU:
4371 return 2;
4372 case NE:
4373 return 4;
4374 case GE:
4375 case GEU:
4376 return 5;
4377 case GT:
4378 case GTU:
4379 return 6;
4380 default:
4381 gcc_unreachable ();
4382 }
4383 }
4384
4385 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4386
4387 static int
4388 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4389 {
4390 switch (code)
4391 {
4392 case EQ:
4393 return 0x00;
4394 case NE:
4395 return 0x04;
4396 case GT:
4397 return 0x0e;
4398 case LE:
4399 return 0x02;
4400 case GE:
4401 return 0x0d;
4402 case LT:
4403 return 0x01;
4404 case UNLE:
4405 return 0x0a;
4406 case UNLT:
4407 return 0x09;
4408 case UNGE:
4409 return 0x05;
4410 case UNGT:
4411 return 0x06;
4412 case UNEQ:
4413 return 0x18;
4414 case LTGT:
4415 return 0x0c;
4416 case ORDERED:
4417 return 0x07;
4418 case UNORDERED:
4419 return 0x03;
4420 default:
4421 gcc_unreachable ();
4422 }
4423 }
4424
4425 /* Return immediate value to be used in UNSPEC_PCMP
4426 for comparison CODE in MODE. */
4427
4428 static int
4429 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4430 {
4431 if (FLOAT_MODE_P (mode))
4432 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4433 return ix86_int_cmp_code_to_pcmp_immediate (code);
4434 }
4435
4436 /* Expand AVX-512 vector comparison. */
4437
4438 bool
4439 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4440 {
4441 machine_mode mask_mode = GET_MODE (dest);
4442 machine_mode cmp_mode = GET_MODE (cmp_op0);
4443 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4444 int unspec_code;
4445 rtx unspec;
4446
4447 switch (code)
4448 {
4449 case LEU:
4450 case GTU:
4451 case GEU:
4452 case LTU:
4453 unspec_code = UNSPEC_UNSIGNED_PCMP;
4454 break;
4455
4456 default:
4457 unspec_code = UNSPEC_PCMP;
4458 }
4459
4460 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4461 unspec_code);
4462 emit_insn (gen_rtx_SET (dest, unspec));
4463
4464 return true;
4465 }
4466
4467 /* Expand fp vector comparison. */
4468
4469 bool
4470 ix86_expand_fp_vec_cmp (rtx operands[])
4471 {
4472 enum rtx_code code = GET_CODE (operands[1]);
4473 rtx cmp;
4474
4475 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4476 &operands[2], &operands[3]);
4477 if (code == UNKNOWN)
4478 {
4479 rtx temp;
4480 switch (GET_CODE (operands[1]))
4481 {
4482 case LTGT:
4483 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4484 operands[3], NULL, NULL);
4485 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4486 operands[3], NULL, NULL);
4487 code = AND;
4488 break;
4489 case UNEQ:
4490 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4491 operands[3], NULL, NULL);
4492 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4493 operands[3], NULL, NULL);
4494 code = IOR;
4495 break;
4496 default:
4497 gcc_unreachable ();
4498 }
4499 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4500 OPTAB_DIRECT);
4501 }
4502 else
4503 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4504 NULL, NULL);
4505
4506 if (operands[0] != cmp)
4507 emit_move_insn (operands[0], cmp);
4508
4509 return true;
4510 }
4511
4512 static rtx
4513 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4514 rtx op_true, rtx op_false, bool *negate)
4515 {
4516 machine_mode data_mode = GET_MODE (dest);
4517 machine_mode mode = GET_MODE (cop0);
4518 rtx x;
4519
4520 *negate = false;
4521
4522 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4523 if (TARGET_XOP
4524 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4525 && GET_MODE_SIZE (mode) <= 16)
4526 ;
4527 /* AVX512F supports all of the comparsions
4528 on all 128/256/512-bit vector int types. */
4529 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4530 ;
4531 else
4532 {
4533 /* Canonicalize the comparison to EQ, GT, GTU. */
4534 switch (code)
4535 {
4536 case EQ:
4537 case GT:
4538 case GTU:
4539 break;
4540
4541 case LE:
4542 case LEU:
4543 /* x <= cst can be handled as x < cst + 1 unless there is
4544 wrap around in cst + 1. */
4545 if (GET_CODE (cop1) == CONST_VECTOR
4546 && GET_MODE_INNER (mode) != TImode)
4547 {
4548 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4549 machine_mode eltmode = GET_MODE_INNER (mode);
4550 for (i = 0; i < n_elts; ++i)
4551 {
4552 rtx elt = CONST_VECTOR_ELT (cop1, i);
4553 if (!CONST_INT_P (elt))
4554 break;
4555 if (code == GE)
4556 {
4557 /* For LE punt if some element is signed maximum. */
4558 if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4559 == (GET_MODE_MASK (eltmode) >> 1))
4560 break;
4561 }
4562 /* For LEU punt if some element is unsigned maximum. */
4563 else if (elt == constm1_rtx)
4564 break;
4565 }
4566 if (i == n_elts)
4567 {
4568 rtvec v = rtvec_alloc (n_elts);
4569 for (i = 0; i < n_elts; ++i)
4570 RTVEC_ELT (v, i)
4571 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4572 eltmode);
4573 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4574 std::swap (cop0, cop1);
4575 code = code == LE ? GT : GTU;
4576 break;
4577 }
4578 }
4579 /* FALLTHRU */
4580 case NE:
4581 code = reverse_condition (code);
4582 *negate = true;
4583 break;
4584
4585 case GE:
4586 case GEU:
4587 /* x >= cst can be handled as x > cst - 1 unless there is
4588 wrap around in cst - 1. */
4589 if (GET_CODE (cop1) == CONST_VECTOR
4590 && GET_MODE_INNER (mode) != TImode)
4591 {
4592 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4593 machine_mode eltmode = GET_MODE_INNER (mode);
4594 for (i = 0; i < n_elts; ++i)
4595 {
4596 rtx elt = CONST_VECTOR_ELT (cop1, i);
4597 if (!CONST_INT_P (elt))
4598 break;
4599 if (code == GE)
4600 {
4601 /* For GE punt if some element is signed minimum. */
4602 if (INTVAL (elt) < 0
4603 && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4604 == 0))
4605 break;
4606 }
4607 /* For GEU punt if some element is zero. */
4608 else if (elt == const0_rtx)
4609 break;
4610 }
4611 if (i == n_elts)
4612 {
4613 rtvec v = rtvec_alloc (n_elts);
4614 for (i = 0; i < n_elts; ++i)
4615 RTVEC_ELT (v, i)
4616 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
4617 eltmode);
4618 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4619 code = code == GE ? GT : GTU;
4620 break;
4621 }
4622 }
4623 code = reverse_condition (code);
4624 *negate = true;
4625 /* FALLTHRU */
4626
4627 case LT:
4628 case LTU:
4629 std::swap (cop0, cop1);
4630 code = swap_condition (code);
4631 break;
4632
4633 default:
4634 gcc_unreachable ();
4635 }
4636
4637 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4638 if (mode == V2DImode)
4639 {
4640 switch (code)
4641 {
4642 case EQ:
4643 /* SSE4.1 supports EQ. */
4644 if (!TARGET_SSE4_1)
4645 return NULL;
4646 break;
4647
4648 case GT:
4649 case GTU:
4650 /* SSE4.2 supports GT/GTU. */
4651 if (!TARGET_SSE4_2)
4652 return NULL;
4653 break;
4654
4655 default:
4656 gcc_unreachable ();
4657 }
4658 }
4659
4660 if (GET_CODE (cop0) == CONST_VECTOR)
4661 cop0 = force_reg (mode, cop0);
4662 else if (GET_CODE (cop1) == CONST_VECTOR)
4663 cop1 = force_reg (mode, cop1);
4664
4665 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4666 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4667 if (*negate)
4668 std::swap (optrue, opfalse);
4669
4670 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4671 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4672 min (x, y) == x). While we add one instruction (the minimum),
4673 we remove the need for two instructions in the negation, as the
4674 result is done this way.
4675 When using masks, do it for SI/DImode element types, as it is shorter
4676 than the two subtractions. */
4677 if ((code != EQ
4678 && GET_MODE_SIZE (mode) != 64
4679 && vector_all_ones_operand (opfalse, data_mode)
4680 && optrue == CONST0_RTX (data_mode))
4681 || (code == GTU
4682 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4683 /* Don't do it if not using integer masks and we'd end up with
4684 the right values in the registers though. */
4685 && (GET_MODE_SIZE (mode) == 64
4686 || !vector_all_ones_operand (optrue, data_mode)
4687 || opfalse != CONST0_RTX (data_mode))))
4688 {
4689 rtx (*gen) (rtx, rtx, rtx) = NULL;
4690
4691 switch (mode)
4692 {
4693 case E_V16SImode:
4694 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4695 break;
4696 case E_V8DImode:
4697 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4698 cop0 = force_reg (mode, cop0);
4699 cop1 = force_reg (mode, cop1);
4700 break;
4701 case E_V32QImode:
4702 if (TARGET_AVX2)
4703 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4704 break;
4705 case E_V16HImode:
4706 if (TARGET_AVX2)
4707 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4708 break;
4709 case E_V8SImode:
4710 if (TARGET_AVX2)
4711 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4712 break;
4713 case E_V4DImode:
4714 if (TARGET_AVX512VL)
4715 {
4716 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4717 cop0 = force_reg (mode, cop0);
4718 cop1 = force_reg (mode, cop1);
4719 }
4720 break;
4721 case E_V16QImode:
4722 if (code == GTU && TARGET_SSE2)
4723 gen = gen_uminv16qi3;
4724 else if (code == GT && TARGET_SSE4_1)
4725 gen = gen_sminv16qi3;
4726 break;
4727 case E_V8QImode:
4728 if (code == GTU && TARGET_SSE2)
4729 gen = gen_uminv8qi3;
4730 else if (code == GT && TARGET_SSE4_1)
4731 gen = gen_sminv8qi3;
4732 break;
4733 case E_V4QImode:
4734 if (code == GTU && TARGET_SSE2)
4735 gen = gen_uminv4qi3;
4736 else if (code == GT && TARGET_SSE4_1)
4737 gen = gen_sminv4qi3;
4738 break;
4739 case E_V2QImode:
4740 if (code == GTU && TARGET_SSE2)
4741 gen = gen_uminv2qi3;
4742 else if (code == GT && TARGET_SSE4_1)
4743 gen = gen_sminv2qi3;
4744 break;
4745 case E_V8HImode:
4746 if (code == GTU && TARGET_SSE4_1)
4747 gen = gen_uminv8hi3;
4748 else if (code == GT && TARGET_SSE2)
4749 gen = gen_sminv8hi3;
4750 break;
4751 case E_V4HImode:
4752 if (code == GTU && TARGET_SSE4_1)
4753 gen = gen_uminv4hi3;
4754 else if (code == GT && TARGET_SSE2)
4755 gen = gen_sminv4hi3;
4756 break;
4757 case E_V2HImode:
4758 if (code == GTU && TARGET_SSE4_1)
4759 gen = gen_uminv2hi3;
4760 else if (code == GT && TARGET_SSE2)
4761 gen = gen_sminv2hi3;
4762 break;
4763 case E_V4SImode:
4764 if (TARGET_SSE4_1)
4765 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4766 break;
4767 case E_V2SImode:
4768 if (TARGET_SSE4_1)
4769 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4770 break;
4771 case E_V2DImode:
4772 if (TARGET_AVX512VL)
4773 {
4774 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4775 cop0 = force_reg (mode, cop0);
4776 cop1 = force_reg (mode, cop1);
4777 }
4778 break;
4779 default:
4780 break;
4781 }
4782
4783 if (gen)
4784 {
4785 rtx tem = gen_reg_rtx (mode);
4786 if (!vector_operand (cop0, mode))
4787 cop0 = force_reg (mode, cop0);
4788 if (!vector_operand (cop1, mode))
4789 cop1 = force_reg (mode, cop1);
4790 *negate = !*negate;
4791 emit_insn (gen (tem, cop0, cop1));
4792 cop1 = tem;
4793 code = EQ;
4794 }
4795 }
4796
4797 /* Unsigned parallel compare is not supported by the hardware.
4798 Play some tricks to turn this into a signed comparison
4799 against 0. */
4800 if (code == GTU)
4801 {
4802 cop0 = force_reg (mode, cop0);
4803
4804 switch (mode)
4805 {
4806 case E_V16SImode:
4807 case E_V8DImode:
4808 case E_V8SImode:
4809 case E_V4DImode:
4810 case E_V4SImode:
4811 case E_V2SImode:
4812 case E_V2DImode:
4813 {
4814 rtx t1, t2, mask;
4815
4816 /* Subtract (-(INT MAX) - 1) from both operands to make
4817 them signed. */
4818 mask = ix86_build_signbit_mask (mode, true, false);
4819 t1 = gen_reg_rtx (mode);
4820 emit_insn (gen_sub3_insn (t1, cop0, mask));
4821
4822 t2 = gen_reg_rtx (mode);
4823 emit_insn (gen_sub3_insn (t2, cop1, mask));
4824
4825 cop0 = t1;
4826 cop1 = t2;
4827 code = GT;
4828 }
4829 break;
4830
4831 case E_V64QImode:
4832 case E_V32HImode:
4833 case E_V32QImode:
4834 case E_V16HImode:
4835 case E_V16QImode:
4836 case E_V8QImode:
4837 case E_V4QImode:
4838 case E_V2QImode:
4839 case E_V8HImode:
4840 case E_V4HImode:
4841 case E_V2HImode:
4842 /* Perform a parallel unsigned saturating subtraction. */
4843 x = gen_reg_rtx (mode);
4844 emit_insn (gen_rtx_SET
4845 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4846 cop0 = x;
4847 cop1 = CONST0_RTX (mode);
4848 code = EQ;
4849 *negate = !*negate;
4850 break;
4851
4852 default:
4853 gcc_unreachable ();
4854 }
4855 }
4856 }
4857
4858 if (*negate)
4859 std::swap (op_true, op_false);
4860
4861 if (GET_CODE (cop1) == CONST_VECTOR)
4862 cop1 = force_reg (mode, cop1);
4863
4864 /* Allow the comparison to be done in one mode, but the movcc to
4865 happen in another mode. */
4866 if (data_mode == mode)
4867 x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
4868 else
4869 {
4870 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4871 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4872 op_true, op_false);
4873 if (GET_MODE (x) == mode)
4874 x = gen_lowpart (data_mode, x);
4875 }
4876
4877 return x;
4878 }
4879
4880 /* Expand integer vector comparison. */
4881
4882 bool
4883 ix86_expand_int_vec_cmp (rtx operands[])
4884 {
4885 rtx_code code = GET_CODE (operands[1]);
4886 bool negate = false;
4887 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4888 operands[3], NULL, NULL, &negate);
4889
4890 if (!cmp)
4891 return false;
4892
4893 if (negate)
4894 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4895 CONST0_RTX (GET_MODE (cmp)),
4896 NULL, NULL, &negate);
4897
4898 gcc_assert (!negate);
4899
4900 if (operands[0] != cmp)
4901 emit_move_insn (operands[0], cmp);
4902
4903 return true;
4904 }
4905
4906 /* Expand a floating-point vector conditional move; a vcond operation
4907 rather than a movcc operation. */
4908
4909 bool
4910 ix86_expand_fp_vcond (rtx operands[])
4911 {
4912 enum rtx_code code = GET_CODE (operands[3]);
4913 rtx cmp;
4914
4915 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4916 &operands[4], &operands[5]);
4917 if (code == UNKNOWN)
4918 {
4919 rtx temp;
4920 switch (GET_CODE (operands[3]))
4921 {
4922 case LTGT:
4923 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4924 operands[5], operands[0], operands[0]);
4925 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4926 operands[5], operands[1], operands[2]);
4927 code = AND;
4928 break;
4929 case UNEQ:
4930 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4931 operands[5], operands[0], operands[0]);
4932 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4933 operands[5], operands[1], operands[2]);
4934 code = IOR;
4935 break;
4936 default:
4937 gcc_unreachable ();
4938 }
4939 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4940 OPTAB_DIRECT);
4941 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4942 return true;
4943 }
4944
4945 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4946 operands[5], operands[1], operands[2]))
4947 return true;
4948
4949 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4950 operands[1], operands[2]);
4951 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4952 return true;
4953 }
4954
4955 /* Expand a signed/unsigned integral vector conditional move. */
4956
4957 bool
4958 ix86_expand_int_vcond (rtx operands[])
4959 {
4960 machine_mode data_mode = GET_MODE (operands[0]);
4961 machine_mode mode = GET_MODE (operands[4]);
4962 enum rtx_code code = GET_CODE (operands[3]);
4963 bool negate = false;
4964 rtx x, cop0, cop1;
4965
4966 cop0 = operands[4];
4967 cop1 = operands[5];
4968
4969 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4970 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4971 if ((code == LT || code == GE)
4972 && data_mode == mode
4973 && cop1 == CONST0_RTX (mode)
4974 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4975 && GET_MODE_UNIT_SIZE (data_mode) > 1
4976 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4977 && (GET_MODE_SIZE (data_mode) == 16
4978 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4979 {
4980 rtx negop = operands[2 - (code == LT)];
4981 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4982 if (negop == CONST1_RTX (data_mode))
4983 {
4984 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4985 operands[0], 1, OPTAB_DIRECT);
4986 if (res != operands[0])
4987 emit_move_insn (operands[0], res);
4988 return true;
4989 }
4990 else if (GET_MODE_INNER (data_mode) != DImode
4991 && vector_all_ones_operand (negop, data_mode))
4992 {
4993 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4994 operands[0], 0, OPTAB_DIRECT);
4995 if (res != operands[0])
4996 emit_move_insn (operands[0], res);
4997 return true;
4998 }
4999 }
5000
5001 if (!nonimmediate_operand (cop1, mode))
5002 cop1 = force_reg (mode, cop1);
5003 if (!general_operand (operands[1], data_mode))
5004 operands[1] = force_reg (data_mode, operands[1]);
5005 if (!general_operand (operands[2], data_mode))
5006 operands[2] = force_reg (data_mode, operands[2]);
5007
5008 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5009 operands[1], operands[2], &negate);
5010
5011 if (!x)
5012 return false;
5013
5014 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5015 operands[2-negate]);
5016 return true;
5017 }
5018
5019 static bool
5020 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5021 struct expand_vec_perm_d *d)
5022 {
5023 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5024 expander, so args are either in d, or in op0, op1 etc. */
5025 machine_mode mode = GET_MODE (d ? d->op0 : op0);
5026 machine_mode maskmode = mode;
5027 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5028
5029 switch (mode)
5030 {
5031 case E_V16QImode:
5032 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5033 gen = gen_avx512vl_vpermt2varv16qi3;
5034 break;
5035 case E_V32QImode:
5036 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5037 gen = gen_avx512vl_vpermt2varv32qi3;
5038 break;
5039 case E_V64QImode:
5040 if (TARGET_AVX512VBMI)
5041 gen = gen_avx512bw_vpermt2varv64qi3;
5042 break;
5043 case E_V8HImode:
5044 if (TARGET_AVX512VL && TARGET_AVX512BW)
5045 gen = gen_avx512vl_vpermt2varv8hi3;
5046 break;
5047 case E_V16HImode:
5048 if (TARGET_AVX512VL && TARGET_AVX512BW)
5049 gen = gen_avx512vl_vpermt2varv16hi3;
5050 break;
5051 case E_V32HImode:
5052 if (TARGET_AVX512BW)
5053 gen = gen_avx512bw_vpermt2varv32hi3;
5054 break;
5055 case E_V4SImode:
5056 if (TARGET_AVX512VL)
5057 gen = gen_avx512vl_vpermt2varv4si3;
5058 break;
5059 case E_V8SImode:
5060 if (TARGET_AVX512VL)
5061 gen = gen_avx512vl_vpermt2varv8si3;
5062 break;
5063 case E_V16SImode:
5064 if (TARGET_AVX512F)
5065 gen = gen_avx512f_vpermt2varv16si3;
5066 break;
5067 case E_V4SFmode:
5068 if (TARGET_AVX512VL)
5069 {
5070 gen = gen_avx512vl_vpermt2varv4sf3;
5071 maskmode = V4SImode;
5072 }
5073 break;
5074 case E_V8SFmode:
5075 if (TARGET_AVX512VL)
5076 {
5077 gen = gen_avx512vl_vpermt2varv8sf3;
5078 maskmode = V8SImode;
5079 }
5080 break;
5081 case E_V16SFmode:
5082 if (TARGET_AVX512F)
5083 {
5084 gen = gen_avx512f_vpermt2varv16sf3;
5085 maskmode = V16SImode;
5086 }
5087 break;
5088 case E_V2DImode:
5089 if (TARGET_AVX512VL)
5090 gen = gen_avx512vl_vpermt2varv2di3;
5091 break;
5092 case E_V4DImode:
5093 if (TARGET_AVX512VL)
5094 gen = gen_avx512vl_vpermt2varv4di3;
5095 break;
5096 case E_V8DImode:
5097 if (TARGET_AVX512F)
5098 gen = gen_avx512f_vpermt2varv8di3;
5099 break;
5100 case E_V2DFmode:
5101 if (TARGET_AVX512VL)
5102 {
5103 gen = gen_avx512vl_vpermt2varv2df3;
5104 maskmode = V2DImode;
5105 }
5106 break;
5107 case E_V4DFmode:
5108 if (TARGET_AVX512VL)
5109 {
5110 gen = gen_avx512vl_vpermt2varv4df3;
5111 maskmode = V4DImode;
5112 }
5113 break;
5114 case E_V8DFmode:
5115 if (TARGET_AVX512F)
5116 {
5117 gen = gen_avx512f_vpermt2varv8df3;
5118 maskmode = V8DImode;
5119 }
5120 break;
5121 default:
5122 break;
5123 }
5124
5125 if (gen == NULL)
5126 return false;
5127
5128 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5129 expander, so args are either in d, or in op0, op1 etc. */
5130 if (d)
5131 {
5132 rtx vec[64];
5133 target = d->target;
5134 op0 = d->op0;
5135 op1 = d->op1;
5136 for (int i = 0; i < d->nelt; ++i)
5137 vec[i] = GEN_INT (d->perm[i]);
5138 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5139 }
5140
5141 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5142 return true;
5143 }
5144
5145 /* Expand a variable vector permutation. */
5146
5147 void
5148 ix86_expand_vec_perm (rtx operands[])
5149 {
5150 rtx target = operands[0];
5151 rtx op0 = operands[1];
5152 rtx op1 = operands[2];
5153 rtx mask = operands[3];
5154 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5155 machine_mode mode = GET_MODE (op0);
5156 machine_mode maskmode = GET_MODE (mask);
5157 int w, e, i;
5158 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5159
5160 /* Number of elements in the vector. */
5161 w = GET_MODE_NUNITS (mode);
5162 e = GET_MODE_UNIT_SIZE (mode);
5163 gcc_assert (w <= 64);
5164
5165 /* For HF mode vector, convert it to HI using subreg. */
5166 if (GET_MODE_INNER (mode) == HFmode)
5167 {
5168 machine_mode orig_mode = mode;
5169 mode = mode_for_vector (HImode, w).require ();
5170 target = lowpart_subreg (mode, target, orig_mode);
5171 op0 = lowpart_subreg (mode, op0, orig_mode);
5172 op1 = lowpart_subreg (mode, op1, orig_mode);
5173 }
5174
5175 if (TARGET_AVX512F && one_operand_shuffle)
5176 {
5177 rtx (*gen) (rtx, rtx, rtx) = NULL;
5178 switch (mode)
5179 {
5180 case E_V16SImode:
5181 gen =gen_avx512f_permvarv16si;
5182 break;
5183 case E_V16SFmode:
5184 gen = gen_avx512f_permvarv16sf;
5185 break;
5186 case E_V8DImode:
5187 gen = gen_avx512f_permvarv8di;
5188 break;
5189 case E_V8DFmode:
5190 gen = gen_avx512f_permvarv8df;
5191 break;
5192 default:
5193 break;
5194 }
5195 if (gen != NULL)
5196 {
5197 emit_insn (gen (target, op0, mask));
5198 return;
5199 }
5200 }
5201
5202 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5203 return;
5204
5205 if (TARGET_AVX2)
5206 {
5207 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5208 {
5209 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5210 an constant shuffle operand. With a tiny bit of effort we can
5211 use VPERMD instead. A re-interpretation stall for V4DFmode is
5212 unfortunate but there's no avoiding it.
5213 Similarly for V16HImode we don't have instructions for variable
5214 shuffling, while for V32QImode we can use after preparing suitable
5215 masks vpshufb; vpshufb; vpermq; vpor. */
5216
5217 if (mode == V16HImode)
5218 {
5219 maskmode = mode = V32QImode;
5220 w = 32;
5221 e = 1;
5222 }
5223 else
5224 {
5225 maskmode = mode = V8SImode;
5226 w = 8;
5227 e = 4;
5228 }
5229 t1 = gen_reg_rtx (maskmode);
5230
5231 /* Replicate the low bits of the V4DImode mask into V8SImode:
5232 mask = { A B C D }
5233 t1 = { A A B B C C D D }. */
5234 for (i = 0; i < w / 2; ++i)
5235 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5236 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5237 vt = force_reg (maskmode, vt);
5238 mask = gen_lowpart (maskmode, mask);
5239 if (maskmode == V8SImode)
5240 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5241 else
5242 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5243
5244 /* Multiply the shuffle indicies by two. */
5245 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5246 OPTAB_DIRECT);
5247
5248 /* Add one to the odd shuffle indicies:
5249 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5250 for (i = 0; i < w / 2; ++i)
5251 {
5252 vec[i * 2] = const0_rtx;
5253 vec[i * 2 + 1] = const1_rtx;
5254 }
5255 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5256 vt = validize_mem (force_const_mem (maskmode, vt));
5257 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5258 OPTAB_DIRECT);
5259
5260 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5261 operands[3] = mask = t1;
5262 target = gen_reg_rtx (mode);
5263 op0 = gen_lowpart (mode, op0);
5264 op1 = gen_lowpart (mode, op1);
5265 }
5266
5267 switch (mode)
5268 {
5269 case E_V8SImode:
5270 /* The VPERMD and VPERMPS instructions already properly ignore
5271 the high bits of the shuffle elements. No need for us to
5272 perform an AND ourselves. */
5273 if (one_operand_shuffle)
5274 {
5275 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5276 if (target != operands[0])
5277 emit_move_insn (operands[0],
5278 gen_lowpart (GET_MODE (operands[0]), target));
5279 }
5280 else
5281 {
5282 t1 = gen_reg_rtx (V8SImode);
5283 t2 = gen_reg_rtx (V8SImode);
5284 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5285 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5286 goto merge_two;
5287 }
5288 return;
5289
5290 case E_V8SFmode:
5291 mask = gen_lowpart (V8SImode, mask);
5292 if (one_operand_shuffle)
5293 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5294 else
5295 {
5296 t1 = gen_reg_rtx (V8SFmode);
5297 t2 = gen_reg_rtx (V8SFmode);
5298 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5299 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5300 goto merge_two;
5301 }
5302 return;
5303
5304 case E_V4SImode:
5305 /* By combining the two 128-bit input vectors into one 256-bit
5306 input vector, we can use VPERMD and VPERMPS for the full
5307 two-operand shuffle. */
5308 t1 = gen_reg_rtx (V8SImode);
5309 t2 = gen_reg_rtx (V8SImode);
5310 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5311 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5312 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5313 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5314 return;
5315
5316 case E_V4SFmode:
5317 t1 = gen_reg_rtx (V8SFmode);
5318 t2 = gen_reg_rtx (V8SImode);
5319 mask = gen_lowpart (V4SImode, mask);
5320 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5321 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5322 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5323 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5324 return;
5325
5326 case E_V32QImode:
5327 t1 = gen_reg_rtx (V32QImode);
5328 t2 = gen_reg_rtx (V32QImode);
5329 t3 = gen_reg_rtx (V32QImode);
5330 vt2 = GEN_INT (-128);
5331 vt = gen_const_vec_duplicate (V32QImode, vt2);
5332 vt = force_reg (V32QImode, vt);
5333 for (i = 0; i < 32; i++)
5334 vec[i] = i < 16 ? vt2 : const0_rtx;
5335 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5336 vt2 = force_reg (V32QImode, vt2);
5337 /* From mask create two adjusted masks, which contain the same
5338 bits as mask in the low 7 bits of each vector element.
5339 The first mask will have the most significant bit clear
5340 if it requests element from the same 128-bit lane
5341 and MSB set if it requests element from the other 128-bit lane.
5342 The second mask will have the opposite values of the MSB,
5343 and additionally will have its 128-bit lanes swapped.
5344 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5345 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5346 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5347 stands for other 12 bytes. */
5348 /* The bit whether element is from the same lane or the other
5349 lane is bit 4, so shift it up by 3 to the MSB position. */
5350 t5 = gen_reg_rtx (V4DImode);
5351 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5352 GEN_INT (3)));
5353 /* Clear MSB bits from the mask just in case it had them set. */
5354 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5355 /* After this t1 will have MSB set for elements from other lane. */
5356 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5357 /* Clear bits other than MSB. */
5358 emit_insn (gen_andv32qi3 (t1, t1, vt));
5359 /* Or in the lower bits from mask into t3. */
5360 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5361 /* And invert MSB bits in t1, so MSB is set for elements from the same
5362 lane. */
5363 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5364 /* Swap 128-bit lanes in t3. */
5365 t6 = gen_reg_rtx (V4DImode);
5366 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5367 const2_rtx, GEN_INT (3),
5368 const0_rtx, const1_rtx));
5369 /* And or in the lower bits from mask into t1. */
5370 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5371 if (one_operand_shuffle)
5372 {
5373 /* Each of these shuffles will put 0s in places where
5374 element from the other 128-bit lane is needed, otherwise
5375 will shuffle in the requested value. */
5376 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5377 gen_lowpart (V32QImode, t6)));
5378 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5379 /* For t3 the 128-bit lanes are swapped again. */
5380 t7 = gen_reg_rtx (V4DImode);
5381 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5382 const2_rtx, GEN_INT (3),
5383 const0_rtx, const1_rtx));
5384 /* And oring both together leads to the result. */
5385 emit_insn (gen_iorv32qi3 (target, t1,
5386 gen_lowpart (V32QImode, t7)));
5387 if (target != operands[0])
5388 emit_move_insn (operands[0],
5389 gen_lowpart (GET_MODE (operands[0]), target));
5390 return;
5391 }
5392
5393 t4 = gen_reg_rtx (V32QImode);
5394 /* Similarly to the above one_operand_shuffle code,
5395 just for repeated twice for each operand. merge_two:
5396 code will merge the two results together. */
5397 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5398 gen_lowpart (V32QImode, t6)));
5399 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5400 gen_lowpart (V32QImode, t6)));
5401 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5402 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5403 t7 = gen_reg_rtx (V4DImode);
5404 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5405 const2_rtx, GEN_INT (3),
5406 const0_rtx, const1_rtx));
5407 t8 = gen_reg_rtx (V4DImode);
5408 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5409 const2_rtx, GEN_INT (3),
5410 const0_rtx, const1_rtx));
5411 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5412 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5413 t1 = t4;
5414 t2 = t3;
5415 goto merge_two;
5416
5417 default:
5418 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5419 break;
5420 }
5421 }
5422
5423 if (TARGET_XOP)
5424 {
5425 /* The XOP VPPERM insn supports three inputs. By ignoring the
5426 one_operand_shuffle special case, we avoid creating another
5427 set of constant vectors in memory. */
5428 one_operand_shuffle = false;
5429
5430 /* mask = mask & {2*w-1, ...} */
5431 vt = GEN_INT (2*w - 1);
5432 }
5433 else
5434 {
5435 /* mask = mask & {w-1, ...} */
5436 vt = GEN_INT (w - 1);
5437 }
5438
5439 vt = gen_const_vec_duplicate (maskmode, vt);
5440 mask = expand_simple_binop (maskmode, AND, mask, vt,
5441 NULL_RTX, 0, OPTAB_DIRECT);
5442
5443 /* For non-QImode operations, convert the word permutation control
5444 into a byte permutation control. */
5445 if (mode != V16QImode)
5446 {
5447 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5448 GEN_INT (exact_log2 (e)),
5449 NULL_RTX, 0, OPTAB_DIRECT);
5450
5451 /* Convert mask to vector of chars. */
5452 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5453
5454 /* Replicate each of the input bytes into byte positions:
5455 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5456 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5457 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5458 for (i = 0; i < 16; ++i)
5459 vec[i] = GEN_INT (i/e * e);
5460 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5461 vt = validize_mem (force_const_mem (V16QImode, vt));
5462 if (TARGET_XOP)
5463 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5464 else
5465 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5466
5467 /* Convert it into the byte positions by doing
5468 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5469 for (i = 0; i < 16; ++i)
5470 vec[i] = GEN_INT (i % e);
5471 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5472 vt = validize_mem (force_const_mem (V16QImode, vt));
5473 emit_insn (gen_addv16qi3 (mask, mask, vt));
5474 }
5475
5476 /* The actual shuffle operations all operate on V16QImode. */
5477 op0 = gen_lowpart (V16QImode, op0);
5478 op1 = gen_lowpart (V16QImode, op1);
5479
5480 if (TARGET_XOP)
5481 {
5482 if (GET_MODE (target) != V16QImode)
5483 target = gen_reg_rtx (V16QImode);
5484 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5485 if (target != operands[0])
5486 emit_move_insn (operands[0],
5487 gen_lowpart (GET_MODE (operands[0]), target));
5488 }
5489 else if (one_operand_shuffle)
5490 {
5491 if (GET_MODE (target) != V16QImode)
5492 target = gen_reg_rtx (V16QImode);
5493 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5494 if (target != operands[0])
5495 emit_move_insn (operands[0],
5496 gen_lowpart (GET_MODE (operands[0]), target));
5497 }
5498 else
5499 {
5500 rtx xops[6];
5501 bool ok;
5502
5503 /* Shuffle the two input vectors independently. */
5504 t1 = gen_reg_rtx (V16QImode);
5505 t2 = gen_reg_rtx (V16QImode);
5506 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5507 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5508
5509 merge_two:
5510 /* Then merge them together. The key is whether any given control
5511 element contained a bit set that indicates the second word. */
5512 mask = operands[3];
5513 vt = GEN_INT (w);
5514 if (maskmode == V2DImode && !TARGET_SSE4_1)
5515 {
5516 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5517 more shuffle to convert the V2DI input mask into a V4SI
5518 input mask. At which point the masking that expand_int_vcond
5519 will work as desired. */
5520 rtx t3 = gen_reg_rtx (V4SImode);
5521 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5522 const0_rtx, const0_rtx,
5523 const2_rtx, const2_rtx));
5524 mask = t3;
5525 maskmode = V4SImode;
5526 e = w = 4;
5527 }
5528
5529 vt = gen_const_vec_duplicate (maskmode, vt);
5530 vt = force_reg (maskmode, vt);
5531 mask = expand_simple_binop (maskmode, AND, mask, vt,
5532 NULL_RTX, 0, OPTAB_DIRECT);
5533
5534 if (GET_MODE (target) != mode)
5535 target = gen_reg_rtx (mode);
5536 xops[0] = target;
5537 xops[1] = gen_lowpart (mode, t2);
5538 xops[2] = gen_lowpart (mode, t1);
5539 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5540 xops[4] = mask;
5541 xops[5] = vt;
5542 ok = ix86_expand_int_vcond (xops);
5543 gcc_assert (ok);
5544 if (target != operands[0])
5545 emit_move_insn (operands[0],
5546 gen_lowpart (GET_MODE (operands[0]), target));
5547 }
5548 }
5549
5550 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5551 true if we should do zero extension, else sign extension. HIGH_P is
5552 true if we want the N/2 high elements, else the low elements. */
5553
5554 void
5555 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5556 {
5557 machine_mode imode = GET_MODE (src);
5558 rtx tmp;
5559
5560 if (TARGET_SSE4_1)
5561 {
5562 rtx (*unpack)(rtx, rtx);
5563 rtx (*extract)(rtx, rtx) = NULL;
5564 machine_mode halfmode = BLKmode;
5565
5566 switch (imode)
5567 {
5568 case E_V64QImode:
5569 if (unsigned_p)
5570 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5571 else
5572 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5573 halfmode = V32QImode;
5574 extract
5575 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5576 break;
5577 case E_V32QImode:
5578 if (unsigned_p)
5579 unpack = gen_avx2_zero_extendv16qiv16hi2;
5580 else
5581 unpack = gen_avx2_sign_extendv16qiv16hi2;
5582 halfmode = V16QImode;
5583 extract
5584 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5585 break;
5586 case E_V32HImode:
5587 if (unsigned_p)
5588 unpack = gen_avx512f_zero_extendv16hiv16si2;
5589 else
5590 unpack = gen_avx512f_sign_extendv16hiv16si2;
5591 halfmode = V16HImode;
5592 extract
5593 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5594 break;
5595 case E_V16HImode:
5596 if (unsigned_p)
5597 unpack = gen_avx2_zero_extendv8hiv8si2;
5598 else
5599 unpack = gen_avx2_sign_extendv8hiv8si2;
5600 halfmode = V8HImode;
5601 extract
5602 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5603 break;
5604 case E_V16SImode:
5605 if (unsigned_p)
5606 unpack = gen_avx512f_zero_extendv8siv8di2;
5607 else
5608 unpack = gen_avx512f_sign_extendv8siv8di2;
5609 halfmode = V8SImode;
5610 extract
5611 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5612 break;
5613 case E_V8SImode:
5614 if (unsigned_p)
5615 unpack = gen_avx2_zero_extendv4siv4di2;
5616 else
5617 unpack = gen_avx2_sign_extendv4siv4di2;
5618 halfmode = V4SImode;
5619 extract
5620 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5621 break;
5622 case E_V16QImode:
5623 if (unsigned_p)
5624 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5625 else
5626 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5627 break;
5628 case E_V8HImode:
5629 if (unsigned_p)
5630 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5631 else
5632 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5633 break;
5634 case E_V4SImode:
5635 if (unsigned_p)
5636 unpack = gen_sse4_1_zero_extendv2siv2di2;
5637 else
5638 unpack = gen_sse4_1_sign_extendv2siv2di2;
5639 break;
5640 case E_V8QImode:
5641 if (unsigned_p)
5642 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5643 else
5644 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5645 break;
5646 case E_V4HImode:
5647 if (unsigned_p)
5648 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5649 else
5650 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5651 break;
5652 case E_V4QImode:
5653 if (unsigned_p)
5654 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5655 else
5656 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5657 break;
5658 default:
5659 gcc_unreachable ();
5660 }
5661
5662 if (GET_MODE_SIZE (imode) >= 32)
5663 {
5664 tmp = gen_reg_rtx (halfmode);
5665 emit_insn (extract (tmp, src));
5666 }
5667 else if (high_p)
5668 {
5669 switch (GET_MODE_SIZE (imode))
5670 {
5671 case 16:
5672 /* Shift higher 8 bytes to lower 8 bytes. */
5673 tmp = gen_reg_rtx (V1TImode);
5674 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5675 GEN_INT (64)));
5676 break;
5677 case 8:
5678 /* Shift higher 4 bytes to lower 4 bytes. */
5679 tmp = gen_reg_rtx (V1DImode);
5680 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5681 GEN_INT (32)));
5682 break;
5683 case 4:
5684 /* Shift higher 2 bytes to lower 2 bytes. */
5685 tmp = gen_reg_rtx (V1SImode);
5686 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5687 GEN_INT (16)));
5688 break;
5689 default:
5690 gcc_unreachable ();
5691 }
5692
5693 tmp = gen_lowpart (imode, tmp);
5694 }
5695 else
5696 tmp = src;
5697
5698 emit_insn (unpack (dest, tmp));
5699 }
5700 else
5701 {
5702 rtx (*unpack)(rtx, rtx, rtx);
5703
5704 switch (imode)
5705 {
5706 case E_V16QImode:
5707 if (high_p)
5708 unpack = gen_vec_interleave_highv16qi;
5709 else
5710 unpack = gen_vec_interleave_lowv16qi;
5711 break;
5712 case E_V8HImode:
5713 if (high_p)
5714 unpack = gen_vec_interleave_highv8hi;
5715 else
5716 unpack = gen_vec_interleave_lowv8hi;
5717 break;
5718 case E_V4SImode:
5719 if (high_p)
5720 unpack = gen_vec_interleave_highv4si;
5721 else
5722 unpack = gen_vec_interleave_lowv4si;
5723 break;
5724 case E_V8QImode:
5725 if (high_p)
5726 unpack = gen_mmx_punpckhbw;
5727 else
5728 unpack = gen_mmx_punpcklbw;
5729 break;
5730 case E_V4HImode:
5731 if (high_p)
5732 unpack = gen_mmx_punpckhwd;
5733 else
5734 unpack = gen_mmx_punpcklwd;
5735 break;
5736 case E_V4QImode:
5737 if (high_p)
5738 unpack = gen_mmx_punpckhbw_low;
5739 else
5740 unpack = gen_mmx_punpcklbw_low;
5741 break;
5742 default:
5743 gcc_unreachable ();
5744 }
5745
5746 if (unsigned_p)
5747 tmp = force_reg (imode, CONST0_RTX (imode));
5748 else
5749 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5750 src, pc_rtx, pc_rtx);
5751
5752 rtx tmp2 = gen_reg_rtx (imode);
5753 emit_insn (unpack (tmp2, src, tmp));
5754 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5755 }
5756 }
5757
5758 /* Return true if mem is pool constant which contains a const_vector
5759 perm index, assign the index to PERM. */
5760 bool
5761 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5762 {
5763 machine_mode mode = GET_MODE (mem);
5764 int nelt = GET_MODE_NUNITS (mode);
5765
5766 if (!INTEGRAL_MODE_P (mode))
5767 return false;
5768
5769 /* Needs to be constant pool. */
5770 if (!(MEM_P (mem))
5771 || !SYMBOL_REF_P (XEXP (mem, 0))
5772 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5773 return false;
5774
5775 rtx constant = get_pool_constant (XEXP (mem, 0));
5776
5777 if (GET_CODE (constant) != CONST_VECTOR)
5778 return false;
5779
5780 /* There could be some rtx like
5781 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5782 but with "*.LC1" refer to V2DI constant vector. */
5783 if (GET_MODE (constant) != mode)
5784 {
5785 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5786
5787 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5788 return false;
5789 }
5790
5791 for (int i = 0; i != nelt; i++)
5792 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5793
5794 return true;
5795 }
5796
5797 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5798 but works for floating pointer parameters and nonoffsetable memories.
5799 For pushes, it returns just stack offsets; the values will be saved
5800 in the right order. Maximally three parts are generated. */
5801
5802 static int
5803 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5804 {
5805 int size;
5806
5807 if (!TARGET_64BIT)
5808 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5809 else
5810 size = (GET_MODE_SIZE (mode) + 4) / 8;
5811
5812 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5813 gcc_assert (size >= 2 && size <= 4);
5814
5815 /* Optimize constant pool reference to immediates. This is used by fp
5816 moves, that force all constants to memory to allow combining. */
5817 if (MEM_P (operand) && MEM_READONLY_P (operand))
5818 operand = avoid_constant_pool_reference (operand);
5819
5820 if (MEM_P (operand) && !offsettable_memref_p (operand))
5821 {
5822 /* The only non-offsetable memories we handle are pushes. */
5823 int ok = push_operand (operand, VOIDmode);
5824
5825 gcc_assert (ok);
5826
5827 operand = copy_rtx (operand);
5828 PUT_MODE (operand, word_mode);
5829 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5830 return size;
5831 }
5832
5833 if (GET_CODE (operand) == CONST_VECTOR)
5834 {
5835 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5836 /* Caution: if we looked through a constant pool memory above,
5837 the operand may actually have a different mode now. That's
5838 ok, since we want to pun this all the way back to an integer. */
5839 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5840 gcc_assert (operand != NULL);
5841 mode = imode;
5842 }
5843
5844 if (!TARGET_64BIT)
5845 {
5846 if (mode == DImode)
5847 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5848 else
5849 {
5850 int i;
5851
5852 if (REG_P (operand))
5853 {
5854 gcc_assert (reload_completed);
5855 for (i = 0; i < size; i++)
5856 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5857 }
5858 else if (offsettable_memref_p (operand))
5859 {
5860 operand = adjust_address (operand, SImode, 0);
5861 parts[0] = operand;
5862 for (i = 1; i < size; i++)
5863 parts[i] = adjust_address (operand, SImode, 4 * i);
5864 }
5865 else if (CONST_DOUBLE_P (operand))
5866 {
5867 const REAL_VALUE_TYPE *r;
5868 long l[4];
5869
5870 r = CONST_DOUBLE_REAL_VALUE (operand);
5871 switch (mode)
5872 {
5873 case E_TFmode:
5874 real_to_target (l, r, mode);
5875 parts[3] = gen_int_mode (l[3], SImode);
5876 parts[2] = gen_int_mode (l[2], SImode);
5877 break;
5878 case E_XFmode:
5879 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5880 long double may not be 80-bit. */
5881 real_to_target (l, r, mode);
5882 parts[2] = gen_int_mode (l[2], SImode);
5883 break;
5884 case E_DFmode:
5885 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5886 break;
5887 default:
5888 gcc_unreachable ();
5889 }
5890 parts[1] = gen_int_mode (l[1], SImode);
5891 parts[0] = gen_int_mode (l[0], SImode);
5892 }
5893 else
5894 gcc_unreachable ();
5895 }
5896 }
5897 else
5898 {
5899 if (mode == TImode)
5900 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5901 if (mode == XFmode || mode == TFmode)
5902 {
5903 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5904 if (REG_P (operand))
5905 {
5906 gcc_assert (reload_completed);
5907 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5908 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5909 }
5910 else if (offsettable_memref_p (operand))
5911 {
5912 operand = adjust_address (operand, DImode, 0);
5913 parts[0] = operand;
5914 parts[1] = adjust_address (operand, upper_mode, 8);
5915 }
5916 else if (CONST_DOUBLE_P (operand))
5917 {
5918 long l[4];
5919
5920 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5921
5922 /* real_to_target puts 32-bit pieces in each long. */
5923 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5924 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5925 << 32), DImode);
5926
5927 if (upper_mode == SImode)
5928 parts[1] = gen_int_mode (l[2], SImode);
5929 else
5930 parts[1]
5931 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5932 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5933 << 32), DImode);
5934 }
5935 else
5936 gcc_unreachable ();
5937 }
5938 }
5939
5940 return size;
5941 }
5942
5943 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5944 Return false when normal moves are needed; true when all required
5945 insns have been emitted. Operands 2-4 contain the input values
5946 int the correct order; operands 5-7 contain the output values. */
5947
5948 void
5949 ix86_split_long_move (rtx operands[])
5950 {
5951 rtx part[2][4];
5952 int nparts, i, j;
5953 int push = 0;
5954 int collisions = 0;
5955 machine_mode mode = GET_MODE (operands[0]);
5956 bool collisionparts[4];
5957
5958 /* The DFmode expanders may ask us to move double.
5959 For 64bit target this is single move. By hiding the fact
5960 here we simplify i386.md splitters. */
5961 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5962 {
5963 /* Optimize constant pool reference to immediates. This is used by
5964 fp moves, that force all constants to memory to allow combining. */
5965
5966 if (MEM_P (operands[1])
5967 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5968 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5969 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5970 if (push_operand (operands[0], VOIDmode))
5971 {
5972 operands[0] = copy_rtx (operands[0]);
5973 PUT_MODE (operands[0], word_mode);
5974 }
5975 else
5976 operands[0] = gen_lowpart (DImode, operands[0]);
5977 operands[1] = gen_lowpart (DImode, operands[1]);
5978 emit_move_insn (operands[0], operands[1]);
5979 return;
5980 }
5981
5982 /* The only non-offsettable memory we handle is push. */
5983 if (push_operand (operands[0], VOIDmode))
5984 push = 1;
5985 else
5986 gcc_assert (!MEM_P (operands[0])
5987 || offsettable_memref_p (operands[0]));
5988
5989 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5990 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5991
5992 /* When emitting push, take care for source operands on the stack. */
5993 if (push && MEM_P (operands[1])
5994 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5995 {
5996 rtx src_base = XEXP (part[1][nparts - 1], 0);
5997
5998 /* Compensate for the stack decrement by 4. */
5999 if (!TARGET_64BIT && nparts == 3
6000 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6001 src_base = plus_constant (Pmode, src_base, 4);
6002
6003 /* src_base refers to the stack pointer and is
6004 automatically decreased by emitted push. */
6005 for (i = 0; i < nparts; i++)
6006 part[1][i] = change_address (part[1][i],
6007 GET_MODE (part[1][i]), src_base);
6008 }
6009
6010 /* We need to do copy in the right order in case an address register
6011 of the source overlaps the destination. */
6012 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6013 {
6014 rtx tmp;
6015
6016 for (i = 0; i < nparts; i++)
6017 {
6018 collisionparts[i]
6019 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6020 if (collisionparts[i])
6021 collisions++;
6022 }
6023
6024 /* Collision in the middle part can be handled by reordering. */
6025 if (collisions == 1 && nparts == 3 && collisionparts [1])
6026 {
6027 std::swap (part[0][1], part[0][2]);
6028 std::swap (part[1][1], part[1][2]);
6029 }
6030 else if (collisions == 1
6031 && nparts == 4
6032 && (collisionparts [1] || collisionparts [2]))
6033 {
6034 if (collisionparts [1])
6035 {
6036 std::swap (part[0][1], part[0][2]);
6037 std::swap (part[1][1], part[1][2]);
6038 }
6039 else
6040 {
6041 std::swap (part[0][2], part[0][3]);
6042 std::swap (part[1][2], part[1][3]);
6043 }
6044 }
6045
6046 /* If there are more collisions, we can't handle it by reordering.
6047 Do an lea to the last part and use only one colliding move. */
6048 else if (collisions > 1)
6049 {
6050 rtx base, addr;
6051
6052 collisions = 1;
6053
6054 base = part[0][nparts - 1];
6055
6056 /* Handle the case when the last part isn't valid for lea.
6057 Happens in 64-bit mode storing the 12-byte XFmode. */
6058 if (GET_MODE (base) != Pmode)
6059 base = gen_rtx_REG (Pmode, REGNO (base));
6060
6061 addr = XEXP (part[1][0], 0);
6062 if (TARGET_TLS_DIRECT_SEG_REFS)
6063 {
6064 struct ix86_address parts;
6065 int ok = ix86_decompose_address (addr, &parts);
6066 gcc_assert (ok);
6067 /* It is not valid to use %gs: or %fs: in lea. */
6068 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6069 }
6070 emit_insn (gen_rtx_SET (base, addr));
6071 part[1][0] = replace_equiv_address (part[1][0], base);
6072 for (i = 1; i < nparts; i++)
6073 {
6074 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6075 part[1][i] = replace_equiv_address (part[1][i], tmp);
6076 }
6077 }
6078 }
6079
6080 if (push)
6081 {
6082 if (!TARGET_64BIT)
6083 {
6084 if (nparts == 3)
6085 {
6086 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
6087 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
6088 emit_move_insn (part[0][2], part[1][2]);
6089 }
6090 else if (nparts == 4)
6091 {
6092 emit_move_insn (part[0][3], part[1][3]);
6093 emit_move_insn (part[0][2], part[1][2]);
6094 }
6095 }
6096 else
6097 {
6098 /* In 64bit mode we don't have 32bit push available. In case this is
6099 register, it is OK - we will just use larger counterpart. We also
6100 retype memory - these comes from attempt to avoid REX prefix on
6101 moving of second half of TFmode value. */
6102 if (GET_MODE (part[1][1]) == SImode)
6103 {
6104 switch (GET_CODE (part[1][1]))
6105 {
6106 case MEM:
6107 part[1][1] = adjust_address (part[1][1], DImode, 0);
6108 break;
6109
6110 case REG:
6111 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6112 break;
6113
6114 default:
6115 gcc_unreachable ();
6116 }
6117
6118 if (GET_MODE (part[1][0]) == SImode)
6119 part[1][0] = part[1][1];
6120 }
6121 }
6122 emit_move_insn (part[0][1], part[1][1]);
6123 emit_move_insn (part[0][0], part[1][0]);
6124 return;
6125 }
6126
6127 /* Choose correct order to not overwrite the source before it is copied. */
6128 if ((REG_P (part[0][0])
6129 && REG_P (part[1][1])
6130 && (REGNO (part[0][0]) == REGNO (part[1][1])
6131 || (nparts == 3
6132 && REGNO (part[0][0]) == REGNO (part[1][2]))
6133 || (nparts == 4
6134 && REGNO (part[0][0]) == REGNO (part[1][3]))))
6135 || (collisions > 0
6136 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6137 {
6138 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6139 {
6140 operands[2 + i] = part[0][j];
6141 operands[6 + i] = part[1][j];
6142 }
6143 }
6144 else
6145 {
6146 for (i = 0; i < nparts; i++)
6147 {
6148 operands[2 + i] = part[0][i];
6149 operands[6 + i] = part[1][i];
6150 }
6151 }
6152
6153 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6154 if (optimize_insn_for_size_p ())
6155 {
6156 for (j = 0; j < nparts - 1; j++)
6157 if (CONST_INT_P (operands[6 + j])
6158 && operands[6 + j] != const0_rtx
6159 && REG_P (operands[2 + j]))
6160 for (i = j; i < nparts - 1; i++)
6161 if (CONST_INT_P (operands[7 + i])
6162 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6163 operands[7 + i] = operands[2 + j];
6164 }
6165
6166 for (i = 0; i < nparts; i++)
6167 emit_move_insn (operands[2 + i], operands[6 + i]);
6168
6169 return;
6170 }
6171
6172 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6173 left shift by a constant, either using a single shift or
6174 a sequence of add instructions. */
6175
6176 static void
6177 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6178 {
6179 if (count == 1
6180 || (count * ix86_cost->add <= ix86_cost->shift_const
6181 && !optimize_insn_for_size_p ()))
6182 {
6183 while (count-- > 0)
6184 emit_insn (gen_add2_insn (operand, operand));
6185 }
6186 else
6187 {
6188 rtx (*insn)(rtx, rtx, rtx);
6189
6190 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6191 emit_insn (insn (operand, operand, GEN_INT (count)));
6192 }
6193 }
6194
6195 void
6196 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6197 {
6198 rtx (*gen_ashl3)(rtx, rtx, rtx);
6199 rtx (*gen_shld)(rtx, rtx, rtx);
6200 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6201 machine_mode half_mode;
6202
6203 rtx low[2], high[2];
6204 int count;
6205
6206 if (CONST_INT_P (operands[2]))
6207 {
6208 split_double_mode (mode, operands, 2, low, high);
6209 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6210
6211 if (count >= half_width)
6212 {
6213 emit_move_insn (high[0], low[1]);
6214 ix86_expand_clear (low[0]);
6215
6216 if (count > half_width)
6217 ix86_expand_ashl_const (high[0], count - half_width, mode);
6218 }
6219 else
6220 {
6221 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6222
6223 if (!rtx_equal_p (operands[0], operands[1]))
6224 emit_move_insn (operands[0], operands[1]);
6225
6226 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6227 ix86_expand_ashl_const (low[0], count, mode);
6228 }
6229 return;
6230 }
6231
6232 split_double_mode (mode, operands, 1, low, high);
6233 half_mode = mode == DImode ? SImode : DImode;
6234
6235 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6236
6237 if (operands[1] == const1_rtx)
6238 {
6239 /* Assuming we've chosen a QImode capable registers, then 1 << N
6240 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6241 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6242 {
6243 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6244
6245 ix86_expand_clear (low[0]);
6246 ix86_expand_clear (high[0]);
6247 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6248
6249 d = gen_lowpart (QImode, low[0]);
6250 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6251 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6252 emit_insn (gen_rtx_SET (d, s));
6253
6254 d = gen_lowpart (QImode, high[0]);
6255 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6256 s = gen_rtx_NE (QImode, flags, const0_rtx);
6257 emit_insn (gen_rtx_SET (d, s));
6258 }
6259
6260 /* Otherwise, we can get the same results by manually performing
6261 a bit extract operation on bit 5/6, and then performing the two
6262 shifts. The two methods of getting 0/1 into low/high are exactly
6263 the same size. Avoiding the shift in the bit extract case helps
6264 pentium4 a bit; no one else seems to care much either way. */
6265 else
6266 {
6267 rtx (*gen_lshr3)(rtx, rtx, rtx);
6268 rtx (*gen_and3)(rtx, rtx, rtx);
6269 rtx (*gen_xor3)(rtx, rtx, rtx);
6270 HOST_WIDE_INT bits;
6271 rtx x;
6272
6273 if (mode == DImode)
6274 {
6275 gen_lshr3 = gen_lshrsi3;
6276 gen_and3 = gen_andsi3;
6277 gen_xor3 = gen_xorsi3;
6278 bits = 5;
6279 }
6280 else
6281 {
6282 gen_lshr3 = gen_lshrdi3;
6283 gen_and3 = gen_anddi3;
6284 gen_xor3 = gen_xordi3;
6285 bits = 6;
6286 }
6287
6288 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6289 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6290 else
6291 x = gen_lowpart (half_mode, operands[2]);
6292 emit_insn (gen_rtx_SET (high[0], x));
6293
6294 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6295 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6296 emit_move_insn (low[0], high[0]);
6297 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6298 }
6299
6300 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6301 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6302 return;
6303 }
6304
6305 if (operands[1] == constm1_rtx)
6306 {
6307 /* For -1 << N, we can avoid the shld instruction, because we
6308 know that we're shifting 0...31/63 ones into a -1. */
6309 emit_move_insn (low[0], constm1_rtx);
6310 if (optimize_insn_for_size_p ())
6311 emit_move_insn (high[0], low[0]);
6312 else
6313 emit_move_insn (high[0], constm1_rtx);
6314 }
6315 else
6316 {
6317 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6318
6319 if (!rtx_equal_p (operands[0], operands[1]))
6320 emit_move_insn (operands[0], operands[1]);
6321
6322 split_double_mode (mode, operands, 1, low, high);
6323 emit_insn (gen_shld (high[0], low[0], operands[2]));
6324 }
6325
6326 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6327
6328 if (TARGET_CMOVE && scratch)
6329 {
6330 ix86_expand_clear (scratch);
6331 emit_insn (gen_x86_shift_adj_1
6332 (half_mode, high[0], low[0], operands[2], scratch));
6333 }
6334 else
6335 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6336 }
6337
6338 void
6339 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6340 {
6341 rtx (*gen_ashr3)(rtx, rtx, rtx)
6342 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6343 rtx (*gen_shrd)(rtx, rtx, rtx);
6344 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6345
6346 rtx low[2], high[2];
6347 int count;
6348
6349 if (CONST_INT_P (operands[2]))
6350 {
6351 split_double_mode (mode, operands, 2, low, high);
6352 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6353
6354 if (count == GET_MODE_BITSIZE (mode) - 1)
6355 {
6356 emit_move_insn (high[0], high[1]);
6357 emit_insn (gen_ashr3 (high[0], high[0],
6358 GEN_INT (half_width - 1)));
6359 emit_move_insn (low[0], high[0]);
6360
6361 }
6362 else if (count >= half_width)
6363 {
6364 emit_move_insn (low[0], high[1]);
6365 emit_move_insn (high[0], low[0]);
6366 emit_insn (gen_ashr3 (high[0], high[0],
6367 GEN_INT (half_width - 1)));
6368
6369 if (count > half_width)
6370 emit_insn (gen_ashr3 (low[0], low[0],
6371 GEN_INT (count - half_width)));
6372 }
6373 else
6374 {
6375 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6376
6377 if (!rtx_equal_p (operands[0], operands[1]))
6378 emit_move_insn (operands[0], operands[1]);
6379
6380 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6381 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6382 }
6383 }
6384 else
6385 {
6386 machine_mode half_mode;
6387
6388 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6389
6390 if (!rtx_equal_p (operands[0], operands[1]))
6391 emit_move_insn (operands[0], operands[1]);
6392
6393 split_double_mode (mode, operands, 1, low, high);
6394 half_mode = mode == DImode ? SImode : DImode;
6395
6396 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6397 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6398
6399 if (TARGET_CMOVE && scratch)
6400 {
6401 emit_move_insn (scratch, high[0]);
6402 emit_insn (gen_ashr3 (scratch, scratch,
6403 GEN_INT (half_width - 1)));
6404 emit_insn (gen_x86_shift_adj_1
6405 (half_mode, low[0], high[0], operands[2], scratch));
6406 }
6407 else
6408 emit_insn (gen_x86_shift_adj_3
6409 (half_mode, low[0], high[0], operands[2]));
6410 }
6411 }
6412
6413 void
6414 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6415 {
6416 rtx (*gen_lshr3)(rtx, rtx, rtx)
6417 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6418 rtx (*gen_shrd)(rtx, rtx, rtx);
6419 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6420
6421 rtx low[2], high[2];
6422 int count;
6423
6424 if (CONST_INT_P (operands[2]))
6425 {
6426 split_double_mode (mode, operands, 2, low, high);
6427 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6428
6429 if (count >= half_width)
6430 {
6431 emit_move_insn (low[0], high[1]);
6432 ix86_expand_clear (high[0]);
6433
6434 if (count > half_width)
6435 emit_insn (gen_lshr3 (low[0], low[0],
6436 GEN_INT (count - half_width)));
6437 }
6438 else
6439 {
6440 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6441
6442 if (!rtx_equal_p (operands[0], operands[1]))
6443 emit_move_insn (operands[0], operands[1]);
6444
6445 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6446 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6447 }
6448 }
6449 else
6450 {
6451 machine_mode half_mode;
6452
6453 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6454
6455 if (!rtx_equal_p (operands[0], operands[1]))
6456 emit_move_insn (operands[0], operands[1]);
6457
6458 split_double_mode (mode, operands, 1, low, high);
6459 half_mode = mode == DImode ? SImode : DImode;
6460
6461 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6462 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6463
6464 if (TARGET_CMOVE && scratch)
6465 {
6466 ix86_expand_clear (scratch);
6467 emit_insn (gen_x86_shift_adj_1
6468 (half_mode, low[0], high[0], operands[2], scratch));
6469 }
6470 else
6471 emit_insn (gen_x86_shift_adj_2
6472 (half_mode, low[0], high[0], operands[2]));
6473 }
6474 }
6475
6476 /* Expand move of V1TI mode register X to a new TI mode register. */
6477 static rtx
6478 ix86_expand_v1ti_to_ti (rtx x)
6479 {
6480 rtx result = gen_reg_rtx (TImode);
6481 if (TARGET_SSE2)
6482 {
6483 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6484 rtx lo = gen_lowpart (DImode, result);
6485 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6486 rtx hi = gen_highpart (DImode, result);
6487 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6488 }
6489 else
6490 emit_move_insn (result, gen_lowpart (TImode, x));
6491 return result;
6492 }
6493
6494 /* Expand move of TI mode register X to a new V1TI mode register. */
6495 static rtx
6496 ix86_expand_ti_to_v1ti (rtx x)
6497 {
6498 if (TARGET_SSE2)
6499 {
6500 rtx lo = gen_lowpart (DImode, x);
6501 rtx hi = gen_highpart (DImode, x);
6502 rtx tmp = gen_reg_rtx (V2DImode);
6503 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6504 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6505 }
6506
6507 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6508 }
6509
6510 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6511 void
6512 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6513 {
6514 rtx op1 = force_reg (V1TImode, operands[1]);
6515
6516 if (!CONST_INT_P (operands[2]))
6517 {
6518 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6519 rtx tmp2 = gen_reg_rtx (TImode);
6520 rtx (*shift) (rtx, rtx, rtx)
6521 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6522 emit_insn (shift (tmp2, tmp1, operands[2]));
6523 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6524 emit_move_insn (operands[0], tmp3);
6525 return;
6526 }
6527
6528 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6529
6530 if (bits == 0)
6531 {
6532 emit_move_insn (operands[0], op1);
6533 return;
6534 }
6535
6536 if ((bits & 7) == 0)
6537 {
6538 rtx tmp = gen_reg_rtx (V1TImode);
6539 if (code == ASHIFT)
6540 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6541 else
6542 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6543 emit_move_insn (operands[0], tmp);
6544 return;
6545 }
6546
6547 rtx tmp1 = gen_reg_rtx (V1TImode);
6548 if (code == ASHIFT)
6549 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6550 else
6551 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6552
6553 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6554 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6555
6556 /* tmp3 will be the V2DImode result. */
6557 rtx tmp3 = gen_reg_rtx (V2DImode);
6558
6559 if (bits > 64)
6560 {
6561 if (code == ASHIFT)
6562 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6563 else
6564 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6565 }
6566 else
6567 {
6568 /* tmp4 is operands[1], in V2DImode. */
6569 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6570
6571 rtx tmp5 = gen_reg_rtx (V2DImode);
6572 if (code == ASHIFT)
6573 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6574 else
6575 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6576
6577 rtx tmp6 = gen_reg_rtx (V2DImode);
6578 if (code == ASHIFT)
6579 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6580 else
6581 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6582
6583 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6584 }
6585
6586 /* Convert the result back to V1TImode and store in operands[0]. */
6587 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6588 emit_move_insn (operands[0], tmp7);
6589 }
6590
6591 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6592 void
6593 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6594 {
6595 rtx op1 = force_reg (V1TImode, operands[1]);
6596
6597 if (!CONST_INT_P (operands[2]))
6598 {
6599 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6600 rtx tmp2 = gen_reg_rtx (TImode);
6601 rtx (*rotate) (rtx, rtx, rtx)
6602 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6603 emit_insn (rotate (tmp2, tmp1, operands[2]));
6604 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6605 emit_move_insn (operands[0], tmp3);
6606 return;
6607 }
6608
6609 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6610
6611 if (bits == 0)
6612 {
6613 emit_move_insn (operands[0], op1);
6614 return;
6615 }
6616
6617 if (code == ROTATERT)
6618 bits = 128 - bits;
6619
6620 if ((bits & 31) == 0)
6621 {
6622 rtx tmp2 = gen_reg_rtx (V4SImode);
6623 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6624 if (bits == 32)
6625 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6626 else if (bits == 64)
6627 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6628 else
6629 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6630 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6631 return;
6632 }
6633
6634 if ((bits & 7) == 0)
6635 {
6636 rtx tmp1 = gen_reg_rtx (V1TImode);
6637 rtx tmp2 = gen_reg_rtx (V1TImode);
6638 rtx tmp3 = gen_reg_rtx (V1TImode);
6639
6640 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6641 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6642 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6643 emit_move_insn (operands[0], tmp3);
6644 return;
6645 }
6646
6647 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6648
6649 rtx lobits;
6650 rtx hibits;
6651
6652 switch (bits >> 5)
6653 {
6654 case 0:
6655 lobits = op1_v4si;
6656 hibits = gen_reg_rtx (V4SImode);
6657 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6658 break;
6659
6660 case 1:
6661 lobits = gen_reg_rtx (V4SImode);
6662 hibits = gen_reg_rtx (V4SImode);
6663 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6664 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6665 break;
6666
6667 case 2:
6668 lobits = gen_reg_rtx (V4SImode);
6669 hibits = gen_reg_rtx (V4SImode);
6670 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6671 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6672 break;
6673
6674 default:
6675 lobits = gen_reg_rtx (V4SImode);
6676 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6677 hibits = op1_v4si;
6678 break;
6679 }
6680
6681 rtx tmp1 = gen_reg_rtx (V4SImode);
6682 rtx tmp2 = gen_reg_rtx (V4SImode);
6683 rtx tmp3 = gen_reg_rtx (V4SImode);
6684
6685 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6686 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6687 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6688
6689 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6690 }
6691
6692 /* Expand V1TI mode ashiftrt by constant. */
6693 void
6694 ix86_expand_v1ti_ashiftrt (rtx operands[])
6695 {
6696 rtx op1 = force_reg (V1TImode, operands[1]);
6697
6698 if (!CONST_INT_P (operands[2]))
6699 {
6700 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6701 rtx tmp2 = gen_reg_rtx (TImode);
6702 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6703 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6704 emit_move_insn (operands[0], tmp3);
6705 return;
6706 }
6707
6708 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6709
6710 if (bits == 0)
6711 {
6712 emit_move_insn (operands[0], op1);
6713 return;
6714 }
6715
6716 if (bits == 127)
6717 {
6718 /* Two operations. */
6719 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6720 rtx tmp2 = gen_reg_rtx (V4SImode);
6721 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6722
6723 rtx tmp3 = gen_reg_rtx (V4SImode);
6724 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6725
6726 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6727 return;
6728 }
6729
6730 if (bits == 64)
6731 {
6732 /* Three operations. */
6733 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6734 rtx tmp2 = gen_reg_rtx (V4SImode);
6735 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6736
6737 rtx tmp3 = gen_reg_rtx (V4SImode);
6738 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6739
6740 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6741 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6742 rtx tmp6 = gen_reg_rtx (V2DImode);
6743 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6744
6745 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6746 return;
6747 }
6748
6749 if (bits == 96)
6750 {
6751 /* Three operations. */
6752 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6753 rtx tmp2 = gen_reg_rtx (V4SImode);
6754 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6755
6756 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6757 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6758 rtx tmp5 = gen_reg_rtx (V2DImode);
6759 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6760
6761 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
6762 rtx tmp7 = gen_reg_rtx (V4SImode);
6763 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6764
6765 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6766 return;
6767 }
6768
6769 if (bits >= 111)
6770 {
6771 /* Three operations. */
6772 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6773 rtx tmp2 = gen_reg_rtx (V4SImode);
6774 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6775
6776 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6777 rtx tmp4 = gen_reg_rtx (V8HImode);
6778 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6779
6780 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6781 rtx tmp6 = gen_reg_rtx (V4SImode);
6782 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6783
6784 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6785 return;
6786 }
6787
6788 if (TARGET_AVX2 || TARGET_SSE4_1)
6789 {
6790 /* Three operations. */
6791 if (bits == 32)
6792 {
6793 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6794 rtx tmp2 = gen_reg_rtx (V4SImode);
6795 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6796
6797 rtx tmp3 = gen_reg_rtx (V1TImode);
6798 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6799
6800 if (TARGET_AVX2)
6801 {
6802 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6803 rtx tmp5 = gen_reg_rtx (V4SImode);
6804 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6805 GEN_INT (7)));
6806
6807 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6808 }
6809 else
6810 {
6811 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6812 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6813 rtx tmp6 = gen_reg_rtx (V8HImode);
6814 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6815 GEN_INT (0x3f)));
6816
6817 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6818 }
6819 return;
6820 }
6821
6822 /* Three operations. */
6823 if (bits == 8 || bits == 16 || bits == 24)
6824 {
6825 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6826 rtx tmp2 = gen_reg_rtx (V4SImode);
6827 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6828
6829 rtx tmp3 = gen_reg_rtx (V1TImode);
6830 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6831
6832 if (TARGET_AVX2)
6833 {
6834 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6835 rtx tmp5 = gen_reg_rtx (V4SImode);
6836 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6837 GEN_INT (7)));
6838
6839 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6840 }
6841 else
6842 {
6843 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6844 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6845 rtx tmp6 = gen_reg_rtx (V8HImode);
6846 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6847 GEN_INT (0x3f)));
6848
6849 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6850 }
6851 return;
6852 }
6853 }
6854
6855 if (bits > 96)
6856 {
6857 /* Four operations. */
6858 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6859 rtx tmp2 = gen_reg_rtx (V4SImode);
6860 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6861
6862 rtx tmp3 = gen_reg_rtx (V4SImode);
6863 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6864
6865 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6866 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6867 rtx tmp6 = gen_reg_rtx (V2DImode);
6868 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6869
6870 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
6871 rtx tmp8 = gen_reg_rtx (V4SImode);
6872 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6873
6874 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
6875 return;
6876 }
6877
6878 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6879 {
6880 /* Four operations. */
6881 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6882 rtx tmp2 = gen_reg_rtx (V4SImode);
6883 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6884
6885 rtx tmp3 = gen_reg_rtx (V4SImode);
6886 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6887
6888 rtx tmp4 = gen_reg_rtx (V1TImode);
6889 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6890
6891 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6892 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
6893 rtx tmp7 = gen_reg_rtx (V8HImode);
6894 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6895 GEN_INT (bits == 48 ? 0x1f : 0x07)));
6896
6897 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6898 return;
6899 }
6900
6901 if ((bits & 7) == 0)
6902 {
6903 /* Five operations. */
6904 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6905 rtx tmp2 = gen_reg_rtx (V4SImode);
6906 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6907
6908 rtx tmp3 = gen_reg_rtx (V4SImode);
6909 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6910
6911 rtx tmp4 = gen_reg_rtx (V1TImode);
6912 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6913
6914 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6915 rtx tmp6 = gen_reg_rtx (V1TImode);
6916 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6917
6918 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6919 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
6920 rtx tmp9 = gen_reg_rtx (V2DImode);
6921 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6922
6923 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
6924 return;
6925 }
6926
6927 if (TARGET_AVX2 && bits < 32)
6928 {
6929 /* Six operations. */
6930 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6931 rtx tmp2 = gen_reg_rtx (V4SImode);
6932 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6933
6934 rtx tmp3 = gen_reg_rtx (V1TImode);
6935 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6936
6937 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6938 rtx tmp5 = gen_reg_rtx (V2DImode);
6939 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6940
6941 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6942 rtx tmp7 = gen_reg_rtx (V2DImode);
6943 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6944
6945 rtx tmp8 = gen_reg_rtx (V2DImode);
6946 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6947
6948 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
6949 rtx tmp10 = gen_reg_rtx (V4SImode);
6950 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6951
6952 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
6953 return;
6954 }
6955
6956 if (TARGET_SSE4_1 && bits < 15)
6957 {
6958 /* Six operations. */
6959 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6960 rtx tmp2 = gen_reg_rtx (V4SImode);
6961 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6962
6963 rtx tmp3 = gen_reg_rtx (V1TImode);
6964 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6965
6966 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6967 rtx tmp5 = gen_reg_rtx (V2DImode);
6968 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6969
6970 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6971 rtx tmp7 = gen_reg_rtx (V2DImode);
6972 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6973
6974 rtx tmp8 = gen_reg_rtx (V2DImode);
6975 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6976
6977 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6978 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
6979 rtx tmp11 = gen_reg_rtx (V8HImode);
6980 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
6981
6982 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
6983 return;
6984 }
6985
6986 if (bits == 1)
6987 {
6988 /* Eight operations. */
6989 rtx tmp1 = gen_reg_rtx (V1TImode);
6990 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6991
6992 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6993 rtx tmp3 = gen_reg_rtx (V2DImode);
6994 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
6995
6996 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6997 rtx tmp5 = gen_reg_rtx (V2DImode);
6998 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
6999
7000 rtx tmp6 = gen_reg_rtx (V2DImode);
7001 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7002
7003 rtx tmp7 = gen_reg_rtx (V2DImode);
7004 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7005
7006 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
7007 rtx tmp9 = gen_reg_rtx (V4SImode);
7008 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7009
7010 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
7011 rtx tmp11 = gen_reg_rtx (V2DImode);
7012 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7013
7014 rtx tmp12 = gen_reg_rtx (V2DImode);
7015 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7016
7017 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
7018 return;
7019 }
7020
7021 if (bits > 64)
7022 {
7023 /* Eight operations. */
7024 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7025 rtx tmp2 = gen_reg_rtx (V4SImode);
7026 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7027
7028 rtx tmp3 = gen_reg_rtx (V4SImode);
7029 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7030
7031 rtx tmp4 = gen_reg_rtx (V1TImode);
7032 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7033
7034 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7035 rtx tmp6 = gen_reg_rtx (V2DImode);
7036 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7037
7038 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7039 rtx tmp8 = gen_reg_rtx (V1TImode);
7040 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7041
7042 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7043 rtx tmp10 = gen_reg_rtx (V2DImode);
7044 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7045
7046 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
7047 rtx tmp12 = gen_reg_rtx (V2DImode);
7048 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7049
7050 rtx tmp13 = gen_reg_rtx (V2DImode);
7051 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7052
7053 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
7054 }
7055 else
7056 {
7057 /* Nine operations. */
7058 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7059 rtx tmp2 = gen_reg_rtx (V4SImode);
7060 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7061
7062 rtx tmp3 = gen_reg_rtx (V4SImode);
7063 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7064
7065 rtx tmp4 = gen_reg_rtx (V1TImode);
7066 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7067
7068 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7069 rtx tmp6 = gen_reg_rtx (V2DImode);
7070 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7071
7072 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7073 rtx tmp8 = gen_reg_rtx (V2DImode);
7074 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7075
7076 rtx tmp9 = gen_reg_rtx (V2DImode);
7077 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7078
7079 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7080 rtx tmp11 = gen_reg_rtx (V1TImode);
7081 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7082
7083 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
7084 rtx tmp13 = gen_reg_rtx (V2DImode);
7085 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7086
7087 rtx tmp14 = gen_reg_rtx (V2DImode);
7088 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7089
7090 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
7091 }
7092 }
7093
7094 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7095 DImode for constant loop counts. */
7096
7097 static machine_mode
7098 counter_mode (rtx count_exp)
7099 {
7100 if (GET_MODE (count_exp) != VOIDmode)
7101 return GET_MODE (count_exp);
7102 if (!CONST_INT_P (count_exp))
7103 return Pmode;
7104 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7105 return DImode;
7106 return SImode;
7107 }
7108
7109 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7110 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7111 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7112 memory by VALUE (supposed to be in MODE).
7113
7114 The size is rounded down to whole number of chunk size moved at once.
7115 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7116
7117
7118 static void
7119 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7120 rtx destptr, rtx srcptr, rtx value,
7121 rtx count, machine_mode mode, int unroll,
7122 int expected_size, bool issetmem)
7123 {
7124 rtx_code_label *out_label, *top_label;
7125 rtx iter, tmp;
7126 machine_mode iter_mode = counter_mode (count);
7127 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7128 rtx piece_size = GEN_INT (piece_size_n);
7129 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7130 rtx size;
7131 int i;
7132
7133 top_label = gen_label_rtx ();
7134 out_label = gen_label_rtx ();
7135 iter = gen_reg_rtx (iter_mode);
7136
7137 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7138 NULL, 1, OPTAB_DIRECT);
7139 /* Those two should combine. */
7140 if (piece_size == const1_rtx)
7141 {
7142 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7143 true, out_label);
7144 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7145 }
7146 emit_move_insn (iter, const0_rtx);
7147
7148 emit_label (top_label);
7149
7150 tmp = convert_modes (Pmode, iter_mode, iter, true);
7151
7152 /* This assert could be relaxed - in this case we'll need to compute
7153 smallest power of two, containing in PIECE_SIZE_N and pass it to
7154 offset_address. */
7155 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7156 destmem = offset_address (destmem, tmp, piece_size_n);
7157 destmem = adjust_address (destmem, mode, 0);
7158
7159 if (!issetmem)
7160 {
7161 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7162 srcmem = adjust_address (srcmem, mode, 0);
7163
7164 /* When unrolling for chips that reorder memory reads and writes,
7165 we can save registers by using single temporary.
7166 Also using 4 temporaries is overkill in 32bit mode. */
7167 if (!TARGET_64BIT && 0)
7168 {
7169 for (i = 0; i < unroll; i++)
7170 {
7171 if (i)
7172 {
7173 destmem = adjust_address (copy_rtx (destmem), mode,
7174 GET_MODE_SIZE (mode));
7175 srcmem = adjust_address (copy_rtx (srcmem), mode,
7176 GET_MODE_SIZE (mode));
7177 }
7178 emit_move_insn (destmem, srcmem);
7179 }
7180 }
7181 else
7182 {
7183 rtx tmpreg[4];
7184 gcc_assert (unroll <= 4);
7185 for (i = 0; i < unroll; i++)
7186 {
7187 tmpreg[i] = gen_reg_rtx (mode);
7188 if (i)
7189 srcmem = adjust_address (copy_rtx (srcmem), mode,
7190 GET_MODE_SIZE (mode));
7191 emit_move_insn (tmpreg[i], srcmem);
7192 }
7193 for (i = 0; i < unroll; i++)
7194 {
7195 if (i)
7196 destmem = adjust_address (copy_rtx (destmem), mode,
7197 GET_MODE_SIZE (mode));
7198 emit_move_insn (destmem, tmpreg[i]);
7199 }
7200 }
7201 }
7202 else
7203 for (i = 0; i < unroll; i++)
7204 {
7205 if (i)
7206 destmem = adjust_address (copy_rtx (destmem), mode,
7207 GET_MODE_SIZE (mode));
7208 emit_move_insn (destmem, value);
7209 }
7210
7211 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7212 true, OPTAB_LIB_WIDEN);
7213 if (tmp != iter)
7214 emit_move_insn (iter, tmp);
7215
7216 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7217 true, top_label);
7218 if (expected_size != -1)
7219 {
7220 expected_size /= GET_MODE_SIZE (mode) * unroll;
7221 if (expected_size == 0)
7222 predict_jump (0);
7223 else if (expected_size > REG_BR_PROB_BASE)
7224 predict_jump (REG_BR_PROB_BASE - 1);
7225 else
7226 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7227 / expected_size);
7228 }
7229 else
7230 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7231 iter = ix86_zero_extend_to_Pmode (iter);
7232 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7233 true, OPTAB_LIB_WIDEN);
7234 if (tmp != destptr)
7235 emit_move_insn (destptr, tmp);
7236 if (!issetmem)
7237 {
7238 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7239 true, OPTAB_LIB_WIDEN);
7240 if (tmp != srcptr)
7241 emit_move_insn (srcptr, tmp);
7242 }
7243 emit_label (out_label);
7244 }
7245
7246 /* Divide COUNTREG by SCALE. */
7247 static rtx
7248 scale_counter (rtx countreg, int scale)
7249 {
7250 rtx sc;
7251
7252 if (scale == 1)
7253 return countreg;
7254 if (CONST_INT_P (countreg))
7255 return GEN_INT (INTVAL (countreg) / scale);
7256 gcc_assert (REG_P (countreg));
7257
7258 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7259 GEN_INT (exact_log2 (scale)),
7260 NULL, 1, OPTAB_DIRECT);
7261 return sc;
7262 }
7263
7264 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7265 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7266 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7267 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7268 ORIG_VALUE is the original value passed to memset to fill the memory with.
7269 Other arguments have same meaning as for previous function. */
7270
7271 static void
7272 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7273 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7274 rtx count,
7275 machine_mode mode, bool issetmem)
7276 {
7277 rtx destexp;
7278 rtx srcexp;
7279 rtx countreg;
7280 HOST_WIDE_INT rounded_count;
7281
7282 /* If possible, it is shorter to use rep movs.
7283 TODO: Maybe it is better to move this logic to decide_alg. */
7284 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7285 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7286 && (!issetmem || orig_value == const0_rtx))
7287 mode = SImode;
7288
7289 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7290 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7291
7292 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7293 GET_MODE_SIZE (mode)));
7294 if (mode != QImode)
7295 {
7296 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7297 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7298 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7299 }
7300 else
7301 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7302 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7303 {
7304 rounded_count
7305 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7306 destmem = shallow_copy_rtx (destmem);
7307 set_mem_size (destmem, rounded_count);
7308 }
7309 else if (MEM_SIZE_KNOWN_P (destmem))
7310 clear_mem_size (destmem);
7311
7312 if (issetmem)
7313 {
7314 value = force_reg (mode, gen_lowpart (mode, value));
7315 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7316 }
7317 else
7318 {
7319 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7320 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7321 if (mode != QImode)
7322 {
7323 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7324 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7325 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7326 }
7327 else
7328 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7329 if (CONST_INT_P (count))
7330 {
7331 rounded_count
7332 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7333 srcmem = shallow_copy_rtx (srcmem);
7334 set_mem_size (srcmem, rounded_count);
7335 }
7336 else
7337 {
7338 if (MEM_SIZE_KNOWN_P (srcmem))
7339 clear_mem_size (srcmem);
7340 }
7341 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7342 destexp, srcexp));
7343 }
7344 }
7345
7346 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7347 DESTMEM.
7348 SRC is passed by pointer to be updated on return.
7349 Return value is updated DST. */
7350 static rtx
7351 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7352 HOST_WIDE_INT size_to_move)
7353 {
7354 rtx dst = destmem, src = *srcmem, tempreg;
7355 enum insn_code code;
7356 machine_mode move_mode;
7357 int piece_size, i;
7358
7359 /* Find the widest mode in which we could perform moves.
7360 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7361 it until move of such size is supported. */
7362 piece_size = 1 << floor_log2 (size_to_move);
7363 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7364 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7365 {
7366 gcc_assert (piece_size > 1);
7367 piece_size >>= 1;
7368 }
7369
7370 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7371 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7372 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7373 {
7374 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7375 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7376 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7377 {
7378 move_mode = word_mode;
7379 piece_size = GET_MODE_SIZE (move_mode);
7380 code = optab_handler (mov_optab, move_mode);
7381 }
7382 }
7383 gcc_assert (code != CODE_FOR_nothing);
7384
7385 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7386 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7387
7388 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7389 gcc_assert (size_to_move % piece_size == 0);
7390
7391 for (i = 0; i < size_to_move; i += piece_size)
7392 {
7393 /* We move from memory to memory, so we'll need to do it via
7394 a temporary register. */
7395 tempreg = gen_reg_rtx (move_mode);
7396 emit_insn (GEN_FCN (code) (tempreg, src));
7397 emit_insn (GEN_FCN (code) (dst, tempreg));
7398
7399 emit_move_insn (destptr,
7400 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7401 emit_move_insn (srcptr,
7402 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7403
7404 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7405 piece_size);
7406 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7407 piece_size);
7408 }
7409
7410 /* Update DST and SRC rtx. */
7411 *srcmem = src;
7412 return dst;
7413 }
7414
7415 /* Helper function for the string operations below. Dest VARIABLE whether
7416 it is aligned to VALUE bytes. If true, jump to the label. */
7417
7418 static rtx_code_label *
7419 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7420 {
7421 rtx_code_label *label = gen_label_rtx ();
7422 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7423 if (GET_MODE (variable) == DImode)
7424 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7425 else
7426 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7427 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7428 1, label);
7429 if (epilogue)
7430 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7431 else
7432 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7433 return label;
7434 }
7435
7436
7437 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7438
7439 static void
7440 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7441 rtx destptr, rtx srcptr, rtx count, int max_size)
7442 {
7443 rtx src, dest;
7444 if (CONST_INT_P (count))
7445 {
7446 HOST_WIDE_INT countval = INTVAL (count);
7447 HOST_WIDE_INT epilogue_size = countval % max_size;
7448 int i;
7449
7450 /* For now MAX_SIZE should be a power of 2. This assert could be
7451 relaxed, but it'll require a bit more complicated epilogue
7452 expanding. */
7453 gcc_assert ((max_size & (max_size - 1)) == 0);
7454 for (i = max_size; i >= 1; i >>= 1)
7455 {
7456 if (epilogue_size & i)
7457 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7458 }
7459 return;
7460 }
7461 if (max_size > 8)
7462 {
7463 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7464 count, 1, OPTAB_DIRECT);
7465 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7466 count, QImode, 1, 4, false);
7467 return;
7468 }
7469
7470 /* When there are stringops, we can cheaply increase dest and src pointers.
7471 Otherwise we save code size by maintaining offset (zero is readily
7472 available from preceding rep operation) and using x86 addressing modes.
7473 */
7474 if (TARGET_SINGLE_STRINGOP)
7475 {
7476 if (max_size > 4)
7477 {
7478 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7479 src = change_address (srcmem, SImode, srcptr);
7480 dest = change_address (destmem, SImode, destptr);
7481 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7482 emit_label (label);
7483 LABEL_NUSES (label) = 1;
7484 }
7485 if (max_size > 2)
7486 {
7487 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7488 src = change_address (srcmem, HImode, srcptr);
7489 dest = change_address (destmem, HImode, destptr);
7490 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7491 emit_label (label);
7492 LABEL_NUSES (label) = 1;
7493 }
7494 if (max_size > 1)
7495 {
7496 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7497 src = change_address (srcmem, QImode, srcptr);
7498 dest = change_address (destmem, QImode, destptr);
7499 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7500 emit_label (label);
7501 LABEL_NUSES (label) = 1;
7502 }
7503 }
7504 else
7505 {
7506 rtx offset = force_reg (Pmode, const0_rtx);
7507 rtx tmp;
7508
7509 if (max_size > 4)
7510 {
7511 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7512 src = change_address (srcmem, SImode, srcptr);
7513 dest = change_address (destmem, SImode, destptr);
7514 emit_move_insn (dest, src);
7515 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7516 true, OPTAB_LIB_WIDEN);
7517 if (tmp != offset)
7518 emit_move_insn (offset, tmp);
7519 emit_label (label);
7520 LABEL_NUSES (label) = 1;
7521 }
7522 if (max_size > 2)
7523 {
7524 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7525 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7526 src = change_address (srcmem, HImode, tmp);
7527 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7528 dest = change_address (destmem, HImode, tmp);
7529 emit_move_insn (dest, src);
7530 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7531 true, OPTAB_LIB_WIDEN);
7532 if (tmp != offset)
7533 emit_move_insn (offset, tmp);
7534 emit_label (label);
7535 LABEL_NUSES (label) = 1;
7536 }
7537 if (max_size > 1)
7538 {
7539 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7540 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7541 src = change_address (srcmem, QImode, tmp);
7542 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7543 dest = change_address (destmem, QImode, tmp);
7544 emit_move_insn (dest, src);
7545 emit_label (label);
7546 LABEL_NUSES (label) = 1;
7547 }
7548 }
7549 }
7550
7551 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7552 with value PROMOTED_VAL.
7553 SRC is passed by pointer to be updated on return.
7554 Return value is updated DST. */
7555 static rtx
7556 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7557 HOST_WIDE_INT size_to_move)
7558 {
7559 rtx dst = destmem;
7560 enum insn_code code;
7561 machine_mode move_mode;
7562 int piece_size, i;
7563
7564 /* Find the widest mode in which we could perform moves.
7565 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7566 it until move of such size is supported. */
7567 move_mode = GET_MODE (promoted_val);
7568 if (move_mode == VOIDmode)
7569 move_mode = QImode;
7570 if (size_to_move < GET_MODE_SIZE (move_mode))
7571 {
7572 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7573 move_mode = int_mode_for_size (move_bits, 0).require ();
7574 promoted_val = gen_lowpart (move_mode, promoted_val);
7575 }
7576 piece_size = GET_MODE_SIZE (move_mode);
7577 code = optab_handler (mov_optab, move_mode);
7578 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7579
7580 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7581
7582 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7583 gcc_assert (size_to_move % piece_size == 0);
7584
7585 for (i = 0; i < size_to_move; i += piece_size)
7586 {
7587 if (piece_size <= GET_MODE_SIZE (word_mode))
7588 {
7589 emit_insn (gen_strset (destptr, dst, promoted_val));
7590 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7591 piece_size);
7592 continue;
7593 }
7594
7595 emit_insn (GEN_FCN (code) (dst, promoted_val));
7596
7597 emit_move_insn (destptr,
7598 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7599
7600 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7601 piece_size);
7602 }
7603
7604 /* Update DST rtx. */
7605 return dst;
7606 }
7607 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7608 static void
7609 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7610 rtx count, int max_size)
7611 {
7612 count = expand_simple_binop (counter_mode (count), AND, count,
7613 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7614 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7615 gen_lowpart (QImode, value), count, QImode,
7616 1, max_size / 2, true);
7617 }
7618
7619 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7620 static void
7621 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7622 rtx count, int max_size)
7623 {
7624 rtx dest;
7625
7626 if (CONST_INT_P (count))
7627 {
7628 HOST_WIDE_INT countval = INTVAL (count);
7629 HOST_WIDE_INT epilogue_size = countval % max_size;
7630 int i;
7631
7632 /* For now MAX_SIZE should be a power of 2. This assert could be
7633 relaxed, but it'll require a bit more complicated epilogue
7634 expanding. */
7635 gcc_assert ((max_size & (max_size - 1)) == 0);
7636 for (i = max_size; i >= 1; i >>= 1)
7637 {
7638 if (epilogue_size & i)
7639 {
7640 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7641 destmem = emit_memset (destmem, destptr, vec_value, i);
7642 else
7643 destmem = emit_memset (destmem, destptr, value, i);
7644 }
7645 }
7646 return;
7647 }
7648 if (max_size > 32)
7649 {
7650 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7651 return;
7652 }
7653 if (max_size > 16)
7654 {
7655 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7656 if (TARGET_64BIT)
7657 {
7658 dest = change_address (destmem, DImode, destptr);
7659 emit_insn (gen_strset (destptr, dest, value));
7660 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7661 emit_insn (gen_strset (destptr, dest, value));
7662 }
7663 else
7664 {
7665 dest = change_address (destmem, SImode, destptr);
7666 emit_insn (gen_strset (destptr, dest, value));
7667 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7668 emit_insn (gen_strset (destptr, dest, value));
7669 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7670 emit_insn (gen_strset (destptr, dest, value));
7671 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7672 emit_insn (gen_strset (destptr, dest, value));
7673 }
7674 emit_label (label);
7675 LABEL_NUSES (label) = 1;
7676 }
7677 if (max_size > 8)
7678 {
7679 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7680 if (TARGET_64BIT)
7681 {
7682 dest = change_address (destmem, DImode, destptr);
7683 emit_insn (gen_strset (destptr, dest, value));
7684 }
7685 else
7686 {
7687 dest = change_address (destmem, SImode, destptr);
7688 emit_insn (gen_strset (destptr, dest, value));
7689 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7690 emit_insn (gen_strset (destptr, dest, value));
7691 }
7692 emit_label (label);
7693 LABEL_NUSES (label) = 1;
7694 }
7695 if (max_size > 4)
7696 {
7697 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7698 dest = change_address (destmem, SImode, destptr);
7699 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7700 emit_label (label);
7701 LABEL_NUSES (label) = 1;
7702 }
7703 if (max_size > 2)
7704 {
7705 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7706 dest = change_address (destmem, HImode, destptr);
7707 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7708 emit_label (label);
7709 LABEL_NUSES (label) = 1;
7710 }
7711 if (max_size > 1)
7712 {
7713 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7714 dest = change_address (destmem, QImode, destptr);
7715 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7716 emit_label (label);
7717 LABEL_NUSES (label) = 1;
7718 }
7719 }
7720
7721 /* Adjust COUNTER by the VALUE. */
7722 static void
7723 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7724 {
7725 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7726 }
7727
7728 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7729 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7730 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7731 ignored.
7732 Return value is updated DESTMEM. */
7733
7734 static rtx
7735 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7736 rtx destptr, rtx srcptr, rtx value,
7737 rtx vec_value, rtx count, int align,
7738 int desired_alignment, bool issetmem)
7739 {
7740 int i;
7741 for (i = 1; i < desired_alignment; i <<= 1)
7742 {
7743 if (align <= i)
7744 {
7745 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7746 if (issetmem)
7747 {
7748 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7749 destmem = emit_memset (destmem, destptr, vec_value, i);
7750 else
7751 destmem = emit_memset (destmem, destptr, value, i);
7752 }
7753 else
7754 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7755 ix86_adjust_counter (count, i);
7756 emit_label (label);
7757 LABEL_NUSES (label) = 1;
7758 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7759 }
7760 }
7761 return destmem;
7762 }
7763
7764 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7765 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7766 and jump to DONE_LABEL. */
7767 static void
7768 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
7769 rtx destptr, rtx srcptr,
7770 rtx value, rtx vec_value,
7771 rtx count, int size,
7772 rtx done_label, bool issetmem)
7773 {
7774 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7775 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7776 rtx modesize;
7777 int n;
7778
7779 /* If we do not have vector value to copy, we must reduce size. */
7780 if (issetmem)
7781 {
7782 if (!vec_value)
7783 {
7784 if (GET_MODE (value) == VOIDmode && size > 8)
7785 mode = Pmode;
7786 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7787 mode = GET_MODE (value);
7788 }
7789 else
7790 mode = GET_MODE (vec_value), value = vec_value;
7791 }
7792 else
7793 {
7794 /* Choose appropriate vector mode. */
7795 if (size >= 32)
7796 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7797 else if (size >= 16)
7798 mode = TARGET_SSE ? V16QImode : DImode;
7799 srcmem = change_address (srcmem, mode, srcptr);
7800 }
7801 destmem = change_address (destmem, mode, destptr);
7802 modesize = GEN_INT (GET_MODE_SIZE (mode));
7803 gcc_assert (GET_MODE_SIZE (mode) <= size);
7804 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7805 {
7806 if (issetmem)
7807 emit_move_insn (destmem, gen_lowpart (mode, value));
7808 else
7809 {
7810 emit_move_insn (destmem, srcmem);
7811 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7812 }
7813 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7814 }
7815
7816 destmem = offset_address (destmem, count, 1);
7817 destmem = offset_address (destmem, GEN_INT (-2 * size),
7818 GET_MODE_SIZE (mode));
7819 if (!issetmem)
7820 {
7821 srcmem = offset_address (srcmem, count, 1);
7822 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7823 GET_MODE_SIZE (mode));
7824 }
7825 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7826 {
7827 if (issetmem)
7828 emit_move_insn (destmem, gen_lowpart (mode, value));
7829 else
7830 {
7831 emit_move_insn (destmem, srcmem);
7832 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7833 }
7834 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7835 }
7836 emit_jump_insn (gen_jump (done_label));
7837 emit_barrier ();
7838
7839 emit_label (label);
7840 LABEL_NUSES (label) = 1;
7841 }
7842
7843 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7844 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7845 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7846 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7847 DONE_LABEL is a label after the whole copying sequence. The label is created
7848 on demand if *DONE_LABEL is NULL.
7849 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7850 bounds after the initial copies.
7851
7852 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7853 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7854 we will dispatch to a library call for large blocks.
7855
7856 In pseudocode we do:
7857
7858 if (COUNT < SIZE)
7859 {
7860 Assume that SIZE is 4. Bigger sizes are handled analogously
7861 if (COUNT & 4)
7862 {
7863 copy 4 bytes from SRCPTR to DESTPTR
7864 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7865 goto done_label
7866 }
7867 if (!COUNT)
7868 goto done_label;
7869 copy 1 byte from SRCPTR to DESTPTR
7870 if (COUNT & 2)
7871 {
7872 copy 2 bytes from SRCPTR to DESTPTR
7873 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7874 }
7875 }
7876 else
7877 {
7878 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7879 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7880
7881 OLD_DESPTR = DESTPTR;
7882 Align DESTPTR up to DESIRED_ALIGN
7883 SRCPTR += DESTPTR - OLD_DESTPTR
7884 COUNT -= DEST_PTR - OLD_DESTPTR
7885 if (DYNAMIC_CHECK)
7886 Round COUNT down to multiple of SIZE
7887 << optional caller supplied zero size guard is here >>
7888 << optional caller supplied dynamic check is here >>
7889 << caller supplied main copy loop is here >>
7890 }
7891 done_label:
7892 */
7893 static void
7894 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
7895 rtx *destptr, rtx *srcptr,
7896 machine_mode mode,
7897 rtx value, rtx vec_value,
7898 rtx *count,
7899 rtx_code_label **done_label,
7900 int size,
7901 int desired_align,
7902 int align,
7903 unsigned HOST_WIDE_INT *min_size,
7904 bool dynamic_check,
7905 bool issetmem)
7906 {
7907 rtx_code_label *loop_label = NULL, *label;
7908 int n;
7909 rtx modesize;
7910 int prolog_size = 0;
7911 rtx mode_value;
7912
7913 /* Chose proper value to copy. */
7914 if (issetmem && VECTOR_MODE_P (mode))
7915 mode_value = vec_value;
7916 else
7917 mode_value = value;
7918 gcc_assert (GET_MODE_SIZE (mode) <= size);
7919
7920 /* See if block is big or small, handle small blocks. */
7921 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7922 {
7923 int size2 = size;
7924 loop_label = gen_label_rtx ();
7925
7926 if (!*done_label)
7927 *done_label = gen_label_rtx ();
7928
7929 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7930 1, loop_label);
7931 size2 >>= 1;
7932
7933 /* Handle sizes > 3. */
7934 for (;size2 > 2; size2 >>= 1)
7935 expand_small_cpymem_or_setmem (destmem, srcmem,
7936 *destptr, *srcptr,
7937 value, vec_value,
7938 *count,
7939 size2, *done_label, issetmem);
7940 /* Nothing to copy? Jump to DONE_LABEL if so */
7941 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7942 1, *done_label);
7943
7944 /* Do a byte copy. */
7945 destmem = change_address (destmem, QImode, *destptr);
7946 if (issetmem)
7947 emit_move_insn (destmem, gen_lowpart (QImode, value));
7948 else
7949 {
7950 srcmem = change_address (srcmem, QImode, *srcptr);
7951 emit_move_insn (destmem, srcmem);
7952 }
7953
7954 /* Handle sizes 2 and 3. */
7955 label = ix86_expand_aligntest (*count, 2, false);
7956 destmem = change_address (destmem, HImode, *destptr);
7957 destmem = offset_address (destmem, *count, 1);
7958 destmem = offset_address (destmem, GEN_INT (-2), 2);
7959 if (issetmem)
7960 emit_move_insn (destmem, gen_lowpart (HImode, value));
7961 else
7962 {
7963 srcmem = change_address (srcmem, HImode, *srcptr);
7964 srcmem = offset_address (srcmem, *count, 1);
7965 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
7966 emit_move_insn (destmem, srcmem);
7967 }
7968
7969 emit_label (label);
7970 LABEL_NUSES (label) = 1;
7971 emit_jump_insn (gen_jump (*done_label));
7972 emit_barrier ();
7973 }
7974 else
7975 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
7976 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
7977
7978 /* Start memcpy for COUNT >= SIZE. */
7979 if (loop_label)
7980 {
7981 emit_label (loop_label);
7982 LABEL_NUSES (loop_label) = 1;
7983 }
7984
7985 /* Copy first desired_align bytes. */
7986 if (!issetmem)
7987 srcmem = change_address (srcmem, mode, *srcptr);
7988 destmem = change_address (destmem, mode, *destptr);
7989 modesize = GEN_INT (GET_MODE_SIZE (mode));
7990 for (n = 0; prolog_size < desired_align - align; n++)
7991 {
7992 if (issetmem)
7993 emit_move_insn (destmem, mode_value);
7994 else
7995 {
7996 emit_move_insn (destmem, srcmem);
7997 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7998 }
7999 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8000 prolog_size += GET_MODE_SIZE (mode);
8001 }
8002
8003
8004 /* Copy last SIZE bytes. */
8005 destmem = offset_address (destmem, *count, 1);
8006 destmem = offset_address (destmem,
8007 GEN_INT (-size - prolog_size),
8008 1);
8009 if (issetmem)
8010 emit_move_insn (destmem, mode_value);
8011 else
8012 {
8013 srcmem = offset_address (srcmem, *count, 1);
8014 srcmem = offset_address (srcmem,
8015 GEN_INT (-size - prolog_size),
8016 1);
8017 emit_move_insn (destmem, srcmem);
8018 }
8019 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8020 {
8021 destmem = offset_address (destmem, modesize, 1);
8022 if (issetmem)
8023 emit_move_insn (destmem, mode_value);
8024 else
8025 {
8026 srcmem = offset_address (srcmem, modesize, 1);
8027 emit_move_insn (destmem, srcmem);
8028 }
8029 }
8030
8031 /* Align destination. */
8032 if (desired_align > 1 && desired_align > align)
8033 {
8034 rtx saveddest = *destptr;
8035
8036 gcc_assert (desired_align <= size);
8037 /* Align destptr up, place it to new register. */
8038 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8039 GEN_INT (prolog_size),
8040 NULL_RTX, 1, OPTAB_DIRECT);
8041 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8042 REG_POINTER (*destptr) = 1;
8043 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8044 GEN_INT (-desired_align),
8045 *destptr, 1, OPTAB_DIRECT);
8046 /* See how many bytes we skipped. */
8047 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8048 *destptr,
8049 saveddest, 1, OPTAB_DIRECT);
8050 /* Adjust srcptr and count. */
8051 if (!issetmem)
8052 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8053 saveddest, *srcptr, 1, OPTAB_DIRECT);
8054 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8055 saveddest, *count, 1, OPTAB_DIRECT);
8056 /* We copied at most size + prolog_size. */
8057 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8058 *min_size
8059 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8060 else
8061 *min_size = 0;
8062
8063 /* Our loops always round down the block size, but for dispatch to
8064 library we need precise value. */
8065 if (dynamic_check)
8066 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
8067 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
8068 }
8069 else
8070 {
8071 gcc_assert (prolog_size == 0);
8072 /* Decrease count, so we won't end up copying last word twice. */
8073 if (!CONST_INT_P (*count))
8074 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8075 constm1_rtx, *count, 1, OPTAB_DIRECT);
8076 else
8077 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
8078 (unsigned HOST_WIDE_INT)size));
8079 if (*min_size)
8080 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
8081 }
8082 }
8083
8084
8085 /* This function is like the previous one, except here we know how many bytes
8086 need to be copied. That allows us to update alignment not only of DST, which
8087 is returned, but also of SRC, which is passed as a pointer for that
8088 reason. */
8089 static rtx
8090 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
8091 rtx srcreg, rtx value, rtx vec_value,
8092 int desired_align, int align_bytes,
8093 bool issetmem)
8094 {
8095 rtx src = NULL;
8096 rtx orig_dst = dst;
8097 rtx orig_src = NULL;
8098 int piece_size = 1;
8099 int copied_bytes = 0;
8100
8101 if (!issetmem)
8102 {
8103 gcc_assert (srcp != NULL);
8104 src = *srcp;
8105 orig_src = src;
8106 }
8107
8108 for (piece_size = 1;
8109 piece_size <= desired_align && copied_bytes < align_bytes;
8110 piece_size <<= 1)
8111 {
8112 if (align_bytes & piece_size)
8113 {
8114 if (issetmem)
8115 {
8116 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
8117 dst = emit_memset (dst, destreg, vec_value, piece_size);
8118 else
8119 dst = emit_memset (dst, destreg, value, piece_size);
8120 }
8121 else
8122 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
8123 copied_bytes += piece_size;
8124 }
8125 }
8126 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
8127 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8128 if (MEM_SIZE_KNOWN_P (orig_dst))
8129 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
8130
8131 if (!issetmem)
8132 {
8133 int src_align_bytes = get_mem_align_offset (src, desired_align
8134 * BITS_PER_UNIT);
8135 if (src_align_bytes >= 0)
8136 src_align_bytes = desired_align - src_align_bytes;
8137 if (src_align_bytes >= 0)
8138 {
8139 unsigned int src_align;
8140 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
8141 {
8142 if ((src_align_bytes & (src_align - 1))
8143 == (align_bytes & (src_align - 1)))
8144 break;
8145 }
8146 if (src_align > (unsigned int) desired_align)
8147 src_align = desired_align;
8148 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8149 set_mem_align (src, src_align * BITS_PER_UNIT);
8150 }
8151 if (MEM_SIZE_KNOWN_P (orig_src))
8152 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8153 *srcp = src;
8154 }
8155
8156 return dst;
8157 }
8158
8159 /* Return true if ALG can be used in current context.
8160 Assume we expand memset if MEMSET is true. */
8161 static bool
8162 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8163 {
8164 if (alg == no_stringop)
8165 return false;
8166 if (alg == vector_loop)
8167 return TARGET_SSE || TARGET_AVX;
8168 /* Algorithms using the rep prefix want at least edi and ecx;
8169 additionally, memset wants eax and memcpy wants esi. Don't
8170 consider such algorithms if the user has appropriated those
8171 registers for their own purposes, or if we have a non-default
8172 address space, since some string insns cannot override the segment. */
8173 if (alg == rep_prefix_1_byte
8174 || alg == rep_prefix_4_byte
8175 || alg == rep_prefix_8_byte)
8176 {
8177 if (have_as)
8178 return false;
8179 if (fixed_regs[CX_REG]
8180 || fixed_regs[DI_REG]
8181 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8182 return false;
8183 }
8184 return true;
8185 }
8186
8187 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8188 static enum stringop_alg
8189 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8190 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8191 bool memset, bool zero_memset, bool have_as,
8192 int *dynamic_check, bool *noalign, bool recur)
8193 {
8194 const struct stringop_algs *algs;
8195 bool optimize_for_speed;
8196 int max = 0;
8197 const struct processor_costs *cost;
8198 int i;
8199 bool any_alg_usable_p = false;
8200
8201 *noalign = false;
8202 *dynamic_check = -1;
8203
8204 /* Even if the string operation call is cold, we still might spend a lot
8205 of time processing large blocks. */
8206 if (optimize_function_for_size_p (cfun)
8207 || (optimize_insn_for_size_p ()
8208 && (max_size < 256
8209 || (expected_size != -1 && expected_size < 256))))
8210 optimize_for_speed = false;
8211 else
8212 optimize_for_speed = true;
8213
8214 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8215 if (memset)
8216 algs = &cost->memset[TARGET_64BIT != 0];
8217 else
8218 algs = &cost->memcpy[TARGET_64BIT != 0];
8219
8220 /* See maximal size for user defined algorithm. */
8221 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8222 {
8223 enum stringop_alg candidate = algs->size[i].alg;
8224 bool usable = alg_usable_p (candidate, memset, have_as);
8225 any_alg_usable_p |= usable;
8226
8227 if (candidate != libcall && candidate && usable)
8228 max = algs->size[i].max;
8229 }
8230
8231 /* If expected size is not known but max size is small enough
8232 so inline version is a win, set expected size into
8233 the range. */
8234 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8235 && expected_size == -1)
8236 expected_size = min_size / 2 + max_size / 2;
8237
8238 /* If user specified the algorithm, honor it if possible. */
8239 if (ix86_stringop_alg != no_stringop
8240 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8241 return ix86_stringop_alg;
8242 /* rep; movq or rep; movl is the smallest variant. */
8243 else if (!optimize_for_speed)
8244 {
8245 *noalign = true;
8246 if (!count || (count & 3) || (memset && !zero_memset))
8247 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8248 ? rep_prefix_1_byte : loop_1_byte;
8249 else
8250 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8251 ? rep_prefix_4_byte : loop;
8252 }
8253 /* Very tiny blocks are best handled via the loop, REP is expensive to
8254 setup. */
8255 else if (expected_size != -1 && expected_size < 4)
8256 return loop_1_byte;
8257 else if (expected_size != -1)
8258 {
8259 enum stringop_alg alg = libcall;
8260 bool alg_noalign = false;
8261 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8262 {
8263 /* We get here if the algorithms that were not libcall-based
8264 were rep-prefix based and we are unable to use rep prefixes
8265 based on global register usage. Break out of the loop and
8266 use the heuristic below. */
8267 if (algs->size[i].max == 0)
8268 break;
8269 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8270 {
8271 enum stringop_alg candidate = algs->size[i].alg;
8272
8273 if (candidate != libcall
8274 && alg_usable_p (candidate, memset, have_as))
8275 {
8276 alg = candidate;
8277 alg_noalign = algs->size[i].noalign;
8278 }
8279 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8280 last non-libcall inline algorithm. */
8281 if (TARGET_INLINE_ALL_STRINGOPS)
8282 {
8283 /* When the current size is best to be copied by a libcall,
8284 but we are still forced to inline, run the heuristic below
8285 that will pick code for medium sized blocks. */
8286 if (alg != libcall)
8287 {
8288 *noalign = alg_noalign;
8289 return alg;
8290 }
8291 else if (!any_alg_usable_p)
8292 break;
8293 }
8294 else if (alg_usable_p (candidate, memset, have_as)
8295 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8296 && candidate == rep_prefix_1_byte
8297 /* NB: If min_size != max_size, size is
8298 unknown. */
8299 && min_size != max_size))
8300 {
8301 *noalign = algs->size[i].noalign;
8302 return candidate;
8303 }
8304 }
8305 }
8306 }
8307 /* When asked to inline the call anyway, try to pick meaningful choice.
8308 We look for maximal size of block that is faster to copy by hand and
8309 take blocks of at most of that size guessing that average size will
8310 be roughly half of the block.
8311
8312 If this turns out to be bad, we might simply specify the preferred
8313 choice in ix86_costs. */
8314 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8315 && (algs->unknown_size == libcall
8316 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8317 {
8318 enum stringop_alg alg;
8319 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8320
8321 /* If there aren't any usable algorithms or if recursing already,
8322 then recursing on smaller sizes or same size isn't going to
8323 find anything. Just return the simple byte-at-a-time copy loop. */
8324 if (!any_alg_usable_p || recur)
8325 {
8326 /* Pick something reasonable. */
8327 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8328 *dynamic_check = 128;
8329 return loop_1_byte;
8330 }
8331 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8332 zero_memset, have_as, dynamic_check, noalign, true);
8333 gcc_assert (*dynamic_check == -1);
8334 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8335 *dynamic_check = max;
8336 else
8337 gcc_assert (alg != libcall);
8338 return alg;
8339 }
8340 return (alg_usable_p (algs->unknown_size, memset, have_as)
8341 ? algs->unknown_size : libcall);
8342 }
8343
8344 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8345 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8346 static int
8347 decide_alignment (int align,
8348 enum stringop_alg alg,
8349 int expected_size,
8350 machine_mode move_mode)
8351 {
8352 int desired_align = 0;
8353
8354 gcc_assert (alg != no_stringop);
8355
8356 if (alg == libcall)
8357 return 0;
8358 if (move_mode == VOIDmode)
8359 return 0;
8360
8361 desired_align = GET_MODE_SIZE (move_mode);
8362 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8363 copying whole cacheline at once. */
8364 if (TARGET_CPU_P (PENTIUMPRO)
8365 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8366 desired_align = 8;
8367
8368 if (optimize_size)
8369 desired_align = 1;
8370 if (desired_align < align)
8371 desired_align = align;
8372 if (expected_size != -1 && expected_size < 4)
8373 desired_align = align;
8374
8375 return desired_align;
8376 }
8377
8378
8379 /* Helper function for memcpy. For QImode value 0xXY produce
8380 0xXYXYXYXY of wide specified by MODE. This is essentially
8381 a * 0x10101010, but we can do slightly better than
8382 synth_mult by unwinding the sequence by hand on CPUs with
8383 slow multiply. */
8384 static rtx
8385 promote_duplicated_reg (machine_mode mode, rtx val)
8386 {
8387 machine_mode valmode = GET_MODE (val);
8388 rtx tmp;
8389 int nops = mode == DImode ? 3 : 2;
8390
8391 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8392 if (val == const0_rtx)
8393 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8394 if (CONST_INT_P (val))
8395 {
8396 HOST_WIDE_INT v = INTVAL (val) & 255;
8397
8398 v |= v << 8;
8399 v |= v << 16;
8400 if (mode == DImode)
8401 v |= (v << 16) << 16;
8402 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8403 }
8404
8405 if (valmode == VOIDmode)
8406 valmode = QImode;
8407 if (valmode != QImode)
8408 val = gen_lowpart (QImode, val);
8409 if (mode == QImode)
8410 return val;
8411 if (!TARGET_PARTIAL_REG_STALL)
8412 nops--;
8413 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8414 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8415 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8416 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8417 {
8418 rtx reg = convert_modes (mode, QImode, val, true);
8419 tmp = promote_duplicated_reg (mode, const1_rtx);
8420 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8421 OPTAB_DIRECT);
8422 }
8423 else
8424 {
8425 rtx reg = convert_modes (mode, QImode, val, true);
8426
8427 if (!TARGET_PARTIAL_REG_STALL)
8428 emit_insn (gen_insv_1 (mode, reg, reg));
8429 else
8430 {
8431 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8432 NULL, 1, OPTAB_DIRECT);
8433 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8434 OPTAB_DIRECT);
8435 }
8436 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8437 NULL, 1, OPTAB_DIRECT);
8438 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8439 if (mode == SImode)
8440 return reg;
8441 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8442 NULL, 1, OPTAB_DIRECT);
8443 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8444 return reg;
8445 }
8446 }
8447
8448 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8449 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8450 alignment from ALIGN to DESIRED_ALIGN. */
8451 static rtx
8452 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8453 int align)
8454 {
8455 rtx promoted_val;
8456
8457 if (TARGET_64BIT
8458 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8459 promoted_val = promote_duplicated_reg (DImode, val);
8460 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8461 promoted_val = promote_duplicated_reg (SImode, val);
8462 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8463 promoted_val = promote_duplicated_reg (HImode, val);
8464 else
8465 promoted_val = val;
8466
8467 return promoted_val;
8468 }
8469
8470 /* Copy the address to a Pmode register. This is used for x32 to
8471 truncate DImode TLS address to a SImode register. */
8472
8473 static rtx
8474 ix86_copy_addr_to_reg (rtx addr)
8475 {
8476 rtx reg;
8477 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8478 {
8479 reg = copy_addr_to_reg (addr);
8480 REG_POINTER (reg) = 1;
8481 return reg;
8482 }
8483 else
8484 {
8485 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8486 reg = copy_to_mode_reg (DImode, addr);
8487 REG_POINTER (reg) = 1;
8488 return gen_rtx_SUBREG (SImode, reg, 0);
8489 }
8490 }
8491
8492 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8493 operations when profitable. The code depends upon architecture, block size
8494 and alignment, but always has one of the following overall structures:
8495
8496 Aligned move sequence:
8497
8498 1) Prologue guard: Conditional that jumps up to epilogues for small
8499 blocks that can be handled by epilogue alone. This is faster
8500 but also needed for correctness, since prologue assume the block
8501 is larger than the desired alignment.
8502
8503 Optional dynamic check for size and libcall for large
8504 blocks is emitted here too, with -minline-stringops-dynamically.
8505
8506 2) Prologue: copy first few bytes in order to get destination
8507 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8508 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8509 copied. We emit either a jump tree on power of two sized
8510 blocks, or a byte loop.
8511
8512 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8513 with specified algorithm.
8514
8515 4) Epilogue: code copying tail of the block that is too small to be
8516 handled by main body (or up to size guarded by prologue guard).
8517
8518 Misaligned move sequence
8519
8520 1) missaligned move prologue/epilogue containing:
8521 a) Prologue handling small memory blocks and jumping to done_label
8522 (skipped if blocks are known to be large enough)
8523 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8524 needed by single possibly misaligned move
8525 (skipped if alignment is not needed)
8526 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8527
8528 2) Zero size guard dispatching to done_label, if needed
8529
8530 3) dispatch to library call, if needed,
8531
8532 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8533 with specified algorithm. */
8534 bool
8535 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8536 rtx align_exp, rtx expected_align_exp,
8537 rtx expected_size_exp, rtx min_size_exp,
8538 rtx max_size_exp, rtx probable_max_size_exp,
8539 bool issetmem)
8540 {
8541 rtx destreg;
8542 rtx srcreg = NULL;
8543 rtx_code_label *label = NULL;
8544 rtx tmp;
8545 rtx_code_label *jump_around_label = NULL;
8546 HOST_WIDE_INT align = 1;
8547 unsigned HOST_WIDE_INT count = 0;
8548 HOST_WIDE_INT expected_size = -1;
8549 int size_needed = 0, epilogue_size_needed;
8550 int desired_align = 0, align_bytes = 0;
8551 enum stringop_alg alg;
8552 rtx promoted_val = NULL;
8553 rtx vec_promoted_val = NULL;
8554 bool force_loopy_epilogue = false;
8555 int dynamic_check;
8556 bool need_zero_guard = false;
8557 bool noalign;
8558 machine_mode move_mode = VOIDmode;
8559 machine_mode wider_mode;
8560 int unroll_factor = 1;
8561 /* TODO: Once value ranges are available, fill in proper data. */
8562 unsigned HOST_WIDE_INT min_size = 0;
8563 unsigned HOST_WIDE_INT max_size = -1;
8564 unsigned HOST_WIDE_INT probable_max_size = -1;
8565 bool misaligned_prologue_used = false;
8566 bool have_as;
8567
8568 if (CONST_INT_P (align_exp))
8569 align = INTVAL (align_exp);
8570 /* i386 can do misaligned access on reasonably increased cost. */
8571 if (CONST_INT_P (expected_align_exp)
8572 && INTVAL (expected_align_exp) > align)
8573 align = INTVAL (expected_align_exp);
8574 /* ALIGN is the minimum of destination and source alignment, but we care here
8575 just about destination alignment. */
8576 else if (!issetmem
8577 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8578 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8579
8580 if (CONST_INT_P (count_exp))
8581 {
8582 min_size = max_size = probable_max_size = count = expected_size
8583 = INTVAL (count_exp);
8584 /* When COUNT is 0, there is nothing to do. */
8585 if (!count)
8586 return true;
8587 }
8588 else
8589 {
8590 if (min_size_exp)
8591 min_size = INTVAL (min_size_exp);
8592 if (max_size_exp)
8593 max_size = INTVAL (max_size_exp);
8594 if (probable_max_size_exp)
8595 probable_max_size = INTVAL (probable_max_size_exp);
8596 if (CONST_INT_P (expected_size_exp))
8597 expected_size = INTVAL (expected_size_exp);
8598 }
8599
8600 /* Make sure we don't need to care about overflow later on. */
8601 if (count > (HOST_WIDE_INT_1U << 30))
8602 return false;
8603
8604 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8605 if (!issetmem)
8606 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8607
8608 /* Step 0: Decide on preferred algorithm, desired alignment and
8609 size of chunks to be copied by main loop. */
8610 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8611 issetmem,
8612 issetmem && val_exp == const0_rtx, have_as,
8613 &dynamic_check, &noalign, false);
8614
8615 if (dump_file)
8616 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8617 stringop_alg_names[alg]);
8618
8619 if (alg == libcall)
8620 return false;
8621 gcc_assert (alg != no_stringop);
8622
8623 /* For now vector-version of memset is generated only for memory zeroing, as
8624 creating of promoted vector value is very cheap in this case. */
8625 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8626 alg = unrolled_loop;
8627
8628 if (!count)
8629 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8630 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8631 if (!issetmem)
8632 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8633
8634 unroll_factor = 1;
8635 move_mode = word_mode;
8636 switch (alg)
8637 {
8638 case libcall:
8639 case no_stringop:
8640 case last_alg:
8641 gcc_unreachable ();
8642 case loop_1_byte:
8643 need_zero_guard = true;
8644 move_mode = QImode;
8645 break;
8646 case loop:
8647 need_zero_guard = true;
8648 break;
8649 case unrolled_loop:
8650 need_zero_guard = true;
8651 unroll_factor = (TARGET_64BIT ? 4 : 2);
8652 break;
8653 case vector_loop:
8654 need_zero_guard = true;
8655 unroll_factor = 4;
8656 /* Find the widest supported mode. */
8657 move_mode = word_mode;
8658 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8659 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8660 move_mode = wider_mode;
8661
8662 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8663 move_mode = TImode;
8664 if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
8665 move_mode = OImode;
8666
8667 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8668 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8669 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8670 {
8671 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8672 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8673 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8674 move_mode = word_mode;
8675 }
8676 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8677 break;
8678 case rep_prefix_8_byte:
8679 move_mode = DImode;
8680 break;
8681 case rep_prefix_4_byte:
8682 move_mode = SImode;
8683 break;
8684 case rep_prefix_1_byte:
8685 move_mode = QImode;
8686 break;
8687 }
8688 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8689 epilogue_size_needed = size_needed;
8690
8691 /* If we are going to call any library calls conditionally, make sure any
8692 pending stack adjustment happen before the first conditional branch,
8693 otherwise they will be emitted before the library call only and won't
8694 happen from the other branches. */
8695 if (dynamic_check != -1)
8696 do_pending_stack_adjust ();
8697
8698 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8699 if (!TARGET_ALIGN_STRINGOPS || noalign)
8700 align = desired_align;
8701
8702 /* Step 1: Prologue guard. */
8703
8704 /* Alignment code needs count to be in register. */
8705 if (CONST_INT_P (count_exp) && desired_align > align)
8706 {
8707 if (INTVAL (count_exp) > desired_align
8708 && INTVAL (count_exp) > size_needed)
8709 {
8710 align_bytes
8711 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8712 if (align_bytes <= 0)
8713 align_bytes = 0;
8714 else
8715 align_bytes = desired_align - align_bytes;
8716 }
8717 if (align_bytes == 0)
8718 count_exp = force_reg (counter_mode (count_exp), count_exp);
8719 }
8720 gcc_assert (desired_align >= 1 && align >= 1);
8721
8722 /* Misaligned move sequences handle both prologue and epilogue at once.
8723 Default code generation results in a smaller code for large alignments
8724 and also avoids redundant job when sizes are known precisely. */
8725 misaligned_prologue_used
8726 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8727 && MAX (desired_align, epilogue_size_needed) <= 32
8728 && desired_align <= epilogue_size_needed
8729 && ((desired_align > align && !align_bytes)
8730 || (!count && epilogue_size_needed > 1)));
8731
8732 /* Do the cheap promotion to allow better CSE across the
8733 main loop and epilogue (ie one load of the big constant in the
8734 front of all code.
8735 For now the misaligned move sequences do not have fast path
8736 without broadcasting. */
8737 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8738 {
8739 if (alg == vector_loop)
8740 {
8741 gcc_assert (val_exp == const0_rtx);
8742 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8743 promoted_val = promote_duplicated_reg_to_size (val_exp,
8744 GET_MODE_SIZE (word_mode),
8745 desired_align, align);
8746 }
8747 else
8748 {
8749 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8750 desired_align, align);
8751 }
8752 }
8753 /* Misaligned move sequences handles both prologues and epilogues at once.
8754 Default code generation results in smaller code for large alignments and
8755 also avoids redundant job when sizes are known precisely. */
8756 if (misaligned_prologue_used)
8757 {
8758 /* Misaligned move prologue handled small blocks by itself. */
8759 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8760 (dst, src, &destreg, &srcreg,
8761 move_mode, promoted_val, vec_promoted_val,
8762 &count_exp,
8763 &jump_around_label,
8764 desired_align < align
8765 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8766 desired_align, align, &min_size, dynamic_check, issetmem);
8767 if (!issetmem)
8768 src = change_address (src, BLKmode, srcreg);
8769 dst = change_address (dst, BLKmode, destreg);
8770 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8771 epilogue_size_needed = 0;
8772 if (need_zero_guard
8773 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8774 {
8775 /* It is possible that we copied enough so the main loop will not
8776 execute. */
8777 gcc_assert (size_needed > 1);
8778 if (jump_around_label == NULL_RTX)
8779 jump_around_label = gen_label_rtx ();
8780 emit_cmp_and_jump_insns (count_exp,
8781 GEN_INT (size_needed),
8782 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8783 if (expected_size == -1
8784 || expected_size < (desired_align - align) / 2 + size_needed)
8785 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8786 else
8787 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8788 }
8789 }
8790 /* Ensure that alignment prologue won't copy past end of block. */
8791 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8792 {
8793 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8794 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8795 Make sure it is power of 2. */
8796 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8797
8798 /* To improve performance of small blocks, we jump around the VAL
8799 promoting mode. This mean that if the promoted VAL is not constant,
8800 we might not use it in the epilogue and have to use byte
8801 loop variant. */
8802 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8803 force_loopy_epilogue = true;
8804 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8805 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8806 {
8807 /* If main algorithm works on QImode, no epilogue is needed.
8808 For small sizes just don't align anything. */
8809 if (size_needed == 1)
8810 desired_align = align;
8811 else
8812 goto epilogue;
8813 }
8814 else if (!count
8815 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8816 {
8817 label = gen_label_rtx ();
8818 emit_cmp_and_jump_insns (count_exp,
8819 GEN_INT (epilogue_size_needed),
8820 LTU, 0, counter_mode (count_exp), 1, label);
8821 if (expected_size == -1 || expected_size < epilogue_size_needed)
8822 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8823 else
8824 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8825 }
8826 }
8827
8828 /* Emit code to decide on runtime whether library call or inline should be
8829 used. */
8830 if (dynamic_check != -1)
8831 {
8832 if (!issetmem && CONST_INT_P (count_exp))
8833 {
8834 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8835 {
8836 emit_block_copy_via_libcall (dst, src, count_exp);
8837 count_exp = const0_rtx;
8838 goto epilogue;
8839 }
8840 }
8841 else
8842 {
8843 rtx_code_label *hot_label = gen_label_rtx ();
8844 if (jump_around_label == NULL_RTX)
8845 jump_around_label = gen_label_rtx ();
8846 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8847 LEU, 0, counter_mode (count_exp),
8848 1, hot_label);
8849 predict_jump (REG_BR_PROB_BASE * 90 / 100);
8850 if (issetmem)
8851 set_storage_via_libcall (dst, count_exp, val_exp);
8852 else
8853 emit_block_copy_via_libcall (dst, src, count_exp);
8854 emit_jump (jump_around_label);
8855 emit_label (hot_label);
8856 }
8857 }
8858
8859 /* Step 2: Alignment prologue. */
8860 /* Do the expensive promotion once we branched off the small blocks. */
8861 if (issetmem && !promoted_val)
8862 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8863 desired_align, align);
8864
8865 if (desired_align > align && !misaligned_prologue_used)
8866 {
8867 if (align_bytes == 0)
8868 {
8869 /* Except for the first move in prologue, we no longer know
8870 constant offset in aliasing info. It don't seems to worth
8871 the pain to maintain it for the first move, so throw away
8872 the info early. */
8873 dst = change_address (dst, BLKmode, destreg);
8874 if (!issetmem)
8875 src = change_address (src, BLKmode, srcreg);
8876 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
8877 promoted_val, vec_promoted_val,
8878 count_exp, align, desired_align,
8879 issetmem);
8880 /* At most desired_align - align bytes are copied. */
8881 if (min_size < (unsigned)(desired_align - align))
8882 min_size = 0;
8883 else
8884 min_size -= desired_align - align;
8885 }
8886 else
8887 {
8888 /* If we know how many bytes need to be stored before dst is
8889 sufficiently aligned, maintain aliasing info accurately. */
8890 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
8891 srcreg,
8892 promoted_val,
8893 vec_promoted_val,
8894 desired_align,
8895 align_bytes,
8896 issetmem);
8897
8898 count_exp = plus_constant (counter_mode (count_exp),
8899 count_exp, -align_bytes);
8900 count -= align_bytes;
8901 min_size -= align_bytes;
8902 max_size -= align_bytes;
8903 }
8904 if (need_zero_guard
8905 && min_size < (unsigned HOST_WIDE_INT) size_needed
8906 && (count < (unsigned HOST_WIDE_INT) size_needed
8907 || (align_bytes == 0
8908 && count < ((unsigned HOST_WIDE_INT) size_needed
8909 + desired_align - align))))
8910 {
8911 /* It is possible that we copied enough so the main loop will not
8912 execute. */
8913 gcc_assert (size_needed > 1);
8914 if (label == NULL_RTX)
8915 label = gen_label_rtx ();
8916 emit_cmp_and_jump_insns (count_exp,
8917 GEN_INT (size_needed),
8918 LTU, 0, counter_mode (count_exp), 1, label);
8919 if (expected_size == -1
8920 || expected_size < (desired_align - align) / 2 + size_needed)
8921 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8922 else
8923 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8924 }
8925 }
8926 if (label && size_needed == 1)
8927 {
8928 emit_label (label);
8929 LABEL_NUSES (label) = 1;
8930 label = NULL;
8931 epilogue_size_needed = 1;
8932 if (issetmem)
8933 promoted_val = val_exp;
8934 }
8935 else if (label == NULL_RTX && !misaligned_prologue_used)
8936 epilogue_size_needed = size_needed;
8937
8938 /* Step 3: Main loop. */
8939
8940 switch (alg)
8941 {
8942 case libcall:
8943 case no_stringop:
8944 case last_alg:
8945 gcc_unreachable ();
8946 case loop_1_byte:
8947 case loop:
8948 case unrolled_loop:
8949 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
8950 count_exp, move_mode, unroll_factor,
8951 expected_size, issetmem);
8952 break;
8953 case vector_loop:
8954 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
8955 vec_promoted_val, count_exp, move_mode,
8956 unroll_factor, expected_size, issetmem);
8957 break;
8958 case rep_prefix_8_byte:
8959 case rep_prefix_4_byte:
8960 case rep_prefix_1_byte:
8961 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
8962 val_exp, count_exp, move_mode, issetmem);
8963 break;
8964 }
8965 /* Adjust properly the offset of src and dest memory for aliasing. */
8966 if (CONST_INT_P (count_exp))
8967 {
8968 if (!issetmem)
8969 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
8970 (count / size_needed) * size_needed);
8971 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
8972 (count / size_needed) * size_needed);
8973 }
8974 else
8975 {
8976 if (!issetmem)
8977 src = change_address (src, BLKmode, srcreg);
8978 dst = change_address (dst, BLKmode, destreg);
8979 }
8980
8981 /* Step 4: Epilogue to copy the remaining bytes. */
8982 epilogue:
8983 if (label)
8984 {
8985 /* When the main loop is done, COUNT_EXP might hold original count,
8986 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8987 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8988 bytes. Compensate if needed. */
8989
8990 if (size_needed < epilogue_size_needed)
8991 {
8992 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
8993 GEN_INT (size_needed - 1), count_exp, 1,
8994 OPTAB_DIRECT);
8995 if (tmp != count_exp)
8996 emit_move_insn (count_exp, tmp);
8997 }
8998 emit_label (label);
8999 LABEL_NUSES (label) = 1;
9000 }
9001
9002 if (count_exp != const0_rtx && epilogue_size_needed > 1)
9003 {
9004 if (force_loopy_epilogue)
9005 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
9006 epilogue_size_needed);
9007 else
9008 {
9009 if (issetmem)
9010 expand_setmem_epilogue (dst, destreg, promoted_val,
9011 vec_promoted_val, count_exp,
9012 epilogue_size_needed);
9013 else
9014 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
9015 epilogue_size_needed);
9016 }
9017 }
9018 if (jump_around_label)
9019 emit_label (jump_around_label);
9020 return true;
9021 }
9022
9023 /* Expand cmpstrn or memcmp. */
9024
9025 bool
9026 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
9027 rtx length, rtx align, bool is_cmpstrn)
9028 {
9029 /* Expand strncmp and memcmp only with -minline-all-stringops since
9030 "repz cmpsb" can be much slower than strncmp and memcmp functions
9031 implemented with vector instructions, see
9032
9033 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9034 */
9035 if (!TARGET_INLINE_ALL_STRINGOPS)
9036 return false;
9037
9038 /* Can't use this if the user has appropriated ecx, esi or edi. */
9039 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
9040 return false;
9041
9042 if (is_cmpstrn)
9043 {
9044 /* For strncmp, length is the maximum length, which can be larger
9045 than actual string lengths. We can expand the cmpstrn pattern
9046 to "repz cmpsb" only if one of the strings is a constant so
9047 that expand_builtin_strncmp() can write the length argument to
9048 be the minimum of the const string length and the actual length
9049 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9050 tree t1 = MEM_EXPR (src1);
9051 tree t2 = MEM_EXPR (src2);
9052 if (!((t1 && TREE_CODE (t1) == MEM_REF
9053 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
9054 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
9055 == STRING_CST))
9056 || (t2 && TREE_CODE (t2) == MEM_REF
9057 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
9058 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
9059 == STRING_CST))))
9060 return false;
9061 }
9062
9063 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
9064 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
9065 if (addr1 != XEXP (src1, 0))
9066 src1 = replace_equiv_address_nv (src1, addr1);
9067 if (addr2 != XEXP (src2, 0))
9068 src2 = replace_equiv_address_nv (src2, addr2);
9069
9070 /* NB: Make a copy of the data length to avoid changing the original
9071 data length by cmpstrnqi patterns. */
9072 length = ix86_zero_extend_to_Pmode (length);
9073 rtx lengthreg = gen_reg_rtx (Pmode);
9074 emit_move_insn (lengthreg, length);
9075
9076 /* If we are testing strict equality, we can use known alignment to
9077 good advantage. This may be possible with combine, particularly
9078 once cc0 is dead. */
9079 if (CONST_INT_P (length))
9080 {
9081 if (length == const0_rtx)
9082 {
9083 emit_move_insn (result, const0_rtx);
9084 return true;
9085 }
9086 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
9087 src1, src2));
9088 }
9089 else
9090 {
9091 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
9092 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
9093 src1, src2));
9094 }
9095
9096 rtx out = gen_lowpart (QImode, result);
9097 emit_insn (gen_cmpintqi (out));
9098 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
9099
9100 return true;
9101 }
9102
9103 /* Expand the appropriate insns for doing strlen if not just doing
9104 repnz; scasb
9105
9106 out = result, initialized with the start address
9107 align_rtx = alignment of the address.
9108 scratch = scratch register, initialized with the startaddress when
9109 not aligned, otherwise undefined
9110
9111 This is just the body. It needs the initializations mentioned above and
9112 some address computing at the end. These things are done in i386.md. */
9113
9114 static void
9115 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
9116 {
9117 int align;
9118 rtx tmp;
9119 rtx_code_label *align_2_label = NULL;
9120 rtx_code_label *align_3_label = NULL;
9121 rtx_code_label *align_4_label = gen_label_rtx ();
9122 rtx_code_label *end_0_label = gen_label_rtx ();
9123 rtx mem;
9124 rtx tmpreg = gen_reg_rtx (SImode);
9125 rtx scratch = gen_reg_rtx (SImode);
9126 rtx cmp;
9127
9128 align = 0;
9129 if (CONST_INT_P (align_rtx))
9130 align = INTVAL (align_rtx);
9131
9132 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9133
9134 /* Is there a known alignment and is it less than 4? */
9135 if (align < 4)
9136 {
9137 rtx scratch1 = gen_reg_rtx (Pmode);
9138 emit_move_insn (scratch1, out);
9139 /* Is there a known alignment and is it not 2? */
9140 if (align != 2)
9141 {
9142 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
9143 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
9144
9145 /* Leave just the 3 lower bits. */
9146 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9147 NULL_RTX, 0, OPTAB_WIDEN);
9148
9149 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9150 Pmode, 1, align_4_label);
9151 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9152 Pmode, 1, align_2_label);
9153 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9154 Pmode, 1, align_3_label);
9155 }
9156 else
9157 {
9158 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9159 check if is aligned to 4 - byte. */
9160
9161 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9162 NULL_RTX, 0, OPTAB_WIDEN);
9163
9164 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9165 Pmode, 1, align_4_label);
9166 }
9167
9168 mem = change_address (src, QImode, out);
9169
9170 /* Now compare the bytes. */
9171
9172 /* Compare the first n unaligned byte on a byte per byte basis. */
9173 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9174 QImode, 1, end_0_label);
9175
9176 /* Increment the address. */
9177 emit_insn (gen_add2_insn (out, const1_rtx));
9178
9179 /* Not needed with an alignment of 2 */
9180 if (align != 2)
9181 {
9182 emit_label (align_2_label);
9183
9184 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9185 end_0_label);
9186
9187 emit_insn (gen_add2_insn (out, const1_rtx));
9188
9189 emit_label (align_3_label);
9190 }
9191
9192 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9193 end_0_label);
9194
9195 emit_insn (gen_add2_insn (out, const1_rtx));
9196 }
9197
9198 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9199 align this loop. It gives only huge programs, but does not help to
9200 speed up. */
9201 emit_label (align_4_label);
9202
9203 mem = change_address (src, SImode, out);
9204 emit_move_insn (scratch, mem);
9205 emit_insn (gen_add2_insn (out, GEN_INT (4)));
9206
9207 /* This formula yields a nonzero result iff one of the bytes is zero.
9208 This saves three branches inside loop and many cycles. */
9209
9210 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9211 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9212 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9213 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9214 gen_int_mode (0x80808080, SImode)));
9215 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9216 align_4_label);
9217
9218 if (TARGET_CMOVE)
9219 {
9220 rtx reg = gen_reg_rtx (SImode);
9221 rtx reg2 = gen_reg_rtx (Pmode);
9222 emit_move_insn (reg, tmpreg);
9223 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9224
9225 /* If zero is not in the first two bytes, move two bytes forward. */
9226 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9227 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9228 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9229 emit_insn (gen_rtx_SET (tmpreg,
9230 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9231 reg,
9232 tmpreg)));
9233 /* Emit lea manually to avoid clobbering of flags. */
9234 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9235
9236 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9237 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9238 emit_insn (gen_rtx_SET (out,
9239 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9240 reg2,
9241 out)));
9242 }
9243 else
9244 {
9245 rtx_code_label *end_2_label = gen_label_rtx ();
9246 /* Is zero in the first two bytes? */
9247
9248 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9249 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9250 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9251 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9252 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9253 pc_rtx);
9254 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9255 JUMP_LABEL (tmp) = end_2_label;
9256
9257 /* Not in the first two. Move two bytes forward. */
9258 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9259 emit_insn (gen_add2_insn (out, const2_rtx));
9260
9261 emit_label (end_2_label);
9262
9263 }
9264
9265 /* Avoid branch in fixing the byte. */
9266 tmpreg = gen_lowpart (QImode, tmpreg);
9267 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9268 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9269 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9270 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9271
9272 emit_label (end_0_label);
9273 }
9274
9275 /* Expand strlen. */
9276
9277 bool
9278 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9279 {
9280 if (TARGET_UNROLL_STRLEN
9281 && TARGET_INLINE_ALL_STRINGOPS
9282 && eoschar == const0_rtx
9283 && optimize > 1)
9284 {
9285 /* The generic case of strlen expander is long. Avoid it's
9286 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9287 rtx addr = force_reg (Pmode, XEXP (src, 0));
9288 /* Well it seems that some optimizer does not combine a call like
9289 foo(strlen(bar), strlen(bar));
9290 when the move and the subtraction is done here. It does calculate
9291 the length just once when these instructions are done inside of
9292 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9293 often used and I use one fewer register for the lifetime of
9294 output_strlen_unroll() this is better. */
9295
9296 emit_move_insn (out, addr);
9297
9298 ix86_expand_strlensi_unroll_1 (out, src, align);
9299
9300 /* strlensi_unroll_1 returns the address of the zero at the end of
9301 the string, like memchr(), so compute the length by subtracting
9302 the start address. */
9303 emit_insn (gen_sub2_insn (out, addr));
9304 return true;
9305 }
9306 else
9307 return false;
9308 }
9309
9310 /* For given symbol (function) construct code to compute address of it's PLT
9311 entry in large x86-64 PIC model. */
9312
9313 static rtx
9314 construct_plt_address (rtx symbol)
9315 {
9316 rtx tmp, unspec;
9317
9318 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9319 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9320 gcc_assert (Pmode == DImode);
9321
9322 tmp = gen_reg_rtx (Pmode);
9323 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9324
9325 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9326 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9327 return tmp;
9328 }
9329
9330 /* Additional registers that are clobbered by SYSV calls. */
9331
9332 static int const x86_64_ms_sysv_extra_clobbered_registers
9333 [NUM_X86_64_MS_CLOBBERED_REGS] =
9334 {
9335 SI_REG, DI_REG,
9336 XMM6_REG, XMM7_REG,
9337 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9338 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9339 };
9340
9341 rtx_insn *
9342 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9343 rtx callarg2,
9344 rtx pop, bool sibcall)
9345 {
9346 rtx vec[3];
9347 rtx use = NULL, call;
9348 unsigned int vec_len = 0;
9349 tree fndecl;
9350
9351 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9352 {
9353 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9354 if (fndecl
9355 && (lookup_attribute ("interrupt",
9356 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9357 error ("interrupt service routine cannot be called directly");
9358 }
9359 else
9360 fndecl = NULL_TREE;
9361
9362 if (pop == const0_rtx)
9363 pop = NULL;
9364 gcc_assert (!TARGET_64BIT || !pop);
9365
9366 rtx addr = XEXP (fnaddr, 0);
9367 if (TARGET_MACHO && !TARGET_64BIT)
9368 {
9369 #if TARGET_MACHO
9370 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9371 fnaddr = machopic_indirect_call_target (fnaddr);
9372 #endif
9373 }
9374 else
9375 {
9376 /* Static functions and indirect calls don't need the pic register. Also,
9377 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9378 it an indirect call. */
9379 if (flag_pic
9380 && GET_CODE (addr) == SYMBOL_REF
9381 && ix86_call_use_plt_p (addr))
9382 {
9383 if (flag_plt
9384 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9385 || !lookup_attribute ("noplt",
9386 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9387 {
9388 if (!TARGET_64BIT
9389 || (ix86_cmodel == CM_LARGE_PIC
9390 && DEFAULT_ABI != MS_ABI))
9391 {
9392 use_reg (&use, gen_rtx_REG (Pmode,
9393 REAL_PIC_OFFSET_TABLE_REGNUM));
9394 if (ix86_use_pseudo_pic_reg ())
9395 emit_move_insn (gen_rtx_REG (Pmode,
9396 REAL_PIC_OFFSET_TABLE_REGNUM),
9397 pic_offset_table_rtx);
9398 }
9399 }
9400 else if (!TARGET_PECOFF && !TARGET_MACHO)
9401 {
9402 if (TARGET_64BIT
9403 && ix86_cmodel == CM_LARGE_PIC
9404 && DEFAULT_ABI != MS_ABI)
9405 {
9406 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9407 UNSPEC_GOT);
9408 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9409 fnaddr = force_reg (Pmode, fnaddr);
9410 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9411 }
9412 else if (TARGET_64BIT)
9413 {
9414 fnaddr = gen_rtx_UNSPEC (Pmode,
9415 gen_rtvec (1, addr),
9416 UNSPEC_GOTPCREL);
9417 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9418 }
9419 else
9420 {
9421 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9422 UNSPEC_GOT);
9423 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9424 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9425 fnaddr);
9426 }
9427 fnaddr = gen_const_mem (Pmode, fnaddr);
9428 /* Pmode may not be the same as word_mode for x32, which
9429 doesn't support indirect branch via 32-bit memory slot.
9430 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9431 indirect branch via x32 GOT slot is OK. */
9432 if (GET_MODE (fnaddr) != word_mode)
9433 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9434 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9435 }
9436 }
9437 }
9438
9439 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9440 parameters passed in vector registers. */
9441 if (TARGET_64BIT
9442 && (INTVAL (callarg2) > 0
9443 || (INTVAL (callarg2) == 0
9444 && (TARGET_SSE || !flag_skip_rax_setup))))
9445 {
9446 rtx al = gen_rtx_REG (QImode, AX_REG);
9447 emit_move_insn (al, callarg2);
9448 use_reg (&use, al);
9449 }
9450
9451 if (ix86_cmodel == CM_LARGE_PIC
9452 && !TARGET_PECOFF
9453 && MEM_P (fnaddr)
9454 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9455 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9456 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9457 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9458 branch via x32 GOT slot is OK. */
9459 else if (!(TARGET_X32
9460 && MEM_P (fnaddr)
9461 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9462 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9463 && (sibcall
9464 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9465 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9466 {
9467 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9468 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9469 }
9470
9471 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9472 mask off code pointers here.
9473 TODO: also need to handle indirect jump. */
9474 if (ix86_memtag_can_tag_addresses () && !fndecl
9475 && sanitize_flags_p (SANITIZE_HWADDRESS))
9476 {
9477 rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
9478 NULL_RTX);
9479 fnaddr = gen_rtx_MEM (QImode, untagged_addr);
9480 }
9481
9482 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9483
9484 if (retval)
9485 call = gen_rtx_SET (retval, call);
9486 vec[vec_len++] = call;
9487
9488 if (pop)
9489 {
9490 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9491 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9492 vec[vec_len++] = pop;
9493 }
9494
9495 if (cfun->machine->no_caller_saved_registers
9496 && (!fndecl
9497 || (!TREE_THIS_VOLATILE (fndecl)
9498 && !lookup_attribute ("no_caller_saved_registers",
9499 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9500 {
9501 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9502 bool is_64bit_ms_abi = (TARGET_64BIT
9503 && ix86_function_abi (fndecl) == MS_ABI);
9504 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9505
9506 /* If there are no caller-saved registers, add all registers
9507 that are clobbered by the call which returns. */
9508 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9509 if (!fixed_regs[i]
9510 && (ix86_call_used_regs[i] == 1
9511 || (ix86_call_used_regs[i] & c_mask))
9512 && !STACK_REGNO_P (i)
9513 && !MMX_REGNO_P (i))
9514 clobber_reg (&use,
9515 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9516 }
9517 else if (TARGET_64BIT_MS_ABI
9518 && (!callarg2 || INTVAL (callarg2) != -2))
9519 {
9520 unsigned i;
9521
9522 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9523 {
9524 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9525 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9526
9527 clobber_reg (&use, gen_rtx_REG (mode, regno));
9528 }
9529
9530 /* Set here, but it may get cleared later. */
9531 if (TARGET_CALL_MS2SYSV_XLOGUES)
9532 {
9533 if (!TARGET_SSE)
9534 ;
9535
9536 /* Don't break hot-patched functions. */
9537 else if (ix86_function_ms_hook_prologue (current_function_decl))
9538 ;
9539
9540 /* TODO: Cases not yet examined. */
9541 else if (flag_split_stack)
9542 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9543
9544 else
9545 {
9546 gcc_assert (!reload_completed);
9547 cfun->machine->call_ms2sysv = true;
9548 }
9549 }
9550 }
9551
9552 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9553 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9554 || !fndecl || TREE_PUBLIC (fndecl)))
9555 {
9556 /* We allow public functions defined in a TU to bind locally for PIC
9557 code (the default) on 64bit Mach-O.
9558 If such functions are not inlined, we cannot tell at compile-time if
9559 they will be called via the lazy symbol resolver (this can depend on
9560 options given at link-time). Therefore, we must assume that the lazy
9561 resolver could be used which clobbers R11 and R10. */
9562 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9563 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9564 }
9565
9566 if (vec_len > 1)
9567 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9568 rtx_insn *call_insn = emit_call_insn (call);
9569 if (use)
9570 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9571
9572 return call_insn;
9573 }
9574
9575 /* Split simple return with popping POPC bytes from stack to indirect
9576 branch with stack adjustment . */
9577
9578 void
9579 ix86_split_simple_return_pop_internal (rtx popc)
9580 {
9581 struct machine_function *m = cfun->machine;
9582 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9583 rtx_insn *insn;
9584
9585 /* There is no "pascal" calling convention in any 64bit ABI. */
9586 gcc_assert (!TARGET_64BIT);
9587
9588 insn = emit_insn (gen_pop (ecx));
9589 m->fs.cfa_offset -= UNITS_PER_WORD;
9590 m->fs.sp_offset -= UNITS_PER_WORD;
9591
9592 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9593 x = gen_rtx_SET (stack_pointer_rtx, x);
9594 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9595 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9596 RTX_FRAME_RELATED_P (insn) = 1;
9597
9598 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9599 x = gen_rtx_SET (stack_pointer_rtx, x);
9600 insn = emit_insn (x);
9601 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9602 RTX_FRAME_RELATED_P (insn) = 1;
9603
9604 /* Now return address is in ECX. */
9605 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9606 }
9607
9608 /* Errors in the source file can cause expand_expr to return const0_rtx
9609 where we expect a vector. To avoid crashing, use one of the vector
9610 clear instructions. */
9611
9612 static rtx
9613 safe_vector_operand (rtx x, machine_mode mode)
9614 {
9615 if (x == const0_rtx)
9616 x = CONST0_RTX (mode);
9617 return x;
9618 }
9619
9620 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9621
9622 static rtx
9623 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9624 {
9625 rtx pat;
9626 tree arg0 = CALL_EXPR_ARG (exp, 0);
9627 tree arg1 = CALL_EXPR_ARG (exp, 1);
9628 rtx op0 = expand_normal (arg0);
9629 rtx op1 = expand_normal (arg1);
9630 machine_mode tmode = insn_data[icode].operand[0].mode;
9631 machine_mode mode0 = insn_data[icode].operand[1].mode;
9632 machine_mode mode1 = insn_data[icode].operand[2].mode;
9633
9634 if (VECTOR_MODE_P (mode0))
9635 op0 = safe_vector_operand (op0, mode0);
9636 if (VECTOR_MODE_P (mode1))
9637 op1 = safe_vector_operand (op1, mode1);
9638
9639 if (optimize || !target
9640 || GET_MODE (target) != tmode
9641 || !insn_data[icode].operand[0].predicate (target, tmode))
9642 target = gen_reg_rtx (tmode);
9643
9644 if (GET_MODE (op1) == SImode && mode1 == TImode)
9645 {
9646 rtx x = gen_reg_rtx (V4SImode);
9647 emit_insn (gen_sse2_loadd (x, op1));
9648 op1 = gen_lowpart (TImode, x);
9649 }
9650
9651 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9652 op0 = copy_to_mode_reg (mode0, op0);
9653 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9654 op1 = copy_to_mode_reg (mode1, op1);
9655
9656 pat = GEN_FCN (icode) (target, op0, op1);
9657 if (! pat)
9658 return 0;
9659
9660 emit_insn (pat);
9661
9662 return target;
9663 }
9664
9665 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9666
9667 static rtx
9668 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9669 enum ix86_builtin_func_type m_type,
9670 enum rtx_code sub_code)
9671 {
9672 rtx pat;
9673 unsigned int i, nargs;
9674 bool comparison_p = false;
9675 bool tf_p = false;
9676 bool last_arg_constant = false;
9677 int num_memory = 0;
9678 rtx xops[4];
9679
9680 machine_mode tmode = insn_data[icode].operand[0].mode;
9681
9682 switch (m_type)
9683 {
9684 case MULTI_ARG_4_DF2_DI_I:
9685 case MULTI_ARG_4_DF2_DI_I1:
9686 case MULTI_ARG_4_SF2_SI_I:
9687 case MULTI_ARG_4_SF2_SI_I1:
9688 nargs = 4;
9689 last_arg_constant = true;
9690 break;
9691
9692 case MULTI_ARG_3_SF:
9693 case MULTI_ARG_3_DF:
9694 case MULTI_ARG_3_SF2:
9695 case MULTI_ARG_3_DF2:
9696 case MULTI_ARG_3_DI:
9697 case MULTI_ARG_3_SI:
9698 case MULTI_ARG_3_SI_DI:
9699 case MULTI_ARG_3_HI:
9700 case MULTI_ARG_3_HI_SI:
9701 case MULTI_ARG_3_QI:
9702 case MULTI_ARG_3_DI2:
9703 case MULTI_ARG_3_SI2:
9704 case MULTI_ARG_3_HI2:
9705 case MULTI_ARG_3_QI2:
9706 nargs = 3;
9707 break;
9708
9709 case MULTI_ARG_2_SF:
9710 case MULTI_ARG_2_DF:
9711 case MULTI_ARG_2_DI:
9712 case MULTI_ARG_2_SI:
9713 case MULTI_ARG_2_HI:
9714 case MULTI_ARG_2_QI:
9715 nargs = 2;
9716 break;
9717
9718 case MULTI_ARG_2_DI_IMM:
9719 case MULTI_ARG_2_SI_IMM:
9720 case MULTI_ARG_2_HI_IMM:
9721 case MULTI_ARG_2_QI_IMM:
9722 nargs = 2;
9723 last_arg_constant = true;
9724 break;
9725
9726 case MULTI_ARG_1_SF:
9727 case MULTI_ARG_1_DF:
9728 case MULTI_ARG_1_SF2:
9729 case MULTI_ARG_1_DF2:
9730 case MULTI_ARG_1_DI:
9731 case MULTI_ARG_1_SI:
9732 case MULTI_ARG_1_HI:
9733 case MULTI_ARG_1_QI:
9734 case MULTI_ARG_1_SI_DI:
9735 case MULTI_ARG_1_HI_DI:
9736 case MULTI_ARG_1_HI_SI:
9737 case MULTI_ARG_1_QI_DI:
9738 case MULTI_ARG_1_QI_SI:
9739 case MULTI_ARG_1_QI_HI:
9740 nargs = 1;
9741 break;
9742
9743 case MULTI_ARG_2_DI_CMP:
9744 case MULTI_ARG_2_SI_CMP:
9745 case MULTI_ARG_2_HI_CMP:
9746 case MULTI_ARG_2_QI_CMP:
9747 nargs = 2;
9748 comparison_p = true;
9749 break;
9750
9751 case MULTI_ARG_2_SF_TF:
9752 case MULTI_ARG_2_DF_TF:
9753 case MULTI_ARG_2_DI_TF:
9754 case MULTI_ARG_2_SI_TF:
9755 case MULTI_ARG_2_HI_TF:
9756 case MULTI_ARG_2_QI_TF:
9757 nargs = 2;
9758 tf_p = true;
9759 break;
9760
9761 default:
9762 gcc_unreachable ();
9763 }
9764
9765 if (optimize || !target
9766 || GET_MODE (target) != tmode
9767 || !insn_data[icode].operand[0].predicate (target, tmode))
9768 target = gen_reg_rtx (tmode);
9769 else if (memory_operand (target, tmode))
9770 num_memory++;
9771
9772 gcc_assert (nargs <= ARRAY_SIZE (xops));
9773
9774 for (i = 0; i < nargs; i++)
9775 {
9776 tree arg = CALL_EXPR_ARG (exp, i);
9777 rtx op = expand_normal (arg);
9778 int adjust = (comparison_p) ? 1 : 0;
9779 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9780
9781 if (last_arg_constant && i == nargs - 1)
9782 {
9783 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9784 {
9785 enum insn_code new_icode = icode;
9786 switch (icode)
9787 {
9788 case CODE_FOR_xop_vpermil2v2df3:
9789 case CODE_FOR_xop_vpermil2v4sf3:
9790 case CODE_FOR_xop_vpermil2v4df3:
9791 case CODE_FOR_xop_vpermil2v8sf3:
9792 error ("the last argument must be a 2-bit immediate");
9793 return gen_reg_rtx (tmode);
9794 case CODE_FOR_xop_rotlv2di3:
9795 new_icode = CODE_FOR_rotlv2di3;
9796 goto xop_rotl;
9797 case CODE_FOR_xop_rotlv4si3:
9798 new_icode = CODE_FOR_rotlv4si3;
9799 goto xop_rotl;
9800 case CODE_FOR_xop_rotlv8hi3:
9801 new_icode = CODE_FOR_rotlv8hi3;
9802 goto xop_rotl;
9803 case CODE_FOR_xop_rotlv16qi3:
9804 new_icode = CODE_FOR_rotlv16qi3;
9805 xop_rotl:
9806 if (CONST_INT_P (op))
9807 {
9808 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9809 op = GEN_INT (INTVAL (op) & mask);
9810 gcc_checking_assert
9811 (insn_data[icode].operand[i + 1].predicate (op, mode));
9812 }
9813 else
9814 {
9815 gcc_checking_assert
9816 (nargs == 2
9817 && insn_data[new_icode].operand[0].mode == tmode
9818 && insn_data[new_icode].operand[1].mode == tmode
9819 && insn_data[new_icode].operand[2].mode == mode
9820 && insn_data[new_icode].operand[0].predicate
9821 == insn_data[icode].operand[0].predicate
9822 && insn_data[new_icode].operand[1].predicate
9823 == insn_data[icode].operand[1].predicate);
9824 icode = new_icode;
9825 goto non_constant;
9826 }
9827 break;
9828 default:
9829 gcc_unreachable ();
9830 }
9831 }
9832 }
9833 else
9834 {
9835 non_constant:
9836 if (VECTOR_MODE_P (mode))
9837 op = safe_vector_operand (op, mode);
9838
9839 /* If we aren't optimizing, only allow one memory operand to be
9840 generated. */
9841 if (memory_operand (op, mode))
9842 num_memory++;
9843
9844 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9845
9846 if (optimize
9847 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9848 || num_memory > 1)
9849 op = force_reg (mode, op);
9850 }
9851
9852 xops[i] = op;
9853 }
9854
9855 switch (nargs)
9856 {
9857 case 1:
9858 pat = GEN_FCN (icode) (target, xops[0]);
9859 break;
9860
9861 case 2:
9862 if (tf_p)
9863 pat = GEN_FCN (icode) (target, xops[0], xops[1],
9864 GEN_INT ((int)sub_code));
9865 else if (! comparison_p)
9866 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
9867 else
9868 {
9869 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
9870 xops[0], xops[1]);
9871
9872 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
9873 }
9874 break;
9875
9876 case 3:
9877 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
9878 break;
9879
9880 case 4:
9881 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
9882 break;
9883
9884 default:
9885 gcc_unreachable ();
9886 }
9887
9888 if (! pat)
9889 return 0;
9890
9891 emit_insn (pat);
9892 return target;
9893 }
9894
9895 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9896 insns with vec_merge. */
9897
9898 static rtx
9899 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9900 rtx target)
9901 {
9902 rtx pat;
9903 tree arg0 = CALL_EXPR_ARG (exp, 0);
9904 rtx op1, op0 = expand_normal (arg0);
9905 machine_mode tmode = insn_data[icode].operand[0].mode;
9906 machine_mode mode0 = insn_data[icode].operand[1].mode;
9907
9908 if (optimize || !target
9909 || GET_MODE (target) != tmode
9910 || !insn_data[icode].operand[0].predicate (target, tmode))
9911 target = gen_reg_rtx (tmode);
9912
9913 if (VECTOR_MODE_P (mode0))
9914 op0 = safe_vector_operand (op0, mode0);
9915
9916 if ((optimize && !register_operand (op0, mode0))
9917 || !insn_data[icode].operand[1].predicate (op0, mode0))
9918 op0 = copy_to_mode_reg (mode0, op0);
9919
9920 op1 = op0;
9921 if (!insn_data[icode].operand[2].predicate (op1, mode0))
9922 op1 = copy_to_mode_reg (mode0, op1);
9923
9924 pat = GEN_FCN (icode) (target, op0, op1);
9925 if (! pat)
9926 return 0;
9927 emit_insn (pat);
9928 return target;
9929 }
9930
9931 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9932
9933 static rtx
9934 ix86_expand_sse_compare (const struct builtin_description *d,
9935 tree exp, rtx target, bool swap)
9936 {
9937 rtx pat;
9938 tree arg0 = CALL_EXPR_ARG (exp, 0);
9939 tree arg1 = CALL_EXPR_ARG (exp, 1);
9940 rtx op0 = expand_normal (arg0);
9941 rtx op1 = expand_normal (arg1);
9942 rtx op2;
9943 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9944 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9945 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9946 enum rtx_code comparison = d->comparison;
9947
9948 if (VECTOR_MODE_P (mode0))
9949 op0 = safe_vector_operand (op0, mode0);
9950 if (VECTOR_MODE_P (mode1))
9951 op1 = safe_vector_operand (op1, mode1);
9952
9953 /* Swap operands if we have a comparison that isn't available in
9954 hardware. */
9955 if (swap)
9956 std::swap (op0, op1);
9957
9958 if (optimize || !target
9959 || GET_MODE (target) != tmode
9960 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9961 target = gen_reg_rtx (tmode);
9962
9963 if ((optimize && !register_operand (op0, mode0))
9964 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
9965 op0 = copy_to_mode_reg (mode0, op0);
9966 if ((optimize && !register_operand (op1, mode1))
9967 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
9968 op1 = copy_to_mode_reg (mode1, op1);
9969
9970 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
9971 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9972 if (! pat)
9973 return 0;
9974 emit_insn (pat);
9975 return target;
9976 }
9977
9978 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
9979 * ordered EQ or unordered NE, generate PF jump. */
9980
9981 static rtx
9982 ix86_ssecom_setcc (const enum rtx_code comparison,
9983 bool check_unordered, machine_mode mode,
9984 rtx set_dst, rtx target)
9985 {
9986
9987 rtx_code_label *label = NULL;
9988
9989 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
9990 with NAN operands. */
9991 if (check_unordered)
9992 {
9993 gcc_assert (comparison == EQ || comparison == NE);
9994
9995 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
9996 label = gen_label_rtx ();
9997 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
9998 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9999 gen_rtx_LABEL_REF (VOIDmode, label),
10000 pc_rtx);
10001 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10002 }
10003
10004 /* NB: Set CCFPmode and check a different CCmode which is in subset
10005 of CCFPmode. */
10006 if (GET_MODE (set_dst) != mode)
10007 {
10008 gcc_assert (mode == CCAmode || mode == CCCmode
10009 || mode == CCOmode || mode == CCPmode
10010 || mode == CCSmode || mode == CCZmode);
10011 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10012 }
10013
10014 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10015 gen_rtx_fmt_ee (comparison, QImode,
10016 set_dst,
10017 const0_rtx)));
10018
10019 if (label)
10020 emit_label (label);
10021
10022 return SUBREG_REG (target);
10023 }
10024
10025 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
10026
10027 static rtx
10028 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
10029 rtx target)
10030 {
10031 rtx pat, set_dst;
10032 tree arg0 = CALL_EXPR_ARG (exp, 0);
10033 tree arg1 = CALL_EXPR_ARG (exp, 1);
10034 rtx op0 = expand_normal (arg0);
10035 rtx op1 = expand_normal (arg1);
10036 enum insn_code icode = d->icode;
10037 const struct insn_data_d *insn_p = &insn_data[icode];
10038 machine_mode mode0 = insn_p->operand[0].mode;
10039 machine_mode mode1 = insn_p->operand[1].mode;
10040
10041 if (VECTOR_MODE_P (mode0))
10042 op0 = safe_vector_operand (op0, mode0);
10043 if (VECTOR_MODE_P (mode1))
10044 op1 = safe_vector_operand (op1, mode1);
10045
10046 enum rtx_code comparison = d->comparison;
10047 rtx const_val = const0_rtx;
10048
10049 bool check_unordered = false;
10050 machine_mode mode = CCFPmode;
10051 switch (comparison)
10052 {
10053 case LE: /* -> GE */
10054 case LT: /* -> GT */
10055 std::swap (op0, op1);
10056 comparison = swap_condition (comparison);
10057 /* FALLTHRU */
10058 case GT:
10059 case GE:
10060 break;
10061 case EQ:
10062 check_unordered = true;
10063 mode = CCZmode;
10064 break;
10065 case NE:
10066 check_unordered = true;
10067 mode = CCZmode;
10068 const_val = const1_rtx;
10069 break;
10070 default:
10071 gcc_unreachable ();
10072 }
10073
10074 target = gen_reg_rtx (SImode);
10075 emit_move_insn (target, const_val);
10076 target = gen_rtx_SUBREG (QImode, target, 0);
10077
10078 if ((optimize && !register_operand (op0, mode0))
10079 || !insn_p->operand[0].predicate (op0, mode0))
10080 op0 = copy_to_mode_reg (mode0, op0);
10081 if ((optimize && !register_operand (op1, mode1))
10082 || !insn_p->operand[1].predicate (op1, mode1))
10083 op1 = copy_to_mode_reg (mode1, op1);
10084
10085 pat = GEN_FCN (icode) (op0, op1);
10086 if (! pat)
10087 return 0;
10088
10089 set_dst = SET_DEST (pat);
10090 emit_insn (pat);
10091 return ix86_ssecom_setcc (comparison, check_unordered, mode,
10092 set_dst, target);
10093 }
10094
10095 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10096
10097 static rtx
10098 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
10099 rtx target)
10100 {
10101 rtx pat;
10102 tree arg0 = CALL_EXPR_ARG (exp, 0);
10103 rtx op1, op0 = expand_normal (arg0);
10104 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10105 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10106
10107 if (optimize || target == 0
10108 || GET_MODE (target) != tmode
10109 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10110 target = gen_reg_rtx (tmode);
10111
10112 if (VECTOR_MODE_P (mode0))
10113 op0 = safe_vector_operand (op0, mode0);
10114
10115 if ((optimize && !register_operand (op0, mode0))
10116 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10117 op0 = copy_to_mode_reg (mode0, op0);
10118
10119 op1 = GEN_INT (d->comparison);
10120
10121 pat = GEN_FCN (d->icode) (target, op0, op1);
10122 if (! pat)
10123 return 0;
10124 emit_insn (pat);
10125 return target;
10126 }
10127
10128 static rtx
10129 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
10130 tree exp, rtx target)
10131 {
10132 rtx pat;
10133 tree arg0 = CALL_EXPR_ARG (exp, 0);
10134 tree arg1 = CALL_EXPR_ARG (exp, 1);
10135 rtx op0 = expand_normal (arg0);
10136 rtx op1 = expand_normal (arg1);
10137 rtx op2;
10138 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10139 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10140 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10141
10142 if (optimize || target == 0
10143 || GET_MODE (target) != tmode
10144 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10145 target = gen_reg_rtx (tmode);
10146
10147 op0 = safe_vector_operand (op0, mode0);
10148 op1 = safe_vector_operand (op1, mode1);
10149
10150 if ((optimize && !register_operand (op0, mode0))
10151 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10152 op0 = copy_to_mode_reg (mode0, op0);
10153 if ((optimize && !register_operand (op1, mode1))
10154 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10155 op1 = copy_to_mode_reg (mode1, op1);
10156
10157 op2 = GEN_INT (d->comparison);
10158
10159 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10160 if (! pat)
10161 return 0;
10162 emit_insn (pat);
10163 return target;
10164 }
10165
10166 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10167
10168 static rtx
10169 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10170 rtx target)
10171 {
10172 rtx pat;
10173 tree arg0 = CALL_EXPR_ARG (exp, 0);
10174 tree arg1 = CALL_EXPR_ARG (exp, 1);
10175 rtx op0 = expand_normal (arg0);
10176 rtx op1 = expand_normal (arg1);
10177 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10178 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10179 enum rtx_code comparison = d->comparison;
10180
10181 if (VECTOR_MODE_P (mode0))
10182 op0 = safe_vector_operand (op0, mode0);
10183 if (VECTOR_MODE_P (mode1))
10184 op1 = safe_vector_operand (op1, mode1);
10185
10186 target = gen_reg_rtx (SImode);
10187 emit_move_insn (target, const0_rtx);
10188 target = gen_rtx_SUBREG (QImode, target, 0);
10189
10190 if ((optimize && !register_operand (op0, mode0))
10191 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10192 op0 = copy_to_mode_reg (mode0, op0);
10193 if ((optimize && !register_operand (op1, mode1))
10194 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10195 op1 = copy_to_mode_reg (mode1, op1);
10196
10197 pat = GEN_FCN (d->icode) (op0, op1);
10198 if (! pat)
10199 return 0;
10200 emit_insn (pat);
10201 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10202 gen_rtx_fmt_ee (comparison, QImode,
10203 SET_DEST (pat),
10204 const0_rtx)));
10205
10206 return SUBREG_REG (target);
10207 }
10208
10209 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10210
10211 static rtx
10212 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10213 tree exp, rtx target)
10214 {
10215 rtx pat;
10216 tree arg0 = CALL_EXPR_ARG (exp, 0);
10217 tree arg1 = CALL_EXPR_ARG (exp, 1);
10218 tree arg2 = CALL_EXPR_ARG (exp, 2);
10219 tree arg3 = CALL_EXPR_ARG (exp, 3);
10220 tree arg4 = CALL_EXPR_ARG (exp, 4);
10221 rtx scratch0, scratch1;
10222 rtx op0 = expand_normal (arg0);
10223 rtx op1 = expand_normal (arg1);
10224 rtx op2 = expand_normal (arg2);
10225 rtx op3 = expand_normal (arg3);
10226 rtx op4 = expand_normal (arg4);
10227 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10228
10229 tmode0 = insn_data[d->icode].operand[0].mode;
10230 tmode1 = insn_data[d->icode].operand[1].mode;
10231 modev2 = insn_data[d->icode].operand[2].mode;
10232 modei3 = insn_data[d->icode].operand[3].mode;
10233 modev4 = insn_data[d->icode].operand[4].mode;
10234 modei5 = insn_data[d->icode].operand[5].mode;
10235 modeimm = insn_data[d->icode].operand[6].mode;
10236
10237 if (VECTOR_MODE_P (modev2))
10238 op0 = safe_vector_operand (op0, modev2);
10239 if (VECTOR_MODE_P (modev4))
10240 op2 = safe_vector_operand (op2, modev4);
10241
10242 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10243 op0 = copy_to_mode_reg (modev2, op0);
10244 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10245 op1 = copy_to_mode_reg (modei3, op1);
10246 if ((optimize && !register_operand (op2, modev4))
10247 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10248 op2 = copy_to_mode_reg (modev4, op2);
10249 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10250 op3 = copy_to_mode_reg (modei5, op3);
10251
10252 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10253 {
10254 error ("the fifth argument must be an 8-bit immediate");
10255 return const0_rtx;
10256 }
10257
10258 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10259 {
10260 if (optimize || !target
10261 || GET_MODE (target) != tmode0
10262 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10263 target = gen_reg_rtx (tmode0);
10264
10265 scratch1 = gen_reg_rtx (tmode1);
10266
10267 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10268 }
10269 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10270 {
10271 if (optimize || !target
10272 || GET_MODE (target) != tmode1
10273 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10274 target = gen_reg_rtx (tmode1);
10275
10276 scratch0 = gen_reg_rtx (tmode0);
10277
10278 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10279 }
10280 else
10281 {
10282 gcc_assert (d->flag);
10283
10284 scratch0 = gen_reg_rtx (tmode0);
10285 scratch1 = gen_reg_rtx (tmode1);
10286
10287 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10288 }
10289
10290 if (! pat)
10291 return 0;
10292
10293 emit_insn (pat);
10294
10295 if (d->flag)
10296 {
10297 target = gen_reg_rtx (SImode);
10298 emit_move_insn (target, const0_rtx);
10299 target = gen_rtx_SUBREG (QImode, target, 0);
10300
10301 emit_insn
10302 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10303 gen_rtx_fmt_ee (EQ, QImode,
10304 gen_rtx_REG ((machine_mode) d->flag,
10305 FLAGS_REG),
10306 const0_rtx)));
10307 return SUBREG_REG (target);
10308 }
10309 else
10310 return target;
10311 }
10312
10313
10314 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10315
10316 static rtx
10317 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10318 tree exp, rtx target)
10319 {
10320 rtx pat;
10321 tree arg0 = CALL_EXPR_ARG (exp, 0);
10322 tree arg1 = CALL_EXPR_ARG (exp, 1);
10323 tree arg2 = CALL_EXPR_ARG (exp, 2);
10324 rtx scratch0, scratch1;
10325 rtx op0 = expand_normal (arg0);
10326 rtx op1 = expand_normal (arg1);
10327 rtx op2 = expand_normal (arg2);
10328 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10329
10330 tmode0 = insn_data[d->icode].operand[0].mode;
10331 tmode1 = insn_data[d->icode].operand[1].mode;
10332 modev2 = insn_data[d->icode].operand[2].mode;
10333 modev3 = insn_data[d->icode].operand[3].mode;
10334 modeimm = insn_data[d->icode].operand[4].mode;
10335
10336 if (VECTOR_MODE_P (modev2))
10337 op0 = safe_vector_operand (op0, modev2);
10338 if (VECTOR_MODE_P (modev3))
10339 op1 = safe_vector_operand (op1, modev3);
10340
10341 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10342 op0 = copy_to_mode_reg (modev2, op0);
10343 if ((optimize && !register_operand (op1, modev3))
10344 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10345 op1 = copy_to_mode_reg (modev3, op1);
10346
10347 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10348 {
10349 error ("the third argument must be an 8-bit immediate");
10350 return const0_rtx;
10351 }
10352
10353 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10354 {
10355 if (optimize || !target
10356 || GET_MODE (target) != tmode0
10357 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10358 target = gen_reg_rtx (tmode0);
10359
10360 scratch1 = gen_reg_rtx (tmode1);
10361
10362 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10363 }
10364 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10365 {
10366 if (optimize || !target
10367 || GET_MODE (target) != tmode1
10368 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10369 target = gen_reg_rtx (tmode1);
10370
10371 scratch0 = gen_reg_rtx (tmode0);
10372
10373 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10374 }
10375 else
10376 {
10377 gcc_assert (d->flag);
10378
10379 scratch0 = gen_reg_rtx (tmode0);
10380 scratch1 = gen_reg_rtx (tmode1);
10381
10382 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10383 }
10384
10385 if (! pat)
10386 return 0;
10387
10388 emit_insn (pat);
10389
10390 if (d->flag)
10391 {
10392 target = gen_reg_rtx (SImode);
10393 emit_move_insn (target, const0_rtx);
10394 target = gen_rtx_SUBREG (QImode, target, 0);
10395
10396 emit_insn
10397 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10398 gen_rtx_fmt_ee (EQ, QImode,
10399 gen_rtx_REG ((machine_mode) d->flag,
10400 FLAGS_REG),
10401 const0_rtx)));
10402 return SUBREG_REG (target);
10403 }
10404 else
10405 return target;
10406 }
10407
10408 /* Fixup modeless constants to fit required mode. */
10409
10410 static rtx
10411 fixup_modeless_constant (rtx x, machine_mode mode)
10412 {
10413 if (GET_MODE (x) == VOIDmode)
10414 x = convert_to_mode (mode, x, 1);
10415 return x;
10416 }
10417
10418 /* Subroutine of ix86_expand_builtin to take care of insns with
10419 variable number of operands. */
10420
10421 static rtx
10422 ix86_expand_args_builtin (const struct builtin_description *d,
10423 tree exp, rtx target)
10424 {
10425 rtx pat, real_target;
10426 unsigned int i, nargs;
10427 unsigned int nargs_constant = 0;
10428 unsigned int mask_pos = 0;
10429 int num_memory = 0;
10430 rtx xops[6];
10431 bool second_arg_count = false;
10432 enum insn_code icode = d->icode;
10433 const struct insn_data_d *insn_p = &insn_data[icode];
10434 machine_mode tmode = insn_p->operand[0].mode;
10435 machine_mode rmode = VOIDmode;
10436 bool swap = false;
10437 enum rtx_code comparison = d->comparison;
10438
10439 switch ((enum ix86_builtin_func_type) d->flag)
10440 {
10441 case V2DF_FTYPE_V2DF_ROUND:
10442 case V4DF_FTYPE_V4DF_ROUND:
10443 case V8DF_FTYPE_V8DF_ROUND:
10444 case V4SF_FTYPE_V4SF_ROUND:
10445 case V8SF_FTYPE_V8SF_ROUND:
10446 case V16SF_FTYPE_V16SF_ROUND:
10447 case V8HF_FTYPE_V8HF_ROUND:
10448 case V16HF_FTYPE_V16HF_ROUND:
10449 case V32HF_FTYPE_V32HF_ROUND:
10450 case V4SI_FTYPE_V4SF_ROUND:
10451 case V8SI_FTYPE_V8SF_ROUND:
10452 case V16SI_FTYPE_V16SF_ROUND:
10453 return ix86_expand_sse_round (d, exp, target);
10454 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10455 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10456 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10457 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10458 case INT_FTYPE_V8SF_V8SF_PTEST:
10459 case INT_FTYPE_V4DI_V4DI_PTEST:
10460 case INT_FTYPE_V4DF_V4DF_PTEST:
10461 case INT_FTYPE_V4SF_V4SF_PTEST:
10462 case INT_FTYPE_V2DI_V2DI_PTEST:
10463 case INT_FTYPE_V2DF_V2DF_PTEST:
10464 return ix86_expand_sse_ptest (d, exp, target);
10465 case FLOAT128_FTYPE_FLOAT128:
10466 case FLOAT_FTYPE_FLOAT:
10467 case FLOAT_FTYPE_BFLOAT16:
10468 case INT_FTYPE_INT:
10469 case UINT_FTYPE_UINT:
10470 case UINT16_FTYPE_UINT16:
10471 case UINT64_FTYPE_INT:
10472 case UINT64_FTYPE_UINT64:
10473 case INT64_FTYPE_INT64:
10474 case INT64_FTYPE_V4SF:
10475 case INT64_FTYPE_V2DF:
10476 case INT_FTYPE_V16QI:
10477 case INT_FTYPE_V8QI:
10478 case INT_FTYPE_V8SF:
10479 case INT_FTYPE_V4DF:
10480 case INT_FTYPE_V4SF:
10481 case INT_FTYPE_V2DF:
10482 case INT_FTYPE_V32QI:
10483 case V16QI_FTYPE_V16QI:
10484 case V8SI_FTYPE_V8SF:
10485 case V8SI_FTYPE_V4SI:
10486 case V8HI_FTYPE_V8HI:
10487 case V8HI_FTYPE_V16QI:
10488 case V8QI_FTYPE_V8QI:
10489 case V8SF_FTYPE_V8SF:
10490 case V8SF_FTYPE_V8SI:
10491 case V8SF_FTYPE_V4SF:
10492 case V8SF_FTYPE_V8HI:
10493 case V4SI_FTYPE_V4SI:
10494 case V4SI_FTYPE_V16QI:
10495 case V4SI_FTYPE_V4SF:
10496 case V4SI_FTYPE_V8SI:
10497 case V4SI_FTYPE_V8HI:
10498 case V4SI_FTYPE_V4DF:
10499 case V4SI_FTYPE_V2DF:
10500 case V4HI_FTYPE_V4HI:
10501 case V4DF_FTYPE_V4DF:
10502 case V4DF_FTYPE_V4SI:
10503 case V4DF_FTYPE_V4SF:
10504 case V4DF_FTYPE_V2DF:
10505 case V4SF_FTYPE_V4SF:
10506 case V4SF_FTYPE_V4SI:
10507 case V4SF_FTYPE_V8SF:
10508 case V4SF_FTYPE_V4DF:
10509 case V4SF_FTYPE_V8HI:
10510 case V4SF_FTYPE_V2DF:
10511 case V2DI_FTYPE_V2DI:
10512 case V2DI_FTYPE_V16QI:
10513 case V2DI_FTYPE_V8HI:
10514 case V2DI_FTYPE_V4SI:
10515 case V2DF_FTYPE_V2DF:
10516 case V2DF_FTYPE_V4SI:
10517 case V2DF_FTYPE_V4DF:
10518 case V2DF_FTYPE_V4SF:
10519 case V2DF_FTYPE_V2SI:
10520 case V2SI_FTYPE_V2SI:
10521 case V2SI_FTYPE_V4SF:
10522 case V2SI_FTYPE_V2SF:
10523 case V2SI_FTYPE_V2DF:
10524 case V2SF_FTYPE_V2SF:
10525 case V2SF_FTYPE_V2SI:
10526 case V32QI_FTYPE_V32QI:
10527 case V32QI_FTYPE_V16QI:
10528 case V16HI_FTYPE_V16HI:
10529 case V16HI_FTYPE_V8HI:
10530 case V8SI_FTYPE_V8SI:
10531 case V16HI_FTYPE_V16QI:
10532 case V8SI_FTYPE_V16QI:
10533 case V4DI_FTYPE_V16QI:
10534 case V8SI_FTYPE_V8HI:
10535 case V4DI_FTYPE_V8HI:
10536 case V4DI_FTYPE_V4SI:
10537 case V4DI_FTYPE_V2DI:
10538 case UQI_FTYPE_UQI:
10539 case UHI_FTYPE_UHI:
10540 case USI_FTYPE_USI:
10541 case USI_FTYPE_UQI:
10542 case USI_FTYPE_UHI:
10543 case UDI_FTYPE_UDI:
10544 case UHI_FTYPE_V16QI:
10545 case USI_FTYPE_V32QI:
10546 case UDI_FTYPE_V64QI:
10547 case V16QI_FTYPE_UHI:
10548 case V32QI_FTYPE_USI:
10549 case V64QI_FTYPE_UDI:
10550 case V8HI_FTYPE_UQI:
10551 case V16HI_FTYPE_UHI:
10552 case V32HI_FTYPE_USI:
10553 case V4SI_FTYPE_UQI:
10554 case V8SI_FTYPE_UQI:
10555 case V4SI_FTYPE_UHI:
10556 case V8SI_FTYPE_UHI:
10557 case UQI_FTYPE_V8HI:
10558 case UHI_FTYPE_V16HI:
10559 case USI_FTYPE_V32HI:
10560 case UQI_FTYPE_V4SI:
10561 case UQI_FTYPE_V8SI:
10562 case UHI_FTYPE_V16SI:
10563 case UQI_FTYPE_V2DI:
10564 case UQI_FTYPE_V4DI:
10565 case UQI_FTYPE_V8DI:
10566 case V16SI_FTYPE_UHI:
10567 case V2DI_FTYPE_UQI:
10568 case V4DI_FTYPE_UQI:
10569 case V16SI_FTYPE_INT:
10570 case V16SF_FTYPE_V8SF:
10571 case V16SI_FTYPE_V8SI:
10572 case V16SF_FTYPE_V4SF:
10573 case V16SI_FTYPE_V4SI:
10574 case V16SI_FTYPE_V16SF:
10575 case V16SI_FTYPE_V16SI:
10576 case V64QI_FTYPE_V64QI:
10577 case V32HI_FTYPE_V32HI:
10578 case V16SF_FTYPE_V16SF:
10579 case V8DI_FTYPE_UQI:
10580 case V8DI_FTYPE_V8DI:
10581 case V8DF_FTYPE_V4DF:
10582 case V8DF_FTYPE_V2DF:
10583 case V8DF_FTYPE_V8DF:
10584 case V4DI_FTYPE_V4DI:
10585 case V16BF_FTYPE_V16SF:
10586 case V8BF_FTYPE_V8SF:
10587 case V8BF_FTYPE_V4SF:
10588 nargs = 1;
10589 break;
10590 case V4SF_FTYPE_V4SF_VEC_MERGE:
10591 case V2DF_FTYPE_V2DF_VEC_MERGE:
10592 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10593 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10594 case V16QI_FTYPE_V16QI_V16QI:
10595 case V16QI_FTYPE_V8HI_V8HI:
10596 case V16HF_FTYPE_V16HF_V16HF:
10597 case V16SF_FTYPE_V16SF_V16SF:
10598 case V8QI_FTYPE_V8QI_V8QI:
10599 case V8QI_FTYPE_V4HI_V4HI:
10600 case V8HI_FTYPE_V8HI_V8HI:
10601 case V8HI_FTYPE_V16QI_V16QI:
10602 case V8HI_FTYPE_V4SI_V4SI:
10603 case V8HF_FTYPE_V8HF_V8HF:
10604 case V8SF_FTYPE_V8SF_V8SF:
10605 case V8SF_FTYPE_V8SF_V8SI:
10606 case V8DF_FTYPE_V8DF_V8DF:
10607 case V4SI_FTYPE_V4SI_V4SI:
10608 case V4SI_FTYPE_V8HI_V8HI:
10609 case V4SI_FTYPE_V2DF_V2DF:
10610 case V4HI_FTYPE_V4HI_V4HI:
10611 case V4HI_FTYPE_V8QI_V8QI:
10612 case V4HI_FTYPE_V2SI_V2SI:
10613 case V4DF_FTYPE_V4DF_V4DF:
10614 case V4DF_FTYPE_V4DF_V4DI:
10615 case V4SF_FTYPE_V4SF_V4SF:
10616 case V4SF_FTYPE_V4SF_V4SI:
10617 case V4SF_FTYPE_V4SF_V2SI:
10618 case V4SF_FTYPE_V4SF_V2DF:
10619 case V4SF_FTYPE_V4SF_UINT:
10620 case V4SF_FTYPE_V4SF_DI:
10621 case V4SF_FTYPE_V4SF_SI:
10622 case V2DI_FTYPE_V2DI_V2DI:
10623 case V2DI_FTYPE_V16QI_V16QI:
10624 case V2DI_FTYPE_V4SI_V4SI:
10625 case V2DI_FTYPE_V2DI_V16QI:
10626 case V2SI_FTYPE_V2SI_V2SI:
10627 case V2SI_FTYPE_V4HI_V4HI:
10628 case V2SI_FTYPE_V2SF_V2SF:
10629 case V2DF_FTYPE_V2DF_V2DF:
10630 case V2DF_FTYPE_V2DF_V4SF:
10631 case V2DF_FTYPE_V2DF_V2DI:
10632 case V2DF_FTYPE_V2DF_DI:
10633 case V2DF_FTYPE_V2DF_SI:
10634 case V2DF_FTYPE_V2DF_UINT:
10635 case V2SF_FTYPE_V2SF_V2SF:
10636 case V1DI_FTYPE_V1DI_V1DI:
10637 case V1DI_FTYPE_V8QI_V8QI:
10638 case V1DI_FTYPE_V2SI_V2SI:
10639 case V32QI_FTYPE_V16HI_V16HI:
10640 case V16HI_FTYPE_V8SI_V8SI:
10641 case V64QI_FTYPE_V64QI_V64QI:
10642 case V32QI_FTYPE_V32QI_V32QI:
10643 case V16HI_FTYPE_V32QI_V32QI:
10644 case V16HI_FTYPE_V16HI_V16HI:
10645 case V8SI_FTYPE_V4DF_V4DF:
10646 case V8SI_FTYPE_V8SI_V8SI:
10647 case V8SI_FTYPE_V16HI_V16HI:
10648 case V4DI_FTYPE_V4DI_V4DI:
10649 case V4DI_FTYPE_V8SI_V8SI:
10650 case V4DI_FTYPE_V32QI_V32QI:
10651 case V8DI_FTYPE_V64QI_V64QI:
10652 if (comparison == UNKNOWN)
10653 return ix86_expand_binop_builtin (icode, exp, target);
10654 nargs = 2;
10655 break;
10656 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10657 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10658 gcc_assert (comparison != UNKNOWN);
10659 nargs = 2;
10660 swap = true;
10661 break;
10662 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10663 case V16HI_FTYPE_V16HI_SI_COUNT:
10664 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10665 case V8SI_FTYPE_V8SI_SI_COUNT:
10666 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10667 case V4DI_FTYPE_V4DI_INT_COUNT:
10668 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10669 case V8HI_FTYPE_V8HI_SI_COUNT:
10670 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10671 case V4SI_FTYPE_V4SI_SI_COUNT:
10672 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10673 case V4HI_FTYPE_V4HI_SI_COUNT:
10674 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10675 case V2DI_FTYPE_V2DI_SI_COUNT:
10676 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10677 case V2SI_FTYPE_V2SI_SI_COUNT:
10678 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10679 case V1DI_FTYPE_V1DI_SI_COUNT:
10680 nargs = 2;
10681 second_arg_count = true;
10682 break;
10683 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10684 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10685 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10686 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10687 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10688 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10689 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10690 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10691 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10692 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10693 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10694 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10695 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10696 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10697 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10698 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10699 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10700 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10701 nargs = 4;
10702 second_arg_count = true;
10703 break;
10704 case UINT64_FTYPE_UINT64_UINT64:
10705 case UINT_FTYPE_UINT_UINT:
10706 case UINT_FTYPE_UINT_USHORT:
10707 case UINT_FTYPE_UINT_UCHAR:
10708 case UINT16_FTYPE_UINT16_INT:
10709 case UINT8_FTYPE_UINT8_INT:
10710 case UQI_FTYPE_UQI_UQI:
10711 case UHI_FTYPE_UHI_UHI:
10712 case USI_FTYPE_USI_USI:
10713 case UDI_FTYPE_UDI_UDI:
10714 case V16SI_FTYPE_V8DF_V8DF:
10715 case V32BF_FTYPE_V16SF_V16SF:
10716 case V16BF_FTYPE_V8SF_V8SF:
10717 case V8BF_FTYPE_V4SF_V4SF:
10718 case V16BF_FTYPE_V16SF_UHI:
10719 case V8BF_FTYPE_V8SF_UQI:
10720 case V8BF_FTYPE_V4SF_UQI:
10721 nargs = 2;
10722 break;
10723 case V2DI_FTYPE_V2DI_INT_CONVERT:
10724 nargs = 2;
10725 rmode = V1TImode;
10726 nargs_constant = 1;
10727 break;
10728 case V4DI_FTYPE_V4DI_INT_CONVERT:
10729 nargs = 2;
10730 rmode = V2TImode;
10731 nargs_constant = 1;
10732 break;
10733 case V8DI_FTYPE_V8DI_INT_CONVERT:
10734 nargs = 2;
10735 rmode = V4TImode;
10736 nargs_constant = 1;
10737 break;
10738 case V8HI_FTYPE_V8HI_INT:
10739 case V8HI_FTYPE_V8SF_INT:
10740 case V16HI_FTYPE_V16SF_INT:
10741 case V8HI_FTYPE_V4SF_INT:
10742 case V8SF_FTYPE_V8SF_INT:
10743 case V4SF_FTYPE_V16SF_INT:
10744 case V16SF_FTYPE_V16SF_INT:
10745 case V4SI_FTYPE_V4SI_INT:
10746 case V4SI_FTYPE_V8SI_INT:
10747 case V4HI_FTYPE_V4HI_INT:
10748 case V4DF_FTYPE_V4DF_INT:
10749 case V4DF_FTYPE_V8DF_INT:
10750 case V4SF_FTYPE_V4SF_INT:
10751 case V4SF_FTYPE_V8SF_INT:
10752 case V2DI_FTYPE_V2DI_INT:
10753 case V2DF_FTYPE_V2DF_INT:
10754 case V2DF_FTYPE_V4DF_INT:
10755 case V16HI_FTYPE_V16HI_INT:
10756 case V8SI_FTYPE_V8SI_INT:
10757 case V16SI_FTYPE_V16SI_INT:
10758 case V4SI_FTYPE_V16SI_INT:
10759 case V4DI_FTYPE_V4DI_INT:
10760 case V2DI_FTYPE_V4DI_INT:
10761 case V4DI_FTYPE_V8DI_INT:
10762 case UQI_FTYPE_UQI_UQI_CONST:
10763 case UHI_FTYPE_UHI_UQI:
10764 case USI_FTYPE_USI_UQI:
10765 case UDI_FTYPE_UDI_UQI:
10766 nargs = 2;
10767 nargs_constant = 1;
10768 break;
10769 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10770 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10771 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10772 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10773 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10774 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10775 case UHI_FTYPE_V16SI_V16SI_UHI:
10776 case UQI_FTYPE_V8DI_V8DI_UQI:
10777 case V16HI_FTYPE_V16SI_V16HI_UHI:
10778 case V16QI_FTYPE_V16SI_V16QI_UHI:
10779 case V16QI_FTYPE_V8DI_V16QI_UQI:
10780 case V32HF_FTYPE_V32HF_V32HF_USI:
10781 case V16SF_FTYPE_V16SF_V16SF_UHI:
10782 case V16SF_FTYPE_V4SF_V16SF_UHI:
10783 case V16SI_FTYPE_SI_V16SI_UHI:
10784 case V16SI_FTYPE_V16HI_V16SI_UHI:
10785 case V16SI_FTYPE_V16QI_V16SI_UHI:
10786 case V8SF_FTYPE_V4SF_V8SF_UQI:
10787 case V4DF_FTYPE_V2DF_V4DF_UQI:
10788 case V8SI_FTYPE_V4SI_V8SI_UQI:
10789 case V8SI_FTYPE_SI_V8SI_UQI:
10790 case V4SI_FTYPE_V4SI_V4SI_UQI:
10791 case V4SI_FTYPE_SI_V4SI_UQI:
10792 case V4DI_FTYPE_V2DI_V4DI_UQI:
10793 case V4DI_FTYPE_DI_V4DI_UQI:
10794 case V2DI_FTYPE_V2DI_V2DI_UQI:
10795 case V2DI_FTYPE_DI_V2DI_UQI:
10796 case V64QI_FTYPE_V64QI_V64QI_UDI:
10797 case V64QI_FTYPE_V16QI_V64QI_UDI:
10798 case V64QI_FTYPE_QI_V64QI_UDI:
10799 case V32QI_FTYPE_V32QI_V32QI_USI:
10800 case V32QI_FTYPE_V16QI_V32QI_USI:
10801 case V32QI_FTYPE_QI_V32QI_USI:
10802 case V16QI_FTYPE_V16QI_V16QI_UHI:
10803 case V16QI_FTYPE_QI_V16QI_UHI:
10804 case V32HI_FTYPE_V8HI_V32HI_USI:
10805 case V32HI_FTYPE_HI_V32HI_USI:
10806 case V16HI_FTYPE_V8HI_V16HI_UHI:
10807 case V16HI_FTYPE_HI_V16HI_UHI:
10808 case V8HI_FTYPE_V8HI_V8HI_UQI:
10809 case V8HI_FTYPE_HI_V8HI_UQI:
10810 case V16HF_FTYPE_V16HF_V16HF_UHI:
10811 case V8SF_FTYPE_V8HI_V8SF_UQI:
10812 case V4SF_FTYPE_V8HI_V4SF_UQI:
10813 case V8SI_FTYPE_V8HF_V8SI_UQI:
10814 case V8SF_FTYPE_V8HF_V8SF_UQI:
10815 case V8SI_FTYPE_V8SF_V8SI_UQI:
10816 case V4SI_FTYPE_V4SF_V4SI_UQI:
10817 case V4SI_FTYPE_V8HF_V4SI_UQI:
10818 case V4SF_FTYPE_V8HF_V4SF_UQI:
10819 case V4DI_FTYPE_V8HF_V4DI_UQI:
10820 case V4DI_FTYPE_V4SF_V4DI_UQI:
10821 case V2DI_FTYPE_V8HF_V2DI_UQI:
10822 case V2DI_FTYPE_V4SF_V2DI_UQI:
10823 case V8HF_FTYPE_V8HF_V8HF_UQI:
10824 case V8HF_FTYPE_V8HF_V8HF_V8HF:
10825 case V8HF_FTYPE_V8HI_V8HF_UQI:
10826 case V8HF_FTYPE_V8SI_V8HF_UQI:
10827 case V8HF_FTYPE_V8SF_V8HF_UQI:
10828 case V8HF_FTYPE_V4SI_V8HF_UQI:
10829 case V8HF_FTYPE_V4SF_V8HF_UQI:
10830 case V8HF_FTYPE_V4DI_V8HF_UQI:
10831 case V8HF_FTYPE_V4DF_V8HF_UQI:
10832 case V8HF_FTYPE_V2DI_V8HF_UQI:
10833 case V8HF_FTYPE_V2DF_V8HF_UQI:
10834 case V4SF_FTYPE_V4DI_V4SF_UQI:
10835 case V4SF_FTYPE_V2DI_V4SF_UQI:
10836 case V4DF_FTYPE_V4DI_V4DF_UQI:
10837 case V4DF_FTYPE_V8HF_V4DF_UQI:
10838 case V2DF_FTYPE_V8HF_V2DF_UQI:
10839 case V2DF_FTYPE_V2DI_V2DF_UQI:
10840 case V16QI_FTYPE_V8HI_V16QI_UQI:
10841 case V16QI_FTYPE_V16HI_V16QI_UHI:
10842 case V16QI_FTYPE_V4SI_V16QI_UQI:
10843 case V16QI_FTYPE_V8SI_V16QI_UQI:
10844 case V8HI_FTYPE_V8HF_V8HI_UQI:
10845 case V8HI_FTYPE_V4SI_V8HI_UQI:
10846 case V8HI_FTYPE_V8SI_V8HI_UQI:
10847 case V16QI_FTYPE_V2DI_V16QI_UQI:
10848 case V16QI_FTYPE_V4DI_V16QI_UQI:
10849 case V8HI_FTYPE_V2DI_V8HI_UQI:
10850 case V8HI_FTYPE_V4DI_V8HI_UQI:
10851 case V4SI_FTYPE_V2DI_V4SI_UQI:
10852 case V4SI_FTYPE_V4DI_V4SI_UQI:
10853 case V32QI_FTYPE_V32HI_V32QI_USI:
10854 case UHI_FTYPE_V16QI_V16QI_UHI:
10855 case USI_FTYPE_V32QI_V32QI_USI:
10856 case UDI_FTYPE_V64QI_V64QI_UDI:
10857 case UQI_FTYPE_V8HI_V8HI_UQI:
10858 case UHI_FTYPE_V16HI_V16HI_UHI:
10859 case USI_FTYPE_V32HI_V32HI_USI:
10860 case UQI_FTYPE_V4SI_V4SI_UQI:
10861 case UQI_FTYPE_V8SI_V8SI_UQI:
10862 case UQI_FTYPE_V2DI_V2DI_UQI:
10863 case UQI_FTYPE_V4DI_V4DI_UQI:
10864 case V4SF_FTYPE_V2DF_V4SF_UQI:
10865 case V4SF_FTYPE_V4DF_V4SF_UQI:
10866 case V16SI_FTYPE_V16SI_V16SI_UHI:
10867 case V16SI_FTYPE_V4SI_V16SI_UHI:
10868 case V2DI_FTYPE_V4SI_V2DI_UQI:
10869 case V2DI_FTYPE_V8HI_V2DI_UQI:
10870 case V2DI_FTYPE_V16QI_V2DI_UQI:
10871 case V4DI_FTYPE_V4DI_V4DI_UQI:
10872 case V4DI_FTYPE_V4SI_V4DI_UQI:
10873 case V4DI_FTYPE_V8HI_V4DI_UQI:
10874 case V4DI_FTYPE_V16QI_V4DI_UQI:
10875 case V4DI_FTYPE_V4DF_V4DI_UQI:
10876 case V2DI_FTYPE_V2DF_V2DI_UQI:
10877 case V4SI_FTYPE_V4DF_V4SI_UQI:
10878 case V4SI_FTYPE_V2DF_V4SI_UQI:
10879 case V4SI_FTYPE_V8HI_V4SI_UQI:
10880 case V4SI_FTYPE_V16QI_V4SI_UQI:
10881 case V4DI_FTYPE_V4DI_V4DI_V4DI:
10882 case V8DF_FTYPE_V2DF_V8DF_UQI:
10883 case V8DF_FTYPE_V4DF_V8DF_UQI:
10884 case V8DF_FTYPE_V8DF_V8DF_UQI:
10885 case V8SF_FTYPE_V8SF_V8SF_UQI:
10886 case V8SF_FTYPE_V8SI_V8SF_UQI:
10887 case V4DF_FTYPE_V4DF_V4DF_UQI:
10888 case V4SF_FTYPE_V4SF_V4SF_UQI:
10889 case V2DF_FTYPE_V2DF_V2DF_UQI:
10890 case V2DF_FTYPE_V4SF_V2DF_UQI:
10891 case V2DF_FTYPE_V4SI_V2DF_UQI:
10892 case V4SF_FTYPE_V4SI_V4SF_UQI:
10893 case V4DF_FTYPE_V4SF_V4DF_UQI:
10894 case V4DF_FTYPE_V4SI_V4DF_UQI:
10895 case V8SI_FTYPE_V8SI_V8SI_UQI:
10896 case V8SI_FTYPE_V8HI_V8SI_UQI:
10897 case V8SI_FTYPE_V16QI_V8SI_UQI:
10898 case V8DF_FTYPE_V8SI_V8DF_UQI:
10899 case V8DI_FTYPE_DI_V8DI_UQI:
10900 case V16SF_FTYPE_V8SF_V16SF_UHI:
10901 case V16SI_FTYPE_V8SI_V16SI_UHI:
10902 case V16HF_FTYPE_V16HI_V16HF_UHI:
10903 case V16HF_FTYPE_V16HF_V16HF_V16HF:
10904 case V16HI_FTYPE_V16HF_V16HI_UHI:
10905 case V16HI_FTYPE_V16HI_V16HI_UHI:
10906 case V8HI_FTYPE_V16QI_V8HI_UQI:
10907 case V16HI_FTYPE_V16QI_V16HI_UHI:
10908 case V32HI_FTYPE_V32HI_V32HI_USI:
10909 case V32HI_FTYPE_V32QI_V32HI_USI:
10910 case V8DI_FTYPE_V16QI_V8DI_UQI:
10911 case V8DI_FTYPE_V2DI_V8DI_UQI:
10912 case V8DI_FTYPE_V4DI_V8DI_UQI:
10913 case V8DI_FTYPE_V8DI_V8DI_UQI:
10914 case V8DI_FTYPE_V8HI_V8DI_UQI:
10915 case V8DI_FTYPE_V8SI_V8DI_UQI:
10916 case V8HI_FTYPE_V8DI_V8HI_UQI:
10917 case V8SI_FTYPE_V8DI_V8SI_UQI:
10918 case V4SI_FTYPE_V4SI_V4SI_V4SI:
10919 case V16SI_FTYPE_V16SI_V16SI_V16SI:
10920 case V8DI_FTYPE_V8DI_V8DI_V8DI:
10921 case V32HI_FTYPE_V32HI_V32HI_V32HI:
10922 case V2DI_FTYPE_V2DI_V2DI_V2DI:
10923 case V16HI_FTYPE_V16HI_V16HI_V16HI:
10924 case V8SI_FTYPE_V8SI_V8SI_V8SI:
10925 case V8HI_FTYPE_V8HI_V8HI_V8HI:
10926 case V32BF_FTYPE_V16SF_V16SF_USI:
10927 case V16BF_FTYPE_V8SF_V8SF_UHI:
10928 case V8BF_FTYPE_V4SF_V4SF_UQI:
10929 case V16BF_FTYPE_V16SF_V16BF_UHI:
10930 case V8BF_FTYPE_V8SF_V8BF_UQI:
10931 case V8BF_FTYPE_V4SF_V8BF_UQI:
10932 case V16SF_FTYPE_V16SF_V32BF_V32BF:
10933 case V8SF_FTYPE_V8SF_V16BF_V16BF:
10934 case V4SF_FTYPE_V4SF_V8BF_V8BF:
10935 nargs = 3;
10936 break;
10937 case V32QI_FTYPE_V32QI_V32QI_INT:
10938 case V16HI_FTYPE_V16HI_V16HI_INT:
10939 case V16QI_FTYPE_V16QI_V16QI_INT:
10940 case V4DI_FTYPE_V4DI_V4DI_INT:
10941 case V8HI_FTYPE_V8HI_V8HI_INT:
10942 case V8SI_FTYPE_V8SI_V8SI_INT:
10943 case V8SI_FTYPE_V8SI_V4SI_INT:
10944 case V8SF_FTYPE_V8SF_V8SF_INT:
10945 case V8SF_FTYPE_V8SF_V4SF_INT:
10946 case V4SI_FTYPE_V4SI_V4SI_INT:
10947 case V4DF_FTYPE_V4DF_V4DF_INT:
10948 case V16SF_FTYPE_V16SF_V16SF_INT:
10949 case V16SF_FTYPE_V16SF_V4SF_INT:
10950 case V16SI_FTYPE_V16SI_V4SI_INT:
10951 case V4DF_FTYPE_V4DF_V2DF_INT:
10952 case V4SF_FTYPE_V4SF_V4SF_INT:
10953 case V2DI_FTYPE_V2DI_V2DI_INT:
10954 case V4DI_FTYPE_V4DI_V2DI_INT:
10955 case V2DF_FTYPE_V2DF_V2DF_INT:
10956 case UQI_FTYPE_V8DI_V8UDI_INT:
10957 case UQI_FTYPE_V8DF_V8DF_INT:
10958 case UQI_FTYPE_V2DF_V2DF_INT:
10959 case UQI_FTYPE_V4SF_V4SF_INT:
10960 case UHI_FTYPE_V16SI_V16SI_INT:
10961 case UHI_FTYPE_V16SF_V16SF_INT:
10962 case V64QI_FTYPE_V64QI_V64QI_INT:
10963 case V32HI_FTYPE_V32HI_V32HI_INT:
10964 case V16SI_FTYPE_V16SI_V16SI_INT:
10965 case V8DI_FTYPE_V8DI_V8DI_INT:
10966 nargs = 3;
10967 nargs_constant = 1;
10968 break;
10969 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
10970 nargs = 3;
10971 rmode = V4DImode;
10972 nargs_constant = 1;
10973 break;
10974 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
10975 nargs = 3;
10976 rmode = V2DImode;
10977 nargs_constant = 1;
10978 break;
10979 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
10980 nargs = 3;
10981 rmode = DImode;
10982 nargs_constant = 1;
10983 break;
10984 case V2DI_FTYPE_V2DI_UINT_UINT:
10985 nargs = 3;
10986 nargs_constant = 2;
10987 break;
10988 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
10989 nargs = 3;
10990 rmode = V8DImode;
10991 nargs_constant = 1;
10992 break;
10993 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
10994 nargs = 5;
10995 rmode = V8DImode;
10996 mask_pos = 2;
10997 nargs_constant = 1;
10998 break;
10999 case QI_FTYPE_V8DF_INT_UQI:
11000 case QI_FTYPE_V4DF_INT_UQI:
11001 case QI_FTYPE_V2DF_INT_UQI:
11002 case HI_FTYPE_V16SF_INT_UHI:
11003 case QI_FTYPE_V8SF_INT_UQI:
11004 case QI_FTYPE_V4SF_INT_UQI:
11005 case QI_FTYPE_V8HF_INT_UQI:
11006 case HI_FTYPE_V16HF_INT_UHI:
11007 case SI_FTYPE_V32HF_INT_USI:
11008 case V4SI_FTYPE_V4SI_V4SI_UHI:
11009 case V8SI_FTYPE_V8SI_V8SI_UHI:
11010 nargs = 3;
11011 mask_pos = 1;
11012 nargs_constant = 1;
11013 break;
11014 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
11015 nargs = 5;
11016 rmode = V4DImode;
11017 mask_pos = 2;
11018 nargs_constant = 1;
11019 break;
11020 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
11021 nargs = 5;
11022 rmode = V2DImode;
11023 mask_pos = 2;
11024 nargs_constant = 1;
11025 break;
11026 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
11027 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
11028 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
11029 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
11030 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
11031 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
11032 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
11033 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
11034 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
11035 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
11036 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
11037 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
11038 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
11039 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
11040 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
11041 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
11042 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
11043 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
11044 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
11045 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
11046 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
11047 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
11048 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
11049 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
11050 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
11051 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
11052 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
11053 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
11054 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
11055 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
11056 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
11057 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
11058 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
11059 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
11060 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
11061 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
11062 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
11063 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
11064 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
11065 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
11066 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
11067 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
11068 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
11069 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
11070 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
11071 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
11072 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
11073 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
11074 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
11075 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
11076 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
11077 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
11078 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
11079 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
11080 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
11081 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
11082 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
11083 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
11084 nargs = 4;
11085 break;
11086 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
11087 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
11088 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
11089 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
11090 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
11091 nargs = 4;
11092 nargs_constant = 1;
11093 break;
11094 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
11095 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
11096 case QI_FTYPE_V4DF_V4DF_INT_UQI:
11097 case QI_FTYPE_V8SF_V8SF_INT_UQI:
11098 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
11099 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
11100 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
11101 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
11102 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
11103 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
11104 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
11105 case USI_FTYPE_V32QI_V32QI_INT_USI:
11106 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
11107 case USI_FTYPE_V32HI_V32HI_INT_USI:
11108 case USI_FTYPE_V32HF_V32HF_INT_USI:
11109 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
11110 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
11111 nargs = 4;
11112 mask_pos = 1;
11113 nargs_constant = 1;
11114 break;
11115 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
11116 nargs = 4;
11117 nargs_constant = 2;
11118 break;
11119 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
11120 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
11121 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
11122 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
11123 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
11124 nargs = 4;
11125 break;
11126 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
11127 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
11128 mask_pos = 1;
11129 nargs = 4;
11130 nargs_constant = 1;
11131 break;
11132 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
11133 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
11134 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
11135 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
11136 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
11137 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
11138 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
11139 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
11140 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
11141 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
11142 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
11143 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
11144 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
11145 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
11146 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
11147 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
11148 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
11149 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
11150 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
11151 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
11152 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
11153 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
11154 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
11155 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
11156 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
11157 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11158 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
11159 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11160 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11161 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11162 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11163 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11164 nargs = 4;
11165 mask_pos = 2;
11166 nargs_constant = 1;
11167 break;
11168 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11169 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11170 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11171 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11172 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11173 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11174 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11175 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11176 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11177 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11178 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11179 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11180 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11181 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11182 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11183 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11184 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11185 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11186 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11187 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11188 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11189 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11190 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11191 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11192 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11193 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11194 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11195 nargs = 5;
11196 mask_pos = 2;
11197 nargs_constant = 1;
11198 break;
11199 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11200 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11201 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11202 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11203 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11204 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11205 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11206 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11207 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11208 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11209 nargs = 5;
11210 mask_pos = 1;
11211 nargs_constant = 1;
11212 break;
11213 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11214 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11215 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11216 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11217 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11218 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11219 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11220 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11221 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11222 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11223 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11224 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11225 nargs = 5;
11226 mask_pos = 1;
11227 nargs_constant = 2;
11228 break;
11229
11230 default:
11231 gcc_unreachable ();
11232 }
11233
11234 gcc_assert (nargs <= ARRAY_SIZE (xops));
11235
11236 if (comparison != UNKNOWN)
11237 {
11238 gcc_assert (nargs == 2);
11239 return ix86_expand_sse_compare (d, exp, target, swap);
11240 }
11241
11242 if (rmode == VOIDmode || rmode == tmode)
11243 {
11244 if (optimize
11245 || target == 0
11246 || GET_MODE (target) != tmode
11247 || !insn_p->operand[0].predicate (target, tmode))
11248 target = gen_reg_rtx (tmode);
11249 else if (memory_operand (target, tmode))
11250 num_memory++;
11251 real_target = target;
11252 }
11253 else
11254 {
11255 real_target = gen_reg_rtx (tmode);
11256 target = lowpart_subreg (rmode, real_target, tmode);
11257 }
11258
11259 for (i = 0; i < nargs; i++)
11260 {
11261 tree arg = CALL_EXPR_ARG (exp, i);
11262 rtx op = expand_normal (arg);
11263 machine_mode mode = insn_p->operand[i + 1].mode;
11264 bool match = insn_p->operand[i + 1].predicate (op, mode);
11265
11266 if (second_arg_count && i == 1)
11267 {
11268 /* SIMD shift insns take either an 8-bit immediate or
11269 register as count. But builtin functions take int as
11270 count. If count doesn't match, we put it in register.
11271 The instructions are using 64-bit count, if op is just
11272 32-bit, zero-extend it, as negative shift counts
11273 are undefined behavior and zero-extension is more
11274 efficient. */
11275 if (!match)
11276 {
11277 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11278 op = convert_modes (mode, GET_MODE (op), op, 1);
11279 else
11280 op = lowpart_subreg (mode, op, GET_MODE (op));
11281 if (!insn_p->operand[i + 1].predicate (op, mode))
11282 op = copy_to_reg (op);
11283 }
11284 }
11285 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11286 (!mask_pos && (nargs - i) <= nargs_constant))
11287 {
11288 if (!match)
11289 switch (icode)
11290 {
11291 case CODE_FOR_avx_vinsertf128v4di:
11292 case CODE_FOR_avx_vextractf128v4di:
11293 error ("the last argument must be an 1-bit immediate");
11294 return const0_rtx;
11295
11296 case CODE_FOR_avx512f_cmpv8di3_mask:
11297 case CODE_FOR_avx512f_cmpv16si3_mask:
11298 case CODE_FOR_avx512f_ucmpv8di3_mask:
11299 case CODE_FOR_avx512f_ucmpv16si3_mask:
11300 case CODE_FOR_avx512vl_cmpv4di3_mask:
11301 case CODE_FOR_avx512vl_cmpv8si3_mask:
11302 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11303 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11304 case CODE_FOR_avx512vl_cmpv2di3_mask:
11305 case CODE_FOR_avx512vl_cmpv4si3_mask:
11306 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11307 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11308 error ("the last argument must be a 3-bit immediate");
11309 return const0_rtx;
11310
11311 case CODE_FOR_sse4_1_roundsd:
11312 case CODE_FOR_sse4_1_roundss:
11313
11314 case CODE_FOR_sse4_1_roundpd:
11315 case CODE_FOR_sse4_1_roundps:
11316 case CODE_FOR_avx_roundpd256:
11317 case CODE_FOR_avx_roundps256:
11318
11319 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11320 case CODE_FOR_sse4_1_roundps_sfix:
11321 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11322 case CODE_FOR_avx_roundps_sfix256:
11323
11324 case CODE_FOR_sse4_1_blendps:
11325 case CODE_FOR_avx_blendpd256:
11326 case CODE_FOR_avx_vpermilv4df:
11327 case CODE_FOR_avx_vpermilv4df_mask:
11328 case CODE_FOR_avx512f_getmantv8df_mask:
11329 case CODE_FOR_avx512f_getmantv16sf_mask:
11330 case CODE_FOR_avx512vl_getmantv16hf_mask:
11331 case CODE_FOR_avx512vl_getmantv8sf_mask:
11332 case CODE_FOR_avx512vl_getmantv4df_mask:
11333 case CODE_FOR_avx512fp16_getmantv8hf_mask:
11334 case CODE_FOR_avx512vl_getmantv4sf_mask:
11335 case CODE_FOR_avx512vl_getmantv2df_mask:
11336 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11337 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11338 case CODE_FOR_avx512dq_rangepv4df_mask:
11339 case CODE_FOR_avx512dq_rangepv8sf_mask:
11340 case CODE_FOR_avx512dq_rangepv2df_mask:
11341 case CODE_FOR_avx512dq_rangepv4sf_mask:
11342 case CODE_FOR_avx_shufpd256_mask:
11343 error ("the last argument must be a 4-bit immediate");
11344 return const0_rtx;
11345
11346 case CODE_FOR_sha1rnds4:
11347 case CODE_FOR_sse4_1_blendpd:
11348 case CODE_FOR_avx_vpermilv2df:
11349 case CODE_FOR_avx_vpermilv2df_mask:
11350 case CODE_FOR_xop_vpermil2v2df3:
11351 case CODE_FOR_xop_vpermil2v4sf3:
11352 case CODE_FOR_xop_vpermil2v4df3:
11353 case CODE_FOR_xop_vpermil2v8sf3:
11354 case CODE_FOR_avx512f_vinsertf32x4_mask:
11355 case CODE_FOR_avx512f_vinserti32x4_mask:
11356 case CODE_FOR_avx512f_vextractf32x4_mask:
11357 case CODE_FOR_avx512f_vextracti32x4_mask:
11358 case CODE_FOR_sse2_shufpd:
11359 case CODE_FOR_sse2_shufpd_mask:
11360 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11361 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11362 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11363 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11364 error ("the last argument must be a 2-bit immediate");
11365 return const0_rtx;
11366
11367 case CODE_FOR_avx_vextractf128v4df:
11368 case CODE_FOR_avx_vextractf128v8sf:
11369 case CODE_FOR_avx_vextractf128v8si:
11370 case CODE_FOR_avx_vinsertf128v4df:
11371 case CODE_FOR_avx_vinsertf128v8sf:
11372 case CODE_FOR_avx_vinsertf128v8si:
11373 case CODE_FOR_avx512f_vinsertf64x4_mask:
11374 case CODE_FOR_avx512f_vinserti64x4_mask:
11375 case CODE_FOR_avx512f_vextractf64x4_mask:
11376 case CODE_FOR_avx512f_vextracti64x4_mask:
11377 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11378 case CODE_FOR_avx512dq_vinserti32x8_mask:
11379 case CODE_FOR_avx512vl_vinsertv4df:
11380 case CODE_FOR_avx512vl_vinsertv4di:
11381 case CODE_FOR_avx512vl_vinsertv8sf:
11382 case CODE_FOR_avx512vl_vinsertv8si:
11383 error ("the last argument must be a 1-bit immediate");
11384 return const0_rtx;
11385
11386 case CODE_FOR_avx_vmcmpv2df3:
11387 case CODE_FOR_avx_vmcmpv4sf3:
11388 case CODE_FOR_avx_cmpv2df3:
11389 case CODE_FOR_avx_cmpv4sf3:
11390 case CODE_FOR_avx_cmpv4df3:
11391 case CODE_FOR_avx_cmpv8sf3:
11392 case CODE_FOR_avx512f_cmpv8df3_mask:
11393 case CODE_FOR_avx512f_cmpv16sf3_mask:
11394 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11395 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11396 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11397 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11398 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11399 error ("the last argument must be a 5-bit immediate");
11400 return const0_rtx;
11401
11402 default:
11403 switch (nargs_constant)
11404 {
11405 case 2:
11406 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11407 (!mask_pos && (nargs - i) == nargs_constant))
11408 {
11409 error ("the next to last argument must be an 8-bit immediate");
11410 break;
11411 }
11412 /* FALLTHRU */
11413 case 1:
11414 error ("the last argument must be an 8-bit immediate");
11415 break;
11416 default:
11417 gcc_unreachable ();
11418 }
11419 return const0_rtx;
11420 }
11421 }
11422 else
11423 {
11424 if (VECTOR_MODE_P (mode))
11425 op = safe_vector_operand (op, mode);
11426
11427 /* If we aren't optimizing, only allow one memory operand to
11428 be generated. */
11429 if (memory_operand (op, mode))
11430 num_memory++;
11431
11432 op = fixup_modeless_constant (op, mode);
11433
11434 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11435 {
11436 if (optimize || !match || num_memory > 1)
11437 op = copy_to_mode_reg (mode, op);
11438 }
11439 else
11440 {
11441 op = copy_to_reg (op);
11442 op = lowpart_subreg (mode, op, GET_MODE (op));
11443 }
11444 }
11445
11446 xops[i] = op;
11447 }
11448
11449 switch (nargs)
11450 {
11451 case 1:
11452 pat = GEN_FCN (icode) (real_target, xops[0]);
11453 break;
11454 case 2:
11455 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11456 break;
11457 case 3:
11458 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11459 break;
11460 case 4:
11461 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11462 xops[2], xops[3]);
11463 break;
11464 case 5:
11465 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11466 xops[2], xops[3], xops[4]);
11467 break;
11468 case 6:
11469 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11470 xops[2], xops[3], xops[4], xops[5]);
11471 break;
11472 default:
11473 gcc_unreachable ();
11474 }
11475
11476 if (! pat)
11477 return 0;
11478
11479 emit_insn (pat);
11480 return target;
11481 }
11482
11483 /* Transform pattern of following layout:
11484 (set A
11485 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11486 )
11487 into:
11488 (set (A B)) */
11489
11490 static rtx
11491 ix86_erase_embedded_rounding (rtx pat)
11492 {
11493 if (GET_CODE (pat) == INSN)
11494 pat = PATTERN (pat);
11495
11496 gcc_assert (GET_CODE (pat) == SET);
11497 rtx src = SET_SRC (pat);
11498 gcc_assert (XVECLEN (src, 0) == 2);
11499 rtx p0 = XVECEXP (src, 0, 0);
11500 gcc_assert (GET_CODE (src) == UNSPEC
11501 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11502 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11503 return res;
11504 }
11505
11506 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11507 with rounding. */
11508 static rtx
11509 ix86_expand_sse_comi_round (const struct builtin_description *d,
11510 tree exp, rtx target)
11511 {
11512 rtx pat, set_dst;
11513 tree arg0 = CALL_EXPR_ARG (exp, 0);
11514 tree arg1 = CALL_EXPR_ARG (exp, 1);
11515 tree arg2 = CALL_EXPR_ARG (exp, 2);
11516 tree arg3 = CALL_EXPR_ARG (exp, 3);
11517 rtx op0 = expand_normal (arg0);
11518 rtx op1 = expand_normal (arg1);
11519 rtx op2 = expand_normal (arg2);
11520 rtx op3 = expand_normal (arg3);
11521 enum insn_code icode = d->icode;
11522 const struct insn_data_d *insn_p = &insn_data[icode];
11523 machine_mode mode0 = insn_p->operand[0].mode;
11524 machine_mode mode1 = insn_p->operand[1].mode;
11525
11526 /* See avxintrin.h for values. */
11527 static const enum rtx_code comparisons[32] =
11528 {
11529 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11530 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11531 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11532 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11533 };
11534 static const bool ordereds[32] =
11535 {
11536 true, true, true, false, false, false, false, true,
11537 false, false, false, true, true, true, true, false,
11538 true, true, true, false, false, false, false, true,
11539 false, false, false, true, true, true, true, false
11540 };
11541 static const bool non_signalings[32] =
11542 {
11543 true, false, false, true, true, false, false, true,
11544 true, false, false, true, true, false, false, true,
11545 false, true, true, false, false, true, true, false,
11546 false, true, true, false, false, true, true, false
11547 };
11548
11549 if (!CONST_INT_P (op2))
11550 {
11551 error ("the third argument must be comparison constant");
11552 return const0_rtx;
11553 }
11554 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11555 {
11556 error ("incorrect comparison mode");
11557 return const0_rtx;
11558 }
11559
11560 if (!insn_p->operand[2].predicate (op3, SImode))
11561 {
11562 error ("incorrect rounding operand");
11563 return const0_rtx;
11564 }
11565
11566 if (VECTOR_MODE_P (mode0))
11567 op0 = safe_vector_operand (op0, mode0);
11568 if (VECTOR_MODE_P (mode1))
11569 op1 = safe_vector_operand (op1, mode1);
11570
11571 enum rtx_code comparison = comparisons[INTVAL (op2)];
11572 bool ordered = ordereds[INTVAL (op2)];
11573 bool non_signaling = non_signalings[INTVAL (op2)];
11574 rtx const_val = const0_rtx;
11575
11576 bool check_unordered = false;
11577 machine_mode mode = CCFPmode;
11578 switch (comparison)
11579 {
11580 case ORDERED:
11581 if (!ordered)
11582 {
11583 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11584 if (!non_signaling)
11585 ordered = true;
11586 mode = CCSmode;
11587 }
11588 else
11589 {
11590 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11591 if (non_signaling)
11592 ordered = false;
11593 mode = CCPmode;
11594 }
11595 comparison = NE;
11596 break;
11597 case UNORDERED:
11598 if (ordered)
11599 {
11600 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11601 if (non_signaling)
11602 ordered = false;
11603 mode = CCSmode;
11604 }
11605 else
11606 {
11607 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11608 if (!non_signaling)
11609 ordered = true;
11610 mode = CCPmode;
11611 }
11612 comparison = EQ;
11613 break;
11614
11615 case LE: /* -> GE */
11616 case LT: /* -> GT */
11617 case UNGE: /* -> UNLE */
11618 case UNGT: /* -> UNLT */
11619 std::swap (op0, op1);
11620 comparison = swap_condition (comparison);
11621 /* FALLTHRU */
11622 case GT:
11623 case GE:
11624 case UNEQ:
11625 case UNLT:
11626 case UNLE:
11627 case LTGT:
11628 /* These are supported by CCFPmode. NB: Use ordered/signaling
11629 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11630 with NAN operands. */
11631 if (ordered == non_signaling)
11632 ordered = !ordered;
11633 break;
11634 case EQ:
11635 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11636 _CMP_EQ_OQ/_CMP_EQ_OS. */
11637 check_unordered = true;
11638 mode = CCZmode;
11639 break;
11640 case NE:
11641 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11642 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11643 gcc_assert (!ordered);
11644 check_unordered = true;
11645 mode = CCZmode;
11646 const_val = const1_rtx;
11647 break;
11648 default:
11649 gcc_unreachable ();
11650 }
11651
11652 target = gen_reg_rtx (SImode);
11653 emit_move_insn (target, const_val);
11654 target = gen_rtx_SUBREG (QImode, target, 0);
11655
11656 if ((optimize && !register_operand (op0, mode0))
11657 || !insn_p->operand[0].predicate (op0, mode0))
11658 op0 = copy_to_mode_reg (mode0, op0);
11659 if ((optimize && !register_operand (op1, mode1))
11660 || !insn_p->operand[1].predicate (op1, mode1))
11661 op1 = copy_to_mode_reg (mode1, op1);
11662
11663 /*
11664 1. COMI: ordered and signaling.
11665 2. UCOMI: unordered and non-signaling.
11666 */
11667 if (non_signaling)
11668 icode = (icode == CODE_FOR_sse_comi_round
11669 ? CODE_FOR_sse_ucomi_round
11670 : CODE_FOR_sse2_ucomi_round);
11671
11672 pat = GEN_FCN (icode) (op0, op1, op3);
11673 if (! pat)
11674 return 0;
11675
11676 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11677 if (INTVAL (op3) == NO_ROUND)
11678 {
11679 pat = ix86_erase_embedded_rounding (pat);
11680 if (! pat)
11681 return 0;
11682
11683 set_dst = SET_DEST (pat);
11684 }
11685 else
11686 {
11687 gcc_assert (GET_CODE (pat) == SET);
11688 set_dst = SET_DEST (pat);
11689 }
11690
11691 emit_insn (pat);
11692
11693 return ix86_ssecom_setcc (comparison, check_unordered, mode,
11694 set_dst, target);
11695 }
11696
11697 static rtx
11698 ix86_expand_round_builtin (const struct builtin_description *d,
11699 tree exp, rtx target)
11700 {
11701 rtx pat;
11702 unsigned int i, nargs;
11703 rtx xops[6];
11704 enum insn_code icode = d->icode;
11705 const struct insn_data_d *insn_p = &insn_data[icode];
11706 machine_mode tmode = insn_p->operand[0].mode;
11707 unsigned int nargs_constant = 0;
11708 unsigned int redundant_embed_rnd = 0;
11709
11710 switch ((enum ix86_builtin_func_type) d->flag)
11711 {
11712 case UINT64_FTYPE_V2DF_INT:
11713 case UINT64_FTYPE_V4SF_INT:
11714 case UINT64_FTYPE_V8HF_INT:
11715 case UINT_FTYPE_V2DF_INT:
11716 case UINT_FTYPE_V4SF_INT:
11717 case UINT_FTYPE_V8HF_INT:
11718 case INT64_FTYPE_V2DF_INT:
11719 case INT64_FTYPE_V4SF_INT:
11720 case INT64_FTYPE_V8HF_INT:
11721 case INT_FTYPE_V2DF_INT:
11722 case INT_FTYPE_V4SF_INT:
11723 case INT_FTYPE_V8HF_INT:
11724 nargs = 2;
11725 break;
11726 case V32HF_FTYPE_V32HF_V32HF_INT:
11727 case V8HF_FTYPE_V8HF_V8HF_INT:
11728 case V8HF_FTYPE_V8HF_INT_INT:
11729 case V8HF_FTYPE_V8HF_UINT_INT:
11730 case V8HF_FTYPE_V8HF_INT64_INT:
11731 case V8HF_FTYPE_V8HF_UINT64_INT:
11732 case V4SF_FTYPE_V4SF_UINT_INT:
11733 case V4SF_FTYPE_V4SF_UINT64_INT:
11734 case V2DF_FTYPE_V2DF_UINT64_INT:
11735 case V4SF_FTYPE_V4SF_INT_INT:
11736 case V4SF_FTYPE_V4SF_INT64_INT:
11737 case V2DF_FTYPE_V2DF_INT64_INT:
11738 case V4SF_FTYPE_V4SF_V4SF_INT:
11739 case V2DF_FTYPE_V2DF_V2DF_INT:
11740 case V4SF_FTYPE_V4SF_V2DF_INT:
11741 case V2DF_FTYPE_V2DF_V4SF_INT:
11742 nargs = 3;
11743 break;
11744 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11745 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
11746 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
11747 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
11748 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
11749 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11750 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11751 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
11752 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11753 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
11754 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
11755 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
11756 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
11757 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11758 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11759 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11760 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
11761 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
11762 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
11763 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11764 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11765 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11766 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
11767 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
11768 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11769 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
11770 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
11771 nargs = 4;
11772 break;
11773 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11774 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11775 nargs_constant = 2;
11776 nargs = 4;
11777 break;
11778 case INT_FTYPE_V4SF_V4SF_INT_INT:
11779 case INT_FTYPE_V2DF_V2DF_INT_INT:
11780 return ix86_expand_sse_comi_round (d, exp, target);
11781 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11782 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11783 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
11784 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
11785 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
11786 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
11787 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
11788 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
11789 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11790 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
11791 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
11792 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11793 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
11794 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
11795 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
11796 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11797 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
11798 nargs = 5;
11799 break;
11800 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
11801 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11802 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
11803 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11804 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
11805 nargs_constant = 4;
11806 nargs = 5;
11807 break;
11808 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11809 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11810 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11811 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
11812 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11813 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
11814 nargs_constant = 3;
11815 nargs = 5;
11816 break;
11817 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11818 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11819 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11820 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11821 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11822 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
11823 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
11824 nargs = 6;
11825 nargs_constant = 4;
11826 break;
11827 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11828 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11829 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11830 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11831 nargs = 6;
11832 nargs_constant = 3;
11833 break;
11834 default:
11835 gcc_unreachable ();
11836 }
11837 gcc_assert (nargs <= ARRAY_SIZE (xops));
11838
11839 if (optimize
11840 || target == 0
11841 || GET_MODE (target) != tmode
11842 || !insn_p->operand[0].predicate (target, tmode))
11843 target = gen_reg_rtx (tmode);
11844
11845 for (i = 0; i < nargs; i++)
11846 {
11847 tree arg = CALL_EXPR_ARG (exp, i);
11848 rtx op = expand_normal (arg);
11849 machine_mode mode = insn_p->operand[i + 1].mode;
11850 bool match = insn_p->operand[i + 1].predicate (op, mode);
11851
11852 if (i == nargs - nargs_constant)
11853 {
11854 if (!match)
11855 {
11856 switch (icode)
11857 {
11858 case CODE_FOR_avx512f_getmantv8df_mask_round:
11859 case CODE_FOR_avx512f_getmantv16sf_mask_round:
11860 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
11861 case CODE_FOR_avx512f_vgetmantv2df_round:
11862 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11863 case CODE_FOR_avx512f_vgetmantv4sf_round:
11864 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
11865 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
11866 error ("the immediate argument must be a 4-bit immediate");
11867 return const0_rtx;
11868 case CODE_FOR_avx512f_cmpv8df3_mask_round:
11869 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11870 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11871 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
11872 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11873 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
11874 error ("the immediate argument must be a 5-bit immediate");
11875 return const0_rtx;
11876 default:
11877 error ("the immediate argument must be an 8-bit immediate");
11878 return const0_rtx;
11879 }
11880 }
11881 }
11882 else if (i == nargs-1)
11883 {
11884 if (!insn_p->operand[nargs].predicate (op, SImode))
11885 {
11886 error ("incorrect rounding operand");
11887 return const0_rtx;
11888 }
11889
11890 /* If there is no rounding use normal version of the pattern. */
11891 if (INTVAL (op) == NO_ROUND)
11892 {
11893 /* Skip erasing embedded rounding for below expanders who
11894 generates multiple insns. In ix86_erase_embedded_rounding
11895 the pattern will be transformed to a single set, and emit_insn
11896 appends the set insead of insert it to chain. So the insns
11897 emitted inside define_expander would be ignored. */
11898 switch (icode)
11899 {
11900 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11901 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11902 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11903 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11904 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11905 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11906 redundant_embed_rnd = 0;
11907 break;
11908 default:
11909 redundant_embed_rnd = 1;
11910 break;
11911 }
11912 }
11913 }
11914 else
11915 {
11916 if (VECTOR_MODE_P (mode))
11917 op = safe_vector_operand (op, mode);
11918
11919 op = fixup_modeless_constant (op, mode);
11920
11921 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11922 {
11923 if (optimize || !match)
11924 op = copy_to_mode_reg (mode, op);
11925 }
11926 else
11927 {
11928 op = copy_to_reg (op);
11929 op = lowpart_subreg (mode, op, GET_MODE (op));
11930 }
11931 }
11932
11933 xops[i] = op;
11934 }
11935
11936 switch (nargs)
11937 {
11938 case 1:
11939 pat = GEN_FCN (icode) (target, xops[0]);
11940 break;
11941 case 2:
11942 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11943 break;
11944 case 3:
11945 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11946 break;
11947 case 4:
11948 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11949 xops[2], xops[3]);
11950 break;
11951 case 5:
11952 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11953 xops[2], xops[3], xops[4]);
11954 break;
11955 case 6:
11956 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11957 xops[2], xops[3], xops[4], xops[5]);
11958 break;
11959 default:
11960 gcc_unreachable ();
11961 }
11962
11963 if (!pat)
11964 return 0;
11965
11966 if (redundant_embed_rnd)
11967 pat = ix86_erase_embedded_rounding (pat);
11968
11969 emit_insn (pat);
11970 return target;
11971 }
11972
11973 /* Subroutine of ix86_expand_builtin to take care of special insns
11974 with variable number of operands. */
11975
11976 static rtx
11977 ix86_expand_special_args_builtin (const struct builtin_description *d,
11978 tree exp, rtx target)
11979 {
11980 tree arg;
11981 rtx pat, op;
11982 unsigned int i, nargs, arg_adjust, memory;
11983 unsigned int constant = 100;
11984 bool aligned_mem = false;
11985 rtx xops[4];
11986 enum insn_code icode = d->icode;
11987 const struct insn_data_d *insn_p = &insn_data[icode];
11988 machine_mode tmode = insn_p->operand[0].mode;
11989 enum { load, store } klass;
11990
11991 switch ((enum ix86_builtin_func_type) d->flag)
11992 {
11993 case VOID_FTYPE_VOID:
11994 emit_insn (GEN_FCN (icode) (target));
11995 return 0;
11996 case VOID_FTYPE_UINT64:
11997 case VOID_FTYPE_UNSIGNED:
11998 nargs = 0;
11999 klass = store;
12000 memory = 0;
12001 break;
12002
12003 case INT_FTYPE_VOID:
12004 case USHORT_FTYPE_VOID:
12005 case UINT64_FTYPE_VOID:
12006 case UINT_FTYPE_VOID:
12007 case UINT8_FTYPE_VOID:
12008 case UNSIGNED_FTYPE_VOID:
12009 nargs = 0;
12010 klass = load;
12011 memory = 0;
12012 break;
12013 case UINT64_FTYPE_PUNSIGNED:
12014 case V2DI_FTYPE_PV2DI:
12015 case V4DI_FTYPE_PV4DI:
12016 case V32QI_FTYPE_PCCHAR:
12017 case V16QI_FTYPE_PCCHAR:
12018 case V8SF_FTYPE_PCV4SF:
12019 case V8SF_FTYPE_PCFLOAT:
12020 case V4SF_FTYPE_PCFLOAT:
12021 case V4SF_FTYPE_PCFLOAT16:
12022 case V4SF_FTYPE_PCBFLOAT16:
12023 case V4SF_FTYPE_PCV8BF:
12024 case V4SF_FTYPE_PCV8HF:
12025 case V8SF_FTYPE_PCFLOAT16:
12026 case V8SF_FTYPE_PCBFLOAT16:
12027 case V8SF_FTYPE_PCV16HF:
12028 case V8SF_FTYPE_PCV16BF:
12029 case V4DF_FTYPE_PCV2DF:
12030 case V4DF_FTYPE_PCDOUBLE:
12031 case V2DF_FTYPE_PCDOUBLE:
12032 case VOID_FTYPE_PVOID:
12033 case V8DI_FTYPE_PV8DI:
12034 nargs = 1;
12035 klass = load;
12036 memory = 0;
12037 switch (icode)
12038 {
12039 case CODE_FOR_sse4_1_movntdqa:
12040 case CODE_FOR_avx2_movntdqa:
12041 case CODE_FOR_avx512f_movntdqa:
12042 aligned_mem = true;
12043 break;
12044 default:
12045 break;
12046 }
12047 break;
12048 case VOID_FTYPE_PV2SF_V4SF:
12049 case VOID_FTYPE_PV8DI_V8DI:
12050 case VOID_FTYPE_PV4DI_V4DI:
12051 case VOID_FTYPE_PV2DI_V2DI:
12052 case VOID_FTYPE_PCHAR_V32QI:
12053 case VOID_FTYPE_PCHAR_V16QI:
12054 case VOID_FTYPE_PFLOAT_V16SF:
12055 case VOID_FTYPE_PFLOAT_V8SF:
12056 case VOID_FTYPE_PFLOAT_V4SF:
12057 case VOID_FTYPE_PDOUBLE_V8DF:
12058 case VOID_FTYPE_PDOUBLE_V4DF:
12059 case VOID_FTYPE_PDOUBLE_V2DF:
12060 case VOID_FTYPE_PLONGLONG_LONGLONG:
12061 case VOID_FTYPE_PULONGLONG_ULONGLONG:
12062 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
12063 case VOID_FTYPE_PINT_INT:
12064 nargs = 1;
12065 klass = store;
12066 /* Reserve memory operand for target. */
12067 memory = ARRAY_SIZE (xops);
12068 switch (icode)
12069 {
12070 /* These builtins and instructions require the memory
12071 to be properly aligned. */
12072 case CODE_FOR_avx_movntv4di:
12073 case CODE_FOR_sse2_movntv2di:
12074 case CODE_FOR_avx_movntv8sf:
12075 case CODE_FOR_sse_movntv4sf:
12076 case CODE_FOR_sse4a_vmmovntv4sf:
12077 case CODE_FOR_avx_movntv4df:
12078 case CODE_FOR_sse2_movntv2df:
12079 case CODE_FOR_sse4a_vmmovntv2df:
12080 case CODE_FOR_sse2_movntidi:
12081 case CODE_FOR_sse_movntq:
12082 case CODE_FOR_sse2_movntisi:
12083 case CODE_FOR_avx512f_movntv16sf:
12084 case CODE_FOR_avx512f_movntv8df:
12085 case CODE_FOR_avx512f_movntv8di:
12086 aligned_mem = true;
12087 break;
12088 default:
12089 break;
12090 }
12091 break;
12092 case VOID_FTYPE_PVOID_PCVOID:
12093 nargs = 1;
12094 klass = store;
12095 memory = 0;
12096
12097 break;
12098 case V4SF_FTYPE_V4SF_PCV2SF:
12099 case V2DF_FTYPE_V2DF_PCDOUBLE:
12100 nargs = 2;
12101 klass = load;
12102 memory = 1;
12103 break;
12104 case V8SF_FTYPE_PCV8SF_V8SI:
12105 case V4DF_FTYPE_PCV4DF_V4DI:
12106 case V4SF_FTYPE_PCV4SF_V4SI:
12107 case V2DF_FTYPE_PCV2DF_V2DI:
12108 case V8SI_FTYPE_PCV8SI_V8SI:
12109 case V4DI_FTYPE_PCV4DI_V4DI:
12110 case V4SI_FTYPE_PCV4SI_V4SI:
12111 case V2DI_FTYPE_PCV2DI_V2DI:
12112 case VOID_FTYPE_INT_INT64:
12113 nargs = 2;
12114 klass = load;
12115 memory = 0;
12116 break;
12117 case VOID_FTYPE_PV8DF_V8DF_UQI:
12118 case VOID_FTYPE_PV4DF_V4DF_UQI:
12119 case VOID_FTYPE_PV2DF_V2DF_UQI:
12120 case VOID_FTYPE_PV16SF_V16SF_UHI:
12121 case VOID_FTYPE_PV8SF_V8SF_UQI:
12122 case VOID_FTYPE_PV4SF_V4SF_UQI:
12123 case VOID_FTYPE_PV8DI_V8DI_UQI:
12124 case VOID_FTYPE_PV4DI_V4DI_UQI:
12125 case VOID_FTYPE_PV2DI_V2DI_UQI:
12126 case VOID_FTYPE_PV16SI_V16SI_UHI:
12127 case VOID_FTYPE_PV8SI_V8SI_UQI:
12128 case VOID_FTYPE_PV4SI_V4SI_UQI:
12129 case VOID_FTYPE_PV64QI_V64QI_UDI:
12130 case VOID_FTYPE_PV32HI_V32HI_USI:
12131 case VOID_FTYPE_PV32QI_V32QI_USI:
12132 case VOID_FTYPE_PV16QI_V16QI_UHI:
12133 case VOID_FTYPE_PV16HI_V16HI_UHI:
12134 case VOID_FTYPE_PV8HI_V8HI_UQI:
12135 switch (icode)
12136 {
12137 /* These builtins and instructions require the memory
12138 to be properly aligned. */
12139 case CODE_FOR_avx512f_storev16sf_mask:
12140 case CODE_FOR_avx512f_storev16si_mask:
12141 case CODE_FOR_avx512f_storev8df_mask:
12142 case CODE_FOR_avx512f_storev8di_mask:
12143 case CODE_FOR_avx512vl_storev8sf_mask:
12144 case CODE_FOR_avx512vl_storev8si_mask:
12145 case CODE_FOR_avx512vl_storev4df_mask:
12146 case CODE_FOR_avx512vl_storev4di_mask:
12147 case CODE_FOR_avx512vl_storev4sf_mask:
12148 case CODE_FOR_avx512vl_storev4si_mask:
12149 case CODE_FOR_avx512vl_storev2df_mask:
12150 case CODE_FOR_avx512vl_storev2di_mask:
12151 aligned_mem = true;
12152 break;
12153 default:
12154 break;
12155 }
12156 /* FALLTHRU */
12157 case VOID_FTYPE_PV8SF_V8SI_V8SF:
12158 case VOID_FTYPE_PV4DF_V4DI_V4DF:
12159 case VOID_FTYPE_PV4SF_V4SI_V4SF:
12160 case VOID_FTYPE_PV2DF_V2DI_V2DF:
12161 case VOID_FTYPE_PV8SI_V8SI_V8SI:
12162 case VOID_FTYPE_PV4DI_V4DI_V4DI:
12163 case VOID_FTYPE_PV4SI_V4SI_V4SI:
12164 case VOID_FTYPE_PV2DI_V2DI_V2DI:
12165 case VOID_FTYPE_PV8SI_V8DI_UQI:
12166 case VOID_FTYPE_PV8HI_V8DI_UQI:
12167 case VOID_FTYPE_PV16HI_V16SI_UHI:
12168 case VOID_FTYPE_PUDI_V8DI_UQI:
12169 case VOID_FTYPE_PV16QI_V16SI_UHI:
12170 case VOID_FTYPE_PV4SI_V4DI_UQI:
12171 case VOID_FTYPE_PUDI_V2DI_UQI:
12172 case VOID_FTYPE_PUDI_V4DI_UQI:
12173 case VOID_FTYPE_PUSI_V2DI_UQI:
12174 case VOID_FTYPE_PV8HI_V8SI_UQI:
12175 case VOID_FTYPE_PUDI_V4SI_UQI:
12176 case VOID_FTYPE_PUSI_V4DI_UQI:
12177 case VOID_FTYPE_PUHI_V2DI_UQI:
12178 case VOID_FTYPE_PUDI_V8SI_UQI:
12179 case VOID_FTYPE_PUSI_V4SI_UQI:
12180 case VOID_FTYPE_PCHAR_V64QI_UDI:
12181 case VOID_FTYPE_PCHAR_V32QI_USI:
12182 case VOID_FTYPE_PCHAR_V16QI_UHI:
12183 case VOID_FTYPE_PSHORT_V32HI_USI:
12184 case VOID_FTYPE_PSHORT_V16HI_UHI:
12185 case VOID_FTYPE_PSHORT_V8HI_UQI:
12186 case VOID_FTYPE_PINT_V16SI_UHI:
12187 case VOID_FTYPE_PINT_V8SI_UQI:
12188 case VOID_FTYPE_PINT_V4SI_UQI:
12189 case VOID_FTYPE_PINT64_V8DI_UQI:
12190 case VOID_FTYPE_PINT64_V4DI_UQI:
12191 case VOID_FTYPE_PINT64_V2DI_UQI:
12192 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12193 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12194 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12195 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12196 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12197 case VOID_FTYPE_PFLOAT_V4SF_UQI:
12198 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
12199 case VOID_FTYPE_PV32QI_V32HI_USI:
12200 case VOID_FTYPE_PV16QI_V16HI_UHI:
12201 case VOID_FTYPE_PUDI_V8HI_UQI:
12202 nargs = 2;
12203 klass = store;
12204 /* Reserve memory operand for target. */
12205 memory = ARRAY_SIZE (xops);
12206 break;
12207 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12208 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12209 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12210 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12211 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12212 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12213 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12214 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12215 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12216 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12217 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12218 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12219 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12220 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12221 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12222 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12223 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12224 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12225 switch (icode)
12226 {
12227 /* These builtins and instructions require the memory
12228 to be properly aligned. */
12229 case CODE_FOR_avx512f_loadv16sf_mask:
12230 case CODE_FOR_avx512f_loadv16si_mask:
12231 case CODE_FOR_avx512f_loadv8df_mask:
12232 case CODE_FOR_avx512f_loadv8di_mask:
12233 case CODE_FOR_avx512vl_loadv8sf_mask:
12234 case CODE_FOR_avx512vl_loadv8si_mask:
12235 case CODE_FOR_avx512vl_loadv4df_mask:
12236 case CODE_FOR_avx512vl_loadv4di_mask:
12237 case CODE_FOR_avx512vl_loadv4sf_mask:
12238 case CODE_FOR_avx512vl_loadv4si_mask:
12239 case CODE_FOR_avx512vl_loadv2df_mask:
12240 case CODE_FOR_avx512vl_loadv2di_mask:
12241 case CODE_FOR_avx512bw_loadv64qi_mask:
12242 case CODE_FOR_avx512vl_loadv32qi_mask:
12243 case CODE_FOR_avx512vl_loadv16qi_mask:
12244 case CODE_FOR_avx512bw_loadv32hi_mask:
12245 case CODE_FOR_avx512vl_loadv16hi_mask:
12246 case CODE_FOR_avx512vl_loadv8hi_mask:
12247 aligned_mem = true;
12248 break;
12249 default:
12250 break;
12251 }
12252 /* FALLTHRU */
12253 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12254 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12255 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12256 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12257 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12258 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12259 case V16SI_FTYPE_PCINT_V16SI_UHI:
12260 case V8SI_FTYPE_PCINT_V8SI_UQI:
12261 case V4SI_FTYPE_PCINT_V4SI_UQI:
12262 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12263 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12264 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12265 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12266 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12267 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12268 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12269 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12270 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12271 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12272 nargs = 3;
12273 klass = load;
12274 memory = 0;
12275 break;
12276 case INT_FTYPE_PINT_INT_INT_INT:
12277 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
12278 nargs = 4;
12279 klass = load;
12280 memory = 0;
12281 constant = 3;
12282 break;
12283 default:
12284 gcc_unreachable ();
12285 }
12286
12287 gcc_assert (nargs <= ARRAY_SIZE (xops));
12288
12289 if (klass == store)
12290 {
12291 arg = CALL_EXPR_ARG (exp, 0);
12292 op = expand_normal (arg);
12293 gcc_assert (target == 0);
12294 if (memory)
12295 {
12296 op = ix86_zero_extend_to_Pmode (op);
12297 target = gen_rtx_MEM (tmode, op);
12298 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12299 on it. Try to improve it using get_pointer_alignment,
12300 and if the special builtin is one that requires strict
12301 mode alignment, also from it's GET_MODE_ALIGNMENT.
12302 Failure to do so could lead to ix86_legitimate_combined_insn
12303 rejecting all changes to such insns. */
12304 unsigned int align = get_pointer_alignment (arg);
12305 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12306 align = GET_MODE_ALIGNMENT (tmode);
12307 if (MEM_ALIGN (target) < align)
12308 set_mem_align (target, align);
12309 }
12310 else
12311 target = force_reg (tmode, op);
12312 arg_adjust = 1;
12313 }
12314 else
12315 {
12316 arg_adjust = 0;
12317 if (optimize
12318 || target == 0
12319 || !register_operand (target, tmode)
12320 || GET_MODE (target) != tmode)
12321 target = gen_reg_rtx (tmode);
12322 }
12323
12324 for (i = 0; i < nargs; i++)
12325 {
12326 machine_mode mode = insn_p->operand[i + 1].mode;
12327
12328 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12329 op = expand_normal (arg);
12330
12331 if (i == memory)
12332 {
12333 /* This must be the memory operand. */
12334 op = ix86_zero_extend_to_Pmode (op);
12335 op = gen_rtx_MEM (mode, op);
12336 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12337 on it. Try to improve it using get_pointer_alignment,
12338 and if the special builtin is one that requires strict
12339 mode alignment, also from it's GET_MODE_ALIGNMENT.
12340 Failure to do so could lead to ix86_legitimate_combined_insn
12341 rejecting all changes to such insns. */
12342 unsigned int align = get_pointer_alignment (arg);
12343 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12344 align = GET_MODE_ALIGNMENT (mode);
12345 if (MEM_ALIGN (op) < align)
12346 set_mem_align (op, align);
12347 }
12348 else if (i == constant)
12349 {
12350 /* This must be the constant. */
12351 if (!insn_p->operand[nargs].predicate(op, SImode))
12352 {
12353 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12354 return const0_rtx;
12355 }
12356 }
12357 else
12358 {
12359 /* This must be register. */
12360 if (VECTOR_MODE_P (mode))
12361 op = safe_vector_operand (op, mode);
12362
12363 op = fixup_modeless_constant (op, mode);
12364
12365 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12366 and that mask operand shoud be at the end.
12367 Keep all-ones mask which would be simplified by the expander. */
12368 if (nargs == 3 && i == 2 && klass == load
12369 && constm1_operand (op, mode)
12370 && insn_p->operand[i].predicate (op, mode))
12371 ;
12372 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12373 op = copy_to_mode_reg (mode, op);
12374 else
12375 {
12376 op = copy_to_reg (op);
12377 op = lowpart_subreg (mode, op, GET_MODE (op));
12378 }
12379 }
12380
12381 xops[i]= op;
12382 }
12383
12384 switch (nargs)
12385 {
12386 case 0:
12387 pat = GEN_FCN (icode) (target);
12388 break;
12389 case 1:
12390 pat = GEN_FCN (icode) (target, xops[0]);
12391 break;
12392 case 2:
12393 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12394 break;
12395 case 3:
12396 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12397 break;
12398 case 4:
12399 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
12400 break;
12401 default:
12402 gcc_unreachable ();
12403 }
12404
12405 if (! pat)
12406 return 0;
12407
12408 emit_insn (pat);
12409 return klass == store ? 0 : target;
12410 }
12411
12412 /* Return the integer constant in ARG. Constrain it to be in the range
12413 of the subparts of VEC_TYPE; issue an error if not. */
12414
12415 static int
12416 get_element_number (tree vec_type, tree arg)
12417 {
12418 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12419
12420 if (!tree_fits_uhwi_p (arg)
12421 || (elt = tree_to_uhwi (arg), elt > max))
12422 {
12423 error ("selector must be an integer constant in the range "
12424 "[0, %wi]", max);
12425 return 0;
12426 }
12427
12428 return elt;
12429 }
12430
12431 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12432 ix86_expand_vector_init. We DO have language-level syntax for this, in
12433 the form of (type){ init-list }. Except that since we can't place emms
12434 instructions from inside the compiler, we can't allow the use of MMX
12435 registers unless the user explicitly asks for it. So we do *not* define
12436 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12437 we have builtins invoked by mmintrin.h that gives us license to emit
12438 these sorts of instructions. */
12439
12440 static rtx
12441 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12442 {
12443 machine_mode tmode = TYPE_MODE (type);
12444 machine_mode inner_mode = GET_MODE_INNER (tmode);
12445 int i, n_elt = GET_MODE_NUNITS (tmode);
12446 rtvec v = rtvec_alloc (n_elt);
12447
12448 gcc_assert (VECTOR_MODE_P (tmode));
12449 gcc_assert (call_expr_nargs (exp) == n_elt);
12450
12451 for (i = 0; i < n_elt; ++i)
12452 {
12453 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12454 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12455 }
12456
12457 if (!target || !register_operand (target, tmode))
12458 target = gen_reg_rtx (tmode);
12459
12460 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12461 return target;
12462 }
12463
12464 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12465 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12466 had a language-level syntax for referencing vector elements. */
12467
12468 static rtx
12469 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12470 {
12471 machine_mode tmode, mode0;
12472 tree arg0, arg1;
12473 int elt;
12474 rtx op0;
12475
12476 arg0 = CALL_EXPR_ARG (exp, 0);
12477 arg1 = CALL_EXPR_ARG (exp, 1);
12478
12479 op0 = expand_normal (arg0);
12480 elt = get_element_number (TREE_TYPE (arg0), arg1);
12481
12482 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12483 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12484 gcc_assert (VECTOR_MODE_P (mode0));
12485
12486 op0 = force_reg (mode0, op0);
12487
12488 if (optimize || !target || !register_operand (target, tmode))
12489 target = gen_reg_rtx (tmode);
12490
12491 ix86_expand_vector_extract (true, target, op0, elt);
12492
12493 return target;
12494 }
12495
12496 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12497 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12498 a language-level syntax for referencing vector elements. */
12499
12500 static rtx
12501 ix86_expand_vec_set_builtin (tree exp)
12502 {
12503 machine_mode tmode, mode1;
12504 tree arg0, arg1, arg2;
12505 int elt;
12506 rtx op0, op1, target;
12507
12508 arg0 = CALL_EXPR_ARG (exp, 0);
12509 arg1 = CALL_EXPR_ARG (exp, 1);
12510 arg2 = CALL_EXPR_ARG (exp, 2);
12511
12512 tmode = TYPE_MODE (TREE_TYPE (arg0));
12513 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12514 gcc_assert (VECTOR_MODE_P (tmode));
12515
12516 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12517 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12518 elt = get_element_number (TREE_TYPE (arg0), arg2);
12519
12520 if (GET_MODE (op1) != mode1)
12521 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12522
12523 op0 = force_reg (tmode, op0);
12524 op1 = force_reg (mode1, op1);
12525
12526 /* OP0 is the source of these builtin functions and shouldn't be
12527 modified. Create a copy, use it and return it as target. */
12528 target = gen_reg_rtx (tmode);
12529 emit_move_insn (target, op0);
12530 ix86_expand_vector_set (true, target, op1, elt);
12531
12532 return target;
12533 }
12534
12535 /* Return true if the necessary isa options for this builtin exist,
12536 else false.
12537 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12538 bool
12539 ix86_check_builtin_isa_match (unsigned int fcode,
12540 HOST_WIDE_INT* pbisa,
12541 HOST_WIDE_INT* pbisa2)
12542 {
12543 HOST_WIDE_INT isa = ix86_isa_flags;
12544 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12545 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12546 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12547 /* The general case is we require all the ISAs specified in bisa{,2}
12548 to be enabled.
12549 The exceptions are:
12550 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12551 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12552 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12553 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12554 OPTION_MASK_ISA2_AVXVNNI
12555 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512IFMA) or
12556 OPTION_MASK_ISA2_AVXIFMA
12557 (OPTION_MASK_ISA_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16) or
12558 OPTION_MASK_ISA2_AVXNECONVERT
12559 where for each such pair it is sufficient if either of the ISAs is
12560 enabled, plus if it is ored with other options also those others.
12561 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12562 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12563 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12564 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
12565 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
12566
12567 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12568 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12569 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
12570 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
12571
12572 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12573 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12574 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
12575 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
12576
12577 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12578 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12579 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
12580 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12581 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12582 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
12583 {
12584 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
12585 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
12586 }
12587
12588 if ((((bisa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12589 == (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12590 || (bisa2 & OPTION_MASK_ISA2_AVXIFMA) != 0)
12591 && (((isa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12592 == (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12593 || (isa2 & OPTION_MASK_ISA2_AVXIFMA) != 0))
12594 {
12595 isa |= OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL;
12596 isa2 |= OPTION_MASK_ISA2_AVXIFMA;
12597 }
12598
12599 if ((((bisa & OPTION_MASK_ISA_AVX512VL) != 0
12600 && (bisa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
12601 && (bisa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0)
12602 && (((isa & OPTION_MASK_ISA_AVX512VL) != 0
12603 && (isa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
12604 || (isa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0))
12605 {
12606 isa |= OPTION_MASK_ISA_AVX512VL;
12607 isa2 |= OPTION_MASK_ISA2_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16;
12608 }
12609
12610 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12611 /* __builtin_ia32_maskmovq requires MMX registers. */
12612 && fcode != IX86_BUILTIN_MASKMOVQ)
12613 {
12614 bisa &= ~OPTION_MASK_ISA_MMX;
12615 bisa |= OPTION_MASK_ISA_SSE2;
12616 }
12617
12618 if (pbisa)
12619 *pbisa = bisa;
12620 if (pbisa2)
12621 *pbisa2 = bisa2;
12622
12623 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12624 }
12625
12626 /* Expand an expression EXP that calls a built-in function,
12627 with result going to TARGET if that's convenient
12628 (and in mode MODE if that's convenient).
12629 SUBTARGET may be used as the target for computing one of EXP's operands.
12630 IGNORE is nonzero if the value is to be ignored. */
12631
12632 rtx
12633 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12634 machine_mode mode, int ignore)
12635 {
12636 size_t i;
12637 enum insn_code icode, icode2;
12638 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12639 tree arg0, arg1, arg2, arg3, arg4;
12640 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12641 machine_mode mode0, mode1, mode2, mode3, mode4;
12642 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12643 HOST_WIDE_INT bisa, bisa2;
12644
12645 /* For CPU builtins that can be folded, fold first and expand the fold. */
12646 switch (fcode)
12647 {
12648 case IX86_BUILTIN_CPU_INIT:
12649 {
12650 /* Make it call __cpu_indicator_init in libgcc. */
12651 tree call_expr, fndecl, type;
12652 type = build_function_type_list (integer_type_node, NULL_TREE);
12653 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12654 call_expr = build_call_expr (fndecl, 0);
12655 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12656 }
12657 case IX86_BUILTIN_CPU_IS:
12658 case IX86_BUILTIN_CPU_SUPPORTS:
12659 {
12660 tree arg0 = CALL_EXPR_ARG (exp, 0);
12661 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12662 gcc_assert (fold_expr != NULL_TREE);
12663 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12664 }
12665 }
12666
12667 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
12668 {
12669 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12670 if (TARGET_ABI_X32)
12671 bisa |= OPTION_MASK_ABI_X32;
12672 else
12673 bisa |= OPTION_MASK_ABI_64;
12674 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
12675 (enum fpmath_unit) 0,
12676 (enum prefer_vector_width) 0,
12677 PVW_NONE, PVW_NONE,
12678 false, add_abi_p);
12679 if (!opts)
12680 error ("%qE needs unknown isa option", fndecl);
12681 else
12682 {
12683 gcc_assert (opts != NULL);
12684 error ("%qE needs isa option %s", fndecl, opts);
12685 free (opts);
12686 }
12687 return expand_call (exp, target, ignore);
12688 }
12689
12690 switch (fcode)
12691 {
12692 case IX86_BUILTIN_MASKMOVQ:
12693 case IX86_BUILTIN_MASKMOVDQU:
12694 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12695 ? CODE_FOR_mmx_maskmovq
12696 : CODE_FOR_sse2_maskmovdqu);
12697 /* Note the arg order is different from the operand order. */
12698 arg1 = CALL_EXPR_ARG (exp, 0);
12699 arg2 = CALL_EXPR_ARG (exp, 1);
12700 arg0 = CALL_EXPR_ARG (exp, 2);
12701 op0 = expand_normal (arg0);
12702 op1 = expand_normal (arg1);
12703 op2 = expand_normal (arg2);
12704 mode0 = insn_data[icode].operand[0].mode;
12705 mode1 = insn_data[icode].operand[1].mode;
12706 mode2 = insn_data[icode].operand[2].mode;
12707
12708 op0 = ix86_zero_extend_to_Pmode (op0);
12709 op0 = gen_rtx_MEM (mode1, op0);
12710
12711 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12712 op0 = copy_to_mode_reg (mode0, op0);
12713 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12714 op1 = copy_to_mode_reg (mode1, op1);
12715 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12716 op2 = copy_to_mode_reg (mode2, op2);
12717 pat = GEN_FCN (icode) (op0, op1, op2);
12718 if (! pat)
12719 return 0;
12720 emit_insn (pat);
12721 return 0;
12722
12723 case IX86_BUILTIN_LDMXCSR:
12724 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12725 target = assign_386_stack_local (SImode, SLOT_TEMP);
12726 emit_move_insn (target, op0);
12727 emit_insn (gen_sse_ldmxcsr (target));
12728 return 0;
12729
12730 case IX86_BUILTIN_STMXCSR:
12731 target = assign_386_stack_local (SImode, SLOT_TEMP);
12732 emit_insn (gen_sse_stmxcsr (target));
12733 return copy_to_mode_reg (SImode, target);
12734
12735 case IX86_BUILTIN_CLFLUSH:
12736 arg0 = CALL_EXPR_ARG (exp, 0);
12737 op0 = expand_normal (arg0);
12738 icode = CODE_FOR_sse2_clflush;
12739 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12740 op0 = ix86_zero_extend_to_Pmode (op0);
12741
12742 emit_insn (gen_sse2_clflush (op0));
12743 return 0;
12744
12745 case IX86_BUILTIN_CLWB:
12746 arg0 = CALL_EXPR_ARG (exp, 0);
12747 op0 = expand_normal (arg0);
12748 icode = CODE_FOR_clwb;
12749 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12750 op0 = ix86_zero_extend_to_Pmode (op0);
12751
12752 emit_insn (gen_clwb (op0));
12753 return 0;
12754
12755 case IX86_BUILTIN_CLFLUSHOPT:
12756 arg0 = CALL_EXPR_ARG (exp, 0);
12757 op0 = expand_normal (arg0);
12758 icode = CODE_FOR_clflushopt;
12759 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12760 op0 = ix86_zero_extend_to_Pmode (op0);
12761
12762 emit_insn (gen_clflushopt (op0));
12763 return 0;
12764
12765 case IX86_BUILTIN_MONITOR:
12766 case IX86_BUILTIN_MONITORX:
12767 arg0 = CALL_EXPR_ARG (exp, 0);
12768 arg1 = CALL_EXPR_ARG (exp, 1);
12769 arg2 = CALL_EXPR_ARG (exp, 2);
12770 op0 = expand_normal (arg0);
12771 op1 = expand_normal (arg1);
12772 op2 = expand_normal (arg2);
12773 if (!REG_P (op0))
12774 op0 = ix86_zero_extend_to_Pmode (op0);
12775 if (!REG_P (op1))
12776 op1 = copy_to_mode_reg (SImode, op1);
12777 if (!REG_P (op2))
12778 op2 = copy_to_mode_reg (SImode, op2);
12779
12780 emit_insn (fcode == IX86_BUILTIN_MONITOR
12781 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12782 : gen_monitorx (Pmode, op0, op1, op2));
12783 return 0;
12784
12785 case IX86_BUILTIN_MWAIT:
12786 arg0 = CALL_EXPR_ARG (exp, 0);
12787 arg1 = CALL_EXPR_ARG (exp, 1);
12788 op0 = expand_normal (arg0);
12789 op1 = expand_normal (arg1);
12790 if (!REG_P (op0))
12791 op0 = copy_to_mode_reg (SImode, op0);
12792 if (!REG_P (op1))
12793 op1 = copy_to_mode_reg (SImode, op1);
12794 emit_insn (gen_sse3_mwait (op0, op1));
12795 return 0;
12796
12797 case IX86_BUILTIN_MWAITX:
12798 arg0 = CALL_EXPR_ARG (exp, 0);
12799 arg1 = CALL_EXPR_ARG (exp, 1);
12800 arg2 = CALL_EXPR_ARG (exp, 2);
12801 op0 = expand_normal (arg0);
12802 op1 = expand_normal (arg1);
12803 op2 = expand_normal (arg2);
12804 if (!REG_P (op0))
12805 op0 = copy_to_mode_reg (SImode, op0);
12806 if (!REG_P (op1))
12807 op1 = copy_to_mode_reg (SImode, op1);
12808 if (!REG_P (op2))
12809 op2 = copy_to_mode_reg (SImode, op2);
12810 emit_insn (gen_mwaitx (op0, op1, op2));
12811 return 0;
12812
12813 case IX86_BUILTIN_UMONITOR:
12814 arg0 = CALL_EXPR_ARG (exp, 0);
12815 op0 = expand_normal (arg0);
12816
12817 op0 = ix86_zero_extend_to_Pmode (op0);
12818 emit_insn (gen_umonitor (Pmode, op0));
12819 return 0;
12820
12821 case IX86_BUILTIN_UMWAIT:
12822 case IX86_BUILTIN_TPAUSE:
12823 arg0 = CALL_EXPR_ARG (exp, 0);
12824 arg1 = CALL_EXPR_ARG (exp, 1);
12825 op0 = expand_normal (arg0);
12826 op1 = expand_normal (arg1);
12827
12828 if (!REG_P (op0))
12829 op0 = copy_to_mode_reg (SImode, op0);
12830
12831 op1 = force_reg (DImode, op1);
12832
12833 if (TARGET_64BIT)
12834 {
12835 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12836 NULL, 1, OPTAB_DIRECT);
12837 switch (fcode)
12838 {
12839 case IX86_BUILTIN_UMWAIT:
12840 icode = CODE_FOR_umwait_rex64;
12841 break;
12842 case IX86_BUILTIN_TPAUSE:
12843 icode = CODE_FOR_tpause_rex64;
12844 break;
12845 default:
12846 gcc_unreachable ();
12847 }
12848
12849 op2 = gen_lowpart (SImode, op2);
12850 op1 = gen_lowpart (SImode, op1);
12851 pat = GEN_FCN (icode) (op0, op1, op2);
12852 }
12853 else
12854 {
12855 switch (fcode)
12856 {
12857 case IX86_BUILTIN_UMWAIT:
12858 icode = CODE_FOR_umwait;
12859 break;
12860 case IX86_BUILTIN_TPAUSE:
12861 icode = CODE_FOR_tpause;
12862 break;
12863 default:
12864 gcc_unreachable ();
12865 }
12866 pat = GEN_FCN (icode) (op0, op1);
12867 }
12868
12869 if (!pat)
12870 return 0;
12871
12872 emit_insn (pat);
12873
12874 if (target == 0
12875 || !register_operand (target, QImode))
12876 target = gen_reg_rtx (QImode);
12877
12878 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12879 const0_rtx);
12880 emit_insn (gen_rtx_SET (target, pat));
12881
12882 return target;
12883
12884 case IX86_BUILTIN_TESTUI:
12885 emit_insn (gen_testui ());
12886
12887 if (target == 0
12888 || !register_operand (target, QImode))
12889 target = gen_reg_rtx (QImode);
12890
12891 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12892 const0_rtx);
12893 emit_insn (gen_rtx_SET (target, pat));
12894
12895 return target;
12896
12897 case IX86_BUILTIN_CLZERO:
12898 arg0 = CALL_EXPR_ARG (exp, 0);
12899 op0 = expand_normal (arg0);
12900 if (!REG_P (op0))
12901 op0 = ix86_zero_extend_to_Pmode (op0);
12902 emit_insn (gen_clzero (Pmode, op0));
12903 return 0;
12904
12905 case IX86_BUILTIN_CLDEMOTE:
12906 arg0 = CALL_EXPR_ARG (exp, 0);
12907 op0 = expand_normal (arg0);
12908 icode = CODE_FOR_cldemote;
12909 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12910 op0 = ix86_zero_extend_to_Pmode (op0);
12911
12912 emit_insn (gen_cldemote (op0));
12913 return 0;
12914
12915 case IX86_BUILTIN_LOADIWKEY:
12916 {
12917 arg0 = CALL_EXPR_ARG (exp, 0);
12918 arg1 = CALL_EXPR_ARG (exp, 1);
12919 arg2 = CALL_EXPR_ARG (exp, 2);
12920 arg3 = CALL_EXPR_ARG (exp, 3);
12921
12922 op0 = expand_normal (arg0);
12923 op1 = expand_normal (arg1);
12924 op2 = expand_normal (arg2);
12925 op3 = expand_normal (arg3);
12926
12927 if (!REG_P (op0))
12928 op0 = copy_to_mode_reg (V2DImode, op0);
12929 if (!REG_P (op1))
12930 op1 = copy_to_mode_reg (V2DImode, op1);
12931 if (!REG_P (op2))
12932 op2 = copy_to_mode_reg (V2DImode, op2);
12933 if (!REG_P (op3))
12934 op3 = copy_to_mode_reg (SImode, op3);
12935
12936 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12937
12938 return 0;
12939 }
12940
12941 case IX86_BUILTIN_AESDEC128KLU8:
12942 icode = CODE_FOR_aesdec128klu8;
12943 goto aesdecenc_expand;
12944
12945 case IX86_BUILTIN_AESDEC256KLU8:
12946 icode = CODE_FOR_aesdec256klu8;
12947 goto aesdecenc_expand;
12948
12949 case IX86_BUILTIN_AESENC128KLU8:
12950 icode = CODE_FOR_aesenc128klu8;
12951 goto aesdecenc_expand;
12952
12953 case IX86_BUILTIN_AESENC256KLU8:
12954 icode = CODE_FOR_aesenc256klu8;
12955
12956 aesdecenc_expand:
12957
12958 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
12959 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
12960 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12961
12962 op0 = expand_normal (arg0);
12963 op1 = expand_normal (arg1);
12964 op2 = expand_normal (arg2);
12965
12966 if (!address_operand (op0, V2DImode))
12967 {
12968 op0 = convert_memory_address (Pmode, op0);
12969 op0 = copy_addr_to_reg (op0);
12970 }
12971 op0 = gen_rtx_MEM (V2DImode, op0);
12972
12973 if (!REG_P (op1))
12974 op1 = copy_to_mode_reg (V2DImode, op1);
12975
12976 if (!address_operand (op2, VOIDmode))
12977 {
12978 op2 = convert_memory_address (Pmode, op2);
12979 op2 = copy_addr_to_reg (op2);
12980 }
12981 op2 = gen_rtx_MEM (BLKmode, op2);
12982
12983 emit_insn (GEN_FCN (icode) (op1, op1, op2));
12984
12985 if (target == 0)
12986 target = gen_reg_rtx (QImode);
12987
12988 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12989 error occurs. Then the output should be cleared for safety. */
12990 rtx_code_label *ok_label;
12991 rtx tmp;
12992
12993 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12994 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12995 ok_label = gen_label_rtx ();
12996 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12997 true, ok_label);
12998 /* Usually the runtime error seldom occur, so predict OK path as
12999 hotspot to optimize it as fallthrough block. */
13000 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13001
13002 emit_insn (gen_rtx_SET (op1, const0_rtx));
13003
13004 emit_label (ok_label);
13005 emit_insn (gen_rtx_SET (target, pat));
13006 emit_insn (gen_rtx_SET (op0, op1));
13007
13008 return target;
13009
13010 case IX86_BUILTIN_AESDECWIDE128KLU8:
13011 icode = CODE_FOR_aesdecwide128klu8;
13012 goto wideaesdecenc_expand;
13013
13014 case IX86_BUILTIN_AESDECWIDE256KLU8:
13015 icode = CODE_FOR_aesdecwide256klu8;
13016 goto wideaesdecenc_expand;
13017
13018 case IX86_BUILTIN_AESENCWIDE128KLU8:
13019 icode = CODE_FOR_aesencwide128klu8;
13020 goto wideaesdecenc_expand;
13021
13022 case IX86_BUILTIN_AESENCWIDE256KLU8:
13023 icode = CODE_FOR_aesencwide256klu8;
13024
13025 wideaesdecenc_expand:
13026
13027 rtx xmm_regs[8];
13028 rtx op;
13029
13030 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
13031 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
13032 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13033
13034 op0 = expand_normal (arg0);
13035 op1 = expand_normal (arg1);
13036 op2 = expand_normal (arg2);
13037
13038 if (!address_operand (op2, VOIDmode))
13039 {
13040 op2 = convert_memory_address (Pmode, op2);
13041 op2 = copy_addr_to_reg (op2);
13042 }
13043 op2 = gen_rtx_MEM (BLKmode, op2);
13044
13045 for (i = 0; i < 8; i++)
13046 {
13047 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13048
13049 op = gen_rtx_MEM (V2DImode,
13050 plus_constant (Pmode, op1, (i * 16)));
13051
13052 emit_move_insn (xmm_regs[i], op);
13053 }
13054
13055 emit_insn (GEN_FCN (icode) (op2));
13056
13057 if (target == 0)
13058 target = gen_reg_rtx (QImode);
13059
13060 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13061 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13062 ok_label = gen_label_rtx ();
13063 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13064 true, ok_label);
13065 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13066
13067 for (i = 0; i < 8; i++)
13068 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
13069
13070 emit_label (ok_label);
13071 emit_insn (gen_rtx_SET (target, pat));
13072
13073 for (i = 0; i < 8; i++)
13074 {
13075 op = gen_rtx_MEM (V2DImode,
13076 plus_constant (Pmode, op0, (i * 16)));
13077 emit_move_insn (op, xmm_regs[i]);
13078 }
13079
13080 return target;
13081
13082 case IX86_BUILTIN_ENCODEKEY128U32:
13083 {
13084 rtx op, xmm_regs[7];
13085
13086 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13087 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
13088 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
13089
13090 op0 = expand_normal (arg0);
13091 op1 = expand_normal (arg1);
13092 op2 = expand_normal (arg2);
13093
13094 if (!REG_P (op0))
13095 op0 = copy_to_mode_reg (SImode, op0);
13096
13097 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13098 emit_move_insn (op, op1);
13099
13100 for (i = 0; i < 3; i++)
13101 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13102
13103 if (target == 0)
13104 target = gen_reg_rtx (SImode);
13105
13106 emit_insn (gen_encodekey128u32 (target, op0));
13107
13108 for (i = 0; i < 3; i++)
13109 {
13110 op = gen_rtx_MEM (V2DImode,
13111 plus_constant (Pmode, op2, (i * 16)));
13112 emit_move_insn (op, xmm_regs[i]);
13113 }
13114
13115 return target;
13116 }
13117 case IX86_BUILTIN_ENCODEKEY256U32:
13118 {
13119 rtx op, xmm_regs[7];
13120
13121 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13122 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
13123 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
13124 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
13125
13126 op0 = expand_normal (arg0);
13127 op1 = expand_normal (arg1);
13128 op2 = expand_normal (arg2);
13129 op3 = expand_normal (arg3);
13130
13131 if (!REG_P (op0))
13132 op0 = copy_to_mode_reg (SImode, op0);
13133
13134 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13135 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13136 emit_move_insn (op, op1);
13137 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
13138 emit_move_insn (op, op2);
13139
13140 for (i = 0; i < 4; i++)
13141 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13142
13143 if (target == 0)
13144 target = gen_reg_rtx (SImode);
13145
13146 emit_insn (gen_encodekey256u32 (target, op0));
13147
13148 for (i = 0; i < 4; i++)
13149 {
13150 op = gen_rtx_MEM (V2DImode,
13151 plus_constant (Pmode, op3, (i * 16)));
13152 emit_move_insn (op, xmm_regs[i]);
13153 }
13154
13155 return target;
13156 }
13157
13158 case IX86_BUILTIN_PREFETCH:
13159 {
13160 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13161 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13162 arg2 = CALL_EXPR_ARG (exp, 2); // const int
13163 arg3 = CALL_EXPR_ARG (exp, 3); // const int
13164
13165 op0 = expand_normal (arg0);
13166 op1 = expand_normal (arg1);
13167 op2 = expand_normal (arg2);
13168 op3 = expand_normal (arg3);
13169
13170 if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
13171 {
13172 error ("second, third and fourth argument must be a const");
13173 return const0_rtx;
13174 }
13175
13176 if (INTVAL (op3) == 1)
13177 {
13178 if (TARGET_64BIT && TARGET_PREFETCHI
13179 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13180 emit_insn (gen_prefetchi (op0, op2));
13181 else
13182 {
13183 warning (0, "instruction prefetch applies when in 64-bit mode"
13184 " with RIP-relative addressing and"
13185 " option %<-mprefetchi%>;"
13186 " they stay NOPs otherwise");
13187 emit_insn (gen_nop ());
13188 }
13189 }
13190 else
13191 {
13192 if (!address_operand (op0, VOIDmode))
13193 {
13194 op0 = convert_memory_address (Pmode, op0);
13195 op0 = copy_addr_to_reg (op0);
13196 }
13197
13198 if (TARGET_3DNOW || TARGET_PREFETCH_SSE
13199 || TARGET_PRFCHW || TARGET_PREFETCHWT1)
13200 emit_insn (gen_prefetch (op0, op1, op2));
13201 else if (!MEM_P (op0) && side_effects_p (op0))
13202 /* Don't do anything with direct references to volatile memory,
13203 but generate code to handle other side effects. */
13204 emit_insn (op0);
13205 }
13206
13207 return 0;
13208 }
13209
13210 case IX86_BUILTIN_PREFETCHI:
13211 {
13212 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13213 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13214
13215 op0 = expand_normal (arg0);
13216 op1 = expand_normal (arg1);
13217
13218 if (!CONST_INT_P (op1))
13219 {
13220 error ("second argument must be a const");
13221 return const0_rtx;
13222 }
13223
13224 /* GOT/PLT_PIC should not be available for instruction prefetch.
13225 It must be real instruction address. */
13226 if (TARGET_64BIT
13227 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13228 emit_insn (gen_prefetchi (op0, op1));
13229 else
13230 {
13231 /* Ignore the hint. */
13232 warning (0, "instruction prefetch applies when in 64-bit mode"
13233 " with RIP-relative addressing and"
13234 " option %<-mprefetchi%>;"
13235 " they stay NOPs otherwise");
13236 emit_insn (gen_nop ());
13237 }
13238
13239 return 0;
13240 }
13241
13242 case IX86_BUILTIN_VEC_INIT_V2SI:
13243 case IX86_BUILTIN_VEC_INIT_V4HI:
13244 case IX86_BUILTIN_VEC_INIT_V8QI:
13245 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
13246
13247 case IX86_BUILTIN_VEC_EXT_V2DF:
13248 case IX86_BUILTIN_VEC_EXT_V2DI:
13249 case IX86_BUILTIN_VEC_EXT_V4SF:
13250 case IX86_BUILTIN_VEC_EXT_V4SI:
13251 case IX86_BUILTIN_VEC_EXT_V8HI:
13252 case IX86_BUILTIN_VEC_EXT_V2SI:
13253 case IX86_BUILTIN_VEC_EXT_V4HI:
13254 case IX86_BUILTIN_VEC_EXT_V16QI:
13255 return ix86_expand_vec_ext_builtin (exp, target);
13256
13257 case IX86_BUILTIN_VEC_SET_V2DI:
13258 case IX86_BUILTIN_VEC_SET_V4SF:
13259 case IX86_BUILTIN_VEC_SET_V4SI:
13260 case IX86_BUILTIN_VEC_SET_V8HI:
13261 case IX86_BUILTIN_VEC_SET_V4HI:
13262 case IX86_BUILTIN_VEC_SET_V16QI:
13263 return ix86_expand_vec_set_builtin (exp);
13264
13265 case IX86_BUILTIN_NANQ:
13266 case IX86_BUILTIN_NANSQ:
13267 return expand_call (exp, target, ignore);
13268
13269 case IX86_BUILTIN_RDPID:
13270
13271 op0 = gen_reg_rtx (word_mode);
13272
13273 if (TARGET_64BIT)
13274 {
13275 insn = gen_rdpid_rex64 (op0);
13276 op0 = convert_to_mode (SImode, op0, 1);
13277 }
13278 else
13279 insn = gen_rdpid (op0);
13280
13281 emit_insn (insn);
13282
13283 if (target == 0
13284 || !register_operand (target, SImode))
13285 target = gen_reg_rtx (SImode);
13286
13287 emit_move_insn (target, op0);
13288 return target;
13289
13290 case IX86_BUILTIN_2INTERSECTD512:
13291 case IX86_BUILTIN_2INTERSECTQ512:
13292 case IX86_BUILTIN_2INTERSECTD256:
13293 case IX86_BUILTIN_2INTERSECTQ256:
13294 case IX86_BUILTIN_2INTERSECTD128:
13295 case IX86_BUILTIN_2INTERSECTQ128:
13296 arg0 = CALL_EXPR_ARG (exp, 0);
13297 arg1 = CALL_EXPR_ARG (exp, 1);
13298 arg2 = CALL_EXPR_ARG (exp, 2);
13299 arg3 = CALL_EXPR_ARG (exp, 3);
13300 op0 = expand_normal (arg0);
13301 op1 = expand_normal (arg1);
13302 op2 = expand_normal (arg2);
13303 op3 = expand_normal (arg3);
13304
13305 if (!address_operand (op0, VOIDmode))
13306 {
13307 op0 = convert_memory_address (Pmode, op0);
13308 op0 = copy_addr_to_reg (op0);
13309 }
13310 if (!address_operand (op1, VOIDmode))
13311 {
13312 op1 = convert_memory_address (Pmode, op1);
13313 op1 = copy_addr_to_reg (op1);
13314 }
13315
13316 switch (fcode)
13317 {
13318 case IX86_BUILTIN_2INTERSECTD512:
13319 mode4 = P2HImode;
13320 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13321 break;
13322 case IX86_BUILTIN_2INTERSECTQ512:
13323 mode4 = P2QImode;
13324 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13325 break;
13326 case IX86_BUILTIN_2INTERSECTD256:
13327 mode4 = P2QImode;
13328 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13329 break;
13330 case IX86_BUILTIN_2INTERSECTQ256:
13331 mode4 = P2QImode;
13332 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13333 break;
13334 case IX86_BUILTIN_2INTERSECTD128:
13335 mode4 = P2QImode;
13336 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13337 break;
13338 case IX86_BUILTIN_2INTERSECTQ128:
13339 mode4 = P2QImode;
13340 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13341 break;
13342 default:
13343 gcc_unreachable ();
13344 }
13345
13346 mode2 = insn_data[icode].operand[1].mode;
13347 mode3 = insn_data[icode].operand[2].mode;
13348 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13349 op2 = copy_to_mode_reg (mode2, op2);
13350 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13351 op3 = copy_to_mode_reg (mode3, op3);
13352
13353 op4 = gen_reg_rtx (mode4);
13354 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13355 mode0 = mode4 == P2HImode ? HImode : QImode;
13356 emit_move_insn (gen_rtx_MEM (mode0, op0),
13357 gen_lowpart (mode0, op4));
13358 emit_move_insn (gen_rtx_MEM (mode0, op1),
13359 gen_highpart (mode0, op4));
13360
13361 return 0;
13362
13363 case IX86_BUILTIN_RDPMC:
13364 case IX86_BUILTIN_RDTSC:
13365 case IX86_BUILTIN_RDTSCP:
13366 case IX86_BUILTIN_XGETBV:
13367
13368 op0 = gen_reg_rtx (DImode);
13369 op1 = gen_reg_rtx (DImode);
13370
13371 if (fcode == IX86_BUILTIN_RDPMC)
13372 {
13373 arg0 = CALL_EXPR_ARG (exp, 0);
13374 op2 = expand_normal (arg0);
13375 if (!register_operand (op2, SImode))
13376 op2 = copy_to_mode_reg (SImode, op2);
13377
13378 insn = (TARGET_64BIT
13379 ? gen_rdpmc_rex64 (op0, op1, op2)
13380 : gen_rdpmc (op0, op2));
13381 emit_insn (insn);
13382 }
13383 else if (fcode == IX86_BUILTIN_XGETBV)
13384 {
13385 arg0 = CALL_EXPR_ARG (exp, 0);
13386 op2 = expand_normal (arg0);
13387 if (!register_operand (op2, SImode))
13388 op2 = copy_to_mode_reg (SImode, op2);
13389
13390 insn = (TARGET_64BIT
13391 ? gen_xgetbv_rex64 (op0, op1, op2)
13392 : gen_xgetbv (op0, op2));
13393 emit_insn (insn);
13394 }
13395 else if (fcode == IX86_BUILTIN_RDTSC)
13396 {
13397 insn = (TARGET_64BIT
13398 ? gen_rdtsc_rex64 (op0, op1)
13399 : gen_rdtsc (op0));
13400 emit_insn (insn);
13401 }
13402 else
13403 {
13404 op2 = gen_reg_rtx (SImode);
13405
13406 insn = (TARGET_64BIT
13407 ? gen_rdtscp_rex64 (op0, op1, op2)
13408 : gen_rdtscp (op0, op2));
13409 emit_insn (insn);
13410
13411 arg0 = CALL_EXPR_ARG (exp, 0);
13412 op4 = expand_normal (arg0);
13413 if (!address_operand (op4, VOIDmode))
13414 {
13415 op4 = convert_memory_address (Pmode, op4);
13416 op4 = copy_addr_to_reg (op4);
13417 }
13418 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13419 }
13420
13421 if (target == 0
13422 || !register_operand (target, DImode))
13423 target = gen_reg_rtx (DImode);
13424
13425 if (TARGET_64BIT)
13426 {
13427 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13428 op1, 1, OPTAB_DIRECT);
13429 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13430 op0, 1, OPTAB_DIRECT);
13431 }
13432
13433 emit_move_insn (target, op0);
13434 return target;
13435
13436 case IX86_BUILTIN_ENQCMD:
13437 case IX86_BUILTIN_ENQCMDS:
13438 case IX86_BUILTIN_MOVDIR64B:
13439
13440 arg0 = CALL_EXPR_ARG (exp, 0);
13441 arg1 = CALL_EXPR_ARG (exp, 1);
13442 op0 = expand_normal (arg0);
13443 op1 = expand_normal (arg1);
13444
13445 op0 = ix86_zero_extend_to_Pmode (op0);
13446 if (!address_operand (op1, VOIDmode))
13447 {
13448 op1 = convert_memory_address (Pmode, op1);
13449 op1 = copy_addr_to_reg (op1);
13450 }
13451 op1 = gen_rtx_MEM (XImode, op1);
13452
13453 if (fcode == IX86_BUILTIN_MOVDIR64B)
13454 {
13455 emit_insn (gen_movdir64b (Pmode, op0, op1));
13456 return 0;
13457 }
13458 else
13459 {
13460 if (target == 0
13461 || !register_operand (target, SImode))
13462 target = gen_reg_rtx (SImode);
13463
13464 emit_move_insn (target, const0_rtx);
13465 target = gen_rtx_SUBREG (QImode, target, 0);
13466
13467 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13468 ? UNSPECV_ENQCMD
13469 : UNSPECV_ENQCMDS);
13470 icode = code_for_enqcmd (unspecv, Pmode);
13471 emit_insn (GEN_FCN (icode) (op0, op1));
13472
13473 emit_insn
13474 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13475 gen_rtx_fmt_ee (EQ, QImode,
13476 gen_rtx_REG (CCZmode, FLAGS_REG),
13477 const0_rtx)));
13478 return SUBREG_REG (target);
13479 }
13480
13481 case IX86_BUILTIN_FXSAVE:
13482 case IX86_BUILTIN_FXRSTOR:
13483 case IX86_BUILTIN_FXSAVE64:
13484 case IX86_BUILTIN_FXRSTOR64:
13485 case IX86_BUILTIN_FNSTENV:
13486 case IX86_BUILTIN_FLDENV:
13487 mode0 = BLKmode;
13488 switch (fcode)
13489 {
13490 case IX86_BUILTIN_FXSAVE:
13491 icode = CODE_FOR_fxsave;
13492 break;
13493 case IX86_BUILTIN_FXRSTOR:
13494 icode = CODE_FOR_fxrstor;
13495 break;
13496 case IX86_BUILTIN_FXSAVE64:
13497 icode = CODE_FOR_fxsave64;
13498 break;
13499 case IX86_BUILTIN_FXRSTOR64:
13500 icode = CODE_FOR_fxrstor64;
13501 break;
13502 case IX86_BUILTIN_FNSTENV:
13503 icode = CODE_FOR_fnstenv;
13504 break;
13505 case IX86_BUILTIN_FLDENV:
13506 icode = CODE_FOR_fldenv;
13507 break;
13508 default:
13509 gcc_unreachable ();
13510 }
13511
13512 arg0 = CALL_EXPR_ARG (exp, 0);
13513 op0 = expand_normal (arg0);
13514
13515 if (!address_operand (op0, VOIDmode))
13516 {
13517 op0 = convert_memory_address (Pmode, op0);
13518 op0 = copy_addr_to_reg (op0);
13519 }
13520 op0 = gen_rtx_MEM (mode0, op0);
13521
13522 pat = GEN_FCN (icode) (op0);
13523 if (pat)
13524 emit_insn (pat);
13525 return 0;
13526
13527 case IX86_BUILTIN_XSETBV:
13528 arg0 = CALL_EXPR_ARG (exp, 0);
13529 arg1 = CALL_EXPR_ARG (exp, 1);
13530 op0 = expand_normal (arg0);
13531 op1 = expand_normal (arg1);
13532
13533 if (!REG_P (op0))
13534 op0 = copy_to_mode_reg (SImode, op0);
13535
13536 op1 = force_reg (DImode, op1);
13537
13538 if (TARGET_64BIT)
13539 {
13540 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13541 NULL, 1, OPTAB_DIRECT);
13542
13543 icode = CODE_FOR_xsetbv_rex64;
13544
13545 op2 = gen_lowpart (SImode, op2);
13546 op1 = gen_lowpart (SImode, op1);
13547 pat = GEN_FCN (icode) (op0, op1, op2);
13548 }
13549 else
13550 {
13551 icode = CODE_FOR_xsetbv;
13552
13553 pat = GEN_FCN (icode) (op0, op1);
13554 }
13555 if (pat)
13556 emit_insn (pat);
13557 return 0;
13558
13559 case IX86_BUILTIN_XSAVE:
13560 case IX86_BUILTIN_XRSTOR:
13561 case IX86_BUILTIN_XSAVE64:
13562 case IX86_BUILTIN_XRSTOR64:
13563 case IX86_BUILTIN_XSAVEOPT:
13564 case IX86_BUILTIN_XSAVEOPT64:
13565 case IX86_BUILTIN_XSAVES:
13566 case IX86_BUILTIN_XRSTORS:
13567 case IX86_BUILTIN_XSAVES64:
13568 case IX86_BUILTIN_XRSTORS64:
13569 case IX86_BUILTIN_XSAVEC:
13570 case IX86_BUILTIN_XSAVEC64:
13571 arg0 = CALL_EXPR_ARG (exp, 0);
13572 arg1 = CALL_EXPR_ARG (exp, 1);
13573 op0 = expand_normal (arg0);
13574 op1 = expand_normal (arg1);
13575
13576 if (!address_operand (op0, VOIDmode))
13577 {
13578 op0 = convert_memory_address (Pmode, op0);
13579 op0 = copy_addr_to_reg (op0);
13580 }
13581 op0 = gen_rtx_MEM (BLKmode, op0);
13582
13583 op1 = force_reg (DImode, op1);
13584
13585 if (TARGET_64BIT)
13586 {
13587 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13588 NULL, 1, OPTAB_DIRECT);
13589 switch (fcode)
13590 {
13591 case IX86_BUILTIN_XSAVE:
13592 icode = CODE_FOR_xsave_rex64;
13593 break;
13594 case IX86_BUILTIN_XRSTOR:
13595 icode = CODE_FOR_xrstor_rex64;
13596 break;
13597 case IX86_BUILTIN_XSAVE64:
13598 icode = CODE_FOR_xsave64;
13599 break;
13600 case IX86_BUILTIN_XRSTOR64:
13601 icode = CODE_FOR_xrstor64;
13602 break;
13603 case IX86_BUILTIN_XSAVEOPT:
13604 icode = CODE_FOR_xsaveopt_rex64;
13605 break;
13606 case IX86_BUILTIN_XSAVEOPT64:
13607 icode = CODE_FOR_xsaveopt64;
13608 break;
13609 case IX86_BUILTIN_XSAVES:
13610 icode = CODE_FOR_xsaves_rex64;
13611 break;
13612 case IX86_BUILTIN_XRSTORS:
13613 icode = CODE_FOR_xrstors_rex64;
13614 break;
13615 case IX86_BUILTIN_XSAVES64:
13616 icode = CODE_FOR_xsaves64;
13617 break;
13618 case IX86_BUILTIN_XRSTORS64:
13619 icode = CODE_FOR_xrstors64;
13620 break;
13621 case IX86_BUILTIN_XSAVEC:
13622 icode = CODE_FOR_xsavec_rex64;
13623 break;
13624 case IX86_BUILTIN_XSAVEC64:
13625 icode = CODE_FOR_xsavec64;
13626 break;
13627 default:
13628 gcc_unreachable ();
13629 }
13630
13631 op2 = gen_lowpart (SImode, op2);
13632 op1 = gen_lowpart (SImode, op1);
13633 pat = GEN_FCN (icode) (op0, op1, op2);
13634 }
13635 else
13636 {
13637 switch (fcode)
13638 {
13639 case IX86_BUILTIN_XSAVE:
13640 icode = CODE_FOR_xsave;
13641 break;
13642 case IX86_BUILTIN_XRSTOR:
13643 icode = CODE_FOR_xrstor;
13644 break;
13645 case IX86_BUILTIN_XSAVEOPT:
13646 icode = CODE_FOR_xsaveopt;
13647 break;
13648 case IX86_BUILTIN_XSAVES:
13649 icode = CODE_FOR_xsaves;
13650 break;
13651 case IX86_BUILTIN_XRSTORS:
13652 icode = CODE_FOR_xrstors;
13653 break;
13654 case IX86_BUILTIN_XSAVEC:
13655 icode = CODE_FOR_xsavec;
13656 break;
13657 default:
13658 gcc_unreachable ();
13659 }
13660 pat = GEN_FCN (icode) (op0, op1);
13661 }
13662
13663 if (pat)
13664 emit_insn (pat);
13665 return 0;
13666
13667 case IX86_BUILTIN_LLWPCB:
13668 arg0 = CALL_EXPR_ARG (exp, 0);
13669 op0 = expand_normal (arg0);
13670
13671 if (!register_operand (op0, Pmode))
13672 op0 = ix86_zero_extend_to_Pmode (op0);
13673 emit_insn (gen_lwp_llwpcb (Pmode, op0));
13674 return 0;
13675
13676 case IX86_BUILTIN_SLWPCB:
13677 if (!target
13678 || !register_operand (target, Pmode))
13679 target = gen_reg_rtx (Pmode);
13680 emit_insn (gen_lwp_slwpcb (Pmode, target));
13681 return target;
13682
13683 case IX86_BUILTIN_LWPVAL32:
13684 case IX86_BUILTIN_LWPVAL64:
13685 case IX86_BUILTIN_LWPINS32:
13686 case IX86_BUILTIN_LWPINS64:
13687 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13688 || fcode == IX86_BUILTIN_LWPINS32)
13689 ? SImode : DImode);
13690
13691 if (fcode == IX86_BUILTIN_LWPVAL32
13692 || fcode == IX86_BUILTIN_LWPVAL64)
13693 icode = code_for_lwp_lwpval (mode);
13694 else
13695 icode = code_for_lwp_lwpins (mode);
13696
13697 arg0 = CALL_EXPR_ARG (exp, 0);
13698 arg1 = CALL_EXPR_ARG (exp, 1);
13699 arg2 = CALL_EXPR_ARG (exp, 2);
13700 op0 = expand_normal (arg0);
13701 op1 = expand_normal (arg1);
13702 op2 = expand_normal (arg2);
13703 mode0 = insn_data[icode].operand[0].mode;
13704
13705 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13706 op0 = copy_to_mode_reg (mode0, op0);
13707 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13708 op1 = copy_to_mode_reg (SImode, op1);
13709
13710 if (!CONST_INT_P (op2))
13711 {
13712 error ("the last argument must be a 32-bit immediate");
13713 return const0_rtx;
13714 }
13715
13716 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13717
13718 if (fcode == IX86_BUILTIN_LWPINS32
13719 || fcode == IX86_BUILTIN_LWPINS64)
13720 {
13721 if (target == 0
13722 || !nonimmediate_operand (target, QImode))
13723 target = gen_reg_rtx (QImode);
13724
13725 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13726 const0_rtx);
13727 emit_insn (gen_rtx_SET (target, pat));
13728
13729 return target;
13730 }
13731 else
13732 return 0;
13733
13734 case IX86_BUILTIN_BEXTRI32:
13735 case IX86_BUILTIN_BEXTRI64:
13736 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13737
13738 arg0 = CALL_EXPR_ARG (exp, 0);
13739 arg1 = CALL_EXPR_ARG (exp, 1);
13740 op0 = expand_normal (arg0);
13741 op1 = expand_normal (arg1);
13742
13743 if (!CONST_INT_P (op1))
13744 {
13745 error ("last argument must be an immediate");
13746 return const0_rtx;
13747 }
13748 else
13749 {
13750 unsigned char lsb_index = UINTVAL (op1);
13751 unsigned char length = UINTVAL (op1) >> 8;
13752
13753 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13754
13755 icode = code_for_tbm_bextri (mode);
13756
13757 mode1 = insn_data[icode].operand[1].mode;
13758 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13759 op0 = copy_to_mode_reg (mode1, op0);
13760
13761 mode0 = insn_data[icode].operand[0].mode;
13762 if (target == 0
13763 || !register_operand (target, mode0))
13764 target = gen_reg_rtx (mode0);
13765
13766 if (length == 0 || lsb_index >= bitsize)
13767 {
13768 emit_move_insn (target, const0_rtx);
13769 return target;
13770 }
13771
13772 if (length + lsb_index > bitsize)
13773 length = bitsize - lsb_index;
13774
13775 op1 = GEN_INT (length);
13776 op2 = GEN_INT (lsb_index);
13777
13778 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13779 return target;
13780 }
13781
13782 case IX86_BUILTIN_RDRAND16_STEP:
13783 mode = HImode;
13784 goto rdrand_step;
13785
13786 case IX86_BUILTIN_RDRAND32_STEP:
13787 mode = SImode;
13788 goto rdrand_step;
13789
13790 case IX86_BUILTIN_RDRAND64_STEP:
13791 mode = DImode;
13792
13793 rdrand_step:
13794 arg0 = CALL_EXPR_ARG (exp, 0);
13795 op1 = expand_normal (arg0);
13796 if (!address_operand (op1, VOIDmode))
13797 {
13798 op1 = convert_memory_address (Pmode, op1);
13799 op1 = copy_addr_to_reg (op1);
13800 }
13801
13802 op0 = gen_reg_rtx (mode);
13803 emit_insn (gen_rdrand (mode, op0));
13804
13805 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13806
13807 op1 = force_reg (SImode, const1_rtx);
13808
13809 /* Emit SImode conditional move. */
13810 if (mode == HImode)
13811 {
13812 if (TARGET_ZERO_EXTEND_WITH_AND
13813 && optimize_function_for_speed_p (cfun))
13814 {
13815 op2 = force_reg (SImode, const0_rtx);
13816
13817 emit_insn (gen_movstricthi
13818 (gen_lowpart (HImode, op2), op0));
13819 }
13820 else
13821 {
13822 op2 = gen_reg_rtx (SImode);
13823
13824 emit_insn (gen_zero_extendhisi2 (op2, op0));
13825 }
13826 }
13827 else if (mode == SImode)
13828 op2 = op0;
13829 else
13830 op2 = gen_rtx_SUBREG (SImode, op0, 0);
13831
13832 if (target == 0
13833 || !register_operand (target, SImode))
13834 target = gen_reg_rtx (SImode);
13835
13836 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13837 const0_rtx);
13838 emit_insn (gen_rtx_SET (target,
13839 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13840 return target;
13841
13842 case IX86_BUILTIN_RDSEED16_STEP:
13843 mode = HImode;
13844 goto rdseed_step;
13845
13846 case IX86_BUILTIN_RDSEED32_STEP:
13847 mode = SImode;
13848 goto rdseed_step;
13849
13850 case IX86_BUILTIN_RDSEED64_STEP:
13851 mode = DImode;
13852
13853 rdseed_step:
13854 arg0 = CALL_EXPR_ARG (exp, 0);
13855 op1 = expand_normal (arg0);
13856 if (!address_operand (op1, VOIDmode))
13857 {
13858 op1 = convert_memory_address (Pmode, op1);
13859 op1 = copy_addr_to_reg (op1);
13860 }
13861
13862 op0 = gen_reg_rtx (mode);
13863 emit_insn (gen_rdseed (mode, op0));
13864
13865 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13866
13867 op2 = gen_reg_rtx (QImode);
13868
13869 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13870 const0_rtx);
13871 emit_insn (gen_rtx_SET (op2, pat));
13872
13873 if (target == 0
13874 || !register_operand (target, SImode))
13875 target = gen_reg_rtx (SImode);
13876
13877 emit_insn (gen_zero_extendqisi2 (target, op2));
13878 return target;
13879
13880 case IX86_BUILTIN_SBB32:
13881 icode = CODE_FOR_subborrowsi;
13882 icode2 = CODE_FOR_subborrowsi_0;
13883 mode0 = SImode;
13884 mode1 = DImode;
13885 mode2 = CCmode;
13886 goto handlecarry;
13887
13888 case IX86_BUILTIN_SBB64:
13889 icode = CODE_FOR_subborrowdi;
13890 icode2 = CODE_FOR_subborrowdi_0;
13891 mode0 = DImode;
13892 mode1 = TImode;
13893 mode2 = CCmode;
13894 goto handlecarry;
13895
13896 case IX86_BUILTIN_ADDCARRYX32:
13897 icode = CODE_FOR_addcarrysi;
13898 icode2 = CODE_FOR_addcarrysi_0;
13899 mode0 = SImode;
13900 mode1 = DImode;
13901 mode2 = CCCmode;
13902 goto handlecarry;
13903
13904 case IX86_BUILTIN_ADDCARRYX64:
13905 icode = CODE_FOR_addcarrydi;
13906 icode2 = CODE_FOR_addcarrydi_0;
13907 mode0 = DImode;
13908 mode1 = TImode;
13909 mode2 = CCCmode;
13910
13911 handlecarry:
13912 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
13913 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
13914 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
13915 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
13916
13917 op1 = expand_normal (arg0);
13918 if (!integer_zerop (arg0))
13919 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
13920
13921 op2 = expand_normal (arg1);
13922 if (!register_operand (op2, mode0))
13923 op2 = copy_to_mode_reg (mode0, op2);
13924
13925 op3 = expand_normal (arg2);
13926 if (!register_operand (op3, mode0))
13927 op3 = copy_to_mode_reg (mode0, op3);
13928
13929 op4 = expand_normal (arg3);
13930 if (!address_operand (op4, VOIDmode))
13931 {
13932 op4 = convert_memory_address (Pmode, op4);
13933 op4 = copy_addr_to_reg (op4);
13934 }
13935
13936 op0 = gen_reg_rtx (mode0);
13937 if (integer_zerop (arg0))
13938 {
13939 /* If arg0 is 0, optimize right away into add or sub
13940 instruction that sets CCCmode flags. */
13941 op1 = gen_rtx_REG (mode2, FLAGS_REG);
13942 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
13943 }
13944 else
13945 {
13946 /* Generate CF from input operand. */
13947 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
13948
13949 /* Generate instruction that consumes CF. */
13950 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
13951 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
13952 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
13953 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
13954 }
13955
13956 /* Return current CF value. */
13957 if (target == 0)
13958 target = gen_reg_rtx (QImode);
13959
13960 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
13961 emit_insn (gen_rtx_SET (target, pat));
13962
13963 /* Store the result. */
13964 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
13965
13966 return target;
13967
13968 case IX86_BUILTIN_READ_FLAGS:
13969 if (ignore)
13970 return const0_rtx;
13971
13972 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
13973
13974 if (optimize
13975 || target == NULL_RTX
13976 || !nonimmediate_operand (target, word_mode)
13977 || GET_MODE (target) != word_mode)
13978 target = gen_reg_rtx (word_mode);
13979
13980 emit_insn (gen_pop (target));
13981 return target;
13982
13983 case IX86_BUILTIN_WRITE_FLAGS:
13984
13985 arg0 = CALL_EXPR_ARG (exp, 0);
13986 op0 = expand_normal (arg0);
13987 if (!general_no_elim_operand (op0, word_mode))
13988 op0 = copy_to_mode_reg (word_mode, op0);
13989
13990 emit_insn (gen_push (op0));
13991 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
13992 return 0;
13993
13994 case IX86_BUILTIN_KTESTC8:
13995 icode = CODE_FOR_ktestqi;
13996 mode3 = CCCmode;
13997 goto kortest;
13998
13999 case IX86_BUILTIN_KTESTZ8:
14000 icode = CODE_FOR_ktestqi;
14001 mode3 = CCZmode;
14002 goto kortest;
14003
14004 case IX86_BUILTIN_KTESTC16:
14005 icode = CODE_FOR_ktesthi;
14006 mode3 = CCCmode;
14007 goto kortest;
14008
14009 case IX86_BUILTIN_KTESTZ16:
14010 icode = CODE_FOR_ktesthi;
14011 mode3 = CCZmode;
14012 goto kortest;
14013
14014 case IX86_BUILTIN_KTESTC32:
14015 icode = CODE_FOR_ktestsi;
14016 mode3 = CCCmode;
14017 goto kortest;
14018
14019 case IX86_BUILTIN_KTESTZ32:
14020 icode = CODE_FOR_ktestsi;
14021 mode3 = CCZmode;
14022 goto kortest;
14023
14024 case IX86_BUILTIN_KTESTC64:
14025 icode = CODE_FOR_ktestdi;
14026 mode3 = CCCmode;
14027 goto kortest;
14028
14029 case IX86_BUILTIN_KTESTZ64:
14030 icode = CODE_FOR_ktestdi;
14031 mode3 = CCZmode;
14032 goto kortest;
14033
14034 case IX86_BUILTIN_KORTESTC8:
14035 icode = CODE_FOR_kortestqi;
14036 mode3 = CCCmode;
14037 goto kortest;
14038
14039 case IX86_BUILTIN_KORTESTZ8:
14040 icode = CODE_FOR_kortestqi;
14041 mode3 = CCZmode;
14042 goto kortest;
14043
14044 case IX86_BUILTIN_KORTESTC16:
14045 icode = CODE_FOR_kortesthi;
14046 mode3 = CCCmode;
14047 goto kortest;
14048
14049 case IX86_BUILTIN_KORTESTZ16:
14050 icode = CODE_FOR_kortesthi;
14051 mode3 = CCZmode;
14052 goto kortest;
14053
14054 case IX86_BUILTIN_KORTESTC32:
14055 icode = CODE_FOR_kortestsi;
14056 mode3 = CCCmode;
14057 goto kortest;
14058
14059 case IX86_BUILTIN_KORTESTZ32:
14060 icode = CODE_FOR_kortestsi;
14061 mode3 = CCZmode;
14062 goto kortest;
14063
14064 case IX86_BUILTIN_KORTESTC64:
14065 icode = CODE_FOR_kortestdi;
14066 mode3 = CCCmode;
14067 goto kortest;
14068
14069 case IX86_BUILTIN_KORTESTZ64:
14070 icode = CODE_FOR_kortestdi;
14071 mode3 = CCZmode;
14072
14073 kortest:
14074 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
14075 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
14076 op0 = expand_normal (arg0);
14077 op1 = expand_normal (arg1);
14078
14079 mode0 = insn_data[icode].operand[0].mode;
14080 mode1 = insn_data[icode].operand[1].mode;
14081
14082 if (GET_MODE (op0) != VOIDmode)
14083 op0 = force_reg (GET_MODE (op0), op0);
14084
14085 op0 = gen_lowpart (mode0, op0);
14086
14087 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14088 op0 = copy_to_mode_reg (mode0, op0);
14089
14090 if (GET_MODE (op1) != VOIDmode)
14091 op1 = force_reg (GET_MODE (op1), op1);
14092
14093 op1 = gen_lowpart (mode1, op1);
14094
14095 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14096 op1 = copy_to_mode_reg (mode1, op1);
14097
14098 target = gen_reg_rtx (QImode);
14099
14100 /* Emit kortest. */
14101 emit_insn (GEN_FCN (icode) (op0, op1));
14102 /* And use setcc to return result from flags. */
14103 ix86_expand_setcc (target, EQ,
14104 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
14105 return target;
14106
14107 case IX86_BUILTIN_GATHERSIV2DF:
14108 icode = CODE_FOR_avx2_gathersiv2df;
14109 goto gather_gen;
14110 case IX86_BUILTIN_GATHERSIV4DF:
14111 icode = CODE_FOR_avx2_gathersiv4df;
14112 goto gather_gen;
14113 case IX86_BUILTIN_GATHERDIV2DF:
14114 icode = CODE_FOR_avx2_gatherdiv2df;
14115 goto gather_gen;
14116 case IX86_BUILTIN_GATHERDIV4DF:
14117 icode = CODE_FOR_avx2_gatherdiv4df;
14118 goto gather_gen;
14119 case IX86_BUILTIN_GATHERSIV4SF:
14120 icode = CODE_FOR_avx2_gathersiv4sf;
14121 goto gather_gen;
14122 case IX86_BUILTIN_GATHERSIV8SF:
14123 icode = CODE_FOR_avx2_gathersiv8sf;
14124 goto gather_gen;
14125 case IX86_BUILTIN_GATHERDIV4SF:
14126 icode = CODE_FOR_avx2_gatherdiv4sf;
14127 goto gather_gen;
14128 case IX86_BUILTIN_GATHERDIV8SF:
14129 icode = CODE_FOR_avx2_gatherdiv8sf;
14130 goto gather_gen;
14131 case IX86_BUILTIN_GATHERSIV2DI:
14132 icode = CODE_FOR_avx2_gathersiv2di;
14133 goto gather_gen;
14134 case IX86_BUILTIN_GATHERSIV4DI:
14135 icode = CODE_FOR_avx2_gathersiv4di;
14136 goto gather_gen;
14137 case IX86_BUILTIN_GATHERDIV2DI:
14138 icode = CODE_FOR_avx2_gatherdiv2di;
14139 goto gather_gen;
14140 case IX86_BUILTIN_GATHERDIV4DI:
14141 icode = CODE_FOR_avx2_gatherdiv4di;
14142 goto gather_gen;
14143 case IX86_BUILTIN_GATHERSIV4SI:
14144 icode = CODE_FOR_avx2_gathersiv4si;
14145 goto gather_gen;
14146 case IX86_BUILTIN_GATHERSIV8SI:
14147 icode = CODE_FOR_avx2_gathersiv8si;
14148 goto gather_gen;
14149 case IX86_BUILTIN_GATHERDIV4SI:
14150 icode = CODE_FOR_avx2_gatherdiv4si;
14151 goto gather_gen;
14152 case IX86_BUILTIN_GATHERDIV8SI:
14153 icode = CODE_FOR_avx2_gatherdiv8si;
14154 goto gather_gen;
14155 case IX86_BUILTIN_GATHERALTSIV4DF:
14156 icode = CODE_FOR_avx2_gathersiv4df;
14157 goto gather_gen;
14158 case IX86_BUILTIN_GATHERALTDIV8SF:
14159 icode = CODE_FOR_avx2_gatherdiv8sf;
14160 goto gather_gen;
14161 case IX86_BUILTIN_GATHERALTSIV4DI:
14162 icode = CODE_FOR_avx2_gathersiv4di;
14163 goto gather_gen;
14164 case IX86_BUILTIN_GATHERALTDIV8SI:
14165 icode = CODE_FOR_avx2_gatherdiv8si;
14166 goto gather_gen;
14167 case IX86_BUILTIN_GATHER3SIV16SF:
14168 icode = CODE_FOR_avx512f_gathersiv16sf;
14169 goto gather_gen;
14170 case IX86_BUILTIN_GATHER3SIV8DF:
14171 icode = CODE_FOR_avx512f_gathersiv8df;
14172 goto gather_gen;
14173 case IX86_BUILTIN_GATHER3DIV16SF:
14174 icode = CODE_FOR_avx512f_gatherdiv16sf;
14175 goto gather_gen;
14176 case IX86_BUILTIN_GATHER3DIV8DF:
14177 icode = CODE_FOR_avx512f_gatherdiv8df;
14178 goto gather_gen;
14179 case IX86_BUILTIN_GATHER3SIV16SI:
14180 icode = CODE_FOR_avx512f_gathersiv16si;
14181 goto gather_gen;
14182 case IX86_BUILTIN_GATHER3SIV8DI:
14183 icode = CODE_FOR_avx512f_gathersiv8di;
14184 goto gather_gen;
14185 case IX86_BUILTIN_GATHER3DIV16SI:
14186 icode = CODE_FOR_avx512f_gatherdiv16si;
14187 goto gather_gen;
14188 case IX86_BUILTIN_GATHER3DIV8DI:
14189 icode = CODE_FOR_avx512f_gatherdiv8di;
14190 goto gather_gen;
14191 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14192 icode = CODE_FOR_avx512f_gathersiv8df;
14193 goto gather_gen;
14194 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14195 icode = CODE_FOR_avx512f_gatherdiv16sf;
14196 goto gather_gen;
14197 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14198 icode = CODE_FOR_avx512f_gathersiv8di;
14199 goto gather_gen;
14200 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14201 icode = CODE_FOR_avx512f_gatherdiv16si;
14202 goto gather_gen;
14203 case IX86_BUILTIN_GATHER3SIV2DF:
14204 icode = CODE_FOR_avx512vl_gathersiv2df;
14205 goto gather_gen;
14206 case IX86_BUILTIN_GATHER3SIV4DF:
14207 icode = CODE_FOR_avx512vl_gathersiv4df;
14208 goto gather_gen;
14209 case IX86_BUILTIN_GATHER3DIV2DF:
14210 icode = CODE_FOR_avx512vl_gatherdiv2df;
14211 goto gather_gen;
14212 case IX86_BUILTIN_GATHER3DIV4DF:
14213 icode = CODE_FOR_avx512vl_gatherdiv4df;
14214 goto gather_gen;
14215 case IX86_BUILTIN_GATHER3SIV4SF:
14216 icode = CODE_FOR_avx512vl_gathersiv4sf;
14217 goto gather_gen;
14218 case IX86_BUILTIN_GATHER3SIV8SF:
14219 icode = CODE_FOR_avx512vl_gathersiv8sf;
14220 goto gather_gen;
14221 case IX86_BUILTIN_GATHER3DIV4SF:
14222 icode = CODE_FOR_avx512vl_gatherdiv4sf;
14223 goto gather_gen;
14224 case IX86_BUILTIN_GATHER3DIV8SF:
14225 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14226 goto gather_gen;
14227 case IX86_BUILTIN_GATHER3SIV2DI:
14228 icode = CODE_FOR_avx512vl_gathersiv2di;
14229 goto gather_gen;
14230 case IX86_BUILTIN_GATHER3SIV4DI:
14231 icode = CODE_FOR_avx512vl_gathersiv4di;
14232 goto gather_gen;
14233 case IX86_BUILTIN_GATHER3DIV2DI:
14234 icode = CODE_FOR_avx512vl_gatherdiv2di;
14235 goto gather_gen;
14236 case IX86_BUILTIN_GATHER3DIV4DI:
14237 icode = CODE_FOR_avx512vl_gatherdiv4di;
14238 goto gather_gen;
14239 case IX86_BUILTIN_GATHER3SIV4SI:
14240 icode = CODE_FOR_avx512vl_gathersiv4si;
14241 goto gather_gen;
14242 case IX86_BUILTIN_GATHER3SIV8SI:
14243 icode = CODE_FOR_avx512vl_gathersiv8si;
14244 goto gather_gen;
14245 case IX86_BUILTIN_GATHER3DIV4SI:
14246 icode = CODE_FOR_avx512vl_gatherdiv4si;
14247 goto gather_gen;
14248 case IX86_BUILTIN_GATHER3DIV8SI:
14249 icode = CODE_FOR_avx512vl_gatherdiv8si;
14250 goto gather_gen;
14251 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14252 icode = CODE_FOR_avx512vl_gathersiv4df;
14253 goto gather_gen;
14254 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14255 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14256 goto gather_gen;
14257 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14258 icode = CODE_FOR_avx512vl_gathersiv4di;
14259 goto gather_gen;
14260 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14261 icode = CODE_FOR_avx512vl_gatherdiv8si;
14262 goto gather_gen;
14263 case IX86_BUILTIN_SCATTERSIV16SF:
14264 icode = CODE_FOR_avx512f_scattersiv16sf;
14265 goto scatter_gen;
14266 case IX86_BUILTIN_SCATTERSIV8DF:
14267 icode = CODE_FOR_avx512f_scattersiv8df;
14268 goto scatter_gen;
14269 case IX86_BUILTIN_SCATTERDIV16SF:
14270 icode = CODE_FOR_avx512f_scatterdiv16sf;
14271 goto scatter_gen;
14272 case IX86_BUILTIN_SCATTERDIV8DF:
14273 icode = CODE_FOR_avx512f_scatterdiv8df;
14274 goto scatter_gen;
14275 case IX86_BUILTIN_SCATTERSIV16SI:
14276 icode = CODE_FOR_avx512f_scattersiv16si;
14277 goto scatter_gen;
14278 case IX86_BUILTIN_SCATTERSIV8DI:
14279 icode = CODE_FOR_avx512f_scattersiv8di;
14280 goto scatter_gen;
14281 case IX86_BUILTIN_SCATTERDIV16SI:
14282 icode = CODE_FOR_avx512f_scatterdiv16si;
14283 goto scatter_gen;
14284 case IX86_BUILTIN_SCATTERDIV8DI:
14285 icode = CODE_FOR_avx512f_scatterdiv8di;
14286 goto scatter_gen;
14287 case IX86_BUILTIN_SCATTERSIV8SF:
14288 icode = CODE_FOR_avx512vl_scattersiv8sf;
14289 goto scatter_gen;
14290 case IX86_BUILTIN_SCATTERSIV4SF:
14291 icode = CODE_FOR_avx512vl_scattersiv4sf;
14292 goto scatter_gen;
14293 case IX86_BUILTIN_SCATTERSIV4DF:
14294 icode = CODE_FOR_avx512vl_scattersiv4df;
14295 goto scatter_gen;
14296 case IX86_BUILTIN_SCATTERSIV2DF:
14297 icode = CODE_FOR_avx512vl_scattersiv2df;
14298 goto scatter_gen;
14299 case IX86_BUILTIN_SCATTERDIV8SF:
14300 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14301 goto scatter_gen;
14302 case IX86_BUILTIN_SCATTERDIV4SF:
14303 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14304 goto scatter_gen;
14305 case IX86_BUILTIN_SCATTERDIV4DF:
14306 icode = CODE_FOR_avx512vl_scatterdiv4df;
14307 goto scatter_gen;
14308 case IX86_BUILTIN_SCATTERDIV2DF:
14309 icode = CODE_FOR_avx512vl_scatterdiv2df;
14310 goto scatter_gen;
14311 case IX86_BUILTIN_SCATTERSIV8SI:
14312 icode = CODE_FOR_avx512vl_scattersiv8si;
14313 goto scatter_gen;
14314 case IX86_BUILTIN_SCATTERSIV4SI:
14315 icode = CODE_FOR_avx512vl_scattersiv4si;
14316 goto scatter_gen;
14317 case IX86_BUILTIN_SCATTERSIV4DI:
14318 icode = CODE_FOR_avx512vl_scattersiv4di;
14319 goto scatter_gen;
14320 case IX86_BUILTIN_SCATTERSIV2DI:
14321 icode = CODE_FOR_avx512vl_scattersiv2di;
14322 goto scatter_gen;
14323 case IX86_BUILTIN_SCATTERDIV8SI:
14324 icode = CODE_FOR_avx512vl_scatterdiv8si;
14325 goto scatter_gen;
14326 case IX86_BUILTIN_SCATTERDIV4SI:
14327 icode = CODE_FOR_avx512vl_scatterdiv4si;
14328 goto scatter_gen;
14329 case IX86_BUILTIN_SCATTERDIV4DI:
14330 icode = CODE_FOR_avx512vl_scatterdiv4di;
14331 goto scatter_gen;
14332 case IX86_BUILTIN_SCATTERDIV2DI:
14333 icode = CODE_FOR_avx512vl_scatterdiv2di;
14334 goto scatter_gen;
14335 case IX86_BUILTIN_GATHERPFDPD:
14336 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14337 goto vec_prefetch_gen;
14338 case IX86_BUILTIN_SCATTERALTSIV8DF:
14339 icode = CODE_FOR_avx512f_scattersiv8df;
14340 goto scatter_gen;
14341 case IX86_BUILTIN_SCATTERALTDIV16SF:
14342 icode = CODE_FOR_avx512f_scatterdiv16sf;
14343 goto scatter_gen;
14344 case IX86_BUILTIN_SCATTERALTSIV8DI:
14345 icode = CODE_FOR_avx512f_scattersiv8di;
14346 goto scatter_gen;
14347 case IX86_BUILTIN_SCATTERALTDIV16SI:
14348 icode = CODE_FOR_avx512f_scatterdiv16si;
14349 goto scatter_gen;
14350 case IX86_BUILTIN_SCATTERALTSIV4DF:
14351 icode = CODE_FOR_avx512vl_scattersiv4df;
14352 goto scatter_gen;
14353 case IX86_BUILTIN_SCATTERALTDIV8SF:
14354 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14355 goto scatter_gen;
14356 case IX86_BUILTIN_SCATTERALTSIV4DI:
14357 icode = CODE_FOR_avx512vl_scattersiv4di;
14358 goto scatter_gen;
14359 case IX86_BUILTIN_SCATTERALTDIV8SI:
14360 icode = CODE_FOR_avx512vl_scatterdiv8si;
14361 goto scatter_gen;
14362 case IX86_BUILTIN_SCATTERALTSIV2DF:
14363 icode = CODE_FOR_avx512vl_scattersiv2df;
14364 goto scatter_gen;
14365 case IX86_BUILTIN_SCATTERALTDIV4SF:
14366 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14367 goto scatter_gen;
14368 case IX86_BUILTIN_SCATTERALTSIV2DI:
14369 icode = CODE_FOR_avx512vl_scattersiv2di;
14370 goto scatter_gen;
14371 case IX86_BUILTIN_SCATTERALTDIV4SI:
14372 icode = CODE_FOR_avx512vl_scatterdiv4si;
14373 goto scatter_gen;
14374 case IX86_BUILTIN_GATHERPFDPS:
14375 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14376 goto vec_prefetch_gen;
14377 case IX86_BUILTIN_GATHERPFQPD:
14378 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14379 goto vec_prefetch_gen;
14380 case IX86_BUILTIN_GATHERPFQPS:
14381 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14382 goto vec_prefetch_gen;
14383 case IX86_BUILTIN_SCATTERPFDPD:
14384 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14385 goto vec_prefetch_gen;
14386 case IX86_BUILTIN_SCATTERPFDPS:
14387 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14388 goto vec_prefetch_gen;
14389 case IX86_BUILTIN_SCATTERPFQPD:
14390 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14391 goto vec_prefetch_gen;
14392 case IX86_BUILTIN_SCATTERPFQPS:
14393 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14394 goto vec_prefetch_gen;
14395
14396 gather_gen:
14397 rtx half;
14398 rtx (*gen) (rtx, rtx);
14399
14400 arg0 = CALL_EXPR_ARG (exp, 0);
14401 arg1 = CALL_EXPR_ARG (exp, 1);
14402 arg2 = CALL_EXPR_ARG (exp, 2);
14403 arg3 = CALL_EXPR_ARG (exp, 3);
14404 arg4 = CALL_EXPR_ARG (exp, 4);
14405 op0 = expand_normal (arg0);
14406 op1 = expand_normal (arg1);
14407 op2 = expand_normal (arg2);
14408 op3 = expand_normal (arg3);
14409 op4 = expand_normal (arg4);
14410 /* Note the arg order is different from the operand order. */
14411 mode0 = insn_data[icode].operand[1].mode;
14412 mode2 = insn_data[icode].operand[3].mode;
14413 mode3 = insn_data[icode].operand[4].mode;
14414 mode4 = insn_data[icode].operand[5].mode;
14415
14416 if (target == NULL_RTX
14417 || GET_MODE (target) != insn_data[icode].operand[0].mode
14418 || !insn_data[icode].operand[0].predicate (target,
14419 GET_MODE (target)))
14420 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14421 else
14422 subtarget = target;
14423
14424 switch (fcode)
14425 {
14426 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14427 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14428 half = gen_reg_rtx (V8SImode);
14429 if (!nonimmediate_operand (op2, V16SImode))
14430 op2 = copy_to_mode_reg (V16SImode, op2);
14431 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14432 op2 = half;
14433 break;
14434 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14435 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14436 case IX86_BUILTIN_GATHERALTSIV4DF:
14437 case IX86_BUILTIN_GATHERALTSIV4DI:
14438 half = gen_reg_rtx (V4SImode);
14439 if (!nonimmediate_operand (op2, V8SImode))
14440 op2 = copy_to_mode_reg (V8SImode, op2);
14441 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14442 op2 = half;
14443 break;
14444 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14445 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14446 half = gen_reg_rtx (mode0);
14447 if (mode0 == V8SFmode)
14448 gen = gen_vec_extract_lo_v16sf;
14449 else
14450 gen = gen_vec_extract_lo_v16si;
14451 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14452 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14453 emit_insn (gen (half, op0));
14454 op0 = half;
14455 op3 = lowpart_subreg (QImode, op3, HImode);
14456 break;
14457 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14458 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14459 case IX86_BUILTIN_GATHERALTDIV8SF:
14460 case IX86_BUILTIN_GATHERALTDIV8SI:
14461 half = gen_reg_rtx (mode0);
14462 if (mode0 == V4SFmode)
14463 gen = gen_vec_extract_lo_v8sf;
14464 else
14465 gen = gen_vec_extract_lo_v8si;
14466 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14467 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14468 emit_insn (gen (half, op0));
14469 op0 = half;
14470 if (VECTOR_MODE_P (GET_MODE (op3)))
14471 {
14472 half = gen_reg_rtx (mode0);
14473 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14474 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14475 emit_insn (gen (half, op3));
14476 op3 = half;
14477 }
14478 break;
14479 default:
14480 break;
14481 }
14482
14483 /* Force memory operand only with base register here. But we
14484 don't want to do it on memory operand for other builtin
14485 functions. */
14486 op1 = ix86_zero_extend_to_Pmode (op1);
14487
14488 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14489 op0 = copy_to_mode_reg (mode0, op0);
14490 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14491 op1 = copy_to_mode_reg (Pmode, op1);
14492 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14493 op2 = copy_to_mode_reg (mode2, op2);
14494
14495 op3 = fixup_modeless_constant (op3, mode3);
14496
14497 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14498 {
14499 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14500 op3 = copy_to_mode_reg (mode3, op3);
14501 }
14502 else
14503 {
14504 op3 = copy_to_reg (op3);
14505 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14506 }
14507 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14508 {
14509 error ("the last argument must be scale 1, 2, 4, 8");
14510 return const0_rtx;
14511 }
14512
14513 /* Optimize. If mask is known to have all high bits set,
14514 replace op0 with pc_rtx to signal that the instruction
14515 overwrites the whole destination and doesn't use its
14516 previous contents. */
14517 if (optimize)
14518 {
14519 if (TREE_CODE (arg3) == INTEGER_CST)
14520 {
14521 if (integer_all_onesp (arg3))
14522 op0 = pc_rtx;
14523 }
14524 else if (TREE_CODE (arg3) == VECTOR_CST)
14525 {
14526 unsigned int negative = 0;
14527 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14528 {
14529 tree cst = VECTOR_CST_ELT (arg3, i);
14530 if (TREE_CODE (cst) == INTEGER_CST
14531 && tree_int_cst_sign_bit (cst))
14532 negative++;
14533 else if (TREE_CODE (cst) == REAL_CST
14534 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14535 negative++;
14536 }
14537 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14538 op0 = pc_rtx;
14539 }
14540 else if (TREE_CODE (arg3) == SSA_NAME
14541 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
14542 {
14543 /* Recognize also when mask is like:
14544 __v2df src = _mm_setzero_pd ();
14545 __v2df mask = _mm_cmpeq_pd (src, src);
14546 or
14547 __v8sf src = _mm256_setzero_ps ();
14548 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14549 as that is a cheaper way to load all ones into
14550 a register than having to load a constant from
14551 memory. */
14552 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14553 if (is_gimple_call (def_stmt))
14554 {
14555 tree fndecl = gimple_call_fndecl (def_stmt);
14556 if (fndecl
14557 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
14558 switch (DECL_MD_FUNCTION_CODE (fndecl))
14559 {
14560 case IX86_BUILTIN_CMPPD:
14561 case IX86_BUILTIN_CMPPS:
14562 case IX86_BUILTIN_CMPPD256:
14563 case IX86_BUILTIN_CMPPS256:
14564 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14565 break;
14566 /* FALLTHRU */
14567 case IX86_BUILTIN_CMPEQPD:
14568 case IX86_BUILTIN_CMPEQPS:
14569 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14570 && initializer_zerop (gimple_call_arg (def_stmt,
14571 1)))
14572 op0 = pc_rtx;
14573 break;
14574 default:
14575 break;
14576 }
14577 }
14578 }
14579 }
14580
14581 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14582 if (! pat)
14583 return const0_rtx;
14584 emit_insn (pat);
14585
14586 switch (fcode)
14587 {
14588 case IX86_BUILTIN_GATHER3DIV16SF:
14589 if (target == NULL_RTX)
14590 target = gen_reg_rtx (V8SFmode);
14591 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14592 break;
14593 case IX86_BUILTIN_GATHER3DIV16SI:
14594 if (target == NULL_RTX)
14595 target = gen_reg_rtx (V8SImode);
14596 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14597 break;
14598 case IX86_BUILTIN_GATHER3DIV8SF:
14599 case IX86_BUILTIN_GATHERDIV8SF:
14600 if (target == NULL_RTX)
14601 target = gen_reg_rtx (V4SFmode);
14602 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14603 break;
14604 case IX86_BUILTIN_GATHER3DIV8SI:
14605 case IX86_BUILTIN_GATHERDIV8SI:
14606 if (target == NULL_RTX)
14607 target = gen_reg_rtx (V4SImode);
14608 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14609 break;
14610 default:
14611 target = subtarget;
14612 break;
14613 }
14614 return target;
14615
14616 scatter_gen:
14617 arg0 = CALL_EXPR_ARG (exp, 0);
14618 arg1 = CALL_EXPR_ARG (exp, 1);
14619 arg2 = CALL_EXPR_ARG (exp, 2);
14620 arg3 = CALL_EXPR_ARG (exp, 3);
14621 arg4 = CALL_EXPR_ARG (exp, 4);
14622 op0 = expand_normal (arg0);
14623 op1 = expand_normal (arg1);
14624 op2 = expand_normal (arg2);
14625 op3 = expand_normal (arg3);
14626 op4 = expand_normal (arg4);
14627 mode1 = insn_data[icode].operand[1].mode;
14628 mode2 = insn_data[icode].operand[2].mode;
14629 mode3 = insn_data[icode].operand[3].mode;
14630 mode4 = insn_data[icode].operand[4].mode;
14631
14632 /* Scatter instruction stores operand op3 to memory with
14633 indices from op2 and scale from op4 under writemask op1.
14634 If index operand op2 has more elements then source operand
14635 op3 one need to use only its low half. And vice versa. */
14636 switch (fcode)
14637 {
14638 case IX86_BUILTIN_SCATTERALTSIV8DF:
14639 case IX86_BUILTIN_SCATTERALTSIV8DI:
14640 half = gen_reg_rtx (V8SImode);
14641 if (!nonimmediate_operand (op2, V16SImode))
14642 op2 = copy_to_mode_reg (V16SImode, op2);
14643 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14644 op2 = half;
14645 break;
14646 case IX86_BUILTIN_SCATTERALTDIV16SF:
14647 case IX86_BUILTIN_SCATTERALTDIV16SI:
14648 half = gen_reg_rtx (mode3);
14649 if (mode3 == V8SFmode)
14650 gen = gen_vec_extract_lo_v16sf;
14651 else
14652 gen = gen_vec_extract_lo_v16si;
14653 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14654 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14655 emit_insn (gen (half, op3));
14656 op3 = half;
14657 break;
14658 case IX86_BUILTIN_SCATTERALTSIV4DF:
14659 case IX86_BUILTIN_SCATTERALTSIV4DI:
14660 half = gen_reg_rtx (V4SImode);
14661 if (!nonimmediate_operand (op2, V8SImode))
14662 op2 = copy_to_mode_reg (V8SImode, op2);
14663 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14664 op2 = half;
14665 break;
14666 case IX86_BUILTIN_SCATTERALTDIV8SF:
14667 case IX86_BUILTIN_SCATTERALTDIV8SI:
14668 half = gen_reg_rtx (mode3);
14669 if (mode3 == V4SFmode)
14670 gen = gen_vec_extract_lo_v8sf;
14671 else
14672 gen = gen_vec_extract_lo_v8si;
14673 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14674 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14675 emit_insn (gen (half, op3));
14676 op3 = half;
14677 break;
14678 case IX86_BUILTIN_SCATTERALTSIV2DF:
14679 case IX86_BUILTIN_SCATTERALTSIV2DI:
14680 if (!nonimmediate_operand (op2, V4SImode))
14681 op2 = copy_to_mode_reg (V4SImode, op2);
14682 break;
14683 case IX86_BUILTIN_SCATTERALTDIV4SF:
14684 case IX86_BUILTIN_SCATTERALTDIV4SI:
14685 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14686 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14687 break;
14688 default:
14689 break;
14690 }
14691
14692 /* Force memory operand only with base register here. But we
14693 don't want to do it on memory operand for other builtin
14694 functions. */
14695 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14696
14697 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14698 op0 = copy_to_mode_reg (Pmode, op0);
14699
14700 op1 = fixup_modeless_constant (op1, mode1);
14701
14702 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14703 {
14704 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14705 op1 = copy_to_mode_reg (mode1, op1);
14706 }
14707 else
14708 {
14709 op1 = copy_to_reg (op1);
14710 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14711 }
14712
14713 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14714 op2 = copy_to_mode_reg (mode2, op2);
14715
14716 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14717 op3 = copy_to_mode_reg (mode3, op3);
14718
14719 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14720 {
14721 error ("the last argument must be scale 1, 2, 4, 8");
14722 return const0_rtx;
14723 }
14724
14725 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14726 if (! pat)
14727 return const0_rtx;
14728
14729 emit_insn (pat);
14730 return 0;
14731
14732 vec_prefetch_gen:
14733 arg0 = CALL_EXPR_ARG (exp, 0);
14734 arg1 = CALL_EXPR_ARG (exp, 1);
14735 arg2 = CALL_EXPR_ARG (exp, 2);
14736 arg3 = CALL_EXPR_ARG (exp, 3);
14737 arg4 = CALL_EXPR_ARG (exp, 4);
14738 op0 = expand_normal (arg0);
14739 op1 = expand_normal (arg1);
14740 op2 = expand_normal (arg2);
14741 op3 = expand_normal (arg3);
14742 op4 = expand_normal (arg4);
14743 mode0 = insn_data[icode].operand[0].mode;
14744 mode1 = insn_data[icode].operand[1].mode;
14745 mode3 = insn_data[icode].operand[3].mode;
14746 mode4 = insn_data[icode].operand[4].mode;
14747
14748 op0 = fixup_modeless_constant (op0, mode0);
14749
14750 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14751 {
14752 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14753 op0 = copy_to_mode_reg (mode0, op0);
14754 }
14755 else
14756 {
14757 op0 = copy_to_reg (op0);
14758 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14759 }
14760
14761 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14762 op1 = copy_to_mode_reg (mode1, op1);
14763
14764 /* Force memory operand only with base register here. But we
14765 don't want to do it on memory operand for other builtin
14766 functions. */
14767 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14768
14769 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14770 op2 = copy_to_mode_reg (Pmode, op2);
14771
14772 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14773 {
14774 error ("the forth argument must be scale 1, 2, 4, 8");
14775 return const0_rtx;
14776 }
14777
14778 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14779 {
14780 error ("incorrect hint operand");
14781 return const0_rtx;
14782 }
14783
14784 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14785 if (! pat)
14786 return const0_rtx;
14787
14788 emit_insn (pat);
14789
14790 return 0;
14791
14792 case IX86_BUILTIN_XABORT:
14793 icode = CODE_FOR_xabort;
14794 arg0 = CALL_EXPR_ARG (exp, 0);
14795 op0 = expand_normal (arg0);
14796 mode0 = insn_data[icode].operand[0].mode;
14797 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14798 {
14799 error ("the argument to %<xabort%> intrinsic must "
14800 "be an 8-bit immediate");
14801 return const0_rtx;
14802 }
14803 emit_insn (gen_xabort (op0));
14804 return 0;
14805
14806 case IX86_BUILTIN_RDSSPD:
14807 case IX86_BUILTIN_RDSSPQ:
14808 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14809
14810 if (target == 0
14811 || !register_operand (target, mode))
14812 target = gen_reg_rtx (mode);
14813
14814 op0 = force_reg (mode, const0_rtx);
14815
14816 emit_insn (gen_rdssp (mode, target, op0));
14817 return target;
14818
14819 case IX86_BUILTIN_INCSSPD:
14820 case IX86_BUILTIN_INCSSPQ:
14821 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14822
14823 arg0 = CALL_EXPR_ARG (exp, 0);
14824 op0 = expand_normal (arg0);
14825
14826 op0 = force_reg (mode, op0);
14827
14828 emit_insn (gen_incssp (mode, op0));
14829 return 0;
14830
14831 case IX86_BUILTIN_HRESET:
14832 icode = CODE_FOR_hreset;
14833 arg0 = CALL_EXPR_ARG (exp, 0);
14834 op0 = expand_normal (arg0);
14835 op0 = force_reg (SImode, op0);
14836 emit_insn (gen_hreset (op0));
14837 return 0;
14838
14839 case IX86_BUILTIN_RSTORSSP:
14840 case IX86_BUILTIN_CLRSSBSY:
14841 arg0 = CALL_EXPR_ARG (exp, 0);
14842 op0 = expand_normal (arg0);
14843 icode = (fcode == IX86_BUILTIN_RSTORSSP
14844 ? CODE_FOR_rstorssp
14845 : CODE_FOR_clrssbsy);
14846
14847 if (!address_operand (op0, VOIDmode))
14848 {
14849 op0 = convert_memory_address (Pmode, op0);
14850 op0 = copy_addr_to_reg (op0);
14851 }
14852 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
14853 return 0;
14854
14855 case IX86_BUILTIN_WRSSD:
14856 case IX86_BUILTIN_WRSSQ:
14857 case IX86_BUILTIN_WRUSSD:
14858 case IX86_BUILTIN_WRUSSQ:
14859 mode = ((fcode == IX86_BUILTIN_WRSSD
14860 || fcode == IX86_BUILTIN_WRUSSD)
14861 ? SImode : DImode);
14862
14863 arg0 = CALL_EXPR_ARG (exp, 0);
14864 op0 = expand_normal (arg0);
14865 arg1 = CALL_EXPR_ARG (exp, 1);
14866 op1 = expand_normal (arg1);
14867
14868 op0 = force_reg (mode, op0);
14869
14870 if (!address_operand (op1, VOIDmode))
14871 {
14872 op1 = convert_memory_address (Pmode, op1);
14873 op1 = copy_addr_to_reg (op1);
14874 }
14875 op1 = gen_rtx_MEM (mode, op1);
14876
14877 icode = ((fcode == IX86_BUILTIN_WRSSD
14878 || fcode == IX86_BUILTIN_WRSSQ)
14879 ? code_for_wrss (mode)
14880 : code_for_wruss (mode));
14881 emit_insn (GEN_FCN (icode) (op0, op1));
14882
14883 return 0;
14884
14885 default:
14886 break;
14887 }
14888
14889 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14890 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14891 {
14892 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14893 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14894 target);
14895 }
14896
14897 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14898 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14899 {
14900 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14901 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14902 target);
14903 }
14904
14905 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14906 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14907 {
14908 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14909 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14910 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14911 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14912 int masked = 1;
14913 machine_mode mode, wide_mode, nar_mode;
14914
14915 nar_mode = V4SFmode;
14916 mode = V16SFmode;
14917 wide_mode = V64SFmode;
14918 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
14919 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14920
14921 switch (fcode)
14922 {
14923 case IX86_BUILTIN_4FMAPS:
14924 fcn = gen_avx5124fmaddps_4fmaddps;
14925 masked = 0;
14926 goto v4fma_expand;
14927
14928 case IX86_BUILTIN_4DPWSSD:
14929 nar_mode = V4SImode;
14930 mode = V16SImode;
14931 wide_mode = V64SImode;
14932 fcn = gen_avx5124vnniw_vp4dpwssd;
14933 masked = 0;
14934 goto v4fma_expand;
14935
14936 case IX86_BUILTIN_4DPWSSDS:
14937 nar_mode = V4SImode;
14938 mode = V16SImode;
14939 wide_mode = V64SImode;
14940 fcn = gen_avx5124vnniw_vp4dpwssds;
14941 masked = 0;
14942 goto v4fma_expand;
14943
14944 case IX86_BUILTIN_4FNMAPS:
14945 fcn = gen_avx5124fmaddps_4fnmaddps;
14946 masked = 0;
14947 goto v4fma_expand;
14948
14949 case IX86_BUILTIN_4FNMAPS_MASK:
14950 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
14951 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
14952 goto v4fma_expand;
14953
14954 case IX86_BUILTIN_4DPWSSD_MASK:
14955 nar_mode = V4SImode;
14956 mode = V16SImode;
14957 wide_mode = V64SImode;
14958 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
14959 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
14960 goto v4fma_expand;
14961
14962 case IX86_BUILTIN_4DPWSSDS_MASK:
14963 nar_mode = V4SImode;
14964 mode = V16SImode;
14965 wide_mode = V64SImode;
14966 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
14967 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
14968 goto v4fma_expand;
14969
14970 case IX86_BUILTIN_4FMAPS_MASK:
14971 {
14972 tree args[4];
14973 rtx ops[4];
14974 rtx wide_reg;
14975 rtx accum;
14976 rtx addr;
14977 rtx mem;
14978
14979 v4fma_expand:
14980 wide_reg = gen_reg_rtx (wide_mode);
14981 for (i = 0; i < 4; i++)
14982 {
14983 args[i] = CALL_EXPR_ARG (exp, i);
14984 ops[i] = expand_normal (args[i]);
14985
14986 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
14987 ops[i]);
14988 }
14989
14990 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14991 accum = force_reg (mode, accum);
14992
14993 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14994 addr = force_reg (Pmode, addr);
14995
14996 mem = gen_rtx_MEM (nar_mode, addr);
14997
14998 target = gen_reg_rtx (mode);
14999
15000 emit_move_insn (target, accum);
15001
15002 if (! masked)
15003 emit_insn (fcn (target, accum, wide_reg, mem));
15004 else
15005 {
15006 rtx merge, mask;
15007 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15008
15009 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15010
15011 if (CONST_INT_P (mask))
15012 mask = fixup_modeless_constant (mask, HImode);
15013
15014 mask = force_reg (HImode, mask);
15015
15016 if (GET_MODE (mask) != HImode)
15017 mask = gen_rtx_SUBREG (HImode, mask, 0);
15018
15019 /* If merge is 0 then we're about to emit z-masked variant. */
15020 if (const0_operand (merge, mode))
15021 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15022 /* If merge is the same as accum then emit merge-masked variant. */
15023 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15024 {
15025 merge = force_reg (mode, merge);
15026 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15027 }
15028 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15029 else
15030 {
15031 target = gen_reg_rtx (mode);
15032 emit_move_insn (target, merge);
15033 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15034 }
15035 }
15036 return target;
15037 }
15038
15039 case IX86_BUILTIN_4FNMASS:
15040 fcn = gen_avx5124fmaddps_4fnmaddss;
15041 masked = 0;
15042 goto s4fma_expand;
15043
15044 case IX86_BUILTIN_4FMASS:
15045 fcn = gen_avx5124fmaddps_4fmaddss;
15046 masked = 0;
15047 goto s4fma_expand;
15048
15049 case IX86_BUILTIN_4FNMASS_MASK:
15050 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
15051 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
15052 goto s4fma_expand;
15053
15054 case IX86_BUILTIN_4FMASS_MASK:
15055 {
15056 tree args[4];
15057 rtx ops[4];
15058 rtx wide_reg;
15059 rtx accum;
15060 rtx addr;
15061 rtx mem;
15062
15063 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
15064 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
15065
15066 s4fma_expand:
15067 mode = V4SFmode;
15068 wide_reg = gen_reg_rtx (V64SFmode);
15069 for (i = 0; i < 4; i++)
15070 {
15071 rtx tmp;
15072 args[i] = CALL_EXPR_ARG (exp, i);
15073 ops[i] = expand_normal (args[i]);
15074
15075 tmp = gen_reg_rtx (SFmode);
15076 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
15077
15078 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
15079 gen_rtx_SUBREG (V16SFmode, tmp, 0));
15080 }
15081
15082 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15083 accum = force_reg (V4SFmode, accum);
15084
15085 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15086 addr = force_reg (Pmode, addr);
15087
15088 mem = gen_rtx_MEM (V4SFmode, addr);
15089
15090 target = gen_reg_rtx (V4SFmode);
15091
15092 emit_move_insn (target, accum);
15093
15094 if (! masked)
15095 emit_insn (fcn (target, accum, wide_reg, mem));
15096 else
15097 {
15098 rtx merge, mask;
15099 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15100
15101 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15102
15103 if (CONST_INT_P (mask))
15104 mask = fixup_modeless_constant (mask, QImode);
15105
15106 mask = force_reg (QImode, mask);
15107
15108 if (GET_MODE (mask) != QImode)
15109 mask = gen_rtx_SUBREG (QImode, mask, 0);
15110
15111 /* If merge is 0 then we're about to emit z-masked variant. */
15112 if (const0_operand (merge, mode))
15113 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15114 /* If merge is the same as accum then emit merge-masked
15115 variant. */
15116 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15117 {
15118 merge = force_reg (mode, merge);
15119 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15120 }
15121 /* Merge with something unknown might happen if we z-mask
15122 w/ -O0. */
15123 else
15124 {
15125 target = gen_reg_rtx (mode);
15126 emit_move_insn (target, merge);
15127 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15128 }
15129 }
15130 return target;
15131 }
15132 case IX86_BUILTIN_RDPID:
15133 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
15134 target);
15135 case IX86_BUILTIN_FABSQ:
15136 case IX86_BUILTIN_COPYSIGNQ:
15137 if (!TARGET_SSE)
15138 /* Emit a normal call if SSE isn't available. */
15139 return expand_call (exp, target, ignore);
15140 /* FALLTHRU */
15141 default:
15142 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
15143 }
15144 }
15145
15146 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
15147 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
15148 {
15149 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
15150 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
15151 }
15152
15153 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15154 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
15155 {
15156 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
15157 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
15158 }
15159
15160 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15161 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
15162 {
15163 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
15164 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
15165 }
15166
15167 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15168 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
15169 {
15170 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
15171 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
15172 }
15173
15174 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15175 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
15176 {
15177 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
15178 const struct builtin_description *d = bdesc_multi_arg + i;
15179 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
15180 (enum ix86_builtin_func_type)
15181 d->flag, d->comparison);
15182 }
15183
15184 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
15185 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
15186 {
15187 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
15188 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
15189 target);
15190 }
15191
15192 gcc_unreachable ();
15193 }
15194
15195 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15196 fill target with val via vec_duplicate. */
15197
15198 static bool
15199 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
15200 {
15201 bool ok;
15202 rtx_insn *insn;
15203 rtx dup;
15204 /* Save/restore recog_data in case this is called from splitters
15205 or other routines where recog_data needs to stay valid across
15206 force_reg. See PR106577. */
15207 recog_data_d recog_data_save = recog_data;
15208
15209 /* First attempt to recognize VAL as-is. */
15210 dup = gen_vec_duplicate (mode, val);
15211 insn = emit_insn (gen_rtx_SET (target, dup));
15212 if (recog_memoized (insn) < 0)
15213 {
15214 rtx_insn *seq;
15215 machine_mode innermode = GET_MODE_INNER (mode);
15216 rtx reg;
15217
15218 /* If that fails, force VAL into a register. */
15219
15220 start_sequence ();
15221 reg = force_reg (innermode, val);
15222 if (GET_MODE (reg) != innermode)
15223 reg = gen_lowpart (innermode, reg);
15224 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
15225 seq = get_insns ();
15226 end_sequence ();
15227 if (seq)
15228 emit_insn_before (seq, insn);
15229
15230 ok = recog_memoized (insn) >= 0;
15231 gcc_assert (ok);
15232 }
15233 recog_data = recog_data_save;
15234 return true;
15235 }
15236
15237 /* Get a vector mode of the same size as the original but with elements
15238 twice as wide. This is only guaranteed to apply to integral vectors. */
15239
15240 static machine_mode
15241 get_mode_wider_vector (machine_mode o)
15242 {
15243 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15244 machine_mode n = GET_MODE_NEXT_MODE (o).require ();
15245 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
15246 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
15247 return n;
15248 }
15249
15250 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
15251 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
15252
15253 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15254 with all elements equal to VAR. Return true if successful. */
15255
15256 bool
15257 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
15258 rtx target, rtx val)
15259 {
15260 bool ok;
15261
15262 switch (mode)
15263 {
15264 case E_V2SImode:
15265 case E_V2SFmode:
15266 if (!mmx_ok)
15267 return false;
15268 /* FALLTHRU */
15269
15270 case E_V4DFmode:
15271 case E_V4DImode:
15272 case E_V8SFmode:
15273 case E_V8SImode:
15274 case E_V2DFmode:
15275 case E_V2DImode:
15276 case E_V4SFmode:
15277 case E_V4SImode:
15278 case E_V16SImode:
15279 case E_V8DImode:
15280 case E_V16SFmode:
15281 case E_V8DFmode:
15282 return ix86_vector_duplicate_value (mode, target, val);
15283
15284 case E_V4HImode:
15285 if (!mmx_ok)
15286 return false;
15287 if (TARGET_SSE || TARGET_3DNOW_A)
15288 {
15289 rtx x;
15290
15291 val = gen_lowpart (SImode, val);
15292 x = gen_rtx_TRUNCATE (HImode, val);
15293 x = gen_rtx_VEC_DUPLICATE (mode, x);
15294 emit_insn (gen_rtx_SET (target, x));
15295 return true;
15296 }
15297 goto widen;
15298
15299 case E_V2HImode:
15300 if (TARGET_SSE2)
15301 {
15302 rtx x;
15303
15304 val = gen_lowpart (SImode, val);
15305 x = gen_rtx_TRUNCATE (HImode, val);
15306 x = gen_rtx_VEC_DUPLICATE (mode, x);
15307 emit_insn (gen_rtx_SET (target, x));
15308 return true;
15309 }
15310 return false;
15311
15312 case E_V8QImode:
15313 case E_V4QImode:
15314 if (!mmx_ok)
15315 return false;
15316 goto widen;
15317
15318 case E_V8HImode:
15319 case E_V8HFmode:
15320 case E_V8BFmode:
15321 if (TARGET_AVX2)
15322 return ix86_vector_duplicate_value (mode, target, val);
15323
15324 if (TARGET_SSE2)
15325 {
15326 struct expand_vec_perm_d dperm;
15327 rtx tmp1, tmp2;
15328
15329 permute:
15330 memset (&dperm, 0, sizeof (dperm));
15331 dperm.target = target;
15332 dperm.vmode = mode;
15333 dperm.nelt = GET_MODE_NUNITS (mode);
15334 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15335 dperm.one_operand_p = true;
15336
15337 if (mode == V8HFmode || mode == V8BFmode)
15338 {
15339 tmp1 = force_reg (GET_MODE_INNER (mode), val);
15340 tmp2 = gen_reg_rtx (mode);
15341 emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
15342 CONST0_RTX (mode), tmp1));
15343 tmp1 = gen_lowpart (mode, tmp2);
15344 }
15345 else
15346 {
15347 /* Extend to SImode using a paradoxical SUBREG. */
15348 tmp1 = gen_reg_rtx (SImode);
15349 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15350
15351 /* Insert the SImode value as
15352 low element of a V4SImode vector. */
15353 tmp2 = gen_reg_rtx (V4SImode);
15354 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15355 tmp1 = gen_lowpart (mode, tmp2);
15356 }
15357
15358 emit_move_insn (dperm.op0, tmp1);
15359 ok = (expand_vec_perm_1 (&dperm)
15360 || expand_vec_perm_broadcast_1 (&dperm));
15361 gcc_assert (ok);
15362 return ok;
15363 }
15364 goto widen;
15365
15366 case E_V16QImode:
15367 if (TARGET_AVX2)
15368 return ix86_vector_duplicate_value (mode, target, val);
15369
15370 if (TARGET_SSE2)
15371 goto permute;
15372 goto widen;
15373
15374 widen:
15375 /* Replicate the value once into the next wider mode and recurse. */
15376 {
15377 machine_mode smode, wsmode, wvmode;
15378 rtx x;
15379
15380 smode = GET_MODE_INNER (mode);
15381 wvmode = get_mode_wider_vector (mode);
15382 wsmode = GET_MODE_INNER (wvmode);
15383
15384 val = convert_modes (wsmode, smode, val, true);
15385
15386 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15387 emit_insn (gen_insv_1 (wsmode, val, val));
15388 else
15389 {
15390 x = expand_simple_binop (wsmode, ASHIFT, val,
15391 GEN_INT (GET_MODE_BITSIZE (smode)),
15392 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15393 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15394 OPTAB_LIB_WIDEN);
15395 }
15396
15397 x = gen_reg_rtx (wvmode);
15398 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15399 gcc_assert (ok);
15400 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15401 return ok;
15402 }
15403
15404 case E_V16HImode:
15405 case E_V16HFmode:
15406 case E_V16BFmode:
15407 case E_V32QImode:
15408 if (TARGET_AVX2)
15409 return ix86_vector_duplicate_value (mode, target, val);
15410 else
15411 {
15412 machine_mode hvmode;
15413 switch (mode)
15414 {
15415 case V16HImode:
15416 hvmode = V8HImode;
15417 break;
15418 case V16HFmode:
15419 hvmode = V8HFmode;
15420 break;
15421 case V16BFmode:
15422 hvmode = V8BFmode;
15423 break;
15424 case V32QImode:
15425 hvmode = V16QImode;
15426 break;
15427 default:
15428 gcc_unreachable ();
15429 }
15430 rtx x = gen_reg_rtx (hvmode);
15431
15432 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15433 gcc_assert (ok);
15434
15435 x = gen_rtx_VEC_CONCAT (mode, x, x);
15436 emit_insn (gen_rtx_SET (target, x));
15437 }
15438 return true;
15439
15440 case E_V32HImode:
15441 case E_V32HFmode:
15442 case E_V32BFmode:
15443 case E_V64QImode:
15444 if (TARGET_AVX512BW)
15445 return ix86_vector_duplicate_value (mode, target, val);
15446 else
15447 {
15448 machine_mode hvmode;
15449 switch (mode)
15450 {
15451 case V32HImode:
15452 hvmode = V16HImode;
15453 break;
15454 case V32HFmode:
15455 hvmode = V16HFmode;
15456 break;
15457 case V32BFmode:
15458 hvmode = V16BFmode;
15459 break;
15460 case V64QImode:
15461 hvmode = V32QImode;
15462 break;
15463 default:
15464 gcc_unreachable ();
15465 }
15466 rtx x = gen_reg_rtx (hvmode);
15467
15468 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15469 gcc_assert (ok);
15470
15471 x = gen_rtx_VEC_CONCAT (mode, x, x);
15472 emit_insn (gen_rtx_SET (target, x));
15473 }
15474 return true;
15475
15476 default:
15477 return false;
15478 }
15479 }
15480
15481 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15482 whose ONE_VAR element is VAR, and other elements are zero. Return true
15483 if successful. */
15484
15485 static bool
15486 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15487 rtx target, rtx var, int one_var)
15488 {
15489 machine_mode vsimode;
15490 rtx new_target;
15491 rtx x, tmp;
15492 bool use_vector_set = false;
15493 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15494
15495 switch (mode)
15496 {
15497 case E_V2DImode:
15498 /* For SSE4.1, we normally use vector set. But if the second
15499 element is zero and inter-unit moves are OK, we use movq
15500 instead. */
15501 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15502 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15503 && one_var == 0));
15504 break;
15505 case E_V16QImode:
15506 case E_V4SImode:
15507 case E_V4SFmode:
15508 use_vector_set = TARGET_SSE4_1;
15509 break;
15510 case E_V8HImode:
15511 use_vector_set = TARGET_SSE2;
15512 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15513 ? gen_vec_setv8hi_0 : NULL;
15514 break;
15515 case E_V8QImode:
15516 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15517 break;
15518 case E_V4HImode:
15519 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15520 break;
15521 case E_V4QImode:
15522 use_vector_set = TARGET_SSE4_1;
15523 break;
15524 case E_V32QImode:
15525 use_vector_set = TARGET_AVX;
15526 break;
15527 case E_V16HImode:
15528 use_vector_set = TARGET_AVX;
15529 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15530 ? gen_vec_setv16hi_0 : NULL;
15531 break;
15532 case E_V8SImode:
15533 use_vector_set = TARGET_AVX;
15534 gen_vec_set_0 = gen_vec_setv8si_0;
15535 break;
15536 case E_V8SFmode:
15537 use_vector_set = TARGET_AVX;
15538 gen_vec_set_0 = gen_vec_setv8sf_0;
15539 break;
15540 case E_V4DFmode:
15541 use_vector_set = TARGET_AVX;
15542 gen_vec_set_0 = gen_vec_setv4df_0;
15543 break;
15544 case E_V4DImode:
15545 /* Use ix86_expand_vector_set in 64bit mode only. */
15546 use_vector_set = TARGET_AVX && TARGET_64BIT;
15547 gen_vec_set_0 = gen_vec_setv4di_0;
15548 break;
15549 case E_V16SImode:
15550 use_vector_set = TARGET_AVX512F && one_var == 0;
15551 gen_vec_set_0 = gen_vec_setv16si_0;
15552 break;
15553 case E_V16SFmode:
15554 use_vector_set = TARGET_AVX512F && one_var == 0;
15555 gen_vec_set_0 = gen_vec_setv16sf_0;
15556 break;
15557 case E_V8DFmode:
15558 use_vector_set = TARGET_AVX512F && one_var == 0;
15559 gen_vec_set_0 = gen_vec_setv8df_0;
15560 break;
15561 case E_V8DImode:
15562 /* Use ix86_expand_vector_set in 64bit mode only. */
15563 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15564 gen_vec_set_0 = gen_vec_setv8di_0;
15565 break;
15566 case E_V8HFmode:
15567 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15568 gen_vec_set_0 = gen_vec_setv8hf_0;
15569 break;
15570 case E_V16HFmode:
15571 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15572 gen_vec_set_0 = gen_vec_setv16hf_0;
15573 break;
15574 case E_V32HFmode:
15575 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15576 gen_vec_set_0 = gen_vec_setv32hf_0;
15577 break;
15578 case E_V8BFmode:
15579 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15580 gen_vec_set_0 = gen_vec_setv8bf_0;
15581 break;
15582 case E_V16BFmode:
15583 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15584 gen_vec_set_0 = gen_vec_setv16bf_0;
15585 break;
15586 case E_V32BFmode:
15587 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15588 gen_vec_set_0 = gen_vec_setv32bf_0;
15589 break;
15590 case E_V32HImode:
15591 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15592 gen_vec_set_0 = gen_vec_setv32hi_0;
15593 default:
15594 break;
15595 }
15596
15597 if (use_vector_set)
15598 {
15599 if (gen_vec_set_0 && one_var == 0)
15600 {
15601 var = force_reg (GET_MODE_INNER (mode), var);
15602 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15603 return true;
15604 }
15605 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15606 var = force_reg (GET_MODE_INNER (mode), var);
15607 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15608 return true;
15609 }
15610
15611 switch (mode)
15612 {
15613 case E_V2SFmode:
15614 case E_V2SImode:
15615 if (!mmx_ok)
15616 return false;
15617 /* FALLTHRU */
15618
15619 case E_V2DFmode:
15620 case E_V2DImode:
15621 if (one_var != 0)
15622 return false;
15623 var = force_reg (GET_MODE_INNER (mode), var);
15624 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15625 emit_insn (gen_rtx_SET (target, x));
15626 return true;
15627
15628 case E_V4SFmode:
15629 case E_V4SImode:
15630 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15631 new_target = gen_reg_rtx (mode);
15632 else
15633 new_target = target;
15634 var = force_reg (GET_MODE_INNER (mode), var);
15635 x = gen_rtx_VEC_DUPLICATE (mode, var);
15636 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15637 emit_insn (gen_rtx_SET (new_target, x));
15638 if (one_var != 0)
15639 {
15640 /* We need to shuffle the value to the correct position, so
15641 create a new pseudo to store the intermediate result. */
15642
15643 /* With SSE2, we can use the integer shuffle insns. */
15644 if (mode != V4SFmode && TARGET_SSE2)
15645 {
15646 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15647 const1_rtx,
15648 GEN_INT (one_var == 1 ? 0 : 1),
15649 GEN_INT (one_var == 2 ? 0 : 1),
15650 GEN_INT (one_var == 3 ? 0 : 1)));
15651 if (target != new_target)
15652 emit_move_insn (target, new_target);
15653 return true;
15654 }
15655
15656 /* Otherwise convert the intermediate result to V4SFmode and
15657 use the SSE1 shuffle instructions. */
15658 if (mode != V4SFmode)
15659 {
15660 tmp = gen_reg_rtx (V4SFmode);
15661 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15662 }
15663 else
15664 tmp = new_target;
15665
15666 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15667 const1_rtx,
15668 GEN_INT (one_var == 1 ? 0 : 1),
15669 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15670 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15671
15672 if (mode != V4SFmode)
15673 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15674 else if (tmp != target)
15675 emit_move_insn (target, tmp);
15676 }
15677 else if (target != new_target)
15678 emit_move_insn (target, new_target);
15679 return true;
15680
15681 case E_V8HImode:
15682 case E_V16QImode:
15683 vsimode = V4SImode;
15684 goto widen;
15685 case E_V4HImode:
15686 case E_V8QImode:
15687 if (!mmx_ok)
15688 return false;
15689 vsimode = V2SImode;
15690 goto widen;
15691 widen:
15692 if (one_var != 0)
15693 return false;
15694
15695 /* Zero extend the variable element to SImode and recurse. */
15696 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15697
15698 x = gen_reg_rtx (vsimode);
15699 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15700 var, one_var))
15701 gcc_unreachable ();
15702
15703 emit_move_insn (target, gen_lowpart (mode, x));
15704 return true;
15705
15706 default:
15707 return false;
15708 }
15709 }
15710
15711 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15712 consisting of the values in VALS. It is known that all elements
15713 except ONE_VAR are constants. Return true if successful. */
15714
15715 static bool
15716 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15717 rtx target, rtx vals, int one_var)
15718 {
15719 rtx var = XVECEXP (vals, 0, one_var);
15720 machine_mode wmode;
15721 rtx const_vec, x;
15722
15723 const_vec = copy_rtx (vals);
15724 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15725 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15726
15727 switch (mode)
15728 {
15729 case E_V2DFmode:
15730 case E_V2DImode:
15731 case E_V2SFmode:
15732 case E_V2SImode:
15733 /* For the two element vectors, it's just as easy to use
15734 the general case. */
15735 return false;
15736
15737 case E_V4DImode:
15738 /* Use ix86_expand_vector_set in 64bit mode only. */
15739 if (!TARGET_64BIT)
15740 return false;
15741 /* FALLTHRU */
15742 case E_V8HFmode:
15743 case E_V16HFmode:
15744 case E_V8BFmode:
15745 case E_V16BFmode:
15746 case E_V4DFmode:
15747 case E_V8SFmode:
15748 case E_V8SImode:
15749 case E_V16HImode:
15750 case E_V32QImode:
15751 case E_V4SFmode:
15752 case E_V4SImode:
15753 case E_V8HImode:
15754 case E_V4HImode:
15755 break;
15756
15757 case E_V16QImode:
15758 if (TARGET_SSE4_1)
15759 break;
15760 wmode = V8HImode;
15761 goto widen;
15762 case E_V8QImode:
15763 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15764 break;
15765 wmode = V4HImode;
15766 goto widen;
15767 case E_V4QImode:
15768 if (TARGET_SSE4_1)
15769 break;
15770 wmode = V2HImode;
15771 widen:
15772 /* There's no way to set one QImode entry easily. Combine
15773 the variable value with its adjacent constant value, and
15774 promote to an HImode set. */
15775 x = XVECEXP (vals, 0, one_var ^ 1);
15776 if (one_var & 1)
15777 {
15778 var = convert_modes (HImode, QImode, var, true);
15779 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15780 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15781 x = GEN_INT (INTVAL (x) & 0xff);
15782 }
15783 else
15784 {
15785 var = convert_modes (HImode, QImode, var, true);
15786 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15787 }
15788 if (x != const0_rtx)
15789 var = expand_simple_binop (HImode, IOR, var, x, var,
15790 1, OPTAB_LIB_WIDEN);
15791
15792 x = gen_reg_rtx (wmode);
15793 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15794 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15795
15796 emit_move_insn (target, gen_lowpart (mode, x));
15797 return true;
15798
15799 default:
15800 return false;
15801 }
15802
15803 emit_move_insn (target, const_vec);
15804 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15805 return true;
15806 }
15807
15808 /* A subroutine of ix86_expand_vector_init_general. Use vector
15809 concatenate to handle the most general case: all values variable,
15810 and none identical. */
15811
15812 static void
15813 ix86_expand_vector_init_concat (machine_mode mode,
15814 rtx target, rtx *ops, int n)
15815 {
15816 machine_mode half_mode = VOIDmode;
15817 rtx half[2];
15818 rtvec v;
15819 int i, j;
15820
15821 switch (n)
15822 {
15823 case 2:
15824 switch (mode)
15825 {
15826 case E_V32HFmode:
15827 half_mode = V16HFmode;
15828 break;
15829 case E_V32BFmode:
15830 half_mode = V16BFmode;
15831 break;
15832 case E_V16SImode:
15833 half_mode = V8SImode;
15834 break;
15835 case E_V16SFmode:
15836 half_mode = V8SFmode;
15837 break;
15838 case E_V8DImode:
15839 half_mode = V4DImode;
15840 break;
15841 case E_V8DFmode:
15842 half_mode = V4DFmode;
15843 break;
15844 case E_V16HFmode:
15845 half_mode = V8HFmode;
15846 break;
15847 case E_V16BFmode:
15848 half_mode = V8BFmode;
15849 break;
15850 case E_V8SImode:
15851 half_mode = V4SImode;
15852 break;
15853 case E_V8SFmode:
15854 half_mode = V4SFmode;
15855 break;
15856 case E_V4DImode:
15857 half_mode = V2DImode;
15858 break;
15859 case E_V4DFmode:
15860 half_mode = V2DFmode;
15861 break;
15862 case E_V4SImode:
15863 half_mode = V2SImode;
15864 break;
15865 case E_V4SFmode:
15866 half_mode = V2SFmode;
15867 break;
15868 case E_V2DImode:
15869 half_mode = DImode;
15870 break;
15871 case E_V2SImode:
15872 half_mode = SImode;
15873 break;
15874 case E_V2DFmode:
15875 half_mode = DFmode;
15876 break;
15877 case E_V2SFmode:
15878 half_mode = SFmode;
15879 break;
15880 default:
15881 gcc_unreachable ();
15882 }
15883
15884 if (!register_operand (ops[1], half_mode))
15885 ops[1] = force_reg (half_mode, ops[1]);
15886 if (!register_operand (ops[0], half_mode))
15887 ops[0] = force_reg (half_mode, ops[0]);
15888 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15889 ops[1])));
15890 break;
15891
15892 case 4:
15893 switch (mode)
15894 {
15895 case E_V4DImode:
15896 half_mode = V2DImode;
15897 break;
15898 case E_V4DFmode:
15899 half_mode = V2DFmode;
15900 break;
15901 case E_V4SImode:
15902 half_mode = V2SImode;
15903 break;
15904 case E_V4SFmode:
15905 half_mode = V2SFmode;
15906 break;
15907 default:
15908 gcc_unreachable ();
15909 }
15910 goto half;
15911
15912 case 8:
15913 switch (mode)
15914 {
15915 case E_V8DImode:
15916 half_mode = V4DImode;
15917 break;
15918 case E_V8DFmode:
15919 half_mode = V4DFmode;
15920 break;
15921 case E_V8SImode:
15922 half_mode = V4SImode;
15923 break;
15924 case E_V8SFmode:
15925 half_mode = V4SFmode;
15926 break;
15927 default:
15928 gcc_unreachable ();
15929 }
15930 goto half;
15931
15932 case 16:
15933 switch (mode)
15934 {
15935 case E_V16SImode:
15936 half_mode = V8SImode;
15937 break;
15938 case E_V16SFmode:
15939 half_mode = V8SFmode;
15940 break;
15941 default:
15942 gcc_unreachable ();
15943 }
15944 goto half;
15945
15946 half:
15947 /* FIXME: We process inputs backward to help RA. PR 36222. */
15948 i = n - 1;
15949 for (j = 1; j != -1; j--)
15950 {
15951 half[j] = gen_reg_rtx (half_mode);
15952 switch (n >> 1)
15953 {
15954 case 2:
15955 v = gen_rtvec (2, ops[i-1], ops[i]);
15956 i -= 2;
15957 break;
15958 case 4:
15959 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
15960 i -= 4;
15961 break;
15962 case 8:
15963 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
15964 ops[i-3], ops[i-2], ops[i-1], ops[i]);
15965 i -= 8;
15966 break;
15967 default:
15968 gcc_unreachable ();
15969 }
15970 ix86_expand_vector_init (false, half[j],
15971 gen_rtx_PARALLEL (half_mode, v));
15972 }
15973
15974 ix86_expand_vector_init_concat (mode, target, half, 2);
15975 break;
15976
15977 default:
15978 gcc_unreachable ();
15979 }
15980 }
15981
15982 /* A subroutine of ix86_expand_vector_init_general. Use vector
15983 interleave to handle the most general case: all values variable,
15984 and none identical. */
15985
15986 static void
15987 ix86_expand_vector_init_interleave (machine_mode mode,
15988 rtx target, rtx *ops, int n)
15989 {
15990 machine_mode first_imode, second_imode, third_imode, inner_mode;
15991 int i, j;
15992 rtx op, op0, op1;
15993 rtx (*gen_load_even) (rtx, rtx, rtx);
15994 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
15995 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
15996
15997 switch (mode)
15998 {
15999 case E_V8HFmode:
16000 gen_load_even = gen_vec_interleave_lowv8hf;
16001 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16002 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16003 inner_mode = HFmode;
16004 first_imode = V4SImode;
16005 second_imode = V2DImode;
16006 third_imode = VOIDmode;
16007 break;
16008 case E_V8BFmode:
16009 gen_load_even = gen_vec_interleave_lowv8bf;
16010 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16011 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16012 inner_mode = BFmode;
16013 first_imode = V4SImode;
16014 second_imode = V2DImode;
16015 third_imode = VOIDmode;
16016 break;
16017 case E_V8HImode:
16018 gen_load_even = gen_vec_setv8hi;
16019 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16020 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16021 inner_mode = HImode;
16022 first_imode = V4SImode;
16023 second_imode = V2DImode;
16024 third_imode = VOIDmode;
16025 break;
16026 case E_V16QImode:
16027 gen_load_even = gen_vec_setv16qi;
16028 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
16029 gen_interleave_second_low = gen_vec_interleave_lowv4si;
16030 inner_mode = QImode;
16031 first_imode = V8HImode;
16032 second_imode = V4SImode;
16033 third_imode = V2DImode;
16034 break;
16035 default:
16036 gcc_unreachable ();
16037 }
16038
16039 for (i = 0; i < n; i++)
16040 {
16041 op = ops [i + i];
16042 if (inner_mode == HFmode || inner_mode == BFmode)
16043 {
16044 rtx even, odd;
16045 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16046 machine_mode vec_mode =
16047 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
16048 op0 = gen_reg_rtx (vec_mode);
16049 even = lowpart_subreg (vec_mode,
16050 force_reg (inner_mode, op), inner_mode);
16051 odd = lowpart_subreg (vec_mode,
16052 force_reg (inner_mode, ops[i + i + 1]),
16053 inner_mode);
16054 emit_insn (gen_load_even (op0, even, odd));
16055 }
16056 else
16057 {
16058 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16059 op0 = gen_reg_rtx (SImode);
16060 emit_move_insn (op0, gen_lowpart (SImode, op));
16061
16062 /* Insert the SImode value as low element of V4SImode vector. */
16063 op1 = gen_reg_rtx (V4SImode);
16064 op0 = gen_rtx_VEC_MERGE (V4SImode,
16065 gen_rtx_VEC_DUPLICATE (V4SImode,
16066 op0),
16067 CONST0_RTX (V4SImode),
16068 const1_rtx);
16069 emit_insn (gen_rtx_SET (op1, op0));
16070
16071 /* Cast the V4SImode vector back to a vector in orignal mode. */
16072 op0 = gen_reg_rtx (mode);
16073 emit_move_insn (op0, gen_lowpart (mode, op1));
16074
16075 /* Load even elements into the second position. */
16076 emit_insn (gen_load_even (op0,
16077 force_reg (inner_mode,
16078 ops[i + i + 1]),
16079 const1_rtx));
16080 }
16081
16082 /* Cast vector to FIRST_IMODE vector. */
16083 ops[i] = gen_reg_rtx (first_imode);
16084 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
16085 }
16086
16087 /* Interleave low FIRST_IMODE vectors. */
16088 for (i = j = 0; i < n; i += 2, j++)
16089 {
16090 op0 = gen_reg_rtx (first_imode);
16091 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
16092
16093 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16094 ops[j] = gen_reg_rtx (second_imode);
16095 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
16096 }
16097
16098 /* Interleave low SECOND_IMODE vectors. */
16099 switch (second_imode)
16100 {
16101 case E_V4SImode:
16102 for (i = j = 0; i < n / 2; i += 2, j++)
16103 {
16104 op0 = gen_reg_rtx (second_imode);
16105 emit_insn (gen_interleave_second_low (op0, ops[i],
16106 ops[i + 1]));
16107
16108 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16109 vector. */
16110 ops[j] = gen_reg_rtx (third_imode);
16111 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
16112 }
16113 second_imode = V2DImode;
16114 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16115 /* FALLTHRU */
16116
16117 case E_V2DImode:
16118 op0 = gen_reg_rtx (second_imode);
16119 emit_insn (gen_interleave_second_low (op0, ops[0],
16120 ops[1]));
16121
16122 /* Cast the SECOND_IMODE vector back to a vector on original
16123 mode. */
16124 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
16125 break;
16126
16127 default:
16128 gcc_unreachable ();
16129 }
16130 }
16131
16132 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
16133 all values variable, and none identical. */
16134
16135 static void
16136 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
16137 rtx target, rtx vals)
16138 {
16139 rtx ops[64], op0, op1, op2, op3, op4, op5;
16140 machine_mode half_mode = VOIDmode;
16141 machine_mode quarter_mode = VOIDmode;
16142 int n, i;
16143
16144 switch (mode)
16145 {
16146 case E_V2SFmode:
16147 case E_V2SImode:
16148 if (!mmx_ok && !TARGET_SSE)
16149 break;
16150 /* FALLTHRU */
16151
16152 case E_V16SImode:
16153 case E_V16SFmode:
16154 case E_V8DFmode:
16155 case E_V8DImode:
16156 case E_V8SFmode:
16157 case E_V8SImode:
16158 case E_V4DFmode:
16159 case E_V4DImode:
16160 case E_V4SFmode:
16161 case E_V4SImode:
16162 case E_V2DFmode:
16163 case E_V2DImode:
16164 n = GET_MODE_NUNITS (mode);
16165 for (i = 0; i < n; i++)
16166 ops[i] = XVECEXP (vals, 0, i);
16167 ix86_expand_vector_init_concat (mode, target, ops, n);
16168 return;
16169
16170 case E_V2TImode:
16171 for (i = 0; i < 2; i++)
16172 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16173 op0 = gen_reg_rtx (V4DImode);
16174 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
16175 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16176 return;
16177
16178 case E_V4TImode:
16179 for (i = 0; i < 4; i++)
16180 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16181 ops[4] = gen_reg_rtx (V4DImode);
16182 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
16183 ops[5] = gen_reg_rtx (V4DImode);
16184 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
16185 op0 = gen_reg_rtx (V8DImode);
16186 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
16187 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16188 return;
16189
16190 case E_V32QImode:
16191 half_mode = V16QImode;
16192 goto half;
16193
16194 case E_V16HImode:
16195 half_mode = V8HImode;
16196 goto half;
16197
16198 case E_V16HFmode:
16199 half_mode = V8HFmode;
16200 goto half;
16201
16202 case E_V16BFmode:
16203 half_mode = V8BFmode;
16204 goto half;
16205
16206 half:
16207 n = GET_MODE_NUNITS (mode);
16208 for (i = 0; i < n; i++)
16209 ops[i] = XVECEXP (vals, 0, i);
16210 op0 = gen_reg_rtx (half_mode);
16211 op1 = gen_reg_rtx (half_mode);
16212 ix86_expand_vector_init_interleave (half_mode, op0, ops,
16213 n >> 2);
16214 ix86_expand_vector_init_interleave (half_mode, op1,
16215 &ops [n >> 1], n >> 2);
16216 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
16217 return;
16218
16219 case E_V64QImode:
16220 quarter_mode = V16QImode;
16221 half_mode = V32QImode;
16222 goto quarter;
16223
16224 case E_V32HImode:
16225 quarter_mode = V8HImode;
16226 half_mode = V16HImode;
16227 goto quarter;
16228
16229 case E_V32HFmode:
16230 quarter_mode = V8HFmode;
16231 half_mode = V16HFmode;
16232 goto quarter;
16233
16234 case E_V32BFmode:
16235 quarter_mode = V8BFmode;
16236 half_mode = V16BFmode;
16237 goto quarter;
16238
16239 quarter:
16240 n = GET_MODE_NUNITS (mode);
16241 for (i = 0; i < n; i++)
16242 ops[i] = XVECEXP (vals, 0, i);
16243 op0 = gen_reg_rtx (quarter_mode);
16244 op1 = gen_reg_rtx (quarter_mode);
16245 op2 = gen_reg_rtx (quarter_mode);
16246 op3 = gen_reg_rtx (quarter_mode);
16247 op4 = gen_reg_rtx (half_mode);
16248 op5 = gen_reg_rtx (half_mode);
16249 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
16250 n >> 3);
16251 ix86_expand_vector_init_interleave (quarter_mode, op1,
16252 &ops [n >> 2], n >> 3);
16253 ix86_expand_vector_init_interleave (quarter_mode, op2,
16254 &ops [n >> 1], n >> 3);
16255 ix86_expand_vector_init_interleave (quarter_mode, op3,
16256 &ops [(n >> 1) | (n >> 2)], n >> 3);
16257 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
16258 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
16259 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
16260 return;
16261
16262 case E_V16QImode:
16263 if (!TARGET_SSE4_1)
16264 break;
16265 /* FALLTHRU */
16266
16267 case E_V8HImode:
16268 if (!TARGET_SSE2)
16269 break;
16270
16271 /* Don't use ix86_expand_vector_init_interleave if we can't
16272 move from GPR to SSE register directly. */
16273 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
16274 break;
16275 /* FALLTHRU */
16276
16277 case E_V8HFmode:
16278 case E_V8BFmode:
16279
16280 n = GET_MODE_NUNITS (mode);
16281 for (i = 0; i < n; i++)
16282 ops[i] = XVECEXP (vals, 0, i);
16283 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
16284 return;
16285
16286 case E_V4HImode:
16287 case E_V8QImode:
16288
16289 case E_V2HImode:
16290 case E_V4QImode:
16291 break;
16292
16293 default:
16294 gcc_unreachable ();
16295 }
16296
16297 {
16298 int i, j, n_elts, n_words, n_elt_per_word;
16299 machine_mode tmp_mode, inner_mode;
16300 rtx words[4], shift;
16301
16302 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
16303
16304 inner_mode = GET_MODE_INNER (mode);
16305 n_elts = GET_MODE_NUNITS (mode);
16306 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
16307 n_elt_per_word = n_elts / n_words;
16308 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
16309
16310 for (i = 0; i < n_words; ++i)
16311 {
16312 rtx word = NULL_RTX;
16313
16314 for (j = 0; j < n_elt_per_word; ++j)
16315 {
16316 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
16317 elt = convert_modes (tmp_mode, inner_mode, elt, true);
16318
16319 if (j == 0)
16320 word = elt;
16321 else
16322 {
16323 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
16324 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16325 word = expand_simple_binop (tmp_mode, IOR, word, elt,
16326 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16327 }
16328 }
16329
16330 words[i] = word;
16331 }
16332
16333 if (n_words == 1)
16334 emit_move_insn (target, gen_lowpart (mode, words[0]));
16335 else if (n_words == 2)
16336 {
16337 rtx tmp = gen_reg_rtx (mode);
16338 emit_clobber (tmp);
16339 emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
16340 emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
16341 emit_move_insn (target, tmp);
16342 }
16343 else if (n_words == 4)
16344 {
16345 rtx tmp = gen_reg_rtx (V4SImode);
16346 gcc_assert (tmp_mode == SImode);
16347 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16348 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
16349 emit_move_insn (target, gen_lowpart (mode, tmp));
16350 }
16351 else
16352 gcc_unreachable ();
16353 }
16354 }
16355
16356 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16357 instructions unless MMX_OK is true. */
16358
16359 void
16360 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16361 {
16362 machine_mode mode = GET_MODE (target);
16363 machine_mode inner_mode = GET_MODE_INNER (mode);
16364 int n_elts = GET_MODE_NUNITS (mode);
16365 int n_var = 0, one_var = -1;
16366 bool all_same = true, all_const_zero = true;
16367 int i;
16368 rtx x;
16369
16370 /* Handle first initialization from vector elts. */
16371 if (n_elts != XVECLEN (vals, 0))
16372 {
16373 rtx subtarget = target;
16374 x = XVECEXP (vals, 0, 0);
16375 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16376 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16377 {
16378 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
16379 if (inner_mode == QImode
16380 || inner_mode == HImode
16381 || inner_mode == TImode
16382 || inner_mode == HFmode
16383 || inner_mode == BFmode)
16384 {
16385 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
16386 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16387 n_bits /= GET_MODE_SIZE (elt_mode);
16388 mode = mode_for_vector (elt_mode, n_bits).require ();
16389 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
16390 ops[0] = gen_lowpart (inner_mode, ops[0]);
16391 ops[1] = gen_lowpart (inner_mode, ops[1]);
16392 subtarget = gen_reg_rtx (mode);
16393 }
16394 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
16395 if (subtarget != target)
16396 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
16397 return;
16398 }
16399 gcc_unreachable ();
16400 }
16401
16402 for (i = 0; i < n_elts; ++i)
16403 {
16404 x = XVECEXP (vals, 0, i);
16405 if (!(CONST_SCALAR_INT_P (x)
16406 || CONST_DOUBLE_P (x)
16407 || CONST_FIXED_P (x)))
16408 n_var++, one_var = i;
16409 else if (x != CONST0_RTX (inner_mode))
16410 all_const_zero = false;
16411 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
16412 all_same = false;
16413 }
16414
16415 /* Constants are best loaded from the constant pool. */
16416 if (n_var == 0)
16417 {
16418 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
16419 return;
16420 }
16421
16422 /* If all values are identical, broadcast the value. */
16423 if (all_same
16424 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16425 XVECEXP (vals, 0, 0)))
16426 return;
16427
16428 /* Values where only one field is non-constant are best loaded from
16429 the pool and overwritten via move later. */
16430 if (n_var == 1)
16431 {
16432 if (all_const_zero
16433 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16434 XVECEXP (vals, 0, one_var),
16435 one_var))
16436 return;
16437
16438 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16439 return;
16440 }
16441
16442 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16443 }
16444
16445 /* Implemented as
16446 V setg (V v, int idx, T val)
16447 {
16448 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16449 V valv = (V){val, val, val, val, val, val, val, val};
16450 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16451 v = (v & ~mask) | (valv & mask);
16452 return v;
16453 }. */
16454 void
16455 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16456 {
16457 rtx vec[64];
16458 machine_mode mode = GET_MODE (target);
16459 machine_mode cmp_mode = mode;
16460 int n_elts = GET_MODE_NUNITS (mode);
16461 rtx valv,idxv,constv,idx_tmp;
16462 bool ok = false;
16463
16464 /* 512-bits vector byte/word broadcast and comparison only available
16465 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16466 when without TARGET_AVX512BW. */
16467 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
16468 || mode == V64QImode)
16469 && !TARGET_AVX512BW)
16470 {
16471 gcc_assert (TARGET_AVX512F);
16472 rtx vhi, vlo, idx_hi;
16473 machine_mode half_mode;
16474 rtx (*extract_hi)(rtx, rtx);
16475 rtx (*extract_lo)(rtx, rtx);
16476
16477 if (mode == V32HImode)
16478 {
16479 half_mode = V16HImode;
16480 extract_hi = gen_vec_extract_hi_v32hi;
16481 extract_lo = gen_vec_extract_lo_v32hi;
16482 }
16483 else if (mode == V32HFmode)
16484 {
16485 half_mode = V16HFmode;
16486 extract_hi = gen_vec_extract_hi_v32hf;
16487 extract_lo = gen_vec_extract_lo_v32hf;
16488 }
16489 else if (mode == V32BFmode)
16490 {
16491 half_mode = V16BFmode;
16492 extract_hi = gen_vec_extract_hi_v32bf;
16493 extract_lo = gen_vec_extract_lo_v32bf;
16494 }
16495 else
16496 {
16497 half_mode = V32QImode;
16498 extract_hi = gen_vec_extract_hi_v64qi;
16499 extract_lo = gen_vec_extract_lo_v64qi;
16500 }
16501
16502 vhi = gen_reg_rtx (half_mode);
16503 vlo = gen_reg_rtx (half_mode);
16504 idx_hi = gen_reg_rtx (GET_MODE (idx));
16505 emit_insn (extract_hi (vhi, target));
16506 emit_insn (extract_lo (vlo, target));
16507 vec[0] = idx_hi;
16508 vec[1] = idx;
16509 vec[2] = GEN_INT (n_elts/2);
16510 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16511 ix86_expand_vector_set_var (vhi, val, idx_hi);
16512 ix86_expand_vector_set_var (vlo, val, idx);
16513 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16514 return;
16515 }
16516
16517 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16518 {
16519 switch (mode)
16520 {
16521 case E_V2DFmode:
16522 cmp_mode = V2DImode;
16523 break;
16524 case E_V4DFmode:
16525 cmp_mode = V4DImode;
16526 break;
16527 case E_V8DFmode:
16528 cmp_mode = V8DImode;
16529 break;
16530 case E_V2SFmode:
16531 cmp_mode = V2SImode;
16532 break;
16533 case E_V4SFmode:
16534 cmp_mode = V4SImode;
16535 break;
16536 case E_V8SFmode:
16537 cmp_mode = V8SImode;
16538 break;
16539 case E_V16SFmode:
16540 cmp_mode = V16SImode;
16541 break;
16542 case E_V8HFmode:
16543 cmp_mode = V8HImode;
16544 break;
16545 case E_V16HFmode:
16546 cmp_mode = V16HImode;
16547 break;
16548 case E_V32HFmode:
16549 cmp_mode = V32HImode;
16550 break;
16551 case E_V8BFmode:
16552 cmp_mode = V8HImode;
16553 break;
16554 case E_V16BFmode:
16555 cmp_mode = V16HImode;
16556 break;
16557 case E_V32BFmode:
16558 cmp_mode = V32HImode;
16559 break;
16560 default:
16561 gcc_unreachable ();
16562 }
16563 }
16564
16565 for (int i = 0; i != n_elts; i++)
16566 vec[i] = GEN_INT (i);
16567 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16568 valv = gen_reg_rtx (mode);
16569 idxv = gen_reg_rtx (cmp_mode);
16570 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16571
16572 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16573 mode, valv, val);
16574 gcc_assert (ok);
16575 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16576 cmp_mode, idxv, idx_tmp);
16577 gcc_assert (ok);
16578 vec[0] = target;
16579 vec[1] = valv;
16580 vec[2] = target;
16581 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16582 vec[4] = idxv;
16583 vec[5] = constv;
16584 ok = ix86_expand_int_vcond (vec);
16585 gcc_assert (ok);
16586 }
16587
16588 void
16589 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16590 {
16591 machine_mode mode = GET_MODE (target);
16592 machine_mode inner_mode = GET_MODE_INNER (mode);
16593 machine_mode half_mode;
16594 bool use_vec_merge = false;
16595 bool blendm_const = false;
16596 rtx tmp;
16597 static rtx (*gen_extract[8][2]) (rtx, rtx)
16598 = {
16599 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16600 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16601 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16602 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16603 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16604 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16605 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
16606 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
16607 };
16608 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
16609 = {
16610 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16611 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16612 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16613 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16614 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16615 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16616 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16617 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
16618 };
16619 int i, j, n;
16620 machine_mode mmode = VOIDmode;
16621 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16622
16623 switch (mode)
16624 {
16625 case E_V2SImode:
16626 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16627 if (use_vec_merge)
16628 break;
16629 /* FALLTHRU */
16630
16631 case E_V2SFmode:
16632 if (mmx_ok)
16633 {
16634 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16635 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16636 if (elt == 0)
16637 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16638 else
16639 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16640 emit_insn (gen_rtx_SET (target, tmp));
16641 return;
16642 }
16643 break;
16644
16645 case E_V2DImode:
16646 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16647 if (use_vec_merge)
16648 break;
16649
16650 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16651 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16652 if (elt == 0)
16653 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16654 else
16655 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16656 emit_insn (gen_rtx_SET (target, tmp));
16657 return;
16658
16659 case E_V2DFmode:
16660 /* NB: For ELT == 0, use standard scalar operation patterns which
16661 preserve the rest of the vector for combiner:
16662
16663 (vec_merge:V2DF
16664 (vec_duplicate:V2DF (reg:DF))
16665 (reg:V2DF)
16666 (const_int 1))
16667 */
16668 if (elt == 0)
16669 goto do_vec_merge;
16670
16671 {
16672 rtx op0, op1;
16673
16674 /* For the two element vectors, we implement a VEC_CONCAT with
16675 the extraction of the other element. */
16676
16677 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16678 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16679
16680 if (elt == 0)
16681 op0 = val, op1 = tmp;
16682 else
16683 op0 = tmp, op1 = val;
16684
16685 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16686 emit_insn (gen_rtx_SET (target, tmp));
16687 }
16688 return;
16689
16690 case E_V4SFmode:
16691 use_vec_merge = TARGET_SSE4_1;
16692 if (use_vec_merge)
16693 break;
16694
16695 switch (elt)
16696 {
16697 case 0:
16698 use_vec_merge = true;
16699 break;
16700
16701 case 1:
16702 /* tmp = target = A B C D */
16703 tmp = copy_to_reg (target);
16704 /* target = A A B B */
16705 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16706 /* target = X A B B */
16707 ix86_expand_vector_set (false, target, val, 0);
16708 /* target = A X C D */
16709 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16710 const1_rtx, const0_rtx,
16711 GEN_INT (2+4), GEN_INT (3+4)));
16712 return;
16713
16714 case 2:
16715 /* tmp = target = A B C D */
16716 tmp = copy_to_reg (target);
16717 /* tmp = X B C D */
16718 ix86_expand_vector_set (false, tmp, val, 0);
16719 /* target = A B X D */
16720 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16721 const0_rtx, const1_rtx,
16722 GEN_INT (0+4), GEN_INT (3+4)));
16723 return;
16724
16725 case 3:
16726 /* tmp = target = A B C D */
16727 tmp = copy_to_reg (target);
16728 /* tmp = X B C D */
16729 ix86_expand_vector_set (false, tmp, val, 0);
16730 /* target = A B X D */
16731 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16732 const0_rtx, const1_rtx,
16733 GEN_INT (2+4), GEN_INT (0+4)));
16734 return;
16735
16736 default:
16737 gcc_unreachable ();
16738 }
16739 break;
16740
16741 case E_V4SImode:
16742 use_vec_merge = TARGET_SSE4_1;
16743 if (use_vec_merge)
16744 break;
16745
16746 /* Element 0 handled by vec_merge below. */
16747 if (elt == 0)
16748 {
16749 use_vec_merge = true;
16750 break;
16751 }
16752
16753 if (TARGET_SSE2)
16754 {
16755 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16756 store into element 0, then shuffle them back. */
16757
16758 rtx order[4];
16759
16760 order[0] = GEN_INT (elt);
16761 order[1] = const1_rtx;
16762 order[2] = const2_rtx;
16763 order[3] = GEN_INT (3);
16764 order[elt] = const0_rtx;
16765
16766 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16767 order[1], order[2], order[3]));
16768
16769 ix86_expand_vector_set (false, target, val, 0);
16770
16771 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16772 order[1], order[2], order[3]));
16773 }
16774 else
16775 {
16776 /* For SSE1, we have to reuse the V4SF code. */
16777 rtx t = gen_reg_rtx (V4SFmode);
16778 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16779 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16780 emit_move_insn (target, gen_lowpart (mode, t));
16781 }
16782 return;
16783
16784 case E_V8HImode:
16785 case E_V8HFmode:
16786 case E_V8BFmode:
16787 case E_V2HImode:
16788 use_vec_merge = TARGET_SSE2;
16789 break;
16790 case E_V4HImode:
16791 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16792 break;
16793
16794 case E_V16QImode:
16795 case E_V4QImode:
16796 use_vec_merge = TARGET_SSE4_1;
16797 break;
16798
16799 case E_V8QImode:
16800 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16801 break;
16802
16803 case E_V32QImode:
16804 half_mode = V16QImode;
16805 j = 0;
16806 n = 16;
16807 goto half;
16808
16809 case E_V16HFmode:
16810 case E_V16BFmode:
16811 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16812 if (TARGET_AVX2 && elt != 0)
16813 {
16814 mmode = SImode;
16815 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
16816 : gen_avx2_pblendbf_1);
16817 blendm_const = true;
16818 break;
16819 }
16820 else
16821 {
16822 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
16823 j = ((mode == E_V16HFmode) ? 6 : 7);
16824 n = 8;
16825 goto half;
16826 }
16827
16828 case E_V16HImode:
16829 half_mode = V8HImode;
16830 j = 1;
16831 n = 8;
16832 goto half;
16833
16834 case E_V8SImode:
16835 half_mode = V4SImode;
16836 j = 2;
16837 n = 4;
16838 goto half;
16839
16840 case E_V4DImode:
16841 half_mode = V2DImode;
16842 j = 3;
16843 n = 2;
16844 goto half;
16845
16846 case E_V8SFmode:
16847 half_mode = V4SFmode;
16848 j = 4;
16849 n = 4;
16850 goto half;
16851
16852 case E_V4DFmode:
16853 half_mode = V2DFmode;
16854 j = 5;
16855 n = 2;
16856 goto half;
16857
16858 half:
16859 /* Compute offset. */
16860 i = elt / n;
16861 elt %= n;
16862
16863 gcc_assert (i <= 1);
16864
16865 /* Extract the half. */
16866 tmp = gen_reg_rtx (half_mode);
16867 emit_insn (gen_extract[j][i] (tmp, target));
16868
16869 /* Put val in tmp at elt. */
16870 ix86_expand_vector_set (false, tmp, val, elt);
16871
16872 /* Put it back. */
16873 emit_insn (gen_insert[j][i] (target, target, tmp));
16874 return;
16875
16876 case E_V8DFmode:
16877 if (TARGET_AVX512F)
16878 {
16879 mmode = QImode;
16880 gen_blendm = gen_avx512f_blendmv8df;
16881 }
16882 break;
16883
16884 case E_V8DImode:
16885 if (TARGET_AVX512F)
16886 {
16887 mmode = QImode;
16888 gen_blendm = gen_avx512f_blendmv8di;
16889 }
16890 break;
16891
16892 case E_V16SFmode:
16893 if (TARGET_AVX512F)
16894 {
16895 mmode = HImode;
16896 gen_blendm = gen_avx512f_blendmv16sf;
16897 }
16898 break;
16899
16900 case E_V16SImode:
16901 if (TARGET_AVX512F)
16902 {
16903 mmode = HImode;
16904 gen_blendm = gen_avx512f_blendmv16si;
16905 }
16906 break;
16907
16908 case E_V32HFmode:
16909 if (TARGET_AVX512BW)
16910 {
16911 mmode = SImode;
16912 gen_blendm = gen_avx512bw_blendmv32hf;
16913 }
16914 break;
16915 case E_V32BFmode:
16916 if (TARGET_AVX512BW)
16917 {
16918 mmode = SImode;
16919 gen_blendm = gen_avx512bw_blendmv32bf;
16920 }
16921 break;
16922 case E_V32HImode:
16923 if (TARGET_AVX512BW)
16924 {
16925 mmode = SImode;
16926 gen_blendm = gen_avx512bw_blendmv32hi;
16927 }
16928 else if (TARGET_AVX512F)
16929 {
16930 half_mode = E_V8HImode;
16931 n = 8;
16932 goto quarter;
16933 }
16934 break;
16935
16936 case E_V64QImode:
16937 if (TARGET_AVX512BW)
16938 {
16939 mmode = DImode;
16940 gen_blendm = gen_avx512bw_blendmv64qi;
16941 }
16942 else if (TARGET_AVX512F)
16943 {
16944 half_mode = E_V16QImode;
16945 n = 16;
16946 goto quarter;
16947 }
16948 break;
16949
16950 quarter:
16951 /* Compute offset. */
16952 i = elt / n;
16953 elt %= n;
16954
16955 gcc_assert (i <= 3);
16956
16957 {
16958 /* Extract the quarter. */
16959 tmp = gen_reg_rtx (V4SImode);
16960 rtx tmp2 = gen_lowpart (V16SImode, target);
16961 rtx mask = gen_reg_rtx (QImode);
16962
16963 emit_move_insn (mask, constm1_rtx);
16964 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
16965 tmp, mask));
16966
16967 tmp2 = gen_reg_rtx (half_mode);
16968 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
16969 tmp = tmp2;
16970
16971 /* Put val in tmp at elt. */
16972 ix86_expand_vector_set (false, tmp, val, elt);
16973
16974 /* Put it back. */
16975 tmp2 = gen_reg_rtx (V16SImode);
16976 rtx tmp3 = gen_lowpart (V16SImode, target);
16977 mask = gen_reg_rtx (HImode);
16978 emit_move_insn (mask, constm1_rtx);
16979 tmp = gen_lowpart (V4SImode, tmp);
16980 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
16981 tmp3, mask));
16982 emit_move_insn (target, gen_lowpart (mode, tmp2));
16983 }
16984 return;
16985
16986 default:
16987 break;
16988 }
16989
16990 if (mmode != VOIDmode)
16991 {
16992 tmp = gen_reg_rtx (mode);
16993 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
16994 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
16995 /* The avx512*_blendm<mode> expanders have different operand order
16996 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
16997 elements where the mask is set and second input operand otherwise,
16998 in {sse,avx}*_*blend* the first input operand is used for elements
16999 where the mask is clear and second input operand otherwise. */
17000 if (!blendm_const)
17001 merge_mask = force_reg (mmode, merge_mask);
17002 emit_insn (gen_blendm (target, target, tmp, merge_mask));
17003 }
17004 else if (use_vec_merge)
17005 {
17006 do_vec_merge:
17007 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
17008 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
17009 GEN_INT (HOST_WIDE_INT_1U << elt));
17010 emit_insn (gen_rtx_SET (target, tmp));
17011 }
17012 else
17013 {
17014 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17015
17016 emit_move_insn (mem, target);
17017
17018 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
17019 emit_move_insn (tmp, val);
17020
17021 emit_move_insn (target, mem);
17022 }
17023 }
17024
17025 void
17026 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
17027 {
17028 machine_mode mode = GET_MODE (vec);
17029 machine_mode inner_mode = GET_MODE_INNER (mode);
17030 bool use_vec_extr = false;
17031 rtx tmp;
17032
17033 switch (mode)
17034 {
17035 case E_V2SImode:
17036 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17037 if (use_vec_extr)
17038 break;
17039 /* FALLTHRU */
17040
17041 case E_V2SFmode:
17042 if (!mmx_ok)
17043 break;
17044 /* FALLTHRU */
17045
17046 case E_V2DFmode:
17047 case E_V2DImode:
17048 case E_V2TImode:
17049 case E_V4TImode:
17050 use_vec_extr = true;
17051 break;
17052
17053 case E_V4SFmode:
17054 use_vec_extr = TARGET_SSE4_1;
17055 if (use_vec_extr)
17056 break;
17057
17058 switch (elt)
17059 {
17060 case 0:
17061 tmp = vec;
17062 break;
17063
17064 case 1:
17065 case 3:
17066 tmp = gen_reg_rtx (mode);
17067 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
17068 GEN_INT (elt), GEN_INT (elt),
17069 GEN_INT (elt+4), GEN_INT (elt+4)));
17070 break;
17071
17072 case 2:
17073 tmp = gen_reg_rtx (mode);
17074 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
17075 break;
17076
17077 default:
17078 gcc_unreachable ();
17079 }
17080 vec = tmp;
17081 use_vec_extr = true;
17082 elt = 0;
17083 break;
17084
17085 case E_V4SImode:
17086 use_vec_extr = TARGET_SSE4_1;
17087 if (use_vec_extr)
17088 break;
17089
17090 if (TARGET_SSE2)
17091 {
17092 switch (elt)
17093 {
17094 case 0:
17095 tmp = vec;
17096 break;
17097
17098 case 1:
17099 case 3:
17100 tmp = gen_reg_rtx (mode);
17101 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
17102 GEN_INT (elt), GEN_INT (elt),
17103 GEN_INT (elt), GEN_INT (elt)));
17104 break;
17105
17106 case 2:
17107 tmp = gen_reg_rtx (mode);
17108 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
17109 break;
17110
17111 default:
17112 gcc_unreachable ();
17113 }
17114 vec = tmp;
17115 use_vec_extr = true;
17116 elt = 0;
17117 }
17118 else
17119 {
17120 /* For SSE1, we have to reuse the V4SF code. */
17121 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
17122 gen_lowpart (V4SFmode, vec), elt);
17123 return;
17124 }
17125 break;
17126
17127 case E_V8HImode:
17128 case E_V8HFmode:
17129 case E_V8BFmode:
17130 case E_V2HImode:
17131 use_vec_extr = TARGET_SSE2;
17132 break;
17133 case E_V4HImode:
17134 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17135 break;
17136
17137 case E_V16QImode:
17138 use_vec_extr = TARGET_SSE4_1;
17139 if (!use_vec_extr
17140 && TARGET_SSE2
17141 && elt == 0
17142 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
17143 {
17144 tmp = gen_reg_rtx (SImode);
17145 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
17146 0);
17147 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
17148 return;
17149 }
17150 break;
17151 case E_V4QImode:
17152 use_vec_extr = TARGET_SSE4_1;
17153 break;
17154
17155 case E_V8SFmode:
17156 if (TARGET_AVX)
17157 {
17158 tmp = gen_reg_rtx (V4SFmode);
17159 if (elt < 4)
17160 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
17161 else
17162 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
17163 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17164 return;
17165 }
17166 break;
17167
17168 case E_V4DFmode:
17169 if (TARGET_AVX)
17170 {
17171 tmp = gen_reg_rtx (V2DFmode);
17172 if (elt < 2)
17173 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
17174 else
17175 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
17176 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17177 return;
17178 }
17179 break;
17180
17181 case E_V32QImode:
17182 if (TARGET_AVX)
17183 {
17184 tmp = gen_reg_rtx (V16QImode);
17185 if (elt < 16)
17186 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
17187 else
17188 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
17189 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17190 return;
17191 }
17192 break;
17193
17194 case E_V16HImode:
17195 if (TARGET_AVX)
17196 {
17197 tmp = gen_reg_rtx (V8HImode);
17198 if (elt < 8)
17199 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
17200 else
17201 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
17202 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17203 return;
17204 }
17205 break;
17206
17207 case E_V8SImode:
17208 if (TARGET_AVX)
17209 {
17210 tmp = gen_reg_rtx (V4SImode);
17211 if (elt < 4)
17212 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
17213 else
17214 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
17215 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17216 return;
17217 }
17218 break;
17219
17220 case E_V4DImode:
17221 if (TARGET_AVX)
17222 {
17223 tmp = gen_reg_rtx (V2DImode);
17224 if (elt < 2)
17225 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
17226 else
17227 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
17228 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17229 return;
17230 }
17231 break;
17232
17233 case E_V32HImode:
17234 if (TARGET_AVX512BW)
17235 {
17236 tmp = gen_reg_rtx (V16HImode);
17237 if (elt < 16)
17238 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
17239 else
17240 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
17241 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17242 return;
17243 }
17244 break;
17245
17246 case E_V64QImode:
17247 if (TARGET_AVX512BW)
17248 {
17249 tmp = gen_reg_rtx (V32QImode);
17250 if (elt < 32)
17251 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
17252 else
17253 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
17254 ix86_expand_vector_extract (false, target, tmp, elt & 31);
17255 return;
17256 }
17257 break;
17258
17259 case E_V16SFmode:
17260 tmp = gen_reg_rtx (V8SFmode);
17261 if (elt < 8)
17262 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
17263 else
17264 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
17265 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17266 return;
17267
17268 case E_V8DFmode:
17269 tmp = gen_reg_rtx (V4DFmode);
17270 if (elt < 4)
17271 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
17272 else
17273 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
17274 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17275 return;
17276
17277 case E_V16SImode:
17278 tmp = gen_reg_rtx (V8SImode);
17279 if (elt < 8)
17280 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
17281 else
17282 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
17283 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17284 return;
17285
17286 case E_V8DImode:
17287 tmp = gen_reg_rtx (V4DImode);
17288 if (elt < 4)
17289 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
17290 else
17291 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
17292 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17293 return;
17294
17295 case E_V32HFmode:
17296 case E_V32BFmode:
17297 if (TARGET_AVX512BW)
17298 {
17299 tmp = (mode == E_V32HFmode
17300 ? gen_reg_rtx (V16HFmode)
17301 : gen_reg_rtx (V16BFmode));
17302 if (elt < 16)
17303 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
17304 else
17305 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
17306 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17307 return;
17308 }
17309 break;
17310
17311 case E_V16HFmode:
17312 case E_V16BFmode:
17313 if (TARGET_AVX)
17314 {
17315 tmp = (mode == E_V16HFmode
17316 ? gen_reg_rtx (V8HFmode)
17317 : gen_reg_rtx (V8BFmode));
17318 if (elt < 8)
17319 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
17320 else
17321 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
17322 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17323 return;
17324 }
17325 break;
17326
17327 case E_V8QImode:
17328 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17329 /* ??? Could extract the appropriate HImode element and shift. */
17330 break;
17331
17332 default:
17333 break;
17334 }
17335
17336 if (use_vec_extr)
17337 {
17338 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17339 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17340
17341 /* Let the rtl optimizers know about the zero extension performed. */
17342 if (inner_mode == QImode || inner_mode == HImode)
17343 {
17344 rtx reg = gen_reg_rtx (SImode);
17345 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
17346 emit_move_insn (reg, tmp);
17347 tmp = gen_lowpart (inner_mode, reg);
17348 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17349 SUBREG_PROMOTED_SET (tmp, 1);
17350 }
17351
17352 emit_move_insn (target, tmp);
17353 }
17354 else
17355 {
17356 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17357
17358 emit_move_insn (mem, vec);
17359
17360 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17361 emit_move_insn (target, tmp);
17362 }
17363 }
17364
17365 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17366 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17367 The upper bits of DEST are undefined, though they shouldn't cause
17368 exceptions (some bits from src or all zeros are ok). */
17369
17370 static void
17371 emit_reduc_half (rtx dest, rtx src, int i)
17372 {
17373 rtx tem, d = dest;
17374 switch (GET_MODE (src))
17375 {
17376 case E_V4SFmode:
17377 if (i == 128)
17378 tem = gen_sse_movhlps (dest, src, src);
17379 else
17380 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
17381 GEN_INT (1 + 4), GEN_INT (1 + 4));
17382 break;
17383 case E_V2DFmode:
17384 tem = gen_vec_interleave_highv2df (dest, src, src);
17385 break;
17386 case E_V4QImode:
17387 d = gen_reg_rtx (V1SImode);
17388 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
17389 GEN_INT (i / 2));
17390 break;
17391 case E_V4HImode:
17392 d = gen_reg_rtx (V1DImode);
17393 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
17394 GEN_INT (i / 2));
17395 break;
17396 case E_V16QImode:
17397 case E_V8HImode:
17398 case E_V8HFmode:
17399 case E_V4SImode:
17400 case E_V2DImode:
17401 d = gen_reg_rtx (V1TImode);
17402 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
17403 GEN_INT (i / 2));
17404 break;
17405 case E_V8SFmode:
17406 if (i == 256)
17407 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
17408 else
17409 tem = gen_avx_shufps256 (dest, src, src,
17410 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
17411 break;
17412 case E_V4DFmode:
17413 if (i == 256)
17414 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
17415 else
17416 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
17417 break;
17418 case E_V32QImode:
17419 case E_V16HImode:
17420 case E_V16HFmode:
17421 case E_V8SImode:
17422 case E_V4DImode:
17423 if (i == 256)
17424 {
17425 if (GET_MODE (dest) != V4DImode)
17426 d = gen_reg_rtx (V4DImode);
17427 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
17428 gen_lowpart (V4DImode, src),
17429 const1_rtx);
17430 }
17431 else
17432 {
17433 d = gen_reg_rtx (V2TImode);
17434 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
17435 GEN_INT (i / 2));
17436 }
17437 break;
17438 case E_V64QImode:
17439 case E_V32HImode:
17440 case E_V32HFmode:
17441 if (i < 64)
17442 {
17443 d = gen_reg_rtx (V4TImode);
17444 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
17445 GEN_INT (i / 2));
17446 break;
17447 }
17448 /* FALLTHRU */
17449 case E_V16SImode:
17450 case E_V16SFmode:
17451 case E_V8DImode:
17452 case E_V8DFmode:
17453 if (i > 128)
17454 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
17455 gen_lowpart (V16SImode, src),
17456 gen_lowpart (V16SImode, src),
17457 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17458 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17459 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17460 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17461 GEN_INT (0xC), GEN_INT (0xD),
17462 GEN_INT (0xE), GEN_INT (0xF),
17463 GEN_INT (0x10), GEN_INT (0x11),
17464 GEN_INT (0x12), GEN_INT (0x13),
17465 GEN_INT (0x14), GEN_INT (0x15),
17466 GEN_INT (0x16), GEN_INT (0x17));
17467 else
17468 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
17469 gen_lowpart (V16SImode, src),
17470 GEN_INT (i == 128 ? 0x2 : 0x1),
17471 GEN_INT (0x3),
17472 GEN_INT (0x3),
17473 GEN_INT (0x3),
17474 GEN_INT (i == 128 ? 0x6 : 0x5),
17475 GEN_INT (0x7),
17476 GEN_INT (0x7),
17477 GEN_INT (0x7),
17478 GEN_INT (i == 128 ? 0xA : 0x9),
17479 GEN_INT (0xB),
17480 GEN_INT (0xB),
17481 GEN_INT (0xB),
17482 GEN_INT (i == 128 ? 0xE : 0xD),
17483 GEN_INT (0xF),
17484 GEN_INT (0xF),
17485 GEN_INT (0xF));
17486 break;
17487 default:
17488 gcc_unreachable ();
17489 }
17490 emit_insn (tem);
17491 if (d != dest)
17492 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17493 }
17494
17495 /* Expand a vector reduction. FN is the binary pattern to reduce;
17496 DEST is the destination; IN is the input vector. */
17497
17498 void
17499 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17500 {
17501 rtx half, dst, vec = in;
17502 machine_mode mode = GET_MODE (in);
17503 int i;
17504
17505 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17506 if (TARGET_SSE4_1
17507 && mode == V8HImode
17508 && fn == gen_uminv8hi3)
17509 {
17510 emit_insn (gen_sse4_1_phminposuw (dest, in));
17511 return;
17512 }
17513
17514 for (i = GET_MODE_BITSIZE (mode);
17515 i > GET_MODE_UNIT_BITSIZE (mode);
17516 i >>= 1)
17517 {
17518 half = gen_reg_rtx (mode);
17519 emit_reduc_half (half, vec, i);
17520 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17521 dst = dest;
17522 else
17523 dst = gen_reg_rtx (mode);
17524 emit_insn (fn (dst, half, vec));
17525 vec = dst;
17526 }
17527 }
17528
17529 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17530 FP status register is set. */
17531
17532 void
17533 ix86_emit_fp_unordered_jump (rtx label)
17534 {
17535 rtx reg = gen_reg_rtx (HImode);
17536 rtx_insn *insn;
17537 rtx temp;
17538
17539 emit_insn (gen_x86_fnstsw_1 (reg));
17540
17541 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17542 {
17543 emit_insn (gen_x86_sahf_1 (reg));
17544
17545 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17546 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17547 }
17548 else
17549 {
17550 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17551
17552 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17553 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17554 }
17555
17556 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17557 gen_rtx_LABEL_REF (VOIDmode, label),
17558 pc_rtx);
17559 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17560 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17561 JUMP_LABEL (insn) = label;
17562 }
17563
17564 /* Output code to perform an sinh XFmode calculation. */
17565
17566 void
17567 ix86_emit_i387_sinh (rtx op0, rtx op1)
17568 {
17569 rtx e1 = gen_reg_rtx (XFmode);
17570 rtx e2 = gen_reg_rtx (XFmode);
17571 rtx scratch = gen_reg_rtx (HImode);
17572 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17573 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17574 rtx cst1, tmp;
17575 rtx_code_label *jump_label = gen_label_rtx ();
17576 rtx_insn *insn;
17577
17578 /* scratch = fxam (op1) */
17579 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17580
17581 /* e1 = expm1 (|op1|) */
17582 emit_insn (gen_absxf2 (e2, op1));
17583 emit_insn (gen_expm1xf2 (e1, e2));
17584
17585 /* e2 = e1 / (e1 + 1.0) + e1 */
17586 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17587 emit_insn (gen_addxf3 (e2, e1, cst1));
17588 emit_insn (gen_divxf3 (e2, e1, e2));
17589 emit_insn (gen_addxf3 (e2, e2, e1));
17590
17591 /* flags = signbit (op1) */
17592 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17593
17594 /* if (flags) then e2 = -e2 */
17595 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17596 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17597 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17598 pc_rtx);
17599 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17600 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17601 JUMP_LABEL (insn) = jump_label;
17602
17603 emit_insn (gen_negxf2 (e2, e2));
17604
17605 emit_label (jump_label);
17606 LABEL_NUSES (jump_label) = 1;
17607
17608 /* op0 = 0.5 * e2 */
17609 half = force_reg (XFmode, half);
17610 emit_insn (gen_mulxf3 (op0, e2, half));
17611 }
17612
17613 /* Output code to perform an cosh XFmode calculation. */
17614
17615 void
17616 ix86_emit_i387_cosh (rtx op0, rtx op1)
17617 {
17618 rtx e1 = gen_reg_rtx (XFmode);
17619 rtx e2 = gen_reg_rtx (XFmode);
17620 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17621 rtx cst1;
17622
17623 /* e1 = exp (op1) */
17624 emit_insn (gen_expxf2 (e1, op1));
17625
17626 /* e2 = e1 + 1.0 / e1 */
17627 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17628 emit_insn (gen_divxf3 (e2, cst1, e1));
17629 emit_insn (gen_addxf3 (e2, e1, e2));
17630
17631 /* op0 = 0.5 * e2 */
17632 half = force_reg (XFmode, half);
17633 emit_insn (gen_mulxf3 (op0, e2, half));
17634 }
17635
17636 /* Output code to perform an tanh XFmode calculation. */
17637
17638 void
17639 ix86_emit_i387_tanh (rtx op0, rtx op1)
17640 {
17641 rtx e1 = gen_reg_rtx (XFmode);
17642 rtx e2 = gen_reg_rtx (XFmode);
17643 rtx scratch = gen_reg_rtx (HImode);
17644 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17645 rtx cst2, tmp;
17646 rtx_code_label *jump_label = gen_label_rtx ();
17647 rtx_insn *insn;
17648
17649 /* scratch = fxam (op1) */
17650 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17651
17652 /* e1 = expm1 (-|2 * op1|) */
17653 emit_insn (gen_addxf3 (e2, op1, op1));
17654 emit_insn (gen_absxf2 (e2, e2));
17655 emit_insn (gen_negxf2 (e2, e2));
17656 emit_insn (gen_expm1xf2 (e1, e2));
17657
17658 /* e2 = e1 / (e1 + 2.0) */
17659 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17660 emit_insn (gen_addxf3 (e2, e1, cst2));
17661 emit_insn (gen_divxf3 (e2, e1, e2));
17662
17663 /* flags = signbit (op1) */
17664 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17665
17666 /* if (!flags) then e2 = -e2 */
17667 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17668 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17669 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17670 pc_rtx);
17671 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17672 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17673 JUMP_LABEL (insn) = jump_label;
17674
17675 emit_insn (gen_negxf2 (e2, e2));
17676
17677 emit_label (jump_label);
17678 LABEL_NUSES (jump_label) = 1;
17679
17680 emit_move_insn (op0, e2);
17681 }
17682
17683 /* Output code to perform an asinh XFmode calculation. */
17684
17685 void
17686 ix86_emit_i387_asinh (rtx op0, rtx op1)
17687 {
17688 rtx e1 = gen_reg_rtx (XFmode);
17689 rtx e2 = gen_reg_rtx (XFmode);
17690 rtx scratch = gen_reg_rtx (HImode);
17691 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17692 rtx cst1, tmp;
17693 rtx_code_label *jump_label = gen_label_rtx ();
17694 rtx_insn *insn;
17695
17696 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17697 emit_insn (gen_mulxf3 (e1, op1, op1));
17698 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17699 emit_insn (gen_addxf3 (e2, e1, cst1));
17700 emit_insn (gen_sqrtxf2 (e2, e2));
17701 emit_insn (gen_addxf3 (e2, e2, cst1));
17702
17703 /* e1 = e1 / e2 */
17704 emit_insn (gen_divxf3 (e1, e1, e2));
17705
17706 /* scratch = fxam (op1) */
17707 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17708
17709 /* e1 = e1 + |op1| */
17710 emit_insn (gen_absxf2 (e2, op1));
17711 emit_insn (gen_addxf3 (e1, e1, e2));
17712
17713 /* e2 = log1p (e1) */
17714 ix86_emit_i387_log1p (e2, e1);
17715
17716 /* flags = signbit (op1) */
17717 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17718
17719 /* if (flags) then e2 = -e2 */
17720 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17721 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17722 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17723 pc_rtx);
17724 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17725 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17726 JUMP_LABEL (insn) = jump_label;
17727
17728 emit_insn (gen_negxf2 (e2, e2));
17729
17730 emit_label (jump_label);
17731 LABEL_NUSES (jump_label) = 1;
17732
17733 emit_move_insn (op0, e2);
17734 }
17735
17736 /* Output code to perform an acosh XFmode calculation. */
17737
17738 void
17739 ix86_emit_i387_acosh (rtx op0, rtx op1)
17740 {
17741 rtx e1 = gen_reg_rtx (XFmode);
17742 rtx e2 = gen_reg_rtx (XFmode);
17743 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17744
17745 /* e2 = sqrt (op1 + 1.0) */
17746 emit_insn (gen_addxf3 (e2, op1, cst1));
17747 emit_insn (gen_sqrtxf2 (e2, e2));
17748
17749 /* e1 = sqrt (op1 - 1.0) */
17750 emit_insn (gen_subxf3 (e1, op1, cst1));
17751 emit_insn (gen_sqrtxf2 (e1, e1));
17752
17753 /* e1 = e1 * e2 */
17754 emit_insn (gen_mulxf3 (e1, e1, e2));
17755
17756 /* e1 = e1 + op1 */
17757 emit_insn (gen_addxf3 (e1, e1, op1));
17758
17759 /* op0 = log (e1) */
17760 emit_insn (gen_logxf2 (op0, e1));
17761 }
17762
17763 /* Output code to perform an atanh XFmode calculation. */
17764
17765 void
17766 ix86_emit_i387_atanh (rtx op0, rtx op1)
17767 {
17768 rtx e1 = gen_reg_rtx (XFmode);
17769 rtx e2 = gen_reg_rtx (XFmode);
17770 rtx scratch = gen_reg_rtx (HImode);
17771 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17772 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17773 rtx cst1, tmp;
17774 rtx_code_label *jump_label = gen_label_rtx ();
17775 rtx_insn *insn;
17776
17777 /* scratch = fxam (op1) */
17778 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17779
17780 /* e2 = |op1| */
17781 emit_insn (gen_absxf2 (e2, op1));
17782
17783 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17784 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17785 emit_insn (gen_addxf3 (e1, e2, cst1));
17786 emit_insn (gen_addxf3 (e2, e2, e2));
17787 emit_insn (gen_negxf2 (e2, e2));
17788 emit_insn (gen_divxf3 (e1, e2, e1));
17789
17790 /* e2 = log1p (e1) */
17791 ix86_emit_i387_log1p (e2, e1);
17792
17793 /* flags = signbit (op1) */
17794 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17795
17796 /* if (!flags) then e2 = -e2 */
17797 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17798 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17799 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17800 pc_rtx);
17801 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17802 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17803 JUMP_LABEL (insn) = jump_label;
17804
17805 emit_insn (gen_negxf2 (e2, e2));
17806
17807 emit_label (jump_label);
17808 LABEL_NUSES (jump_label) = 1;
17809
17810 /* op0 = 0.5 * e2 */
17811 half = force_reg (XFmode, half);
17812 emit_insn (gen_mulxf3 (op0, e2, half));
17813 }
17814
17815 /* Output code to perform a log1p XFmode calculation. */
17816
17817 void
17818 ix86_emit_i387_log1p (rtx op0, rtx op1)
17819 {
17820 rtx_code_label *label1 = gen_label_rtx ();
17821 rtx_code_label *label2 = gen_label_rtx ();
17822
17823 rtx tmp = gen_reg_rtx (XFmode);
17824 rtx res = gen_reg_rtx (XFmode);
17825 rtx cst, cstln2, cst1;
17826 rtx_insn *insn;
17827
17828 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17829 before the conditional jump, otherwise the stack adjustment will be
17830 only conditional. */
17831 do_pending_stack_adjust ();
17832
17833 cst = const_double_from_real_value
17834 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17835 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17836
17837 emit_insn (gen_absxf2 (tmp, op1));
17838
17839 cst = force_reg (XFmode, cst);
17840 ix86_expand_branch (GE, tmp, cst, label1);
17841 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17842 insn = get_last_insn ();
17843 JUMP_LABEL (insn) = label1;
17844
17845 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17846 emit_jump (label2);
17847
17848 emit_label (label1);
17849 LABEL_NUSES (label1) = 1;
17850
17851 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17852 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17853 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17854
17855 emit_label (label2);
17856 LABEL_NUSES (label2) = 1;
17857
17858 emit_move_insn (op0, res);
17859 }
17860
17861 /* Emit code for round calculation. */
17862 void
17863 ix86_emit_i387_round (rtx op0, rtx op1)
17864 {
17865 machine_mode inmode = GET_MODE (op1);
17866 machine_mode outmode = GET_MODE (op0);
17867 rtx e1 = gen_reg_rtx (XFmode);
17868 rtx e2 = gen_reg_rtx (XFmode);
17869 rtx scratch = gen_reg_rtx (HImode);
17870 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17871 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17872 rtx res = gen_reg_rtx (outmode);
17873 rtx_code_label *jump_label = gen_label_rtx ();
17874 rtx (*floor_insn) (rtx, rtx);
17875 rtx (*neg_insn) (rtx, rtx);
17876 rtx_insn *insn;
17877 rtx tmp;
17878
17879 switch (inmode)
17880 {
17881 case E_SFmode:
17882 case E_DFmode:
17883 tmp = gen_reg_rtx (XFmode);
17884
17885 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17886 op1 = tmp;
17887 break;
17888 case E_XFmode:
17889 break;
17890 default:
17891 gcc_unreachable ();
17892 }
17893
17894 switch (outmode)
17895 {
17896 case E_SFmode:
17897 floor_insn = gen_frndintxf2_floor;
17898 neg_insn = gen_negsf2;
17899 break;
17900 case E_DFmode:
17901 floor_insn = gen_frndintxf2_floor;
17902 neg_insn = gen_negdf2;
17903 break;
17904 case E_XFmode:
17905 floor_insn = gen_frndintxf2_floor;
17906 neg_insn = gen_negxf2;
17907 break;
17908 case E_HImode:
17909 floor_insn = gen_lfloorxfhi2;
17910 neg_insn = gen_neghi2;
17911 break;
17912 case E_SImode:
17913 floor_insn = gen_lfloorxfsi2;
17914 neg_insn = gen_negsi2;
17915 break;
17916 case E_DImode:
17917 floor_insn = gen_lfloorxfdi2;
17918 neg_insn = gen_negdi2;
17919 break;
17920 default:
17921 gcc_unreachable ();
17922 }
17923
17924 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17925
17926 /* scratch = fxam(op1) */
17927 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17928
17929 /* e1 = fabs(op1) */
17930 emit_insn (gen_absxf2 (e1, op1));
17931
17932 /* e2 = e1 + 0.5 */
17933 half = force_reg (XFmode, half);
17934 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
17935
17936 /* res = floor(e2) */
17937 switch (outmode)
17938 {
17939 case E_SFmode:
17940 case E_DFmode:
17941 {
17942 tmp = gen_reg_rtx (XFmode);
17943
17944 emit_insn (floor_insn (tmp, e2));
17945 emit_insn (gen_rtx_SET (res,
17946 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
17947 UNSPEC_TRUNC_NOOP)));
17948 }
17949 break;
17950 default:
17951 emit_insn (floor_insn (res, e2));
17952 }
17953
17954 /* flags = signbit(a) */
17955 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17956
17957 /* if (flags) then res = -res */
17958 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17959 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17960 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17961 pc_rtx);
17962 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17963 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17964 JUMP_LABEL (insn) = jump_label;
17965
17966 emit_insn (neg_insn (res, res));
17967
17968 emit_label (jump_label);
17969 LABEL_NUSES (jump_label) = 1;
17970
17971 emit_move_insn (op0, res);
17972 }
17973
17974 /* Output code to perform a Newton-Rhapson approximation of a single precision
17975 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
17976
17977 void
17978 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
17979 {
17980 rtx x0, x1, e0, e1;
17981
17982 x0 = gen_reg_rtx (mode);
17983 e0 = gen_reg_rtx (mode);
17984 e1 = gen_reg_rtx (mode);
17985 x1 = gen_reg_rtx (mode);
17986
17987 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17988
17989 b = force_reg (mode, b);
17990
17991 /* x0 = rcp(b) estimate */
17992 if (mode == V16SFmode || mode == V8DFmode)
17993 {
17994 if (TARGET_AVX512ER)
17995 {
17996 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17997 UNSPEC_RCP28)));
17998 /* res = a * x0 */
17999 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
18000 return;
18001 }
18002 else
18003 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18004 UNSPEC_RCP14)));
18005 }
18006 else
18007 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18008 UNSPEC_RCP)));
18009
18010 /* e0 = x0 * b */
18011 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
18012
18013 /* e0 = x0 * e0 */
18014 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
18015
18016 /* e1 = x0 + x0 */
18017 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
18018
18019 /* x1 = e1 - e0 */
18020 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
18021
18022 /* res = a * x1 */
18023 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
18024 }
18025
18026 /* Output code to perform a Newton-Rhapson approximation of a
18027 single precision floating point [reciprocal] square root. */
18028
18029 void
18030 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
18031 {
18032 rtx x0, e0, e1, e2, e3, mthree, mhalf;
18033 REAL_VALUE_TYPE r;
18034 int unspec;
18035
18036 x0 = gen_reg_rtx (mode);
18037 e0 = gen_reg_rtx (mode);
18038 e1 = gen_reg_rtx (mode);
18039 e2 = gen_reg_rtx (mode);
18040 e3 = gen_reg_rtx (mode);
18041
18042 if (TARGET_AVX512ER && mode == V16SFmode)
18043 {
18044 if (recip)
18045 /* res = rsqrt28(a) estimate */
18046 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18047 UNSPEC_RSQRT28)));
18048 else
18049 {
18050 /* x0 = rsqrt28(a) estimate */
18051 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18052 UNSPEC_RSQRT28)));
18053 /* res = rcp28(x0) estimate */
18054 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
18055 UNSPEC_RCP28)));
18056 }
18057 return;
18058 }
18059
18060 real_from_integer (&r, VOIDmode, -3, SIGNED);
18061 mthree = const_double_from_real_value (r, SFmode);
18062
18063 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
18064 mhalf = const_double_from_real_value (r, SFmode);
18065 unspec = UNSPEC_RSQRT;
18066
18067 if (VECTOR_MODE_P (mode))
18068 {
18069 mthree = ix86_build_const_vector (mode, true, mthree);
18070 mhalf = ix86_build_const_vector (mode, true, mhalf);
18071 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18072 if (GET_MODE_SIZE (mode) == 64)
18073 unspec = UNSPEC_RSQRT14;
18074 }
18075
18076 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18077 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18078
18079 a = force_reg (mode, a);
18080
18081 /* x0 = rsqrt(a) estimate */
18082 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18083 unspec)));
18084
18085 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18086 if (!recip)
18087 {
18088 rtx zero = force_reg (mode, CONST0_RTX(mode));
18089 rtx mask;
18090
18091 /* Handle masked compare. */
18092 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
18093 {
18094 mask = gen_reg_rtx (HImode);
18095 /* Imm value 0x4 corresponds to not-equal comparison. */
18096 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
18097 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
18098 }
18099 else
18100 {
18101 mask = gen_reg_rtx (mode);
18102 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
18103 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
18104 }
18105 }
18106
18107 mthree = force_reg (mode, mthree);
18108
18109 /* e0 = x0 * a */
18110 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
18111
18112 unsigned vector_size = GET_MODE_SIZE (mode);
18113 if (TARGET_FMA
18114 || (TARGET_AVX512F && vector_size == 64)
18115 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
18116 emit_insn (gen_rtx_SET (e2,
18117 gen_rtx_FMA (mode, e0, x0, mthree)));
18118 else
18119 {
18120 /* e1 = e0 * x0 */
18121 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
18122
18123 /* e2 = e1 - 3. */
18124 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
18125 }
18126
18127 mhalf = force_reg (mode, mhalf);
18128 if (recip)
18129 /* e3 = -.5 * x0 */
18130 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
18131 else
18132 /* e3 = -.5 * e0 */
18133 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
18134 /* ret = e2 * e3 */
18135 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
18136 }
18137
18138 /* Expand fabs (OP0) and return a new rtx that holds the result. The
18139 mask for masking out the sign-bit is stored in *SMASK, if that is
18140 non-null. */
18141
18142 static rtx
18143 ix86_expand_sse_fabs (rtx op0, rtx *smask)
18144 {
18145 machine_mode vmode, mode = GET_MODE (op0);
18146 rtx xa, mask;
18147
18148 xa = gen_reg_rtx (mode);
18149 if (mode == SFmode)
18150 vmode = V4SFmode;
18151 else if (mode == DFmode)
18152 vmode = V2DFmode;
18153 else
18154 vmode = mode;
18155 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
18156 if (!VECTOR_MODE_P (mode))
18157 {
18158 /* We need to generate a scalar mode mask in this case. */
18159 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18160 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18161 mask = gen_reg_rtx (mode);
18162 emit_insn (gen_rtx_SET (mask, tmp));
18163 }
18164 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
18165
18166 if (smask)
18167 *smask = mask;
18168
18169 return xa;
18170 }
18171
18172 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
18173 swapping the operands if SWAP_OPERANDS is true. The expanded
18174 code is a forward jump to a newly created label in case the
18175 comparison is true. The generated label rtx is returned. */
18176 static rtx_code_label *
18177 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
18178 bool swap_operands)
18179 {
18180 bool unordered_compare = ix86_unordered_fp_compare (code);
18181 rtx_code_label *label;
18182 rtx tmp, reg;
18183
18184 if (swap_operands)
18185 std::swap (op0, op1);
18186
18187 label = gen_label_rtx ();
18188 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
18189 if (unordered_compare)
18190 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
18191 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
18192 emit_insn (gen_rtx_SET (reg, tmp));
18193 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
18194 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18195 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
18196 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18197 JUMP_LABEL (tmp) = label;
18198
18199 return label;
18200 }
18201
18202 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18203 using comparison code CODE. Operands are swapped for the comparison if
18204 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18205 static rtx
18206 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
18207 bool swap_operands)
18208 {
18209 rtx (*insn)(rtx, rtx, rtx, rtx);
18210 machine_mode mode = GET_MODE (op0);
18211 rtx mask = gen_reg_rtx (mode);
18212
18213 if (swap_operands)
18214 std::swap (op0, op1);
18215
18216 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
18217
18218 emit_insn (insn (mask, op0, op1,
18219 gen_rtx_fmt_ee (code, mode, op0, op1)));
18220 return mask;
18221 }
18222
18223 /* Expand copysign from SIGN to the positive value ABS_VALUE
18224 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18225 the sign-bit. */
18226
18227 static void
18228 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
18229 {
18230 machine_mode mode = GET_MODE (sign);
18231 rtx sgn = gen_reg_rtx (mode);
18232 if (mask == NULL_RTX)
18233 {
18234 machine_mode vmode;
18235
18236 if (mode == SFmode)
18237 vmode = V4SFmode;
18238 else if (mode == DFmode)
18239 vmode = V2DFmode;
18240 else
18241 vmode = mode;
18242
18243 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
18244 if (!VECTOR_MODE_P (mode))
18245 {
18246 /* We need to generate a scalar mode mask in this case. */
18247 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18248 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18249 mask = gen_reg_rtx (mode);
18250 emit_insn (gen_rtx_SET (mask, tmp));
18251 }
18252 }
18253 else
18254 mask = gen_rtx_NOT (mode, mask);
18255 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
18256 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
18257 }
18258
18259 /* Expand SSE sequence for computing lround from OP1 storing
18260 into OP0. */
18261
18262 void
18263 ix86_expand_lround (rtx op0, rtx op1)
18264 {
18265 /* C code for the stuff we're doing below:
18266 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18267 return (long)tmp;
18268 */
18269 machine_mode mode = GET_MODE (op1);
18270 const struct real_format *fmt;
18271 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18272 rtx adj;
18273
18274 /* load nextafter (0.5, 0.0) */
18275 fmt = REAL_MODE_FORMAT (mode);
18276 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18277 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18278
18279 /* adj = copysign (0.5, op1) */
18280 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
18281 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
18282
18283 /* adj = op1 + adj */
18284 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
18285
18286 /* op0 = (imode)adj */
18287 expand_fix (op0, adj, 0);
18288 }
18289
18290 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18291 into OPERAND0. */
18292
18293 void
18294 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
18295 {
18296 /* C code for the stuff we're doing below (for do_floor):
18297 xi = (long)op1;
18298 xi -= (double)xi > op1 ? 1 : 0;
18299 return xi;
18300 */
18301 machine_mode fmode = GET_MODE (op1);
18302 machine_mode imode = GET_MODE (op0);
18303 rtx ireg, freg, tmp;
18304 rtx_code_label *label;
18305
18306 /* reg = (long)op1 */
18307 ireg = gen_reg_rtx (imode);
18308 expand_fix (ireg, op1, 0);
18309
18310 /* freg = (double)reg */
18311 freg = gen_reg_rtx (fmode);
18312 expand_float (freg, ireg, 0);
18313
18314 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18315 label = ix86_expand_sse_compare_and_jump (UNLE,
18316 freg, op1, !do_floor);
18317 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
18318 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
18319 emit_move_insn (ireg, tmp);
18320
18321 emit_label (label);
18322 LABEL_NUSES (label) = 1;
18323
18324 emit_move_insn (op0, ireg);
18325 }
18326
18327 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18328 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18329
18330 static rtx
18331 ix86_gen_TWO52 (machine_mode mode)
18332 {
18333 const struct real_format *fmt;
18334 REAL_VALUE_TYPE TWO52r;
18335 rtx TWO52;
18336
18337 fmt = REAL_MODE_FORMAT (mode);
18338 real_2expN (&TWO52r, fmt->p - 1, mode);
18339 TWO52 = const_double_from_real_value (TWO52r, mode);
18340 TWO52 = force_reg (mode, TWO52);
18341
18342 return TWO52;
18343 }
18344
18345 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18346
18347 void
18348 ix86_expand_rint (rtx operand0, rtx operand1)
18349 {
18350 /* C code for the stuff we're doing below:
18351 xa = fabs (operand1);
18352 if (!isless (xa, 2**52))
18353 return operand1;
18354 two52 = 2**52;
18355 if (flag_rounding_math)
18356 {
18357 two52 = copysign (two52, operand1);
18358 xa = operand1;
18359 }
18360 xa = xa + two52 - two52;
18361 return copysign (xa, operand1);
18362 */
18363 machine_mode mode = GET_MODE (operand0);
18364 rtx res, xa, TWO52, mask;
18365 rtx_code_label *label;
18366
18367 TWO52 = ix86_gen_TWO52 (mode);
18368
18369 /* Temporary for holding the result, initialized to the input
18370 operand to ease control flow. */
18371 res = copy_to_reg (operand1);
18372
18373 /* xa = abs (operand1) */
18374 xa = ix86_expand_sse_fabs (res, &mask);
18375
18376 /* if (!isless (xa, TWO52)) goto label; */
18377 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18378
18379 if (flag_rounding_math)
18380 {
18381 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
18382 xa = res;
18383 }
18384
18385 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18386 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18387
18388 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18389 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18390 xa = ix86_expand_sse_fabs (xa, NULL);
18391
18392 ix86_sse_copysign_to_positive (res, xa, res, mask);
18393
18394 emit_label (label);
18395 LABEL_NUSES (label) = 1;
18396
18397 emit_move_insn (operand0, res);
18398 }
18399
18400 /* Expand SSE2 sequence for computing floor or ceil
18401 from OPERAND1 storing into OPERAND0. */
18402 void
18403 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
18404 {
18405 /* C code for the stuff we expand below.
18406 double xa = fabs (x), x2;
18407 if (!isless (xa, TWO52))
18408 return x;
18409 x2 = (double)(long)x;
18410
18411 Compensate. Floor:
18412 if (x2 > x)
18413 x2 -= 1;
18414 Compensate. Ceil:
18415 if (x2 < x)
18416 x2 += 1;
18417
18418 if (HONOR_SIGNED_ZEROS (mode))
18419 return copysign (x2, x);
18420 return x2;
18421 */
18422 machine_mode mode = GET_MODE (operand0);
18423 rtx xa, xi, TWO52, tmp, one, res, mask;
18424 rtx_code_label *label;
18425
18426 TWO52 = ix86_gen_TWO52 (mode);
18427
18428 /* Temporary for holding the result, initialized to the input
18429 operand to ease control flow. */
18430 res = copy_to_reg (operand1);
18431
18432 /* xa = abs (operand1) */
18433 xa = ix86_expand_sse_fabs (res, &mask);
18434
18435 /* if (!isless (xa, TWO52)) goto label; */
18436 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18437
18438 /* xa = (double)(long)x */
18439 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18440 expand_fix (xi, res, 0);
18441 expand_float (xa, xi, 0);
18442
18443 /* generate 1.0 */
18444 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18445
18446 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18447 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18448 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18449 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18450 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18451 if (HONOR_SIGNED_ZEROS (mode))
18452 {
18453 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18454 if (do_floor && flag_rounding_math)
18455 tmp = ix86_expand_sse_fabs (tmp, NULL);
18456
18457 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18458 }
18459 emit_move_insn (res, tmp);
18460
18461 emit_label (label);
18462 LABEL_NUSES (label) = 1;
18463
18464 emit_move_insn (operand0, res);
18465 }
18466
18467 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18468 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18469 that is only available on 64bit targets. */
18470 void
18471 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
18472 {
18473 /* C code for the stuff we expand below.
18474 double xa = fabs (x), x2;
18475 if (!isless (xa, TWO52))
18476 return x;
18477 xa = xa + TWO52 - TWO52;
18478 x2 = copysign (xa, x);
18479
18480 Compensate. Floor:
18481 if (x2 > x)
18482 x2 -= 1;
18483 Compensate. Ceil:
18484 if (x2 < x)
18485 x2 += 1;
18486
18487 if (HONOR_SIGNED_ZEROS (mode))
18488 x2 = copysign (x2, x);
18489 return x2;
18490 */
18491 machine_mode mode = GET_MODE (operand0);
18492 rtx xa, TWO52, tmp, one, res, mask;
18493 rtx_code_label *label;
18494
18495 TWO52 = ix86_gen_TWO52 (mode);
18496
18497 /* Temporary for holding the result, initialized to the input
18498 operand to ease control flow. */
18499 res = copy_to_reg (operand1);
18500
18501 /* xa = abs (operand1) */
18502 xa = ix86_expand_sse_fabs (res, &mask);
18503
18504 /* if (!isless (xa, TWO52)) goto label; */
18505 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18506
18507 /* xa = xa + TWO52 - TWO52; */
18508 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18509 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18510
18511 /* xa = copysign (xa, operand1) */
18512 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18513
18514 /* generate 1.0 */
18515 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18516
18517 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18518 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18519 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18520 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18521 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18522 if (HONOR_SIGNED_ZEROS (mode))
18523 {
18524 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18525 if (do_floor && flag_rounding_math)
18526 tmp = ix86_expand_sse_fabs (tmp, NULL);
18527
18528 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18529 }
18530 emit_move_insn (res, tmp);
18531
18532 emit_label (label);
18533 LABEL_NUSES (label) = 1;
18534
18535 emit_move_insn (operand0, res);
18536 }
18537
18538 /* Expand SSE sequence for computing trunc
18539 from OPERAND1 storing into OPERAND0. */
18540 void
18541 ix86_expand_trunc (rtx operand0, rtx operand1)
18542 {
18543 /* C code for SSE variant we expand below.
18544 double xa = fabs (x), x2;
18545 if (!isless (xa, TWO52))
18546 return x;
18547 x2 = (double)(long)x;
18548 if (HONOR_SIGNED_ZEROS (mode))
18549 return copysign (x2, x);
18550 return x2;
18551 */
18552 machine_mode mode = GET_MODE (operand0);
18553 rtx xa, xi, TWO52, res, mask;
18554 rtx_code_label *label;
18555
18556 TWO52 = ix86_gen_TWO52 (mode);
18557
18558 /* Temporary for holding the result, initialized to the input
18559 operand to ease control flow. */
18560 res = copy_to_reg (operand1);
18561
18562 /* xa = abs (operand1) */
18563 xa = ix86_expand_sse_fabs (res, &mask);
18564
18565 /* if (!isless (xa, TWO52)) goto label; */
18566 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18567
18568 /* xa = (double)(long)x */
18569 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18570 expand_fix (xi, res, 0);
18571 expand_float (xa, xi, 0);
18572
18573 if (HONOR_SIGNED_ZEROS (mode))
18574 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18575
18576 emit_move_insn (res, xa);
18577
18578 emit_label (label);
18579 LABEL_NUSES (label) = 1;
18580
18581 emit_move_insn (operand0, res);
18582 }
18583
18584 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18585 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18586 that is only available on 64bit targets. */
18587 void
18588 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18589 {
18590 machine_mode mode = GET_MODE (operand0);
18591 rtx xa, xa2, TWO52, tmp, one, res, mask;
18592 rtx_code_label *label;
18593
18594 /* C code for SSE variant we expand below.
18595 double xa = fabs (x), x2;
18596 if (!isless (xa, TWO52))
18597 return x;
18598 xa2 = xa + TWO52 - TWO52;
18599 Compensate:
18600 if (xa2 > xa)
18601 xa2 -= 1.0;
18602 x2 = copysign (xa2, x);
18603 return x2;
18604 */
18605
18606 TWO52 = ix86_gen_TWO52 (mode);
18607
18608 /* Temporary for holding the result, initialized to the input
18609 operand to ease control flow. */
18610 res =copy_to_reg (operand1);
18611
18612 /* xa = abs (operand1) */
18613 xa = ix86_expand_sse_fabs (res, &mask);
18614
18615 /* if (!isless (xa, TWO52)) goto label; */
18616 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18617
18618 /* xa2 = xa + TWO52 - TWO52; */
18619 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18620 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18621
18622 /* generate 1.0 */
18623 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18624
18625 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18626 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18627 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18628 tmp = expand_simple_binop (mode, MINUS,
18629 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18630 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18631 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18632 tmp = ix86_expand_sse_fabs (tmp, NULL);
18633
18634 /* res = copysign (xa2, operand1) */
18635 ix86_sse_copysign_to_positive (res, tmp, res, mask);
18636
18637 emit_label (label);
18638 LABEL_NUSES (label) = 1;
18639
18640 emit_move_insn (operand0, res);
18641 }
18642
18643 /* Expand SSE sequence for computing round
18644 from OPERAND1 storing into OPERAND0. */
18645 void
18646 ix86_expand_round (rtx operand0, rtx operand1)
18647 {
18648 /* C code for the stuff we're doing below:
18649 double xa = fabs (x);
18650 if (!isless (xa, TWO52))
18651 return x;
18652 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18653 return copysign (xa, x);
18654 */
18655 machine_mode mode = GET_MODE (operand0);
18656 rtx res, TWO52, xa, xi, half, mask;
18657 rtx_code_label *label;
18658 const struct real_format *fmt;
18659 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18660
18661 /* Temporary for holding the result, initialized to the input
18662 operand to ease control flow. */
18663 res = copy_to_reg (operand1);
18664
18665 TWO52 = ix86_gen_TWO52 (mode);
18666 xa = ix86_expand_sse_fabs (res, &mask);
18667 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18668
18669 /* load nextafter (0.5, 0.0) */
18670 fmt = REAL_MODE_FORMAT (mode);
18671 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18672 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18673
18674 /* xa = xa + 0.5 */
18675 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18676 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18677
18678 /* xa = (double)(int64_t)xa */
18679 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18680 expand_fix (xi, xa, 0);
18681 expand_float (xa, xi, 0);
18682
18683 /* res = copysign (xa, operand1) */
18684 ix86_sse_copysign_to_positive (res, xa, res, mask);
18685
18686 emit_label (label);
18687 LABEL_NUSES (label) = 1;
18688
18689 emit_move_insn (operand0, res);
18690 }
18691
18692 /* Expand SSE sequence for computing round from OPERAND1 storing
18693 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18694 that is only available on 64bit targets. */
18695 void
18696 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18697 {
18698 /* C code for the stuff we expand below.
18699 double xa = fabs (x), xa2, x2;
18700 if (!isless (xa, TWO52))
18701 return x;
18702 Using the absolute value and copying back sign makes
18703 -0.0 -> -0.0 correct.
18704 xa2 = xa + TWO52 - TWO52;
18705 Compensate.
18706 dxa = xa2 - xa;
18707 if (dxa <= -0.5)
18708 xa2 += 1;
18709 else if (dxa > 0.5)
18710 xa2 -= 1;
18711 x2 = copysign (xa2, x);
18712 return x2;
18713 */
18714 machine_mode mode = GET_MODE (operand0);
18715 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18716 rtx_code_label *label;
18717
18718 TWO52 = ix86_gen_TWO52 (mode);
18719
18720 /* Temporary for holding the result, initialized to the input
18721 operand to ease control flow. */
18722 res = copy_to_reg (operand1);
18723
18724 /* xa = abs (operand1) */
18725 xa = ix86_expand_sse_fabs (res, &mask);
18726
18727 /* if (!isless (xa, TWO52)) goto label; */
18728 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18729
18730 /* xa2 = xa + TWO52 - TWO52; */
18731 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18732 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18733
18734 /* dxa = xa2 - xa; */
18735 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18736
18737 /* generate 0.5, 1.0 and -0.5 */
18738 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18739 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18740 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18741 0, OPTAB_DIRECT);
18742
18743 /* Compensate. */
18744 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18745 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18746 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18747 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18748 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18749 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18750 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18751 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18752
18753 /* res = copysign (xa2, operand1) */
18754 ix86_sse_copysign_to_positive (res, xa2, res, mask);
18755
18756 emit_label (label);
18757 LABEL_NUSES (label) = 1;
18758
18759 emit_move_insn (operand0, res);
18760 }
18761
18762 /* Expand SSE sequence for computing round
18763 from OP1 storing into OP0 using sse4 round insn. */
18764 void
18765 ix86_expand_round_sse4 (rtx op0, rtx op1)
18766 {
18767 machine_mode mode = GET_MODE (op0);
18768 rtx e1, e2, res, half;
18769 const struct real_format *fmt;
18770 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18771 rtx (*gen_copysign) (rtx, rtx, rtx);
18772 rtx (*gen_round) (rtx, rtx, rtx);
18773
18774 switch (mode)
18775 {
18776 case E_SFmode:
18777 gen_copysign = gen_copysignsf3;
18778 gen_round = gen_sse4_1_roundsf2;
18779 break;
18780 case E_DFmode:
18781 gen_copysign = gen_copysigndf3;
18782 gen_round = gen_sse4_1_rounddf2;
18783 break;
18784 default:
18785 gcc_unreachable ();
18786 }
18787
18788 /* round (a) = trunc (a + copysign (0.5, a)) */
18789
18790 /* load nextafter (0.5, 0.0) */
18791 fmt = REAL_MODE_FORMAT (mode);
18792 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18793 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18794 half = const_double_from_real_value (pred_half, mode);
18795
18796 /* e1 = copysign (0.5, op1) */
18797 e1 = gen_reg_rtx (mode);
18798 emit_insn (gen_copysign (e1, half, op1));
18799
18800 /* e2 = op1 + e1 */
18801 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18802
18803 /* res = trunc (e2) */
18804 res = gen_reg_rtx (mode);
18805 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18806
18807 emit_move_insn (op0, res);
18808 }
18809
18810 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18811 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18812 insn every time. */
18813
18814 static GTY(()) rtx_insn *vselect_insn;
18815
18816 /* Initialize vselect_insn. */
18817
18818 static void
18819 init_vselect_insn (void)
18820 {
18821 unsigned i;
18822 rtx x;
18823
18824 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18825 for (i = 0; i < MAX_VECT_LEN; ++i)
18826 XVECEXP (x, 0, i) = const0_rtx;
18827 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18828 const0_rtx), x);
18829 x = gen_rtx_SET (const0_rtx, x);
18830 start_sequence ();
18831 vselect_insn = emit_insn (x);
18832 end_sequence ();
18833 }
18834
18835 /* Construct (set target (vec_select op0 (parallel perm))) and
18836 return true if that's a valid instruction in the active ISA. */
18837
18838 static bool
18839 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18840 unsigned nelt, bool testing_p)
18841 {
18842 unsigned int i;
18843 rtx x, save_vconcat;
18844 int icode;
18845
18846 if (vselect_insn == NULL_RTX)
18847 init_vselect_insn ();
18848
18849 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18850 PUT_NUM_ELEM (XVEC (x, 0), nelt);
18851 for (i = 0; i < nelt; ++i)
18852 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18853 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18854 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18855 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18856 SET_DEST (PATTERN (vselect_insn)) = target;
18857 icode = recog_memoized (vselect_insn);
18858
18859 if (icode >= 0 && !testing_p)
18860 emit_insn (copy_rtx (PATTERN (vselect_insn)));
18861
18862 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18863 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18864 INSN_CODE (vselect_insn) = -1;
18865
18866 return icode >= 0;
18867 }
18868
18869 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18870
18871 static bool
18872 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18873 const unsigned char *perm, unsigned nelt,
18874 bool testing_p)
18875 {
18876 machine_mode v2mode;
18877 rtx x;
18878 bool ok;
18879
18880 if (vselect_insn == NULL_RTX)
18881 init_vselect_insn ();
18882
18883 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18884 return false;
18885 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18886 PUT_MODE (x, v2mode);
18887 XEXP (x, 0) = op0;
18888 XEXP (x, 1) = op1;
18889 ok = expand_vselect (target, x, perm, nelt, testing_p);
18890 XEXP (x, 0) = const0_rtx;
18891 XEXP (x, 1) = const0_rtx;
18892 return ok;
18893 }
18894
18895 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18896 using movss or movsd. */
18897 static bool
18898 expand_vec_perm_movs (struct expand_vec_perm_d *d)
18899 {
18900 machine_mode vmode = d->vmode;
18901 unsigned i, nelt = d->nelt;
18902 rtx x;
18903
18904 if (d->one_operand_p)
18905 return false;
18906
18907 if (!(TARGET_SSE && vmode == V4SFmode)
18908 && !(TARGET_SSE && vmode == V4SImode)
18909 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
18910 && !(TARGET_SSE2 && vmode == V2DFmode)
18911 && !(TARGET_SSE2 && vmode == V2DImode))
18912 return false;
18913
18914 /* Only the first element is changed. */
18915 if (d->perm[0] != nelt && d->perm[0] != 0)
18916 return false;
18917 for (i = 1; i < nelt; ++i)
18918 if (d->perm[i] != i + nelt - d->perm[0])
18919 return false;
18920
18921 if (d->testing_p)
18922 return true;
18923
18924 if (d->perm[0] == nelt)
18925 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18926 else
18927 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18928
18929 emit_insn (gen_rtx_SET (d->target, x));
18930
18931 return true;
18932 }
18933
18934 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18935 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
18936
18937 static bool
18938 expand_vec_perm_blend (struct expand_vec_perm_d *d)
18939 {
18940 machine_mode mmode, vmode = d->vmode;
18941 unsigned i, nelt = d->nelt;
18942 unsigned HOST_WIDE_INT mask;
18943 rtx target, op0, op1, maskop, x;
18944 rtx rperm[32], vperm;
18945
18946 if (d->one_operand_p)
18947 return false;
18948 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
18949 && (TARGET_AVX512BW
18950 || GET_MODE_UNIT_SIZE (vmode) >= 4))
18951 ;
18952 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
18953 ;
18954 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
18955 ;
18956 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
18957 || GET_MODE_SIZE (vmode) == 8
18958 || GET_MODE_SIZE (vmode) == 4))
18959 ;
18960 else
18961 return false;
18962
18963 /* This is a blend, not a permute. Elements must stay in their
18964 respective lanes. */
18965 for (i = 0; i < nelt; ++i)
18966 {
18967 unsigned e = d->perm[i];
18968 if (!(e == i || e == i + nelt))
18969 return false;
18970 }
18971
18972 if (d->testing_p)
18973 return true;
18974
18975 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
18976 decision should be extracted elsewhere, so that we only try that
18977 sequence once all budget==3 options have been tried. */
18978 target = d->target;
18979 op0 = d->op0;
18980 op1 = d->op1;
18981 mask = 0;
18982
18983 switch (vmode)
18984 {
18985 case E_V8DFmode:
18986 case E_V16SFmode:
18987 case E_V4DFmode:
18988 case E_V8SFmode:
18989 case E_V2DFmode:
18990 case E_V4SFmode:
18991 case E_V4HImode:
18992 case E_V8HImode:
18993 case E_V8SImode:
18994 case E_V32HImode:
18995 case E_V64QImode:
18996 case E_V16SImode:
18997 case E_V8DImode:
18998 for (i = 0; i < nelt; ++i)
18999 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19000 break;
19001
19002 case E_V2DImode:
19003 for (i = 0; i < 2; ++i)
19004 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
19005 vmode = V8HImode;
19006 goto do_subreg;
19007
19008 case E_V2SImode:
19009 for (i = 0; i < 2; ++i)
19010 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
19011 vmode = V4HImode;
19012 goto do_subreg;
19013
19014 case E_V4SImode:
19015 for (i = 0; i < 4; ++i)
19016 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19017 vmode = V8HImode;
19018 goto do_subreg;
19019
19020 case E_V16QImode:
19021 /* See if bytes move in pairs so we can use pblendw with
19022 an immediate argument, rather than pblendvb with a vector
19023 argument. */
19024 for (i = 0; i < 16; i += 2)
19025 if (d->perm[i] + 1 != d->perm[i + 1])
19026 {
19027 use_pblendvb:
19028 for (i = 0; i < nelt; ++i)
19029 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
19030
19031 finish_pblendvb:
19032 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19033 vperm = force_reg (vmode, vperm);
19034
19035 if (GET_MODE_SIZE (vmode) == 4)
19036 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
19037 else if (GET_MODE_SIZE (vmode) == 8)
19038 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
19039 else if (GET_MODE_SIZE (vmode) == 16)
19040 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
19041 else
19042 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
19043 if (target != d->target)
19044 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19045 return true;
19046 }
19047
19048 for (i = 0; i < 8; ++i)
19049 mask |= (d->perm[i * 2] >= 16) << i;
19050 vmode = V8HImode;
19051 /* FALLTHRU */
19052
19053 do_subreg:
19054 target = gen_reg_rtx (vmode);
19055 op0 = gen_lowpart (vmode, op0);
19056 op1 = gen_lowpart (vmode, op1);
19057 break;
19058
19059 case E_V8QImode:
19060 for (i = 0; i < 8; i += 2)
19061 if (d->perm[i] + 1 != d->perm[i + 1])
19062 goto use_pblendvb;
19063
19064 for (i = 0; i < 4; ++i)
19065 mask |= (d->perm[i * 2] >= 8) << i;
19066 vmode = V4HImode;
19067 goto do_subreg;
19068
19069 case E_V4QImode:
19070 for (i = 0; i < 4; i += 2)
19071 if (d->perm[i] + 1 != d->perm[i + 1])
19072 goto use_pblendvb;
19073
19074 for (i = 0; i < 2; ++i)
19075 mask |= (d->perm[i * 2] >= 4) << i;
19076 vmode = V2HImode;
19077 goto do_subreg;
19078
19079 case E_V32QImode:
19080 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19081 for (i = 0; i < 32; i += 2)
19082 if (d->perm[i] + 1 != d->perm[i + 1])
19083 goto use_pblendvb;
19084 /* See if bytes move in quadruplets. If yes, vpblendd
19085 with immediate can be used. */
19086 for (i = 0; i < 32; i += 4)
19087 if (d->perm[i] + 2 != d->perm[i + 2])
19088 break;
19089 if (i < 32)
19090 {
19091 /* See if bytes move the same in both lanes. If yes,
19092 vpblendw with immediate can be used. */
19093 for (i = 0; i < 16; i += 2)
19094 if (d->perm[i] + 16 != d->perm[i + 16])
19095 goto use_pblendvb;
19096
19097 /* Use vpblendw. */
19098 for (i = 0; i < 16; ++i)
19099 mask |= (d->perm[i * 2] >= 32) << i;
19100 vmode = V16HImode;
19101 goto do_subreg;
19102 }
19103
19104 /* Use vpblendd. */
19105 for (i = 0; i < 8; ++i)
19106 mask |= (d->perm[i * 4] >= 32) << i;
19107 vmode = V8SImode;
19108 goto do_subreg;
19109
19110 case E_V16HImode:
19111 /* See if words move in pairs. If yes, vpblendd can be used. */
19112 for (i = 0; i < 16; i += 2)
19113 if (d->perm[i] + 1 != d->perm[i + 1])
19114 break;
19115 if (i < 16)
19116 {
19117 /* See if words move the same in both lanes. If not,
19118 vpblendvb must be used. */
19119 for (i = 0; i < 8; i++)
19120 if (d->perm[i] + 8 != d->perm[i + 8])
19121 {
19122 /* Use vpblendvb. */
19123 for (i = 0; i < 32; ++i)
19124 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
19125
19126 vmode = V32QImode;
19127 nelt = 32;
19128 target = gen_reg_rtx (vmode);
19129 op0 = gen_lowpart (vmode, op0);
19130 op1 = gen_lowpart (vmode, op1);
19131 goto finish_pblendvb;
19132 }
19133
19134 /* Use vpblendw. */
19135 for (i = 0; i < 16; ++i)
19136 mask |= (d->perm[i] >= 16) << i;
19137 break;
19138 }
19139
19140 /* Use vpblendd. */
19141 for (i = 0; i < 8; ++i)
19142 mask |= (d->perm[i * 2] >= 16) << i;
19143 vmode = V8SImode;
19144 goto do_subreg;
19145
19146 case E_V4DImode:
19147 /* Use vpblendd. */
19148 for (i = 0; i < 4; ++i)
19149 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19150 vmode = V8SImode;
19151 goto do_subreg;
19152
19153 default:
19154 gcc_unreachable ();
19155 }
19156
19157 switch (vmode)
19158 {
19159 case E_V8DFmode:
19160 case E_V8DImode:
19161 mmode = QImode;
19162 break;
19163 case E_V16SFmode:
19164 case E_V16SImode:
19165 mmode = HImode;
19166 break;
19167 case E_V32HImode:
19168 mmode = SImode;
19169 break;
19170 case E_V64QImode:
19171 mmode = DImode;
19172 break;
19173 default:
19174 mmode = VOIDmode;
19175 }
19176
19177 if (mmode != VOIDmode)
19178 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
19179 else
19180 maskop = GEN_INT (mask);
19181
19182 /* This matches five different patterns with the different modes. */
19183 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
19184 x = gen_rtx_SET (target, x);
19185 emit_insn (x);
19186 if (target != d->target)
19187 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19188
19189 return true;
19190 }
19191
19192 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19193 in terms of the variable form of vpermilps.
19194
19195 Note that we will have already failed the immediate input vpermilps,
19196 which requires that the high and low part shuffle be identical; the
19197 variable form doesn't require that. */
19198
19199 static bool
19200 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
19201 {
19202 rtx rperm[8], vperm;
19203 unsigned i;
19204
19205 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
19206 return false;
19207
19208 /* We can only permute within the 128-bit lane. */
19209 for (i = 0; i < 8; ++i)
19210 {
19211 unsigned e = d->perm[i];
19212 if (i < 4 ? e >= 4 : e < 4)
19213 return false;
19214 }
19215
19216 if (d->testing_p)
19217 return true;
19218
19219 for (i = 0; i < 8; ++i)
19220 {
19221 unsigned e = d->perm[i];
19222
19223 /* Within each 128-bit lane, the elements of op0 are numbered
19224 from 0 and the elements of op1 are numbered from 4. */
19225 if (e >= 8 + 4)
19226 e -= 8;
19227 else if (e >= 4)
19228 e -= 4;
19229
19230 rperm[i] = GEN_INT (e);
19231 }
19232
19233 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
19234 vperm = force_reg (V8SImode, vperm);
19235 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
19236
19237 return true;
19238 }
19239
19240 /* For V*[QHS]Imode permutations, check if the same permutation
19241 can't be performed in a 2x, 4x or 8x wider inner mode. */
19242
19243 static bool
19244 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
19245 struct expand_vec_perm_d *nd)
19246 {
19247 int i;
19248 machine_mode mode = VOIDmode;
19249
19250 switch (d->vmode)
19251 {
19252 case E_V8QImode: mode = V4HImode; break;
19253 case E_V16QImode: mode = V8HImode; break;
19254 case E_V32QImode: mode = V16HImode; break;
19255 case E_V64QImode: mode = V32HImode; break;
19256 case E_V4HImode: mode = V2SImode; break;
19257 case E_V8HImode: mode = V4SImode; break;
19258 case E_V16HImode: mode = V8SImode; break;
19259 case E_V32HImode: mode = V16SImode; break;
19260 case E_V4SImode: mode = V2DImode; break;
19261 case E_V8SImode: mode = V4DImode; break;
19262 case E_V16SImode: mode = V8DImode; break;
19263 default: return false;
19264 }
19265 for (i = 0; i < d->nelt; i += 2)
19266 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
19267 return false;
19268 nd->vmode = mode;
19269 nd->nelt = d->nelt / 2;
19270 for (i = 0; i < nd->nelt; i++)
19271 nd->perm[i] = d->perm[2 * i] / 2;
19272 if (GET_MODE_INNER (mode) != DImode)
19273 canonicalize_vector_int_perm (nd, nd);
19274 if (nd != d)
19275 {
19276 nd->one_operand_p = d->one_operand_p;
19277 nd->testing_p = d->testing_p;
19278 if (d->op0 == d->op1)
19279 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
19280 else
19281 {
19282 nd->op0 = gen_lowpart (nd->vmode, d->op0);
19283 nd->op1 = gen_lowpart (nd->vmode, d->op1);
19284 }
19285 if (d->testing_p)
19286 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
19287 else
19288 nd->target = gen_reg_rtx (nd->vmode);
19289 }
19290 return true;
19291 }
19292
19293 /* Return true if permutation D can be performed as VMODE permutation
19294 instead. */
19295
19296 static bool
19297 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
19298 {
19299 unsigned int i, j, chunk;
19300
19301 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
19302 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
19303 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
19304 return false;
19305
19306 if (GET_MODE_NUNITS (vmode) >= d->nelt)
19307 return true;
19308
19309 chunk = d->nelt / GET_MODE_NUNITS (vmode);
19310 for (i = 0; i < d->nelt; i += chunk)
19311 if (d->perm[i] & (chunk - 1))
19312 return false;
19313 else
19314 for (j = 1; j < chunk; ++j)
19315 if (d->perm[i] + j != d->perm[i + j])
19316 return false;
19317
19318 return true;
19319 }
19320
19321 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19322 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19323
19324 static bool
19325 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
19326 {
19327 unsigned i, nelt, eltsz, mask;
19328 unsigned char perm[64];
19329 machine_mode vmode;
19330 struct expand_vec_perm_d nd;
19331 rtx rperm[64], vperm, target, op0, op1;
19332
19333 nelt = d->nelt;
19334
19335 if (!d->one_operand_p)
19336 switch (GET_MODE_SIZE (d->vmode))
19337 {
19338 case 4:
19339 if (!TARGET_XOP)
19340 return false;
19341 vmode = V4QImode;
19342 break;
19343
19344 case 8:
19345 if (!TARGET_XOP)
19346 return false;
19347 vmode = V8QImode;
19348 break;
19349
19350 case 16:
19351 if (!TARGET_XOP)
19352 return false;
19353 vmode = V16QImode;
19354 break;
19355
19356 case 32:
19357 if (!TARGET_AVX2)
19358 return false;
19359
19360 if (valid_perm_using_mode_p (V2TImode, d))
19361 {
19362 if (d->testing_p)
19363 return true;
19364
19365 /* Use vperm2i128 insn. The pattern uses
19366 V4DImode instead of V2TImode. */
19367 target = d->target;
19368 if (d->vmode != V4DImode)
19369 target = gen_reg_rtx (V4DImode);
19370 op0 = gen_lowpart (V4DImode, d->op0);
19371 op1 = gen_lowpart (V4DImode, d->op1);
19372 rperm[0]
19373 = GEN_INT ((d->perm[0] / (nelt / 2))
19374 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
19375 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
19376 if (target != d->target)
19377 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19378 return true;
19379 }
19380 /* FALLTHRU */
19381
19382 default:
19383 return false;
19384 }
19385 else
19386 switch (GET_MODE_SIZE (d->vmode))
19387 {
19388 case 4:
19389 if (!TARGET_SSSE3)
19390 return false;
19391 vmode = V4QImode;
19392 break;
19393
19394 case 8:
19395 if (!TARGET_SSSE3)
19396 return false;
19397 vmode = V8QImode;
19398 break;
19399
19400 case 16:
19401 if (!TARGET_SSSE3)
19402 return false;
19403 vmode = V16QImode;
19404 break;
19405
19406 case 32:
19407 if (!TARGET_AVX2)
19408 return false;
19409
19410 /* V4DImode should be already handled through
19411 expand_vselect by vpermq instruction. */
19412 gcc_assert (d->vmode != V4DImode);
19413
19414 vmode = V32QImode;
19415 if (d->vmode == V8SImode
19416 || d->vmode == V16HImode
19417 || d->vmode == V32QImode)
19418 {
19419 /* First see if vpermq can be used for
19420 V8SImode/V16HImode/V32QImode. */
19421 if (valid_perm_using_mode_p (V4DImode, d))
19422 {
19423 for (i = 0; i < 4; i++)
19424 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
19425 if (d->testing_p)
19426 return true;
19427 target = gen_reg_rtx (V4DImode);
19428 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
19429 perm, 4, false))
19430 {
19431 emit_move_insn (d->target,
19432 gen_lowpart (d->vmode, target));
19433 return true;
19434 }
19435 return false;
19436 }
19437
19438 /* Next see if vpermd can be used. */
19439 if (valid_perm_using_mode_p (V8SImode, d))
19440 vmode = V8SImode;
19441 }
19442 /* Or if vpermps can be used. */
19443 else if (d->vmode == V8SFmode)
19444 vmode = V8SImode;
19445
19446 if (vmode == V32QImode)
19447 {
19448 /* vpshufb only works intra lanes, it is not
19449 possible to shuffle bytes in between the lanes. */
19450 for (i = 0; i < nelt; ++i)
19451 if ((d->perm[i] ^ i) & (nelt / 2))
19452 return false;
19453 }
19454 break;
19455
19456 case 64:
19457 if (!TARGET_AVX512BW)
19458 return false;
19459
19460 /* If vpermq didn't work, vpshufb won't work either. */
19461 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19462 return false;
19463
19464 vmode = V64QImode;
19465 if (d->vmode == V16SImode
19466 || d->vmode == V32HImode
19467 || d->vmode == V64QImode)
19468 {
19469 /* First see if vpermq can be used for
19470 V16SImode/V32HImode/V64QImode. */
19471 if (valid_perm_using_mode_p (V8DImode, d))
19472 {
19473 for (i = 0; i < 8; i++)
19474 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19475 if (d->testing_p)
19476 return true;
19477 target = gen_reg_rtx (V8DImode);
19478 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19479 perm, 8, false))
19480 {
19481 emit_move_insn (d->target,
19482 gen_lowpart (d->vmode, target));
19483 return true;
19484 }
19485 return false;
19486 }
19487
19488 /* Next see if vpermd can be used. */
19489 if (valid_perm_using_mode_p (V16SImode, d))
19490 vmode = V16SImode;
19491 }
19492 /* Or if vpermps can be used. */
19493 else if (d->vmode == V16SFmode)
19494 vmode = V16SImode;
19495
19496 if (vmode == V64QImode)
19497 {
19498 /* vpshufb only works intra lanes, it is not
19499 possible to shuffle bytes in between the lanes. */
19500 for (i = 0; i < nelt; ++i)
19501 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19502 return false;
19503 }
19504 break;
19505
19506 default:
19507 return false;
19508 }
19509
19510 if (d->testing_p)
19511 return true;
19512
19513 /* Try to avoid variable permutation instruction. */
19514 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19515 {
19516 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19517 return true;
19518 }
19519
19520 if (vmode == V8SImode)
19521 for (i = 0; i < 8; ++i)
19522 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19523 else if (vmode == V16SImode)
19524 for (i = 0; i < 16; ++i)
19525 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19526 else
19527 {
19528 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19529 if (!d->one_operand_p)
19530 mask = 2 * nelt - 1;
19531 else if (vmode == V64QImode)
19532 mask = nelt / 4 - 1;
19533 else if (vmode == V32QImode)
19534 mask = nelt / 2 - 1;
19535 else
19536 mask = nelt - 1;
19537
19538 for (i = 0; i < nelt; ++i)
19539 {
19540 unsigned j, e = d->perm[i] & mask;
19541 for (j = 0; j < eltsz; ++j)
19542 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19543 }
19544 }
19545
19546 machine_mode vpmode = vmode;
19547
19548 nelt = GET_MODE_SIZE (vmode);
19549
19550 /* Emulate narrow modes with V16QI instructions. */
19551 if (nelt < 16)
19552 {
19553 rtx m128 = GEN_INT (-128);
19554
19555 /* Remap elements from the second operand, as we have to
19556 account for inactive top elements from the first operand. */
19557 if (!d->one_operand_p)
19558 {
19559 for (i = 0; i < nelt; ++i)
19560 {
19561 unsigned ival = UINTVAL (rperm[i]);
19562 if (ival >= nelt)
19563 rperm[i] = GEN_INT (ival + 16 - nelt);
19564 }
19565 }
19566
19567 /* Fill inactive elements in the top positions with zeros. */
19568 for (i = nelt; i < 16; ++i)
19569 rperm[i] = m128;
19570
19571 vpmode = V16QImode;
19572 }
19573
19574 vperm = gen_rtx_CONST_VECTOR (vpmode,
19575 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19576 vperm = force_reg (vpmode, vperm);
19577
19578 if (vmode == d->vmode)
19579 target = d->target;
19580 else
19581 target = gen_reg_rtx (vmode);
19582
19583 op0 = gen_lowpart (vmode, d->op0);
19584
19585 if (d->one_operand_p)
19586 {
19587 rtx (*gen) (rtx, rtx, rtx);
19588
19589 if (vmode == V4QImode)
19590 gen = gen_mmx_pshufbv4qi3;
19591 else if (vmode == V8QImode)
19592 gen = gen_mmx_pshufbv8qi3;
19593 else if (vmode == V16QImode)
19594 gen = gen_ssse3_pshufbv16qi3;
19595 else if (vmode == V32QImode)
19596 gen = gen_avx2_pshufbv32qi3;
19597 else if (vmode == V64QImode)
19598 gen = gen_avx512bw_pshufbv64qi3;
19599 else if (vmode == V8SFmode)
19600 gen = gen_avx2_permvarv8sf;
19601 else if (vmode == V8SImode)
19602 gen = gen_avx2_permvarv8si;
19603 else if (vmode == V16SFmode)
19604 gen = gen_avx512f_permvarv16sf;
19605 else if (vmode == V16SImode)
19606 gen = gen_avx512f_permvarv16si;
19607 else
19608 gcc_unreachable ();
19609
19610 emit_insn (gen (target, op0, vperm));
19611 }
19612 else
19613 {
19614 rtx (*gen) (rtx, rtx, rtx, rtx);
19615
19616 op1 = gen_lowpart (vmode, d->op1);
19617
19618 if (vmode == V4QImode)
19619 gen = gen_mmx_ppermv32;
19620 else if (vmode == V8QImode)
19621 gen = gen_mmx_ppermv64;
19622 else if (vmode == V16QImode)
19623 gen = gen_xop_pperm;
19624 else
19625 gcc_unreachable ();
19626
19627 emit_insn (gen (target, op0, op1, vperm));
19628 }
19629
19630 if (target != d->target)
19631 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19632
19633 return true;
19634 }
19635
19636 /* Try to expand one-operand permutation with constant mask. */
19637
19638 static bool
19639 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19640 {
19641 machine_mode mode = GET_MODE (d->op0);
19642 machine_mode maskmode = mode;
19643 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
19644 rtx (*gen) (rtx, rtx, rtx) = NULL;
19645 rtx target, op0, mask;
19646 rtx vec[64];
19647
19648 if (!rtx_equal_p (d->op0, d->op1))
19649 return false;
19650
19651 if (!TARGET_AVX512F)
19652 return false;
19653
19654 /* Accept VNxHImode and VNxQImode now. */
19655 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19656 return false;
19657
19658 /* vpermw. */
19659 if (!TARGET_AVX512BW && inner_size == 2)
19660 return false;
19661
19662 /* vpermb. */
19663 if (!TARGET_AVX512VBMI && inner_size == 1)
19664 return false;
19665
19666 switch (mode)
19667 {
19668 case E_V16SImode:
19669 gen = gen_avx512f_permvarv16si;
19670 break;
19671 case E_V16SFmode:
19672 gen = gen_avx512f_permvarv16sf;
19673 maskmode = V16SImode;
19674 break;
19675 case E_V8DImode:
19676 gen = gen_avx512f_permvarv8di;
19677 break;
19678 case E_V8DFmode:
19679 gen = gen_avx512f_permvarv8df;
19680 maskmode = V8DImode;
19681 break;
19682 case E_V32HImode:
19683 gen = gen_avx512bw_permvarv32hi;
19684 break;
19685 case E_V16HImode:
19686 gen = gen_avx512vl_permvarv16hi;
19687 break;
19688 case E_V8HImode:
19689 gen = gen_avx512vl_permvarv8hi;
19690 break;
19691 case E_V64QImode:
19692 gen = gen_avx512bw_permvarv64qi;
19693 break;
19694 case E_V32QImode:
19695 gen = gen_avx512vl_permvarv32qi;
19696 break;
19697 case E_V16QImode:
19698 gen = gen_avx512vl_permvarv16qi;
19699 break;
19700
19701 default:
19702 return false;
19703 }
19704
19705 if (d->testing_p)
19706 return true;
19707
19708 target = d->target;
19709 op0 = d->op0;
19710 for (int i = 0; i < d->nelt; ++i)
19711 vec[i] = GEN_INT (d->perm[i]);
19712 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19713 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19714 return true;
19715 }
19716
19717 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19718
19719 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19720 in a single instruction. */
19721
19722 static bool
19723 expand_vec_perm_1 (struct expand_vec_perm_d *d)
19724 {
19725 unsigned i, nelt = d->nelt;
19726 struct expand_vec_perm_d nd;
19727
19728 /* Check plain VEC_SELECT first, because AVX has instructions that could
19729 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19730 input where SEL+CONCAT may not. */
19731 if (d->one_operand_p)
19732 {
19733 int mask = nelt - 1;
19734 bool identity_perm = true;
19735 bool broadcast_perm = true;
19736
19737 for (i = 0; i < nelt; i++)
19738 {
19739 nd.perm[i] = d->perm[i] & mask;
19740 if (nd.perm[i] != i)
19741 identity_perm = false;
19742 if (nd.perm[i])
19743 broadcast_perm = false;
19744 }
19745
19746 if (identity_perm)
19747 {
19748 if (!d->testing_p)
19749 emit_move_insn (d->target, d->op0);
19750 return true;
19751 }
19752 else if (broadcast_perm && TARGET_AVX2)
19753 {
19754 /* Use vpbroadcast{b,w,d}. */
19755 rtx (*gen) (rtx, rtx) = NULL;
19756 switch (d->vmode)
19757 {
19758 case E_V64QImode:
19759 if (TARGET_AVX512BW)
19760 gen = gen_avx512bw_vec_dupv64qi_1;
19761 break;
19762 case E_V32QImode:
19763 gen = gen_avx2_pbroadcastv32qi_1;
19764 break;
19765 case E_V32HImode:
19766 if (TARGET_AVX512BW)
19767 gen = gen_avx512bw_vec_dupv32hi_1;
19768 break;
19769 case E_V16HImode:
19770 gen = gen_avx2_pbroadcastv16hi_1;
19771 break;
19772 case E_V16SImode:
19773 if (TARGET_AVX512F)
19774 gen = gen_avx512f_vec_dupv16si_1;
19775 break;
19776 case E_V8SImode:
19777 gen = gen_avx2_pbroadcastv8si_1;
19778 break;
19779 case E_V16QImode:
19780 gen = gen_avx2_pbroadcastv16qi;
19781 break;
19782 case E_V8HImode:
19783 gen = gen_avx2_pbroadcastv8hi;
19784 break;
19785 case E_V16SFmode:
19786 if (TARGET_AVX512F)
19787 gen = gen_avx512f_vec_dupv16sf_1;
19788 break;
19789 case E_V8SFmode:
19790 gen = gen_avx2_vec_dupv8sf_1;
19791 break;
19792 case E_V8DFmode:
19793 if (TARGET_AVX512F)
19794 gen = gen_avx512f_vec_dupv8df_1;
19795 break;
19796 case E_V8DImode:
19797 if (TARGET_AVX512F)
19798 gen = gen_avx512f_vec_dupv8di_1;
19799 break;
19800 /* For other modes prefer other shuffles this function creates. */
19801 default: break;
19802 }
19803 if (gen != NULL)
19804 {
19805 if (!d->testing_p)
19806 emit_insn (gen (d->target, d->op0));
19807 return true;
19808 }
19809 }
19810
19811 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19812 return true;
19813
19814 /* There are plenty of patterns in sse.md that are written for
19815 SEL+CONCAT and are not replicated for a single op. Perhaps
19816 that should be changed, to avoid the nastiness here. */
19817
19818 /* Recognize interleave style patterns, which means incrementing
19819 every other permutation operand. */
19820 for (i = 0; i < nelt; i += 2)
19821 {
19822 nd.perm[i] = d->perm[i] & mask;
19823 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19824 }
19825 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19826 d->testing_p))
19827 return true;
19828
19829 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19830 if (nelt >= 4)
19831 {
19832 for (i = 0; i < nelt; i += 4)
19833 {
19834 nd.perm[i + 0] = d->perm[i + 0] & mask;
19835 nd.perm[i + 1] = d->perm[i + 1] & mask;
19836 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19837 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19838 }
19839
19840 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19841 d->testing_p))
19842 return true;
19843 }
19844 }
19845
19846 /* Try movss/movsd instructions. */
19847 if (expand_vec_perm_movs (d))
19848 return true;
19849
19850 /* Finally, try the fully general two operand permute. */
19851 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
19852 d->testing_p))
19853 return true;
19854
19855 /* Recognize interleave style patterns with reversed operands. */
19856 if (!d->one_operand_p)
19857 {
19858 for (i = 0; i < nelt; ++i)
19859 {
19860 unsigned e = d->perm[i];
19861 if (e >= nelt)
19862 e -= nelt;
19863 else
19864 e += nelt;
19865 nd.perm[i] = e;
19866 }
19867
19868 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
19869 d->testing_p))
19870 return true;
19871 }
19872
19873 /* Try the SSE4.1 blend variable merge instructions. */
19874 if (expand_vec_perm_blend (d))
19875 return true;
19876
19877 /* Try one of the AVX vpermil variable permutations. */
19878 if (expand_vec_perm_vpermil (d))
19879 return true;
19880
19881 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19882 vpshufb, vpermd, vpermps or vpermq variable permutation. */
19883 if (expand_vec_perm_pshufb (d))
19884 return true;
19885
19886 /* Try the AVX2 vpalignr instruction. */
19887 if (expand_vec_perm_palignr (d, true))
19888 return true;
19889
19890 /* Try the AVX512F vperm{w,b,s,d} instructions */
19891 if (ix86_expand_vec_one_operand_perm_avx512 (d))
19892 return true;
19893
19894 /* Try the AVX512F vpermt2/vpermi2 instructions. */
19895 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
19896 return true;
19897
19898 /* See if we can get the same permutation in different vector integer
19899 mode. */
19900 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19901 {
19902 if (!d->testing_p)
19903 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19904 return true;
19905 }
19906 return false;
19907 }
19908
19909 /* Canonicalize vec_perm index to make the first index
19910 always comes from the first vector. */
19911 static void
19912 ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
19913 {
19914 unsigned nelt = d->nelt;
19915 if (d->perm[0] < nelt)
19916 return;
19917
19918 for (unsigned i = 0; i != nelt; i++)
19919 d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
19920
19921 std::swap (d->op0, d->op1);
19922 return;
19923 }
19924
19925 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19926 in terms of a pair of shufps+ shufps/pshufd instructions. */
19927 static bool
19928 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
19929 {
19930 unsigned char perm1[4];
19931 machine_mode vmode = d->vmode;
19932 bool ok;
19933 unsigned i, j, k, count = 0;
19934
19935 if (d->one_operand_p
19936 || (vmode != V4SImode && vmode != V4SFmode))
19937 return false;
19938
19939 if (d->testing_p)
19940 return true;
19941
19942 ix86_vec_perm_index_canon (d);
19943 for (i = 0; i < 4; ++i)
19944 count += d->perm[i] > 3 ? 1 : 0;
19945
19946 gcc_assert (count & 3);
19947
19948 rtx tmp = gen_reg_rtx (vmode);
19949 /* 2 from op0 and 2 from op1. */
19950 if (count == 2)
19951 {
19952 unsigned char perm2[4];
19953 for (i = 0, j = 0, k = 2; i < 4; ++i)
19954 if (d->perm[i] & 4)
19955 {
19956 perm1[k++] = d->perm[i];
19957 perm2[i] = k - 1;
19958 }
19959 else
19960 {
19961 perm1[j++] = d->perm[i];
19962 perm2[i] = j - 1;
19963 }
19964
19965 /* shufps. */
19966 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
19967 perm1, d->nelt, false);
19968 gcc_assert (ok);
19969 if (vmode == V4SImode && TARGET_SSE2)
19970 /* pshufd. */
19971 ok = expand_vselect (d->target, tmp,
19972 perm2, d->nelt, false);
19973 else
19974 {
19975 /* shufps. */
19976 perm2[2] += 4;
19977 perm2[3] += 4;
19978 ok = expand_vselect_vconcat (d->target, tmp, tmp,
19979 perm2, d->nelt, false);
19980 }
19981 gcc_assert (ok);
19982 }
19983 /* 3 from one op and 1 from another. */
19984 else
19985 {
19986 unsigned pair_idx = 8, lone_idx = 8, shift;
19987
19988 /* Find the lone index. */
19989 for (i = 0; i < 4; ++i)
19990 if ((d->perm[i] > 3 && count == 1)
19991 || (d->perm[i] < 4 && count == 3))
19992 lone_idx = i;
19993
19994 /* When lone_idx is not 0, it must from second op(count == 1). */
19995 gcc_assert (count == (lone_idx ? 1 : 3));
19996
19997 /* Find the pair index that sits in the same half as the lone index. */
19998 shift = lone_idx & 2;
19999 pair_idx = 1 - lone_idx + 2 * shift;
20000
20001 /* First permutate lone index and pair index into the same vector as
20002 [ lone, lone, pair, pair ]. */
20003 perm1[1] = perm1[0]
20004 = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
20005 perm1[3] = perm1[2]
20006 = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
20007
20008 /* Alway put the vector contains lone indx at the first. */
20009 if (count == 1)
20010 std::swap (d->op0, d->op1);
20011
20012 /* shufps. */
20013 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20014 perm1, d->nelt, false);
20015 gcc_assert (ok);
20016
20017 /* Refine lone and pair index to original order. */
20018 perm1[shift] = lone_idx << 1;
20019 perm1[shift + 1] = pair_idx << 1;
20020
20021 /* Select the remaining 2 elements in another vector. */
20022 for (i = 2 - shift; i < 4 - shift; ++i)
20023 perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
20024
20025 /* Adjust to original selector. */
20026 if (lone_idx > 1)
20027 std::swap (tmp, d->op1);
20028
20029 /* shufps. */
20030 ok = expand_vselect_vconcat (d->target, tmp, d->op1,
20031 perm1, d->nelt, false);
20032
20033 gcc_assert (ok);
20034 }
20035
20036 return true;
20037 }
20038
20039 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20040 in terms of a pair of pshuflw + pshufhw instructions. */
20041
20042 static bool
20043 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
20044 {
20045 unsigned char perm2[MAX_VECT_LEN];
20046 unsigned i;
20047 bool ok;
20048
20049 if (d->vmode != V8HImode || !d->one_operand_p)
20050 return false;
20051
20052 /* The two permutations only operate in 64-bit lanes. */
20053 for (i = 0; i < 4; ++i)
20054 if (d->perm[i] >= 4)
20055 return false;
20056 for (i = 4; i < 8; ++i)
20057 if (d->perm[i] < 4)
20058 return false;
20059
20060 if (d->testing_p)
20061 return true;
20062
20063 /* Emit the pshuflw. */
20064 memcpy (perm2, d->perm, 4);
20065 for (i = 4; i < 8; ++i)
20066 perm2[i] = i;
20067 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
20068 gcc_assert (ok);
20069
20070 /* Emit the pshufhw. */
20071 memcpy (perm2 + 4, d->perm + 4, 4);
20072 for (i = 0; i < 4; ++i)
20073 perm2[i] = i;
20074 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
20075 gcc_assert (ok);
20076
20077 return true;
20078 }
20079
20080 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20081 the permutation using the SSSE3 palignr instruction. This succeeds
20082 when all of the elements in PERM fit within one vector and we merely
20083 need to shift them down so that a single vector permutation has a
20084 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20085 the vpalignr instruction itself can perform the requested permutation. */
20086
20087 static bool
20088 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
20089 {
20090 unsigned i, nelt = d->nelt;
20091 unsigned min, max, minswap, maxswap;
20092 bool in_order, ok, swap = false;
20093 rtx shift, target;
20094 struct expand_vec_perm_d dcopy;
20095
20096 /* Even with AVX, palignr only operates on 128-bit vectors,
20097 in AVX2 palignr operates on both 128-bit lanes. */
20098 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
20099 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
20100 return false;
20101
20102 min = 2 * nelt;
20103 max = 0;
20104 minswap = 2 * nelt;
20105 maxswap = 0;
20106 for (i = 0; i < nelt; ++i)
20107 {
20108 unsigned e = d->perm[i];
20109 unsigned eswap = d->perm[i] ^ nelt;
20110 if (GET_MODE_SIZE (d->vmode) == 32)
20111 {
20112 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
20113 eswap = e ^ (nelt / 2);
20114 }
20115 if (e < min)
20116 min = e;
20117 if (e > max)
20118 max = e;
20119 if (eswap < minswap)
20120 minswap = eswap;
20121 if (eswap > maxswap)
20122 maxswap = eswap;
20123 }
20124 if (min == 0
20125 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
20126 {
20127 if (d->one_operand_p
20128 || minswap == 0
20129 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
20130 ? nelt / 2 : nelt))
20131 return false;
20132 swap = true;
20133 min = minswap;
20134 max = maxswap;
20135 }
20136
20137 /* Given that we have SSSE3, we know we'll be able to implement the
20138 single operand permutation after the palignr with pshufb for
20139 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20140 first. */
20141 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
20142 return true;
20143
20144 dcopy = *d;
20145 if (swap)
20146 {
20147 dcopy.op0 = d->op1;
20148 dcopy.op1 = d->op0;
20149 for (i = 0; i < nelt; ++i)
20150 dcopy.perm[i] ^= nelt;
20151 }
20152
20153 in_order = true;
20154 for (i = 0; i < nelt; ++i)
20155 {
20156 unsigned e = dcopy.perm[i];
20157 if (GET_MODE_SIZE (d->vmode) == 32
20158 && e >= nelt
20159 && (e & (nelt / 2 - 1)) < min)
20160 e = e - min - (nelt / 2);
20161 else
20162 e = e - min;
20163 if (e != i)
20164 in_order = false;
20165 dcopy.perm[i] = e;
20166 }
20167 dcopy.one_operand_p = true;
20168
20169 if (single_insn_only_p && !in_order)
20170 return false;
20171
20172 /* For AVX2, test whether we can permute the result in one instruction. */
20173 if (d->testing_p)
20174 {
20175 if (in_order)
20176 return true;
20177 dcopy.op1 = dcopy.op0;
20178 return expand_vec_perm_1 (&dcopy);
20179 }
20180
20181 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
20182 if (GET_MODE_SIZE (d->vmode) == 16)
20183 {
20184 target = gen_reg_rtx (V1TImode);
20185 emit_insn (gen_ssse3_palignrv1ti (target,
20186 gen_lowpart (V1TImode, dcopy.op1),
20187 gen_lowpart (V1TImode, dcopy.op0),
20188 shift));
20189 }
20190 else
20191 {
20192 target = gen_reg_rtx (V2TImode);
20193 emit_insn (gen_avx2_palignrv2ti (target,
20194 gen_lowpart (V2TImode, dcopy.op1),
20195 gen_lowpart (V2TImode, dcopy.op0),
20196 shift));
20197 }
20198
20199 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
20200
20201 /* Test for the degenerate case where the alignment by itself
20202 produces the desired permutation. */
20203 if (in_order)
20204 {
20205 emit_move_insn (d->target, dcopy.op0);
20206 return true;
20207 }
20208
20209 ok = expand_vec_perm_1 (&dcopy);
20210 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
20211
20212 return ok;
20213 }
20214
20215 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20216 the permutation using the SSE4_1 pblendv instruction. Potentially
20217 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20218
20219 static bool
20220 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
20221 {
20222 unsigned i, which, nelt = d->nelt;
20223 struct expand_vec_perm_d dcopy, dcopy1;
20224 machine_mode vmode = d->vmode;
20225 bool ok;
20226
20227 /* Use the same checks as in expand_vec_perm_blend. */
20228 if (d->one_operand_p)
20229 return false;
20230 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20231 ;
20232 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20233 ;
20234 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
20235 || GET_MODE_SIZE (vmode) == 8
20236 || GET_MODE_SIZE (vmode) == 16))
20237 ;
20238 else
20239 return false;
20240
20241 /* Figure out where permutation elements stay not in their
20242 respective lanes. */
20243 for (i = 0, which = 0; i < nelt; ++i)
20244 {
20245 unsigned e = d->perm[i];
20246 if (e != i)
20247 which |= (e < nelt ? 1 : 2);
20248 }
20249 /* We can pblend the part where elements stay not in their
20250 respective lanes only when these elements are all in one
20251 half of a permutation.
20252 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20253 lanes, but both 8 and 9 >= 8
20254 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20255 respective lanes and 8 >= 8, but 2 not. */
20256 if (which != 1 && which != 2)
20257 return false;
20258 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
20259 return true;
20260
20261 /* First we apply one operand permutation to the part where
20262 elements stay not in their respective lanes. */
20263 dcopy = *d;
20264 if (which == 2)
20265 dcopy.op0 = dcopy.op1 = d->op1;
20266 else
20267 dcopy.op0 = dcopy.op1 = d->op0;
20268 if (!d->testing_p)
20269 dcopy.target = gen_reg_rtx (vmode);
20270 dcopy.one_operand_p = true;
20271
20272 for (i = 0; i < nelt; ++i)
20273 dcopy.perm[i] = d->perm[i] & (nelt - 1);
20274
20275 ok = expand_vec_perm_1 (&dcopy);
20276 if (GET_MODE_SIZE (vmode) != 16 && !ok)
20277 return false;
20278 else
20279 gcc_assert (ok);
20280 if (d->testing_p)
20281 return true;
20282
20283 /* Next we put permuted elements into their positions. */
20284 dcopy1 = *d;
20285 if (which == 2)
20286 dcopy1.op1 = dcopy.target;
20287 else
20288 dcopy1.op0 = dcopy.target;
20289
20290 for (i = 0; i < nelt; ++i)
20291 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
20292
20293 ok = expand_vec_perm_blend (&dcopy1);
20294 gcc_assert (ok);
20295
20296 return true;
20297 }
20298
20299 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
20300
20301 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20302 a two vector permutation into a single vector permutation by using
20303 an interleave operation to merge the vectors. */
20304
20305 static bool
20306 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
20307 {
20308 struct expand_vec_perm_d dremap, dfinal;
20309 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
20310 unsigned HOST_WIDE_INT contents;
20311 unsigned char remap[2 * MAX_VECT_LEN];
20312 rtx_insn *seq;
20313 bool ok, same_halves = false;
20314
20315 if (GET_MODE_SIZE (d->vmode) == 4
20316 || GET_MODE_SIZE (d->vmode) == 8
20317 || GET_MODE_SIZE (d->vmode) == 16)
20318 {
20319 if (d->one_operand_p)
20320 return false;
20321 }
20322 else if (GET_MODE_SIZE (d->vmode) == 32)
20323 {
20324 if (!TARGET_AVX)
20325 return false;
20326 /* For 32-byte modes allow even d->one_operand_p.
20327 The lack of cross-lane shuffling in some instructions
20328 might prevent a single insn shuffle. */
20329 dfinal = *d;
20330 dfinal.testing_p = true;
20331 /* If expand_vec_perm_interleave3 can expand this into
20332 a 3 insn sequence, give up and let it be expanded as
20333 3 insn sequence. While that is one insn longer,
20334 it doesn't need a memory operand and in the common
20335 case that both interleave low and high permutations
20336 with the same operands are adjacent needs 4 insns
20337 for both after CSE. */
20338 if (expand_vec_perm_interleave3 (&dfinal))
20339 return false;
20340 }
20341 else
20342 return false;
20343
20344 /* Examine from whence the elements come. */
20345 contents = 0;
20346 for (i = 0; i < nelt; ++i)
20347 contents |= HOST_WIDE_INT_1U << d->perm[i];
20348
20349 memset (remap, 0xff, sizeof (remap));
20350 dremap = *d;
20351
20352 if (GET_MODE_SIZE (d->vmode) == 4
20353 || GET_MODE_SIZE (d->vmode) == 8)
20354 {
20355 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20356
20357 /* Split the two input vectors into 4 halves. */
20358 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20359 h2 = h1 << nelt2;
20360 h3 = h2 << nelt2;
20361 h4 = h3 << nelt2;
20362
20363 /* If the elements from the low halves use interleave low,
20364 and similarly for interleave high. */
20365 if ((contents & (h1 | h3)) == contents)
20366 {
20367 /* punpckl* */
20368 for (i = 0; i < nelt2; ++i)
20369 {
20370 remap[i] = i * 2;
20371 remap[i + nelt] = i * 2 + 1;
20372 dremap.perm[i * 2] = i;
20373 dremap.perm[i * 2 + 1] = i + nelt;
20374 }
20375 }
20376 else if ((contents & (h2 | h4)) == contents)
20377 {
20378 /* punpckh* */
20379 for (i = 0; i < nelt2; ++i)
20380 {
20381 remap[i + nelt2] = i * 2;
20382 remap[i + nelt + nelt2] = i * 2 + 1;
20383 dremap.perm[i * 2] = i + nelt2;
20384 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20385 }
20386 }
20387 else
20388 return false;
20389 }
20390 else if (GET_MODE_SIZE (d->vmode) == 16)
20391 {
20392 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20393
20394 /* Split the two input vectors into 4 halves. */
20395 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20396 h2 = h1 << nelt2;
20397 h3 = h2 << nelt2;
20398 h4 = h3 << nelt2;
20399
20400 /* If the elements from the low halves use interleave low, and similarly
20401 for interleave high. If the elements are from mis-matched halves, we
20402 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20403 if ((contents & (h1 | h3)) == contents)
20404 {
20405 /* punpckl* */
20406 for (i = 0; i < nelt2; ++i)
20407 {
20408 remap[i] = i * 2;
20409 remap[i + nelt] = i * 2 + 1;
20410 dremap.perm[i * 2] = i;
20411 dremap.perm[i * 2 + 1] = i + nelt;
20412 }
20413 if (!TARGET_SSE2 && d->vmode == V4SImode)
20414 dremap.vmode = V4SFmode;
20415 }
20416 else if ((contents & (h2 | h4)) == contents)
20417 {
20418 /* punpckh* */
20419 for (i = 0; i < nelt2; ++i)
20420 {
20421 remap[i + nelt2] = i * 2;
20422 remap[i + nelt + nelt2] = i * 2 + 1;
20423 dremap.perm[i * 2] = i + nelt2;
20424 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20425 }
20426 if (!TARGET_SSE2 && d->vmode == V4SImode)
20427 dremap.vmode = V4SFmode;
20428 }
20429 else if ((contents & (h1 | h4)) == contents)
20430 {
20431 /* shufps */
20432 for (i = 0; i < nelt2; ++i)
20433 {
20434 remap[i] = i;
20435 remap[i + nelt + nelt2] = i + nelt2;
20436 dremap.perm[i] = i;
20437 dremap.perm[i + nelt2] = i + nelt + nelt2;
20438 }
20439 if (nelt != 4)
20440 {
20441 /* shufpd */
20442 dremap.vmode = V2DImode;
20443 dremap.nelt = 2;
20444 dremap.perm[0] = 0;
20445 dremap.perm[1] = 3;
20446 }
20447 }
20448 else if ((contents & (h2 | h3)) == contents)
20449 {
20450 /* shufps */
20451 for (i = 0; i < nelt2; ++i)
20452 {
20453 remap[i + nelt2] = i;
20454 remap[i + nelt] = i + nelt2;
20455 dremap.perm[i] = i + nelt2;
20456 dremap.perm[i + nelt2] = i + nelt;
20457 }
20458 if (nelt != 4)
20459 {
20460 /* shufpd */
20461 dremap.vmode = V2DImode;
20462 dremap.nelt = 2;
20463 dremap.perm[0] = 1;
20464 dremap.perm[1] = 2;
20465 }
20466 }
20467 else
20468 return false;
20469 }
20470 else
20471 {
20472 unsigned int nelt4 = nelt / 4, nzcnt = 0;
20473 unsigned HOST_WIDE_INT q[8];
20474 unsigned int nonzero_halves[4];
20475
20476 /* Split the two input vectors into 8 quarters. */
20477 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
20478 for (i = 1; i < 8; ++i)
20479 q[i] = q[0] << (nelt4 * i);
20480 for (i = 0; i < 4; ++i)
20481 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
20482 {
20483 nonzero_halves[nzcnt] = i;
20484 ++nzcnt;
20485 }
20486
20487 if (nzcnt == 1)
20488 {
20489 gcc_assert (d->one_operand_p);
20490 nonzero_halves[1] = nonzero_halves[0];
20491 same_halves = true;
20492 }
20493 else if (d->one_operand_p)
20494 {
20495 gcc_assert (nonzero_halves[0] == 0);
20496 gcc_assert (nonzero_halves[1] == 1);
20497 }
20498
20499 if (nzcnt <= 2)
20500 {
20501 if (d->perm[0] / nelt2 == nonzero_halves[1])
20502 {
20503 /* Attempt to increase the likelihood that dfinal
20504 shuffle will be intra-lane. */
20505 std::swap (nonzero_halves[0], nonzero_halves[1]);
20506 }
20507
20508 /* vperm2f128 or vperm2i128. */
20509 for (i = 0; i < nelt2; ++i)
20510 {
20511 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
20512 remap[i + nonzero_halves[0] * nelt2] = i;
20513 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
20514 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
20515 }
20516
20517 if (d->vmode != V8SFmode
20518 && d->vmode != V4DFmode
20519 && d->vmode != V8SImode)
20520 {
20521 dremap.vmode = V8SImode;
20522 dremap.nelt = 8;
20523 for (i = 0; i < 4; ++i)
20524 {
20525 dremap.perm[i] = i + nonzero_halves[0] * 4;
20526 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
20527 }
20528 }
20529 }
20530 else if (d->one_operand_p)
20531 return false;
20532 else if (TARGET_AVX2
20533 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
20534 {
20535 /* vpunpckl* */
20536 for (i = 0; i < nelt4; ++i)
20537 {
20538 remap[i] = i * 2;
20539 remap[i + nelt] = i * 2 + 1;
20540 remap[i + nelt2] = i * 2 + nelt2;
20541 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
20542 dremap.perm[i * 2] = i;
20543 dremap.perm[i * 2 + 1] = i + nelt;
20544 dremap.perm[i * 2 + nelt2] = i + nelt2;
20545 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
20546 }
20547 }
20548 else if (TARGET_AVX2
20549 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
20550 {
20551 /* vpunpckh* */
20552 for (i = 0; i < nelt4; ++i)
20553 {
20554 remap[i + nelt4] = i * 2;
20555 remap[i + nelt + nelt4] = i * 2 + 1;
20556 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
20557 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
20558 dremap.perm[i * 2] = i + nelt4;
20559 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
20560 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
20561 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
20562 }
20563 }
20564 else
20565 return false;
20566 }
20567
20568 /* Use the remapping array set up above to move the elements from their
20569 swizzled locations into their final destinations. */
20570 dfinal = *d;
20571 for (i = 0; i < nelt; ++i)
20572 {
20573 unsigned e = remap[d->perm[i]];
20574 gcc_assert (e < nelt);
20575 /* If same_halves is true, both halves of the remapped vector are the
20576 same. Avoid cross-lane accesses if possible. */
20577 if (same_halves && i >= nelt2)
20578 {
20579 gcc_assert (e < nelt2);
20580 dfinal.perm[i] = e + nelt2;
20581 }
20582 else
20583 dfinal.perm[i] = e;
20584 }
20585 if (!d->testing_p)
20586 {
20587 dremap.target = gen_reg_rtx (dremap.vmode);
20588 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20589 }
20590 dfinal.op1 = dfinal.op0;
20591 dfinal.one_operand_p = true;
20592
20593 /* Test if the final remap can be done with a single insn. For V4SFmode or
20594 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20595 start_sequence ();
20596 ok = expand_vec_perm_1 (&dfinal);
20597 seq = get_insns ();
20598 end_sequence ();
20599
20600 if (!ok)
20601 return false;
20602
20603 if (d->testing_p)
20604 return true;
20605
20606 if (dremap.vmode != dfinal.vmode)
20607 {
20608 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20609 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20610 }
20611
20612 ok = expand_vec_perm_1 (&dremap);
20613 gcc_assert (ok);
20614
20615 emit_insn (seq);
20616 return true;
20617 }
20618
20619 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20620 a single vector cross-lane permutation into vpermq followed
20621 by any of the single insn permutations. */
20622
20623 static bool
20624 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20625 {
20626 struct expand_vec_perm_d dremap, dfinal;
20627 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20628 unsigned contents[2];
20629 bool ok;
20630
20631 if (!(TARGET_AVX2
20632 && (d->vmode == V32QImode || d->vmode == V16HImode)
20633 && d->one_operand_p))
20634 return false;
20635
20636 contents[0] = 0;
20637 contents[1] = 0;
20638 for (i = 0; i < nelt2; ++i)
20639 {
20640 contents[0] |= 1u << (d->perm[i] / nelt4);
20641 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20642 }
20643
20644 for (i = 0; i < 2; ++i)
20645 {
20646 unsigned int cnt = 0;
20647 for (j = 0; j < 4; ++j)
20648 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20649 return false;
20650 }
20651
20652 if (d->testing_p)
20653 return true;
20654
20655 dremap = *d;
20656 dremap.vmode = V4DImode;
20657 dremap.nelt = 4;
20658 dremap.target = gen_reg_rtx (V4DImode);
20659 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20660 dremap.op1 = dremap.op0;
20661 dremap.one_operand_p = true;
20662 for (i = 0; i < 2; ++i)
20663 {
20664 unsigned int cnt = 0;
20665 for (j = 0; j < 4; ++j)
20666 if ((contents[i] & (1u << j)) != 0)
20667 dremap.perm[2 * i + cnt++] = j;
20668 for (; cnt < 2; ++cnt)
20669 dremap.perm[2 * i + cnt] = 0;
20670 }
20671
20672 dfinal = *d;
20673 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20674 dfinal.op1 = dfinal.op0;
20675 dfinal.one_operand_p = true;
20676 for (i = 0, j = 0; i < nelt; ++i)
20677 {
20678 if (i == nelt2)
20679 j = 2;
20680 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20681 if ((d->perm[i] / nelt4) == dremap.perm[j])
20682 ;
20683 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20684 dfinal.perm[i] |= nelt4;
20685 else
20686 gcc_unreachable ();
20687 }
20688
20689 ok = expand_vec_perm_1 (&dremap);
20690 gcc_assert (ok);
20691
20692 ok = expand_vec_perm_1 (&dfinal);
20693 gcc_assert (ok);
20694
20695 return true;
20696 }
20697
20698 static bool canonicalize_perm (struct expand_vec_perm_d *d);
20699
20700 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20701 a vector permutation using two instructions, vperm2f128 resp.
20702 vperm2i128 followed by any single in-lane permutation. */
20703
20704 static bool
20705 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20706 {
20707 struct expand_vec_perm_d dfirst, dsecond;
20708 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20709 bool ok;
20710
20711 if (!TARGET_AVX
20712 || GET_MODE_SIZE (d->vmode) != 32
20713 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20714 return false;
20715
20716 dsecond = *d;
20717 dsecond.one_operand_p = false;
20718 dsecond.testing_p = true;
20719
20720 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20721 immediate. For perm < 16 the second permutation uses
20722 d->op0 as first operand, for perm >= 16 it uses d->op1
20723 as first operand. The second operand is the result of
20724 vperm2[fi]128. */
20725 for (perm = 0; perm < 32; perm++)
20726 {
20727 /* Ignore permutations which do not move anything cross-lane. */
20728 if (perm < 16)
20729 {
20730 /* The second shuffle for e.g. V4DFmode has
20731 0123 and ABCD operands.
20732 Ignore AB23, as 23 is already in the second lane
20733 of the first operand. */
20734 if ((perm & 0xc) == (1 << 2)) continue;
20735 /* And 01CD, as 01 is in the first lane of the first
20736 operand. */
20737 if ((perm & 3) == 0) continue;
20738 /* And 4567, as then the vperm2[fi]128 doesn't change
20739 anything on the original 4567 second operand. */
20740 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
20741 }
20742 else
20743 {
20744 /* The second shuffle for e.g. V4DFmode has
20745 4567 and ABCD operands.
20746 Ignore AB67, as 67 is already in the second lane
20747 of the first operand. */
20748 if ((perm & 0xc) == (3 << 2)) continue;
20749 /* And 45CD, as 45 is in the first lane of the first
20750 operand. */
20751 if ((perm & 3) == 2) continue;
20752 /* And 0123, as then the vperm2[fi]128 doesn't change
20753 anything on the original 0123 first operand. */
20754 if ((perm & 0xf) == (1 << 2)) continue;
20755 }
20756
20757 for (i = 0; i < nelt; i++)
20758 {
20759 j = d->perm[i] / nelt2;
20760 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20761 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20762 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20763 dsecond.perm[i] = d->perm[i] & (nelt - 1);
20764 else
20765 break;
20766 }
20767
20768 if (i == nelt)
20769 {
20770 start_sequence ();
20771 ok = expand_vec_perm_1 (&dsecond);
20772 end_sequence ();
20773 }
20774 else
20775 ok = false;
20776
20777 if (ok)
20778 {
20779 if (d->testing_p)
20780 return true;
20781
20782 /* Found a usable second shuffle. dfirst will be
20783 vperm2f128 on d->op0 and d->op1. */
20784 dsecond.testing_p = false;
20785 dfirst = *d;
20786 dfirst.target = gen_reg_rtx (d->vmode);
20787 for (i = 0; i < nelt; i++)
20788 dfirst.perm[i] = (i & (nelt2 - 1))
20789 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20790
20791 canonicalize_perm (&dfirst);
20792 ok = expand_vec_perm_1 (&dfirst);
20793 gcc_assert (ok);
20794
20795 /* And dsecond is some single insn shuffle, taking
20796 d->op0 and result of vperm2f128 (if perm < 16) or
20797 d->op1 and result of vperm2f128 (otherwise). */
20798 if (perm >= 16)
20799 dsecond.op0 = dsecond.op1;
20800 dsecond.op1 = dfirst.target;
20801
20802 ok = expand_vec_perm_1 (&dsecond);
20803 gcc_assert (ok);
20804
20805 return true;
20806 }
20807
20808 /* For one operand, the only useful vperm2f128 permutation is 0x01
20809 aka lanes swap. */
20810 if (d->one_operand_p)
20811 return false;
20812 }
20813
20814 return false;
20815 }
20816
20817 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20818 a two vector permutation using 2 intra-lane interleave insns
20819 and cross-lane shuffle for 32-byte vectors. */
20820
20821 static bool
20822 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20823 {
20824 unsigned i, nelt;
20825 rtx (*gen) (rtx, rtx, rtx);
20826
20827 if (d->one_operand_p)
20828 return false;
20829 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20830 ;
20831 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20832 ;
20833 else
20834 return false;
20835
20836 nelt = d->nelt;
20837 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20838 return false;
20839 for (i = 0; i < nelt; i += 2)
20840 if (d->perm[i] != d->perm[0] + i / 2
20841 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
20842 return false;
20843
20844 if (d->testing_p)
20845 return true;
20846
20847 switch (d->vmode)
20848 {
20849 case E_V32QImode:
20850 if (d->perm[0])
20851 gen = gen_vec_interleave_highv32qi;
20852 else
20853 gen = gen_vec_interleave_lowv32qi;
20854 break;
20855 case E_V16HImode:
20856 if (d->perm[0])
20857 gen = gen_vec_interleave_highv16hi;
20858 else
20859 gen = gen_vec_interleave_lowv16hi;
20860 break;
20861 case E_V8SImode:
20862 if (d->perm[0])
20863 gen = gen_vec_interleave_highv8si;
20864 else
20865 gen = gen_vec_interleave_lowv8si;
20866 break;
20867 case E_V4DImode:
20868 if (d->perm[0])
20869 gen = gen_vec_interleave_highv4di;
20870 else
20871 gen = gen_vec_interleave_lowv4di;
20872 break;
20873 case E_V8SFmode:
20874 if (d->perm[0])
20875 gen = gen_vec_interleave_highv8sf;
20876 else
20877 gen = gen_vec_interleave_lowv8sf;
20878 break;
20879 case E_V4DFmode:
20880 if (d->perm[0])
20881 gen = gen_vec_interleave_highv4df;
20882 else
20883 gen = gen_vec_interleave_lowv4df;
20884 break;
20885 default:
20886 gcc_unreachable ();
20887 }
20888
20889 emit_insn (gen (d->target, d->op0, d->op1));
20890 return true;
20891 }
20892
20893 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20894 a single vector permutation using a single intra-lane vector
20895 permutation, vperm2f128 swapping the lanes and vblend* insn blending
20896 the non-swapped and swapped vectors together. */
20897
20898 static bool
20899 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
20900 {
20901 struct expand_vec_perm_d dfirst, dsecond;
20902 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
20903 rtx_insn *seq;
20904 bool ok;
20905 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20906
20907 if (!TARGET_AVX
20908 || TARGET_AVX2
20909 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20910 || !d->one_operand_p)
20911 return false;
20912
20913 dfirst = *d;
20914 for (i = 0; i < nelt; i++)
20915 dfirst.perm[i] = 0xff;
20916 for (i = 0, msk = 0; i < nelt; i++)
20917 {
20918 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20919 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
20920 return false;
20921 dfirst.perm[j] = d->perm[i];
20922 if (j != i)
20923 msk |= (1 << i);
20924 }
20925 for (i = 0; i < nelt; i++)
20926 if (dfirst.perm[i] == 0xff)
20927 dfirst.perm[i] = i;
20928
20929 if (!d->testing_p)
20930 dfirst.target = gen_reg_rtx (dfirst.vmode);
20931
20932 start_sequence ();
20933 ok = expand_vec_perm_1 (&dfirst);
20934 seq = get_insns ();
20935 end_sequence ();
20936
20937 if (!ok)
20938 return false;
20939
20940 if (d->testing_p)
20941 return true;
20942
20943 emit_insn (seq);
20944
20945 dsecond = *d;
20946 dsecond.op0 = dfirst.target;
20947 dsecond.op1 = dfirst.target;
20948 dsecond.one_operand_p = true;
20949 dsecond.target = gen_reg_rtx (dsecond.vmode);
20950 for (i = 0; i < nelt; i++)
20951 dsecond.perm[i] = i ^ nelt2;
20952
20953 ok = expand_vec_perm_1 (&dsecond);
20954 gcc_assert (ok);
20955
20956 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20957 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
20958 return true;
20959 }
20960
20961 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20962 a two vector permutation using two single vector permutations and
20963 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
20964 of dfirst or dsecond is identity permutation. */
20965
20966 static bool
20967 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
20968 {
20969 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
20970 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20971 bool ident1 = true, ident2 = true;
20972
20973 if (d->one_operand_p)
20974 return false;
20975
20976 if (GET_MODE_SIZE (d->vmode) == 16)
20977 {
20978 if (!TARGET_SSE)
20979 return false;
20980 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
20981 return false;
20982 }
20983 else if (GET_MODE_SIZE (d->vmode) == 32)
20984 {
20985 if (!TARGET_AVX)
20986 return false;
20987 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
20988 return false;
20989 lane = nelt2;
20990 }
20991 else
20992 return false;
20993
20994 for (i = 1; i < nelt; i++)
20995 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
20996 return false;
20997
20998 dfirst = *d;
20999 dsecond = *d;
21000 dfinal = *d;
21001 dfirst.op1 = dfirst.op0;
21002 dfirst.one_operand_p = true;
21003 dsecond.op0 = dsecond.op1;
21004 dsecond.one_operand_p = true;
21005
21006 for (i = 0; i < nelt; i++)
21007 if (d->perm[i] >= nelt)
21008 {
21009 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
21010 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
21011 ident2 = false;
21012 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
21013 = d->perm[i] - nelt;
21014 }
21015 else
21016 {
21017 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
21018 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
21019 ident1 = false;
21020 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
21021 }
21022
21023 if (two_insn && !ident1 && !ident2)
21024 return false;
21025
21026 if (!d->testing_p)
21027 {
21028 if (!ident1)
21029 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21030 if (!ident2)
21031 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21032 if (d->perm[0] >= nelt)
21033 std::swap (dfinal.op0, dfinal.op1);
21034 }
21035
21036 bool ok;
21037 rtx_insn *seq1 = NULL, *seq2 = NULL;
21038
21039 if (!ident1)
21040 {
21041 start_sequence ();
21042 ok = expand_vec_perm_1 (&dfirst);
21043 seq1 = get_insns ();
21044 end_sequence ();
21045
21046 if (!ok)
21047 return false;
21048 }
21049
21050 if (!ident2)
21051 {
21052 start_sequence ();
21053 ok = expand_vec_perm_1 (&dsecond);
21054 seq2 = get_insns ();
21055 end_sequence ();
21056
21057 if (!ok)
21058 return false;
21059 }
21060
21061 if (d->testing_p)
21062 return true;
21063
21064 for (i = 0; i < nelt; i++)
21065 {
21066 dfinal.perm[i] = i / 2;
21067 if (i >= lane)
21068 dfinal.perm[i] += lane / 2;
21069 if ((i & 1) != 0)
21070 dfinal.perm[i] += nelt;
21071 }
21072 emit_insn (seq1);
21073 emit_insn (seq2);
21074 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
21075 dfinal.perm, dfinal.nelt, false);
21076 gcc_assert (ok);
21077 return true;
21078 }
21079
21080 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21081 the permutation using two single vector permutations and the SSE4_1 pblendv
21082 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21083 identity permutation. */
21084
21085 static bool
21086 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
21087 {
21088 unsigned i, nelt = d->nelt;
21089 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21090 machine_mode vmode = d->vmode;
21091 bool ident1 = true, ident2 = true;
21092
21093 /* Use the same checks as in expand_vec_perm_blend. */
21094 if (d->one_operand_p)
21095 return false;
21096 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21097 ;
21098 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21099 ;
21100 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
21101 || GET_MODE_SIZE (vmode) == 8
21102 || GET_MODE_SIZE (vmode) == 4))
21103 ;
21104 else
21105 return false;
21106
21107 dfirst = *d;
21108 dsecond = *d;
21109 dfinal = *d;
21110 dfirst.op1 = dfirst.op0;
21111 dfirst.one_operand_p = true;
21112 dsecond.op0 = dsecond.op1;
21113 dsecond.one_operand_p = true;
21114
21115 for (i = 0; i < nelt; ++i)
21116 if (d->perm[i] >= nelt)
21117 {
21118 dfirst.perm[i] = 0xff;
21119 dsecond.perm[i] = d->perm[i] - nelt;
21120 if (d->perm[i] != i + nelt)
21121 ident2 = false;
21122 }
21123 else
21124 {
21125 dsecond.perm[i] = 0xff;
21126 dfirst.perm[i] = d->perm[i];
21127 if (d->perm[i] != i)
21128 ident1 = false;
21129 }
21130
21131 if (two_insn && !ident1 && !ident2)
21132 return false;
21133
21134 /* For now. Ideally treat 0xff as a wildcard. */
21135 for (i = 0; i < nelt; ++i)
21136 if (dfirst.perm[i] == 0xff)
21137 {
21138 if (GET_MODE_SIZE (vmode) == 32
21139 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
21140 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21141 else
21142 dfirst.perm[i] = i;
21143 }
21144 else
21145 {
21146 if (GET_MODE_SIZE (vmode) == 32
21147 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
21148 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21149 else
21150 dsecond.perm[i] = i;
21151 }
21152
21153 if (!d->testing_p)
21154 {
21155 if (!ident1)
21156 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21157 if (!ident2)
21158 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21159 }
21160
21161 bool ok;
21162 rtx_insn *seq1 = NULL, *seq2 = NULL;
21163
21164 if (!ident1)
21165 {
21166 start_sequence ();
21167 ok = expand_vec_perm_1 (&dfirst);
21168 seq1 = get_insns ();
21169 end_sequence ();
21170
21171 if (!ok)
21172 return false;
21173 }
21174
21175 if (!ident2)
21176 {
21177 start_sequence ();
21178 ok = expand_vec_perm_1 (&dsecond);
21179 seq2 = get_insns ();
21180 end_sequence ();
21181
21182 if (!ok)
21183 return false;
21184 }
21185
21186 if (d->testing_p)
21187 return true;
21188
21189 for (i = 0; i < nelt; ++i)
21190 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
21191
21192 emit_insn (seq1);
21193 emit_insn (seq2);
21194 ok = expand_vec_perm_blend (&dfinal);
21195 gcc_assert (ok);
21196 return true;
21197 }
21198
21199 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21200 permutation using two vperm2f128, followed by a vshufpd insn blending
21201 the two vectors together. */
21202
21203 static bool
21204 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
21205 {
21206 struct expand_vec_perm_d dfirst, dsecond, dthird;
21207 bool ok;
21208
21209 if (!TARGET_AVX || (d->vmode != V4DFmode))
21210 return false;
21211
21212 if (d->testing_p)
21213 return true;
21214
21215 dfirst = *d;
21216 dsecond = *d;
21217 dthird = *d;
21218
21219 dfirst.perm[0] = (d->perm[0] & ~1);
21220 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
21221 dfirst.perm[2] = (d->perm[2] & ~1);
21222 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
21223 dsecond.perm[0] = (d->perm[1] & ~1);
21224 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
21225 dsecond.perm[2] = (d->perm[3] & ~1);
21226 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
21227 dthird.perm[0] = (d->perm[0] % 2);
21228 dthird.perm[1] = (d->perm[1] % 2) + 4;
21229 dthird.perm[2] = (d->perm[2] % 2) + 2;
21230 dthird.perm[3] = (d->perm[3] % 2) + 6;
21231
21232 dfirst.target = gen_reg_rtx (dfirst.vmode);
21233 dsecond.target = gen_reg_rtx (dsecond.vmode);
21234 dthird.op0 = dfirst.target;
21235 dthird.op1 = dsecond.target;
21236 dthird.one_operand_p = false;
21237
21238 canonicalize_perm (&dfirst);
21239 canonicalize_perm (&dsecond);
21240
21241 ok = expand_vec_perm_1 (&dfirst)
21242 && expand_vec_perm_1 (&dsecond)
21243 && expand_vec_perm_1 (&dthird);
21244
21245 gcc_assert (ok);
21246
21247 return true;
21248 }
21249
21250 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
21251
21252 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21253 a two vector permutation using two intra-lane vector
21254 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21255 the non-swapped and swapped vectors together. */
21256
21257 static bool
21258 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
21259 {
21260 struct expand_vec_perm_d dfirst, dsecond, dthird;
21261 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
21262 rtx_insn *seq1, *seq2;
21263 bool ok;
21264 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21265
21266 if (!TARGET_AVX
21267 || TARGET_AVX2
21268 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21269 || d->one_operand_p)
21270 return false;
21271
21272 dfirst = *d;
21273 dsecond = *d;
21274 for (i = 0; i < nelt; i++)
21275 {
21276 dfirst.perm[i] = 0xff;
21277 dsecond.perm[i] = 0xff;
21278 }
21279 for (i = 0, msk = 0; i < nelt; i++)
21280 {
21281 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21282 if (j == i)
21283 {
21284 dfirst.perm[j] = d->perm[i];
21285 which1 |= (d->perm[i] < nelt ? 1 : 2);
21286 }
21287 else
21288 {
21289 dsecond.perm[j] = d->perm[i];
21290 which2 |= (d->perm[i] < nelt ? 1 : 2);
21291 msk |= (1U << i);
21292 }
21293 }
21294 if (msk == 0 || msk == (1U << nelt) - 1)
21295 return false;
21296
21297 if (!d->testing_p)
21298 {
21299 dfirst.target = gen_reg_rtx (dfirst.vmode);
21300 dsecond.target = gen_reg_rtx (dsecond.vmode);
21301 }
21302
21303 for (i = 0; i < nelt; i++)
21304 {
21305 if (dfirst.perm[i] == 0xff)
21306 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
21307 if (dsecond.perm[i] == 0xff)
21308 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
21309 }
21310 canonicalize_perm (&dfirst);
21311 start_sequence ();
21312 ok = ix86_expand_vec_perm_const_1 (&dfirst);
21313 seq1 = get_insns ();
21314 end_sequence ();
21315
21316 if (!ok)
21317 return false;
21318
21319 canonicalize_perm (&dsecond);
21320 start_sequence ();
21321 ok = ix86_expand_vec_perm_const_1 (&dsecond);
21322 seq2 = get_insns ();
21323 end_sequence ();
21324
21325 if (!ok)
21326 return false;
21327
21328 if (d->testing_p)
21329 return true;
21330
21331 emit_insn (seq1);
21332 emit_insn (seq2);
21333
21334 dthird = *d;
21335 dthird.op0 = dsecond.target;
21336 dthird.op1 = dsecond.target;
21337 dthird.one_operand_p = true;
21338 dthird.target = gen_reg_rtx (dthird.vmode);
21339 for (i = 0; i < nelt; i++)
21340 dthird.perm[i] = i ^ nelt2;
21341
21342 ok = expand_vec_perm_1 (&dthird);
21343 gcc_assert (ok);
21344
21345 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21346 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
21347 return true;
21348 }
21349
21350 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21351 permutation with two pshufb insns and an ior. We should have already
21352 failed all two instruction sequences. */
21353
21354 static bool
21355 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
21356 {
21357 rtx rperm[2][16], vperm, l, h, op, m128;
21358 unsigned int i, nelt, eltsz;
21359 machine_mode mode;
21360 rtx (*gen) (rtx, rtx, rtx);
21361
21362 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
21363 && GET_MODE_SIZE (d->vmode) != 8
21364 && GET_MODE_SIZE (d->vmode) != 4))
21365 return false;
21366 gcc_assert (!d->one_operand_p);
21367
21368 if (d->testing_p)
21369 return true;
21370
21371 switch (GET_MODE_SIZE (d->vmode))
21372 {
21373 case 4:
21374 mode = V4QImode;
21375 gen = gen_mmx_pshufbv4qi3;
21376 break;
21377 case 8:
21378 mode = V8QImode;
21379 gen = gen_mmx_pshufbv8qi3;
21380 break;
21381 case 16:
21382 mode = V16QImode;
21383 gen = gen_ssse3_pshufbv16qi3;
21384 break;
21385 default:
21386 gcc_unreachable ();
21387 }
21388
21389 nelt = d->nelt;
21390 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21391
21392 /* Generate two permutation masks. If the required element is within
21393 the given vector it is shuffled into the proper lane. If the required
21394 element is in the other vector, force a zero into the lane by setting
21395 bit 7 in the permutation mask. */
21396 m128 = GEN_INT (-128);
21397 for (i = 0; i < nelt; ++i)
21398 {
21399 unsigned j, k, e = d->perm[i];
21400 unsigned which = (e >= nelt);
21401 if (e >= nelt)
21402 e -= nelt;
21403
21404 for (j = 0; j < eltsz; ++j)
21405 {
21406 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
21407 rperm[1-which][i*eltsz + j] = m128;
21408 }
21409
21410 for (k = i*eltsz + j; k < 16; ++k)
21411 rperm[0][k] = rperm[1][k] = m128;
21412 }
21413
21414 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
21415 vperm = force_reg (V16QImode, vperm);
21416
21417 l = gen_reg_rtx (mode);
21418 op = gen_lowpart (mode, d->op0);
21419 emit_insn (gen (l, op, vperm));
21420
21421 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
21422 vperm = force_reg (V16QImode, vperm);
21423
21424 h = gen_reg_rtx (mode);
21425 op = gen_lowpart (mode, d->op1);
21426 emit_insn (gen (h, op, vperm));
21427
21428 op = d->target;
21429 if (d->vmode != mode)
21430 op = gen_reg_rtx (mode);
21431 ix86_emit_vec_binop (IOR, mode, op, l, h);
21432 if (op != d->target)
21433 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21434
21435 return true;
21436 }
21437
21438 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
21439 with two vpshufb insns, vpermq and vpor. We should have already failed
21440 all two or three instruction sequences. */
21441
21442 static bool
21443 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
21444 {
21445 rtx rperm[2][32], vperm, l, h, hp, op, m128;
21446 unsigned int i, nelt, eltsz;
21447
21448 if (!TARGET_AVX2
21449 || !d->one_operand_p
21450 || (d->vmode != V32QImode && d->vmode != V16HImode))
21451 return false;
21452
21453 if (d->testing_p)
21454 return true;
21455
21456 nelt = d->nelt;
21457 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21458
21459 /* Generate two permutation masks. If the required element is within
21460 the same lane, it is shuffled in. If the required element from the
21461 other lane, force a zero by setting bit 7 in the permutation mask.
21462 In the other mask the mask has non-negative elements if element
21463 is requested from the other lane, but also moved to the other lane,
21464 so that the result of vpshufb can have the two V2TImode halves
21465 swapped. */
21466 m128 = GEN_INT (-128);
21467 for (i = 0; i < nelt; ++i)
21468 {
21469 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21470 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21471
21472 for (j = 0; j < eltsz; ++j)
21473 {
21474 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
21475 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
21476 }
21477 }
21478
21479 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21480 vperm = force_reg (V32QImode, vperm);
21481
21482 h = gen_reg_rtx (V32QImode);
21483 op = gen_lowpart (V32QImode, d->op0);
21484 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21485
21486 /* Swap the 128-byte lanes of h into hp. */
21487 hp = gen_reg_rtx (V4DImode);
21488 op = gen_lowpart (V4DImode, h);
21489 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
21490 const1_rtx));
21491
21492 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21493 vperm = force_reg (V32QImode, vperm);
21494
21495 l = gen_reg_rtx (V32QImode);
21496 op = gen_lowpart (V32QImode, d->op0);
21497 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21498
21499 op = d->target;
21500 if (d->vmode != V32QImode)
21501 op = gen_reg_rtx (V32QImode);
21502 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
21503 if (op != d->target)
21504 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21505
21506 return true;
21507 }
21508
21509 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21510 and extract-odd permutations of two V32QImode and V16QImode operand
21511 with two vpshufb insns, vpor and vpermq. We should have already
21512 failed all two or three instruction sequences. */
21513
21514 static bool
21515 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
21516 {
21517 rtx rperm[2][32], vperm, l, h, ior, op, m128;
21518 unsigned int i, nelt, eltsz;
21519
21520 if (!TARGET_AVX2
21521 || d->one_operand_p
21522 || (d->vmode != V32QImode && d->vmode != V16HImode))
21523 return false;
21524
21525 for (i = 0; i < d->nelt; ++i)
21526 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
21527 return false;
21528
21529 if (d->testing_p)
21530 return true;
21531
21532 nelt = d->nelt;
21533 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21534
21535 /* Generate two permutation masks. In the first permutation mask
21536 the first quarter will contain indexes for the first half
21537 of the op0, the second quarter will contain bit 7 set, third quarter
21538 will contain indexes for the second half of the op0 and the
21539 last quarter bit 7 set. In the second permutation mask
21540 the first quarter will contain bit 7 set, the second quarter
21541 indexes for the first half of the op1, the third quarter bit 7 set
21542 and last quarter indexes for the second half of the op1.
21543 I.e. the first mask e.g. for V32QImode extract even will be:
21544 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21545 (all values masked with 0xf except for -128) and second mask
21546 for extract even will be
21547 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21548 m128 = GEN_INT (-128);
21549 for (i = 0; i < nelt; ++i)
21550 {
21551 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21552 unsigned which = d->perm[i] >= nelt;
21553 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
21554
21555 for (j = 0; j < eltsz; ++j)
21556 {
21557 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
21558 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
21559 }
21560 }
21561
21562 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21563 vperm = force_reg (V32QImode, vperm);
21564
21565 l = gen_reg_rtx (V32QImode);
21566 op = gen_lowpart (V32QImode, d->op0);
21567 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21568
21569 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21570 vperm = force_reg (V32QImode, vperm);
21571
21572 h = gen_reg_rtx (V32QImode);
21573 op = gen_lowpart (V32QImode, d->op1);
21574 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21575
21576 ior = gen_reg_rtx (V32QImode);
21577 emit_insn (gen_iorv32qi3 (ior, l, h));
21578
21579 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21580 op = gen_reg_rtx (V4DImode);
21581 ior = gen_lowpart (V4DImode, ior);
21582 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
21583 const1_rtx, GEN_INT (3)));
21584 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21585
21586 return true;
21587 }
21588
21589 /* Implement permutation with pslldq + psrldq + por when pshufb is not
21590 available. */
21591 static bool
21592 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
21593 {
21594 unsigned i, nelt = d->nelt;
21595 unsigned start1, end1 = -1;
21596 machine_mode vmode = d->vmode, imode;
21597 int start2 = -1;
21598 bool clear_op0, clear_op1;
21599 unsigned inner_size;
21600 rtx op0, op1, dop1;
21601 rtx (*gen_vec_shr) (rtx, rtx, rtx);
21602 rtx (*gen_vec_shl) (rtx, rtx, rtx);
21603
21604 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21605 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
21606 return false;
21607
21608 start1 = d->perm[0];
21609 for (i = 1; i < nelt; i++)
21610 {
21611 if (d->perm[i] != d->perm[i-1] + 1
21612 || d->perm[i] == nelt)
21613 {
21614 if (start2 == -1)
21615 {
21616 start2 = d->perm[i];
21617 end1 = d->perm[i-1];
21618 }
21619 else
21620 return false;
21621 }
21622 }
21623
21624 clear_op0 = end1 != nelt - 1;
21625 clear_op1 = start2 % nelt != 0;
21626 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21627 if (!pandn && (clear_op0 || clear_op1))
21628 return false;
21629
21630 if (d->testing_p)
21631 return true;
21632
21633 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
21634 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
21635 imode = GET_MODE_INNER (vmode);
21636 inner_size = GET_MODE_BITSIZE (imode);
21637 op0 = gen_reg_rtx (vmode);
21638 op1 = gen_reg_rtx (vmode);
21639
21640 if (start1)
21641 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
21642 else
21643 emit_move_insn (op0, d->op0);
21644
21645 dop1 = d->op1;
21646 if (d->one_operand_p)
21647 dop1 = d->op0;
21648
21649 int shl_offset = end1 - start1 + 1 - start2 % nelt;
21650 if (shl_offset)
21651 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
21652 else
21653 emit_move_insn (op1, dop1);
21654
21655 /* Clear lower/upper bits for op0/op1. */
21656 if (clear_op0 || clear_op1)
21657 {
21658 rtx vec[16];
21659 rtx const_vec;
21660 rtx clear;
21661 for (i = 0; i != nelt; i++)
21662 {
21663 if (i < (end1 - start1 + 1))
21664 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
21665 else
21666 vec[i] = CONST0_RTX (imode);
21667 }
21668 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
21669 const_vec = validize_mem (force_const_mem (vmode, const_vec));
21670 clear = force_reg (vmode, const_vec);
21671
21672 if (clear_op0)
21673 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
21674 if (clear_op1)
21675 emit_move_insn (op1, gen_rtx_AND (vmode,
21676 gen_rtx_NOT (vmode, clear),
21677 op1));
21678 }
21679
21680 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
21681 return true;
21682 }
21683
21684 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21685 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21686 operands with two "and" and "pack" or two "shift" and "pack" insns.
21687 We should have already failed all two instruction sequences. */
21688
21689 static bool
21690 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
21691 {
21692 rtx op, dop0, dop1, t;
21693 unsigned i, odd, c, s, nelt = d->nelt;
21694 bool end_perm = false;
21695 machine_mode half_mode;
21696 rtx (*gen_and) (rtx, rtx, rtx);
21697 rtx (*gen_pack) (rtx, rtx, rtx);
21698 rtx (*gen_shift) (rtx, rtx, rtx);
21699
21700 if (d->one_operand_p)
21701 return false;
21702
21703 switch (d->vmode)
21704 {
21705 case E_V4HImode:
21706 /* Required for "pack". */
21707 if (!TARGET_SSE4_1)
21708 return false;
21709 c = 0xffff;
21710 s = 16;
21711 half_mode = V2SImode;
21712 gen_and = gen_andv2si3;
21713 gen_pack = gen_mmx_packusdw;
21714 gen_shift = gen_lshrv2si3;
21715 break;
21716 case E_V8HImode:
21717 /* Required for "pack". */
21718 if (!TARGET_SSE4_1)
21719 return false;
21720 c = 0xffff;
21721 s = 16;
21722 half_mode = V4SImode;
21723 gen_and = gen_andv4si3;
21724 gen_pack = gen_sse4_1_packusdw;
21725 gen_shift = gen_lshrv4si3;
21726 break;
21727 case E_V8QImode:
21728 /* No check as all instructions are SSE2. */
21729 c = 0xff;
21730 s = 8;
21731 half_mode = V4HImode;
21732 gen_and = gen_andv4hi3;
21733 gen_pack = gen_mmx_packuswb;
21734 gen_shift = gen_lshrv4hi3;
21735 break;
21736 case E_V16QImode:
21737 /* No check as all instructions are SSE2. */
21738 c = 0xff;
21739 s = 8;
21740 half_mode = V8HImode;
21741 gen_and = gen_andv8hi3;
21742 gen_pack = gen_sse2_packuswb;
21743 gen_shift = gen_lshrv8hi3;
21744 break;
21745 case E_V16HImode:
21746 if (!TARGET_AVX2)
21747 return false;
21748 c = 0xffff;
21749 s = 16;
21750 half_mode = V8SImode;
21751 gen_and = gen_andv8si3;
21752 gen_pack = gen_avx2_packusdw;
21753 gen_shift = gen_lshrv8si3;
21754 end_perm = true;
21755 break;
21756 case E_V32QImode:
21757 if (!TARGET_AVX2)
21758 return false;
21759 c = 0xff;
21760 s = 8;
21761 half_mode = V16HImode;
21762 gen_and = gen_andv16hi3;
21763 gen_pack = gen_avx2_packuswb;
21764 gen_shift = gen_lshrv16hi3;
21765 end_perm = true;
21766 break;
21767 default:
21768 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21769 are more profitable than general shuffles. */
21770 return false;
21771 }
21772
21773 /* Check that permutation is even or odd. */
21774 odd = d->perm[0];
21775 if (odd > 1)
21776 return false;
21777
21778 for (i = 1; i < nelt; ++i)
21779 if (d->perm[i] != 2 * i + odd)
21780 return false;
21781
21782 if (d->testing_p)
21783 return true;
21784
21785 dop0 = gen_reg_rtx (half_mode);
21786 dop1 = gen_reg_rtx (half_mode);
21787 if (odd == 0)
21788 {
21789 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
21790 t = force_reg (half_mode, t);
21791 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
21792 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
21793 }
21794 else
21795 {
21796 emit_insn (gen_shift (dop0,
21797 gen_lowpart (half_mode, d->op0),
21798 GEN_INT (s)));
21799 emit_insn (gen_shift (dop1,
21800 gen_lowpart (half_mode, d->op1),
21801 GEN_INT (s)));
21802 }
21803 /* In AVX2 for 256 bit case we need to permute pack result. */
21804 if (TARGET_AVX2 && end_perm)
21805 {
21806 op = gen_reg_rtx (d->vmode);
21807 t = gen_reg_rtx (V4DImode);
21808 emit_insn (gen_pack (op, dop0, dop1));
21809 emit_insn (gen_avx2_permv4di_1 (t,
21810 gen_lowpart (V4DImode, op),
21811 const0_rtx,
21812 const2_rtx,
21813 const1_rtx,
21814 GEN_INT (3)));
21815 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
21816 }
21817 else
21818 emit_insn (gen_pack (d->target, dop0, dop1));
21819
21820 return true;
21821 }
21822
21823 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21824 and extract-odd permutations of two V64QI operands
21825 with two "shifts", two "truncs" and one "concat" insns for "odd"
21826 and two "truncs" and one concat insn for "even."
21827 Have already failed all two instruction sequences. */
21828
21829 static bool
21830 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
21831 {
21832 rtx t1, t2, t3, t4;
21833 unsigned i, odd, nelt = d->nelt;
21834
21835 if (!TARGET_AVX512BW
21836 || d->one_operand_p
21837 || d->vmode != V64QImode)
21838 return false;
21839
21840 /* Check that permutation is even or odd. */
21841 odd = d->perm[0];
21842 if (odd > 1)
21843 return false;
21844
21845 for (i = 1; i < nelt; ++i)
21846 if (d->perm[i] != 2 * i + odd)
21847 return false;
21848
21849 if (d->testing_p)
21850 return true;
21851
21852
21853 if (odd)
21854 {
21855 t1 = gen_reg_rtx (V32HImode);
21856 t2 = gen_reg_rtx (V32HImode);
21857 emit_insn (gen_lshrv32hi3 (t1,
21858 gen_lowpart (V32HImode, d->op0),
21859 GEN_INT (8)));
21860 emit_insn (gen_lshrv32hi3 (t2,
21861 gen_lowpart (V32HImode, d->op1),
21862 GEN_INT (8)));
21863 }
21864 else
21865 {
21866 t1 = gen_lowpart (V32HImode, d->op0);
21867 t2 = gen_lowpart (V32HImode, d->op1);
21868 }
21869
21870 t3 = gen_reg_rtx (V32QImode);
21871 t4 = gen_reg_rtx (V32QImode);
21872 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
21873 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
21874 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
21875
21876 return true;
21877 }
21878
21879 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
21880 and extract-odd permutations. */
21881
21882 static bool
21883 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
21884 {
21885 rtx t1, t2, t3, t4, t5;
21886
21887 switch (d->vmode)
21888 {
21889 case E_V4DFmode:
21890 if (d->testing_p)
21891 break;
21892 t1 = gen_reg_rtx (V4DFmode);
21893 t2 = gen_reg_rtx (V4DFmode);
21894
21895 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21896 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
21897 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
21898
21899 /* Now an unpck[lh]pd will produce the result required. */
21900 if (odd)
21901 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
21902 else
21903 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
21904 emit_insn (t3);
21905 break;
21906
21907 case E_V8SFmode:
21908 {
21909 int mask = odd ? 0xdd : 0x88;
21910
21911 if (d->testing_p)
21912 break;
21913 t1 = gen_reg_rtx (V8SFmode);
21914 t2 = gen_reg_rtx (V8SFmode);
21915 t3 = gen_reg_rtx (V8SFmode);
21916
21917 /* Shuffle within the 128-bit lanes to produce:
21918 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
21919 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
21920 GEN_INT (mask)));
21921
21922 /* Shuffle the lanes around to produce:
21923 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
21924 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
21925 GEN_INT (0x3)));
21926
21927 /* Shuffle within the 128-bit lanes to produce:
21928 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
21929 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
21930
21931 /* Shuffle within the 128-bit lanes to produce:
21932 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
21933 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
21934
21935 /* Shuffle the lanes around to produce:
21936 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
21937 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
21938 GEN_INT (0x20)));
21939 }
21940 break;
21941
21942 case E_V2DFmode:
21943 case E_V4SFmode:
21944 case E_V2DImode:
21945 case E_V2SImode:
21946 case E_V4SImode:
21947 case E_V2HImode:
21948 /* These are always directly implementable by expand_vec_perm_1. */
21949 gcc_unreachable ();
21950
21951 case E_V2SFmode:
21952 gcc_assert (TARGET_MMX_WITH_SSE);
21953 /* We have no suitable instructions. */
21954 if (d->testing_p)
21955 return false;
21956 break;
21957
21958 case E_V4QImode:
21959 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21960 return expand_vec_perm_pshufb2 (d);
21961 else
21962 {
21963 if (d->testing_p)
21964 break;
21965 /* We need 2*log2(N)-1 operations to achieve odd/even
21966 with interleave. */
21967 t1 = gen_reg_rtx (V4QImode);
21968 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
21969 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
21970 if (odd)
21971 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
21972 else
21973 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
21974 emit_insn (t2);
21975 }
21976 break;
21977
21978 case E_V4HImode:
21979 if (TARGET_SSE4_1)
21980 return expand_vec_perm_even_odd_pack (d);
21981 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21982 return expand_vec_perm_pshufb2 (d);
21983 else
21984 {
21985 if (d->testing_p)
21986 break;
21987 /* We need 2*log2(N)-1 operations to achieve odd/even
21988 with interleave. */
21989 t1 = gen_reg_rtx (V4HImode);
21990 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
21991 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
21992 if (odd)
21993 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
21994 else
21995 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
21996 emit_insn (t2);
21997 }
21998 break;
21999
22000 case E_V8HImode:
22001 if (TARGET_SSE4_1)
22002 return expand_vec_perm_even_odd_pack (d);
22003 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22004 return expand_vec_perm_pshufb2 (d);
22005 else
22006 {
22007 if (d->testing_p)
22008 break;
22009 /* We need 2*log2(N)-1 operations to achieve odd/even
22010 with interleave. */
22011 t1 = gen_reg_rtx (V8HImode);
22012 t2 = gen_reg_rtx (V8HImode);
22013 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
22014 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
22015 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
22016 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
22017 if (odd)
22018 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
22019 else
22020 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
22021 emit_insn (t3);
22022 }
22023 break;
22024
22025 case E_V8QImode:
22026 case E_V16QImode:
22027 return expand_vec_perm_even_odd_pack (d);
22028
22029 case E_V16HImode:
22030 case E_V32QImode:
22031 return expand_vec_perm_even_odd_pack (d);
22032
22033 case E_V64QImode:
22034 return expand_vec_perm_even_odd_trunc (d);
22035
22036 case E_V4DImode:
22037 if (!TARGET_AVX2)
22038 {
22039 struct expand_vec_perm_d d_copy = *d;
22040 d_copy.vmode = V4DFmode;
22041 if (d->testing_p)
22042 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
22043 else
22044 d_copy.target = gen_reg_rtx (V4DFmode);
22045 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
22046 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
22047 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22048 {
22049 if (!d->testing_p)
22050 emit_move_insn (d->target,
22051 gen_lowpart (V4DImode, d_copy.target));
22052 return true;
22053 }
22054 return false;
22055 }
22056
22057 if (d->testing_p)
22058 break;
22059
22060 t1 = gen_reg_rtx (V4DImode);
22061 t2 = gen_reg_rtx (V4DImode);
22062
22063 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22064 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
22065 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
22066
22067 /* Now an vpunpck[lh]qdq will produce the result required. */
22068 if (odd)
22069 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
22070 else
22071 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
22072 emit_insn (t3);
22073 break;
22074
22075 case E_V8SImode:
22076 if (!TARGET_AVX2)
22077 {
22078 struct expand_vec_perm_d d_copy = *d;
22079 d_copy.vmode = V8SFmode;
22080 if (d->testing_p)
22081 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
22082 else
22083 d_copy.target = gen_reg_rtx (V8SFmode);
22084 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
22085 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
22086 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22087 {
22088 if (!d->testing_p)
22089 emit_move_insn (d->target,
22090 gen_lowpart (V8SImode, d_copy.target));
22091 return true;
22092 }
22093 return false;
22094 }
22095
22096 if (d->testing_p)
22097 break;
22098
22099 t1 = gen_reg_rtx (V8SImode);
22100 t2 = gen_reg_rtx (V8SImode);
22101 t3 = gen_reg_rtx (V4DImode);
22102 t4 = gen_reg_rtx (V4DImode);
22103 t5 = gen_reg_rtx (V4DImode);
22104
22105 /* Shuffle the lanes around into
22106 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22107 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
22108 gen_lowpart (V4DImode, d->op1),
22109 GEN_INT (0x20)));
22110 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
22111 gen_lowpart (V4DImode, d->op1),
22112 GEN_INT (0x31)));
22113
22114 /* Swap the 2nd and 3rd position in each lane into
22115 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22116 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
22117 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22118 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
22119 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22120
22121 /* Now an vpunpck[lh]qdq will produce
22122 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22123 if (odd)
22124 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
22125 gen_lowpart (V4DImode, t2));
22126 else
22127 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
22128 gen_lowpart (V4DImode, t2));
22129 emit_insn (t3);
22130 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
22131 break;
22132
22133 default:
22134 gcc_unreachable ();
22135 }
22136
22137 return true;
22138 }
22139
22140 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22141 extract-even and extract-odd permutations. */
22142
22143 static bool
22144 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
22145 {
22146 unsigned i, odd, nelt = d->nelt;
22147
22148 odd = d->perm[0];
22149 if (odd != 0 && odd != 1)
22150 return false;
22151
22152 for (i = 1; i < nelt; ++i)
22153 if (d->perm[i] != 2 * i + odd)
22154 return false;
22155
22156 if (d->vmode == E_V32HImode
22157 && d->testing_p
22158 && !TARGET_AVX512BW)
22159 return false;
22160
22161 return expand_vec_perm_even_odd_1 (d, odd);
22162 }
22163
22164 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22165 permutations. We assume that expand_vec_perm_1 has already failed. */
22166
22167 static bool
22168 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
22169 {
22170 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
22171 machine_mode vmode = d->vmode;
22172 rtx (*gen) (rtx, rtx, rtx);
22173 unsigned char perm2[4];
22174 rtx op0 = d->op0, dest;
22175 bool ok;
22176
22177 switch (vmode)
22178 {
22179 case E_V4DFmode:
22180 case E_V8SFmode:
22181 /* These are special-cased in sse.md so that we can optionally
22182 use the vbroadcast instruction. They expand to two insns
22183 if the input happens to be in a register. */
22184 gcc_unreachable ();
22185
22186 case E_V2DFmode:
22187 case E_V2SFmode:
22188 case E_V4SFmode:
22189 case E_V2DImode:
22190 case E_V2SImode:
22191 case E_V4SImode:
22192 case E_V2HImode:
22193 case E_V4HImode:
22194 /* These are always implementable using standard shuffle patterns. */
22195 gcc_unreachable ();
22196
22197 case E_V4QImode:
22198 /* This can be implemented via interleave and pshuflw. */
22199 if (d->testing_p)
22200 return true;
22201
22202 if (elt >= nelt2)
22203 {
22204 gen = gen_mmx_punpckhbw_low;
22205 elt -= nelt2;
22206 }
22207 else
22208 gen = gen_mmx_punpcklbw_low;
22209
22210 dest = gen_reg_rtx (vmode);
22211 emit_insn (gen (dest, op0, op0));
22212 vmode = get_mode_wider_vector (vmode);
22213 op0 = gen_lowpart (vmode, dest);
22214
22215 memset (perm2, elt, 2);
22216 dest = gen_reg_rtx (vmode);
22217 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22218 gcc_assert (ok);
22219
22220 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22221 return true;
22222
22223 case E_V8QImode:
22224 /* This can be implemented via interleave. We save one insn by
22225 stopping once we have promoted to V2SImode and then use pshufd. */
22226 if (d->testing_p)
22227 return true;
22228 do
22229 {
22230 if (elt >= nelt2)
22231 {
22232 gen = vmode == V8QImode ? gen_mmx_punpckhbw
22233 : gen_mmx_punpckhwd;
22234 elt -= nelt2;
22235 }
22236 else
22237 gen = vmode == V8QImode ? gen_mmx_punpcklbw
22238 : gen_mmx_punpcklwd;
22239 nelt2 /= 2;
22240
22241 dest = gen_reg_rtx (vmode);
22242 emit_insn (gen (dest, op0, op0));
22243 vmode = get_mode_wider_vector (vmode);
22244 op0 = gen_lowpart (vmode, dest);
22245 }
22246 while (vmode != V2SImode);
22247
22248 memset (perm2, elt, 2);
22249 dest = gen_reg_rtx (vmode);
22250 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22251 gcc_assert (ok);
22252
22253 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22254 return true;
22255
22256 case E_V8HImode:
22257 case E_V16QImode:
22258 /* These can be implemented via interleave. We save one insn by
22259 stopping once we have promoted to V4SImode and then use pshufd. */
22260 if (d->testing_p)
22261 return true;
22262 do
22263 {
22264 if (elt >= nelt2)
22265 {
22266 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
22267 : gen_vec_interleave_highv8hi;
22268 elt -= nelt2;
22269 }
22270 else
22271 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
22272 : gen_vec_interleave_lowv8hi;
22273 nelt2 /= 2;
22274
22275 dest = gen_reg_rtx (vmode);
22276 emit_insn (gen (dest, op0, op0));
22277 vmode = get_mode_wider_vector (vmode);
22278 op0 = gen_lowpart (vmode, dest);
22279 }
22280 while (vmode != V4SImode);
22281
22282 memset (perm2, elt, 4);
22283 dest = gen_reg_rtx (vmode);
22284 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22285 gcc_assert (ok);
22286
22287 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22288 return true;
22289
22290 case E_V8HFmode:
22291 case E_V8BFmode:
22292 /* This can be implemented via interleave and pshufd. */
22293 if (d->testing_p)
22294 return true;
22295
22296 rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
22297 if (elt >= nelt2)
22298 {
22299 maybe_gen = maybe_gen_vec_interleave_high;
22300 elt -= nelt2;
22301 }
22302 else
22303 maybe_gen = maybe_gen_vec_interleave_low;
22304 nelt2 /= 2;
22305
22306 dest = gen_reg_rtx (vmode);
22307 emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
22308
22309 vmode = V4SImode;
22310 op0 = gen_lowpart (vmode, dest);
22311
22312 memset (perm2, elt, 4);
22313 dest = gen_reg_rtx (vmode);
22314 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22315 gcc_assert (ok);
22316
22317 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22318 return true;
22319
22320 case E_V32QImode:
22321 case E_V16HImode:
22322 case E_V8SImode:
22323 case E_V4DImode:
22324 /* For AVX2 broadcasts of the first element vpbroadcast* or
22325 vpermq should be used by expand_vec_perm_1. */
22326 gcc_assert (!TARGET_AVX2 || d->perm[0]);
22327 return false;
22328
22329 case E_V64QImode:
22330 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
22331 return false;
22332
22333 case E_V32HImode:
22334 gcc_assert (!TARGET_AVX512BW);
22335 return false;
22336
22337 default:
22338 gcc_unreachable ();
22339 }
22340 }
22341
22342 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22343 broadcast permutations. */
22344
22345 static bool
22346 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
22347 {
22348 unsigned i, elt, nelt = d->nelt;
22349
22350 if (!d->one_operand_p)
22351 return false;
22352
22353 elt = d->perm[0];
22354 for (i = 1; i < nelt; ++i)
22355 if (d->perm[i] != elt)
22356 return false;
22357
22358 return expand_vec_perm_broadcast_1 (d);
22359 }
22360
22361 /* Implement arbitrary permutations of two V64QImode operands
22362 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22363 static bool
22364 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
22365 {
22366 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
22367 return false;
22368
22369 if (d->testing_p)
22370 return true;
22371
22372 struct expand_vec_perm_d ds[2];
22373 rtx rperm[128], vperm, target0, target1;
22374 unsigned int i, nelt;
22375 machine_mode vmode;
22376
22377 nelt = d->nelt;
22378 vmode = V64QImode;
22379
22380 for (i = 0; i < 2; i++)
22381 {
22382 ds[i] = *d;
22383 ds[i].vmode = V32HImode;
22384 ds[i].nelt = 32;
22385 ds[i].target = gen_reg_rtx (V32HImode);
22386 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
22387 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
22388 }
22389
22390 /* Prepare permutations such that the first one takes care of
22391 putting the even bytes into the right positions or one higher
22392 positions (ds[0]) and the second one takes care of
22393 putting the odd bytes into the right positions or one below
22394 (ds[1]). */
22395
22396 for (i = 0; i < nelt; i++)
22397 {
22398 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
22399 if (i & 1)
22400 {
22401 rperm[i] = constm1_rtx;
22402 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22403 }
22404 else
22405 {
22406 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22407 rperm[i + 64] = constm1_rtx;
22408 }
22409 }
22410
22411 bool ok = expand_vec_perm_1 (&ds[0]);
22412 gcc_assert (ok);
22413 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
22414
22415 ok = expand_vec_perm_1 (&ds[1]);
22416 gcc_assert (ok);
22417 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
22418
22419 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
22420 vperm = force_reg (vmode, vperm);
22421 target0 = gen_reg_rtx (V64QImode);
22422 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
22423
22424 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
22425 vperm = force_reg (vmode, vperm);
22426 target1 = gen_reg_rtx (V64QImode);
22427 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
22428
22429 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
22430 return true;
22431 }
22432
22433 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
22434 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22435 all the shorter instruction sequences. */
22436
22437 static bool
22438 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
22439 {
22440 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
22441 unsigned int i, nelt, eltsz;
22442 bool used[4];
22443
22444 if (!TARGET_AVX2
22445 || d->one_operand_p
22446 || (d->vmode != V32QImode && d->vmode != V16HImode))
22447 return false;
22448
22449 if (d->testing_p)
22450 return true;
22451
22452 nelt = d->nelt;
22453 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22454
22455 /* Generate 4 permutation masks. If the required element is within
22456 the same lane, it is shuffled in. If the required element from the
22457 other lane, force a zero by setting bit 7 in the permutation mask.
22458 In the other mask the mask has non-negative elements if element
22459 is requested from the other lane, but also moved to the other lane,
22460 so that the result of vpshufb can have the two V2TImode halves
22461 swapped. */
22462 m128 = GEN_INT (-128);
22463 for (i = 0; i < 32; ++i)
22464 {
22465 rperm[0][i] = m128;
22466 rperm[1][i] = m128;
22467 rperm[2][i] = m128;
22468 rperm[3][i] = m128;
22469 }
22470 used[0] = false;
22471 used[1] = false;
22472 used[2] = false;
22473 used[3] = false;
22474 for (i = 0; i < nelt; ++i)
22475 {
22476 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22477 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22478 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
22479
22480 for (j = 0; j < eltsz; ++j)
22481 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
22482 used[which] = true;
22483 }
22484
22485 for (i = 0; i < 2; ++i)
22486 {
22487 if (!used[2 * i + 1])
22488 {
22489 h[i] = NULL_RTX;
22490 continue;
22491 }
22492 vperm = gen_rtx_CONST_VECTOR (V32QImode,
22493 gen_rtvec_v (32, rperm[2 * i + 1]));
22494 vperm = force_reg (V32QImode, vperm);
22495 h[i] = gen_reg_rtx (V32QImode);
22496 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22497 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
22498 }
22499
22500 /* Swap the 128-byte lanes of h[X]. */
22501 for (i = 0; i < 2; ++i)
22502 {
22503 if (h[i] == NULL_RTX)
22504 continue;
22505 op = gen_reg_rtx (V4DImode);
22506 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
22507 const2_rtx, GEN_INT (3), const0_rtx,
22508 const1_rtx));
22509 h[i] = gen_lowpart (V32QImode, op);
22510 }
22511
22512 for (i = 0; i < 2; ++i)
22513 {
22514 if (!used[2 * i])
22515 {
22516 l[i] = NULL_RTX;
22517 continue;
22518 }
22519 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
22520 vperm = force_reg (V32QImode, vperm);
22521 l[i] = gen_reg_rtx (V32QImode);
22522 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22523 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
22524 }
22525
22526 for (i = 0; i < 2; ++i)
22527 {
22528 if (h[i] && l[i])
22529 {
22530 op = gen_reg_rtx (V32QImode);
22531 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
22532 l[i] = op;
22533 }
22534 else if (h[i])
22535 l[i] = h[i];
22536 }
22537
22538 gcc_assert (l[0] && l[1]);
22539 op = d->target;
22540 if (d->vmode != V32QImode)
22541 op = gen_reg_rtx (V32QImode);
22542 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
22543 if (op != d->target)
22544 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22545 return true;
22546 }
22547
22548 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22549 taken care of, perform the expansion in D and return true on success. */
22550
22551 static bool
22552 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22553 {
22554 /* Try a single instruction expansion. */
22555 if (expand_vec_perm_1 (d))
22556 return true;
22557
22558 /* Try sequences of two instructions. */
22559
22560 if (expand_vec_perm_pshuflw_pshufhw (d))
22561 return true;
22562
22563 if (expand_vec_perm_palignr (d, false))
22564 return true;
22565
22566 if (expand_vec_perm_interleave2 (d))
22567 return true;
22568
22569 if (expand_vec_perm_broadcast (d))
22570 return true;
22571
22572 if (expand_vec_perm_vpermq_perm_1 (d))
22573 return true;
22574
22575 if (expand_vec_perm_vperm2f128 (d))
22576 return true;
22577
22578 if (expand_vec_perm_pblendv (d))
22579 return true;
22580
22581 if (expand_vec_perm_2perm_interleave (d, true))
22582 return true;
22583
22584 if (expand_vec_perm_2perm_pblendv (d, true))
22585 return true;
22586
22587 if (expand_vec_perm_shufps_shufps (d))
22588 return true;
22589
22590 /* Try sequences of three instructions. */
22591
22592 if (expand_vec_perm_even_odd_pack (d))
22593 return true;
22594
22595 if (expand_vec_perm_2vperm2f128_vshuf (d))
22596 return true;
22597
22598 if (expand_vec_perm_pshufb2 (d))
22599 return true;
22600
22601 if (expand_vec_perm_pslldq_psrldq_por (d, false))
22602 return true;
22603
22604 if (expand_vec_perm_interleave3 (d))
22605 return true;
22606
22607 if (expand_vec_perm_vperm2f128_vblend (d))
22608 return true;
22609
22610 if (expand_vec_perm_2perm_interleave (d, false))
22611 return true;
22612
22613 if (expand_vec_perm_2perm_pblendv (d, false))
22614 return true;
22615
22616 /* Try sequences of four instructions. */
22617
22618 if (expand_vec_perm_even_odd_trunc (d))
22619 return true;
22620 if (expand_vec_perm_vpshufb2_vpermq (d))
22621 return true;
22622
22623 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
22624 return true;
22625
22626 if (expand_vec_perm_vpermt2_vpshub2 (d))
22627 return true;
22628
22629 /* ??? Look for narrow permutations whose element orderings would
22630 allow the promotion to a wider mode. */
22631
22632 /* ??? Look for sequences of interleave or a wider permute that place
22633 the data into the correct lanes for a half-vector shuffle like
22634 pshuf[lh]w or vpermilps. */
22635
22636 /* ??? Look for sequences of interleave that produce the desired results.
22637 The combinatorics of punpck[lh] get pretty ugly... */
22638
22639 if (expand_vec_perm_even_odd (d))
22640 return true;
22641
22642 /* Generate four or five instructions. */
22643 if (expand_vec_perm_pslldq_psrldq_por (d, true))
22644 return true;
22645
22646 /* Even longer sequences. */
22647 if (expand_vec_perm_vpshufb4_vpermq2 (d))
22648 return true;
22649
22650 /* See if we can get the same permutation in different vector integer
22651 mode. */
22652 struct expand_vec_perm_d nd;
22653 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22654 {
22655 if (!d->testing_p)
22656 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22657 return true;
22658 }
22659
22660 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22661 if (expand_vec_perm2_vperm2f128_vblend (d))
22662 return true;
22663
22664 return false;
22665 }
22666
22667 /* If a permutation only uses one operand, make it clear. Returns true
22668 if the permutation references both operands. */
22669
22670 static bool
22671 canonicalize_perm (struct expand_vec_perm_d *d)
22672 {
22673 int i, which, nelt = d->nelt;
22674
22675 for (i = which = 0; i < nelt; ++i)
22676 which |= (d->perm[i] < nelt ? 1 : 2);
22677
22678 d->one_operand_p = true;
22679 switch (which)
22680 {
22681 default:
22682 gcc_unreachable();
22683
22684 case 3:
22685 if (!rtx_equal_p (d->op0, d->op1))
22686 {
22687 d->one_operand_p = false;
22688 break;
22689 }
22690 /* The elements of PERM do not suggest that only the first operand
22691 is used, but both operands are identical. Allow easier matching
22692 of the permutation by folding the permutation into the single
22693 input vector. */
22694 /* FALLTHRU */
22695
22696 case 2:
22697 for (i = 0; i < nelt; ++i)
22698 d->perm[i] &= nelt - 1;
22699 d->op0 = d->op1;
22700 break;
22701
22702 case 1:
22703 d->op1 = d->op0;
22704 break;
22705 }
22706
22707 return (which == 3);
22708 }
22709
22710 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22711
22712 bool
22713 ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
22714 rtx target, rtx op0, rtx op1,
22715 const vec_perm_indices &sel)
22716 {
22717 if (vmode != op_mode)
22718 return false;
22719
22720 struct expand_vec_perm_d d;
22721 unsigned char perm[MAX_VECT_LEN];
22722 unsigned int i, nelt, which;
22723 bool two_args;
22724
22725 /* For HF mode vector, convert it to HI using subreg. */
22726 if (GET_MODE_INNER (vmode) == HFmode)
22727 {
22728 machine_mode orig_mode = vmode;
22729 vmode = mode_for_vector (HImode,
22730 GET_MODE_NUNITS (vmode)).require ();
22731 if (target)
22732 target = lowpart_subreg (vmode, target, orig_mode);
22733 if (op0)
22734 op0 = lowpart_subreg (vmode, op0, orig_mode);
22735 if (op1)
22736 op1 = lowpart_subreg (vmode, op1, orig_mode);
22737 }
22738
22739 d.target = target;
22740 d.op0 = op0;
22741 d.op1 = op1;
22742
22743 d.vmode = vmode;
22744 gcc_assert (VECTOR_MODE_P (d.vmode));
22745 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22746 d.testing_p = !target;
22747
22748 gcc_assert (sel.length () == nelt);
22749 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
22750
22751 /* Given sufficient ISA support we can just return true here
22752 for selected vector modes. */
22753 switch (d.vmode)
22754 {
22755 case E_V16SFmode:
22756 case E_V16SImode:
22757 case E_V8DImode:
22758 case E_V8DFmode:
22759 if (!TARGET_AVX512F)
22760 return false;
22761 /* All implementable with a single vperm[it]2 insn. */
22762 if (d.testing_p)
22763 return true;
22764 break;
22765 case E_V32HImode:
22766 if (!TARGET_AVX512F)
22767 return false;
22768 if (d.testing_p && TARGET_AVX512BW)
22769 /* All implementable with a single vperm[it]2 insn. */
22770 return true;
22771 break;
22772 case E_V64QImode:
22773 if (!TARGET_AVX512F)
22774 return false;
22775 if (d.testing_p && TARGET_AVX512BW)
22776 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22777 return true;
22778 break;
22779 case E_V8SImode:
22780 case E_V8SFmode:
22781 case E_V4DFmode:
22782 case E_V4DImode:
22783 if (!TARGET_AVX)
22784 return false;
22785 if (d.testing_p && TARGET_AVX512VL)
22786 /* All implementable with a single vperm[it]2 insn. */
22787 return true;
22788 break;
22789 case E_V16HImode:
22790 if (!TARGET_SSE2)
22791 return false;
22792 if (d.testing_p && TARGET_AVX2)
22793 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22794 return true;
22795 break;
22796 case E_V32QImode:
22797 if (!TARGET_SSE2)
22798 return false;
22799 if (d.testing_p && TARGET_AVX2)
22800 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22801 return true;
22802 break;
22803 case E_V8HImode:
22804 case E_V16QImode:
22805 if (!TARGET_SSE2)
22806 return false;
22807 /* Fall through. */
22808 case E_V4SImode:
22809 case E_V4SFmode:
22810 if (!TARGET_SSE)
22811 return false;
22812 /* All implementable with a single vpperm insn. */
22813 if (d.testing_p && TARGET_XOP)
22814 return true;
22815 /* All implementable with 2 pshufb + 1 ior. */
22816 if (d.testing_p && TARGET_SSSE3)
22817 return true;
22818 break;
22819 case E_V2SFmode:
22820 case E_V2SImode:
22821 case E_V4HImode:
22822 case E_V8QImode:
22823 if (!TARGET_MMX_WITH_SSE)
22824 return false;
22825 break;
22826 case E_V2HImode:
22827 if (!TARGET_SSE2)
22828 return false;
22829 /* All implementable with *punpckwd. */
22830 if (d.testing_p)
22831 return true;
22832 break;
22833 case E_V4QImode:
22834 if (!TARGET_SSE2)
22835 return false;
22836 break;
22837 case E_V2DImode:
22838 case E_V2DFmode:
22839 if (!TARGET_SSE)
22840 return false;
22841 /* All implementable with shufpd or unpck[lh]pd. */
22842 if (d.testing_p)
22843 return true;
22844 break;
22845 default:
22846 return false;
22847 }
22848
22849 for (i = which = 0; i < nelt; ++i)
22850 {
22851 unsigned char e = sel[i];
22852 gcc_assert (e < 2 * nelt);
22853 d.perm[i] = e;
22854 perm[i] = e;
22855 which |= (e < nelt ? 1 : 2);
22856 }
22857
22858 if (d.testing_p)
22859 {
22860 /* For all elements from second vector, fold the elements to first. */
22861 if (which == 2)
22862 for (i = 0; i < nelt; ++i)
22863 d.perm[i] -= nelt;
22864
22865 /* Check whether the mask can be applied to the vector type. */
22866 d.one_operand_p = (which != 3);
22867
22868 /* Implementable with shufps, pshufd or pshuflw. */
22869 if (d.one_operand_p
22870 && (d.vmode == V4SFmode || d.vmode == V2SFmode
22871 || d.vmode == V4SImode || d.vmode == V2SImode
22872 || d.vmode == V4HImode || d.vmode == V2HImode))
22873 return true;
22874
22875 /* Otherwise we have to go through the motions and see if we can
22876 figure out how to generate the requested permutation. */
22877 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
22878 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
22879 if (!d.one_operand_p)
22880 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
22881
22882 start_sequence ();
22883 bool ret = ix86_expand_vec_perm_const_1 (&d);
22884 end_sequence ();
22885
22886 return ret;
22887 }
22888
22889 two_args = canonicalize_perm (&d);
22890
22891 /* If one of the operands is a zero vector, try to match pmovzx. */
22892 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
22893 {
22894 struct expand_vec_perm_d dzero = d;
22895 if (d.op0 == CONST0_RTX (vmode))
22896 {
22897 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
22898 std::swap (dzero.op0, dzero.op1);
22899 for (i = 0; i < nelt; ++i)
22900 dzero.perm[i] ^= nelt;
22901 }
22902 else
22903 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
22904
22905 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
22906 dzero.perm, nelt, dzero.testing_p))
22907 return true;
22908 }
22909
22910 /* Force operands into registers. */
22911 rtx nop0 = force_reg (vmode, d.op0);
22912 if (d.op0 == d.op1)
22913 d.op1 = nop0;
22914 d.op0 = nop0;
22915 d.op1 = force_reg (vmode, d.op1);
22916
22917 if (ix86_expand_vec_perm_const_1 (&d))
22918 return true;
22919
22920 /* If the selector says both arguments are needed, but the operands are the
22921 same, the above tried to expand with one_operand_p and flattened selector.
22922 If that didn't work, retry without one_operand_p; we succeeded with that
22923 during testing. */
22924 if (two_args && d.one_operand_p)
22925 {
22926 d.one_operand_p = false;
22927 memcpy (d.perm, perm, sizeof (perm));
22928 return ix86_expand_vec_perm_const_1 (&d);
22929 }
22930
22931 return false;
22932 }
22933
22934 void
22935 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
22936 {
22937 struct expand_vec_perm_d d;
22938 unsigned i, nelt;
22939
22940 d.target = targ;
22941 d.op0 = op0;
22942 d.op1 = op1;
22943 d.vmode = GET_MODE (targ);
22944 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22945 d.one_operand_p = false;
22946 d.testing_p = false;
22947
22948 for (i = 0; i < nelt; ++i)
22949 d.perm[i] = i * 2 + odd;
22950
22951 /* We'll either be able to implement the permutation directly... */
22952 if (expand_vec_perm_1 (&d))
22953 return;
22954
22955 /* ... or we use the special-case patterns. */
22956 expand_vec_perm_even_odd_1 (&d, odd);
22957 }
22958
22959 static void
22960 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
22961 {
22962 struct expand_vec_perm_d d;
22963 unsigned i, nelt, base;
22964 bool ok;
22965
22966 d.target = targ;
22967 d.op0 = op0;
22968 d.op1 = op1;
22969 d.vmode = GET_MODE (targ);
22970 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22971 d.one_operand_p = false;
22972 d.testing_p = false;
22973
22974 base = high_p ? nelt / 2 : 0;
22975 for (i = 0; i < nelt / 2; ++i)
22976 {
22977 d.perm[i * 2] = i + base;
22978 d.perm[i * 2 + 1] = i + base + nelt;
22979 }
22980
22981 /* Note that for AVX this isn't one instruction. */
22982 ok = ix86_expand_vec_perm_const_1 (&d);
22983 gcc_assert (ok);
22984 }
22985
22986 /* This function is similar as ix86_expand_vecop_qihi,
22987 but optimized under AVX512BW by using vpmovwb.
22988 For example, optimize vector MUL generation like
22989
22990 vpmovzxbw ymm2, xmm0
22991 vpmovzxbw ymm3, xmm1
22992 vpmullw ymm4, ymm2, ymm3
22993 vpmovwb xmm0, ymm4
22994
22995 it would take less instructions than ix86_expand_vecop_qihi.
22996 Return true if success. */
22997
22998 static bool
22999 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23000 {
23001 machine_mode himode, qimode = GET_MODE (dest);
23002 rtx hop1, hop2, hdest;
23003 rtx (*gen_extend)(rtx, rtx);
23004 rtx (*gen_truncate)(rtx, rtx);
23005 bool uns_p = (code == ASHIFTRT) ? false : true;
23006
23007 /* There's no V64HImode multiplication instruction. */
23008 if (qimode == E_V64QImode)
23009 return false;
23010
23011 /* vpmovwb only available under AVX512BW. */
23012 if (!TARGET_AVX512BW)
23013 return false;
23014 if ((qimode == V8QImode || qimode == V16QImode)
23015 && !TARGET_AVX512VL)
23016 return false;
23017 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
23018 if (qimode == V32QImode
23019 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
23020 return false;
23021
23022 switch (qimode)
23023 {
23024 case E_V8QImode:
23025 himode = V8HImode;
23026 gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
23027 gen_truncate = gen_truncv8hiv8qi2;
23028 break;
23029 case E_V16QImode:
23030 himode = V16HImode;
23031 gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
23032 gen_truncate = gen_truncv16hiv16qi2;
23033 break;
23034 case E_V32QImode:
23035 himode = V32HImode;
23036 gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
23037 gen_truncate = gen_truncv32hiv32qi2;
23038 break;
23039 default:
23040 gcc_unreachable ();
23041 }
23042
23043 hop1 = gen_reg_rtx (himode);
23044 hop2 = gen_reg_rtx (himode);
23045 hdest = gen_reg_rtx (himode);
23046 emit_insn (gen_extend (hop1, op1));
23047 emit_insn (gen_extend (hop2, op2));
23048 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
23049 hop1, hop2)));
23050 emit_insn (gen_truncate (dest, hdest));
23051 return true;
23052 }
23053
23054 /* Expand a vector operation shift by constant for a V*QImode in terms of the
23055 same operation on V*HImode. Return true if success. */
23056 static bool
23057 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
23058 rtx dest, rtx op1, rtx op2)
23059 {
23060 machine_mode qimode, himode;
23061 HOST_WIDE_INT and_constant, xor_constant;
23062 HOST_WIDE_INT shift_amount;
23063 rtx vec_const_and, vec_const_xor;
23064 rtx tmp, op1_subreg;
23065 rtx (*gen_shift) (rtx, rtx, rtx);
23066 rtx (*gen_and) (rtx, rtx, rtx);
23067 rtx (*gen_xor) (rtx, rtx, rtx);
23068 rtx (*gen_sub) (rtx, rtx, rtx);
23069
23070 /* Only optimize shift by constant. */
23071 if (!CONST_INT_P (op2))
23072 return false;
23073
23074 qimode = GET_MODE (dest);
23075 shift_amount = INTVAL (op2);
23076 /* Do nothing when shift amount greater equal 8. */
23077 if (shift_amount > 7)
23078 return false;
23079
23080 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
23081 /* Record sign bit. */
23082 xor_constant = 1 << (8 - shift_amount - 1);
23083
23084 /* Zero upper/lower bits shift from left/right element. */
23085 and_constant
23086 = (code == ASHIFT ? 256 - (1 << shift_amount)
23087 : (1 << (8 - shift_amount)) - 1);
23088
23089 switch (qimode)
23090 {
23091 case V16QImode:
23092 himode = V8HImode;
23093 gen_shift =
23094 ((code == ASHIFT)
23095 ? gen_ashlv8hi3
23096 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
23097 gen_and = gen_andv16qi3;
23098 gen_xor = gen_xorv16qi3;
23099 gen_sub = gen_subv16qi3;
23100 break;
23101 case V32QImode:
23102 himode = V16HImode;
23103 gen_shift =
23104 ((code == ASHIFT)
23105 ? gen_ashlv16hi3
23106 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
23107 gen_and = gen_andv32qi3;
23108 gen_xor = gen_xorv32qi3;
23109 gen_sub = gen_subv32qi3;
23110 break;
23111 case V64QImode:
23112 himode = V32HImode;
23113 gen_shift =
23114 ((code == ASHIFT)
23115 ? gen_ashlv32hi3
23116 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
23117 gen_and = gen_andv64qi3;
23118 gen_xor = gen_xorv64qi3;
23119 gen_sub = gen_subv64qi3;
23120 break;
23121 default:
23122 gcc_unreachable ();
23123 }
23124
23125 tmp = gen_reg_rtx (himode);
23126 vec_const_and = gen_reg_rtx (qimode);
23127 op1_subreg = lowpart_subreg (himode, op1, qimode);
23128
23129 /* For ASHIFT and LSHIFTRT, perform operation like
23130 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23131 vpand %vec_const_and, %dest. */
23132 emit_insn (gen_shift (tmp, op1_subreg, op2));
23133 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
23134 emit_move_insn (vec_const_and,
23135 ix86_build_const_vector (qimode, true,
23136 gen_int_mode (and_constant, QImode)));
23137 emit_insn (gen_and (dest, dest, vec_const_and));
23138
23139 /* For ASHIFTRT, perform extra operation like
23140 vpxor %vec_const_xor, %dest, %dest
23141 vpsubb %vec_const_xor, %dest, %dest */
23142 if (code == ASHIFTRT)
23143 {
23144 vec_const_xor = gen_reg_rtx (qimode);
23145 emit_move_insn (vec_const_xor,
23146 ix86_build_const_vector (qimode, true,
23147 gen_int_mode (xor_constant, QImode)));
23148 emit_insn (gen_xor (dest, dest, vec_const_xor));
23149 emit_insn (gen_sub (dest, dest, vec_const_xor));
23150 }
23151 return true;
23152 }
23153
23154 /* Expand a vector operation CODE for a V*QImode in terms of the
23155 same operation on V*HImode. */
23156
23157 void
23158 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23159 {
23160 machine_mode qimode = GET_MODE (dest);
23161 machine_mode himode;
23162 rtx (*gen_il) (rtx, rtx, rtx);
23163 rtx (*gen_ih) (rtx, rtx, rtx);
23164 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
23165 struct expand_vec_perm_d d;
23166 bool ok, full_interleave;
23167 bool uns_p = false;
23168 int i;
23169
23170 if (CONST_INT_P (op2)
23171 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23172 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
23173 return;
23174
23175 if (TARGET_AVX512BW
23176 && VECTOR_MODE_P (GET_MODE (op2))
23177 && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
23178 return;
23179
23180 switch (qimode)
23181 {
23182 case E_V16QImode:
23183 himode = V8HImode;
23184 gen_il = gen_vec_interleave_lowv16qi;
23185 gen_ih = gen_vec_interleave_highv16qi;
23186 break;
23187 case E_V32QImode:
23188 himode = V16HImode;
23189 gen_il = gen_avx2_interleave_lowv32qi;
23190 gen_ih = gen_avx2_interleave_highv32qi;
23191 break;
23192 case E_V64QImode:
23193 himode = V32HImode;
23194 gen_il = gen_avx512bw_interleave_lowv64qi;
23195 gen_ih = gen_avx512bw_interleave_highv64qi;
23196 break;
23197 default:
23198 gcc_unreachable ();
23199 }
23200
23201 switch (code)
23202 {
23203 case MULT:
23204 /* Unpack data such that we've got a source byte in each low byte of
23205 each word. We don't care what goes into the high byte of each word.
23206 Rather than trying to get zero in there, most convenient is to let
23207 it be a copy of the low byte. */
23208 op2_l = gen_reg_rtx (qimode);
23209 op2_h = gen_reg_rtx (qimode);
23210 emit_insn (gen_il (op2_l, op2, op2));
23211 emit_insn (gen_ih (op2_h, op2, op2));
23212
23213 op1_l = gen_reg_rtx (qimode);
23214 op1_h = gen_reg_rtx (qimode);
23215 emit_insn (gen_il (op1_l, op1, op1));
23216 emit_insn (gen_ih (op1_h, op1, op1));
23217 full_interleave = qimode == V16QImode;
23218 break;
23219
23220 case ASHIFT:
23221 case LSHIFTRT:
23222 uns_p = true;
23223 /* FALLTHRU */
23224 case ASHIFTRT:
23225 op1_l = gen_reg_rtx (himode);
23226 op1_h = gen_reg_rtx (himode);
23227 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
23228 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
23229 /* vashr/vlshr/vashl */
23230 if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
23231 {
23232 rtx tmp = force_reg (qimode, op2);
23233 op2_l = gen_reg_rtx (himode);
23234 op2_h = gen_reg_rtx (himode);
23235 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
23236 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
23237 }
23238 else
23239 op2_l = op2_h = op2;
23240
23241 full_interleave = true;
23242 break;
23243 default:
23244 gcc_unreachable ();
23245 }
23246
23247 /* Perform vashr/vlshr/vashl. */
23248 if (code != MULT
23249 && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
23250 {
23251 res_l = gen_reg_rtx (himode);
23252 res_h = gen_reg_rtx (himode);
23253 emit_insn (gen_rtx_SET (res_l,
23254 simplify_gen_binary (code, himode,
23255 op1_l, op2_l)));
23256 emit_insn (gen_rtx_SET (res_h,
23257 simplify_gen_binary (code, himode,
23258 op1_h, op2_h)));
23259 }
23260 /* Performance mult/ashr/lshr/ashl. */
23261 else
23262 {
23263 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
23264 1, OPTAB_DIRECT);
23265 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
23266 1, OPTAB_DIRECT);
23267 }
23268
23269 gcc_assert (res_l && res_h);
23270
23271 /* Merge the data back into the right place. */
23272 d.target = dest;
23273 d.op0 = gen_lowpart (qimode, res_l);
23274 d.op1 = gen_lowpart (qimode, res_h);
23275 d.vmode = qimode;
23276 d.nelt = GET_MODE_NUNITS (qimode);
23277 d.one_operand_p = false;
23278 d.testing_p = false;
23279
23280 if (full_interleave)
23281 {
23282 /* For SSE2, we used an full interleave, so the desired
23283 results are in the even elements. */
23284 for (i = 0; i < d.nelt; ++i)
23285 d.perm[i] = i * 2;
23286 }
23287 else
23288 {
23289 /* For AVX, the interleave used above was not cross-lane. So the
23290 extraction is evens but with the second and third quarter swapped.
23291 Happily, that is even one insn shorter than even extraction.
23292 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23293 always first from the first and then from the second source operand,
23294 the index bits above the low 4 bits remains the same.
23295 Thus, for d.nelt == 32 we want permutation
23296 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23297 and for d.nelt == 64 we want permutation
23298 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23299 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23300 for (i = 0; i < d.nelt; ++i)
23301 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
23302 }
23303
23304 ok = ix86_expand_vec_perm_const_1 (&d);
23305 gcc_assert (ok);
23306
23307 set_unique_reg_note (get_last_insn (), REG_EQUAL,
23308 gen_rtx_fmt_ee (code, qimode, op1, op2));
23309 }
23310
23311 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
23312 if op is CONST_VECTOR with all odd elements equal to their
23313 preceding element. */
23314
23315 static bool
23316 const_vector_equal_evenodd_p (rtx op)
23317 {
23318 machine_mode mode = GET_MODE (op);
23319 int i, nunits = GET_MODE_NUNITS (mode);
23320 if (GET_CODE (op) != CONST_VECTOR
23321 || nunits != CONST_VECTOR_NUNITS (op))
23322 return false;
23323 for (i = 0; i < nunits; i += 2)
23324 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
23325 return false;
23326 return true;
23327 }
23328
23329 void
23330 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
23331 bool uns_p, bool odd_p)
23332 {
23333 machine_mode mode = GET_MODE (op1);
23334 machine_mode wmode = GET_MODE (dest);
23335 rtx x;
23336 rtx orig_op1 = op1, orig_op2 = op2;
23337
23338 if (!nonimmediate_operand (op1, mode))
23339 op1 = force_reg (mode, op1);
23340 if (!nonimmediate_operand (op2, mode))
23341 op2 = force_reg (mode, op2);
23342
23343 /* We only play even/odd games with vectors of SImode. */
23344 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
23345
23346 /* If we're looking for the odd results, shift those members down to
23347 the even slots. For some cpus this is faster than a PSHUFD. */
23348 if (odd_p)
23349 {
23350 /* For XOP use vpmacsdqh, but only for smult, as it is only
23351 signed. */
23352 if (TARGET_XOP && mode == V4SImode && !uns_p)
23353 {
23354 x = force_reg (wmode, CONST0_RTX (wmode));
23355 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
23356 return;
23357 }
23358
23359 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
23360 if (!const_vector_equal_evenodd_p (orig_op1))
23361 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
23362 x, NULL, 1, OPTAB_DIRECT);
23363 if (!const_vector_equal_evenodd_p (orig_op2))
23364 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
23365 x, NULL, 1, OPTAB_DIRECT);
23366 op1 = gen_lowpart (mode, op1);
23367 op2 = gen_lowpart (mode, op2);
23368 }
23369
23370 if (mode == V16SImode)
23371 {
23372 if (uns_p)
23373 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
23374 else
23375 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
23376 }
23377 else if (mode == V8SImode)
23378 {
23379 if (uns_p)
23380 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
23381 else
23382 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
23383 }
23384 else if (uns_p)
23385 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
23386 else if (TARGET_SSE4_1)
23387 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
23388 else
23389 {
23390 rtx s1, s2, t0, t1, t2;
23391
23392 /* The easiest way to implement this without PMULDQ is to go through
23393 the motions as if we are performing a full 64-bit multiply. With
23394 the exception that we need to do less shuffling of the elements. */
23395
23396 /* Compute the sign-extension, aka highparts, of the two operands. */
23397 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23398 op1, pc_rtx, pc_rtx);
23399 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23400 op2, pc_rtx, pc_rtx);
23401
23402 /* Multiply LO(A) * HI(B), and vice-versa. */
23403 t1 = gen_reg_rtx (wmode);
23404 t2 = gen_reg_rtx (wmode);
23405 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
23406 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
23407
23408 /* Multiply LO(A) * LO(B). */
23409 t0 = gen_reg_rtx (wmode);
23410 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
23411
23412 /* Combine and shift the highparts into place. */
23413 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
23414 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
23415 1, OPTAB_DIRECT);
23416
23417 /* Combine high and low parts. */
23418 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
23419 return;
23420 }
23421 emit_insn (x);
23422 }
23423
23424 void
23425 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
23426 bool uns_p, bool high_p)
23427 {
23428 machine_mode wmode = GET_MODE (dest);
23429 machine_mode mode = GET_MODE (op1);
23430 rtx t1, t2, t3, t4, mask;
23431
23432 switch (mode)
23433 {
23434 case E_V4SImode:
23435 t1 = gen_reg_rtx (mode);
23436 t2 = gen_reg_rtx (mode);
23437 if (TARGET_XOP && !uns_p)
23438 {
23439 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23440 shuffle the elements once so that all elements are in the right
23441 place for immediate use: { A C B D }. */
23442 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
23443 const1_rtx, GEN_INT (3)));
23444 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
23445 const1_rtx, GEN_INT (3)));
23446 }
23447 else
23448 {
23449 /* Put the elements into place for the multiply. */
23450 ix86_expand_vec_interleave (t1, op1, op1, high_p);
23451 ix86_expand_vec_interleave (t2, op2, op2, high_p);
23452 high_p = false;
23453 }
23454 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
23455 break;
23456
23457 case E_V8SImode:
23458 /* Shuffle the elements between the lanes. After this we
23459 have { A B E F | C D G H } for each operand. */
23460 t1 = gen_reg_rtx (V4DImode);
23461 t2 = gen_reg_rtx (V4DImode);
23462 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
23463 const0_rtx, const2_rtx,
23464 const1_rtx, GEN_INT (3)));
23465 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
23466 const0_rtx, const2_rtx,
23467 const1_rtx, GEN_INT (3)));
23468
23469 /* Shuffle the elements within the lanes. After this we
23470 have { A A B B | C C D D } or { E E F F | G G H H }. */
23471 t3 = gen_reg_rtx (V8SImode);
23472 t4 = gen_reg_rtx (V8SImode);
23473 mask = GEN_INT (high_p
23474 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23475 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23476 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
23477 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
23478
23479 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
23480 break;
23481
23482 case E_V8HImode:
23483 case E_V16HImode:
23484 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
23485 uns_p, OPTAB_DIRECT);
23486 t2 = expand_binop (mode,
23487 uns_p ? umul_highpart_optab : smul_highpart_optab,
23488 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
23489 gcc_assert (t1 && t2);
23490
23491 t3 = gen_reg_rtx (mode);
23492 ix86_expand_vec_interleave (t3, t1, t2, high_p);
23493 emit_move_insn (dest, gen_lowpart (wmode, t3));
23494 break;
23495
23496 case E_V16QImode:
23497 case E_V32QImode:
23498 case E_V32HImode:
23499 case E_V16SImode:
23500 case E_V64QImode:
23501 t1 = gen_reg_rtx (wmode);
23502 t2 = gen_reg_rtx (wmode);
23503 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
23504 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
23505
23506 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
23507 break;
23508
23509 default:
23510 gcc_unreachable ();
23511 }
23512 }
23513
23514 void
23515 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
23516 {
23517 rtx res_1, res_2, res_3, res_4;
23518
23519 res_1 = gen_reg_rtx (V4SImode);
23520 res_2 = gen_reg_rtx (V4SImode);
23521 res_3 = gen_reg_rtx (V2DImode);
23522 res_4 = gen_reg_rtx (V2DImode);
23523 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
23524 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
23525
23526 /* Move the results in element 2 down to element 1; we don't care
23527 what goes in elements 2 and 3. Then we can merge the parts
23528 back together with an interleave.
23529
23530 Note that two other sequences were tried:
23531 (1) Use interleaves at the start instead of psrldq, which allows
23532 us to use a single shufps to merge things back at the end.
23533 (2) Use shufps here to combine the two vectors, then pshufd to
23534 put the elements in the correct order.
23535 In both cases the cost of the reformatting stall was too high
23536 and the overall sequence slower. */
23537
23538 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
23539 const0_rtx, const2_rtx,
23540 const0_rtx, const0_rtx));
23541 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
23542 const0_rtx, const2_rtx,
23543 const0_rtx, const0_rtx));
23544 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
23545
23546 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
23547 }
23548
23549 void
23550 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
23551 {
23552 machine_mode mode = GET_MODE (op0);
23553 rtx t1, t2, t3, t4, t5, t6;
23554
23555 if (TARGET_AVX512DQ && mode == V8DImode)
23556 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
23557 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
23558 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
23559 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
23560 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
23561 else if (TARGET_XOP && mode == V2DImode)
23562 {
23563 /* op1: A,B,C,D, op2: E,F,G,H */
23564 op1 = gen_lowpart (V4SImode, op1);
23565 op2 = gen_lowpart (V4SImode, op2);
23566
23567 t1 = gen_reg_rtx (V4SImode);
23568 t2 = gen_reg_rtx (V4SImode);
23569 t3 = gen_reg_rtx (V2DImode);
23570 t4 = gen_reg_rtx (V2DImode);
23571
23572 /* t1: B,A,D,C */
23573 emit_insn (gen_sse2_pshufd_1 (t1, op1,
23574 GEN_INT (1),
23575 GEN_INT (0),
23576 GEN_INT (3),
23577 GEN_INT (2)));
23578
23579 /* t2: (B*E),(A*F),(D*G),(C*H) */
23580 emit_insn (gen_mulv4si3 (t2, t1, op2));
23581
23582 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
23583 emit_insn (gen_xop_phadddq (t3, t2));
23584
23585 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
23586 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
23587
23588 /* Multiply lower parts and add all */
23589 t5 = gen_reg_rtx (V2DImode);
23590 emit_insn (gen_vec_widen_umult_even_v4si (t5,
23591 gen_lowpart (V4SImode, op1),
23592 gen_lowpart (V4SImode, op2)));
23593 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
23594 }
23595 else
23596 {
23597 machine_mode nmode;
23598 rtx (*umul) (rtx, rtx, rtx);
23599
23600 if (mode == V2DImode)
23601 {
23602 umul = gen_vec_widen_umult_even_v4si;
23603 nmode = V4SImode;
23604 }
23605 else if (mode == V4DImode)
23606 {
23607 umul = gen_vec_widen_umult_even_v8si;
23608 nmode = V8SImode;
23609 }
23610 else if (mode == V8DImode)
23611 {
23612 umul = gen_vec_widen_umult_even_v16si;
23613 nmode = V16SImode;
23614 }
23615 else
23616 gcc_unreachable ();
23617
23618
23619 /* Multiply low parts. */
23620 t1 = gen_reg_rtx (mode);
23621 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
23622
23623 /* Shift input vectors right 32 bits so we can multiply high parts. */
23624 t6 = GEN_INT (32);
23625 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
23626 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
23627
23628 /* Multiply high parts by low parts. */
23629 t4 = gen_reg_rtx (mode);
23630 t5 = gen_reg_rtx (mode);
23631 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
23632 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
23633
23634 /* Combine and shift the highparts back. */
23635 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
23636 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
23637
23638 /* Combine high and low parts. */
23639 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
23640 }
23641
23642 set_unique_reg_note (get_last_insn (), REG_EQUAL,
23643 gen_rtx_MULT (mode, op1, op2));
23644 }
23645
23646 /* Return 1 if control tansfer instruction INSN
23647 should be encoded with notrack prefix. */
23648
23649 bool
23650 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
23651 {
23652 if (!insn || !((flag_cf_protection & CF_BRANCH)))
23653 return false;
23654
23655 if (CALL_P (insn))
23656 {
23657 rtx call = get_call_rtx_from (insn);
23658 gcc_assert (call != NULL_RTX);
23659 rtx addr = XEXP (call, 0);
23660
23661 /* Do not emit 'notrack' if it's not an indirect call. */
23662 if (MEM_P (addr)
23663 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
23664 return false;
23665 else
23666 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
23667 }
23668
23669 if (JUMP_P (insn) && !flag_cet_switch)
23670 {
23671 rtx target = JUMP_LABEL (insn);
23672 if (target == NULL_RTX || ANY_RETURN_P (target))
23673 return false;
23674
23675 /* Check the jump is a switch table. */
23676 rtx_insn *label = as_a<rtx_insn *> (target);
23677 rtx_insn *table = next_insn (label);
23678 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
23679 return false;
23680 else
23681 return true;
23682 }
23683 return false;
23684 }
23685
23686 /* Calculate integer abs() using only SSE2 instructions. */
23687
23688 void
23689 ix86_expand_sse2_abs (rtx target, rtx input)
23690 {
23691 machine_mode mode = GET_MODE (target);
23692 rtx tmp0, tmp1, x;
23693
23694 switch (mode)
23695 {
23696 case E_V2DImode:
23697 case E_V4DImode:
23698 /* For 64-bit signed integer X, with SSE4.2 use
23699 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
23700 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
23701 32 and use logical instead of arithmetic right shift (which is
23702 unimplemented) and subtract. */
23703 if (TARGET_SSE4_2)
23704 {
23705 tmp0 = gen_reg_rtx (mode);
23706 tmp1 = gen_reg_rtx (mode);
23707 emit_move_insn (tmp1, CONST0_RTX (mode));
23708 if (mode == E_V2DImode)
23709 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
23710 else
23711 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
23712 }
23713 else
23714 {
23715 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
23716 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
23717 - 1), NULL, 0, OPTAB_DIRECT);
23718 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
23719 }
23720
23721 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23722 NULL, 0, OPTAB_DIRECT);
23723 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23724 target, 0, OPTAB_DIRECT);
23725 break;
23726
23727 case E_V4SImode:
23728 /* For 32-bit signed integer X, the best way to calculate the absolute
23729 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
23730 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
23731 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
23732 NULL, 0, OPTAB_DIRECT);
23733 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23734 NULL, 0, OPTAB_DIRECT);
23735 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23736 target, 0, OPTAB_DIRECT);
23737 break;
23738
23739 case E_V8HImode:
23740 /* For 16-bit signed integer X, the best way to calculate the absolute
23741 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
23742 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23743
23744 x = expand_simple_binop (mode, SMAX, tmp0, input,
23745 target, 0, OPTAB_DIRECT);
23746 break;
23747
23748 case E_V16QImode:
23749 /* For 8-bit signed integer X, the best way to calculate the absolute
23750 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23751 as SSE2 provides the PMINUB insn. */
23752 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23753
23754 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
23755 target, 0, OPTAB_DIRECT);
23756 break;
23757
23758 default:
23759 gcc_unreachable ();
23760 }
23761
23762 if (x != target)
23763 emit_move_insn (target, x);
23764 }
23765
23766 /* Expand an extract from a vector register through pextr insn.
23767 Return true if successful. */
23768
23769 bool
23770 ix86_expand_pextr (rtx *operands)
23771 {
23772 rtx dst = operands[0];
23773 rtx src = operands[1];
23774
23775 unsigned int size = INTVAL (operands[2]);
23776 unsigned int pos = INTVAL (operands[3]);
23777
23778 if (SUBREG_P (dst))
23779 {
23780 /* Reject non-lowpart subregs. */
23781 if (SUBREG_BYTE (dst) > 0)
23782 return false;
23783 dst = SUBREG_REG (dst);
23784 }
23785
23786 if (SUBREG_P (src))
23787 {
23788 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
23789 src = SUBREG_REG (src);
23790 }
23791
23792 switch (GET_MODE (src))
23793 {
23794 case E_V16QImode:
23795 case E_V8HImode:
23796 case E_V4SImode:
23797 case E_V2DImode:
23798 case E_V1TImode:
23799 {
23800 machine_mode srcmode, dstmode;
23801 rtx d, pat;
23802
23803 if (!int_mode_for_size (size, 0).exists (&dstmode))
23804 return false;
23805
23806 switch (dstmode)
23807 {
23808 case E_QImode:
23809 if (!TARGET_SSE4_1)
23810 return false;
23811 srcmode = V16QImode;
23812 break;
23813
23814 case E_HImode:
23815 if (!TARGET_SSE2)
23816 return false;
23817 srcmode = V8HImode;
23818 break;
23819
23820 case E_SImode:
23821 if (!TARGET_SSE4_1)
23822 return false;
23823 srcmode = V4SImode;
23824 break;
23825
23826 case E_DImode:
23827 gcc_assert (TARGET_64BIT);
23828 if (!TARGET_SSE4_1)
23829 return false;
23830 srcmode = V2DImode;
23831 break;
23832
23833 default:
23834 return false;
23835 }
23836
23837 /* Reject extractions from misaligned positions. */
23838 if (pos & (size-1))
23839 return false;
23840
23841 if (GET_MODE (dst) == dstmode)
23842 d = dst;
23843 else
23844 d = gen_reg_rtx (dstmode);
23845
23846 /* Construct insn pattern. */
23847 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
23848 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
23849
23850 /* Let the rtl optimizers know about the zero extension performed. */
23851 if (dstmode == QImode || dstmode == HImode)
23852 {
23853 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
23854 d = gen_lowpart (SImode, d);
23855 }
23856
23857 emit_insn (gen_rtx_SET (d, pat));
23858
23859 if (d != dst)
23860 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23861 return true;
23862 }
23863
23864 default:
23865 return false;
23866 }
23867 }
23868
23869 /* Expand an insert into a vector register through pinsr insn.
23870 Return true if successful. */
23871
23872 bool
23873 ix86_expand_pinsr (rtx *operands)
23874 {
23875 rtx dst = operands[0];
23876 rtx src = operands[3];
23877
23878 unsigned int size = INTVAL (operands[1]);
23879 unsigned int pos = INTVAL (operands[2]);
23880
23881 if (SUBREG_P (dst))
23882 {
23883 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
23884 dst = SUBREG_REG (dst);
23885 }
23886
23887 switch (GET_MODE (dst))
23888 {
23889 case E_V16QImode:
23890 case E_V8HImode:
23891 case E_V4SImode:
23892 case E_V2DImode:
23893 case E_V1TImode:
23894 {
23895 machine_mode srcmode, dstmode;
23896 rtx (*pinsr)(rtx, rtx, rtx, rtx);
23897 rtx d;
23898
23899 if (!int_mode_for_size (size, 0).exists (&srcmode))
23900 return false;
23901
23902 switch (srcmode)
23903 {
23904 case E_QImode:
23905 if (!TARGET_SSE4_1)
23906 return false;
23907 dstmode = V16QImode;
23908 pinsr = gen_sse4_1_pinsrb;
23909 break;
23910
23911 case E_HImode:
23912 if (!TARGET_SSE2)
23913 return false;
23914 dstmode = V8HImode;
23915 pinsr = gen_sse2_pinsrw;
23916 break;
23917
23918 case E_SImode:
23919 if (!TARGET_SSE4_1)
23920 return false;
23921 dstmode = V4SImode;
23922 pinsr = gen_sse4_1_pinsrd;
23923 break;
23924
23925 case E_DImode:
23926 gcc_assert (TARGET_64BIT);
23927 if (!TARGET_SSE4_1)
23928 return false;
23929 dstmode = V2DImode;
23930 pinsr = gen_sse4_1_pinsrq;
23931 break;
23932
23933 default:
23934 return false;
23935 }
23936
23937 /* Reject insertions to misaligned positions. */
23938 if (pos & (size-1))
23939 return false;
23940
23941 if (SUBREG_P (src))
23942 {
23943 unsigned int srcpos = SUBREG_BYTE (src);
23944
23945 if (srcpos > 0)
23946 {
23947 rtx extr_ops[4];
23948
23949 extr_ops[0] = gen_reg_rtx (srcmode);
23950 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
23951 extr_ops[2] = GEN_INT (size);
23952 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
23953
23954 if (!ix86_expand_pextr (extr_ops))
23955 return false;
23956
23957 src = extr_ops[0];
23958 }
23959 else
23960 src = gen_lowpart (srcmode, SUBREG_REG (src));
23961 }
23962
23963 if (GET_MODE (dst) == dstmode)
23964 d = dst;
23965 else
23966 d = gen_reg_rtx (dstmode);
23967
23968 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
23969 gen_lowpart (srcmode, src),
23970 GEN_INT (1 << (pos / size))));
23971 if (d != dst)
23972 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23973 return true;
23974 }
23975
23976 default:
23977 return false;
23978 }
23979 }
23980
23981 /* All CPUs prefer to avoid cross-lane operations so perform reductions
23982 upper against lower halves up to SSE reg size. */
23983
23984 machine_mode
23985 ix86_split_reduction (machine_mode mode)
23986 {
23987 /* Reduce lowpart against highpart until we reach SSE reg width to
23988 avoid cross-lane operations. */
23989 switch (mode)
23990 {
23991 case E_V8DImode:
23992 case E_V4DImode:
23993 return V2DImode;
23994 case E_V16SImode:
23995 case E_V8SImode:
23996 return V4SImode;
23997 case E_V32HImode:
23998 case E_V16HImode:
23999 return V8HImode;
24000 case E_V64QImode:
24001 case E_V32QImode:
24002 return V16QImode;
24003 case E_V16SFmode:
24004 case E_V8SFmode:
24005 return V4SFmode;
24006 case E_V8DFmode:
24007 case E_V4DFmode:
24008 return V2DFmode;
24009 default:
24010 return mode;
24011 }
24012 }
24013
24014 /* Generate call to __divmoddi4. */
24015
24016 void
24017 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
24018 rtx op0, rtx op1,
24019 rtx *quot_p, rtx *rem_p)
24020 {
24021 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
24022
24023 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
24024 mode, op0, mode, op1, mode,
24025 XEXP (rem, 0), Pmode);
24026 *quot_p = quot;
24027 *rem_p = rem;
24028 }
24029
24030 void
24031 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
24032 enum rtx_code code, bool after,
24033 bool doubleword)
24034 {
24035 rtx old_reg, new_reg, old_mem, success;
24036 machine_mode mode = GET_MODE (target);
24037 rtx_code_label *loop_label = NULL;
24038
24039 old_reg = gen_reg_rtx (mode);
24040 new_reg = old_reg;
24041 old_mem = copy_to_reg (mem);
24042 loop_label = gen_label_rtx ();
24043 emit_label (loop_label);
24044 emit_move_insn (old_reg, old_mem);
24045
24046 /* return value for atomic_fetch_op. */
24047 if (!after)
24048 emit_move_insn (target, old_reg);
24049
24050 if (code == NOT)
24051 {
24052 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
24053 true, OPTAB_LIB_WIDEN);
24054 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
24055 }
24056 else
24057 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
24058 true, OPTAB_LIB_WIDEN);
24059
24060 /* return value for atomic_op_fetch. */
24061 if (after)
24062 emit_move_insn (target, new_reg);
24063
24064 success = NULL_RTX;
24065
24066 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
24067 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
24068 SImode),
24069 doubleword, loop_label);
24070 }
24071
24072 /* Relax cmpxchg instruction, param loop_label indicates whether
24073 the instruction should be relaxed with a pause loop. If not,
24074 it will be relaxed to an atomic load + compare, and skip
24075 cmpxchg instruction if mem != exp_input. */
24076
24077 void
24078 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
24079 rtx mem, rtx exp_input, rtx new_input,
24080 rtx mem_model, bool doubleword,
24081 rtx_code_label *loop_label)
24082 {
24083 rtx_code_label *cmp_label = NULL;
24084 rtx_code_label *done_label = NULL;
24085 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
24086 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
24087 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
24088 machine_mode mode = GET_MODE (target_val), hmode = mode;
24089
24090 if (*ptarget_bool == NULL)
24091 target_bool = gen_reg_rtx (QImode);
24092 else
24093 target_bool = *ptarget_bool;
24094
24095 cmp_label = gen_label_rtx ();
24096 done_label = gen_label_rtx ();
24097
24098 new_mem = gen_reg_rtx (mode);
24099 /* Load memory first. */
24100 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
24101
24102 switch (mode)
24103 {
24104 case E_TImode:
24105 gendw = gen_atomic_compare_and_swapti_doubleword;
24106 hmode = DImode;
24107 break;
24108 case E_DImode:
24109 if (doubleword)
24110 {
24111 gendw = gen_atomic_compare_and_swapdi_doubleword;
24112 hmode = SImode;
24113 }
24114 else
24115 gen = gen_atomic_compare_and_swapdi_1;
24116 break;
24117 case E_SImode:
24118 gen = gen_atomic_compare_and_swapsi_1;
24119 break;
24120 case E_HImode:
24121 gen = gen_atomic_compare_and_swaphi_1;
24122 break;
24123 case E_QImode:
24124 gen = gen_atomic_compare_and_swapqi_1;
24125 break;
24126 default:
24127 gcc_unreachable ();
24128 }
24129
24130 /* Compare mem value with expected value. */
24131 if (doubleword)
24132 {
24133 rtx low_new_mem = gen_lowpart (hmode, new_mem);
24134 rtx low_exp_input = gen_lowpart (hmode, exp_input);
24135 rtx high_new_mem = gen_highpart (hmode, new_mem);
24136 rtx high_exp_input = gen_highpart (hmode, exp_input);
24137 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
24138 hmode, 1, cmp_label,
24139 profile_probability::guessed_never ());
24140 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
24141 hmode, 1, cmp_label,
24142 profile_probability::guessed_never ());
24143 }
24144 else
24145 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
24146 GET_MODE (exp_input), 1, cmp_label,
24147 profile_probability::guessed_never ());
24148
24149 /* Directly emits cmpxchg here. */
24150 if (doubleword)
24151 emit_insn (gendw (target_val, mem, exp_input,
24152 gen_lowpart (hmode, new_input),
24153 gen_highpart (hmode, new_input),
24154 mem_model));
24155 else
24156 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
24157
24158 if (!loop_label)
24159 {
24160 emit_jump_insn (gen_jump (done_label));
24161 emit_barrier ();
24162 emit_label (cmp_label);
24163 emit_move_insn (target_val, new_mem);
24164 emit_label (done_label);
24165 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24166 const0_rtx);
24167 }
24168 else
24169 {
24170 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24171 const0_rtx);
24172 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
24173 GET_MODE (target_bool), 1, loop_label,
24174 profile_probability::guessed_never ());
24175 emit_jump_insn (gen_jump (done_label));
24176 emit_barrier ();
24177
24178 /* If mem is not expected, pause and loop back. */
24179 emit_label (cmp_label);
24180 emit_move_insn (target_val, new_mem);
24181 emit_insn (gen_pause ());
24182 emit_jump_insn (gen_jump (loop_label));
24183 emit_barrier ();
24184 emit_label (done_label);
24185 }
24186
24187 *ptarget_bool = target_bool;
24188 }
24189
24190 /* Convert a BFmode VAL to SFmode without signaling sNaNs.
24191 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24192
24193 rtx
24194 ix86_expand_fast_convert_bf_to_sf (rtx val)
24195 {
24196 rtx op = gen_lowpart (HImode, val), ret;
24197 if (CONST_INT_P (op))
24198 {
24199 ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
24200 val, BFmode);
24201 if (ret)
24202 return ret;
24203 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24204 ret = gen_reg_rtx (SImode);
24205 emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
24206 emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
24207 return gen_lowpart (SFmode, ret);
24208 }
24209
24210 ret = gen_reg_rtx (SFmode);
24211 emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
24212 return ret;
24213 }
24214
24215 #include "gt-i386-expand.h"