]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386-expand.cc
i386: Add infrastructure for QImode partial vector mult and shift operations
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
1 /* Copyright (C) 1988-2023 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95 #include "asan.h"
96
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
102
103 void
104 split_double_mode (machine_mode mode, rtx operands[],
105 int num, rtx lo_half[], rtx hi_half[])
106 {
107 machine_mode half_mode;
108 unsigned int byte;
109 rtx mem_op = NULL_RTX;
110 int mem_num = 0;
111
112 switch (mode)
113 {
114 case E_TImode:
115 half_mode = DImode;
116 break;
117 case E_DImode:
118 half_mode = SImode;
119 break;
120 case E_P2HImode:
121 half_mode = HImode;
122 break;
123 case E_P2QImode:
124 half_mode = QImode;
125 break;
126 default:
127 gcc_unreachable ();
128 }
129
130 byte = GET_MODE_SIZE (half_mode);
131
132 while (num--)
133 {
134 rtx op = operands[num];
135
136 /* simplify_subreg refuse to split volatile memory addresses,
137 but we still have to handle it. */
138 if (MEM_P (op))
139 {
140 if (mem_op && rtx_equal_p (op, mem_op))
141 {
142 lo_half[num] = lo_half[mem_num];
143 hi_half[num] = hi_half[mem_num];
144 }
145 else
146 {
147 mem_op = op;
148 mem_num = num;
149 lo_half[num] = adjust_address (op, half_mode, 0);
150 hi_half[num] = adjust_address (op, half_mode, byte);
151 }
152 }
153 else
154 {
155 lo_half[num] = simplify_gen_subreg (half_mode, op,
156 GET_MODE (op) == VOIDmode
157 ? mode : GET_MODE (op), 0);
158
159 rtx tmp = simplify_gen_subreg (half_mode, op,
160 GET_MODE (op) == VOIDmode
161 ? mode : GET_MODE (op), byte);
162 /* simplify_gen_subreg will return NULL RTX for the
163 high half of the paradoxical subreg. */
164 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
165 }
166 }
167 }
168
169 /* Emit the double word assignment DST = { LO, HI }. */
170
171 void
172 split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
173 {
174 rtx dlo, dhi;
175 int deleted_move_count = 0;
176 split_double_mode (mode, &dst, 1, &dlo, &dhi);
177 /* Constraints ensure that if both lo and hi are MEMs, then
178 dst has early-clobber and thus addresses of MEMs don't use
179 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
180 dlo/dhi are registers. */
181 if (MEM_P (lo)
182 && rtx_equal_p (dlo, hi)
183 && reg_overlap_mentioned_p (dhi, lo))
184 {
185 /* If dlo is same as hi and lo's address uses dhi register,
186 code below would first emit_move_insn (dhi, hi)
187 and then emit_move_insn (dlo, lo). But the former
188 would invalidate lo's address. Load into dhi first,
189 then swap. */
190 emit_move_insn (dhi, lo);
191 lo = dhi;
192 }
193 else if (MEM_P (hi)
194 && !MEM_P (lo)
195 && !rtx_equal_p (dlo, lo)
196 && reg_overlap_mentioned_p (dlo, hi))
197 {
198 /* In this case, code below would first emit_move_insn (dlo, lo)
199 and then emit_move_insn (dhi, hi). But the former would
200 invalidate hi's address. */
201 if (rtx_equal_p (dhi, lo))
202 {
203 /* We can't load into dhi first, so load into dlo
204 first and we'll swap. */
205 emit_move_insn (dlo, hi);
206 hi = dlo;
207 }
208 else
209 {
210 /* Load into dhi first. */
211 emit_move_insn (dhi, hi);
212 hi = dhi;
213 }
214 }
215 if (!rtx_equal_p (dlo, hi))
216 {
217 if (!rtx_equal_p (dlo, lo))
218 emit_move_insn (dlo, lo);
219 else
220 deleted_move_count++;
221 if (!rtx_equal_p (dhi, hi))
222 emit_move_insn (dhi, hi);
223 else
224 deleted_move_count++;
225 }
226 else if (!rtx_equal_p (lo, dhi))
227 {
228 if (!rtx_equal_p (dhi, hi))
229 emit_move_insn (dhi, hi);
230 else
231 deleted_move_count++;
232 if (!rtx_equal_p (dlo, lo))
233 emit_move_insn (dlo, lo);
234 else
235 deleted_move_count++;
236 }
237 else if (mode == TImode)
238 emit_insn (gen_swapdi (dlo, dhi));
239 else
240 emit_insn (gen_swapsi (dlo, dhi));
241
242 if (deleted_move_count == 2)
243 emit_note (NOTE_INSN_DELETED);
244 }
245
246
247 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
248 for the target. */
249
250 void
251 ix86_expand_clear (rtx dest)
252 {
253 rtx tmp;
254
255 /* We play register width games, which are only valid after reload. */
256 gcc_assert (reload_completed);
257
258 /* Avoid HImode and its attendant prefix byte. */
259 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
260 dest = gen_rtx_REG (SImode, REGNO (dest));
261 tmp = gen_rtx_SET (dest, const0_rtx);
262
263 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
264 {
265 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
266 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
267 }
268
269 emit_insn (tmp);
270 }
271
272 /* Return true if V can be broadcasted from an integer of WIDTH bits
273 which is returned in VAL_BROADCAST. Otherwise, return false. */
274
275 static bool
276 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
277 HOST_WIDE_INT &val_broadcast)
278 {
279 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
280 val_broadcast = wi::extract_uhwi (val, 0, width);
281 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
282 {
283 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
284 if (val_broadcast != each)
285 return false;
286 }
287 val_broadcast = sext_hwi (val_broadcast, width);
288 return true;
289 }
290
291 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
292
293 static rtx
294 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
295 {
296 /* Don't use integer vector broadcast if we can't move from GPR to SSE
297 register directly. */
298 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
299 return nullptr;
300
301 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
302 broadcast only if vector broadcast is available. */
303 if (!TARGET_AVX
304 || !CONST_WIDE_INT_P (op)
305 || standard_sse_constant_p (op, mode)
306 || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
307 != GET_MODE_BITSIZE (mode)))
308 return nullptr;
309
310 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
311 HOST_WIDE_INT val_broadcast;
312 scalar_int_mode broadcast_mode;
313 if (TARGET_AVX2
314 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
315 val_broadcast))
316 broadcast_mode = QImode;
317 else if (TARGET_AVX2
318 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
319 val_broadcast))
320 broadcast_mode = HImode;
321 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
322 val_broadcast))
323 broadcast_mode = SImode;
324 else if (TARGET_64BIT
325 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
326 val_broadcast))
327 broadcast_mode = DImode;
328 else
329 return nullptr;
330
331 /* Check if OP can be broadcasted from VAL. */
332 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
333 if (val != CONST_WIDE_INT_ELT (op, i))
334 return nullptr;
335
336 unsigned int nunits = (GET_MODE_SIZE (mode)
337 / GET_MODE_SIZE (broadcast_mode));
338 machine_mode vector_mode;
339 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
340 gcc_unreachable ();
341 rtx target = gen_reg_rtx (vector_mode);
342 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
343 target,
344 GEN_INT (val_broadcast));
345 gcc_assert (ok);
346 target = lowpart_subreg (mode, target, vector_mode);
347 return target;
348 }
349
350 void
351 ix86_expand_move (machine_mode mode, rtx operands[])
352 {
353 rtx op0, op1;
354 rtx tmp, addend = NULL_RTX;
355 enum tls_model model;
356
357 op0 = operands[0];
358 op1 = operands[1];
359
360 /* Avoid complex sets of likely spilled hard registers before reload. */
361 if (!ix86_hardreg_mov_ok (op0, op1))
362 {
363 tmp = gen_reg_rtx (mode);
364 operands[0] = tmp;
365 ix86_expand_move (mode, operands);
366 operands[0] = op0;
367 operands[1] = tmp;
368 op1 = tmp;
369 }
370
371 switch (GET_CODE (op1))
372 {
373 case CONST:
374 tmp = XEXP (op1, 0);
375
376 if (GET_CODE (tmp) != PLUS
377 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
378 break;
379
380 op1 = XEXP (tmp, 0);
381 addend = XEXP (tmp, 1);
382 /* FALLTHRU */
383
384 case SYMBOL_REF:
385 model = SYMBOL_REF_TLS_MODEL (op1);
386
387 if (model)
388 op1 = legitimize_tls_address (op1, model, true);
389 else if (ix86_force_load_from_GOT_p (op1))
390 {
391 /* Load the external function address via GOT slot to avoid PLT. */
392 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
393 (TARGET_64BIT
394 ? UNSPEC_GOTPCREL
395 : UNSPEC_GOT));
396 op1 = gen_rtx_CONST (Pmode, op1);
397 op1 = gen_const_mem (Pmode, op1);
398 set_mem_alias_set (op1, ix86_GOT_alias_set ());
399 }
400 else
401 {
402 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
403 if (tmp)
404 {
405 op1 = tmp;
406 if (!addend)
407 break;
408 }
409 else
410 {
411 op1 = operands[1];
412 break;
413 }
414 }
415
416 if (addend)
417 {
418 op1 = force_operand (op1, NULL_RTX);
419 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
420 op0, 1, OPTAB_DIRECT);
421 }
422 else
423 op1 = force_operand (op1, op0);
424
425 if (op1 == op0)
426 return;
427
428 op1 = convert_to_mode (mode, op1, 1);
429
430 default:
431 break;
432 }
433
434 if ((flag_pic || MACHOPIC_INDIRECT)
435 && symbolic_operand (op1, mode))
436 {
437 if (TARGET_MACHO && !TARGET_64BIT)
438 {
439 #if TARGET_MACHO
440 /* dynamic-no-pic */
441 if (MACHOPIC_INDIRECT)
442 {
443 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
444 ? op0 : gen_reg_rtx (Pmode);
445 op1 = machopic_indirect_data_reference (op1, temp);
446 if (MACHOPIC_PURE)
447 op1 = machopic_legitimize_pic_address (op1, mode,
448 temp == op1 ? 0 : temp);
449 }
450 if (op0 != op1 && GET_CODE (op0) != MEM)
451 {
452 rtx insn = gen_rtx_SET (op0, op1);
453 emit_insn (insn);
454 return;
455 }
456 if (GET_CODE (op0) == MEM)
457 op1 = force_reg (Pmode, op1);
458 else
459 {
460 rtx temp = op0;
461 if (GET_CODE (temp) != REG)
462 temp = gen_reg_rtx (Pmode);
463 temp = legitimize_pic_address (op1, temp);
464 if (temp == op0)
465 return;
466 op1 = temp;
467 }
468 /* dynamic-no-pic */
469 #endif
470 }
471 else
472 {
473 if (MEM_P (op0))
474 op1 = force_reg (mode, op1);
475 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
476 {
477 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
478 op1 = legitimize_pic_address (op1, reg);
479 if (op0 == op1)
480 return;
481 op1 = convert_to_mode (mode, op1, 1);
482 }
483 }
484 }
485 else
486 {
487 if (MEM_P (op0)
488 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
489 || !push_operand (op0, mode))
490 && MEM_P (op1))
491 op1 = force_reg (mode, op1);
492
493 if (push_operand (op0, mode)
494 && ! general_no_elim_operand (op1, mode))
495 op1 = copy_to_mode_reg (mode, op1);
496
497 /* Force large constants in 64bit compilation into register
498 to get them CSEed. */
499 if (can_create_pseudo_p ()
500 && (mode == DImode) && TARGET_64BIT
501 && immediate_operand (op1, mode)
502 && !x86_64_zext_immediate_operand (op1, VOIDmode)
503 && !register_operand (op0, mode)
504 && optimize)
505 op1 = copy_to_mode_reg (mode, op1);
506
507 if (can_create_pseudo_p ())
508 {
509 if (CONST_DOUBLE_P (op1))
510 {
511 /* If we are loading a floating point constant to a
512 register, force the value to memory now, since we'll
513 get better code out the back end. */
514
515 op1 = validize_mem (force_const_mem (mode, op1));
516 if (!register_operand (op0, mode))
517 {
518 rtx temp = gen_reg_rtx (mode);
519 emit_insn (gen_rtx_SET (temp, op1));
520 emit_move_insn (op0, temp);
521 return;
522 }
523 }
524 else if (GET_MODE_SIZE (mode) >= 16)
525 {
526 rtx tmp = ix86_convert_const_wide_int_to_broadcast
527 (GET_MODE (op0), op1);
528 if (tmp != nullptr)
529 op1 = tmp;
530 }
531 }
532 }
533
534 emit_insn (gen_rtx_SET (op0, op1));
535 }
536
537 /* OP is a memref of CONST_VECTOR, return scalar constant mem
538 if CONST_VECTOR is a vec_duplicate, else return NULL. */
539 static rtx
540 ix86_broadcast_from_constant (machine_mode mode, rtx op)
541 {
542 int nunits = GET_MODE_NUNITS (mode);
543 if (nunits < 2)
544 return nullptr;
545
546 /* Don't use integer vector broadcast if we can't move from GPR to SSE
547 register directly. */
548 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
549 && INTEGRAL_MODE_P (mode))
550 return nullptr;
551
552 /* Convert CONST_VECTOR to a non-standard SSE constant integer
553 broadcast only if vector broadcast is available. */
554 if (!(TARGET_AVX2
555 || (TARGET_AVX
556 && (GET_MODE_INNER (mode) == SImode
557 || GET_MODE_INNER (mode) == DImode))
558 || FLOAT_MODE_P (mode))
559 || standard_sse_constant_p (op, mode))
560 return nullptr;
561
562 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
563 We can still put 64-bit integer constant in memory when
564 avx512 embed broadcast is available. */
565 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
566 && (!TARGET_AVX512F
567 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
568 return nullptr;
569
570 if (GET_MODE_INNER (mode) == TImode)
571 return nullptr;
572
573 rtx constant = get_pool_constant (XEXP (op, 0));
574 if (GET_CODE (constant) != CONST_VECTOR)
575 return nullptr;
576
577 /* There could be some rtx like
578 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
579 but with "*.LC1" refer to V2DI constant vector. */
580 if (GET_MODE (constant) != mode)
581 {
582 constant = simplify_subreg (mode, constant, GET_MODE (constant),
583 0);
584 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
585 return nullptr;
586 }
587
588 rtx first = XVECEXP (constant, 0, 0);
589
590 for (int i = 1; i < nunits; ++i)
591 {
592 rtx tmp = XVECEXP (constant, 0, i);
593 /* Vector duplicate value. */
594 if (!rtx_equal_p (tmp, first))
595 return nullptr;
596 }
597
598 return first;
599 }
600
601 void
602 ix86_expand_vector_move (machine_mode mode, rtx operands[])
603 {
604 rtx op0 = operands[0], op1 = operands[1];
605 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
606 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
607 unsigned int align = (TARGET_IAMCU
608 ? GET_MODE_BITSIZE (mode)
609 : GET_MODE_ALIGNMENT (mode));
610
611 if (push_operand (op0, VOIDmode))
612 op0 = emit_move_resolve_push (mode, op0);
613
614 /* Force constants other than zero into memory. We do not know how
615 the instructions used to build constants modify the upper 64 bits
616 of the register, once we have that information we may be able
617 to handle some of them more efficiently. */
618 if (can_create_pseudo_p ()
619 && (CONSTANT_P (op1)
620 || (SUBREG_P (op1)
621 && CONSTANT_P (SUBREG_REG (op1))))
622 && ((register_operand (op0, mode)
623 && !standard_sse_constant_p (op1, mode))
624 /* ix86_expand_vector_move_misalign() does not like constants. */
625 || (SSE_REG_MODE_P (mode)
626 && MEM_P (op0)
627 && MEM_ALIGN (op0) < align)))
628 {
629 if (SUBREG_P (op1))
630 {
631 machine_mode imode = GET_MODE (SUBREG_REG (op1));
632 rtx r = force_const_mem (imode, SUBREG_REG (op1));
633 if (r)
634 r = validize_mem (r);
635 else
636 r = force_reg (imode, SUBREG_REG (op1));
637 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
638 }
639 else
640 {
641 machine_mode mode = GET_MODE (op0);
642 rtx tmp = ix86_convert_const_wide_int_to_broadcast
643 (mode, op1);
644 if (tmp == nullptr)
645 op1 = validize_mem (force_const_mem (mode, op1));
646 else
647 op1 = tmp;
648 }
649 }
650
651 if (can_create_pseudo_p ()
652 && GET_MODE_SIZE (mode) >= 16
653 && VECTOR_MODE_P (mode)
654 && (MEM_P (op1)
655 && SYMBOL_REF_P (XEXP (op1, 0))
656 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
657 {
658 rtx first = ix86_broadcast_from_constant (mode, op1);
659 if (first != nullptr)
660 {
661 /* Broadcast to XMM/YMM/ZMM register from an integer
662 constant or scalar mem. */
663 op1 = gen_reg_rtx (mode);
664 if (FLOAT_MODE_P (mode)
665 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
666 first = force_const_mem (GET_MODE_INNER (mode), first);
667 bool ok = ix86_expand_vector_init_duplicate (false, mode,
668 op1, first);
669 gcc_assert (ok);
670 emit_move_insn (op0, op1);
671 return;
672 }
673 }
674
675 /* We need to check memory alignment for SSE mode since attribute
676 can make operands unaligned. */
677 if (can_create_pseudo_p ()
678 && SSE_REG_MODE_P (mode)
679 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
680 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
681 {
682 rtx tmp[2];
683
684 /* ix86_expand_vector_move_misalign() does not like both
685 arguments in memory. */
686 if (!register_operand (op0, mode)
687 && !register_operand (op1, mode))
688 {
689 rtx scratch = gen_reg_rtx (mode);
690 emit_move_insn (scratch, op1);
691 op1 = scratch;
692 }
693
694 tmp[0] = op0; tmp[1] = op1;
695 ix86_expand_vector_move_misalign (mode, tmp);
696 return;
697 }
698
699 /* Special case TImode to V1TImode conversions, via V2DI. */
700 if (mode == V1TImode
701 && SUBREG_P (op1)
702 && GET_MODE (SUBREG_REG (op1)) == TImode
703 && TARGET_64BIT && TARGET_SSE
704 && can_create_pseudo_p ())
705 {
706 rtx tmp = gen_reg_rtx (V2DImode);
707 rtx lo = gen_reg_rtx (DImode);
708 rtx hi = gen_reg_rtx (DImode);
709 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
710 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
711 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
712 emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
713 return;
714 }
715
716 /* If operand0 is a hard register, make operand1 a pseudo. */
717 if (can_create_pseudo_p ()
718 && !ix86_hardreg_mov_ok (op0, op1))
719 {
720 rtx tmp = gen_reg_rtx (GET_MODE (op0));
721 emit_move_insn (tmp, op1);
722 emit_move_insn (op0, tmp);
723 return;
724 }
725
726 /* Make operand1 a register if it isn't already. */
727 if (can_create_pseudo_p ()
728 && !register_operand (op0, mode)
729 && !register_operand (op1, mode))
730 {
731 rtx tmp = gen_reg_rtx (GET_MODE (op0));
732 emit_move_insn (tmp, op1);
733 emit_move_insn (op0, tmp);
734 return;
735 }
736
737 emit_insn (gen_rtx_SET (op0, op1));
738 }
739
740 /* Split 32-byte AVX unaligned load and store if needed. */
741
742 static void
743 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
744 {
745 rtx m;
746 rtx (*extract) (rtx, rtx, rtx);
747 machine_mode mode;
748
749 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
750 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
751 {
752 emit_insn (gen_rtx_SET (op0, op1));
753 return;
754 }
755
756 rtx orig_op0 = NULL_RTX;
757 mode = GET_MODE (op0);
758 switch (GET_MODE_CLASS (mode))
759 {
760 case MODE_VECTOR_INT:
761 case MODE_INT:
762 if (mode != V32QImode)
763 {
764 if (!MEM_P (op0))
765 {
766 orig_op0 = op0;
767 op0 = gen_reg_rtx (V32QImode);
768 }
769 else
770 op0 = gen_lowpart (V32QImode, op0);
771 op1 = gen_lowpart (V32QImode, op1);
772 mode = V32QImode;
773 }
774 break;
775 case MODE_VECTOR_FLOAT:
776 break;
777 default:
778 gcc_unreachable ();
779 }
780
781 switch (mode)
782 {
783 default:
784 gcc_unreachable ();
785 case E_V32QImode:
786 extract = gen_avx_vextractf128v32qi;
787 mode = V16QImode;
788 break;
789 case E_V16BFmode:
790 extract = gen_avx_vextractf128v16bf;
791 mode = V8BFmode;
792 break;
793 case E_V16HFmode:
794 extract = gen_avx_vextractf128v16hf;
795 mode = V8HFmode;
796 break;
797 case E_V8SFmode:
798 extract = gen_avx_vextractf128v8sf;
799 mode = V4SFmode;
800 break;
801 case E_V4DFmode:
802 extract = gen_avx_vextractf128v4df;
803 mode = V2DFmode;
804 break;
805 }
806
807 if (MEM_P (op1))
808 {
809 rtx r = gen_reg_rtx (mode);
810 m = adjust_address (op1, mode, 0);
811 emit_move_insn (r, m);
812 m = adjust_address (op1, mode, 16);
813 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
814 emit_move_insn (op0, r);
815 }
816 else if (MEM_P (op0))
817 {
818 m = adjust_address (op0, mode, 0);
819 emit_insn (extract (m, op1, const0_rtx));
820 m = adjust_address (op0, mode, 16);
821 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
822 }
823 else
824 gcc_unreachable ();
825
826 if (orig_op0)
827 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
828 }
829
830 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
831 straight to ix86_expand_vector_move. */
832 /* Code generation for scalar reg-reg moves of single and double precision data:
833 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
834 movaps reg, reg
835 else
836 movss reg, reg
837 if (x86_sse_partial_reg_dependency == true)
838 movapd reg, reg
839 else
840 movsd reg, reg
841
842 Code generation for scalar loads of double precision data:
843 if (x86_sse_split_regs == true)
844 movlpd mem, reg (gas syntax)
845 else
846 movsd mem, reg
847
848 Code generation for unaligned packed loads of single precision data
849 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
850 if (x86_sse_unaligned_move_optimal)
851 movups mem, reg
852
853 if (x86_sse_partial_reg_dependency == true)
854 {
855 xorps reg, reg
856 movlps mem, reg
857 movhps mem+8, reg
858 }
859 else
860 {
861 movlps mem, reg
862 movhps mem+8, reg
863 }
864
865 Code generation for unaligned packed loads of double precision data
866 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
867 if (x86_sse_unaligned_move_optimal)
868 movupd mem, reg
869
870 if (x86_sse_split_regs == true)
871 {
872 movlpd mem, reg
873 movhpd mem+8, reg
874 }
875 else
876 {
877 movsd mem, reg
878 movhpd mem+8, reg
879 }
880 */
881
882 void
883 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
884 {
885 rtx op0, op1, m;
886
887 op0 = operands[0];
888 op1 = operands[1];
889
890 /* Use unaligned load/store for AVX512 or when optimizing for size. */
891 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
892 {
893 emit_insn (gen_rtx_SET (op0, op1));
894 return;
895 }
896
897 if (TARGET_AVX)
898 {
899 if (GET_MODE_SIZE (mode) == 32)
900 ix86_avx256_split_vector_move_misalign (op0, op1);
901 else
902 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
903 emit_insn (gen_rtx_SET (op0, op1));
904 return;
905 }
906
907 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
908 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
909 {
910 emit_insn (gen_rtx_SET (op0, op1));
911 return;
912 }
913
914 /* ??? If we have typed data, then it would appear that using
915 movdqu is the only way to get unaligned data loaded with
916 integer type. */
917 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
918 {
919 emit_insn (gen_rtx_SET (op0, op1));
920 return;
921 }
922
923 if (MEM_P (op1))
924 {
925 if (TARGET_SSE2 && mode == V2DFmode)
926 {
927 rtx zero;
928
929 /* When SSE registers are split into halves, we can avoid
930 writing to the top half twice. */
931 if (TARGET_SSE_SPLIT_REGS)
932 {
933 emit_clobber (op0);
934 zero = op0;
935 }
936 else
937 {
938 /* ??? Not sure about the best option for the Intel chips.
939 The following would seem to satisfy; the register is
940 entirely cleared, breaking the dependency chain. We
941 then store to the upper half, with a dependency depth
942 of one. A rumor has it that Intel recommends two movsd
943 followed by an unpacklpd, but this is unconfirmed. And
944 given that the dependency depth of the unpacklpd would
945 still be one, I'm not sure why this would be better. */
946 zero = CONST0_RTX (V2DFmode);
947 }
948
949 m = adjust_address (op1, DFmode, 0);
950 emit_insn (gen_sse2_loadlpd (op0, zero, m));
951 m = adjust_address (op1, DFmode, 8);
952 emit_insn (gen_sse2_loadhpd (op0, op0, m));
953 }
954 else
955 {
956 rtx t;
957
958 if (mode != V4SFmode)
959 t = gen_reg_rtx (V4SFmode);
960 else
961 t = op0;
962
963 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
964 emit_move_insn (t, CONST0_RTX (V4SFmode));
965 else
966 emit_clobber (t);
967
968 m = adjust_address (op1, V2SFmode, 0);
969 emit_insn (gen_sse_loadlps (t, t, m));
970 m = adjust_address (op1, V2SFmode, 8);
971 emit_insn (gen_sse_loadhps (t, t, m));
972 if (mode != V4SFmode)
973 emit_move_insn (op0, gen_lowpart (mode, t));
974 }
975 }
976 else if (MEM_P (op0))
977 {
978 if (TARGET_SSE2 && mode == V2DFmode)
979 {
980 m = adjust_address (op0, DFmode, 0);
981 emit_insn (gen_sse2_storelpd (m, op1));
982 m = adjust_address (op0, DFmode, 8);
983 emit_insn (gen_sse2_storehpd (m, op1));
984 }
985 else
986 {
987 if (mode != V4SFmode)
988 op1 = gen_lowpart (V4SFmode, op1);
989
990 m = adjust_address (op0, V2SFmode, 0);
991 emit_insn (gen_sse_storelps (m, op1));
992 m = adjust_address (op0, V2SFmode, 8);
993 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
994 }
995 }
996 else
997 gcc_unreachable ();
998 }
999
1000 /* Move bits 64:95 to bits 32:63. */
1001
1002 void
1003 ix86_move_vector_high_sse_to_mmx (rtx op)
1004 {
1005 rtx mask = gen_rtx_PARALLEL (VOIDmode,
1006 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1007 GEN_INT (0), GEN_INT (0)));
1008 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
1009 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1010 rtx insn = gen_rtx_SET (dest, op);
1011 emit_insn (insn);
1012 }
1013
1014 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1015
1016 void
1017 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1018 {
1019 rtx op0 = operands[0];
1020 rtx op1 = operands[1];
1021 rtx op2 = operands[2];
1022
1023 machine_mode dmode = GET_MODE (op0);
1024 machine_mode smode = GET_MODE (op1);
1025 machine_mode inner_dmode = GET_MODE_INNER (dmode);
1026 machine_mode inner_smode = GET_MODE_INNER (smode);
1027
1028 /* Get the corresponding SSE mode for destination. */
1029 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1030 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1031 nunits).require ();
1032 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1033 nunits / 2).require ();
1034
1035 /* Get the corresponding SSE mode for source. */
1036 nunits = 16 / GET_MODE_SIZE (inner_smode);
1037 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1038 nunits).require ();
1039
1040 /* Generate SSE pack with signed/unsigned saturation. */
1041 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1042 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1043 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1044
1045 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1046 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1047 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
1048 op1, op2));
1049 emit_insn (insn);
1050
1051 ix86_move_vector_high_sse_to_mmx (op0);
1052 }
1053
1054 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1055
1056 void
1057 ix86_split_mmx_punpck (rtx operands[], bool high_p)
1058 {
1059 rtx op0 = operands[0];
1060 rtx op1 = operands[1];
1061 rtx op2 = operands[2];
1062 machine_mode mode = GET_MODE (op0);
1063 rtx mask;
1064 /* The corresponding SSE mode. */
1065 machine_mode sse_mode, double_sse_mode;
1066
1067 switch (mode)
1068 {
1069 case E_V4QImode:
1070 case E_V8QImode:
1071 sse_mode = V16QImode;
1072 double_sse_mode = V32QImode;
1073 mask = gen_rtx_PARALLEL (VOIDmode,
1074 gen_rtvec (16,
1075 GEN_INT (0), GEN_INT (16),
1076 GEN_INT (1), GEN_INT (17),
1077 GEN_INT (2), GEN_INT (18),
1078 GEN_INT (3), GEN_INT (19),
1079 GEN_INT (4), GEN_INT (20),
1080 GEN_INT (5), GEN_INT (21),
1081 GEN_INT (6), GEN_INT (22),
1082 GEN_INT (7), GEN_INT (23)));
1083 break;
1084
1085 case E_V4HImode:
1086 case E_V2HImode:
1087 sse_mode = V8HImode;
1088 double_sse_mode = V16HImode;
1089 mask = gen_rtx_PARALLEL (VOIDmode,
1090 gen_rtvec (8,
1091 GEN_INT (0), GEN_INT (8),
1092 GEN_INT (1), GEN_INT (9),
1093 GEN_INT (2), GEN_INT (10),
1094 GEN_INT (3), GEN_INT (11)));
1095 break;
1096
1097 case E_V2SImode:
1098 sse_mode = V4SImode;
1099 double_sse_mode = V8SImode;
1100 mask = gen_rtx_PARALLEL (VOIDmode,
1101 gen_rtvec (4,
1102 GEN_INT (0), GEN_INT (4),
1103 GEN_INT (1), GEN_INT (5)));
1104 break;
1105
1106 case E_V2SFmode:
1107 sse_mode = V4SFmode;
1108 double_sse_mode = V8SFmode;
1109 mask = gen_rtx_PARALLEL (VOIDmode,
1110 gen_rtvec (4,
1111 GEN_INT (0), GEN_INT (4),
1112 GEN_INT (1), GEN_INT (5)));
1113 break;
1114
1115 default:
1116 gcc_unreachable ();
1117 }
1118
1119 /* Generate SSE punpcklXX. */
1120 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1121 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1122 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1123
1124 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1125 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1126 rtx insn = gen_rtx_SET (dest, op2);
1127 emit_insn (insn);
1128
1129 /* Move high bits to low bits. */
1130 if (high_p)
1131 {
1132 if (sse_mode == V4SFmode)
1133 {
1134 mask = gen_rtx_PARALLEL (VOIDmode,
1135 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1136 GEN_INT (4), GEN_INT (5)));
1137 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1138 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1139 }
1140 else
1141 {
1142 int sz = GET_MODE_SIZE (mode);
1143
1144 if (sz == 4)
1145 mask = gen_rtx_PARALLEL (VOIDmode,
1146 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1147 GEN_INT (0), GEN_INT (1)));
1148 else if (sz == 8)
1149 mask = gen_rtx_PARALLEL (VOIDmode,
1150 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1151 GEN_INT (0), GEN_INT (1)));
1152 else
1153 gcc_unreachable ();
1154
1155 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1156 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1157 }
1158
1159 insn = gen_rtx_SET (dest, op1);
1160 emit_insn (insn);
1161 }
1162 }
1163
1164 /* Helper function of ix86_fixup_binary_operands to canonicalize
1165 operand order. Returns true if the operands should be swapped. */
1166
1167 static bool
1168 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1169 rtx operands[])
1170 {
1171 rtx dst = operands[0];
1172 rtx src1 = operands[1];
1173 rtx src2 = operands[2];
1174
1175 /* If the operation is not commutative, we can't do anything. */
1176 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1177 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1178 return false;
1179
1180 /* Highest priority is that src1 should match dst. */
1181 if (rtx_equal_p (dst, src1))
1182 return false;
1183 if (rtx_equal_p (dst, src2))
1184 return true;
1185
1186 /* Next highest priority is that immediate constants come second. */
1187 if (immediate_operand (src2, mode))
1188 return false;
1189 if (immediate_operand (src1, mode))
1190 return true;
1191
1192 /* Lowest priority is that memory references should come second. */
1193 if (MEM_P (src2))
1194 return false;
1195 if (MEM_P (src1))
1196 return true;
1197
1198 return false;
1199 }
1200
1201
1202 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1203 destination to use for the operation. If different from the true
1204 destination in operands[0], a copy operation will be required. */
1205
1206 rtx
1207 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1208 rtx operands[])
1209 {
1210 rtx dst = operands[0];
1211 rtx src1 = operands[1];
1212 rtx src2 = operands[2];
1213
1214 /* Canonicalize operand order. */
1215 if (ix86_swap_binary_operands_p (code, mode, operands))
1216 {
1217 /* It is invalid to swap operands of different modes. */
1218 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1219
1220 std::swap (src1, src2);
1221 }
1222
1223 /* Both source operands cannot be in memory. */
1224 if (MEM_P (src1) && MEM_P (src2))
1225 {
1226 /* Optimization: Only read from memory once. */
1227 if (rtx_equal_p (src1, src2))
1228 {
1229 src2 = force_reg (mode, src2);
1230 src1 = src2;
1231 }
1232 else if (rtx_equal_p (dst, src1))
1233 src2 = force_reg (mode, src2);
1234 else
1235 src1 = force_reg (mode, src1);
1236 }
1237
1238 /* If the destination is memory, and we do not have matching source
1239 operands, do things in registers. */
1240 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1241 dst = gen_reg_rtx (mode);
1242
1243 /* Source 1 cannot be a constant. */
1244 if (CONSTANT_P (src1))
1245 src1 = force_reg (mode, src1);
1246
1247 /* Source 1 cannot be a non-matching memory. */
1248 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1249 src1 = force_reg (mode, src1);
1250
1251 /* Improve address combine. */
1252 if (code == PLUS
1253 && GET_MODE_CLASS (mode) == MODE_INT
1254 && MEM_P (src2))
1255 src2 = force_reg (mode, src2);
1256
1257 operands[1] = src1;
1258 operands[2] = src2;
1259 return dst;
1260 }
1261
1262 /* Similarly, but assume that the destination has already been
1263 set up properly. */
1264
1265 void
1266 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1267 machine_mode mode, rtx operands[])
1268 {
1269 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1270 gcc_assert (dst == operands[0]);
1271 }
1272
1273 /* Attempt to expand a binary operator. Make the expansion closer to the
1274 actual machine, then just general_operand, which will allow 3 separate
1275 memory references (one output, two input) in a single insn. */
1276
1277 void
1278 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1279 rtx operands[])
1280 {
1281 rtx src1, src2, dst, op, clob;
1282
1283 dst = ix86_fixup_binary_operands (code, mode, operands);
1284 src1 = operands[1];
1285 src2 = operands[2];
1286
1287 /* Emit the instruction. */
1288
1289 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1290
1291 if (reload_completed
1292 && code == PLUS
1293 && !rtx_equal_p (dst, src1))
1294 {
1295 /* This is going to be an LEA; avoid splitting it later. */
1296 emit_insn (op);
1297 }
1298 else
1299 {
1300 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1301 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1302 }
1303
1304 /* Fix up the destination if needed. */
1305 if (dst != operands[0])
1306 emit_move_insn (operands[0], dst);
1307 }
1308
1309 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1310 the given OPERANDS. */
1311
1312 void
1313 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1314 rtx operands[])
1315 {
1316 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1317 if (SUBREG_P (operands[1]))
1318 {
1319 op1 = operands[1];
1320 op2 = operands[2];
1321 }
1322 else if (SUBREG_P (operands[2]))
1323 {
1324 op1 = operands[2];
1325 op2 = operands[1];
1326 }
1327 /* Optimize (__m128i) d | (__m128i) e and similar code
1328 when d and e are float vectors into float vector logical
1329 insn. In C/C++ without using intrinsics there is no other way
1330 to express vector logical operation on float vectors than
1331 to cast them temporarily to integer vectors. */
1332 if (op1
1333 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1334 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1335 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1336 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1337 && SUBREG_BYTE (op1) == 0
1338 && (GET_CODE (op2) == CONST_VECTOR
1339 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1340 && SUBREG_BYTE (op2) == 0))
1341 && can_create_pseudo_p ())
1342 {
1343 rtx dst;
1344 switch (GET_MODE (SUBREG_REG (op1)))
1345 {
1346 case E_V4SFmode:
1347 case E_V8SFmode:
1348 case E_V16SFmode:
1349 case E_V2DFmode:
1350 case E_V4DFmode:
1351 case E_V8DFmode:
1352 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1353 if (GET_CODE (op2) == CONST_VECTOR)
1354 {
1355 op2 = gen_lowpart (GET_MODE (dst), op2);
1356 op2 = force_reg (GET_MODE (dst), op2);
1357 }
1358 else
1359 {
1360 op1 = operands[1];
1361 op2 = SUBREG_REG (operands[2]);
1362 if (!vector_operand (op2, GET_MODE (dst)))
1363 op2 = force_reg (GET_MODE (dst), op2);
1364 }
1365 op1 = SUBREG_REG (op1);
1366 if (!vector_operand (op1, GET_MODE (dst)))
1367 op1 = force_reg (GET_MODE (dst), op1);
1368 emit_insn (gen_rtx_SET (dst,
1369 gen_rtx_fmt_ee (code, GET_MODE (dst),
1370 op1, op2)));
1371 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1372 return;
1373 default:
1374 break;
1375 }
1376 }
1377 if (!vector_operand (operands[1], mode))
1378 operands[1] = force_reg (mode, operands[1]);
1379 if (!vector_operand (operands[2], mode))
1380 operands[2] = force_reg (mode, operands[2]);
1381 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1382 emit_insn (gen_rtx_SET (operands[0],
1383 gen_rtx_fmt_ee (code, mode, operands[1],
1384 operands[2])));
1385 }
1386
1387 /* Return TRUE or FALSE depending on whether the binary operator meets the
1388 appropriate constraints. */
1389
1390 bool
1391 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1392 rtx operands[3])
1393 {
1394 rtx dst = operands[0];
1395 rtx src1 = operands[1];
1396 rtx src2 = operands[2];
1397
1398 /* Both source operands cannot be in memory. */
1399 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1400 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1401 return false;
1402
1403 /* Canonicalize operand order for commutative operators. */
1404 if (ix86_swap_binary_operands_p (code, mode, operands))
1405 std::swap (src1, src2);
1406
1407 /* If the destination is memory, we must have a matching source operand. */
1408 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1409 return false;
1410
1411 /* Source 1 cannot be a constant. */
1412 if (CONSTANT_P (src1))
1413 return false;
1414
1415 /* Source 1 cannot be a non-matching memory. */
1416 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1417 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1418 return (code == AND
1419 && (mode == HImode
1420 || mode == SImode
1421 || (TARGET_64BIT && mode == DImode))
1422 && satisfies_constraint_L (src2));
1423
1424 return true;
1425 }
1426
1427 /* Attempt to expand a unary operator. Make the expansion closer to the
1428 actual machine, then just general_operand, which will allow 2 separate
1429 memory references (one output, one input) in a single insn. */
1430
1431 void
1432 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1433 rtx operands[])
1434 {
1435 bool matching_memory = false;
1436 rtx src, dst, op, clob;
1437
1438 dst = operands[0];
1439 src = operands[1];
1440
1441 /* If the destination is memory, and we do not have matching source
1442 operands, do things in registers. */
1443 if (MEM_P (dst))
1444 {
1445 if (rtx_equal_p (dst, src))
1446 matching_memory = true;
1447 else
1448 dst = gen_reg_rtx (mode);
1449 }
1450
1451 /* When source operand is memory, destination must match. */
1452 if (MEM_P (src) && !matching_memory)
1453 src = force_reg (mode, src);
1454
1455 /* Emit the instruction. */
1456
1457 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1458
1459 if (code == NOT)
1460 emit_insn (op);
1461 else
1462 {
1463 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1464 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1465 }
1466
1467 /* Fix up the destination if needed. */
1468 if (dst != operands[0])
1469 emit_move_insn (operands[0], dst);
1470 }
1471
1472 /* Predict just emitted jump instruction to be taken with probability PROB. */
1473
1474 static void
1475 predict_jump (int prob)
1476 {
1477 rtx_insn *insn = get_last_insn ();
1478 gcc_assert (JUMP_P (insn));
1479 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1480 }
1481
1482 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1483 divisor are within the range [0-255]. */
1484
1485 void
1486 ix86_split_idivmod (machine_mode mode, rtx operands[],
1487 bool unsigned_p)
1488 {
1489 rtx_code_label *end_label, *qimode_label;
1490 rtx div, mod;
1491 rtx_insn *insn;
1492 rtx scratch, tmp0, tmp1, tmp2;
1493 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1494
1495 operands[2] = force_reg (mode, operands[2]);
1496 operands[3] = force_reg (mode, operands[3]);
1497
1498 switch (mode)
1499 {
1500 case E_SImode:
1501 if (GET_MODE (operands[0]) == SImode)
1502 {
1503 if (GET_MODE (operands[1]) == SImode)
1504 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1505 else
1506 gen_divmod4_1
1507 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1508 }
1509 else
1510 gen_divmod4_1
1511 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1512 break;
1513
1514 case E_DImode:
1515 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1516 break;
1517
1518 default:
1519 gcc_unreachable ();
1520 }
1521
1522 end_label = gen_label_rtx ();
1523 qimode_label = gen_label_rtx ();
1524
1525 scratch = gen_reg_rtx (mode);
1526
1527 /* Use 8bit unsigned divimod if dividend and divisor are within
1528 the range [0-255]. */
1529 emit_move_insn (scratch, operands[2]);
1530 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1531 scratch, 1, OPTAB_DIRECT);
1532 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1533 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1534 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1535 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1536 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1537 pc_rtx);
1538 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1539 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1540 JUMP_LABEL (insn) = qimode_label;
1541
1542 /* Generate original signed/unsigned divimod. */
1543 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1544 operands[2], operands[3]));
1545
1546 /* Branch to the end. */
1547 emit_jump_insn (gen_jump (end_label));
1548 emit_barrier ();
1549
1550 /* Generate 8bit unsigned divide. */
1551 emit_label (qimode_label);
1552 /* Don't use operands[0] for result of 8bit divide since not all
1553 registers support QImode ZERO_EXTRACT. */
1554 tmp0 = lowpart_subreg (HImode, scratch, mode);
1555 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1556 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1557 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1558
1559 if (unsigned_p)
1560 {
1561 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1562 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1563 }
1564 else
1565 {
1566 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1567 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1568 }
1569 if (mode == SImode)
1570 {
1571 if (GET_MODE (operands[0]) != SImode)
1572 div = gen_rtx_ZERO_EXTEND (DImode, div);
1573 if (GET_MODE (operands[1]) != SImode)
1574 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1575 }
1576
1577 /* Extract remainder from AH. */
1578 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1579 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1580 GEN_INT (8), GEN_INT (8));
1581 insn = emit_move_insn (operands[1], tmp1);
1582 set_unique_reg_note (insn, REG_EQUAL, mod);
1583
1584 /* Zero extend quotient from AL. */
1585 tmp1 = gen_lowpart (QImode, tmp0);
1586 insn = emit_insn (gen_extend_insn
1587 (operands[0], tmp1,
1588 GET_MODE (operands[0]), QImode, 1));
1589 set_unique_reg_note (insn, REG_EQUAL, div);
1590
1591 emit_label (end_label);
1592 }
1593
1594 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1595 matches destination. RTX includes clobber of FLAGS_REG. */
1596
1597 void
1598 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1599 rtx dst, rtx src)
1600 {
1601 rtx op, clob;
1602
1603 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1604 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1605
1606 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1607 }
1608
1609 /* Return true if regno1 def is nearest to the insn. */
1610
1611 static bool
1612 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1613 {
1614 rtx_insn *prev = insn;
1615 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1616
1617 if (insn == start)
1618 return false;
1619 while (prev && prev != start)
1620 {
1621 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1622 {
1623 prev = PREV_INSN (prev);
1624 continue;
1625 }
1626 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1627 return true;
1628 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1629 return false;
1630 prev = PREV_INSN (prev);
1631 }
1632
1633 /* None of the regs is defined in the bb. */
1634 return false;
1635 }
1636
1637 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1638 int ix86_last_zero_store_uid;
1639
1640 /* Split lea instructions into a sequence of instructions
1641 which are executed on ALU to avoid AGU stalls.
1642 It is assumed that it is allowed to clobber flags register
1643 at lea position. */
1644
1645 void
1646 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1647 {
1648 unsigned int regno0, regno1, regno2;
1649 struct ix86_address parts;
1650 rtx target, tmp;
1651 int ok, adds;
1652
1653 ok = ix86_decompose_address (operands[1], &parts);
1654 gcc_assert (ok);
1655
1656 target = gen_lowpart (mode, operands[0]);
1657
1658 regno0 = true_regnum (target);
1659 regno1 = INVALID_REGNUM;
1660 regno2 = INVALID_REGNUM;
1661
1662 if (parts.base)
1663 {
1664 parts.base = gen_lowpart (mode, parts.base);
1665 regno1 = true_regnum (parts.base);
1666 }
1667
1668 if (parts.index)
1669 {
1670 parts.index = gen_lowpart (mode, parts.index);
1671 regno2 = true_regnum (parts.index);
1672 }
1673
1674 if (parts.disp)
1675 parts.disp = gen_lowpart (mode, parts.disp);
1676
1677 if (parts.scale > 1)
1678 {
1679 /* Case r1 = r1 + ... */
1680 if (regno1 == regno0)
1681 {
1682 /* If we have a case r1 = r1 + C * r2 then we
1683 should use multiplication which is very
1684 expensive. Assume cost model is wrong if we
1685 have such case here. */
1686 gcc_assert (regno2 != regno0);
1687
1688 for (adds = parts.scale; adds > 0; adds--)
1689 ix86_emit_binop (PLUS, mode, target, parts.index);
1690 }
1691 else
1692 {
1693 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1694 if (regno0 != regno2)
1695 emit_insn (gen_rtx_SET (target, parts.index));
1696
1697 /* Use shift for scaling, but emit it as MULT instead
1698 to avoid it being immediately peephole2 optimized back
1699 into lea. */
1700 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1701
1702 if (parts.base)
1703 ix86_emit_binop (PLUS, mode, target, parts.base);
1704
1705 if (parts.disp && parts.disp != const0_rtx)
1706 ix86_emit_binop (PLUS, mode, target, parts.disp);
1707 }
1708 }
1709 else if (!parts.base && !parts.index)
1710 {
1711 gcc_assert(parts.disp);
1712 emit_insn (gen_rtx_SET (target, parts.disp));
1713 }
1714 else
1715 {
1716 if (!parts.base)
1717 {
1718 if (regno0 != regno2)
1719 emit_insn (gen_rtx_SET (target, parts.index));
1720 }
1721 else if (!parts.index)
1722 {
1723 if (regno0 != regno1)
1724 emit_insn (gen_rtx_SET (target, parts.base));
1725 }
1726 else
1727 {
1728 if (regno0 == regno1)
1729 tmp = parts.index;
1730 else if (regno0 == regno2)
1731 tmp = parts.base;
1732 else
1733 {
1734 rtx tmp1;
1735
1736 /* Find better operand for SET instruction, depending
1737 on which definition is farther from the insn. */
1738 if (find_nearest_reg_def (insn, regno1, regno2))
1739 tmp = parts.index, tmp1 = parts.base;
1740 else
1741 tmp = parts.base, tmp1 = parts.index;
1742
1743 emit_insn (gen_rtx_SET (target, tmp));
1744
1745 if (parts.disp && parts.disp != const0_rtx)
1746 ix86_emit_binop (PLUS, mode, target, parts.disp);
1747
1748 ix86_emit_binop (PLUS, mode, target, tmp1);
1749 return;
1750 }
1751
1752 ix86_emit_binop (PLUS, mode, target, tmp);
1753 }
1754
1755 if (parts.disp && parts.disp != const0_rtx)
1756 ix86_emit_binop (PLUS, mode, target, parts.disp);
1757 }
1758 }
1759
1760 /* Post-reload splitter for converting an SF or DFmode value in an
1761 SSE register into an unsigned SImode. */
1762
1763 void
1764 ix86_split_convert_uns_si_sse (rtx operands[])
1765 {
1766 machine_mode vecmode;
1767 rtx value, large, zero_or_two31, input, two31, x;
1768
1769 large = operands[1];
1770 zero_or_two31 = operands[2];
1771 input = operands[3];
1772 two31 = operands[4];
1773 vecmode = GET_MODE (large);
1774 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1775
1776 /* Load up the value into the low element. We must ensure that the other
1777 elements are valid floats -- zero is the easiest such value. */
1778 if (MEM_P (input))
1779 {
1780 if (vecmode == V4SFmode)
1781 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1782 else
1783 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1784 }
1785 else
1786 {
1787 input = gen_rtx_REG (vecmode, REGNO (input));
1788 emit_move_insn (value, CONST0_RTX (vecmode));
1789 if (vecmode == V4SFmode)
1790 emit_insn (gen_sse_movss_v4sf (value, value, input));
1791 else
1792 emit_insn (gen_sse2_movsd_v2df (value, value, input));
1793 }
1794
1795 emit_move_insn (large, two31);
1796 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1797
1798 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1799 emit_insn (gen_rtx_SET (large, x));
1800
1801 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1802 emit_insn (gen_rtx_SET (zero_or_two31, x));
1803
1804 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1805 emit_insn (gen_rtx_SET (value, x));
1806
1807 large = gen_rtx_REG (V4SImode, REGNO (large));
1808 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1809
1810 x = gen_rtx_REG (V4SImode, REGNO (value));
1811 if (vecmode == V4SFmode)
1812 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1813 else
1814 emit_insn (gen_sse2_cvttpd2dq (x, value));
1815 value = x;
1816
1817 emit_insn (gen_xorv4si3 (value, value, large));
1818 }
1819
1820 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1821 machine_mode mode, rtx target,
1822 rtx var, int one_var);
1823
1824 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1825 Expects the 64-bit DImode to be supplied in a pair of integral
1826 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1827 -mfpmath=sse, !optimize_size only. */
1828
1829 void
1830 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1831 {
1832 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1833 rtx int_xmm, fp_xmm;
1834 rtx biases, exponents;
1835 rtx x;
1836
1837 int_xmm = gen_reg_rtx (V4SImode);
1838 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1839 emit_insn (gen_movdi_to_sse (int_xmm, input));
1840 else if (TARGET_SSE_SPLIT_REGS)
1841 {
1842 emit_clobber (int_xmm);
1843 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1844 }
1845 else
1846 {
1847 x = gen_reg_rtx (V2DImode);
1848 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1849 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1850 }
1851
1852 x = gen_rtx_CONST_VECTOR (V4SImode,
1853 gen_rtvec (4, GEN_INT (0x43300000UL),
1854 GEN_INT (0x45300000UL),
1855 const0_rtx, const0_rtx));
1856 exponents = validize_mem (force_const_mem (V4SImode, x));
1857
1858 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1859 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1860
1861 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1862 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1863 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1864 (0x1.0p84 + double(fp_value_hi_xmm)).
1865 Note these exponents differ by 32. */
1866
1867 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1868
1869 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1870 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1871 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1872 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1873 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1874 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1875 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1876 biases = validize_mem (force_const_mem (V2DFmode, biases));
1877 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1878
1879 /* Add the upper and lower DFmode values together. */
1880 if (TARGET_SSE3)
1881 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1882 else
1883 {
1884 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1885 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1886 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1887 }
1888
1889 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1890 }
1891
1892 /* Not used, but eases macroization of patterns. */
1893 void
1894 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1895 {
1896 gcc_unreachable ();
1897 }
1898
1899 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1900
1901 /* Convert an unsigned SImode value into a DFmode. Only currently used
1902 for SSE, but applicable anywhere. */
1903
1904 void
1905 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1906 {
1907 REAL_VALUE_TYPE TWO31r;
1908 rtx x, fp;
1909
1910 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1911 NULL, 1, OPTAB_DIRECT);
1912
1913 fp = gen_reg_rtx (DFmode);
1914 emit_insn (gen_floatsidf2 (fp, x));
1915
1916 real_ldexp (&TWO31r, &dconst1, 31);
1917 x = const_double_from_real_value (TWO31r, DFmode);
1918
1919 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1920
1921 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1922 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1923 x = ix86_expand_sse_fabs (x, NULL);
1924
1925 if (x != target)
1926 emit_move_insn (target, x);
1927 }
1928
1929 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1930 32-bit mode; otherwise we have a direct convert instruction. */
1931
1932 void
1933 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1934 {
1935 REAL_VALUE_TYPE TWO32r;
1936 rtx fp_lo, fp_hi, x;
1937
1938 fp_lo = gen_reg_rtx (DFmode);
1939 fp_hi = gen_reg_rtx (DFmode);
1940
1941 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1942
1943 real_ldexp (&TWO32r, &dconst1, 32);
1944 x = const_double_from_real_value (TWO32r, DFmode);
1945 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1946
1947 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1948
1949 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1950 0, OPTAB_DIRECT);
1951 if (x != target)
1952 emit_move_insn (target, x);
1953 }
1954
1955 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1956 For x86_32, -mfpmath=sse, !optimize_size only. */
1957 void
1958 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1959 {
1960 REAL_VALUE_TYPE ONE16r;
1961 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1962
1963 real_ldexp (&ONE16r, &dconst1, 16);
1964 x = const_double_from_real_value (ONE16r, SFmode);
1965 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1966 NULL, 0, OPTAB_DIRECT);
1967 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1968 NULL, 0, OPTAB_DIRECT);
1969 fp_hi = gen_reg_rtx (SFmode);
1970 fp_lo = gen_reg_rtx (SFmode);
1971 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1972 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1973 if (TARGET_FMA)
1974 {
1975 x = validize_mem (force_const_mem (SFmode, x));
1976 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1977 emit_move_insn (target, fp_hi);
1978 }
1979 else
1980 {
1981 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1982 0, OPTAB_DIRECT);
1983 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1984 0, OPTAB_DIRECT);
1985 if (!rtx_equal_p (target, fp_hi))
1986 emit_move_insn (target, fp_hi);
1987 }
1988 }
1989
1990 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1991 a vector of unsigned ints VAL to vector of floats TARGET. */
1992
1993 void
1994 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1995 {
1996 rtx tmp[8];
1997 REAL_VALUE_TYPE TWO16r;
1998 machine_mode intmode = GET_MODE (val);
1999 machine_mode fltmode = GET_MODE (target);
2000 rtx (*cvt) (rtx, rtx);
2001
2002 if (intmode == V4SImode)
2003 cvt = gen_floatv4siv4sf2;
2004 else
2005 cvt = gen_floatv8siv8sf2;
2006 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
2007 tmp[0] = force_reg (intmode, tmp[0]);
2008 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
2009 OPTAB_DIRECT);
2010 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
2011 NULL_RTX, 1, OPTAB_DIRECT);
2012 tmp[3] = gen_reg_rtx (fltmode);
2013 emit_insn (cvt (tmp[3], tmp[1]));
2014 tmp[4] = gen_reg_rtx (fltmode);
2015 emit_insn (cvt (tmp[4], tmp[2]));
2016 real_ldexp (&TWO16r, &dconst1, 16);
2017 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2018 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
2019 if (TARGET_FMA)
2020 {
2021 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2022 emit_move_insn (target, tmp[6]);
2023 }
2024 else
2025 {
2026 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2027 NULL_RTX, 1, OPTAB_DIRECT);
2028 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2029 target, 1, OPTAB_DIRECT);
2030 if (tmp[7] != target)
2031 emit_move_insn (target, tmp[7]);
2032 }
2033 }
2034
2035 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2036 pattern can be used on it instead of fixuns_trunc*.
2037 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2038 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2039
2040 rtx
2041 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2042 {
2043 REAL_VALUE_TYPE TWO31r;
2044 rtx two31r, tmp[4];
2045 machine_mode mode = GET_MODE (val);
2046 machine_mode scalarmode = GET_MODE_INNER (mode);
2047 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2048 rtx (*cmp) (rtx, rtx, rtx, rtx);
2049 int i;
2050
2051 for (i = 0; i < 3; i++)
2052 tmp[i] = gen_reg_rtx (mode);
2053 real_ldexp (&TWO31r, &dconst1, 31);
2054 two31r = const_double_from_real_value (TWO31r, scalarmode);
2055 two31r = ix86_build_const_vector (mode, 1, two31r);
2056 two31r = force_reg (mode, two31r);
2057 switch (mode)
2058 {
2059 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2060 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2061 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2062 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2063 default: gcc_unreachable ();
2064 }
2065 tmp[3] = gen_rtx_LE (mode, two31r, val);
2066 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2067 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2068 0, OPTAB_DIRECT);
2069 if (intmode == V4SImode || TARGET_AVX2)
2070 *xorp = expand_simple_binop (intmode, ASHIFT,
2071 gen_lowpart (intmode, tmp[0]),
2072 GEN_INT (31), NULL_RTX, 0,
2073 OPTAB_DIRECT);
2074 else
2075 {
2076 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2077 two31 = ix86_build_const_vector (intmode, 1, two31);
2078 *xorp = expand_simple_binop (intmode, AND,
2079 gen_lowpart (intmode, tmp[0]),
2080 two31, NULL_RTX, 0,
2081 OPTAB_DIRECT);
2082 }
2083 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2084 0, OPTAB_DIRECT);
2085 }
2086
2087 /* Generate code for floating point ABS or NEG. */
2088
2089 void
2090 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2091 rtx operands[])
2092 {
2093 rtx set, dst, src;
2094 bool use_sse = false;
2095 bool vector_mode = VECTOR_MODE_P (mode);
2096 machine_mode vmode = mode;
2097 rtvec par;
2098
2099 if (vector_mode || mode == TFmode || mode == HFmode)
2100 {
2101 use_sse = true;
2102 if (mode == HFmode)
2103 vmode = V8HFmode;
2104 }
2105 else if (TARGET_SSE_MATH)
2106 {
2107 use_sse = SSE_FLOAT_MODE_P (mode);
2108 if (mode == SFmode)
2109 vmode = V4SFmode;
2110 else if (mode == DFmode)
2111 vmode = V2DFmode;
2112 }
2113
2114 dst = operands[0];
2115 src = operands[1];
2116
2117 set = gen_rtx_fmt_e (code, mode, src);
2118 set = gen_rtx_SET (dst, set);
2119
2120 if (use_sse)
2121 {
2122 rtx mask, use, clob;
2123
2124 /* NEG and ABS performed with SSE use bitwise mask operations.
2125 Create the appropriate mask now. */
2126 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2127 use = gen_rtx_USE (VOIDmode, mask);
2128 if (vector_mode || mode == TFmode)
2129 par = gen_rtvec (2, set, use);
2130 else
2131 {
2132 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2133 par = gen_rtvec (3, set, use, clob);
2134 }
2135 }
2136 else
2137 {
2138 rtx clob;
2139
2140 /* Changing of sign for FP values is doable using integer unit too. */
2141 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2142 par = gen_rtvec (2, set, clob);
2143 }
2144
2145 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2146 }
2147
2148 /* Deconstruct a floating point ABS or NEG operation
2149 with integer registers into integer operations. */
2150
2151 void
2152 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2153 rtx operands[])
2154 {
2155 enum rtx_code absneg_op;
2156 rtx dst, set;
2157
2158 gcc_assert (operands_match_p (operands[0], operands[1]));
2159
2160 switch (mode)
2161 {
2162 case E_SFmode:
2163 dst = gen_lowpart (SImode, operands[0]);
2164
2165 if (code == ABS)
2166 {
2167 set = gen_int_mode (0x7fffffff, SImode);
2168 absneg_op = AND;
2169 }
2170 else
2171 {
2172 set = gen_int_mode (0x80000000, SImode);
2173 absneg_op = XOR;
2174 }
2175 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2176 break;
2177
2178 case E_DFmode:
2179 if (TARGET_64BIT)
2180 {
2181 dst = gen_lowpart (DImode, operands[0]);
2182 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2183
2184 if (code == ABS)
2185 set = const0_rtx;
2186 else
2187 set = gen_rtx_NOT (DImode, dst);
2188 }
2189 else
2190 {
2191 dst = gen_highpart (SImode, operands[0]);
2192
2193 if (code == ABS)
2194 {
2195 set = gen_int_mode (0x7fffffff, SImode);
2196 absneg_op = AND;
2197 }
2198 else
2199 {
2200 set = gen_int_mode (0x80000000, SImode);
2201 absneg_op = XOR;
2202 }
2203 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2204 }
2205 break;
2206
2207 case E_XFmode:
2208 dst = gen_rtx_REG (SImode,
2209 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2210 if (code == ABS)
2211 {
2212 set = GEN_INT (0x7fff);
2213 absneg_op = AND;
2214 }
2215 else
2216 {
2217 set = GEN_INT (0x8000);
2218 absneg_op = XOR;
2219 }
2220 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2221 break;
2222
2223 default:
2224 gcc_unreachable ();
2225 }
2226
2227 set = gen_rtx_SET (dst, set);
2228
2229 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2230 rtvec par = gen_rtvec (2, set, clob);
2231
2232 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2233 }
2234
2235 /* Expand a copysign operation. Special case operand 0 being a constant. */
2236
2237 void
2238 ix86_expand_copysign (rtx operands[])
2239 {
2240 machine_mode mode, vmode;
2241 rtx dest, vdest, op0, op1, mask, op2, op3;
2242
2243 mode = GET_MODE (operands[0]);
2244
2245 if (mode == HFmode)
2246 vmode = V8HFmode;
2247 else if (mode == SFmode)
2248 vmode = V4SFmode;
2249 else if (mode == DFmode)
2250 vmode = V2DFmode;
2251 else if (mode == TFmode)
2252 vmode = mode;
2253 else
2254 gcc_unreachable ();
2255
2256 if (rtx_equal_p (operands[1], operands[2]))
2257 {
2258 emit_move_insn (operands[0], operands[1]);
2259 return;
2260 }
2261
2262 dest = operands[0];
2263 vdest = lowpart_subreg (vmode, dest, mode);
2264 if (vdest == NULL_RTX)
2265 vdest = gen_reg_rtx (vmode);
2266 else
2267 dest = NULL_RTX;
2268 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2269 mask = ix86_build_signbit_mask (vmode, 0, 0);
2270
2271 if (CONST_DOUBLE_P (operands[1]))
2272 {
2273 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2274 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2275 if (op0 == CONST0_RTX (mode))
2276 {
2277 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2278 if (dest)
2279 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2280 return;
2281 }
2282
2283 if (GET_MODE_SIZE (mode) < 16)
2284 op0 = ix86_build_const_vector (vmode, false, op0);
2285 op0 = force_reg (vmode, op0);
2286 }
2287 else
2288 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2289
2290 op2 = gen_reg_rtx (vmode);
2291 op3 = gen_reg_rtx (vmode);
2292 emit_move_insn (op2, gen_rtx_AND (vmode,
2293 gen_rtx_NOT (vmode, mask),
2294 op0));
2295 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2296 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2297 if (dest)
2298 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2299 }
2300
2301 /* Expand an xorsign operation. */
2302
2303 void
2304 ix86_expand_xorsign (rtx operands[])
2305 {
2306 machine_mode mode, vmode;
2307 rtx dest, vdest, op0, op1, mask, x, temp;
2308
2309 dest = operands[0];
2310 op0 = operands[1];
2311 op1 = operands[2];
2312
2313 mode = GET_MODE (dest);
2314
2315 if (mode == HFmode)
2316 vmode = V8HFmode;
2317 else if (mode == SFmode)
2318 vmode = V4SFmode;
2319 else if (mode == DFmode)
2320 vmode = V2DFmode;
2321 else
2322 gcc_unreachable ();
2323
2324 temp = gen_reg_rtx (vmode);
2325 mask = ix86_build_signbit_mask (vmode, 0, 0);
2326
2327 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2328 x = gen_rtx_AND (vmode, op1, mask);
2329 emit_insn (gen_rtx_SET (temp, x));
2330
2331 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2332 x = gen_rtx_XOR (vmode, temp, op0);
2333
2334 vdest = lowpart_subreg (vmode, dest, mode);
2335 if (vdest == NULL_RTX)
2336 vdest = gen_reg_rtx (vmode);
2337 else
2338 dest = NULL_RTX;
2339 emit_insn (gen_rtx_SET (vdest, x));
2340
2341 if (dest)
2342 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2343 }
2344
2345 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2346
2347 void
2348 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2349 {
2350 machine_mode mode = GET_MODE (op0);
2351 rtx tmp;
2352
2353 /* Handle special case - vector comparsion with boolean result, transform
2354 it using ptest instruction. */
2355 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2356 || mode == OImode)
2357 {
2358 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2359 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2360
2361 gcc_assert (code == EQ || code == NE);
2362
2363 if (mode == OImode)
2364 {
2365 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2366 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2367 mode = p_mode;
2368 }
2369 /* Generate XOR since we can't check that one operand is zero vector. */
2370 tmp = gen_reg_rtx (mode);
2371 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2372 tmp = gen_lowpart (p_mode, tmp);
2373 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2374 gen_rtx_UNSPEC (CCmode,
2375 gen_rtvec (2, tmp, tmp),
2376 UNSPEC_PTEST)));
2377 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2378 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2379 gen_rtx_LABEL_REF (VOIDmode, label),
2380 pc_rtx);
2381 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2382 return;
2383 }
2384
2385 switch (mode)
2386 {
2387 case E_HFmode:
2388 case E_SFmode:
2389 case E_DFmode:
2390 case E_XFmode:
2391 case E_QImode:
2392 case E_HImode:
2393 case E_SImode:
2394 simple:
2395 tmp = ix86_expand_compare (code, op0, op1);
2396 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2397 gen_rtx_LABEL_REF (VOIDmode, label),
2398 pc_rtx);
2399 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2400 return;
2401
2402 case E_DImode:
2403 if (TARGET_64BIT)
2404 goto simple;
2405 /* FALLTHRU */
2406 case E_TImode:
2407 /* DI and TI mode equality/inequality comparisons may be performed
2408 on SSE registers. Avoid splitting them, except when optimizing
2409 for size. */
2410 if ((code == EQ || code == NE)
2411 && !optimize_insn_for_size_p ())
2412 goto simple;
2413
2414 /* Expand DImode branch into multiple compare+branch. */
2415 {
2416 rtx lo[2], hi[2];
2417 rtx_code_label *label2;
2418 enum rtx_code code1, code2, code3;
2419 machine_mode submode;
2420
2421 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2422 {
2423 std::swap (op0, op1);
2424 code = swap_condition (code);
2425 }
2426
2427 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2428 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2429
2430 submode = mode == DImode ? SImode : DImode;
2431
2432 /* If we are doing less-than or greater-or-equal-than,
2433 op1 is a constant and the low word is zero, then we can just
2434 examine the high word. Similarly for low word -1 and
2435 less-or-equal-than or greater-than. */
2436
2437 if (CONST_INT_P (hi[1]))
2438 switch (code)
2439 {
2440 case LT: case LTU: case GE: case GEU:
2441 if (lo[1] == const0_rtx)
2442 {
2443 ix86_expand_branch (code, hi[0], hi[1], label);
2444 return;
2445 }
2446 break;
2447 case LE: case LEU: case GT: case GTU:
2448 if (lo[1] == constm1_rtx)
2449 {
2450 ix86_expand_branch (code, hi[0], hi[1], label);
2451 return;
2452 }
2453 break;
2454 default:
2455 break;
2456 }
2457
2458 /* Emulate comparisons that do not depend on Zero flag with
2459 double-word subtraction. Note that only Overflow, Sign
2460 and Carry flags are valid, so swap arguments and condition
2461 of comparisons that would otherwise test Zero flag. */
2462
2463 switch (code)
2464 {
2465 case LE: case LEU: case GT: case GTU:
2466 std::swap (lo[0], lo[1]);
2467 std::swap (hi[0], hi[1]);
2468 code = swap_condition (code);
2469 /* FALLTHRU */
2470
2471 case LT: case LTU: case GE: case GEU:
2472 {
2473 bool uns = (code == LTU || code == GEU);
2474 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2475 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2476
2477 if (!nonimmediate_operand (lo[0], submode))
2478 lo[0] = force_reg (submode, lo[0]);
2479 if (!x86_64_general_operand (lo[1], submode))
2480 lo[1] = force_reg (submode, lo[1]);
2481
2482 if (!register_operand (hi[0], submode))
2483 hi[0] = force_reg (submode, hi[0]);
2484 if ((uns && !nonimmediate_operand (hi[1], submode))
2485 || (!uns && !x86_64_general_operand (hi[1], submode)))
2486 hi[1] = force_reg (submode, hi[1]);
2487
2488 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2489
2490 tmp = gen_rtx_SCRATCH (submode);
2491 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2492
2493 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2494 ix86_expand_branch (code, tmp, const0_rtx, label);
2495 return;
2496 }
2497
2498 default:
2499 break;
2500 }
2501
2502 /* Otherwise, we need two or three jumps. */
2503
2504 label2 = gen_label_rtx ();
2505
2506 code1 = code;
2507 code2 = swap_condition (code);
2508 code3 = unsigned_condition (code);
2509
2510 switch (code)
2511 {
2512 case LT: case GT: case LTU: case GTU:
2513 break;
2514
2515 case LE: code1 = LT; code2 = GT; break;
2516 case GE: code1 = GT; code2 = LT; break;
2517 case LEU: code1 = LTU; code2 = GTU; break;
2518 case GEU: code1 = GTU; code2 = LTU; break;
2519
2520 case EQ: code1 = UNKNOWN; code2 = NE; break;
2521 case NE: code2 = UNKNOWN; break;
2522
2523 default:
2524 gcc_unreachable ();
2525 }
2526
2527 /*
2528 * a < b =>
2529 * if (hi(a) < hi(b)) goto true;
2530 * if (hi(a) > hi(b)) goto false;
2531 * if (lo(a) < lo(b)) goto true;
2532 * false:
2533 */
2534
2535 if (code1 != UNKNOWN)
2536 ix86_expand_branch (code1, hi[0], hi[1], label);
2537 if (code2 != UNKNOWN)
2538 ix86_expand_branch (code2, hi[0], hi[1], label2);
2539
2540 ix86_expand_branch (code3, lo[0], lo[1], label);
2541
2542 if (code2 != UNKNOWN)
2543 emit_label (label2);
2544 return;
2545 }
2546
2547 default:
2548 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2549 goto simple;
2550 }
2551 }
2552
2553 /* Figure out whether to use unordered fp comparisons. */
2554
2555 static bool
2556 ix86_unordered_fp_compare (enum rtx_code code)
2557 {
2558 if (!TARGET_IEEE_FP)
2559 return false;
2560
2561 switch (code)
2562 {
2563 case LT:
2564 case LE:
2565 case GT:
2566 case GE:
2567 case LTGT:
2568 return false;
2569
2570 case EQ:
2571 case NE:
2572
2573 case UNORDERED:
2574 case ORDERED:
2575 case UNLT:
2576 case UNLE:
2577 case UNGT:
2578 case UNGE:
2579 case UNEQ:
2580 return true;
2581
2582 default:
2583 gcc_unreachable ();
2584 }
2585 }
2586
2587 /* Return a comparison we can do and that it is equivalent to
2588 swap_condition (code) apart possibly from orderedness.
2589 But, never change orderedness if TARGET_IEEE_FP, returning
2590 UNKNOWN in that case if necessary. */
2591
2592 static enum rtx_code
2593 ix86_fp_swap_condition (enum rtx_code code)
2594 {
2595 switch (code)
2596 {
2597 case GT: /* GTU - CF=0 & ZF=0 */
2598 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2599 case GE: /* GEU - CF=0 */
2600 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2601 case UNLT: /* LTU - CF=1 */
2602 return TARGET_IEEE_FP ? UNKNOWN : GT;
2603 case UNLE: /* LEU - CF=1 | ZF=1 */
2604 return TARGET_IEEE_FP ? UNKNOWN : GE;
2605 default:
2606 return swap_condition (code);
2607 }
2608 }
2609
2610 /* Return cost of comparison CODE using the best strategy for performance.
2611 All following functions do use number of instructions as a cost metrics.
2612 In future this should be tweaked to compute bytes for optimize_size and
2613 take into account performance of various instructions on various CPUs. */
2614
2615 static int
2616 ix86_fp_comparison_cost (enum rtx_code code)
2617 {
2618 int arith_cost;
2619
2620 /* The cost of code using bit-twiddling on %ah. */
2621 switch (code)
2622 {
2623 case UNLE:
2624 case UNLT:
2625 case LTGT:
2626 case GT:
2627 case GE:
2628 case UNORDERED:
2629 case ORDERED:
2630 case UNEQ:
2631 arith_cost = 4;
2632 break;
2633 case LT:
2634 case NE:
2635 case EQ:
2636 case UNGE:
2637 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2638 break;
2639 case LE:
2640 case UNGT:
2641 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2642 break;
2643 default:
2644 gcc_unreachable ();
2645 }
2646
2647 switch (ix86_fp_comparison_strategy (code))
2648 {
2649 case IX86_FPCMP_COMI:
2650 return arith_cost > 4 ? 3 : 2;
2651 case IX86_FPCMP_SAHF:
2652 return arith_cost > 4 ? 4 : 3;
2653 default:
2654 return arith_cost;
2655 }
2656 }
2657
2658 /* Swap, force into registers, or otherwise massage the two operands
2659 to a fp comparison. The operands are updated in place; the new
2660 comparison code is returned. */
2661
2662 static enum rtx_code
2663 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2664 {
2665 bool unordered_compare = ix86_unordered_fp_compare (code);
2666 rtx op0 = *pop0, op1 = *pop1;
2667 machine_mode op_mode = GET_MODE (op0);
2668 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2669
2670 if (op_mode == BFmode)
2671 {
2672 rtx op = gen_lowpart (HImode, op0);
2673 if (CONST_INT_P (op))
2674 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2675 op0, BFmode);
2676 else
2677 {
2678 rtx t1 = gen_reg_rtx (SImode);
2679 emit_insn (gen_zero_extendhisi2 (t1, op));
2680 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2681 op = gen_lowpart (SFmode, t1);
2682 }
2683 *pop0 = op;
2684 op = gen_lowpart (HImode, op1);
2685 if (CONST_INT_P (op))
2686 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2687 op1, BFmode);
2688 else
2689 {
2690 rtx t1 = gen_reg_rtx (SImode);
2691 emit_insn (gen_zero_extendhisi2 (t1, op));
2692 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2693 op = gen_lowpart (SFmode, t1);
2694 }
2695 *pop1 = op;
2696 return ix86_prepare_fp_compare_args (code, pop0, pop1);
2697 }
2698
2699 /* All of the unordered compare instructions only work on registers.
2700 The same is true of the fcomi compare instructions. The XFmode
2701 compare instructions require registers except when comparing
2702 against zero or when converting operand 1 from fixed point to
2703 floating point. */
2704
2705 if (!is_sse
2706 && (unordered_compare
2707 || (op_mode == XFmode
2708 && ! (standard_80387_constant_p (op0) == 1
2709 || standard_80387_constant_p (op1) == 1)
2710 && GET_CODE (op1) != FLOAT)
2711 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2712 {
2713 op0 = force_reg (op_mode, op0);
2714 op1 = force_reg (op_mode, op1);
2715 }
2716 else
2717 {
2718 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2719 things around if they appear profitable, otherwise force op0
2720 into a register. */
2721
2722 if (standard_80387_constant_p (op0) == 0
2723 || (MEM_P (op0)
2724 && ! (standard_80387_constant_p (op1) == 0
2725 || MEM_P (op1))))
2726 {
2727 enum rtx_code new_code = ix86_fp_swap_condition (code);
2728 if (new_code != UNKNOWN)
2729 {
2730 std::swap (op0, op1);
2731 code = new_code;
2732 }
2733 }
2734
2735 if (!REG_P (op0))
2736 op0 = force_reg (op_mode, op0);
2737
2738 if (CONSTANT_P (op1))
2739 {
2740 int tmp = standard_80387_constant_p (op1);
2741 if (tmp == 0)
2742 op1 = validize_mem (force_const_mem (op_mode, op1));
2743 else if (tmp == 1)
2744 {
2745 if (TARGET_CMOVE)
2746 op1 = force_reg (op_mode, op1);
2747 }
2748 else
2749 op1 = force_reg (op_mode, op1);
2750 }
2751 }
2752
2753 /* Try to rearrange the comparison to make it cheaper. */
2754 if (ix86_fp_comparison_cost (code)
2755 > ix86_fp_comparison_cost (swap_condition (code))
2756 && (REG_P (op1) || can_create_pseudo_p ()))
2757 {
2758 std::swap (op0, op1);
2759 code = swap_condition (code);
2760 if (!REG_P (op0))
2761 op0 = force_reg (op_mode, op0);
2762 }
2763
2764 *pop0 = op0;
2765 *pop1 = op1;
2766 return code;
2767 }
2768
2769 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2770
2771 static rtx
2772 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2773 {
2774 bool unordered_compare = ix86_unordered_fp_compare (code);
2775 machine_mode cmp_mode;
2776 rtx tmp, scratch;
2777
2778 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2779
2780 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2781 if (unordered_compare)
2782 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2783
2784 /* Do fcomi/sahf based test when profitable. */
2785 switch (ix86_fp_comparison_strategy (code))
2786 {
2787 case IX86_FPCMP_COMI:
2788 cmp_mode = CCFPmode;
2789 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2790 break;
2791
2792 case IX86_FPCMP_SAHF:
2793 cmp_mode = CCFPmode;
2794 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2795 scratch = gen_reg_rtx (HImode);
2796 emit_insn (gen_rtx_SET (scratch, tmp));
2797 emit_insn (gen_x86_sahf_1 (scratch));
2798 break;
2799
2800 case IX86_FPCMP_ARITH:
2801 cmp_mode = CCNOmode;
2802 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2803 scratch = gen_reg_rtx (HImode);
2804 emit_insn (gen_rtx_SET (scratch, tmp));
2805
2806 /* In the unordered case, we have to check C2 for NaN's, which
2807 doesn't happen to work out to anything nice combination-wise.
2808 So do some bit twiddling on the value we've got in AH to come
2809 up with an appropriate set of condition codes. */
2810
2811 switch (code)
2812 {
2813 case GT:
2814 case UNGT:
2815 if (code == GT || !TARGET_IEEE_FP)
2816 {
2817 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2818 code = EQ;
2819 }
2820 else
2821 {
2822 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2823 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2824 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2825 cmp_mode = CCmode;
2826 code = GEU;
2827 }
2828 break;
2829 case LT:
2830 case UNLT:
2831 if (code == LT && TARGET_IEEE_FP)
2832 {
2833 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2834 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2835 cmp_mode = CCmode;
2836 code = EQ;
2837 }
2838 else
2839 {
2840 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2841 code = NE;
2842 }
2843 break;
2844 case GE:
2845 case UNGE:
2846 if (code == GE || !TARGET_IEEE_FP)
2847 {
2848 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2849 code = EQ;
2850 }
2851 else
2852 {
2853 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2854 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2855 code = NE;
2856 }
2857 break;
2858 case LE:
2859 case UNLE:
2860 if (code == LE && TARGET_IEEE_FP)
2861 {
2862 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2863 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2864 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2865 cmp_mode = CCmode;
2866 code = LTU;
2867 }
2868 else
2869 {
2870 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2871 code = NE;
2872 }
2873 break;
2874 case EQ:
2875 case UNEQ:
2876 if (code == EQ && TARGET_IEEE_FP)
2877 {
2878 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2879 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2880 cmp_mode = CCmode;
2881 code = EQ;
2882 }
2883 else
2884 {
2885 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2886 code = NE;
2887 }
2888 break;
2889 case NE:
2890 case LTGT:
2891 if (code == NE && TARGET_IEEE_FP)
2892 {
2893 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2894 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2895 GEN_INT (0x40)));
2896 code = NE;
2897 }
2898 else
2899 {
2900 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2901 code = EQ;
2902 }
2903 break;
2904
2905 case UNORDERED:
2906 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2907 code = NE;
2908 break;
2909 case ORDERED:
2910 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2911 code = EQ;
2912 break;
2913
2914 default:
2915 gcc_unreachable ();
2916 }
2917 break;
2918
2919 default:
2920 gcc_unreachable();
2921 }
2922
2923 /* Return the test that should be put into the flags user, i.e.
2924 the bcc, scc, or cmov instruction. */
2925 return gen_rtx_fmt_ee (code, VOIDmode,
2926 gen_rtx_REG (cmp_mode, FLAGS_REG),
2927 const0_rtx);
2928 }
2929
2930 /* Generate insn patterns to do an integer compare of OPERANDS. */
2931
2932 static rtx
2933 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2934 {
2935 machine_mode cmpmode;
2936 rtx tmp, flags;
2937
2938 /* Swap operands to emit carry flag comparison. */
2939 if ((code == GTU || code == LEU)
2940 && nonimmediate_operand (op1, VOIDmode))
2941 {
2942 std::swap (op0, op1);
2943 code = swap_condition (code);
2944 }
2945
2946 cmpmode = SELECT_CC_MODE (code, op0, op1);
2947 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2948
2949 /* This is very simple, but making the interface the same as in the
2950 FP case makes the rest of the code easier. */
2951 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2952 emit_insn (gen_rtx_SET (flags, tmp));
2953
2954 /* Return the test that should be put into the flags user, i.e.
2955 the bcc, scc, or cmov instruction. */
2956 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2957 }
2958
2959 static rtx
2960 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2961 {
2962 rtx ret;
2963
2964 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2965 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2966
2967 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2968 {
2969 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2970 ret = ix86_expand_fp_compare (code, op0, op1);
2971 }
2972 else
2973 ret = ix86_expand_int_compare (code, op0, op1);
2974
2975 return ret;
2976 }
2977
2978 void
2979 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2980 {
2981 rtx ret;
2982
2983 gcc_assert (GET_MODE (dest) == QImode);
2984
2985 ret = ix86_expand_compare (code, op0, op1);
2986 PUT_MODE (ret, QImode);
2987 emit_insn (gen_rtx_SET (dest, ret));
2988 }
2989
2990 /* Expand floating point op0 <=> op1, i.e.
2991 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2992
2993 void
2994 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
2995 {
2996 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
2997 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
2998 rtx l0 = gen_label_rtx ();
2999 rtx l1 = gen_label_rtx ();
3000 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
3001 rtx lend = gen_label_rtx ();
3002 rtx tmp;
3003 rtx_insn *jmp;
3004 if (l2)
3005 {
3006 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
3007 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3008 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
3009 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
3010 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3011 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
3012 }
3013 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3014 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3015 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3016 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3017 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3018 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3019 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3020 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3021 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3022 add_reg_br_prob_note (jmp, profile_probability::even ());
3023 emit_move_insn (dest, constm1_rtx);
3024 emit_jump (lend);
3025 emit_label (l0);
3026 emit_move_insn (dest, const0_rtx);
3027 emit_jump (lend);
3028 emit_label (l1);
3029 emit_move_insn (dest, const1_rtx);
3030 emit_jump (lend);
3031 if (l2)
3032 {
3033 emit_label (l2);
3034 emit_move_insn (dest, const2_rtx);
3035 }
3036 emit_label (lend);
3037 }
3038
3039 /* Expand comparison setting or clearing carry flag. Return true when
3040 successful and set pop for the operation. */
3041 static bool
3042 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3043 {
3044 machine_mode mode
3045 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3046
3047 /* Do not handle double-mode compares that go through special path. */
3048 if (mode == (TARGET_64BIT ? TImode : DImode))
3049 return false;
3050
3051 if (SCALAR_FLOAT_MODE_P (mode))
3052 {
3053 rtx compare_op;
3054 rtx_insn *compare_seq;
3055
3056 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3057
3058 /* Shortcut: following common codes never translate
3059 into carry flag compares. */
3060 if (code == EQ || code == NE || code == UNEQ || code == LTGT
3061 || code == ORDERED || code == UNORDERED)
3062 return false;
3063
3064 /* These comparisons require zero flag; swap operands so they won't. */
3065 if ((code == GT || code == UNLE || code == LE || code == UNGT)
3066 && !TARGET_IEEE_FP)
3067 {
3068 std::swap (op0, op1);
3069 code = swap_condition (code);
3070 }
3071
3072 /* Try to expand the comparison and verify that we end up with
3073 carry flag based comparison. This fails to be true only when
3074 we decide to expand comparison using arithmetic that is not
3075 too common scenario. */
3076 start_sequence ();
3077 compare_op = ix86_expand_fp_compare (code, op0, op1);
3078 compare_seq = get_insns ();
3079 end_sequence ();
3080
3081 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3082 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3083 else
3084 code = GET_CODE (compare_op);
3085
3086 if (code != LTU && code != GEU)
3087 return false;
3088
3089 emit_insn (compare_seq);
3090 *pop = compare_op;
3091 return true;
3092 }
3093
3094 if (!INTEGRAL_MODE_P (mode))
3095 return false;
3096
3097 switch (code)
3098 {
3099 case LTU:
3100 case GEU:
3101 break;
3102
3103 /* Convert a==0 into (unsigned)a<1. */
3104 case EQ:
3105 case NE:
3106 if (op1 != const0_rtx)
3107 return false;
3108 op1 = const1_rtx;
3109 code = (code == EQ ? LTU : GEU);
3110 break;
3111
3112 /* Convert a>b into b<a or a>=b-1. */
3113 case GTU:
3114 case LEU:
3115 if (CONST_INT_P (op1))
3116 {
3117 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3118 /* Bail out on overflow. We still can swap operands but that
3119 would force loading of the constant into register. */
3120 if (op1 == const0_rtx
3121 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3122 return false;
3123 code = (code == GTU ? GEU : LTU);
3124 }
3125 else
3126 {
3127 std::swap (op0, op1);
3128 code = (code == GTU ? LTU : GEU);
3129 }
3130 break;
3131
3132 /* Convert a>=0 into (unsigned)a<0x80000000. */
3133 case LT:
3134 case GE:
3135 if (mode == DImode || op1 != const0_rtx)
3136 return false;
3137 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3138 code = (code == LT ? GEU : LTU);
3139 break;
3140 case LE:
3141 case GT:
3142 if (mode == DImode || op1 != constm1_rtx)
3143 return false;
3144 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3145 code = (code == LE ? GEU : LTU);
3146 break;
3147
3148 default:
3149 return false;
3150 }
3151 /* Swapping operands may cause constant to appear as first operand. */
3152 if (!nonimmediate_operand (op0, VOIDmode))
3153 {
3154 if (!can_create_pseudo_p ())
3155 return false;
3156 op0 = force_reg (mode, op0);
3157 }
3158 *pop = ix86_expand_compare (code, op0, op1);
3159 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3160 return true;
3161 }
3162
3163 /* Expand conditional increment or decrement using adb/sbb instructions.
3164 The default case using setcc followed by the conditional move can be
3165 done by generic code. */
3166 bool
3167 ix86_expand_int_addcc (rtx operands[])
3168 {
3169 enum rtx_code code = GET_CODE (operands[1]);
3170 rtx flags;
3171 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3172 rtx compare_op;
3173 rtx val = const0_rtx;
3174 bool fpcmp = false;
3175 machine_mode mode;
3176 rtx op0 = XEXP (operands[1], 0);
3177 rtx op1 = XEXP (operands[1], 1);
3178
3179 if (operands[3] != const1_rtx
3180 && operands[3] != constm1_rtx)
3181 return false;
3182 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3183 return false;
3184 code = GET_CODE (compare_op);
3185
3186 flags = XEXP (compare_op, 0);
3187
3188 if (GET_MODE (flags) == CCFPmode)
3189 {
3190 fpcmp = true;
3191 code = ix86_fp_compare_code_to_integer (code);
3192 }
3193
3194 if (code != LTU)
3195 {
3196 val = constm1_rtx;
3197 if (fpcmp)
3198 PUT_CODE (compare_op,
3199 reverse_condition_maybe_unordered
3200 (GET_CODE (compare_op)));
3201 else
3202 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3203 }
3204
3205 mode = GET_MODE (operands[0]);
3206
3207 /* Construct either adc or sbb insn. */
3208 if ((code == LTU) == (operands[3] == constm1_rtx))
3209 insn = gen_sub3_carry;
3210 else
3211 insn = gen_add3_carry;
3212
3213 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3214
3215 return true;
3216 }
3217
3218 bool
3219 ix86_expand_int_movcc (rtx operands[])
3220 {
3221 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3222 rtx_insn *compare_seq;
3223 rtx compare_op;
3224 machine_mode mode = GET_MODE (operands[0]);
3225 bool sign_bit_compare_p = false;
3226 bool negate_cc_compare_p = false;
3227 rtx op0 = XEXP (operands[1], 0);
3228 rtx op1 = XEXP (operands[1], 1);
3229 rtx op2 = operands[2];
3230 rtx op3 = operands[3];
3231
3232 if (GET_MODE (op0) == TImode
3233 || (GET_MODE (op0) == DImode
3234 && !TARGET_64BIT))
3235 return false;
3236
3237 if (GET_MODE (op0) == BFmode
3238 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3239 return false;
3240
3241 start_sequence ();
3242 compare_op = ix86_expand_compare (code, op0, op1);
3243 compare_seq = get_insns ();
3244 end_sequence ();
3245
3246 compare_code = GET_CODE (compare_op);
3247
3248 if ((op1 == const0_rtx && (code == GE || code == LT))
3249 || (op1 == constm1_rtx && (code == GT || code == LE)))
3250 sign_bit_compare_p = true;
3251
3252 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3253 but if op1 is a constant, the latter form allows more optimizations,
3254 either through the last 2 ops being constant handling, or the one
3255 constant and one variable cases. On the other side, for cmov the
3256 former might be better as we don't need to load the constant into
3257 another register. */
3258 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3259 op2 = op1;
3260 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3261 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3262 op3 = op1;
3263
3264 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3265 HImode insns, we'd be swallowed in word prefix ops. */
3266
3267 if ((mode != HImode || TARGET_FAST_PREFIX)
3268 && (mode != (TARGET_64BIT ? TImode : DImode))
3269 && CONST_INT_P (op2)
3270 && CONST_INT_P (op3))
3271 {
3272 rtx out = operands[0];
3273 HOST_WIDE_INT ct = INTVAL (op2);
3274 HOST_WIDE_INT cf = INTVAL (op3);
3275 HOST_WIDE_INT diff;
3276
3277 if ((mode == SImode
3278 || (TARGET_64BIT && mode == DImode))
3279 && (GET_MODE (op0) == SImode
3280 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3281 {
3282 /* Special case x != 0 ? -1 : y. */
3283 if (code == NE && op1 == const0_rtx && ct == -1)
3284 {
3285 negate_cc_compare_p = true;
3286 std::swap (ct, cf);
3287 code = EQ;
3288 }
3289 else if (code == EQ && op1 == const0_rtx && cf == -1)
3290 negate_cc_compare_p = true;
3291 }
3292
3293 diff = ct - cf;
3294 /* Sign bit compares are better done using shifts than we do by using
3295 sbb. */
3296 if (sign_bit_compare_p
3297 || negate_cc_compare_p
3298 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3299 {
3300 /* Detect overlap between destination and compare sources. */
3301 rtx tmp = out;
3302
3303 if (negate_cc_compare_p)
3304 {
3305 if (GET_MODE (op0) == DImode)
3306 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3307 else
3308 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3309 gen_lowpart (SImode, op0)));
3310
3311 tmp = gen_reg_rtx (mode);
3312 if (mode == DImode)
3313 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3314 else
3315 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3316 tmp)));
3317 }
3318 else if (!sign_bit_compare_p)
3319 {
3320 rtx flags;
3321 bool fpcmp = false;
3322
3323 compare_code = GET_CODE (compare_op);
3324
3325 flags = XEXP (compare_op, 0);
3326
3327 if (GET_MODE (flags) == CCFPmode)
3328 {
3329 fpcmp = true;
3330 compare_code
3331 = ix86_fp_compare_code_to_integer (compare_code);
3332 }
3333
3334 /* To simplify rest of code, restrict to the GEU case. */
3335 if (compare_code == LTU)
3336 {
3337 std::swap (ct, cf);
3338 compare_code = reverse_condition (compare_code);
3339 code = reverse_condition (code);
3340 }
3341 else
3342 {
3343 if (fpcmp)
3344 PUT_CODE (compare_op,
3345 reverse_condition_maybe_unordered
3346 (GET_CODE (compare_op)));
3347 else
3348 PUT_CODE (compare_op,
3349 reverse_condition (GET_CODE (compare_op)));
3350 }
3351 diff = ct - cf;
3352
3353 if (reg_overlap_mentioned_p (out, compare_op))
3354 tmp = gen_reg_rtx (mode);
3355
3356 if (mode == DImode)
3357 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3358 else
3359 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3360 flags, compare_op));
3361 }
3362 else
3363 {
3364 if (code == GT || code == GE)
3365 code = reverse_condition (code);
3366 else
3367 {
3368 std::swap (ct, cf);
3369 diff = ct - cf;
3370 }
3371 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3372 }
3373
3374 if (diff == 1)
3375 {
3376 /*
3377 * cmpl op0,op1
3378 * sbbl dest,dest
3379 * [addl dest, ct]
3380 *
3381 * Size 5 - 8.
3382 */
3383 if (ct)
3384 tmp = expand_simple_binop (mode, PLUS,
3385 tmp, GEN_INT (ct),
3386 copy_rtx (tmp), 1, OPTAB_DIRECT);
3387 }
3388 else if (cf == -1)
3389 {
3390 /*
3391 * cmpl op0,op1
3392 * sbbl dest,dest
3393 * orl $ct, dest
3394 *
3395 * Size 8.
3396 */
3397 tmp = expand_simple_binop (mode, IOR,
3398 tmp, GEN_INT (ct),
3399 copy_rtx (tmp), 1, OPTAB_DIRECT);
3400 }
3401 else if (diff == -1 && ct)
3402 {
3403 /*
3404 * cmpl op0,op1
3405 * sbbl dest,dest
3406 * notl dest
3407 * [addl dest, cf]
3408 *
3409 * Size 8 - 11.
3410 */
3411 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3412 if (cf)
3413 tmp = expand_simple_binop (mode, PLUS,
3414 copy_rtx (tmp), GEN_INT (cf),
3415 copy_rtx (tmp), 1, OPTAB_DIRECT);
3416 }
3417 else
3418 {
3419 /*
3420 * cmpl op0,op1
3421 * sbbl dest,dest
3422 * [notl dest]
3423 * andl cf - ct, dest
3424 * [addl dest, ct]
3425 *
3426 * Size 8 - 11.
3427 */
3428
3429 if (cf == 0)
3430 {
3431 cf = ct;
3432 ct = 0;
3433 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3434 }
3435
3436 tmp = expand_simple_binop (mode, AND,
3437 copy_rtx (tmp),
3438 gen_int_mode (cf - ct, mode),
3439 copy_rtx (tmp), 1, OPTAB_DIRECT);
3440 if (ct)
3441 tmp = expand_simple_binop (mode, PLUS,
3442 copy_rtx (tmp), GEN_INT (ct),
3443 copy_rtx (tmp), 1, OPTAB_DIRECT);
3444 }
3445
3446 if (!rtx_equal_p (tmp, out))
3447 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3448
3449 return true;
3450 }
3451
3452 if (diff < 0)
3453 {
3454 machine_mode cmp_mode = GET_MODE (op0);
3455 enum rtx_code new_code;
3456
3457 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3458 {
3459 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3460
3461 /* We may be reversing a non-trapping
3462 comparison to a trapping comparison. */
3463 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3464 && code != EQ && code != NE
3465 && code != ORDERED && code != UNORDERED)
3466 new_code = UNKNOWN;
3467 else
3468 new_code = reverse_condition_maybe_unordered (code);
3469 }
3470 else
3471 new_code = ix86_reverse_condition (code, cmp_mode);
3472 if (new_code != UNKNOWN)
3473 {
3474 std::swap (ct, cf);
3475 diff = -diff;
3476 code = new_code;
3477 }
3478 }
3479
3480 compare_code = UNKNOWN;
3481 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3482 && CONST_INT_P (op1))
3483 {
3484 if (op1 == const0_rtx
3485 && (code == LT || code == GE))
3486 compare_code = code;
3487 else if (op1 == constm1_rtx)
3488 {
3489 if (code == LE)
3490 compare_code = LT;
3491 else if (code == GT)
3492 compare_code = GE;
3493 }
3494 }
3495
3496 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3497 if (compare_code != UNKNOWN
3498 && GET_MODE (op0) == GET_MODE (out)
3499 && (cf == -1 || ct == -1))
3500 {
3501 /* If lea code below could be used, only optimize
3502 if it results in a 2 insn sequence. */
3503
3504 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3505 || diff == 3 || diff == 5 || diff == 9)
3506 || (compare_code == LT && ct == -1)
3507 || (compare_code == GE && cf == -1))
3508 {
3509 /*
3510 * notl op1 (if necessary)
3511 * sarl $31, op1
3512 * orl cf, op1
3513 */
3514 if (ct != -1)
3515 {
3516 cf = ct;
3517 ct = -1;
3518 code = reverse_condition (code);
3519 }
3520
3521 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3522
3523 out = expand_simple_binop (mode, IOR,
3524 out, GEN_INT (cf),
3525 out, 1, OPTAB_DIRECT);
3526 if (out != operands[0])
3527 emit_move_insn (operands[0], out);
3528
3529 return true;
3530 }
3531 }
3532
3533
3534 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3535 || diff == 3 || diff == 5 || diff == 9)
3536 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3537 && (mode != DImode
3538 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3539 {
3540 /*
3541 * xorl dest,dest
3542 * cmpl op1,op2
3543 * setcc dest
3544 * lea cf(dest*(ct-cf)),dest
3545 *
3546 * Size 14.
3547 *
3548 * This also catches the degenerate setcc-only case.
3549 */
3550
3551 rtx tmp;
3552 int nops;
3553
3554 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3555
3556 nops = 0;
3557 /* On x86_64 the lea instruction operates on Pmode, so we need
3558 to get arithmetics done in proper mode to match. */
3559 if (diff == 1)
3560 tmp = copy_rtx (out);
3561 else
3562 {
3563 rtx out1;
3564 out1 = copy_rtx (out);
3565 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3566 nops++;
3567 if (diff & 1)
3568 {
3569 tmp = gen_rtx_PLUS (mode, tmp, out1);
3570 nops++;
3571 }
3572 }
3573 if (cf != 0)
3574 {
3575 tmp = plus_constant (mode, tmp, cf);
3576 nops++;
3577 }
3578 if (!rtx_equal_p (tmp, out))
3579 {
3580 if (nops == 1)
3581 out = force_operand (tmp, copy_rtx (out));
3582 else
3583 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3584 }
3585 if (!rtx_equal_p (out, operands[0]))
3586 emit_move_insn (operands[0], copy_rtx (out));
3587
3588 return true;
3589 }
3590
3591 /*
3592 * General case: Jumpful:
3593 * xorl dest,dest cmpl op1, op2
3594 * cmpl op1, op2 movl ct, dest
3595 * setcc dest jcc 1f
3596 * decl dest movl cf, dest
3597 * andl (cf-ct),dest 1:
3598 * addl ct,dest
3599 *
3600 * Size 20. Size 14.
3601 *
3602 * This is reasonably steep, but branch mispredict costs are
3603 * high on modern cpus, so consider failing only if optimizing
3604 * for space.
3605 */
3606
3607 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3608 && BRANCH_COST (optimize_insn_for_speed_p (),
3609 false) >= 2)
3610 {
3611 if (cf == 0)
3612 {
3613 machine_mode cmp_mode = GET_MODE (op0);
3614 enum rtx_code new_code;
3615
3616 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3617 {
3618 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3619
3620 /* We may be reversing a non-trapping
3621 comparison to a trapping comparison. */
3622 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3623 && code != EQ && code != NE
3624 && code != ORDERED && code != UNORDERED)
3625 new_code = UNKNOWN;
3626 else
3627 new_code = reverse_condition_maybe_unordered (code);
3628
3629 }
3630 else
3631 {
3632 new_code = ix86_reverse_condition (code, cmp_mode);
3633 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3634 compare_code = reverse_condition (compare_code);
3635 }
3636
3637 if (new_code != UNKNOWN)
3638 {
3639 cf = ct;
3640 ct = 0;
3641 code = new_code;
3642 }
3643 }
3644
3645 if (compare_code != UNKNOWN)
3646 {
3647 /* notl op1 (if needed)
3648 sarl $31, op1
3649 andl (cf-ct), op1
3650 addl ct, op1
3651
3652 For x < 0 (resp. x <= -1) there will be no notl,
3653 so if possible swap the constants to get rid of the
3654 complement.
3655 True/false will be -1/0 while code below (store flag
3656 followed by decrement) is 0/-1, so the constants need
3657 to be exchanged once more. */
3658
3659 if (compare_code == GE || !cf)
3660 {
3661 code = reverse_condition (code);
3662 compare_code = LT;
3663 }
3664 else
3665 std::swap (ct, cf);
3666
3667 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3668 }
3669 else
3670 {
3671 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3672
3673 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3674 constm1_rtx,
3675 copy_rtx (out), 1, OPTAB_DIRECT);
3676 }
3677
3678 out = expand_simple_binop (mode, AND, copy_rtx (out),
3679 gen_int_mode (cf - ct, mode),
3680 copy_rtx (out), 1, OPTAB_DIRECT);
3681 if (ct)
3682 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3683 copy_rtx (out), 1, OPTAB_DIRECT);
3684 if (!rtx_equal_p (out, operands[0]))
3685 emit_move_insn (operands[0], copy_rtx (out));
3686
3687 return true;
3688 }
3689 }
3690
3691 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3692 {
3693 /* Try a few things more with specific constants and a variable. */
3694
3695 optab op;
3696 rtx var, orig_out, out, tmp;
3697
3698 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3699 return false;
3700
3701 operands[2] = op2;
3702 operands[3] = op3;
3703
3704 /* If one of the two operands is an interesting constant, load a
3705 constant with the above and mask it in with a logical operation. */
3706
3707 if (CONST_INT_P (operands[2]))
3708 {
3709 var = operands[3];
3710 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3711 operands[3] = constm1_rtx, op = and_optab;
3712 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3713 operands[3] = const0_rtx, op = ior_optab;
3714 else
3715 return false;
3716 }
3717 else if (CONST_INT_P (operands[3]))
3718 {
3719 var = operands[2];
3720 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3721 {
3722 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3723 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3724 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3725 operands[1] = simplify_gen_relational (LT, VOIDmode,
3726 GET_MODE (op0),
3727 op0, const0_rtx);
3728
3729 operands[2] = constm1_rtx;
3730 op = and_optab;
3731 }
3732 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3733 operands[2] = const0_rtx, op = ior_optab;
3734 else
3735 return false;
3736 }
3737 else
3738 return false;
3739
3740 orig_out = operands[0];
3741 tmp = gen_reg_rtx (mode);
3742 operands[0] = tmp;
3743
3744 /* Recurse to get the constant loaded. */
3745 if (!ix86_expand_int_movcc (operands))
3746 return false;
3747
3748 /* Mask in the interesting variable. */
3749 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3750 OPTAB_WIDEN);
3751 if (!rtx_equal_p (out, orig_out))
3752 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3753
3754 return true;
3755 }
3756
3757 /*
3758 * For comparison with above,
3759 *
3760 * movl cf,dest
3761 * movl ct,tmp
3762 * cmpl op1,op2
3763 * cmovcc tmp,dest
3764 *
3765 * Size 15.
3766 */
3767
3768 if (! nonimmediate_operand (operands[2], mode))
3769 operands[2] = force_reg (mode, operands[2]);
3770 if (! nonimmediate_operand (operands[3], mode))
3771 operands[3] = force_reg (mode, operands[3]);
3772
3773 if (! register_operand (operands[2], VOIDmode)
3774 && (mode == QImode
3775 || ! register_operand (operands[3], VOIDmode)))
3776 operands[2] = force_reg (mode, operands[2]);
3777
3778 if (mode == QImode
3779 && ! register_operand (operands[3], VOIDmode))
3780 operands[3] = force_reg (mode, operands[3]);
3781
3782 emit_insn (compare_seq);
3783 emit_insn (gen_rtx_SET (operands[0],
3784 gen_rtx_IF_THEN_ELSE (mode,
3785 compare_op, operands[2],
3786 operands[3])));
3787 return true;
3788 }
3789
3790 /* Detect conditional moves that exactly match min/max operational
3791 semantics. Note that this is IEEE safe, as long as we don't
3792 interchange the operands.
3793
3794 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3795 and TRUE if the operation is successful and instructions are emitted. */
3796
3797 static bool
3798 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3799 rtx cmp_op1, rtx if_true, rtx if_false)
3800 {
3801 machine_mode mode;
3802 bool is_min;
3803 rtx tmp;
3804
3805 if (code == LT)
3806 ;
3807 else if (code == UNGE)
3808 std::swap (if_true, if_false);
3809 else
3810 return false;
3811
3812 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3813 is_min = true;
3814 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3815 is_min = false;
3816 else
3817 return false;
3818
3819 mode = GET_MODE (dest);
3820
3821 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3822 but MODE may be a vector mode and thus not appropriate. */
3823 if (!flag_finite_math_only || flag_signed_zeros)
3824 {
3825 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3826 rtvec v;
3827
3828 if_true = force_reg (mode, if_true);
3829 v = gen_rtvec (2, if_true, if_false);
3830 tmp = gen_rtx_UNSPEC (mode, v, u);
3831 }
3832 else
3833 {
3834 code = is_min ? SMIN : SMAX;
3835 if (MEM_P (if_true) && MEM_P (if_false))
3836 if_true = force_reg (mode, if_true);
3837 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3838 }
3839
3840 emit_insn (gen_rtx_SET (dest, tmp));
3841 return true;
3842 }
3843
3844 /* Return true if MODE is valid for vector compare to mask register,
3845 Same result for conditionl vector move with mask register. */
3846 static bool
3847 ix86_valid_mask_cmp_mode (machine_mode mode)
3848 {
3849 /* XOP has its own vector conditional movement. */
3850 if (TARGET_XOP && !TARGET_AVX512F)
3851 return false;
3852
3853 /* HFmode only supports vcmpsh whose dest is mask register. */
3854 if (TARGET_AVX512FP16 && mode == HFmode)
3855 return true;
3856
3857 /* AVX512F is needed for mask operation. */
3858 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3859 return false;
3860
3861 /* AVX512BW is needed for vector QI/HImode,
3862 AVX512VL is needed for 128/256-bit vector. */
3863 machine_mode inner_mode = GET_MODE_INNER (mode);
3864 int vector_size = GET_MODE_SIZE (mode);
3865 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3866 return false;
3867
3868 return vector_size == 64 || TARGET_AVX512VL;
3869 }
3870
3871 /* Return true if integer mask comparison should be used. */
3872 static bool
3873 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3874 rtx op_true, rtx op_false)
3875 {
3876 int vector_size = GET_MODE_SIZE (mode);
3877
3878 if (cmp_mode == HFmode)
3879 return true;
3880 else if (vector_size < 16)
3881 return false;
3882 else if (vector_size == 64)
3883 return true;
3884 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3885 return true;
3886
3887 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3888 gcc_assert (!op_true == !op_false);
3889
3890 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3891 vector dest is required. */
3892 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3893 return false;
3894
3895 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3896 if (op_false == CONST0_RTX (mode)
3897 || op_true == CONST0_RTX (mode)
3898 || (INTEGRAL_MODE_P (mode)
3899 && (op_true == CONSTM1_RTX (mode)
3900 || op_false == CONSTM1_RTX (mode))))
3901 return false;
3902
3903 return true;
3904 }
3905
3906 /* Expand an SSE comparison. Return the register with the result. */
3907
3908 static rtx
3909 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3910 rtx op_true, rtx op_false)
3911 {
3912 machine_mode mode = GET_MODE (dest);
3913 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3914
3915 /* In general case result of comparison can differ from operands' type. */
3916 machine_mode cmp_mode;
3917
3918 /* In AVX512F the result of comparison is an integer mask. */
3919 bool maskcmp = false;
3920 rtx x;
3921
3922 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3923 {
3924 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3925 maskcmp = true;
3926 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3927 }
3928 else
3929 cmp_mode = cmp_ops_mode;
3930
3931 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3932
3933 bool (*op1_predicate)(rtx, machine_mode)
3934 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3935
3936 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3937 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3938
3939 if (optimize
3940 || (maskcmp && cmp_mode != mode)
3941 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3942 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3943 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3944
3945 if (maskcmp)
3946 {
3947 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3948 gcc_assert (ok);
3949 return dest;
3950 }
3951
3952 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3953
3954 if (cmp_mode != mode)
3955 {
3956 x = force_reg (cmp_ops_mode, x);
3957 convert_move (dest, x, false);
3958 }
3959 else
3960 emit_insn (gen_rtx_SET (dest, x));
3961
3962 return dest;
3963 }
3964
3965 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3966 instructions that can be performed using GP registers. */
3967
3968 static void
3969 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
3970 rtx dst, rtx src1, rtx src2)
3971 {
3972 rtx tmp;
3973
3974 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
3975
3976 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
3977 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
3978 {
3979 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
3980 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
3981 }
3982
3983 emit_insn (tmp);
3984 }
3985
3986 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3987 operations. This is used for both scalar and vector conditional moves. */
3988
3989 void
3990 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3991 {
3992 machine_mode mode = GET_MODE (dest);
3993 machine_mode cmpmode = GET_MODE (cmp);
3994 rtx x;
3995
3996 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3997 if (rtx_equal_p (op_true, op_false))
3998 {
3999 emit_move_insn (dest, op_true);
4000 return;
4001 }
4002
4003 /* If we have an integer mask and FP value then we need
4004 to cast mask to FP mode. */
4005 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
4006 {
4007 cmp = force_reg (cmpmode, cmp);
4008 cmp = gen_rtx_SUBREG (mode, cmp, 0);
4009 }
4010
4011 /* In AVX512F the result of comparison is an integer mask. */
4012 if (mode != cmpmode
4013 && GET_MODE_CLASS (cmpmode) == MODE_INT)
4014 {
4015 gcc_assert (ix86_valid_mask_cmp_mode (mode));
4016 /* Using scalar/vector move with mask register. */
4017 cmp = force_reg (cmpmode, cmp);
4018 /* Optimize for mask zero. */
4019 op_true = (op_true != CONST0_RTX (mode)
4020 ? force_reg (mode, op_true) : op_true);
4021 op_false = (op_false != CONST0_RTX (mode)
4022 ? force_reg (mode, op_false) : op_false);
4023 if (op_true == CONST0_RTX (mode))
4024 {
4025 if (cmpmode == E_DImode && !TARGET_64BIT)
4026 {
4027 x = gen_reg_rtx (cmpmode);
4028 emit_insn (gen_knotdi (x, cmp));
4029 }
4030 else
4031 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4032 cmp = x;
4033 /* Reverse op_true op_false. */
4034 std::swap (op_true, op_false);
4035 }
4036
4037 if (mode == HFmode)
4038 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4039 else
4040 emit_insn (gen_rtx_SET (dest,
4041 gen_rtx_VEC_MERGE (mode,
4042 op_true, op_false, cmp)));
4043 return;
4044 }
4045
4046 if (vector_all_ones_operand (op_true, mode)
4047 && op_false == CONST0_RTX (mode))
4048 {
4049 emit_move_insn (dest, cmp);
4050 return;
4051 }
4052 else if (op_false == CONST0_RTX (mode))
4053 {
4054 x = expand_simple_binop (mode, AND, cmp, op_true,
4055 dest, 1, OPTAB_DIRECT);
4056 if (x != dest)
4057 emit_move_insn (dest, x);
4058 return;
4059 }
4060 else if (op_true == CONST0_RTX (mode))
4061 {
4062 op_false = force_reg (mode, op_false);
4063 x = gen_rtx_NOT (mode, cmp);
4064 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
4065 return;
4066 }
4067 else if (vector_all_ones_operand (op_true, mode))
4068 {
4069 x = expand_simple_binop (mode, IOR, cmp, op_false,
4070 dest, 1, OPTAB_DIRECT);
4071 if (x != dest)
4072 emit_move_insn (dest, x);
4073 return;
4074 }
4075
4076 if (TARGET_XOP)
4077 {
4078 op_true = force_reg (mode, op_true);
4079
4080 if (GET_MODE_SIZE (mode) < 16
4081 || !nonimmediate_operand (op_false, mode))
4082 op_false = force_reg (mode, op_false);
4083
4084 emit_insn (gen_rtx_SET (dest,
4085 gen_rtx_IF_THEN_ELSE (mode, cmp,
4086 op_true, op_false)));
4087 return;
4088 }
4089
4090 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4091 machine_mode blend_mode = mode;
4092
4093 if (GET_MODE_SIZE (mode) < 16
4094 || !vector_operand (op_true, mode))
4095 op_true = force_reg (mode, op_true);
4096
4097 op_false = force_reg (mode, op_false);
4098
4099 switch (mode)
4100 {
4101 case E_V2SFmode:
4102 if (TARGET_SSE4_1)
4103 gen = gen_mmx_blendvps;
4104 break;
4105 case E_V4SFmode:
4106 if (TARGET_SSE4_1)
4107 gen = gen_sse4_1_blendvps;
4108 break;
4109 case E_V2DFmode:
4110 if (TARGET_SSE4_1)
4111 gen = gen_sse4_1_blendvpd;
4112 break;
4113 case E_SFmode:
4114 if (TARGET_SSE4_1)
4115 gen = gen_sse4_1_blendvss;
4116 break;
4117 case E_DFmode:
4118 if (TARGET_SSE4_1)
4119 gen = gen_sse4_1_blendvsd;
4120 break;
4121 case E_V8QImode:
4122 case E_V4HImode:
4123 case E_V2SImode:
4124 if (TARGET_SSE4_1)
4125 {
4126 gen = gen_mmx_pblendvb_v8qi;
4127 blend_mode = V8QImode;
4128 }
4129 break;
4130 case E_V4QImode:
4131 case E_V2HImode:
4132 if (TARGET_SSE4_1)
4133 {
4134 gen = gen_mmx_pblendvb_v4qi;
4135 blend_mode = V4QImode;
4136 }
4137 break;
4138 case E_V2QImode:
4139 if (TARGET_SSE4_1)
4140 gen = gen_mmx_pblendvb_v2qi;
4141 break;
4142 case E_V16QImode:
4143 case E_V8HImode:
4144 case E_V8HFmode:
4145 case E_V8BFmode:
4146 case E_V4SImode:
4147 case E_V2DImode:
4148 case E_V1TImode:
4149 if (TARGET_SSE4_1)
4150 {
4151 gen = gen_sse4_1_pblendvb;
4152 blend_mode = V16QImode;
4153 }
4154 break;
4155 case E_V8SFmode:
4156 if (TARGET_AVX)
4157 gen = gen_avx_blendvps256;
4158 break;
4159 case E_V4DFmode:
4160 if (TARGET_AVX)
4161 gen = gen_avx_blendvpd256;
4162 break;
4163 case E_V32QImode:
4164 case E_V16HImode:
4165 case E_V16HFmode:
4166 case E_V16BFmode:
4167 case E_V8SImode:
4168 case E_V4DImode:
4169 if (TARGET_AVX2)
4170 {
4171 gen = gen_avx2_pblendvb;
4172 blend_mode = V32QImode;
4173 }
4174 break;
4175
4176 case E_V64QImode:
4177 gen = gen_avx512bw_blendmv64qi;
4178 break;
4179 case E_V32HImode:
4180 gen = gen_avx512bw_blendmv32hi;
4181 break;
4182 case E_V32HFmode:
4183 gen = gen_avx512bw_blendmv32hf;
4184 break;
4185 case E_V32BFmode:
4186 gen = gen_avx512bw_blendmv32bf;
4187 break;
4188 case E_V16SImode:
4189 gen = gen_avx512f_blendmv16si;
4190 break;
4191 case E_V8DImode:
4192 gen = gen_avx512f_blendmv8di;
4193 break;
4194 case E_V8DFmode:
4195 gen = gen_avx512f_blendmv8df;
4196 break;
4197 case E_V16SFmode:
4198 gen = gen_avx512f_blendmv16sf;
4199 break;
4200
4201 default:
4202 break;
4203 }
4204
4205 if (gen != NULL)
4206 {
4207 if (blend_mode == mode)
4208 x = dest;
4209 else
4210 {
4211 x = gen_reg_rtx (blend_mode);
4212 op_false = gen_lowpart (blend_mode, op_false);
4213 op_true = gen_lowpart (blend_mode, op_true);
4214 cmp = gen_lowpart (blend_mode, cmp);
4215 }
4216
4217 emit_insn (gen (x, op_false, op_true, cmp));
4218
4219 if (x != dest)
4220 emit_move_insn (dest, gen_lowpart (mode, x));
4221 }
4222 else
4223 {
4224 rtx t2, t3;
4225
4226 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4227 NULL, 1, OPTAB_DIRECT);
4228
4229 t3 = gen_reg_rtx (mode);
4230 x = gen_rtx_NOT (mode, cmp);
4231 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4232
4233 x = expand_simple_binop (mode, IOR, t3, t2,
4234 dest, 1, OPTAB_DIRECT);
4235 if (x != dest)
4236 emit_move_insn (dest, x);
4237 }
4238 }
4239
4240 /* Swap, force into registers, or otherwise massage the two operands
4241 to an sse comparison with a mask result. Thus we differ a bit from
4242 ix86_prepare_fp_compare_args which expects to produce a flags result.
4243
4244 The DEST operand exists to help determine whether to commute commutative
4245 operators. The POP0/POP1 operands are updated in place. The new
4246 comparison code is returned, or UNKNOWN if not implementable. */
4247
4248 static enum rtx_code
4249 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4250 rtx *pop0, rtx *pop1)
4251 {
4252 switch (code)
4253 {
4254 case LTGT:
4255 case UNEQ:
4256 /* AVX supports all the needed comparisons. */
4257 if (TARGET_AVX)
4258 break;
4259 /* We have no LTGT as an operator. We could implement it with
4260 NE & ORDERED, but this requires an extra temporary. It's
4261 not clear that it's worth it. */
4262 return UNKNOWN;
4263
4264 case LT:
4265 case LE:
4266 case UNGT:
4267 case UNGE:
4268 /* These are supported directly. */
4269 break;
4270
4271 case EQ:
4272 case NE:
4273 case UNORDERED:
4274 case ORDERED:
4275 /* AVX has 3 operand comparisons, no need to swap anything. */
4276 if (TARGET_AVX)
4277 break;
4278 /* For commutative operators, try to canonicalize the destination
4279 operand to be first in the comparison - this helps reload to
4280 avoid extra moves. */
4281 if (!dest || !rtx_equal_p (dest, *pop1))
4282 break;
4283 /* FALLTHRU */
4284
4285 case GE:
4286 case GT:
4287 case UNLE:
4288 case UNLT:
4289 /* These are not supported directly before AVX, and furthermore
4290 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4291 comparison operands to transform into something that is
4292 supported. */
4293 std::swap (*pop0, *pop1);
4294 code = swap_condition (code);
4295 break;
4296
4297 default:
4298 gcc_unreachable ();
4299 }
4300
4301 return code;
4302 }
4303
4304 /* Expand a floating-point conditional move. Return true if successful. */
4305
4306 bool
4307 ix86_expand_fp_movcc (rtx operands[])
4308 {
4309 machine_mode mode = GET_MODE (operands[0]);
4310 enum rtx_code code = GET_CODE (operands[1]);
4311 rtx tmp, compare_op;
4312 rtx op0 = XEXP (operands[1], 0);
4313 rtx op1 = XEXP (operands[1], 1);
4314
4315 if (GET_MODE (op0) == BFmode
4316 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4317 return false;
4318
4319 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4320 {
4321 machine_mode cmode;
4322
4323 /* Since we've no cmove for sse registers, don't force bad register
4324 allocation just to gain access to it. Deny movcc when the
4325 comparison mode doesn't match the move mode. */
4326 cmode = GET_MODE (op0);
4327 if (cmode == VOIDmode)
4328 cmode = GET_MODE (op1);
4329 if (cmode != mode)
4330 return false;
4331
4332 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4333 if (code == UNKNOWN)
4334 return false;
4335
4336 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4337 operands[2], operands[3]))
4338 return true;
4339
4340 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4341 operands[2], operands[3]);
4342 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4343 return true;
4344 }
4345
4346 if (GET_MODE (op0) == TImode
4347 || (GET_MODE (op0) == DImode
4348 && !TARGET_64BIT))
4349 return false;
4350
4351 /* The floating point conditional move instructions don't directly
4352 support conditions resulting from a signed integer comparison. */
4353
4354 compare_op = ix86_expand_compare (code, op0, op1);
4355 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4356 {
4357 tmp = gen_reg_rtx (QImode);
4358 ix86_expand_setcc (tmp, code, op0, op1);
4359
4360 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4361 }
4362
4363 emit_insn (gen_rtx_SET (operands[0],
4364 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4365 operands[2], operands[3])));
4366
4367 return true;
4368 }
4369
4370 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4371
4372 static int
4373 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4374 {
4375 switch (code)
4376 {
4377 case EQ:
4378 return 0;
4379 case LT:
4380 case LTU:
4381 return 1;
4382 case LE:
4383 case LEU:
4384 return 2;
4385 case NE:
4386 return 4;
4387 case GE:
4388 case GEU:
4389 return 5;
4390 case GT:
4391 case GTU:
4392 return 6;
4393 default:
4394 gcc_unreachable ();
4395 }
4396 }
4397
4398 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4399
4400 static int
4401 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4402 {
4403 switch (code)
4404 {
4405 case EQ:
4406 return 0x00;
4407 case NE:
4408 return 0x04;
4409 case GT:
4410 return 0x0e;
4411 case LE:
4412 return 0x02;
4413 case GE:
4414 return 0x0d;
4415 case LT:
4416 return 0x01;
4417 case UNLE:
4418 return 0x0a;
4419 case UNLT:
4420 return 0x09;
4421 case UNGE:
4422 return 0x05;
4423 case UNGT:
4424 return 0x06;
4425 case UNEQ:
4426 return 0x18;
4427 case LTGT:
4428 return 0x0c;
4429 case ORDERED:
4430 return 0x07;
4431 case UNORDERED:
4432 return 0x03;
4433 default:
4434 gcc_unreachable ();
4435 }
4436 }
4437
4438 /* Return immediate value to be used in UNSPEC_PCMP
4439 for comparison CODE in MODE. */
4440
4441 static int
4442 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4443 {
4444 if (FLOAT_MODE_P (mode))
4445 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4446 return ix86_int_cmp_code_to_pcmp_immediate (code);
4447 }
4448
4449 /* Expand AVX-512 vector comparison. */
4450
4451 bool
4452 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4453 {
4454 machine_mode mask_mode = GET_MODE (dest);
4455 machine_mode cmp_mode = GET_MODE (cmp_op0);
4456 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4457 int unspec_code;
4458 rtx unspec;
4459
4460 switch (code)
4461 {
4462 case LEU:
4463 case GTU:
4464 case GEU:
4465 case LTU:
4466 unspec_code = UNSPEC_UNSIGNED_PCMP;
4467 break;
4468
4469 default:
4470 unspec_code = UNSPEC_PCMP;
4471 }
4472
4473 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4474 unspec_code);
4475 emit_insn (gen_rtx_SET (dest, unspec));
4476
4477 return true;
4478 }
4479
4480 /* Expand fp vector comparison. */
4481
4482 bool
4483 ix86_expand_fp_vec_cmp (rtx operands[])
4484 {
4485 enum rtx_code code = GET_CODE (operands[1]);
4486 rtx cmp;
4487
4488 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4489 &operands[2], &operands[3]);
4490 if (code == UNKNOWN)
4491 {
4492 rtx temp;
4493 switch (GET_CODE (operands[1]))
4494 {
4495 case LTGT:
4496 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4497 operands[3], NULL, NULL);
4498 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4499 operands[3], NULL, NULL);
4500 code = AND;
4501 break;
4502 case UNEQ:
4503 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4504 operands[3], NULL, NULL);
4505 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4506 operands[3], NULL, NULL);
4507 code = IOR;
4508 break;
4509 default:
4510 gcc_unreachable ();
4511 }
4512 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4513 OPTAB_DIRECT);
4514 }
4515 else
4516 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4517 NULL, NULL);
4518
4519 if (operands[0] != cmp)
4520 emit_move_insn (operands[0], cmp);
4521
4522 return true;
4523 }
4524
4525 static rtx
4526 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4527 rtx op_true, rtx op_false, bool *negate)
4528 {
4529 machine_mode data_mode = GET_MODE (dest);
4530 machine_mode mode = GET_MODE (cop0);
4531 rtx x;
4532
4533 *negate = false;
4534
4535 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4536 if (TARGET_XOP
4537 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4538 && GET_MODE_SIZE (mode) <= 16)
4539 ;
4540 /* AVX512F supports all of the comparsions
4541 on all 128/256/512-bit vector int types. */
4542 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4543 ;
4544 else
4545 {
4546 /* Canonicalize the comparison to EQ, GT, GTU. */
4547 switch (code)
4548 {
4549 case EQ:
4550 case GT:
4551 case GTU:
4552 break;
4553
4554 case LE:
4555 case LEU:
4556 /* x <= cst can be handled as x < cst + 1 unless there is
4557 wrap around in cst + 1. */
4558 if (GET_CODE (cop1) == CONST_VECTOR
4559 && GET_MODE_INNER (mode) != TImode)
4560 {
4561 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4562 machine_mode eltmode = GET_MODE_INNER (mode);
4563 for (i = 0; i < n_elts; ++i)
4564 {
4565 rtx elt = CONST_VECTOR_ELT (cop1, i);
4566 if (!CONST_INT_P (elt))
4567 break;
4568 if (code == GE)
4569 {
4570 /* For LE punt if some element is signed maximum. */
4571 if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4572 == (GET_MODE_MASK (eltmode) >> 1))
4573 break;
4574 }
4575 /* For LEU punt if some element is unsigned maximum. */
4576 else if (elt == constm1_rtx)
4577 break;
4578 }
4579 if (i == n_elts)
4580 {
4581 rtvec v = rtvec_alloc (n_elts);
4582 for (i = 0; i < n_elts; ++i)
4583 RTVEC_ELT (v, i)
4584 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4585 eltmode);
4586 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4587 std::swap (cop0, cop1);
4588 code = code == LE ? GT : GTU;
4589 break;
4590 }
4591 }
4592 /* FALLTHRU */
4593 case NE:
4594 code = reverse_condition (code);
4595 *negate = true;
4596 break;
4597
4598 case GE:
4599 case GEU:
4600 /* x >= cst can be handled as x > cst - 1 unless there is
4601 wrap around in cst - 1. */
4602 if (GET_CODE (cop1) == CONST_VECTOR
4603 && GET_MODE_INNER (mode) != TImode)
4604 {
4605 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4606 machine_mode eltmode = GET_MODE_INNER (mode);
4607 for (i = 0; i < n_elts; ++i)
4608 {
4609 rtx elt = CONST_VECTOR_ELT (cop1, i);
4610 if (!CONST_INT_P (elt))
4611 break;
4612 if (code == GE)
4613 {
4614 /* For GE punt if some element is signed minimum. */
4615 if (INTVAL (elt) < 0
4616 && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4617 == 0))
4618 break;
4619 }
4620 /* For GEU punt if some element is zero. */
4621 else if (elt == const0_rtx)
4622 break;
4623 }
4624 if (i == n_elts)
4625 {
4626 rtvec v = rtvec_alloc (n_elts);
4627 for (i = 0; i < n_elts; ++i)
4628 RTVEC_ELT (v, i)
4629 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
4630 eltmode);
4631 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4632 code = code == GE ? GT : GTU;
4633 break;
4634 }
4635 }
4636 code = reverse_condition (code);
4637 *negate = true;
4638 /* FALLTHRU */
4639
4640 case LT:
4641 case LTU:
4642 std::swap (cop0, cop1);
4643 code = swap_condition (code);
4644 break;
4645
4646 default:
4647 gcc_unreachable ();
4648 }
4649
4650 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4651 if (mode == V2DImode)
4652 {
4653 switch (code)
4654 {
4655 case EQ:
4656 /* SSE4.1 supports EQ. */
4657 if (!TARGET_SSE4_1)
4658 return NULL;
4659 break;
4660
4661 case GT:
4662 case GTU:
4663 /* SSE4.2 supports GT/GTU. */
4664 if (!TARGET_SSE4_2)
4665 return NULL;
4666 break;
4667
4668 default:
4669 gcc_unreachable ();
4670 }
4671 }
4672
4673 if (GET_CODE (cop0) == CONST_VECTOR)
4674 cop0 = force_reg (mode, cop0);
4675 else if (GET_CODE (cop1) == CONST_VECTOR)
4676 cop1 = force_reg (mode, cop1);
4677
4678 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4679 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4680 if (*negate)
4681 std::swap (optrue, opfalse);
4682
4683 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4684 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4685 min (x, y) == x). While we add one instruction (the minimum),
4686 we remove the need for two instructions in the negation, as the
4687 result is done this way.
4688 When using masks, do it for SI/DImode element types, as it is shorter
4689 than the two subtractions. */
4690 if ((code != EQ
4691 && GET_MODE_SIZE (mode) != 64
4692 && vector_all_ones_operand (opfalse, data_mode)
4693 && optrue == CONST0_RTX (data_mode))
4694 || (code == GTU
4695 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4696 /* Don't do it if not using integer masks and we'd end up with
4697 the right values in the registers though. */
4698 && (GET_MODE_SIZE (mode) == 64
4699 || !vector_all_ones_operand (optrue, data_mode)
4700 || opfalse != CONST0_RTX (data_mode))))
4701 {
4702 rtx (*gen) (rtx, rtx, rtx) = NULL;
4703
4704 switch (mode)
4705 {
4706 case E_V16SImode:
4707 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4708 break;
4709 case E_V8DImode:
4710 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4711 cop0 = force_reg (mode, cop0);
4712 cop1 = force_reg (mode, cop1);
4713 break;
4714 case E_V32QImode:
4715 if (TARGET_AVX2)
4716 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4717 break;
4718 case E_V16HImode:
4719 if (TARGET_AVX2)
4720 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4721 break;
4722 case E_V8SImode:
4723 if (TARGET_AVX2)
4724 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4725 break;
4726 case E_V4DImode:
4727 if (TARGET_AVX512VL)
4728 {
4729 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4730 cop0 = force_reg (mode, cop0);
4731 cop1 = force_reg (mode, cop1);
4732 }
4733 break;
4734 case E_V16QImode:
4735 if (code == GTU && TARGET_SSE2)
4736 gen = gen_uminv16qi3;
4737 else if (code == GT && TARGET_SSE4_1)
4738 gen = gen_sminv16qi3;
4739 break;
4740 case E_V8QImode:
4741 if (code == GTU && TARGET_SSE2)
4742 gen = gen_uminv8qi3;
4743 else if (code == GT && TARGET_SSE4_1)
4744 gen = gen_sminv8qi3;
4745 break;
4746 case E_V4QImode:
4747 if (code == GTU && TARGET_SSE2)
4748 gen = gen_uminv4qi3;
4749 else if (code == GT && TARGET_SSE4_1)
4750 gen = gen_sminv4qi3;
4751 break;
4752 case E_V2QImode:
4753 if (code == GTU && TARGET_SSE2)
4754 gen = gen_uminv2qi3;
4755 else if (code == GT && TARGET_SSE4_1)
4756 gen = gen_sminv2qi3;
4757 break;
4758 case E_V8HImode:
4759 if (code == GTU && TARGET_SSE4_1)
4760 gen = gen_uminv8hi3;
4761 else if (code == GT && TARGET_SSE2)
4762 gen = gen_sminv8hi3;
4763 break;
4764 case E_V4HImode:
4765 if (code == GTU && TARGET_SSE4_1)
4766 gen = gen_uminv4hi3;
4767 else if (code == GT && TARGET_SSE2)
4768 gen = gen_sminv4hi3;
4769 break;
4770 case E_V2HImode:
4771 if (code == GTU && TARGET_SSE4_1)
4772 gen = gen_uminv2hi3;
4773 else if (code == GT && TARGET_SSE2)
4774 gen = gen_sminv2hi3;
4775 break;
4776 case E_V4SImode:
4777 if (TARGET_SSE4_1)
4778 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4779 break;
4780 case E_V2SImode:
4781 if (TARGET_SSE4_1)
4782 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4783 break;
4784 case E_V2DImode:
4785 if (TARGET_AVX512VL)
4786 {
4787 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4788 cop0 = force_reg (mode, cop0);
4789 cop1 = force_reg (mode, cop1);
4790 }
4791 break;
4792 default:
4793 break;
4794 }
4795
4796 if (gen)
4797 {
4798 rtx tem = gen_reg_rtx (mode);
4799 if (!vector_operand (cop0, mode))
4800 cop0 = force_reg (mode, cop0);
4801 if (!vector_operand (cop1, mode))
4802 cop1 = force_reg (mode, cop1);
4803 *negate = !*negate;
4804 emit_insn (gen (tem, cop0, cop1));
4805 cop1 = tem;
4806 code = EQ;
4807 }
4808 }
4809
4810 /* Unsigned parallel compare is not supported by the hardware.
4811 Play some tricks to turn this into a signed comparison
4812 against 0. */
4813 if (code == GTU)
4814 {
4815 cop0 = force_reg (mode, cop0);
4816
4817 switch (mode)
4818 {
4819 case E_V16SImode:
4820 case E_V8DImode:
4821 case E_V8SImode:
4822 case E_V4DImode:
4823 case E_V4SImode:
4824 case E_V2SImode:
4825 case E_V2DImode:
4826 {
4827 rtx t1, t2, mask;
4828
4829 /* Subtract (-(INT MAX) - 1) from both operands to make
4830 them signed. */
4831 mask = ix86_build_signbit_mask (mode, true, false);
4832 t1 = gen_reg_rtx (mode);
4833 emit_insn (gen_sub3_insn (t1, cop0, mask));
4834
4835 t2 = gen_reg_rtx (mode);
4836 emit_insn (gen_sub3_insn (t2, cop1, mask));
4837
4838 cop0 = t1;
4839 cop1 = t2;
4840 code = GT;
4841 }
4842 break;
4843
4844 case E_V64QImode:
4845 case E_V32HImode:
4846 case E_V32QImode:
4847 case E_V16HImode:
4848 case E_V16QImode:
4849 case E_V8QImode:
4850 case E_V4QImode:
4851 case E_V2QImode:
4852 case E_V8HImode:
4853 case E_V4HImode:
4854 case E_V2HImode:
4855 /* Perform a parallel unsigned saturating subtraction. */
4856 x = gen_reg_rtx (mode);
4857 emit_insn (gen_rtx_SET
4858 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4859 cop0 = x;
4860 cop1 = CONST0_RTX (mode);
4861 code = EQ;
4862 *negate = !*negate;
4863 break;
4864
4865 default:
4866 gcc_unreachable ();
4867 }
4868 }
4869 }
4870
4871 if (*negate)
4872 std::swap (op_true, op_false);
4873
4874 if (GET_CODE (cop1) == CONST_VECTOR)
4875 cop1 = force_reg (mode, cop1);
4876
4877 /* Allow the comparison to be done in one mode, but the movcc to
4878 happen in another mode. */
4879 if (data_mode == mode)
4880 x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
4881 else
4882 {
4883 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4884 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4885 op_true, op_false);
4886 if (GET_MODE (x) == mode)
4887 x = gen_lowpart (data_mode, x);
4888 }
4889
4890 return x;
4891 }
4892
4893 /* Expand integer vector comparison. */
4894
4895 bool
4896 ix86_expand_int_vec_cmp (rtx operands[])
4897 {
4898 rtx_code code = GET_CODE (operands[1]);
4899 bool negate = false;
4900 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4901 operands[3], NULL, NULL, &negate);
4902
4903 if (!cmp)
4904 return false;
4905
4906 if (negate)
4907 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4908 CONST0_RTX (GET_MODE (cmp)),
4909 NULL, NULL, &negate);
4910
4911 gcc_assert (!negate);
4912
4913 if (operands[0] != cmp)
4914 emit_move_insn (operands[0], cmp);
4915
4916 return true;
4917 }
4918
4919 /* Expand a floating-point vector conditional move; a vcond operation
4920 rather than a movcc operation. */
4921
4922 bool
4923 ix86_expand_fp_vcond (rtx operands[])
4924 {
4925 enum rtx_code code = GET_CODE (operands[3]);
4926 rtx cmp;
4927
4928 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4929 &operands[4], &operands[5]);
4930 if (code == UNKNOWN)
4931 {
4932 rtx temp;
4933 switch (GET_CODE (operands[3]))
4934 {
4935 case LTGT:
4936 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4937 operands[5], operands[0], operands[0]);
4938 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4939 operands[5], operands[1], operands[2]);
4940 code = AND;
4941 break;
4942 case UNEQ:
4943 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4944 operands[5], operands[0], operands[0]);
4945 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4946 operands[5], operands[1], operands[2]);
4947 code = IOR;
4948 break;
4949 default:
4950 gcc_unreachable ();
4951 }
4952 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4953 OPTAB_DIRECT);
4954 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4955 return true;
4956 }
4957
4958 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4959 operands[5], operands[1], operands[2]))
4960 return true;
4961
4962 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4963 operands[1], operands[2]);
4964 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4965 return true;
4966 }
4967
4968 /* Expand a signed/unsigned integral vector conditional move. */
4969
4970 bool
4971 ix86_expand_int_vcond (rtx operands[])
4972 {
4973 machine_mode data_mode = GET_MODE (operands[0]);
4974 machine_mode mode = GET_MODE (operands[4]);
4975 enum rtx_code code = GET_CODE (operands[3]);
4976 bool negate = false;
4977 rtx x, cop0, cop1;
4978
4979 cop0 = operands[4];
4980 cop1 = operands[5];
4981
4982 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4983 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4984 if ((code == LT || code == GE)
4985 && data_mode == mode
4986 && cop1 == CONST0_RTX (mode)
4987 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4988 && GET_MODE_UNIT_SIZE (data_mode) > 1
4989 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4990 && (GET_MODE_SIZE (data_mode) == 16
4991 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4992 {
4993 rtx negop = operands[2 - (code == LT)];
4994 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4995 if (negop == CONST1_RTX (data_mode))
4996 {
4997 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4998 operands[0], 1, OPTAB_DIRECT);
4999 if (res != operands[0])
5000 emit_move_insn (operands[0], res);
5001 return true;
5002 }
5003 else if (GET_MODE_INNER (data_mode) != DImode
5004 && vector_all_ones_operand (negop, data_mode))
5005 {
5006 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
5007 operands[0], 0, OPTAB_DIRECT);
5008 if (res != operands[0])
5009 emit_move_insn (operands[0], res);
5010 return true;
5011 }
5012 }
5013
5014 if (!nonimmediate_operand (cop1, mode))
5015 cop1 = force_reg (mode, cop1);
5016 if (!general_operand (operands[1], data_mode))
5017 operands[1] = force_reg (data_mode, operands[1]);
5018 if (!general_operand (operands[2], data_mode))
5019 operands[2] = force_reg (data_mode, operands[2]);
5020
5021 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5022 operands[1], operands[2], &negate);
5023
5024 if (!x)
5025 return false;
5026
5027 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5028 operands[2-negate]);
5029 return true;
5030 }
5031
5032 static bool
5033 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5034 struct expand_vec_perm_d *d)
5035 {
5036 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5037 expander, so args are either in d, or in op0, op1 etc. */
5038 machine_mode mode = GET_MODE (d ? d->op0 : op0);
5039 machine_mode maskmode = mode;
5040 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5041
5042 switch (mode)
5043 {
5044 case E_V16QImode:
5045 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5046 gen = gen_avx512vl_vpermt2varv16qi3;
5047 break;
5048 case E_V32QImode:
5049 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5050 gen = gen_avx512vl_vpermt2varv32qi3;
5051 break;
5052 case E_V64QImode:
5053 if (TARGET_AVX512VBMI)
5054 gen = gen_avx512bw_vpermt2varv64qi3;
5055 break;
5056 case E_V8HImode:
5057 if (TARGET_AVX512VL && TARGET_AVX512BW)
5058 gen = gen_avx512vl_vpermt2varv8hi3;
5059 break;
5060 case E_V16HImode:
5061 if (TARGET_AVX512VL && TARGET_AVX512BW)
5062 gen = gen_avx512vl_vpermt2varv16hi3;
5063 break;
5064 case E_V32HImode:
5065 if (TARGET_AVX512BW)
5066 gen = gen_avx512bw_vpermt2varv32hi3;
5067 break;
5068 case E_V4SImode:
5069 if (TARGET_AVX512VL)
5070 gen = gen_avx512vl_vpermt2varv4si3;
5071 break;
5072 case E_V8SImode:
5073 if (TARGET_AVX512VL)
5074 gen = gen_avx512vl_vpermt2varv8si3;
5075 break;
5076 case E_V16SImode:
5077 if (TARGET_AVX512F)
5078 gen = gen_avx512f_vpermt2varv16si3;
5079 break;
5080 case E_V4SFmode:
5081 if (TARGET_AVX512VL)
5082 {
5083 gen = gen_avx512vl_vpermt2varv4sf3;
5084 maskmode = V4SImode;
5085 }
5086 break;
5087 case E_V8SFmode:
5088 if (TARGET_AVX512VL)
5089 {
5090 gen = gen_avx512vl_vpermt2varv8sf3;
5091 maskmode = V8SImode;
5092 }
5093 break;
5094 case E_V16SFmode:
5095 if (TARGET_AVX512F)
5096 {
5097 gen = gen_avx512f_vpermt2varv16sf3;
5098 maskmode = V16SImode;
5099 }
5100 break;
5101 case E_V2DImode:
5102 if (TARGET_AVX512VL)
5103 gen = gen_avx512vl_vpermt2varv2di3;
5104 break;
5105 case E_V4DImode:
5106 if (TARGET_AVX512VL)
5107 gen = gen_avx512vl_vpermt2varv4di3;
5108 break;
5109 case E_V8DImode:
5110 if (TARGET_AVX512F)
5111 gen = gen_avx512f_vpermt2varv8di3;
5112 break;
5113 case E_V2DFmode:
5114 if (TARGET_AVX512VL)
5115 {
5116 gen = gen_avx512vl_vpermt2varv2df3;
5117 maskmode = V2DImode;
5118 }
5119 break;
5120 case E_V4DFmode:
5121 if (TARGET_AVX512VL)
5122 {
5123 gen = gen_avx512vl_vpermt2varv4df3;
5124 maskmode = V4DImode;
5125 }
5126 break;
5127 case E_V8DFmode:
5128 if (TARGET_AVX512F)
5129 {
5130 gen = gen_avx512f_vpermt2varv8df3;
5131 maskmode = V8DImode;
5132 }
5133 break;
5134 default:
5135 break;
5136 }
5137
5138 if (gen == NULL)
5139 return false;
5140
5141 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5142 expander, so args are either in d, or in op0, op1 etc. */
5143 if (d)
5144 {
5145 rtx vec[64];
5146 target = d->target;
5147 op0 = d->op0;
5148 op1 = d->op1;
5149 for (int i = 0; i < d->nelt; ++i)
5150 vec[i] = GEN_INT (d->perm[i]);
5151 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5152 }
5153
5154 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5155 return true;
5156 }
5157
5158 /* Expand a variable vector permutation. */
5159
5160 void
5161 ix86_expand_vec_perm (rtx operands[])
5162 {
5163 rtx target = operands[0];
5164 rtx op0 = operands[1];
5165 rtx op1 = operands[2];
5166 rtx mask = operands[3];
5167 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5168 machine_mode mode = GET_MODE (op0);
5169 machine_mode maskmode = GET_MODE (mask);
5170 int w, e, i;
5171 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5172
5173 /* Number of elements in the vector. */
5174 w = GET_MODE_NUNITS (mode);
5175 e = GET_MODE_UNIT_SIZE (mode);
5176 gcc_assert (w <= 64);
5177
5178 /* For HF mode vector, convert it to HI using subreg. */
5179 if (GET_MODE_INNER (mode) == HFmode)
5180 {
5181 machine_mode orig_mode = mode;
5182 mode = mode_for_vector (HImode, w).require ();
5183 target = lowpart_subreg (mode, target, orig_mode);
5184 op0 = lowpart_subreg (mode, op0, orig_mode);
5185 op1 = lowpart_subreg (mode, op1, orig_mode);
5186 }
5187
5188 if (TARGET_AVX512F && one_operand_shuffle)
5189 {
5190 rtx (*gen) (rtx, rtx, rtx) = NULL;
5191 switch (mode)
5192 {
5193 case E_V16SImode:
5194 gen =gen_avx512f_permvarv16si;
5195 break;
5196 case E_V16SFmode:
5197 gen = gen_avx512f_permvarv16sf;
5198 break;
5199 case E_V8DImode:
5200 gen = gen_avx512f_permvarv8di;
5201 break;
5202 case E_V8DFmode:
5203 gen = gen_avx512f_permvarv8df;
5204 break;
5205 default:
5206 break;
5207 }
5208 if (gen != NULL)
5209 {
5210 emit_insn (gen (target, op0, mask));
5211 return;
5212 }
5213 }
5214
5215 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5216 return;
5217
5218 if (TARGET_AVX2)
5219 {
5220 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5221 {
5222 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5223 an constant shuffle operand. With a tiny bit of effort we can
5224 use VPERMD instead. A re-interpretation stall for V4DFmode is
5225 unfortunate but there's no avoiding it.
5226 Similarly for V16HImode we don't have instructions for variable
5227 shuffling, while for V32QImode we can use after preparing suitable
5228 masks vpshufb; vpshufb; vpermq; vpor. */
5229
5230 if (mode == V16HImode)
5231 {
5232 maskmode = mode = V32QImode;
5233 w = 32;
5234 e = 1;
5235 }
5236 else
5237 {
5238 maskmode = mode = V8SImode;
5239 w = 8;
5240 e = 4;
5241 }
5242 t1 = gen_reg_rtx (maskmode);
5243
5244 /* Replicate the low bits of the V4DImode mask into V8SImode:
5245 mask = { A B C D }
5246 t1 = { A A B B C C D D }. */
5247 for (i = 0; i < w / 2; ++i)
5248 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5249 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5250 vt = force_reg (maskmode, vt);
5251 mask = gen_lowpart (maskmode, mask);
5252 if (maskmode == V8SImode)
5253 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5254 else
5255 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5256
5257 /* Multiply the shuffle indicies by two. */
5258 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5259 OPTAB_DIRECT);
5260
5261 /* Add one to the odd shuffle indicies:
5262 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5263 for (i = 0; i < w / 2; ++i)
5264 {
5265 vec[i * 2] = const0_rtx;
5266 vec[i * 2 + 1] = const1_rtx;
5267 }
5268 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5269 vt = validize_mem (force_const_mem (maskmode, vt));
5270 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5271 OPTAB_DIRECT);
5272
5273 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5274 operands[3] = mask = t1;
5275 target = gen_reg_rtx (mode);
5276 op0 = gen_lowpart (mode, op0);
5277 op1 = gen_lowpart (mode, op1);
5278 }
5279
5280 switch (mode)
5281 {
5282 case E_V8SImode:
5283 /* The VPERMD and VPERMPS instructions already properly ignore
5284 the high bits of the shuffle elements. No need for us to
5285 perform an AND ourselves. */
5286 if (one_operand_shuffle)
5287 {
5288 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5289 if (target != operands[0])
5290 emit_move_insn (operands[0],
5291 gen_lowpart (GET_MODE (operands[0]), target));
5292 }
5293 else
5294 {
5295 t1 = gen_reg_rtx (V8SImode);
5296 t2 = gen_reg_rtx (V8SImode);
5297 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5298 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5299 goto merge_two;
5300 }
5301 return;
5302
5303 case E_V8SFmode:
5304 mask = gen_lowpart (V8SImode, mask);
5305 if (one_operand_shuffle)
5306 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5307 else
5308 {
5309 t1 = gen_reg_rtx (V8SFmode);
5310 t2 = gen_reg_rtx (V8SFmode);
5311 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5312 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5313 goto merge_two;
5314 }
5315 return;
5316
5317 case E_V4SImode:
5318 /* By combining the two 128-bit input vectors into one 256-bit
5319 input vector, we can use VPERMD and VPERMPS for the full
5320 two-operand shuffle. */
5321 t1 = gen_reg_rtx (V8SImode);
5322 t2 = gen_reg_rtx (V8SImode);
5323 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5324 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5325 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5326 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5327 return;
5328
5329 case E_V4SFmode:
5330 t1 = gen_reg_rtx (V8SFmode);
5331 t2 = gen_reg_rtx (V8SImode);
5332 mask = gen_lowpart (V4SImode, mask);
5333 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5334 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5335 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5336 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5337 return;
5338
5339 case E_V32QImode:
5340 t1 = gen_reg_rtx (V32QImode);
5341 t2 = gen_reg_rtx (V32QImode);
5342 t3 = gen_reg_rtx (V32QImode);
5343 vt2 = GEN_INT (-128);
5344 vt = gen_const_vec_duplicate (V32QImode, vt2);
5345 vt = force_reg (V32QImode, vt);
5346 for (i = 0; i < 32; i++)
5347 vec[i] = i < 16 ? vt2 : const0_rtx;
5348 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5349 vt2 = force_reg (V32QImode, vt2);
5350 /* From mask create two adjusted masks, which contain the same
5351 bits as mask in the low 7 bits of each vector element.
5352 The first mask will have the most significant bit clear
5353 if it requests element from the same 128-bit lane
5354 and MSB set if it requests element from the other 128-bit lane.
5355 The second mask will have the opposite values of the MSB,
5356 and additionally will have its 128-bit lanes swapped.
5357 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5358 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5359 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5360 stands for other 12 bytes. */
5361 /* The bit whether element is from the same lane or the other
5362 lane is bit 4, so shift it up by 3 to the MSB position. */
5363 t5 = gen_reg_rtx (V4DImode);
5364 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5365 GEN_INT (3)));
5366 /* Clear MSB bits from the mask just in case it had them set. */
5367 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5368 /* After this t1 will have MSB set for elements from other lane. */
5369 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5370 /* Clear bits other than MSB. */
5371 emit_insn (gen_andv32qi3 (t1, t1, vt));
5372 /* Or in the lower bits from mask into t3. */
5373 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5374 /* And invert MSB bits in t1, so MSB is set for elements from the same
5375 lane. */
5376 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5377 /* Swap 128-bit lanes in t3. */
5378 t6 = gen_reg_rtx (V4DImode);
5379 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5380 const2_rtx, GEN_INT (3),
5381 const0_rtx, const1_rtx));
5382 /* And or in the lower bits from mask into t1. */
5383 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5384 if (one_operand_shuffle)
5385 {
5386 /* Each of these shuffles will put 0s in places where
5387 element from the other 128-bit lane is needed, otherwise
5388 will shuffle in the requested value. */
5389 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5390 gen_lowpart (V32QImode, t6)));
5391 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5392 /* For t3 the 128-bit lanes are swapped again. */
5393 t7 = gen_reg_rtx (V4DImode);
5394 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5395 const2_rtx, GEN_INT (3),
5396 const0_rtx, const1_rtx));
5397 /* And oring both together leads to the result. */
5398 emit_insn (gen_iorv32qi3 (target, t1,
5399 gen_lowpart (V32QImode, t7)));
5400 if (target != operands[0])
5401 emit_move_insn (operands[0],
5402 gen_lowpart (GET_MODE (operands[0]), target));
5403 return;
5404 }
5405
5406 t4 = gen_reg_rtx (V32QImode);
5407 /* Similarly to the above one_operand_shuffle code,
5408 just for repeated twice for each operand. merge_two:
5409 code will merge the two results together. */
5410 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5411 gen_lowpart (V32QImode, t6)));
5412 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5413 gen_lowpart (V32QImode, t6)));
5414 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5415 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5416 t7 = gen_reg_rtx (V4DImode);
5417 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5418 const2_rtx, GEN_INT (3),
5419 const0_rtx, const1_rtx));
5420 t8 = gen_reg_rtx (V4DImode);
5421 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5422 const2_rtx, GEN_INT (3),
5423 const0_rtx, const1_rtx));
5424 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5425 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5426 t1 = t4;
5427 t2 = t3;
5428 goto merge_two;
5429
5430 default:
5431 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5432 break;
5433 }
5434 }
5435
5436 if (TARGET_XOP)
5437 {
5438 /* The XOP VPPERM insn supports three inputs. By ignoring the
5439 one_operand_shuffle special case, we avoid creating another
5440 set of constant vectors in memory. */
5441 one_operand_shuffle = false;
5442
5443 /* mask = mask & {2*w-1, ...} */
5444 vt = GEN_INT (2*w - 1);
5445 }
5446 else
5447 {
5448 /* mask = mask & {w-1, ...} */
5449 vt = GEN_INT (w - 1);
5450 }
5451
5452 vt = gen_const_vec_duplicate (maskmode, vt);
5453 mask = expand_simple_binop (maskmode, AND, mask, vt,
5454 NULL_RTX, 0, OPTAB_DIRECT);
5455
5456 /* For non-QImode operations, convert the word permutation control
5457 into a byte permutation control. */
5458 if (mode != V16QImode)
5459 {
5460 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5461 GEN_INT (exact_log2 (e)),
5462 NULL_RTX, 0, OPTAB_DIRECT);
5463
5464 /* Convert mask to vector of chars. */
5465 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5466
5467 /* Replicate each of the input bytes into byte positions:
5468 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5469 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5470 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5471 for (i = 0; i < 16; ++i)
5472 vec[i] = GEN_INT (i/e * e);
5473 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5474 vt = validize_mem (force_const_mem (V16QImode, vt));
5475 if (TARGET_XOP)
5476 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5477 else
5478 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5479
5480 /* Convert it into the byte positions by doing
5481 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5482 for (i = 0; i < 16; ++i)
5483 vec[i] = GEN_INT (i % e);
5484 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5485 vt = validize_mem (force_const_mem (V16QImode, vt));
5486 emit_insn (gen_addv16qi3 (mask, mask, vt));
5487 }
5488
5489 /* The actual shuffle operations all operate on V16QImode. */
5490 op0 = gen_lowpart (V16QImode, op0);
5491 op1 = gen_lowpart (V16QImode, op1);
5492
5493 if (TARGET_XOP)
5494 {
5495 if (GET_MODE (target) != V16QImode)
5496 target = gen_reg_rtx (V16QImode);
5497 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5498 if (target != operands[0])
5499 emit_move_insn (operands[0],
5500 gen_lowpart (GET_MODE (operands[0]), target));
5501 }
5502 else if (one_operand_shuffle)
5503 {
5504 if (GET_MODE (target) != V16QImode)
5505 target = gen_reg_rtx (V16QImode);
5506 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5507 if (target != operands[0])
5508 emit_move_insn (operands[0],
5509 gen_lowpart (GET_MODE (operands[0]), target));
5510 }
5511 else
5512 {
5513 rtx xops[6];
5514 bool ok;
5515
5516 /* Shuffle the two input vectors independently. */
5517 t1 = gen_reg_rtx (V16QImode);
5518 t2 = gen_reg_rtx (V16QImode);
5519 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5520 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5521
5522 merge_two:
5523 /* Then merge them together. The key is whether any given control
5524 element contained a bit set that indicates the second word. */
5525 mask = operands[3];
5526 vt = GEN_INT (w);
5527 if (maskmode == V2DImode && !TARGET_SSE4_1)
5528 {
5529 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5530 more shuffle to convert the V2DI input mask into a V4SI
5531 input mask. At which point the masking that expand_int_vcond
5532 will work as desired. */
5533 rtx t3 = gen_reg_rtx (V4SImode);
5534 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5535 const0_rtx, const0_rtx,
5536 const2_rtx, const2_rtx));
5537 mask = t3;
5538 maskmode = V4SImode;
5539 e = w = 4;
5540 }
5541
5542 vt = gen_const_vec_duplicate (maskmode, vt);
5543 vt = force_reg (maskmode, vt);
5544 mask = expand_simple_binop (maskmode, AND, mask, vt,
5545 NULL_RTX, 0, OPTAB_DIRECT);
5546
5547 if (GET_MODE (target) != mode)
5548 target = gen_reg_rtx (mode);
5549 xops[0] = target;
5550 xops[1] = gen_lowpart (mode, t2);
5551 xops[2] = gen_lowpart (mode, t1);
5552 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5553 xops[4] = mask;
5554 xops[5] = vt;
5555 ok = ix86_expand_int_vcond (xops);
5556 gcc_assert (ok);
5557 if (target != operands[0])
5558 emit_move_insn (operands[0],
5559 gen_lowpart (GET_MODE (operands[0]), target));
5560 }
5561 }
5562
5563 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5564 true if we should do zero extension, else sign extension. HIGH_P is
5565 true if we want the N/2 high elements, else the low elements. */
5566
5567 void
5568 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5569 {
5570 machine_mode imode = GET_MODE (src);
5571 rtx tmp;
5572
5573 if (TARGET_SSE4_1)
5574 {
5575 rtx (*unpack)(rtx, rtx);
5576 rtx (*extract)(rtx, rtx) = NULL;
5577 machine_mode halfmode = BLKmode;
5578
5579 switch (imode)
5580 {
5581 case E_V64QImode:
5582 if (unsigned_p)
5583 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5584 else
5585 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5586 halfmode = V32QImode;
5587 extract
5588 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5589 break;
5590 case E_V32QImode:
5591 if (unsigned_p)
5592 unpack = gen_avx2_zero_extendv16qiv16hi2;
5593 else
5594 unpack = gen_avx2_sign_extendv16qiv16hi2;
5595 halfmode = V16QImode;
5596 extract
5597 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5598 break;
5599 case E_V32HImode:
5600 if (unsigned_p)
5601 unpack = gen_avx512f_zero_extendv16hiv16si2;
5602 else
5603 unpack = gen_avx512f_sign_extendv16hiv16si2;
5604 halfmode = V16HImode;
5605 extract
5606 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5607 break;
5608 case E_V16HImode:
5609 if (unsigned_p)
5610 unpack = gen_avx2_zero_extendv8hiv8si2;
5611 else
5612 unpack = gen_avx2_sign_extendv8hiv8si2;
5613 halfmode = V8HImode;
5614 extract
5615 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5616 break;
5617 case E_V16SImode:
5618 if (unsigned_p)
5619 unpack = gen_avx512f_zero_extendv8siv8di2;
5620 else
5621 unpack = gen_avx512f_sign_extendv8siv8di2;
5622 halfmode = V8SImode;
5623 extract
5624 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5625 break;
5626 case E_V8SImode:
5627 if (unsigned_p)
5628 unpack = gen_avx2_zero_extendv4siv4di2;
5629 else
5630 unpack = gen_avx2_sign_extendv4siv4di2;
5631 halfmode = V4SImode;
5632 extract
5633 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5634 break;
5635 case E_V16QImode:
5636 if (unsigned_p)
5637 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5638 else
5639 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5640 break;
5641 case E_V8HImode:
5642 if (unsigned_p)
5643 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5644 else
5645 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5646 break;
5647 case E_V4SImode:
5648 if (unsigned_p)
5649 unpack = gen_sse4_1_zero_extendv2siv2di2;
5650 else
5651 unpack = gen_sse4_1_sign_extendv2siv2di2;
5652 break;
5653 case E_V8QImode:
5654 if (unsigned_p)
5655 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5656 else
5657 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5658 break;
5659 case E_V4HImode:
5660 if (unsigned_p)
5661 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5662 else
5663 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5664 break;
5665 case E_V4QImode:
5666 if (unsigned_p)
5667 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5668 else
5669 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5670 break;
5671 default:
5672 gcc_unreachable ();
5673 }
5674
5675 if (GET_MODE_SIZE (imode) >= 32)
5676 {
5677 tmp = gen_reg_rtx (halfmode);
5678 emit_insn (extract (tmp, src));
5679 }
5680 else if (high_p)
5681 {
5682 switch (GET_MODE_SIZE (imode))
5683 {
5684 case 16:
5685 /* Shift higher 8 bytes to lower 8 bytes. */
5686 tmp = gen_reg_rtx (V1TImode);
5687 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5688 GEN_INT (64)));
5689 break;
5690 case 8:
5691 /* Shift higher 4 bytes to lower 4 bytes. */
5692 tmp = gen_reg_rtx (V1DImode);
5693 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5694 GEN_INT (32)));
5695 break;
5696 case 4:
5697 /* Shift higher 2 bytes to lower 2 bytes. */
5698 tmp = gen_reg_rtx (V1SImode);
5699 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5700 GEN_INT (16)));
5701 break;
5702 default:
5703 gcc_unreachable ();
5704 }
5705
5706 tmp = gen_lowpart (imode, tmp);
5707 }
5708 else
5709 tmp = src;
5710
5711 emit_insn (unpack (dest, tmp));
5712 }
5713 else
5714 {
5715 rtx (*unpack)(rtx, rtx, rtx);
5716
5717 switch (imode)
5718 {
5719 case E_V16QImode:
5720 if (high_p)
5721 unpack = gen_vec_interleave_highv16qi;
5722 else
5723 unpack = gen_vec_interleave_lowv16qi;
5724 break;
5725 case E_V8HImode:
5726 if (high_p)
5727 unpack = gen_vec_interleave_highv8hi;
5728 else
5729 unpack = gen_vec_interleave_lowv8hi;
5730 break;
5731 case E_V4SImode:
5732 if (high_p)
5733 unpack = gen_vec_interleave_highv4si;
5734 else
5735 unpack = gen_vec_interleave_lowv4si;
5736 break;
5737 case E_V8QImode:
5738 if (high_p)
5739 unpack = gen_mmx_punpckhbw;
5740 else
5741 unpack = gen_mmx_punpcklbw;
5742 break;
5743 case E_V4HImode:
5744 if (high_p)
5745 unpack = gen_mmx_punpckhwd;
5746 else
5747 unpack = gen_mmx_punpcklwd;
5748 break;
5749 case E_V4QImode:
5750 if (high_p)
5751 unpack = gen_mmx_punpckhbw_low;
5752 else
5753 unpack = gen_mmx_punpcklbw_low;
5754 break;
5755 default:
5756 gcc_unreachable ();
5757 }
5758
5759 if (unsigned_p)
5760 tmp = force_reg (imode, CONST0_RTX (imode));
5761 else
5762 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5763 src, pc_rtx, pc_rtx);
5764
5765 rtx tmp2 = gen_reg_rtx (imode);
5766 emit_insn (unpack (tmp2, src, tmp));
5767 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5768 }
5769 }
5770
5771 /* Return true if mem is pool constant which contains a const_vector
5772 perm index, assign the index to PERM. */
5773 bool
5774 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5775 {
5776 machine_mode mode = GET_MODE (mem);
5777 int nelt = GET_MODE_NUNITS (mode);
5778
5779 if (!INTEGRAL_MODE_P (mode))
5780 return false;
5781
5782 /* Needs to be constant pool. */
5783 if (!(MEM_P (mem))
5784 || !SYMBOL_REF_P (XEXP (mem, 0))
5785 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5786 return false;
5787
5788 rtx constant = get_pool_constant (XEXP (mem, 0));
5789
5790 if (GET_CODE (constant) != CONST_VECTOR)
5791 return false;
5792
5793 /* There could be some rtx like
5794 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5795 but with "*.LC1" refer to V2DI constant vector. */
5796 if (GET_MODE (constant) != mode)
5797 {
5798 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5799
5800 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5801 return false;
5802 }
5803
5804 for (int i = 0; i != nelt; i++)
5805 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5806
5807 return true;
5808 }
5809
5810 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5811 but works for floating pointer parameters and nonoffsetable memories.
5812 For pushes, it returns just stack offsets; the values will be saved
5813 in the right order. Maximally three parts are generated. */
5814
5815 static int
5816 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5817 {
5818 int size;
5819
5820 if (!TARGET_64BIT)
5821 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5822 else
5823 size = (GET_MODE_SIZE (mode) + 4) / 8;
5824
5825 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5826 gcc_assert (size >= 2 && size <= 4);
5827
5828 /* Optimize constant pool reference to immediates. This is used by fp
5829 moves, that force all constants to memory to allow combining. */
5830 if (MEM_P (operand) && MEM_READONLY_P (operand))
5831 operand = avoid_constant_pool_reference (operand);
5832
5833 if (MEM_P (operand) && !offsettable_memref_p (operand))
5834 {
5835 /* The only non-offsetable memories we handle are pushes. */
5836 int ok = push_operand (operand, VOIDmode);
5837
5838 gcc_assert (ok);
5839
5840 operand = copy_rtx (operand);
5841 PUT_MODE (operand, word_mode);
5842 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5843 return size;
5844 }
5845
5846 if (GET_CODE (operand) == CONST_VECTOR)
5847 {
5848 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5849 /* Caution: if we looked through a constant pool memory above,
5850 the operand may actually have a different mode now. That's
5851 ok, since we want to pun this all the way back to an integer. */
5852 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5853 gcc_assert (operand != NULL);
5854 mode = imode;
5855 }
5856
5857 if (!TARGET_64BIT)
5858 {
5859 if (mode == DImode)
5860 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5861 else
5862 {
5863 int i;
5864
5865 if (REG_P (operand))
5866 {
5867 gcc_assert (reload_completed);
5868 for (i = 0; i < size; i++)
5869 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5870 }
5871 else if (offsettable_memref_p (operand))
5872 {
5873 operand = adjust_address (operand, SImode, 0);
5874 parts[0] = operand;
5875 for (i = 1; i < size; i++)
5876 parts[i] = adjust_address (operand, SImode, 4 * i);
5877 }
5878 else if (CONST_DOUBLE_P (operand))
5879 {
5880 const REAL_VALUE_TYPE *r;
5881 long l[4];
5882
5883 r = CONST_DOUBLE_REAL_VALUE (operand);
5884 switch (mode)
5885 {
5886 case E_TFmode:
5887 real_to_target (l, r, mode);
5888 parts[3] = gen_int_mode (l[3], SImode);
5889 parts[2] = gen_int_mode (l[2], SImode);
5890 break;
5891 case E_XFmode:
5892 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5893 long double may not be 80-bit. */
5894 real_to_target (l, r, mode);
5895 parts[2] = gen_int_mode (l[2], SImode);
5896 break;
5897 case E_DFmode:
5898 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5899 break;
5900 default:
5901 gcc_unreachable ();
5902 }
5903 parts[1] = gen_int_mode (l[1], SImode);
5904 parts[0] = gen_int_mode (l[0], SImode);
5905 }
5906 else
5907 gcc_unreachable ();
5908 }
5909 }
5910 else
5911 {
5912 if (mode == TImode)
5913 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5914 if (mode == XFmode || mode == TFmode)
5915 {
5916 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5917 if (REG_P (operand))
5918 {
5919 gcc_assert (reload_completed);
5920 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5921 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5922 }
5923 else if (offsettable_memref_p (operand))
5924 {
5925 operand = adjust_address (operand, DImode, 0);
5926 parts[0] = operand;
5927 parts[1] = adjust_address (operand, upper_mode, 8);
5928 }
5929 else if (CONST_DOUBLE_P (operand))
5930 {
5931 long l[4];
5932
5933 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5934
5935 /* real_to_target puts 32-bit pieces in each long. */
5936 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5937 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5938 << 32), DImode);
5939
5940 if (upper_mode == SImode)
5941 parts[1] = gen_int_mode (l[2], SImode);
5942 else
5943 parts[1]
5944 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5945 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5946 << 32), DImode);
5947 }
5948 else
5949 gcc_unreachable ();
5950 }
5951 }
5952
5953 return size;
5954 }
5955
5956 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5957 Return false when normal moves are needed; true when all required
5958 insns have been emitted. Operands 2-4 contain the input values
5959 int the correct order; operands 5-7 contain the output values. */
5960
5961 void
5962 ix86_split_long_move (rtx operands[])
5963 {
5964 rtx part[2][4];
5965 int nparts, i, j;
5966 int push = 0;
5967 int collisions = 0;
5968 machine_mode mode = GET_MODE (operands[0]);
5969 bool collisionparts[4];
5970
5971 /* The DFmode expanders may ask us to move double.
5972 For 64bit target this is single move. By hiding the fact
5973 here we simplify i386.md splitters. */
5974 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5975 {
5976 /* Optimize constant pool reference to immediates. This is used by
5977 fp moves, that force all constants to memory to allow combining. */
5978
5979 if (MEM_P (operands[1])
5980 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5981 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5982 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5983 if (push_operand (operands[0], VOIDmode))
5984 {
5985 operands[0] = copy_rtx (operands[0]);
5986 PUT_MODE (operands[0], word_mode);
5987 }
5988 else
5989 operands[0] = gen_lowpart (DImode, operands[0]);
5990 operands[1] = gen_lowpart (DImode, operands[1]);
5991 emit_move_insn (operands[0], operands[1]);
5992 return;
5993 }
5994
5995 /* The only non-offsettable memory we handle is push. */
5996 if (push_operand (operands[0], VOIDmode))
5997 push = 1;
5998 else
5999 gcc_assert (!MEM_P (operands[0])
6000 || offsettable_memref_p (operands[0]));
6001
6002 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
6003 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
6004
6005 /* When emitting push, take care for source operands on the stack. */
6006 if (push && MEM_P (operands[1])
6007 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
6008 {
6009 rtx src_base = XEXP (part[1][nparts - 1], 0);
6010
6011 /* Compensate for the stack decrement by 4. */
6012 if (!TARGET_64BIT && nparts == 3
6013 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6014 src_base = plus_constant (Pmode, src_base, 4);
6015
6016 /* src_base refers to the stack pointer and is
6017 automatically decreased by emitted push. */
6018 for (i = 0; i < nparts; i++)
6019 part[1][i] = change_address (part[1][i],
6020 GET_MODE (part[1][i]), src_base);
6021 }
6022
6023 /* We need to do copy in the right order in case an address register
6024 of the source overlaps the destination. */
6025 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6026 {
6027 rtx tmp;
6028
6029 for (i = 0; i < nparts; i++)
6030 {
6031 collisionparts[i]
6032 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6033 if (collisionparts[i])
6034 collisions++;
6035 }
6036
6037 /* Collision in the middle part can be handled by reordering. */
6038 if (collisions == 1 && nparts == 3 && collisionparts [1])
6039 {
6040 std::swap (part[0][1], part[0][2]);
6041 std::swap (part[1][1], part[1][2]);
6042 }
6043 else if (collisions == 1
6044 && nparts == 4
6045 && (collisionparts [1] || collisionparts [2]))
6046 {
6047 if (collisionparts [1])
6048 {
6049 std::swap (part[0][1], part[0][2]);
6050 std::swap (part[1][1], part[1][2]);
6051 }
6052 else
6053 {
6054 std::swap (part[0][2], part[0][3]);
6055 std::swap (part[1][2], part[1][3]);
6056 }
6057 }
6058
6059 /* If there are more collisions, we can't handle it by reordering.
6060 Do an lea to the last part and use only one colliding move. */
6061 else if (collisions > 1)
6062 {
6063 rtx base, addr;
6064
6065 collisions = 1;
6066
6067 base = part[0][nparts - 1];
6068
6069 /* Handle the case when the last part isn't valid for lea.
6070 Happens in 64-bit mode storing the 12-byte XFmode. */
6071 if (GET_MODE (base) != Pmode)
6072 base = gen_rtx_REG (Pmode, REGNO (base));
6073
6074 addr = XEXP (part[1][0], 0);
6075 if (TARGET_TLS_DIRECT_SEG_REFS)
6076 {
6077 struct ix86_address parts;
6078 int ok = ix86_decompose_address (addr, &parts);
6079 gcc_assert (ok);
6080 /* It is not valid to use %gs: or %fs: in lea. */
6081 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6082 }
6083 emit_insn (gen_rtx_SET (base, addr));
6084 part[1][0] = replace_equiv_address (part[1][0], base);
6085 for (i = 1; i < nparts; i++)
6086 {
6087 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6088 part[1][i] = replace_equiv_address (part[1][i], tmp);
6089 }
6090 }
6091 }
6092
6093 if (push)
6094 {
6095 if (!TARGET_64BIT)
6096 {
6097 if (nparts == 3)
6098 {
6099 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
6100 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
6101 emit_move_insn (part[0][2], part[1][2]);
6102 }
6103 else if (nparts == 4)
6104 {
6105 emit_move_insn (part[0][3], part[1][3]);
6106 emit_move_insn (part[0][2], part[1][2]);
6107 }
6108 }
6109 else
6110 {
6111 /* In 64bit mode we don't have 32bit push available. In case this is
6112 register, it is OK - we will just use larger counterpart. We also
6113 retype memory - these comes from attempt to avoid REX prefix on
6114 moving of second half of TFmode value. */
6115 if (GET_MODE (part[1][1]) == SImode)
6116 {
6117 switch (GET_CODE (part[1][1]))
6118 {
6119 case MEM:
6120 part[1][1] = adjust_address (part[1][1], DImode, 0);
6121 break;
6122
6123 case REG:
6124 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6125 break;
6126
6127 default:
6128 gcc_unreachable ();
6129 }
6130
6131 if (GET_MODE (part[1][0]) == SImode)
6132 part[1][0] = part[1][1];
6133 }
6134 }
6135 emit_move_insn (part[0][1], part[1][1]);
6136 emit_move_insn (part[0][0], part[1][0]);
6137 return;
6138 }
6139
6140 /* Choose correct order to not overwrite the source before it is copied. */
6141 if ((REG_P (part[0][0])
6142 && REG_P (part[1][1])
6143 && (REGNO (part[0][0]) == REGNO (part[1][1])
6144 || (nparts == 3
6145 && REGNO (part[0][0]) == REGNO (part[1][2]))
6146 || (nparts == 4
6147 && REGNO (part[0][0]) == REGNO (part[1][3]))))
6148 || (collisions > 0
6149 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6150 {
6151 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6152 {
6153 operands[2 + i] = part[0][j];
6154 operands[6 + i] = part[1][j];
6155 }
6156 }
6157 else
6158 {
6159 for (i = 0; i < nparts; i++)
6160 {
6161 operands[2 + i] = part[0][i];
6162 operands[6 + i] = part[1][i];
6163 }
6164 }
6165
6166 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6167 if (optimize_insn_for_size_p ())
6168 {
6169 for (j = 0; j < nparts - 1; j++)
6170 if (CONST_INT_P (operands[6 + j])
6171 && operands[6 + j] != const0_rtx
6172 && REG_P (operands[2 + j]))
6173 for (i = j; i < nparts - 1; i++)
6174 if (CONST_INT_P (operands[7 + i])
6175 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6176 operands[7 + i] = operands[2 + j];
6177 }
6178
6179 for (i = 0; i < nparts; i++)
6180 emit_move_insn (operands[2 + i], operands[6 + i]);
6181
6182 return;
6183 }
6184
6185 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6186 left shift by a constant, either using a single shift or
6187 a sequence of add instructions. */
6188
6189 static void
6190 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6191 {
6192 if (count == 1
6193 || (count * ix86_cost->add <= ix86_cost->shift_const
6194 && !optimize_insn_for_size_p ()))
6195 {
6196 while (count-- > 0)
6197 emit_insn (gen_add2_insn (operand, operand));
6198 }
6199 else
6200 {
6201 rtx (*insn)(rtx, rtx, rtx);
6202
6203 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6204 emit_insn (insn (operand, operand, GEN_INT (count)));
6205 }
6206 }
6207
6208 void
6209 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6210 {
6211 rtx (*gen_ashl3)(rtx, rtx, rtx);
6212 rtx (*gen_shld)(rtx, rtx, rtx);
6213 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6214 machine_mode half_mode;
6215
6216 rtx low[2], high[2];
6217 int count;
6218
6219 if (CONST_INT_P (operands[2]))
6220 {
6221 split_double_mode (mode, operands, 2, low, high);
6222 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6223
6224 if (count >= half_width)
6225 {
6226 emit_move_insn (high[0], low[1]);
6227 ix86_expand_clear (low[0]);
6228
6229 if (count > half_width)
6230 ix86_expand_ashl_const (high[0], count - half_width, mode);
6231 }
6232 else
6233 {
6234 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6235
6236 if (!rtx_equal_p (operands[0], operands[1]))
6237 emit_move_insn (operands[0], operands[1]);
6238
6239 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6240 ix86_expand_ashl_const (low[0], count, mode);
6241 }
6242 return;
6243 }
6244
6245 split_double_mode (mode, operands, 1, low, high);
6246 half_mode = mode == DImode ? SImode : DImode;
6247
6248 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6249
6250 if (operands[1] == const1_rtx)
6251 {
6252 /* Assuming we've chosen a QImode capable registers, then 1 << N
6253 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6254 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6255 {
6256 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6257
6258 ix86_expand_clear (low[0]);
6259 ix86_expand_clear (high[0]);
6260 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6261
6262 d = gen_lowpart (QImode, low[0]);
6263 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6264 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6265 emit_insn (gen_rtx_SET (d, s));
6266
6267 d = gen_lowpart (QImode, high[0]);
6268 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6269 s = gen_rtx_NE (QImode, flags, const0_rtx);
6270 emit_insn (gen_rtx_SET (d, s));
6271 }
6272
6273 /* Otherwise, we can get the same results by manually performing
6274 a bit extract operation on bit 5/6, and then performing the two
6275 shifts. The two methods of getting 0/1 into low/high are exactly
6276 the same size. Avoiding the shift in the bit extract case helps
6277 pentium4 a bit; no one else seems to care much either way. */
6278 else
6279 {
6280 rtx (*gen_lshr3)(rtx, rtx, rtx);
6281 rtx (*gen_and3)(rtx, rtx, rtx);
6282 rtx (*gen_xor3)(rtx, rtx, rtx);
6283 HOST_WIDE_INT bits;
6284 rtx x;
6285
6286 if (mode == DImode)
6287 {
6288 gen_lshr3 = gen_lshrsi3;
6289 gen_and3 = gen_andsi3;
6290 gen_xor3 = gen_xorsi3;
6291 bits = 5;
6292 }
6293 else
6294 {
6295 gen_lshr3 = gen_lshrdi3;
6296 gen_and3 = gen_anddi3;
6297 gen_xor3 = gen_xordi3;
6298 bits = 6;
6299 }
6300
6301 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6302 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6303 else
6304 x = gen_lowpart (half_mode, operands[2]);
6305 emit_insn (gen_rtx_SET (high[0], x));
6306
6307 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6308 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6309 emit_move_insn (low[0], high[0]);
6310 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6311 }
6312
6313 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6314 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6315 return;
6316 }
6317
6318 if (operands[1] == constm1_rtx)
6319 {
6320 /* For -1 << N, we can avoid the shld instruction, because we
6321 know that we're shifting 0...31/63 ones into a -1. */
6322 emit_move_insn (low[0], constm1_rtx);
6323 if (optimize_insn_for_size_p ())
6324 emit_move_insn (high[0], low[0]);
6325 else
6326 emit_move_insn (high[0], constm1_rtx);
6327 }
6328 else
6329 {
6330 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6331
6332 if (!rtx_equal_p (operands[0], operands[1]))
6333 emit_move_insn (operands[0], operands[1]);
6334
6335 split_double_mode (mode, operands, 1, low, high);
6336 emit_insn (gen_shld (high[0], low[0], operands[2]));
6337 }
6338
6339 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6340
6341 if (TARGET_CMOVE && scratch)
6342 {
6343 ix86_expand_clear (scratch);
6344 emit_insn (gen_x86_shift_adj_1
6345 (half_mode, high[0], low[0], operands[2], scratch));
6346 }
6347 else
6348 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6349 }
6350
6351 void
6352 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6353 {
6354 rtx (*gen_ashr3)(rtx, rtx, rtx)
6355 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6356 rtx (*gen_shrd)(rtx, rtx, rtx);
6357 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6358
6359 rtx low[2], high[2];
6360 int count;
6361
6362 if (CONST_INT_P (operands[2]))
6363 {
6364 split_double_mode (mode, operands, 2, low, high);
6365 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6366
6367 if (count == GET_MODE_BITSIZE (mode) - 1)
6368 {
6369 emit_move_insn (high[0], high[1]);
6370 emit_insn (gen_ashr3 (high[0], high[0],
6371 GEN_INT (half_width - 1)));
6372 emit_move_insn (low[0], high[0]);
6373
6374 }
6375 else if (count >= half_width)
6376 {
6377 emit_move_insn (low[0], high[1]);
6378 emit_move_insn (high[0], low[0]);
6379 emit_insn (gen_ashr3 (high[0], high[0],
6380 GEN_INT (half_width - 1)));
6381
6382 if (count > half_width)
6383 emit_insn (gen_ashr3 (low[0], low[0],
6384 GEN_INT (count - half_width)));
6385 }
6386 else
6387 {
6388 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6389
6390 if (!rtx_equal_p (operands[0], operands[1]))
6391 emit_move_insn (operands[0], operands[1]);
6392
6393 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6394 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6395 }
6396 }
6397 else
6398 {
6399 machine_mode half_mode;
6400
6401 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6402
6403 if (!rtx_equal_p (operands[0], operands[1]))
6404 emit_move_insn (operands[0], operands[1]);
6405
6406 split_double_mode (mode, operands, 1, low, high);
6407 half_mode = mode == DImode ? SImode : DImode;
6408
6409 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6410 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6411
6412 if (TARGET_CMOVE && scratch)
6413 {
6414 emit_move_insn (scratch, high[0]);
6415 emit_insn (gen_ashr3 (scratch, scratch,
6416 GEN_INT (half_width - 1)));
6417 emit_insn (gen_x86_shift_adj_1
6418 (half_mode, low[0], high[0], operands[2], scratch));
6419 }
6420 else
6421 emit_insn (gen_x86_shift_adj_3
6422 (half_mode, low[0], high[0], operands[2]));
6423 }
6424 }
6425
6426 void
6427 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6428 {
6429 rtx (*gen_lshr3)(rtx, rtx, rtx)
6430 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6431 rtx (*gen_shrd)(rtx, rtx, rtx);
6432 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6433
6434 rtx low[2], high[2];
6435 int count;
6436
6437 if (CONST_INT_P (operands[2]))
6438 {
6439 split_double_mode (mode, operands, 2, low, high);
6440 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6441
6442 if (count >= half_width)
6443 {
6444 emit_move_insn (low[0], high[1]);
6445 ix86_expand_clear (high[0]);
6446
6447 if (count > half_width)
6448 emit_insn (gen_lshr3 (low[0], low[0],
6449 GEN_INT (count - half_width)));
6450 }
6451 else
6452 {
6453 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6454
6455 if (!rtx_equal_p (operands[0], operands[1]))
6456 emit_move_insn (operands[0], operands[1]);
6457
6458 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6459 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6460 }
6461 }
6462 else
6463 {
6464 machine_mode half_mode;
6465
6466 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6467
6468 if (!rtx_equal_p (operands[0], operands[1]))
6469 emit_move_insn (operands[0], operands[1]);
6470
6471 split_double_mode (mode, operands, 1, low, high);
6472 half_mode = mode == DImode ? SImode : DImode;
6473
6474 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6475 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6476
6477 if (TARGET_CMOVE && scratch)
6478 {
6479 ix86_expand_clear (scratch);
6480 emit_insn (gen_x86_shift_adj_1
6481 (half_mode, low[0], high[0], operands[2], scratch));
6482 }
6483 else
6484 emit_insn (gen_x86_shift_adj_2
6485 (half_mode, low[0], high[0], operands[2]));
6486 }
6487 }
6488
6489 /* Expand move of V1TI mode register X to a new TI mode register. */
6490 static rtx
6491 ix86_expand_v1ti_to_ti (rtx x)
6492 {
6493 rtx result = gen_reg_rtx (TImode);
6494 if (TARGET_SSE2)
6495 {
6496 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6497 rtx lo = gen_lowpart (DImode, result);
6498 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6499 rtx hi = gen_highpart (DImode, result);
6500 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6501 }
6502 else
6503 emit_move_insn (result, gen_lowpart (TImode, x));
6504 return result;
6505 }
6506
6507 /* Expand move of TI mode register X to a new V1TI mode register. */
6508 static rtx
6509 ix86_expand_ti_to_v1ti (rtx x)
6510 {
6511 if (TARGET_SSE2)
6512 {
6513 rtx lo = gen_lowpart (DImode, x);
6514 rtx hi = gen_highpart (DImode, x);
6515 rtx tmp = gen_reg_rtx (V2DImode);
6516 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6517 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6518 }
6519
6520 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6521 }
6522
6523 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6524 void
6525 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6526 {
6527 rtx op1 = force_reg (V1TImode, operands[1]);
6528
6529 if (!CONST_INT_P (operands[2]))
6530 {
6531 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6532 rtx tmp2 = gen_reg_rtx (TImode);
6533 rtx (*shift) (rtx, rtx, rtx)
6534 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6535 emit_insn (shift (tmp2, tmp1, operands[2]));
6536 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6537 emit_move_insn (operands[0], tmp3);
6538 return;
6539 }
6540
6541 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6542
6543 if (bits == 0)
6544 {
6545 emit_move_insn (operands[0], op1);
6546 return;
6547 }
6548
6549 if ((bits & 7) == 0)
6550 {
6551 rtx tmp = gen_reg_rtx (V1TImode);
6552 if (code == ASHIFT)
6553 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6554 else
6555 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6556 emit_move_insn (operands[0], tmp);
6557 return;
6558 }
6559
6560 rtx tmp1 = gen_reg_rtx (V1TImode);
6561 if (code == ASHIFT)
6562 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6563 else
6564 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6565
6566 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6567 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6568
6569 /* tmp3 will be the V2DImode result. */
6570 rtx tmp3 = gen_reg_rtx (V2DImode);
6571
6572 if (bits > 64)
6573 {
6574 if (code == ASHIFT)
6575 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6576 else
6577 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6578 }
6579 else
6580 {
6581 /* tmp4 is operands[1], in V2DImode. */
6582 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6583
6584 rtx tmp5 = gen_reg_rtx (V2DImode);
6585 if (code == ASHIFT)
6586 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6587 else
6588 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6589
6590 rtx tmp6 = gen_reg_rtx (V2DImode);
6591 if (code == ASHIFT)
6592 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6593 else
6594 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6595
6596 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6597 }
6598
6599 /* Convert the result back to V1TImode and store in operands[0]. */
6600 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6601 emit_move_insn (operands[0], tmp7);
6602 }
6603
6604 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6605 void
6606 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6607 {
6608 rtx op1 = force_reg (V1TImode, operands[1]);
6609
6610 if (!CONST_INT_P (operands[2]))
6611 {
6612 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6613 rtx tmp2 = gen_reg_rtx (TImode);
6614 rtx (*rotate) (rtx, rtx, rtx)
6615 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6616 emit_insn (rotate (tmp2, tmp1, operands[2]));
6617 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6618 emit_move_insn (operands[0], tmp3);
6619 return;
6620 }
6621
6622 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6623
6624 if (bits == 0)
6625 {
6626 emit_move_insn (operands[0], op1);
6627 return;
6628 }
6629
6630 if (code == ROTATERT)
6631 bits = 128 - bits;
6632
6633 if ((bits & 31) == 0)
6634 {
6635 rtx tmp2 = gen_reg_rtx (V4SImode);
6636 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6637 if (bits == 32)
6638 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6639 else if (bits == 64)
6640 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6641 else
6642 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6643 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6644 return;
6645 }
6646
6647 if ((bits & 7) == 0)
6648 {
6649 rtx tmp1 = gen_reg_rtx (V1TImode);
6650 rtx tmp2 = gen_reg_rtx (V1TImode);
6651 rtx tmp3 = gen_reg_rtx (V1TImode);
6652
6653 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6654 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6655 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6656 emit_move_insn (operands[0], tmp3);
6657 return;
6658 }
6659
6660 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6661
6662 rtx lobits;
6663 rtx hibits;
6664
6665 switch (bits >> 5)
6666 {
6667 case 0:
6668 lobits = op1_v4si;
6669 hibits = gen_reg_rtx (V4SImode);
6670 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6671 break;
6672
6673 case 1:
6674 lobits = gen_reg_rtx (V4SImode);
6675 hibits = gen_reg_rtx (V4SImode);
6676 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6677 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6678 break;
6679
6680 case 2:
6681 lobits = gen_reg_rtx (V4SImode);
6682 hibits = gen_reg_rtx (V4SImode);
6683 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6684 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6685 break;
6686
6687 default:
6688 lobits = gen_reg_rtx (V4SImode);
6689 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6690 hibits = op1_v4si;
6691 break;
6692 }
6693
6694 rtx tmp1 = gen_reg_rtx (V4SImode);
6695 rtx tmp2 = gen_reg_rtx (V4SImode);
6696 rtx tmp3 = gen_reg_rtx (V4SImode);
6697
6698 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6699 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6700 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6701
6702 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6703 }
6704
6705 /* Expand V1TI mode ashiftrt by constant. */
6706 void
6707 ix86_expand_v1ti_ashiftrt (rtx operands[])
6708 {
6709 rtx op1 = force_reg (V1TImode, operands[1]);
6710
6711 if (!CONST_INT_P (operands[2]))
6712 {
6713 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6714 rtx tmp2 = gen_reg_rtx (TImode);
6715 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6716 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6717 emit_move_insn (operands[0], tmp3);
6718 return;
6719 }
6720
6721 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6722
6723 if (bits == 0)
6724 {
6725 emit_move_insn (operands[0], op1);
6726 return;
6727 }
6728
6729 if (bits == 127)
6730 {
6731 /* Two operations. */
6732 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6733 rtx tmp2 = gen_reg_rtx (V4SImode);
6734 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6735
6736 rtx tmp3 = gen_reg_rtx (V4SImode);
6737 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6738
6739 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6740 return;
6741 }
6742
6743 if (bits == 64)
6744 {
6745 /* Three operations. */
6746 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6747 rtx tmp2 = gen_reg_rtx (V4SImode);
6748 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6749
6750 rtx tmp3 = gen_reg_rtx (V4SImode);
6751 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6752
6753 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6754 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6755 rtx tmp6 = gen_reg_rtx (V2DImode);
6756 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6757
6758 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6759 return;
6760 }
6761
6762 if (bits == 96)
6763 {
6764 /* Three operations. */
6765 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6766 rtx tmp2 = gen_reg_rtx (V4SImode);
6767 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6768
6769 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6770 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6771 rtx tmp5 = gen_reg_rtx (V2DImode);
6772 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6773
6774 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
6775 rtx tmp7 = gen_reg_rtx (V4SImode);
6776 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6777
6778 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6779 return;
6780 }
6781
6782 if (bits >= 111)
6783 {
6784 /* Three operations. */
6785 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6786 rtx tmp2 = gen_reg_rtx (V4SImode);
6787 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6788
6789 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6790 rtx tmp4 = gen_reg_rtx (V8HImode);
6791 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6792
6793 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6794 rtx tmp6 = gen_reg_rtx (V4SImode);
6795 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6796
6797 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6798 return;
6799 }
6800
6801 if (TARGET_AVX2 || TARGET_SSE4_1)
6802 {
6803 /* Three operations. */
6804 if (bits == 32)
6805 {
6806 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6807 rtx tmp2 = gen_reg_rtx (V4SImode);
6808 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6809
6810 rtx tmp3 = gen_reg_rtx (V1TImode);
6811 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6812
6813 if (TARGET_AVX2)
6814 {
6815 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6816 rtx tmp5 = gen_reg_rtx (V4SImode);
6817 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6818 GEN_INT (7)));
6819
6820 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6821 }
6822 else
6823 {
6824 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6825 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6826 rtx tmp6 = gen_reg_rtx (V8HImode);
6827 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6828 GEN_INT (0x3f)));
6829
6830 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6831 }
6832 return;
6833 }
6834
6835 /* Three operations. */
6836 if (bits == 8 || bits == 16 || bits == 24)
6837 {
6838 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6839 rtx tmp2 = gen_reg_rtx (V4SImode);
6840 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6841
6842 rtx tmp3 = gen_reg_rtx (V1TImode);
6843 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6844
6845 if (TARGET_AVX2)
6846 {
6847 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6848 rtx tmp5 = gen_reg_rtx (V4SImode);
6849 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6850 GEN_INT (7)));
6851
6852 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6853 }
6854 else
6855 {
6856 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6857 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6858 rtx tmp6 = gen_reg_rtx (V8HImode);
6859 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6860 GEN_INT (0x3f)));
6861
6862 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6863 }
6864 return;
6865 }
6866 }
6867
6868 if (bits > 96)
6869 {
6870 /* Four operations. */
6871 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6872 rtx tmp2 = gen_reg_rtx (V4SImode);
6873 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6874
6875 rtx tmp3 = gen_reg_rtx (V4SImode);
6876 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6877
6878 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6879 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6880 rtx tmp6 = gen_reg_rtx (V2DImode);
6881 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6882
6883 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
6884 rtx tmp8 = gen_reg_rtx (V4SImode);
6885 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6886
6887 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
6888 return;
6889 }
6890
6891 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6892 {
6893 /* Four operations. */
6894 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6895 rtx tmp2 = gen_reg_rtx (V4SImode);
6896 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6897
6898 rtx tmp3 = gen_reg_rtx (V4SImode);
6899 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6900
6901 rtx tmp4 = gen_reg_rtx (V1TImode);
6902 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6903
6904 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6905 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
6906 rtx tmp7 = gen_reg_rtx (V8HImode);
6907 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6908 GEN_INT (bits == 48 ? 0x1f : 0x07)));
6909
6910 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6911 return;
6912 }
6913
6914 if ((bits & 7) == 0)
6915 {
6916 /* Five operations. */
6917 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6918 rtx tmp2 = gen_reg_rtx (V4SImode);
6919 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6920
6921 rtx tmp3 = gen_reg_rtx (V4SImode);
6922 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6923
6924 rtx tmp4 = gen_reg_rtx (V1TImode);
6925 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6926
6927 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6928 rtx tmp6 = gen_reg_rtx (V1TImode);
6929 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6930
6931 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6932 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
6933 rtx tmp9 = gen_reg_rtx (V2DImode);
6934 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6935
6936 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
6937 return;
6938 }
6939
6940 if (TARGET_AVX2 && bits < 32)
6941 {
6942 /* Six operations. */
6943 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6944 rtx tmp2 = gen_reg_rtx (V4SImode);
6945 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6946
6947 rtx tmp3 = gen_reg_rtx (V1TImode);
6948 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6949
6950 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6951 rtx tmp5 = gen_reg_rtx (V2DImode);
6952 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6953
6954 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6955 rtx tmp7 = gen_reg_rtx (V2DImode);
6956 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6957
6958 rtx tmp8 = gen_reg_rtx (V2DImode);
6959 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6960
6961 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
6962 rtx tmp10 = gen_reg_rtx (V4SImode);
6963 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6964
6965 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
6966 return;
6967 }
6968
6969 if (TARGET_SSE4_1 && bits < 15)
6970 {
6971 /* Six operations. */
6972 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6973 rtx tmp2 = gen_reg_rtx (V4SImode);
6974 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6975
6976 rtx tmp3 = gen_reg_rtx (V1TImode);
6977 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6978
6979 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6980 rtx tmp5 = gen_reg_rtx (V2DImode);
6981 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6982
6983 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6984 rtx tmp7 = gen_reg_rtx (V2DImode);
6985 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6986
6987 rtx tmp8 = gen_reg_rtx (V2DImode);
6988 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6989
6990 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6991 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
6992 rtx tmp11 = gen_reg_rtx (V8HImode);
6993 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
6994
6995 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
6996 return;
6997 }
6998
6999 if (bits == 1)
7000 {
7001 /* Eight operations. */
7002 rtx tmp1 = gen_reg_rtx (V1TImode);
7003 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7004
7005 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7006 rtx tmp3 = gen_reg_rtx (V2DImode);
7007 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
7008
7009 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7010 rtx tmp5 = gen_reg_rtx (V2DImode);
7011 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
7012
7013 rtx tmp6 = gen_reg_rtx (V2DImode);
7014 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7015
7016 rtx tmp7 = gen_reg_rtx (V2DImode);
7017 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7018
7019 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
7020 rtx tmp9 = gen_reg_rtx (V4SImode);
7021 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7022
7023 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
7024 rtx tmp11 = gen_reg_rtx (V2DImode);
7025 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7026
7027 rtx tmp12 = gen_reg_rtx (V2DImode);
7028 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7029
7030 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
7031 return;
7032 }
7033
7034 if (bits > 64)
7035 {
7036 /* Eight operations. */
7037 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7038 rtx tmp2 = gen_reg_rtx (V4SImode);
7039 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7040
7041 rtx tmp3 = gen_reg_rtx (V4SImode);
7042 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7043
7044 rtx tmp4 = gen_reg_rtx (V1TImode);
7045 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7046
7047 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7048 rtx tmp6 = gen_reg_rtx (V2DImode);
7049 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7050
7051 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7052 rtx tmp8 = gen_reg_rtx (V1TImode);
7053 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7054
7055 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7056 rtx tmp10 = gen_reg_rtx (V2DImode);
7057 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7058
7059 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
7060 rtx tmp12 = gen_reg_rtx (V2DImode);
7061 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7062
7063 rtx tmp13 = gen_reg_rtx (V2DImode);
7064 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7065
7066 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
7067 }
7068 else
7069 {
7070 /* Nine operations. */
7071 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7072 rtx tmp2 = gen_reg_rtx (V4SImode);
7073 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7074
7075 rtx tmp3 = gen_reg_rtx (V4SImode);
7076 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7077
7078 rtx tmp4 = gen_reg_rtx (V1TImode);
7079 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7080
7081 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7082 rtx tmp6 = gen_reg_rtx (V2DImode);
7083 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7084
7085 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7086 rtx tmp8 = gen_reg_rtx (V2DImode);
7087 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7088
7089 rtx tmp9 = gen_reg_rtx (V2DImode);
7090 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7091
7092 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7093 rtx tmp11 = gen_reg_rtx (V1TImode);
7094 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7095
7096 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
7097 rtx tmp13 = gen_reg_rtx (V2DImode);
7098 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7099
7100 rtx tmp14 = gen_reg_rtx (V2DImode);
7101 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7102
7103 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
7104 }
7105 }
7106
7107 /* Replace all occurrences of REG FROM with REG TO in X, including
7108 occurrences with different modes. */
7109
7110 rtx
7111 ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
7112 {
7113 gcc_checking_assert (REG_P (from)
7114 && REG_P (to)
7115 && GET_MODE (from) == GET_MODE (to));
7116 if (!reg_overlap_mentioned_p (from, x))
7117 return x;
7118 rtx ret = copy_rtx (x);
7119 subrtx_ptr_iterator::array_type array;
7120 FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
7121 {
7122 rtx *loc = *iter;
7123 x = *loc;
7124 if (REG_P (x) && REGNO (x) == REGNO (from))
7125 {
7126 if (x == from)
7127 *loc = to;
7128 else
7129 {
7130 gcc_checking_assert (REG_NREGS (x) == 1);
7131 *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
7132 }
7133 }
7134 }
7135 return ret;
7136 }
7137
7138 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7139 DImode for constant loop counts. */
7140
7141 static machine_mode
7142 counter_mode (rtx count_exp)
7143 {
7144 if (GET_MODE (count_exp) != VOIDmode)
7145 return GET_MODE (count_exp);
7146 if (!CONST_INT_P (count_exp))
7147 return Pmode;
7148 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7149 return DImode;
7150 return SImode;
7151 }
7152
7153 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7154 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7155 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7156 memory by VALUE (supposed to be in MODE).
7157
7158 The size is rounded down to whole number of chunk size moved at once.
7159 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7160
7161
7162 static void
7163 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7164 rtx destptr, rtx srcptr, rtx value,
7165 rtx count, machine_mode mode, int unroll,
7166 int expected_size, bool issetmem)
7167 {
7168 rtx_code_label *out_label, *top_label;
7169 rtx iter, tmp;
7170 machine_mode iter_mode = counter_mode (count);
7171 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7172 rtx piece_size = GEN_INT (piece_size_n);
7173 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7174 rtx size;
7175 int i;
7176
7177 top_label = gen_label_rtx ();
7178 out_label = gen_label_rtx ();
7179 iter = gen_reg_rtx (iter_mode);
7180
7181 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7182 NULL, 1, OPTAB_DIRECT);
7183 /* Those two should combine. */
7184 if (piece_size == const1_rtx)
7185 {
7186 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7187 true, out_label);
7188 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7189 }
7190 emit_move_insn (iter, const0_rtx);
7191
7192 emit_label (top_label);
7193
7194 tmp = convert_modes (Pmode, iter_mode, iter, true);
7195
7196 /* This assert could be relaxed - in this case we'll need to compute
7197 smallest power of two, containing in PIECE_SIZE_N and pass it to
7198 offset_address. */
7199 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7200 destmem = offset_address (destmem, tmp, piece_size_n);
7201 destmem = adjust_address (destmem, mode, 0);
7202
7203 if (!issetmem)
7204 {
7205 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7206 srcmem = adjust_address (srcmem, mode, 0);
7207
7208 /* When unrolling for chips that reorder memory reads and writes,
7209 we can save registers by using single temporary.
7210 Also using 4 temporaries is overkill in 32bit mode. */
7211 if (!TARGET_64BIT && 0)
7212 {
7213 for (i = 0; i < unroll; i++)
7214 {
7215 if (i)
7216 {
7217 destmem = adjust_address (copy_rtx (destmem), mode,
7218 GET_MODE_SIZE (mode));
7219 srcmem = adjust_address (copy_rtx (srcmem), mode,
7220 GET_MODE_SIZE (mode));
7221 }
7222 emit_move_insn (destmem, srcmem);
7223 }
7224 }
7225 else
7226 {
7227 rtx tmpreg[4];
7228 gcc_assert (unroll <= 4);
7229 for (i = 0; i < unroll; i++)
7230 {
7231 tmpreg[i] = gen_reg_rtx (mode);
7232 if (i)
7233 srcmem = adjust_address (copy_rtx (srcmem), mode,
7234 GET_MODE_SIZE (mode));
7235 emit_move_insn (tmpreg[i], srcmem);
7236 }
7237 for (i = 0; i < unroll; i++)
7238 {
7239 if (i)
7240 destmem = adjust_address (copy_rtx (destmem), mode,
7241 GET_MODE_SIZE (mode));
7242 emit_move_insn (destmem, tmpreg[i]);
7243 }
7244 }
7245 }
7246 else
7247 for (i = 0; i < unroll; i++)
7248 {
7249 if (i)
7250 destmem = adjust_address (copy_rtx (destmem), mode,
7251 GET_MODE_SIZE (mode));
7252 emit_move_insn (destmem, value);
7253 }
7254
7255 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7256 true, OPTAB_LIB_WIDEN);
7257 if (tmp != iter)
7258 emit_move_insn (iter, tmp);
7259
7260 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7261 true, top_label);
7262 if (expected_size != -1)
7263 {
7264 expected_size /= GET_MODE_SIZE (mode) * unroll;
7265 if (expected_size == 0)
7266 predict_jump (0);
7267 else if (expected_size > REG_BR_PROB_BASE)
7268 predict_jump (REG_BR_PROB_BASE - 1);
7269 else
7270 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7271 / expected_size);
7272 }
7273 else
7274 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7275 iter = ix86_zero_extend_to_Pmode (iter);
7276 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7277 true, OPTAB_LIB_WIDEN);
7278 if (tmp != destptr)
7279 emit_move_insn (destptr, tmp);
7280 if (!issetmem)
7281 {
7282 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7283 true, OPTAB_LIB_WIDEN);
7284 if (tmp != srcptr)
7285 emit_move_insn (srcptr, tmp);
7286 }
7287 emit_label (out_label);
7288 }
7289
7290 /* Divide COUNTREG by SCALE. */
7291 static rtx
7292 scale_counter (rtx countreg, int scale)
7293 {
7294 rtx sc;
7295
7296 if (scale == 1)
7297 return countreg;
7298 if (CONST_INT_P (countreg))
7299 return GEN_INT (INTVAL (countreg) / scale);
7300 gcc_assert (REG_P (countreg));
7301
7302 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7303 GEN_INT (exact_log2 (scale)),
7304 NULL, 1, OPTAB_DIRECT);
7305 return sc;
7306 }
7307
7308 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7309 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7310 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7311 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7312 ORIG_VALUE is the original value passed to memset to fill the memory with.
7313 Other arguments have same meaning as for previous function. */
7314
7315 static void
7316 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7317 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7318 rtx count,
7319 machine_mode mode, bool issetmem)
7320 {
7321 rtx destexp;
7322 rtx srcexp;
7323 rtx countreg;
7324 HOST_WIDE_INT rounded_count;
7325
7326 /* If possible, it is shorter to use rep movs.
7327 TODO: Maybe it is better to move this logic to decide_alg. */
7328 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7329 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7330 && (!issetmem || orig_value == const0_rtx))
7331 mode = SImode;
7332
7333 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7334 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7335
7336 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7337 GET_MODE_SIZE (mode)));
7338 if (mode != QImode)
7339 {
7340 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7341 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7342 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7343 }
7344 else
7345 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7346 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7347 {
7348 rounded_count
7349 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7350 destmem = shallow_copy_rtx (destmem);
7351 set_mem_size (destmem, rounded_count);
7352 }
7353 else if (MEM_SIZE_KNOWN_P (destmem))
7354 clear_mem_size (destmem);
7355
7356 if (issetmem)
7357 {
7358 value = force_reg (mode, gen_lowpart (mode, value));
7359 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7360 }
7361 else
7362 {
7363 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7364 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7365 if (mode != QImode)
7366 {
7367 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7368 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7369 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7370 }
7371 else
7372 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7373 if (CONST_INT_P (count))
7374 {
7375 rounded_count
7376 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7377 srcmem = shallow_copy_rtx (srcmem);
7378 set_mem_size (srcmem, rounded_count);
7379 }
7380 else
7381 {
7382 if (MEM_SIZE_KNOWN_P (srcmem))
7383 clear_mem_size (srcmem);
7384 }
7385 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7386 destexp, srcexp));
7387 }
7388 }
7389
7390 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7391 DESTMEM.
7392 SRC is passed by pointer to be updated on return.
7393 Return value is updated DST. */
7394 static rtx
7395 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7396 HOST_WIDE_INT size_to_move)
7397 {
7398 rtx dst = destmem, src = *srcmem, tempreg;
7399 enum insn_code code;
7400 machine_mode move_mode;
7401 int piece_size, i;
7402
7403 /* Find the widest mode in which we could perform moves.
7404 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7405 it until move of such size is supported. */
7406 piece_size = 1 << floor_log2 (size_to_move);
7407 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7408 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7409 {
7410 gcc_assert (piece_size > 1);
7411 piece_size >>= 1;
7412 }
7413
7414 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7415 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7416 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7417 {
7418 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7419 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7420 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7421 {
7422 move_mode = word_mode;
7423 piece_size = GET_MODE_SIZE (move_mode);
7424 code = optab_handler (mov_optab, move_mode);
7425 }
7426 }
7427 gcc_assert (code != CODE_FOR_nothing);
7428
7429 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7430 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7431
7432 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7433 gcc_assert (size_to_move % piece_size == 0);
7434
7435 for (i = 0; i < size_to_move; i += piece_size)
7436 {
7437 /* We move from memory to memory, so we'll need to do it via
7438 a temporary register. */
7439 tempreg = gen_reg_rtx (move_mode);
7440 emit_insn (GEN_FCN (code) (tempreg, src));
7441 emit_insn (GEN_FCN (code) (dst, tempreg));
7442
7443 emit_move_insn (destptr,
7444 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7445 emit_move_insn (srcptr,
7446 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7447
7448 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7449 piece_size);
7450 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7451 piece_size);
7452 }
7453
7454 /* Update DST and SRC rtx. */
7455 *srcmem = src;
7456 return dst;
7457 }
7458
7459 /* Helper function for the string operations below. Dest VARIABLE whether
7460 it is aligned to VALUE bytes. If true, jump to the label. */
7461
7462 static rtx_code_label *
7463 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7464 {
7465 rtx_code_label *label = gen_label_rtx ();
7466 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7467 if (GET_MODE (variable) == DImode)
7468 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7469 else
7470 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7471 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7472 1, label);
7473 if (epilogue)
7474 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7475 else
7476 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7477 return label;
7478 }
7479
7480
7481 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7482
7483 static void
7484 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7485 rtx destptr, rtx srcptr, rtx count, int max_size)
7486 {
7487 rtx src, dest;
7488 if (CONST_INT_P (count))
7489 {
7490 HOST_WIDE_INT countval = INTVAL (count);
7491 HOST_WIDE_INT epilogue_size = countval % max_size;
7492 int i;
7493
7494 /* For now MAX_SIZE should be a power of 2. This assert could be
7495 relaxed, but it'll require a bit more complicated epilogue
7496 expanding. */
7497 gcc_assert ((max_size & (max_size - 1)) == 0);
7498 for (i = max_size; i >= 1; i >>= 1)
7499 {
7500 if (epilogue_size & i)
7501 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7502 }
7503 return;
7504 }
7505 if (max_size > 8)
7506 {
7507 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7508 count, 1, OPTAB_DIRECT);
7509 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7510 count, QImode, 1, 4, false);
7511 return;
7512 }
7513
7514 /* When there are stringops, we can cheaply increase dest and src pointers.
7515 Otherwise we save code size by maintaining offset (zero is readily
7516 available from preceding rep operation) and using x86 addressing modes.
7517 */
7518 if (TARGET_SINGLE_STRINGOP)
7519 {
7520 if (max_size > 4)
7521 {
7522 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7523 src = change_address (srcmem, SImode, srcptr);
7524 dest = change_address (destmem, SImode, destptr);
7525 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7526 emit_label (label);
7527 LABEL_NUSES (label) = 1;
7528 }
7529 if (max_size > 2)
7530 {
7531 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7532 src = change_address (srcmem, HImode, srcptr);
7533 dest = change_address (destmem, HImode, destptr);
7534 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7535 emit_label (label);
7536 LABEL_NUSES (label) = 1;
7537 }
7538 if (max_size > 1)
7539 {
7540 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7541 src = change_address (srcmem, QImode, srcptr);
7542 dest = change_address (destmem, QImode, destptr);
7543 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7544 emit_label (label);
7545 LABEL_NUSES (label) = 1;
7546 }
7547 }
7548 else
7549 {
7550 rtx offset = force_reg (Pmode, const0_rtx);
7551 rtx tmp;
7552
7553 if (max_size > 4)
7554 {
7555 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7556 src = change_address (srcmem, SImode, srcptr);
7557 dest = change_address (destmem, SImode, destptr);
7558 emit_move_insn (dest, src);
7559 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7560 true, OPTAB_LIB_WIDEN);
7561 if (tmp != offset)
7562 emit_move_insn (offset, tmp);
7563 emit_label (label);
7564 LABEL_NUSES (label) = 1;
7565 }
7566 if (max_size > 2)
7567 {
7568 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7569 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7570 src = change_address (srcmem, HImode, tmp);
7571 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7572 dest = change_address (destmem, HImode, tmp);
7573 emit_move_insn (dest, src);
7574 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7575 true, OPTAB_LIB_WIDEN);
7576 if (tmp != offset)
7577 emit_move_insn (offset, tmp);
7578 emit_label (label);
7579 LABEL_NUSES (label) = 1;
7580 }
7581 if (max_size > 1)
7582 {
7583 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7584 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7585 src = change_address (srcmem, QImode, tmp);
7586 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7587 dest = change_address (destmem, QImode, tmp);
7588 emit_move_insn (dest, src);
7589 emit_label (label);
7590 LABEL_NUSES (label) = 1;
7591 }
7592 }
7593 }
7594
7595 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7596 with value PROMOTED_VAL.
7597 SRC is passed by pointer to be updated on return.
7598 Return value is updated DST. */
7599 static rtx
7600 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7601 HOST_WIDE_INT size_to_move)
7602 {
7603 rtx dst = destmem;
7604 enum insn_code code;
7605 machine_mode move_mode;
7606 int piece_size, i;
7607
7608 /* Find the widest mode in which we could perform moves.
7609 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7610 it until move of such size is supported. */
7611 move_mode = GET_MODE (promoted_val);
7612 if (move_mode == VOIDmode)
7613 move_mode = QImode;
7614 if (size_to_move < GET_MODE_SIZE (move_mode))
7615 {
7616 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7617 move_mode = int_mode_for_size (move_bits, 0).require ();
7618 promoted_val = gen_lowpart (move_mode, promoted_val);
7619 }
7620 piece_size = GET_MODE_SIZE (move_mode);
7621 code = optab_handler (mov_optab, move_mode);
7622 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7623
7624 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7625
7626 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7627 gcc_assert (size_to_move % piece_size == 0);
7628
7629 for (i = 0; i < size_to_move; i += piece_size)
7630 {
7631 if (piece_size <= GET_MODE_SIZE (word_mode))
7632 {
7633 emit_insn (gen_strset (destptr, dst, promoted_val));
7634 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7635 piece_size);
7636 continue;
7637 }
7638
7639 emit_insn (GEN_FCN (code) (dst, promoted_val));
7640
7641 emit_move_insn (destptr,
7642 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7643
7644 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7645 piece_size);
7646 }
7647
7648 /* Update DST rtx. */
7649 return dst;
7650 }
7651 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7652 static void
7653 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7654 rtx count, int max_size)
7655 {
7656 count = expand_simple_binop (counter_mode (count), AND, count,
7657 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7658 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7659 gen_lowpart (QImode, value), count, QImode,
7660 1, max_size / 2, true);
7661 }
7662
7663 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7664 static void
7665 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7666 rtx count, int max_size)
7667 {
7668 rtx dest;
7669
7670 if (CONST_INT_P (count))
7671 {
7672 HOST_WIDE_INT countval = INTVAL (count);
7673 HOST_WIDE_INT epilogue_size = countval % max_size;
7674 int i;
7675
7676 /* For now MAX_SIZE should be a power of 2. This assert could be
7677 relaxed, but it'll require a bit more complicated epilogue
7678 expanding. */
7679 gcc_assert ((max_size & (max_size - 1)) == 0);
7680 for (i = max_size; i >= 1; i >>= 1)
7681 {
7682 if (epilogue_size & i)
7683 {
7684 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7685 destmem = emit_memset (destmem, destptr, vec_value, i);
7686 else
7687 destmem = emit_memset (destmem, destptr, value, i);
7688 }
7689 }
7690 return;
7691 }
7692 if (max_size > 32)
7693 {
7694 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7695 return;
7696 }
7697 if (max_size > 16)
7698 {
7699 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7700 if (TARGET_64BIT)
7701 {
7702 dest = change_address (destmem, DImode, destptr);
7703 emit_insn (gen_strset (destptr, dest, value));
7704 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7705 emit_insn (gen_strset (destptr, dest, value));
7706 }
7707 else
7708 {
7709 dest = change_address (destmem, SImode, destptr);
7710 emit_insn (gen_strset (destptr, dest, value));
7711 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7712 emit_insn (gen_strset (destptr, dest, value));
7713 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7714 emit_insn (gen_strset (destptr, dest, value));
7715 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7716 emit_insn (gen_strset (destptr, dest, value));
7717 }
7718 emit_label (label);
7719 LABEL_NUSES (label) = 1;
7720 }
7721 if (max_size > 8)
7722 {
7723 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7724 if (TARGET_64BIT)
7725 {
7726 dest = change_address (destmem, DImode, destptr);
7727 emit_insn (gen_strset (destptr, dest, value));
7728 }
7729 else
7730 {
7731 dest = change_address (destmem, SImode, destptr);
7732 emit_insn (gen_strset (destptr, dest, value));
7733 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7734 emit_insn (gen_strset (destptr, dest, value));
7735 }
7736 emit_label (label);
7737 LABEL_NUSES (label) = 1;
7738 }
7739 if (max_size > 4)
7740 {
7741 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7742 dest = change_address (destmem, SImode, destptr);
7743 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7744 emit_label (label);
7745 LABEL_NUSES (label) = 1;
7746 }
7747 if (max_size > 2)
7748 {
7749 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7750 dest = change_address (destmem, HImode, destptr);
7751 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7752 emit_label (label);
7753 LABEL_NUSES (label) = 1;
7754 }
7755 if (max_size > 1)
7756 {
7757 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7758 dest = change_address (destmem, QImode, destptr);
7759 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7760 emit_label (label);
7761 LABEL_NUSES (label) = 1;
7762 }
7763 }
7764
7765 /* Adjust COUNTER by the VALUE. */
7766 static void
7767 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7768 {
7769 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7770 }
7771
7772 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7773 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7774 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7775 ignored.
7776 Return value is updated DESTMEM. */
7777
7778 static rtx
7779 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7780 rtx destptr, rtx srcptr, rtx value,
7781 rtx vec_value, rtx count, int align,
7782 int desired_alignment, bool issetmem)
7783 {
7784 int i;
7785 for (i = 1; i < desired_alignment; i <<= 1)
7786 {
7787 if (align <= i)
7788 {
7789 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7790 if (issetmem)
7791 {
7792 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7793 destmem = emit_memset (destmem, destptr, vec_value, i);
7794 else
7795 destmem = emit_memset (destmem, destptr, value, i);
7796 }
7797 else
7798 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7799 ix86_adjust_counter (count, i);
7800 emit_label (label);
7801 LABEL_NUSES (label) = 1;
7802 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7803 }
7804 }
7805 return destmem;
7806 }
7807
7808 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7809 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7810 and jump to DONE_LABEL. */
7811 static void
7812 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
7813 rtx destptr, rtx srcptr,
7814 rtx value, rtx vec_value,
7815 rtx count, int size,
7816 rtx done_label, bool issetmem)
7817 {
7818 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7819 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7820 rtx modesize;
7821 int n;
7822
7823 /* If we do not have vector value to copy, we must reduce size. */
7824 if (issetmem)
7825 {
7826 if (!vec_value)
7827 {
7828 if (GET_MODE (value) == VOIDmode && size > 8)
7829 mode = Pmode;
7830 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7831 mode = GET_MODE (value);
7832 }
7833 else
7834 mode = GET_MODE (vec_value), value = vec_value;
7835 }
7836 else
7837 {
7838 /* Choose appropriate vector mode. */
7839 if (size >= 32)
7840 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7841 else if (size >= 16)
7842 mode = TARGET_SSE ? V16QImode : DImode;
7843 srcmem = change_address (srcmem, mode, srcptr);
7844 }
7845 destmem = change_address (destmem, mode, destptr);
7846 modesize = GEN_INT (GET_MODE_SIZE (mode));
7847 gcc_assert (GET_MODE_SIZE (mode) <= size);
7848 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7849 {
7850 if (issetmem)
7851 emit_move_insn (destmem, gen_lowpart (mode, value));
7852 else
7853 {
7854 emit_move_insn (destmem, srcmem);
7855 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7856 }
7857 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7858 }
7859
7860 destmem = offset_address (destmem, count, 1);
7861 destmem = offset_address (destmem, GEN_INT (-2 * size),
7862 GET_MODE_SIZE (mode));
7863 if (!issetmem)
7864 {
7865 srcmem = offset_address (srcmem, count, 1);
7866 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7867 GET_MODE_SIZE (mode));
7868 }
7869 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7870 {
7871 if (issetmem)
7872 emit_move_insn (destmem, gen_lowpart (mode, value));
7873 else
7874 {
7875 emit_move_insn (destmem, srcmem);
7876 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7877 }
7878 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7879 }
7880 emit_jump_insn (gen_jump (done_label));
7881 emit_barrier ();
7882
7883 emit_label (label);
7884 LABEL_NUSES (label) = 1;
7885 }
7886
7887 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7888 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7889 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7890 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7891 DONE_LABEL is a label after the whole copying sequence. The label is created
7892 on demand if *DONE_LABEL is NULL.
7893 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7894 bounds after the initial copies.
7895
7896 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7897 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7898 we will dispatch to a library call for large blocks.
7899
7900 In pseudocode we do:
7901
7902 if (COUNT < SIZE)
7903 {
7904 Assume that SIZE is 4. Bigger sizes are handled analogously
7905 if (COUNT & 4)
7906 {
7907 copy 4 bytes from SRCPTR to DESTPTR
7908 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7909 goto done_label
7910 }
7911 if (!COUNT)
7912 goto done_label;
7913 copy 1 byte from SRCPTR to DESTPTR
7914 if (COUNT & 2)
7915 {
7916 copy 2 bytes from SRCPTR to DESTPTR
7917 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7918 }
7919 }
7920 else
7921 {
7922 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7923 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7924
7925 OLD_DESPTR = DESTPTR;
7926 Align DESTPTR up to DESIRED_ALIGN
7927 SRCPTR += DESTPTR - OLD_DESTPTR
7928 COUNT -= DEST_PTR - OLD_DESTPTR
7929 if (DYNAMIC_CHECK)
7930 Round COUNT down to multiple of SIZE
7931 << optional caller supplied zero size guard is here >>
7932 << optional caller supplied dynamic check is here >>
7933 << caller supplied main copy loop is here >>
7934 }
7935 done_label:
7936 */
7937 static void
7938 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
7939 rtx *destptr, rtx *srcptr,
7940 machine_mode mode,
7941 rtx value, rtx vec_value,
7942 rtx *count,
7943 rtx_code_label **done_label,
7944 int size,
7945 int desired_align,
7946 int align,
7947 unsigned HOST_WIDE_INT *min_size,
7948 bool dynamic_check,
7949 bool issetmem)
7950 {
7951 rtx_code_label *loop_label = NULL, *label;
7952 int n;
7953 rtx modesize;
7954 int prolog_size = 0;
7955 rtx mode_value;
7956
7957 /* Chose proper value to copy. */
7958 if (issetmem && VECTOR_MODE_P (mode))
7959 mode_value = vec_value;
7960 else
7961 mode_value = value;
7962 gcc_assert (GET_MODE_SIZE (mode) <= size);
7963
7964 /* See if block is big or small, handle small blocks. */
7965 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7966 {
7967 int size2 = size;
7968 loop_label = gen_label_rtx ();
7969
7970 if (!*done_label)
7971 *done_label = gen_label_rtx ();
7972
7973 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7974 1, loop_label);
7975 size2 >>= 1;
7976
7977 /* Handle sizes > 3. */
7978 for (;size2 > 2; size2 >>= 1)
7979 expand_small_cpymem_or_setmem (destmem, srcmem,
7980 *destptr, *srcptr,
7981 value, vec_value,
7982 *count,
7983 size2, *done_label, issetmem);
7984 /* Nothing to copy? Jump to DONE_LABEL if so */
7985 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7986 1, *done_label);
7987
7988 /* Do a byte copy. */
7989 destmem = change_address (destmem, QImode, *destptr);
7990 if (issetmem)
7991 emit_move_insn (destmem, gen_lowpart (QImode, value));
7992 else
7993 {
7994 srcmem = change_address (srcmem, QImode, *srcptr);
7995 emit_move_insn (destmem, srcmem);
7996 }
7997
7998 /* Handle sizes 2 and 3. */
7999 label = ix86_expand_aligntest (*count, 2, false);
8000 destmem = change_address (destmem, HImode, *destptr);
8001 destmem = offset_address (destmem, *count, 1);
8002 destmem = offset_address (destmem, GEN_INT (-2), 2);
8003 if (issetmem)
8004 emit_move_insn (destmem, gen_lowpart (HImode, value));
8005 else
8006 {
8007 srcmem = change_address (srcmem, HImode, *srcptr);
8008 srcmem = offset_address (srcmem, *count, 1);
8009 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
8010 emit_move_insn (destmem, srcmem);
8011 }
8012
8013 emit_label (label);
8014 LABEL_NUSES (label) = 1;
8015 emit_jump_insn (gen_jump (*done_label));
8016 emit_barrier ();
8017 }
8018 else
8019 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
8020 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
8021
8022 /* Start memcpy for COUNT >= SIZE. */
8023 if (loop_label)
8024 {
8025 emit_label (loop_label);
8026 LABEL_NUSES (loop_label) = 1;
8027 }
8028
8029 /* Copy first desired_align bytes. */
8030 if (!issetmem)
8031 srcmem = change_address (srcmem, mode, *srcptr);
8032 destmem = change_address (destmem, mode, *destptr);
8033 modesize = GEN_INT (GET_MODE_SIZE (mode));
8034 for (n = 0; prolog_size < desired_align - align; n++)
8035 {
8036 if (issetmem)
8037 emit_move_insn (destmem, mode_value);
8038 else
8039 {
8040 emit_move_insn (destmem, srcmem);
8041 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8042 }
8043 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8044 prolog_size += GET_MODE_SIZE (mode);
8045 }
8046
8047
8048 /* Copy last SIZE bytes. */
8049 destmem = offset_address (destmem, *count, 1);
8050 destmem = offset_address (destmem,
8051 GEN_INT (-size - prolog_size),
8052 1);
8053 if (issetmem)
8054 emit_move_insn (destmem, mode_value);
8055 else
8056 {
8057 srcmem = offset_address (srcmem, *count, 1);
8058 srcmem = offset_address (srcmem,
8059 GEN_INT (-size - prolog_size),
8060 1);
8061 emit_move_insn (destmem, srcmem);
8062 }
8063 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8064 {
8065 destmem = offset_address (destmem, modesize, 1);
8066 if (issetmem)
8067 emit_move_insn (destmem, mode_value);
8068 else
8069 {
8070 srcmem = offset_address (srcmem, modesize, 1);
8071 emit_move_insn (destmem, srcmem);
8072 }
8073 }
8074
8075 /* Align destination. */
8076 if (desired_align > 1 && desired_align > align)
8077 {
8078 rtx saveddest = *destptr;
8079
8080 gcc_assert (desired_align <= size);
8081 /* Align destptr up, place it to new register. */
8082 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8083 GEN_INT (prolog_size),
8084 NULL_RTX, 1, OPTAB_DIRECT);
8085 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8086 REG_POINTER (*destptr) = 1;
8087 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8088 GEN_INT (-desired_align),
8089 *destptr, 1, OPTAB_DIRECT);
8090 /* See how many bytes we skipped. */
8091 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8092 *destptr,
8093 saveddest, 1, OPTAB_DIRECT);
8094 /* Adjust srcptr and count. */
8095 if (!issetmem)
8096 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8097 saveddest, *srcptr, 1, OPTAB_DIRECT);
8098 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8099 saveddest, *count, 1, OPTAB_DIRECT);
8100 /* We copied at most size + prolog_size. */
8101 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8102 *min_size
8103 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8104 else
8105 *min_size = 0;
8106
8107 /* Our loops always round down the block size, but for dispatch to
8108 library we need precise value. */
8109 if (dynamic_check)
8110 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
8111 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
8112 }
8113 else
8114 {
8115 gcc_assert (prolog_size == 0);
8116 /* Decrease count, so we won't end up copying last word twice. */
8117 if (!CONST_INT_P (*count))
8118 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8119 constm1_rtx, *count, 1, OPTAB_DIRECT);
8120 else
8121 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
8122 (unsigned HOST_WIDE_INT)size));
8123 if (*min_size)
8124 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
8125 }
8126 }
8127
8128
8129 /* This function is like the previous one, except here we know how many bytes
8130 need to be copied. That allows us to update alignment not only of DST, which
8131 is returned, but also of SRC, which is passed as a pointer for that
8132 reason. */
8133 static rtx
8134 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
8135 rtx srcreg, rtx value, rtx vec_value,
8136 int desired_align, int align_bytes,
8137 bool issetmem)
8138 {
8139 rtx src = NULL;
8140 rtx orig_dst = dst;
8141 rtx orig_src = NULL;
8142 int piece_size = 1;
8143 int copied_bytes = 0;
8144
8145 if (!issetmem)
8146 {
8147 gcc_assert (srcp != NULL);
8148 src = *srcp;
8149 orig_src = src;
8150 }
8151
8152 for (piece_size = 1;
8153 piece_size <= desired_align && copied_bytes < align_bytes;
8154 piece_size <<= 1)
8155 {
8156 if (align_bytes & piece_size)
8157 {
8158 if (issetmem)
8159 {
8160 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
8161 dst = emit_memset (dst, destreg, vec_value, piece_size);
8162 else
8163 dst = emit_memset (dst, destreg, value, piece_size);
8164 }
8165 else
8166 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
8167 copied_bytes += piece_size;
8168 }
8169 }
8170 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
8171 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8172 if (MEM_SIZE_KNOWN_P (orig_dst))
8173 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
8174
8175 if (!issetmem)
8176 {
8177 int src_align_bytes = get_mem_align_offset (src, desired_align
8178 * BITS_PER_UNIT);
8179 if (src_align_bytes >= 0)
8180 src_align_bytes = desired_align - src_align_bytes;
8181 if (src_align_bytes >= 0)
8182 {
8183 unsigned int src_align;
8184 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
8185 {
8186 if ((src_align_bytes & (src_align - 1))
8187 == (align_bytes & (src_align - 1)))
8188 break;
8189 }
8190 if (src_align > (unsigned int) desired_align)
8191 src_align = desired_align;
8192 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8193 set_mem_align (src, src_align * BITS_PER_UNIT);
8194 }
8195 if (MEM_SIZE_KNOWN_P (orig_src))
8196 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8197 *srcp = src;
8198 }
8199
8200 return dst;
8201 }
8202
8203 /* Return true if ALG can be used in current context.
8204 Assume we expand memset if MEMSET is true. */
8205 static bool
8206 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8207 {
8208 if (alg == no_stringop)
8209 return false;
8210 if (alg == vector_loop)
8211 return TARGET_SSE || TARGET_AVX;
8212 /* Algorithms using the rep prefix want at least edi and ecx;
8213 additionally, memset wants eax and memcpy wants esi. Don't
8214 consider such algorithms if the user has appropriated those
8215 registers for their own purposes, or if we have a non-default
8216 address space, since some string insns cannot override the segment. */
8217 if (alg == rep_prefix_1_byte
8218 || alg == rep_prefix_4_byte
8219 || alg == rep_prefix_8_byte)
8220 {
8221 if (have_as)
8222 return false;
8223 if (fixed_regs[CX_REG]
8224 || fixed_regs[DI_REG]
8225 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8226 return false;
8227 }
8228 return true;
8229 }
8230
8231 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8232 static enum stringop_alg
8233 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8234 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8235 bool memset, bool zero_memset, bool have_as,
8236 int *dynamic_check, bool *noalign, bool recur)
8237 {
8238 const struct stringop_algs *algs;
8239 bool optimize_for_speed;
8240 int max = 0;
8241 const struct processor_costs *cost;
8242 int i;
8243 bool any_alg_usable_p = false;
8244
8245 *noalign = false;
8246 *dynamic_check = -1;
8247
8248 /* Even if the string operation call is cold, we still might spend a lot
8249 of time processing large blocks. */
8250 if (optimize_function_for_size_p (cfun)
8251 || (optimize_insn_for_size_p ()
8252 && (max_size < 256
8253 || (expected_size != -1 && expected_size < 256))))
8254 optimize_for_speed = false;
8255 else
8256 optimize_for_speed = true;
8257
8258 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8259 if (memset)
8260 algs = &cost->memset[TARGET_64BIT != 0];
8261 else
8262 algs = &cost->memcpy[TARGET_64BIT != 0];
8263
8264 /* See maximal size for user defined algorithm. */
8265 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8266 {
8267 enum stringop_alg candidate = algs->size[i].alg;
8268 bool usable = alg_usable_p (candidate, memset, have_as);
8269 any_alg_usable_p |= usable;
8270
8271 if (candidate != libcall && candidate && usable)
8272 max = algs->size[i].max;
8273 }
8274
8275 /* If expected size is not known but max size is small enough
8276 so inline version is a win, set expected size into
8277 the range. */
8278 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8279 && expected_size == -1)
8280 expected_size = min_size / 2 + max_size / 2;
8281
8282 /* If user specified the algorithm, honor it if possible. */
8283 if (ix86_stringop_alg != no_stringop
8284 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8285 return ix86_stringop_alg;
8286 /* rep; movq or rep; movl is the smallest variant. */
8287 else if (!optimize_for_speed)
8288 {
8289 *noalign = true;
8290 if (!count || (count & 3) || (memset && !zero_memset))
8291 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8292 ? rep_prefix_1_byte : loop_1_byte;
8293 else
8294 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8295 ? rep_prefix_4_byte : loop;
8296 }
8297 /* Very tiny blocks are best handled via the loop, REP is expensive to
8298 setup. */
8299 else if (expected_size != -1 && expected_size < 4)
8300 return loop_1_byte;
8301 else if (expected_size != -1)
8302 {
8303 enum stringop_alg alg = libcall;
8304 bool alg_noalign = false;
8305 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8306 {
8307 /* We get here if the algorithms that were not libcall-based
8308 were rep-prefix based and we are unable to use rep prefixes
8309 based on global register usage. Break out of the loop and
8310 use the heuristic below. */
8311 if (algs->size[i].max == 0)
8312 break;
8313 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8314 {
8315 enum stringop_alg candidate = algs->size[i].alg;
8316
8317 if (candidate != libcall
8318 && alg_usable_p (candidate, memset, have_as))
8319 {
8320 alg = candidate;
8321 alg_noalign = algs->size[i].noalign;
8322 }
8323 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8324 last non-libcall inline algorithm. */
8325 if (TARGET_INLINE_ALL_STRINGOPS)
8326 {
8327 /* When the current size is best to be copied by a libcall,
8328 but we are still forced to inline, run the heuristic below
8329 that will pick code for medium sized blocks. */
8330 if (alg != libcall)
8331 {
8332 *noalign = alg_noalign;
8333 return alg;
8334 }
8335 else if (!any_alg_usable_p)
8336 break;
8337 }
8338 else if (alg_usable_p (candidate, memset, have_as)
8339 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8340 && candidate == rep_prefix_1_byte
8341 /* NB: If min_size != max_size, size is
8342 unknown. */
8343 && min_size != max_size))
8344 {
8345 *noalign = algs->size[i].noalign;
8346 return candidate;
8347 }
8348 }
8349 }
8350 }
8351 /* When asked to inline the call anyway, try to pick meaningful choice.
8352 We look for maximal size of block that is faster to copy by hand and
8353 take blocks of at most of that size guessing that average size will
8354 be roughly half of the block.
8355
8356 If this turns out to be bad, we might simply specify the preferred
8357 choice in ix86_costs. */
8358 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8359 && (algs->unknown_size == libcall
8360 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8361 {
8362 enum stringop_alg alg;
8363 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8364
8365 /* If there aren't any usable algorithms or if recursing already,
8366 then recursing on smaller sizes or same size isn't going to
8367 find anything. Just return the simple byte-at-a-time copy loop. */
8368 if (!any_alg_usable_p || recur)
8369 {
8370 /* Pick something reasonable. */
8371 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8372 *dynamic_check = 128;
8373 return loop_1_byte;
8374 }
8375 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8376 zero_memset, have_as, dynamic_check, noalign, true);
8377 gcc_assert (*dynamic_check == -1);
8378 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8379 *dynamic_check = max;
8380 else
8381 gcc_assert (alg != libcall);
8382 return alg;
8383 }
8384 return (alg_usable_p (algs->unknown_size, memset, have_as)
8385 ? algs->unknown_size : libcall);
8386 }
8387
8388 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8389 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8390 static int
8391 decide_alignment (int align,
8392 enum stringop_alg alg,
8393 int expected_size,
8394 machine_mode move_mode)
8395 {
8396 int desired_align = 0;
8397
8398 gcc_assert (alg != no_stringop);
8399
8400 if (alg == libcall)
8401 return 0;
8402 if (move_mode == VOIDmode)
8403 return 0;
8404
8405 desired_align = GET_MODE_SIZE (move_mode);
8406 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8407 copying whole cacheline at once. */
8408 if (TARGET_CPU_P (PENTIUMPRO)
8409 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8410 desired_align = 8;
8411
8412 if (optimize_size)
8413 desired_align = 1;
8414 if (desired_align < align)
8415 desired_align = align;
8416 if (expected_size != -1 && expected_size < 4)
8417 desired_align = align;
8418
8419 return desired_align;
8420 }
8421
8422
8423 /* Helper function for memcpy. For QImode value 0xXY produce
8424 0xXYXYXYXY of wide specified by MODE. This is essentially
8425 a * 0x10101010, but we can do slightly better than
8426 synth_mult by unwinding the sequence by hand on CPUs with
8427 slow multiply. */
8428 static rtx
8429 promote_duplicated_reg (machine_mode mode, rtx val)
8430 {
8431 machine_mode valmode = GET_MODE (val);
8432 rtx tmp;
8433 int nops = mode == DImode ? 3 : 2;
8434
8435 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8436 if (val == const0_rtx)
8437 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8438 if (CONST_INT_P (val))
8439 {
8440 HOST_WIDE_INT v = INTVAL (val) & 255;
8441
8442 v |= v << 8;
8443 v |= v << 16;
8444 if (mode == DImode)
8445 v |= (v << 16) << 16;
8446 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8447 }
8448
8449 if (valmode == VOIDmode)
8450 valmode = QImode;
8451 if (valmode != QImode)
8452 val = gen_lowpart (QImode, val);
8453 if (mode == QImode)
8454 return val;
8455 if (!TARGET_PARTIAL_REG_STALL)
8456 nops--;
8457 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8458 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8459 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8460 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8461 {
8462 rtx reg = convert_modes (mode, QImode, val, true);
8463 tmp = promote_duplicated_reg (mode, const1_rtx);
8464 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8465 OPTAB_DIRECT);
8466 }
8467 else
8468 {
8469 rtx reg = convert_modes (mode, QImode, val, true);
8470
8471 if (!TARGET_PARTIAL_REG_STALL)
8472 emit_insn (gen_insv_1 (mode, reg, reg));
8473 else
8474 {
8475 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8476 NULL, 1, OPTAB_DIRECT);
8477 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8478 OPTAB_DIRECT);
8479 }
8480 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8481 NULL, 1, OPTAB_DIRECT);
8482 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8483 if (mode == SImode)
8484 return reg;
8485 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8486 NULL, 1, OPTAB_DIRECT);
8487 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8488 return reg;
8489 }
8490 }
8491
8492 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8493 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8494 alignment from ALIGN to DESIRED_ALIGN. */
8495 static rtx
8496 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8497 int align)
8498 {
8499 rtx promoted_val;
8500
8501 if (TARGET_64BIT
8502 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8503 promoted_val = promote_duplicated_reg (DImode, val);
8504 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8505 promoted_val = promote_duplicated_reg (SImode, val);
8506 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8507 promoted_val = promote_duplicated_reg (HImode, val);
8508 else
8509 promoted_val = val;
8510
8511 return promoted_val;
8512 }
8513
8514 /* Copy the address to a Pmode register. This is used for x32 to
8515 truncate DImode TLS address to a SImode register. */
8516
8517 static rtx
8518 ix86_copy_addr_to_reg (rtx addr)
8519 {
8520 rtx reg;
8521 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8522 {
8523 reg = copy_addr_to_reg (addr);
8524 REG_POINTER (reg) = 1;
8525 return reg;
8526 }
8527 else
8528 {
8529 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8530 reg = copy_to_mode_reg (DImode, addr);
8531 REG_POINTER (reg) = 1;
8532 return gen_rtx_SUBREG (SImode, reg, 0);
8533 }
8534 }
8535
8536 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8537 operations when profitable. The code depends upon architecture, block size
8538 and alignment, but always has one of the following overall structures:
8539
8540 Aligned move sequence:
8541
8542 1) Prologue guard: Conditional that jumps up to epilogues for small
8543 blocks that can be handled by epilogue alone. This is faster
8544 but also needed for correctness, since prologue assume the block
8545 is larger than the desired alignment.
8546
8547 Optional dynamic check for size and libcall for large
8548 blocks is emitted here too, with -minline-stringops-dynamically.
8549
8550 2) Prologue: copy first few bytes in order to get destination
8551 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8552 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8553 copied. We emit either a jump tree on power of two sized
8554 blocks, or a byte loop.
8555
8556 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8557 with specified algorithm.
8558
8559 4) Epilogue: code copying tail of the block that is too small to be
8560 handled by main body (or up to size guarded by prologue guard).
8561
8562 Misaligned move sequence
8563
8564 1) missaligned move prologue/epilogue containing:
8565 a) Prologue handling small memory blocks and jumping to done_label
8566 (skipped if blocks are known to be large enough)
8567 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8568 needed by single possibly misaligned move
8569 (skipped if alignment is not needed)
8570 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8571
8572 2) Zero size guard dispatching to done_label, if needed
8573
8574 3) dispatch to library call, if needed,
8575
8576 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8577 with specified algorithm. */
8578 bool
8579 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8580 rtx align_exp, rtx expected_align_exp,
8581 rtx expected_size_exp, rtx min_size_exp,
8582 rtx max_size_exp, rtx probable_max_size_exp,
8583 bool issetmem)
8584 {
8585 rtx destreg;
8586 rtx srcreg = NULL;
8587 rtx_code_label *label = NULL;
8588 rtx tmp;
8589 rtx_code_label *jump_around_label = NULL;
8590 HOST_WIDE_INT align = 1;
8591 unsigned HOST_WIDE_INT count = 0;
8592 HOST_WIDE_INT expected_size = -1;
8593 int size_needed = 0, epilogue_size_needed;
8594 int desired_align = 0, align_bytes = 0;
8595 enum stringop_alg alg;
8596 rtx promoted_val = NULL;
8597 rtx vec_promoted_val = NULL;
8598 bool force_loopy_epilogue = false;
8599 int dynamic_check;
8600 bool need_zero_guard = false;
8601 bool noalign;
8602 machine_mode move_mode = VOIDmode;
8603 machine_mode wider_mode;
8604 int unroll_factor = 1;
8605 /* TODO: Once value ranges are available, fill in proper data. */
8606 unsigned HOST_WIDE_INT min_size = 0;
8607 unsigned HOST_WIDE_INT max_size = -1;
8608 unsigned HOST_WIDE_INT probable_max_size = -1;
8609 bool misaligned_prologue_used = false;
8610 bool have_as;
8611
8612 if (CONST_INT_P (align_exp))
8613 align = INTVAL (align_exp);
8614 /* i386 can do misaligned access on reasonably increased cost. */
8615 if (CONST_INT_P (expected_align_exp)
8616 && INTVAL (expected_align_exp) > align)
8617 align = INTVAL (expected_align_exp);
8618 /* ALIGN is the minimum of destination and source alignment, but we care here
8619 just about destination alignment. */
8620 else if (!issetmem
8621 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8622 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8623
8624 if (CONST_INT_P (count_exp))
8625 {
8626 min_size = max_size = probable_max_size = count = expected_size
8627 = INTVAL (count_exp);
8628 /* When COUNT is 0, there is nothing to do. */
8629 if (!count)
8630 return true;
8631 }
8632 else
8633 {
8634 if (min_size_exp)
8635 min_size = INTVAL (min_size_exp);
8636 if (max_size_exp)
8637 max_size = INTVAL (max_size_exp);
8638 if (probable_max_size_exp)
8639 probable_max_size = INTVAL (probable_max_size_exp);
8640 if (CONST_INT_P (expected_size_exp))
8641 expected_size = INTVAL (expected_size_exp);
8642 }
8643
8644 /* Make sure we don't need to care about overflow later on. */
8645 if (count > (HOST_WIDE_INT_1U << 30))
8646 return false;
8647
8648 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8649 if (!issetmem)
8650 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8651
8652 /* Step 0: Decide on preferred algorithm, desired alignment and
8653 size of chunks to be copied by main loop. */
8654 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8655 issetmem,
8656 issetmem && val_exp == const0_rtx, have_as,
8657 &dynamic_check, &noalign, false);
8658
8659 if (dump_file)
8660 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8661 stringop_alg_names[alg]);
8662
8663 if (alg == libcall)
8664 return false;
8665 gcc_assert (alg != no_stringop);
8666
8667 /* For now vector-version of memset is generated only for memory zeroing, as
8668 creating of promoted vector value is very cheap in this case. */
8669 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8670 alg = unrolled_loop;
8671
8672 if (!count)
8673 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8674 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8675 if (!issetmem)
8676 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8677
8678 unroll_factor = 1;
8679 move_mode = word_mode;
8680 switch (alg)
8681 {
8682 case libcall:
8683 case no_stringop:
8684 case last_alg:
8685 gcc_unreachable ();
8686 case loop_1_byte:
8687 need_zero_guard = true;
8688 move_mode = QImode;
8689 break;
8690 case loop:
8691 need_zero_guard = true;
8692 break;
8693 case unrolled_loop:
8694 need_zero_guard = true;
8695 unroll_factor = (TARGET_64BIT ? 4 : 2);
8696 break;
8697 case vector_loop:
8698 need_zero_guard = true;
8699 unroll_factor = 4;
8700 /* Find the widest supported mode. */
8701 move_mode = word_mode;
8702 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8703 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8704 move_mode = wider_mode;
8705
8706 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8707 move_mode = TImode;
8708 if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
8709 move_mode = OImode;
8710
8711 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8712 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8713 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8714 {
8715 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8716 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8717 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8718 move_mode = word_mode;
8719 }
8720 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8721 break;
8722 case rep_prefix_8_byte:
8723 move_mode = DImode;
8724 break;
8725 case rep_prefix_4_byte:
8726 move_mode = SImode;
8727 break;
8728 case rep_prefix_1_byte:
8729 move_mode = QImode;
8730 break;
8731 }
8732 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8733 epilogue_size_needed = size_needed;
8734
8735 /* If we are going to call any library calls conditionally, make sure any
8736 pending stack adjustment happen before the first conditional branch,
8737 otherwise they will be emitted before the library call only and won't
8738 happen from the other branches. */
8739 if (dynamic_check != -1)
8740 do_pending_stack_adjust ();
8741
8742 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8743 if (!TARGET_ALIGN_STRINGOPS || noalign)
8744 align = desired_align;
8745
8746 /* Step 1: Prologue guard. */
8747
8748 /* Alignment code needs count to be in register. */
8749 if (CONST_INT_P (count_exp) && desired_align > align)
8750 {
8751 if (INTVAL (count_exp) > desired_align
8752 && INTVAL (count_exp) > size_needed)
8753 {
8754 align_bytes
8755 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8756 if (align_bytes <= 0)
8757 align_bytes = 0;
8758 else
8759 align_bytes = desired_align - align_bytes;
8760 }
8761 if (align_bytes == 0)
8762 count_exp = force_reg (counter_mode (count_exp), count_exp);
8763 }
8764 gcc_assert (desired_align >= 1 && align >= 1);
8765
8766 /* Misaligned move sequences handle both prologue and epilogue at once.
8767 Default code generation results in a smaller code for large alignments
8768 and also avoids redundant job when sizes are known precisely. */
8769 misaligned_prologue_used
8770 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8771 && MAX (desired_align, epilogue_size_needed) <= 32
8772 && desired_align <= epilogue_size_needed
8773 && ((desired_align > align && !align_bytes)
8774 || (!count && epilogue_size_needed > 1)));
8775
8776 /* Do the cheap promotion to allow better CSE across the
8777 main loop and epilogue (ie one load of the big constant in the
8778 front of all code.
8779 For now the misaligned move sequences do not have fast path
8780 without broadcasting. */
8781 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8782 {
8783 if (alg == vector_loop)
8784 {
8785 gcc_assert (val_exp == const0_rtx);
8786 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8787 promoted_val = promote_duplicated_reg_to_size (val_exp,
8788 GET_MODE_SIZE (word_mode),
8789 desired_align, align);
8790 }
8791 else
8792 {
8793 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8794 desired_align, align);
8795 }
8796 }
8797 /* Misaligned move sequences handles both prologues and epilogues at once.
8798 Default code generation results in smaller code for large alignments and
8799 also avoids redundant job when sizes are known precisely. */
8800 if (misaligned_prologue_used)
8801 {
8802 /* Misaligned move prologue handled small blocks by itself. */
8803 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8804 (dst, src, &destreg, &srcreg,
8805 move_mode, promoted_val, vec_promoted_val,
8806 &count_exp,
8807 &jump_around_label,
8808 desired_align < align
8809 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8810 desired_align, align, &min_size, dynamic_check, issetmem);
8811 if (!issetmem)
8812 src = change_address (src, BLKmode, srcreg);
8813 dst = change_address (dst, BLKmode, destreg);
8814 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8815 epilogue_size_needed = 0;
8816 if (need_zero_guard
8817 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8818 {
8819 /* It is possible that we copied enough so the main loop will not
8820 execute. */
8821 gcc_assert (size_needed > 1);
8822 if (jump_around_label == NULL_RTX)
8823 jump_around_label = gen_label_rtx ();
8824 emit_cmp_and_jump_insns (count_exp,
8825 GEN_INT (size_needed),
8826 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8827 if (expected_size == -1
8828 || expected_size < (desired_align - align) / 2 + size_needed)
8829 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8830 else
8831 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8832 }
8833 }
8834 /* Ensure that alignment prologue won't copy past end of block. */
8835 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8836 {
8837 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8838 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8839 Make sure it is power of 2. */
8840 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8841
8842 /* To improve performance of small blocks, we jump around the VAL
8843 promoting mode. This mean that if the promoted VAL is not constant,
8844 we might not use it in the epilogue and have to use byte
8845 loop variant. */
8846 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8847 force_loopy_epilogue = true;
8848 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8849 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8850 {
8851 /* If main algorithm works on QImode, no epilogue is needed.
8852 For small sizes just don't align anything. */
8853 if (size_needed == 1)
8854 desired_align = align;
8855 else
8856 goto epilogue;
8857 }
8858 else if (!count
8859 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8860 {
8861 label = gen_label_rtx ();
8862 emit_cmp_and_jump_insns (count_exp,
8863 GEN_INT (epilogue_size_needed),
8864 LTU, 0, counter_mode (count_exp), 1, label);
8865 if (expected_size == -1 || expected_size < epilogue_size_needed)
8866 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8867 else
8868 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8869 }
8870 }
8871
8872 /* Emit code to decide on runtime whether library call or inline should be
8873 used. */
8874 if (dynamic_check != -1)
8875 {
8876 if (!issetmem && CONST_INT_P (count_exp))
8877 {
8878 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8879 {
8880 emit_block_copy_via_libcall (dst, src, count_exp);
8881 count_exp = const0_rtx;
8882 goto epilogue;
8883 }
8884 }
8885 else
8886 {
8887 rtx_code_label *hot_label = gen_label_rtx ();
8888 if (jump_around_label == NULL_RTX)
8889 jump_around_label = gen_label_rtx ();
8890 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8891 LEU, 0, counter_mode (count_exp),
8892 1, hot_label);
8893 predict_jump (REG_BR_PROB_BASE * 90 / 100);
8894 if (issetmem)
8895 set_storage_via_libcall (dst, count_exp, val_exp);
8896 else
8897 emit_block_copy_via_libcall (dst, src, count_exp);
8898 emit_jump (jump_around_label);
8899 emit_label (hot_label);
8900 }
8901 }
8902
8903 /* Step 2: Alignment prologue. */
8904 /* Do the expensive promotion once we branched off the small blocks. */
8905 if (issetmem && !promoted_val)
8906 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8907 desired_align, align);
8908
8909 if (desired_align > align && !misaligned_prologue_used)
8910 {
8911 if (align_bytes == 0)
8912 {
8913 /* Except for the first move in prologue, we no longer know
8914 constant offset in aliasing info. It don't seems to worth
8915 the pain to maintain it for the first move, so throw away
8916 the info early. */
8917 dst = change_address (dst, BLKmode, destreg);
8918 if (!issetmem)
8919 src = change_address (src, BLKmode, srcreg);
8920 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
8921 promoted_val, vec_promoted_val,
8922 count_exp, align, desired_align,
8923 issetmem);
8924 /* At most desired_align - align bytes are copied. */
8925 if (min_size < (unsigned)(desired_align - align))
8926 min_size = 0;
8927 else
8928 min_size -= desired_align - align;
8929 }
8930 else
8931 {
8932 /* If we know how many bytes need to be stored before dst is
8933 sufficiently aligned, maintain aliasing info accurately. */
8934 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
8935 srcreg,
8936 promoted_val,
8937 vec_promoted_val,
8938 desired_align,
8939 align_bytes,
8940 issetmem);
8941
8942 count_exp = plus_constant (counter_mode (count_exp),
8943 count_exp, -align_bytes);
8944 count -= align_bytes;
8945 min_size -= align_bytes;
8946 max_size -= align_bytes;
8947 }
8948 if (need_zero_guard
8949 && min_size < (unsigned HOST_WIDE_INT) size_needed
8950 && (count < (unsigned HOST_WIDE_INT) size_needed
8951 || (align_bytes == 0
8952 && count < ((unsigned HOST_WIDE_INT) size_needed
8953 + desired_align - align))))
8954 {
8955 /* It is possible that we copied enough so the main loop will not
8956 execute. */
8957 gcc_assert (size_needed > 1);
8958 if (label == NULL_RTX)
8959 label = gen_label_rtx ();
8960 emit_cmp_and_jump_insns (count_exp,
8961 GEN_INT (size_needed),
8962 LTU, 0, counter_mode (count_exp), 1, label);
8963 if (expected_size == -1
8964 || expected_size < (desired_align - align) / 2 + size_needed)
8965 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8966 else
8967 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8968 }
8969 }
8970 if (label && size_needed == 1)
8971 {
8972 emit_label (label);
8973 LABEL_NUSES (label) = 1;
8974 label = NULL;
8975 epilogue_size_needed = 1;
8976 if (issetmem)
8977 promoted_val = val_exp;
8978 }
8979 else if (label == NULL_RTX && !misaligned_prologue_used)
8980 epilogue_size_needed = size_needed;
8981
8982 /* Step 3: Main loop. */
8983
8984 switch (alg)
8985 {
8986 case libcall:
8987 case no_stringop:
8988 case last_alg:
8989 gcc_unreachable ();
8990 case loop_1_byte:
8991 case loop:
8992 case unrolled_loop:
8993 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
8994 count_exp, move_mode, unroll_factor,
8995 expected_size, issetmem);
8996 break;
8997 case vector_loop:
8998 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
8999 vec_promoted_val, count_exp, move_mode,
9000 unroll_factor, expected_size, issetmem);
9001 break;
9002 case rep_prefix_8_byte:
9003 case rep_prefix_4_byte:
9004 case rep_prefix_1_byte:
9005 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
9006 val_exp, count_exp, move_mode, issetmem);
9007 break;
9008 }
9009 /* Adjust properly the offset of src and dest memory for aliasing. */
9010 if (CONST_INT_P (count_exp))
9011 {
9012 if (!issetmem)
9013 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
9014 (count / size_needed) * size_needed);
9015 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
9016 (count / size_needed) * size_needed);
9017 }
9018 else
9019 {
9020 if (!issetmem)
9021 src = change_address (src, BLKmode, srcreg);
9022 dst = change_address (dst, BLKmode, destreg);
9023 }
9024
9025 /* Step 4: Epilogue to copy the remaining bytes. */
9026 epilogue:
9027 if (label)
9028 {
9029 /* When the main loop is done, COUNT_EXP might hold original count,
9030 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9031 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9032 bytes. Compensate if needed. */
9033
9034 if (size_needed < epilogue_size_needed)
9035 {
9036 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
9037 GEN_INT (size_needed - 1), count_exp, 1,
9038 OPTAB_DIRECT);
9039 if (tmp != count_exp)
9040 emit_move_insn (count_exp, tmp);
9041 }
9042 emit_label (label);
9043 LABEL_NUSES (label) = 1;
9044 }
9045
9046 if (count_exp != const0_rtx && epilogue_size_needed > 1)
9047 {
9048 if (force_loopy_epilogue)
9049 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
9050 epilogue_size_needed);
9051 else
9052 {
9053 if (issetmem)
9054 expand_setmem_epilogue (dst, destreg, promoted_val,
9055 vec_promoted_val, count_exp,
9056 epilogue_size_needed);
9057 else
9058 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
9059 epilogue_size_needed);
9060 }
9061 }
9062 if (jump_around_label)
9063 emit_label (jump_around_label);
9064 return true;
9065 }
9066
9067 /* Expand cmpstrn or memcmp. */
9068
9069 bool
9070 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
9071 rtx length, rtx align, bool is_cmpstrn)
9072 {
9073 /* Expand strncmp and memcmp only with -minline-all-stringops since
9074 "repz cmpsb" can be much slower than strncmp and memcmp functions
9075 implemented with vector instructions, see
9076
9077 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9078 */
9079 if (!TARGET_INLINE_ALL_STRINGOPS)
9080 return false;
9081
9082 /* Can't use this if the user has appropriated ecx, esi or edi. */
9083 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
9084 return false;
9085
9086 if (is_cmpstrn)
9087 {
9088 /* For strncmp, length is the maximum length, which can be larger
9089 than actual string lengths. We can expand the cmpstrn pattern
9090 to "repz cmpsb" only if one of the strings is a constant so
9091 that expand_builtin_strncmp() can write the length argument to
9092 be the minimum of the const string length and the actual length
9093 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9094 tree t1 = MEM_EXPR (src1);
9095 tree t2 = MEM_EXPR (src2);
9096 if (!((t1 && TREE_CODE (t1) == MEM_REF
9097 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
9098 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
9099 == STRING_CST))
9100 || (t2 && TREE_CODE (t2) == MEM_REF
9101 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
9102 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
9103 == STRING_CST))))
9104 return false;
9105 }
9106
9107 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
9108 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
9109 if (addr1 != XEXP (src1, 0))
9110 src1 = replace_equiv_address_nv (src1, addr1);
9111 if (addr2 != XEXP (src2, 0))
9112 src2 = replace_equiv_address_nv (src2, addr2);
9113
9114 /* NB: Make a copy of the data length to avoid changing the original
9115 data length by cmpstrnqi patterns. */
9116 length = ix86_zero_extend_to_Pmode (length);
9117 rtx lengthreg = gen_reg_rtx (Pmode);
9118 emit_move_insn (lengthreg, length);
9119
9120 /* If we are testing strict equality, we can use known alignment to
9121 good advantage. This may be possible with combine, particularly
9122 once cc0 is dead. */
9123 if (CONST_INT_P (length))
9124 {
9125 if (length == const0_rtx)
9126 {
9127 emit_move_insn (result, const0_rtx);
9128 return true;
9129 }
9130 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
9131 src1, src2));
9132 }
9133 else
9134 {
9135 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
9136 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
9137 src1, src2));
9138 }
9139
9140 rtx out = gen_lowpart (QImode, result);
9141 emit_insn (gen_cmpintqi (out));
9142 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
9143
9144 return true;
9145 }
9146
9147 /* Expand the appropriate insns for doing strlen if not just doing
9148 repnz; scasb
9149
9150 out = result, initialized with the start address
9151 align_rtx = alignment of the address.
9152 scratch = scratch register, initialized with the startaddress when
9153 not aligned, otherwise undefined
9154
9155 This is just the body. It needs the initializations mentioned above and
9156 some address computing at the end. These things are done in i386.md. */
9157
9158 static void
9159 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
9160 {
9161 int align;
9162 rtx tmp;
9163 rtx_code_label *align_2_label = NULL;
9164 rtx_code_label *align_3_label = NULL;
9165 rtx_code_label *align_4_label = gen_label_rtx ();
9166 rtx_code_label *end_0_label = gen_label_rtx ();
9167 rtx mem;
9168 rtx tmpreg = gen_reg_rtx (SImode);
9169 rtx scratch = gen_reg_rtx (SImode);
9170 rtx cmp;
9171
9172 align = 0;
9173 if (CONST_INT_P (align_rtx))
9174 align = INTVAL (align_rtx);
9175
9176 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9177
9178 /* Is there a known alignment and is it less than 4? */
9179 if (align < 4)
9180 {
9181 rtx scratch1 = gen_reg_rtx (Pmode);
9182 emit_move_insn (scratch1, out);
9183 /* Is there a known alignment and is it not 2? */
9184 if (align != 2)
9185 {
9186 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
9187 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
9188
9189 /* Leave just the 3 lower bits. */
9190 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9191 NULL_RTX, 0, OPTAB_WIDEN);
9192
9193 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9194 Pmode, 1, align_4_label);
9195 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9196 Pmode, 1, align_2_label);
9197 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9198 Pmode, 1, align_3_label);
9199 }
9200 else
9201 {
9202 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9203 check if is aligned to 4 - byte. */
9204
9205 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9206 NULL_RTX, 0, OPTAB_WIDEN);
9207
9208 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9209 Pmode, 1, align_4_label);
9210 }
9211
9212 mem = change_address (src, QImode, out);
9213
9214 /* Now compare the bytes. */
9215
9216 /* Compare the first n unaligned byte on a byte per byte basis. */
9217 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9218 QImode, 1, end_0_label);
9219
9220 /* Increment the address. */
9221 emit_insn (gen_add2_insn (out, const1_rtx));
9222
9223 /* Not needed with an alignment of 2 */
9224 if (align != 2)
9225 {
9226 emit_label (align_2_label);
9227
9228 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9229 end_0_label);
9230
9231 emit_insn (gen_add2_insn (out, const1_rtx));
9232
9233 emit_label (align_3_label);
9234 }
9235
9236 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9237 end_0_label);
9238
9239 emit_insn (gen_add2_insn (out, const1_rtx));
9240 }
9241
9242 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9243 align this loop. It gives only huge programs, but does not help to
9244 speed up. */
9245 emit_label (align_4_label);
9246
9247 mem = change_address (src, SImode, out);
9248 emit_move_insn (scratch, mem);
9249 emit_insn (gen_add2_insn (out, GEN_INT (4)));
9250
9251 /* This formula yields a nonzero result iff one of the bytes is zero.
9252 This saves three branches inside loop and many cycles. */
9253
9254 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9255 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9256 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9257 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9258 gen_int_mode (0x80808080, SImode)));
9259 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9260 align_4_label);
9261
9262 if (TARGET_CMOVE)
9263 {
9264 rtx reg = gen_reg_rtx (SImode);
9265 rtx reg2 = gen_reg_rtx (Pmode);
9266 emit_move_insn (reg, tmpreg);
9267 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9268
9269 /* If zero is not in the first two bytes, move two bytes forward. */
9270 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9271 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9272 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9273 emit_insn (gen_rtx_SET (tmpreg,
9274 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9275 reg,
9276 tmpreg)));
9277 /* Emit lea manually to avoid clobbering of flags. */
9278 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9279
9280 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9281 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9282 emit_insn (gen_rtx_SET (out,
9283 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9284 reg2,
9285 out)));
9286 }
9287 else
9288 {
9289 rtx_code_label *end_2_label = gen_label_rtx ();
9290 /* Is zero in the first two bytes? */
9291
9292 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9293 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9294 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9295 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9296 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9297 pc_rtx);
9298 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9299 JUMP_LABEL (tmp) = end_2_label;
9300
9301 /* Not in the first two. Move two bytes forward. */
9302 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9303 emit_insn (gen_add2_insn (out, const2_rtx));
9304
9305 emit_label (end_2_label);
9306
9307 }
9308
9309 /* Avoid branch in fixing the byte. */
9310 tmpreg = gen_lowpart (QImode, tmpreg);
9311 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9312 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9313 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9314 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9315
9316 emit_label (end_0_label);
9317 }
9318
9319 /* Expand strlen. */
9320
9321 bool
9322 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9323 {
9324 if (TARGET_UNROLL_STRLEN
9325 && TARGET_INLINE_ALL_STRINGOPS
9326 && eoschar == const0_rtx
9327 && optimize > 1)
9328 {
9329 /* The generic case of strlen expander is long. Avoid it's
9330 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9331 rtx addr = force_reg (Pmode, XEXP (src, 0));
9332 /* Well it seems that some optimizer does not combine a call like
9333 foo(strlen(bar), strlen(bar));
9334 when the move and the subtraction is done here. It does calculate
9335 the length just once when these instructions are done inside of
9336 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9337 often used and I use one fewer register for the lifetime of
9338 output_strlen_unroll() this is better. */
9339
9340 emit_move_insn (out, addr);
9341
9342 ix86_expand_strlensi_unroll_1 (out, src, align);
9343
9344 /* strlensi_unroll_1 returns the address of the zero at the end of
9345 the string, like memchr(), so compute the length by subtracting
9346 the start address. */
9347 emit_insn (gen_sub2_insn (out, addr));
9348 return true;
9349 }
9350 else
9351 return false;
9352 }
9353
9354 /* For given symbol (function) construct code to compute address of it's PLT
9355 entry in large x86-64 PIC model. */
9356
9357 static rtx
9358 construct_plt_address (rtx symbol)
9359 {
9360 rtx tmp, unspec;
9361
9362 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9363 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9364 gcc_assert (Pmode == DImode);
9365
9366 tmp = gen_reg_rtx (Pmode);
9367 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9368
9369 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9370 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9371 return tmp;
9372 }
9373
9374 /* Additional registers that are clobbered by SYSV calls. */
9375
9376 static int const x86_64_ms_sysv_extra_clobbered_registers
9377 [NUM_X86_64_MS_CLOBBERED_REGS] =
9378 {
9379 SI_REG, DI_REG,
9380 XMM6_REG, XMM7_REG,
9381 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9382 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9383 };
9384
9385 rtx_insn *
9386 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9387 rtx callarg2,
9388 rtx pop, bool sibcall)
9389 {
9390 rtx vec[3];
9391 rtx use = NULL, call;
9392 unsigned int vec_len = 0;
9393 tree fndecl;
9394
9395 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9396 {
9397 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9398 if (fndecl
9399 && (lookup_attribute ("interrupt",
9400 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9401 error ("interrupt service routine cannot be called directly");
9402 }
9403 else
9404 fndecl = NULL_TREE;
9405
9406 if (pop == const0_rtx)
9407 pop = NULL;
9408 gcc_assert (!TARGET_64BIT || !pop);
9409
9410 rtx addr = XEXP (fnaddr, 0);
9411 if (TARGET_MACHO && !TARGET_64BIT)
9412 {
9413 #if TARGET_MACHO
9414 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9415 fnaddr = machopic_indirect_call_target (fnaddr);
9416 #endif
9417 }
9418 else
9419 {
9420 /* Static functions and indirect calls don't need the pic register. Also,
9421 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9422 it an indirect call. */
9423 if (flag_pic
9424 && GET_CODE (addr) == SYMBOL_REF
9425 && ix86_call_use_plt_p (addr))
9426 {
9427 if (flag_plt
9428 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9429 || !lookup_attribute ("noplt",
9430 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9431 {
9432 if (!TARGET_64BIT
9433 || (ix86_cmodel == CM_LARGE_PIC
9434 && DEFAULT_ABI != MS_ABI))
9435 {
9436 use_reg (&use, gen_rtx_REG (Pmode,
9437 REAL_PIC_OFFSET_TABLE_REGNUM));
9438 if (ix86_use_pseudo_pic_reg ())
9439 emit_move_insn (gen_rtx_REG (Pmode,
9440 REAL_PIC_OFFSET_TABLE_REGNUM),
9441 pic_offset_table_rtx);
9442 }
9443 }
9444 else if (!TARGET_PECOFF && !TARGET_MACHO)
9445 {
9446 if (TARGET_64BIT
9447 && ix86_cmodel == CM_LARGE_PIC
9448 && DEFAULT_ABI != MS_ABI)
9449 {
9450 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9451 UNSPEC_GOT);
9452 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9453 fnaddr = force_reg (Pmode, fnaddr);
9454 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9455 }
9456 else if (TARGET_64BIT)
9457 {
9458 fnaddr = gen_rtx_UNSPEC (Pmode,
9459 gen_rtvec (1, addr),
9460 UNSPEC_GOTPCREL);
9461 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9462 }
9463 else
9464 {
9465 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9466 UNSPEC_GOT);
9467 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9468 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9469 fnaddr);
9470 }
9471 fnaddr = gen_const_mem (Pmode, fnaddr);
9472 /* Pmode may not be the same as word_mode for x32, which
9473 doesn't support indirect branch via 32-bit memory slot.
9474 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9475 indirect branch via x32 GOT slot is OK. */
9476 if (GET_MODE (fnaddr) != word_mode)
9477 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9478 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9479 }
9480 }
9481 }
9482
9483 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9484 parameters passed in vector registers. */
9485 if (TARGET_64BIT
9486 && (INTVAL (callarg2) > 0
9487 || (INTVAL (callarg2) == 0
9488 && (TARGET_SSE || !flag_skip_rax_setup))))
9489 {
9490 rtx al = gen_rtx_REG (QImode, AX_REG);
9491 emit_move_insn (al, callarg2);
9492 use_reg (&use, al);
9493 }
9494
9495 if (ix86_cmodel == CM_LARGE_PIC
9496 && !TARGET_PECOFF
9497 && MEM_P (fnaddr)
9498 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9499 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9500 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9501 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9502 branch via x32 GOT slot is OK. */
9503 else if (!(TARGET_X32
9504 && MEM_P (fnaddr)
9505 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9506 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9507 && (sibcall
9508 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9509 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9510 {
9511 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9512 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9513 }
9514
9515 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9516 mask off code pointers here.
9517 TODO: also need to handle indirect jump. */
9518 if (ix86_memtag_can_tag_addresses () && !fndecl
9519 && sanitize_flags_p (SANITIZE_HWADDRESS))
9520 {
9521 rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
9522 NULL_RTX);
9523 fnaddr = gen_rtx_MEM (QImode, untagged_addr);
9524 }
9525
9526 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9527
9528 if (retval)
9529 call = gen_rtx_SET (retval, call);
9530 vec[vec_len++] = call;
9531
9532 if (pop)
9533 {
9534 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9535 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9536 vec[vec_len++] = pop;
9537 }
9538
9539 if (cfun->machine->no_caller_saved_registers
9540 && (!fndecl
9541 || (!TREE_THIS_VOLATILE (fndecl)
9542 && !lookup_attribute ("no_caller_saved_registers",
9543 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9544 {
9545 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9546 bool is_64bit_ms_abi = (TARGET_64BIT
9547 && ix86_function_abi (fndecl) == MS_ABI);
9548 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9549
9550 /* If there are no caller-saved registers, add all registers
9551 that are clobbered by the call which returns. */
9552 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9553 if (!fixed_regs[i]
9554 && (ix86_call_used_regs[i] == 1
9555 || (ix86_call_used_regs[i] & c_mask))
9556 && !STACK_REGNO_P (i)
9557 && !MMX_REGNO_P (i))
9558 clobber_reg (&use,
9559 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9560 }
9561 else if (TARGET_64BIT_MS_ABI
9562 && (!callarg2 || INTVAL (callarg2) != -2))
9563 {
9564 unsigned i;
9565
9566 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9567 {
9568 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9569 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9570
9571 clobber_reg (&use, gen_rtx_REG (mode, regno));
9572 }
9573
9574 /* Set here, but it may get cleared later. */
9575 if (TARGET_CALL_MS2SYSV_XLOGUES)
9576 {
9577 if (!TARGET_SSE)
9578 ;
9579
9580 /* Don't break hot-patched functions. */
9581 else if (ix86_function_ms_hook_prologue (current_function_decl))
9582 ;
9583
9584 /* TODO: Cases not yet examined. */
9585 else if (flag_split_stack)
9586 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9587
9588 else
9589 {
9590 gcc_assert (!reload_completed);
9591 cfun->machine->call_ms2sysv = true;
9592 }
9593 }
9594 }
9595
9596 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9597 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9598 || !fndecl || TREE_PUBLIC (fndecl)))
9599 {
9600 /* We allow public functions defined in a TU to bind locally for PIC
9601 code (the default) on 64bit Mach-O.
9602 If such functions are not inlined, we cannot tell at compile-time if
9603 they will be called via the lazy symbol resolver (this can depend on
9604 options given at link-time). Therefore, we must assume that the lazy
9605 resolver could be used which clobbers R11 and R10. */
9606 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9607 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9608 }
9609
9610 if (vec_len > 1)
9611 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9612 rtx_insn *call_insn = emit_call_insn (call);
9613 if (use)
9614 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9615
9616 return call_insn;
9617 }
9618
9619 /* Split simple return with popping POPC bytes from stack to indirect
9620 branch with stack adjustment . */
9621
9622 void
9623 ix86_split_simple_return_pop_internal (rtx popc)
9624 {
9625 struct machine_function *m = cfun->machine;
9626 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9627 rtx_insn *insn;
9628
9629 /* There is no "pascal" calling convention in any 64bit ABI. */
9630 gcc_assert (!TARGET_64BIT);
9631
9632 insn = emit_insn (gen_pop (ecx));
9633 m->fs.cfa_offset -= UNITS_PER_WORD;
9634 m->fs.sp_offset -= UNITS_PER_WORD;
9635
9636 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9637 x = gen_rtx_SET (stack_pointer_rtx, x);
9638 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9639 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9640 RTX_FRAME_RELATED_P (insn) = 1;
9641
9642 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9643 x = gen_rtx_SET (stack_pointer_rtx, x);
9644 insn = emit_insn (x);
9645 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9646 RTX_FRAME_RELATED_P (insn) = 1;
9647
9648 /* Now return address is in ECX. */
9649 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9650 }
9651
9652 /* Errors in the source file can cause expand_expr to return const0_rtx
9653 where we expect a vector. To avoid crashing, use one of the vector
9654 clear instructions. */
9655
9656 static rtx
9657 safe_vector_operand (rtx x, machine_mode mode)
9658 {
9659 if (x == const0_rtx)
9660 x = CONST0_RTX (mode);
9661 return x;
9662 }
9663
9664 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9665
9666 static rtx
9667 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9668 {
9669 rtx pat;
9670 tree arg0 = CALL_EXPR_ARG (exp, 0);
9671 tree arg1 = CALL_EXPR_ARG (exp, 1);
9672 rtx op0 = expand_normal (arg0);
9673 rtx op1 = expand_normal (arg1);
9674 machine_mode tmode = insn_data[icode].operand[0].mode;
9675 machine_mode mode0 = insn_data[icode].operand[1].mode;
9676 machine_mode mode1 = insn_data[icode].operand[2].mode;
9677
9678 if (VECTOR_MODE_P (mode0))
9679 op0 = safe_vector_operand (op0, mode0);
9680 if (VECTOR_MODE_P (mode1))
9681 op1 = safe_vector_operand (op1, mode1);
9682
9683 if (optimize || !target
9684 || GET_MODE (target) != tmode
9685 || !insn_data[icode].operand[0].predicate (target, tmode))
9686 target = gen_reg_rtx (tmode);
9687
9688 if (GET_MODE (op1) == SImode && mode1 == TImode)
9689 {
9690 rtx x = gen_reg_rtx (V4SImode);
9691 emit_insn (gen_sse2_loadd (x, op1));
9692 op1 = gen_lowpart (TImode, x);
9693 }
9694
9695 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9696 op0 = copy_to_mode_reg (mode0, op0);
9697 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9698 op1 = copy_to_mode_reg (mode1, op1);
9699
9700 pat = GEN_FCN (icode) (target, op0, op1);
9701 if (! pat)
9702 return 0;
9703
9704 emit_insn (pat);
9705
9706 return target;
9707 }
9708
9709 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9710
9711 static rtx
9712 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9713 enum ix86_builtin_func_type m_type,
9714 enum rtx_code sub_code)
9715 {
9716 rtx pat;
9717 unsigned int i, nargs;
9718 bool comparison_p = false;
9719 bool tf_p = false;
9720 bool last_arg_constant = false;
9721 int num_memory = 0;
9722 rtx xops[4];
9723
9724 machine_mode tmode = insn_data[icode].operand[0].mode;
9725
9726 switch (m_type)
9727 {
9728 case MULTI_ARG_4_DF2_DI_I:
9729 case MULTI_ARG_4_DF2_DI_I1:
9730 case MULTI_ARG_4_SF2_SI_I:
9731 case MULTI_ARG_4_SF2_SI_I1:
9732 nargs = 4;
9733 last_arg_constant = true;
9734 break;
9735
9736 case MULTI_ARG_3_SF:
9737 case MULTI_ARG_3_DF:
9738 case MULTI_ARG_3_SF2:
9739 case MULTI_ARG_3_DF2:
9740 case MULTI_ARG_3_DI:
9741 case MULTI_ARG_3_SI:
9742 case MULTI_ARG_3_SI_DI:
9743 case MULTI_ARG_3_HI:
9744 case MULTI_ARG_3_HI_SI:
9745 case MULTI_ARG_3_QI:
9746 case MULTI_ARG_3_DI2:
9747 case MULTI_ARG_3_SI2:
9748 case MULTI_ARG_3_HI2:
9749 case MULTI_ARG_3_QI2:
9750 nargs = 3;
9751 break;
9752
9753 case MULTI_ARG_2_SF:
9754 case MULTI_ARG_2_DF:
9755 case MULTI_ARG_2_DI:
9756 case MULTI_ARG_2_SI:
9757 case MULTI_ARG_2_HI:
9758 case MULTI_ARG_2_QI:
9759 nargs = 2;
9760 break;
9761
9762 case MULTI_ARG_2_DI_IMM:
9763 case MULTI_ARG_2_SI_IMM:
9764 case MULTI_ARG_2_HI_IMM:
9765 case MULTI_ARG_2_QI_IMM:
9766 nargs = 2;
9767 last_arg_constant = true;
9768 break;
9769
9770 case MULTI_ARG_1_SF:
9771 case MULTI_ARG_1_DF:
9772 case MULTI_ARG_1_SF2:
9773 case MULTI_ARG_1_DF2:
9774 case MULTI_ARG_1_DI:
9775 case MULTI_ARG_1_SI:
9776 case MULTI_ARG_1_HI:
9777 case MULTI_ARG_1_QI:
9778 case MULTI_ARG_1_SI_DI:
9779 case MULTI_ARG_1_HI_DI:
9780 case MULTI_ARG_1_HI_SI:
9781 case MULTI_ARG_1_QI_DI:
9782 case MULTI_ARG_1_QI_SI:
9783 case MULTI_ARG_1_QI_HI:
9784 nargs = 1;
9785 break;
9786
9787 case MULTI_ARG_2_DI_CMP:
9788 case MULTI_ARG_2_SI_CMP:
9789 case MULTI_ARG_2_HI_CMP:
9790 case MULTI_ARG_2_QI_CMP:
9791 nargs = 2;
9792 comparison_p = true;
9793 break;
9794
9795 case MULTI_ARG_2_SF_TF:
9796 case MULTI_ARG_2_DF_TF:
9797 case MULTI_ARG_2_DI_TF:
9798 case MULTI_ARG_2_SI_TF:
9799 case MULTI_ARG_2_HI_TF:
9800 case MULTI_ARG_2_QI_TF:
9801 nargs = 2;
9802 tf_p = true;
9803 break;
9804
9805 default:
9806 gcc_unreachable ();
9807 }
9808
9809 if (optimize || !target
9810 || GET_MODE (target) != tmode
9811 || !insn_data[icode].operand[0].predicate (target, tmode))
9812 target = gen_reg_rtx (tmode);
9813 else if (memory_operand (target, tmode))
9814 num_memory++;
9815
9816 gcc_assert (nargs <= ARRAY_SIZE (xops));
9817
9818 for (i = 0; i < nargs; i++)
9819 {
9820 tree arg = CALL_EXPR_ARG (exp, i);
9821 rtx op = expand_normal (arg);
9822 int adjust = (comparison_p) ? 1 : 0;
9823 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9824
9825 if (last_arg_constant && i == nargs - 1)
9826 {
9827 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9828 {
9829 enum insn_code new_icode = icode;
9830 switch (icode)
9831 {
9832 case CODE_FOR_xop_vpermil2v2df3:
9833 case CODE_FOR_xop_vpermil2v4sf3:
9834 case CODE_FOR_xop_vpermil2v4df3:
9835 case CODE_FOR_xop_vpermil2v8sf3:
9836 error ("the last argument must be a 2-bit immediate");
9837 return gen_reg_rtx (tmode);
9838 case CODE_FOR_xop_rotlv2di3:
9839 new_icode = CODE_FOR_rotlv2di3;
9840 goto xop_rotl;
9841 case CODE_FOR_xop_rotlv4si3:
9842 new_icode = CODE_FOR_rotlv4si3;
9843 goto xop_rotl;
9844 case CODE_FOR_xop_rotlv8hi3:
9845 new_icode = CODE_FOR_rotlv8hi3;
9846 goto xop_rotl;
9847 case CODE_FOR_xop_rotlv16qi3:
9848 new_icode = CODE_FOR_rotlv16qi3;
9849 xop_rotl:
9850 if (CONST_INT_P (op))
9851 {
9852 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9853 op = GEN_INT (INTVAL (op) & mask);
9854 gcc_checking_assert
9855 (insn_data[icode].operand[i + 1].predicate (op, mode));
9856 }
9857 else
9858 {
9859 gcc_checking_assert
9860 (nargs == 2
9861 && insn_data[new_icode].operand[0].mode == tmode
9862 && insn_data[new_icode].operand[1].mode == tmode
9863 && insn_data[new_icode].operand[2].mode == mode
9864 && insn_data[new_icode].operand[0].predicate
9865 == insn_data[icode].operand[0].predicate
9866 && insn_data[new_icode].operand[1].predicate
9867 == insn_data[icode].operand[1].predicate);
9868 icode = new_icode;
9869 goto non_constant;
9870 }
9871 break;
9872 default:
9873 gcc_unreachable ();
9874 }
9875 }
9876 }
9877 else
9878 {
9879 non_constant:
9880 if (VECTOR_MODE_P (mode))
9881 op = safe_vector_operand (op, mode);
9882
9883 /* If we aren't optimizing, only allow one memory operand to be
9884 generated. */
9885 if (memory_operand (op, mode))
9886 num_memory++;
9887
9888 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9889
9890 if (optimize
9891 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9892 || num_memory > 1)
9893 op = force_reg (mode, op);
9894 }
9895
9896 xops[i] = op;
9897 }
9898
9899 switch (nargs)
9900 {
9901 case 1:
9902 pat = GEN_FCN (icode) (target, xops[0]);
9903 break;
9904
9905 case 2:
9906 if (tf_p)
9907 pat = GEN_FCN (icode) (target, xops[0], xops[1],
9908 GEN_INT ((int)sub_code));
9909 else if (! comparison_p)
9910 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
9911 else
9912 {
9913 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
9914 xops[0], xops[1]);
9915
9916 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
9917 }
9918 break;
9919
9920 case 3:
9921 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
9922 break;
9923
9924 case 4:
9925 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
9926 break;
9927
9928 default:
9929 gcc_unreachable ();
9930 }
9931
9932 if (! pat)
9933 return 0;
9934
9935 emit_insn (pat);
9936 return target;
9937 }
9938
9939 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9940 insns with vec_merge. */
9941
9942 static rtx
9943 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9944 rtx target)
9945 {
9946 rtx pat;
9947 tree arg0 = CALL_EXPR_ARG (exp, 0);
9948 rtx op1, op0 = expand_normal (arg0);
9949 machine_mode tmode = insn_data[icode].operand[0].mode;
9950 machine_mode mode0 = insn_data[icode].operand[1].mode;
9951
9952 if (optimize || !target
9953 || GET_MODE (target) != tmode
9954 || !insn_data[icode].operand[0].predicate (target, tmode))
9955 target = gen_reg_rtx (tmode);
9956
9957 if (VECTOR_MODE_P (mode0))
9958 op0 = safe_vector_operand (op0, mode0);
9959
9960 if ((optimize && !register_operand (op0, mode0))
9961 || !insn_data[icode].operand[1].predicate (op0, mode0))
9962 op0 = copy_to_mode_reg (mode0, op0);
9963
9964 op1 = op0;
9965 if (!insn_data[icode].operand[2].predicate (op1, mode0))
9966 op1 = copy_to_mode_reg (mode0, op1);
9967
9968 pat = GEN_FCN (icode) (target, op0, op1);
9969 if (! pat)
9970 return 0;
9971 emit_insn (pat);
9972 return target;
9973 }
9974
9975 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9976
9977 static rtx
9978 ix86_expand_sse_compare (const struct builtin_description *d,
9979 tree exp, rtx target, bool swap)
9980 {
9981 rtx pat;
9982 tree arg0 = CALL_EXPR_ARG (exp, 0);
9983 tree arg1 = CALL_EXPR_ARG (exp, 1);
9984 rtx op0 = expand_normal (arg0);
9985 rtx op1 = expand_normal (arg1);
9986 rtx op2;
9987 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9988 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9989 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9990 enum rtx_code comparison = d->comparison;
9991
9992 if (VECTOR_MODE_P (mode0))
9993 op0 = safe_vector_operand (op0, mode0);
9994 if (VECTOR_MODE_P (mode1))
9995 op1 = safe_vector_operand (op1, mode1);
9996
9997 /* Swap operands if we have a comparison that isn't available in
9998 hardware. */
9999 if (swap)
10000 std::swap (op0, op1);
10001
10002 if (optimize || !target
10003 || GET_MODE (target) != tmode
10004 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10005 target = gen_reg_rtx (tmode);
10006
10007 if ((optimize && !register_operand (op0, mode0))
10008 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
10009 op0 = copy_to_mode_reg (mode0, op0);
10010 if ((optimize && !register_operand (op1, mode1))
10011 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
10012 op1 = copy_to_mode_reg (mode1, op1);
10013
10014 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
10015 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10016 if (! pat)
10017 return 0;
10018 emit_insn (pat);
10019 return target;
10020 }
10021
10022 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
10023 * ordered EQ or unordered NE, generate PF jump. */
10024
10025 static rtx
10026 ix86_ssecom_setcc (const enum rtx_code comparison,
10027 bool check_unordered, machine_mode mode,
10028 rtx set_dst, rtx target)
10029 {
10030
10031 rtx_code_label *label = NULL;
10032
10033 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10034 with NAN operands. */
10035 if (check_unordered)
10036 {
10037 gcc_assert (comparison == EQ || comparison == NE);
10038
10039 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10040 label = gen_label_rtx ();
10041 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10042 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10043 gen_rtx_LABEL_REF (VOIDmode, label),
10044 pc_rtx);
10045 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10046 }
10047
10048 /* NB: Set CCFPmode and check a different CCmode which is in subset
10049 of CCFPmode. */
10050 if (GET_MODE (set_dst) != mode)
10051 {
10052 gcc_assert (mode == CCAmode || mode == CCCmode
10053 || mode == CCOmode || mode == CCPmode
10054 || mode == CCSmode || mode == CCZmode);
10055 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10056 }
10057
10058 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10059 gen_rtx_fmt_ee (comparison, QImode,
10060 set_dst,
10061 const0_rtx)));
10062
10063 if (label)
10064 emit_label (label);
10065
10066 return SUBREG_REG (target);
10067 }
10068
10069 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
10070
10071 static rtx
10072 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
10073 rtx target)
10074 {
10075 rtx pat, set_dst;
10076 tree arg0 = CALL_EXPR_ARG (exp, 0);
10077 tree arg1 = CALL_EXPR_ARG (exp, 1);
10078 rtx op0 = expand_normal (arg0);
10079 rtx op1 = expand_normal (arg1);
10080 enum insn_code icode = d->icode;
10081 const struct insn_data_d *insn_p = &insn_data[icode];
10082 machine_mode mode0 = insn_p->operand[0].mode;
10083 machine_mode mode1 = insn_p->operand[1].mode;
10084
10085 if (VECTOR_MODE_P (mode0))
10086 op0 = safe_vector_operand (op0, mode0);
10087 if (VECTOR_MODE_P (mode1))
10088 op1 = safe_vector_operand (op1, mode1);
10089
10090 enum rtx_code comparison = d->comparison;
10091 rtx const_val = const0_rtx;
10092
10093 bool check_unordered = false;
10094 machine_mode mode = CCFPmode;
10095 switch (comparison)
10096 {
10097 case LE: /* -> GE */
10098 case LT: /* -> GT */
10099 std::swap (op0, op1);
10100 comparison = swap_condition (comparison);
10101 /* FALLTHRU */
10102 case GT:
10103 case GE:
10104 break;
10105 case EQ:
10106 check_unordered = true;
10107 mode = CCZmode;
10108 break;
10109 case NE:
10110 check_unordered = true;
10111 mode = CCZmode;
10112 const_val = const1_rtx;
10113 break;
10114 default:
10115 gcc_unreachable ();
10116 }
10117
10118 target = gen_reg_rtx (SImode);
10119 emit_move_insn (target, const_val);
10120 target = gen_rtx_SUBREG (QImode, target, 0);
10121
10122 if ((optimize && !register_operand (op0, mode0))
10123 || !insn_p->operand[0].predicate (op0, mode0))
10124 op0 = copy_to_mode_reg (mode0, op0);
10125 if ((optimize && !register_operand (op1, mode1))
10126 || !insn_p->operand[1].predicate (op1, mode1))
10127 op1 = copy_to_mode_reg (mode1, op1);
10128
10129 pat = GEN_FCN (icode) (op0, op1);
10130 if (! pat)
10131 return 0;
10132
10133 set_dst = SET_DEST (pat);
10134 emit_insn (pat);
10135 return ix86_ssecom_setcc (comparison, check_unordered, mode,
10136 set_dst, target);
10137 }
10138
10139 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10140
10141 static rtx
10142 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
10143 rtx target)
10144 {
10145 rtx pat;
10146 tree arg0 = CALL_EXPR_ARG (exp, 0);
10147 rtx op1, op0 = expand_normal (arg0);
10148 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10149 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10150
10151 if (optimize || target == 0
10152 || GET_MODE (target) != tmode
10153 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10154 target = gen_reg_rtx (tmode);
10155
10156 if (VECTOR_MODE_P (mode0))
10157 op0 = safe_vector_operand (op0, mode0);
10158
10159 if ((optimize && !register_operand (op0, mode0))
10160 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10161 op0 = copy_to_mode_reg (mode0, op0);
10162
10163 op1 = GEN_INT (d->comparison);
10164
10165 pat = GEN_FCN (d->icode) (target, op0, op1);
10166 if (! pat)
10167 return 0;
10168 emit_insn (pat);
10169 return target;
10170 }
10171
10172 static rtx
10173 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
10174 tree exp, rtx target)
10175 {
10176 rtx pat;
10177 tree arg0 = CALL_EXPR_ARG (exp, 0);
10178 tree arg1 = CALL_EXPR_ARG (exp, 1);
10179 rtx op0 = expand_normal (arg0);
10180 rtx op1 = expand_normal (arg1);
10181 rtx op2;
10182 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10183 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10184 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10185
10186 if (optimize || target == 0
10187 || GET_MODE (target) != tmode
10188 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10189 target = gen_reg_rtx (tmode);
10190
10191 op0 = safe_vector_operand (op0, mode0);
10192 op1 = safe_vector_operand (op1, mode1);
10193
10194 if ((optimize && !register_operand (op0, mode0))
10195 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10196 op0 = copy_to_mode_reg (mode0, op0);
10197 if ((optimize && !register_operand (op1, mode1))
10198 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10199 op1 = copy_to_mode_reg (mode1, op1);
10200
10201 op2 = GEN_INT (d->comparison);
10202
10203 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10204 if (! pat)
10205 return 0;
10206 emit_insn (pat);
10207 return target;
10208 }
10209
10210 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10211
10212 static rtx
10213 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10214 rtx target)
10215 {
10216 rtx pat;
10217 tree arg0 = CALL_EXPR_ARG (exp, 0);
10218 tree arg1 = CALL_EXPR_ARG (exp, 1);
10219 rtx op0 = expand_normal (arg0);
10220 rtx op1 = expand_normal (arg1);
10221 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10222 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10223 enum rtx_code comparison = d->comparison;
10224
10225 if (VECTOR_MODE_P (mode0))
10226 op0 = safe_vector_operand (op0, mode0);
10227 if (VECTOR_MODE_P (mode1))
10228 op1 = safe_vector_operand (op1, mode1);
10229
10230 target = gen_reg_rtx (SImode);
10231 emit_move_insn (target, const0_rtx);
10232 target = gen_rtx_SUBREG (QImode, target, 0);
10233
10234 if ((optimize && !register_operand (op0, mode0))
10235 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10236 op0 = copy_to_mode_reg (mode0, op0);
10237 if ((optimize && !register_operand (op1, mode1))
10238 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10239 op1 = copy_to_mode_reg (mode1, op1);
10240
10241 pat = GEN_FCN (d->icode) (op0, op1);
10242 if (! pat)
10243 return 0;
10244 emit_insn (pat);
10245 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10246 gen_rtx_fmt_ee (comparison, QImode,
10247 SET_DEST (pat),
10248 const0_rtx)));
10249
10250 return SUBREG_REG (target);
10251 }
10252
10253 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10254
10255 static rtx
10256 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10257 tree exp, rtx target)
10258 {
10259 rtx pat;
10260 tree arg0 = CALL_EXPR_ARG (exp, 0);
10261 tree arg1 = CALL_EXPR_ARG (exp, 1);
10262 tree arg2 = CALL_EXPR_ARG (exp, 2);
10263 tree arg3 = CALL_EXPR_ARG (exp, 3);
10264 tree arg4 = CALL_EXPR_ARG (exp, 4);
10265 rtx scratch0, scratch1;
10266 rtx op0 = expand_normal (arg0);
10267 rtx op1 = expand_normal (arg1);
10268 rtx op2 = expand_normal (arg2);
10269 rtx op3 = expand_normal (arg3);
10270 rtx op4 = expand_normal (arg4);
10271 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10272
10273 tmode0 = insn_data[d->icode].operand[0].mode;
10274 tmode1 = insn_data[d->icode].operand[1].mode;
10275 modev2 = insn_data[d->icode].operand[2].mode;
10276 modei3 = insn_data[d->icode].operand[3].mode;
10277 modev4 = insn_data[d->icode].operand[4].mode;
10278 modei5 = insn_data[d->icode].operand[5].mode;
10279 modeimm = insn_data[d->icode].operand[6].mode;
10280
10281 if (VECTOR_MODE_P (modev2))
10282 op0 = safe_vector_operand (op0, modev2);
10283 if (VECTOR_MODE_P (modev4))
10284 op2 = safe_vector_operand (op2, modev4);
10285
10286 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10287 op0 = copy_to_mode_reg (modev2, op0);
10288 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10289 op1 = copy_to_mode_reg (modei3, op1);
10290 if ((optimize && !register_operand (op2, modev4))
10291 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10292 op2 = copy_to_mode_reg (modev4, op2);
10293 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10294 op3 = copy_to_mode_reg (modei5, op3);
10295
10296 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10297 {
10298 error ("the fifth argument must be an 8-bit immediate");
10299 return const0_rtx;
10300 }
10301
10302 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10303 {
10304 if (optimize || !target
10305 || GET_MODE (target) != tmode0
10306 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10307 target = gen_reg_rtx (tmode0);
10308
10309 scratch1 = gen_reg_rtx (tmode1);
10310
10311 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10312 }
10313 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10314 {
10315 if (optimize || !target
10316 || GET_MODE (target) != tmode1
10317 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10318 target = gen_reg_rtx (tmode1);
10319
10320 scratch0 = gen_reg_rtx (tmode0);
10321
10322 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10323 }
10324 else
10325 {
10326 gcc_assert (d->flag);
10327
10328 scratch0 = gen_reg_rtx (tmode0);
10329 scratch1 = gen_reg_rtx (tmode1);
10330
10331 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10332 }
10333
10334 if (! pat)
10335 return 0;
10336
10337 emit_insn (pat);
10338
10339 if (d->flag)
10340 {
10341 target = gen_reg_rtx (SImode);
10342 emit_move_insn (target, const0_rtx);
10343 target = gen_rtx_SUBREG (QImode, target, 0);
10344
10345 emit_insn
10346 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10347 gen_rtx_fmt_ee (EQ, QImode,
10348 gen_rtx_REG ((machine_mode) d->flag,
10349 FLAGS_REG),
10350 const0_rtx)));
10351 return SUBREG_REG (target);
10352 }
10353 else
10354 return target;
10355 }
10356
10357
10358 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10359
10360 static rtx
10361 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10362 tree exp, rtx target)
10363 {
10364 rtx pat;
10365 tree arg0 = CALL_EXPR_ARG (exp, 0);
10366 tree arg1 = CALL_EXPR_ARG (exp, 1);
10367 tree arg2 = CALL_EXPR_ARG (exp, 2);
10368 rtx scratch0, scratch1;
10369 rtx op0 = expand_normal (arg0);
10370 rtx op1 = expand_normal (arg1);
10371 rtx op2 = expand_normal (arg2);
10372 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10373
10374 tmode0 = insn_data[d->icode].operand[0].mode;
10375 tmode1 = insn_data[d->icode].operand[1].mode;
10376 modev2 = insn_data[d->icode].operand[2].mode;
10377 modev3 = insn_data[d->icode].operand[3].mode;
10378 modeimm = insn_data[d->icode].operand[4].mode;
10379
10380 if (VECTOR_MODE_P (modev2))
10381 op0 = safe_vector_operand (op0, modev2);
10382 if (VECTOR_MODE_P (modev3))
10383 op1 = safe_vector_operand (op1, modev3);
10384
10385 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10386 op0 = copy_to_mode_reg (modev2, op0);
10387 if ((optimize && !register_operand (op1, modev3))
10388 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10389 op1 = copy_to_mode_reg (modev3, op1);
10390
10391 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10392 {
10393 error ("the third argument must be an 8-bit immediate");
10394 return const0_rtx;
10395 }
10396
10397 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10398 {
10399 if (optimize || !target
10400 || GET_MODE (target) != tmode0
10401 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10402 target = gen_reg_rtx (tmode0);
10403
10404 scratch1 = gen_reg_rtx (tmode1);
10405
10406 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10407 }
10408 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10409 {
10410 if (optimize || !target
10411 || GET_MODE (target) != tmode1
10412 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10413 target = gen_reg_rtx (tmode1);
10414
10415 scratch0 = gen_reg_rtx (tmode0);
10416
10417 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10418 }
10419 else
10420 {
10421 gcc_assert (d->flag);
10422
10423 scratch0 = gen_reg_rtx (tmode0);
10424 scratch1 = gen_reg_rtx (tmode1);
10425
10426 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10427 }
10428
10429 if (! pat)
10430 return 0;
10431
10432 emit_insn (pat);
10433
10434 if (d->flag)
10435 {
10436 target = gen_reg_rtx (SImode);
10437 emit_move_insn (target, const0_rtx);
10438 target = gen_rtx_SUBREG (QImode, target, 0);
10439
10440 emit_insn
10441 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10442 gen_rtx_fmt_ee (EQ, QImode,
10443 gen_rtx_REG ((machine_mode) d->flag,
10444 FLAGS_REG),
10445 const0_rtx)));
10446 return SUBREG_REG (target);
10447 }
10448 else
10449 return target;
10450 }
10451
10452 /* Fixup modeless constants to fit required mode. */
10453
10454 static rtx
10455 fixup_modeless_constant (rtx x, machine_mode mode)
10456 {
10457 if (GET_MODE (x) == VOIDmode)
10458 x = convert_to_mode (mode, x, 1);
10459 return x;
10460 }
10461
10462 /* Subroutine of ix86_expand_builtin to take care of insns with
10463 variable number of operands. */
10464
10465 static rtx
10466 ix86_expand_args_builtin (const struct builtin_description *d,
10467 tree exp, rtx target)
10468 {
10469 rtx pat, real_target;
10470 unsigned int i, nargs;
10471 unsigned int nargs_constant = 0;
10472 unsigned int mask_pos = 0;
10473 int num_memory = 0;
10474 rtx xops[6];
10475 bool second_arg_count = false;
10476 enum insn_code icode = d->icode;
10477 const struct insn_data_d *insn_p = &insn_data[icode];
10478 machine_mode tmode = insn_p->operand[0].mode;
10479 machine_mode rmode = VOIDmode;
10480 bool swap = false;
10481 enum rtx_code comparison = d->comparison;
10482
10483 switch ((enum ix86_builtin_func_type) d->flag)
10484 {
10485 case V2DF_FTYPE_V2DF_ROUND:
10486 case V4DF_FTYPE_V4DF_ROUND:
10487 case V8DF_FTYPE_V8DF_ROUND:
10488 case V4SF_FTYPE_V4SF_ROUND:
10489 case V8SF_FTYPE_V8SF_ROUND:
10490 case V16SF_FTYPE_V16SF_ROUND:
10491 case V8HF_FTYPE_V8HF_ROUND:
10492 case V16HF_FTYPE_V16HF_ROUND:
10493 case V32HF_FTYPE_V32HF_ROUND:
10494 case V4SI_FTYPE_V4SF_ROUND:
10495 case V8SI_FTYPE_V8SF_ROUND:
10496 case V16SI_FTYPE_V16SF_ROUND:
10497 return ix86_expand_sse_round (d, exp, target);
10498 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10499 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10500 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10501 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10502 case INT_FTYPE_V8SF_V8SF_PTEST:
10503 case INT_FTYPE_V4DI_V4DI_PTEST:
10504 case INT_FTYPE_V4DF_V4DF_PTEST:
10505 case INT_FTYPE_V4SF_V4SF_PTEST:
10506 case INT_FTYPE_V2DI_V2DI_PTEST:
10507 case INT_FTYPE_V2DF_V2DF_PTEST:
10508 return ix86_expand_sse_ptest (d, exp, target);
10509 case FLOAT128_FTYPE_FLOAT128:
10510 case FLOAT_FTYPE_FLOAT:
10511 case FLOAT_FTYPE_BFLOAT16:
10512 case INT_FTYPE_INT:
10513 case UINT_FTYPE_UINT:
10514 case UINT16_FTYPE_UINT16:
10515 case UINT64_FTYPE_INT:
10516 case UINT64_FTYPE_UINT64:
10517 case INT64_FTYPE_INT64:
10518 case INT64_FTYPE_V4SF:
10519 case INT64_FTYPE_V2DF:
10520 case INT_FTYPE_V16QI:
10521 case INT_FTYPE_V8QI:
10522 case INT_FTYPE_V8SF:
10523 case INT_FTYPE_V4DF:
10524 case INT_FTYPE_V4SF:
10525 case INT_FTYPE_V2DF:
10526 case INT_FTYPE_V32QI:
10527 case V16QI_FTYPE_V16QI:
10528 case V8SI_FTYPE_V8SF:
10529 case V8SI_FTYPE_V4SI:
10530 case V8HI_FTYPE_V8HI:
10531 case V8HI_FTYPE_V16QI:
10532 case V8QI_FTYPE_V8QI:
10533 case V8SF_FTYPE_V8SF:
10534 case V8SF_FTYPE_V8SI:
10535 case V8SF_FTYPE_V4SF:
10536 case V8SF_FTYPE_V8HI:
10537 case V4SI_FTYPE_V4SI:
10538 case V4SI_FTYPE_V16QI:
10539 case V4SI_FTYPE_V4SF:
10540 case V4SI_FTYPE_V8SI:
10541 case V4SI_FTYPE_V8HI:
10542 case V4SI_FTYPE_V4DF:
10543 case V4SI_FTYPE_V2DF:
10544 case V4HI_FTYPE_V4HI:
10545 case V4DF_FTYPE_V4DF:
10546 case V4DF_FTYPE_V4SI:
10547 case V4DF_FTYPE_V4SF:
10548 case V4DF_FTYPE_V2DF:
10549 case V4SF_FTYPE_V4SF:
10550 case V4SF_FTYPE_V4SI:
10551 case V4SF_FTYPE_V8SF:
10552 case V4SF_FTYPE_V4DF:
10553 case V4SF_FTYPE_V8HI:
10554 case V4SF_FTYPE_V2DF:
10555 case V2DI_FTYPE_V2DI:
10556 case V2DI_FTYPE_V16QI:
10557 case V2DI_FTYPE_V8HI:
10558 case V2DI_FTYPE_V4SI:
10559 case V2DF_FTYPE_V2DF:
10560 case V2DF_FTYPE_V4SI:
10561 case V2DF_FTYPE_V4DF:
10562 case V2DF_FTYPE_V4SF:
10563 case V2DF_FTYPE_V2SI:
10564 case V2SI_FTYPE_V2SI:
10565 case V2SI_FTYPE_V4SF:
10566 case V2SI_FTYPE_V2SF:
10567 case V2SI_FTYPE_V2DF:
10568 case V2SF_FTYPE_V2SF:
10569 case V2SF_FTYPE_V2SI:
10570 case V32QI_FTYPE_V32QI:
10571 case V32QI_FTYPE_V16QI:
10572 case V16HI_FTYPE_V16HI:
10573 case V16HI_FTYPE_V8HI:
10574 case V8SI_FTYPE_V8SI:
10575 case V16HI_FTYPE_V16QI:
10576 case V8SI_FTYPE_V16QI:
10577 case V4DI_FTYPE_V16QI:
10578 case V8SI_FTYPE_V8HI:
10579 case V4DI_FTYPE_V8HI:
10580 case V4DI_FTYPE_V4SI:
10581 case V4DI_FTYPE_V2DI:
10582 case UQI_FTYPE_UQI:
10583 case UHI_FTYPE_UHI:
10584 case USI_FTYPE_USI:
10585 case USI_FTYPE_UQI:
10586 case USI_FTYPE_UHI:
10587 case UDI_FTYPE_UDI:
10588 case UHI_FTYPE_V16QI:
10589 case USI_FTYPE_V32QI:
10590 case UDI_FTYPE_V64QI:
10591 case V16QI_FTYPE_UHI:
10592 case V32QI_FTYPE_USI:
10593 case V64QI_FTYPE_UDI:
10594 case V8HI_FTYPE_UQI:
10595 case V16HI_FTYPE_UHI:
10596 case V32HI_FTYPE_USI:
10597 case V4SI_FTYPE_UQI:
10598 case V8SI_FTYPE_UQI:
10599 case V4SI_FTYPE_UHI:
10600 case V8SI_FTYPE_UHI:
10601 case UQI_FTYPE_V8HI:
10602 case UHI_FTYPE_V16HI:
10603 case USI_FTYPE_V32HI:
10604 case UQI_FTYPE_V4SI:
10605 case UQI_FTYPE_V8SI:
10606 case UHI_FTYPE_V16SI:
10607 case UQI_FTYPE_V2DI:
10608 case UQI_FTYPE_V4DI:
10609 case UQI_FTYPE_V8DI:
10610 case V16SI_FTYPE_UHI:
10611 case V2DI_FTYPE_UQI:
10612 case V4DI_FTYPE_UQI:
10613 case V16SI_FTYPE_INT:
10614 case V16SF_FTYPE_V8SF:
10615 case V16SI_FTYPE_V8SI:
10616 case V16SF_FTYPE_V4SF:
10617 case V16SI_FTYPE_V4SI:
10618 case V16SI_FTYPE_V16SF:
10619 case V16SI_FTYPE_V16SI:
10620 case V64QI_FTYPE_V64QI:
10621 case V32HI_FTYPE_V32HI:
10622 case V16SF_FTYPE_V16SF:
10623 case V8DI_FTYPE_UQI:
10624 case V8DI_FTYPE_V8DI:
10625 case V8DF_FTYPE_V4DF:
10626 case V8DF_FTYPE_V2DF:
10627 case V8DF_FTYPE_V8DF:
10628 case V4DI_FTYPE_V4DI:
10629 case V16BF_FTYPE_V16SF:
10630 case V8BF_FTYPE_V8SF:
10631 case V8BF_FTYPE_V4SF:
10632 nargs = 1;
10633 break;
10634 case V4SF_FTYPE_V4SF_VEC_MERGE:
10635 case V2DF_FTYPE_V2DF_VEC_MERGE:
10636 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10637 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10638 case V16QI_FTYPE_V16QI_V16QI:
10639 case V16QI_FTYPE_V8HI_V8HI:
10640 case V16HF_FTYPE_V16HF_V16HF:
10641 case V16SF_FTYPE_V16SF_V16SF:
10642 case V8QI_FTYPE_V8QI_V8QI:
10643 case V8QI_FTYPE_V4HI_V4HI:
10644 case V8HI_FTYPE_V8HI_V8HI:
10645 case V8HI_FTYPE_V16QI_V16QI:
10646 case V8HI_FTYPE_V4SI_V4SI:
10647 case V8HF_FTYPE_V8HF_V8HF:
10648 case V8SF_FTYPE_V8SF_V8SF:
10649 case V8SF_FTYPE_V8SF_V8SI:
10650 case V8DF_FTYPE_V8DF_V8DF:
10651 case V4SI_FTYPE_V4SI_V4SI:
10652 case V4SI_FTYPE_V8HI_V8HI:
10653 case V4SI_FTYPE_V2DF_V2DF:
10654 case V4HI_FTYPE_V4HI_V4HI:
10655 case V4HI_FTYPE_V8QI_V8QI:
10656 case V4HI_FTYPE_V2SI_V2SI:
10657 case V4DF_FTYPE_V4DF_V4DF:
10658 case V4DF_FTYPE_V4DF_V4DI:
10659 case V4SF_FTYPE_V4SF_V4SF:
10660 case V4SF_FTYPE_V4SF_V4SI:
10661 case V4SF_FTYPE_V4SF_V2SI:
10662 case V4SF_FTYPE_V4SF_V2DF:
10663 case V4SF_FTYPE_V4SF_UINT:
10664 case V4SF_FTYPE_V4SF_DI:
10665 case V4SF_FTYPE_V4SF_SI:
10666 case V2DI_FTYPE_V2DI_V2DI:
10667 case V2DI_FTYPE_V16QI_V16QI:
10668 case V2DI_FTYPE_V4SI_V4SI:
10669 case V2DI_FTYPE_V2DI_V16QI:
10670 case V2SI_FTYPE_V2SI_V2SI:
10671 case V2SI_FTYPE_V4HI_V4HI:
10672 case V2SI_FTYPE_V2SF_V2SF:
10673 case V2DF_FTYPE_V2DF_V2DF:
10674 case V2DF_FTYPE_V2DF_V4SF:
10675 case V2DF_FTYPE_V2DF_V2DI:
10676 case V2DF_FTYPE_V2DF_DI:
10677 case V2DF_FTYPE_V2DF_SI:
10678 case V2DF_FTYPE_V2DF_UINT:
10679 case V2SF_FTYPE_V2SF_V2SF:
10680 case V1DI_FTYPE_V1DI_V1DI:
10681 case V1DI_FTYPE_V8QI_V8QI:
10682 case V1DI_FTYPE_V2SI_V2SI:
10683 case V32QI_FTYPE_V16HI_V16HI:
10684 case V16HI_FTYPE_V8SI_V8SI:
10685 case V64QI_FTYPE_V64QI_V64QI:
10686 case V32QI_FTYPE_V32QI_V32QI:
10687 case V16HI_FTYPE_V32QI_V32QI:
10688 case V16HI_FTYPE_V16HI_V16HI:
10689 case V8SI_FTYPE_V4DF_V4DF:
10690 case V8SI_FTYPE_V8SI_V8SI:
10691 case V8SI_FTYPE_V16HI_V16HI:
10692 case V4DI_FTYPE_V4DI_V4DI:
10693 case V4DI_FTYPE_V8SI_V8SI:
10694 case V4DI_FTYPE_V32QI_V32QI:
10695 case V8DI_FTYPE_V64QI_V64QI:
10696 if (comparison == UNKNOWN)
10697 return ix86_expand_binop_builtin (icode, exp, target);
10698 nargs = 2;
10699 break;
10700 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10701 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10702 gcc_assert (comparison != UNKNOWN);
10703 nargs = 2;
10704 swap = true;
10705 break;
10706 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10707 case V16HI_FTYPE_V16HI_SI_COUNT:
10708 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10709 case V8SI_FTYPE_V8SI_SI_COUNT:
10710 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10711 case V4DI_FTYPE_V4DI_INT_COUNT:
10712 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10713 case V8HI_FTYPE_V8HI_SI_COUNT:
10714 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10715 case V4SI_FTYPE_V4SI_SI_COUNT:
10716 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10717 case V4HI_FTYPE_V4HI_SI_COUNT:
10718 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10719 case V2DI_FTYPE_V2DI_SI_COUNT:
10720 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10721 case V2SI_FTYPE_V2SI_SI_COUNT:
10722 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10723 case V1DI_FTYPE_V1DI_SI_COUNT:
10724 nargs = 2;
10725 second_arg_count = true;
10726 break;
10727 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10728 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10729 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10730 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10731 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10732 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10733 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10734 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10735 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10736 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10737 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10738 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10739 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10740 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10741 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10742 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10743 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10744 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10745 nargs = 4;
10746 second_arg_count = true;
10747 break;
10748 case UINT64_FTYPE_UINT64_UINT64:
10749 case UINT_FTYPE_UINT_UINT:
10750 case UINT_FTYPE_UINT_USHORT:
10751 case UINT_FTYPE_UINT_UCHAR:
10752 case UINT16_FTYPE_UINT16_INT:
10753 case UINT8_FTYPE_UINT8_INT:
10754 case UQI_FTYPE_UQI_UQI:
10755 case UHI_FTYPE_UHI_UHI:
10756 case USI_FTYPE_USI_USI:
10757 case UDI_FTYPE_UDI_UDI:
10758 case V16SI_FTYPE_V8DF_V8DF:
10759 case V32BF_FTYPE_V16SF_V16SF:
10760 case V16BF_FTYPE_V8SF_V8SF:
10761 case V8BF_FTYPE_V4SF_V4SF:
10762 case V16BF_FTYPE_V16SF_UHI:
10763 case V8BF_FTYPE_V8SF_UQI:
10764 case V8BF_FTYPE_V4SF_UQI:
10765 nargs = 2;
10766 break;
10767 case V2DI_FTYPE_V2DI_INT_CONVERT:
10768 nargs = 2;
10769 rmode = V1TImode;
10770 nargs_constant = 1;
10771 break;
10772 case V4DI_FTYPE_V4DI_INT_CONVERT:
10773 nargs = 2;
10774 rmode = V2TImode;
10775 nargs_constant = 1;
10776 break;
10777 case V8DI_FTYPE_V8DI_INT_CONVERT:
10778 nargs = 2;
10779 rmode = V4TImode;
10780 nargs_constant = 1;
10781 break;
10782 case V8HI_FTYPE_V8HI_INT:
10783 case V8HI_FTYPE_V8SF_INT:
10784 case V16HI_FTYPE_V16SF_INT:
10785 case V8HI_FTYPE_V4SF_INT:
10786 case V8SF_FTYPE_V8SF_INT:
10787 case V4SF_FTYPE_V16SF_INT:
10788 case V16SF_FTYPE_V16SF_INT:
10789 case V4SI_FTYPE_V4SI_INT:
10790 case V4SI_FTYPE_V8SI_INT:
10791 case V4HI_FTYPE_V4HI_INT:
10792 case V4DF_FTYPE_V4DF_INT:
10793 case V4DF_FTYPE_V8DF_INT:
10794 case V4SF_FTYPE_V4SF_INT:
10795 case V4SF_FTYPE_V8SF_INT:
10796 case V2DI_FTYPE_V2DI_INT:
10797 case V2DF_FTYPE_V2DF_INT:
10798 case V2DF_FTYPE_V4DF_INT:
10799 case V16HI_FTYPE_V16HI_INT:
10800 case V8SI_FTYPE_V8SI_INT:
10801 case V16SI_FTYPE_V16SI_INT:
10802 case V4SI_FTYPE_V16SI_INT:
10803 case V4DI_FTYPE_V4DI_INT:
10804 case V2DI_FTYPE_V4DI_INT:
10805 case V4DI_FTYPE_V8DI_INT:
10806 case UQI_FTYPE_UQI_UQI_CONST:
10807 case UHI_FTYPE_UHI_UQI:
10808 case USI_FTYPE_USI_UQI:
10809 case UDI_FTYPE_UDI_UQI:
10810 nargs = 2;
10811 nargs_constant = 1;
10812 break;
10813 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10814 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10815 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10816 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10817 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10818 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10819 case UHI_FTYPE_V16SI_V16SI_UHI:
10820 case UQI_FTYPE_V8DI_V8DI_UQI:
10821 case V16HI_FTYPE_V16SI_V16HI_UHI:
10822 case V16QI_FTYPE_V16SI_V16QI_UHI:
10823 case V16QI_FTYPE_V8DI_V16QI_UQI:
10824 case V32HF_FTYPE_V32HF_V32HF_USI:
10825 case V16SF_FTYPE_V16SF_V16SF_UHI:
10826 case V16SF_FTYPE_V4SF_V16SF_UHI:
10827 case V16SI_FTYPE_SI_V16SI_UHI:
10828 case V16SI_FTYPE_V16HI_V16SI_UHI:
10829 case V16SI_FTYPE_V16QI_V16SI_UHI:
10830 case V8SF_FTYPE_V4SF_V8SF_UQI:
10831 case V4DF_FTYPE_V2DF_V4DF_UQI:
10832 case V8SI_FTYPE_V4SI_V8SI_UQI:
10833 case V8SI_FTYPE_SI_V8SI_UQI:
10834 case V4SI_FTYPE_V4SI_V4SI_UQI:
10835 case V4SI_FTYPE_SI_V4SI_UQI:
10836 case V4DI_FTYPE_V2DI_V4DI_UQI:
10837 case V4DI_FTYPE_DI_V4DI_UQI:
10838 case V2DI_FTYPE_V2DI_V2DI_UQI:
10839 case V2DI_FTYPE_DI_V2DI_UQI:
10840 case V64QI_FTYPE_V64QI_V64QI_UDI:
10841 case V64QI_FTYPE_V16QI_V64QI_UDI:
10842 case V64QI_FTYPE_QI_V64QI_UDI:
10843 case V32QI_FTYPE_V32QI_V32QI_USI:
10844 case V32QI_FTYPE_V16QI_V32QI_USI:
10845 case V32QI_FTYPE_QI_V32QI_USI:
10846 case V16QI_FTYPE_V16QI_V16QI_UHI:
10847 case V16QI_FTYPE_QI_V16QI_UHI:
10848 case V32HI_FTYPE_V8HI_V32HI_USI:
10849 case V32HI_FTYPE_HI_V32HI_USI:
10850 case V16HI_FTYPE_V8HI_V16HI_UHI:
10851 case V16HI_FTYPE_HI_V16HI_UHI:
10852 case V8HI_FTYPE_V8HI_V8HI_UQI:
10853 case V8HI_FTYPE_HI_V8HI_UQI:
10854 case V16HF_FTYPE_V16HF_V16HF_UHI:
10855 case V8SF_FTYPE_V8HI_V8SF_UQI:
10856 case V4SF_FTYPE_V8HI_V4SF_UQI:
10857 case V8SI_FTYPE_V8HF_V8SI_UQI:
10858 case V8SF_FTYPE_V8HF_V8SF_UQI:
10859 case V8SI_FTYPE_V8SF_V8SI_UQI:
10860 case V4SI_FTYPE_V4SF_V4SI_UQI:
10861 case V4SI_FTYPE_V8HF_V4SI_UQI:
10862 case V4SF_FTYPE_V8HF_V4SF_UQI:
10863 case V4DI_FTYPE_V8HF_V4DI_UQI:
10864 case V4DI_FTYPE_V4SF_V4DI_UQI:
10865 case V2DI_FTYPE_V8HF_V2DI_UQI:
10866 case V2DI_FTYPE_V4SF_V2DI_UQI:
10867 case V8HF_FTYPE_V8HF_V8HF_UQI:
10868 case V8HF_FTYPE_V8HF_V8HF_V8HF:
10869 case V8HF_FTYPE_V8HI_V8HF_UQI:
10870 case V8HF_FTYPE_V8SI_V8HF_UQI:
10871 case V8HF_FTYPE_V8SF_V8HF_UQI:
10872 case V8HF_FTYPE_V4SI_V8HF_UQI:
10873 case V8HF_FTYPE_V4SF_V8HF_UQI:
10874 case V8HF_FTYPE_V4DI_V8HF_UQI:
10875 case V8HF_FTYPE_V4DF_V8HF_UQI:
10876 case V8HF_FTYPE_V2DI_V8HF_UQI:
10877 case V8HF_FTYPE_V2DF_V8HF_UQI:
10878 case V4SF_FTYPE_V4DI_V4SF_UQI:
10879 case V4SF_FTYPE_V2DI_V4SF_UQI:
10880 case V4DF_FTYPE_V4DI_V4DF_UQI:
10881 case V4DF_FTYPE_V8HF_V4DF_UQI:
10882 case V2DF_FTYPE_V8HF_V2DF_UQI:
10883 case V2DF_FTYPE_V2DI_V2DF_UQI:
10884 case V16QI_FTYPE_V8HI_V16QI_UQI:
10885 case V16QI_FTYPE_V16HI_V16QI_UHI:
10886 case V16QI_FTYPE_V4SI_V16QI_UQI:
10887 case V16QI_FTYPE_V8SI_V16QI_UQI:
10888 case V8HI_FTYPE_V8HF_V8HI_UQI:
10889 case V8HI_FTYPE_V4SI_V8HI_UQI:
10890 case V8HI_FTYPE_V8SI_V8HI_UQI:
10891 case V16QI_FTYPE_V2DI_V16QI_UQI:
10892 case V16QI_FTYPE_V4DI_V16QI_UQI:
10893 case V8HI_FTYPE_V2DI_V8HI_UQI:
10894 case V8HI_FTYPE_V4DI_V8HI_UQI:
10895 case V4SI_FTYPE_V2DI_V4SI_UQI:
10896 case V4SI_FTYPE_V4DI_V4SI_UQI:
10897 case V32QI_FTYPE_V32HI_V32QI_USI:
10898 case UHI_FTYPE_V16QI_V16QI_UHI:
10899 case USI_FTYPE_V32QI_V32QI_USI:
10900 case UDI_FTYPE_V64QI_V64QI_UDI:
10901 case UQI_FTYPE_V8HI_V8HI_UQI:
10902 case UHI_FTYPE_V16HI_V16HI_UHI:
10903 case USI_FTYPE_V32HI_V32HI_USI:
10904 case UQI_FTYPE_V4SI_V4SI_UQI:
10905 case UQI_FTYPE_V8SI_V8SI_UQI:
10906 case UQI_FTYPE_V2DI_V2DI_UQI:
10907 case UQI_FTYPE_V4DI_V4DI_UQI:
10908 case V4SF_FTYPE_V2DF_V4SF_UQI:
10909 case V4SF_FTYPE_V4DF_V4SF_UQI:
10910 case V16SI_FTYPE_V16SI_V16SI_UHI:
10911 case V16SI_FTYPE_V4SI_V16SI_UHI:
10912 case V2DI_FTYPE_V4SI_V2DI_UQI:
10913 case V2DI_FTYPE_V8HI_V2DI_UQI:
10914 case V2DI_FTYPE_V16QI_V2DI_UQI:
10915 case V4DI_FTYPE_V4DI_V4DI_UQI:
10916 case V4DI_FTYPE_V4SI_V4DI_UQI:
10917 case V4DI_FTYPE_V8HI_V4DI_UQI:
10918 case V4DI_FTYPE_V16QI_V4DI_UQI:
10919 case V4DI_FTYPE_V4DF_V4DI_UQI:
10920 case V2DI_FTYPE_V2DF_V2DI_UQI:
10921 case V4SI_FTYPE_V4DF_V4SI_UQI:
10922 case V4SI_FTYPE_V2DF_V4SI_UQI:
10923 case V4SI_FTYPE_V8HI_V4SI_UQI:
10924 case V4SI_FTYPE_V16QI_V4SI_UQI:
10925 case V4DI_FTYPE_V4DI_V4DI_V4DI:
10926 case V8DF_FTYPE_V2DF_V8DF_UQI:
10927 case V8DF_FTYPE_V4DF_V8DF_UQI:
10928 case V8DF_FTYPE_V8DF_V8DF_UQI:
10929 case V8SF_FTYPE_V8SF_V8SF_UQI:
10930 case V8SF_FTYPE_V8SI_V8SF_UQI:
10931 case V4DF_FTYPE_V4DF_V4DF_UQI:
10932 case V4SF_FTYPE_V4SF_V4SF_UQI:
10933 case V2DF_FTYPE_V2DF_V2DF_UQI:
10934 case V2DF_FTYPE_V4SF_V2DF_UQI:
10935 case V2DF_FTYPE_V4SI_V2DF_UQI:
10936 case V4SF_FTYPE_V4SI_V4SF_UQI:
10937 case V4DF_FTYPE_V4SF_V4DF_UQI:
10938 case V4DF_FTYPE_V4SI_V4DF_UQI:
10939 case V8SI_FTYPE_V8SI_V8SI_UQI:
10940 case V8SI_FTYPE_V8HI_V8SI_UQI:
10941 case V8SI_FTYPE_V16QI_V8SI_UQI:
10942 case V8DF_FTYPE_V8SI_V8DF_UQI:
10943 case V8DI_FTYPE_DI_V8DI_UQI:
10944 case V16SF_FTYPE_V8SF_V16SF_UHI:
10945 case V16SI_FTYPE_V8SI_V16SI_UHI:
10946 case V16HF_FTYPE_V16HI_V16HF_UHI:
10947 case V16HF_FTYPE_V16HF_V16HF_V16HF:
10948 case V16HI_FTYPE_V16HF_V16HI_UHI:
10949 case V16HI_FTYPE_V16HI_V16HI_UHI:
10950 case V8HI_FTYPE_V16QI_V8HI_UQI:
10951 case V16HI_FTYPE_V16QI_V16HI_UHI:
10952 case V32HI_FTYPE_V32HI_V32HI_USI:
10953 case V32HI_FTYPE_V32QI_V32HI_USI:
10954 case V8DI_FTYPE_V16QI_V8DI_UQI:
10955 case V8DI_FTYPE_V2DI_V8DI_UQI:
10956 case V8DI_FTYPE_V4DI_V8DI_UQI:
10957 case V8DI_FTYPE_V8DI_V8DI_UQI:
10958 case V8DI_FTYPE_V8HI_V8DI_UQI:
10959 case V8DI_FTYPE_V8SI_V8DI_UQI:
10960 case V8HI_FTYPE_V8DI_V8HI_UQI:
10961 case V8SI_FTYPE_V8DI_V8SI_UQI:
10962 case V4SI_FTYPE_V4SI_V4SI_V4SI:
10963 case V16SI_FTYPE_V16SI_V16SI_V16SI:
10964 case V8DI_FTYPE_V8DI_V8DI_V8DI:
10965 case V32HI_FTYPE_V32HI_V32HI_V32HI:
10966 case V2DI_FTYPE_V2DI_V2DI_V2DI:
10967 case V16HI_FTYPE_V16HI_V16HI_V16HI:
10968 case V8SI_FTYPE_V8SI_V8SI_V8SI:
10969 case V8HI_FTYPE_V8HI_V8HI_V8HI:
10970 case V32BF_FTYPE_V16SF_V16SF_USI:
10971 case V16BF_FTYPE_V8SF_V8SF_UHI:
10972 case V8BF_FTYPE_V4SF_V4SF_UQI:
10973 case V16BF_FTYPE_V16SF_V16BF_UHI:
10974 case V8BF_FTYPE_V8SF_V8BF_UQI:
10975 case V8BF_FTYPE_V4SF_V8BF_UQI:
10976 case V16SF_FTYPE_V16SF_V32BF_V32BF:
10977 case V8SF_FTYPE_V8SF_V16BF_V16BF:
10978 case V4SF_FTYPE_V4SF_V8BF_V8BF:
10979 nargs = 3;
10980 break;
10981 case V32QI_FTYPE_V32QI_V32QI_INT:
10982 case V16HI_FTYPE_V16HI_V16HI_INT:
10983 case V16QI_FTYPE_V16QI_V16QI_INT:
10984 case V4DI_FTYPE_V4DI_V4DI_INT:
10985 case V8HI_FTYPE_V8HI_V8HI_INT:
10986 case V8SI_FTYPE_V8SI_V8SI_INT:
10987 case V8SI_FTYPE_V8SI_V4SI_INT:
10988 case V8SF_FTYPE_V8SF_V8SF_INT:
10989 case V8SF_FTYPE_V8SF_V4SF_INT:
10990 case V4SI_FTYPE_V4SI_V4SI_INT:
10991 case V4DF_FTYPE_V4DF_V4DF_INT:
10992 case V16SF_FTYPE_V16SF_V16SF_INT:
10993 case V16SF_FTYPE_V16SF_V4SF_INT:
10994 case V16SI_FTYPE_V16SI_V4SI_INT:
10995 case V4DF_FTYPE_V4DF_V2DF_INT:
10996 case V4SF_FTYPE_V4SF_V4SF_INT:
10997 case V2DI_FTYPE_V2DI_V2DI_INT:
10998 case V4DI_FTYPE_V4DI_V2DI_INT:
10999 case V2DF_FTYPE_V2DF_V2DF_INT:
11000 case UQI_FTYPE_V8DI_V8UDI_INT:
11001 case UQI_FTYPE_V8DF_V8DF_INT:
11002 case UQI_FTYPE_V2DF_V2DF_INT:
11003 case UQI_FTYPE_V4SF_V4SF_INT:
11004 case UHI_FTYPE_V16SI_V16SI_INT:
11005 case UHI_FTYPE_V16SF_V16SF_INT:
11006 case V64QI_FTYPE_V64QI_V64QI_INT:
11007 case V32HI_FTYPE_V32HI_V32HI_INT:
11008 case V16SI_FTYPE_V16SI_V16SI_INT:
11009 case V8DI_FTYPE_V8DI_V8DI_INT:
11010 nargs = 3;
11011 nargs_constant = 1;
11012 break;
11013 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
11014 nargs = 3;
11015 rmode = V4DImode;
11016 nargs_constant = 1;
11017 break;
11018 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
11019 nargs = 3;
11020 rmode = V2DImode;
11021 nargs_constant = 1;
11022 break;
11023 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
11024 nargs = 3;
11025 rmode = DImode;
11026 nargs_constant = 1;
11027 break;
11028 case V2DI_FTYPE_V2DI_UINT_UINT:
11029 nargs = 3;
11030 nargs_constant = 2;
11031 break;
11032 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
11033 nargs = 3;
11034 rmode = V8DImode;
11035 nargs_constant = 1;
11036 break;
11037 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
11038 nargs = 5;
11039 rmode = V8DImode;
11040 mask_pos = 2;
11041 nargs_constant = 1;
11042 break;
11043 case QI_FTYPE_V8DF_INT_UQI:
11044 case QI_FTYPE_V4DF_INT_UQI:
11045 case QI_FTYPE_V2DF_INT_UQI:
11046 case HI_FTYPE_V16SF_INT_UHI:
11047 case QI_FTYPE_V8SF_INT_UQI:
11048 case QI_FTYPE_V4SF_INT_UQI:
11049 case QI_FTYPE_V8HF_INT_UQI:
11050 case HI_FTYPE_V16HF_INT_UHI:
11051 case SI_FTYPE_V32HF_INT_USI:
11052 case V4SI_FTYPE_V4SI_V4SI_UHI:
11053 case V8SI_FTYPE_V8SI_V8SI_UHI:
11054 nargs = 3;
11055 mask_pos = 1;
11056 nargs_constant = 1;
11057 break;
11058 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
11059 nargs = 5;
11060 rmode = V4DImode;
11061 mask_pos = 2;
11062 nargs_constant = 1;
11063 break;
11064 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
11065 nargs = 5;
11066 rmode = V2DImode;
11067 mask_pos = 2;
11068 nargs_constant = 1;
11069 break;
11070 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
11071 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
11072 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
11073 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
11074 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
11075 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
11076 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
11077 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
11078 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
11079 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
11080 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
11081 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
11082 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
11083 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
11084 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
11085 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
11086 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
11087 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
11088 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
11089 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
11090 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
11091 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
11092 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
11093 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
11094 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
11095 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
11096 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
11097 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
11098 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
11099 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
11100 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
11101 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
11102 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
11103 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
11104 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
11105 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
11106 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
11107 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
11108 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
11109 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
11110 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
11111 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
11112 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
11113 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
11114 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
11115 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
11116 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
11117 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
11118 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
11119 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
11120 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
11121 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
11122 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
11123 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
11124 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
11125 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
11126 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
11127 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
11128 nargs = 4;
11129 break;
11130 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
11131 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
11132 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
11133 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
11134 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
11135 nargs = 4;
11136 nargs_constant = 1;
11137 break;
11138 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
11139 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
11140 case QI_FTYPE_V4DF_V4DF_INT_UQI:
11141 case QI_FTYPE_V8SF_V8SF_INT_UQI:
11142 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
11143 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
11144 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
11145 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
11146 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
11147 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
11148 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
11149 case USI_FTYPE_V32QI_V32QI_INT_USI:
11150 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
11151 case USI_FTYPE_V32HI_V32HI_INT_USI:
11152 case USI_FTYPE_V32HF_V32HF_INT_USI:
11153 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
11154 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
11155 nargs = 4;
11156 mask_pos = 1;
11157 nargs_constant = 1;
11158 break;
11159 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
11160 nargs = 4;
11161 nargs_constant = 2;
11162 break;
11163 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
11164 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
11165 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
11166 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
11167 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
11168 nargs = 4;
11169 break;
11170 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
11171 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
11172 mask_pos = 1;
11173 nargs = 4;
11174 nargs_constant = 1;
11175 break;
11176 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
11177 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
11178 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
11179 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
11180 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
11181 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
11182 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
11183 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
11184 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
11185 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
11186 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
11187 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
11188 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
11189 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
11190 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
11191 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
11192 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
11193 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
11194 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
11195 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
11196 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
11197 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
11198 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
11199 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
11200 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
11201 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11202 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
11203 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11204 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11205 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11206 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11207 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11208 nargs = 4;
11209 mask_pos = 2;
11210 nargs_constant = 1;
11211 break;
11212 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11213 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11214 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11215 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11216 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11217 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11218 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11219 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11220 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11221 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11222 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11223 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11224 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11225 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11226 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11227 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11228 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11229 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11230 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11231 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11232 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11233 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11234 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11235 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11236 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11237 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11238 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11239 nargs = 5;
11240 mask_pos = 2;
11241 nargs_constant = 1;
11242 break;
11243 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11244 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11245 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11246 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11247 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11248 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11249 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11250 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11251 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11252 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11253 nargs = 5;
11254 mask_pos = 1;
11255 nargs_constant = 1;
11256 break;
11257 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11258 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11259 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11260 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11261 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11262 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11263 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11264 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11265 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11266 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11267 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11268 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11269 nargs = 5;
11270 mask_pos = 1;
11271 nargs_constant = 2;
11272 break;
11273
11274 default:
11275 gcc_unreachable ();
11276 }
11277
11278 gcc_assert (nargs <= ARRAY_SIZE (xops));
11279
11280 if (comparison != UNKNOWN)
11281 {
11282 gcc_assert (nargs == 2);
11283 return ix86_expand_sse_compare (d, exp, target, swap);
11284 }
11285
11286 if (rmode == VOIDmode || rmode == tmode)
11287 {
11288 if (optimize
11289 || target == 0
11290 || GET_MODE (target) != tmode
11291 || !insn_p->operand[0].predicate (target, tmode))
11292 target = gen_reg_rtx (tmode);
11293 else if (memory_operand (target, tmode))
11294 num_memory++;
11295 real_target = target;
11296 }
11297 else
11298 {
11299 real_target = gen_reg_rtx (tmode);
11300 target = lowpart_subreg (rmode, real_target, tmode);
11301 }
11302
11303 for (i = 0; i < nargs; i++)
11304 {
11305 tree arg = CALL_EXPR_ARG (exp, i);
11306 rtx op = expand_normal (arg);
11307 machine_mode mode = insn_p->operand[i + 1].mode;
11308 bool match = insn_p->operand[i + 1].predicate (op, mode);
11309
11310 if (second_arg_count && i == 1)
11311 {
11312 /* SIMD shift insns take either an 8-bit immediate or
11313 register as count. But builtin functions take int as
11314 count. If count doesn't match, we put it in register.
11315 The instructions are using 64-bit count, if op is just
11316 32-bit, zero-extend it, as negative shift counts
11317 are undefined behavior and zero-extension is more
11318 efficient. */
11319 if (!match)
11320 {
11321 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11322 op = convert_modes (mode, GET_MODE (op), op, 1);
11323 else
11324 op = lowpart_subreg (mode, op, GET_MODE (op));
11325 if (!insn_p->operand[i + 1].predicate (op, mode))
11326 op = copy_to_reg (op);
11327 }
11328 }
11329 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11330 (!mask_pos && (nargs - i) <= nargs_constant))
11331 {
11332 if (!match)
11333 switch (icode)
11334 {
11335 case CODE_FOR_avx_vinsertf128v4di:
11336 case CODE_FOR_avx_vextractf128v4di:
11337 error ("the last argument must be an 1-bit immediate");
11338 return const0_rtx;
11339
11340 case CODE_FOR_avx512f_cmpv8di3_mask:
11341 case CODE_FOR_avx512f_cmpv16si3_mask:
11342 case CODE_FOR_avx512f_ucmpv8di3_mask:
11343 case CODE_FOR_avx512f_ucmpv16si3_mask:
11344 case CODE_FOR_avx512vl_cmpv4di3_mask:
11345 case CODE_FOR_avx512vl_cmpv8si3_mask:
11346 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11347 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11348 case CODE_FOR_avx512vl_cmpv2di3_mask:
11349 case CODE_FOR_avx512vl_cmpv4si3_mask:
11350 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11351 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11352 error ("the last argument must be a 3-bit immediate");
11353 return const0_rtx;
11354
11355 case CODE_FOR_sse4_1_roundsd:
11356 case CODE_FOR_sse4_1_roundss:
11357
11358 case CODE_FOR_sse4_1_roundpd:
11359 case CODE_FOR_sse4_1_roundps:
11360 case CODE_FOR_avx_roundpd256:
11361 case CODE_FOR_avx_roundps256:
11362
11363 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11364 case CODE_FOR_sse4_1_roundps_sfix:
11365 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11366 case CODE_FOR_avx_roundps_sfix256:
11367
11368 case CODE_FOR_sse4_1_blendps:
11369 case CODE_FOR_avx_blendpd256:
11370 case CODE_FOR_avx_vpermilv4df:
11371 case CODE_FOR_avx_vpermilv4df_mask:
11372 case CODE_FOR_avx512f_getmantv8df_mask:
11373 case CODE_FOR_avx512f_getmantv16sf_mask:
11374 case CODE_FOR_avx512vl_getmantv16hf_mask:
11375 case CODE_FOR_avx512vl_getmantv8sf_mask:
11376 case CODE_FOR_avx512vl_getmantv4df_mask:
11377 case CODE_FOR_avx512fp16_getmantv8hf_mask:
11378 case CODE_FOR_avx512vl_getmantv4sf_mask:
11379 case CODE_FOR_avx512vl_getmantv2df_mask:
11380 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11381 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11382 case CODE_FOR_avx512dq_rangepv4df_mask:
11383 case CODE_FOR_avx512dq_rangepv8sf_mask:
11384 case CODE_FOR_avx512dq_rangepv2df_mask:
11385 case CODE_FOR_avx512dq_rangepv4sf_mask:
11386 case CODE_FOR_avx_shufpd256_mask:
11387 error ("the last argument must be a 4-bit immediate");
11388 return const0_rtx;
11389
11390 case CODE_FOR_sha1rnds4:
11391 case CODE_FOR_sse4_1_blendpd:
11392 case CODE_FOR_avx_vpermilv2df:
11393 case CODE_FOR_avx_vpermilv2df_mask:
11394 case CODE_FOR_xop_vpermil2v2df3:
11395 case CODE_FOR_xop_vpermil2v4sf3:
11396 case CODE_FOR_xop_vpermil2v4df3:
11397 case CODE_FOR_xop_vpermil2v8sf3:
11398 case CODE_FOR_avx512f_vinsertf32x4_mask:
11399 case CODE_FOR_avx512f_vinserti32x4_mask:
11400 case CODE_FOR_avx512f_vextractf32x4_mask:
11401 case CODE_FOR_avx512f_vextracti32x4_mask:
11402 case CODE_FOR_sse2_shufpd:
11403 case CODE_FOR_sse2_shufpd_mask:
11404 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11405 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11406 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11407 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11408 error ("the last argument must be a 2-bit immediate");
11409 return const0_rtx;
11410
11411 case CODE_FOR_avx_vextractf128v4df:
11412 case CODE_FOR_avx_vextractf128v8sf:
11413 case CODE_FOR_avx_vextractf128v8si:
11414 case CODE_FOR_avx_vinsertf128v4df:
11415 case CODE_FOR_avx_vinsertf128v8sf:
11416 case CODE_FOR_avx_vinsertf128v8si:
11417 case CODE_FOR_avx512f_vinsertf64x4_mask:
11418 case CODE_FOR_avx512f_vinserti64x4_mask:
11419 case CODE_FOR_avx512f_vextractf64x4_mask:
11420 case CODE_FOR_avx512f_vextracti64x4_mask:
11421 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11422 case CODE_FOR_avx512dq_vinserti32x8_mask:
11423 case CODE_FOR_avx512vl_vinsertv4df:
11424 case CODE_FOR_avx512vl_vinsertv4di:
11425 case CODE_FOR_avx512vl_vinsertv8sf:
11426 case CODE_FOR_avx512vl_vinsertv8si:
11427 error ("the last argument must be a 1-bit immediate");
11428 return const0_rtx;
11429
11430 case CODE_FOR_avx_vmcmpv2df3:
11431 case CODE_FOR_avx_vmcmpv4sf3:
11432 case CODE_FOR_avx_cmpv2df3:
11433 case CODE_FOR_avx_cmpv4sf3:
11434 case CODE_FOR_avx_cmpv4df3:
11435 case CODE_FOR_avx_cmpv8sf3:
11436 case CODE_FOR_avx512f_cmpv8df3_mask:
11437 case CODE_FOR_avx512f_cmpv16sf3_mask:
11438 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11439 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11440 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11441 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11442 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11443 error ("the last argument must be a 5-bit immediate");
11444 return const0_rtx;
11445
11446 default:
11447 switch (nargs_constant)
11448 {
11449 case 2:
11450 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11451 (!mask_pos && (nargs - i) == nargs_constant))
11452 {
11453 error ("the next to last argument must be an 8-bit immediate");
11454 break;
11455 }
11456 /* FALLTHRU */
11457 case 1:
11458 error ("the last argument must be an 8-bit immediate");
11459 break;
11460 default:
11461 gcc_unreachable ();
11462 }
11463 return const0_rtx;
11464 }
11465 }
11466 else
11467 {
11468 if (VECTOR_MODE_P (mode))
11469 op = safe_vector_operand (op, mode);
11470
11471 /* If we aren't optimizing, only allow one memory operand to
11472 be generated. */
11473 if (memory_operand (op, mode))
11474 num_memory++;
11475
11476 op = fixup_modeless_constant (op, mode);
11477
11478 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11479 {
11480 if (optimize || !match || num_memory > 1)
11481 op = copy_to_mode_reg (mode, op);
11482 }
11483 else
11484 {
11485 op = copy_to_reg (op);
11486 op = lowpart_subreg (mode, op, GET_MODE (op));
11487 }
11488 }
11489
11490 xops[i] = op;
11491 }
11492
11493 switch (nargs)
11494 {
11495 case 1:
11496 pat = GEN_FCN (icode) (real_target, xops[0]);
11497 break;
11498 case 2:
11499 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11500 break;
11501 case 3:
11502 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11503 break;
11504 case 4:
11505 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11506 xops[2], xops[3]);
11507 break;
11508 case 5:
11509 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11510 xops[2], xops[3], xops[4]);
11511 break;
11512 case 6:
11513 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11514 xops[2], xops[3], xops[4], xops[5]);
11515 break;
11516 default:
11517 gcc_unreachable ();
11518 }
11519
11520 if (! pat)
11521 return 0;
11522
11523 emit_insn (pat);
11524 return target;
11525 }
11526
11527 /* Transform pattern of following layout:
11528 (set A
11529 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11530 )
11531 into:
11532 (set (A B)) */
11533
11534 static rtx
11535 ix86_erase_embedded_rounding (rtx pat)
11536 {
11537 if (GET_CODE (pat) == INSN)
11538 pat = PATTERN (pat);
11539
11540 gcc_assert (GET_CODE (pat) == SET);
11541 rtx src = SET_SRC (pat);
11542 gcc_assert (XVECLEN (src, 0) == 2);
11543 rtx p0 = XVECEXP (src, 0, 0);
11544 gcc_assert (GET_CODE (src) == UNSPEC
11545 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11546 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11547 return res;
11548 }
11549
11550 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11551 with rounding. */
11552 static rtx
11553 ix86_expand_sse_comi_round (const struct builtin_description *d,
11554 tree exp, rtx target)
11555 {
11556 rtx pat, set_dst;
11557 tree arg0 = CALL_EXPR_ARG (exp, 0);
11558 tree arg1 = CALL_EXPR_ARG (exp, 1);
11559 tree arg2 = CALL_EXPR_ARG (exp, 2);
11560 tree arg3 = CALL_EXPR_ARG (exp, 3);
11561 rtx op0 = expand_normal (arg0);
11562 rtx op1 = expand_normal (arg1);
11563 rtx op2 = expand_normal (arg2);
11564 rtx op3 = expand_normal (arg3);
11565 enum insn_code icode = d->icode;
11566 const struct insn_data_d *insn_p = &insn_data[icode];
11567 machine_mode mode0 = insn_p->operand[0].mode;
11568 machine_mode mode1 = insn_p->operand[1].mode;
11569
11570 /* See avxintrin.h for values. */
11571 static const enum rtx_code comparisons[32] =
11572 {
11573 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11574 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11575 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11576 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11577 };
11578 static const bool ordereds[32] =
11579 {
11580 true, true, true, false, false, false, false, true,
11581 false, false, false, true, true, true, true, false,
11582 true, true, true, false, false, false, false, true,
11583 false, false, false, true, true, true, true, false
11584 };
11585 static const bool non_signalings[32] =
11586 {
11587 true, false, false, true, true, false, false, true,
11588 true, false, false, true, true, false, false, true,
11589 false, true, true, false, false, true, true, false,
11590 false, true, true, false, false, true, true, false
11591 };
11592
11593 if (!CONST_INT_P (op2))
11594 {
11595 error ("the third argument must be comparison constant");
11596 return const0_rtx;
11597 }
11598 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11599 {
11600 error ("incorrect comparison mode");
11601 return const0_rtx;
11602 }
11603
11604 if (!insn_p->operand[2].predicate (op3, SImode))
11605 {
11606 error ("incorrect rounding operand");
11607 return const0_rtx;
11608 }
11609
11610 if (VECTOR_MODE_P (mode0))
11611 op0 = safe_vector_operand (op0, mode0);
11612 if (VECTOR_MODE_P (mode1))
11613 op1 = safe_vector_operand (op1, mode1);
11614
11615 enum rtx_code comparison = comparisons[INTVAL (op2)];
11616 bool ordered = ordereds[INTVAL (op2)];
11617 bool non_signaling = non_signalings[INTVAL (op2)];
11618 rtx const_val = const0_rtx;
11619
11620 bool check_unordered = false;
11621 machine_mode mode = CCFPmode;
11622 switch (comparison)
11623 {
11624 case ORDERED:
11625 if (!ordered)
11626 {
11627 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11628 if (!non_signaling)
11629 ordered = true;
11630 mode = CCSmode;
11631 }
11632 else
11633 {
11634 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11635 if (non_signaling)
11636 ordered = false;
11637 mode = CCPmode;
11638 }
11639 comparison = NE;
11640 break;
11641 case UNORDERED:
11642 if (ordered)
11643 {
11644 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11645 if (non_signaling)
11646 ordered = false;
11647 mode = CCSmode;
11648 }
11649 else
11650 {
11651 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11652 if (!non_signaling)
11653 ordered = true;
11654 mode = CCPmode;
11655 }
11656 comparison = EQ;
11657 break;
11658
11659 case LE: /* -> GE */
11660 case LT: /* -> GT */
11661 case UNGE: /* -> UNLE */
11662 case UNGT: /* -> UNLT */
11663 std::swap (op0, op1);
11664 comparison = swap_condition (comparison);
11665 /* FALLTHRU */
11666 case GT:
11667 case GE:
11668 case UNEQ:
11669 case UNLT:
11670 case UNLE:
11671 case LTGT:
11672 /* These are supported by CCFPmode. NB: Use ordered/signaling
11673 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11674 with NAN operands. */
11675 if (ordered == non_signaling)
11676 ordered = !ordered;
11677 break;
11678 case EQ:
11679 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11680 _CMP_EQ_OQ/_CMP_EQ_OS. */
11681 check_unordered = true;
11682 mode = CCZmode;
11683 break;
11684 case NE:
11685 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11686 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11687 gcc_assert (!ordered);
11688 check_unordered = true;
11689 mode = CCZmode;
11690 const_val = const1_rtx;
11691 break;
11692 default:
11693 gcc_unreachable ();
11694 }
11695
11696 target = gen_reg_rtx (SImode);
11697 emit_move_insn (target, const_val);
11698 target = gen_rtx_SUBREG (QImode, target, 0);
11699
11700 if ((optimize && !register_operand (op0, mode0))
11701 || !insn_p->operand[0].predicate (op0, mode0))
11702 op0 = copy_to_mode_reg (mode0, op0);
11703 if ((optimize && !register_operand (op1, mode1))
11704 || !insn_p->operand[1].predicate (op1, mode1))
11705 op1 = copy_to_mode_reg (mode1, op1);
11706
11707 /*
11708 1. COMI: ordered and signaling.
11709 2. UCOMI: unordered and non-signaling.
11710 */
11711 if (non_signaling)
11712 icode = (icode == CODE_FOR_sse_comi_round
11713 ? CODE_FOR_sse_ucomi_round
11714 : CODE_FOR_sse2_ucomi_round);
11715
11716 pat = GEN_FCN (icode) (op0, op1, op3);
11717 if (! pat)
11718 return 0;
11719
11720 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11721 if (INTVAL (op3) == NO_ROUND)
11722 {
11723 pat = ix86_erase_embedded_rounding (pat);
11724 if (! pat)
11725 return 0;
11726
11727 set_dst = SET_DEST (pat);
11728 }
11729 else
11730 {
11731 gcc_assert (GET_CODE (pat) == SET);
11732 set_dst = SET_DEST (pat);
11733 }
11734
11735 emit_insn (pat);
11736
11737 return ix86_ssecom_setcc (comparison, check_unordered, mode,
11738 set_dst, target);
11739 }
11740
11741 static rtx
11742 ix86_expand_round_builtin (const struct builtin_description *d,
11743 tree exp, rtx target)
11744 {
11745 rtx pat;
11746 unsigned int i, nargs;
11747 rtx xops[6];
11748 enum insn_code icode = d->icode;
11749 const struct insn_data_d *insn_p = &insn_data[icode];
11750 machine_mode tmode = insn_p->operand[0].mode;
11751 unsigned int nargs_constant = 0;
11752 unsigned int redundant_embed_rnd = 0;
11753
11754 switch ((enum ix86_builtin_func_type) d->flag)
11755 {
11756 case UINT64_FTYPE_V2DF_INT:
11757 case UINT64_FTYPE_V4SF_INT:
11758 case UINT64_FTYPE_V8HF_INT:
11759 case UINT_FTYPE_V2DF_INT:
11760 case UINT_FTYPE_V4SF_INT:
11761 case UINT_FTYPE_V8HF_INT:
11762 case INT64_FTYPE_V2DF_INT:
11763 case INT64_FTYPE_V4SF_INT:
11764 case INT64_FTYPE_V8HF_INT:
11765 case INT_FTYPE_V2DF_INT:
11766 case INT_FTYPE_V4SF_INT:
11767 case INT_FTYPE_V8HF_INT:
11768 nargs = 2;
11769 break;
11770 case V32HF_FTYPE_V32HF_V32HF_INT:
11771 case V8HF_FTYPE_V8HF_V8HF_INT:
11772 case V8HF_FTYPE_V8HF_INT_INT:
11773 case V8HF_FTYPE_V8HF_UINT_INT:
11774 case V8HF_FTYPE_V8HF_INT64_INT:
11775 case V8HF_FTYPE_V8HF_UINT64_INT:
11776 case V4SF_FTYPE_V4SF_UINT_INT:
11777 case V4SF_FTYPE_V4SF_UINT64_INT:
11778 case V2DF_FTYPE_V2DF_UINT64_INT:
11779 case V4SF_FTYPE_V4SF_INT_INT:
11780 case V4SF_FTYPE_V4SF_INT64_INT:
11781 case V2DF_FTYPE_V2DF_INT64_INT:
11782 case V4SF_FTYPE_V4SF_V4SF_INT:
11783 case V2DF_FTYPE_V2DF_V2DF_INT:
11784 case V4SF_FTYPE_V4SF_V2DF_INT:
11785 case V2DF_FTYPE_V2DF_V4SF_INT:
11786 nargs = 3;
11787 break;
11788 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11789 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
11790 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
11791 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
11792 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
11793 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11794 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11795 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
11796 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11797 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
11798 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
11799 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
11800 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
11801 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11802 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11803 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11804 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
11805 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
11806 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
11807 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11808 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11809 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11810 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
11811 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
11812 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11813 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
11814 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
11815 nargs = 4;
11816 break;
11817 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11818 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11819 nargs_constant = 2;
11820 nargs = 4;
11821 break;
11822 case INT_FTYPE_V4SF_V4SF_INT_INT:
11823 case INT_FTYPE_V2DF_V2DF_INT_INT:
11824 return ix86_expand_sse_comi_round (d, exp, target);
11825 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11826 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11827 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
11828 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
11829 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
11830 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
11831 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
11832 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
11833 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11834 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
11835 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
11836 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11837 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
11838 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
11839 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
11840 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11841 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
11842 nargs = 5;
11843 break;
11844 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
11845 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11846 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
11847 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11848 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
11849 nargs_constant = 4;
11850 nargs = 5;
11851 break;
11852 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11853 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11854 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11855 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
11856 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11857 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
11858 nargs_constant = 3;
11859 nargs = 5;
11860 break;
11861 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11862 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11863 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11864 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11865 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11866 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
11867 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
11868 nargs = 6;
11869 nargs_constant = 4;
11870 break;
11871 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11872 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11873 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11874 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11875 nargs = 6;
11876 nargs_constant = 3;
11877 break;
11878 default:
11879 gcc_unreachable ();
11880 }
11881 gcc_assert (nargs <= ARRAY_SIZE (xops));
11882
11883 if (optimize
11884 || target == 0
11885 || GET_MODE (target) != tmode
11886 || !insn_p->operand[0].predicate (target, tmode))
11887 target = gen_reg_rtx (tmode);
11888
11889 for (i = 0; i < nargs; i++)
11890 {
11891 tree arg = CALL_EXPR_ARG (exp, i);
11892 rtx op = expand_normal (arg);
11893 machine_mode mode = insn_p->operand[i + 1].mode;
11894 bool match = insn_p->operand[i + 1].predicate (op, mode);
11895
11896 if (i == nargs - nargs_constant)
11897 {
11898 if (!match)
11899 {
11900 switch (icode)
11901 {
11902 case CODE_FOR_avx512f_getmantv8df_mask_round:
11903 case CODE_FOR_avx512f_getmantv16sf_mask_round:
11904 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
11905 case CODE_FOR_avx512f_vgetmantv2df_round:
11906 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11907 case CODE_FOR_avx512f_vgetmantv4sf_round:
11908 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
11909 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
11910 error ("the immediate argument must be a 4-bit immediate");
11911 return const0_rtx;
11912 case CODE_FOR_avx512f_cmpv8df3_mask_round:
11913 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11914 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11915 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
11916 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11917 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
11918 error ("the immediate argument must be a 5-bit immediate");
11919 return const0_rtx;
11920 default:
11921 error ("the immediate argument must be an 8-bit immediate");
11922 return const0_rtx;
11923 }
11924 }
11925 }
11926 else if (i == nargs-1)
11927 {
11928 if (!insn_p->operand[nargs].predicate (op, SImode))
11929 {
11930 error ("incorrect rounding operand");
11931 return const0_rtx;
11932 }
11933
11934 /* If there is no rounding use normal version of the pattern. */
11935 if (INTVAL (op) == NO_ROUND)
11936 {
11937 /* Skip erasing embedded rounding for below expanders who
11938 generates multiple insns. In ix86_erase_embedded_rounding
11939 the pattern will be transformed to a single set, and emit_insn
11940 appends the set insead of insert it to chain. So the insns
11941 emitted inside define_expander would be ignored. */
11942 switch (icode)
11943 {
11944 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11945 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11946 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11947 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11948 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11949 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11950 redundant_embed_rnd = 0;
11951 break;
11952 default:
11953 redundant_embed_rnd = 1;
11954 break;
11955 }
11956 }
11957 }
11958 else
11959 {
11960 if (VECTOR_MODE_P (mode))
11961 op = safe_vector_operand (op, mode);
11962
11963 op = fixup_modeless_constant (op, mode);
11964
11965 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11966 {
11967 if (optimize || !match)
11968 op = copy_to_mode_reg (mode, op);
11969 }
11970 else
11971 {
11972 op = copy_to_reg (op);
11973 op = lowpart_subreg (mode, op, GET_MODE (op));
11974 }
11975 }
11976
11977 xops[i] = op;
11978 }
11979
11980 switch (nargs)
11981 {
11982 case 1:
11983 pat = GEN_FCN (icode) (target, xops[0]);
11984 break;
11985 case 2:
11986 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11987 break;
11988 case 3:
11989 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11990 break;
11991 case 4:
11992 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11993 xops[2], xops[3]);
11994 break;
11995 case 5:
11996 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11997 xops[2], xops[3], xops[4]);
11998 break;
11999 case 6:
12000 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12001 xops[2], xops[3], xops[4], xops[5]);
12002 break;
12003 default:
12004 gcc_unreachable ();
12005 }
12006
12007 if (!pat)
12008 return 0;
12009
12010 if (redundant_embed_rnd)
12011 pat = ix86_erase_embedded_rounding (pat);
12012
12013 emit_insn (pat);
12014 return target;
12015 }
12016
12017 /* Subroutine of ix86_expand_builtin to take care of special insns
12018 with variable number of operands. */
12019
12020 static rtx
12021 ix86_expand_special_args_builtin (const struct builtin_description *d,
12022 tree exp, rtx target)
12023 {
12024 tree arg;
12025 rtx pat, op;
12026 unsigned int i, nargs, arg_adjust, memory;
12027 unsigned int constant = 100;
12028 bool aligned_mem = false;
12029 rtx xops[4];
12030 enum insn_code icode = d->icode;
12031 const struct insn_data_d *insn_p = &insn_data[icode];
12032 machine_mode tmode = insn_p->operand[0].mode;
12033 enum { load, store } klass;
12034
12035 switch ((enum ix86_builtin_func_type) d->flag)
12036 {
12037 case VOID_FTYPE_VOID:
12038 emit_insn (GEN_FCN (icode) (target));
12039 return 0;
12040 case VOID_FTYPE_UINT64:
12041 case VOID_FTYPE_UNSIGNED:
12042 nargs = 0;
12043 klass = store;
12044 memory = 0;
12045 break;
12046
12047 case INT_FTYPE_VOID:
12048 case USHORT_FTYPE_VOID:
12049 case UINT64_FTYPE_VOID:
12050 case UINT_FTYPE_VOID:
12051 case UINT8_FTYPE_VOID:
12052 case UNSIGNED_FTYPE_VOID:
12053 nargs = 0;
12054 klass = load;
12055 memory = 0;
12056 break;
12057 case UINT64_FTYPE_PUNSIGNED:
12058 case V2DI_FTYPE_PV2DI:
12059 case V4DI_FTYPE_PV4DI:
12060 case V32QI_FTYPE_PCCHAR:
12061 case V16QI_FTYPE_PCCHAR:
12062 case V8SF_FTYPE_PCV4SF:
12063 case V8SF_FTYPE_PCFLOAT:
12064 case V4SF_FTYPE_PCFLOAT:
12065 case V4SF_FTYPE_PCFLOAT16:
12066 case V4SF_FTYPE_PCBFLOAT16:
12067 case V4SF_FTYPE_PCV8BF:
12068 case V4SF_FTYPE_PCV8HF:
12069 case V8SF_FTYPE_PCFLOAT16:
12070 case V8SF_FTYPE_PCBFLOAT16:
12071 case V8SF_FTYPE_PCV16HF:
12072 case V8SF_FTYPE_PCV16BF:
12073 case V4DF_FTYPE_PCV2DF:
12074 case V4DF_FTYPE_PCDOUBLE:
12075 case V2DF_FTYPE_PCDOUBLE:
12076 case VOID_FTYPE_PVOID:
12077 case V8DI_FTYPE_PV8DI:
12078 nargs = 1;
12079 klass = load;
12080 memory = 0;
12081 switch (icode)
12082 {
12083 case CODE_FOR_sse4_1_movntdqa:
12084 case CODE_FOR_avx2_movntdqa:
12085 case CODE_FOR_avx512f_movntdqa:
12086 aligned_mem = true;
12087 break;
12088 default:
12089 break;
12090 }
12091 break;
12092 case VOID_FTYPE_PV2SF_V4SF:
12093 case VOID_FTYPE_PV8DI_V8DI:
12094 case VOID_FTYPE_PV4DI_V4DI:
12095 case VOID_FTYPE_PV2DI_V2DI:
12096 case VOID_FTYPE_PCHAR_V32QI:
12097 case VOID_FTYPE_PCHAR_V16QI:
12098 case VOID_FTYPE_PFLOAT_V16SF:
12099 case VOID_FTYPE_PFLOAT_V8SF:
12100 case VOID_FTYPE_PFLOAT_V4SF:
12101 case VOID_FTYPE_PDOUBLE_V8DF:
12102 case VOID_FTYPE_PDOUBLE_V4DF:
12103 case VOID_FTYPE_PDOUBLE_V2DF:
12104 case VOID_FTYPE_PLONGLONG_LONGLONG:
12105 case VOID_FTYPE_PULONGLONG_ULONGLONG:
12106 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
12107 case VOID_FTYPE_PINT_INT:
12108 nargs = 1;
12109 klass = store;
12110 /* Reserve memory operand for target. */
12111 memory = ARRAY_SIZE (xops);
12112 switch (icode)
12113 {
12114 /* These builtins and instructions require the memory
12115 to be properly aligned. */
12116 case CODE_FOR_avx_movntv4di:
12117 case CODE_FOR_sse2_movntv2di:
12118 case CODE_FOR_avx_movntv8sf:
12119 case CODE_FOR_sse_movntv4sf:
12120 case CODE_FOR_sse4a_vmmovntv4sf:
12121 case CODE_FOR_avx_movntv4df:
12122 case CODE_FOR_sse2_movntv2df:
12123 case CODE_FOR_sse4a_vmmovntv2df:
12124 case CODE_FOR_sse2_movntidi:
12125 case CODE_FOR_sse_movntq:
12126 case CODE_FOR_sse2_movntisi:
12127 case CODE_FOR_avx512f_movntv16sf:
12128 case CODE_FOR_avx512f_movntv8df:
12129 case CODE_FOR_avx512f_movntv8di:
12130 aligned_mem = true;
12131 break;
12132 default:
12133 break;
12134 }
12135 break;
12136 case VOID_FTYPE_PVOID_PCVOID:
12137 nargs = 1;
12138 klass = store;
12139 memory = 0;
12140
12141 break;
12142 case V4SF_FTYPE_V4SF_PCV2SF:
12143 case V2DF_FTYPE_V2DF_PCDOUBLE:
12144 nargs = 2;
12145 klass = load;
12146 memory = 1;
12147 break;
12148 case V8SF_FTYPE_PCV8SF_V8SI:
12149 case V4DF_FTYPE_PCV4DF_V4DI:
12150 case V4SF_FTYPE_PCV4SF_V4SI:
12151 case V2DF_FTYPE_PCV2DF_V2DI:
12152 case V8SI_FTYPE_PCV8SI_V8SI:
12153 case V4DI_FTYPE_PCV4DI_V4DI:
12154 case V4SI_FTYPE_PCV4SI_V4SI:
12155 case V2DI_FTYPE_PCV2DI_V2DI:
12156 case VOID_FTYPE_INT_INT64:
12157 nargs = 2;
12158 klass = load;
12159 memory = 0;
12160 break;
12161 case VOID_FTYPE_PV8DF_V8DF_UQI:
12162 case VOID_FTYPE_PV4DF_V4DF_UQI:
12163 case VOID_FTYPE_PV2DF_V2DF_UQI:
12164 case VOID_FTYPE_PV16SF_V16SF_UHI:
12165 case VOID_FTYPE_PV8SF_V8SF_UQI:
12166 case VOID_FTYPE_PV4SF_V4SF_UQI:
12167 case VOID_FTYPE_PV8DI_V8DI_UQI:
12168 case VOID_FTYPE_PV4DI_V4DI_UQI:
12169 case VOID_FTYPE_PV2DI_V2DI_UQI:
12170 case VOID_FTYPE_PV16SI_V16SI_UHI:
12171 case VOID_FTYPE_PV8SI_V8SI_UQI:
12172 case VOID_FTYPE_PV4SI_V4SI_UQI:
12173 case VOID_FTYPE_PV64QI_V64QI_UDI:
12174 case VOID_FTYPE_PV32HI_V32HI_USI:
12175 case VOID_FTYPE_PV32QI_V32QI_USI:
12176 case VOID_FTYPE_PV16QI_V16QI_UHI:
12177 case VOID_FTYPE_PV16HI_V16HI_UHI:
12178 case VOID_FTYPE_PV8HI_V8HI_UQI:
12179 switch (icode)
12180 {
12181 /* These builtins and instructions require the memory
12182 to be properly aligned. */
12183 case CODE_FOR_avx512f_storev16sf_mask:
12184 case CODE_FOR_avx512f_storev16si_mask:
12185 case CODE_FOR_avx512f_storev8df_mask:
12186 case CODE_FOR_avx512f_storev8di_mask:
12187 case CODE_FOR_avx512vl_storev8sf_mask:
12188 case CODE_FOR_avx512vl_storev8si_mask:
12189 case CODE_FOR_avx512vl_storev4df_mask:
12190 case CODE_FOR_avx512vl_storev4di_mask:
12191 case CODE_FOR_avx512vl_storev4sf_mask:
12192 case CODE_FOR_avx512vl_storev4si_mask:
12193 case CODE_FOR_avx512vl_storev2df_mask:
12194 case CODE_FOR_avx512vl_storev2di_mask:
12195 aligned_mem = true;
12196 break;
12197 default:
12198 break;
12199 }
12200 /* FALLTHRU */
12201 case VOID_FTYPE_PV8SF_V8SI_V8SF:
12202 case VOID_FTYPE_PV4DF_V4DI_V4DF:
12203 case VOID_FTYPE_PV4SF_V4SI_V4SF:
12204 case VOID_FTYPE_PV2DF_V2DI_V2DF:
12205 case VOID_FTYPE_PV8SI_V8SI_V8SI:
12206 case VOID_FTYPE_PV4DI_V4DI_V4DI:
12207 case VOID_FTYPE_PV4SI_V4SI_V4SI:
12208 case VOID_FTYPE_PV2DI_V2DI_V2DI:
12209 case VOID_FTYPE_PV8SI_V8DI_UQI:
12210 case VOID_FTYPE_PV8HI_V8DI_UQI:
12211 case VOID_FTYPE_PV16HI_V16SI_UHI:
12212 case VOID_FTYPE_PUDI_V8DI_UQI:
12213 case VOID_FTYPE_PV16QI_V16SI_UHI:
12214 case VOID_FTYPE_PV4SI_V4DI_UQI:
12215 case VOID_FTYPE_PUDI_V2DI_UQI:
12216 case VOID_FTYPE_PUDI_V4DI_UQI:
12217 case VOID_FTYPE_PUSI_V2DI_UQI:
12218 case VOID_FTYPE_PV8HI_V8SI_UQI:
12219 case VOID_FTYPE_PUDI_V4SI_UQI:
12220 case VOID_FTYPE_PUSI_V4DI_UQI:
12221 case VOID_FTYPE_PUHI_V2DI_UQI:
12222 case VOID_FTYPE_PUDI_V8SI_UQI:
12223 case VOID_FTYPE_PUSI_V4SI_UQI:
12224 case VOID_FTYPE_PCHAR_V64QI_UDI:
12225 case VOID_FTYPE_PCHAR_V32QI_USI:
12226 case VOID_FTYPE_PCHAR_V16QI_UHI:
12227 case VOID_FTYPE_PSHORT_V32HI_USI:
12228 case VOID_FTYPE_PSHORT_V16HI_UHI:
12229 case VOID_FTYPE_PSHORT_V8HI_UQI:
12230 case VOID_FTYPE_PINT_V16SI_UHI:
12231 case VOID_FTYPE_PINT_V8SI_UQI:
12232 case VOID_FTYPE_PINT_V4SI_UQI:
12233 case VOID_FTYPE_PINT64_V8DI_UQI:
12234 case VOID_FTYPE_PINT64_V4DI_UQI:
12235 case VOID_FTYPE_PINT64_V2DI_UQI:
12236 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12237 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12238 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12239 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12240 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12241 case VOID_FTYPE_PFLOAT_V4SF_UQI:
12242 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
12243 case VOID_FTYPE_PV32QI_V32HI_USI:
12244 case VOID_FTYPE_PV16QI_V16HI_UHI:
12245 case VOID_FTYPE_PUDI_V8HI_UQI:
12246 nargs = 2;
12247 klass = store;
12248 /* Reserve memory operand for target. */
12249 memory = ARRAY_SIZE (xops);
12250 break;
12251 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12252 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12253 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12254 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12255 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12256 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12257 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12258 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12259 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12260 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12261 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12262 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12263 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12264 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12265 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12266 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12267 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12268 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12269 switch (icode)
12270 {
12271 /* These builtins and instructions require the memory
12272 to be properly aligned. */
12273 case CODE_FOR_avx512f_loadv16sf_mask:
12274 case CODE_FOR_avx512f_loadv16si_mask:
12275 case CODE_FOR_avx512f_loadv8df_mask:
12276 case CODE_FOR_avx512f_loadv8di_mask:
12277 case CODE_FOR_avx512vl_loadv8sf_mask:
12278 case CODE_FOR_avx512vl_loadv8si_mask:
12279 case CODE_FOR_avx512vl_loadv4df_mask:
12280 case CODE_FOR_avx512vl_loadv4di_mask:
12281 case CODE_FOR_avx512vl_loadv4sf_mask:
12282 case CODE_FOR_avx512vl_loadv4si_mask:
12283 case CODE_FOR_avx512vl_loadv2df_mask:
12284 case CODE_FOR_avx512vl_loadv2di_mask:
12285 case CODE_FOR_avx512bw_loadv64qi_mask:
12286 case CODE_FOR_avx512vl_loadv32qi_mask:
12287 case CODE_FOR_avx512vl_loadv16qi_mask:
12288 case CODE_FOR_avx512bw_loadv32hi_mask:
12289 case CODE_FOR_avx512vl_loadv16hi_mask:
12290 case CODE_FOR_avx512vl_loadv8hi_mask:
12291 aligned_mem = true;
12292 break;
12293 default:
12294 break;
12295 }
12296 /* FALLTHRU */
12297 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12298 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12299 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12300 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12301 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12302 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12303 case V16SI_FTYPE_PCINT_V16SI_UHI:
12304 case V8SI_FTYPE_PCINT_V8SI_UQI:
12305 case V4SI_FTYPE_PCINT_V4SI_UQI:
12306 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12307 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12308 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12309 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12310 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12311 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12312 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12313 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12314 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12315 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12316 nargs = 3;
12317 klass = load;
12318 memory = 0;
12319 break;
12320 case INT_FTYPE_PINT_INT_INT_INT:
12321 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
12322 nargs = 4;
12323 klass = load;
12324 memory = 0;
12325 constant = 3;
12326 break;
12327 default:
12328 gcc_unreachable ();
12329 }
12330
12331 gcc_assert (nargs <= ARRAY_SIZE (xops));
12332
12333 if (klass == store)
12334 {
12335 arg = CALL_EXPR_ARG (exp, 0);
12336 op = expand_normal (arg);
12337 gcc_assert (target == 0);
12338 if (memory)
12339 {
12340 op = ix86_zero_extend_to_Pmode (op);
12341 target = gen_rtx_MEM (tmode, op);
12342 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12343 on it. Try to improve it using get_pointer_alignment,
12344 and if the special builtin is one that requires strict
12345 mode alignment, also from it's GET_MODE_ALIGNMENT.
12346 Failure to do so could lead to ix86_legitimate_combined_insn
12347 rejecting all changes to such insns. */
12348 unsigned int align = get_pointer_alignment (arg);
12349 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12350 align = GET_MODE_ALIGNMENT (tmode);
12351 if (MEM_ALIGN (target) < align)
12352 set_mem_align (target, align);
12353 }
12354 else
12355 target = force_reg (tmode, op);
12356 arg_adjust = 1;
12357 }
12358 else
12359 {
12360 arg_adjust = 0;
12361 if (optimize
12362 || target == 0
12363 || !register_operand (target, tmode)
12364 || GET_MODE (target) != tmode)
12365 target = gen_reg_rtx (tmode);
12366 }
12367
12368 for (i = 0; i < nargs; i++)
12369 {
12370 machine_mode mode = insn_p->operand[i + 1].mode;
12371
12372 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12373 op = expand_normal (arg);
12374
12375 if (i == memory)
12376 {
12377 /* This must be the memory operand. */
12378 op = ix86_zero_extend_to_Pmode (op);
12379 op = gen_rtx_MEM (mode, op);
12380 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12381 on it. Try to improve it using get_pointer_alignment,
12382 and if the special builtin is one that requires strict
12383 mode alignment, also from it's GET_MODE_ALIGNMENT.
12384 Failure to do so could lead to ix86_legitimate_combined_insn
12385 rejecting all changes to such insns. */
12386 unsigned int align = get_pointer_alignment (arg);
12387 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12388 align = GET_MODE_ALIGNMENT (mode);
12389 if (MEM_ALIGN (op) < align)
12390 set_mem_align (op, align);
12391 }
12392 else if (i == constant)
12393 {
12394 /* This must be the constant. */
12395 if (!insn_p->operand[nargs].predicate(op, SImode))
12396 {
12397 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12398 return const0_rtx;
12399 }
12400 }
12401 else
12402 {
12403 /* This must be register. */
12404 if (VECTOR_MODE_P (mode))
12405 op = safe_vector_operand (op, mode);
12406
12407 op = fixup_modeless_constant (op, mode);
12408
12409 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12410 and that mask operand shoud be at the end.
12411 Keep all-ones mask which would be simplified by the expander. */
12412 if (nargs == 3 && i == 2 && klass == load
12413 && constm1_operand (op, mode)
12414 && insn_p->operand[i].predicate (op, mode))
12415 ;
12416 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12417 op = copy_to_mode_reg (mode, op);
12418 else
12419 {
12420 op = copy_to_reg (op);
12421 op = lowpart_subreg (mode, op, GET_MODE (op));
12422 }
12423 }
12424
12425 xops[i]= op;
12426 }
12427
12428 switch (nargs)
12429 {
12430 case 0:
12431 pat = GEN_FCN (icode) (target);
12432 break;
12433 case 1:
12434 pat = GEN_FCN (icode) (target, xops[0]);
12435 break;
12436 case 2:
12437 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12438 break;
12439 case 3:
12440 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12441 break;
12442 case 4:
12443 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
12444 break;
12445 default:
12446 gcc_unreachable ();
12447 }
12448
12449 if (! pat)
12450 return 0;
12451
12452 emit_insn (pat);
12453 return klass == store ? 0 : target;
12454 }
12455
12456 /* Return the integer constant in ARG. Constrain it to be in the range
12457 of the subparts of VEC_TYPE; issue an error if not. */
12458
12459 static int
12460 get_element_number (tree vec_type, tree arg)
12461 {
12462 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12463
12464 if (!tree_fits_uhwi_p (arg)
12465 || (elt = tree_to_uhwi (arg), elt > max))
12466 {
12467 error ("selector must be an integer constant in the range "
12468 "[0, %wi]", max);
12469 return 0;
12470 }
12471
12472 return elt;
12473 }
12474
12475 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12476 ix86_expand_vector_init. We DO have language-level syntax for this, in
12477 the form of (type){ init-list }. Except that since we can't place emms
12478 instructions from inside the compiler, we can't allow the use of MMX
12479 registers unless the user explicitly asks for it. So we do *not* define
12480 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12481 we have builtins invoked by mmintrin.h that gives us license to emit
12482 these sorts of instructions. */
12483
12484 static rtx
12485 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12486 {
12487 machine_mode tmode = TYPE_MODE (type);
12488 machine_mode inner_mode = GET_MODE_INNER (tmode);
12489 int i, n_elt = GET_MODE_NUNITS (tmode);
12490 rtvec v = rtvec_alloc (n_elt);
12491
12492 gcc_assert (VECTOR_MODE_P (tmode));
12493 gcc_assert (call_expr_nargs (exp) == n_elt);
12494
12495 for (i = 0; i < n_elt; ++i)
12496 {
12497 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12498 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12499 }
12500
12501 if (!target || !register_operand (target, tmode))
12502 target = gen_reg_rtx (tmode);
12503
12504 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12505 return target;
12506 }
12507
12508 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12509 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12510 had a language-level syntax for referencing vector elements. */
12511
12512 static rtx
12513 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12514 {
12515 machine_mode tmode, mode0;
12516 tree arg0, arg1;
12517 int elt;
12518 rtx op0;
12519
12520 arg0 = CALL_EXPR_ARG (exp, 0);
12521 arg1 = CALL_EXPR_ARG (exp, 1);
12522
12523 op0 = expand_normal (arg0);
12524 elt = get_element_number (TREE_TYPE (arg0), arg1);
12525
12526 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12527 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12528 gcc_assert (VECTOR_MODE_P (mode0));
12529
12530 op0 = force_reg (mode0, op0);
12531
12532 if (optimize || !target || !register_operand (target, tmode))
12533 target = gen_reg_rtx (tmode);
12534
12535 ix86_expand_vector_extract (true, target, op0, elt);
12536
12537 return target;
12538 }
12539
12540 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12541 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12542 a language-level syntax for referencing vector elements. */
12543
12544 static rtx
12545 ix86_expand_vec_set_builtin (tree exp)
12546 {
12547 machine_mode tmode, mode1;
12548 tree arg0, arg1, arg2;
12549 int elt;
12550 rtx op0, op1, target;
12551
12552 arg0 = CALL_EXPR_ARG (exp, 0);
12553 arg1 = CALL_EXPR_ARG (exp, 1);
12554 arg2 = CALL_EXPR_ARG (exp, 2);
12555
12556 tmode = TYPE_MODE (TREE_TYPE (arg0));
12557 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12558 gcc_assert (VECTOR_MODE_P (tmode));
12559
12560 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12561 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12562 elt = get_element_number (TREE_TYPE (arg0), arg2);
12563
12564 if (GET_MODE (op1) != mode1)
12565 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12566
12567 op0 = force_reg (tmode, op0);
12568 op1 = force_reg (mode1, op1);
12569
12570 /* OP0 is the source of these builtin functions and shouldn't be
12571 modified. Create a copy, use it and return it as target. */
12572 target = gen_reg_rtx (tmode);
12573 emit_move_insn (target, op0);
12574 ix86_expand_vector_set (true, target, op1, elt);
12575
12576 return target;
12577 }
12578
12579 /* Return true if the necessary isa options for this builtin exist,
12580 else false.
12581 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12582 bool
12583 ix86_check_builtin_isa_match (unsigned int fcode,
12584 HOST_WIDE_INT* pbisa,
12585 HOST_WIDE_INT* pbisa2)
12586 {
12587 HOST_WIDE_INT isa = ix86_isa_flags;
12588 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12589 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12590 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12591 HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
12592 /* The general case is we require all the ISAs specified in bisa{,2}
12593 to be enabled.
12594 The exceptions are:
12595 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12596 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12597 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12598 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12599 OPTION_MASK_ISA2_AVXVNNI
12600 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
12601 OPTION_MASK_ISA2_AVXIFMA
12602 (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
12603 OPTION_MASK_ISA2_AVXNECONVERT
12604 where for each such pair it is sufficient if either of the ISAs is
12605 enabled, plus if it is ored with other options also those others.
12606 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12607
12608 #define SHARE_BUILTIN(A1, A2, B1, B2) \
12609 if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
12610 && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
12611 && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
12612 || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
12613 { \
12614 tmp_isa |= (A1) | (B1); \
12615 tmp_isa2 |= (A2) | (B2); \
12616 }
12617
12618 SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
12619 SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
12620 SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
12621 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
12622 OPTION_MASK_ISA2_AVXVNNI);
12623 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
12624 OPTION_MASK_ISA2_AVXIFMA);
12625 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
12626 OPTION_MASK_ISA2_AVXNECONVERT);
12627 SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, 0, OPTION_MASK_ISA2_VAES);
12628 isa = tmp_isa;
12629 isa2 = tmp_isa2;
12630
12631 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12632 /* __builtin_ia32_maskmovq requires MMX registers. */
12633 && fcode != IX86_BUILTIN_MASKMOVQ)
12634 {
12635 bisa &= ~OPTION_MASK_ISA_MMX;
12636 bisa |= OPTION_MASK_ISA_SSE2;
12637 }
12638
12639 if (pbisa)
12640 *pbisa = bisa;
12641 if (pbisa2)
12642 *pbisa2 = bisa2;
12643
12644 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12645 }
12646
12647 /* Expand an expression EXP that calls a built-in function,
12648 with result going to TARGET if that's convenient
12649 (and in mode MODE if that's convenient).
12650 SUBTARGET may be used as the target for computing one of EXP's operands.
12651 IGNORE is nonzero if the value is to be ignored. */
12652
12653 rtx
12654 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12655 machine_mode mode, int ignore)
12656 {
12657 size_t i;
12658 enum insn_code icode, icode2;
12659 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12660 tree arg0, arg1, arg2, arg3, arg4;
12661 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12662 machine_mode mode0, mode1, mode2, mode3, mode4;
12663 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12664 HOST_WIDE_INT bisa, bisa2;
12665
12666 /* For CPU builtins that can be folded, fold first and expand the fold. */
12667 switch (fcode)
12668 {
12669 case IX86_BUILTIN_CPU_INIT:
12670 {
12671 /* Make it call __cpu_indicator_init in libgcc. */
12672 tree call_expr, fndecl, type;
12673 type = build_function_type_list (integer_type_node, NULL_TREE);
12674 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12675 call_expr = build_call_expr (fndecl, 0);
12676 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12677 }
12678 case IX86_BUILTIN_CPU_IS:
12679 case IX86_BUILTIN_CPU_SUPPORTS:
12680 {
12681 tree arg0 = CALL_EXPR_ARG (exp, 0);
12682 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12683 gcc_assert (fold_expr != NULL_TREE);
12684 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12685 }
12686 }
12687
12688 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
12689 {
12690 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12691 if (TARGET_ABI_X32)
12692 bisa |= OPTION_MASK_ABI_X32;
12693 else
12694 bisa |= OPTION_MASK_ABI_64;
12695 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
12696 (enum fpmath_unit) 0,
12697 (enum prefer_vector_width) 0,
12698 PVW_NONE, PVW_NONE,
12699 false, add_abi_p);
12700 if (!opts)
12701 error ("%qE needs unknown isa option", fndecl);
12702 else
12703 {
12704 gcc_assert (opts != NULL);
12705 error ("%qE needs isa option %s", fndecl, opts);
12706 free (opts);
12707 }
12708 return expand_call (exp, target, ignore);
12709 }
12710
12711 switch (fcode)
12712 {
12713 case IX86_BUILTIN_MASKMOVQ:
12714 case IX86_BUILTIN_MASKMOVDQU:
12715 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12716 ? CODE_FOR_mmx_maskmovq
12717 : CODE_FOR_sse2_maskmovdqu);
12718 /* Note the arg order is different from the operand order. */
12719 arg1 = CALL_EXPR_ARG (exp, 0);
12720 arg2 = CALL_EXPR_ARG (exp, 1);
12721 arg0 = CALL_EXPR_ARG (exp, 2);
12722 op0 = expand_normal (arg0);
12723 op1 = expand_normal (arg1);
12724 op2 = expand_normal (arg2);
12725 mode0 = insn_data[icode].operand[0].mode;
12726 mode1 = insn_data[icode].operand[1].mode;
12727 mode2 = insn_data[icode].operand[2].mode;
12728
12729 op0 = ix86_zero_extend_to_Pmode (op0);
12730 op0 = gen_rtx_MEM (mode1, op0);
12731
12732 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12733 op0 = copy_to_mode_reg (mode0, op0);
12734 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12735 op1 = copy_to_mode_reg (mode1, op1);
12736 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12737 op2 = copy_to_mode_reg (mode2, op2);
12738 pat = GEN_FCN (icode) (op0, op1, op2);
12739 if (! pat)
12740 return 0;
12741 emit_insn (pat);
12742 return 0;
12743
12744 case IX86_BUILTIN_LDMXCSR:
12745 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12746 target = assign_386_stack_local (SImode, SLOT_TEMP);
12747 emit_move_insn (target, op0);
12748 emit_insn (gen_sse_ldmxcsr (target));
12749 return 0;
12750
12751 case IX86_BUILTIN_STMXCSR:
12752 target = assign_386_stack_local (SImode, SLOT_TEMP);
12753 emit_insn (gen_sse_stmxcsr (target));
12754 return copy_to_mode_reg (SImode, target);
12755
12756 case IX86_BUILTIN_CLFLUSH:
12757 arg0 = CALL_EXPR_ARG (exp, 0);
12758 op0 = expand_normal (arg0);
12759 icode = CODE_FOR_sse2_clflush;
12760 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12761 op0 = ix86_zero_extend_to_Pmode (op0);
12762
12763 emit_insn (gen_sse2_clflush (op0));
12764 return 0;
12765
12766 case IX86_BUILTIN_CLWB:
12767 arg0 = CALL_EXPR_ARG (exp, 0);
12768 op0 = expand_normal (arg0);
12769 icode = CODE_FOR_clwb;
12770 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12771 op0 = ix86_zero_extend_to_Pmode (op0);
12772
12773 emit_insn (gen_clwb (op0));
12774 return 0;
12775
12776 case IX86_BUILTIN_CLFLUSHOPT:
12777 arg0 = CALL_EXPR_ARG (exp, 0);
12778 op0 = expand_normal (arg0);
12779 icode = CODE_FOR_clflushopt;
12780 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12781 op0 = ix86_zero_extend_to_Pmode (op0);
12782
12783 emit_insn (gen_clflushopt (op0));
12784 return 0;
12785
12786 case IX86_BUILTIN_MONITOR:
12787 case IX86_BUILTIN_MONITORX:
12788 arg0 = CALL_EXPR_ARG (exp, 0);
12789 arg1 = CALL_EXPR_ARG (exp, 1);
12790 arg2 = CALL_EXPR_ARG (exp, 2);
12791 op0 = expand_normal (arg0);
12792 op1 = expand_normal (arg1);
12793 op2 = expand_normal (arg2);
12794 if (!REG_P (op0))
12795 op0 = ix86_zero_extend_to_Pmode (op0);
12796 if (!REG_P (op1))
12797 op1 = copy_to_mode_reg (SImode, op1);
12798 if (!REG_P (op2))
12799 op2 = copy_to_mode_reg (SImode, op2);
12800
12801 emit_insn (fcode == IX86_BUILTIN_MONITOR
12802 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12803 : gen_monitorx (Pmode, op0, op1, op2));
12804 return 0;
12805
12806 case IX86_BUILTIN_MWAIT:
12807 arg0 = CALL_EXPR_ARG (exp, 0);
12808 arg1 = CALL_EXPR_ARG (exp, 1);
12809 op0 = expand_normal (arg0);
12810 op1 = expand_normal (arg1);
12811 if (!REG_P (op0))
12812 op0 = copy_to_mode_reg (SImode, op0);
12813 if (!REG_P (op1))
12814 op1 = copy_to_mode_reg (SImode, op1);
12815 emit_insn (gen_sse3_mwait (op0, op1));
12816 return 0;
12817
12818 case IX86_BUILTIN_MWAITX:
12819 arg0 = CALL_EXPR_ARG (exp, 0);
12820 arg1 = CALL_EXPR_ARG (exp, 1);
12821 arg2 = CALL_EXPR_ARG (exp, 2);
12822 op0 = expand_normal (arg0);
12823 op1 = expand_normal (arg1);
12824 op2 = expand_normal (arg2);
12825 if (!REG_P (op0))
12826 op0 = copy_to_mode_reg (SImode, op0);
12827 if (!REG_P (op1))
12828 op1 = copy_to_mode_reg (SImode, op1);
12829 if (!REG_P (op2))
12830 op2 = copy_to_mode_reg (SImode, op2);
12831 emit_insn (gen_mwaitx (op0, op1, op2));
12832 return 0;
12833
12834 case IX86_BUILTIN_UMONITOR:
12835 arg0 = CALL_EXPR_ARG (exp, 0);
12836 op0 = expand_normal (arg0);
12837
12838 op0 = ix86_zero_extend_to_Pmode (op0);
12839 emit_insn (gen_umonitor (Pmode, op0));
12840 return 0;
12841
12842 case IX86_BUILTIN_UMWAIT:
12843 case IX86_BUILTIN_TPAUSE:
12844 arg0 = CALL_EXPR_ARG (exp, 0);
12845 arg1 = CALL_EXPR_ARG (exp, 1);
12846 op0 = expand_normal (arg0);
12847 op1 = expand_normal (arg1);
12848
12849 if (!REG_P (op0))
12850 op0 = copy_to_mode_reg (SImode, op0);
12851
12852 op1 = force_reg (DImode, op1);
12853
12854 if (TARGET_64BIT)
12855 {
12856 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12857 NULL, 1, OPTAB_DIRECT);
12858 switch (fcode)
12859 {
12860 case IX86_BUILTIN_UMWAIT:
12861 icode = CODE_FOR_umwait_rex64;
12862 break;
12863 case IX86_BUILTIN_TPAUSE:
12864 icode = CODE_FOR_tpause_rex64;
12865 break;
12866 default:
12867 gcc_unreachable ();
12868 }
12869
12870 op2 = gen_lowpart (SImode, op2);
12871 op1 = gen_lowpart (SImode, op1);
12872 pat = GEN_FCN (icode) (op0, op1, op2);
12873 }
12874 else
12875 {
12876 switch (fcode)
12877 {
12878 case IX86_BUILTIN_UMWAIT:
12879 icode = CODE_FOR_umwait;
12880 break;
12881 case IX86_BUILTIN_TPAUSE:
12882 icode = CODE_FOR_tpause;
12883 break;
12884 default:
12885 gcc_unreachable ();
12886 }
12887 pat = GEN_FCN (icode) (op0, op1);
12888 }
12889
12890 if (!pat)
12891 return 0;
12892
12893 emit_insn (pat);
12894
12895 if (target == 0
12896 || !register_operand (target, QImode))
12897 target = gen_reg_rtx (QImode);
12898
12899 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12900 const0_rtx);
12901 emit_insn (gen_rtx_SET (target, pat));
12902
12903 return target;
12904
12905 case IX86_BUILTIN_TESTUI:
12906 emit_insn (gen_testui ());
12907
12908 if (target == 0
12909 || !register_operand (target, QImode))
12910 target = gen_reg_rtx (QImode);
12911
12912 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12913 const0_rtx);
12914 emit_insn (gen_rtx_SET (target, pat));
12915
12916 return target;
12917
12918 case IX86_BUILTIN_CLZERO:
12919 arg0 = CALL_EXPR_ARG (exp, 0);
12920 op0 = expand_normal (arg0);
12921 if (!REG_P (op0))
12922 op0 = ix86_zero_extend_to_Pmode (op0);
12923 emit_insn (gen_clzero (Pmode, op0));
12924 return 0;
12925
12926 case IX86_BUILTIN_CLDEMOTE:
12927 arg0 = CALL_EXPR_ARG (exp, 0);
12928 op0 = expand_normal (arg0);
12929 icode = CODE_FOR_cldemote;
12930 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12931 op0 = ix86_zero_extend_to_Pmode (op0);
12932
12933 emit_insn (gen_cldemote (op0));
12934 return 0;
12935
12936 case IX86_BUILTIN_LOADIWKEY:
12937 {
12938 arg0 = CALL_EXPR_ARG (exp, 0);
12939 arg1 = CALL_EXPR_ARG (exp, 1);
12940 arg2 = CALL_EXPR_ARG (exp, 2);
12941 arg3 = CALL_EXPR_ARG (exp, 3);
12942
12943 op0 = expand_normal (arg0);
12944 op1 = expand_normal (arg1);
12945 op2 = expand_normal (arg2);
12946 op3 = expand_normal (arg3);
12947
12948 if (!REG_P (op0))
12949 op0 = copy_to_mode_reg (V2DImode, op0);
12950 if (!REG_P (op1))
12951 op1 = copy_to_mode_reg (V2DImode, op1);
12952 if (!REG_P (op2))
12953 op2 = copy_to_mode_reg (V2DImode, op2);
12954 if (!REG_P (op3))
12955 op3 = copy_to_mode_reg (SImode, op3);
12956
12957 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12958
12959 return 0;
12960 }
12961
12962 case IX86_BUILTIN_AESDEC128KLU8:
12963 icode = CODE_FOR_aesdec128klu8;
12964 goto aesdecenc_expand;
12965
12966 case IX86_BUILTIN_AESDEC256KLU8:
12967 icode = CODE_FOR_aesdec256klu8;
12968 goto aesdecenc_expand;
12969
12970 case IX86_BUILTIN_AESENC128KLU8:
12971 icode = CODE_FOR_aesenc128klu8;
12972 goto aesdecenc_expand;
12973
12974 case IX86_BUILTIN_AESENC256KLU8:
12975 icode = CODE_FOR_aesenc256klu8;
12976
12977 aesdecenc_expand:
12978
12979 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
12980 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
12981 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12982
12983 op0 = expand_normal (arg0);
12984 op1 = expand_normal (arg1);
12985 op2 = expand_normal (arg2);
12986
12987 if (!address_operand (op0, V2DImode))
12988 {
12989 op0 = convert_memory_address (Pmode, op0);
12990 op0 = copy_addr_to_reg (op0);
12991 }
12992 op0 = gen_rtx_MEM (V2DImode, op0);
12993
12994 if (!REG_P (op1))
12995 op1 = copy_to_mode_reg (V2DImode, op1);
12996
12997 if (!address_operand (op2, VOIDmode))
12998 {
12999 op2 = convert_memory_address (Pmode, op2);
13000 op2 = copy_addr_to_reg (op2);
13001 }
13002 op2 = gen_rtx_MEM (BLKmode, op2);
13003
13004 emit_insn (GEN_FCN (icode) (op1, op1, op2));
13005
13006 if (target == 0)
13007 target = gen_reg_rtx (QImode);
13008
13009 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
13010 error occurs. Then the output should be cleared for safety. */
13011 rtx_code_label *ok_label;
13012 rtx tmp;
13013
13014 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13015 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13016 ok_label = gen_label_rtx ();
13017 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13018 true, ok_label);
13019 /* Usually the runtime error seldom occur, so predict OK path as
13020 hotspot to optimize it as fallthrough block. */
13021 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13022
13023 emit_insn (gen_rtx_SET (op1, const0_rtx));
13024
13025 emit_label (ok_label);
13026 emit_insn (gen_rtx_SET (target, pat));
13027 emit_insn (gen_rtx_SET (op0, op1));
13028
13029 return target;
13030
13031 case IX86_BUILTIN_AESDECWIDE128KLU8:
13032 icode = CODE_FOR_aesdecwide128klu8;
13033 goto wideaesdecenc_expand;
13034
13035 case IX86_BUILTIN_AESDECWIDE256KLU8:
13036 icode = CODE_FOR_aesdecwide256klu8;
13037 goto wideaesdecenc_expand;
13038
13039 case IX86_BUILTIN_AESENCWIDE128KLU8:
13040 icode = CODE_FOR_aesencwide128klu8;
13041 goto wideaesdecenc_expand;
13042
13043 case IX86_BUILTIN_AESENCWIDE256KLU8:
13044 icode = CODE_FOR_aesencwide256klu8;
13045
13046 wideaesdecenc_expand:
13047
13048 rtx xmm_regs[8];
13049 rtx op;
13050
13051 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
13052 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
13053 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13054
13055 op0 = expand_normal (arg0);
13056 op1 = expand_normal (arg1);
13057 op2 = expand_normal (arg2);
13058
13059 if (!address_operand (op2, VOIDmode))
13060 {
13061 op2 = convert_memory_address (Pmode, op2);
13062 op2 = copy_addr_to_reg (op2);
13063 }
13064 op2 = gen_rtx_MEM (BLKmode, op2);
13065
13066 for (i = 0; i < 8; i++)
13067 {
13068 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13069
13070 op = gen_rtx_MEM (V2DImode,
13071 plus_constant (Pmode, op1, (i * 16)));
13072
13073 emit_move_insn (xmm_regs[i], op);
13074 }
13075
13076 emit_insn (GEN_FCN (icode) (op2));
13077
13078 if (target == 0)
13079 target = gen_reg_rtx (QImode);
13080
13081 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13082 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13083 ok_label = gen_label_rtx ();
13084 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13085 true, ok_label);
13086 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13087
13088 for (i = 0; i < 8; i++)
13089 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
13090
13091 emit_label (ok_label);
13092 emit_insn (gen_rtx_SET (target, pat));
13093
13094 for (i = 0; i < 8; i++)
13095 {
13096 op = gen_rtx_MEM (V2DImode,
13097 plus_constant (Pmode, op0, (i * 16)));
13098 emit_move_insn (op, xmm_regs[i]);
13099 }
13100
13101 return target;
13102
13103 case IX86_BUILTIN_ENCODEKEY128U32:
13104 {
13105 rtx op, xmm_regs[7];
13106
13107 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13108 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
13109 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
13110
13111 op0 = expand_normal (arg0);
13112 op1 = expand_normal (arg1);
13113 op2 = expand_normal (arg2);
13114
13115 if (!REG_P (op0))
13116 op0 = copy_to_mode_reg (SImode, op0);
13117
13118 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13119 emit_move_insn (op, op1);
13120
13121 for (i = 0; i < 3; i++)
13122 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13123
13124 if (target == 0)
13125 target = gen_reg_rtx (SImode);
13126
13127 emit_insn (gen_encodekey128u32 (target, op0));
13128
13129 for (i = 0; i < 3; i++)
13130 {
13131 op = gen_rtx_MEM (V2DImode,
13132 plus_constant (Pmode, op2, (i * 16)));
13133 emit_move_insn (op, xmm_regs[i]);
13134 }
13135
13136 return target;
13137 }
13138 case IX86_BUILTIN_ENCODEKEY256U32:
13139 {
13140 rtx op, xmm_regs[7];
13141
13142 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13143 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
13144 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
13145 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
13146
13147 op0 = expand_normal (arg0);
13148 op1 = expand_normal (arg1);
13149 op2 = expand_normal (arg2);
13150 op3 = expand_normal (arg3);
13151
13152 if (!REG_P (op0))
13153 op0 = copy_to_mode_reg (SImode, op0);
13154
13155 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13156 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13157 emit_move_insn (op, op1);
13158 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
13159 emit_move_insn (op, op2);
13160
13161 for (i = 0; i < 4; i++)
13162 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13163
13164 if (target == 0)
13165 target = gen_reg_rtx (SImode);
13166
13167 emit_insn (gen_encodekey256u32 (target, op0));
13168
13169 for (i = 0; i < 4; i++)
13170 {
13171 op = gen_rtx_MEM (V2DImode,
13172 plus_constant (Pmode, op3, (i * 16)));
13173 emit_move_insn (op, xmm_regs[i]);
13174 }
13175
13176 return target;
13177 }
13178
13179 case IX86_BUILTIN_PREFETCH:
13180 {
13181 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13182 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13183 arg2 = CALL_EXPR_ARG (exp, 2); // const int
13184 arg3 = CALL_EXPR_ARG (exp, 3); // const int
13185
13186 op0 = expand_normal (arg0);
13187 op1 = expand_normal (arg1);
13188 op2 = expand_normal (arg2);
13189 op3 = expand_normal (arg3);
13190
13191 if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
13192 {
13193 error ("second, third and fourth argument must be a const");
13194 return const0_rtx;
13195 }
13196
13197 if (INTVAL (op3) == 1)
13198 {
13199 if (INTVAL (op2) < 2 || INTVAL (op2) > 3)
13200 {
13201 error ("invalid third argument");
13202 return const0_rtx;
13203 }
13204
13205 if (TARGET_64BIT && TARGET_PREFETCHI
13206 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13207 emit_insn (gen_prefetchi (op0, op2));
13208 else
13209 {
13210 warning (0, "instruction prefetch applies when in 64-bit mode"
13211 " with RIP-relative addressing and"
13212 " option %<-mprefetchi%>;"
13213 " they stay NOPs otherwise");
13214 emit_insn (gen_nop ());
13215 }
13216 }
13217 else
13218 {
13219 if (!address_operand (op0, VOIDmode))
13220 {
13221 op0 = convert_memory_address (Pmode, op0);
13222 op0 = copy_addr_to_reg (op0);
13223 }
13224
13225 if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
13226 {
13227 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
13228 op2 = const0_rtx;
13229 }
13230
13231 if (TARGET_3DNOW || TARGET_PREFETCH_SSE
13232 || TARGET_PRFCHW || TARGET_PREFETCHWT1)
13233 emit_insn (gen_prefetch (op0, op1, op2));
13234 else if (!MEM_P (op0) && side_effects_p (op0))
13235 /* Don't do anything with direct references to volatile memory,
13236 but generate code to handle other side effects. */
13237 emit_insn (op0);
13238 }
13239
13240 return 0;
13241 }
13242
13243 case IX86_BUILTIN_PREFETCHI:
13244 {
13245 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13246 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13247
13248 op0 = expand_normal (arg0);
13249 op1 = expand_normal (arg1);
13250
13251 if (!CONST_INT_P (op1))
13252 {
13253 error ("second argument must be a const");
13254 return const0_rtx;
13255 }
13256
13257 /* GOT/PLT_PIC should not be available for instruction prefetch.
13258 It must be real instruction address. */
13259 if (TARGET_64BIT
13260 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13261 emit_insn (gen_prefetchi (op0, op1));
13262 else
13263 {
13264 /* Ignore the hint. */
13265 warning (0, "instruction prefetch applies when in 64-bit mode"
13266 " with RIP-relative addressing and"
13267 " option %<-mprefetchi%>;"
13268 " they stay NOPs otherwise");
13269 emit_insn (gen_nop ());
13270 }
13271
13272 return 0;
13273 }
13274
13275 case IX86_BUILTIN_VEC_INIT_V2SI:
13276 case IX86_BUILTIN_VEC_INIT_V4HI:
13277 case IX86_BUILTIN_VEC_INIT_V8QI:
13278 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
13279
13280 case IX86_BUILTIN_VEC_EXT_V2DF:
13281 case IX86_BUILTIN_VEC_EXT_V2DI:
13282 case IX86_BUILTIN_VEC_EXT_V4SF:
13283 case IX86_BUILTIN_VEC_EXT_V4SI:
13284 case IX86_BUILTIN_VEC_EXT_V8HI:
13285 case IX86_BUILTIN_VEC_EXT_V2SI:
13286 case IX86_BUILTIN_VEC_EXT_V4HI:
13287 case IX86_BUILTIN_VEC_EXT_V16QI:
13288 return ix86_expand_vec_ext_builtin (exp, target);
13289
13290 case IX86_BUILTIN_VEC_SET_V2DI:
13291 case IX86_BUILTIN_VEC_SET_V4SF:
13292 case IX86_BUILTIN_VEC_SET_V4SI:
13293 case IX86_BUILTIN_VEC_SET_V8HI:
13294 case IX86_BUILTIN_VEC_SET_V4HI:
13295 case IX86_BUILTIN_VEC_SET_V16QI:
13296 return ix86_expand_vec_set_builtin (exp);
13297
13298 case IX86_BUILTIN_NANQ:
13299 case IX86_BUILTIN_NANSQ:
13300 return expand_call (exp, target, ignore);
13301
13302 case IX86_BUILTIN_RDPID:
13303
13304 op0 = gen_reg_rtx (word_mode);
13305
13306 if (TARGET_64BIT)
13307 {
13308 insn = gen_rdpid_rex64 (op0);
13309 op0 = convert_to_mode (SImode, op0, 1);
13310 }
13311 else
13312 insn = gen_rdpid (op0);
13313
13314 emit_insn (insn);
13315
13316 if (target == 0
13317 || !register_operand (target, SImode))
13318 target = gen_reg_rtx (SImode);
13319
13320 emit_move_insn (target, op0);
13321 return target;
13322
13323 case IX86_BUILTIN_2INTERSECTD512:
13324 case IX86_BUILTIN_2INTERSECTQ512:
13325 case IX86_BUILTIN_2INTERSECTD256:
13326 case IX86_BUILTIN_2INTERSECTQ256:
13327 case IX86_BUILTIN_2INTERSECTD128:
13328 case IX86_BUILTIN_2INTERSECTQ128:
13329 arg0 = CALL_EXPR_ARG (exp, 0);
13330 arg1 = CALL_EXPR_ARG (exp, 1);
13331 arg2 = CALL_EXPR_ARG (exp, 2);
13332 arg3 = CALL_EXPR_ARG (exp, 3);
13333 op0 = expand_normal (arg0);
13334 op1 = expand_normal (arg1);
13335 op2 = expand_normal (arg2);
13336 op3 = expand_normal (arg3);
13337
13338 if (!address_operand (op0, VOIDmode))
13339 {
13340 op0 = convert_memory_address (Pmode, op0);
13341 op0 = copy_addr_to_reg (op0);
13342 }
13343 if (!address_operand (op1, VOIDmode))
13344 {
13345 op1 = convert_memory_address (Pmode, op1);
13346 op1 = copy_addr_to_reg (op1);
13347 }
13348
13349 switch (fcode)
13350 {
13351 case IX86_BUILTIN_2INTERSECTD512:
13352 mode4 = P2HImode;
13353 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13354 break;
13355 case IX86_BUILTIN_2INTERSECTQ512:
13356 mode4 = P2QImode;
13357 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13358 break;
13359 case IX86_BUILTIN_2INTERSECTD256:
13360 mode4 = P2QImode;
13361 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13362 break;
13363 case IX86_BUILTIN_2INTERSECTQ256:
13364 mode4 = P2QImode;
13365 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13366 break;
13367 case IX86_BUILTIN_2INTERSECTD128:
13368 mode4 = P2QImode;
13369 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13370 break;
13371 case IX86_BUILTIN_2INTERSECTQ128:
13372 mode4 = P2QImode;
13373 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13374 break;
13375 default:
13376 gcc_unreachable ();
13377 }
13378
13379 mode2 = insn_data[icode].operand[1].mode;
13380 mode3 = insn_data[icode].operand[2].mode;
13381 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13382 op2 = copy_to_mode_reg (mode2, op2);
13383 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13384 op3 = copy_to_mode_reg (mode3, op3);
13385
13386 op4 = gen_reg_rtx (mode4);
13387 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13388 mode0 = mode4 == P2HImode ? HImode : QImode;
13389 emit_move_insn (gen_rtx_MEM (mode0, op0),
13390 gen_lowpart (mode0, op4));
13391 emit_move_insn (gen_rtx_MEM (mode0, op1),
13392 gen_highpart (mode0, op4));
13393
13394 return 0;
13395
13396 case IX86_BUILTIN_RDPMC:
13397 case IX86_BUILTIN_RDTSC:
13398 case IX86_BUILTIN_RDTSCP:
13399 case IX86_BUILTIN_XGETBV:
13400
13401 op0 = gen_reg_rtx (DImode);
13402 op1 = gen_reg_rtx (DImode);
13403
13404 if (fcode == IX86_BUILTIN_RDPMC)
13405 {
13406 arg0 = CALL_EXPR_ARG (exp, 0);
13407 op2 = expand_normal (arg0);
13408 if (!register_operand (op2, SImode))
13409 op2 = copy_to_mode_reg (SImode, op2);
13410
13411 insn = (TARGET_64BIT
13412 ? gen_rdpmc_rex64 (op0, op1, op2)
13413 : gen_rdpmc (op0, op2));
13414 emit_insn (insn);
13415 }
13416 else if (fcode == IX86_BUILTIN_XGETBV)
13417 {
13418 arg0 = CALL_EXPR_ARG (exp, 0);
13419 op2 = expand_normal (arg0);
13420 if (!register_operand (op2, SImode))
13421 op2 = copy_to_mode_reg (SImode, op2);
13422
13423 insn = (TARGET_64BIT
13424 ? gen_xgetbv_rex64 (op0, op1, op2)
13425 : gen_xgetbv (op0, op2));
13426 emit_insn (insn);
13427 }
13428 else if (fcode == IX86_BUILTIN_RDTSC)
13429 {
13430 insn = (TARGET_64BIT
13431 ? gen_rdtsc_rex64 (op0, op1)
13432 : gen_rdtsc (op0));
13433 emit_insn (insn);
13434 }
13435 else
13436 {
13437 op2 = gen_reg_rtx (SImode);
13438
13439 insn = (TARGET_64BIT
13440 ? gen_rdtscp_rex64 (op0, op1, op2)
13441 : gen_rdtscp (op0, op2));
13442 emit_insn (insn);
13443
13444 arg0 = CALL_EXPR_ARG (exp, 0);
13445 op4 = expand_normal (arg0);
13446 if (!address_operand (op4, VOIDmode))
13447 {
13448 op4 = convert_memory_address (Pmode, op4);
13449 op4 = copy_addr_to_reg (op4);
13450 }
13451 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13452 }
13453
13454 if (target == 0
13455 || !register_operand (target, DImode))
13456 target = gen_reg_rtx (DImode);
13457
13458 if (TARGET_64BIT)
13459 {
13460 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13461 op1, 1, OPTAB_DIRECT);
13462 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13463 op0, 1, OPTAB_DIRECT);
13464 }
13465
13466 emit_move_insn (target, op0);
13467 return target;
13468
13469 case IX86_BUILTIN_ENQCMD:
13470 case IX86_BUILTIN_ENQCMDS:
13471 case IX86_BUILTIN_MOVDIR64B:
13472
13473 arg0 = CALL_EXPR_ARG (exp, 0);
13474 arg1 = CALL_EXPR_ARG (exp, 1);
13475 op0 = expand_normal (arg0);
13476 op1 = expand_normal (arg1);
13477
13478 op0 = ix86_zero_extend_to_Pmode (op0);
13479 if (!address_operand (op1, VOIDmode))
13480 {
13481 op1 = convert_memory_address (Pmode, op1);
13482 op1 = copy_addr_to_reg (op1);
13483 }
13484 op1 = gen_rtx_MEM (XImode, op1);
13485
13486 if (fcode == IX86_BUILTIN_MOVDIR64B)
13487 {
13488 emit_insn (gen_movdir64b (Pmode, op0, op1));
13489 return 0;
13490 }
13491 else
13492 {
13493 if (target == 0
13494 || !register_operand (target, SImode))
13495 target = gen_reg_rtx (SImode);
13496
13497 emit_move_insn (target, const0_rtx);
13498 target = gen_rtx_SUBREG (QImode, target, 0);
13499
13500 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13501 ? UNSPECV_ENQCMD
13502 : UNSPECV_ENQCMDS);
13503 icode = code_for_enqcmd (unspecv, Pmode);
13504 emit_insn (GEN_FCN (icode) (op0, op1));
13505
13506 emit_insn
13507 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13508 gen_rtx_fmt_ee (EQ, QImode,
13509 gen_rtx_REG (CCZmode, FLAGS_REG),
13510 const0_rtx)));
13511 return SUBREG_REG (target);
13512 }
13513
13514 case IX86_BUILTIN_FXSAVE:
13515 case IX86_BUILTIN_FXRSTOR:
13516 case IX86_BUILTIN_FXSAVE64:
13517 case IX86_BUILTIN_FXRSTOR64:
13518 case IX86_BUILTIN_FNSTENV:
13519 case IX86_BUILTIN_FLDENV:
13520 mode0 = BLKmode;
13521 switch (fcode)
13522 {
13523 case IX86_BUILTIN_FXSAVE:
13524 icode = CODE_FOR_fxsave;
13525 break;
13526 case IX86_BUILTIN_FXRSTOR:
13527 icode = CODE_FOR_fxrstor;
13528 break;
13529 case IX86_BUILTIN_FXSAVE64:
13530 icode = CODE_FOR_fxsave64;
13531 break;
13532 case IX86_BUILTIN_FXRSTOR64:
13533 icode = CODE_FOR_fxrstor64;
13534 break;
13535 case IX86_BUILTIN_FNSTENV:
13536 icode = CODE_FOR_fnstenv;
13537 break;
13538 case IX86_BUILTIN_FLDENV:
13539 icode = CODE_FOR_fldenv;
13540 break;
13541 default:
13542 gcc_unreachable ();
13543 }
13544
13545 arg0 = CALL_EXPR_ARG (exp, 0);
13546 op0 = expand_normal (arg0);
13547
13548 if (!address_operand (op0, VOIDmode))
13549 {
13550 op0 = convert_memory_address (Pmode, op0);
13551 op0 = copy_addr_to_reg (op0);
13552 }
13553 op0 = gen_rtx_MEM (mode0, op0);
13554
13555 pat = GEN_FCN (icode) (op0);
13556 if (pat)
13557 emit_insn (pat);
13558 return 0;
13559
13560 case IX86_BUILTIN_XSETBV:
13561 arg0 = CALL_EXPR_ARG (exp, 0);
13562 arg1 = CALL_EXPR_ARG (exp, 1);
13563 op0 = expand_normal (arg0);
13564 op1 = expand_normal (arg1);
13565
13566 if (!REG_P (op0))
13567 op0 = copy_to_mode_reg (SImode, op0);
13568
13569 op1 = force_reg (DImode, op1);
13570
13571 if (TARGET_64BIT)
13572 {
13573 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13574 NULL, 1, OPTAB_DIRECT);
13575
13576 icode = CODE_FOR_xsetbv_rex64;
13577
13578 op2 = gen_lowpart (SImode, op2);
13579 op1 = gen_lowpart (SImode, op1);
13580 pat = GEN_FCN (icode) (op0, op1, op2);
13581 }
13582 else
13583 {
13584 icode = CODE_FOR_xsetbv;
13585
13586 pat = GEN_FCN (icode) (op0, op1);
13587 }
13588 if (pat)
13589 emit_insn (pat);
13590 return 0;
13591
13592 case IX86_BUILTIN_XSAVE:
13593 case IX86_BUILTIN_XRSTOR:
13594 case IX86_BUILTIN_XSAVE64:
13595 case IX86_BUILTIN_XRSTOR64:
13596 case IX86_BUILTIN_XSAVEOPT:
13597 case IX86_BUILTIN_XSAVEOPT64:
13598 case IX86_BUILTIN_XSAVES:
13599 case IX86_BUILTIN_XRSTORS:
13600 case IX86_BUILTIN_XSAVES64:
13601 case IX86_BUILTIN_XRSTORS64:
13602 case IX86_BUILTIN_XSAVEC:
13603 case IX86_BUILTIN_XSAVEC64:
13604 arg0 = CALL_EXPR_ARG (exp, 0);
13605 arg1 = CALL_EXPR_ARG (exp, 1);
13606 op0 = expand_normal (arg0);
13607 op1 = expand_normal (arg1);
13608
13609 if (!address_operand (op0, VOIDmode))
13610 {
13611 op0 = convert_memory_address (Pmode, op0);
13612 op0 = copy_addr_to_reg (op0);
13613 }
13614 op0 = gen_rtx_MEM (BLKmode, op0);
13615
13616 op1 = force_reg (DImode, op1);
13617
13618 if (TARGET_64BIT)
13619 {
13620 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13621 NULL, 1, OPTAB_DIRECT);
13622 switch (fcode)
13623 {
13624 case IX86_BUILTIN_XSAVE:
13625 icode = CODE_FOR_xsave_rex64;
13626 break;
13627 case IX86_BUILTIN_XRSTOR:
13628 icode = CODE_FOR_xrstor_rex64;
13629 break;
13630 case IX86_BUILTIN_XSAVE64:
13631 icode = CODE_FOR_xsave64;
13632 break;
13633 case IX86_BUILTIN_XRSTOR64:
13634 icode = CODE_FOR_xrstor64;
13635 break;
13636 case IX86_BUILTIN_XSAVEOPT:
13637 icode = CODE_FOR_xsaveopt_rex64;
13638 break;
13639 case IX86_BUILTIN_XSAVEOPT64:
13640 icode = CODE_FOR_xsaveopt64;
13641 break;
13642 case IX86_BUILTIN_XSAVES:
13643 icode = CODE_FOR_xsaves_rex64;
13644 break;
13645 case IX86_BUILTIN_XRSTORS:
13646 icode = CODE_FOR_xrstors_rex64;
13647 break;
13648 case IX86_BUILTIN_XSAVES64:
13649 icode = CODE_FOR_xsaves64;
13650 break;
13651 case IX86_BUILTIN_XRSTORS64:
13652 icode = CODE_FOR_xrstors64;
13653 break;
13654 case IX86_BUILTIN_XSAVEC:
13655 icode = CODE_FOR_xsavec_rex64;
13656 break;
13657 case IX86_BUILTIN_XSAVEC64:
13658 icode = CODE_FOR_xsavec64;
13659 break;
13660 default:
13661 gcc_unreachable ();
13662 }
13663
13664 op2 = gen_lowpart (SImode, op2);
13665 op1 = gen_lowpart (SImode, op1);
13666 pat = GEN_FCN (icode) (op0, op1, op2);
13667 }
13668 else
13669 {
13670 switch (fcode)
13671 {
13672 case IX86_BUILTIN_XSAVE:
13673 icode = CODE_FOR_xsave;
13674 break;
13675 case IX86_BUILTIN_XRSTOR:
13676 icode = CODE_FOR_xrstor;
13677 break;
13678 case IX86_BUILTIN_XSAVEOPT:
13679 icode = CODE_FOR_xsaveopt;
13680 break;
13681 case IX86_BUILTIN_XSAVES:
13682 icode = CODE_FOR_xsaves;
13683 break;
13684 case IX86_BUILTIN_XRSTORS:
13685 icode = CODE_FOR_xrstors;
13686 break;
13687 case IX86_BUILTIN_XSAVEC:
13688 icode = CODE_FOR_xsavec;
13689 break;
13690 default:
13691 gcc_unreachable ();
13692 }
13693 pat = GEN_FCN (icode) (op0, op1);
13694 }
13695
13696 if (pat)
13697 emit_insn (pat);
13698 return 0;
13699
13700 case IX86_BUILTIN_LLWPCB:
13701 arg0 = CALL_EXPR_ARG (exp, 0);
13702 op0 = expand_normal (arg0);
13703
13704 if (!register_operand (op0, Pmode))
13705 op0 = ix86_zero_extend_to_Pmode (op0);
13706 emit_insn (gen_lwp_llwpcb (Pmode, op0));
13707 return 0;
13708
13709 case IX86_BUILTIN_SLWPCB:
13710 if (!target
13711 || !register_operand (target, Pmode))
13712 target = gen_reg_rtx (Pmode);
13713 emit_insn (gen_lwp_slwpcb (Pmode, target));
13714 return target;
13715
13716 case IX86_BUILTIN_LWPVAL32:
13717 case IX86_BUILTIN_LWPVAL64:
13718 case IX86_BUILTIN_LWPINS32:
13719 case IX86_BUILTIN_LWPINS64:
13720 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13721 || fcode == IX86_BUILTIN_LWPINS32)
13722 ? SImode : DImode);
13723
13724 if (fcode == IX86_BUILTIN_LWPVAL32
13725 || fcode == IX86_BUILTIN_LWPVAL64)
13726 icode = code_for_lwp_lwpval (mode);
13727 else
13728 icode = code_for_lwp_lwpins (mode);
13729
13730 arg0 = CALL_EXPR_ARG (exp, 0);
13731 arg1 = CALL_EXPR_ARG (exp, 1);
13732 arg2 = CALL_EXPR_ARG (exp, 2);
13733 op0 = expand_normal (arg0);
13734 op1 = expand_normal (arg1);
13735 op2 = expand_normal (arg2);
13736 mode0 = insn_data[icode].operand[0].mode;
13737
13738 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13739 op0 = copy_to_mode_reg (mode0, op0);
13740 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13741 op1 = copy_to_mode_reg (SImode, op1);
13742
13743 if (!CONST_INT_P (op2))
13744 {
13745 error ("the last argument must be a 32-bit immediate");
13746 return const0_rtx;
13747 }
13748
13749 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13750
13751 if (fcode == IX86_BUILTIN_LWPINS32
13752 || fcode == IX86_BUILTIN_LWPINS64)
13753 {
13754 if (target == 0
13755 || !nonimmediate_operand (target, QImode))
13756 target = gen_reg_rtx (QImode);
13757
13758 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13759 const0_rtx);
13760 emit_insn (gen_rtx_SET (target, pat));
13761
13762 return target;
13763 }
13764 else
13765 return 0;
13766
13767 case IX86_BUILTIN_BEXTRI32:
13768 case IX86_BUILTIN_BEXTRI64:
13769 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13770
13771 arg0 = CALL_EXPR_ARG (exp, 0);
13772 arg1 = CALL_EXPR_ARG (exp, 1);
13773 op0 = expand_normal (arg0);
13774 op1 = expand_normal (arg1);
13775
13776 if (!CONST_INT_P (op1))
13777 {
13778 error ("last argument must be an immediate");
13779 return const0_rtx;
13780 }
13781 else
13782 {
13783 unsigned char lsb_index = UINTVAL (op1);
13784 unsigned char length = UINTVAL (op1) >> 8;
13785
13786 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13787
13788 icode = code_for_tbm_bextri (mode);
13789
13790 mode1 = insn_data[icode].operand[1].mode;
13791 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13792 op0 = copy_to_mode_reg (mode1, op0);
13793
13794 mode0 = insn_data[icode].operand[0].mode;
13795 if (target == 0
13796 || !register_operand (target, mode0))
13797 target = gen_reg_rtx (mode0);
13798
13799 if (length == 0 || lsb_index >= bitsize)
13800 {
13801 emit_move_insn (target, const0_rtx);
13802 return target;
13803 }
13804
13805 if (length + lsb_index > bitsize)
13806 length = bitsize - lsb_index;
13807
13808 op1 = GEN_INT (length);
13809 op2 = GEN_INT (lsb_index);
13810
13811 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13812 return target;
13813 }
13814
13815 case IX86_BUILTIN_RDRAND16_STEP:
13816 mode = HImode;
13817 goto rdrand_step;
13818
13819 case IX86_BUILTIN_RDRAND32_STEP:
13820 mode = SImode;
13821 goto rdrand_step;
13822
13823 case IX86_BUILTIN_RDRAND64_STEP:
13824 mode = DImode;
13825
13826 rdrand_step:
13827 arg0 = CALL_EXPR_ARG (exp, 0);
13828 op1 = expand_normal (arg0);
13829 if (!address_operand (op1, VOIDmode))
13830 {
13831 op1 = convert_memory_address (Pmode, op1);
13832 op1 = copy_addr_to_reg (op1);
13833 }
13834
13835 op0 = gen_reg_rtx (mode);
13836 emit_insn (gen_rdrand (mode, op0));
13837
13838 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13839
13840 op1 = force_reg (SImode, const1_rtx);
13841
13842 /* Emit SImode conditional move. */
13843 if (mode == HImode)
13844 {
13845 if (TARGET_ZERO_EXTEND_WITH_AND
13846 && optimize_function_for_speed_p (cfun))
13847 {
13848 op2 = force_reg (SImode, const0_rtx);
13849
13850 emit_insn (gen_movstricthi
13851 (gen_lowpart (HImode, op2), op0));
13852 }
13853 else
13854 {
13855 op2 = gen_reg_rtx (SImode);
13856
13857 emit_insn (gen_zero_extendhisi2 (op2, op0));
13858 }
13859 }
13860 else if (mode == SImode)
13861 op2 = op0;
13862 else
13863 op2 = gen_rtx_SUBREG (SImode, op0, 0);
13864
13865 if (target == 0
13866 || !register_operand (target, SImode))
13867 target = gen_reg_rtx (SImode);
13868
13869 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13870 const0_rtx);
13871 emit_insn (gen_rtx_SET (target,
13872 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13873 return target;
13874
13875 case IX86_BUILTIN_RDSEED16_STEP:
13876 mode = HImode;
13877 goto rdseed_step;
13878
13879 case IX86_BUILTIN_RDSEED32_STEP:
13880 mode = SImode;
13881 goto rdseed_step;
13882
13883 case IX86_BUILTIN_RDSEED64_STEP:
13884 mode = DImode;
13885
13886 rdseed_step:
13887 arg0 = CALL_EXPR_ARG (exp, 0);
13888 op1 = expand_normal (arg0);
13889 if (!address_operand (op1, VOIDmode))
13890 {
13891 op1 = convert_memory_address (Pmode, op1);
13892 op1 = copy_addr_to_reg (op1);
13893 }
13894
13895 op0 = gen_reg_rtx (mode);
13896 emit_insn (gen_rdseed (mode, op0));
13897
13898 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13899
13900 op2 = gen_reg_rtx (QImode);
13901
13902 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13903 const0_rtx);
13904 emit_insn (gen_rtx_SET (op2, pat));
13905
13906 if (target == 0
13907 || !register_operand (target, SImode))
13908 target = gen_reg_rtx (SImode);
13909
13910 emit_insn (gen_zero_extendqisi2 (target, op2));
13911 return target;
13912
13913 case IX86_BUILTIN_SBB32:
13914 icode = CODE_FOR_subborrowsi;
13915 icode2 = CODE_FOR_subborrowsi_0;
13916 mode0 = SImode;
13917 mode1 = DImode;
13918 mode2 = CCmode;
13919 goto handlecarry;
13920
13921 case IX86_BUILTIN_SBB64:
13922 icode = CODE_FOR_subborrowdi;
13923 icode2 = CODE_FOR_subborrowdi_0;
13924 mode0 = DImode;
13925 mode1 = TImode;
13926 mode2 = CCmode;
13927 goto handlecarry;
13928
13929 case IX86_BUILTIN_ADDCARRYX32:
13930 icode = CODE_FOR_addcarrysi;
13931 icode2 = CODE_FOR_addcarrysi_0;
13932 mode0 = SImode;
13933 mode1 = DImode;
13934 mode2 = CCCmode;
13935 goto handlecarry;
13936
13937 case IX86_BUILTIN_ADDCARRYX64:
13938 icode = CODE_FOR_addcarrydi;
13939 icode2 = CODE_FOR_addcarrydi_0;
13940 mode0 = DImode;
13941 mode1 = TImode;
13942 mode2 = CCCmode;
13943
13944 handlecarry:
13945 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
13946 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
13947 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
13948 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
13949
13950 op1 = expand_normal (arg0);
13951 if (!integer_zerop (arg0))
13952 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
13953
13954 op2 = expand_normal (arg1);
13955 if (!register_operand (op2, mode0))
13956 op2 = copy_to_mode_reg (mode0, op2);
13957
13958 op3 = expand_normal (arg2);
13959 if (!register_operand (op3, mode0))
13960 op3 = copy_to_mode_reg (mode0, op3);
13961
13962 op4 = expand_normal (arg3);
13963 if (!address_operand (op4, VOIDmode))
13964 {
13965 op4 = convert_memory_address (Pmode, op4);
13966 op4 = copy_addr_to_reg (op4);
13967 }
13968
13969 op0 = gen_reg_rtx (mode0);
13970 if (integer_zerop (arg0))
13971 {
13972 /* If arg0 is 0, optimize right away into add or sub
13973 instruction that sets CCCmode flags. */
13974 op1 = gen_rtx_REG (mode2, FLAGS_REG);
13975 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
13976 }
13977 else
13978 {
13979 /* Generate CF from input operand. */
13980 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
13981
13982 /* Generate instruction that consumes CF. */
13983 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
13984 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
13985 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
13986 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
13987 }
13988
13989 /* Return current CF value. */
13990 if (target == 0)
13991 target = gen_reg_rtx (QImode);
13992
13993 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
13994 emit_insn (gen_rtx_SET (target, pat));
13995
13996 /* Store the result. */
13997 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
13998
13999 return target;
14000
14001 case IX86_BUILTIN_READ_FLAGS:
14002 if (ignore)
14003 return const0_rtx;
14004
14005 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
14006
14007 if (optimize
14008 || target == NULL_RTX
14009 || !nonimmediate_operand (target, word_mode)
14010 || GET_MODE (target) != word_mode)
14011 target = gen_reg_rtx (word_mode);
14012
14013 emit_insn (gen_pop (target));
14014 return target;
14015
14016 case IX86_BUILTIN_WRITE_FLAGS:
14017
14018 arg0 = CALL_EXPR_ARG (exp, 0);
14019 op0 = expand_normal (arg0);
14020 if (!general_no_elim_operand (op0, word_mode))
14021 op0 = copy_to_mode_reg (word_mode, op0);
14022
14023 emit_insn (gen_push (op0));
14024 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
14025 return 0;
14026
14027 case IX86_BUILTIN_KTESTC8:
14028 icode = CODE_FOR_ktestqi;
14029 mode3 = CCCmode;
14030 goto kortest;
14031
14032 case IX86_BUILTIN_KTESTZ8:
14033 icode = CODE_FOR_ktestqi;
14034 mode3 = CCZmode;
14035 goto kortest;
14036
14037 case IX86_BUILTIN_KTESTC16:
14038 icode = CODE_FOR_ktesthi;
14039 mode3 = CCCmode;
14040 goto kortest;
14041
14042 case IX86_BUILTIN_KTESTZ16:
14043 icode = CODE_FOR_ktesthi;
14044 mode3 = CCZmode;
14045 goto kortest;
14046
14047 case IX86_BUILTIN_KTESTC32:
14048 icode = CODE_FOR_ktestsi;
14049 mode3 = CCCmode;
14050 goto kortest;
14051
14052 case IX86_BUILTIN_KTESTZ32:
14053 icode = CODE_FOR_ktestsi;
14054 mode3 = CCZmode;
14055 goto kortest;
14056
14057 case IX86_BUILTIN_KTESTC64:
14058 icode = CODE_FOR_ktestdi;
14059 mode3 = CCCmode;
14060 goto kortest;
14061
14062 case IX86_BUILTIN_KTESTZ64:
14063 icode = CODE_FOR_ktestdi;
14064 mode3 = CCZmode;
14065 goto kortest;
14066
14067 case IX86_BUILTIN_KORTESTC8:
14068 icode = CODE_FOR_kortestqi;
14069 mode3 = CCCmode;
14070 goto kortest;
14071
14072 case IX86_BUILTIN_KORTESTZ8:
14073 icode = CODE_FOR_kortestqi;
14074 mode3 = CCZmode;
14075 goto kortest;
14076
14077 case IX86_BUILTIN_KORTESTC16:
14078 icode = CODE_FOR_kortesthi;
14079 mode3 = CCCmode;
14080 goto kortest;
14081
14082 case IX86_BUILTIN_KORTESTZ16:
14083 icode = CODE_FOR_kortesthi;
14084 mode3 = CCZmode;
14085 goto kortest;
14086
14087 case IX86_BUILTIN_KORTESTC32:
14088 icode = CODE_FOR_kortestsi;
14089 mode3 = CCCmode;
14090 goto kortest;
14091
14092 case IX86_BUILTIN_KORTESTZ32:
14093 icode = CODE_FOR_kortestsi;
14094 mode3 = CCZmode;
14095 goto kortest;
14096
14097 case IX86_BUILTIN_KORTESTC64:
14098 icode = CODE_FOR_kortestdi;
14099 mode3 = CCCmode;
14100 goto kortest;
14101
14102 case IX86_BUILTIN_KORTESTZ64:
14103 icode = CODE_FOR_kortestdi;
14104 mode3 = CCZmode;
14105
14106 kortest:
14107 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
14108 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
14109 op0 = expand_normal (arg0);
14110 op1 = expand_normal (arg1);
14111
14112 mode0 = insn_data[icode].operand[0].mode;
14113 mode1 = insn_data[icode].operand[1].mode;
14114
14115 if (GET_MODE (op0) != VOIDmode)
14116 op0 = force_reg (GET_MODE (op0), op0);
14117
14118 op0 = gen_lowpart (mode0, op0);
14119
14120 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14121 op0 = copy_to_mode_reg (mode0, op0);
14122
14123 if (GET_MODE (op1) != VOIDmode)
14124 op1 = force_reg (GET_MODE (op1), op1);
14125
14126 op1 = gen_lowpart (mode1, op1);
14127
14128 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14129 op1 = copy_to_mode_reg (mode1, op1);
14130
14131 target = gen_reg_rtx (QImode);
14132
14133 /* Emit kortest. */
14134 emit_insn (GEN_FCN (icode) (op0, op1));
14135 /* And use setcc to return result from flags. */
14136 ix86_expand_setcc (target, EQ,
14137 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
14138 return target;
14139
14140 case IX86_BUILTIN_GATHERSIV2DF:
14141 icode = CODE_FOR_avx2_gathersiv2df;
14142 goto gather_gen;
14143 case IX86_BUILTIN_GATHERSIV4DF:
14144 icode = CODE_FOR_avx2_gathersiv4df;
14145 goto gather_gen;
14146 case IX86_BUILTIN_GATHERDIV2DF:
14147 icode = CODE_FOR_avx2_gatherdiv2df;
14148 goto gather_gen;
14149 case IX86_BUILTIN_GATHERDIV4DF:
14150 icode = CODE_FOR_avx2_gatherdiv4df;
14151 goto gather_gen;
14152 case IX86_BUILTIN_GATHERSIV4SF:
14153 icode = CODE_FOR_avx2_gathersiv4sf;
14154 goto gather_gen;
14155 case IX86_BUILTIN_GATHERSIV8SF:
14156 icode = CODE_FOR_avx2_gathersiv8sf;
14157 goto gather_gen;
14158 case IX86_BUILTIN_GATHERDIV4SF:
14159 icode = CODE_FOR_avx2_gatherdiv4sf;
14160 goto gather_gen;
14161 case IX86_BUILTIN_GATHERDIV8SF:
14162 icode = CODE_FOR_avx2_gatherdiv8sf;
14163 goto gather_gen;
14164 case IX86_BUILTIN_GATHERSIV2DI:
14165 icode = CODE_FOR_avx2_gathersiv2di;
14166 goto gather_gen;
14167 case IX86_BUILTIN_GATHERSIV4DI:
14168 icode = CODE_FOR_avx2_gathersiv4di;
14169 goto gather_gen;
14170 case IX86_BUILTIN_GATHERDIV2DI:
14171 icode = CODE_FOR_avx2_gatherdiv2di;
14172 goto gather_gen;
14173 case IX86_BUILTIN_GATHERDIV4DI:
14174 icode = CODE_FOR_avx2_gatherdiv4di;
14175 goto gather_gen;
14176 case IX86_BUILTIN_GATHERSIV4SI:
14177 icode = CODE_FOR_avx2_gathersiv4si;
14178 goto gather_gen;
14179 case IX86_BUILTIN_GATHERSIV8SI:
14180 icode = CODE_FOR_avx2_gathersiv8si;
14181 goto gather_gen;
14182 case IX86_BUILTIN_GATHERDIV4SI:
14183 icode = CODE_FOR_avx2_gatherdiv4si;
14184 goto gather_gen;
14185 case IX86_BUILTIN_GATHERDIV8SI:
14186 icode = CODE_FOR_avx2_gatherdiv8si;
14187 goto gather_gen;
14188 case IX86_BUILTIN_GATHERALTSIV4DF:
14189 icode = CODE_FOR_avx2_gathersiv4df;
14190 goto gather_gen;
14191 case IX86_BUILTIN_GATHERALTDIV8SF:
14192 icode = CODE_FOR_avx2_gatherdiv8sf;
14193 goto gather_gen;
14194 case IX86_BUILTIN_GATHERALTSIV4DI:
14195 icode = CODE_FOR_avx2_gathersiv4di;
14196 goto gather_gen;
14197 case IX86_BUILTIN_GATHERALTDIV8SI:
14198 icode = CODE_FOR_avx2_gatherdiv8si;
14199 goto gather_gen;
14200 case IX86_BUILTIN_GATHER3SIV16SF:
14201 icode = CODE_FOR_avx512f_gathersiv16sf;
14202 goto gather_gen;
14203 case IX86_BUILTIN_GATHER3SIV8DF:
14204 icode = CODE_FOR_avx512f_gathersiv8df;
14205 goto gather_gen;
14206 case IX86_BUILTIN_GATHER3DIV16SF:
14207 icode = CODE_FOR_avx512f_gatherdiv16sf;
14208 goto gather_gen;
14209 case IX86_BUILTIN_GATHER3DIV8DF:
14210 icode = CODE_FOR_avx512f_gatherdiv8df;
14211 goto gather_gen;
14212 case IX86_BUILTIN_GATHER3SIV16SI:
14213 icode = CODE_FOR_avx512f_gathersiv16si;
14214 goto gather_gen;
14215 case IX86_BUILTIN_GATHER3SIV8DI:
14216 icode = CODE_FOR_avx512f_gathersiv8di;
14217 goto gather_gen;
14218 case IX86_BUILTIN_GATHER3DIV16SI:
14219 icode = CODE_FOR_avx512f_gatherdiv16si;
14220 goto gather_gen;
14221 case IX86_BUILTIN_GATHER3DIV8DI:
14222 icode = CODE_FOR_avx512f_gatherdiv8di;
14223 goto gather_gen;
14224 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14225 icode = CODE_FOR_avx512f_gathersiv8df;
14226 goto gather_gen;
14227 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14228 icode = CODE_FOR_avx512f_gatherdiv16sf;
14229 goto gather_gen;
14230 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14231 icode = CODE_FOR_avx512f_gathersiv8di;
14232 goto gather_gen;
14233 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14234 icode = CODE_FOR_avx512f_gatherdiv16si;
14235 goto gather_gen;
14236 case IX86_BUILTIN_GATHER3SIV2DF:
14237 icode = CODE_FOR_avx512vl_gathersiv2df;
14238 goto gather_gen;
14239 case IX86_BUILTIN_GATHER3SIV4DF:
14240 icode = CODE_FOR_avx512vl_gathersiv4df;
14241 goto gather_gen;
14242 case IX86_BUILTIN_GATHER3DIV2DF:
14243 icode = CODE_FOR_avx512vl_gatherdiv2df;
14244 goto gather_gen;
14245 case IX86_BUILTIN_GATHER3DIV4DF:
14246 icode = CODE_FOR_avx512vl_gatherdiv4df;
14247 goto gather_gen;
14248 case IX86_BUILTIN_GATHER3SIV4SF:
14249 icode = CODE_FOR_avx512vl_gathersiv4sf;
14250 goto gather_gen;
14251 case IX86_BUILTIN_GATHER3SIV8SF:
14252 icode = CODE_FOR_avx512vl_gathersiv8sf;
14253 goto gather_gen;
14254 case IX86_BUILTIN_GATHER3DIV4SF:
14255 icode = CODE_FOR_avx512vl_gatherdiv4sf;
14256 goto gather_gen;
14257 case IX86_BUILTIN_GATHER3DIV8SF:
14258 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14259 goto gather_gen;
14260 case IX86_BUILTIN_GATHER3SIV2DI:
14261 icode = CODE_FOR_avx512vl_gathersiv2di;
14262 goto gather_gen;
14263 case IX86_BUILTIN_GATHER3SIV4DI:
14264 icode = CODE_FOR_avx512vl_gathersiv4di;
14265 goto gather_gen;
14266 case IX86_BUILTIN_GATHER3DIV2DI:
14267 icode = CODE_FOR_avx512vl_gatherdiv2di;
14268 goto gather_gen;
14269 case IX86_BUILTIN_GATHER3DIV4DI:
14270 icode = CODE_FOR_avx512vl_gatherdiv4di;
14271 goto gather_gen;
14272 case IX86_BUILTIN_GATHER3SIV4SI:
14273 icode = CODE_FOR_avx512vl_gathersiv4si;
14274 goto gather_gen;
14275 case IX86_BUILTIN_GATHER3SIV8SI:
14276 icode = CODE_FOR_avx512vl_gathersiv8si;
14277 goto gather_gen;
14278 case IX86_BUILTIN_GATHER3DIV4SI:
14279 icode = CODE_FOR_avx512vl_gatherdiv4si;
14280 goto gather_gen;
14281 case IX86_BUILTIN_GATHER3DIV8SI:
14282 icode = CODE_FOR_avx512vl_gatherdiv8si;
14283 goto gather_gen;
14284 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14285 icode = CODE_FOR_avx512vl_gathersiv4df;
14286 goto gather_gen;
14287 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14288 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14289 goto gather_gen;
14290 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14291 icode = CODE_FOR_avx512vl_gathersiv4di;
14292 goto gather_gen;
14293 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14294 icode = CODE_FOR_avx512vl_gatherdiv8si;
14295 goto gather_gen;
14296 case IX86_BUILTIN_SCATTERSIV16SF:
14297 icode = CODE_FOR_avx512f_scattersiv16sf;
14298 goto scatter_gen;
14299 case IX86_BUILTIN_SCATTERSIV8DF:
14300 icode = CODE_FOR_avx512f_scattersiv8df;
14301 goto scatter_gen;
14302 case IX86_BUILTIN_SCATTERDIV16SF:
14303 icode = CODE_FOR_avx512f_scatterdiv16sf;
14304 goto scatter_gen;
14305 case IX86_BUILTIN_SCATTERDIV8DF:
14306 icode = CODE_FOR_avx512f_scatterdiv8df;
14307 goto scatter_gen;
14308 case IX86_BUILTIN_SCATTERSIV16SI:
14309 icode = CODE_FOR_avx512f_scattersiv16si;
14310 goto scatter_gen;
14311 case IX86_BUILTIN_SCATTERSIV8DI:
14312 icode = CODE_FOR_avx512f_scattersiv8di;
14313 goto scatter_gen;
14314 case IX86_BUILTIN_SCATTERDIV16SI:
14315 icode = CODE_FOR_avx512f_scatterdiv16si;
14316 goto scatter_gen;
14317 case IX86_BUILTIN_SCATTERDIV8DI:
14318 icode = CODE_FOR_avx512f_scatterdiv8di;
14319 goto scatter_gen;
14320 case IX86_BUILTIN_SCATTERSIV8SF:
14321 icode = CODE_FOR_avx512vl_scattersiv8sf;
14322 goto scatter_gen;
14323 case IX86_BUILTIN_SCATTERSIV4SF:
14324 icode = CODE_FOR_avx512vl_scattersiv4sf;
14325 goto scatter_gen;
14326 case IX86_BUILTIN_SCATTERSIV4DF:
14327 icode = CODE_FOR_avx512vl_scattersiv4df;
14328 goto scatter_gen;
14329 case IX86_BUILTIN_SCATTERSIV2DF:
14330 icode = CODE_FOR_avx512vl_scattersiv2df;
14331 goto scatter_gen;
14332 case IX86_BUILTIN_SCATTERDIV8SF:
14333 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14334 goto scatter_gen;
14335 case IX86_BUILTIN_SCATTERDIV4SF:
14336 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14337 goto scatter_gen;
14338 case IX86_BUILTIN_SCATTERDIV4DF:
14339 icode = CODE_FOR_avx512vl_scatterdiv4df;
14340 goto scatter_gen;
14341 case IX86_BUILTIN_SCATTERDIV2DF:
14342 icode = CODE_FOR_avx512vl_scatterdiv2df;
14343 goto scatter_gen;
14344 case IX86_BUILTIN_SCATTERSIV8SI:
14345 icode = CODE_FOR_avx512vl_scattersiv8si;
14346 goto scatter_gen;
14347 case IX86_BUILTIN_SCATTERSIV4SI:
14348 icode = CODE_FOR_avx512vl_scattersiv4si;
14349 goto scatter_gen;
14350 case IX86_BUILTIN_SCATTERSIV4DI:
14351 icode = CODE_FOR_avx512vl_scattersiv4di;
14352 goto scatter_gen;
14353 case IX86_BUILTIN_SCATTERSIV2DI:
14354 icode = CODE_FOR_avx512vl_scattersiv2di;
14355 goto scatter_gen;
14356 case IX86_BUILTIN_SCATTERDIV8SI:
14357 icode = CODE_FOR_avx512vl_scatterdiv8si;
14358 goto scatter_gen;
14359 case IX86_BUILTIN_SCATTERDIV4SI:
14360 icode = CODE_FOR_avx512vl_scatterdiv4si;
14361 goto scatter_gen;
14362 case IX86_BUILTIN_SCATTERDIV4DI:
14363 icode = CODE_FOR_avx512vl_scatterdiv4di;
14364 goto scatter_gen;
14365 case IX86_BUILTIN_SCATTERDIV2DI:
14366 icode = CODE_FOR_avx512vl_scatterdiv2di;
14367 goto scatter_gen;
14368 case IX86_BUILTIN_GATHERPFDPD:
14369 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14370 goto vec_prefetch_gen;
14371 case IX86_BUILTIN_SCATTERALTSIV8DF:
14372 icode = CODE_FOR_avx512f_scattersiv8df;
14373 goto scatter_gen;
14374 case IX86_BUILTIN_SCATTERALTDIV16SF:
14375 icode = CODE_FOR_avx512f_scatterdiv16sf;
14376 goto scatter_gen;
14377 case IX86_BUILTIN_SCATTERALTSIV8DI:
14378 icode = CODE_FOR_avx512f_scattersiv8di;
14379 goto scatter_gen;
14380 case IX86_BUILTIN_SCATTERALTDIV16SI:
14381 icode = CODE_FOR_avx512f_scatterdiv16si;
14382 goto scatter_gen;
14383 case IX86_BUILTIN_SCATTERALTSIV4DF:
14384 icode = CODE_FOR_avx512vl_scattersiv4df;
14385 goto scatter_gen;
14386 case IX86_BUILTIN_SCATTERALTDIV8SF:
14387 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14388 goto scatter_gen;
14389 case IX86_BUILTIN_SCATTERALTSIV4DI:
14390 icode = CODE_FOR_avx512vl_scattersiv4di;
14391 goto scatter_gen;
14392 case IX86_BUILTIN_SCATTERALTDIV8SI:
14393 icode = CODE_FOR_avx512vl_scatterdiv8si;
14394 goto scatter_gen;
14395 case IX86_BUILTIN_SCATTERALTSIV2DF:
14396 icode = CODE_FOR_avx512vl_scattersiv2df;
14397 goto scatter_gen;
14398 case IX86_BUILTIN_SCATTERALTDIV4SF:
14399 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14400 goto scatter_gen;
14401 case IX86_BUILTIN_SCATTERALTSIV2DI:
14402 icode = CODE_FOR_avx512vl_scattersiv2di;
14403 goto scatter_gen;
14404 case IX86_BUILTIN_SCATTERALTDIV4SI:
14405 icode = CODE_FOR_avx512vl_scatterdiv4si;
14406 goto scatter_gen;
14407 case IX86_BUILTIN_GATHERPFDPS:
14408 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14409 goto vec_prefetch_gen;
14410 case IX86_BUILTIN_GATHERPFQPD:
14411 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14412 goto vec_prefetch_gen;
14413 case IX86_BUILTIN_GATHERPFQPS:
14414 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14415 goto vec_prefetch_gen;
14416 case IX86_BUILTIN_SCATTERPFDPD:
14417 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14418 goto vec_prefetch_gen;
14419 case IX86_BUILTIN_SCATTERPFDPS:
14420 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14421 goto vec_prefetch_gen;
14422 case IX86_BUILTIN_SCATTERPFQPD:
14423 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14424 goto vec_prefetch_gen;
14425 case IX86_BUILTIN_SCATTERPFQPS:
14426 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14427 goto vec_prefetch_gen;
14428
14429 gather_gen:
14430 rtx half;
14431 rtx (*gen) (rtx, rtx);
14432
14433 arg0 = CALL_EXPR_ARG (exp, 0);
14434 arg1 = CALL_EXPR_ARG (exp, 1);
14435 arg2 = CALL_EXPR_ARG (exp, 2);
14436 arg3 = CALL_EXPR_ARG (exp, 3);
14437 arg4 = CALL_EXPR_ARG (exp, 4);
14438 op0 = expand_normal (arg0);
14439 op1 = expand_normal (arg1);
14440 op2 = expand_normal (arg2);
14441 op3 = expand_normal (arg3);
14442 op4 = expand_normal (arg4);
14443 /* Note the arg order is different from the operand order. */
14444 mode0 = insn_data[icode].operand[1].mode;
14445 mode2 = insn_data[icode].operand[3].mode;
14446 mode3 = insn_data[icode].operand[4].mode;
14447 mode4 = insn_data[icode].operand[5].mode;
14448
14449 if (target == NULL_RTX
14450 || GET_MODE (target) != insn_data[icode].operand[0].mode
14451 || !insn_data[icode].operand[0].predicate (target,
14452 GET_MODE (target)))
14453 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14454 else
14455 subtarget = target;
14456
14457 switch (fcode)
14458 {
14459 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14460 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14461 half = gen_reg_rtx (V8SImode);
14462 if (!nonimmediate_operand (op2, V16SImode))
14463 op2 = copy_to_mode_reg (V16SImode, op2);
14464 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14465 op2 = half;
14466 break;
14467 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14468 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14469 case IX86_BUILTIN_GATHERALTSIV4DF:
14470 case IX86_BUILTIN_GATHERALTSIV4DI:
14471 half = gen_reg_rtx (V4SImode);
14472 if (!nonimmediate_operand (op2, V8SImode))
14473 op2 = copy_to_mode_reg (V8SImode, op2);
14474 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14475 op2 = half;
14476 break;
14477 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14478 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14479 half = gen_reg_rtx (mode0);
14480 if (mode0 == V8SFmode)
14481 gen = gen_vec_extract_lo_v16sf;
14482 else
14483 gen = gen_vec_extract_lo_v16si;
14484 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14485 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14486 emit_insn (gen (half, op0));
14487 op0 = half;
14488 op3 = lowpart_subreg (QImode, op3, HImode);
14489 break;
14490 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14491 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14492 case IX86_BUILTIN_GATHERALTDIV8SF:
14493 case IX86_BUILTIN_GATHERALTDIV8SI:
14494 half = gen_reg_rtx (mode0);
14495 if (mode0 == V4SFmode)
14496 gen = gen_vec_extract_lo_v8sf;
14497 else
14498 gen = gen_vec_extract_lo_v8si;
14499 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14500 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14501 emit_insn (gen (half, op0));
14502 op0 = half;
14503 if (VECTOR_MODE_P (GET_MODE (op3)))
14504 {
14505 half = gen_reg_rtx (mode0);
14506 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14507 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14508 emit_insn (gen (half, op3));
14509 op3 = half;
14510 }
14511 break;
14512 default:
14513 break;
14514 }
14515
14516 /* Force memory operand only with base register here. But we
14517 don't want to do it on memory operand for other builtin
14518 functions. */
14519 op1 = ix86_zero_extend_to_Pmode (op1);
14520
14521 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14522 op0 = copy_to_mode_reg (mode0, op0);
14523 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14524 op1 = copy_to_mode_reg (Pmode, op1);
14525 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14526 op2 = copy_to_mode_reg (mode2, op2);
14527
14528 op3 = fixup_modeless_constant (op3, mode3);
14529
14530 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14531 {
14532 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14533 op3 = copy_to_mode_reg (mode3, op3);
14534 }
14535 else
14536 {
14537 op3 = copy_to_reg (op3);
14538 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14539 }
14540 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14541 {
14542 error ("the last argument must be scale 1, 2, 4, 8");
14543 return const0_rtx;
14544 }
14545
14546 /* Optimize. If mask is known to have all high bits set,
14547 replace op0 with pc_rtx to signal that the instruction
14548 overwrites the whole destination and doesn't use its
14549 previous contents. */
14550 if (optimize)
14551 {
14552 if (TREE_CODE (arg3) == INTEGER_CST)
14553 {
14554 if (integer_all_onesp (arg3))
14555 op0 = pc_rtx;
14556 }
14557 else if (TREE_CODE (arg3) == VECTOR_CST)
14558 {
14559 unsigned int negative = 0;
14560 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14561 {
14562 tree cst = VECTOR_CST_ELT (arg3, i);
14563 if (TREE_CODE (cst) == INTEGER_CST
14564 && tree_int_cst_sign_bit (cst))
14565 negative++;
14566 else if (TREE_CODE (cst) == REAL_CST
14567 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14568 negative++;
14569 }
14570 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14571 op0 = pc_rtx;
14572 }
14573 else if (TREE_CODE (arg3) == SSA_NAME
14574 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
14575 {
14576 /* Recognize also when mask is like:
14577 __v2df src = _mm_setzero_pd ();
14578 __v2df mask = _mm_cmpeq_pd (src, src);
14579 or
14580 __v8sf src = _mm256_setzero_ps ();
14581 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14582 as that is a cheaper way to load all ones into
14583 a register than having to load a constant from
14584 memory. */
14585 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14586 if (is_gimple_call (def_stmt))
14587 {
14588 tree fndecl = gimple_call_fndecl (def_stmt);
14589 if (fndecl
14590 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
14591 switch (DECL_MD_FUNCTION_CODE (fndecl))
14592 {
14593 case IX86_BUILTIN_CMPPD:
14594 case IX86_BUILTIN_CMPPS:
14595 case IX86_BUILTIN_CMPPD256:
14596 case IX86_BUILTIN_CMPPS256:
14597 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14598 break;
14599 /* FALLTHRU */
14600 case IX86_BUILTIN_CMPEQPD:
14601 case IX86_BUILTIN_CMPEQPS:
14602 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14603 && initializer_zerop (gimple_call_arg (def_stmt,
14604 1)))
14605 op0 = pc_rtx;
14606 break;
14607 default:
14608 break;
14609 }
14610 }
14611 }
14612 }
14613
14614 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14615 if (! pat)
14616 return const0_rtx;
14617 emit_insn (pat);
14618
14619 switch (fcode)
14620 {
14621 case IX86_BUILTIN_GATHER3DIV16SF:
14622 if (target == NULL_RTX)
14623 target = gen_reg_rtx (V8SFmode);
14624 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14625 break;
14626 case IX86_BUILTIN_GATHER3DIV16SI:
14627 if (target == NULL_RTX)
14628 target = gen_reg_rtx (V8SImode);
14629 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14630 break;
14631 case IX86_BUILTIN_GATHER3DIV8SF:
14632 case IX86_BUILTIN_GATHERDIV8SF:
14633 if (target == NULL_RTX)
14634 target = gen_reg_rtx (V4SFmode);
14635 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14636 break;
14637 case IX86_BUILTIN_GATHER3DIV8SI:
14638 case IX86_BUILTIN_GATHERDIV8SI:
14639 if (target == NULL_RTX)
14640 target = gen_reg_rtx (V4SImode);
14641 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14642 break;
14643 default:
14644 target = subtarget;
14645 break;
14646 }
14647 return target;
14648
14649 scatter_gen:
14650 arg0 = CALL_EXPR_ARG (exp, 0);
14651 arg1 = CALL_EXPR_ARG (exp, 1);
14652 arg2 = CALL_EXPR_ARG (exp, 2);
14653 arg3 = CALL_EXPR_ARG (exp, 3);
14654 arg4 = CALL_EXPR_ARG (exp, 4);
14655 op0 = expand_normal (arg0);
14656 op1 = expand_normal (arg1);
14657 op2 = expand_normal (arg2);
14658 op3 = expand_normal (arg3);
14659 op4 = expand_normal (arg4);
14660 mode1 = insn_data[icode].operand[1].mode;
14661 mode2 = insn_data[icode].operand[2].mode;
14662 mode3 = insn_data[icode].operand[3].mode;
14663 mode4 = insn_data[icode].operand[4].mode;
14664
14665 /* Scatter instruction stores operand op3 to memory with
14666 indices from op2 and scale from op4 under writemask op1.
14667 If index operand op2 has more elements then source operand
14668 op3 one need to use only its low half. And vice versa. */
14669 switch (fcode)
14670 {
14671 case IX86_BUILTIN_SCATTERALTSIV8DF:
14672 case IX86_BUILTIN_SCATTERALTSIV8DI:
14673 half = gen_reg_rtx (V8SImode);
14674 if (!nonimmediate_operand (op2, V16SImode))
14675 op2 = copy_to_mode_reg (V16SImode, op2);
14676 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14677 op2 = half;
14678 break;
14679 case IX86_BUILTIN_SCATTERALTDIV16SF:
14680 case IX86_BUILTIN_SCATTERALTDIV16SI:
14681 half = gen_reg_rtx (mode3);
14682 if (mode3 == V8SFmode)
14683 gen = gen_vec_extract_lo_v16sf;
14684 else
14685 gen = gen_vec_extract_lo_v16si;
14686 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14687 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14688 emit_insn (gen (half, op3));
14689 op3 = half;
14690 break;
14691 case IX86_BUILTIN_SCATTERALTSIV4DF:
14692 case IX86_BUILTIN_SCATTERALTSIV4DI:
14693 half = gen_reg_rtx (V4SImode);
14694 if (!nonimmediate_operand (op2, V8SImode))
14695 op2 = copy_to_mode_reg (V8SImode, op2);
14696 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14697 op2 = half;
14698 break;
14699 case IX86_BUILTIN_SCATTERALTDIV8SF:
14700 case IX86_BUILTIN_SCATTERALTDIV8SI:
14701 half = gen_reg_rtx (mode3);
14702 if (mode3 == V4SFmode)
14703 gen = gen_vec_extract_lo_v8sf;
14704 else
14705 gen = gen_vec_extract_lo_v8si;
14706 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14707 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14708 emit_insn (gen (half, op3));
14709 op3 = half;
14710 break;
14711 case IX86_BUILTIN_SCATTERALTSIV2DF:
14712 case IX86_BUILTIN_SCATTERALTSIV2DI:
14713 if (!nonimmediate_operand (op2, V4SImode))
14714 op2 = copy_to_mode_reg (V4SImode, op2);
14715 break;
14716 case IX86_BUILTIN_SCATTERALTDIV4SF:
14717 case IX86_BUILTIN_SCATTERALTDIV4SI:
14718 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14719 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14720 break;
14721 default:
14722 break;
14723 }
14724
14725 /* Force memory operand only with base register here. But we
14726 don't want to do it on memory operand for other builtin
14727 functions. */
14728 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14729
14730 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14731 op0 = copy_to_mode_reg (Pmode, op0);
14732
14733 op1 = fixup_modeless_constant (op1, mode1);
14734
14735 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14736 {
14737 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14738 op1 = copy_to_mode_reg (mode1, op1);
14739 }
14740 else
14741 {
14742 op1 = copy_to_reg (op1);
14743 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14744 }
14745
14746 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14747 op2 = copy_to_mode_reg (mode2, op2);
14748
14749 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14750 op3 = copy_to_mode_reg (mode3, op3);
14751
14752 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14753 {
14754 error ("the last argument must be scale 1, 2, 4, 8");
14755 return const0_rtx;
14756 }
14757
14758 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14759 if (! pat)
14760 return const0_rtx;
14761
14762 emit_insn (pat);
14763 return 0;
14764
14765 vec_prefetch_gen:
14766 arg0 = CALL_EXPR_ARG (exp, 0);
14767 arg1 = CALL_EXPR_ARG (exp, 1);
14768 arg2 = CALL_EXPR_ARG (exp, 2);
14769 arg3 = CALL_EXPR_ARG (exp, 3);
14770 arg4 = CALL_EXPR_ARG (exp, 4);
14771 op0 = expand_normal (arg0);
14772 op1 = expand_normal (arg1);
14773 op2 = expand_normal (arg2);
14774 op3 = expand_normal (arg3);
14775 op4 = expand_normal (arg4);
14776 mode0 = insn_data[icode].operand[0].mode;
14777 mode1 = insn_data[icode].operand[1].mode;
14778 mode3 = insn_data[icode].operand[3].mode;
14779 mode4 = insn_data[icode].operand[4].mode;
14780
14781 op0 = fixup_modeless_constant (op0, mode0);
14782
14783 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14784 {
14785 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14786 op0 = copy_to_mode_reg (mode0, op0);
14787 }
14788 else
14789 {
14790 op0 = copy_to_reg (op0);
14791 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14792 }
14793
14794 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14795 op1 = copy_to_mode_reg (mode1, op1);
14796
14797 /* Force memory operand only with base register here. But we
14798 don't want to do it on memory operand for other builtin
14799 functions. */
14800 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14801
14802 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14803 op2 = copy_to_mode_reg (Pmode, op2);
14804
14805 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14806 {
14807 error ("the forth argument must be scale 1, 2, 4, 8");
14808 return const0_rtx;
14809 }
14810
14811 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14812 {
14813 error ("incorrect hint operand");
14814 return const0_rtx;
14815 }
14816
14817 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14818 if (! pat)
14819 return const0_rtx;
14820
14821 emit_insn (pat);
14822
14823 return 0;
14824
14825 case IX86_BUILTIN_XABORT:
14826 icode = CODE_FOR_xabort;
14827 arg0 = CALL_EXPR_ARG (exp, 0);
14828 op0 = expand_normal (arg0);
14829 mode0 = insn_data[icode].operand[0].mode;
14830 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14831 {
14832 error ("the argument to %<xabort%> intrinsic must "
14833 "be an 8-bit immediate");
14834 return const0_rtx;
14835 }
14836 emit_insn (gen_xabort (op0));
14837 return 0;
14838
14839 case IX86_BUILTIN_RDSSPD:
14840 case IX86_BUILTIN_RDSSPQ:
14841 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14842
14843 if (target == 0
14844 || !register_operand (target, mode))
14845 target = gen_reg_rtx (mode);
14846
14847 op0 = force_reg (mode, const0_rtx);
14848
14849 emit_insn (gen_rdssp (mode, target, op0));
14850 return target;
14851
14852 case IX86_BUILTIN_INCSSPD:
14853 case IX86_BUILTIN_INCSSPQ:
14854 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14855
14856 arg0 = CALL_EXPR_ARG (exp, 0);
14857 op0 = expand_normal (arg0);
14858
14859 op0 = force_reg (mode, op0);
14860
14861 emit_insn (gen_incssp (mode, op0));
14862 return 0;
14863
14864 case IX86_BUILTIN_HRESET:
14865 icode = CODE_FOR_hreset;
14866 arg0 = CALL_EXPR_ARG (exp, 0);
14867 op0 = expand_normal (arg0);
14868 op0 = force_reg (SImode, op0);
14869 emit_insn (gen_hreset (op0));
14870 return 0;
14871
14872 case IX86_BUILTIN_RSTORSSP:
14873 case IX86_BUILTIN_CLRSSBSY:
14874 arg0 = CALL_EXPR_ARG (exp, 0);
14875 op0 = expand_normal (arg0);
14876 icode = (fcode == IX86_BUILTIN_RSTORSSP
14877 ? CODE_FOR_rstorssp
14878 : CODE_FOR_clrssbsy);
14879
14880 if (!address_operand (op0, VOIDmode))
14881 {
14882 op0 = convert_memory_address (Pmode, op0);
14883 op0 = copy_addr_to_reg (op0);
14884 }
14885 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
14886 return 0;
14887
14888 case IX86_BUILTIN_WRSSD:
14889 case IX86_BUILTIN_WRSSQ:
14890 case IX86_BUILTIN_WRUSSD:
14891 case IX86_BUILTIN_WRUSSQ:
14892 mode = ((fcode == IX86_BUILTIN_WRSSD
14893 || fcode == IX86_BUILTIN_WRUSSD)
14894 ? SImode : DImode);
14895
14896 arg0 = CALL_EXPR_ARG (exp, 0);
14897 op0 = expand_normal (arg0);
14898 arg1 = CALL_EXPR_ARG (exp, 1);
14899 op1 = expand_normal (arg1);
14900
14901 op0 = force_reg (mode, op0);
14902
14903 if (!address_operand (op1, VOIDmode))
14904 {
14905 op1 = convert_memory_address (Pmode, op1);
14906 op1 = copy_addr_to_reg (op1);
14907 }
14908 op1 = gen_rtx_MEM (mode, op1);
14909
14910 icode = ((fcode == IX86_BUILTIN_WRSSD
14911 || fcode == IX86_BUILTIN_WRSSQ)
14912 ? code_for_wrss (mode)
14913 : code_for_wruss (mode));
14914 emit_insn (GEN_FCN (icode) (op0, op1));
14915
14916 return 0;
14917
14918 default:
14919 break;
14920 }
14921
14922 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14923 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14924 {
14925 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14926 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14927 target);
14928 }
14929
14930 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14931 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14932 {
14933 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14934 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14935 target);
14936 }
14937
14938 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14939 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14940 {
14941 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14942 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14943 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14944 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14945 int masked = 1;
14946 machine_mode mode, wide_mode, nar_mode;
14947
14948 nar_mode = V4SFmode;
14949 mode = V16SFmode;
14950 wide_mode = V64SFmode;
14951 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
14952 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14953
14954 switch (fcode)
14955 {
14956 case IX86_BUILTIN_4FMAPS:
14957 fcn = gen_avx5124fmaddps_4fmaddps;
14958 masked = 0;
14959 goto v4fma_expand;
14960
14961 case IX86_BUILTIN_4DPWSSD:
14962 nar_mode = V4SImode;
14963 mode = V16SImode;
14964 wide_mode = V64SImode;
14965 fcn = gen_avx5124vnniw_vp4dpwssd;
14966 masked = 0;
14967 goto v4fma_expand;
14968
14969 case IX86_BUILTIN_4DPWSSDS:
14970 nar_mode = V4SImode;
14971 mode = V16SImode;
14972 wide_mode = V64SImode;
14973 fcn = gen_avx5124vnniw_vp4dpwssds;
14974 masked = 0;
14975 goto v4fma_expand;
14976
14977 case IX86_BUILTIN_4FNMAPS:
14978 fcn = gen_avx5124fmaddps_4fnmaddps;
14979 masked = 0;
14980 goto v4fma_expand;
14981
14982 case IX86_BUILTIN_4FNMAPS_MASK:
14983 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
14984 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
14985 goto v4fma_expand;
14986
14987 case IX86_BUILTIN_4DPWSSD_MASK:
14988 nar_mode = V4SImode;
14989 mode = V16SImode;
14990 wide_mode = V64SImode;
14991 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
14992 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
14993 goto v4fma_expand;
14994
14995 case IX86_BUILTIN_4DPWSSDS_MASK:
14996 nar_mode = V4SImode;
14997 mode = V16SImode;
14998 wide_mode = V64SImode;
14999 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
15000 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
15001 goto v4fma_expand;
15002
15003 case IX86_BUILTIN_4FMAPS_MASK:
15004 {
15005 tree args[4];
15006 rtx ops[4];
15007 rtx wide_reg;
15008 rtx accum;
15009 rtx addr;
15010 rtx mem;
15011
15012 v4fma_expand:
15013 wide_reg = gen_reg_rtx (wide_mode);
15014 for (i = 0; i < 4; i++)
15015 {
15016 args[i] = CALL_EXPR_ARG (exp, i);
15017 ops[i] = expand_normal (args[i]);
15018
15019 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
15020 ops[i]);
15021 }
15022
15023 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15024 accum = force_reg (mode, accum);
15025
15026 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15027 addr = force_reg (Pmode, addr);
15028
15029 mem = gen_rtx_MEM (nar_mode, addr);
15030
15031 target = gen_reg_rtx (mode);
15032
15033 emit_move_insn (target, accum);
15034
15035 if (! masked)
15036 emit_insn (fcn (target, accum, wide_reg, mem));
15037 else
15038 {
15039 rtx merge, mask;
15040 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15041
15042 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15043
15044 if (CONST_INT_P (mask))
15045 mask = fixup_modeless_constant (mask, HImode);
15046
15047 mask = force_reg (HImode, mask);
15048
15049 if (GET_MODE (mask) != HImode)
15050 mask = gen_rtx_SUBREG (HImode, mask, 0);
15051
15052 /* If merge is 0 then we're about to emit z-masked variant. */
15053 if (const0_operand (merge, mode))
15054 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15055 /* If merge is the same as accum then emit merge-masked variant. */
15056 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15057 {
15058 merge = force_reg (mode, merge);
15059 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15060 }
15061 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15062 else
15063 {
15064 target = gen_reg_rtx (mode);
15065 emit_move_insn (target, merge);
15066 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15067 }
15068 }
15069 return target;
15070 }
15071
15072 case IX86_BUILTIN_4FNMASS:
15073 fcn = gen_avx5124fmaddps_4fnmaddss;
15074 masked = 0;
15075 goto s4fma_expand;
15076
15077 case IX86_BUILTIN_4FMASS:
15078 fcn = gen_avx5124fmaddps_4fmaddss;
15079 masked = 0;
15080 goto s4fma_expand;
15081
15082 case IX86_BUILTIN_4FNMASS_MASK:
15083 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
15084 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
15085 goto s4fma_expand;
15086
15087 case IX86_BUILTIN_4FMASS_MASK:
15088 {
15089 tree args[4];
15090 rtx ops[4];
15091 rtx wide_reg;
15092 rtx accum;
15093 rtx addr;
15094 rtx mem;
15095
15096 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
15097 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
15098
15099 s4fma_expand:
15100 mode = V4SFmode;
15101 wide_reg = gen_reg_rtx (V64SFmode);
15102 for (i = 0; i < 4; i++)
15103 {
15104 rtx tmp;
15105 args[i] = CALL_EXPR_ARG (exp, i);
15106 ops[i] = expand_normal (args[i]);
15107
15108 tmp = gen_reg_rtx (SFmode);
15109 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
15110
15111 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
15112 gen_rtx_SUBREG (V16SFmode, tmp, 0));
15113 }
15114
15115 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15116 accum = force_reg (V4SFmode, accum);
15117
15118 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15119 addr = force_reg (Pmode, addr);
15120
15121 mem = gen_rtx_MEM (V4SFmode, addr);
15122
15123 target = gen_reg_rtx (V4SFmode);
15124
15125 emit_move_insn (target, accum);
15126
15127 if (! masked)
15128 emit_insn (fcn (target, accum, wide_reg, mem));
15129 else
15130 {
15131 rtx merge, mask;
15132 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15133
15134 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15135
15136 if (CONST_INT_P (mask))
15137 mask = fixup_modeless_constant (mask, QImode);
15138
15139 mask = force_reg (QImode, mask);
15140
15141 if (GET_MODE (mask) != QImode)
15142 mask = gen_rtx_SUBREG (QImode, mask, 0);
15143
15144 /* If merge is 0 then we're about to emit z-masked variant. */
15145 if (const0_operand (merge, mode))
15146 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15147 /* If merge is the same as accum then emit merge-masked
15148 variant. */
15149 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15150 {
15151 merge = force_reg (mode, merge);
15152 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15153 }
15154 /* Merge with something unknown might happen if we z-mask
15155 w/ -O0. */
15156 else
15157 {
15158 target = gen_reg_rtx (mode);
15159 emit_move_insn (target, merge);
15160 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15161 }
15162 }
15163 return target;
15164 }
15165 case IX86_BUILTIN_RDPID:
15166 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
15167 target);
15168 case IX86_BUILTIN_FABSQ:
15169 case IX86_BUILTIN_COPYSIGNQ:
15170 if (!TARGET_SSE)
15171 /* Emit a normal call if SSE isn't available. */
15172 return expand_call (exp, target, ignore);
15173 /* FALLTHRU */
15174 default:
15175 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
15176 }
15177 }
15178
15179 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
15180 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
15181 {
15182 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
15183 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
15184 }
15185
15186 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15187 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
15188 {
15189 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
15190 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
15191 }
15192
15193 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15194 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
15195 {
15196 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
15197 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
15198 }
15199
15200 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15201 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
15202 {
15203 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
15204 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
15205 }
15206
15207 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15208 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
15209 {
15210 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
15211 const struct builtin_description *d = bdesc_multi_arg + i;
15212 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
15213 (enum ix86_builtin_func_type)
15214 d->flag, d->comparison);
15215 }
15216
15217 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
15218 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
15219 {
15220 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
15221 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
15222 target);
15223 }
15224
15225 gcc_unreachable ();
15226 }
15227
15228 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15229 fill target with val via vec_duplicate. */
15230
15231 static bool
15232 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
15233 {
15234 bool ok;
15235 rtx_insn *insn;
15236 rtx dup;
15237 /* Save/restore recog_data in case this is called from splitters
15238 or other routines where recog_data needs to stay valid across
15239 force_reg. See PR106577. */
15240 recog_data_d recog_data_save = recog_data;
15241
15242 /* First attempt to recognize VAL as-is. */
15243 dup = gen_vec_duplicate (mode, val);
15244 insn = emit_insn (gen_rtx_SET (target, dup));
15245 if (recog_memoized (insn) < 0)
15246 {
15247 rtx_insn *seq;
15248 machine_mode innermode = GET_MODE_INNER (mode);
15249 rtx reg;
15250
15251 /* If that fails, force VAL into a register. */
15252
15253 start_sequence ();
15254 reg = force_reg (innermode, val);
15255 if (GET_MODE (reg) != innermode)
15256 reg = gen_lowpart (innermode, reg);
15257 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
15258 seq = get_insns ();
15259 end_sequence ();
15260 if (seq)
15261 emit_insn_before (seq, insn);
15262
15263 ok = recog_memoized (insn) >= 0;
15264 gcc_assert (ok);
15265 }
15266 recog_data = recog_data_save;
15267 return true;
15268 }
15269
15270 /* Get a vector mode of the same size as the original but with elements
15271 twice as wide. This is only guaranteed to apply to integral vectors. */
15272
15273 static machine_mode
15274 get_mode_wider_vector (machine_mode o)
15275 {
15276 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15277 machine_mode n = GET_MODE_NEXT_MODE (o).require ();
15278 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
15279 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
15280 return n;
15281 }
15282
15283 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
15284 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
15285
15286 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15287 with all elements equal to VAR. Return true if successful. */
15288
15289 bool
15290 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
15291 rtx target, rtx val)
15292 {
15293 bool ok;
15294
15295 switch (mode)
15296 {
15297 case E_V2SImode:
15298 case E_V2SFmode:
15299 if (!mmx_ok)
15300 return false;
15301 /* FALLTHRU */
15302
15303 case E_V4DFmode:
15304 case E_V4DImode:
15305 case E_V8SFmode:
15306 case E_V8SImode:
15307 case E_V2DFmode:
15308 case E_V2DImode:
15309 case E_V4SFmode:
15310 case E_V4SImode:
15311 case E_V16SImode:
15312 case E_V8DImode:
15313 case E_V16SFmode:
15314 case E_V8DFmode:
15315 return ix86_vector_duplicate_value (mode, target, val);
15316
15317 case E_V4HImode:
15318 if (!mmx_ok)
15319 return false;
15320 if (TARGET_SSE || TARGET_3DNOW_A)
15321 {
15322 rtx x;
15323
15324 val = gen_lowpart (SImode, val);
15325 x = gen_rtx_TRUNCATE (HImode, val);
15326 x = gen_rtx_VEC_DUPLICATE (mode, x);
15327 emit_insn (gen_rtx_SET (target, x));
15328 return true;
15329 }
15330 goto widen;
15331
15332 case E_V2HImode:
15333 if (TARGET_SSE2)
15334 {
15335 rtx x;
15336
15337 val = gen_lowpart (SImode, val);
15338 x = gen_rtx_TRUNCATE (HImode, val);
15339 x = gen_rtx_VEC_DUPLICATE (mode, x);
15340 emit_insn (gen_rtx_SET (target, x));
15341 return true;
15342 }
15343 return false;
15344
15345 case E_V8QImode:
15346 case E_V4QImode:
15347 if (!mmx_ok)
15348 return false;
15349 goto widen;
15350
15351 case E_V8HImode:
15352 case E_V8HFmode:
15353 case E_V8BFmode:
15354 if (TARGET_AVX2)
15355 return ix86_vector_duplicate_value (mode, target, val);
15356
15357 if (TARGET_SSE2)
15358 {
15359 struct expand_vec_perm_d dperm;
15360 rtx tmp1, tmp2;
15361
15362 permute:
15363 memset (&dperm, 0, sizeof (dperm));
15364 dperm.target = target;
15365 dperm.vmode = mode;
15366 dperm.nelt = GET_MODE_NUNITS (mode);
15367 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15368 dperm.one_operand_p = true;
15369
15370 if (mode == V8HFmode || mode == V8BFmode)
15371 {
15372 tmp1 = force_reg (GET_MODE_INNER (mode), val);
15373 tmp2 = gen_reg_rtx (mode);
15374 emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
15375 CONST0_RTX (mode), tmp1));
15376 tmp1 = gen_lowpart (mode, tmp2);
15377 }
15378 else
15379 {
15380 /* Extend to SImode using a paradoxical SUBREG. */
15381 tmp1 = gen_reg_rtx (SImode);
15382 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15383
15384 /* Insert the SImode value as
15385 low element of a V4SImode vector. */
15386 tmp2 = gen_reg_rtx (V4SImode);
15387 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15388 tmp1 = gen_lowpart (mode, tmp2);
15389 }
15390
15391 emit_move_insn (dperm.op0, tmp1);
15392 ok = (expand_vec_perm_1 (&dperm)
15393 || expand_vec_perm_broadcast_1 (&dperm));
15394 gcc_assert (ok);
15395 return ok;
15396 }
15397 goto widen;
15398
15399 case E_V16QImode:
15400 if (TARGET_AVX2)
15401 return ix86_vector_duplicate_value (mode, target, val);
15402
15403 if (TARGET_SSE2)
15404 goto permute;
15405 goto widen;
15406
15407 widen:
15408 /* Replicate the value once into the next wider mode and recurse. */
15409 {
15410 machine_mode smode, wsmode, wvmode;
15411 rtx x;
15412
15413 smode = GET_MODE_INNER (mode);
15414 wvmode = get_mode_wider_vector (mode);
15415 wsmode = GET_MODE_INNER (wvmode);
15416
15417 val = convert_modes (wsmode, smode, val, true);
15418
15419 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15420 emit_insn (gen_insv_1 (wsmode, val, val));
15421 else
15422 {
15423 x = expand_simple_binop (wsmode, ASHIFT, val,
15424 GEN_INT (GET_MODE_BITSIZE (smode)),
15425 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15426 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15427 OPTAB_LIB_WIDEN);
15428 }
15429
15430 x = gen_reg_rtx (wvmode);
15431 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15432 gcc_assert (ok);
15433 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15434 return ok;
15435 }
15436
15437 case E_V16HImode:
15438 case E_V16HFmode:
15439 case E_V16BFmode:
15440 case E_V32QImode:
15441 if (TARGET_AVX2)
15442 return ix86_vector_duplicate_value (mode, target, val);
15443 else
15444 {
15445 machine_mode hvmode;
15446 switch (mode)
15447 {
15448 case V16HImode:
15449 hvmode = V8HImode;
15450 break;
15451 case V16HFmode:
15452 hvmode = V8HFmode;
15453 break;
15454 case V16BFmode:
15455 hvmode = V8BFmode;
15456 break;
15457 case V32QImode:
15458 hvmode = V16QImode;
15459 break;
15460 default:
15461 gcc_unreachable ();
15462 }
15463 rtx x = gen_reg_rtx (hvmode);
15464
15465 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15466 gcc_assert (ok);
15467
15468 x = gen_rtx_VEC_CONCAT (mode, x, x);
15469 emit_insn (gen_rtx_SET (target, x));
15470 }
15471 return true;
15472
15473 case E_V32HImode:
15474 case E_V32HFmode:
15475 case E_V32BFmode:
15476 case E_V64QImode:
15477 if (TARGET_AVX512BW)
15478 return ix86_vector_duplicate_value (mode, target, val);
15479 else
15480 {
15481 machine_mode hvmode;
15482 switch (mode)
15483 {
15484 case V32HImode:
15485 hvmode = V16HImode;
15486 break;
15487 case V32HFmode:
15488 hvmode = V16HFmode;
15489 break;
15490 case V32BFmode:
15491 hvmode = V16BFmode;
15492 break;
15493 case V64QImode:
15494 hvmode = V32QImode;
15495 break;
15496 default:
15497 gcc_unreachable ();
15498 }
15499 rtx x = gen_reg_rtx (hvmode);
15500
15501 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15502 gcc_assert (ok);
15503
15504 x = gen_rtx_VEC_CONCAT (mode, x, x);
15505 emit_insn (gen_rtx_SET (target, x));
15506 }
15507 return true;
15508
15509 default:
15510 return false;
15511 }
15512 }
15513
15514 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15515 whose ONE_VAR element is VAR, and other elements are zero. Return true
15516 if successful. */
15517
15518 static bool
15519 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15520 rtx target, rtx var, int one_var)
15521 {
15522 machine_mode vsimode;
15523 rtx new_target;
15524 rtx x, tmp;
15525 bool use_vector_set = false;
15526 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15527
15528 switch (mode)
15529 {
15530 case E_V2DImode:
15531 /* For SSE4.1, we normally use vector set. But if the second
15532 element is zero and inter-unit moves are OK, we use movq
15533 instead. */
15534 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15535 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15536 && one_var == 0));
15537 break;
15538 case E_V16QImode:
15539 case E_V4SImode:
15540 case E_V4SFmode:
15541 use_vector_set = TARGET_SSE4_1;
15542 break;
15543 case E_V8HImode:
15544 use_vector_set = TARGET_SSE2;
15545 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15546 ? gen_vec_setv8hi_0 : NULL;
15547 break;
15548 case E_V8QImode:
15549 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15550 break;
15551 case E_V4HImode:
15552 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15553 break;
15554 case E_V4QImode:
15555 use_vector_set = TARGET_SSE4_1;
15556 break;
15557 case E_V32QImode:
15558 use_vector_set = TARGET_AVX;
15559 break;
15560 case E_V16HImode:
15561 use_vector_set = TARGET_AVX;
15562 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15563 ? gen_vec_setv16hi_0 : NULL;
15564 break;
15565 case E_V8SImode:
15566 use_vector_set = TARGET_AVX;
15567 gen_vec_set_0 = gen_vec_setv8si_0;
15568 break;
15569 case E_V8SFmode:
15570 use_vector_set = TARGET_AVX;
15571 gen_vec_set_0 = gen_vec_setv8sf_0;
15572 break;
15573 case E_V4DFmode:
15574 use_vector_set = TARGET_AVX;
15575 gen_vec_set_0 = gen_vec_setv4df_0;
15576 break;
15577 case E_V4DImode:
15578 /* Use ix86_expand_vector_set in 64bit mode only. */
15579 use_vector_set = TARGET_AVX && TARGET_64BIT;
15580 gen_vec_set_0 = gen_vec_setv4di_0;
15581 break;
15582 case E_V16SImode:
15583 use_vector_set = TARGET_AVX512F && one_var == 0;
15584 gen_vec_set_0 = gen_vec_setv16si_0;
15585 break;
15586 case E_V16SFmode:
15587 use_vector_set = TARGET_AVX512F && one_var == 0;
15588 gen_vec_set_0 = gen_vec_setv16sf_0;
15589 break;
15590 case E_V8DFmode:
15591 use_vector_set = TARGET_AVX512F && one_var == 0;
15592 gen_vec_set_0 = gen_vec_setv8df_0;
15593 break;
15594 case E_V8DImode:
15595 /* Use ix86_expand_vector_set in 64bit mode only. */
15596 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15597 gen_vec_set_0 = gen_vec_setv8di_0;
15598 break;
15599 case E_V8HFmode:
15600 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15601 gen_vec_set_0 = gen_vec_setv8hf_0;
15602 break;
15603 case E_V16HFmode:
15604 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15605 gen_vec_set_0 = gen_vec_setv16hf_0;
15606 break;
15607 case E_V32HFmode:
15608 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15609 gen_vec_set_0 = gen_vec_setv32hf_0;
15610 break;
15611 case E_V8BFmode:
15612 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15613 gen_vec_set_0 = gen_vec_setv8bf_0;
15614 break;
15615 case E_V16BFmode:
15616 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15617 gen_vec_set_0 = gen_vec_setv16bf_0;
15618 break;
15619 case E_V32BFmode:
15620 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15621 gen_vec_set_0 = gen_vec_setv32bf_0;
15622 break;
15623 case E_V32HImode:
15624 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15625 gen_vec_set_0 = gen_vec_setv32hi_0;
15626 default:
15627 break;
15628 }
15629
15630 if (use_vector_set)
15631 {
15632 if (gen_vec_set_0 && one_var == 0)
15633 {
15634 var = force_reg (GET_MODE_INNER (mode), var);
15635 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15636 return true;
15637 }
15638 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15639 var = force_reg (GET_MODE_INNER (mode), var);
15640 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15641 return true;
15642 }
15643
15644 switch (mode)
15645 {
15646 case E_V2SFmode:
15647 case E_V2SImode:
15648 if (!mmx_ok)
15649 return false;
15650 /* FALLTHRU */
15651
15652 case E_V2DFmode:
15653 case E_V2DImode:
15654 if (one_var != 0)
15655 return false;
15656 var = force_reg (GET_MODE_INNER (mode), var);
15657 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15658 emit_insn (gen_rtx_SET (target, x));
15659 return true;
15660
15661 case E_V4SFmode:
15662 case E_V4SImode:
15663 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15664 new_target = gen_reg_rtx (mode);
15665 else
15666 new_target = target;
15667 var = force_reg (GET_MODE_INNER (mode), var);
15668 x = gen_rtx_VEC_DUPLICATE (mode, var);
15669 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15670 emit_insn (gen_rtx_SET (new_target, x));
15671 if (one_var != 0)
15672 {
15673 /* We need to shuffle the value to the correct position, so
15674 create a new pseudo to store the intermediate result. */
15675
15676 /* With SSE2, we can use the integer shuffle insns. */
15677 if (mode != V4SFmode && TARGET_SSE2)
15678 {
15679 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15680 const1_rtx,
15681 GEN_INT (one_var == 1 ? 0 : 1),
15682 GEN_INT (one_var == 2 ? 0 : 1),
15683 GEN_INT (one_var == 3 ? 0 : 1)));
15684 if (target != new_target)
15685 emit_move_insn (target, new_target);
15686 return true;
15687 }
15688
15689 /* Otherwise convert the intermediate result to V4SFmode and
15690 use the SSE1 shuffle instructions. */
15691 if (mode != V4SFmode)
15692 {
15693 tmp = gen_reg_rtx (V4SFmode);
15694 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15695 }
15696 else
15697 tmp = new_target;
15698
15699 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15700 const1_rtx,
15701 GEN_INT (one_var == 1 ? 0 : 1),
15702 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15703 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15704
15705 if (mode != V4SFmode)
15706 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15707 else if (tmp != target)
15708 emit_move_insn (target, tmp);
15709 }
15710 else if (target != new_target)
15711 emit_move_insn (target, new_target);
15712 return true;
15713
15714 case E_V8HImode:
15715 case E_V16QImode:
15716 vsimode = V4SImode;
15717 goto widen;
15718 case E_V4HImode:
15719 case E_V8QImode:
15720 if (!mmx_ok)
15721 return false;
15722 vsimode = V2SImode;
15723 goto widen;
15724 widen:
15725 if (one_var != 0)
15726 return false;
15727
15728 /* Zero extend the variable element to SImode and recurse. */
15729 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15730
15731 x = gen_reg_rtx (vsimode);
15732 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15733 var, one_var))
15734 gcc_unreachable ();
15735
15736 emit_move_insn (target, gen_lowpart (mode, x));
15737 return true;
15738
15739 default:
15740 return false;
15741 }
15742 }
15743
15744 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15745 consisting of the values in VALS. It is known that all elements
15746 except ONE_VAR are constants. Return true if successful. */
15747
15748 static bool
15749 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15750 rtx target, rtx vals, int one_var)
15751 {
15752 rtx var = XVECEXP (vals, 0, one_var);
15753 machine_mode wmode;
15754 rtx const_vec, x;
15755
15756 const_vec = copy_rtx (vals);
15757 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15758 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15759
15760 switch (mode)
15761 {
15762 case E_V2DFmode:
15763 case E_V2DImode:
15764 case E_V2SFmode:
15765 case E_V2SImode:
15766 /* For the two element vectors, it's just as easy to use
15767 the general case. */
15768 return false;
15769
15770 case E_V4DImode:
15771 /* Use ix86_expand_vector_set in 64bit mode only. */
15772 if (!TARGET_64BIT)
15773 return false;
15774 /* FALLTHRU */
15775 case E_V8HFmode:
15776 case E_V16HFmode:
15777 case E_V8BFmode:
15778 case E_V16BFmode:
15779 case E_V4DFmode:
15780 case E_V8SFmode:
15781 case E_V8SImode:
15782 case E_V16HImode:
15783 case E_V32QImode:
15784 case E_V4SFmode:
15785 case E_V4SImode:
15786 case E_V8HImode:
15787 case E_V4HImode:
15788 break;
15789
15790 case E_V16QImode:
15791 if (TARGET_SSE4_1)
15792 break;
15793 wmode = V8HImode;
15794 goto widen;
15795 case E_V8QImode:
15796 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15797 break;
15798 wmode = V4HImode;
15799 goto widen;
15800 case E_V4QImode:
15801 if (TARGET_SSE4_1)
15802 break;
15803 wmode = V2HImode;
15804 widen:
15805 /* There's no way to set one QImode entry easily. Combine
15806 the variable value with its adjacent constant value, and
15807 promote to an HImode set. */
15808 x = XVECEXP (vals, 0, one_var ^ 1);
15809 if (one_var & 1)
15810 {
15811 var = convert_modes (HImode, QImode, var, true);
15812 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15813 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15814 x = GEN_INT (INTVAL (x) & 0xff);
15815 }
15816 else
15817 {
15818 var = convert_modes (HImode, QImode, var, true);
15819 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15820 }
15821 if (x != const0_rtx)
15822 var = expand_simple_binop (HImode, IOR, var, x, var,
15823 1, OPTAB_LIB_WIDEN);
15824
15825 x = gen_reg_rtx (wmode);
15826 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15827 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15828
15829 emit_move_insn (target, gen_lowpart (mode, x));
15830 return true;
15831
15832 default:
15833 return false;
15834 }
15835
15836 emit_move_insn (target, const_vec);
15837 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15838 return true;
15839 }
15840
15841 /* A subroutine of ix86_expand_vector_init_general. Use vector
15842 concatenate to handle the most general case: all values variable,
15843 and none identical. */
15844
15845 static void
15846 ix86_expand_vector_init_concat (machine_mode mode,
15847 rtx target, rtx *ops, int n)
15848 {
15849 machine_mode half_mode = VOIDmode;
15850 rtx half[2];
15851 rtvec v;
15852 int i, j;
15853
15854 switch (n)
15855 {
15856 case 2:
15857 switch (mode)
15858 {
15859 case E_V32HFmode:
15860 half_mode = V16HFmode;
15861 break;
15862 case E_V32BFmode:
15863 half_mode = V16BFmode;
15864 break;
15865 case E_V16SImode:
15866 half_mode = V8SImode;
15867 break;
15868 case E_V16SFmode:
15869 half_mode = V8SFmode;
15870 break;
15871 case E_V8DImode:
15872 half_mode = V4DImode;
15873 break;
15874 case E_V8DFmode:
15875 half_mode = V4DFmode;
15876 break;
15877 case E_V16HFmode:
15878 half_mode = V8HFmode;
15879 break;
15880 case E_V16BFmode:
15881 half_mode = V8BFmode;
15882 break;
15883 case E_V8SImode:
15884 half_mode = V4SImode;
15885 break;
15886 case E_V8SFmode:
15887 half_mode = V4SFmode;
15888 break;
15889 case E_V4DImode:
15890 half_mode = V2DImode;
15891 break;
15892 case E_V4DFmode:
15893 half_mode = V2DFmode;
15894 break;
15895 case E_V4SImode:
15896 half_mode = V2SImode;
15897 break;
15898 case E_V4SFmode:
15899 half_mode = V2SFmode;
15900 break;
15901 case E_V2DImode:
15902 half_mode = DImode;
15903 break;
15904 case E_V2SImode:
15905 half_mode = SImode;
15906 break;
15907 case E_V2DFmode:
15908 half_mode = DFmode;
15909 break;
15910 case E_V2SFmode:
15911 half_mode = SFmode;
15912 break;
15913 default:
15914 gcc_unreachable ();
15915 }
15916
15917 if (!register_operand (ops[1], half_mode))
15918 ops[1] = force_reg (half_mode, ops[1]);
15919 if (!register_operand (ops[0], half_mode))
15920 ops[0] = force_reg (half_mode, ops[0]);
15921 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15922 ops[1])));
15923 break;
15924
15925 case 4:
15926 switch (mode)
15927 {
15928 case E_V4DImode:
15929 half_mode = V2DImode;
15930 break;
15931 case E_V4DFmode:
15932 half_mode = V2DFmode;
15933 break;
15934 case E_V4SImode:
15935 half_mode = V2SImode;
15936 break;
15937 case E_V4SFmode:
15938 half_mode = V2SFmode;
15939 break;
15940 default:
15941 gcc_unreachable ();
15942 }
15943 goto half;
15944
15945 case 8:
15946 switch (mode)
15947 {
15948 case E_V8DImode:
15949 half_mode = V4DImode;
15950 break;
15951 case E_V8DFmode:
15952 half_mode = V4DFmode;
15953 break;
15954 case E_V8SImode:
15955 half_mode = V4SImode;
15956 break;
15957 case E_V8SFmode:
15958 half_mode = V4SFmode;
15959 break;
15960 default:
15961 gcc_unreachable ();
15962 }
15963 goto half;
15964
15965 case 16:
15966 switch (mode)
15967 {
15968 case E_V16SImode:
15969 half_mode = V8SImode;
15970 break;
15971 case E_V16SFmode:
15972 half_mode = V8SFmode;
15973 break;
15974 default:
15975 gcc_unreachable ();
15976 }
15977 goto half;
15978
15979 half:
15980 /* FIXME: We process inputs backward to help RA. PR 36222. */
15981 i = n - 1;
15982 for (j = 1; j != -1; j--)
15983 {
15984 half[j] = gen_reg_rtx (half_mode);
15985 switch (n >> 1)
15986 {
15987 case 2:
15988 v = gen_rtvec (2, ops[i-1], ops[i]);
15989 i -= 2;
15990 break;
15991 case 4:
15992 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
15993 i -= 4;
15994 break;
15995 case 8:
15996 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
15997 ops[i-3], ops[i-2], ops[i-1], ops[i]);
15998 i -= 8;
15999 break;
16000 default:
16001 gcc_unreachable ();
16002 }
16003 ix86_expand_vector_init (false, half[j],
16004 gen_rtx_PARALLEL (half_mode, v));
16005 }
16006
16007 ix86_expand_vector_init_concat (mode, target, half, 2);
16008 break;
16009
16010 default:
16011 gcc_unreachable ();
16012 }
16013 }
16014
16015 /* A subroutine of ix86_expand_vector_init_general. Use vector
16016 interleave to handle the most general case: all values variable,
16017 and none identical. */
16018
16019 static void
16020 ix86_expand_vector_init_interleave (machine_mode mode,
16021 rtx target, rtx *ops, int n)
16022 {
16023 machine_mode first_imode, second_imode, third_imode, inner_mode;
16024 int i, j;
16025 rtx op, op0, op1;
16026 rtx (*gen_load_even) (rtx, rtx, rtx);
16027 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
16028 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
16029
16030 switch (mode)
16031 {
16032 case E_V8HFmode:
16033 gen_load_even = gen_vec_interleave_lowv8hf;
16034 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16035 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16036 inner_mode = HFmode;
16037 first_imode = V4SImode;
16038 second_imode = V2DImode;
16039 third_imode = VOIDmode;
16040 break;
16041 case E_V8BFmode:
16042 gen_load_even = gen_vec_interleave_lowv8bf;
16043 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16044 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16045 inner_mode = BFmode;
16046 first_imode = V4SImode;
16047 second_imode = V2DImode;
16048 third_imode = VOIDmode;
16049 break;
16050 case E_V8HImode:
16051 gen_load_even = gen_vec_setv8hi;
16052 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16053 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16054 inner_mode = HImode;
16055 first_imode = V4SImode;
16056 second_imode = V2DImode;
16057 third_imode = VOIDmode;
16058 break;
16059 case E_V16QImode:
16060 gen_load_even = gen_vec_setv16qi;
16061 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
16062 gen_interleave_second_low = gen_vec_interleave_lowv4si;
16063 inner_mode = QImode;
16064 first_imode = V8HImode;
16065 second_imode = V4SImode;
16066 third_imode = V2DImode;
16067 break;
16068 default:
16069 gcc_unreachable ();
16070 }
16071
16072 for (i = 0; i < n; i++)
16073 {
16074 op = ops [i + i];
16075 if (inner_mode == HFmode || inner_mode == BFmode)
16076 {
16077 rtx even, odd;
16078 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16079 machine_mode vec_mode =
16080 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
16081 op0 = gen_reg_rtx (vec_mode);
16082 even = lowpart_subreg (vec_mode,
16083 force_reg (inner_mode, op), inner_mode);
16084 odd = lowpart_subreg (vec_mode,
16085 force_reg (inner_mode, ops[i + i + 1]),
16086 inner_mode);
16087 emit_insn (gen_load_even (op0, even, odd));
16088 }
16089 else
16090 {
16091 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16092 op0 = gen_reg_rtx (SImode);
16093 emit_move_insn (op0, gen_lowpart (SImode, op));
16094
16095 /* Insert the SImode value as low element of V4SImode vector. */
16096 op1 = gen_reg_rtx (V4SImode);
16097 op0 = gen_rtx_VEC_MERGE (V4SImode,
16098 gen_rtx_VEC_DUPLICATE (V4SImode,
16099 op0),
16100 CONST0_RTX (V4SImode),
16101 const1_rtx);
16102 emit_insn (gen_rtx_SET (op1, op0));
16103
16104 /* Cast the V4SImode vector back to a vector in orignal mode. */
16105 op0 = gen_reg_rtx (mode);
16106 emit_move_insn (op0, gen_lowpart (mode, op1));
16107
16108 /* Load even elements into the second position. */
16109 emit_insn (gen_load_even (op0,
16110 force_reg (inner_mode,
16111 ops[i + i + 1]),
16112 const1_rtx));
16113 }
16114
16115 /* Cast vector to FIRST_IMODE vector. */
16116 ops[i] = gen_reg_rtx (first_imode);
16117 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
16118 }
16119
16120 /* Interleave low FIRST_IMODE vectors. */
16121 for (i = j = 0; i < n; i += 2, j++)
16122 {
16123 op0 = gen_reg_rtx (first_imode);
16124 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
16125
16126 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16127 ops[j] = gen_reg_rtx (second_imode);
16128 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
16129 }
16130
16131 /* Interleave low SECOND_IMODE vectors. */
16132 switch (second_imode)
16133 {
16134 case E_V4SImode:
16135 for (i = j = 0; i < n / 2; i += 2, j++)
16136 {
16137 op0 = gen_reg_rtx (second_imode);
16138 emit_insn (gen_interleave_second_low (op0, ops[i],
16139 ops[i + 1]));
16140
16141 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16142 vector. */
16143 ops[j] = gen_reg_rtx (third_imode);
16144 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
16145 }
16146 second_imode = V2DImode;
16147 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16148 /* FALLTHRU */
16149
16150 case E_V2DImode:
16151 op0 = gen_reg_rtx (second_imode);
16152 emit_insn (gen_interleave_second_low (op0, ops[0],
16153 ops[1]));
16154
16155 /* Cast the SECOND_IMODE vector back to a vector on original
16156 mode. */
16157 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
16158 break;
16159
16160 default:
16161 gcc_unreachable ();
16162 }
16163 }
16164
16165 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
16166 all values variable, and none identical. */
16167
16168 static void
16169 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
16170 rtx target, rtx vals)
16171 {
16172 rtx ops[64], op0, op1, op2, op3, op4, op5;
16173 machine_mode half_mode = VOIDmode;
16174 machine_mode quarter_mode = VOIDmode;
16175 int n, i;
16176
16177 switch (mode)
16178 {
16179 case E_V2SFmode:
16180 case E_V2SImode:
16181 if (!mmx_ok && !TARGET_SSE)
16182 break;
16183 /* FALLTHRU */
16184
16185 case E_V16SImode:
16186 case E_V16SFmode:
16187 case E_V8DFmode:
16188 case E_V8DImode:
16189 case E_V8SFmode:
16190 case E_V8SImode:
16191 case E_V4DFmode:
16192 case E_V4DImode:
16193 case E_V4SFmode:
16194 case E_V4SImode:
16195 case E_V2DFmode:
16196 case E_V2DImode:
16197 n = GET_MODE_NUNITS (mode);
16198 for (i = 0; i < n; i++)
16199 ops[i] = XVECEXP (vals, 0, i);
16200 ix86_expand_vector_init_concat (mode, target, ops, n);
16201 return;
16202
16203 case E_V2TImode:
16204 for (i = 0; i < 2; i++)
16205 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16206 op0 = gen_reg_rtx (V4DImode);
16207 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
16208 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16209 return;
16210
16211 case E_V4TImode:
16212 for (i = 0; i < 4; i++)
16213 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16214 ops[4] = gen_reg_rtx (V4DImode);
16215 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
16216 ops[5] = gen_reg_rtx (V4DImode);
16217 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
16218 op0 = gen_reg_rtx (V8DImode);
16219 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
16220 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16221 return;
16222
16223 case E_V32QImode:
16224 half_mode = V16QImode;
16225 goto half;
16226
16227 case E_V16HImode:
16228 half_mode = V8HImode;
16229 goto half;
16230
16231 case E_V16HFmode:
16232 half_mode = V8HFmode;
16233 goto half;
16234
16235 case E_V16BFmode:
16236 half_mode = V8BFmode;
16237 goto half;
16238
16239 half:
16240 n = GET_MODE_NUNITS (mode);
16241 for (i = 0; i < n; i++)
16242 ops[i] = XVECEXP (vals, 0, i);
16243 op0 = gen_reg_rtx (half_mode);
16244 op1 = gen_reg_rtx (half_mode);
16245 ix86_expand_vector_init_interleave (half_mode, op0, ops,
16246 n >> 2);
16247 ix86_expand_vector_init_interleave (half_mode, op1,
16248 &ops [n >> 1], n >> 2);
16249 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
16250 return;
16251
16252 case E_V64QImode:
16253 quarter_mode = V16QImode;
16254 half_mode = V32QImode;
16255 goto quarter;
16256
16257 case E_V32HImode:
16258 quarter_mode = V8HImode;
16259 half_mode = V16HImode;
16260 goto quarter;
16261
16262 case E_V32HFmode:
16263 quarter_mode = V8HFmode;
16264 half_mode = V16HFmode;
16265 goto quarter;
16266
16267 case E_V32BFmode:
16268 quarter_mode = V8BFmode;
16269 half_mode = V16BFmode;
16270 goto quarter;
16271
16272 quarter:
16273 n = GET_MODE_NUNITS (mode);
16274 for (i = 0; i < n; i++)
16275 ops[i] = XVECEXP (vals, 0, i);
16276 op0 = gen_reg_rtx (quarter_mode);
16277 op1 = gen_reg_rtx (quarter_mode);
16278 op2 = gen_reg_rtx (quarter_mode);
16279 op3 = gen_reg_rtx (quarter_mode);
16280 op4 = gen_reg_rtx (half_mode);
16281 op5 = gen_reg_rtx (half_mode);
16282 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
16283 n >> 3);
16284 ix86_expand_vector_init_interleave (quarter_mode, op1,
16285 &ops [n >> 2], n >> 3);
16286 ix86_expand_vector_init_interleave (quarter_mode, op2,
16287 &ops [n >> 1], n >> 3);
16288 ix86_expand_vector_init_interleave (quarter_mode, op3,
16289 &ops [(n >> 1) | (n >> 2)], n >> 3);
16290 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
16291 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
16292 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
16293 return;
16294
16295 case E_V16QImode:
16296 if (!TARGET_SSE4_1)
16297 break;
16298 /* FALLTHRU */
16299
16300 case E_V8HImode:
16301 if (!TARGET_SSE2)
16302 break;
16303
16304 /* Don't use ix86_expand_vector_init_interleave if we can't
16305 move from GPR to SSE register directly. */
16306 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
16307 break;
16308 /* FALLTHRU */
16309
16310 case E_V8HFmode:
16311 case E_V8BFmode:
16312
16313 n = GET_MODE_NUNITS (mode);
16314 for (i = 0; i < n; i++)
16315 ops[i] = XVECEXP (vals, 0, i);
16316 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
16317 return;
16318
16319 case E_V4HImode:
16320 case E_V8QImode:
16321
16322 case E_V2HImode:
16323 case E_V4QImode:
16324 break;
16325
16326 default:
16327 gcc_unreachable ();
16328 }
16329
16330 {
16331 int i, j, n_elts, n_words, n_elt_per_word;
16332 machine_mode tmp_mode, inner_mode;
16333 rtx words[4], shift;
16334
16335 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
16336
16337 inner_mode = GET_MODE_INNER (mode);
16338 n_elts = GET_MODE_NUNITS (mode);
16339 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
16340 n_elt_per_word = n_elts / n_words;
16341 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
16342
16343 for (i = 0; i < n_words; ++i)
16344 {
16345 rtx word = NULL_RTX;
16346
16347 for (j = 0; j < n_elt_per_word; ++j)
16348 {
16349 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
16350 elt = convert_modes (tmp_mode, inner_mode, elt, true);
16351
16352 if (j == 0)
16353 word = elt;
16354 else
16355 {
16356 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
16357 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16358 word = expand_simple_binop (tmp_mode, IOR, word, elt,
16359 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16360 }
16361 }
16362
16363 words[i] = word;
16364 }
16365
16366 if (n_words == 1)
16367 emit_move_insn (target, gen_lowpart (mode, words[0]));
16368 else if (n_words == 2)
16369 {
16370 rtx tmp = gen_reg_rtx (mode);
16371 emit_clobber (tmp);
16372 emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
16373 emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
16374 emit_move_insn (target, tmp);
16375 }
16376 else if (n_words == 4)
16377 {
16378 rtx tmp = gen_reg_rtx (V4SImode);
16379 gcc_assert (tmp_mode == SImode);
16380 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16381 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
16382 emit_move_insn (target, gen_lowpart (mode, tmp));
16383 }
16384 else
16385 gcc_unreachable ();
16386 }
16387 }
16388
16389 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16390 instructions unless MMX_OK is true. */
16391
16392 void
16393 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16394 {
16395 machine_mode mode = GET_MODE (target);
16396 machine_mode inner_mode = GET_MODE_INNER (mode);
16397 int n_elts = GET_MODE_NUNITS (mode);
16398 int n_var = 0, one_var = -1;
16399 bool all_same = true, all_const_zero = true;
16400 int i;
16401 rtx x;
16402
16403 /* Handle first initialization from vector elts. */
16404 if (n_elts != XVECLEN (vals, 0))
16405 {
16406 rtx subtarget = target;
16407 x = XVECEXP (vals, 0, 0);
16408 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16409 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16410 {
16411 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
16412 if (inner_mode == QImode
16413 || inner_mode == HImode
16414 || inner_mode == TImode
16415 || inner_mode == HFmode
16416 || inner_mode == BFmode)
16417 {
16418 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
16419 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16420 n_bits /= GET_MODE_SIZE (elt_mode);
16421 mode = mode_for_vector (elt_mode, n_bits).require ();
16422 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
16423 ops[0] = gen_lowpart (inner_mode, ops[0]);
16424 ops[1] = gen_lowpart (inner_mode, ops[1]);
16425 subtarget = gen_reg_rtx (mode);
16426 }
16427 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
16428 if (subtarget != target)
16429 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
16430 return;
16431 }
16432 gcc_unreachable ();
16433 }
16434
16435 for (i = 0; i < n_elts; ++i)
16436 {
16437 x = XVECEXP (vals, 0, i);
16438 if (!(CONST_SCALAR_INT_P (x)
16439 || CONST_DOUBLE_P (x)
16440 || CONST_FIXED_P (x)))
16441 n_var++, one_var = i;
16442 else if (x != CONST0_RTX (inner_mode))
16443 all_const_zero = false;
16444 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
16445 all_same = false;
16446 }
16447
16448 /* Constants are best loaded from the constant pool. */
16449 if (n_var == 0)
16450 {
16451 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
16452 return;
16453 }
16454
16455 /* If all values are identical, broadcast the value. */
16456 if (all_same
16457 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16458 XVECEXP (vals, 0, 0)))
16459 return;
16460
16461 /* Values where only one field is non-constant are best loaded from
16462 the pool and overwritten via move later. */
16463 if (n_var == 1)
16464 {
16465 if (all_const_zero
16466 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16467 XVECEXP (vals, 0, one_var),
16468 one_var))
16469 return;
16470
16471 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16472 return;
16473 }
16474
16475 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16476 }
16477
16478 /* Implemented as
16479 V setg (V v, int idx, T val)
16480 {
16481 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16482 V valv = (V){val, val, val, val, val, val, val, val};
16483 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16484 v = (v & ~mask) | (valv & mask);
16485 return v;
16486 }. */
16487 void
16488 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16489 {
16490 rtx vec[64];
16491 machine_mode mode = GET_MODE (target);
16492 machine_mode cmp_mode = mode;
16493 int n_elts = GET_MODE_NUNITS (mode);
16494 rtx valv,idxv,constv,idx_tmp;
16495 bool ok = false;
16496
16497 /* 512-bits vector byte/word broadcast and comparison only available
16498 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16499 when without TARGET_AVX512BW. */
16500 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
16501 || mode == V64QImode)
16502 && !TARGET_AVX512BW)
16503 {
16504 gcc_assert (TARGET_AVX512F);
16505 rtx vhi, vlo, idx_hi;
16506 machine_mode half_mode;
16507 rtx (*extract_hi)(rtx, rtx);
16508 rtx (*extract_lo)(rtx, rtx);
16509
16510 if (mode == V32HImode)
16511 {
16512 half_mode = V16HImode;
16513 extract_hi = gen_vec_extract_hi_v32hi;
16514 extract_lo = gen_vec_extract_lo_v32hi;
16515 }
16516 else if (mode == V32HFmode)
16517 {
16518 half_mode = V16HFmode;
16519 extract_hi = gen_vec_extract_hi_v32hf;
16520 extract_lo = gen_vec_extract_lo_v32hf;
16521 }
16522 else if (mode == V32BFmode)
16523 {
16524 half_mode = V16BFmode;
16525 extract_hi = gen_vec_extract_hi_v32bf;
16526 extract_lo = gen_vec_extract_lo_v32bf;
16527 }
16528 else
16529 {
16530 half_mode = V32QImode;
16531 extract_hi = gen_vec_extract_hi_v64qi;
16532 extract_lo = gen_vec_extract_lo_v64qi;
16533 }
16534
16535 vhi = gen_reg_rtx (half_mode);
16536 vlo = gen_reg_rtx (half_mode);
16537 idx_hi = gen_reg_rtx (GET_MODE (idx));
16538 emit_insn (extract_hi (vhi, target));
16539 emit_insn (extract_lo (vlo, target));
16540 vec[0] = idx_hi;
16541 vec[1] = idx;
16542 vec[2] = GEN_INT (n_elts/2);
16543 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16544 ix86_expand_vector_set_var (vhi, val, idx_hi);
16545 ix86_expand_vector_set_var (vlo, val, idx);
16546 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16547 return;
16548 }
16549
16550 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16551 {
16552 switch (mode)
16553 {
16554 case E_V2DFmode:
16555 cmp_mode = V2DImode;
16556 break;
16557 case E_V4DFmode:
16558 cmp_mode = V4DImode;
16559 break;
16560 case E_V8DFmode:
16561 cmp_mode = V8DImode;
16562 break;
16563 case E_V2SFmode:
16564 cmp_mode = V2SImode;
16565 break;
16566 case E_V4SFmode:
16567 cmp_mode = V4SImode;
16568 break;
16569 case E_V8SFmode:
16570 cmp_mode = V8SImode;
16571 break;
16572 case E_V16SFmode:
16573 cmp_mode = V16SImode;
16574 break;
16575 case E_V8HFmode:
16576 cmp_mode = V8HImode;
16577 break;
16578 case E_V16HFmode:
16579 cmp_mode = V16HImode;
16580 break;
16581 case E_V32HFmode:
16582 cmp_mode = V32HImode;
16583 break;
16584 case E_V8BFmode:
16585 cmp_mode = V8HImode;
16586 break;
16587 case E_V16BFmode:
16588 cmp_mode = V16HImode;
16589 break;
16590 case E_V32BFmode:
16591 cmp_mode = V32HImode;
16592 break;
16593 default:
16594 gcc_unreachable ();
16595 }
16596 }
16597
16598 for (int i = 0; i != n_elts; i++)
16599 vec[i] = GEN_INT (i);
16600 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16601 valv = gen_reg_rtx (mode);
16602 idxv = gen_reg_rtx (cmp_mode);
16603 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16604
16605 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16606 mode, valv, val);
16607 gcc_assert (ok);
16608 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16609 cmp_mode, idxv, idx_tmp);
16610 gcc_assert (ok);
16611 vec[0] = target;
16612 vec[1] = valv;
16613 vec[2] = target;
16614 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16615 vec[4] = idxv;
16616 vec[5] = constv;
16617 ok = ix86_expand_int_vcond (vec);
16618 gcc_assert (ok);
16619 }
16620
16621 void
16622 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16623 {
16624 machine_mode mode = GET_MODE (target);
16625 machine_mode inner_mode = GET_MODE_INNER (mode);
16626 machine_mode half_mode;
16627 bool use_vec_merge = false;
16628 bool blendm_const = false;
16629 rtx tmp;
16630 static rtx (*gen_extract[8][2]) (rtx, rtx)
16631 = {
16632 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16633 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16634 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16635 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16636 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16637 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16638 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
16639 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
16640 };
16641 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
16642 = {
16643 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16644 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16645 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16646 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16647 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16648 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16649 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16650 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
16651 };
16652 int i, j, n;
16653 machine_mode mmode = VOIDmode;
16654 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16655
16656 switch (mode)
16657 {
16658 case E_V2SImode:
16659 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16660 if (use_vec_merge)
16661 break;
16662 /* FALLTHRU */
16663
16664 case E_V2SFmode:
16665 if (mmx_ok)
16666 {
16667 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16668 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16669 if (elt == 0)
16670 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16671 else
16672 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16673 emit_insn (gen_rtx_SET (target, tmp));
16674 return;
16675 }
16676 break;
16677
16678 case E_V2DImode:
16679 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16680 if (use_vec_merge)
16681 break;
16682
16683 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16684 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16685 if (elt == 0)
16686 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16687 else
16688 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16689 emit_insn (gen_rtx_SET (target, tmp));
16690 return;
16691
16692 case E_V2DFmode:
16693 /* NB: For ELT == 0, use standard scalar operation patterns which
16694 preserve the rest of the vector for combiner:
16695
16696 (vec_merge:V2DF
16697 (vec_duplicate:V2DF (reg:DF))
16698 (reg:V2DF)
16699 (const_int 1))
16700 */
16701 if (elt == 0)
16702 goto do_vec_merge;
16703
16704 {
16705 rtx op0, op1;
16706
16707 /* For the two element vectors, we implement a VEC_CONCAT with
16708 the extraction of the other element. */
16709
16710 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16711 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16712
16713 if (elt == 0)
16714 op0 = val, op1 = tmp;
16715 else
16716 op0 = tmp, op1 = val;
16717
16718 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16719 emit_insn (gen_rtx_SET (target, tmp));
16720 }
16721 return;
16722
16723 case E_V4SFmode:
16724 use_vec_merge = TARGET_SSE4_1;
16725 if (use_vec_merge)
16726 break;
16727
16728 switch (elt)
16729 {
16730 case 0:
16731 use_vec_merge = true;
16732 break;
16733
16734 case 1:
16735 /* tmp = target = A B C D */
16736 tmp = copy_to_reg (target);
16737 /* target = A A B B */
16738 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16739 /* target = X A B B */
16740 ix86_expand_vector_set (false, target, val, 0);
16741 /* target = A X C D */
16742 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16743 const1_rtx, const0_rtx,
16744 GEN_INT (2+4), GEN_INT (3+4)));
16745 return;
16746
16747 case 2:
16748 /* tmp = target = A B C D */
16749 tmp = copy_to_reg (target);
16750 /* tmp = X B C D */
16751 ix86_expand_vector_set (false, tmp, val, 0);
16752 /* target = A B X D */
16753 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16754 const0_rtx, const1_rtx,
16755 GEN_INT (0+4), GEN_INT (3+4)));
16756 return;
16757
16758 case 3:
16759 /* tmp = target = A B C D */
16760 tmp = copy_to_reg (target);
16761 /* tmp = X B C D */
16762 ix86_expand_vector_set (false, tmp, val, 0);
16763 /* target = A B X D */
16764 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16765 const0_rtx, const1_rtx,
16766 GEN_INT (2+4), GEN_INT (0+4)));
16767 return;
16768
16769 default:
16770 gcc_unreachable ();
16771 }
16772 break;
16773
16774 case E_V4SImode:
16775 use_vec_merge = TARGET_SSE4_1;
16776 if (use_vec_merge)
16777 break;
16778
16779 /* Element 0 handled by vec_merge below. */
16780 if (elt == 0)
16781 {
16782 use_vec_merge = true;
16783 break;
16784 }
16785
16786 if (TARGET_SSE2)
16787 {
16788 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16789 store into element 0, then shuffle them back. */
16790
16791 rtx order[4];
16792
16793 order[0] = GEN_INT (elt);
16794 order[1] = const1_rtx;
16795 order[2] = const2_rtx;
16796 order[3] = GEN_INT (3);
16797 order[elt] = const0_rtx;
16798
16799 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16800 order[1], order[2], order[3]));
16801
16802 ix86_expand_vector_set (false, target, val, 0);
16803
16804 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16805 order[1], order[2], order[3]));
16806 }
16807 else
16808 {
16809 /* For SSE1, we have to reuse the V4SF code. */
16810 rtx t = gen_reg_rtx (V4SFmode);
16811 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16812 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16813 emit_move_insn (target, gen_lowpart (mode, t));
16814 }
16815 return;
16816
16817 case E_V8HImode:
16818 case E_V8HFmode:
16819 case E_V8BFmode:
16820 case E_V2HImode:
16821 use_vec_merge = TARGET_SSE2;
16822 break;
16823 case E_V4HImode:
16824 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16825 break;
16826
16827 case E_V16QImode:
16828 case E_V4QImode:
16829 use_vec_merge = TARGET_SSE4_1;
16830 break;
16831
16832 case E_V8QImode:
16833 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16834 break;
16835
16836 case E_V32QImode:
16837 half_mode = V16QImode;
16838 j = 0;
16839 n = 16;
16840 goto half;
16841
16842 case E_V16HFmode:
16843 case E_V16BFmode:
16844 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16845 if (TARGET_AVX2 && elt != 0)
16846 {
16847 mmode = SImode;
16848 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
16849 : gen_avx2_pblendbf_1);
16850 blendm_const = true;
16851 break;
16852 }
16853 else
16854 {
16855 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
16856 j = ((mode == E_V16HFmode) ? 6 : 7);
16857 n = 8;
16858 goto half;
16859 }
16860
16861 case E_V16HImode:
16862 half_mode = V8HImode;
16863 j = 1;
16864 n = 8;
16865 goto half;
16866
16867 case E_V8SImode:
16868 half_mode = V4SImode;
16869 j = 2;
16870 n = 4;
16871 goto half;
16872
16873 case E_V4DImode:
16874 half_mode = V2DImode;
16875 j = 3;
16876 n = 2;
16877 goto half;
16878
16879 case E_V8SFmode:
16880 half_mode = V4SFmode;
16881 j = 4;
16882 n = 4;
16883 goto half;
16884
16885 case E_V4DFmode:
16886 half_mode = V2DFmode;
16887 j = 5;
16888 n = 2;
16889 goto half;
16890
16891 half:
16892 /* Compute offset. */
16893 i = elt / n;
16894 elt %= n;
16895
16896 gcc_assert (i <= 1);
16897
16898 /* Extract the half. */
16899 tmp = gen_reg_rtx (half_mode);
16900 emit_insn (gen_extract[j][i] (tmp, target));
16901
16902 /* Put val in tmp at elt. */
16903 ix86_expand_vector_set (false, tmp, val, elt);
16904
16905 /* Put it back. */
16906 emit_insn (gen_insert[j][i] (target, target, tmp));
16907 return;
16908
16909 case E_V8DFmode:
16910 if (TARGET_AVX512F)
16911 {
16912 mmode = QImode;
16913 gen_blendm = gen_avx512f_blendmv8df;
16914 }
16915 break;
16916
16917 case E_V8DImode:
16918 if (TARGET_AVX512F)
16919 {
16920 mmode = QImode;
16921 gen_blendm = gen_avx512f_blendmv8di;
16922 }
16923 break;
16924
16925 case E_V16SFmode:
16926 if (TARGET_AVX512F)
16927 {
16928 mmode = HImode;
16929 gen_blendm = gen_avx512f_blendmv16sf;
16930 }
16931 break;
16932
16933 case E_V16SImode:
16934 if (TARGET_AVX512F)
16935 {
16936 mmode = HImode;
16937 gen_blendm = gen_avx512f_blendmv16si;
16938 }
16939 break;
16940
16941 case E_V32HFmode:
16942 if (TARGET_AVX512BW)
16943 {
16944 mmode = SImode;
16945 gen_blendm = gen_avx512bw_blendmv32hf;
16946 }
16947 break;
16948 case E_V32BFmode:
16949 if (TARGET_AVX512BW)
16950 {
16951 mmode = SImode;
16952 gen_blendm = gen_avx512bw_blendmv32bf;
16953 }
16954 break;
16955 case E_V32HImode:
16956 if (TARGET_AVX512BW)
16957 {
16958 mmode = SImode;
16959 gen_blendm = gen_avx512bw_blendmv32hi;
16960 }
16961 else if (TARGET_AVX512F)
16962 {
16963 half_mode = E_V8HImode;
16964 n = 8;
16965 goto quarter;
16966 }
16967 break;
16968
16969 case E_V64QImode:
16970 if (TARGET_AVX512BW)
16971 {
16972 mmode = DImode;
16973 gen_blendm = gen_avx512bw_blendmv64qi;
16974 }
16975 else if (TARGET_AVX512F)
16976 {
16977 half_mode = E_V16QImode;
16978 n = 16;
16979 goto quarter;
16980 }
16981 break;
16982
16983 quarter:
16984 /* Compute offset. */
16985 i = elt / n;
16986 elt %= n;
16987
16988 gcc_assert (i <= 3);
16989
16990 {
16991 /* Extract the quarter. */
16992 tmp = gen_reg_rtx (V4SImode);
16993 rtx tmp2 = gen_lowpart (V16SImode, target);
16994 rtx mask = gen_reg_rtx (QImode);
16995
16996 emit_move_insn (mask, constm1_rtx);
16997 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
16998 tmp, mask));
16999
17000 tmp2 = gen_reg_rtx (half_mode);
17001 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
17002 tmp = tmp2;
17003
17004 /* Put val in tmp at elt. */
17005 ix86_expand_vector_set (false, tmp, val, elt);
17006
17007 /* Put it back. */
17008 tmp2 = gen_reg_rtx (V16SImode);
17009 rtx tmp3 = gen_lowpart (V16SImode, target);
17010 mask = gen_reg_rtx (HImode);
17011 emit_move_insn (mask, constm1_rtx);
17012 tmp = gen_lowpart (V4SImode, tmp);
17013 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
17014 tmp3, mask));
17015 emit_move_insn (target, gen_lowpart (mode, tmp2));
17016 }
17017 return;
17018
17019 default:
17020 break;
17021 }
17022
17023 if (mmode != VOIDmode)
17024 {
17025 tmp = gen_reg_rtx (mode);
17026 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
17027 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
17028 /* The avx512*_blendm<mode> expanders have different operand order
17029 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
17030 elements where the mask is set and second input operand otherwise,
17031 in {sse,avx}*_*blend* the first input operand is used for elements
17032 where the mask is clear and second input operand otherwise. */
17033 if (!blendm_const)
17034 merge_mask = force_reg (mmode, merge_mask);
17035 emit_insn (gen_blendm (target, target, tmp, merge_mask));
17036 }
17037 else if (use_vec_merge)
17038 {
17039 do_vec_merge:
17040 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
17041 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
17042 GEN_INT (HOST_WIDE_INT_1U << elt));
17043 emit_insn (gen_rtx_SET (target, tmp));
17044 }
17045 else
17046 {
17047 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17048
17049 emit_move_insn (mem, target);
17050
17051 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
17052 emit_move_insn (tmp, val);
17053
17054 emit_move_insn (target, mem);
17055 }
17056 }
17057
17058 void
17059 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
17060 {
17061 machine_mode mode = GET_MODE (vec);
17062 machine_mode inner_mode = GET_MODE_INNER (mode);
17063 bool use_vec_extr = false;
17064 rtx tmp;
17065
17066 switch (mode)
17067 {
17068 case E_V2SImode:
17069 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17070 if (use_vec_extr)
17071 break;
17072 /* FALLTHRU */
17073
17074 case E_V2SFmode:
17075 if (!mmx_ok)
17076 break;
17077 /* FALLTHRU */
17078
17079 case E_V2DFmode:
17080 case E_V2DImode:
17081 case E_V2TImode:
17082 case E_V4TImode:
17083 use_vec_extr = true;
17084 break;
17085
17086 case E_V4SFmode:
17087 use_vec_extr = TARGET_SSE4_1;
17088 if (use_vec_extr)
17089 break;
17090
17091 switch (elt)
17092 {
17093 case 0:
17094 tmp = vec;
17095 break;
17096
17097 case 1:
17098 case 3:
17099 tmp = gen_reg_rtx (mode);
17100 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
17101 GEN_INT (elt), GEN_INT (elt),
17102 GEN_INT (elt+4), GEN_INT (elt+4)));
17103 break;
17104
17105 case 2:
17106 tmp = gen_reg_rtx (mode);
17107 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
17108 break;
17109
17110 default:
17111 gcc_unreachable ();
17112 }
17113 vec = tmp;
17114 use_vec_extr = true;
17115 elt = 0;
17116 break;
17117
17118 case E_V4SImode:
17119 use_vec_extr = TARGET_SSE4_1;
17120 if (use_vec_extr)
17121 break;
17122
17123 if (TARGET_SSE2)
17124 {
17125 switch (elt)
17126 {
17127 case 0:
17128 tmp = vec;
17129 break;
17130
17131 case 1:
17132 case 3:
17133 tmp = gen_reg_rtx (mode);
17134 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
17135 GEN_INT (elt), GEN_INT (elt),
17136 GEN_INT (elt), GEN_INT (elt)));
17137 break;
17138
17139 case 2:
17140 tmp = gen_reg_rtx (mode);
17141 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
17142 break;
17143
17144 default:
17145 gcc_unreachable ();
17146 }
17147 vec = tmp;
17148 use_vec_extr = true;
17149 elt = 0;
17150 }
17151 else
17152 {
17153 /* For SSE1, we have to reuse the V4SF code. */
17154 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
17155 gen_lowpart (V4SFmode, vec), elt);
17156 return;
17157 }
17158 break;
17159
17160 case E_V8HImode:
17161 case E_V8HFmode:
17162 case E_V8BFmode:
17163 case E_V2HImode:
17164 use_vec_extr = TARGET_SSE2;
17165 break;
17166 case E_V4HImode:
17167 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17168 break;
17169
17170 case E_V16QImode:
17171 use_vec_extr = TARGET_SSE4_1;
17172 if (!use_vec_extr
17173 && TARGET_SSE2
17174 && elt == 0
17175 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
17176 {
17177 tmp = gen_reg_rtx (SImode);
17178 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
17179 0);
17180 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
17181 return;
17182 }
17183 break;
17184 case E_V4QImode:
17185 use_vec_extr = TARGET_SSE4_1;
17186 break;
17187
17188 case E_V8SFmode:
17189 if (TARGET_AVX)
17190 {
17191 tmp = gen_reg_rtx (V4SFmode);
17192 if (elt < 4)
17193 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
17194 else
17195 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
17196 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17197 return;
17198 }
17199 break;
17200
17201 case E_V4DFmode:
17202 if (TARGET_AVX)
17203 {
17204 tmp = gen_reg_rtx (V2DFmode);
17205 if (elt < 2)
17206 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
17207 else
17208 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
17209 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17210 return;
17211 }
17212 break;
17213
17214 case E_V32QImode:
17215 if (TARGET_AVX)
17216 {
17217 tmp = gen_reg_rtx (V16QImode);
17218 if (elt < 16)
17219 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
17220 else
17221 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
17222 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17223 return;
17224 }
17225 break;
17226
17227 case E_V16HImode:
17228 if (TARGET_AVX)
17229 {
17230 tmp = gen_reg_rtx (V8HImode);
17231 if (elt < 8)
17232 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
17233 else
17234 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
17235 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17236 return;
17237 }
17238 break;
17239
17240 case E_V8SImode:
17241 if (TARGET_AVX)
17242 {
17243 tmp = gen_reg_rtx (V4SImode);
17244 if (elt < 4)
17245 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
17246 else
17247 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
17248 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17249 return;
17250 }
17251 break;
17252
17253 case E_V4DImode:
17254 if (TARGET_AVX)
17255 {
17256 tmp = gen_reg_rtx (V2DImode);
17257 if (elt < 2)
17258 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
17259 else
17260 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
17261 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17262 return;
17263 }
17264 break;
17265
17266 case E_V32HImode:
17267 if (TARGET_AVX512BW)
17268 {
17269 tmp = gen_reg_rtx (V16HImode);
17270 if (elt < 16)
17271 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
17272 else
17273 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
17274 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17275 return;
17276 }
17277 break;
17278
17279 case E_V64QImode:
17280 if (TARGET_AVX512BW)
17281 {
17282 tmp = gen_reg_rtx (V32QImode);
17283 if (elt < 32)
17284 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
17285 else
17286 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
17287 ix86_expand_vector_extract (false, target, tmp, elt & 31);
17288 return;
17289 }
17290 break;
17291
17292 case E_V16SFmode:
17293 tmp = gen_reg_rtx (V8SFmode);
17294 if (elt < 8)
17295 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
17296 else
17297 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
17298 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17299 return;
17300
17301 case E_V8DFmode:
17302 tmp = gen_reg_rtx (V4DFmode);
17303 if (elt < 4)
17304 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
17305 else
17306 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
17307 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17308 return;
17309
17310 case E_V16SImode:
17311 tmp = gen_reg_rtx (V8SImode);
17312 if (elt < 8)
17313 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
17314 else
17315 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
17316 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17317 return;
17318
17319 case E_V8DImode:
17320 tmp = gen_reg_rtx (V4DImode);
17321 if (elt < 4)
17322 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
17323 else
17324 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
17325 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17326 return;
17327
17328 case E_V32HFmode:
17329 case E_V32BFmode:
17330 if (TARGET_AVX512BW)
17331 {
17332 tmp = (mode == E_V32HFmode
17333 ? gen_reg_rtx (V16HFmode)
17334 : gen_reg_rtx (V16BFmode));
17335 if (elt < 16)
17336 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
17337 else
17338 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
17339 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17340 return;
17341 }
17342 break;
17343
17344 case E_V16HFmode:
17345 case E_V16BFmode:
17346 if (TARGET_AVX)
17347 {
17348 tmp = (mode == E_V16HFmode
17349 ? gen_reg_rtx (V8HFmode)
17350 : gen_reg_rtx (V8BFmode));
17351 if (elt < 8)
17352 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
17353 else
17354 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
17355 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17356 return;
17357 }
17358 break;
17359
17360 case E_V8QImode:
17361 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17362 /* ??? Could extract the appropriate HImode element and shift. */
17363 break;
17364
17365 default:
17366 break;
17367 }
17368
17369 if (use_vec_extr)
17370 {
17371 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17372 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17373
17374 /* Let the rtl optimizers know about the zero extension performed. */
17375 if (inner_mode == QImode || inner_mode == HImode)
17376 {
17377 rtx reg = gen_reg_rtx (SImode);
17378 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
17379 emit_move_insn (reg, tmp);
17380 tmp = gen_lowpart (inner_mode, reg);
17381 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17382 SUBREG_PROMOTED_SET (tmp, 1);
17383 }
17384
17385 emit_move_insn (target, tmp);
17386 }
17387 else
17388 {
17389 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17390
17391 emit_move_insn (mem, vec);
17392
17393 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17394 emit_move_insn (target, tmp);
17395 }
17396 }
17397
17398 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17399 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17400 The upper bits of DEST are undefined, though they shouldn't cause
17401 exceptions (some bits from src or all zeros are ok). */
17402
17403 static void
17404 emit_reduc_half (rtx dest, rtx src, int i)
17405 {
17406 rtx tem, d = dest;
17407 switch (GET_MODE (src))
17408 {
17409 case E_V4SFmode:
17410 if (i == 128)
17411 tem = gen_sse_movhlps (dest, src, src);
17412 else
17413 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
17414 GEN_INT (1 + 4), GEN_INT (1 + 4));
17415 break;
17416 case E_V2DFmode:
17417 tem = gen_vec_interleave_highv2df (dest, src, src);
17418 break;
17419 case E_V4QImode:
17420 d = gen_reg_rtx (V1SImode);
17421 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
17422 GEN_INT (i / 2));
17423 break;
17424 case E_V4HImode:
17425 d = gen_reg_rtx (V1DImode);
17426 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
17427 GEN_INT (i / 2));
17428 break;
17429 case E_V16QImode:
17430 case E_V8HImode:
17431 case E_V8HFmode:
17432 case E_V4SImode:
17433 case E_V2DImode:
17434 d = gen_reg_rtx (V1TImode);
17435 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
17436 GEN_INT (i / 2));
17437 break;
17438 case E_V8SFmode:
17439 if (i == 256)
17440 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
17441 else
17442 tem = gen_avx_shufps256 (dest, src, src,
17443 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
17444 break;
17445 case E_V4DFmode:
17446 if (i == 256)
17447 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
17448 else
17449 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
17450 break;
17451 case E_V32QImode:
17452 case E_V16HImode:
17453 case E_V16HFmode:
17454 case E_V8SImode:
17455 case E_V4DImode:
17456 if (i == 256)
17457 {
17458 if (GET_MODE (dest) != V4DImode)
17459 d = gen_reg_rtx (V4DImode);
17460 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
17461 gen_lowpart (V4DImode, src),
17462 const1_rtx);
17463 }
17464 else
17465 {
17466 d = gen_reg_rtx (V2TImode);
17467 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
17468 GEN_INT (i / 2));
17469 }
17470 break;
17471 case E_V64QImode:
17472 case E_V32HImode:
17473 case E_V32HFmode:
17474 if (i < 64)
17475 {
17476 d = gen_reg_rtx (V4TImode);
17477 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
17478 GEN_INT (i / 2));
17479 break;
17480 }
17481 /* FALLTHRU */
17482 case E_V16SImode:
17483 case E_V16SFmode:
17484 case E_V8DImode:
17485 case E_V8DFmode:
17486 if (i > 128)
17487 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
17488 gen_lowpart (V16SImode, src),
17489 gen_lowpart (V16SImode, src),
17490 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17491 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17492 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17493 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17494 GEN_INT (0xC), GEN_INT (0xD),
17495 GEN_INT (0xE), GEN_INT (0xF),
17496 GEN_INT (0x10), GEN_INT (0x11),
17497 GEN_INT (0x12), GEN_INT (0x13),
17498 GEN_INT (0x14), GEN_INT (0x15),
17499 GEN_INT (0x16), GEN_INT (0x17));
17500 else
17501 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
17502 gen_lowpart (V16SImode, src),
17503 GEN_INT (i == 128 ? 0x2 : 0x1),
17504 GEN_INT (0x3),
17505 GEN_INT (0x3),
17506 GEN_INT (0x3),
17507 GEN_INT (i == 128 ? 0x6 : 0x5),
17508 GEN_INT (0x7),
17509 GEN_INT (0x7),
17510 GEN_INT (0x7),
17511 GEN_INT (i == 128 ? 0xA : 0x9),
17512 GEN_INT (0xB),
17513 GEN_INT (0xB),
17514 GEN_INT (0xB),
17515 GEN_INT (i == 128 ? 0xE : 0xD),
17516 GEN_INT (0xF),
17517 GEN_INT (0xF),
17518 GEN_INT (0xF));
17519 break;
17520 default:
17521 gcc_unreachable ();
17522 }
17523 emit_insn (tem);
17524 if (d != dest)
17525 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17526 }
17527
17528 /* Expand a vector reduction. FN is the binary pattern to reduce;
17529 DEST is the destination; IN is the input vector. */
17530
17531 void
17532 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17533 {
17534 rtx half, dst, vec = in;
17535 machine_mode mode = GET_MODE (in);
17536 int i;
17537
17538 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17539 if (TARGET_SSE4_1
17540 && mode == V8HImode
17541 && fn == gen_uminv8hi3)
17542 {
17543 emit_insn (gen_sse4_1_phminposuw (dest, in));
17544 return;
17545 }
17546
17547 for (i = GET_MODE_BITSIZE (mode);
17548 i > GET_MODE_UNIT_BITSIZE (mode);
17549 i >>= 1)
17550 {
17551 half = gen_reg_rtx (mode);
17552 emit_reduc_half (half, vec, i);
17553 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17554 dst = dest;
17555 else
17556 dst = gen_reg_rtx (mode);
17557 emit_insn (fn (dst, half, vec));
17558 vec = dst;
17559 }
17560 }
17561
17562 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17563 FP status register is set. */
17564
17565 void
17566 ix86_emit_fp_unordered_jump (rtx label)
17567 {
17568 rtx reg = gen_reg_rtx (HImode);
17569 rtx_insn *insn;
17570 rtx temp;
17571
17572 emit_insn (gen_x86_fnstsw_1 (reg));
17573
17574 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17575 {
17576 emit_insn (gen_x86_sahf_1 (reg));
17577
17578 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17579 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17580 }
17581 else
17582 {
17583 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17584
17585 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17586 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17587 }
17588
17589 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17590 gen_rtx_LABEL_REF (VOIDmode, label),
17591 pc_rtx);
17592 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17593 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17594 JUMP_LABEL (insn) = label;
17595 }
17596
17597 /* Output code to perform an sinh XFmode calculation. */
17598
17599 void
17600 ix86_emit_i387_sinh (rtx op0, rtx op1)
17601 {
17602 rtx e1 = gen_reg_rtx (XFmode);
17603 rtx e2 = gen_reg_rtx (XFmode);
17604 rtx scratch = gen_reg_rtx (HImode);
17605 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17606 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17607 rtx cst1, tmp;
17608 rtx_code_label *jump_label = gen_label_rtx ();
17609 rtx_insn *insn;
17610
17611 /* scratch = fxam (op1) */
17612 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17613
17614 /* e1 = expm1 (|op1|) */
17615 emit_insn (gen_absxf2 (e2, op1));
17616 emit_insn (gen_expm1xf2 (e1, e2));
17617
17618 /* e2 = e1 / (e1 + 1.0) + e1 */
17619 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17620 emit_insn (gen_addxf3 (e2, e1, cst1));
17621 emit_insn (gen_divxf3 (e2, e1, e2));
17622 emit_insn (gen_addxf3 (e2, e2, e1));
17623
17624 /* flags = signbit (op1) */
17625 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17626
17627 /* if (flags) then e2 = -e2 */
17628 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17629 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17630 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17631 pc_rtx);
17632 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17633 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17634 JUMP_LABEL (insn) = jump_label;
17635
17636 emit_insn (gen_negxf2 (e2, e2));
17637
17638 emit_label (jump_label);
17639 LABEL_NUSES (jump_label) = 1;
17640
17641 /* op0 = 0.5 * e2 */
17642 half = force_reg (XFmode, half);
17643 emit_insn (gen_mulxf3 (op0, e2, half));
17644 }
17645
17646 /* Output code to perform an cosh XFmode calculation. */
17647
17648 void
17649 ix86_emit_i387_cosh (rtx op0, rtx op1)
17650 {
17651 rtx e1 = gen_reg_rtx (XFmode);
17652 rtx e2 = gen_reg_rtx (XFmode);
17653 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17654 rtx cst1;
17655
17656 /* e1 = exp (op1) */
17657 emit_insn (gen_expxf2 (e1, op1));
17658
17659 /* e2 = e1 + 1.0 / e1 */
17660 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17661 emit_insn (gen_divxf3 (e2, cst1, e1));
17662 emit_insn (gen_addxf3 (e2, e1, e2));
17663
17664 /* op0 = 0.5 * e2 */
17665 half = force_reg (XFmode, half);
17666 emit_insn (gen_mulxf3 (op0, e2, half));
17667 }
17668
17669 /* Output code to perform an tanh XFmode calculation. */
17670
17671 void
17672 ix86_emit_i387_tanh (rtx op0, rtx op1)
17673 {
17674 rtx e1 = gen_reg_rtx (XFmode);
17675 rtx e2 = gen_reg_rtx (XFmode);
17676 rtx scratch = gen_reg_rtx (HImode);
17677 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17678 rtx cst2, tmp;
17679 rtx_code_label *jump_label = gen_label_rtx ();
17680 rtx_insn *insn;
17681
17682 /* scratch = fxam (op1) */
17683 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17684
17685 /* e1 = expm1 (-|2 * op1|) */
17686 emit_insn (gen_addxf3 (e2, op1, op1));
17687 emit_insn (gen_absxf2 (e2, e2));
17688 emit_insn (gen_negxf2 (e2, e2));
17689 emit_insn (gen_expm1xf2 (e1, e2));
17690
17691 /* e2 = e1 / (e1 + 2.0) */
17692 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17693 emit_insn (gen_addxf3 (e2, e1, cst2));
17694 emit_insn (gen_divxf3 (e2, e1, e2));
17695
17696 /* flags = signbit (op1) */
17697 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17698
17699 /* if (!flags) then e2 = -e2 */
17700 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17701 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17702 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17703 pc_rtx);
17704 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17705 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17706 JUMP_LABEL (insn) = jump_label;
17707
17708 emit_insn (gen_negxf2 (e2, e2));
17709
17710 emit_label (jump_label);
17711 LABEL_NUSES (jump_label) = 1;
17712
17713 emit_move_insn (op0, e2);
17714 }
17715
17716 /* Output code to perform an asinh XFmode calculation. */
17717
17718 void
17719 ix86_emit_i387_asinh (rtx op0, rtx op1)
17720 {
17721 rtx e1 = gen_reg_rtx (XFmode);
17722 rtx e2 = gen_reg_rtx (XFmode);
17723 rtx scratch = gen_reg_rtx (HImode);
17724 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17725 rtx cst1, tmp;
17726 rtx_code_label *jump_label = gen_label_rtx ();
17727 rtx_insn *insn;
17728
17729 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17730 emit_insn (gen_mulxf3 (e1, op1, op1));
17731 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17732 emit_insn (gen_addxf3 (e2, e1, cst1));
17733 emit_insn (gen_sqrtxf2 (e2, e2));
17734 emit_insn (gen_addxf3 (e2, e2, cst1));
17735
17736 /* e1 = e1 / e2 */
17737 emit_insn (gen_divxf3 (e1, e1, e2));
17738
17739 /* scratch = fxam (op1) */
17740 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17741
17742 /* e1 = e1 + |op1| */
17743 emit_insn (gen_absxf2 (e2, op1));
17744 emit_insn (gen_addxf3 (e1, e1, e2));
17745
17746 /* e2 = log1p (e1) */
17747 ix86_emit_i387_log1p (e2, e1);
17748
17749 /* flags = signbit (op1) */
17750 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17751
17752 /* if (flags) then e2 = -e2 */
17753 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17754 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17755 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17756 pc_rtx);
17757 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17758 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17759 JUMP_LABEL (insn) = jump_label;
17760
17761 emit_insn (gen_negxf2 (e2, e2));
17762
17763 emit_label (jump_label);
17764 LABEL_NUSES (jump_label) = 1;
17765
17766 emit_move_insn (op0, e2);
17767 }
17768
17769 /* Output code to perform an acosh XFmode calculation. */
17770
17771 void
17772 ix86_emit_i387_acosh (rtx op0, rtx op1)
17773 {
17774 rtx e1 = gen_reg_rtx (XFmode);
17775 rtx e2 = gen_reg_rtx (XFmode);
17776 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17777
17778 /* e2 = sqrt (op1 + 1.0) */
17779 emit_insn (gen_addxf3 (e2, op1, cst1));
17780 emit_insn (gen_sqrtxf2 (e2, e2));
17781
17782 /* e1 = sqrt (op1 - 1.0) */
17783 emit_insn (gen_subxf3 (e1, op1, cst1));
17784 emit_insn (gen_sqrtxf2 (e1, e1));
17785
17786 /* e1 = e1 * e2 */
17787 emit_insn (gen_mulxf3 (e1, e1, e2));
17788
17789 /* e1 = e1 + op1 */
17790 emit_insn (gen_addxf3 (e1, e1, op1));
17791
17792 /* op0 = log (e1) */
17793 emit_insn (gen_logxf2 (op0, e1));
17794 }
17795
17796 /* Output code to perform an atanh XFmode calculation. */
17797
17798 void
17799 ix86_emit_i387_atanh (rtx op0, rtx op1)
17800 {
17801 rtx e1 = gen_reg_rtx (XFmode);
17802 rtx e2 = gen_reg_rtx (XFmode);
17803 rtx scratch = gen_reg_rtx (HImode);
17804 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17805 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17806 rtx cst1, tmp;
17807 rtx_code_label *jump_label = gen_label_rtx ();
17808 rtx_insn *insn;
17809
17810 /* scratch = fxam (op1) */
17811 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17812
17813 /* e2 = |op1| */
17814 emit_insn (gen_absxf2 (e2, op1));
17815
17816 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17817 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17818 emit_insn (gen_addxf3 (e1, e2, cst1));
17819 emit_insn (gen_addxf3 (e2, e2, e2));
17820 emit_insn (gen_negxf2 (e2, e2));
17821 emit_insn (gen_divxf3 (e1, e2, e1));
17822
17823 /* e2 = log1p (e1) */
17824 ix86_emit_i387_log1p (e2, e1);
17825
17826 /* flags = signbit (op1) */
17827 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17828
17829 /* if (!flags) then e2 = -e2 */
17830 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17831 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17832 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17833 pc_rtx);
17834 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17835 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17836 JUMP_LABEL (insn) = jump_label;
17837
17838 emit_insn (gen_negxf2 (e2, e2));
17839
17840 emit_label (jump_label);
17841 LABEL_NUSES (jump_label) = 1;
17842
17843 /* op0 = 0.5 * e2 */
17844 half = force_reg (XFmode, half);
17845 emit_insn (gen_mulxf3 (op0, e2, half));
17846 }
17847
17848 /* Output code to perform a log1p XFmode calculation. */
17849
17850 void
17851 ix86_emit_i387_log1p (rtx op0, rtx op1)
17852 {
17853 rtx_code_label *label1 = gen_label_rtx ();
17854 rtx_code_label *label2 = gen_label_rtx ();
17855
17856 rtx tmp = gen_reg_rtx (XFmode);
17857 rtx res = gen_reg_rtx (XFmode);
17858 rtx cst, cstln2, cst1;
17859 rtx_insn *insn;
17860
17861 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17862 before the conditional jump, otherwise the stack adjustment will be
17863 only conditional. */
17864 do_pending_stack_adjust ();
17865
17866 cst = const_double_from_real_value
17867 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17868 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17869
17870 emit_insn (gen_absxf2 (tmp, op1));
17871
17872 cst = force_reg (XFmode, cst);
17873 ix86_expand_branch (GE, tmp, cst, label1);
17874 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17875 insn = get_last_insn ();
17876 JUMP_LABEL (insn) = label1;
17877
17878 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17879 emit_jump (label2);
17880
17881 emit_label (label1);
17882 LABEL_NUSES (label1) = 1;
17883
17884 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17885 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17886 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17887
17888 emit_label (label2);
17889 LABEL_NUSES (label2) = 1;
17890
17891 emit_move_insn (op0, res);
17892 }
17893
17894 /* Emit code for round calculation. */
17895 void
17896 ix86_emit_i387_round (rtx op0, rtx op1)
17897 {
17898 machine_mode inmode = GET_MODE (op1);
17899 machine_mode outmode = GET_MODE (op0);
17900 rtx e1 = gen_reg_rtx (XFmode);
17901 rtx e2 = gen_reg_rtx (XFmode);
17902 rtx scratch = gen_reg_rtx (HImode);
17903 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17904 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17905 rtx res = gen_reg_rtx (outmode);
17906 rtx_code_label *jump_label = gen_label_rtx ();
17907 rtx (*floor_insn) (rtx, rtx);
17908 rtx (*neg_insn) (rtx, rtx);
17909 rtx_insn *insn;
17910 rtx tmp;
17911
17912 switch (inmode)
17913 {
17914 case E_SFmode:
17915 case E_DFmode:
17916 tmp = gen_reg_rtx (XFmode);
17917
17918 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17919 op1 = tmp;
17920 break;
17921 case E_XFmode:
17922 break;
17923 default:
17924 gcc_unreachable ();
17925 }
17926
17927 switch (outmode)
17928 {
17929 case E_SFmode:
17930 floor_insn = gen_frndintxf2_floor;
17931 neg_insn = gen_negsf2;
17932 break;
17933 case E_DFmode:
17934 floor_insn = gen_frndintxf2_floor;
17935 neg_insn = gen_negdf2;
17936 break;
17937 case E_XFmode:
17938 floor_insn = gen_frndintxf2_floor;
17939 neg_insn = gen_negxf2;
17940 break;
17941 case E_HImode:
17942 floor_insn = gen_lfloorxfhi2;
17943 neg_insn = gen_neghi2;
17944 break;
17945 case E_SImode:
17946 floor_insn = gen_lfloorxfsi2;
17947 neg_insn = gen_negsi2;
17948 break;
17949 case E_DImode:
17950 floor_insn = gen_lfloorxfdi2;
17951 neg_insn = gen_negdi2;
17952 break;
17953 default:
17954 gcc_unreachable ();
17955 }
17956
17957 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17958
17959 /* scratch = fxam(op1) */
17960 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17961
17962 /* e1 = fabs(op1) */
17963 emit_insn (gen_absxf2 (e1, op1));
17964
17965 /* e2 = e1 + 0.5 */
17966 half = force_reg (XFmode, half);
17967 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
17968
17969 /* res = floor(e2) */
17970 switch (outmode)
17971 {
17972 case E_SFmode:
17973 case E_DFmode:
17974 {
17975 tmp = gen_reg_rtx (XFmode);
17976
17977 emit_insn (floor_insn (tmp, e2));
17978 emit_insn (gen_rtx_SET (res,
17979 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
17980 UNSPEC_TRUNC_NOOP)));
17981 }
17982 break;
17983 default:
17984 emit_insn (floor_insn (res, e2));
17985 }
17986
17987 /* flags = signbit(a) */
17988 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17989
17990 /* if (flags) then res = -res */
17991 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17992 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17993 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17994 pc_rtx);
17995 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17996 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17997 JUMP_LABEL (insn) = jump_label;
17998
17999 emit_insn (neg_insn (res, res));
18000
18001 emit_label (jump_label);
18002 LABEL_NUSES (jump_label) = 1;
18003
18004 emit_move_insn (op0, res);
18005 }
18006
18007 /* Output code to perform a Newton-Rhapson approximation of a single precision
18008 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
18009
18010 void
18011 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
18012 {
18013 rtx x0, x1, e0, e1;
18014
18015 x0 = gen_reg_rtx (mode);
18016 e0 = gen_reg_rtx (mode);
18017 e1 = gen_reg_rtx (mode);
18018 x1 = gen_reg_rtx (mode);
18019
18020 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
18021
18022 b = force_reg (mode, b);
18023
18024 /* x0 = rcp(b) estimate */
18025 if (mode == V16SFmode || mode == V8DFmode)
18026 {
18027 if (TARGET_AVX512ER)
18028 {
18029 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18030 UNSPEC_RCP28)));
18031 /* res = a * x0 */
18032 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
18033 return;
18034 }
18035 else
18036 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18037 UNSPEC_RCP14)));
18038 }
18039 else
18040 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18041 UNSPEC_RCP)));
18042
18043 /* e0 = x0 * b */
18044 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
18045
18046 /* e0 = x0 * e0 */
18047 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
18048
18049 /* e1 = x0 + x0 */
18050 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
18051
18052 /* x1 = e1 - e0 */
18053 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
18054
18055 /* res = a * x1 */
18056 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
18057 }
18058
18059 /* Output code to perform a Newton-Rhapson approximation of a
18060 single precision floating point [reciprocal] square root. */
18061
18062 void
18063 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
18064 {
18065 rtx x0, e0, e1, e2, e3, mthree, mhalf;
18066 REAL_VALUE_TYPE r;
18067 int unspec;
18068
18069 x0 = gen_reg_rtx (mode);
18070 e0 = gen_reg_rtx (mode);
18071 e1 = gen_reg_rtx (mode);
18072 e2 = gen_reg_rtx (mode);
18073 e3 = gen_reg_rtx (mode);
18074
18075 if (TARGET_AVX512ER && mode == V16SFmode)
18076 {
18077 if (recip)
18078 /* res = rsqrt28(a) estimate */
18079 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18080 UNSPEC_RSQRT28)));
18081 else
18082 {
18083 /* x0 = rsqrt28(a) estimate */
18084 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18085 UNSPEC_RSQRT28)));
18086 /* res = rcp28(x0) estimate */
18087 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
18088 UNSPEC_RCP28)));
18089 }
18090 return;
18091 }
18092
18093 real_from_integer (&r, VOIDmode, -3, SIGNED);
18094 mthree = const_double_from_real_value (r, SFmode);
18095
18096 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
18097 mhalf = const_double_from_real_value (r, SFmode);
18098 unspec = UNSPEC_RSQRT;
18099
18100 if (VECTOR_MODE_P (mode))
18101 {
18102 mthree = ix86_build_const_vector (mode, true, mthree);
18103 mhalf = ix86_build_const_vector (mode, true, mhalf);
18104 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18105 if (GET_MODE_SIZE (mode) == 64)
18106 unspec = UNSPEC_RSQRT14;
18107 }
18108
18109 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18110 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18111
18112 a = force_reg (mode, a);
18113
18114 /* x0 = rsqrt(a) estimate */
18115 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18116 unspec)));
18117
18118 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18119 if (!recip)
18120 {
18121 rtx zero = force_reg (mode, CONST0_RTX(mode));
18122 rtx mask;
18123
18124 /* Handle masked compare. */
18125 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
18126 {
18127 mask = gen_reg_rtx (HImode);
18128 /* Imm value 0x4 corresponds to not-equal comparison. */
18129 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
18130 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
18131 }
18132 else
18133 {
18134 mask = gen_reg_rtx (mode);
18135 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
18136 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
18137 }
18138 }
18139
18140 mthree = force_reg (mode, mthree);
18141
18142 /* e0 = x0 * a */
18143 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
18144
18145 unsigned vector_size = GET_MODE_SIZE (mode);
18146 if (TARGET_FMA
18147 || (TARGET_AVX512F && vector_size == 64)
18148 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
18149 emit_insn (gen_rtx_SET (e2,
18150 gen_rtx_FMA (mode, e0, x0, mthree)));
18151 else
18152 {
18153 /* e1 = e0 * x0 */
18154 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
18155
18156 /* e2 = e1 - 3. */
18157 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
18158 }
18159
18160 mhalf = force_reg (mode, mhalf);
18161 if (recip)
18162 /* e3 = -.5 * x0 */
18163 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
18164 else
18165 /* e3 = -.5 * e0 */
18166 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
18167 /* ret = e2 * e3 */
18168 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
18169 }
18170
18171 /* Expand fabs (OP0) and return a new rtx that holds the result. The
18172 mask for masking out the sign-bit is stored in *SMASK, if that is
18173 non-null. */
18174
18175 static rtx
18176 ix86_expand_sse_fabs (rtx op0, rtx *smask)
18177 {
18178 machine_mode vmode, mode = GET_MODE (op0);
18179 rtx xa, mask;
18180
18181 xa = gen_reg_rtx (mode);
18182 if (mode == SFmode)
18183 vmode = V4SFmode;
18184 else if (mode == DFmode)
18185 vmode = V2DFmode;
18186 else
18187 vmode = mode;
18188 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
18189 if (!VECTOR_MODE_P (mode))
18190 {
18191 /* We need to generate a scalar mode mask in this case. */
18192 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18193 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18194 mask = gen_reg_rtx (mode);
18195 emit_insn (gen_rtx_SET (mask, tmp));
18196 }
18197 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
18198
18199 if (smask)
18200 *smask = mask;
18201
18202 return xa;
18203 }
18204
18205 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
18206 swapping the operands if SWAP_OPERANDS is true. The expanded
18207 code is a forward jump to a newly created label in case the
18208 comparison is true. The generated label rtx is returned. */
18209 static rtx_code_label *
18210 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
18211 bool swap_operands)
18212 {
18213 bool unordered_compare = ix86_unordered_fp_compare (code);
18214 rtx_code_label *label;
18215 rtx tmp, reg;
18216
18217 if (swap_operands)
18218 std::swap (op0, op1);
18219
18220 label = gen_label_rtx ();
18221 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
18222 if (unordered_compare)
18223 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
18224 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
18225 emit_insn (gen_rtx_SET (reg, tmp));
18226 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
18227 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18228 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
18229 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18230 JUMP_LABEL (tmp) = label;
18231
18232 return label;
18233 }
18234
18235 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18236 using comparison code CODE. Operands are swapped for the comparison if
18237 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18238 static rtx
18239 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
18240 bool swap_operands)
18241 {
18242 rtx (*insn)(rtx, rtx, rtx, rtx);
18243 machine_mode mode = GET_MODE (op0);
18244 rtx mask = gen_reg_rtx (mode);
18245
18246 if (swap_operands)
18247 std::swap (op0, op1);
18248
18249 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
18250
18251 emit_insn (insn (mask, op0, op1,
18252 gen_rtx_fmt_ee (code, mode, op0, op1)));
18253 return mask;
18254 }
18255
18256 /* Expand copysign from SIGN to the positive value ABS_VALUE
18257 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18258 the sign-bit. */
18259
18260 static void
18261 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
18262 {
18263 machine_mode mode = GET_MODE (sign);
18264 rtx sgn = gen_reg_rtx (mode);
18265 if (mask == NULL_RTX)
18266 {
18267 machine_mode vmode;
18268
18269 if (mode == SFmode)
18270 vmode = V4SFmode;
18271 else if (mode == DFmode)
18272 vmode = V2DFmode;
18273 else
18274 vmode = mode;
18275
18276 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
18277 if (!VECTOR_MODE_P (mode))
18278 {
18279 /* We need to generate a scalar mode mask in this case. */
18280 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18281 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18282 mask = gen_reg_rtx (mode);
18283 emit_insn (gen_rtx_SET (mask, tmp));
18284 }
18285 }
18286 else
18287 mask = gen_rtx_NOT (mode, mask);
18288 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
18289 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
18290 }
18291
18292 /* Expand SSE sequence for computing lround from OP1 storing
18293 into OP0. */
18294
18295 void
18296 ix86_expand_lround (rtx op0, rtx op1)
18297 {
18298 /* C code for the stuff we're doing below:
18299 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18300 return (long)tmp;
18301 */
18302 machine_mode mode = GET_MODE (op1);
18303 const struct real_format *fmt;
18304 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18305 rtx adj;
18306
18307 /* load nextafter (0.5, 0.0) */
18308 fmt = REAL_MODE_FORMAT (mode);
18309 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18310 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18311
18312 /* adj = copysign (0.5, op1) */
18313 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
18314 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
18315
18316 /* adj = op1 + adj */
18317 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
18318
18319 /* op0 = (imode)adj */
18320 expand_fix (op0, adj, 0);
18321 }
18322
18323 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18324 into OPERAND0. */
18325
18326 void
18327 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
18328 {
18329 /* C code for the stuff we're doing below (for do_floor):
18330 xi = (long)op1;
18331 xi -= (double)xi > op1 ? 1 : 0;
18332 return xi;
18333 */
18334 machine_mode fmode = GET_MODE (op1);
18335 machine_mode imode = GET_MODE (op0);
18336 rtx ireg, freg, tmp;
18337 rtx_code_label *label;
18338
18339 /* reg = (long)op1 */
18340 ireg = gen_reg_rtx (imode);
18341 expand_fix (ireg, op1, 0);
18342
18343 /* freg = (double)reg */
18344 freg = gen_reg_rtx (fmode);
18345 expand_float (freg, ireg, 0);
18346
18347 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18348 label = ix86_expand_sse_compare_and_jump (UNLE,
18349 freg, op1, !do_floor);
18350 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
18351 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
18352 emit_move_insn (ireg, tmp);
18353
18354 emit_label (label);
18355 LABEL_NUSES (label) = 1;
18356
18357 emit_move_insn (op0, ireg);
18358 }
18359
18360 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18361 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18362
18363 static rtx
18364 ix86_gen_TWO52 (machine_mode mode)
18365 {
18366 const struct real_format *fmt;
18367 REAL_VALUE_TYPE TWO52r;
18368 rtx TWO52;
18369
18370 fmt = REAL_MODE_FORMAT (mode);
18371 real_2expN (&TWO52r, fmt->p - 1, mode);
18372 TWO52 = const_double_from_real_value (TWO52r, mode);
18373 TWO52 = force_reg (mode, TWO52);
18374
18375 return TWO52;
18376 }
18377
18378 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18379
18380 void
18381 ix86_expand_rint (rtx operand0, rtx operand1)
18382 {
18383 /* C code for the stuff we're doing below:
18384 xa = fabs (operand1);
18385 if (!isless (xa, 2**52))
18386 return operand1;
18387 two52 = 2**52;
18388 if (flag_rounding_math)
18389 {
18390 two52 = copysign (two52, operand1);
18391 xa = operand1;
18392 }
18393 xa = xa + two52 - two52;
18394 return copysign (xa, operand1);
18395 */
18396 machine_mode mode = GET_MODE (operand0);
18397 rtx res, xa, TWO52, mask;
18398 rtx_code_label *label;
18399
18400 TWO52 = ix86_gen_TWO52 (mode);
18401
18402 /* Temporary for holding the result, initialized to the input
18403 operand to ease control flow. */
18404 res = copy_to_reg (operand1);
18405
18406 /* xa = abs (operand1) */
18407 xa = ix86_expand_sse_fabs (res, &mask);
18408
18409 /* if (!isless (xa, TWO52)) goto label; */
18410 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18411
18412 if (flag_rounding_math)
18413 {
18414 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
18415 xa = res;
18416 }
18417
18418 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18419 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18420
18421 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18422 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18423 xa = ix86_expand_sse_fabs (xa, NULL);
18424
18425 ix86_sse_copysign_to_positive (res, xa, res, mask);
18426
18427 emit_label (label);
18428 LABEL_NUSES (label) = 1;
18429
18430 emit_move_insn (operand0, res);
18431 }
18432
18433 /* Expand SSE2 sequence for computing floor or ceil
18434 from OPERAND1 storing into OPERAND0. */
18435 void
18436 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
18437 {
18438 /* C code for the stuff we expand below.
18439 double xa = fabs (x), x2;
18440 if (!isless (xa, TWO52))
18441 return x;
18442 x2 = (double)(long)x;
18443
18444 Compensate. Floor:
18445 if (x2 > x)
18446 x2 -= 1;
18447 Compensate. Ceil:
18448 if (x2 < x)
18449 x2 += 1;
18450
18451 if (HONOR_SIGNED_ZEROS (mode))
18452 return copysign (x2, x);
18453 return x2;
18454 */
18455 machine_mode mode = GET_MODE (operand0);
18456 rtx xa, xi, TWO52, tmp, one, res, mask;
18457 rtx_code_label *label;
18458
18459 TWO52 = ix86_gen_TWO52 (mode);
18460
18461 /* Temporary for holding the result, initialized to the input
18462 operand to ease control flow. */
18463 res = copy_to_reg (operand1);
18464
18465 /* xa = abs (operand1) */
18466 xa = ix86_expand_sse_fabs (res, &mask);
18467
18468 /* if (!isless (xa, TWO52)) goto label; */
18469 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18470
18471 /* xa = (double)(long)x */
18472 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18473 expand_fix (xi, res, 0);
18474 expand_float (xa, xi, 0);
18475
18476 /* generate 1.0 */
18477 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18478
18479 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18480 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18481 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18482 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18483 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18484 if (HONOR_SIGNED_ZEROS (mode))
18485 {
18486 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18487 if (do_floor && flag_rounding_math)
18488 tmp = ix86_expand_sse_fabs (tmp, NULL);
18489
18490 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18491 }
18492 emit_move_insn (res, tmp);
18493
18494 emit_label (label);
18495 LABEL_NUSES (label) = 1;
18496
18497 emit_move_insn (operand0, res);
18498 }
18499
18500 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18501 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18502 that is only available on 64bit targets. */
18503 void
18504 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
18505 {
18506 /* C code for the stuff we expand below.
18507 double xa = fabs (x), x2;
18508 if (!isless (xa, TWO52))
18509 return x;
18510 xa = xa + TWO52 - TWO52;
18511 x2 = copysign (xa, x);
18512
18513 Compensate. Floor:
18514 if (x2 > x)
18515 x2 -= 1;
18516 Compensate. Ceil:
18517 if (x2 < x)
18518 x2 += 1;
18519
18520 if (HONOR_SIGNED_ZEROS (mode))
18521 x2 = copysign (x2, x);
18522 return x2;
18523 */
18524 machine_mode mode = GET_MODE (operand0);
18525 rtx xa, TWO52, tmp, one, res, mask;
18526 rtx_code_label *label;
18527
18528 TWO52 = ix86_gen_TWO52 (mode);
18529
18530 /* Temporary for holding the result, initialized to the input
18531 operand to ease control flow. */
18532 res = copy_to_reg (operand1);
18533
18534 /* xa = abs (operand1) */
18535 xa = ix86_expand_sse_fabs (res, &mask);
18536
18537 /* if (!isless (xa, TWO52)) goto label; */
18538 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18539
18540 /* xa = xa + TWO52 - TWO52; */
18541 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18542 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18543
18544 /* xa = copysign (xa, operand1) */
18545 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18546
18547 /* generate 1.0 */
18548 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18549
18550 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18551 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18552 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18553 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18554 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18555 if (HONOR_SIGNED_ZEROS (mode))
18556 {
18557 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18558 if (do_floor && flag_rounding_math)
18559 tmp = ix86_expand_sse_fabs (tmp, NULL);
18560
18561 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18562 }
18563 emit_move_insn (res, tmp);
18564
18565 emit_label (label);
18566 LABEL_NUSES (label) = 1;
18567
18568 emit_move_insn (operand0, res);
18569 }
18570
18571 /* Expand SSE sequence for computing trunc
18572 from OPERAND1 storing into OPERAND0. */
18573 void
18574 ix86_expand_trunc (rtx operand0, rtx operand1)
18575 {
18576 /* C code for SSE variant we expand below.
18577 double xa = fabs (x), x2;
18578 if (!isless (xa, TWO52))
18579 return x;
18580 x2 = (double)(long)x;
18581 if (HONOR_SIGNED_ZEROS (mode))
18582 return copysign (x2, x);
18583 return x2;
18584 */
18585 machine_mode mode = GET_MODE (operand0);
18586 rtx xa, xi, TWO52, res, mask;
18587 rtx_code_label *label;
18588
18589 TWO52 = ix86_gen_TWO52 (mode);
18590
18591 /* Temporary for holding the result, initialized to the input
18592 operand to ease control flow. */
18593 res = copy_to_reg (operand1);
18594
18595 /* xa = abs (operand1) */
18596 xa = ix86_expand_sse_fabs (res, &mask);
18597
18598 /* if (!isless (xa, TWO52)) goto label; */
18599 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18600
18601 /* xa = (double)(long)x */
18602 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18603 expand_fix (xi, res, 0);
18604 expand_float (xa, xi, 0);
18605
18606 if (HONOR_SIGNED_ZEROS (mode))
18607 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18608
18609 emit_move_insn (res, xa);
18610
18611 emit_label (label);
18612 LABEL_NUSES (label) = 1;
18613
18614 emit_move_insn (operand0, res);
18615 }
18616
18617 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18618 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18619 that is only available on 64bit targets. */
18620 void
18621 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18622 {
18623 machine_mode mode = GET_MODE (operand0);
18624 rtx xa, xa2, TWO52, tmp, one, res, mask;
18625 rtx_code_label *label;
18626
18627 /* C code for SSE variant we expand below.
18628 double xa = fabs (x), x2;
18629 if (!isless (xa, TWO52))
18630 return x;
18631 xa2 = xa + TWO52 - TWO52;
18632 Compensate:
18633 if (xa2 > xa)
18634 xa2 -= 1.0;
18635 x2 = copysign (xa2, x);
18636 return x2;
18637 */
18638
18639 TWO52 = ix86_gen_TWO52 (mode);
18640
18641 /* Temporary for holding the result, initialized to the input
18642 operand to ease control flow. */
18643 res =copy_to_reg (operand1);
18644
18645 /* xa = abs (operand1) */
18646 xa = ix86_expand_sse_fabs (res, &mask);
18647
18648 /* if (!isless (xa, TWO52)) goto label; */
18649 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18650
18651 /* xa2 = xa + TWO52 - TWO52; */
18652 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18653 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18654
18655 /* generate 1.0 */
18656 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18657
18658 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18659 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18660 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18661 tmp = expand_simple_binop (mode, MINUS,
18662 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18663 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18664 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18665 tmp = ix86_expand_sse_fabs (tmp, NULL);
18666
18667 /* res = copysign (xa2, operand1) */
18668 ix86_sse_copysign_to_positive (res, tmp, res, mask);
18669
18670 emit_label (label);
18671 LABEL_NUSES (label) = 1;
18672
18673 emit_move_insn (operand0, res);
18674 }
18675
18676 /* Expand SSE sequence for computing round
18677 from OPERAND1 storing into OPERAND0. */
18678 void
18679 ix86_expand_round (rtx operand0, rtx operand1)
18680 {
18681 /* C code for the stuff we're doing below:
18682 double xa = fabs (x);
18683 if (!isless (xa, TWO52))
18684 return x;
18685 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18686 return copysign (xa, x);
18687 */
18688 machine_mode mode = GET_MODE (operand0);
18689 rtx res, TWO52, xa, xi, half, mask;
18690 rtx_code_label *label;
18691 const struct real_format *fmt;
18692 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18693
18694 /* Temporary for holding the result, initialized to the input
18695 operand to ease control flow. */
18696 res = copy_to_reg (operand1);
18697
18698 TWO52 = ix86_gen_TWO52 (mode);
18699 xa = ix86_expand_sse_fabs (res, &mask);
18700 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18701
18702 /* load nextafter (0.5, 0.0) */
18703 fmt = REAL_MODE_FORMAT (mode);
18704 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18705 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18706
18707 /* xa = xa + 0.5 */
18708 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18709 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18710
18711 /* xa = (double)(int64_t)xa */
18712 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18713 expand_fix (xi, xa, 0);
18714 expand_float (xa, xi, 0);
18715
18716 /* res = copysign (xa, operand1) */
18717 ix86_sse_copysign_to_positive (res, xa, res, mask);
18718
18719 emit_label (label);
18720 LABEL_NUSES (label) = 1;
18721
18722 emit_move_insn (operand0, res);
18723 }
18724
18725 /* Expand SSE sequence for computing round from OPERAND1 storing
18726 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18727 that is only available on 64bit targets. */
18728 void
18729 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18730 {
18731 /* C code for the stuff we expand below.
18732 double xa = fabs (x), xa2, x2;
18733 if (!isless (xa, TWO52))
18734 return x;
18735 Using the absolute value and copying back sign makes
18736 -0.0 -> -0.0 correct.
18737 xa2 = xa + TWO52 - TWO52;
18738 Compensate.
18739 dxa = xa2 - xa;
18740 if (dxa <= -0.5)
18741 xa2 += 1;
18742 else if (dxa > 0.5)
18743 xa2 -= 1;
18744 x2 = copysign (xa2, x);
18745 return x2;
18746 */
18747 machine_mode mode = GET_MODE (operand0);
18748 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18749 rtx_code_label *label;
18750
18751 TWO52 = ix86_gen_TWO52 (mode);
18752
18753 /* Temporary for holding the result, initialized to the input
18754 operand to ease control flow. */
18755 res = copy_to_reg (operand1);
18756
18757 /* xa = abs (operand1) */
18758 xa = ix86_expand_sse_fabs (res, &mask);
18759
18760 /* if (!isless (xa, TWO52)) goto label; */
18761 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18762
18763 /* xa2 = xa + TWO52 - TWO52; */
18764 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18765 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18766
18767 /* dxa = xa2 - xa; */
18768 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18769
18770 /* generate 0.5, 1.0 and -0.5 */
18771 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18772 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18773 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18774 0, OPTAB_DIRECT);
18775
18776 /* Compensate. */
18777 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18778 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18779 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18780 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18781 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18782 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18783 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18784 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18785
18786 /* res = copysign (xa2, operand1) */
18787 ix86_sse_copysign_to_positive (res, xa2, res, mask);
18788
18789 emit_label (label);
18790 LABEL_NUSES (label) = 1;
18791
18792 emit_move_insn (operand0, res);
18793 }
18794
18795 /* Expand SSE sequence for computing round
18796 from OP1 storing into OP0 using sse4 round insn. */
18797 void
18798 ix86_expand_round_sse4 (rtx op0, rtx op1)
18799 {
18800 machine_mode mode = GET_MODE (op0);
18801 rtx e1, e2, res, half;
18802 const struct real_format *fmt;
18803 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18804 rtx (*gen_copysign) (rtx, rtx, rtx);
18805 rtx (*gen_round) (rtx, rtx, rtx);
18806
18807 switch (mode)
18808 {
18809 case E_SFmode:
18810 gen_copysign = gen_copysignsf3;
18811 gen_round = gen_sse4_1_roundsf2;
18812 break;
18813 case E_DFmode:
18814 gen_copysign = gen_copysigndf3;
18815 gen_round = gen_sse4_1_rounddf2;
18816 break;
18817 default:
18818 gcc_unreachable ();
18819 }
18820
18821 /* round (a) = trunc (a + copysign (0.5, a)) */
18822
18823 /* load nextafter (0.5, 0.0) */
18824 fmt = REAL_MODE_FORMAT (mode);
18825 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18826 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18827 half = const_double_from_real_value (pred_half, mode);
18828
18829 /* e1 = copysign (0.5, op1) */
18830 e1 = gen_reg_rtx (mode);
18831 emit_insn (gen_copysign (e1, half, op1));
18832
18833 /* e2 = op1 + e1 */
18834 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18835
18836 /* res = trunc (e2) */
18837 res = gen_reg_rtx (mode);
18838 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18839
18840 emit_move_insn (op0, res);
18841 }
18842
18843 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18844 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18845 insn every time. */
18846
18847 static GTY(()) rtx_insn *vselect_insn;
18848
18849 /* Initialize vselect_insn. */
18850
18851 static void
18852 init_vselect_insn (void)
18853 {
18854 unsigned i;
18855 rtx x;
18856
18857 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18858 for (i = 0; i < MAX_VECT_LEN; ++i)
18859 XVECEXP (x, 0, i) = const0_rtx;
18860 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18861 const0_rtx), x);
18862 x = gen_rtx_SET (const0_rtx, x);
18863 start_sequence ();
18864 vselect_insn = emit_insn (x);
18865 end_sequence ();
18866 }
18867
18868 /* Construct (set target (vec_select op0 (parallel perm))) and
18869 return true if that's a valid instruction in the active ISA. */
18870
18871 static bool
18872 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18873 unsigned nelt, bool testing_p)
18874 {
18875 unsigned int i;
18876 rtx x, save_vconcat;
18877 int icode;
18878
18879 if (vselect_insn == NULL_RTX)
18880 init_vselect_insn ();
18881
18882 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18883 PUT_NUM_ELEM (XVEC (x, 0), nelt);
18884 for (i = 0; i < nelt; ++i)
18885 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18886 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18887 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18888 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18889 SET_DEST (PATTERN (vselect_insn)) = target;
18890 icode = recog_memoized (vselect_insn);
18891
18892 if (icode >= 0 && !testing_p)
18893 emit_insn (copy_rtx (PATTERN (vselect_insn)));
18894
18895 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18896 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18897 INSN_CODE (vselect_insn) = -1;
18898
18899 return icode >= 0;
18900 }
18901
18902 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18903
18904 static bool
18905 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18906 const unsigned char *perm, unsigned nelt,
18907 bool testing_p)
18908 {
18909 machine_mode v2mode;
18910 rtx x;
18911 bool ok;
18912
18913 if (vselect_insn == NULL_RTX)
18914 init_vselect_insn ();
18915
18916 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18917 return false;
18918 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18919 PUT_MODE (x, v2mode);
18920 XEXP (x, 0) = op0;
18921 XEXP (x, 1) = op1;
18922 ok = expand_vselect (target, x, perm, nelt, testing_p);
18923 XEXP (x, 0) = const0_rtx;
18924 XEXP (x, 1) = const0_rtx;
18925 return ok;
18926 }
18927
18928 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18929 using movss or movsd. */
18930 static bool
18931 expand_vec_perm_movs (struct expand_vec_perm_d *d)
18932 {
18933 machine_mode vmode = d->vmode;
18934 unsigned i, nelt = d->nelt;
18935 rtx x;
18936
18937 if (d->one_operand_p)
18938 return false;
18939
18940 if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
18941 && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
18942 && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
18943 return false;
18944
18945 /* Only the first element is changed. */
18946 if (d->perm[0] != nelt && d->perm[0] != 0)
18947 return false;
18948 for (i = 1; i < nelt; ++i)
18949 if (d->perm[i] != i + nelt - d->perm[0])
18950 return false;
18951
18952 if (d->testing_p)
18953 return true;
18954
18955 if (d->perm[0] == nelt)
18956 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18957 else
18958 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18959
18960 emit_insn (gen_rtx_SET (d->target, x));
18961
18962 return true;
18963 }
18964
18965 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18966 using insertps. */
18967 static bool
18968 expand_vec_perm_insertps (struct expand_vec_perm_d *d)
18969 {
18970 machine_mode vmode = d->vmode;
18971 unsigned i, cnt_s, nelt = d->nelt;
18972 int cnt_d = -1;
18973 rtx src, dst;
18974
18975 if (d->one_operand_p)
18976 return false;
18977
18978 if (!(TARGET_SSE4_1
18979 && (vmode == V4SFmode || vmode == V4SImode
18980 || (TARGET_MMX_WITH_SSE
18981 && (vmode == V2SFmode || vmode == V2SImode)))))
18982 return false;
18983
18984 for (i = 0; i < nelt; ++i)
18985 {
18986 if (d->perm[i] == i)
18987 continue;
18988 if (cnt_d != -1)
18989 {
18990 cnt_d = -1;
18991 break;
18992 }
18993 cnt_d = i;
18994 }
18995
18996 if (cnt_d == -1)
18997 {
18998 for (i = 0; i < nelt; ++i)
18999 {
19000 if (d->perm[i] == i + nelt)
19001 continue;
19002 if (cnt_d != -1)
19003 return false;
19004 cnt_d = i;
19005 }
19006
19007 if (cnt_d == -1)
19008 return false;
19009 }
19010
19011 if (d->testing_p)
19012 return true;
19013
19014 gcc_assert (cnt_d != -1);
19015
19016 cnt_s = d->perm[cnt_d];
19017 if (cnt_s < nelt)
19018 {
19019 src = d->op0;
19020 dst = d->op1;
19021 }
19022 else
19023 {
19024 cnt_s -= nelt;
19025 src = d->op1;
19026 dst = d->op0;
19027 }
19028 gcc_assert (cnt_s < nelt);
19029
19030 rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
19031 GEN_INT (cnt_s << 6 | cnt_d << 4));
19032 emit_insn (x);
19033
19034 return true;
19035 }
19036
19037 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19038 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
19039
19040 static bool
19041 expand_vec_perm_blend (struct expand_vec_perm_d *d)
19042 {
19043 machine_mode mmode, vmode = d->vmode;
19044 unsigned i, nelt = d->nelt;
19045 unsigned HOST_WIDE_INT mask;
19046 rtx target, op0, op1, maskop, x;
19047 rtx rperm[32], vperm;
19048
19049 if (d->one_operand_p)
19050 return false;
19051 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
19052 && (TARGET_AVX512BW
19053 || GET_MODE_UNIT_SIZE (vmode) >= 4))
19054 ;
19055 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19056 ;
19057 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19058 ;
19059 else if (TARGET_SSE4_1
19060 && (GET_MODE_SIZE (vmode) == 16
19061 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
19062 || GET_MODE_SIZE (vmode) == 4))
19063 ;
19064 else
19065 return false;
19066
19067 /* This is a blend, not a permute. Elements must stay in their
19068 respective lanes. */
19069 for (i = 0; i < nelt; ++i)
19070 {
19071 unsigned e = d->perm[i];
19072 if (!(e == i || e == i + nelt))
19073 return false;
19074 }
19075
19076 if (d->testing_p)
19077 return true;
19078
19079 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
19080 decision should be extracted elsewhere, so that we only try that
19081 sequence once all budget==3 options have been tried. */
19082 target = d->target;
19083 op0 = d->op0;
19084 op1 = d->op1;
19085 mask = 0;
19086
19087 switch (vmode)
19088 {
19089 case E_V8DFmode:
19090 case E_V16SFmode:
19091 case E_V4DFmode:
19092 case E_V8SFmode:
19093 case E_V2DFmode:
19094 case E_V4SFmode:
19095 case E_V2SFmode:
19096 case E_V2HImode:
19097 case E_V4HImode:
19098 case E_V8HImode:
19099 case E_V8SImode:
19100 case E_V32HImode:
19101 case E_V64QImode:
19102 case E_V16SImode:
19103 case E_V8DImode:
19104 for (i = 0; i < nelt; ++i)
19105 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19106 break;
19107
19108 case E_V2DImode:
19109 for (i = 0; i < 2; ++i)
19110 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
19111 vmode = V8HImode;
19112 goto do_subreg;
19113
19114 case E_V2SImode:
19115 for (i = 0; i < 2; ++i)
19116 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
19117 vmode = V4HImode;
19118 goto do_subreg;
19119
19120 case E_V4SImode:
19121 if (TARGET_AVX2)
19122 {
19123 /* Use vpblendd instead of vpblendw. */
19124 for (i = 0; i < nelt; ++i)
19125 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19126 break;
19127 }
19128 else
19129 {
19130 for (i = 0; i < 4; ++i)
19131 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19132 vmode = V8HImode;
19133 goto do_subreg;
19134 }
19135
19136 case E_V16QImode:
19137 /* See if bytes move in pairs so we can use pblendw with
19138 an immediate argument, rather than pblendvb with a vector
19139 argument. */
19140 for (i = 0; i < 16; i += 2)
19141 if (d->perm[i] + 1 != d->perm[i + 1])
19142 {
19143 use_pblendvb:
19144 for (i = 0; i < nelt; ++i)
19145 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
19146
19147 finish_pblendvb:
19148 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19149 vperm = force_reg (vmode, vperm);
19150
19151 if (GET_MODE_SIZE (vmode) == 4)
19152 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
19153 else if (GET_MODE_SIZE (vmode) == 8)
19154 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
19155 else if (GET_MODE_SIZE (vmode) == 16)
19156 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
19157 else
19158 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
19159 if (target != d->target)
19160 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19161 return true;
19162 }
19163
19164 for (i = 0; i < 8; ++i)
19165 mask |= (d->perm[i * 2] >= 16) << i;
19166 vmode = V8HImode;
19167 /* FALLTHRU */
19168
19169 do_subreg:
19170 target = gen_reg_rtx (vmode);
19171 op0 = gen_lowpart (vmode, op0);
19172 op1 = gen_lowpart (vmode, op1);
19173 break;
19174
19175 case E_V8QImode:
19176 for (i = 0; i < 8; i += 2)
19177 if (d->perm[i] + 1 != d->perm[i + 1])
19178 goto use_pblendvb;
19179
19180 for (i = 0; i < 4; ++i)
19181 mask |= (d->perm[i * 2] >= 8) << i;
19182 vmode = V4HImode;
19183 goto do_subreg;
19184
19185 case E_V4QImode:
19186 for (i = 0; i < 4; i += 2)
19187 if (d->perm[i] + 1 != d->perm[i + 1])
19188 goto use_pblendvb;
19189
19190 for (i = 0; i < 2; ++i)
19191 mask |= (d->perm[i * 2] >= 4) << i;
19192 vmode = V2HImode;
19193 goto do_subreg;
19194
19195 case E_V32QImode:
19196 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19197 for (i = 0; i < 32; i += 2)
19198 if (d->perm[i] + 1 != d->perm[i + 1])
19199 goto use_pblendvb;
19200 /* See if bytes move in quadruplets. If yes, vpblendd
19201 with immediate can be used. */
19202 for (i = 0; i < 32; i += 4)
19203 if (d->perm[i] + 2 != d->perm[i + 2])
19204 break;
19205 if (i < 32)
19206 {
19207 /* See if bytes move the same in both lanes. If yes,
19208 vpblendw with immediate can be used. */
19209 for (i = 0; i < 16; i += 2)
19210 if (d->perm[i] + 16 != d->perm[i + 16])
19211 goto use_pblendvb;
19212
19213 /* Use vpblendw. */
19214 for (i = 0; i < 16; ++i)
19215 mask |= (d->perm[i * 2] >= 32) << i;
19216 vmode = V16HImode;
19217 goto do_subreg;
19218 }
19219
19220 /* Use vpblendd. */
19221 for (i = 0; i < 8; ++i)
19222 mask |= (d->perm[i * 4] >= 32) << i;
19223 vmode = V8SImode;
19224 goto do_subreg;
19225
19226 case E_V16HImode:
19227 /* See if words move in pairs. If yes, vpblendd can be used. */
19228 for (i = 0; i < 16; i += 2)
19229 if (d->perm[i] + 1 != d->perm[i + 1])
19230 break;
19231 if (i < 16)
19232 {
19233 /* See if words move the same in both lanes. If not,
19234 vpblendvb must be used. */
19235 for (i = 0; i < 8; i++)
19236 if (d->perm[i] + 8 != d->perm[i + 8])
19237 {
19238 /* Use vpblendvb. */
19239 for (i = 0; i < 32; ++i)
19240 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
19241
19242 vmode = V32QImode;
19243 nelt = 32;
19244 target = gen_reg_rtx (vmode);
19245 op0 = gen_lowpart (vmode, op0);
19246 op1 = gen_lowpart (vmode, op1);
19247 goto finish_pblendvb;
19248 }
19249
19250 /* Use vpblendw. */
19251 for (i = 0; i < 16; ++i)
19252 mask |= (d->perm[i] >= 16) << i;
19253 break;
19254 }
19255
19256 /* Use vpblendd. */
19257 for (i = 0; i < 8; ++i)
19258 mask |= (d->perm[i * 2] >= 16) << i;
19259 vmode = V8SImode;
19260 goto do_subreg;
19261
19262 case E_V4DImode:
19263 /* Use vpblendd. */
19264 for (i = 0; i < 4; ++i)
19265 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19266 vmode = V8SImode;
19267 goto do_subreg;
19268
19269 default:
19270 gcc_unreachable ();
19271 }
19272
19273 switch (vmode)
19274 {
19275 case E_V8DFmode:
19276 case E_V8DImode:
19277 mmode = QImode;
19278 break;
19279 case E_V16SFmode:
19280 case E_V16SImode:
19281 mmode = HImode;
19282 break;
19283 case E_V32HImode:
19284 mmode = SImode;
19285 break;
19286 case E_V64QImode:
19287 mmode = DImode;
19288 break;
19289 default:
19290 mmode = VOIDmode;
19291 }
19292
19293 if (mmode != VOIDmode)
19294 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
19295 else
19296 maskop = GEN_INT (mask);
19297
19298 /* This matches five different patterns with the different modes. */
19299 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
19300 x = gen_rtx_SET (target, x);
19301 emit_insn (x);
19302 if (target != d->target)
19303 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19304
19305 return true;
19306 }
19307
19308 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19309 in terms of the variable form of vpermilps.
19310
19311 Note that we will have already failed the immediate input vpermilps,
19312 which requires that the high and low part shuffle be identical; the
19313 variable form doesn't require that. */
19314
19315 static bool
19316 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
19317 {
19318 rtx rperm[8], vperm;
19319 unsigned i;
19320
19321 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
19322 return false;
19323
19324 /* We can only permute within the 128-bit lane. */
19325 for (i = 0; i < 8; ++i)
19326 {
19327 unsigned e = d->perm[i];
19328 if (i < 4 ? e >= 4 : e < 4)
19329 return false;
19330 }
19331
19332 if (d->testing_p)
19333 return true;
19334
19335 for (i = 0; i < 8; ++i)
19336 {
19337 unsigned e = d->perm[i];
19338
19339 /* Within each 128-bit lane, the elements of op0 are numbered
19340 from 0 and the elements of op1 are numbered from 4. */
19341 if (e >= 8 + 4)
19342 e -= 8;
19343 else if (e >= 4)
19344 e -= 4;
19345
19346 rperm[i] = GEN_INT (e);
19347 }
19348
19349 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
19350 vperm = force_reg (V8SImode, vperm);
19351 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
19352
19353 return true;
19354 }
19355
19356 /* For V*[QHS]Imode permutations, check if the same permutation
19357 can't be performed in a 2x, 4x or 8x wider inner mode. */
19358
19359 static bool
19360 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
19361 struct expand_vec_perm_d *nd)
19362 {
19363 int i;
19364 machine_mode mode = VOIDmode;
19365
19366 switch (d->vmode)
19367 {
19368 case E_V8QImode: mode = V4HImode; break;
19369 case E_V16QImode: mode = V8HImode; break;
19370 case E_V32QImode: mode = V16HImode; break;
19371 case E_V64QImode: mode = V32HImode; break;
19372 case E_V4HImode: mode = V2SImode; break;
19373 case E_V8HImode: mode = V4SImode; break;
19374 case E_V16HImode: mode = V8SImode; break;
19375 case E_V32HImode: mode = V16SImode; break;
19376 case E_V4SImode: mode = V2DImode; break;
19377 case E_V8SImode: mode = V4DImode; break;
19378 case E_V16SImode: mode = V8DImode; break;
19379 default: return false;
19380 }
19381 for (i = 0; i < d->nelt; i += 2)
19382 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
19383 return false;
19384 nd->vmode = mode;
19385 nd->nelt = d->nelt / 2;
19386 for (i = 0; i < nd->nelt; i++)
19387 nd->perm[i] = d->perm[2 * i] / 2;
19388 if (GET_MODE_INNER (mode) != DImode)
19389 canonicalize_vector_int_perm (nd, nd);
19390 if (nd != d)
19391 {
19392 nd->one_operand_p = d->one_operand_p;
19393 nd->testing_p = d->testing_p;
19394 if (d->op0 == d->op1)
19395 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
19396 else
19397 {
19398 nd->op0 = gen_lowpart (nd->vmode, d->op0);
19399 nd->op1 = gen_lowpart (nd->vmode, d->op1);
19400 }
19401 if (d->testing_p)
19402 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
19403 else
19404 nd->target = gen_reg_rtx (nd->vmode);
19405 }
19406 return true;
19407 }
19408
19409 /* Return true if permutation D can be performed as VMODE permutation
19410 instead. */
19411
19412 static bool
19413 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
19414 {
19415 unsigned int i, j, chunk;
19416
19417 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
19418 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
19419 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
19420 return false;
19421
19422 if (GET_MODE_NUNITS (vmode) >= d->nelt)
19423 return true;
19424
19425 chunk = d->nelt / GET_MODE_NUNITS (vmode);
19426 for (i = 0; i < d->nelt; i += chunk)
19427 if (d->perm[i] & (chunk - 1))
19428 return false;
19429 else
19430 for (j = 1; j < chunk; ++j)
19431 if (d->perm[i] + j != d->perm[i + j])
19432 return false;
19433
19434 return true;
19435 }
19436
19437 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19438 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19439
19440 static bool
19441 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
19442 {
19443 unsigned i, nelt, eltsz, mask;
19444 unsigned char perm[64];
19445 machine_mode vmode;
19446 struct expand_vec_perm_d nd;
19447 rtx rperm[64], vperm, target, op0, op1;
19448
19449 nelt = d->nelt;
19450
19451 if (!d->one_operand_p)
19452 switch (GET_MODE_SIZE (d->vmode))
19453 {
19454 case 4:
19455 if (!TARGET_XOP)
19456 return false;
19457 vmode = V4QImode;
19458 break;
19459
19460 case 8:
19461 if (!TARGET_XOP)
19462 return false;
19463 vmode = V8QImode;
19464 break;
19465
19466 case 16:
19467 if (!TARGET_XOP)
19468 return false;
19469 vmode = V16QImode;
19470 break;
19471
19472 case 32:
19473 if (!TARGET_AVX2)
19474 return false;
19475
19476 if (valid_perm_using_mode_p (V2TImode, d))
19477 {
19478 if (d->testing_p)
19479 return true;
19480
19481 /* Use vperm2i128 insn. The pattern uses
19482 V4DImode instead of V2TImode. */
19483 target = d->target;
19484 if (d->vmode != V4DImode)
19485 target = gen_reg_rtx (V4DImode);
19486 op0 = gen_lowpart (V4DImode, d->op0);
19487 op1 = gen_lowpart (V4DImode, d->op1);
19488 rperm[0]
19489 = GEN_INT ((d->perm[0] / (nelt / 2))
19490 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
19491 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
19492 if (target != d->target)
19493 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19494 return true;
19495 }
19496 /* FALLTHRU */
19497
19498 default:
19499 return false;
19500 }
19501 else
19502 switch (GET_MODE_SIZE (d->vmode))
19503 {
19504 case 4:
19505 if (!TARGET_SSSE3)
19506 return false;
19507 vmode = V4QImode;
19508 break;
19509
19510 case 8:
19511 if (!TARGET_SSSE3)
19512 return false;
19513 vmode = V8QImode;
19514 break;
19515
19516 case 16:
19517 if (!TARGET_SSSE3)
19518 return false;
19519 vmode = V16QImode;
19520 break;
19521
19522 case 32:
19523 if (!TARGET_AVX2)
19524 return false;
19525
19526 /* V4DImode should be already handled through
19527 expand_vselect by vpermq instruction. */
19528 gcc_assert (d->vmode != V4DImode);
19529
19530 vmode = V32QImode;
19531 if (d->vmode == V8SImode
19532 || d->vmode == V16HImode
19533 || d->vmode == V32QImode)
19534 {
19535 /* First see if vpermq can be used for
19536 V8SImode/V16HImode/V32QImode. */
19537 if (valid_perm_using_mode_p (V4DImode, d))
19538 {
19539 for (i = 0; i < 4; i++)
19540 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
19541 if (d->testing_p)
19542 return true;
19543 target = gen_reg_rtx (V4DImode);
19544 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
19545 perm, 4, false))
19546 {
19547 emit_move_insn (d->target,
19548 gen_lowpart (d->vmode, target));
19549 return true;
19550 }
19551 return false;
19552 }
19553
19554 /* Next see if vpermd can be used. */
19555 if (valid_perm_using_mode_p (V8SImode, d))
19556 vmode = V8SImode;
19557 }
19558 /* Or if vpermps can be used. */
19559 else if (d->vmode == V8SFmode)
19560 vmode = V8SImode;
19561
19562 if (vmode == V32QImode)
19563 {
19564 /* vpshufb only works intra lanes, it is not
19565 possible to shuffle bytes in between the lanes. */
19566 for (i = 0; i < nelt; ++i)
19567 if ((d->perm[i] ^ i) & (nelt / 2))
19568 return false;
19569 }
19570 break;
19571
19572 case 64:
19573 if (!TARGET_AVX512BW)
19574 return false;
19575
19576 /* If vpermq didn't work, vpshufb won't work either. */
19577 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19578 return false;
19579
19580 vmode = V64QImode;
19581 if (d->vmode == V16SImode
19582 || d->vmode == V32HImode
19583 || d->vmode == V64QImode)
19584 {
19585 /* First see if vpermq can be used for
19586 V16SImode/V32HImode/V64QImode. */
19587 if (valid_perm_using_mode_p (V8DImode, d))
19588 {
19589 for (i = 0; i < 8; i++)
19590 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19591 if (d->testing_p)
19592 return true;
19593 target = gen_reg_rtx (V8DImode);
19594 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19595 perm, 8, false))
19596 {
19597 emit_move_insn (d->target,
19598 gen_lowpart (d->vmode, target));
19599 return true;
19600 }
19601 return false;
19602 }
19603
19604 /* Next see if vpermd can be used. */
19605 if (valid_perm_using_mode_p (V16SImode, d))
19606 vmode = V16SImode;
19607 }
19608 /* Or if vpermps can be used. */
19609 else if (d->vmode == V16SFmode)
19610 vmode = V16SImode;
19611
19612 if (vmode == V64QImode)
19613 {
19614 /* vpshufb only works intra lanes, it is not
19615 possible to shuffle bytes in between the lanes. */
19616 for (i = 0; i < nelt; ++i)
19617 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19618 return false;
19619 }
19620 break;
19621
19622 default:
19623 return false;
19624 }
19625
19626 if (d->testing_p)
19627 return true;
19628
19629 /* Try to avoid variable permutation instruction. */
19630 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19631 {
19632 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19633 return true;
19634 }
19635
19636 if (vmode == V8SImode)
19637 for (i = 0; i < 8; ++i)
19638 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19639 else if (vmode == V16SImode)
19640 for (i = 0; i < 16; ++i)
19641 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19642 else
19643 {
19644 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19645 if (!d->one_operand_p)
19646 mask = 2 * nelt - 1;
19647 else if (vmode == V64QImode)
19648 mask = nelt / 4 - 1;
19649 else if (vmode == V32QImode)
19650 mask = nelt / 2 - 1;
19651 else
19652 mask = nelt - 1;
19653
19654 for (i = 0; i < nelt; ++i)
19655 {
19656 unsigned j, e = d->perm[i] & mask;
19657 for (j = 0; j < eltsz; ++j)
19658 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19659 }
19660 }
19661
19662 machine_mode vpmode = vmode;
19663
19664 nelt = GET_MODE_SIZE (vmode);
19665
19666 /* Emulate narrow modes with V16QI instructions. */
19667 if (nelt < 16)
19668 {
19669 rtx m128 = GEN_INT (-128);
19670
19671 /* Remap elements from the second operand, as we have to
19672 account for inactive top elements from the first operand. */
19673 if (!d->one_operand_p)
19674 {
19675 for (i = 0; i < nelt; ++i)
19676 {
19677 unsigned ival = UINTVAL (rperm[i]);
19678 if (ival >= nelt)
19679 rperm[i] = GEN_INT (ival + 16 - nelt);
19680 }
19681 }
19682
19683 /* Fill inactive elements in the top positions with zeros. */
19684 for (i = nelt; i < 16; ++i)
19685 rperm[i] = m128;
19686
19687 vpmode = V16QImode;
19688 }
19689
19690 vperm = gen_rtx_CONST_VECTOR (vpmode,
19691 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19692 vperm = force_reg (vpmode, vperm);
19693
19694 if (vmode == d->vmode)
19695 target = d->target;
19696 else
19697 target = gen_reg_rtx (vmode);
19698
19699 op0 = gen_lowpart (vmode, d->op0);
19700
19701 if (d->one_operand_p)
19702 {
19703 rtx (*gen) (rtx, rtx, rtx);
19704
19705 if (vmode == V4QImode)
19706 gen = gen_mmx_pshufbv4qi3;
19707 else if (vmode == V8QImode)
19708 gen = gen_mmx_pshufbv8qi3;
19709 else if (vmode == V16QImode)
19710 gen = gen_ssse3_pshufbv16qi3;
19711 else if (vmode == V32QImode)
19712 gen = gen_avx2_pshufbv32qi3;
19713 else if (vmode == V64QImode)
19714 gen = gen_avx512bw_pshufbv64qi3;
19715 else if (vmode == V8SFmode)
19716 gen = gen_avx2_permvarv8sf;
19717 else if (vmode == V8SImode)
19718 gen = gen_avx2_permvarv8si;
19719 else if (vmode == V16SFmode)
19720 gen = gen_avx512f_permvarv16sf;
19721 else if (vmode == V16SImode)
19722 gen = gen_avx512f_permvarv16si;
19723 else
19724 gcc_unreachable ();
19725
19726 emit_insn (gen (target, op0, vperm));
19727 }
19728 else
19729 {
19730 rtx (*gen) (rtx, rtx, rtx, rtx);
19731
19732 op1 = gen_lowpart (vmode, d->op1);
19733
19734 if (vmode == V4QImode)
19735 gen = gen_mmx_ppermv32;
19736 else if (vmode == V8QImode)
19737 gen = gen_mmx_ppermv64;
19738 else if (vmode == V16QImode)
19739 gen = gen_xop_pperm;
19740 else
19741 gcc_unreachable ();
19742
19743 emit_insn (gen (target, op0, op1, vperm));
19744 }
19745
19746 if (target != d->target)
19747 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19748
19749 return true;
19750 }
19751
19752 /* Try to expand one-operand permutation with constant mask. */
19753
19754 static bool
19755 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19756 {
19757 machine_mode mode = GET_MODE (d->op0);
19758 machine_mode maskmode = mode;
19759 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
19760 rtx (*gen) (rtx, rtx, rtx) = NULL;
19761 rtx target, op0, mask;
19762 rtx vec[64];
19763
19764 if (!rtx_equal_p (d->op0, d->op1))
19765 return false;
19766
19767 if (!TARGET_AVX512F)
19768 return false;
19769
19770 /* Accept VNxHImode and VNxQImode now. */
19771 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19772 return false;
19773
19774 /* vpermw. */
19775 if (!TARGET_AVX512BW && inner_size == 2)
19776 return false;
19777
19778 /* vpermb. */
19779 if (!TARGET_AVX512VBMI && inner_size == 1)
19780 return false;
19781
19782 switch (mode)
19783 {
19784 case E_V16SImode:
19785 gen = gen_avx512f_permvarv16si;
19786 break;
19787 case E_V16SFmode:
19788 gen = gen_avx512f_permvarv16sf;
19789 maskmode = V16SImode;
19790 break;
19791 case E_V8DImode:
19792 gen = gen_avx512f_permvarv8di;
19793 break;
19794 case E_V8DFmode:
19795 gen = gen_avx512f_permvarv8df;
19796 maskmode = V8DImode;
19797 break;
19798 case E_V32HImode:
19799 gen = gen_avx512bw_permvarv32hi;
19800 break;
19801 case E_V16HImode:
19802 gen = gen_avx512vl_permvarv16hi;
19803 break;
19804 case E_V8HImode:
19805 gen = gen_avx512vl_permvarv8hi;
19806 break;
19807 case E_V64QImode:
19808 gen = gen_avx512bw_permvarv64qi;
19809 break;
19810 case E_V32QImode:
19811 gen = gen_avx512vl_permvarv32qi;
19812 break;
19813 case E_V16QImode:
19814 gen = gen_avx512vl_permvarv16qi;
19815 break;
19816
19817 default:
19818 return false;
19819 }
19820
19821 if (d->testing_p)
19822 return true;
19823
19824 target = d->target;
19825 op0 = d->op0;
19826 for (int i = 0; i < d->nelt; ++i)
19827 vec[i] = GEN_INT (d->perm[i]);
19828 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19829 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19830 return true;
19831 }
19832
19833 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19834
19835 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19836 in a single instruction. */
19837
19838 static bool
19839 expand_vec_perm_1 (struct expand_vec_perm_d *d)
19840 {
19841 unsigned i, nelt = d->nelt;
19842 struct expand_vec_perm_d nd;
19843
19844 /* Check plain VEC_SELECT first, because AVX has instructions that could
19845 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19846 input where SEL+CONCAT may not. */
19847 if (d->one_operand_p)
19848 {
19849 int mask = nelt - 1;
19850 bool identity_perm = true;
19851 bool broadcast_perm = true;
19852
19853 for (i = 0; i < nelt; i++)
19854 {
19855 nd.perm[i] = d->perm[i] & mask;
19856 if (nd.perm[i] != i)
19857 identity_perm = false;
19858 if (nd.perm[i])
19859 broadcast_perm = false;
19860 }
19861
19862 if (identity_perm)
19863 {
19864 if (!d->testing_p)
19865 emit_move_insn (d->target, d->op0);
19866 return true;
19867 }
19868 else if (broadcast_perm && TARGET_AVX2)
19869 {
19870 /* Use vpbroadcast{b,w,d}. */
19871 rtx (*gen) (rtx, rtx) = NULL;
19872 switch (d->vmode)
19873 {
19874 case E_V64QImode:
19875 if (TARGET_AVX512BW)
19876 gen = gen_avx512bw_vec_dupv64qi_1;
19877 break;
19878 case E_V32QImode:
19879 gen = gen_avx2_pbroadcastv32qi_1;
19880 break;
19881 case E_V32HImode:
19882 if (TARGET_AVX512BW)
19883 gen = gen_avx512bw_vec_dupv32hi_1;
19884 break;
19885 case E_V16HImode:
19886 gen = gen_avx2_pbroadcastv16hi_1;
19887 break;
19888 case E_V16SImode:
19889 if (TARGET_AVX512F)
19890 gen = gen_avx512f_vec_dupv16si_1;
19891 break;
19892 case E_V8SImode:
19893 gen = gen_avx2_pbroadcastv8si_1;
19894 break;
19895 case E_V16QImode:
19896 gen = gen_avx2_pbroadcastv16qi;
19897 break;
19898 case E_V8HImode:
19899 gen = gen_avx2_pbroadcastv8hi;
19900 break;
19901 case E_V16SFmode:
19902 if (TARGET_AVX512F)
19903 gen = gen_avx512f_vec_dupv16sf_1;
19904 break;
19905 case E_V8SFmode:
19906 gen = gen_avx2_vec_dupv8sf_1;
19907 break;
19908 case E_V8DFmode:
19909 if (TARGET_AVX512F)
19910 gen = gen_avx512f_vec_dupv8df_1;
19911 break;
19912 case E_V8DImode:
19913 if (TARGET_AVX512F)
19914 gen = gen_avx512f_vec_dupv8di_1;
19915 break;
19916 /* For other modes prefer other shuffles this function creates. */
19917 default: break;
19918 }
19919 if (gen != NULL)
19920 {
19921 if (!d->testing_p)
19922 emit_insn (gen (d->target, d->op0));
19923 return true;
19924 }
19925 }
19926
19927 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19928 return true;
19929
19930 /* There are plenty of patterns in sse.md that are written for
19931 SEL+CONCAT and are not replicated for a single op. Perhaps
19932 that should be changed, to avoid the nastiness here. */
19933
19934 /* Recognize interleave style patterns, which means incrementing
19935 every other permutation operand. */
19936 for (i = 0; i < nelt; i += 2)
19937 {
19938 nd.perm[i] = d->perm[i] & mask;
19939 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19940 }
19941 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19942 d->testing_p))
19943 return true;
19944
19945 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19946 if (nelt >= 4)
19947 {
19948 for (i = 0; i < nelt; i += 4)
19949 {
19950 nd.perm[i + 0] = d->perm[i + 0] & mask;
19951 nd.perm[i + 1] = d->perm[i + 1] & mask;
19952 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19953 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19954 }
19955
19956 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19957 d->testing_p))
19958 return true;
19959 }
19960 }
19961
19962 /* Try the SSE4.1 blend variable merge instructions. */
19963 if (expand_vec_perm_blend (d))
19964 return true;
19965
19966 /* Try movss/movsd instructions. */
19967 if (expand_vec_perm_movs (d))
19968 return true;
19969
19970 /* Try the SSE4.1 insertps instruction. */
19971 if (expand_vec_perm_insertps (d))
19972 return true;
19973
19974 /* Try the fully general two operand permute. */
19975 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
19976 d->testing_p))
19977 return true;
19978
19979 /* Recognize interleave style patterns with reversed operands. */
19980 if (!d->one_operand_p)
19981 {
19982 for (i = 0; i < nelt; ++i)
19983 {
19984 unsigned e = d->perm[i];
19985 if (e >= nelt)
19986 e -= nelt;
19987 else
19988 e += nelt;
19989 nd.perm[i] = e;
19990 }
19991
19992 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
19993 d->testing_p))
19994 return true;
19995 }
19996
19997 /* Try one of the AVX vpermil variable permutations. */
19998 if (expand_vec_perm_vpermil (d))
19999 return true;
20000
20001 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
20002 vpshufb, vpermd, vpermps or vpermq variable permutation. */
20003 if (expand_vec_perm_pshufb (d))
20004 return true;
20005
20006 /* Try the AVX2 vpalignr instruction. */
20007 if (expand_vec_perm_palignr (d, true))
20008 return true;
20009
20010 /* Try the AVX512F vperm{w,b,s,d} instructions */
20011 if (ix86_expand_vec_one_operand_perm_avx512 (d))
20012 return true;
20013
20014 /* Try the AVX512F vpermt2/vpermi2 instructions. */
20015 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
20016 return true;
20017
20018 /* See if we can get the same permutation in different vector integer
20019 mode. */
20020 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
20021 {
20022 if (!d->testing_p)
20023 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
20024 return true;
20025 }
20026 return false;
20027 }
20028
20029 /* Canonicalize vec_perm index to make the first index
20030 always comes from the first vector. */
20031 static void
20032 ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
20033 {
20034 unsigned nelt = d->nelt;
20035 if (d->perm[0] < nelt)
20036 return;
20037
20038 for (unsigned i = 0; i != nelt; i++)
20039 d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
20040
20041 std::swap (d->op0, d->op1);
20042 return;
20043 }
20044
20045 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20046 in terms of a pair of shufps+ shufps/pshufd instructions. */
20047 static bool
20048 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
20049 {
20050 unsigned char perm1[4];
20051 machine_mode vmode = d->vmode;
20052 bool ok;
20053 unsigned i, j, k, count = 0;
20054
20055 if (d->one_operand_p
20056 || (vmode != V4SImode && vmode != V4SFmode))
20057 return false;
20058
20059 if (d->testing_p)
20060 return true;
20061
20062 ix86_vec_perm_index_canon (d);
20063 for (i = 0; i < 4; ++i)
20064 count += d->perm[i] > 3 ? 1 : 0;
20065
20066 gcc_assert (count & 3);
20067
20068 rtx tmp = gen_reg_rtx (vmode);
20069 /* 2 from op0 and 2 from op1. */
20070 if (count == 2)
20071 {
20072 unsigned char perm2[4];
20073 for (i = 0, j = 0, k = 2; i < 4; ++i)
20074 if (d->perm[i] & 4)
20075 {
20076 perm1[k++] = d->perm[i];
20077 perm2[i] = k - 1;
20078 }
20079 else
20080 {
20081 perm1[j++] = d->perm[i];
20082 perm2[i] = j - 1;
20083 }
20084
20085 /* shufps. */
20086 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20087 perm1, d->nelt, false);
20088 gcc_assert (ok);
20089 if (vmode == V4SImode && TARGET_SSE2)
20090 /* pshufd. */
20091 ok = expand_vselect (d->target, tmp,
20092 perm2, d->nelt, false);
20093 else
20094 {
20095 /* shufps. */
20096 perm2[2] += 4;
20097 perm2[3] += 4;
20098 ok = expand_vselect_vconcat (d->target, tmp, tmp,
20099 perm2, d->nelt, false);
20100 }
20101 gcc_assert (ok);
20102 }
20103 /* 3 from one op and 1 from another. */
20104 else
20105 {
20106 unsigned pair_idx = 8, lone_idx = 8, shift;
20107
20108 /* Find the lone index. */
20109 for (i = 0; i < 4; ++i)
20110 if ((d->perm[i] > 3 && count == 1)
20111 || (d->perm[i] < 4 && count == 3))
20112 lone_idx = i;
20113
20114 /* When lone_idx is not 0, it must from second op(count == 1). */
20115 gcc_assert (count == (lone_idx ? 1 : 3));
20116
20117 /* Find the pair index that sits in the same half as the lone index. */
20118 shift = lone_idx & 2;
20119 pair_idx = 1 - lone_idx + 2 * shift;
20120
20121 /* First permutate lone index and pair index into the same vector as
20122 [ lone, lone, pair, pair ]. */
20123 perm1[1] = perm1[0]
20124 = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
20125 perm1[3] = perm1[2]
20126 = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
20127
20128 /* Alway put the vector contains lone indx at the first. */
20129 if (count == 1)
20130 std::swap (d->op0, d->op1);
20131
20132 /* shufps. */
20133 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20134 perm1, d->nelt, false);
20135 gcc_assert (ok);
20136
20137 /* Refine lone and pair index to original order. */
20138 perm1[shift] = lone_idx << 1;
20139 perm1[shift + 1] = pair_idx << 1;
20140
20141 /* Select the remaining 2 elements in another vector. */
20142 for (i = 2 - shift; i < 4 - shift; ++i)
20143 perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
20144
20145 /* Adjust to original selector. */
20146 if (lone_idx > 1)
20147 std::swap (tmp, d->op1);
20148
20149 /* shufps. */
20150 ok = expand_vselect_vconcat (d->target, tmp, d->op1,
20151 perm1, d->nelt, false);
20152
20153 gcc_assert (ok);
20154 }
20155
20156 return true;
20157 }
20158
20159 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20160 in terms of a pair of pshuflw + pshufhw instructions. */
20161
20162 static bool
20163 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
20164 {
20165 unsigned char perm2[MAX_VECT_LEN];
20166 unsigned i;
20167 bool ok;
20168
20169 if (d->vmode != V8HImode || !d->one_operand_p)
20170 return false;
20171
20172 /* The two permutations only operate in 64-bit lanes. */
20173 for (i = 0; i < 4; ++i)
20174 if (d->perm[i] >= 4)
20175 return false;
20176 for (i = 4; i < 8; ++i)
20177 if (d->perm[i] < 4)
20178 return false;
20179
20180 if (d->testing_p)
20181 return true;
20182
20183 /* Emit the pshuflw. */
20184 memcpy (perm2, d->perm, 4);
20185 for (i = 4; i < 8; ++i)
20186 perm2[i] = i;
20187 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
20188 gcc_assert (ok);
20189
20190 /* Emit the pshufhw. */
20191 memcpy (perm2 + 4, d->perm + 4, 4);
20192 for (i = 0; i < 4; ++i)
20193 perm2[i] = i;
20194 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
20195 gcc_assert (ok);
20196
20197 return true;
20198 }
20199
20200 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20201 the permutation using the SSSE3 palignr instruction. This succeeds
20202 when all of the elements in PERM fit within one vector and we merely
20203 need to shift them down so that a single vector permutation has a
20204 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20205 the vpalignr instruction itself can perform the requested permutation. */
20206
20207 static bool
20208 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
20209 {
20210 unsigned i, nelt = d->nelt;
20211 unsigned min, max, minswap, maxswap;
20212 bool in_order, ok, swap = false;
20213 rtx shift, target;
20214 struct expand_vec_perm_d dcopy;
20215
20216 /* Even with AVX, palignr only operates on 128-bit vectors,
20217 in AVX2 palignr operates on both 128-bit lanes. */
20218 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
20219 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
20220 return false;
20221
20222 min = 2 * nelt;
20223 max = 0;
20224 minswap = 2 * nelt;
20225 maxswap = 0;
20226 for (i = 0; i < nelt; ++i)
20227 {
20228 unsigned e = d->perm[i];
20229 unsigned eswap = d->perm[i] ^ nelt;
20230 if (GET_MODE_SIZE (d->vmode) == 32)
20231 {
20232 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
20233 eswap = e ^ (nelt / 2);
20234 }
20235 if (e < min)
20236 min = e;
20237 if (e > max)
20238 max = e;
20239 if (eswap < minswap)
20240 minswap = eswap;
20241 if (eswap > maxswap)
20242 maxswap = eswap;
20243 }
20244 if (min == 0
20245 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
20246 {
20247 if (d->one_operand_p
20248 || minswap == 0
20249 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
20250 ? nelt / 2 : nelt))
20251 return false;
20252 swap = true;
20253 min = minswap;
20254 max = maxswap;
20255 }
20256
20257 /* Given that we have SSSE3, we know we'll be able to implement the
20258 single operand permutation after the palignr with pshufb for
20259 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20260 first. */
20261 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
20262 return true;
20263
20264 dcopy = *d;
20265 if (swap)
20266 {
20267 dcopy.op0 = d->op1;
20268 dcopy.op1 = d->op0;
20269 for (i = 0; i < nelt; ++i)
20270 dcopy.perm[i] ^= nelt;
20271 }
20272
20273 in_order = true;
20274 for (i = 0; i < nelt; ++i)
20275 {
20276 unsigned e = dcopy.perm[i];
20277 if (GET_MODE_SIZE (d->vmode) == 32
20278 && e >= nelt
20279 && (e & (nelt / 2 - 1)) < min)
20280 e = e - min - (nelt / 2);
20281 else
20282 e = e - min;
20283 if (e != i)
20284 in_order = false;
20285 dcopy.perm[i] = e;
20286 }
20287 dcopy.one_operand_p = true;
20288
20289 if (single_insn_only_p && !in_order)
20290 return false;
20291
20292 /* For AVX2, test whether we can permute the result in one instruction. */
20293 if (d->testing_p)
20294 {
20295 if (in_order)
20296 return true;
20297 dcopy.op1 = dcopy.op0;
20298 return expand_vec_perm_1 (&dcopy);
20299 }
20300
20301 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
20302 if (GET_MODE_SIZE (d->vmode) == 16)
20303 {
20304 target = gen_reg_rtx (V1TImode);
20305 emit_insn (gen_ssse3_palignrv1ti (target,
20306 gen_lowpart (V1TImode, dcopy.op1),
20307 gen_lowpart (V1TImode, dcopy.op0),
20308 shift));
20309 }
20310 else
20311 {
20312 target = gen_reg_rtx (V2TImode);
20313 emit_insn (gen_avx2_palignrv2ti (target,
20314 gen_lowpart (V2TImode, dcopy.op1),
20315 gen_lowpart (V2TImode, dcopy.op0),
20316 shift));
20317 }
20318
20319 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
20320
20321 /* Test for the degenerate case where the alignment by itself
20322 produces the desired permutation. */
20323 if (in_order)
20324 {
20325 emit_move_insn (d->target, dcopy.op0);
20326 return true;
20327 }
20328
20329 ok = expand_vec_perm_1 (&dcopy);
20330 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
20331
20332 return ok;
20333 }
20334
20335 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20336 the permutation using the SSE4_1 pblendv instruction. Potentially
20337 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20338
20339 static bool
20340 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
20341 {
20342 unsigned i, which, nelt = d->nelt;
20343 struct expand_vec_perm_d dcopy, dcopy1;
20344 machine_mode vmode = d->vmode;
20345 bool ok;
20346
20347 /* Use the same checks as in expand_vec_perm_blend. */
20348 if (d->one_operand_p)
20349 return false;
20350 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20351 ;
20352 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20353 ;
20354 else if (TARGET_SSE4_1
20355 && (GET_MODE_SIZE (vmode) == 16
20356 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
20357 || GET_MODE_SIZE (vmode) == 4))
20358 ;
20359 else
20360 return false;
20361
20362 /* Figure out where permutation elements stay not in their
20363 respective lanes. */
20364 for (i = 0, which = 0; i < nelt; ++i)
20365 {
20366 unsigned e = d->perm[i];
20367 if (e != i)
20368 which |= (e < nelt ? 1 : 2);
20369 }
20370 /* We can pblend the part where elements stay not in their
20371 respective lanes only when these elements are all in one
20372 half of a permutation.
20373 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20374 lanes, but both 8 and 9 >= 8
20375 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20376 respective lanes and 8 >= 8, but 2 not. */
20377 if (which != 1 && which != 2)
20378 return false;
20379 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
20380 return true;
20381
20382 /* First we apply one operand permutation to the part where
20383 elements stay not in their respective lanes. */
20384 dcopy = *d;
20385 if (which == 2)
20386 dcopy.op0 = dcopy.op1 = d->op1;
20387 else
20388 dcopy.op0 = dcopy.op1 = d->op0;
20389 if (!d->testing_p)
20390 dcopy.target = gen_reg_rtx (vmode);
20391 dcopy.one_operand_p = true;
20392
20393 for (i = 0; i < nelt; ++i)
20394 dcopy.perm[i] = d->perm[i] & (nelt - 1);
20395
20396 ok = expand_vec_perm_1 (&dcopy);
20397 if (GET_MODE_SIZE (vmode) != 16 && !ok)
20398 return false;
20399 else
20400 gcc_assert (ok);
20401 if (d->testing_p)
20402 return true;
20403
20404 /* Next we put permuted elements into their positions. */
20405 dcopy1 = *d;
20406 if (which == 2)
20407 dcopy1.op1 = dcopy.target;
20408 else
20409 dcopy1.op0 = dcopy.target;
20410
20411 for (i = 0; i < nelt; ++i)
20412 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
20413
20414 ok = expand_vec_perm_blend (&dcopy1);
20415 gcc_assert (ok);
20416
20417 return true;
20418 }
20419
20420 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
20421
20422 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20423 a two vector permutation into a single vector permutation by using
20424 an interleave operation to merge the vectors. */
20425
20426 static bool
20427 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
20428 {
20429 struct expand_vec_perm_d dremap, dfinal;
20430 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
20431 unsigned HOST_WIDE_INT contents;
20432 unsigned char remap[2 * MAX_VECT_LEN];
20433 rtx_insn *seq;
20434 bool ok, same_halves = false;
20435
20436 if (GET_MODE_SIZE (d->vmode) == 4
20437 || GET_MODE_SIZE (d->vmode) == 8
20438 || GET_MODE_SIZE (d->vmode) == 16)
20439 {
20440 if (d->one_operand_p)
20441 return false;
20442 }
20443 else if (GET_MODE_SIZE (d->vmode) == 32)
20444 {
20445 if (!TARGET_AVX)
20446 return false;
20447 /* For 32-byte modes allow even d->one_operand_p.
20448 The lack of cross-lane shuffling in some instructions
20449 might prevent a single insn shuffle. */
20450 dfinal = *d;
20451 dfinal.testing_p = true;
20452 /* If expand_vec_perm_interleave3 can expand this into
20453 a 3 insn sequence, give up and let it be expanded as
20454 3 insn sequence. While that is one insn longer,
20455 it doesn't need a memory operand and in the common
20456 case that both interleave low and high permutations
20457 with the same operands are adjacent needs 4 insns
20458 for both after CSE. */
20459 if (expand_vec_perm_interleave3 (&dfinal))
20460 return false;
20461 }
20462 else
20463 return false;
20464
20465 /* Examine from whence the elements come. */
20466 contents = 0;
20467 for (i = 0; i < nelt; ++i)
20468 contents |= HOST_WIDE_INT_1U << d->perm[i];
20469
20470 memset (remap, 0xff, sizeof (remap));
20471 dremap = *d;
20472
20473 if (GET_MODE_SIZE (d->vmode) == 4
20474 || GET_MODE_SIZE (d->vmode) == 8)
20475 {
20476 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20477
20478 /* Split the two input vectors into 4 halves. */
20479 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20480 h2 = h1 << nelt2;
20481 h3 = h2 << nelt2;
20482 h4 = h3 << nelt2;
20483
20484 /* If the elements from the low halves use interleave low,
20485 and similarly for interleave high. */
20486 if ((contents & (h1 | h3)) == contents)
20487 {
20488 /* punpckl* */
20489 for (i = 0; i < nelt2; ++i)
20490 {
20491 remap[i] = i * 2;
20492 remap[i + nelt] = i * 2 + 1;
20493 dremap.perm[i * 2] = i;
20494 dremap.perm[i * 2 + 1] = i + nelt;
20495 }
20496 }
20497 else if ((contents & (h2 | h4)) == contents)
20498 {
20499 /* punpckh* */
20500 for (i = 0; i < nelt2; ++i)
20501 {
20502 remap[i + nelt2] = i * 2;
20503 remap[i + nelt + nelt2] = i * 2 + 1;
20504 dremap.perm[i * 2] = i + nelt2;
20505 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20506 }
20507 }
20508 else
20509 return false;
20510 }
20511 else if (GET_MODE_SIZE (d->vmode) == 16)
20512 {
20513 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20514
20515 /* Split the two input vectors into 4 halves. */
20516 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20517 h2 = h1 << nelt2;
20518 h3 = h2 << nelt2;
20519 h4 = h3 << nelt2;
20520
20521 /* If the elements from the low halves use interleave low, and similarly
20522 for interleave high. If the elements are from mis-matched halves, we
20523 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20524 if ((contents & (h1 | h3)) == contents)
20525 {
20526 /* punpckl* */
20527 for (i = 0; i < nelt2; ++i)
20528 {
20529 remap[i] = i * 2;
20530 remap[i + nelt] = i * 2 + 1;
20531 dremap.perm[i * 2] = i;
20532 dremap.perm[i * 2 + 1] = i + nelt;
20533 }
20534 if (!TARGET_SSE2 && d->vmode == V4SImode)
20535 dremap.vmode = V4SFmode;
20536 }
20537 else if ((contents & (h2 | h4)) == contents)
20538 {
20539 /* punpckh* */
20540 for (i = 0; i < nelt2; ++i)
20541 {
20542 remap[i + nelt2] = i * 2;
20543 remap[i + nelt + nelt2] = i * 2 + 1;
20544 dremap.perm[i * 2] = i + nelt2;
20545 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20546 }
20547 if (!TARGET_SSE2 && d->vmode == V4SImode)
20548 dremap.vmode = V4SFmode;
20549 }
20550 else if ((contents & (h1 | h4)) == contents)
20551 {
20552 /* shufps */
20553 for (i = 0; i < nelt2; ++i)
20554 {
20555 remap[i] = i;
20556 remap[i + nelt + nelt2] = i + nelt2;
20557 dremap.perm[i] = i;
20558 dremap.perm[i + nelt2] = i + nelt + nelt2;
20559 }
20560 if (nelt != 4)
20561 {
20562 /* shufpd */
20563 dremap.vmode = V2DImode;
20564 dremap.nelt = 2;
20565 dremap.perm[0] = 0;
20566 dremap.perm[1] = 3;
20567 }
20568 }
20569 else if ((contents & (h2 | h3)) == contents)
20570 {
20571 /* shufps */
20572 for (i = 0; i < nelt2; ++i)
20573 {
20574 remap[i + nelt2] = i;
20575 remap[i + nelt] = i + nelt2;
20576 dremap.perm[i] = i + nelt2;
20577 dremap.perm[i + nelt2] = i + nelt;
20578 }
20579 if (nelt != 4)
20580 {
20581 /* shufpd */
20582 dremap.vmode = V2DImode;
20583 dremap.nelt = 2;
20584 dremap.perm[0] = 1;
20585 dremap.perm[1] = 2;
20586 }
20587 }
20588 else
20589 return false;
20590 }
20591 else
20592 {
20593 unsigned int nelt4 = nelt / 4, nzcnt = 0;
20594 unsigned HOST_WIDE_INT q[8];
20595 unsigned int nonzero_halves[4];
20596
20597 /* Split the two input vectors into 8 quarters. */
20598 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
20599 for (i = 1; i < 8; ++i)
20600 q[i] = q[0] << (nelt4 * i);
20601 for (i = 0; i < 4; ++i)
20602 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
20603 {
20604 nonzero_halves[nzcnt] = i;
20605 ++nzcnt;
20606 }
20607
20608 if (nzcnt == 1)
20609 {
20610 gcc_assert (d->one_operand_p);
20611 nonzero_halves[1] = nonzero_halves[0];
20612 same_halves = true;
20613 }
20614 else if (d->one_operand_p)
20615 {
20616 gcc_assert (nonzero_halves[0] == 0);
20617 gcc_assert (nonzero_halves[1] == 1);
20618 }
20619
20620 if (nzcnt <= 2)
20621 {
20622 if (d->perm[0] / nelt2 == nonzero_halves[1])
20623 {
20624 /* Attempt to increase the likelihood that dfinal
20625 shuffle will be intra-lane. */
20626 std::swap (nonzero_halves[0], nonzero_halves[1]);
20627 }
20628
20629 /* vperm2f128 or vperm2i128. */
20630 for (i = 0; i < nelt2; ++i)
20631 {
20632 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
20633 remap[i + nonzero_halves[0] * nelt2] = i;
20634 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
20635 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
20636 }
20637
20638 if (d->vmode != V8SFmode
20639 && d->vmode != V4DFmode
20640 && d->vmode != V8SImode)
20641 {
20642 dremap.vmode = V8SImode;
20643 dremap.nelt = 8;
20644 for (i = 0; i < 4; ++i)
20645 {
20646 dremap.perm[i] = i + nonzero_halves[0] * 4;
20647 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
20648 }
20649 }
20650 }
20651 else if (d->one_operand_p)
20652 return false;
20653 else if (TARGET_AVX2
20654 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
20655 {
20656 /* vpunpckl* */
20657 for (i = 0; i < nelt4; ++i)
20658 {
20659 remap[i] = i * 2;
20660 remap[i + nelt] = i * 2 + 1;
20661 remap[i + nelt2] = i * 2 + nelt2;
20662 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
20663 dremap.perm[i * 2] = i;
20664 dremap.perm[i * 2 + 1] = i + nelt;
20665 dremap.perm[i * 2 + nelt2] = i + nelt2;
20666 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
20667 }
20668 }
20669 else if (TARGET_AVX2
20670 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
20671 {
20672 /* vpunpckh* */
20673 for (i = 0; i < nelt4; ++i)
20674 {
20675 remap[i + nelt4] = i * 2;
20676 remap[i + nelt + nelt4] = i * 2 + 1;
20677 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
20678 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
20679 dremap.perm[i * 2] = i + nelt4;
20680 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
20681 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
20682 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
20683 }
20684 }
20685 else
20686 return false;
20687 }
20688
20689 /* Use the remapping array set up above to move the elements from their
20690 swizzled locations into their final destinations. */
20691 dfinal = *d;
20692 for (i = 0; i < nelt; ++i)
20693 {
20694 unsigned e = remap[d->perm[i]];
20695 gcc_assert (e < nelt);
20696 /* If same_halves is true, both halves of the remapped vector are the
20697 same. Avoid cross-lane accesses if possible. */
20698 if (same_halves && i >= nelt2)
20699 {
20700 gcc_assert (e < nelt2);
20701 dfinal.perm[i] = e + nelt2;
20702 }
20703 else
20704 dfinal.perm[i] = e;
20705 }
20706 if (!d->testing_p)
20707 {
20708 dremap.target = gen_reg_rtx (dremap.vmode);
20709 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20710 }
20711 dfinal.op1 = dfinal.op0;
20712 dfinal.one_operand_p = true;
20713
20714 /* Test if the final remap can be done with a single insn. For V4SFmode or
20715 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20716 start_sequence ();
20717 ok = expand_vec_perm_1 (&dfinal);
20718 seq = get_insns ();
20719 end_sequence ();
20720
20721 if (!ok)
20722 return false;
20723
20724 if (d->testing_p)
20725 return true;
20726
20727 if (dremap.vmode != dfinal.vmode)
20728 {
20729 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20730 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20731 }
20732
20733 ok = expand_vec_perm_1 (&dremap);
20734 gcc_assert (ok);
20735
20736 emit_insn (seq);
20737 return true;
20738 }
20739
20740 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20741 a single vector cross-lane permutation into vpermq followed
20742 by any of the single insn permutations. */
20743
20744 static bool
20745 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20746 {
20747 struct expand_vec_perm_d dremap, dfinal;
20748 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20749 unsigned contents[2];
20750 bool ok;
20751
20752 if (!(TARGET_AVX2
20753 && (d->vmode == V32QImode || d->vmode == V16HImode)
20754 && d->one_operand_p))
20755 return false;
20756
20757 contents[0] = 0;
20758 contents[1] = 0;
20759 for (i = 0; i < nelt2; ++i)
20760 {
20761 contents[0] |= 1u << (d->perm[i] / nelt4);
20762 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20763 }
20764
20765 for (i = 0; i < 2; ++i)
20766 {
20767 unsigned int cnt = 0;
20768 for (j = 0; j < 4; ++j)
20769 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20770 return false;
20771 }
20772
20773 if (d->testing_p)
20774 return true;
20775
20776 dremap = *d;
20777 dremap.vmode = V4DImode;
20778 dremap.nelt = 4;
20779 dremap.target = gen_reg_rtx (V4DImode);
20780 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20781 dremap.op1 = dremap.op0;
20782 dremap.one_operand_p = true;
20783 for (i = 0; i < 2; ++i)
20784 {
20785 unsigned int cnt = 0;
20786 for (j = 0; j < 4; ++j)
20787 if ((contents[i] & (1u << j)) != 0)
20788 dremap.perm[2 * i + cnt++] = j;
20789 for (; cnt < 2; ++cnt)
20790 dremap.perm[2 * i + cnt] = 0;
20791 }
20792
20793 dfinal = *d;
20794 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20795 dfinal.op1 = dfinal.op0;
20796 dfinal.one_operand_p = true;
20797 for (i = 0, j = 0; i < nelt; ++i)
20798 {
20799 if (i == nelt2)
20800 j = 2;
20801 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20802 if ((d->perm[i] / nelt4) == dremap.perm[j])
20803 ;
20804 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20805 dfinal.perm[i] |= nelt4;
20806 else
20807 gcc_unreachable ();
20808 }
20809
20810 ok = expand_vec_perm_1 (&dremap);
20811 gcc_assert (ok);
20812
20813 ok = expand_vec_perm_1 (&dfinal);
20814 gcc_assert (ok);
20815
20816 return true;
20817 }
20818
20819 static bool canonicalize_perm (struct expand_vec_perm_d *d);
20820
20821 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20822 a vector permutation using two instructions, vperm2f128 resp.
20823 vperm2i128 followed by any single in-lane permutation. */
20824
20825 static bool
20826 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20827 {
20828 struct expand_vec_perm_d dfirst, dsecond;
20829 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20830 bool ok;
20831
20832 if (!TARGET_AVX
20833 || GET_MODE_SIZE (d->vmode) != 32
20834 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20835 return false;
20836
20837 dsecond = *d;
20838 dsecond.one_operand_p = false;
20839 dsecond.testing_p = true;
20840
20841 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20842 immediate. For perm < 16 the second permutation uses
20843 d->op0 as first operand, for perm >= 16 it uses d->op1
20844 as first operand. The second operand is the result of
20845 vperm2[fi]128. */
20846 for (perm = 0; perm < 32; perm++)
20847 {
20848 /* Ignore permutations which do not move anything cross-lane. */
20849 if (perm < 16)
20850 {
20851 /* The second shuffle for e.g. V4DFmode has
20852 0123 and ABCD operands.
20853 Ignore AB23, as 23 is already in the second lane
20854 of the first operand. */
20855 if ((perm & 0xc) == (1 << 2)) continue;
20856 /* And 01CD, as 01 is in the first lane of the first
20857 operand. */
20858 if ((perm & 3) == 0) continue;
20859 /* And 4567, as then the vperm2[fi]128 doesn't change
20860 anything on the original 4567 second operand. */
20861 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
20862 }
20863 else
20864 {
20865 /* The second shuffle for e.g. V4DFmode has
20866 4567 and ABCD operands.
20867 Ignore AB67, as 67 is already in the second lane
20868 of the first operand. */
20869 if ((perm & 0xc) == (3 << 2)) continue;
20870 /* And 45CD, as 45 is in the first lane of the first
20871 operand. */
20872 if ((perm & 3) == 2) continue;
20873 /* And 0123, as then the vperm2[fi]128 doesn't change
20874 anything on the original 0123 first operand. */
20875 if ((perm & 0xf) == (1 << 2)) continue;
20876 }
20877
20878 for (i = 0; i < nelt; i++)
20879 {
20880 j = d->perm[i] / nelt2;
20881 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20882 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20883 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20884 dsecond.perm[i] = d->perm[i] & (nelt - 1);
20885 else
20886 break;
20887 }
20888
20889 if (i == nelt)
20890 {
20891 start_sequence ();
20892 ok = expand_vec_perm_1 (&dsecond);
20893 end_sequence ();
20894 }
20895 else
20896 ok = false;
20897
20898 if (ok)
20899 {
20900 if (d->testing_p)
20901 return true;
20902
20903 /* Found a usable second shuffle. dfirst will be
20904 vperm2f128 on d->op0 and d->op1. */
20905 dsecond.testing_p = false;
20906 dfirst = *d;
20907 dfirst.target = gen_reg_rtx (d->vmode);
20908 for (i = 0; i < nelt; i++)
20909 dfirst.perm[i] = (i & (nelt2 - 1))
20910 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20911
20912 canonicalize_perm (&dfirst);
20913 ok = expand_vec_perm_1 (&dfirst);
20914 gcc_assert (ok);
20915
20916 /* And dsecond is some single insn shuffle, taking
20917 d->op0 and result of vperm2f128 (if perm < 16) or
20918 d->op1 and result of vperm2f128 (otherwise). */
20919 if (perm >= 16)
20920 dsecond.op0 = dsecond.op1;
20921 dsecond.op1 = dfirst.target;
20922
20923 ok = expand_vec_perm_1 (&dsecond);
20924 gcc_assert (ok);
20925
20926 return true;
20927 }
20928
20929 /* For one operand, the only useful vperm2f128 permutation is 0x01
20930 aka lanes swap. */
20931 if (d->one_operand_p)
20932 return false;
20933 }
20934
20935 return false;
20936 }
20937
20938 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20939 a two vector permutation using 2 intra-lane interleave insns
20940 and cross-lane shuffle for 32-byte vectors. */
20941
20942 static bool
20943 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20944 {
20945 unsigned i, nelt;
20946 rtx (*gen) (rtx, rtx, rtx);
20947
20948 if (d->one_operand_p)
20949 return false;
20950 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20951 ;
20952 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20953 ;
20954 else
20955 return false;
20956
20957 nelt = d->nelt;
20958 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20959 return false;
20960 for (i = 0; i < nelt; i += 2)
20961 if (d->perm[i] != d->perm[0] + i / 2
20962 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
20963 return false;
20964
20965 if (d->testing_p)
20966 return true;
20967
20968 switch (d->vmode)
20969 {
20970 case E_V32QImode:
20971 if (d->perm[0])
20972 gen = gen_vec_interleave_highv32qi;
20973 else
20974 gen = gen_vec_interleave_lowv32qi;
20975 break;
20976 case E_V16HImode:
20977 if (d->perm[0])
20978 gen = gen_vec_interleave_highv16hi;
20979 else
20980 gen = gen_vec_interleave_lowv16hi;
20981 break;
20982 case E_V8SImode:
20983 if (d->perm[0])
20984 gen = gen_vec_interleave_highv8si;
20985 else
20986 gen = gen_vec_interleave_lowv8si;
20987 break;
20988 case E_V4DImode:
20989 if (d->perm[0])
20990 gen = gen_vec_interleave_highv4di;
20991 else
20992 gen = gen_vec_interleave_lowv4di;
20993 break;
20994 case E_V8SFmode:
20995 if (d->perm[0])
20996 gen = gen_vec_interleave_highv8sf;
20997 else
20998 gen = gen_vec_interleave_lowv8sf;
20999 break;
21000 case E_V4DFmode:
21001 if (d->perm[0])
21002 gen = gen_vec_interleave_highv4df;
21003 else
21004 gen = gen_vec_interleave_lowv4df;
21005 break;
21006 default:
21007 gcc_unreachable ();
21008 }
21009
21010 emit_insn (gen (d->target, d->op0, d->op1));
21011 return true;
21012 }
21013
21014 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21015 a single vector permutation using a single intra-lane vector
21016 permutation, vperm2f128 swapping the lanes and vblend* insn blending
21017 the non-swapped and swapped vectors together. */
21018
21019 static bool
21020 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
21021 {
21022 struct expand_vec_perm_d dfirst, dsecond;
21023 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
21024 rtx_insn *seq;
21025 bool ok;
21026 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21027
21028 if (!TARGET_AVX
21029 || TARGET_AVX2
21030 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21031 || !d->one_operand_p)
21032 return false;
21033
21034 dfirst = *d;
21035 for (i = 0; i < nelt; i++)
21036 dfirst.perm[i] = 0xff;
21037 for (i = 0, msk = 0; i < nelt; i++)
21038 {
21039 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21040 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
21041 return false;
21042 dfirst.perm[j] = d->perm[i];
21043 if (j != i)
21044 msk |= (1 << i);
21045 }
21046 for (i = 0; i < nelt; i++)
21047 if (dfirst.perm[i] == 0xff)
21048 dfirst.perm[i] = i;
21049
21050 if (!d->testing_p)
21051 dfirst.target = gen_reg_rtx (dfirst.vmode);
21052
21053 start_sequence ();
21054 ok = expand_vec_perm_1 (&dfirst);
21055 seq = get_insns ();
21056 end_sequence ();
21057
21058 if (!ok)
21059 return false;
21060
21061 if (d->testing_p)
21062 return true;
21063
21064 emit_insn (seq);
21065
21066 dsecond = *d;
21067 dsecond.op0 = dfirst.target;
21068 dsecond.op1 = dfirst.target;
21069 dsecond.one_operand_p = true;
21070 dsecond.target = gen_reg_rtx (dsecond.vmode);
21071 for (i = 0; i < nelt; i++)
21072 dsecond.perm[i] = i ^ nelt2;
21073
21074 ok = expand_vec_perm_1 (&dsecond);
21075 gcc_assert (ok);
21076
21077 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21078 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
21079 return true;
21080 }
21081
21082 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21083 a two vector permutation using two single vector permutations and
21084 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
21085 of dfirst or dsecond is identity permutation. */
21086
21087 static bool
21088 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
21089 {
21090 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
21091 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21092 bool ident1 = true, ident2 = true;
21093
21094 if (d->one_operand_p)
21095 return false;
21096
21097 if (GET_MODE_SIZE (d->vmode) == 16)
21098 {
21099 if (!TARGET_SSE)
21100 return false;
21101 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
21102 return false;
21103 }
21104 else if (GET_MODE_SIZE (d->vmode) == 32)
21105 {
21106 if (!TARGET_AVX)
21107 return false;
21108 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
21109 return false;
21110 lane = nelt2;
21111 }
21112 else
21113 return false;
21114
21115 for (i = 1; i < nelt; i++)
21116 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
21117 return false;
21118
21119 dfirst = *d;
21120 dsecond = *d;
21121 dfinal = *d;
21122 dfirst.op1 = dfirst.op0;
21123 dfirst.one_operand_p = true;
21124 dsecond.op0 = dsecond.op1;
21125 dsecond.one_operand_p = true;
21126
21127 for (i = 0; i < nelt; i++)
21128 if (d->perm[i] >= nelt)
21129 {
21130 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
21131 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
21132 ident2 = false;
21133 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
21134 = d->perm[i] - nelt;
21135 }
21136 else
21137 {
21138 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
21139 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
21140 ident1 = false;
21141 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
21142 }
21143
21144 if (two_insn && !ident1 && !ident2)
21145 return false;
21146
21147 if (!d->testing_p)
21148 {
21149 if (!ident1)
21150 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21151 if (!ident2)
21152 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21153 if (d->perm[0] >= nelt)
21154 std::swap (dfinal.op0, dfinal.op1);
21155 }
21156
21157 bool ok;
21158 rtx_insn *seq1 = NULL, *seq2 = NULL;
21159
21160 if (!ident1)
21161 {
21162 start_sequence ();
21163 ok = expand_vec_perm_1 (&dfirst);
21164 seq1 = get_insns ();
21165 end_sequence ();
21166
21167 if (!ok)
21168 return false;
21169 }
21170
21171 if (!ident2)
21172 {
21173 start_sequence ();
21174 ok = expand_vec_perm_1 (&dsecond);
21175 seq2 = get_insns ();
21176 end_sequence ();
21177
21178 if (!ok)
21179 return false;
21180 }
21181
21182 if (d->testing_p)
21183 return true;
21184
21185 for (i = 0; i < nelt; i++)
21186 {
21187 dfinal.perm[i] = i / 2;
21188 if (i >= lane)
21189 dfinal.perm[i] += lane / 2;
21190 if ((i & 1) != 0)
21191 dfinal.perm[i] += nelt;
21192 }
21193 emit_insn (seq1);
21194 emit_insn (seq2);
21195 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
21196 dfinal.perm, dfinal.nelt, false);
21197 gcc_assert (ok);
21198 return true;
21199 }
21200
21201 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21202 the permutation using two single vector permutations and the SSE4_1 pblendv
21203 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21204 identity permutation. */
21205
21206 static bool
21207 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
21208 {
21209 unsigned i, nelt = d->nelt;
21210 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21211 machine_mode vmode = d->vmode;
21212 bool ident1 = true, ident2 = true;
21213
21214 /* Use the same checks as in expand_vec_perm_blend. */
21215 if (d->one_operand_p)
21216 return false;
21217 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21218 ;
21219 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21220 ;
21221 else if (TARGET_SSE4_1
21222 && (GET_MODE_SIZE (vmode) == 16
21223 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
21224 || GET_MODE_SIZE (vmode) == 4))
21225 ;
21226 else
21227 return false;
21228
21229 dfirst = *d;
21230 dsecond = *d;
21231 dfinal = *d;
21232 dfirst.op1 = dfirst.op0;
21233 dfirst.one_operand_p = true;
21234 dsecond.op0 = dsecond.op1;
21235 dsecond.one_operand_p = true;
21236
21237 for (i = 0; i < nelt; ++i)
21238 if (d->perm[i] >= nelt)
21239 {
21240 dfirst.perm[i] = 0xff;
21241 dsecond.perm[i] = d->perm[i] - nelt;
21242 if (d->perm[i] != i + nelt)
21243 ident2 = false;
21244 }
21245 else
21246 {
21247 dsecond.perm[i] = 0xff;
21248 dfirst.perm[i] = d->perm[i];
21249 if (d->perm[i] != i)
21250 ident1 = false;
21251 }
21252
21253 if (two_insn && !ident1 && !ident2)
21254 return false;
21255
21256 /* For now. Ideally treat 0xff as a wildcard. */
21257 for (i = 0; i < nelt; ++i)
21258 if (dfirst.perm[i] == 0xff)
21259 {
21260 if (GET_MODE_SIZE (vmode) == 32
21261 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
21262 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21263 else
21264 dfirst.perm[i] = i;
21265 }
21266 else
21267 {
21268 if (GET_MODE_SIZE (vmode) == 32
21269 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
21270 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21271 else
21272 dsecond.perm[i] = i;
21273 }
21274
21275 if (!d->testing_p)
21276 {
21277 if (!ident1)
21278 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21279 if (!ident2)
21280 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21281 }
21282
21283 bool ok;
21284 rtx_insn *seq1 = NULL, *seq2 = NULL;
21285
21286 if (!ident1)
21287 {
21288 start_sequence ();
21289 ok = expand_vec_perm_1 (&dfirst);
21290 seq1 = get_insns ();
21291 end_sequence ();
21292
21293 if (!ok)
21294 return false;
21295 }
21296
21297 if (!ident2)
21298 {
21299 start_sequence ();
21300 ok = expand_vec_perm_1 (&dsecond);
21301 seq2 = get_insns ();
21302 end_sequence ();
21303
21304 if (!ok)
21305 return false;
21306 }
21307
21308 if (d->testing_p)
21309 return true;
21310
21311 for (i = 0; i < nelt; ++i)
21312 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
21313
21314 emit_insn (seq1);
21315 emit_insn (seq2);
21316 ok = expand_vec_perm_blend (&dfinal);
21317 gcc_assert (ok);
21318 return true;
21319 }
21320
21321 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21322 permutation using two vperm2f128, followed by a vshufpd insn blending
21323 the two vectors together. */
21324
21325 static bool
21326 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
21327 {
21328 struct expand_vec_perm_d dfirst, dsecond, dthird;
21329 bool ok;
21330
21331 if (!TARGET_AVX || (d->vmode != V4DFmode))
21332 return false;
21333
21334 if (d->testing_p)
21335 return true;
21336
21337 dfirst = *d;
21338 dsecond = *d;
21339 dthird = *d;
21340
21341 dfirst.perm[0] = (d->perm[0] & ~1);
21342 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
21343 dfirst.perm[2] = (d->perm[2] & ~1);
21344 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
21345 dsecond.perm[0] = (d->perm[1] & ~1);
21346 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
21347 dsecond.perm[2] = (d->perm[3] & ~1);
21348 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
21349 dthird.perm[0] = (d->perm[0] % 2);
21350 dthird.perm[1] = (d->perm[1] % 2) + 4;
21351 dthird.perm[2] = (d->perm[2] % 2) + 2;
21352 dthird.perm[3] = (d->perm[3] % 2) + 6;
21353
21354 dfirst.target = gen_reg_rtx (dfirst.vmode);
21355 dsecond.target = gen_reg_rtx (dsecond.vmode);
21356 dthird.op0 = dfirst.target;
21357 dthird.op1 = dsecond.target;
21358 dthird.one_operand_p = false;
21359
21360 canonicalize_perm (&dfirst);
21361 canonicalize_perm (&dsecond);
21362
21363 ok = expand_vec_perm_1 (&dfirst)
21364 && expand_vec_perm_1 (&dsecond)
21365 && expand_vec_perm_1 (&dthird);
21366
21367 gcc_assert (ok);
21368
21369 return true;
21370 }
21371
21372 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
21373
21374 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21375 a two vector permutation using two intra-lane vector
21376 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21377 the non-swapped and swapped vectors together. */
21378
21379 static bool
21380 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
21381 {
21382 struct expand_vec_perm_d dfirst, dsecond, dthird;
21383 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
21384 rtx_insn *seq1, *seq2;
21385 bool ok;
21386 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21387
21388 if (!TARGET_AVX
21389 || TARGET_AVX2
21390 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21391 || d->one_operand_p)
21392 return false;
21393
21394 dfirst = *d;
21395 dsecond = *d;
21396 for (i = 0; i < nelt; i++)
21397 {
21398 dfirst.perm[i] = 0xff;
21399 dsecond.perm[i] = 0xff;
21400 }
21401 for (i = 0, msk = 0; i < nelt; i++)
21402 {
21403 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21404 if (j == i)
21405 {
21406 dfirst.perm[j] = d->perm[i];
21407 which1 |= (d->perm[i] < nelt ? 1 : 2);
21408 }
21409 else
21410 {
21411 dsecond.perm[j] = d->perm[i];
21412 which2 |= (d->perm[i] < nelt ? 1 : 2);
21413 msk |= (1U << i);
21414 }
21415 }
21416 if (msk == 0 || msk == (1U << nelt) - 1)
21417 return false;
21418
21419 if (!d->testing_p)
21420 {
21421 dfirst.target = gen_reg_rtx (dfirst.vmode);
21422 dsecond.target = gen_reg_rtx (dsecond.vmode);
21423 }
21424
21425 for (i = 0; i < nelt; i++)
21426 {
21427 if (dfirst.perm[i] == 0xff)
21428 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
21429 if (dsecond.perm[i] == 0xff)
21430 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
21431 }
21432 canonicalize_perm (&dfirst);
21433 start_sequence ();
21434 ok = ix86_expand_vec_perm_const_1 (&dfirst);
21435 seq1 = get_insns ();
21436 end_sequence ();
21437
21438 if (!ok)
21439 return false;
21440
21441 canonicalize_perm (&dsecond);
21442 start_sequence ();
21443 ok = ix86_expand_vec_perm_const_1 (&dsecond);
21444 seq2 = get_insns ();
21445 end_sequence ();
21446
21447 if (!ok)
21448 return false;
21449
21450 if (d->testing_p)
21451 return true;
21452
21453 emit_insn (seq1);
21454 emit_insn (seq2);
21455
21456 dthird = *d;
21457 dthird.op0 = dsecond.target;
21458 dthird.op1 = dsecond.target;
21459 dthird.one_operand_p = true;
21460 dthird.target = gen_reg_rtx (dthird.vmode);
21461 for (i = 0; i < nelt; i++)
21462 dthird.perm[i] = i ^ nelt2;
21463
21464 ok = expand_vec_perm_1 (&dthird);
21465 gcc_assert (ok);
21466
21467 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21468 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
21469 return true;
21470 }
21471
21472 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21473 permutation with two pshufb insns and an ior. We should have already
21474 failed all two instruction sequences. */
21475
21476 static bool
21477 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
21478 {
21479 rtx rperm[2][16], vperm, l, h, op, m128;
21480 unsigned int i, nelt, eltsz;
21481 machine_mode mode;
21482 rtx (*gen) (rtx, rtx, rtx);
21483
21484 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
21485 && GET_MODE_SIZE (d->vmode) != 8
21486 && GET_MODE_SIZE (d->vmode) != 4))
21487 return false;
21488 gcc_assert (!d->one_operand_p);
21489
21490 if (d->testing_p)
21491 return true;
21492
21493 switch (GET_MODE_SIZE (d->vmode))
21494 {
21495 case 4:
21496 mode = V4QImode;
21497 gen = gen_mmx_pshufbv4qi3;
21498 break;
21499 case 8:
21500 mode = V8QImode;
21501 gen = gen_mmx_pshufbv8qi3;
21502 break;
21503 case 16:
21504 mode = V16QImode;
21505 gen = gen_ssse3_pshufbv16qi3;
21506 break;
21507 default:
21508 gcc_unreachable ();
21509 }
21510
21511 nelt = d->nelt;
21512 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21513
21514 /* Generate two permutation masks. If the required element is within
21515 the given vector it is shuffled into the proper lane. If the required
21516 element is in the other vector, force a zero into the lane by setting
21517 bit 7 in the permutation mask. */
21518 m128 = GEN_INT (-128);
21519 for (i = 0; i < nelt; ++i)
21520 {
21521 unsigned j, k, e = d->perm[i];
21522 unsigned which = (e >= nelt);
21523 if (e >= nelt)
21524 e -= nelt;
21525
21526 for (j = 0; j < eltsz; ++j)
21527 {
21528 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
21529 rperm[1-which][i*eltsz + j] = m128;
21530 }
21531
21532 for (k = i*eltsz + j; k < 16; ++k)
21533 rperm[0][k] = rperm[1][k] = m128;
21534 }
21535
21536 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
21537 vperm = force_reg (V16QImode, vperm);
21538
21539 l = gen_reg_rtx (mode);
21540 op = gen_lowpart (mode, d->op0);
21541 emit_insn (gen (l, op, vperm));
21542
21543 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
21544 vperm = force_reg (V16QImode, vperm);
21545
21546 h = gen_reg_rtx (mode);
21547 op = gen_lowpart (mode, d->op1);
21548 emit_insn (gen (h, op, vperm));
21549
21550 op = d->target;
21551 if (d->vmode != mode)
21552 op = gen_reg_rtx (mode);
21553 ix86_emit_vec_binop (IOR, mode, op, l, h);
21554 if (op != d->target)
21555 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21556
21557 return true;
21558 }
21559
21560 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
21561 with two vpshufb insns, vpermq and vpor. We should have already failed
21562 all two or three instruction sequences. */
21563
21564 static bool
21565 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
21566 {
21567 rtx rperm[2][32], vperm, l, h, hp, op, m128;
21568 unsigned int i, nelt, eltsz;
21569
21570 if (!TARGET_AVX2
21571 || !d->one_operand_p
21572 || (d->vmode != V32QImode && d->vmode != V16HImode))
21573 return false;
21574
21575 if (d->testing_p)
21576 return true;
21577
21578 nelt = d->nelt;
21579 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21580
21581 /* Generate two permutation masks. If the required element is within
21582 the same lane, it is shuffled in. If the required element from the
21583 other lane, force a zero by setting bit 7 in the permutation mask.
21584 In the other mask the mask has non-negative elements if element
21585 is requested from the other lane, but also moved to the other lane,
21586 so that the result of vpshufb can have the two V2TImode halves
21587 swapped. */
21588 m128 = GEN_INT (-128);
21589 for (i = 0; i < nelt; ++i)
21590 {
21591 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21592 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21593
21594 for (j = 0; j < eltsz; ++j)
21595 {
21596 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
21597 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
21598 }
21599 }
21600
21601 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21602 vperm = force_reg (V32QImode, vperm);
21603
21604 h = gen_reg_rtx (V32QImode);
21605 op = gen_lowpart (V32QImode, d->op0);
21606 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21607
21608 /* Swap the 128-byte lanes of h into hp. */
21609 hp = gen_reg_rtx (V4DImode);
21610 op = gen_lowpart (V4DImode, h);
21611 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
21612 const1_rtx));
21613
21614 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21615 vperm = force_reg (V32QImode, vperm);
21616
21617 l = gen_reg_rtx (V32QImode);
21618 op = gen_lowpart (V32QImode, d->op0);
21619 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21620
21621 op = d->target;
21622 if (d->vmode != V32QImode)
21623 op = gen_reg_rtx (V32QImode);
21624 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
21625 if (op != d->target)
21626 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21627
21628 return true;
21629 }
21630
21631 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21632 and extract-odd permutations of two V32QImode and V16QImode operand
21633 with two vpshufb insns, vpor and vpermq. We should have already
21634 failed all two or three instruction sequences. */
21635
21636 static bool
21637 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
21638 {
21639 rtx rperm[2][32], vperm, l, h, ior, op, m128;
21640 unsigned int i, nelt, eltsz;
21641
21642 if (!TARGET_AVX2
21643 || d->one_operand_p
21644 || (d->vmode != V32QImode && d->vmode != V16HImode))
21645 return false;
21646
21647 for (i = 0; i < d->nelt; ++i)
21648 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
21649 return false;
21650
21651 if (d->testing_p)
21652 return true;
21653
21654 nelt = d->nelt;
21655 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21656
21657 /* Generate two permutation masks. In the first permutation mask
21658 the first quarter will contain indexes for the first half
21659 of the op0, the second quarter will contain bit 7 set, third quarter
21660 will contain indexes for the second half of the op0 and the
21661 last quarter bit 7 set. In the second permutation mask
21662 the first quarter will contain bit 7 set, the second quarter
21663 indexes for the first half of the op1, the third quarter bit 7 set
21664 and last quarter indexes for the second half of the op1.
21665 I.e. the first mask e.g. for V32QImode extract even will be:
21666 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21667 (all values masked with 0xf except for -128) and second mask
21668 for extract even will be
21669 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21670 m128 = GEN_INT (-128);
21671 for (i = 0; i < nelt; ++i)
21672 {
21673 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21674 unsigned which = d->perm[i] >= nelt;
21675 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
21676
21677 for (j = 0; j < eltsz; ++j)
21678 {
21679 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
21680 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
21681 }
21682 }
21683
21684 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21685 vperm = force_reg (V32QImode, vperm);
21686
21687 l = gen_reg_rtx (V32QImode);
21688 op = gen_lowpart (V32QImode, d->op0);
21689 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21690
21691 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21692 vperm = force_reg (V32QImode, vperm);
21693
21694 h = gen_reg_rtx (V32QImode);
21695 op = gen_lowpart (V32QImode, d->op1);
21696 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21697
21698 ior = gen_reg_rtx (V32QImode);
21699 emit_insn (gen_iorv32qi3 (ior, l, h));
21700
21701 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21702 op = gen_reg_rtx (V4DImode);
21703 ior = gen_lowpart (V4DImode, ior);
21704 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
21705 const1_rtx, GEN_INT (3)));
21706 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21707
21708 return true;
21709 }
21710
21711 /* Implement permutation with pslldq + psrldq + por when pshufb is not
21712 available. */
21713 static bool
21714 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
21715 {
21716 unsigned i, nelt = d->nelt;
21717 unsigned start1, end1 = -1;
21718 machine_mode vmode = d->vmode, imode;
21719 int start2 = -1;
21720 bool clear_op0, clear_op1;
21721 unsigned inner_size;
21722 rtx op0, op1, dop1;
21723 rtx (*gen_vec_shr) (rtx, rtx, rtx);
21724 rtx (*gen_vec_shl) (rtx, rtx, rtx);
21725
21726 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21727 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
21728 return false;
21729
21730 start1 = d->perm[0];
21731 for (i = 1; i < nelt; i++)
21732 {
21733 if (d->perm[i] != d->perm[i-1] + 1
21734 || d->perm[i] == nelt)
21735 {
21736 if (start2 == -1)
21737 {
21738 start2 = d->perm[i];
21739 end1 = d->perm[i-1];
21740 }
21741 else
21742 return false;
21743 }
21744 }
21745
21746 clear_op0 = end1 != nelt - 1;
21747 clear_op1 = start2 % nelt != 0;
21748 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21749 if (!pandn && (clear_op0 || clear_op1))
21750 return false;
21751
21752 if (d->testing_p)
21753 return true;
21754
21755 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
21756 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
21757 imode = GET_MODE_INNER (vmode);
21758 inner_size = GET_MODE_BITSIZE (imode);
21759 op0 = gen_reg_rtx (vmode);
21760 op1 = gen_reg_rtx (vmode);
21761
21762 if (start1)
21763 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
21764 else
21765 emit_move_insn (op0, d->op0);
21766
21767 dop1 = d->op1;
21768 if (d->one_operand_p)
21769 dop1 = d->op0;
21770
21771 int shl_offset = end1 - start1 + 1 - start2 % nelt;
21772 if (shl_offset)
21773 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
21774 else
21775 emit_move_insn (op1, dop1);
21776
21777 /* Clear lower/upper bits for op0/op1. */
21778 if (clear_op0 || clear_op1)
21779 {
21780 rtx vec[16];
21781 rtx const_vec;
21782 rtx clear;
21783 for (i = 0; i != nelt; i++)
21784 {
21785 if (i < (end1 - start1 + 1))
21786 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
21787 else
21788 vec[i] = CONST0_RTX (imode);
21789 }
21790 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
21791 const_vec = validize_mem (force_const_mem (vmode, const_vec));
21792 clear = force_reg (vmode, const_vec);
21793
21794 if (clear_op0)
21795 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
21796 if (clear_op1)
21797 emit_move_insn (op1, gen_rtx_AND (vmode,
21798 gen_rtx_NOT (vmode, clear),
21799 op1));
21800 }
21801
21802 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
21803 return true;
21804 }
21805
21806 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21807 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21808 operands with two "and" and "pack" or two "shift" and "pack" insns.
21809 We should have already failed all two instruction sequences. */
21810
21811 static bool
21812 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
21813 {
21814 rtx op, dop0, dop1, t;
21815 unsigned i, odd, c, s, nelt = d->nelt;
21816 bool end_perm = false;
21817 machine_mode half_mode;
21818 rtx (*gen_and) (rtx, rtx, rtx);
21819 rtx (*gen_pack) (rtx, rtx, rtx);
21820 rtx (*gen_shift) (rtx, rtx, rtx);
21821
21822 if (d->one_operand_p)
21823 return false;
21824
21825 switch (d->vmode)
21826 {
21827 case E_V4HImode:
21828 /* Required for "pack". */
21829 if (!TARGET_SSE4_1)
21830 return false;
21831 c = 0xffff;
21832 s = 16;
21833 half_mode = V2SImode;
21834 gen_and = gen_andv2si3;
21835 gen_pack = gen_mmx_packusdw;
21836 gen_shift = gen_lshrv2si3;
21837 break;
21838 case E_V8HImode:
21839 /* Required for "pack". */
21840 if (!TARGET_SSE4_1)
21841 return false;
21842 c = 0xffff;
21843 s = 16;
21844 half_mode = V4SImode;
21845 gen_and = gen_andv4si3;
21846 gen_pack = gen_sse4_1_packusdw;
21847 gen_shift = gen_lshrv4si3;
21848 break;
21849 case E_V8QImode:
21850 /* No check as all instructions are SSE2. */
21851 c = 0xff;
21852 s = 8;
21853 half_mode = V4HImode;
21854 gen_and = gen_andv4hi3;
21855 gen_pack = gen_mmx_packuswb;
21856 gen_shift = gen_lshrv4hi3;
21857 break;
21858 case E_V16QImode:
21859 /* No check as all instructions are SSE2. */
21860 c = 0xff;
21861 s = 8;
21862 half_mode = V8HImode;
21863 gen_and = gen_andv8hi3;
21864 gen_pack = gen_sse2_packuswb;
21865 gen_shift = gen_lshrv8hi3;
21866 break;
21867 case E_V16HImode:
21868 if (!TARGET_AVX2)
21869 return false;
21870 c = 0xffff;
21871 s = 16;
21872 half_mode = V8SImode;
21873 gen_and = gen_andv8si3;
21874 gen_pack = gen_avx2_packusdw;
21875 gen_shift = gen_lshrv8si3;
21876 end_perm = true;
21877 break;
21878 case E_V32QImode:
21879 if (!TARGET_AVX2)
21880 return false;
21881 c = 0xff;
21882 s = 8;
21883 half_mode = V16HImode;
21884 gen_and = gen_andv16hi3;
21885 gen_pack = gen_avx2_packuswb;
21886 gen_shift = gen_lshrv16hi3;
21887 end_perm = true;
21888 break;
21889 default:
21890 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21891 are more profitable than general shuffles. */
21892 return false;
21893 }
21894
21895 /* Check that permutation is even or odd. */
21896 odd = d->perm[0];
21897 if (odd > 1)
21898 return false;
21899
21900 for (i = 1; i < nelt; ++i)
21901 if (d->perm[i] != 2 * i + odd)
21902 return false;
21903
21904 if (d->testing_p)
21905 return true;
21906
21907 dop0 = gen_reg_rtx (half_mode);
21908 dop1 = gen_reg_rtx (half_mode);
21909 if (odd == 0)
21910 {
21911 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
21912 t = force_reg (half_mode, t);
21913 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
21914 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
21915 }
21916 else
21917 {
21918 emit_insn (gen_shift (dop0,
21919 gen_lowpart (half_mode, d->op0),
21920 GEN_INT (s)));
21921 emit_insn (gen_shift (dop1,
21922 gen_lowpart (half_mode, d->op1),
21923 GEN_INT (s)));
21924 }
21925 /* In AVX2 for 256 bit case we need to permute pack result. */
21926 if (TARGET_AVX2 && end_perm)
21927 {
21928 op = gen_reg_rtx (d->vmode);
21929 t = gen_reg_rtx (V4DImode);
21930 emit_insn (gen_pack (op, dop0, dop1));
21931 emit_insn (gen_avx2_permv4di_1 (t,
21932 gen_lowpart (V4DImode, op),
21933 const0_rtx,
21934 const2_rtx,
21935 const1_rtx,
21936 GEN_INT (3)));
21937 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
21938 }
21939 else
21940 emit_insn (gen_pack (d->target, dop0, dop1));
21941
21942 return true;
21943 }
21944
21945 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21946 and extract-odd permutations of two V64QI operands
21947 with two "shifts", two "truncs" and one "concat" insns for "odd"
21948 and two "truncs" and one concat insn for "even."
21949 Have already failed all two instruction sequences. */
21950
21951 static bool
21952 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
21953 {
21954 rtx t1, t2, t3, t4;
21955 unsigned i, odd, nelt = d->nelt;
21956
21957 if (!TARGET_AVX512BW
21958 || d->one_operand_p
21959 || d->vmode != V64QImode)
21960 return false;
21961
21962 /* Check that permutation is even or odd. */
21963 odd = d->perm[0];
21964 if (odd > 1)
21965 return false;
21966
21967 for (i = 1; i < nelt; ++i)
21968 if (d->perm[i] != 2 * i + odd)
21969 return false;
21970
21971 if (d->testing_p)
21972 return true;
21973
21974
21975 if (odd)
21976 {
21977 t1 = gen_reg_rtx (V32HImode);
21978 t2 = gen_reg_rtx (V32HImode);
21979 emit_insn (gen_lshrv32hi3 (t1,
21980 gen_lowpart (V32HImode, d->op0),
21981 GEN_INT (8)));
21982 emit_insn (gen_lshrv32hi3 (t2,
21983 gen_lowpart (V32HImode, d->op1),
21984 GEN_INT (8)));
21985 }
21986 else
21987 {
21988 t1 = gen_lowpart (V32HImode, d->op0);
21989 t2 = gen_lowpart (V32HImode, d->op1);
21990 }
21991
21992 t3 = gen_reg_rtx (V32QImode);
21993 t4 = gen_reg_rtx (V32QImode);
21994 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
21995 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
21996 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
21997
21998 return true;
21999 }
22000
22001 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
22002 and extract-odd permutations. */
22003
22004 static bool
22005 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
22006 {
22007 rtx t1, t2, t3, t4, t5;
22008
22009 switch (d->vmode)
22010 {
22011 case E_V4DFmode:
22012 if (d->testing_p)
22013 break;
22014 t1 = gen_reg_rtx (V4DFmode);
22015 t2 = gen_reg_rtx (V4DFmode);
22016
22017 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22018 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
22019 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
22020
22021 /* Now an unpck[lh]pd will produce the result required. */
22022 if (odd)
22023 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
22024 else
22025 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
22026 emit_insn (t3);
22027 break;
22028
22029 case E_V8SFmode:
22030 {
22031 int mask = odd ? 0xdd : 0x88;
22032
22033 if (d->testing_p)
22034 break;
22035 t1 = gen_reg_rtx (V8SFmode);
22036 t2 = gen_reg_rtx (V8SFmode);
22037 t3 = gen_reg_rtx (V8SFmode);
22038
22039 /* Shuffle within the 128-bit lanes to produce:
22040 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
22041 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
22042 GEN_INT (mask)));
22043
22044 /* Shuffle the lanes around to produce:
22045 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
22046 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
22047 GEN_INT (0x3)));
22048
22049 /* Shuffle within the 128-bit lanes to produce:
22050 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
22051 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
22052
22053 /* Shuffle within the 128-bit lanes to produce:
22054 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
22055 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
22056
22057 /* Shuffle the lanes around to produce:
22058 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
22059 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
22060 GEN_INT (0x20)));
22061 }
22062 break;
22063
22064 case E_V2DFmode:
22065 case E_V4SFmode:
22066 case E_V2DImode:
22067 case E_V2SImode:
22068 case E_V4SImode:
22069 case E_V2HImode:
22070 /* These are always directly implementable by expand_vec_perm_1. */
22071 gcc_unreachable ();
22072
22073 case E_V2SFmode:
22074 gcc_assert (TARGET_MMX_WITH_SSE);
22075 /* We have no suitable instructions. */
22076 if (d->testing_p)
22077 return false;
22078 break;
22079
22080 case E_V4QImode:
22081 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22082 return expand_vec_perm_pshufb2 (d);
22083 else
22084 {
22085 if (d->testing_p)
22086 break;
22087 /* We need 2*log2(N)-1 operations to achieve odd/even
22088 with interleave. */
22089 t1 = gen_reg_rtx (V4QImode);
22090 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
22091 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
22092 if (odd)
22093 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
22094 else
22095 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
22096 emit_insn (t2);
22097 }
22098 break;
22099
22100 case E_V4HImode:
22101 if (TARGET_SSE4_1)
22102 return expand_vec_perm_even_odd_pack (d);
22103 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22104 return expand_vec_perm_pshufb2 (d);
22105 else
22106 {
22107 if (d->testing_p)
22108 break;
22109 /* We need 2*log2(N)-1 operations to achieve odd/even
22110 with interleave. */
22111 t1 = gen_reg_rtx (V4HImode);
22112 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
22113 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
22114 if (odd)
22115 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
22116 else
22117 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
22118 emit_insn (t2);
22119 }
22120 break;
22121
22122 case E_V8HImode:
22123 if (TARGET_SSE4_1)
22124 return expand_vec_perm_even_odd_pack (d);
22125 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22126 return expand_vec_perm_pshufb2 (d);
22127 else
22128 {
22129 if (d->testing_p)
22130 break;
22131 /* We need 2*log2(N)-1 operations to achieve odd/even
22132 with interleave. */
22133 t1 = gen_reg_rtx (V8HImode);
22134 t2 = gen_reg_rtx (V8HImode);
22135 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
22136 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
22137 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
22138 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
22139 if (odd)
22140 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
22141 else
22142 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
22143 emit_insn (t3);
22144 }
22145 break;
22146
22147 case E_V8QImode:
22148 case E_V16QImode:
22149 return expand_vec_perm_even_odd_pack (d);
22150
22151 case E_V16HImode:
22152 case E_V32QImode:
22153 return expand_vec_perm_even_odd_pack (d);
22154
22155 case E_V64QImode:
22156 return expand_vec_perm_even_odd_trunc (d);
22157
22158 case E_V4DImode:
22159 if (!TARGET_AVX2)
22160 {
22161 struct expand_vec_perm_d d_copy = *d;
22162 d_copy.vmode = V4DFmode;
22163 if (d->testing_p)
22164 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
22165 else
22166 d_copy.target = gen_reg_rtx (V4DFmode);
22167 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
22168 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
22169 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22170 {
22171 if (!d->testing_p)
22172 emit_move_insn (d->target,
22173 gen_lowpart (V4DImode, d_copy.target));
22174 return true;
22175 }
22176 return false;
22177 }
22178
22179 if (d->testing_p)
22180 break;
22181
22182 t1 = gen_reg_rtx (V4DImode);
22183 t2 = gen_reg_rtx (V4DImode);
22184
22185 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22186 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
22187 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
22188
22189 /* Now an vpunpck[lh]qdq will produce the result required. */
22190 if (odd)
22191 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
22192 else
22193 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
22194 emit_insn (t3);
22195 break;
22196
22197 case E_V8SImode:
22198 if (!TARGET_AVX2)
22199 {
22200 struct expand_vec_perm_d d_copy = *d;
22201 d_copy.vmode = V8SFmode;
22202 if (d->testing_p)
22203 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
22204 else
22205 d_copy.target = gen_reg_rtx (V8SFmode);
22206 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
22207 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
22208 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22209 {
22210 if (!d->testing_p)
22211 emit_move_insn (d->target,
22212 gen_lowpart (V8SImode, d_copy.target));
22213 return true;
22214 }
22215 return false;
22216 }
22217
22218 if (d->testing_p)
22219 break;
22220
22221 t1 = gen_reg_rtx (V8SImode);
22222 t2 = gen_reg_rtx (V8SImode);
22223 t3 = gen_reg_rtx (V4DImode);
22224 t4 = gen_reg_rtx (V4DImode);
22225 t5 = gen_reg_rtx (V4DImode);
22226
22227 /* Shuffle the lanes around into
22228 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22229 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
22230 gen_lowpart (V4DImode, d->op1),
22231 GEN_INT (0x20)));
22232 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
22233 gen_lowpart (V4DImode, d->op1),
22234 GEN_INT (0x31)));
22235
22236 /* Swap the 2nd and 3rd position in each lane into
22237 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22238 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
22239 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22240 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
22241 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22242
22243 /* Now an vpunpck[lh]qdq will produce
22244 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22245 if (odd)
22246 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
22247 gen_lowpart (V4DImode, t2));
22248 else
22249 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
22250 gen_lowpart (V4DImode, t2));
22251 emit_insn (t3);
22252 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
22253 break;
22254
22255 default:
22256 gcc_unreachable ();
22257 }
22258
22259 return true;
22260 }
22261
22262 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22263 extract-even and extract-odd permutations. */
22264
22265 static bool
22266 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
22267 {
22268 unsigned i, odd, nelt = d->nelt;
22269
22270 odd = d->perm[0];
22271 if (odd != 0 && odd != 1)
22272 return false;
22273
22274 for (i = 1; i < nelt; ++i)
22275 if (d->perm[i] != 2 * i + odd)
22276 return false;
22277
22278 if (d->vmode == E_V32HImode
22279 && d->testing_p
22280 && !TARGET_AVX512BW)
22281 return false;
22282
22283 return expand_vec_perm_even_odd_1 (d, odd);
22284 }
22285
22286 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22287 permutations. We assume that expand_vec_perm_1 has already failed. */
22288
22289 static bool
22290 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
22291 {
22292 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
22293 machine_mode vmode = d->vmode;
22294 rtx (*gen) (rtx, rtx, rtx);
22295 unsigned char perm2[4];
22296 rtx op0 = d->op0, dest;
22297 bool ok;
22298
22299 switch (vmode)
22300 {
22301 case E_V4DFmode:
22302 case E_V8SFmode:
22303 /* These are special-cased in sse.md so that we can optionally
22304 use the vbroadcast instruction. They expand to two insns
22305 if the input happens to be in a register. */
22306 gcc_unreachable ();
22307
22308 case E_V2DFmode:
22309 case E_V2SFmode:
22310 case E_V4SFmode:
22311 case E_V2DImode:
22312 case E_V2SImode:
22313 case E_V4SImode:
22314 case E_V2HImode:
22315 case E_V4HImode:
22316 /* These are always implementable using standard shuffle patterns. */
22317 gcc_unreachable ();
22318
22319 case E_V4QImode:
22320 /* This can be implemented via interleave and pshuflw. */
22321 if (d->testing_p)
22322 return true;
22323
22324 if (elt >= nelt2)
22325 {
22326 gen = gen_mmx_punpckhbw_low;
22327 elt -= nelt2;
22328 }
22329 else
22330 gen = gen_mmx_punpcklbw_low;
22331
22332 dest = gen_reg_rtx (vmode);
22333 emit_insn (gen (dest, op0, op0));
22334 vmode = get_mode_wider_vector (vmode);
22335 op0 = gen_lowpart (vmode, dest);
22336
22337 memset (perm2, elt, 2);
22338 dest = gen_reg_rtx (vmode);
22339 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22340 gcc_assert (ok);
22341
22342 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22343 return true;
22344
22345 case E_V8QImode:
22346 /* This can be implemented via interleave. We save one insn by
22347 stopping once we have promoted to V2SImode and then use pshufd. */
22348 if (d->testing_p)
22349 return true;
22350 do
22351 {
22352 if (elt >= nelt2)
22353 {
22354 gen = vmode == V8QImode ? gen_mmx_punpckhbw
22355 : gen_mmx_punpckhwd;
22356 elt -= nelt2;
22357 }
22358 else
22359 gen = vmode == V8QImode ? gen_mmx_punpcklbw
22360 : gen_mmx_punpcklwd;
22361 nelt2 /= 2;
22362
22363 dest = gen_reg_rtx (vmode);
22364 emit_insn (gen (dest, op0, op0));
22365 vmode = get_mode_wider_vector (vmode);
22366 op0 = gen_lowpart (vmode, dest);
22367 }
22368 while (vmode != V2SImode);
22369
22370 memset (perm2, elt, 2);
22371 dest = gen_reg_rtx (vmode);
22372 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22373 gcc_assert (ok);
22374
22375 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22376 return true;
22377
22378 case E_V8HImode:
22379 case E_V16QImode:
22380 /* These can be implemented via interleave. We save one insn by
22381 stopping once we have promoted to V4SImode and then use pshufd. */
22382 if (d->testing_p)
22383 return true;
22384 do
22385 {
22386 if (elt >= nelt2)
22387 {
22388 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
22389 : gen_vec_interleave_highv8hi;
22390 elt -= nelt2;
22391 }
22392 else
22393 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
22394 : gen_vec_interleave_lowv8hi;
22395 nelt2 /= 2;
22396
22397 dest = gen_reg_rtx (vmode);
22398 emit_insn (gen (dest, op0, op0));
22399 vmode = get_mode_wider_vector (vmode);
22400 op0 = gen_lowpart (vmode, dest);
22401 }
22402 while (vmode != V4SImode);
22403
22404 memset (perm2, elt, 4);
22405 dest = gen_reg_rtx (vmode);
22406 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22407 gcc_assert (ok);
22408
22409 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22410 return true;
22411
22412 case E_V8HFmode:
22413 case E_V8BFmode:
22414 /* This can be implemented via interleave and pshufd. */
22415 if (d->testing_p)
22416 return true;
22417
22418 rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
22419 if (elt >= nelt2)
22420 {
22421 maybe_gen = maybe_gen_vec_interleave_high;
22422 elt -= nelt2;
22423 }
22424 else
22425 maybe_gen = maybe_gen_vec_interleave_low;
22426 nelt2 /= 2;
22427
22428 dest = gen_reg_rtx (vmode);
22429 emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
22430
22431 vmode = V4SImode;
22432 op0 = gen_lowpart (vmode, dest);
22433
22434 memset (perm2, elt, 4);
22435 dest = gen_reg_rtx (vmode);
22436 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22437 gcc_assert (ok);
22438
22439 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22440 return true;
22441
22442 case E_V32QImode:
22443 case E_V16HImode:
22444 case E_V8SImode:
22445 case E_V4DImode:
22446 /* For AVX2 broadcasts of the first element vpbroadcast* or
22447 vpermq should be used by expand_vec_perm_1. */
22448 gcc_assert (!TARGET_AVX2 || d->perm[0]);
22449 return false;
22450
22451 case E_V64QImode:
22452 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
22453 return false;
22454
22455 case E_V32HImode:
22456 gcc_assert (!TARGET_AVX512BW);
22457 return false;
22458
22459 default:
22460 gcc_unreachable ();
22461 }
22462 }
22463
22464 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22465 broadcast permutations. */
22466
22467 static bool
22468 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
22469 {
22470 unsigned i, elt, nelt = d->nelt;
22471
22472 if (!d->one_operand_p)
22473 return false;
22474
22475 elt = d->perm[0];
22476 for (i = 1; i < nelt; ++i)
22477 if (d->perm[i] != elt)
22478 return false;
22479
22480 return expand_vec_perm_broadcast_1 (d);
22481 }
22482
22483 /* Implement arbitrary permutations of two V64QImode operands
22484 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22485 static bool
22486 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
22487 {
22488 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
22489 return false;
22490
22491 if (d->testing_p)
22492 return true;
22493
22494 struct expand_vec_perm_d ds[2];
22495 rtx rperm[128], vperm, target0, target1;
22496 unsigned int i, nelt;
22497 machine_mode vmode;
22498
22499 nelt = d->nelt;
22500 vmode = V64QImode;
22501
22502 for (i = 0; i < 2; i++)
22503 {
22504 ds[i] = *d;
22505 ds[i].vmode = V32HImode;
22506 ds[i].nelt = 32;
22507 ds[i].target = gen_reg_rtx (V32HImode);
22508 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
22509 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
22510 }
22511
22512 /* Prepare permutations such that the first one takes care of
22513 putting the even bytes into the right positions or one higher
22514 positions (ds[0]) and the second one takes care of
22515 putting the odd bytes into the right positions or one below
22516 (ds[1]). */
22517
22518 for (i = 0; i < nelt; i++)
22519 {
22520 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
22521 if (i & 1)
22522 {
22523 rperm[i] = constm1_rtx;
22524 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22525 }
22526 else
22527 {
22528 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22529 rperm[i + 64] = constm1_rtx;
22530 }
22531 }
22532
22533 bool ok = expand_vec_perm_1 (&ds[0]);
22534 gcc_assert (ok);
22535 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
22536
22537 ok = expand_vec_perm_1 (&ds[1]);
22538 gcc_assert (ok);
22539 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
22540
22541 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
22542 vperm = force_reg (vmode, vperm);
22543 target0 = gen_reg_rtx (V64QImode);
22544 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
22545
22546 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
22547 vperm = force_reg (vmode, vperm);
22548 target1 = gen_reg_rtx (V64QImode);
22549 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
22550
22551 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
22552 return true;
22553 }
22554
22555 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
22556 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22557 all the shorter instruction sequences. */
22558
22559 static bool
22560 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
22561 {
22562 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
22563 unsigned int i, nelt, eltsz;
22564 bool used[4];
22565
22566 if (!TARGET_AVX2
22567 || d->one_operand_p
22568 || (d->vmode != V32QImode && d->vmode != V16HImode))
22569 return false;
22570
22571 if (d->testing_p)
22572 return true;
22573
22574 nelt = d->nelt;
22575 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22576
22577 /* Generate 4 permutation masks. If the required element is within
22578 the same lane, it is shuffled in. If the required element from the
22579 other lane, force a zero by setting bit 7 in the permutation mask.
22580 In the other mask the mask has non-negative elements if element
22581 is requested from the other lane, but also moved to the other lane,
22582 so that the result of vpshufb can have the two V2TImode halves
22583 swapped. */
22584 m128 = GEN_INT (-128);
22585 for (i = 0; i < 32; ++i)
22586 {
22587 rperm[0][i] = m128;
22588 rperm[1][i] = m128;
22589 rperm[2][i] = m128;
22590 rperm[3][i] = m128;
22591 }
22592 used[0] = false;
22593 used[1] = false;
22594 used[2] = false;
22595 used[3] = false;
22596 for (i = 0; i < nelt; ++i)
22597 {
22598 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22599 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22600 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
22601
22602 for (j = 0; j < eltsz; ++j)
22603 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
22604 used[which] = true;
22605 }
22606
22607 for (i = 0; i < 2; ++i)
22608 {
22609 if (!used[2 * i + 1])
22610 {
22611 h[i] = NULL_RTX;
22612 continue;
22613 }
22614 vperm = gen_rtx_CONST_VECTOR (V32QImode,
22615 gen_rtvec_v (32, rperm[2 * i + 1]));
22616 vperm = force_reg (V32QImode, vperm);
22617 h[i] = gen_reg_rtx (V32QImode);
22618 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22619 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
22620 }
22621
22622 /* Swap the 128-byte lanes of h[X]. */
22623 for (i = 0; i < 2; ++i)
22624 {
22625 if (h[i] == NULL_RTX)
22626 continue;
22627 op = gen_reg_rtx (V4DImode);
22628 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
22629 const2_rtx, GEN_INT (3), const0_rtx,
22630 const1_rtx));
22631 h[i] = gen_lowpart (V32QImode, op);
22632 }
22633
22634 for (i = 0; i < 2; ++i)
22635 {
22636 if (!used[2 * i])
22637 {
22638 l[i] = NULL_RTX;
22639 continue;
22640 }
22641 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
22642 vperm = force_reg (V32QImode, vperm);
22643 l[i] = gen_reg_rtx (V32QImode);
22644 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22645 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
22646 }
22647
22648 for (i = 0; i < 2; ++i)
22649 {
22650 if (h[i] && l[i])
22651 {
22652 op = gen_reg_rtx (V32QImode);
22653 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
22654 l[i] = op;
22655 }
22656 else if (h[i])
22657 l[i] = h[i];
22658 }
22659
22660 gcc_assert (l[0] && l[1]);
22661 op = d->target;
22662 if (d->vmode != V32QImode)
22663 op = gen_reg_rtx (V32QImode);
22664 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
22665 if (op != d->target)
22666 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22667 return true;
22668 }
22669
22670 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22671 taken care of, perform the expansion in D and return true on success. */
22672
22673 static bool
22674 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22675 {
22676 /* Try a single instruction expansion. */
22677 if (expand_vec_perm_1 (d))
22678 return true;
22679
22680 /* Try sequences of two instructions. */
22681
22682 if (expand_vec_perm_pshuflw_pshufhw (d))
22683 return true;
22684
22685 if (expand_vec_perm_palignr (d, false))
22686 return true;
22687
22688 if (expand_vec_perm_interleave2 (d))
22689 return true;
22690
22691 if (expand_vec_perm_broadcast (d))
22692 return true;
22693
22694 if (expand_vec_perm_vpermq_perm_1 (d))
22695 return true;
22696
22697 if (expand_vec_perm_vperm2f128 (d))
22698 return true;
22699
22700 if (expand_vec_perm_pblendv (d))
22701 return true;
22702
22703 if (expand_vec_perm_2perm_interleave (d, true))
22704 return true;
22705
22706 if (expand_vec_perm_2perm_pblendv (d, true))
22707 return true;
22708
22709 if (expand_vec_perm_shufps_shufps (d))
22710 return true;
22711
22712 /* Try sequences of three instructions. */
22713
22714 if (expand_vec_perm_even_odd_pack (d))
22715 return true;
22716
22717 if (expand_vec_perm_2vperm2f128_vshuf (d))
22718 return true;
22719
22720 if (expand_vec_perm_pshufb2 (d))
22721 return true;
22722
22723 if (expand_vec_perm_pslldq_psrldq_por (d, false))
22724 return true;
22725
22726 if (expand_vec_perm_interleave3 (d))
22727 return true;
22728
22729 if (expand_vec_perm_vperm2f128_vblend (d))
22730 return true;
22731
22732 if (expand_vec_perm_2perm_interleave (d, false))
22733 return true;
22734
22735 if (expand_vec_perm_2perm_pblendv (d, false))
22736 return true;
22737
22738 /* Try sequences of four instructions. */
22739
22740 if (expand_vec_perm_even_odd_trunc (d))
22741 return true;
22742 if (expand_vec_perm_vpshufb2_vpermq (d))
22743 return true;
22744
22745 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
22746 return true;
22747
22748 if (expand_vec_perm_vpermt2_vpshub2 (d))
22749 return true;
22750
22751 /* ??? Look for narrow permutations whose element orderings would
22752 allow the promotion to a wider mode. */
22753
22754 /* ??? Look for sequences of interleave or a wider permute that place
22755 the data into the correct lanes for a half-vector shuffle like
22756 pshuf[lh]w or vpermilps. */
22757
22758 /* ??? Look for sequences of interleave that produce the desired results.
22759 The combinatorics of punpck[lh] get pretty ugly... */
22760
22761 if (expand_vec_perm_even_odd (d))
22762 return true;
22763
22764 /* Generate four or five instructions. */
22765 if (expand_vec_perm_pslldq_psrldq_por (d, true))
22766 return true;
22767
22768 /* Even longer sequences. */
22769 if (expand_vec_perm_vpshufb4_vpermq2 (d))
22770 return true;
22771
22772 /* See if we can get the same permutation in different vector integer
22773 mode. */
22774 struct expand_vec_perm_d nd;
22775 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22776 {
22777 if (!d->testing_p)
22778 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22779 return true;
22780 }
22781
22782 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22783 if (expand_vec_perm2_vperm2f128_vblend (d))
22784 return true;
22785
22786 return false;
22787 }
22788
22789 /* If a permutation only uses one operand, make it clear. Returns true
22790 if the permutation references both operands. */
22791
22792 static bool
22793 canonicalize_perm (struct expand_vec_perm_d *d)
22794 {
22795 int i, which, nelt = d->nelt;
22796
22797 for (i = which = 0; i < nelt; ++i)
22798 which |= (d->perm[i] < nelt ? 1 : 2);
22799
22800 d->one_operand_p = true;
22801 switch (which)
22802 {
22803 default:
22804 gcc_unreachable();
22805
22806 case 3:
22807 if (!rtx_equal_p (d->op0, d->op1))
22808 {
22809 d->one_operand_p = false;
22810 break;
22811 }
22812 /* The elements of PERM do not suggest that only the first operand
22813 is used, but both operands are identical. Allow easier matching
22814 of the permutation by folding the permutation into the single
22815 input vector. */
22816 /* FALLTHRU */
22817
22818 case 2:
22819 for (i = 0; i < nelt; ++i)
22820 d->perm[i] &= nelt - 1;
22821 d->op0 = d->op1;
22822 break;
22823
22824 case 1:
22825 d->op1 = d->op0;
22826 break;
22827 }
22828
22829 return (which == 3);
22830 }
22831
22832 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22833
22834 bool
22835 ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
22836 rtx target, rtx op0, rtx op1,
22837 const vec_perm_indices &sel)
22838 {
22839 if (vmode != op_mode)
22840 return false;
22841
22842 struct expand_vec_perm_d d;
22843 unsigned char perm[MAX_VECT_LEN];
22844 unsigned int i, nelt, which;
22845 bool two_args;
22846
22847 /* For HF mode vector, convert it to HI using subreg. */
22848 if (GET_MODE_INNER (vmode) == HFmode)
22849 {
22850 machine_mode orig_mode = vmode;
22851 vmode = mode_for_vector (HImode,
22852 GET_MODE_NUNITS (vmode)).require ();
22853 if (target)
22854 target = lowpart_subreg (vmode, target, orig_mode);
22855 if (op0)
22856 op0 = lowpart_subreg (vmode, op0, orig_mode);
22857 if (op1)
22858 op1 = lowpart_subreg (vmode, op1, orig_mode);
22859 }
22860
22861 d.target = target;
22862 d.op0 = op0;
22863 d.op1 = op1;
22864
22865 d.vmode = vmode;
22866 gcc_assert (VECTOR_MODE_P (d.vmode));
22867 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22868 d.testing_p = !target;
22869
22870 gcc_assert (sel.length () == nelt);
22871 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
22872
22873 /* Given sufficient ISA support we can just return true here
22874 for selected vector modes. */
22875 switch (d.vmode)
22876 {
22877 case E_V16SFmode:
22878 case E_V16SImode:
22879 case E_V8DImode:
22880 case E_V8DFmode:
22881 if (!TARGET_AVX512F)
22882 return false;
22883 /* All implementable with a single vperm[it]2 insn. */
22884 if (d.testing_p)
22885 return true;
22886 break;
22887 case E_V32HImode:
22888 if (!TARGET_AVX512F)
22889 return false;
22890 if (d.testing_p && TARGET_AVX512BW)
22891 /* All implementable with a single vperm[it]2 insn. */
22892 return true;
22893 break;
22894 case E_V64QImode:
22895 if (!TARGET_AVX512F)
22896 return false;
22897 if (d.testing_p && TARGET_AVX512BW)
22898 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22899 return true;
22900 break;
22901 case E_V8SImode:
22902 case E_V8SFmode:
22903 case E_V4DFmode:
22904 case E_V4DImode:
22905 if (!TARGET_AVX)
22906 return false;
22907 if (d.testing_p && TARGET_AVX512VL)
22908 /* All implementable with a single vperm[it]2 insn. */
22909 return true;
22910 break;
22911 case E_V16HImode:
22912 if (!TARGET_SSE2)
22913 return false;
22914 if (d.testing_p && TARGET_AVX2)
22915 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22916 return true;
22917 break;
22918 case E_V32QImode:
22919 if (!TARGET_SSE2)
22920 return false;
22921 if (d.testing_p && TARGET_AVX2)
22922 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22923 return true;
22924 break;
22925 case E_V8HImode:
22926 case E_V16QImode:
22927 if (!TARGET_SSE2)
22928 return false;
22929 /* Fall through. */
22930 case E_V4SImode:
22931 case E_V4SFmode:
22932 if (!TARGET_SSE)
22933 return false;
22934 /* All implementable with a single vpperm insn. */
22935 if (d.testing_p && TARGET_XOP)
22936 return true;
22937 /* All implementable with 2 pshufb + 1 ior. */
22938 if (d.testing_p && TARGET_SSSE3)
22939 return true;
22940 break;
22941 case E_V2SFmode:
22942 case E_V2SImode:
22943 case E_V4HImode:
22944 case E_V8QImode:
22945 if (!TARGET_MMX_WITH_SSE)
22946 return false;
22947 break;
22948 case E_V2HImode:
22949 if (!TARGET_SSE2)
22950 return false;
22951 /* All implementable with *punpckwd. */
22952 if (d.testing_p)
22953 return true;
22954 break;
22955 case E_V4QImode:
22956 if (!TARGET_SSE2)
22957 return false;
22958 break;
22959 case E_V2DImode:
22960 case E_V2DFmode:
22961 if (!TARGET_SSE)
22962 return false;
22963 /* All implementable with shufpd or unpck[lh]pd. */
22964 if (d.testing_p)
22965 return true;
22966 break;
22967 default:
22968 return false;
22969 }
22970
22971 for (i = which = 0; i < nelt; ++i)
22972 {
22973 unsigned char e = sel[i];
22974 gcc_assert (e < 2 * nelt);
22975 d.perm[i] = e;
22976 perm[i] = e;
22977 which |= (e < nelt ? 1 : 2);
22978 }
22979
22980 if (d.testing_p)
22981 {
22982 /* For all elements from second vector, fold the elements to first. */
22983 if (which == 2)
22984 for (i = 0; i < nelt; ++i)
22985 d.perm[i] -= nelt;
22986
22987 /* Check whether the mask can be applied to the vector type. */
22988 d.one_operand_p = (which != 3);
22989
22990 /* Implementable with shufps, pshufd or pshuflw. */
22991 if (d.one_operand_p
22992 && (d.vmode == V4SFmode || d.vmode == V2SFmode
22993 || d.vmode == V4SImode || d.vmode == V2SImode
22994 || d.vmode == V4HImode || d.vmode == V2HImode))
22995 return true;
22996
22997 /* Otherwise we have to go through the motions and see if we can
22998 figure out how to generate the requested permutation. */
22999 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
23000 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
23001 if (!d.one_operand_p)
23002 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
23003
23004 start_sequence ();
23005 bool ret = ix86_expand_vec_perm_const_1 (&d);
23006 end_sequence ();
23007
23008 return ret;
23009 }
23010
23011 two_args = canonicalize_perm (&d);
23012
23013 /* If one of the operands is a zero vector, try to match pmovzx. */
23014 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
23015 {
23016 struct expand_vec_perm_d dzero = d;
23017 if (d.op0 == CONST0_RTX (vmode))
23018 {
23019 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
23020 std::swap (dzero.op0, dzero.op1);
23021 for (i = 0; i < nelt; ++i)
23022 dzero.perm[i] ^= nelt;
23023 }
23024 else
23025 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
23026
23027 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
23028 dzero.perm, nelt, dzero.testing_p))
23029 return true;
23030 }
23031
23032 /* Force operands into registers. */
23033 rtx nop0 = force_reg (vmode, d.op0);
23034 if (d.op0 == d.op1)
23035 d.op1 = nop0;
23036 d.op0 = nop0;
23037 d.op1 = force_reg (vmode, d.op1);
23038
23039 if (ix86_expand_vec_perm_const_1 (&d))
23040 return true;
23041
23042 /* If the selector says both arguments are needed, but the operands are the
23043 same, the above tried to expand with one_operand_p and flattened selector.
23044 If that didn't work, retry without one_operand_p; we succeeded with that
23045 during testing. */
23046 if (two_args && d.one_operand_p)
23047 {
23048 d.one_operand_p = false;
23049 memcpy (d.perm, perm, sizeof (perm));
23050 return ix86_expand_vec_perm_const_1 (&d);
23051 }
23052
23053 return false;
23054 }
23055
23056 void
23057 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
23058 {
23059 struct expand_vec_perm_d d;
23060 unsigned i, nelt;
23061
23062 d.target = targ;
23063 d.op0 = op0;
23064 d.op1 = op1;
23065 d.vmode = GET_MODE (targ);
23066 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23067 d.one_operand_p = false;
23068 d.testing_p = false;
23069
23070 for (i = 0; i < nelt; ++i)
23071 d.perm[i] = i * 2 + odd;
23072
23073 /* We'll either be able to implement the permutation directly... */
23074 if (expand_vec_perm_1 (&d))
23075 return;
23076
23077 /* ... or we use the special-case patterns. */
23078 expand_vec_perm_even_odd_1 (&d, odd);
23079 }
23080
23081 static void
23082 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
23083 {
23084 struct expand_vec_perm_d d;
23085 unsigned i, nelt, base;
23086 bool ok;
23087
23088 d.target = targ;
23089 d.op0 = op0;
23090 d.op1 = op1;
23091 d.vmode = GET_MODE (targ);
23092 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23093 d.one_operand_p = false;
23094 d.testing_p = false;
23095
23096 base = high_p ? nelt / 2 : 0;
23097 for (i = 0; i < nelt / 2; ++i)
23098 {
23099 d.perm[i * 2] = i + base;
23100 d.perm[i * 2 + 1] = i + base + nelt;
23101 }
23102
23103 /* Note that for AVX this isn't one instruction. */
23104 ok = ix86_expand_vec_perm_const_1 (&d);
23105 gcc_assert (ok);
23106 }
23107
23108 /* This function is similar as ix86_expand_vecop_qihi,
23109 but optimized under AVX512BW by using vpmovwb.
23110 For example, optimize vector MUL generation like
23111
23112 vpmovzxbw ymm2, xmm0
23113 vpmovzxbw ymm3, xmm1
23114 vpmullw ymm4, ymm2, ymm3
23115 vpmovwb xmm0, ymm4
23116
23117 it would take less instructions than ix86_expand_vecop_qihi.
23118 Return true if success. */
23119
23120 static bool
23121 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23122 {
23123 machine_mode himode, qimode = GET_MODE (dest);
23124 rtx hop1, hop2, hdest;
23125 rtx (*gen_truncate)(rtx, rtx);
23126 bool uns_p = (code == ASHIFTRT) ? false : true;
23127
23128 /* There are no V64HImode instructions. */
23129 if (qimode == V64QImode)
23130 return false;
23131
23132 /* vpmovwb only available under AVX512BW. */
23133 if (!TARGET_AVX512BW)
23134 return false;
23135 if ((qimode == V8QImode || qimode == V16QImode)
23136 && !TARGET_AVX512VL)
23137 return false;
23138 /* Do not generate ymm/zmm instructions when
23139 target prefers 128/256 bit vector width. */
23140 if ((qimode == V16QImode && TARGET_PREFER_AVX128)
23141 || (qimode == V32QImode && TARGET_PREFER_AVX256))
23142 return false;
23143
23144 switch (qimode)
23145 {
23146 case E_V8QImode:
23147 himode = V8HImode;
23148 gen_truncate = gen_truncv8hiv8qi2;
23149 break;
23150 case E_V16QImode:
23151 himode = V16HImode;
23152 gen_truncate = gen_truncv16hiv16qi2;
23153 break;
23154 case E_V32QImode:
23155 himode = V32HImode;
23156 gen_truncate = gen_truncv32hiv32qi2;
23157 break;
23158 default:
23159 gcc_unreachable ();
23160 }
23161
23162 hop1 = gen_reg_rtx (himode);
23163 hop2 = gen_reg_rtx (himode);
23164 hdest = gen_reg_rtx (himode);
23165 emit_insn (gen_extend_insn (hop1, op1, himode, qimode, uns_p));
23166 emit_insn (gen_extend_insn (hop2, op2, himode, qimode, uns_p));
23167 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
23168 hop1, hop2)));
23169 emit_insn (gen_truncate (dest, hdest));
23170 return true;
23171 }
23172
23173 /* Expand a vector operation shift by constant for a V*QImode in terms of the
23174 same operation on V*HImode. Return true if success. */
23175 static bool
23176 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
23177 rtx dest, rtx op1, rtx op2)
23178 {
23179 machine_mode qimode, himode;
23180 HOST_WIDE_INT and_constant, xor_constant;
23181 HOST_WIDE_INT shift_amount;
23182 rtx vec_const_and, vec_const_xor;
23183 rtx tmp, op1_subreg;
23184 rtx (*gen_shift) (rtx, rtx, rtx);
23185 rtx (*gen_and) (rtx, rtx, rtx);
23186 rtx (*gen_xor) (rtx, rtx, rtx);
23187 rtx (*gen_sub) (rtx, rtx, rtx);
23188
23189 /* Only optimize shift by constant. */
23190 if (!CONST_INT_P (op2))
23191 return false;
23192
23193 qimode = GET_MODE (dest);
23194 shift_amount = INTVAL (op2);
23195 /* Do nothing when shift amount greater equal 8. */
23196 if (shift_amount > 7)
23197 return false;
23198
23199 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
23200 /* Record sign bit. */
23201 xor_constant = 1 << (8 - shift_amount - 1);
23202
23203 /* Zero upper/lower bits shift from left/right element. */
23204 and_constant
23205 = (code == ASHIFT ? 256 - (1 << shift_amount)
23206 : (1 << (8 - shift_amount)) - 1);
23207
23208 switch (qimode)
23209 {
23210 case V16QImode:
23211 himode = V8HImode;
23212 gen_shift =
23213 ((code == ASHIFT)
23214 ? gen_ashlv8hi3
23215 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
23216 gen_and = gen_andv16qi3;
23217 gen_xor = gen_xorv16qi3;
23218 gen_sub = gen_subv16qi3;
23219 break;
23220 case V32QImode:
23221 himode = V16HImode;
23222 gen_shift =
23223 ((code == ASHIFT)
23224 ? gen_ashlv16hi3
23225 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
23226 gen_and = gen_andv32qi3;
23227 gen_xor = gen_xorv32qi3;
23228 gen_sub = gen_subv32qi3;
23229 break;
23230 case V64QImode:
23231 himode = V32HImode;
23232 gen_shift =
23233 ((code == ASHIFT)
23234 ? gen_ashlv32hi3
23235 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
23236 gen_and = gen_andv64qi3;
23237 gen_xor = gen_xorv64qi3;
23238 gen_sub = gen_subv64qi3;
23239 break;
23240 default:
23241 gcc_unreachable ();
23242 }
23243
23244 tmp = gen_reg_rtx (himode);
23245 vec_const_and = gen_reg_rtx (qimode);
23246 op1_subreg = lowpart_subreg (himode, op1, qimode);
23247
23248 /* For ASHIFT and LSHIFTRT, perform operation like
23249 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23250 vpand %vec_const_and, %dest. */
23251 emit_insn (gen_shift (tmp, op1_subreg, op2));
23252 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
23253 emit_move_insn (vec_const_and,
23254 ix86_build_const_vector (qimode, true,
23255 gen_int_mode (and_constant, QImode)));
23256 emit_insn (gen_and (dest, dest, vec_const_and));
23257
23258 /* For ASHIFTRT, perform extra operation like
23259 vpxor %vec_const_xor, %dest, %dest
23260 vpsubb %vec_const_xor, %dest, %dest */
23261 if (code == ASHIFTRT)
23262 {
23263 vec_const_xor = gen_reg_rtx (qimode);
23264 emit_move_insn (vec_const_xor,
23265 ix86_build_const_vector (qimode, true,
23266 gen_int_mode (xor_constant, QImode)));
23267 emit_insn (gen_xor (dest, dest, vec_const_xor));
23268 emit_insn (gen_sub (dest, dest, vec_const_xor));
23269 }
23270 return true;
23271 }
23272
23273 void
23274 ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23275 {
23276 machine_mode qimode = GET_MODE (dest);
23277 rtx qop1, qop2, hop1, hop2, qdest, hres;
23278 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23279 bool uns_p = true;
23280
23281 switch (qimode)
23282 {
23283 case E_V4QImode:
23284 case E_V8QImode:
23285 break;
23286 default:
23287 gcc_unreachable ();
23288 }
23289
23290 qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
23291
23292 if (op2vec)
23293 qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
23294 else
23295 qop2 = op2;
23296
23297 switch (code)
23298 {
23299 case MULT:
23300 gcc_assert (op2vec);
23301 /* Unpack data such that we've got a source byte in each low byte of
23302 each word. We don't care what goes into the high byte of each word.
23303 Rather than trying to get zero in there, most convenient is to let
23304 it be a copy of the low byte. */
23305 hop1 = copy_to_reg (qop1);
23306 hop2 = copy_to_reg (qop2);
23307 emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
23308 emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
23309 break;
23310
23311 case ASHIFTRT:
23312 uns_p = false;
23313 /* FALLTHRU */
23314 case ASHIFT:
23315 case LSHIFTRT:
23316 hop1 = gen_reg_rtx (V8HImode);
23317 ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
23318 /* vashr/vlshr/vashl */
23319 if (op2vec)
23320 {
23321 hop2 = gen_reg_rtx (V8HImode);
23322 ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
23323 }
23324 else
23325 hop2 = qop2;
23326
23327 break;
23328 default:
23329 gcc_unreachable ();
23330 }
23331
23332 if (code != MULT && op2vec)
23333 {
23334 /* Expand vashr/vlshr/vashl. */
23335 hres = gen_reg_rtx (V8HImode);
23336 emit_insn (gen_rtx_SET (hres,
23337 simplify_gen_binary (code, V8HImode,
23338 hop1, hop2)));
23339 }
23340 else
23341 /* Expand mult/ashr/lshr/ashl. */
23342 hres = expand_simple_binop (V8HImode, code, hop1, hop2,
23343 NULL_RTX, 1, OPTAB_DIRECT);
23344
23345 if (TARGET_AVX512BW && TARGET_AVX512VL)
23346 {
23347 if (qimode == V8QImode)
23348 qdest = dest;
23349 else
23350 qdest = gen_reg_rtx (V8QImode);
23351
23352 emit_insn (gen_truncv8hiv8qi2 (qdest, hres));
23353 }
23354 else
23355 {
23356 struct expand_vec_perm_d d;
23357 rtx qres = gen_lowpart (V16QImode, hres);
23358 bool ok;
23359 int i;
23360
23361 qdest = gen_reg_rtx (V16QImode);
23362
23363 /* Merge the data back into the right place. */
23364 d.target = qdest;
23365 d.op0 = qres;
23366 d.op1 = qres;
23367 d.vmode = V16QImode;
23368 d.nelt = 16;
23369 d.one_operand_p = false;
23370 d.testing_p = false;
23371
23372 for (i = 0; i < d.nelt; ++i)
23373 d.perm[i] = i * 2;
23374
23375 ok = ix86_expand_vec_perm_const_1 (&d);
23376 gcc_assert (ok);
23377 }
23378
23379 if (qdest != dest)
23380 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23381 }
23382
23383 /* Expand a vector operation CODE for a V*QImode in terms of the
23384 same operation on V*HImode. */
23385
23386 void
23387 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23388 {
23389 machine_mode qimode = GET_MODE (dest);
23390 machine_mode himode;
23391 rtx (*gen_il) (rtx, rtx, rtx);
23392 rtx (*gen_ih) (rtx, rtx, rtx);
23393 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
23394 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23395 struct expand_vec_perm_d d;
23396 bool full_interleave = true;
23397 bool uns_p = true;
23398 bool ok;
23399 int i;
23400
23401 if (CONST_INT_P (op2)
23402 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23403 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
23404 return;
23405
23406 if (TARGET_AVX512BW
23407 && VECTOR_MODE_P (GET_MODE (op2))
23408 && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
23409 return;
23410
23411 switch (qimode)
23412 {
23413 case E_V16QImode:
23414 himode = V8HImode;
23415 break;
23416 case E_V32QImode:
23417 himode = V16HImode;
23418 break;
23419 case E_V64QImode:
23420 himode = V32HImode;
23421 break;
23422 default:
23423 gcc_unreachable ();
23424 }
23425
23426 switch (code)
23427 {
23428 case MULT:
23429 gcc_assert (op2vec);
23430 /* Unpack data such that we've got a source byte in each low byte of
23431 each word. We don't care what goes into the high byte of each word.
23432 Rather than trying to get zero in there, most convenient is to let
23433 it be a copy of the low byte. */
23434 switch (qimode)
23435 {
23436 case E_V16QImode:
23437 gen_il = gen_vec_interleave_lowv16qi;
23438 gen_ih = gen_vec_interleave_highv16qi;
23439 break;
23440 case E_V32QImode:
23441 gen_il = gen_avx2_interleave_lowv32qi;
23442 gen_ih = gen_avx2_interleave_highv32qi;
23443 full_interleave = false;
23444 break;
23445 case E_V64QImode:
23446 gen_il = gen_avx512bw_interleave_lowv64qi;
23447 gen_ih = gen_avx512bw_interleave_highv64qi;
23448 full_interleave = false;
23449 break;
23450 default:
23451 gcc_unreachable ();
23452 }
23453
23454 op2_l = gen_reg_rtx (qimode);
23455 op2_h = gen_reg_rtx (qimode);
23456 emit_insn (gen_il (op2_l, op2, op2));
23457 emit_insn (gen_ih (op2_h, op2, op2));
23458
23459 op1_l = gen_reg_rtx (qimode);
23460 op1_h = gen_reg_rtx (qimode);
23461 emit_insn (gen_il (op1_l, op1, op1));
23462 emit_insn (gen_ih (op1_h, op1, op1));
23463 break;
23464
23465 case ASHIFTRT:
23466 uns_p = false;
23467 /* FALLTHRU */
23468 case ASHIFT:
23469 case LSHIFTRT:
23470 op1_l = gen_reg_rtx (himode);
23471 op1_h = gen_reg_rtx (himode);
23472 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
23473 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
23474 /* vashr/vlshr/vashl */
23475 if (op2vec)
23476 {
23477 rtx tmp = force_reg (qimode, op2);
23478 op2_l = gen_reg_rtx (himode);
23479 op2_h = gen_reg_rtx (himode);
23480 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
23481 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
23482 }
23483 else
23484 op2_l = op2_h = op2;
23485
23486 break;
23487 default:
23488 gcc_unreachable ();
23489 }
23490
23491 if (code != MULT && op2vec)
23492 {
23493 /* Expand vashr/vlshr/vashl. */
23494 res_l = gen_reg_rtx (himode);
23495 res_h = gen_reg_rtx (himode);
23496 emit_insn (gen_rtx_SET (res_l,
23497 simplify_gen_binary (code, himode,
23498 op1_l, op2_l)));
23499 emit_insn (gen_rtx_SET (res_h,
23500 simplify_gen_binary (code, himode,
23501 op1_h, op2_h)));
23502 }
23503 else
23504 {
23505 /* Expand mult/ashr/lshr/ashl. */
23506 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
23507 1, OPTAB_DIRECT);
23508 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
23509 1, OPTAB_DIRECT);
23510 }
23511
23512 gcc_assert (res_l && res_h);
23513
23514 /* Merge the data back into the right place. */
23515 d.target = dest;
23516 d.op0 = gen_lowpart (qimode, res_l);
23517 d.op1 = gen_lowpart (qimode, res_h);
23518 d.vmode = qimode;
23519 d.nelt = GET_MODE_NUNITS (qimode);
23520 d.one_operand_p = false;
23521 d.testing_p = false;
23522
23523 if (full_interleave)
23524 {
23525 /* We used the full interleave, the desired
23526 results are in the even elements. */
23527 for (i = 0; i < d.nelt; ++i)
23528 d.perm[i] = i * 2;
23529 }
23530 else
23531 {
23532 /* For AVX, the interleave used above was not cross-lane. So the
23533 extraction is evens but with the second and third quarter swapped.
23534 Happily, that is even one insn shorter than even extraction.
23535 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23536 always first from the first and then from the second source operand,
23537 the index bits above the low 4 bits remains the same.
23538 Thus, for d.nelt == 32 we want permutation
23539 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23540 and for d.nelt == 64 we want permutation
23541 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23542 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23543 for (i = 0; i < d.nelt; ++i)
23544 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
23545 }
23546
23547 ok = ix86_expand_vec_perm_const_1 (&d);
23548 gcc_assert (ok);
23549 }
23550
23551 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
23552 if op is CONST_VECTOR with all odd elements equal to their
23553 preceding element. */
23554
23555 static bool
23556 const_vector_equal_evenodd_p (rtx op)
23557 {
23558 machine_mode mode = GET_MODE (op);
23559 int i, nunits = GET_MODE_NUNITS (mode);
23560 if (GET_CODE (op) != CONST_VECTOR
23561 || nunits != CONST_VECTOR_NUNITS (op))
23562 return false;
23563 for (i = 0; i < nunits; i += 2)
23564 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
23565 return false;
23566 return true;
23567 }
23568
23569 void
23570 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
23571 bool uns_p, bool odd_p)
23572 {
23573 machine_mode mode = GET_MODE (op1);
23574 machine_mode wmode = GET_MODE (dest);
23575 rtx x;
23576 rtx orig_op1 = op1, orig_op2 = op2;
23577
23578 if (!nonimmediate_operand (op1, mode))
23579 op1 = force_reg (mode, op1);
23580 if (!nonimmediate_operand (op2, mode))
23581 op2 = force_reg (mode, op2);
23582
23583 /* We only play even/odd games with vectors of SImode. */
23584 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
23585
23586 /* If we're looking for the odd results, shift those members down to
23587 the even slots. For some cpus this is faster than a PSHUFD. */
23588 if (odd_p)
23589 {
23590 /* For XOP use vpmacsdqh, but only for smult, as it is only
23591 signed. */
23592 if (TARGET_XOP && mode == V4SImode && !uns_p)
23593 {
23594 x = force_reg (wmode, CONST0_RTX (wmode));
23595 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
23596 return;
23597 }
23598
23599 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
23600 if (!const_vector_equal_evenodd_p (orig_op1))
23601 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
23602 x, NULL, 1, OPTAB_DIRECT);
23603 if (!const_vector_equal_evenodd_p (orig_op2))
23604 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
23605 x, NULL, 1, OPTAB_DIRECT);
23606 op1 = gen_lowpart (mode, op1);
23607 op2 = gen_lowpart (mode, op2);
23608 }
23609
23610 if (mode == V16SImode)
23611 {
23612 if (uns_p)
23613 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
23614 else
23615 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
23616 }
23617 else if (mode == V8SImode)
23618 {
23619 if (uns_p)
23620 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
23621 else
23622 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
23623 }
23624 else if (uns_p)
23625 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
23626 else if (TARGET_SSE4_1)
23627 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
23628 else
23629 {
23630 rtx s1, s2, t0, t1, t2;
23631
23632 /* The easiest way to implement this without PMULDQ is to go through
23633 the motions as if we are performing a full 64-bit multiply. With
23634 the exception that we need to do less shuffling of the elements. */
23635
23636 /* Compute the sign-extension, aka highparts, of the two operands. */
23637 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23638 op1, pc_rtx, pc_rtx);
23639 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23640 op2, pc_rtx, pc_rtx);
23641
23642 /* Multiply LO(A) * HI(B), and vice-versa. */
23643 t1 = gen_reg_rtx (wmode);
23644 t2 = gen_reg_rtx (wmode);
23645 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
23646 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
23647
23648 /* Multiply LO(A) * LO(B). */
23649 t0 = gen_reg_rtx (wmode);
23650 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
23651
23652 /* Combine and shift the highparts into place. */
23653 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
23654 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
23655 1, OPTAB_DIRECT);
23656
23657 /* Combine high and low parts. */
23658 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
23659 return;
23660 }
23661 emit_insn (x);
23662 }
23663
23664 void
23665 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
23666 bool uns_p, bool high_p)
23667 {
23668 machine_mode wmode = GET_MODE (dest);
23669 machine_mode mode = GET_MODE (op1);
23670 rtx t1, t2, t3, t4, mask;
23671
23672 switch (mode)
23673 {
23674 case E_V4SImode:
23675 t1 = gen_reg_rtx (mode);
23676 t2 = gen_reg_rtx (mode);
23677 if (TARGET_XOP && !uns_p)
23678 {
23679 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23680 shuffle the elements once so that all elements are in the right
23681 place for immediate use: { A C B D }. */
23682 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
23683 const1_rtx, GEN_INT (3)));
23684 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
23685 const1_rtx, GEN_INT (3)));
23686 }
23687 else
23688 {
23689 /* Put the elements into place for the multiply. */
23690 ix86_expand_vec_interleave (t1, op1, op1, high_p);
23691 ix86_expand_vec_interleave (t2, op2, op2, high_p);
23692 high_p = false;
23693 }
23694 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
23695 break;
23696
23697 case E_V8SImode:
23698 /* Shuffle the elements between the lanes. After this we
23699 have { A B E F | C D G H } for each operand. */
23700 t1 = gen_reg_rtx (V4DImode);
23701 t2 = gen_reg_rtx (V4DImode);
23702 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
23703 const0_rtx, const2_rtx,
23704 const1_rtx, GEN_INT (3)));
23705 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
23706 const0_rtx, const2_rtx,
23707 const1_rtx, GEN_INT (3)));
23708
23709 /* Shuffle the elements within the lanes. After this we
23710 have { A A B B | C C D D } or { E E F F | G G H H }. */
23711 t3 = gen_reg_rtx (V8SImode);
23712 t4 = gen_reg_rtx (V8SImode);
23713 mask = GEN_INT (high_p
23714 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23715 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23716 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
23717 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
23718
23719 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
23720 break;
23721
23722 case E_V8HImode:
23723 case E_V16HImode:
23724 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
23725 uns_p, OPTAB_DIRECT);
23726 t2 = expand_binop (mode,
23727 uns_p ? umul_highpart_optab : smul_highpart_optab,
23728 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
23729 gcc_assert (t1 && t2);
23730
23731 t3 = gen_reg_rtx (mode);
23732 ix86_expand_vec_interleave (t3, t1, t2, high_p);
23733 emit_move_insn (dest, gen_lowpart (wmode, t3));
23734 break;
23735
23736 case E_V16QImode:
23737 case E_V32QImode:
23738 case E_V32HImode:
23739 case E_V16SImode:
23740 case E_V64QImode:
23741 t1 = gen_reg_rtx (wmode);
23742 t2 = gen_reg_rtx (wmode);
23743 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
23744 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
23745
23746 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
23747 break;
23748
23749 default:
23750 gcc_unreachable ();
23751 }
23752 }
23753
23754 void
23755 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
23756 {
23757 rtx res_1, res_2, res_3, res_4;
23758
23759 res_1 = gen_reg_rtx (V4SImode);
23760 res_2 = gen_reg_rtx (V4SImode);
23761 res_3 = gen_reg_rtx (V2DImode);
23762 res_4 = gen_reg_rtx (V2DImode);
23763 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
23764 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
23765
23766 /* Move the results in element 2 down to element 1; we don't care
23767 what goes in elements 2 and 3. Then we can merge the parts
23768 back together with an interleave.
23769
23770 Note that two other sequences were tried:
23771 (1) Use interleaves at the start instead of psrldq, which allows
23772 us to use a single shufps to merge things back at the end.
23773 (2) Use shufps here to combine the two vectors, then pshufd to
23774 put the elements in the correct order.
23775 In both cases the cost of the reformatting stall was too high
23776 and the overall sequence slower. */
23777
23778 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
23779 const0_rtx, const2_rtx,
23780 const0_rtx, const0_rtx));
23781 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
23782 const0_rtx, const2_rtx,
23783 const0_rtx, const0_rtx));
23784 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
23785
23786 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
23787 }
23788
23789 void
23790 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
23791 {
23792 machine_mode mode = GET_MODE (op0);
23793 rtx t1, t2, t3, t4, t5, t6;
23794
23795 if (TARGET_AVX512DQ && mode == V8DImode)
23796 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
23797 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
23798 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
23799 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
23800 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
23801 else if (TARGET_XOP && mode == V2DImode)
23802 {
23803 /* op1: A,B,C,D, op2: E,F,G,H */
23804 op1 = gen_lowpart (V4SImode, op1);
23805 op2 = gen_lowpart (V4SImode, op2);
23806
23807 t1 = gen_reg_rtx (V4SImode);
23808 t2 = gen_reg_rtx (V4SImode);
23809 t3 = gen_reg_rtx (V2DImode);
23810 t4 = gen_reg_rtx (V2DImode);
23811
23812 /* t1: B,A,D,C */
23813 emit_insn (gen_sse2_pshufd_1 (t1, op1,
23814 GEN_INT (1),
23815 GEN_INT (0),
23816 GEN_INT (3),
23817 GEN_INT (2)));
23818
23819 /* t2: (B*E),(A*F),(D*G),(C*H) */
23820 emit_insn (gen_mulv4si3 (t2, t1, op2));
23821
23822 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
23823 emit_insn (gen_xop_phadddq (t3, t2));
23824
23825 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
23826 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
23827
23828 /* Multiply lower parts and add all */
23829 t5 = gen_reg_rtx (V2DImode);
23830 emit_insn (gen_vec_widen_umult_even_v4si (t5,
23831 gen_lowpart (V4SImode, op1),
23832 gen_lowpart (V4SImode, op2)));
23833 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
23834 }
23835 else
23836 {
23837 machine_mode nmode;
23838 rtx (*umul) (rtx, rtx, rtx);
23839
23840 if (mode == V2DImode)
23841 {
23842 umul = gen_vec_widen_umult_even_v4si;
23843 nmode = V4SImode;
23844 }
23845 else if (mode == V4DImode)
23846 {
23847 umul = gen_vec_widen_umult_even_v8si;
23848 nmode = V8SImode;
23849 }
23850 else if (mode == V8DImode)
23851 {
23852 umul = gen_vec_widen_umult_even_v16si;
23853 nmode = V16SImode;
23854 }
23855 else
23856 gcc_unreachable ();
23857
23858
23859 /* Multiply low parts. */
23860 t1 = gen_reg_rtx (mode);
23861 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
23862
23863 /* Shift input vectors right 32 bits so we can multiply high parts. */
23864 t6 = GEN_INT (32);
23865 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
23866 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
23867
23868 /* Multiply high parts by low parts. */
23869 t4 = gen_reg_rtx (mode);
23870 t5 = gen_reg_rtx (mode);
23871 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
23872 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
23873
23874 /* Combine and shift the highparts back. */
23875 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
23876 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
23877
23878 /* Combine high and low parts. */
23879 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
23880 }
23881
23882 set_unique_reg_note (get_last_insn (), REG_EQUAL,
23883 gen_rtx_MULT (mode, op1, op2));
23884 }
23885
23886 /* Return 1 if control tansfer instruction INSN
23887 should be encoded with notrack prefix. */
23888
23889 bool
23890 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
23891 {
23892 if (!insn || !((flag_cf_protection & CF_BRANCH)))
23893 return false;
23894
23895 if (CALL_P (insn))
23896 {
23897 rtx call = get_call_rtx_from (insn);
23898 gcc_assert (call != NULL_RTX);
23899 rtx addr = XEXP (call, 0);
23900
23901 /* Do not emit 'notrack' if it's not an indirect call. */
23902 if (MEM_P (addr)
23903 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
23904 return false;
23905 else
23906 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
23907 }
23908
23909 if (JUMP_P (insn) && !flag_cet_switch)
23910 {
23911 rtx target = JUMP_LABEL (insn);
23912 if (target == NULL_RTX || ANY_RETURN_P (target))
23913 return false;
23914
23915 /* Check the jump is a switch table. */
23916 rtx_insn *label = as_a<rtx_insn *> (target);
23917 rtx_insn *table = next_insn (label);
23918 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
23919 return false;
23920 else
23921 return true;
23922 }
23923 return false;
23924 }
23925
23926 /* Calculate integer abs() using only SSE2 instructions. */
23927
23928 void
23929 ix86_expand_sse2_abs (rtx target, rtx input)
23930 {
23931 machine_mode mode = GET_MODE (target);
23932 rtx tmp0, tmp1, x;
23933
23934 switch (mode)
23935 {
23936 case E_V2DImode:
23937 case E_V4DImode:
23938 /* For 64-bit signed integer X, with SSE4.2 use
23939 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
23940 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
23941 32 and use logical instead of arithmetic right shift (which is
23942 unimplemented) and subtract. */
23943 if (TARGET_SSE4_2)
23944 {
23945 tmp0 = gen_reg_rtx (mode);
23946 tmp1 = gen_reg_rtx (mode);
23947 emit_move_insn (tmp1, CONST0_RTX (mode));
23948 if (mode == E_V2DImode)
23949 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
23950 else
23951 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
23952 }
23953 else
23954 {
23955 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
23956 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
23957 - 1), NULL, 0, OPTAB_DIRECT);
23958 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
23959 }
23960
23961 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23962 NULL, 0, OPTAB_DIRECT);
23963 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23964 target, 0, OPTAB_DIRECT);
23965 break;
23966
23967 case E_V4SImode:
23968 /* For 32-bit signed integer X, the best way to calculate the absolute
23969 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
23970 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
23971 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
23972 NULL, 0, OPTAB_DIRECT);
23973 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23974 NULL, 0, OPTAB_DIRECT);
23975 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23976 target, 0, OPTAB_DIRECT);
23977 break;
23978
23979 case E_V8HImode:
23980 /* For 16-bit signed integer X, the best way to calculate the absolute
23981 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
23982 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23983
23984 x = expand_simple_binop (mode, SMAX, tmp0, input,
23985 target, 0, OPTAB_DIRECT);
23986 break;
23987
23988 case E_V16QImode:
23989 /* For 8-bit signed integer X, the best way to calculate the absolute
23990 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23991 as SSE2 provides the PMINUB insn. */
23992 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23993
23994 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
23995 target, 0, OPTAB_DIRECT);
23996 break;
23997
23998 default:
23999 gcc_unreachable ();
24000 }
24001
24002 if (x != target)
24003 emit_move_insn (target, x);
24004 }
24005
24006 /* Expand an extract from a vector register through pextr insn.
24007 Return true if successful. */
24008
24009 bool
24010 ix86_expand_pextr (rtx *operands)
24011 {
24012 rtx dst = operands[0];
24013 rtx src = operands[1];
24014
24015 unsigned int size = INTVAL (operands[2]);
24016 unsigned int pos = INTVAL (operands[3]);
24017
24018 if (SUBREG_P (dst))
24019 {
24020 /* Reject non-lowpart subregs. */
24021 if (SUBREG_BYTE (dst) > 0)
24022 return false;
24023 dst = SUBREG_REG (dst);
24024 }
24025
24026 if (SUBREG_P (src))
24027 {
24028 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
24029 src = SUBREG_REG (src);
24030 }
24031
24032 switch (GET_MODE (src))
24033 {
24034 case E_V16QImode:
24035 case E_V8HImode:
24036 case E_V4SImode:
24037 case E_V2DImode:
24038 case E_V1TImode:
24039 {
24040 machine_mode srcmode, dstmode;
24041 rtx d, pat;
24042
24043 if (!int_mode_for_size (size, 0).exists (&dstmode))
24044 return false;
24045
24046 switch (dstmode)
24047 {
24048 case E_QImode:
24049 if (!TARGET_SSE4_1)
24050 return false;
24051 srcmode = V16QImode;
24052 break;
24053
24054 case E_HImode:
24055 if (!TARGET_SSE2)
24056 return false;
24057 srcmode = V8HImode;
24058 break;
24059
24060 case E_SImode:
24061 if (!TARGET_SSE4_1)
24062 return false;
24063 srcmode = V4SImode;
24064 break;
24065
24066 case E_DImode:
24067 gcc_assert (TARGET_64BIT);
24068 if (!TARGET_SSE4_1)
24069 return false;
24070 srcmode = V2DImode;
24071 break;
24072
24073 default:
24074 return false;
24075 }
24076
24077 /* Reject extractions from misaligned positions. */
24078 if (pos & (size-1))
24079 return false;
24080
24081 if (GET_MODE (dst) == dstmode)
24082 d = dst;
24083 else
24084 d = gen_reg_rtx (dstmode);
24085
24086 /* Construct insn pattern. */
24087 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
24088 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
24089
24090 /* Let the rtl optimizers know about the zero extension performed. */
24091 if (dstmode == QImode || dstmode == HImode)
24092 {
24093 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
24094 d = gen_lowpart (SImode, d);
24095 }
24096
24097 emit_insn (gen_rtx_SET (d, pat));
24098
24099 if (d != dst)
24100 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24101 return true;
24102 }
24103
24104 default:
24105 return false;
24106 }
24107 }
24108
24109 /* Expand an insert into a vector register through pinsr insn.
24110 Return true if successful. */
24111
24112 bool
24113 ix86_expand_pinsr (rtx *operands)
24114 {
24115 rtx dst = operands[0];
24116 rtx src = operands[3];
24117
24118 unsigned int size = INTVAL (operands[1]);
24119 unsigned int pos = INTVAL (operands[2]);
24120
24121 if (SUBREG_P (dst))
24122 {
24123 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
24124 dst = SUBREG_REG (dst);
24125 }
24126
24127 switch (GET_MODE (dst))
24128 {
24129 case E_V16QImode:
24130 case E_V8HImode:
24131 case E_V4SImode:
24132 case E_V2DImode:
24133 case E_V1TImode:
24134 {
24135 machine_mode srcmode, dstmode;
24136 rtx (*pinsr)(rtx, rtx, rtx, rtx);
24137 rtx d;
24138
24139 if (!int_mode_for_size (size, 0).exists (&srcmode))
24140 return false;
24141
24142 switch (srcmode)
24143 {
24144 case E_QImode:
24145 if (!TARGET_SSE4_1)
24146 return false;
24147 dstmode = V16QImode;
24148 pinsr = gen_sse4_1_pinsrb;
24149 break;
24150
24151 case E_HImode:
24152 if (!TARGET_SSE2)
24153 return false;
24154 dstmode = V8HImode;
24155 pinsr = gen_sse2_pinsrw;
24156 break;
24157
24158 case E_SImode:
24159 if (!TARGET_SSE4_1)
24160 return false;
24161 dstmode = V4SImode;
24162 pinsr = gen_sse4_1_pinsrd;
24163 break;
24164
24165 case E_DImode:
24166 gcc_assert (TARGET_64BIT);
24167 if (!TARGET_SSE4_1)
24168 return false;
24169 dstmode = V2DImode;
24170 pinsr = gen_sse4_1_pinsrq;
24171 break;
24172
24173 default:
24174 return false;
24175 }
24176
24177 /* Reject insertions to misaligned positions. */
24178 if (pos & (size-1))
24179 return false;
24180
24181 if (SUBREG_P (src))
24182 {
24183 unsigned int srcpos = SUBREG_BYTE (src);
24184
24185 if (srcpos > 0)
24186 {
24187 rtx extr_ops[4];
24188
24189 extr_ops[0] = gen_reg_rtx (srcmode);
24190 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
24191 extr_ops[2] = GEN_INT (size);
24192 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
24193
24194 if (!ix86_expand_pextr (extr_ops))
24195 return false;
24196
24197 src = extr_ops[0];
24198 }
24199 else
24200 src = gen_lowpart (srcmode, SUBREG_REG (src));
24201 }
24202
24203 if (GET_MODE (dst) == dstmode)
24204 d = dst;
24205 else
24206 d = gen_reg_rtx (dstmode);
24207
24208 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
24209 gen_lowpart (srcmode, src),
24210 GEN_INT (1 << (pos / size))));
24211 if (d != dst)
24212 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24213 return true;
24214 }
24215
24216 default:
24217 return false;
24218 }
24219 }
24220
24221 /* All CPUs prefer to avoid cross-lane operations so perform reductions
24222 upper against lower halves up to SSE reg size. */
24223
24224 machine_mode
24225 ix86_split_reduction (machine_mode mode)
24226 {
24227 /* Reduce lowpart against highpart until we reach SSE reg width to
24228 avoid cross-lane operations. */
24229 switch (mode)
24230 {
24231 case E_V8DImode:
24232 case E_V4DImode:
24233 return V2DImode;
24234 case E_V16SImode:
24235 case E_V8SImode:
24236 return V4SImode;
24237 case E_V32HImode:
24238 case E_V16HImode:
24239 return V8HImode;
24240 case E_V64QImode:
24241 case E_V32QImode:
24242 return V16QImode;
24243 case E_V16SFmode:
24244 case E_V8SFmode:
24245 return V4SFmode;
24246 case E_V8DFmode:
24247 case E_V4DFmode:
24248 return V2DFmode;
24249 default:
24250 return mode;
24251 }
24252 }
24253
24254 /* Generate call to __divmoddi4. */
24255
24256 void
24257 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
24258 rtx op0, rtx op1,
24259 rtx *quot_p, rtx *rem_p)
24260 {
24261 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
24262
24263 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
24264 mode, op0, mode, op1, mode,
24265 XEXP (rem, 0), Pmode);
24266 *quot_p = quot;
24267 *rem_p = rem;
24268 }
24269
24270 void
24271 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
24272 enum rtx_code code, bool after,
24273 bool doubleword)
24274 {
24275 rtx old_reg, new_reg, old_mem, success;
24276 machine_mode mode = GET_MODE (target);
24277 rtx_code_label *loop_label = NULL;
24278
24279 old_reg = gen_reg_rtx (mode);
24280 new_reg = old_reg;
24281 old_mem = copy_to_reg (mem);
24282 loop_label = gen_label_rtx ();
24283 emit_label (loop_label);
24284 emit_move_insn (old_reg, old_mem);
24285
24286 /* return value for atomic_fetch_op. */
24287 if (!after)
24288 emit_move_insn (target, old_reg);
24289
24290 if (code == NOT)
24291 {
24292 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
24293 true, OPTAB_LIB_WIDEN);
24294 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
24295 }
24296 else
24297 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
24298 true, OPTAB_LIB_WIDEN);
24299
24300 /* return value for atomic_op_fetch. */
24301 if (after)
24302 emit_move_insn (target, new_reg);
24303
24304 success = NULL_RTX;
24305
24306 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
24307 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
24308 SImode),
24309 doubleword, loop_label);
24310 }
24311
24312 /* Relax cmpxchg instruction, param loop_label indicates whether
24313 the instruction should be relaxed with a pause loop. If not,
24314 it will be relaxed to an atomic load + compare, and skip
24315 cmpxchg instruction if mem != exp_input. */
24316
24317 void
24318 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
24319 rtx mem, rtx exp_input, rtx new_input,
24320 rtx mem_model, bool doubleword,
24321 rtx_code_label *loop_label)
24322 {
24323 rtx_code_label *cmp_label = NULL;
24324 rtx_code_label *done_label = NULL;
24325 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
24326 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
24327 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
24328 machine_mode mode = GET_MODE (target_val), hmode = mode;
24329
24330 if (*ptarget_bool == NULL)
24331 target_bool = gen_reg_rtx (QImode);
24332 else
24333 target_bool = *ptarget_bool;
24334
24335 cmp_label = gen_label_rtx ();
24336 done_label = gen_label_rtx ();
24337
24338 new_mem = gen_reg_rtx (mode);
24339 /* Load memory first. */
24340 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
24341
24342 switch (mode)
24343 {
24344 case E_TImode:
24345 gendw = gen_atomic_compare_and_swapti_doubleword;
24346 hmode = DImode;
24347 break;
24348 case E_DImode:
24349 if (doubleword)
24350 {
24351 gendw = gen_atomic_compare_and_swapdi_doubleword;
24352 hmode = SImode;
24353 }
24354 else
24355 gen = gen_atomic_compare_and_swapdi_1;
24356 break;
24357 case E_SImode:
24358 gen = gen_atomic_compare_and_swapsi_1;
24359 break;
24360 case E_HImode:
24361 gen = gen_atomic_compare_and_swaphi_1;
24362 break;
24363 case E_QImode:
24364 gen = gen_atomic_compare_and_swapqi_1;
24365 break;
24366 default:
24367 gcc_unreachable ();
24368 }
24369
24370 /* Compare mem value with expected value. */
24371 if (doubleword)
24372 {
24373 rtx low_new_mem = gen_lowpart (hmode, new_mem);
24374 rtx low_exp_input = gen_lowpart (hmode, exp_input);
24375 rtx high_new_mem = gen_highpart (hmode, new_mem);
24376 rtx high_exp_input = gen_highpart (hmode, exp_input);
24377 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
24378 hmode, 1, cmp_label,
24379 profile_probability::guessed_never ());
24380 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
24381 hmode, 1, cmp_label,
24382 profile_probability::guessed_never ());
24383 }
24384 else
24385 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
24386 GET_MODE (exp_input), 1, cmp_label,
24387 profile_probability::guessed_never ());
24388
24389 /* Directly emits cmpxchg here. */
24390 if (doubleword)
24391 emit_insn (gendw (target_val, mem, exp_input,
24392 gen_lowpart (hmode, new_input),
24393 gen_highpart (hmode, new_input),
24394 mem_model));
24395 else
24396 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
24397
24398 if (!loop_label)
24399 {
24400 emit_jump_insn (gen_jump (done_label));
24401 emit_barrier ();
24402 emit_label (cmp_label);
24403 emit_move_insn (target_val, new_mem);
24404 emit_label (done_label);
24405 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24406 const0_rtx);
24407 }
24408 else
24409 {
24410 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24411 const0_rtx);
24412 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
24413 GET_MODE (target_bool), 1, loop_label,
24414 profile_probability::guessed_never ());
24415 emit_jump_insn (gen_jump (done_label));
24416 emit_barrier ();
24417
24418 /* If mem is not expected, pause and loop back. */
24419 emit_label (cmp_label);
24420 emit_move_insn (target_val, new_mem);
24421 emit_insn (gen_pause ());
24422 emit_jump_insn (gen_jump (loop_label));
24423 emit_barrier ();
24424 emit_label (done_label);
24425 }
24426
24427 *ptarget_bool = target_bool;
24428 }
24429
24430 /* Convert a BFmode VAL to SFmode without signaling sNaNs.
24431 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24432
24433 rtx
24434 ix86_expand_fast_convert_bf_to_sf (rtx val)
24435 {
24436 rtx op = gen_lowpart (HImode, val), ret;
24437 if (CONST_INT_P (op))
24438 {
24439 ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
24440 val, BFmode);
24441 if (ret)
24442 return ret;
24443 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24444 ret = gen_reg_rtx (SImode);
24445 emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
24446 emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
24447 return gen_lowpart (SFmode, ret);
24448 }
24449
24450 ret = gen_reg_rtx (SFmode);
24451 emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
24452 return ret;
24453 }
24454
24455 #include "gt-i386-expand.h"