]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386-expand.cc
Revert "Support AVX10.1 for AVX512DQ+AVX512VL intrins"
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
1 /* Copyright (C) 1988-2023 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "shrink-wrap.h"
70 #include "builtins.h"
71 #include "rtl-iter.h"
72 #include "tree-iterator.h"
73 #include "dbgcnt.h"
74 #include "case-cfn-macros.h"
75 #include "dojump.h"
76 #include "fold-const-call.h"
77 #include "tree-vrp.h"
78 #include "tree-ssanames.h"
79 #include "selftest.h"
80 #include "selftest-rtl.h"
81 #include "print-rtl.h"
82 #include "intl.h"
83 #include "ifcvt.h"
84 #include "symbol-summary.h"
85 #include "ipa-prop.h"
86 #include "ipa-fnsummary.h"
87 #include "wide-int-bitmask.h"
88 #include "tree-vector-builder.h"
89 #include "debug.h"
90 #include "dwarf2out.h"
91 #include "i386-options.h"
92 #include "i386-builtins.h"
93 #include "i386-expand.h"
94 #include "asan.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157
158 rtx tmp = simplify_gen_subreg (half_mode, op,
159 GET_MODE (op) == VOIDmode
160 ? mode : GET_MODE (op), byte);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
164 }
165 }
166 }
167
168 /* Emit the double word assignment DST = { LO, HI }. */
169
170 void
171 split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
172 {
173 rtx dlo, dhi;
174 int deleted_move_count = 0;
175 split_double_mode (mode, &dst, 1, &dlo, &dhi);
176 /* Constraints ensure that if both lo and hi are MEMs, then
177 dst has early-clobber and thus addresses of MEMs don't use
178 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
179 dlo/dhi are registers. */
180 if (MEM_P (lo)
181 && rtx_equal_p (dlo, hi)
182 && reg_overlap_mentioned_p (dhi, lo))
183 {
184 /* If dlo is same as hi and lo's address uses dhi register,
185 code below would first emit_move_insn (dhi, hi)
186 and then emit_move_insn (dlo, lo). But the former
187 would invalidate lo's address. Load into dhi first,
188 then swap. */
189 emit_move_insn (dhi, lo);
190 lo = dhi;
191 }
192 else if (MEM_P (hi)
193 && !MEM_P (lo)
194 && !rtx_equal_p (dlo, lo)
195 && reg_overlap_mentioned_p (dlo, hi))
196 {
197 /* In this case, code below would first emit_move_insn (dlo, lo)
198 and then emit_move_insn (dhi, hi). But the former would
199 invalidate hi's address. */
200 if (rtx_equal_p (dhi, lo))
201 {
202 /* We can't load into dhi first, so load into dlo
203 first and we'll swap. */
204 emit_move_insn (dlo, hi);
205 hi = dlo;
206 }
207 else
208 {
209 /* Load into dhi first. */
210 emit_move_insn (dhi, hi);
211 hi = dhi;
212 }
213 }
214 if (!rtx_equal_p (dlo, hi))
215 {
216 if (!rtx_equal_p (dlo, lo))
217 emit_move_insn (dlo, lo);
218 else
219 deleted_move_count++;
220 if (!rtx_equal_p (dhi, hi))
221 emit_move_insn (dhi, hi);
222 else
223 deleted_move_count++;
224 }
225 else if (!rtx_equal_p (lo, dhi))
226 {
227 if (!rtx_equal_p (dhi, hi))
228 emit_move_insn (dhi, hi);
229 else
230 deleted_move_count++;
231 if (!rtx_equal_p (dlo, lo))
232 emit_move_insn (dlo, lo);
233 else
234 deleted_move_count++;
235 }
236 else if (mode == TImode)
237 emit_insn (gen_swapdi (dlo, dhi));
238 else
239 emit_insn (gen_swapsi (dlo, dhi));
240
241 if (deleted_move_count == 2)
242 emit_note (NOTE_INSN_DELETED);
243 }
244
245
246 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
247 for the target. */
248
249 void
250 ix86_expand_clear (rtx dest)
251 {
252 rtx tmp;
253
254 /* We play register width games, which are only valid after reload. */
255 gcc_assert (reload_completed);
256
257 /* Avoid HImode and its attendant prefix byte. */
258 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
259 dest = gen_rtx_REG (SImode, REGNO (dest));
260 tmp = gen_rtx_SET (dest, const0_rtx);
261
262 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
263 {
264 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
265 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
266 }
267
268 emit_insn (tmp);
269 }
270
271 /* Return true if V can be broadcasted from an integer of WIDTH bits
272 which is returned in VAL_BROADCAST. Otherwise, return false. */
273
274 static bool
275 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
276 HOST_WIDE_INT &val_broadcast)
277 {
278 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
279 val_broadcast = wi::extract_uhwi (val, 0, width);
280 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
281 {
282 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
283 if (val_broadcast != each)
284 return false;
285 }
286 val_broadcast = sext_hwi (val_broadcast, width);
287 return true;
288 }
289
290 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
291
292 static rtx
293 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
294 {
295 /* Don't use integer vector broadcast if we can't move from GPR to SSE
296 register directly. */
297 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
298 return nullptr;
299
300 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
301 broadcast only if vector broadcast is available. */
302 if (!TARGET_AVX
303 || !CONST_WIDE_INT_P (op)
304 || standard_sse_constant_p (op, mode)
305 || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
306 != GET_MODE_BITSIZE (mode)))
307 return nullptr;
308
309 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
310 HOST_WIDE_INT val_broadcast;
311 scalar_int_mode broadcast_mode;
312 if (TARGET_AVX2
313 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
314 val_broadcast))
315 broadcast_mode = QImode;
316 else if (TARGET_AVX2
317 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
318 val_broadcast))
319 broadcast_mode = HImode;
320 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
321 val_broadcast))
322 broadcast_mode = SImode;
323 else if (TARGET_64BIT
324 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
325 val_broadcast))
326 broadcast_mode = DImode;
327 else
328 return nullptr;
329
330 /* Check if OP can be broadcasted from VAL. */
331 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
332 if (val != CONST_WIDE_INT_ELT (op, i))
333 return nullptr;
334
335 unsigned int nunits = (GET_MODE_SIZE (mode)
336 / GET_MODE_SIZE (broadcast_mode));
337 machine_mode vector_mode;
338 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
339 gcc_unreachable ();
340 rtx target = gen_reg_rtx (vector_mode);
341 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
342 target,
343 GEN_INT (val_broadcast));
344 gcc_assert (ok);
345 target = lowpart_subreg (mode, target, vector_mode);
346 return target;
347 }
348
349 void
350 ix86_expand_move (machine_mode mode, rtx operands[])
351 {
352 rtx op0, op1;
353 rtx tmp, addend = NULL_RTX;
354 enum tls_model model;
355
356 op0 = operands[0];
357 op1 = operands[1];
358
359 /* Avoid complex sets of likely spilled hard registers before reload. */
360 if (!ix86_hardreg_mov_ok (op0, op1))
361 {
362 tmp = gen_reg_rtx (mode);
363 operands[0] = tmp;
364 ix86_expand_move (mode, operands);
365 operands[0] = op0;
366 operands[1] = tmp;
367 op1 = tmp;
368 }
369
370 switch (GET_CODE (op1))
371 {
372 case CONST:
373 tmp = XEXP (op1, 0);
374
375 if (GET_CODE (tmp) != PLUS
376 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
377 break;
378
379 op1 = XEXP (tmp, 0);
380 addend = XEXP (tmp, 1);
381 /* FALLTHRU */
382
383 case SYMBOL_REF:
384 model = SYMBOL_REF_TLS_MODEL (op1);
385
386 if (model)
387 op1 = legitimize_tls_address (op1, model, true);
388 else if (ix86_force_load_from_GOT_p (op1))
389 {
390 /* Load the external function address via GOT slot to avoid PLT. */
391 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
392 (TARGET_64BIT
393 ? UNSPEC_GOTPCREL
394 : UNSPEC_GOT));
395 op1 = gen_rtx_CONST (Pmode, op1);
396 op1 = gen_const_mem (Pmode, op1);
397 set_mem_alias_set (op1, ix86_GOT_alias_set ());
398 }
399 else
400 {
401 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
402 if (tmp)
403 {
404 op1 = tmp;
405 if (!addend)
406 break;
407 }
408 else
409 {
410 op1 = operands[1];
411 break;
412 }
413 }
414
415 if (addend)
416 {
417 op1 = force_operand (op1, NULL_RTX);
418 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
419 op0, 1, OPTAB_DIRECT);
420 }
421 else
422 op1 = force_operand (op1, op0);
423
424 if (op1 == op0)
425 return;
426
427 op1 = convert_to_mode (mode, op1, 1);
428
429 default:
430 break;
431
432 case SUBREG:
433 /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
434 if (TARGET_64BIT
435 && mode == TImode
436 && SUBREG_P (op1)
437 && GET_MODE (SUBREG_REG (op1)) == DImode
438 && SUBREG_BYTE (op1) == 0)
439 op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
440 break;
441 }
442
443 if ((flag_pic || MACHOPIC_INDIRECT)
444 && symbolic_operand (op1, mode))
445 {
446 if (TARGET_MACHO && !TARGET_64BIT)
447 {
448 #if TARGET_MACHO
449 /* dynamic-no-pic */
450 if (MACHOPIC_INDIRECT)
451 {
452 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
453 ? op0 : gen_reg_rtx (Pmode);
454 op1 = machopic_indirect_data_reference (op1, temp);
455 if (MACHOPIC_PURE)
456 op1 = machopic_legitimize_pic_address (op1, mode,
457 temp == op1 ? 0 : temp);
458 }
459 if (op0 != op1 && GET_CODE (op0) != MEM)
460 {
461 rtx insn = gen_rtx_SET (op0, op1);
462 emit_insn (insn);
463 return;
464 }
465 if (GET_CODE (op0) == MEM)
466 op1 = force_reg (Pmode, op1);
467 else
468 {
469 rtx temp = op0;
470 if (GET_CODE (temp) != REG)
471 temp = gen_reg_rtx (Pmode);
472 temp = legitimize_pic_address (op1, temp);
473 if (temp == op0)
474 return;
475 op1 = temp;
476 }
477 /* dynamic-no-pic */
478 #endif
479 }
480 else
481 {
482 if (MEM_P (op0))
483 op1 = force_reg (mode, op1);
484 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
485 {
486 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
487 op1 = legitimize_pic_address (op1, reg);
488 if (op0 == op1)
489 return;
490 op1 = convert_to_mode (mode, op1, 1);
491 }
492 }
493 }
494 else
495 {
496 if (MEM_P (op0)
497 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
498 || !push_operand (op0, mode))
499 && MEM_P (op1))
500 op1 = force_reg (mode, op1);
501
502 if (push_operand (op0, mode)
503 && ! general_no_elim_operand (op1, mode))
504 op1 = copy_to_mode_reg (mode, op1);
505
506 /* Force large constants in 64bit compilation into register
507 to get them CSEed. */
508 if (can_create_pseudo_p ()
509 && (mode == DImode) && TARGET_64BIT
510 && immediate_operand (op1, mode)
511 && !x86_64_zext_immediate_operand (op1, VOIDmode)
512 && !register_operand (op0, mode)
513 && optimize)
514 op1 = copy_to_mode_reg (mode, op1);
515
516 if (can_create_pseudo_p ())
517 {
518 if (CONST_DOUBLE_P (op1))
519 {
520 /* If we are loading a floating point constant to a
521 register, force the value to memory now, since we'll
522 get better code out the back end. */
523
524 op1 = validize_mem (force_const_mem (mode, op1));
525 if (!register_operand (op0, mode))
526 {
527 rtx temp = gen_reg_rtx (mode);
528 emit_insn (gen_rtx_SET (temp, op1));
529 emit_move_insn (op0, temp);
530 return;
531 }
532 }
533 else if (CONST_WIDE_INT_P (op1)
534 && GET_MODE_SIZE (mode) >= 16)
535 {
536 rtx tmp = ix86_convert_const_wide_int_to_broadcast
537 (GET_MODE (op0), op1);
538 if (tmp != nullptr)
539 op1 = tmp;
540 }
541 }
542 }
543
544 /* Special case inserting 64-bit values into a TImode register. */
545 if (TARGET_64BIT
546 /* Disable for -O0 (see PR110587) unless naked (PR110533). */
547 && (optimize || ix86_function_naked (current_function_decl))
548 && (mode == DImode || mode == DFmode)
549 && SUBREG_P (op0)
550 && GET_MODE (SUBREG_REG (op0)) == TImode
551 && REG_P (SUBREG_REG (op0))
552 && REG_P (op1))
553 {
554 /* Use *insvti_lowpart_1 to set lowpart. */
555 if (SUBREG_BYTE (op0) == 0)
556 {
557 wide_int mask = wi::mask (64, true, 128);
558 rtx tmp = immed_wide_int_const (mask, TImode);
559 op0 = SUBREG_REG (op0);
560 tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
561 if (mode == DFmode)
562 op1 = gen_lowpart (DImode, op1);
563 op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
564 op1 = gen_rtx_IOR (TImode, tmp, op1);
565 }
566 /* Use *insvti_highpart_1 to set highpart. */
567 else if (SUBREG_BYTE (op0) == 8)
568 {
569 wide_int mask = wi::mask (64, false, 128);
570 rtx tmp = immed_wide_int_const (mask, TImode);
571 op0 = SUBREG_REG (op0);
572 tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
573 if (mode == DFmode)
574 op1 = gen_lowpart (DImode, op1);
575 op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
576 op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
577 op1 = gen_rtx_IOR (TImode, tmp, op1);
578 }
579 }
580
581 emit_insn (gen_rtx_SET (op0, op1));
582 }
583
584 /* OP is a memref of CONST_VECTOR, return scalar constant mem
585 if CONST_VECTOR is a vec_duplicate, else return NULL. */
586 static rtx
587 ix86_broadcast_from_constant (machine_mode mode, rtx op)
588 {
589 int nunits = GET_MODE_NUNITS (mode);
590 if (nunits < 2)
591 return nullptr;
592
593 /* Don't use integer vector broadcast if we can't move from GPR to SSE
594 register directly. */
595 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
596 && INTEGRAL_MODE_P (mode))
597 return nullptr;
598
599 /* Convert CONST_VECTOR to a non-standard SSE constant integer
600 broadcast only if vector broadcast is available. */
601 if (!(TARGET_AVX2
602 || (TARGET_AVX
603 && (GET_MODE_INNER (mode) == SImode
604 || GET_MODE_INNER (mode) == DImode))
605 || FLOAT_MODE_P (mode))
606 || standard_sse_constant_p (op, mode))
607 return nullptr;
608
609 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
610 We can still put 64-bit integer constant in memory when
611 avx512 embed broadcast is available. */
612 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
613 && (!TARGET_AVX512F
614 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
615 return nullptr;
616
617 if (GET_MODE_INNER (mode) == TImode)
618 return nullptr;
619
620 rtx constant = get_pool_constant (XEXP (op, 0));
621 if (GET_CODE (constant) != CONST_VECTOR)
622 return nullptr;
623
624 /* There could be some rtx like
625 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
626 but with "*.LC1" refer to V2DI constant vector. */
627 if (GET_MODE (constant) != mode)
628 {
629 constant = simplify_subreg (mode, constant, GET_MODE (constant),
630 0);
631 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
632 return nullptr;
633 }
634
635 rtx first = XVECEXP (constant, 0, 0);
636
637 for (int i = 1; i < nunits; ++i)
638 {
639 rtx tmp = XVECEXP (constant, 0, i);
640 /* Vector duplicate value. */
641 if (!rtx_equal_p (tmp, first))
642 return nullptr;
643 }
644
645 return first;
646 }
647
648 void
649 ix86_expand_vector_move (machine_mode mode, rtx operands[])
650 {
651 rtx op0 = operands[0], op1 = operands[1];
652 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
653 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
654 unsigned int align = (TARGET_IAMCU
655 ? GET_MODE_BITSIZE (mode)
656 : GET_MODE_ALIGNMENT (mode));
657
658 if (push_operand (op0, VOIDmode))
659 op0 = emit_move_resolve_push (mode, op0);
660
661 /* Force constants other than zero into memory. We do not know how
662 the instructions used to build constants modify the upper 64 bits
663 of the register, once we have that information we may be able
664 to handle some of them more efficiently. */
665 if (can_create_pseudo_p ()
666 && (CONSTANT_P (op1)
667 || (SUBREG_P (op1)
668 && CONSTANT_P (SUBREG_REG (op1))))
669 && ((register_operand (op0, mode)
670 && !standard_sse_constant_p (op1, mode))
671 /* ix86_expand_vector_move_misalign() does not like constants. */
672 || (SSE_REG_MODE_P (mode)
673 && MEM_P (op0)
674 && MEM_ALIGN (op0) < align)))
675 {
676 if (SUBREG_P (op1))
677 {
678 machine_mode imode = GET_MODE (SUBREG_REG (op1));
679 rtx r = force_const_mem (imode, SUBREG_REG (op1));
680 if (r)
681 r = validize_mem (r);
682 else
683 r = force_reg (imode, SUBREG_REG (op1));
684 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
685 }
686 else
687 {
688 machine_mode mode = GET_MODE (op0);
689 rtx tmp = ix86_convert_const_wide_int_to_broadcast
690 (mode, op1);
691 if (tmp == nullptr)
692 op1 = validize_mem (force_const_mem (mode, op1));
693 else
694 op1 = tmp;
695 }
696 }
697
698 if (can_create_pseudo_p ()
699 && GET_MODE_SIZE (mode) >= 16
700 && VECTOR_MODE_P (mode)
701 && (MEM_P (op1)
702 && SYMBOL_REF_P (XEXP (op1, 0))
703 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
704 {
705 rtx first = ix86_broadcast_from_constant (mode, op1);
706 if (first != nullptr)
707 {
708 /* Broadcast to XMM/YMM/ZMM register from an integer
709 constant or scalar mem. */
710 op1 = gen_reg_rtx (mode);
711 if (FLOAT_MODE_P (mode)
712 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
713 first = force_const_mem (GET_MODE_INNER (mode), first);
714 bool ok = ix86_expand_vector_init_duplicate (false, mode,
715 op1, first);
716 gcc_assert (ok);
717 emit_move_insn (op0, op1);
718 return;
719 }
720 }
721
722 /* We need to check memory alignment for SSE mode since attribute
723 can make operands unaligned. */
724 if (can_create_pseudo_p ()
725 && SSE_REG_MODE_P (mode)
726 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
727 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
728 {
729 rtx tmp[2];
730
731 /* ix86_expand_vector_move_misalign() does not like both
732 arguments in memory. */
733 if (!register_operand (op0, mode)
734 && !register_operand (op1, mode))
735 {
736 rtx scratch = gen_reg_rtx (mode);
737 emit_move_insn (scratch, op1);
738 op1 = scratch;
739 }
740
741 tmp[0] = op0; tmp[1] = op1;
742 ix86_expand_vector_move_misalign (mode, tmp);
743 return;
744 }
745
746 /* Special case TImode to 128-bit vector conversions via V2DI. */
747 if (VECTOR_MODE_P (mode)
748 && GET_MODE_SIZE (mode) == 16
749 && SUBREG_P (op1)
750 && GET_MODE (SUBREG_REG (op1)) == TImode
751 && TARGET_64BIT && TARGET_SSE
752 && can_create_pseudo_p ())
753 {
754 rtx tmp = gen_reg_rtx (V2DImode);
755 rtx lo = gen_reg_rtx (DImode);
756 rtx hi = gen_reg_rtx (DImode);
757 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
758 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
759 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
760 emit_move_insn (op0, gen_lowpart (mode, tmp));
761 return;
762 }
763
764 /* If operand0 is a hard register, make operand1 a pseudo. */
765 if (can_create_pseudo_p ()
766 && !ix86_hardreg_mov_ok (op0, op1))
767 {
768 rtx tmp = gen_reg_rtx (GET_MODE (op0));
769 emit_move_insn (tmp, op1);
770 emit_move_insn (op0, tmp);
771 return;
772 }
773
774 /* Make operand1 a register if it isn't already. */
775 if (can_create_pseudo_p ()
776 && !register_operand (op0, mode)
777 && !register_operand (op1, mode))
778 {
779 rtx tmp = gen_reg_rtx (GET_MODE (op0));
780 emit_move_insn (tmp, op1);
781 emit_move_insn (op0, tmp);
782 return;
783 }
784
785 emit_insn (gen_rtx_SET (op0, op1));
786 }
787
788 /* Split 32-byte AVX unaligned load and store if needed. */
789
790 static void
791 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
792 {
793 rtx m;
794 rtx (*extract) (rtx, rtx, rtx);
795 machine_mode mode;
796
797 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
798 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
799 {
800 emit_insn (gen_rtx_SET (op0, op1));
801 return;
802 }
803
804 rtx orig_op0 = NULL_RTX;
805 mode = GET_MODE (op0);
806 switch (GET_MODE_CLASS (mode))
807 {
808 case MODE_VECTOR_INT:
809 case MODE_INT:
810 if (mode != V32QImode)
811 {
812 if (!MEM_P (op0))
813 {
814 orig_op0 = op0;
815 op0 = gen_reg_rtx (V32QImode);
816 }
817 else
818 op0 = gen_lowpart (V32QImode, op0);
819 op1 = gen_lowpart (V32QImode, op1);
820 mode = V32QImode;
821 }
822 break;
823 case MODE_VECTOR_FLOAT:
824 break;
825 default:
826 gcc_unreachable ();
827 }
828
829 switch (mode)
830 {
831 default:
832 gcc_unreachable ();
833 case E_V32QImode:
834 extract = gen_avx_vextractf128v32qi;
835 mode = V16QImode;
836 break;
837 case E_V16BFmode:
838 extract = gen_avx_vextractf128v16bf;
839 mode = V8BFmode;
840 break;
841 case E_V16HFmode:
842 extract = gen_avx_vextractf128v16hf;
843 mode = V8HFmode;
844 break;
845 case E_V8SFmode:
846 extract = gen_avx_vextractf128v8sf;
847 mode = V4SFmode;
848 break;
849 case E_V4DFmode:
850 extract = gen_avx_vextractf128v4df;
851 mode = V2DFmode;
852 break;
853 }
854
855 if (MEM_P (op1))
856 {
857 rtx r = gen_reg_rtx (mode);
858 m = adjust_address (op1, mode, 0);
859 emit_move_insn (r, m);
860 m = adjust_address (op1, mode, 16);
861 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
862 emit_move_insn (op0, r);
863 }
864 else if (MEM_P (op0))
865 {
866 m = adjust_address (op0, mode, 0);
867 emit_insn (extract (m, op1, const0_rtx));
868 m = adjust_address (op0, mode, 16);
869 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
870 }
871 else
872 gcc_unreachable ();
873
874 if (orig_op0)
875 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
876 }
877
878 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
879 straight to ix86_expand_vector_move. */
880 /* Code generation for scalar reg-reg moves of single and double precision data:
881 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
882 movaps reg, reg
883 else
884 movss reg, reg
885 if (x86_sse_partial_reg_dependency == true)
886 movapd reg, reg
887 else
888 movsd reg, reg
889
890 Code generation for scalar loads of double precision data:
891 if (x86_sse_split_regs == true)
892 movlpd mem, reg (gas syntax)
893 else
894 movsd mem, reg
895
896 Code generation for unaligned packed loads of single precision data
897 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
898 if (x86_sse_unaligned_move_optimal)
899 movups mem, reg
900
901 if (x86_sse_partial_reg_dependency == true)
902 {
903 xorps reg, reg
904 movlps mem, reg
905 movhps mem+8, reg
906 }
907 else
908 {
909 movlps mem, reg
910 movhps mem+8, reg
911 }
912
913 Code generation for unaligned packed loads of double precision data
914 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
915 if (x86_sse_unaligned_move_optimal)
916 movupd mem, reg
917
918 if (x86_sse_split_regs == true)
919 {
920 movlpd mem, reg
921 movhpd mem+8, reg
922 }
923 else
924 {
925 movsd mem, reg
926 movhpd mem+8, reg
927 }
928 */
929
930 void
931 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
932 {
933 rtx op0, op1, m;
934
935 op0 = operands[0];
936 op1 = operands[1];
937
938 /* Use unaligned load/store for AVX512 or when optimizing for size. */
939 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
940 {
941 emit_insn (gen_rtx_SET (op0, op1));
942 return;
943 }
944
945 if (TARGET_AVX)
946 {
947 if (GET_MODE_SIZE (mode) == 32)
948 ix86_avx256_split_vector_move_misalign (op0, op1);
949 else
950 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
951 emit_insn (gen_rtx_SET (op0, op1));
952 return;
953 }
954
955 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
956 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
957 {
958 emit_insn (gen_rtx_SET (op0, op1));
959 return;
960 }
961
962 /* ??? If we have typed data, then it would appear that using
963 movdqu is the only way to get unaligned data loaded with
964 integer type. */
965 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
966 {
967 emit_insn (gen_rtx_SET (op0, op1));
968 return;
969 }
970
971 if (MEM_P (op1))
972 {
973 if (TARGET_SSE2 && mode == V2DFmode)
974 {
975 rtx zero;
976
977 /* When SSE registers are split into halves, we can avoid
978 writing to the top half twice. */
979 if (TARGET_SSE_SPLIT_REGS)
980 {
981 emit_clobber (op0);
982 zero = op0;
983 }
984 else
985 {
986 /* ??? Not sure about the best option for the Intel chips.
987 The following would seem to satisfy; the register is
988 entirely cleared, breaking the dependency chain. We
989 then store to the upper half, with a dependency depth
990 of one. A rumor has it that Intel recommends two movsd
991 followed by an unpacklpd, but this is unconfirmed. And
992 given that the dependency depth of the unpacklpd would
993 still be one, I'm not sure why this would be better. */
994 zero = CONST0_RTX (V2DFmode);
995 }
996
997 m = adjust_address (op1, DFmode, 0);
998 emit_insn (gen_sse2_loadlpd (op0, zero, m));
999 m = adjust_address (op1, DFmode, 8);
1000 emit_insn (gen_sse2_loadhpd (op0, op0, m));
1001 }
1002 else
1003 {
1004 rtx t;
1005
1006 if (mode != V4SFmode)
1007 t = gen_reg_rtx (V4SFmode);
1008 else
1009 t = op0;
1010
1011 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
1012 emit_move_insn (t, CONST0_RTX (V4SFmode));
1013 else
1014 emit_clobber (t);
1015
1016 m = adjust_address (op1, V2SFmode, 0);
1017 emit_insn (gen_sse_loadlps (t, t, m));
1018 m = adjust_address (op1, V2SFmode, 8);
1019 emit_insn (gen_sse_loadhps (t, t, m));
1020 if (mode != V4SFmode)
1021 emit_move_insn (op0, gen_lowpart (mode, t));
1022 }
1023 }
1024 else if (MEM_P (op0))
1025 {
1026 if (TARGET_SSE2 && mode == V2DFmode)
1027 {
1028 m = adjust_address (op0, DFmode, 0);
1029 emit_insn (gen_sse2_storelpd (m, op1));
1030 m = adjust_address (op0, DFmode, 8);
1031 emit_insn (gen_sse2_storehpd (m, op1));
1032 }
1033 else
1034 {
1035 if (mode != V4SFmode)
1036 op1 = gen_lowpart (V4SFmode, op1);
1037
1038 m = adjust_address (op0, V2SFmode, 0);
1039 emit_insn (gen_sse_storelps (m, op1));
1040 m = adjust_address (op0, V2SFmode, 8);
1041 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
1042 }
1043 }
1044 else
1045 gcc_unreachable ();
1046 }
1047
1048 /* Move bits 64:95 to bits 32:63. */
1049
1050 void
1051 ix86_move_vector_high_sse_to_mmx (rtx op)
1052 {
1053 rtx mask = gen_rtx_PARALLEL (VOIDmode,
1054 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1055 GEN_INT (0), GEN_INT (0)));
1056 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
1057 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1058 rtx insn = gen_rtx_SET (dest, op);
1059 emit_insn (insn);
1060 }
1061
1062 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1063
1064 void
1065 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1066 {
1067 rtx op0 = operands[0];
1068 rtx op1 = operands[1];
1069 rtx op2 = operands[2];
1070 rtx src;
1071
1072 machine_mode dmode = GET_MODE (op0);
1073 machine_mode smode = GET_MODE (op1);
1074 machine_mode inner_dmode = GET_MODE_INNER (dmode);
1075 machine_mode inner_smode = GET_MODE_INNER (smode);
1076
1077 /* Get the corresponding SSE mode for destination. */
1078 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1079 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1080 nunits).require ();
1081 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1082 nunits / 2).require ();
1083
1084 /* Get the corresponding SSE mode for source. */
1085 nunits = 16 / GET_MODE_SIZE (inner_smode);
1086 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1087 nunits).require ();
1088
1089 /* Generate SSE pack with signed/unsigned saturation. */
1090 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1091 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1092 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1093
1094 /* paskusdw/packuswb does unsigned saturation of a signed source
1095 which is different from generic us_truncate RTX. */
1096 if (code == US_TRUNCATE)
1097 src = gen_rtx_UNSPEC (sse_dmode,
1098 gen_rtvec (2, op1, op2),
1099 UNSPEC_US_TRUNCATE);
1100 else
1101 {
1102 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1103 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1104 src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
1105 }
1106
1107 emit_move_insn (dest, src);
1108
1109 ix86_move_vector_high_sse_to_mmx (op0);
1110 }
1111
1112 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1113
1114 void
1115 ix86_split_mmx_punpck (rtx operands[], bool high_p)
1116 {
1117 rtx op0 = operands[0];
1118 rtx op1 = operands[1];
1119 rtx op2 = operands[2];
1120 machine_mode mode = GET_MODE (op0);
1121 rtx mask;
1122 /* The corresponding SSE mode. */
1123 machine_mode sse_mode, double_sse_mode;
1124
1125 switch (mode)
1126 {
1127 case E_V8QImode:
1128 case E_V4QImode:
1129 case E_V2QImode:
1130 sse_mode = V16QImode;
1131 double_sse_mode = V32QImode;
1132 mask = gen_rtx_PARALLEL (VOIDmode,
1133 gen_rtvec (16,
1134 GEN_INT (0), GEN_INT (16),
1135 GEN_INT (1), GEN_INT (17),
1136 GEN_INT (2), GEN_INT (18),
1137 GEN_INT (3), GEN_INT (19),
1138 GEN_INT (4), GEN_INT (20),
1139 GEN_INT (5), GEN_INT (21),
1140 GEN_INT (6), GEN_INT (22),
1141 GEN_INT (7), GEN_INT (23)));
1142 break;
1143
1144 case E_V4HImode:
1145 case E_V2HImode:
1146 sse_mode = V8HImode;
1147 double_sse_mode = V16HImode;
1148 mask = gen_rtx_PARALLEL (VOIDmode,
1149 gen_rtvec (8,
1150 GEN_INT (0), GEN_INT (8),
1151 GEN_INT (1), GEN_INT (9),
1152 GEN_INT (2), GEN_INT (10),
1153 GEN_INT (3), GEN_INT (11)));
1154 break;
1155
1156 case E_V2SImode:
1157 sse_mode = V4SImode;
1158 double_sse_mode = V8SImode;
1159 mask = gen_rtx_PARALLEL (VOIDmode,
1160 gen_rtvec (4,
1161 GEN_INT (0), GEN_INT (4),
1162 GEN_INT (1), GEN_INT (5)));
1163 break;
1164
1165 case E_V2SFmode:
1166 sse_mode = V4SFmode;
1167 double_sse_mode = V8SFmode;
1168 mask = gen_rtx_PARALLEL (VOIDmode,
1169 gen_rtvec (4,
1170 GEN_INT (0), GEN_INT (4),
1171 GEN_INT (1), GEN_INT (5)));
1172 break;
1173
1174 default:
1175 gcc_unreachable ();
1176 }
1177
1178 /* Generate SSE punpcklXX. */
1179 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1180 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1181 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1182
1183 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1184 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1185 rtx insn = gen_rtx_SET (dest, op2);
1186 emit_insn (insn);
1187
1188 /* Move high bits to low bits. */
1189 if (high_p)
1190 {
1191 if (sse_mode == V4SFmode)
1192 {
1193 mask = gen_rtx_PARALLEL (VOIDmode,
1194 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1195 GEN_INT (4), GEN_INT (5)));
1196 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1197 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1198 }
1199 else
1200 {
1201 int sz = GET_MODE_SIZE (mode);
1202
1203 if (sz == 4)
1204 mask = gen_rtx_PARALLEL (VOIDmode,
1205 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1206 GEN_INT (0), GEN_INT (1)));
1207 else if (sz == 8)
1208 mask = gen_rtx_PARALLEL (VOIDmode,
1209 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1210 GEN_INT (0), GEN_INT (1)));
1211 else
1212 gcc_unreachable ();
1213
1214 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1215 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1216 }
1217
1218 insn = gen_rtx_SET (dest, op1);
1219 emit_insn (insn);
1220 }
1221 }
1222
1223 /* Helper function of ix86_fixup_binary_operands to canonicalize
1224 operand order. Returns true if the operands should be swapped. */
1225
1226 static bool
1227 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1228 rtx operands[])
1229 {
1230 rtx dst = operands[0];
1231 rtx src1 = operands[1];
1232 rtx src2 = operands[2];
1233
1234 /* If the operation is not commutative, we can't do anything. */
1235 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1236 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1237 return false;
1238
1239 /* Highest priority is that src1 should match dst. */
1240 if (rtx_equal_p (dst, src1))
1241 return false;
1242 if (rtx_equal_p (dst, src2))
1243 return true;
1244
1245 /* Next highest priority is that immediate constants come second. */
1246 if (immediate_operand (src2, mode))
1247 return false;
1248 if (immediate_operand (src1, mode))
1249 return true;
1250
1251 /* Lowest priority is that memory references should come second. */
1252 if (MEM_P (src2))
1253 return false;
1254 if (MEM_P (src1))
1255 return true;
1256
1257 return false;
1258 }
1259
1260
1261 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1262 destination to use for the operation. If different from the true
1263 destination in operands[0], a copy operation will be required. */
1264
1265 rtx
1266 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1267 rtx operands[])
1268 {
1269 rtx dst = operands[0];
1270 rtx src1 = operands[1];
1271 rtx src2 = operands[2];
1272
1273 /* Canonicalize operand order. */
1274 if (ix86_swap_binary_operands_p (code, mode, operands))
1275 {
1276 /* It is invalid to swap operands of different modes. */
1277 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1278
1279 std::swap (src1, src2);
1280 }
1281
1282 /* Both source operands cannot be in memory. */
1283 if (MEM_P (src1) && MEM_P (src2))
1284 {
1285 /* Optimization: Only read from memory once. */
1286 if (rtx_equal_p (src1, src2))
1287 {
1288 src2 = force_reg (mode, src2);
1289 src1 = src2;
1290 }
1291 else if (rtx_equal_p (dst, src1))
1292 src2 = force_reg (mode, src2);
1293 else
1294 src1 = force_reg (mode, src1);
1295 }
1296
1297 /* If the destination is memory, and we do not have matching source
1298 operands, do things in registers. */
1299 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1300 dst = gen_reg_rtx (mode);
1301
1302 /* Source 1 cannot be a constant. */
1303 if (CONSTANT_P (src1))
1304 src1 = force_reg (mode, src1);
1305
1306 /* Source 1 cannot be a non-matching memory. */
1307 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1308 src1 = force_reg (mode, src1);
1309
1310 /* Improve address combine. */
1311 if (code == PLUS
1312 && GET_MODE_CLASS (mode) == MODE_INT
1313 && MEM_P (src2))
1314 src2 = force_reg (mode, src2);
1315
1316 operands[1] = src1;
1317 operands[2] = src2;
1318 return dst;
1319 }
1320
1321 /* Similarly, but assume that the destination has already been
1322 set up properly. */
1323
1324 void
1325 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1326 machine_mode mode, rtx operands[])
1327 {
1328 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1329 gcc_assert (dst == operands[0]);
1330 }
1331
1332 /* Attempt to expand a binary operator. Make the expansion closer to the
1333 actual machine, then just general_operand, which will allow 3 separate
1334 memory references (one output, two input) in a single insn. */
1335
1336 void
1337 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1338 rtx operands[])
1339 {
1340 rtx src1, src2, dst, op, clob;
1341
1342 dst = ix86_fixup_binary_operands (code, mode, operands);
1343 src1 = operands[1];
1344 src2 = operands[2];
1345
1346 /* Emit the instruction. */
1347
1348 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1349
1350 if (reload_completed
1351 && code == PLUS
1352 && !rtx_equal_p (dst, src1))
1353 {
1354 /* This is going to be an LEA; avoid splitting it later. */
1355 emit_insn (op);
1356 }
1357 else
1358 {
1359 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1360 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1361 }
1362
1363 /* Fix up the destination if needed. */
1364 if (dst != operands[0])
1365 emit_move_insn (operands[0], dst);
1366 }
1367
1368 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1369 the given OPERANDS. */
1370
1371 void
1372 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1373 rtx operands[])
1374 {
1375 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1376 if (SUBREG_P (operands[1]))
1377 {
1378 op1 = operands[1];
1379 op2 = operands[2];
1380 }
1381 else if (SUBREG_P (operands[2]))
1382 {
1383 op1 = operands[2];
1384 op2 = operands[1];
1385 }
1386 /* Optimize (__m128i) d | (__m128i) e and similar code
1387 when d and e are float vectors into float vector logical
1388 insn. In C/C++ without using intrinsics there is no other way
1389 to express vector logical operation on float vectors than
1390 to cast them temporarily to integer vectors. */
1391 if (op1
1392 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1393 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1394 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1395 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1396 && SUBREG_BYTE (op1) == 0
1397 && (GET_CODE (op2) == CONST_VECTOR
1398 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1399 && SUBREG_BYTE (op2) == 0))
1400 && can_create_pseudo_p ())
1401 {
1402 rtx dst;
1403 switch (GET_MODE (SUBREG_REG (op1)))
1404 {
1405 case E_V4SFmode:
1406 case E_V8SFmode:
1407 case E_V16SFmode:
1408 case E_V2DFmode:
1409 case E_V4DFmode:
1410 case E_V8DFmode:
1411 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1412 if (GET_CODE (op2) == CONST_VECTOR)
1413 {
1414 op2 = gen_lowpart (GET_MODE (dst), op2);
1415 op2 = force_reg (GET_MODE (dst), op2);
1416 }
1417 else
1418 {
1419 op1 = operands[1];
1420 op2 = SUBREG_REG (operands[2]);
1421 if (!vector_operand (op2, GET_MODE (dst)))
1422 op2 = force_reg (GET_MODE (dst), op2);
1423 }
1424 op1 = SUBREG_REG (op1);
1425 if (!vector_operand (op1, GET_MODE (dst)))
1426 op1 = force_reg (GET_MODE (dst), op1);
1427 emit_insn (gen_rtx_SET (dst,
1428 gen_rtx_fmt_ee (code, GET_MODE (dst),
1429 op1, op2)));
1430 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1431 return;
1432 default:
1433 break;
1434 }
1435 }
1436 if (!vector_operand (operands[1], mode))
1437 operands[1] = force_reg (mode, operands[1]);
1438 if (!vector_operand (operands[2], mode))
1439 operands[2] = force_reg (mode, operands[2]);
1440 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1441 emit_insn (gen_rtx_SET (operands[0],
1442 gen_rtx_fmt_ee (code, mode, operands[1],
1443 operands[2])));
1444 }
1445
1446 /* Return TRUE or FALSE depending on whether the binary operator meets the
1447 appropriate constraints. */
1448
1449 bool
1450 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1451 rtx operands[3])
1452 {
1453 rtx dst = operands[0];
1454 rtx src1 = operands[1];
1455 rtx src2 = operands[2];
1456
1457 /* Both source operands cannot be in memory. */
1458 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1459 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1460 return false;
1461
1462 /* Canonicalize operand order for commutative operators. */
1463 if (ix86_swap_binary_operands_p (code, mode, operands))
1464 std::swap (src1, src2);
1465
1466 /* If the destination is memory, we must have a matching source operand. */
1467 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1468 return false;
1469
1470 /* Source 1 cannot be a constant. */
1471 if (CONSTANT_P (src1))
1472 return false;
1473
1474 /* Source 1 cannot be a non-matching memory. */
1475 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1476 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1477 return (code == AND
1478 && (mode == HImode
1479 || mode == SImode
1480 || (TARGET_64BIT && mode == DImode))
1481 && satisfies_constraint_L (src2));
1482
1483 return true;
1484 }
1485
1486 /* Attempt to expand a unary operator. Make the expansion closer to the
1487 actual machine, then just general_operand, which will allow 2 separate
1488 memory references (one output, one input) in a single insn. */
1489
1490 void
1491 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1492 rtx operands[])
1493 {
1494 bool matching_memory = false;
1495 rtx src, dst, op, clob;
1496
1497 dst = operands[0];
1498 src = operands[1];
1499
1500 /* If the destination is memory, and we do not have matching source
1501 operands, do things in registers. */
1502 if (MEM_P (dst))
1503 {
1504 if (rtx_equal_p (dst, src))
1505 matching_memory = true;
1506 else
1507 dst = gen_reg_rtx (mode);
1508 }
1509
1510 /* When source operand is memory, destination must match. */
1511 if (MEM_P (src) && !matching_memory)
1512 src = force_reg (mode, src);
1513
1514 /* Emit the instruction. */
1515
1516 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1517
1518 if (code == NOT)
1519 emit_insn (op);
1520 else
1521 {
1522 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1523 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1524 }
1525
1526 /* Fix up the destination if needed. */
1527 if (dst != operands[0])
1528 emit_move_insn (operands[0], dst);
1529 }
1530
1531 /* Predict just emitted jump instruction to be taken with probability PROB. */
1532
1533 static void
1534 predict_jump (int prob)
1535 {
1536 rtx_insn *insn = get_last_insn ();
1537 gcc_assert (JUMP_P (insn));
1538 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1539 }
1540
1541 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1542 divisor are within the range [0-255]. */
1543
1544 void
1545 ix86_split_idivmod (machine_mode mode, rtx operands[],
1546 bool unsigned_p)
1547 {
1548 rtx_code_label *end_label, *qimode_label;
1549 rtx div, mod;
1550 rtx_insn *insn;
1551 rtx scratch, tmp0, tmp1, tmp2;
1552 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1553
1554 operands[2] = force_reg (mode, operands[2]);
1555 operands[3] = force_reg (mode, operands[3]);
1556
1557 switch (mode)
1558 {
1559 case E_SImode:
1560 if (GET_MODE (operands[0]) == SImode)
1561 {
1562 if (GET_MODE (operands[1]) == SImode)
1563 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1564 else
1565 gen_divmod4_1
1566 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1567 }
1568 else
1569 gen_divmod4_1
1570 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1571 break;
1572
1573 case E_DImode:
1574 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1575 break;
1576
1577 default:
1578 gcc_unreachable ();
1579 }
1580
1581 end_label = gen_label_rtx ();
1582 qimode_label = gen_label_rtx ();
1583
1584 scratch = gen_reg_rtx (mode);
1585
1586 /* Use 8bit unsigned divimod if dividend and divisor are within
1587 the range [0-255]. */
1588 emit_move_insn (scratch, operands[2]);
1589 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1590 scratch, 1, OPTAB_DIRECT);
1591 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1592 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1593 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1594 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1595 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1596 pc_rtx);
1597 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1598 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1599 JUMP_LABEL (insn) = qimode_label;
1600
1601 /* Generate original signed/unsigned divimod. */
1602 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1603 operands[2], operands[3]));
1604
1605 /* Branch to the end. */
1606 emit_jump_insn (gen_jump (end_label));
1607 emit_barrier ();
1608
1609 /* Generate 8bit unsigned divide. */
1610 emit_label (qimode_label);
1611 /* Don't use operands[0] for result of 8bit divide since not all
1612 registers support QImode ZERO_EXTRACT. */
1613 tmp0 = lowpart_subreg (HImode, scratch, mode);
1614 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1615 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1616 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1617
1618 if (unsigned_p)
1619 {
1620 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1621 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1622 }
1623 else
1624 {
1625 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1626 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1627 }
1628 if (mode == SImode)
1629 {
1630 if (GET_MODE (operands[0]) != SImode)
1631 div = gen_rtx_ZERO_EXTEND (DImode, div);
1632 if (GET_MODE (operands[1]) != SImode)
1633 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1634 }
1635
1636 /* Extract remainder from AH. */
1637 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1638 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1639 GEN_INT (8), GEN_INT (8));
1640 insn = emit_move_insn (operands[1], tmp1);
1641 set_unique_reg_note (insn, REG_EQUAL, mod);
1642
1643 /* Zero extend quotient from AL. */
1644 tmp1 = gen_lowpart (QImode, tmp0);
1645 insn = emit_insn (gen_extend_insn
1646 (operands[0], tmp1,
1647 GET_MODE (operands[0]), QImode, 1));
1648 set_unique_reg_note (insn, REG_EQUAL, div);
1649
1650 emit_label (end_label);
1651 }
1652
1653 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1654 matches destination. RTX includes clobber of FLAGS_REG. */
1655
1656 void
1657 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1658 rtx dst, rtx src)
1659 {
1660 rtx op, clob;
1661
1662 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1663 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1664
1665 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1666 }
1667
1668 /* Return true if regno1 def is nearest to the insn. */
1669
1670 static bool
1671 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1672 {
1673 rtx_insn *prev = insn;
1674 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1675
1676 if (insn == start)
1677 return false;
1678 while (prev && prev != start)
1679 {
1680 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1681 {
1682 prev = PREV_INSN (prev);
1683 continue;
1684 }
1685 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1686 return true;
1687 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1688 return false;
1689 prev = PREV_INSN (prev);
1690 }
1691
1692 /* None of the regs is defined in the bb. */
1693 return false;
1694 }
1695
1696 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1697 int ix86_last_zero_store_uid;
1698
1699 /* Split lea instructions into a sequence of instructions
1700 which are executed on ALU to avoid AGU stalls.
1701 It is assumed that it is allowed to clobber flags register
1702 at lea position. */
1703
1704 void
1705 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1706 {
1707 unsigned int regno0, regno1, regno2;
1708 struct ix86_address parts;
1709 rtx target, tmp;
1710 int ok, adds;
1711
1712 ok = ix86_decompose_address (operands[1], &parts);
1713 gcc_assert (ok);
1714
1715 target = gen_lowpart (mode, operands[0]);
1716
1717 regno0 = true_regnum (target);
1718 regno1 = INVALID_REGNUM;
1719 regno2 = INVALID_REGNUM;
1720
1721 if (parts.base)
1722 {
1723 parts.base = gen_lowpart (mode, parts.base);
1724 regno1 = true_regnum (parts.base);
1725 }
1726
1727 if (parts.index)
1728 {
1729 parts.index = gen_lowpart (mode, parts.index);
1730 regno2 = true_regnum (parts.index);
1731 }
1732
1733 if (parts.disp)
1734 parts.disp = gen_lowpart (mode, parts.disp);
1735
1736 if (parts.scale > 1)
1737 {
1738 /* Case r1 = r1 + ... */
1739 if (regno1 == regno0)
1740 {
1741 /* If we have a case r1 = r1 + C * r2 then we
1742 should use multiplication which is very
1743 expensive. Assume cost model is wrong if we
1744 have such case here. */
1745 gcc_assert (regno2 != regno0);
1746
1747 for (adds = parts.scale; adds > 0; adds--)
1748 ix86_emit_binop (PLUS, mode, target, parts.index);
1749 }
1750 else
1751 {
1752 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1753 if (regno0 != regno2)
1754 emit_insn (gen_rtx_SET (target, parts.index));
1755
1756 /* Use shift for scaling, but emit it as MULT instead
1757 to avoid it being immediately peephole2 optimized back
1758 into lea. */
1759 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1760
1761 if (parts.base)
1762 ix86_emit_binop (PLUS, mode, target, parts.base);
1763
1764 if (parts.disp && parts.disp != const0_rtx)
1765 ix86_emit_binop (PLUS, mode, target, parts.disp);
1766 }
1767 }
1768 else if (!parts.base && !parts.index)
1769 {
1770 gcc_assert(parts.disp);
1771 emit_insn (gen_rtx_SET (target, parts.disp));
1772 }
1773 else
1774 {
1775 if (!parts.base)
1776 {
1777 if (regno0 != regno2)
1778 emit_insn (gen_rtx_SET (target, parts.index));
1779 }
1780 else if (!parts.index)
1781 {
1782 if (regno0 != regno1)
1783 emit_insn (gen_rtx_SET (target, parts.base));
1784 }
1785 else
1786 {
1787 if (regno0 == regno1)
1788 tmp = parts.index;
1789 else if (regno0 == regno2)
1790 tmp = parts.base;
1791 else
1792 {
1793 rtx tmp1;
1794
1795 /* Find better operand for SET instruction, depending
1796 on which definition is farther from the insn. */
1797 if (find_nearest_reg_def (insn, regno1, regno2))
1798 tmp = parts.index, tmp1 = parts.base;
1799 else
1800 tmp = parts.base, tmp1 = parts.index;
1801
1802 emit_insn (gen_rtx_SET (target, tmp));
1803
1804 if (parts.disp && parts.disp != const0_rtx)
1805 ix86_emit_binop (PLUS, mode, target, parts.disp);
1806
1807 ix86_emit_binop (PLUS, mode, target, tmp1);
1808 return;
1809 }
1810
1811 ix86_emit_binop (PLUS, mode, target, tmp);
1812 }
1813
1814 if (parts.disp && parts.disp != const0_rtx)
1815 ix86_emit_binop (PLUS, mode, target, parts.disp);
1816 }
1817 }
1818
1819 /* Post-reload splitter for converting an SF or DFmode value in an
1820 SSE register into an unsigned SImode. */
1821
1822 void
1823 ix86_split_convert_uns_si_sse (rtx operands[])
1824 {
1825 machine_mode vecmode;
1826 rtx value, large, zero_or_two31, input, two31, x;
1827
1828 large = operands[1];
1829 zero_or_two31 = operands[2];
1830 input = operands[3];
1831 two31 = operands[4];
1832 vecmode = GET_MODE (large);
1833 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1834
1835 /* Load up the value into the low element. We must ensure that the other
1836 elements are valid floats -- zero is the easiest such value. */
1837 if (MEM_P (input))
1838 {
1839 if (vecmode == V4SFmode)
1840 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1841 else
1842 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1843 }
1844 else
1845 {
1846 input = gen_rtx_REG (vecmode, REGNO (input));
1847 emit_move_insn (value, CONST0_RTX (vecmode));
1848 if (vecmode == V4SFmode)
1849 emit_insn (gen_sse_movss_v4sf (value, value, input));
1850 else
1851 emit_insn (gen_sse2_movsd_v2df (value, value, input));
1852 }
1853
1854 emit_move_insn (large, two31);
1855 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1856
1857 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1858 emit_insn (gen_rtx_SET (large, x));
1859
1860 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1861 emit_insn (gen_rtx_SET (zero_or_two31, x));
1862
1863 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1864 emit_insn (gen_rtx_SET (value, x));
1865
1866 large = gen_rtx_REG (V4SImode, REGNO (large));
1867 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1868
1869 x = gen_rtx_REG (V4SImode, REGNO (value));
1870 if (vecmode == V4SFmode)
1871 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1872 else
1873 emit_insn (gen_sse2_cvttpd2dq (x, value));
1874 value = x;
1875
1876 emit_insn (gen_xorv4si3 (value, value, large));
1877 }
1878
1879 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1880 machine_mode mode, rtx target,
1881 rtx var, int one_var);
1882
1883 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1884 Expects the 64-bit DImode to be supplied in a pair of integral
1885 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1886 -mfpmath=sse, !optimize_size only. */
1887
1888 void
1889 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1890 {
1891 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1892 rtx int_xmm, fp_xmm;
1893 rtx biases, exponents;
1894 rtx x;
1895
1896 int_xmm = gen_reg_rtx (V4SImode);
1897 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1898 emit_insn (gen_movdi_to_sse (int_xmm, input));
1899 else if (TARGET_SSE_SPLIT_REGS)
1900 {
1901 emit_clobber (int_xmm);
1902 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1903 }
1904 else
1905 {
1906 x = gen_reg_rtx (V2DImode);
1907 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1908 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1909 }
1910
1911 x = gen_rtx_CONST_VECTOR (V4SImode,
1912 gen_rtvec (4, GEN_INT (0x43300000UL),
1913 GEN_INT (0x45300000UL),
1914 const0_rtx, const0_rtx));
1915 exponents = validize_mem (force_const_mem (V4SImode, x));
1916
1917 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1918 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1919
1920 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1921 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1922 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1923 (0x1.0p84 + double(fp_value_hi_xmm)).
1924 Note these exponents differ by 32. */
1925
1926 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1927
1928 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1929 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1930 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1931 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1932 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1933 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1934 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1935 biases = validize_mem (force_const_mem (V2DFmode, biases));
1936 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1937
1938 /* Add the upper and lower DFmode values together. */
1939 if (TARGET_SSE3)
1940 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1941 else
1942 {
1943 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1944 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1945 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1946 }
1947
1948 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1949 }
1950
1951 /* Not used, but eases macroization of patterns. */
1952 void
1953 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1954 {
1955 gcc_unreachable ();
1956 }
1957
1958 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1959
1960 /* Convert an unsigned SImode value into a DFmode. Only currently used
1961 for SSE, but applicable anywhere. */
1962
1963 void
1964 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1965 {
1966 REAL_VALUE_TYPE TWO31r;
1967 rtx x, fp;
1968
1969 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1970 NULL, 1, OPTAB_DIRECT);
1971
1972 fp = gen_reg_rtx (DFmode);
1973 emit_insn (gen_floatsidf2 (fp, x));
1974
1975 real_ldexp (&TWO31r, &dconst1, 31);
1976 x = const_double_from_real_value (TWO31r, DFmode);
1977
1978 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1979
1980 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1981 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1982 x = ix86_expand_sse_fabs (x, NULL);
1983
1984 if (x != target)
1985 emit_move_insn (target, x);
1986 }
1987
1988 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1989 32-bit mode; otherwise we have a direct convert instruction. */
1990
1991 void
1992 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1993 {
1994 REAL_VALUE_TYPE TWO32r;
1995 rtx fp_lo, fp_hi, x;
1996
1997 fp_lo = gen_reg_rtx (DFmode);
1998 fp_hi = gen_reg_rtx (DFmode);
1999
2000 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
2001
2002 real_ldexp (&TWO32r, &dconst1, 32);
2003 x = const_double_from_real_value (TWO32r, DFmode);
2004 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
2005
2006 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
2007
2008 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
2009 0, OPTAB_DIRECT);
2010 if (x != target)
2011 emit_move_insn (target, x);
2012 }
2013
2014 /* Convert an unsigned SImode value into a SFmode, using only SSE.
2015 For x86_32, -mfpmath=sse, !optimize_size only. */
2016 void
2017 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
2018 {
2019 REAL_VALUE_TYPE ONE16r;
2020 rtx fp_hi, fp_lo, int_hi, int_lo, x;
2021
2022 real_ldexp (&ONE16r, &dconst1, 16);
2023 x = const_double_from_real_value (ONE16r, SFmode);
2024 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
2025 NULL, 0, OPTAB_DIRECT);
2026 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
2027 NULL, 0, OPTAB_DIRECT);
2028 fp_hi = gen_reg_rtx (SFmode);
2029 fp_lo = gen_reg_rtx (SFmode);
2030 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
2031 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
2032 if (TARGET_FMA)
2033 {
2034 x = validize_mem (force_const_mem (SFmode, x));
2035 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
2036 emit_move_insn (target, fp_hi);
2037 }
2038 else
2039 {
2040 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
2041 0, OPTAB_DIRECT);
2042 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
2043 0, OPTAB_DIRECT);
2044 if (!rtx_equal_p (target, fp_hi))
2045 emit_move_insn (target, fp_hi);
2046 }
2047 }
2048
2049 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2050 a vector of unsigned ints VAL to vector of floats TARGET. */
2051
2052 void
2053 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
2054 {
2055 rtx tmp[8];
2056 REAL_VALUE_TYPE TWO16r;
2057 machine_mode intmode = GET_MODE (val);
2058 machine_mode fltmode = GET_MODE (target);
2059 rtx (*cvt) (rtx, rtx);
2060
2061 if (intmode == V4SImode)
2062 cvt = gen_floatv4siv4sf2;
2063 else
2064 cvt = gen_floatv8siv8sf2;
2065 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
2066 tmp[0] = force_reg (intmode, tmp[0]);
2067 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
2068 OPTAB_DIRECT);
2069 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
2070 NULL_RTX, 1, OPTAB_DIRECT);
2071 tmp[3] = gen_reg_rtx (fltmode);
2072 emit_insn (cvt (tmp[3], tmp[1]));
2073 tmp[4] = gen_reg_rtx (fltmode);
2074 emit_insn (cvt (tmp[4], tmp[2]));
2075 real_ldexp (&TWO16r, &dconst1, 16);
2076 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2077 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
2078 if (TARGET_FMA)
2079 {
2080 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2081 emit_move_insn (target, tmp[6]);
2082 }
2083 else
2084 {
2085 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2086 NULL_RTX, 1, OPTAB_DIRECT);
2087 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2088 target, 1, OPTAB_DIRECT);
2089 if (tmp[7] != target)
2090 emit_move_insn (target, tmp[7]);
2091 }
2092 }
2093
2094 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2095 pattern can be used on it instead of fixuns_trunc*.
2096 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2097 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2098
2099 rtx
2100 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2101 {
2102 REAL_VALUE_TYPE TWO31r;
2103 rtx two31r, tmp[4];
2104 machine_mode mode = GET_MODE (val);
2105 machine_mode scalarmode = GET_MODE_INNER (mode);
2106 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2107 rtx (*cmp) (rtx, rtx, rtx, rtx);
2108 int i;
2109
2110 for (i = 0; i < 3; i++)
2111 tmp[i] = gen_reg_rtx (mode);
2112 real_ldexp (&TWO31r, &dconst1, 31);
2113 two31r = const_double_from_real_value (TWO31r, scalarmode);
2114 two31r = ix86_build_const_vector (mode, 1, two31r);
2115 two31r = force_reg (mode, two31r);
2116 switch (mode)
2117 {
2118 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2119 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2120 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2121 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2122 default: gcc_unreachable ();
2123 }
2124 tmp[3] = gen_rtx_LE (mode, two31r, val);
2125 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2126 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2127 0, OPTAB_DIRECT);
2128 if (intmode == V4SImode || TARGET_AVX2)
2129 *xorp = expand_simple_binop (intmode, ASHIFT,
2130 gen_lowpart (intmode, tmp[0]),
2131 GEN_INT (31), NULL_RTX, 0,
2132 OPTAB_DIRECT);
2133 else
2134 {
2135 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2136 two31 = ix86_build_const_vector (intmode, 1, two31);
2137 *xorp = expand_simple_binop (intmode, AND,
2138 gen_lowpart (intmode, tmp[0]),
2139 two31, NULL_RTX, 0,
2140 OPTAB_DIRECT);
2141 }
2142 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2143 0, OPTAB_DIRECT);
2144 }
2145
2146 /* Generate code for floating point ABS or NEG. */
2147
2148 void
2149 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2150 rtx operands[])
2151 {
2152 rtx set, dst, src;
2153 bool use_sse = false;
2154 bool vector_mode = VECTOR_MODE_P (mode);
2155 machine_mode vmode = mode;
2156 rtvec par;
2157
2158 if (vector_mode || mode == TFmode || mode == HFmode)
2159 {
2160 use_sse = true;
2161 if (mode == HFmode)
2162 vmode = V8HFmode;
2163 }
2164 else if (TARGET_SSE_MATH)
2165 {
2166 use_sse = SSE_FLOAT_MODE_P (mode);
2167 if (mode == SFmode)
2168 vmode = V4SFmode;
2169 else if (mode == DFmode)
2170 vmode = V2DFmode;
2171 }
2172
2173 dst = operands[0];
2174 src = operands[1];
2175
2176 set = gen_rtx_fmt_e (code, mode, src);
2177 set = gen_rtx_SET (dst, set);
2178
2179 if (use_sse)
2180 {
2181 rtx mask, use, clob;
2182
2183 /* NEG and ABS performed with SSE use bitwise mask operations.
2184 Create the appropriate mask now. */
2185 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2186 use = gen_rtx_USE (VOIDmode, mask);
2187 if (vector_mode || mode == TFmode)
2188 par = gen_rtvec (2, set, use);
2189 else
2190 {
2191 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2192 par = gen_rtvec (3, set, use, clob);
2193 }
2194 }
2195 else
2196 {
2197 rtx clob;
2198
2199 /* Changing of sign for FP values is doable using integer unit too. */
2200 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2201 par = gen_rtvec (2, set, clob);
2202 }
2203
2204 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2205 }
2206
2207 /* Deconstruct a floating point ABS or NEG operation
2208 with integer registers into integer operations. */
2209
2210 void
2211 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2212 rtx operands[])
2213 {
2214 enum rtx_code absneg_op;
2215 rtx dst, set;
2216
2217 gcc_assert (operands_match_p (operands[0], operands[1]));
2218
2219 switch (mode)
2220 {
2221 case E_SFmode:
2222 dst = gen_lowpart (SImode, operands[0]);
2223
2224 if (code == ABS)
2225 {
2226 set = gen_int_mode (0x7fffffff, SImode);
2227 absneg_op = AND;
2228 }
2229 else
2230 {
2231 set = gen_int_mode (0x80000000, SImode);
2232 absneg_op = XOR;
2233 }
2234 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2235 break;
2236
2237 case E_DFmode:
2238 if (TARGET_64BIT)
2239 {
2240 dst = gen_lowpart (DImode, operands[0]);
2241 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2242
2243 if (code == ABS)
2244 set = const0_rtx;
2245 else
2246 set = gen_rtx_NOT (DImode, dst);
2247 }
2248 else
2249 {
2250 dst = gen_highpart (SImode, operands[0]);
2251
2252 if (code == ABS)
2253 {
2254 set = gen_int_mode (0x7fffffff, SImode);
2255 absneg_op = AND;
2256 }
2257 else
2258 {
2259 set = gen_int_mode (0x80000000, SImode);
2260 absneg_op = XOR;
2261 }
2262 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2263 }
2264 break;
2265
2266 case E_XFmode:
2267 dst = gen_rtx_REG (SImode,
2268 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2269 if (code == ABS)
2270 {
2271 set = GEN_INT (0x7fff);
2272 absneg_op = AND;
2273 }
2274 else
2275 {
2276 set = GEN_INT (0x8000);
2277 absneg_op = XOR;
2278 }
2279 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2280 break;
2281
2282 default:
2283 gcc_unreachable ();
2284 }
2285
2286 set = gen_rtx_SET (dst, set);
2287
2288 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2289 rtvec par = gen_rtvec (2, set, clob);
2290
2291 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2292 }
2293
2294 /* Expand a copysign operation. Special case operand 0 being a constant. */
2295
2296 void
2297 ix86_expand_copysign (rtx operands[])
2298 {
2299 machine_mode mode, vmode;
2300 rtx dest, vdest, op0, op1, mask, op2, op3;
2301
2302 mode = GET_MODE (operands[0]);
2303
2304 if (mode == HFmode)
2305 vmode = V8HFmode;
2306 else if (mode == SFmode)
2307 vmode = V4SFmode;
2308 else if (mode == DFmode)
2309 vmode = V2DFmode;
2310 else if (mode == TFmode)
2311 vmode = mode;
2312 else
2313 gcc_unreachable ();
2314
2315 if (rtx_equal_p (operands[1], operands[2]))
2316 {
2317 emit_move_insn (operands[0], operands[1]);
2318 return;
2319 }
2320
2321 dest = operands[0];
2322 vdest = lowpart_subreg (vmode, dest, mode);
2323 if (vdest == NULL_RTX)
2324 vdest = gen_reg_rtx (vmode);
2325 else
2326 dest = NULL_RTX;
2327 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2328 mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
2329
2330 if (CONST_DOUBLE_P (operands[1]))
2331 {
2332 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2333 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2334 if (op0 == CONST0_RTX (mode))
2335 {
2336 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2337 if (dest)
2338 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2339 return;
2340 }
2341
2342 if (GET_MODE_SIZE (mode) < 16)
2343 op0 = ix86_build_const_vector (vmode, false, op0);
2344 op0 = force_reg (vmode, op0);
2345 }
2346 else
2347 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2348
2349 op2 = gen_reg_rtx (vmode);
2350 op3 = gen_reg_rtx (vmode);
2351 emit_move_insn (op2, gen_rtx_AND (vmode,
2352 gen_rtx_NOT (vmode, mask),
2353 op0));
2354 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2355 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2356 if (dest)
2357 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2358 }
2359
2360 /* Expand an xorsign operation. */
2361
2362 void
2363 ix86_expand_xorsign (rtx operands[])
2364 {
2365 machine_mode mode, vmode;
2366 rtx dest, vdest, op0, op1, mask, x, temp;
2367
2368 dest = operands[0];
2369 op0 = operands[1];
2370 op1 = operands[2];
2371
2372 mode = GET_MODE (dest);
2373
2374 if (mode == HFmode)
2375 vmode = V8HFmode;
2376 else if (mode == SFmode)
2377 vmode = V4SFmode;
2378 else if (mode == DFmode)
2379 vmode = V2DFmode;
2380 else
2381 gcc_unreachable ();
2382
2383 temp = gen_reg_rtx (vmode);
2384 mask = ix86_build_signbit_mask (vmode, 0, 0);
2385
2386 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2387 x = gen_rtx_AND (vmode, op1, mask);
2388 emit_insn (gen_rtx_SET (temp, x));
2389
2390 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2391 x = gen_rtx_XOR (vmode, temp, op0);
2392
2393 vdest = lowpart_subreg (vmode, dest, mode);
2394 if (vdest == NULL_RTX)
2395 vdest = gen_reg_rtx (vmode);
2396 else
2397 dest = NULL_RTX;
2398 emit_insn (gen_rtx_SET (vdest, x));
2399
2400 if (dest)
2401 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2402 }
2403
2404 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2405
2406 void
2407 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2408 {
2409 machine_mode mode = GET_MODE (op0);
2410 rtx tmp;
2411
2412 /* Handle special case - vector comparsion with boolean result, transform
2413 it using ptest instruction. */
2414 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2415 || (mode == TImode && !TARGET_64BIT)
2416 || mode == OImode)
2417 {
2418 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2419 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2420
2421 gcc_assert (code == EQ || code == NE);
2422
2423 if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
2424 {
2425 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2426 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2427 mode = p_mode;
2428 }
2429 /* Generate XOR since we can't check that one operand is zero vector. */
2430 tmp = gen_reg_rtx (mode);
2431 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2432 tmp = gen_lowpart (p_mode, tmp);
2433 emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
2434 gen_rtx_UNSPEC (CCZmode,
2435 gen_rtvec (2, tmp, tmp),
2436 UNSPEC_PTEST)));
2437 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2438 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2439 gen_rtx_LABEL_REF (VOIDmode, label),
2440 pc_rtx);
2441 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2442 return;
2443 }
2444
2445 switch (mode)
2446 {
2447 case E_HFmode:
2448 case E_SFmode:
2449 case E_DFmode:
2450 case E_XFmode:
2451 case E_QImode:
2452 case E_HImode:
2453 case E_SImode:
2454 simple:
2455 tmp = ix86_expand_compare (code, op0, op1);
2456 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2457 gen_rtx_LABEL_REF (VOIDmode, label),
2458 pc_rtx);
2459 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2460 return;
2461
2462 case E_DImode:
2463 if (TARGET_64BIT)
2464 goto simple;
2465 /* FALLTHRU */
2466 case E_TImode:
2467 /* DI and TI mode equality/inequality comparisons may be performed
2468 on SSE registers. Avoid splitting them, except when optimizing
2469 for size. */
2470 if ((code == EQ || code == NE)
2471 && !optimize_insn_for_size_p ())
2472 goto simple;
2473
2474 /* Expand DImode branch into multiple compare+branch. */
2475 {
2476 rtx lo[2], hi[2];
2477 rtx_code_label *label2;
2478 enum rtx_code code1, code2, code3;
2479 machine_mode submode;
2480
2481 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2482 {
2483 std::swap (op0, op1);
2484 code = swap_condition (code);
2485 }
2486
2487 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2488 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2489
2490 submode = mode == DImode ? SImode : DImode;
2491
2492 /* If we are doing less-than or greater-or-equal-than,
2493 op1 is a constant and the low word is zero, then we can just
2494 examine the high word. Similarly for low word -1 and
2495 less-or-equal-than or greater-than. */
2496
2497 if (CONST_INT_P (hi[1]))
2498 switch (code)
2499 {
2500 case LT: case LTU: case GE: case GEU:
2501 if (lo[1] == const0_rtx)
2502 {
2503 ix86_expand_branch (code, hi[0], hi[1], label);
2504 return;
2505 }
2506 break;
2507 case LE: case LEU: case GT: case GTU:
2508 if (lo[1] == constm1_rtx)
2509 {
2510 ix86_expand_branch (code, hi[0], hi[1], label);
2511 return;
2512 }
2513 break;
2514 default:
2515 break;
2516 }
2517
2518 /* Emulate comparisons that do not depend on Zero flag with
2519 double-word subtraction. Note that only Overflow, Sign
2520 and Carry flags are valid, so swap arguments and condition
2521 of comparisons that would otherwise test Zero flag. */
2522
2523 switch (code)
2524 {
2525 case LE: case LEU: case GT: case GTU:
2526 std::swap (lo[0], lo[1]);
2527 std::swap (hi[0], hi[1]);
2528 code = swap_condition (code);
2529 /* FALLTHRU */
2530
2531 case LT: case LTU: case GE: case GEU:
2532 {
2533 bool uns = (code == LTU || code == GEU);
2534 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2535 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2536
2537 if (!nonimmediate_operand (lo[0], submode))
2538 lo[0] = force_reg (submode, lo[0]);
2539 if (!x86_64_general_operand (lo[1], submode))
2540 lo[1] = force_reg (submode, lo[1]);
2541
2542 if (!register_operand (hi[0], submode))
2543 hi[0] = force_reg (submode, hi[0]);
2544 if ((uns && !nonimmediate_operand (hi[1], submode))
2545 || (!uns && !x86_64_general_operand (hi[1], submode)))
2546 hi[1] = force_reg (submode, hi[1]);
2547
2548 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2549
2550 tmp = gen_rtx_SCRATCH (submode);
2551 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2552
2553 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2554 ix86_expand_branch (code, tmp, const0_rtx, label);
2555 return;
2556 }
2557
2558 default:
2559 break;
2560 }
2561
2562 /* Otherwise, we need two or three jumps. */
2563
2564 label2 = gen_label_rtx ();
2565
2566 code1 = code;
2567 code2 = swap_condition (code);
2568 code3 = unsigned_condition (code);
2569
2570 switch (code)
2571 {
2572 case LT: case GT: case LTU: case GTU:
2573 break;
2574
2575 case LE: code1 = LT; code2 = GT; break;
2576 case GE: code1 = GT; code2 = LT; break;
2577 case LEU: code1 = LTU; code2 = GTU; break;
2578 case GEU: code1 = GTU; code2 = LTU; break;
2579
2580 case EQ: code1 = UNKNOWN; code2 = NE; break;
2581 case NE: code2 = UNKNOWN; break;
2582
2583 default:
2584 gcc_unreachable ();
2585 }
2586
2587 /*
2588 * a < b =>
2589 * if (hi(a) < hi(b)) goto true;
2590 * if (hi(a) > hi(b)) goto false;
2591 * if (lo(a) < lo(b)) goto true;
2592 * false:
2593 */
2594
2595 if (code1 != UNKNOWN)
2596 ix86_expand_branch (code1, hi[0], hi[1], label);
2597 if (code2 != UNKNOWN)
2598 ix86_expand_branch (code2, hi[0], hi[1], label2);
2599
2600 ix86_expand_branch (code3, lo[0], lo[1], label);
2601
2602 if (code2 != UNKNOWN)
2603 emit_label (label2);
2604 return;
2605 }
2606
2607 default:
2608 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2609 goto simple;
2610 }
2611 }
2612
2613 /* Figure out whether to use unordered fp comparisons. */
2614
2615 static bool
2616 ix86_unordered_fp_compare (enum rtx_code code)
2617 {
2618 if (!TARGET_IEEE_FP)
2619 return false;
2620
2621 switch (code)
2622 {
2623 case LT:
2624 case LE:
2625 case GT:
2626 case GE:
2627 case LTGT:
2628 return false;
2629
2630 case EQ:
2631 case NE:
2632
2633 case UNORDERED:
2634 case ORDERED:
2635 case UNLT:
2636 case UNLE:
2637 case UNGT:
2638 case UNGE:
2639 case UNEQ:
2640 return true;
2641
2642 default:
2643 gcc_unreachable ();
2644 }
2645 }
2646
2647 /* Return a comparison we can do and that it is equivalent to
2648 swap_condition (code) apart possibly from orderedness.
2649 But, never change orderedness if TARGET_IEEE_FP, returning
2650 UNKNOWN in that case if necessary. */
2651
2652 static enum rtx_code
2653 ix86_fp_swap_condition (enum rtx_code code)
2654 {
2655 switch (code)
2656 {
2657 case GT: /* GTU - CF=0 & ZF=0 */
2658 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2659 case GE: /* GEU - CF=0 */
2660 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2661 case UNLT: /* LTU - CF=1 */
2662 return TARGET_IEEE_FP ? UNKNOWN : GT;
2663 case UNLE: /* LEU - CF=1 | ZF=1 */
2664 return TARGET_IEEE_FP ? UNKNOWN : GE;
2665 default:
2666 return swap_condition (code);
2667 }
2668 }
2669
2670 /* Return cost of comparison CODE using the best strategy for performance.
2671 All following functions do use number of instructions as a cost metrics.
2672 In future this should be tweaked to compute bytes for optimize_size and
2673 take into account performance of various instructions on various CPUs. */
2674
2675 static int
2676 ix86_fp_comparison_cost (enum rtx_code code)
2677 {
2678 int arith_cost;
2679
2680 /* The cost of code using bit-twiddling on %ah. */
2681 switch (code)
2682 {
2683 case UNLE:
2684 case UNLT:
2685 case LTGT:
2686 case GT:
2687 case GE:
2688 case UNORDERED:
2689 case ORDERED:
2690 case UNEQ:
2691 arith_cost = 4;
2692 break;
2693 case LT:
2694 case NE:
2695 case EQ:
2696 case UNGE:
2697 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2698 break;
2699 case LE:
2700 case UNGT:
2701 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2702 break;
2703 default:
2704 gcc_unreachable ();
2705 }
2706
2707 switch (ix86_fp_comparison_strategy (code))
2708 {
2709 case IX86_FPCMP_COMI:
2710 return arith_cost > 4 ? 3 : 2;
2711 case IX86_FPCMP_SAHF:
2712 return arith_cost > 4 ? 4 : 3;
2713 default:
2714 return arith_cost;
2715 }
2716 }
2717
2718 /* Swap, force into registers, or otherwise massage the two operands
2719 to a fp comparison. The operands are updated in place; the new
2720 comparison code is returned. */
2721
2722 static enum rtx_code
2723 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2724 {
2725 bool unordered_compare = ix86_unordered_fp_compare (code);
2726 rtx op0 = *pop0, op1 = *pop1;
2727 machine_mode op_mode = GET_MODE (op0);
2728 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2729
2730 if (op_mode == BFmode)
2731 {
2732 rtx op = gen_lowpart (HImode, op0);
2733 if (CONST_INT_P (op))
2734 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2735 op0, BFmode);
2736 else
2737 {
2738 rtx t1 = gen_reg_rtx (SImode);
2739 emit_insn (gen_zero_extendhisi2 (t1, op));
2740 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2741 op = gen_lowpart (SFmode, t1);
2742 }
2743 *pop0 = op;
2744 op = gen_lowpart (HImode, op1);
2745 if (CONST_INT_P (op))
2746 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2747 op1, BFmode);
2748 else
2749 {
2750 rtx t1 = gen_reg_rtx (SImode);
2751 emit_insn (gen_zero_extendhisi2 (t1, op));
2752 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2753 op = gen_lowpart (SFmode, t1);
2754 }
2755 *pop1 = op;
2756 return ix86_prepare_fp_compare_args (code, pop0, pop1);
2757 }
2758
2759 /* All of the unordered compare instructions only work on registers.
2760 The same is true of the fcomi compare instructions. The XFmode
2761 compare instructions require registers except when comparing
2762 against zero or when converting operand 1 from fixed point to
2763 floating point. */
2764
2765 if (!is_sse
2766 && (unordered_compare
2767 || (op_mode == XFmode
2768 && ! (standard_80387_constant_p (op0) == 1
2769 || standard_80387_constant_p (op1) == 1)
2770 && GET_CODE (op1) != FLOAT)
2771 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2772 {
2773 op0 = force_reg (op_mode, op0);
2774 op1 = force_reg (op_mode, op1);
2775 }
2776 else
2777 {
2778 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2779 things around if they appear profitable, otherwise force op0
2780 into a register. */
2781
2782 if (standard_80387_constant_p (op0) == 0
2783 || (MEM_P (op0)
2784 && ! (standard_80387_constant_p (op1) == 0
2785 || MEM_P (op1))))
2786 {
2787 enum rtx_code new_code = ix86_fp_swap_condition (code);
2788 if (new_code != UNKNOWN)
2789 {
2790 std::swap (op0, op1);
2791 code = new_code;
2792 }
2793 }
2794
2795 if (!REG_P (op0))
2796 op0 = force_reg (op_mode, op0);
2797
2798 if (CONSTANT_P (op1))
2799 {
2800 int tmp = standard_80387_constant_p (op1);
2801 if (tmp == 0)
2802 op1 = validize_mem (force_const_mem (op_mode, op1));
2803 else if (tmp == 1)
2804 {
2805 if (TARGET_CMOVE)
2806 op1 = force_reg (op_mode, op1);
2807 }
2808 else
2809 op1 = force_reg (op_mode, op1);
2810 }
2811 }
2812
2813 /* Try to rearrange the comparison to make it cheaper. */
2814 if (ix86_fp_comparison_cost (code)
2815 > ix86_fp_comparison_cost (swap_condition (code))
2816 && (REG_P (op1) || can_create_pseudo_p ()))
2817 {
2818 std::swap (op0, op1);
2819 code = swap_condition (code);
2820 if (!REG_P (op0))
2821 op0 = force_reg (op_mode, op0);
2822 }
2823
2824 *pop0 = op0;
2825 *pop1 = op1;
2826 return code;
2827 }
2828
2829 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2830
2831 static rtx
2832 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2833 {
2834 bool unordered_compare = ix86_unordered_fp_compare (code);
2835 machine_mode cmp_mode;
2836 rtx tmp, scratch;
2837
2838 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2839
2840 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2841 if (unordered_compare)
2842 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2843
2844 /* Do fcomi/sahf based test when profitable. */
2845 switch (ix86_fp_comparison_strategy (code))
2846 {
2847 case IX86_FPCMP_COMI:
2848 cmp_mode = CCFPmode;
2849 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2850 break;
2851
2852 case IX86_FPCMP_SAHF:
2853 cmp_mode = CCFPmode;
2854 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2855 scratch = gen_reg_rtx (HImode);
2856 emit_insn (gen_rtx_SET (scratch, tmp));
2857 emit_insn (gen_x86_sahf_1 (scratch));
2858 break;
2859
2860 case IX86_FPCMP_ARITH:
2861 cmp_mode = CCNOmode;
2862 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2863 scratch = gen_reg_rtx (HImode);
2864 emit_insn (gen_rtx_SET (scratch, tmp));
2865
2866 /* In the unordered case, we have to check C2 for NaN's, which
2867 doesn't happen to work out to anything nice combination-wise.
2868 So do some bit twiddling on the value we've got in AH to come
2869 up with an appropriate set of condition codes. */
2870
2871 switch (code)
2872 {
2873 case GT:
2874 case UNGT:
2875 if (code == GT || !TARGET_IEEE_FP)
2876 {
2877 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2878 code = EQ;
2879 }
2880 else
2881 {
2882 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2883 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2884 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2885 cmp_mode = CCmode;
2886 code = GEU;
2887 }
2888 break;
2889 case LT:
2890 case UNLT:
2891 if (code == LT && TARGET_IEEE_FP)
2892 {
2893 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2894 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2895 cmp_mode = CCmode;
2896 code = EQ;
2897 }
2898 else
2899 {
2900 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2901 code = NE;
2902 }
2903 break;
2904 case GE:
2905 case UNGE:
2906 if (code == GE || !TARGET_IEEE_FP)
2907 {
2908 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2909 code = EQ;
2910 }
2911 else
2912 {
2913 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2914 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2915 code = NE;
2916 }
2917 break;
2918 case LE:
2919 case UNLE:
2920 if (code == LE && TARGET_IEEE_FP)
2921 {
2922 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2923 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2924 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2925 cmp_mode = CCmode;
2926 code = LTU;
2927 }
2928 else
2929 {
2930 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2931 code = NE;
2932 }
2933 break;
2934 case EQ:
2935 case UNEQ:
2936 if (code == EQ && TARGET_IEEE_FP)
2937 {
2938 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2939 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2940 cmp_mode = CCmode;
2941 code = EQ;
2942 }
2943 else
2944 {
2945 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2946 code = NE;
2947 }
2948 break;
2949 case NE:
2950 case LTGT:
2951 if (code == NE && TARGET_IEEE_FP)
2952 {
2953 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2954 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2955 GEN_INT (0x40)));
2956 code = NE;
2957 }
2958 else
2959 {
2960 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2961 code = EQ;
2962 }
2963 break;
2964
2965 case UNORDERED:
2966 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2967 code = NE;
2968 break;
2969 case ORDERED:
2970 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2971 code = EQ;
2972 break;
2973
2974 default:
2975 gcc_unreachable ();
2976 }
2977 break;
2978
2979 default:
2980 gcc_unreachable();
2981 }
2982
2983 /* Return the test that should be put into the flags user, i.e.
2984 the bcc, scc, or cmov instruction. */
2985 return gen_rtx_fmt_ee (code, VOIDmode,
2986 gen_rtx_REG (cmp_mode, FLAGS_REG),
2987 const0_rtx);
2988 }
2989
2990 /* Generate insn patterns to do an integer compare of OPERANDS. */
2991
2992 static rtx
2993 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2994 {
2995 machine_mode cmpmode;
2996 rtx tmp, flags;
2997
2998 /* Swap operands to emit carry flag comparison. */
2999 if ((code == GTU || code == LEU)
3000 && nonimmediate_operand (op1, VOIDmode))
3001 {
3002 std::swap (op0, op1);
3003 code = swap_condition (code);
3004 }
3005
3006 cmpmode = SELECT_CC_MODE (code, op0, op1);
3007 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
3008
3009 /* Attempt to use PTEST, if available, when testing vector modes for
3010 equality/inequality against zero. */
3011 if (op1 == const0_rtx
3012 && SUBREG_P (op0)
3013 && cmpmode == CCZmode
3014 && SUBREG_BYTE (op0) == 0
3015 && REG_P (SUBREG_REG (op0))
3016 && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
3017 && TARGET_SSE4_1
3018 && GET_MODE (op0) == TImode
3019 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
3020 {
3021 tmp = SUBREG_REG (op0);
3022 tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
3023 }
3024 else
3025 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
3026
3027 /* This is very simple, but making the interface the same as in the
3028 FP case makes the rest of the code easier. */
3029 emit_insn (gen_rtx_SET (flags, tmp));
3030
3031 /* Return the test that should be put into the flags user, i.e.
3032 the bcc, scc, or cmov instruction. */
3033 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
3034 }
3035
3036 static rtx
3037 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
3038 {
3039 rtx ret;
3040
3041 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
3042 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
3043
3044 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
3045 {
3046 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
3047 ret = ix86_expand_fp_compare (code, op0, op1);
3048 }
3049 else
3050 ret = ix86_expand_int_compare (code, op0, op1);
3051
3052 return ret;
3053 }
3054
3055 void
3056 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
3057 {
3058 rtx ret;
3059
3060 gcc_assert (GET_MODE (dest) == QImode);
3061
3062 ret = ix86_expand_compare (code, op0, op1);
3063 PUT_MODE (ret, QImode);
3064 emit_insn (gen_rtx_SET (dest, ret));
3065 }
3066
3067 /* Expand floating point op0 <=> op1, i.e.
3068 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
3069
3070 void
3071 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
3072 {
3073 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
3074 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
3075 rtx l0 = gen_label_rtx ();
3076 rtx l1 = gen_label_rtx ();
3077 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
3078 rtx lend = gen_label_rtx ();
3079 rtx tmp;
3080 rtx_insn *jmp;
3081 if (l2)
3082 {
3083 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
3084 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3085 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
3086 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
3087 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3088 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
3089 }
3090 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3091 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3092 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3093 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3094 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3095 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3096 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3097 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3098 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3099 add_reg_br_prob_note (jmp, profile_probability::even ());
3100 emit_move_insn (dest, constm1_rtx);
3101 emit_jump (lend);
3102 emit_label (l0);
3103 emit_move_insn (dest, const0_rtx);
3104 emit_jump (lend);
3105 emit_label (l1);
3106 emit_move_insn (dest, const1_rtx);
3107 emit_jump (lend);
3108 if (l2)
3109 {
3110 emit_label (l2);
3111 emit_move_insn (dest, const2_rtx);
3112 }
3113 emit_label (lend);
3114 }
3115
3116 /* Expand comparison setting or clearing carry flag. Return true when
3117 successful and set pop for the operation. */
3118 static bool
3119 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3120 {
3121 machine_mode mode
3122 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3123
3124 /* Do not handle double-mode compares that go through special path. */
3125 if (mode == (TARGET_64BIT ? TImode : DImode))
3126 return false;
3127
3128 if (SCALAR_FLOAT_MODE_P (mode))
3129 {
3130 rtx compare_op;
3131 rtx_insn *compare_seq;
3132
3133 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3134
3135 /* Shortcut: following common codes never translate
3136 into carry flag compares. */
3137 if (code == EQ || code == NE || code == UNEQ || code == LTGT
3138 || code == ORDERED || code == UNORDERED)
3139 return false;
3140
3141 /* These comparisons require zero flag; swap operands so they won't. */
3142 if ((code == GT || code == UNLE || code == LE || code == UNGT)
3143 && !TARGET_IEEE_FP)
3144 {
3145 std::swap (op0, op1);
3146 code = swap_condition (code);
3147 }
3148
3149 /* Try to expand the comparison and verify that we end up with
3150 carry flag based comparison. This fails to be true only when
3151 we decide to expand comparison using arithmetic that is not
3152 too common scenario. */
3153 start_sequence ();
3154 compare_op = ix86_expand_fp_compare (code, op0, op1);
3155 compare_seq = get_insns ();
3156 end_sequence ();
3157
3158 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3159 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3160 else
3161 code = GET_CODE (compare_op);
3162
3163 if (code != LTU && code != GEU)
3164 return false;
3165
3166 emit_insn (compare_seq);
3167 *pop = compare_op;
3168 return true;
3169 }
3170
3171 if (!INTEGRAL_MODE_P (mode))
3172 return false;
3173
3174 switch (code)
3175 {
3176 case LTU:
3177 case GEU:
3178 break;
3179
3180 /* Convert a==0 into (unsigned)a<1. */
3181 case EQ:
3182 case NE:
3183 if (op1 != const0_rtx)
3184 return false;
3185 op1 = const1_rtx;
3186 code = (code == EQ ? LTU : GEU);
3187 break;
3188
3189 /* Convert a>b into b<a or a>=b-1. */
3190 case GTU:
3191 case LEU:
3192 if (CONST_INT_P (op1))
3193 {
3194 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3195 /* Bail out on overflow. We still can swap operands but that
3196 would force loading of the constant into register. */
3197 if (op1 == const0_rtx
3198 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3199 return false;
3200 code = (code == GTU ? GEU : LTU);
3201 }
3202 else
3203 {
3204 std::swap (op0, op1);
3205 code = (code == GTU ? LTU : GEU);
3206 }
3207 break;
3208
3209 /* Convert a>=0 into (unsigned)a<0x80000000. */
3210 case LT:
3211 case GE:
3212 if (mode == DImode || op1 != const0_rtx)
3213 return false;
3214 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3215 code = (code == LT ? GEU : LTU);
3216 break;
3217 case LE:
3218 case GT:
3219 if (mode == DImode || op1 != constm1_rtx)
3220 return false;
3221 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3222 code = (code == LE ? GEU : LTU);
3223 break;
3224
3225 default:
3226 return false;
3227 }
3228 /* Swapping operands may cause constant to appear as first operand. */
3229 if (!nonimmediate_operand (op0, VOIDmode))
3230 {
3231 if (!can_create_pseudo_p ())
3232 return false;
3233 op0 = force_reg (mode, op0);
3234 }
3235 *pop = ix86_expand_compare (code, op0, op1);
3236 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3237 return true;
3238 }
3239
3240 /* Expand conditional increment or decrement using adb/sbb instructions.
3241 The default case using setcc followed by the conditional move can be
3242 done by generic code. */
3243 bool
3244 ix86_expand_int_addcc (rtx operands[])
3245 {
3246 enum rtx_code code = GET_CODE (operands[1]);
3247 rtx flags;
3248 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3249 rtx compare_op;
3250 rtx val = const0_rtx;
3251 bool fpcmp = false;
3252 machine_mode mode;
3253 rtx op0 = XEXP (operands[1], 0);
3254 rtx op1 = XEXP (operands[1], 1);
3255
3256 if (operands[3] != const1_rtx
3257 && operands[3] != constm1_rtx)
3258 return false;
3259 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3260 return false;
3261 code = GET_CODE (compare_op);
3262
3263 flags = XEXP (compare_op, 0);
3264
3265 if (GET_MODE (flags) == CCFPmode)
3266 {
3267 fpcmp = true;
3268 code = ix86_fp_compare_code_to_integer (code);
3269 }
3270
3271 if (code != LTU)
3272 {
3273 val = constm1_rtx;
3274 if (fpcmp)
3275 PUT_CODE (compare_op,
3276 reverse_condition_maybe_unordered
3277 (GET_CODE (compare_op)));
3278 else
3279 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3280 }
3281
3282 mode = GET_MODE (operands[0]);
3283
3284 /* Construct either adc or sbb insn. */
3285 if ((code == LTU) == (operands[3] == constm1_rtx))
3286 insn = gen_sub3_carry;
3287 else
3288 insn = gen_add3_carry;
3289
3290 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3291
3292 return true;
3293 }
3294
3295 bool
3296 ix86_expand_int_movcc (rtx operands[])
3297 {
3298 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3299 rtx_insn *compare_seq;
3300 rtx compare_op;
3301 machine_mode mode = GET_MODE (operands[0]);
3302 bool sign_bit_compare_p = false;
3303 bool negate_cc_compare_p = false;
3304 rtx op0 = XEXP (operands[1], 0);
3305 rtx op1 = XEXP (operands[1], 1);
3306 rtx op2 = operands[2];
3307 rtx op3 = operands[3];
3308
3309 if (GET_MODE (op0) == TImode
3310 || (GET_MODE (op0) == DImode
3311 && !TARGET_64BIT))
3312 return false;
3313
3314 if (GET_MODE (op0) == BFmode
3315 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3316 return false;
3317
3318 start_sequence ();
3319 compare_op = ix86_expand_compare (code, op0, op1);
3320 compare_seq = get_insns ();
3321 end_sequence ();
3322
3323 compare_code = GET_CODE (compare_op);
3324
3325 if ((op1 == const0_rtx && (code == GE || code == LT))
3326 || (op1 == constm1_rtx && (code == GT || code == LE)))
3327 sign_bit_compare_p = true;
3328
3329 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3330 but if op1 is a constant, the latter form allows more optimizations,
3331 either through the last 2 ops being constant handling, or the one
3332 constant and one variable cases. On the other side, for cmov the
3333 former might be better as we don't need to load the constant into
3334 another register. */
3335 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3336 op2 = op1;
3337 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3338 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3339 op3 = op1;
3340
3341 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3342 HImode insns, we'd be swallowed in word prefix ops. */
3343
3344 if ((mode != HImode || TARGET_FAST_PREFIX)
3345 && (mode != (TARGET_64BIT ? TImode : DImode))
3346 && CONST_INT_P (op2)
3347 && CONST_INT_P (op3))
3348 {
3349 rtx out = operands[0];
3350 HOST_WIDE_INT ct = INTVAL (op2);
3351 HOST_WIDE_INT cf = INTVAL (op3);
3352 HOST_WIDE_INT diff;
3353
3354 if ((mode == SImode
3355 || (TARGET_64BIT && mode == DImode))
3356 && (GET_MODE (op0) == SImode
3357 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3358 {
3359 /* Special case x != 0 ? -1 : y. */
3360 if (code == NE && op1 == const0_rtx && ct == -1)
3361 {
3362 negate_cc_compare_p = true;
3363 std::swap (ct, cf);
3364 code = EQ;
3365 }
3366 else if (code == EQ && op1 == const0_rtx && cf == -1)
3367 negate_cc_compare_p = true;
3368 }
3369
3370 diff = ct - cf;
3371 /* Sign bit compares are better done using shifts than we do by using
3372 sbb. */
3373 if (sign_bit_compare_p
3374 || negate_cc_compare_p
3375 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3376 {
3377 /* Detect overlap between destination and compare sources. */
3378 rtx tmp = out;
3379
3380 if (negate_cc_compare_p)
3381 {
3382 if (GET_MODE (op0) == DImode)
3383 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3384 else
3385 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3386 gen_lowpart (SImode, op0)));
3387
3388 tmp = gen_reg_rtx (mode);
3389 if (mode == DImode)
3390 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3391 else
3392 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3393 tmp)));
3394 }
3395 else if (!sign_bit_compare_p)
3396 {
3397 rtx flags;
3398 bool fpcmp = false;
3399
3400 compare_code = GET_CODE (compare_op);
3401
3402 flags = XEXP (compare_op, 0);
3403
3404 if (GET_MODE (flags) == CCFPmode)
3405 {
3406 fpcmp = true;
3407 compare_code
3408 = ix86_fp_compare_code_to_integer (compare_code);
3409 }
3410
3411 /* To simplify rest of code, restrict to the GEU case. */
3412 if (compare_code == LTU)
3413 {
3414 std::swap (ct, cf);
3415 compare_code = reverse_condition (compare_code);
3416 code = reverse_condition (code);
3417 }
3418 else
3419 {
3420 if (fpcmp)
3421 PUT_CODE (compare_op,
3422 reverse_condition_maybe_unordered
3423 (GET_CODE (compare_op)));
3424 else
3425 PUT_CODE (compare_op,
3426 reverse_condition (GET_CODE (compare_op)));
3427 }
3428 diff = ct - cf;
3429
3430 if (reg_overlap_mentioned_p (out, compare_op))
3431 tmp = gen_reg_rtx (mode);
3432
3433 if (mode == DImode)
3434 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3435 else
3436 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3437 flags, compare_op));
3438 }
3439 else
3440 {
3441 if (code == GT || code == GE)
3442 code = reverse_condition (code);
3443 else
3444 {
3445 std::swap (ct, cf);
3446 diff = ct - cf;
3447 }
3448 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3449 }
3450
3451 if (diff == 1)
3452 {
3453 /*
3454 * cmpl op0,op1
3455 * sbbl dest,dest
3456 * [addl dest, ct]
3457 *
3458 * Size 5 - 8.
3459 */
3460 if (ct)
3461 tmp = expand_simple_binop (mode, PLUS,
3462 tmp, GEN_INT (ct),
3463 copy_rtx (tmp), 1, OPTAB_DIRECT);
3464 }
3465 else if (cf == -1)
3466 {
3467 /*
3468 * cmpl op0,op1
3469 * sbbl dest,dest
3470 * orl $ct, dest
3471 *
3472 * Size 8.
3473 */
3474 tmp = expand_simple_binop (mode, IOR,
3475 tmp, GEN_INT (ct),
3476 copy_rtx (tmp), 1, OPTAB_DIRECT);
3477 }
3478 else if (diff == -1 && ct)
3479 {
3480 /*
3481 * cmpl op0,op1
3482 * sbbl dest,dest
3483 * notl dest
3484 * [addl dest, cf]
3485 *
3486 * Size 8 - 11.
3487 */
3488 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3489 if (cf)
3490 tmp = expand_simple_binop (mode, PLUS,
3491 copy_rtx (tmp), GEN_INT (cf),
3492 copy_rtx (tmp), 1, OPTAB_DIRECT);
3493 }
3494 else
3495 {
3496 /*
3497 * cmpl op0,op1
3498 * sbbl dest,dest
3499 * [notl dest]
3500 * andl cf - ct, dest
3501 * [addl dest, ct]
3502 *
3503 * Size 8 - 11.
3504 */
3505
3506 if (cf == 0)
3507 {
3508 cf = ct;
3509 ct = 0;
3510 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3511 }
3512
3513 tmp = expand_simple_binop (mode, AND,
3514 copy_rtx (tmp),
3515 gen_int_mode (cf - ct, mode),
3516 copy_rtx (tmp), 1, OPTAB_DIRECT);
3517 if (ct)
3518 tmp = expand_simple_binop (mode, PLUS,
3519 copy_rtx (tmp), GEN_INT (ct),
3520 copy_rtx (tmp), 1, OPTAB_DIRECT);
3521 }
3522
3523 if (!rtx_equal_p (tmp, out))
3524 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3525
3526 return true;
3527 }
3528
3529 if (diff < 0)
3530 {
3531 machine_mode cmp_mode = GET_MODE (op0);
3532 enum rtx_code new_code;
3533
3534 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3535 {
3536 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3537
3538 /* We may be reversing a non-trapping
3539 comparison to a trapping comparison. */
3540 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3541 && code != EQ && code != NE
3542 && code != ORDERED && code != UNORDERED)
3543 new_code = UNKNOWN;
3544 else
3545 new_code = reverse_condition_maybe_unordered (code);
3546 }
3547 else
3548 new_code = ix86_reverse_condition (code, cmp_mode);
3549 if (new_code != UNKNOWN)
3550 {
3551 std::swap (ct, cf);
3552 diff = -diff;
3553 code = new_code;
3554 }
3555 }
3556
3557 compare_code = UNKNOWN;
3558 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3559 && CONST_INT_P (op1))
3560 {
3561 if (op1 == const0_rtx
3562 && (code == LT || code == GE))
3563 compare_code = code;
3564 else if (op1 == constm1_rtx)
3565 {
3566 if (code == LE)
3567 compare_code = LT;
3568 else if (code == GT)
3569 compare_code = GE;
3570 }
3571 }
3572
3573 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3574 if (compare_code != UNKNOWN
3575 && GET_MODE (op0) == GET_MODE (out)
3576 && (cf == -1 || ct == -1))
3577 {
3578 /* If lea code below could be used, only optimize
3579 if it results in a 2 insn sequence. */
3580
3581 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3582 || diff == 3 || diff == 5 || diff == 9)
3583 || (compare_code == LT && ct == -1)
3584 || (compare_code == GE && cf == -1))
3585 {
3586 /*
3587 * notl op1 (if necessary)
3588 * sarl $31, op1
3589 * orl cf, op1
3590 */
3591 if (ct != -1)
3592 {
3593 cf = ct;
3594 ct = -1;
3595 code = reverse_condition (code);
3596 }
3597
3598 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3599
3600 out = expand_simple_binop (mode, IOR,
3601 out, GEN_INT (cf),
3602 out, 1, OPTAB_DIRECT);
3603 if (out != operands[0])
3604 emit_move_insn (operands[0], out);
3605
3606 return true;
3607 }
3608 }
3609
3610
3611 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3612 || diff == 3 || diff == 5 || diff == 9)
3613 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3614 && (mode != DImode
3615 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3616 {
3617 /*
3618 * xorl dest,dest
3619 * cmpl op1,op2
3620 * setcc dest
3621 * lea cf(dest*(ct-cf)),dest
3622 *
3623 * Size 14.
3624 *
3625 * This also catches the degenerate setcc-only case.
3626 */
3627
3628 rtx tmp;
3629 int nops;
3630
3631 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3632
3633 nops = 0;
3634 /* On x86_64 the lea instruction operates on Pmode, so we need
3635 to get arithmetics done in proper mode to match. */
3636 if (diff == 1)
3637 tmp = copy_rtx (out);
3638 else
3639 {
3640 rtx out1;
3641 out1 = copy_rtx (out);
3642 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3643 nops++;
3644 if (diff & 1)
3645 {
3646 tmp = gen_rtx_PLUS (mode, tmp, out1);
3647 nops++;
3648 }
3649 }
3650 if (cf != 0)
3651 {
3652 tmp = plus_constant (mode, tmp, cf);
3653 nops++;
3654 }
3655 if (!rtx_equal_p (tmp, out))
3656 {
3657 if (nops == 1)
3658 out = force_operand (tmp, copy_rtx (out));
3659 else
3660 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3661 }
3662 if (!rtx_equal_p (out, operands[0]))
3663 emit_move_insn (operands[0], copy_rtx (out));
3664
3665 return true;
3666 }
3667
3668 /*
3669 * General case: Jumpful:
3670 * xorl dest,dest cmpl op1, op2
3671 * cmpl op1, op2 movl ct, dest
3672 * setcc dest jcc 1f
3673 * decl dest movl cf, dest
3674 * andl (cf-ct),dest 1:
3675 * addl ct,dest
3676 *
3677 * Size 20. Size 14.
3678 *
3679 * This is reasonably steep, but branch mispredict costs are
3680 * high on modern cpus, so consider failing only if optimizing
3681 * for space.
3682 */
3683
3684 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3685 && BRANCH_COST (optimize_insn_for_speed_p (),
3686 false) >= 2)
3687 {
3688 if (cf == 0)
3689 {
3690 machine_mode cmp_mode = GET_MODE (op0);
3691 enum rtx_code new_code;
3692
3693 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3694 {
3695 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3696
3697 /* We may be reversing a non-trapping
3698 comparison to a trapping comparison. */
3699 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3700 && code != EQ && code != NE
3701 && code != ORDERED && code != UNORDERED)
3702 new_code = UNKNOWN;
3703 else
3704 new_code = reverse_condition_maybe_unordered (code);
3705
3706 }
3707 else
3708 {
3709 new_code = ix86_reverse_condition (code, cmp_mode);
3710 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3711 compare_code = reverse_condition (compare_code);
3712 }
3713
3714 if (new_code != UNKNOWN)
3715 {
3716 cf = ct;
3717 ct = 0;
3718 code = new_code;
3719 }
3720 }
3721
3722 if (compare_code != UNKNOWN)
3723 {
3724 /* notl op1 (if needed)
3725 sarl $31, op1
3726 andl (cf-ct), op1
3727 addl ct, op1
3728
3729 For x < 0 (resp. x <= -1) there will be no notl,
3730 so if possible swap the constants to get rid of the
3731 complement.
3732 True/false will be -1/0 while code below (store flag
3733 followed by decrement) is 0/-1, so the constants need
3734 to be exchanged once more. */
3735
3736 if (compare_code == GE || !cf)
3737 {
3738 code = reverse_condition (code);
3739 compare_code = LT;
3740 }
3741 else
3742 std::swap (ct, cf);
3743
3744 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3745 }
3746 else
3747 {
3748 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3749
3750 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3751 constm1_rtx,
3752 copy_rtx (out), 1, OPTAB_DIRECT);
3753 }
3754
3755 out = expand_simple_binop (mode, AND, copy_rtx (out),
3756 gen_int_mode (cf - ct, mode),
3757 copy_rtx (out), 1, OPTAB_DIRECT);
3758 if (ct)
3759 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3760 copy_rtx (out), 1, OPTAB_DIRECT);
3761 if (!rtx_equal_p (out, operands[0]))
3762 emit_move_insn (operands[0], copy_rtx (out));
3763
3764 return true;
3765 }
3766 }
3767
3768 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3769 {
3770 /* Try a few things more with specific constants and a variable. */
3771
3772 optab op;
3773 rtx var, orig_out, out, tmp;
3774
3775 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3776 return false;
3777
3778 operands[2] = op2;
3779 operands[3] = op3;
3780
3781 /* If one of the two operands is an interesting constant, load a
3782 constant with the above and mask it in with a logical operation. */
3783
3784 if (CONST_INT_P (operands[2]))
3785 {
3786 var = operands[3];
3787 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3788 operands[3] = constm1_rtx, op = and_optab;
3789 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3790 operands[3] = const0_rtx, op = ior_optab;
3791 else
3792 return false;
3793 }
3794 else if (CONST_INT_P (operands[3]))
3795 {
3796 var = operands[2];
3797 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3798 {
3799 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3800 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3801 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3802 operands[1] = simplify_gen_relational (LT, VOIDmode,
3803 GET_MODE (op0),
3804 op0, const0_rtx);
3805
3806 operands[2] = constm1_rtx;
3807 op = and_optab;
3808 }
3809 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3810 operands[2] = const0_rtx, op = ior_optab;
3811 else
3812 return false;
3813 }
3814 else
3815 return false;
3816
3817 orig_out = operands[0];
3818 tmp = gen_reg_rtx (mode);
3819 operands[0] = tmp;
3820
3821 /* Recurse to get the constant loaded. */
3822 if (!ix86_expand_int_movcc (operands))
3823 return false;
3824
3825 /* Mask in the interesting variable. */
3826 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3827 OPTAB_WIDEN);
3828 if (!rtx_equal_p (out, orig_out))
3829 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3830
3831 return true;
3832 }
3833
3834 /*
3835 * For comparison with above,
3836 *
3837 * movl cf,dest
3838 * movl ct,tmp
3839 * cmpl op1,op2
3840 * cmovcc tmp,dest
3841 *
3842 * Size 15.
3843 */
3844
3845 if (! nonimmediate_operand (operands[2], mode))
3846 operands[2] = force_reg (mode, operands[2]);
3847 if (! nonimmediate_operand (operands[3], mode))
3848 operands[3] = force_reg (mode, operands[3]);
3849
3850 if (! register_operand (operands[2], VOIDmode)
3851 && (mode == QImode
3852 || ! register_operand (operands[3], VOIDmode)))
3853 operands[2] = force_reg (mode, operands[2]);
3854
3855 if (mode == QImode
3856 && ! register_operand (operands[3], VOIDmode))
3857 operands[3] = force_reg (mode, operands[3]);
3858
3859 emit_insn (compare_seq);
3860 emit_insn (gen_rtx_SET (operands[0],
3861 gen_rtx_IF_THEN_ELSE (mode,
3862 compare_op, operands[2],
3863 operands[3])));
3864 return true;
3865 }
3866
3867 /* Detect conditional moves that exactly match min/max operational
3868 semantics. Note that this is IEEE safe, as long as we don't
3869 interchange the operands.
3870
3871 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3872 and TRUE if the operation is successful and instructions are emitted. */
3873
3874 static bool
3875 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3876 rtx cmp_op1, rtx if_true, rtx if_false)
3877 {
3878 machine_mode mode;
3879 bool is_min;
3880 rtx tmp;
3881
3882 if (code == LT)
3883 ;
3884 else if (code == UNGE)
3885 std::swap (if_true, if_false);
3886 else
3887 return false;
3888
3889 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3890 is_min = true;
3891 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3892 is_min = false;
3893 else
3894 return false;
3895
3896 mode = GET_MODE (dest);
3897
3898 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3899 but MODE may be a vector mode and thus not appropriate. */
3900 if (!flag_finite_math_only || flag_signed_zeros)
3901 {
3902 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3903 rtvec v;
3904
3905 if_true = force_reg (mode, if_true);
3906 v = gen_rtvec (2, if_true, if_false);
3907 tmp = gen_rtx_UNSPEC (mode, v, u);
3908 }
3909 else
3910 {
3911 code = is_min ? SMIN : SMAX;
3912 if (MEM_P (if_true) && MEM_P (if_false))
3913 if_true = force_reg (mode, if_true);
3914 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3915 }
3916
3917 emit_insn (gen_rtx_SET (dest, tmp));
3918 return true;
3919 }
3920
3921 /* Return true if MODE is valid for vector compare to mask register,
3922 Same result for conditionl vector move with mask register. */
3923 static bool
3924 ix86_valid_mask_cmp_mode (machine_mode mode)
3925 {
3926 /* XOP has its own vector conditional movement. */
3927 if (TARGET_XOP && !TARGET_AVX512F)
3928 return false;
3929
3930 /* HFmode only supports vcmpsh whose dest is mask register. */
3931 if (TARGET_AVX512FP16 && mode == HFmode)
3932 return true;
3933
3934 /* AVX512F is needed for mask operation. */
3935 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3936 return false;
3937
3938 /* AVX512BW is needed for vector QI/HImode,
3939 AVX512VL is needed for 128/256-bit vector. */
3940 machine_mode inner_mode = GET_MODE_INNER (mode);
3941 int vector_size = GET_MODE_SIZE (mode);
3942 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3943 return false;
3944
3945 return vector_size == 64 || TARGET_AVX512VL;
3946 }
3947
3948 /* Return true if integer mask comparison should be used. */
3949 static bool
3950 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3951 rtx op_true, rtx op_false)
3952 {
3953 int vector_size = GET_MODE_SIZE (mode);
3954
3955 if (cmp_mode == HFmode)
3956 return true;
3957 else if (vector_size < 16)
3958 return false;
3959 else if (vector_size == 64)
3960 return true;
3961 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3962 return true;
3963
3964 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3965 gcc_assert (!op_true == !op_false);
3966
3967 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3968 vector dest is required. */
3969 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3970 return false;
3971
3972 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3973 if (op_false == CONST0_RTX (mode)
3974 || op_true == CONST0_RTX (mode)
3975 || (INTEGRAL_MODE_P (mode)
3976 && (op_true == CONSTM1_RTX (mode)
3977 || op_false == CONSTM1_RTX (mode))))
3978 return false;
3979
3980 return true;
3981 }
3982
3983 /* Expand an SSE comparison. Return the register with the result. */
3984
3985 static rtx
3986 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3987 rtx op_true, rtx op_false)
3988 {
3989 machine_mode mode = GET_MODE (dest);
3990 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3991
3992 /* In general case result of comparison can differ from operands' type. */
3993 machine_mode cmp_mode;
3994
3995 /* In AVX512F the result of comparison is an integer mask. */
3996 bool maskcmp = false;
3997 rtx x;
3998
3999 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
4000 {
4001 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
4002 maskcmp = true;
4003 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
4004 }
4005 else
4006 cmp_mode = cmp_ops_mode;
4007
4008 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
4009
4010 bool (*op1_predicate)(rtx, machine_mode)
4011 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
4012
4013 if (!op1_predicate (cmp_op1, cmp_ops_mode))
4014 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
4015
4016 if (optimize
4017 || (maskcmp && cmp_mode != mode)
4018 || (op_true && reg_overlap_mentioned_p (dest, op_true))
4019 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
4020 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
4021
4022 if (maskcmp)
4023 {
4024 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
4025 gcc_assert (ok);
4026 return dest;
4027 }
4028
4029 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
4030
4031 if (cmp_mode != mode)
4032 {
4033 x = force_reg (cmp_ops_mode, x);
4034 convert_move (dest, x, false);
4035 }
4036 else
4037 emit_insn (gen_rtx_SET (dest, x));
4038
4039 return dest;
4040 }
4041
4042 /* Emit x86 binary operand CODE in mode MODE for SSE vector
4043 instructions that can be performed using GP registers. */
4044
4045 static void
4046 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
4047 rtx dst, rtx src1, rtx src2)
4048 {
4049 rtx tmp;
4050
4051 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
4052
4053 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
4054 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4055 {
4056 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
4057 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
4058 }
4059
4060 emit_insn (tmp);
4061 }
4062
4063 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
4064 operations. This is used for both scalar and vector conditional moves. */
4065
4066 void
4067 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
4068 {
4069 machine_mode mode = GET_MODE (dest);
4070 machine_mode cmpmode = GET_MODE (cmp);
4071 rtx x;
4072
4073 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4074 if (rtx_equal_p (op_true, op_false))
4075 {
4076 emit_move_insn (dest, op_true);
4077 return;
4078 }
4079
4080 /* If we have an integer mask and FP value then we need
4081 to cast mask to FP mode. */
4082 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
4083 {
4084 cmp = force_reg (cmpmode, cmp);
4085 cmp = gen_rtx_SUBREG (mode, cmp, 0);
4086 }
4087
4088 /* In AVX512F the result of comparison is an integer mask. */
4089 if (mode != cmpmode
4090 && GET_MODE_CLASS (cmpmode) == MODE_INT)
4091 {
4092 gcc_assert (ix86_valid_mask_cmp_mode (mode));
4093 /* Using scalar/vector move with mask register. */
4094 cmp = force_reg (cmpmode, cmp);
4095 /* Optimize for mask zero. */
4096 op_true = (op_true != CONST0_RTX (mode)
4097 ? force_reg (mode, op_true) : op_true);
4098 op_false = (op_false != CONST0_RTX (mode)
4099 ? force_reg (mode, op_false) : op_false);
4100 if (op_true == CONST0_RTX (mode))
4101 {
4102 if (cmpmode == E_DImode && !TARGET_64BIT)
4103 {
4104 x = gen_reg_rtx (cmpmode);
4105 emit_insn (gen_knotdi (x, cmp));
4106 }
4107 else
4108 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4109 cmp = x;
4110 /* Reverse op_true op_false. */
4111 std::swap (op_true, op_false);
4112 }
4113
4114 if (mode == HFmode)
4115 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4116 else
4117 emit_insn (gen_rtx_SET (dest,
4118 gen_rtx_VEC_MERGE (mode,
4119 op_true, op_false, cmp)));
4120 return;
4121 }
4122
4123 if (vector_all_ones_operand (op_true, mode)
4124 && op_false == CONST0_RTX (mode))
4125 {
4126 emit_move_insn (dest, cmp);
4127 return;
4128 }
4129 else if (op_false == CONST0_RTX (mode))
4130 {
4131 x = expand_simple_binop (mode, AND, cmp, op_true,
4132 dest, 1, OPTAB_DIRECT);
4133 if (x != dest)
4134 emit_move_insn (dest, x);
4135 return;
4136 }
4137 else if (op_true == CONST0_RTX (mode))
4138 {
4139 op_false = force_reg (mode, op_false);
4140 x = gen_rtx_NOT (mode, cmp);
4141 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
4142 return;
4143 }
4144 else if (vector_all_ones_operand (op_true, mode))
4145 {
4146 x = expand_simple_binop (mode, IOR, cmp, op_false,
4147 dest, 1, OPTAB_DIRECT);
4148 if (x != dest)
4149 emit_move_insn (dest, x);
4150 return;
4151 }
4152
4153 if (TARGET_XOP)
4154 {
4155 op_true = force_reg (mode, op_true);
4156
4157 if (GET_MODE_SIZE (mode) < 16
4158 || !nonimmediate_operand (op_false, mode))
4159 op_false = force_reg (mode, op_false);
4160
4161 emit_insn (gen_rtx_SET (dest,
4162 gen_rtx_IF_THEN_ELSE (mode, cmp,
4163 op_true, op_false)));
4164 return;
4165 }
4166
4167 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4168 machine_mode blend_mode = mode;
4169
4170 if (GET_MODE_SIZE (mode) < 16
4171 || !vector_operand (op_true, mode))
4172 op_true = force_reg (mode, op_true);
4173
4174 op_false = force_reg (mode, op_false);
4175
4176 switch (mode)
4177 {
4178 case E_V2SFmode:
4179 if (TARGET_SSE4_1)
4180 gen = gen_mmx_blendvps;
4181 break;
4182 case E_V4SFmode:
4183 if (TARGET_SSE4_1)
4184 gen = gen_sse4_1_blendvps;
4185 break;
4186 case E_V2DFmode:
4187 if (TARGET_SSE4_1)
4188 gen = gen_sse4_1_blendvpd;
4189 break;
4190 case E_SFmode:
4191 if (TARGET_SSE4_1)
4192 gen = gen_sse4_1_blendvss;
4193 break;
4194 case E_DFmode:
4195 if (TARGET_SSE4_1)
4196 gen = gen_sse4_1_blendvsd;
4197 break;
4198 case E_V8QImode:
4199 case E_V4HImode:
4200 case E_V2SImode:
4201 if (TARGET_SSE4_1)
4202 {
4203 gen = gen_mmx_pblendvb_v8qi;
4204 blend_mode = V8QImode;
4205 }
4206 break;
4207 case E_V4QImode:
4208 case E_V2HImode:
4209 if (TARGET_SSE4_1)
4210 {
4211 gen = gen_mmx_pblendvb_v4qi;
4212 blend_mode = V4QImode;
4213 }
4214 break;
4215 case E_V2QImode:
4216 if (TARGET_SSE4_1)
4217 gen = gen_mmx_pblendvb_v2qi;
4218 break;
4219 case E_V16QImode:
4220 case E_V8HImode:
4221 case E_V8HFmode:
4222 case E_V8BFmode:
4223 case E_V4SImode:
4224 case E_V2DImode:
4225 case E_V1TImode:
4226 if (TARGET_SSE4_1)
4227 {
4228 gen = gen_sse4_1_pblendvb;
4229 blend_mode = V16QImode;
4230 }
4231 break;
4232 case E_V8SFmode:
4233 if (TARGET_AVX)
4234 gen = gen_avx_blendvps256;
4235 break;
4236 case E_V4DFmode:
4237 if (TARGET_AVX)
4238 gen = gen_avx_blendvpd256;
4239 break;
4240 case E_V32QImode:
4241 case E_V16HImode:
4242 case E_V16HFmode:
4243 case E_V16BFmode:
4244 case E_V8SImode:
4245 case E_V4DImode:
4246 if (TARGET_AVX2)
4247 {
4248 gen = gen_avx2_pblendvb;
4249 blend_mode = V32QImode;
4250 }
4251 break;
4252
4253 case E_V64QImode:
4254 gen = gen_avx512bw_blendmv64qi;
4255 break;
4256 case E_V32HImode:
4257 gen = gen_avx512bw_blendmv32hi;
4258 break;
4259 case E_V32HFmode:
4260 gen = gen_avx512bw_blendmv32hf;
4261 break;
4262 case E_V32BFmode:
4263 gen = gen_avx512bw_blendmv32bf;
4264 break;
4265 case E_V16SImode:
4266 gen = gen_avx512f_blendmv16si;
4267 break;
4268 case E_V8DImode:
4269 gen = gen_avx512f_blendmv8di;
4270 break;
4271 case E_V8DFmode:
4272 gen = gen_avx512f_blendmv8df;
4273 break;
4274 case E_V16SFmode:
4275 gen = gen_avx512f_blendmv16sf;
4276 break;
4277
4278 default:
4279 break;
4280 }
4281
4282 if (gen != NULL)
4283 {
4284 if (blend_mode == mode)
4285 x = dest;
4286 else
4287 {
4288 x = gen_reg_rtx (blend_mode);
4289 op_false = gen_lowpart (blend_mode, op_false);
4290 op_true = gen_lowpart (blend_mode, op_true);
4291 cmp = gen_lowpart (blend_mode, cmp);
4292 }
4293
4294 emit_insn (gen (x, op_false, op_true, cmp));
4295
4296 if (x != dest)
4297 emit_move_insn (dest, gen_lowpart (mode, x));
4298 }
4299 else
4300 {
4301 rtx t2, t3;
4302
4303 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4304 NULL, 1, OPTAB_DIRECT);
4305
4306 t3 = gen_reg_rtx (mode);
4307 x = gen_rtx_NOT (mode, cmp);
4308 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4309
4310 x = expand_simple_binop (mode, IOR, t3, t2,
4311 dest, 1, OPTAB_DIRECT);
4312 if (x != dest)
4313 emit_move_insn (dest, x);
4314 }
4315 }
4316
4317 /* Swap, force into registers, or otherwise massage the two operands
4318 to an sse comparison with a mask result. Thus we differ a bit from
4319 ix86_prepare_fp_compare_args which expects to produce a flags result.
4320
4321 The DEST operand exists to help determine whether to commute commutative
4322 operators. The POP0/POP1 operands are updated in place. The new
4323 comparison code is returned, or UNKNOWN if not implementable. */
4324
4325 static enum rtx_code
4326 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4327 rtx *pop0, rtx *pop1)
4328 {
4329 switch (code)
4330 {
4331 case LTGT:
4332 case UNEQ:
4333 /* AVX supports all the needed comparisons. */
4334 if (TARGET_AVX)
4335 break;
4336 /* We have no LTGT as an operator. We could implement it with
4337 NE & ORDERED, but this requires an extra temporary. It's
4338 not clear that it's worth it. */
4339 return UNKNOWN;
4340
4341 case LT:
4342 case LE:
4343 case UNGT:
4344 case UNGE:
4345 /* These are supported directly. */
4346 break;
4347
4348 case EQ:
4349 case NE:
4350 case UNORDERED:
4351 case ORDERED:
4352 /* AVX has 3 operand comparisons, no need to swap anything. */
4353 if (TARGET_AVX)
4354 break;
4355 /* For commutative operators, try to canonicalize the destination
4356 operand to be first in the comparison - this helps reload to
4357 avoid extra moves. */
4358 if (!dest || !rtx_equal_p (dest, *pop1))
4359 break;
4360 /* FALLTHRU */
4361
4362 case GE:
4363 case GT:
4364 case UNLE:
4365 case UNLT:
4366 /* These are not supported directly before AVX, and furthermore
4367 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4368 comparison operands to transform into something that is
4369 supported. */
4370 std::swap (*pop0, *pop1);
4371 code = swap_condition (code);
4372 break;
4373
4374 default:
4375 gcc_unreachable ();
4376 }
4377
4378 return code;
4379 }
4380
4381 /* Expand a floating-point conditional move. Return true if successful. */
4382
4383 bool
4384 ix86_expand_fp_movcc (rtx operands[])
4385 {
4386 machine_mode mode = GET_MODE (operands[0]);
4387 enum rtx_code code = GET_CODE (operands[1]);
4388 rtx tmp, compare_op;
4389 rtx op0 = XEXP (operands[1], 0);
4390 rtx op1 = XEXP (operands[1], 1);
4391
4392 if (GET_MODE (op0) == BFmode
4393 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4394 return false;
4395
4396 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4397 {
4398 machine_mode cmode;
4399
4400 /* Since we've no cmove for sse registers, don't force bad register
4401 allocation just to gain access to it. Deny movcc when the
4402 comparison mode doesn't match the move mode. */
4403 cmode = GET_MODE (op0);
4404 if (cmode == VOIDmode)
4405 cmode = GET_MODE (op1);
4406 if (cmode != mode)
4407 return false;
4408
4409 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4410 if (code == UNKNOWN)
4411 return false;
4412
4413 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4414 operands[2], operands[3]))
4415 return true;
4416
4417 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4418 operands[2], operands[3]);
4419 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4420 return true;
4421 }
4422
4423 if (GET_MODE (op0) == TImode
4424 || (GET_MODE (op0) == DImode
4425 && !TARGET_64BIT))
4426 return false;
4427
4428 /* The floating point conditional move instructions don't directly
4429 support conditions resulting from a signed integer comparison. */
4430
4431 compare_op = ix86_expand_compare (code, op0, op1);
4432 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4433 {
4434 tmp = gen_reg_rtx (QImode);
4435 ix86_expand_setcc (tmp, code, op0, op1);
4436
4437 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4438 }
4439
4440 emit_insn (gen_rtx_SET (operands[0],
4441 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4442 operands[2], operands[3])));
4443
4444 return true;
4445 }
4446
4447 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4448
4449 static int
4450 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4451 {
4452 switch (code)
4453 {
4454 case EQ:
4455 return 0;
4456 case LT:
4457 case LTU:
4458 return 1;
4459 case LE:
4460 case LEU:
4461 return 2;
4462 case NE:
4463 return 4;
4464 case GE:
4465 case GEU:
4466 return 5;
4467 case GT:
4468 case GTU:
4469 return 6;
4470 default:
4471 gcc_unreachable ();
4472 }
4473 }
4474
4475 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4476
4477 static int
4478 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4479 {
4480 switch (code)
4481 {
4482 case EQ:
4483 return 0x00;
4484 case NE:
4485 return 0x04;
4486 case GT:
4487 return 0x0e;
4488 case LE:
4489 return 0x02;
4490 case GE:
4491 return 0x0d;
4492 case LT:
4493 return 0x01;
4494 case UNLE:
4495 return 0x0a;
4496 case UNLT:
4497 return 0x09;
4498 case UNGE:
4499 return 0x05;
4500 case UNGT:
4501 return 0x06;
4502 case UNEQ:
4503 return 0x18;
4504 case LTGT:
4505 return 0x0c;
4506 case ORDERED:
4507 return 0x07;
4508 case UNORDERED:
4509 return 0x03;
4510 default:
4511 gcc_unreachable ();
4512 }
4513 }
4514
4515 /* Return immediate value to be used in UNSPEC_PCMP
4516 for comparison CODE in MODE. */
4517
4518 static int
4519 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4520 {
4521 if (FLOAT_MODE_P (mode))
4522 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4523 return ix86_int_cmp_code_to_pcmp_immediate (code);
4524 }
4525
4526 /* Expand AVX-512 vector comparison. */
4527
4528 bool
4529 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4530 {
4531 machine_mode mask_mode = GET_MODE (dest);
4532 machine_mode cmp_mode = GET_MODE (cmp_op0);
4533 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4534 int unspec_code;
4535 rtx unspec;
4536
4537 switch (code)
4538 {
4539 case LEU:
4540 case GTU:
4541 case GEU:
4542 case LTU:
4543 unspec_code = UNSPEC_UNSIGNED_PCMP;
4544 break;
4545
4546 default:
4547 unspec_code = UNSPEC_PCMP;
4548 }
4549
4550 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4551 unspec_code);
4552 emit_insn (gen_rtx_SET (dest, unspec));
4553
4554 return true;
4555 }
4556
4557 /* Expand fp vector comparison. */
4558
4559 bool
4560 ix86_expand_fp_vec_cmp (rtx operands[])
4561 {
4562 enum rtx_code code = GET_CODE (operands[1]);
4563 rtx cmp;
4564
4565 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4566 &operands[2], &operands[3]);
4567 if (code == UNKNOWN)
4568 {
4569 rtx temp;
4570 switch (GET_CODE (operands[1]))
4571 {
4572 case LTGT:
4573 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4574 operands[3], NULL, NULL);
4575 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4576 operands[3], NULL, NULL);
4577 code = AND;
4578 break;
4579 case UNEQ:
4580 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4581 operands[3], NULL, NULL);
4582 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4583 operands[3], NULL, NULL);
4584 code = IOR;
4585 break;
4586 default:
4587 gcc_unreachable ();
4588 }
4589 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4590 OPTAB_DIRECT);
4591 }
4592 else
4593 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4594 NULL, NULL);
4595
4596 if (operands[0] != cmp)
4597 emit_move_insn (operands[0], cmp);
4598
4599 return true;
4600 }
4601
4602 static rtx
4603 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4604 rtx op_true, rtx op_false, bool *negate)
4605 {
4606 machine_mode data_mode = GET_MODE (dest);
4607 machine_mode mode = GET_MODE (cop0);
4608 rtx x;
4609
4610 *negate = false;
4611
4612 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4613 if (TARGET_XOP
4614 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4615 && GET_MODE_SIZE (mode) <= 16)
4616 ;
4617 /* AVX512F supports all of the comparsions
4618 on all 128/256/512-bit vector int types. */
4619 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4620 ;
4621 else
4622 {
4623 /* Canonicalize the comparison to EQ, GT, GTU. */
4624 switch (code)
4625 {
4626 case EQ:
4627 case GT:
4628 case GTU:
4629 break;
4630
4631 case LE:
4632 case LEU:
4633 /* x <= cst can be handled as x < cst + 1 unless there is
4634 wrap around in cst + 1. */
4635 if (GET_CODE (cop1) == CONST_VECTOR
4636 && GET_MODE_INNER (mode) != TImode)
4637 {
4638 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4639 machine_mode eltmode = GET_MODE_INNER (mode);
4640 for (i = 0; i < n_elts; ++i)
4641 {
4642 rtx elt = CONST_VECTOR_ELT (cop1, i);
4643 if (!CONST_INT_P (elt))
4644 break;
4645 if (code == GE)
4646 {
4647 /* For LE punt if some element is signed maximum. */
4648 if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4649 == (GET_MODE_MASK (eltmode) >> 1))
4650 break;
4651 }
4652 /* For LEU punt if some element is unsigned maximum. */
4653 else if (elt == constm1_rtx)
4654 break;
4655 }
4656 if (i == n_elts)
4657 {
4658 rtvec v = rtvec_alloc (n_elts);
4659 for (i = 0; i < n_elts; ++i)
4660 RTVEC_ELT (v, i)
4661 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4662 eltmode);
4663 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4664 std::swap (cop0, cop1);
4665 code = code == LE ? GT : GTU;
4666 break;
4667 }
4668 }
4669 /* FALLTHRU */
4670 case NE:
4671 code = reverse_condition (code);
4672 *negate = true;
4673 break;
4674
4675 case GE:
4676 case GEU:
4677 /* x >= cst can be handled as x > cst - 1 unless there is
4678 wrap around in cst - 1. */
4679 if (GET_CODE (cop1) == CONST_VECTOR
4680 && GET_MODE_INNER (mode) != TImode)
4681 {
4682 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4683 machine_mode eltmode = GET_MODE_INNER (mode);
4684 for (i = 0; i < n_elts; ++i)
4685 {
4686 rtx elt = CONST_VECTOR_ELT (cop1, i);
4687 if (!CONST_INT_P (elt))
4688 break;
4689 if (code == GE)
4690 {
4691 /* For GE punt if some element is signed minimum. */
4692 if (INTVAL (elt) < 0
4693 && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4694 == 0))
4695 break;
4696 }
4697 /* For GEU punt if some element is zero. */
4698 else if (elt == const0_rtx)
4699 break;
4700 }
4701 if (i == n_elts)
4702 {
4703 rtvec v = rtvec_alloc (n_elts);
4704 for (i = 0; i < n_elts; ++i)
4705 RTVEC_ELT (v, i)
4706 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
4707 eltmode);
4708 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4709 code = code == GE ? GT : GTU;
4710 break;
4711 }
4712 }
4713 code = reverse_condition (code);
4714 *negate = true;
4715 /* FALLTHRU */
4716
4717 case LT:
4718 case LTU:
4719 std::swap (cop0, cop1);
4720 code = swap_condition (code);
4721 break;
4722
4723 default:
4724 gcc_unreachable ();
4725 }
4726
4727 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4728 if (mode == V2DImode)
4729 {
4730 switch (code)
4731 {
4732 case EQ:
4733 /* SSE4.1 supports EQ. */
4734 if (!TARGET_SSE4_1)
4735 return NULL;
4736 break;
4737
4738 case GT:
4739 case GTU:
4740 /* SSE4.2 supports GT/GTU. */
4741 if (!TARGET_SSE4_2)
4742 return NULL;
4743 break;
4744
4745 default:
4746 gcc_unreachable ();
4747 }
4748 }
4749
4750 if (GET_CODE (cop0) == CONST_VECTOR)
4751 cop0 = force_reg (mode, cop0);
4752 else if (GET_CODE (cop1) == CONST_VECTOR)
4753 cop1 = force_reg (mode, cop1);
4754
4755 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4756 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4757 if (*negate)
4758 std::swap (optrue, opfalse);
4759
4760 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4761 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4762 min (x, y) == x). While we add one instruction (the minimum),
4763 we remove the need for two instructions in the negation, as the
4764 result is done this way.
4765 When using masks, do it for SI/DImode element types, as it is shorter
4766 than the two subtractions. */
4767 if ((code != EQ
4768 && GET_MODE_SIZE (mode) != 64
4769 && vector_all_ones_operand (opfalse, data_mode)
4770 && optrue == CONST0_RTX (data_mode))
4771 || (code == GTU
4772 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4773 /* Don't do it if not using integer masks and we'd end up with
4774 the right values in the registers though. */
4775 && (GET_MODE_SIZE (mode) == 64
4776 || !vector_all_ones_operand (optrue, data_mode)
4777 || opfalse != CONST0_RTX (data_mode))))
4778 {
4779 rtx (*gen) (rtx, rtx, rtx) = NULL;
4780
4781 switch (mode)
4782 {
4783 case E_V16SImode:
4784 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4785 break;
4786 case E_V8DImode:
4787 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4788 cop0 = force_reg (mode, cop0);
4789 cop1 = force_reg (mode, cop1);
4790 break;
4791 case E_V32QImode:
4792 if (TARGET_AVX2)
4793 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4794 break;
4795 case E_V16HImode:
4796 if (TARGET_AVX2)
4797 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4798 break;
4799 case E_V8SImode:
4800 if (TARGET_AVX2)
4801 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4802 break;
4803 case E_V4DImode:
4804 if (TARGET_AVX512VL)
4805 {
4806 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4807 cop0 = force_reg (mode, cop0);
4808 cop1 = force_reg (mode, cop1);
4809 }
4810 break;
4811 case E_V16QImode:
4812 if (code == GTU && TARGET_SSE2)
4813 gen = gen_uminv16qi3;
4814 else if (code == GT && TARGET_SSE4_1)
4815 gen = gen_sminv16qi3;
4816 break;
4817 case E_V8QImode:
4818 if (code == GTU && TARGET_SSE2)
4819 gen = gen_uminv8qi3;
4820 else if (code == GT && TARGET_SSE4_1)
4821 gen = gen_sminv8qi3;
4822 break;
4823 case E_V4QImode:
4824 if (code == GTU && TARGET_SSE2)
4825 gen = gen_uminv4qi3;
4826 else if (code == GT && TARGET_SSE4_1)
4827 gen = gen_sminv4qi3;
4828 break;
4829 case E_V2QImode:
4830 if (code == GTU && TARGET_SSE2)
4831 gen = gen_uminv2qi3;
4832 else if (code == GT && TARGET_SSE4_1)
4833 gen = gen_sminv2qi3;
4834 break;
4835 case E_V8HImode:
4836 if (code == GTU && TARGET_SSE4_1)
4837 gen = gen_uminv8hi3;
4838 else if (code == GT && TARGET_SSE2)
4839 gen = gen_sminv8hi3;
4840 break;
4841 case E_V4HImode:
4842 if (code == GTU && TARGET_SSE4_1)
4843 gen = gen_uminv4hi3;
4844 else if (code == GT && TARGET_SSE2)
4845 gen = gen_sminv4hi3;
4846 break;
4847 case E_V2HImode:
4848 if (code == GTU && TARGET_SSE4_1)
4849 gen = gen_uminv2hi3;
4850 else if (code == GT && TARGET_SSE2)
4851 gen = gen_sminv2hi3;
4852 break;
4853 case E_V4SImode:
4854 if (TARGET_SSE4_1)
4855 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4856 break;
4857 case E_V2SImode:
4858 if (TARGET_SSE4_1)
4859 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4860 break;
4861 case E_V2DImode:
4862 if (TARGET_AVX512VL)
4863 {
4864 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4865 cop0 = force_reg (mode, cop0);
4866 cop1 = force_reg (mode, cop1);
4867 }
4868 break;
4869 default:
4870 break;
4871 }
4872
4873 if (gen)
4874 {
4875 rtx tem = gen_reg_rtx (mode);
4876 if (!vector_operand (cop0, mode))
4877 cop0 = force_reg (mode, cop0);
4878 if (!vector_operand (cop1, mode))
4879 cop1 = force_reg (mode, cop1);
4880 *negate = !*negate;
4881 emit_insn (gen (tem, cop0, cop1));
4882 cop1 = tem;
4883 code = EQ;
4884 }
4885 }
4886
4887 /* Unsigned parallel compare is not supported by the hardware.
4888 Play some tricks to turn this into a signed comparison
4889 against 0. */
4890 if (code == GTU)
4891 {
4892 cop0 = force_reg (mode, cop0);
4893
4894 switch (mode)
4895 {
4896 case E_V16SImode:
4897 case E_V8DImode:
4898 case E_V8SImode:
4899 case E_V4DImode:
4900 case E_V4SImode:
4901 case E_V2SImode:
4902 case E_V2DImode:
4903 {
4904 rtx t1, t2, mask;
4905
4906 /* Subtract (-(INT MAX) - 1) from both operands to make
4907 them signed. */
4908 mask = ix86_build_signbit_mask (mode, true, false);
4909 t1 = gen_reg_rtx (mode);
4910 emit_insn (gen_sub3_insn (t1, cop0, mask));
4911
4912 t2 = gen_reg_rtx (mode);
4913 emit_insn (gen_sub3_insn (t2, cop1, mask));
4914
4915 cop0 = t1;
4916 cop1 = t2;
4917 code = GT;
4918 }
4919 break;
4920
4921 case E_V64QImode:
4922 case E_V32HImode:
4923 case E_V32QImode:
4924 case E_V16HImode:
4925 case E_V16QImode:
4926 case E_V8QImode:
4927 case E_V4QImode:
4928 case E_V2QImode:
4929 case E_V8HImode:
4930 case E_V4HImode:
4931 case E_V2HImode:
4932 /* Perform a parallel unsigned saturating subtraction. */
4933 x = gen_reg_rtx (mode);
4934 emit_insn (gen_rtx_SET
4935 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4936 cop0 = x;
4937 cop1 = CONST0_RTX (mode);
4938 code = EQ;
4939 *negate = !*negate;
4940 break;
4941
4942 default:
4943 gcc_unreachable ();
4944 }
4945 }
4946 }
4947
4948 if (*negate)
4949 std::swap (op_true, op_false);
4950
4951 if (GET_CODE (cop1) == CONST_VECTOR)
4952 cop1 = force_reg (mode, cop1);
4953
4954 /* Allow the comparison to be done in one mode, but the movcc to
4955 happen in another mode. */
4956 if (data_mode == mode)
4957 x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
4958 else
4959 {
4960 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4961 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4962 op_true, op_false);
4963 if (GET_MODE (x) == mode)
4964 x = gen_lowpart (data_mode, x);
4965 }
4966
4967 return x;
4968 }
4969
4970 /* Expand integer vector comparison. */
4971
4972 bool
4973 ix86_expand_int_vec_cmp (rtx operands[])
4974 {
4975 rtx_code code = GET_CODE (operands[1]);
4976 bool negate = false;
4977 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4978 operands[3], NULL, NULL, &negate);
4979
4980 if (!cmp)
4981 return false;
4982
4983 if (negate)
4984 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4985 CONST0_RTX (GET_MODE (cmp)),
4986 NULL, NULL, &negate);
4987
4988 gcc_assert (!negate);
4989
4990 if (operands[0] != cmp)
4991 emit_move_insn (operands[0], cmp);
4992
4993 return true;
4994 }
4995
4996 /* Expand a floating-point vector conditional move; a vcond operation
4997 rather than a movcc operation. */
4998
4999 bool
5000 ix86_expand_fp_vcond (rtx operands[])
5001 {
5002 enum rtx_code code = GET_CODE (operands[3]);
5003 rtx cmp;
5004
5005 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
5006 &operands[4], &operands[5]);
5007 if (code == UNKNOWN)
5008 {
5009 rtx temp;
5010 switch (GET_CODE (operands[3]))
5011 {
5012 case LTGT:
5013 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
5014 operands[5], operands[0], operands[0]);
5015 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
5016 operands[5], operands[1], operands[2]);
5017 code = AND;
5018 break;
5019 case UNEQ:
5020 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
5021 operands[5], operands[0], operands[0]);
5022 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
5023 operands[5], operands[1], operands[2]);
5024 code = IOR;
5025 break;
5026 default:
5027 gcc_unreachable ();
5028 }
5029 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
5030 OPTAB_DIRECT);
5031 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5032 return true;
5033 }
5034
5035 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
5036 operands[5], operands[1], operands[2]))
5037 return true;
5038
5039 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
5040 operands[1], operands[2]);
5041 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5042 return true;
5043 }
5044
5045 /* Expand a signed/unsigned integral vector conditional move. */
5046
5047 bool
5048 ix86_expand_int_vcond (rtx operands[])
5049 {
5050 machine_mode data_mode = GET_MODE (operands[0]);
5051 machine_mode mode = GET_MODE (operands[4]);
5052 enum rtx_code code = GET_CODE (operands[3]);
5053 bool negate = false;
5054 rtx x, cop0, cop1;
5055
5056 cop0 = operands[4];
5057 cop1 = operands[5];
5058
5059 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
5060 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
5061 if ((code == LT || code == GE)
5062 && data_mode == mode
5063 && cop1 == CONST0_RTX (mode)
5064 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
5065 && GET_MODE_UNIT_SIZE (data_mode) > 1
5066 && GET_MODE_UNIT_SIZE (data_mode) <= 8
5067 && (GET_MODE_SIZE (data_mode) == 16
5068 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
5069 {
5070 rtx negop = operands[2 - (code == LT)];
5071 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
5072 if (negop == CONST1_RTX (data_mode))
5073 {
5074 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
5075 operands[0], 1, OPTAB_DIRECT);
5076 if (res != operands[0])
5077 emit_move_insn (operands[0], res);
5078 return true;
5079 }
5080 else if (GET_MODE_INNER (data_mode) != DImode
5081 && vector_all_ones_operand (negop, data_mode))
5082 {
5083 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
5084 operands[0], 0, OPTAB_DIRECT);
5085 if (res != operands[0])
5086 emit_move_insn (operands[0], res);
5087 return true;
5088 }
5089 }
5090
5091 if (!nonimmediate_operand (cop1, mode))
5092 cop1 = force_reg (mode, cop1);
5093 if (!general_operand (operands[1], data_mode))
5094 operands[1] = force_reg (data_mode, operands[1]);
5095 if (!general_operand (operands[2], data_mode))
5096 operands[2] = force_reg (data_mode, operands[2]);
5097
5098 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5099 operands[1], operands[2], &negate);
5100
5101 if (!x)
5102 return false;
5103
5104 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5105 operands[2-negate]);
5106 return true;
5107 }
5108
5109 static bool
5110 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5111 struct expand_vec_perm_d *d)
5112 {
5113 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5114 expander, so args are either in d, or in op0, op1 etc. */
5115 machine_mode mode = GET_MODE (d ? d->op0 : op0);
5116 machine_mode maskmode = mode;
5117 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5118
5119 switch (mode)
5120 {
5121 case E_V16QImode:
5122 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5123 gen = gen_avx512vl_vpermt2varv16qi3;
5124 break;
5125 case E_V32QImode:
5126 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5127 gen = gen_avx512vl_vpermt2varv32qi3;
5128 break;
5129 case E_V64QImode:
5130 if (TARGET_AVX512VBMI)
5131 gen = gen_avx512bw_vpermt2varv64qi3;
5132 break;
5133 case E_V8HImode:
5134 if (TARGET_AVX512VL && TARGET_AVX512BW)
5135 gen = gen_avx512vl_vpermt2varv8hi3;
5136 break;
5137 case E_V16HImode:
5138 if (TARGET_AVX512VL && TARGET_AVX512BW)
5139 gen = gen_avx512vl_vpermt2varv16hi3;
5140 break;
5141 case E_V32HImode:
5142 if (TARGET_AVX512BW)
5143 gen = gen_avx512bw_vpermt2varv32hi3;
5144 break;
5145 case E_V4SImode:
5146 if (TARGET_AVX512VL)
5147 gen = gen_avx512vl_vpermt2varv4si3;
5148 break;
5149 case E_V8SImode:
5150 if (TARGET_AVX512VL)
5151 gen = gen_avx512vl_vpermt2varv8si3;
5152 break;
5153 case E_V16SImode:
5154 if (TARGET_AVX512F)
5155 gen = gen_avx512f_vpermt2varv16si3;
5156 break;
5157 case E_V4SFmode:
5158 if (TARGET_AVX512VL)
5159 {
5160 gen = gen_avx512vl_vpermt2varv4sf3;
5161 maskmode = V4SImode;
5162 }
5163 break;
5164 case E_V8SFmode:
5165 if (TARGET_AVX512VL)
5166 {
5167 gen = gen_avx512vl_vpermt2varv8sf3;
5168 maskmode = V8SImode;
5169 }
5170 break;
5171 case E_V16SFmode:
5172 if (TARGET_AVX512F)
5173 {
5174 gen = gen_avx512f_vpermt2varv16sf3;
5175 maskmode = V16SImode;
5176 }
5177 break;
5178 case E_V2DImode:
5179 if (TARGET_AVX512VL)
5180 gen = gen_avx512vl_vpermt2varv2di3;
5181 break;
5182 case E_V4DImode:
5183 if (TARGET_AVX512VL)
5184 gen = gen_avx512vl_vpermt2varv4di3;
5185 break;
5186 case E_V8DImode:
5187 if (TARGET_AVX512F)
5188 gen = gen_avx512f_vpermt2varv8di3;
5189 break;
5190 case E_V2DFmode:
5191 if (TARGET_AVX512VL)
5192 {
5193 gen = gen_avx512vl_vpermt2varv2df3;
5194 maskmode = V2DImode;
5195 }
5196 break;
5197 case E_V4DFmode:
5198 if (TARGET_AVX512VL)
5199 {
5200 gen = gen_avx512vl_vpermt2varv4df3;
5201 maskmode = V4DImode;
5202 }
5203 break;
5204 case E_V8DFmode:
5205 if (TARGET_AVX512F)
5206 {
5207 gen = gen_avx512f_vpermt2varv8df3;
5208 maskmode = V8DImode;
5209 }
5210 break;
5211 default:
5212 break;
5213 }
5214
5215 if (gen == NULL)
5216 return false;
5217
5218 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5219 expander, so args are either in d, or in op0, op1 etc. */
5220 if (d)
5221 {
5222 rtx vec[64];
5223 target = d->target;
5224 op0 = d->op0;
5225 op1 = d->op1;
5226 for (int i = 0; i < d->nelt; ++i)
5227 vec[i] = GEN_INT (d->perm[i]);
5228 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5229 }
5230
5231 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5232 return true;
5233 }
5234
5235 /* Expand a variable vector permutation. */
5236
5237 void
5238 ix86_expand_vec_perm (rtx operands[])
5239 {
5240 rtx target = operands[0];
5241 rtx op0 = operands[1];
5242 rtx op1 = operands[2];
5243 rtx mask = operands[3];
5244 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5245 machine_mode mode = GET_MODE (op0);
5246 machine_mode maskmode = GET_MODE (mask);
5247 int w, e, i;
5248 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5249
5250 /* Number of elements in the vector. */
5251 w = GET_MODE_NUNITS (mode);
5252 e = GET_MODE_UNIT_SIZE (mode);
5253 gcc_assert (w <= 64);
5254
5255 /* For HF mode vector, convert it to HI using subreg. */
5256 if (GET_MODE_INNER (mode) == HFmode)
5257 {
5258 machine_mode orig_mode = mode;
5259 mode = mode_for_vector (HImode, w).require ();
5260 target = lowpart_subreg (mode, target, orig_mode);
5261 op0 = lowpart_subreg (mode, op0, orig_mode);
5262 op1 = lowpart_subreg (mode, op1, orig_mode);
5263 }
5264
5265 if (TARGET_AVX512F && one_operand_shuffle)
5266 {
5267 rtx (*gen) (rtx, rtx, rtx) = NULL;
5268 switch (mode)
5269 {
5270 case E_V16SImode:
5271 gen =gen_avx512f_permvarv16si;
5272 break;
5273 case E_V16SFmode:
5274 gen = gen_avx512f_permvarv16sf;
5275 break;
5276 case E_V8DImode:
5277 gen = gen_avx512f_permvarv8di;
5278 break;
5279 case E_V8DFmode:
5280 gen = gen_avx512f_permvarv8df;
5281 break;
5282 default:
5283 break;
5284 }
5285 if (gen != NULL)
5286 {
5287 emit_insn (gen (target, op0, mask));
5288 return;
5289 }
5290 }
5291
5292 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5293 return;
5294
5295 if (TARGET_AVX2)
5296 {
5297 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5298 {
5299 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5300 an constant shuffle operand. With a tiny bit of effort we can
5301 use VPERMD instead. A re-interpretation stall for V4DFmode is
5302 unfortunate but there's no avoiding it.
5303 Similarly for V16HImode we don't have instructions for variable
5304 shuffling, while for V32QImode we can use after preparing suitable
5305 masks vpshufb; vpshufb; vpermq; vpor. */
5306
5307 if (mode == V16HImode)
5308 {
5309 maskmode = mode = V32QImode;
5310 w = 32;
5311 e = 1;
5312 }
5313 else
5314 {
5315 maskmode = mode = V8SImode;
5316 w = 8;
5317 e = 4;
5318 }
5319 t1 = gen_reg_rtx (maskmode);
5320
5321 /* Replicate the low bits of the V4DImode mask into V8SImode:
5322 mask = { A B C D }
5323 t1 = { A A B B C C D D }. */
5324 for (i = 0; i < w / 2; ++i)
5325 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5326 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5327 vt = force_reg (maskmode, vt);
5328 mask = gen_lowpart (maskmode, mask);
5329 if (maskmode == V8SImode)
5330 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5331 else
5332 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5333
5334 /* Multiply the shuffle indicies by two. */
5335 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5336 OPTAB_DIRECT);
5337
5338 /* Add one to the odd shuffle indicies:
5339 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5340 for (i = 0; i < w / 2; ++i)
5341 {
5342 vec[i * 2] = const0_rtx;
5343 vec[i * 2 + 1] = const1_rtx;
5344 }
5345 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5346 vt = validize_mem (force_const_mem (maskmode, vt));
5347 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5348 OPTAB_DIRECT);
5349
5350 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5351 operands[3] = mask = t1;
5352 target = gen_reg_rtx (mode);
5353 op0 = gen_lowpart (mode, op0);
5354 op1 = gen_lowpart (mode, op1);
5355 }
5356
5357 switch (mode)
5358 {
5359 case E_V8SImode:
5360 /* The VPERMD and VPERMPS instructions already properly ignore
5361 the high bits of the shuffle elements. No need for us to
5362 perform an AND ourselves. */
5363 if (one_operand_shuffle)
5364 {
5365 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5366 if (target != operands[0])
5367 emit_move_insn (operands[0],
5368 gen_lowpart (GET_MODE (operands[0]), target));
5369 }
5370 else
5371 {
5372 t1 = gen_reg_rtx (V8SImode);
5373 t2 = gen_reg_rtx (V8SImode);
5374 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5375 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5376 goto merge_two;
5377 }
5378 return;
5379
5380 case E_V8SFmode:
5381 mask = gen_lowpart (V8SImode, mask);
5382 if (one_operand_shuffle)
5383 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5384 else
5385 {
5386 t1 = gen_reg_rtx (V8SFmode);
5387 t2 = gen_reg_rtx (V8SFmode);
5388 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5389 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5390 goto merge_two;
5391 }
5392 return;
5393
5394 case E_V4SImode:
5395 /* By combining the two 128-bit input vectors into one 256-bit
5396 input vector, we can use VPERMD and VPERMPS for the full
5397 two-operand shuffle. */
5398 t1 = gen_reg_rtx (V8SImode);
5399 t2 = gen_reg_rtx (V8SImode);
5400 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5401 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5402 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5403 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5404 return;
5405
5406 case E_V4SFmode:
5407 t1 = gen_reg_rtx (V8SFmode);
5408 t2 = gen_reg_rtx (V8SImode);
5409 mask = gen_lowpart (V4SImode, mask);
5410 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5411 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5412 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5413 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5414 return;
5415
5416 case E_V32QImode:
5417 t1 = gen_reg_rtx (V32QImode);
5418 t2 = gen_reg_rtx (V32QImode);
5419 t3 = gen_reg_rtx (V32QImode);
5420 vt2 = GEN_INT (-128);
5421 vt = gen_const_vec_duplicate (V32QImode, vt2);
5422 vt = force_reg (V32QImode, vt);
5423 for (i = 0; i < 32; i++)
5424 vec[i] = i < 16 ? vt2 : const0_rtx;
5425 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5426 vt2 = force_reg (V32QImode, vt2);
5427 /* From mask create two adjusted masks, which contain the same
5428 bits as mask in the low 7 bits of each vector element.
5429 The first mask will have the most significant bit clear
5430 if it requests element from the same 128-bit lane
5431 and MSB set if it requests element from the other 128-bit lane.
5432 The second mask will have the opposite values of the MSB,
5433 and additionally will have its 128-bit lanes swapped.
5434 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5435 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5436 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5437 stands for other 12 bytes. */
5438 /* The bit whether element is from the same lane or the other
5439 lane is bit 4, so shift it up by 3 to the MSB position. */
5440 t5 = gen_reg_rtx (V4DImode);
5441 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5442 GEN_INT (3)));
5443 /* Clear MSB bits from the mask just in case it had them set. */
5444 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5445 /* After this t1 will have MSB set for elements from other lane. */
5446 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5447 /* Clear bits other than MSB. */
5448 emit_insn (gen_andv32qi3 (t1, t1, vt));
5449 /* Or in the lower bits from mask into t3. */
5450 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5451 /* And invert MSB bits in t1, so MSB is set for elements from the same
5452 lane. */
5453 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5454 /* Swap 128-bit lanes in t3. */
5455 t6 = gen_reg_rtx (V4DImode);
5456 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5457 const2_rtx, GEN_INT (3),
5458 const0_rtx, const1_rtx));
5459 /* And or in the lower bits from mask into t1. */
5460 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5461 if (one_operand_shuffle)
5462 {
5463 /* Each of these shuffles will put 0s in places where
5464 element from the other 128-bit lane is needed, otherwise
5465 will shuffle in the requested value. */
5466 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5467 gen_lowpart (V32QImode, t6)));
5468 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5469 /* For t3 the 128-bit lanes are swapped again. */
5470 t7 = gen_reg_rtx (V4DImode);
5471 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5472 const2_rtx, GEN_INT (3),
5473 const0_rtx, const1_rtx));
5474 /* And oring both together leads to the result. */
5475 emit_insn (gen_iorv32qi3 (target, t1,
5476 gen_lowpart (V32QImode, t7)));
5477 if (target != operands[0])
5478 emit_move_insn (operands[0],
5479 gen_lowpart (GET_MODE (operands[0]), target));
5480 return;
5481 }
5482
5483 t4 = gen_reg_rtx (V32QImode);
5484 /* Similarly to the above one_operand_shuffle code,
5485 just for repeated twice for each operand. merge_two:
5486 code will merge the two results together. */
5487 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5488 gen_lowpart (V32QImode, t6)));
5489 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5490 gen_lowpart (V32QImode, t6)));
5491 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5492 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5493 t7 = gen_reg_rtx (V4DImode);
5494 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5495 const2_rtx, GEN_INT (3),
5496 const0_rtx, const1_rtx));
5497 t8 = gen_reg_rtx (V4DImode);
5498 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5499 const2_rtx, GEN_INT (3),
5500 const0_rtx, const1_rtx));
5501 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5502 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5503 t1 = t4;
5504 t2 = t3;
5505 goto merge_two;
5506
5507 default:
5508 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5509 break;
5510 }
5511 }
5512
5513 if (TARGET_XOP)
5514 {
5515 /* The XOP VPPERM insn supports three inputs. By ignoring the
5516 one_operand_shuffle special case, we avoid creating another
5517 set of constant vectors in memory. */
5518 one_operand_shuffle = false;
5519
5520 /* mask = mask & {2*w-1, ...} */
5521 vt = GEN_INT (2*w - 1);
5522 }
5523 else
5524 {
5525 /* mask = mask & {w-1, ...} */
5526 vt = GEN_INT (w - 1);
5527 }
5528
5529 vt = gen_const_vec_duplicate (maskmode, vt);
5530 mask = expand_simple_binop (maskmode, AND, mask, vt,
5531 NULL_RTX, 0, OPTAB_DIRECT);
5532
5533 /* For non-QImode operations, convert the word permutation control
5534 into a byte permutation control. */
5535 if (mode != V16QImode)
5536 {
5537 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5538 GEN_INT (exact_log2 (e)),
5539 NULL_RTX, 0, OPTAB_DIRECT);
5540
5541 /* Convert mask to vector of chars. */
5542 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5543
5544 /* Replicate each of the input bytes into byte positions:
5545 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5546 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5547 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5548 for (i = 0; i < 16; ++i)
5549 vec[i] = GEN_INT (i/e * e);
5550 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5551 vt = validize_mem (force_const_mem (V16QImode, vt));
5552 if (TARGET_XOP)
5553 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5554 else
5555 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5556
5557 /* Convert it into the byte positions by doing
5558 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5559 for (i = 0; i < 16; ++i)
5560 vec[i] = GEN_INT (i % e);
5561 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5562 vt = validize_mem (force_const_mem (V16QImode, vt));
5563 emit_insn (gen_addv16qi3 (mask, mask, vt));
5564 }
5565
5566 /* The actual shuffle operations all operate on V16QImode. */
5567 op0 = gen_lowpart (V16QImode, op0);
5568 op1 = gen_lowpart (V16QImode, op1);
5569
5570 if (TARGET_XOP)
5571 {
5572 if (GET_MODE (target) != V16QImode)
5573 target = gen_reg_rtx (V16QImode);
5574 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5575 if (target != operands[0])
5576 emit_move_insn (operands[0],
5577 gen_lowpart (GET_MODE (operands[0]), target));
5578 }
5579 else if (one_operand_shuffle)
5580 {
5581 if (GET_MODE (target) != V16QImode)
5582 target = gen_reg_rtx (V16QImode);
5583 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5584 if (target != operands[0])
5585 emit_move_insn (operands[0],
5586 gen_lowpart (GET_MODE (operands[0]), target));
5587 }
5588 else
5589 {
5590 rtx xops[6];
5591 bool ok;
5592
5593 /* Shuffle the two input vectors independently. */
5594 t1 = gen_reg_rtx (V16QImode);
5595 t2 = gen_reg_rtx (V16QImode);
5596 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5597 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5598
5599 merge_two:
5600 /* Then merge them together. The key is whether any given control
5601 element contained a bit set that indicates the second word. */
5602 mask = operands[3];
5603 vt = GEN_INT (w);
5604 if (maskmode == V2DImode && !TARGET_SSE4_1)
5605 {
5606 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5607 more shuffle to convert the V2DI input mask into a V4SI
5608 input mask. At which point the masking that expand_int_vcond
5609 will work as desired. */
5610 rtx t3 = gen_reg_rtx (V4SImode);
5611 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5612 const0_rtx, const0_rtx,
5613 const2_rtx, const2_rtx));
5614 mask = t3;
5615 maskmode = V4SImode;
5616 e = w = 4;
5617 }
5618
5619 vt = gen_const_vec_duplicate (maskmode, vt);
5620 vt = force_reg (maskmode, vt);
5621 mask = expand_simple_binop (maskmode, AND, mask, vt,
5622 NULL_RTX, 0, OPTAB_DIRECT);
5623
5624 if (GET_MODE (target) != mode)
5625 target = gen_reg_rtx (mode);
5626 xops[0] = target;
5627 xops[1] = gen_lowpart (mode, t2);
5628 xops[2] = gen_lowpart (mode, t1);
5629 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5630 xops[4] = mask;
5631 xops[5] = vt;
5632 ok = ix86_expand_int_vcond (xops);
5633 gcc_assert (ok);
5634 if (target != operands[0])
5635 emit_move_insn (operands[0],
5636 gen_lowpart (GET_MODE (operands[0]), target));
5637 }
5638 }
5639
5640 /* Extend SRC into next wider integer vector type. UNSIGNED_P is
5641 true if we should do zero extension, else sign extension. */
5642
5643 void
5644 ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
5645 {
5646 machine_mode imode = GET_MODE (src);
5647 rtx ops[3];
5648
5649 switch (imode)
5650 {
5651 case E_V8QImode:
5652 case E_V4QImode:
5653 case E_V2QImode:
5654 case E_V4HImode:
5655 case E_V2HImode:
5656 case E_V2SImode:
5657 break;
5658 default:
5659 gcc_unreachable ();
5660 }
5661
5662 ops[0] = gen_reg_rtx (imode);
5663
5664 ops[1] = force_reg (imode, src);
5665
5666 if (unsigned_p)
5667 ops[2] = force_reg (imode, CONST0_RTX (imode));
5668 else
5669 ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5670 ops[1], pc_rtx, pc_rtx);
5671
5672 ix86_split_mmx_punpck (ops, false);
5673 emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
5674 }
5675
5676 /* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
5677 true if we should do zero extension, else sign extension. HIGH_P is
5678 true if we want the N/2 high elements, else the low elements. */
5679
5680 void
5681 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5682 {
5683 machine_mode imode = GET_MODE (src);
5684 rtx tmp;
5685
5686 if (TARGET_SSE4_1)
5687 {
5688 rtx (*unpack)(rtx, rtx);
5689 rtx (*extract)(rtx, rtx) = NULL;
5690 machine_mode halfmode = BLKmode;
5691
5692 switch (imode)
5693 {
5694 case E_V64QImode:
5695 if (unsigned_p)
5696 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5697 else
5698 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5699 halfmode = V32QImode;
5700 extract
5701 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5702 break;
5703 case E_V32QImode:
5704 if (unsigned_p)
5705 unpack = gen_avx2_zero_extendv16qiv16hi2;
5706 else
5707 unpack = gen_avx2_sign_extendv16qiv16hi2;
5708 halfmode = V16QImode;
5709 extract
5710 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5711 break;
5712 case E_V32HImode:
5713 if (unsigned_p)
5714 unpack = gen_avx512f_zero_extendv16hiv16si2;
5715 else
5716 unpack = gen_avx512f_sign_extendv16hiv16si2;
5717 halfmode = V16HImode;
5718 extract
5719 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5720 break;
5721 case E_V16HImode:
5722 if (unsigned_p)
5723 unpack = gen_avx2_zero_extendv8hiv8si2;
5724 else
5725 unpack = gen_avx2_sign_extendv8hiv8si2;
5726 halfmode = V8HImode;
5727 extract
5728 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5729 break;
5730 case E_V16SImode:
5731 if (unsigned_p)
5732 unpack = gen_avx512f_zero_extendv8siv8di2;
5733 else
5734 unpack = gen_avx512f_sign_extendv8siv8di2;
5735 halfmode = V8SImode;
5736 extract
5737 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5738 break;
5739 case E_V8SImode:
5740 if (unsigned_p)
5741 unpack = gen_avx2_zero_extendv4siv4di2;
5742 else
5743 unpack = gen_avx2_sign_extendv4siv4di2;
5744 halfmode = V4SImode;
5745 extract
5746 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5747 break;
5748 case E_V16QImode:
5749 if (unsigned_p)
5750 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5751 else
5752 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5753 break;
5754 case E_V8HImode:
5755 if (unsigned_p)
5756 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5757 else
5758 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5759 break;
5760 case E_V4SImode:
5761 if (unsigned_p)
5762 unpack = gen_sse4_1_zero_extendv2siv2di2;
5763 else
5764 unpack = gen_sse4_1_sign_extendv2siv2di2;
5765 break;
5766 case E_V8QImode:
5767 if (unsigned_p)
5768 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5769 else
5770 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5771 break;
5772 case E_V4HImode:
5773 if (unsigned_p)
5774 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5775 else
5776 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5777 break;
5778 case E_V4QImode:
5779 if (unsigned_p)
5780 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5781 else
5782 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5783 break;
5784 default:
5785 gcc_unreachable ();
5786 }
5787
5788 if (GET_MODE_SIZE (imode) >= 32)
5789 {
5790 tmp = gen_reg_rtx (halfmode);
5791 emit_insn (extract (tmp, src));
5792 }
5793 else if (high_p)
5794 {
5795 switch (GET_MODE_SIZE (imode))
5796 {
5797 case 16:
5798 /* Shift higher 8 bytes to lower 8 bytes. */
5799 tmp = gen_reg_rtx (V1TImode);
5800 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5801 GEN_INT (64)));
5802 break;
5803 case 8:
5804 /* Shift higher 4 bytes to lower 4 bytes. */
5805 tmp = gen_reg_rtx (V1DImode);
5806 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5807 GEN_INT (32)));
5808 break;
5809 case 4:
5810 /* Shift higher 2 bytes to lower 2 bytes. */
5811 tmp = gen_reg_rtx (V1SImode);
5812 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5813 GEN_INT (16)));
5814 break;
5815 default:
5816 gcc_unreachable ();
5817 }
5818
5819 tmp = gen_lowpart (imode, tmp);
5820 }
5821 else
5822 tmp = src;
5823
5824 emit_insn (unpack (dest, tmp));
5825 }
5826 else
5827 {
5828 rtx (*unpack)(rtx, rtx, rtx);
5829
5830 switch (imode)
5831 {
5832 case E_V16QImode:
5833 if (high_p)
5834 unpack = gen_vec_interleave_highv16qi;
5835 else
5836 unpack = gen_vec_interleave_lowv16qi;
5837 break;
5838 case E_V8HImode:
5839 if (high_p)
5840 unpack = gen_vec_interleave_highv8hi;
5841 else
5842 unpack = gen_vec_interleave_lowv8hi;
5843 break;
5844 case E_V4SImode:
5845 if (high_p)
5846 unpack = gen_vec_interleave_highv4si;
5847 else
5848 unpack = gen_vec_interleave_lowv4si;
5849 break;
5850 case E_V8QImode:
5851 if (high_p)
5852 unpack = gen_mmx_punpckhbw;
5853 else
5854 unpack = gen_mmx_punpcklbw;
5855 break;
5856 case E_V4HImode:
5857 if (high_p)
5858 unpack = gen_mmx_punpckhwd;
5859 else
5860 unpack = gen_mmx_punpcklwd;
5861 break;
5862 case E_V4QImode:
5863 if (high_p)
5864 unpack = gen_mmx_punpckhbw_low;
5865 else
5866 unpack = gen_mmx_punpcklbw_low;
5867 break;
5868 default:
5869 gcc_unreachable ();
5870 }
5871
5872 if (unsigned_p)
5873 tmp = force_reg (imode, CONST0_RTX (imode));
5874 else
5875 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5876 src, pc_rtx, pc_rtx);
5877
5878 rtx tmp2 = gen_reg_rtx (imode);
5879 emit_insn (unpack (tmp2, src, tmp));
5880 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5881 }
5882 }
5883
5884 /* Return true if mem is pool constant which contains a const_vector
5885 perm index, assign the index to PERM. */
5886 bool
5887 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5888 {
5889 machine_mode mode = GET_MODE (mem);
5890 int nelt = GET_MODE_NUNITS (mode);
5891
5892 if (!INTEGRAL_MODE_P (mode))
5893 return false;
5894
5895 /* Needs to be constant pool. */
5896 if (!(MEM_P (mem))
5897 || !SYMBOL_REF_P (XEXP (mem, 0))
5898 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5899 return false;
5900
5901 rtx constant = get_pool_constant (XEXP (mem, 0));
5902
5903 if (GET_CODE (constant) != CONST_VECTOR)
5904 return false;
5905
5906 /* There could be some rtx like
5907 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5908 but with "*.LC1" refer to V2DI constant vector. */
5909 if (GET_MODE (constant) != mode)
5910 {
5911 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5912
5913 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5914 return false;
5915 }
5916
5917 for (int i = 0; i != nelt; i++)
5918 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5919
5920 return true;
5921 }
5922
5923 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5924 but works for floating pointer parameters and nonoffsetable memories.
5925 For pushes, it returns just stack offsets; the values will be saved
5926 in the right order. Maximally three parts are generated. */
5927
5928 static int
5929 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5930 {
5931 int size;
5932
5933 if (!TARGET_64BIT)
5934 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5935 else
5936 size = (GET_MODE_SIZE (mode) + 4) / 8;
5937
5938 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5939 gcc_assert (size >= 2 && size <= 4);
5940
5941 /* Optimize constant pool reference to immediates. This is used by fp
5942 moves, that force all constants to memory to allow combining. */
5943 if (MEM_P (operand) && MEM_READONLY_P (operand))
5944 operand = avoid_constant_pool_reference (operand);
5945
5946 if (MEM_P (operand) && !offsettable_memref_p (operand))
5947 {
5948 /* The only non-offsetable memories we handle are pushes. */
5949 int ok = push_operand (operand, VOIDmode);
5950
5951 gcc_assert (ok);
5952
5953 operand = copy_rtx (operand);
5954 PUT_MODE (operand, word_mode);
5955 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5956 return size;
5957 }
5958
5959 if (GET_CODE (operand) == CONST_VECTOR)
5960 {
5961 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5962 /* Caution: if we looked through a constant pool memory above,
5963 the operand may actually have a different mode now. That's
5964 ok, since we want to pun this all the way back to an integer. */
5965 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5966 gcc_assert (operand != NULL);
5967 mode = imode;
5968 }
5969
5970 if (!TARGET_64BIT)
5971 {
5972 if (mode == DImode)
5973 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5974 else
5975 {
5976 int i;
5977
5978 if (REG_P (operand))
5979 {
5980 gcc_assert (reload_completed);
5981 for (i = 0; i < size; i++)
5982 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5983 }
5984 else if (offsettable_memref_p (operand))
5985 {
5986 operand = adjust_address (operand, SImode, 0);
5987 parts[0] = operand;
5988 for (i = 1; i < size; i++)
5989 parts[i] = adjust_address (operand, SImode, 4 * i);
5990 }
5991 else if (CONST_DOUBLE_P (operand))
5992 {
5993 const REAL_VALUE_TYPE *r;
5994 long l[4];
5995
5996 r = CONST_DOUBLE_REAL_VALUE (operand);
5997 switch (mode)
5998 {
5999 case E_TFmode:
6000 real_to_target (l, r, mode);
6001 parts[3] = gen_int_mode (l[3], SImode);
6002 parts[2] = gen_int_mode (l[2], SImode);
6003 break;
6004 case E_XFmode:
6005 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
6006 long double may not be 80-bit. */
6007 real_to_target (l, r, mode);
6008 parts[2] = gen_int_mode (l[2], SImode);
6009 break;
6010 case E_DFmode:
6011 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
6012 break;
6013 default:
6014 gcc_unreachable ();
6015 }
6016 parts[1] = gen_int_mode (l[1], SImode);
6017 parts[0] = gen_int_mode (l[0], SImode);
6018 }
6019 else
6020 gcc_unreachable ();
6021 }
6022 }
6023 else
6024 {
6025 if (mode == TImode)
6026 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6027 if (mode == XFmode || mode == TFmode)
6028 {
6029 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
6030 if (REG_P (operand))
6031 {
6032 gcc_assert (reload_completed);
6033 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
6034 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
6035 }
6036 else if (offsettable_memref_p (operand))
6037 {
6038 operand = adjust_address (operand, DImode, 0);
6039 parts[0] = operand;
6040 parts[1] = adjust_address (operand, upper_mode, 8);
6041 }
6042 else if (CONST_DOUBLE_P (operand))
6043 {
6044 long l[4];
6045
6046 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
6047
6048 /* real_to_target puts 32-bit pieces in each long. */
6049 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
6050 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
6051 << 32), DImode);
6052
6053 if (upper_mode == SImode)
6054 parts[1] = gen_int_mode (l[2], SImode);
6055 else
6056 parts[1]
6057 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
6058 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
6059 << 32), DImode);
6060 }
6061 else
6062 gcc_unreachable ();
6063 }
6064 }
6065
6066 return size;
6067 }
6068
6069 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
6070 Return false when normal moves are needed; true when all required
6071 insns have been emitted. Operands 2-4 contain the input values
6072 int the correct order; operands 5-7 contain the output values. */
6073
6074 void
6075 ix86_split_long_move (rtx operands[])
6076 {
6077 rtx part[2][4];
6078 int nparts, i, j;
6079 int push = 0;
6080 int collisions = 0;
6081 machine_mode mode = GET_MODE (operands[0]);
6082 bool collisionparts[4];
6083
6084 /* The DFmode expanders may ask us to move double.
6085 For 64bit target this is single move. By hiding the fact
6086 here we simplify i386.md splitters. */
6087 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
6088 {
6089 /* Optimize constant pool reference to immediates. This is used by
6090 fp moves, that force all constants to memory to allow combining. */
6091
6092 if (MEM_P (operands[1])
6093 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
6094 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
6095 operands[1] = get_pool_constant (XEXP (operands[1], 0));
6096 if (push_operand (operands[0], VOIDmode))
6097 {
6098 operands[0] = copy_rtx (operands[0]);
6099 PUT_MODE (operands[0], word_mode);
6100 }
6101 else
6102 operands[0] = gen_lowpart (DImode, operands[0]);
6103 operands[1] = gen_lowpart (DImode, operands[1]);
6104 emit_move_insn (operands[0], operands[1]);
6105 return;
6106 }
6107
6108 /* The only non-offsettable memory we handle is push. */
6109 if (push_operand (operands[0], VOIDmode))
6110 push = 1;
6111 else
6112 gcc_assert (!MEM_P (operands[0])
6113 || offsettable_memref_p (operands[0]));
6114
6115 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
6116 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
6117
6118 /* When emitting push, take care for source operands on the stack. */
6119 if (push && MEM_P (operands[1])
6120 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
6121 {
6122 rtx src_base = XEXP (part[1][nparts - 1], 0);
6123
6124 /* Compensate for the stack decrement by 4. */
6125 if (!TARGET_64BIT && nparts == 3
6126 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6127 src_base = plus_constant (Pmode, src_base, 4);
6128
6129 /* src_base refers to the stack pointer and is
6130 automatically decreased by emitted push. */
6131 for (i = 0; i < nparts; i++)
6132 part[1][i] = change_address (part[1][i],
6133 GET_MODE (part[1][i]), src_base);
6134 }
6135
6136 /* We need to do copy in the right order in case an address register
6137 of the source overlaps the destination. */
6138 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6139 {
6140 rtx tmp;
6141
6142 for (i = 0; i < nparts; i++)
6143 {
6144 collisionparts[i]
6145 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6146 if (collisionparts[i])
6147 collisions++;
6148 }
6149
6150 /* Collision in the middle part can be handled by reordering. */
6151 if (collisions == 1 && nparts == 3 && collisionparts [1])
6152 {
6153 std::swap (part[0][1], part[0][2]);
6154 std::swap (part[1][1], part[1][2]);
6155 }
6156 else if (collisions == 1
6157 && nparts == 4
6158 && (collisionparts [1] || collisionparts [2]))
6159 {
6160 if (collisionparts [1])
6161 {
6162 std::swap (part[0][1], part[0][2]);
6163 std::swap (part[1][1], part[1][2]);
6164 }
6165 else
6166 {
6167 std::swap (part[0][2], part[0][3]);
6168 std::swap (part[1][2], part[1][3]);
6169 }
6170 }
6171
6172 /* If there are more collisions, we can't handle it by reordering.
6173 Do an lea to the last part and use only one colliding move. */
6174 else if (collisions > 1)
6175 {
6176 rtx base, addr;
6177
6178 collisions = 1;
6179
6180 base = part[0][nparts - 1];
6181
6182 /* Handle the case when the last part isn't valid for lea.
6183 Happens in 64-bit mode storing the 12-byte XFmode. */
6184 if (GET_MODE (base) != Pmode)
6185 base = gen_rtx_REG (Pmode, REGNO (base));
6186
6187 addr = XEXP (part[1][0], 0);
6188 if (TARGET_TLS_DIRECT_SEG_REFS)
6189 {
6190 struct ix86_address parts;
6191 int ok = ix86_decompose_address (addr, &parts);
6192 gcc_assert (ok);
6193 /* It is not valid to use %gs: or %fs: in lea. */
6194 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6195 }
6196 emit_insn (gen_rtx_SET (base, addr));
6197 part[1][0] = replace_equiv_address (part[1][0], base);
6198 for (i = 1; i < nparts; i++)
6199 {
6200 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6201 part[1][i] = replace_equiv_address (part[1][i], tmp);
6202 }
6203 }
6204 }
6205
6206 if (push)
6207 {
6208 if (!TARGET_64BIT)
6209 {
6210 if (nparts == 3)
6211 {
6212 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
6213 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
6214 emit_move_insn (part[0][2], part[1][2]);
6215 }
6216 else if (nparts == 4)
6217 {
6218 emit_move_insn (part[0][3], part[1][3]);
6219 emit_move_insn (part[0][2], part[1][2]);
6220 }
6221 }
6222 else
6223 {
6224 /* In 64bit mode we don't have 32bit push available. In case this is
6225 register, it is OK - we will just use larger counterpart. We also
6226 retype memory - these comes from attempt to avoid REX prefix on
6227 moving of second half of TFmode value. */
6228 if (GET_MODE (part[1][1]) == SImode)
6229 {
6230 switch (GET_CODE (part[1][1]))
6231 {
6232 case MEM:
6233 part[1][1] = adjust_address (part[1][1], DImode, 0);
6234 break;
6235
6236 case REG:
6237 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6238 break;
6239
6240 default:
6241 gcc_unreachable ();
6242 }
6243
6244 if (GET_MODE (part[1][0]) == SImode)
6245 part[1][0] = part[1][1];
6246 }
6247 }
6248 emit_move_insn (part[0][1], part[1][1]);
6249 emit_move_insn (part[0][0], part[1][0]);
6250 return;
6251 }
6252
6253 /* Choose correct order to not overwrite the source before it is copied. */
6254 if ((REG_P (part[0][0])
6255 && REG_P (part[1][1])
6256 && (REGNO (part[0][0]) == REGNO (part[1][1])
6257 || (nparts == 3
6258 && REGNO (part[0][0]) == REGNO (part[1][2]))
6259 || (nparts == 4
6260 && REGNO (part[0][0]) == REGNO (part[1][3]))))
6261 || (collisions > 0
6262 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6263 {
6264 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6265 {
6266 operands[2 + i] = part[0][j];
6267 operands[6 + i] = part[1][j];
6268 }
6269 }
6270 else
6271 {
6272 for (i = 0; i < nparts; i++)
6273 {
6274 operands[2 + i] = part[0][i];
6275 operands[6 + i] = part[1][i];
6276 }
6277 }
6278
6279 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6280 if (optimize_insn_for_size_p ())
6281 {
6282 for (j = 0; j < nparts - 1; j++)
6283 if (CONST_INT_P (operands[6 + j])
6284 && operands[6 + j] != const0_rtx
6285 && REG_P (operands[2 + j]))
6286 for (i = j; i < nparts - 1; i++)
6287 if (CONST_INT_P (operands[7 + i])
6288 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6289 operands[7 + i] = operands[2 + j];
6290 }
6291
6292 for (i = 0; i < nparts; i++)
6293 emit_move_insn (operands[2 + i], operands[6 + i]);
6294
6295 return;
6296 }
6297
6298 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6299 left shift by a constant, either using a single shift or
6300 a sequence of add instructions. */
6301
6302 static void
6303 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6304 {
6305 if (count == 1
6306 || (count * ix86_cost->add <= ix86_cost->shift_const
6307 && !optimize_insn_for_size_p ()))
6308 {
6309 while (count-- > 0)
6310 emit_insn (gen_add2_insn (operand, operand));
6311 }
6312 else
6313 {
6314 rtx (*insn)(rtx, rtx, rtx);
6315
6316 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6317 emit_insn (insn (operand, operand, GEN_INT (count)));
6318 }
6319 }
6320
6321 void
6322 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6323 {
6324 rtx (*gen_ashl3)(rtx, rtx, rtx);
6325 rtx (*gen_shld)(rtx, rtx, rtx);
6326 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6327 machine_mode half_mode;
6328
6329 rtx low[2], high[2];
6330 int count;
6331
6332 if (CONST_INT_P (operands[2]))
6333 {
6334 split_double_mode (mode, operands, 2, low, high);
6335 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6336
6337 if (count >= half_width)
6338 {
6339 emit_move_insn (high[0], low[1]);
6340 ix86_expand_clear (low[0]);
6341
6342 if (count > half_width)
6343 ix86_expand_ashl_const (high[0], count - half_width, mode);
6344 }
6345 else
6346 {
6347 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6348
6349 if (!rtx_equal_p (operands[0], operands[1]))
6350 emit_move_insn (operands[0], operands[1]);
6351
6352 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6353 ix86_expand_ashl_const (low[0], count, mode);
6354 }
6355 return;
6356 }
6357
6358 split_double_mode (mode, operands, 1, low, high);
6359 half_mode = mode == DImode ? SImode : DImode;
6360
6361 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6362
6363 if (operands[1] == const1_rtx)
6364 {
6365 /* Assuming we've chosen a QImode capable registers, then 1 << N
6366 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6367 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6368 {
6369 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6370
6371 ix86_expand_clear (low[0]);
6372 ix86_expand_clear (high[0]);
6373 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6374
6375 d = gen_lowpart (QImode, low[0]);
6376 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6377 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6378 emit_insn (gen_rtx_SET (d, s));
6379
6380 d = gen_lowpart (QImode, high[0]);
6381 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6382 s = gen_rtx_NE (QImode, flags, const0_rtx);
6383 emit_insn (gen_rtx_SET (d, s));
6384 }
6385
6386 /* Otherwise, we can get the same results by manually performing
6387 a bit extract operation on bit 5/6, and then performing the two
6388 shifts. The two methods of getting 0/1 into low/high are exactly
6389 the same size. Avoiding the shift in the bit extract case helps
6390 pentium4 a bit; no one else seems to care much either way. */
6391 else
6392 {
6393 rtx (*gen_lshr3)(rtx, rtx, rtx);
6394 rtx (*gen_and3)(rtx, rtx, rtx);
6395 rtx (*gen_xor3)(rtx, rtx, rtx);
6396 HOST_WIDE_INT bits;
6397 rtx x;
6398
6399 if (mode == DImode)
6400 {
6401 gen_lshr3 = gen_lshrsi3;
6402 gen_and3 = gen_andsi3;
6403 gen_xor3 = gen_xorsi3;
6404 bits = 5;
6405 }
6406 else
6407 {
6408 gen_lshr3 = gen_lshrdi3;
6409 gen_and3 = gen_anddi3;
6410 gen_xor3 = gen_xordi3;
6411 bits = 6;
6412 }
6413
6414 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6415 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6416 else
6417 x = gen_lowpart (half_mode, operands[2]);
6418 emit_insn (gen_rtx_SET (high[0], x));
6419
6420 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6421 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6422 emit_move_insn (low[0], high[0]);
6423 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6424 }
6425
6426 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6427 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6428 return;
6429 }
6430
6431 if (operands[1] == constm1_rtx)
6432 {
6433 /* For -1 << N, we can avoid the shld instruction, because we
6434 know that we're shifting 0...31/63 ones into a -1. */
6435 emit_move_insn (low[0], constm1_rtx);
6436 if (optimize_insn_for_size_p ())
6437 emit_move_insn (high[0], low[0]);
6438 else
6439 emit_move_insn (high[0], constm1_rtx);
6440 }
6441 else
6442 {
6443 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6444
6445 if (!rtx_equal_p (operands[0], operands[1]))
6446 emit_move_insn (operands[0], operands[1]);
6447
6448 split_double_mode (mode, operands, 1, low, high);
6449 emit_insn (gen_shld (high[0], low[0], operands[2]));
6450 }
6451
6452 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6453
6454 if (TARGET_CMOVE && scratch)
6455 {
6456 ix86_expand_clear (scratch);
6457 emit_insn (gen_x86_shift_adj_1
6458 (half_mode, high[0], low[0], operands[2], scratch));
6459 }
6460 else
6461 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6462 }
6463
6464 void
6465 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6466 {
6467 rtx (*gen_ashr3)(rtx, rtx, rtx)
6468 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6469 rtx (*gen_shrd)(rtx, rtx, rtx);
6470 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6471
6472 rtx low[2], high[2];
6473 int count;
6474
6475 if (CONST_INT_P (operands[2]))
6476 {
6477 split_double_mode (mode, operands, 2, low, high);
6478 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6479
6480 if (count == GET_MODE_BITSIZE (mode) - 1)
6481 {
6482 emit_move_insn (high[0], high[1]);
6483 emit_insn (gen_ashr3 (high[0], high[0],
6484 GEN_INT (half_width - 1)));
6485 emit_move_insn (low[0], high[0]);
6486
6487 }
6488 else if (count >= half_width)
6489 {
6490 emit_move_insn (low[0], high[1]);
6491 emit_move_insn (high[0], low[0]);
6492 emit_insn (gen_ashr3 (high[0], high[0],
6493 GEN_INT (half_width - 1)));
6494
6495 if (count > half_width)
6496 emit_insn (gen_ashr3 (low[0], low[0],
6497 GEN_INT (count - half_width)));
6498 }
6499 else
6500 {
6501 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6502
6503 if (!rtx_equal_p (operands[0], operands[1]))
6504 emit_move_insn (operands[0], operands[1]);
6505
6506 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6507 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6508 }
6509 }
6510 else
6511 {
6512 machine_mode half_mode;
6513
6514 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6515
6516 if (!rtx_equal_p (operands[0], operands[1]))
6517 emit_move_insn (operands[0], operands[1]);
6518
6519 split_double_mode (mode, operands, 1, low, high);
6520 half_mode = mode == DImode ? SImode : DImode;
6521
6522 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6523 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6524
6525 if (TARGET_CMOVE && scratch)
6526 {
6527 emit_move_insn (scratch, high[0]);
6528 emit_insn (gen_ashr3 (scratch, scratch,
6529 GEN_INT (half_width - 1)));
6530 emit_insn (gen_x86_shift_adj_1
6531 (half_mode, low[0], high[0], operands[2], scratch));
6532 }
6533 else
6534 emit_insn (gen_x86_shift_adj_3
6535 (half_mode, low[0], high[0], operands[2]));
6536 }
6537 }
6538
6539 void
6540 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6541 {
6542 rtx (*gen_lshr3)(rtx, rtx, rtx)
6543 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6544 rtx (*gen_shrd)(rtx, rtx, rtx);
6545 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6546
6547 rtx low[2], high[2];
6548 int count;
6549
6550 if (CONST_INT_P (operands[2]))
6551 {
6552 split_double_mode (mode, operands, 2, low, high);
6553 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6554
6555 if (count >= half_width)
6556 {
6557 emit_move_insn (low[0], high[1]);
6558 ix86_expand_clear (high[0]);
6559
6560 if (count > half_width)
6561 emit_insn (gen_lshr3 (low[0], low[0],
6562 GEN_INT (count - half_width)));
6563 }
6564 else
6565 {
6566 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6567
6568 if (!rtx_equal_p (operands[0], operands[1]))
6569 emit_move_insn (operands[0], operands[1]);
6570
6571 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6572 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6573 }
6574 }
6575 else
6576 {
6577 machine_mode half_mode;
6578
6579 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6580
6581 if (!rtx_equal_p (operands[0], operands[1]))
6582 emit_move_insn (operands[0], operands[1]);
6583
6584 split_double_mode (mode, operands, 1, low, high);
6585 half_mode = mode == DImode ? SImode : DImode;
6586
6587 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6588 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6589
6590 if (TARGET_CMOVE && scratch)
6591 {
6592 ix86_expand_clear (scratch);
6593 emit_insn (gen_x86_shift_adj_1
6594 (half_mode, low[0], high[0], operands[2], scratch));
6595 }
6596 else
6597 emit_insn (gen_x86_shift_adj_2
6598 (half_mode, low[0], high[0], operands[2]));
6599 }
6600 }
6601
6602 /* Expand move of V1TI mode register X to a new TI mode register. */
6603 static rtx
6604 ix86_expand_v1ti_to_ti (rtx x)
6605 {
6606 rtx result = gen_reg_rtx (TImode);
6607 if (TARGET_SSE2)
6608 {
6609 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6610 rtx lo = gen_lowpart (DImode, result);
6611 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6612 rtx hi = gen_highpart (DImode, result);
6613 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6614 }
6615 else
6616 emit_move_insn (result, gen_lowpart (TImode, x));
6617 return result;
6618 }
6619
6620 /* Expand move of TI mode register X to a new V1TI mode register. */
6621 static rtx
6622 ix86_expand_ti_to_v1ti (rtx x)
6623 {
6624 if (TARGET_SSE2)
6625 {
6626 rtx lo = gen_lowpart (DImode, x);
6627 rtx hi = gen_highpart (DImode, x);
6628 rtx tmp = gen_reg_rtx (V2DImode);
6629 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6630 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6631 }
6632
6633 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6634 }
6635
6636 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6637 void
6638 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6639 {
6640 rtx op1 = force_reg (V1TImode, operands[1]);
6641
6642 if (!CONST_INT_P (operands[2]))
6643 {
6644 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6645 rtx tmp2 = gen_reg_rtx (TImode);
6646 rtx (*shift) (rtx, rtx, rtx)
6647 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6648 emit_insn (shift (tmp2, tmp1, operands[2]));
6649 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6650 emit_move_insn (operands[0], tmp3);
6651 return;
6652 }
6653
6654 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6655
6656 if (bits == 0)
6657 {
6658 emit_move_insn (operands[0], op1);
6659 return;
6660 }
6661
6662 if ((bits & 7) == 0)
6663 {
6664 rtx tmp = gen_reg_rtx (V1TImode);
6665 if (code == ASHIFT)
6666 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6667 else
6668 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6669 emit_move_insn (operands[0], tmp);
6670 return;
6671 }
6672
6673 rtx tmp1 = gen_reg_rtx (V1TImode);
6674 if (code == ASHIFT)
6675 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6676 else
6677 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6678
6679 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6680 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6681
6682 /* tmp3 will be the V2DImode result. */
6683 rtx tmp3 = gen_reg_rtx (V2DImode);
6684
6685 if (bits > 64)
6686 {
6687 if (code == ASHIFT)
6688 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6689 else
6690 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6691 }
6692 else
6693 {
6694 /* tmp4 is operands[1], in V2DImode. */
6695 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6696
6697 rtx tmp5 = gen_reg_rtx (V2DImode);
6698 if (code == ASHIFT)
6699 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6700 else
6701 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6702
6703 rtx tmp6 = gen_reg_rtx (V2DImode);
6704 if (code == ASHIFT)
6705 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6706 else
6707 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6708
6709 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6710 }
6711
6712 /* Convert the result back to V1TImode and store in operands[0]. */
6713 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6714 emit_move_insn (operands[0], tmp7);
6715 }
6716
6717 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6718 void
6719 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6720 {
6721 rtx op1 = force_reg (V1TImode, operands[1]);
6722
6723 if (!CONST_INT_P (operands[2]))
6724 {
6725 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6726 rtx tmp2 = gen_reg_rtx (TImode);
6727 rtx (*rotate) (rtx, rtx, rtx)
6728 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6729 emit_insn (rotate (tmp2, tmp1, operands[2]));
6730 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6731 emit_move_insn (operands[0], tmp3);
6732 return;
6733 }
6734
6735 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6736
6737 if (bits == 0)
6738 {
6739 emit_move_insn (operands[0], op1);
6740 return;
6741 }
6742
6743 if (code == ROTATERT)
6744 bits = 128 - bits;
6745
6746 if ((bits & 31) == 0)
6747 {
6748 rtx tmp2 = gen_reg_rtx (V4SImode);
6749 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6750 if (bits == 32)
6751 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6752 else if (bits == 64)
6753 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6754 else
6755 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6756 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6757 return;
6758 }
6759
6760 if ((bits & 7) == 0)
6761 {
6762 rtx tmp1 = gen_reg_rtx (V1TImode);
6763 rtx tmp2 = gen_reg_rtx (V1TImode);
6764 rtx tmp3 = gen_reg_rtx (V1TImode);
6765
6766 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6767 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6768 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6769 emit_move_insn (operands[0], tmp3);
6770 return;
6771 }
6772
6773 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6774
6775 rtx lobits;
6776 rtx hibits;
6777
6778 switch (bits >> 5)
6779 {
6780 case 0:
6781 lobits = op1_v4si;
6782 hibits = gen_reg_rtx (V4SImode);
6783 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6784 break;
6785
6786 case 1:
6787 lobits = gen_reg_rtx (V4SImode);
6788 hibits = gen_reg_rtx (V4SImode);
6789 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6790 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6791 break;
6792
6793 case 2:
6794 lobits = gen_reg_rtx (V4SImode);
6795 hibits = gen_reg_rtx (V4SImode);
6796 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6797 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6798 break;
6799
6800 default:
6801 lobits = gen_reg_rtx (V4SImode);
6802 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6803 hibits = op1_v4si;
6804 break;
6805 }
6806
6807 rtx tmp1 = gen_reg_rtx (V4SImode);
6808 rtx tmp2 = gen_reg_rtx (V4SImode);
6809 rtx tmp3 = gen_reg_rtx (V4SImode);
6810
6811 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6812 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6813 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6814
6815 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6816 }
6817
6818 /* Expand V1TI mode ashiftrt by constant. */
6819 void
6820 ix86_expand_v1ti_ashiftrt (rtx operands[])
6821 {
6822 rtx op1 = force_reg (V1TImode, operands[1]);
6823
6824 if (!CONST_INT_P (operands[2]))
6825 {
6826 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6827 rtx tmp2 = gen_reg_rtx (TImode);
6828 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6829 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6830 emit_move_insn (operands[0], tmp3);
6831 return;
6832 }
6833
6834 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6835
6836 if (bits == 0)
6837 {
6838 emit_move_insn (operands[0], op1);
6839 return;
6840 }
6841
6842 if (bits == 127)
6843 {
6844 /* Two operations. */
6845 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6846 rtx tmp2 = gen_reg_rtx (V4SImode);
6847 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6848
6849 rtx tmp3 = gen_reg_rtx (V4SImode);
6850 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6851
6852 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6853 return;
6854 }
6855
6856 if (bits == 64)
6857 {
6858 /* Three operations. */
6859 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6860 rtx tmp2 = gen_reg_rtx (V4SImode);
6861 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6862
6863 rtx tmp3 = gen_reg_rtx (V4SImode);
6864 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6865
6866 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6867 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6868 rtx tmp6 = gen_reg_rtx (V2DImode);
6869 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6870
6871 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6872 return;
6873 }
6874
6875 if (bits == 96)
6876 {
6877 /* Three operations. */
6878 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6879 rtx tmp2 = gen_reg_rtx (V4SImode);
6880 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6881
6882 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6883 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6884 rtx tmp5 = gen_reg_rtx (V2DImode);
6885 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6886
6887 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
6888 rtx tmp7 = gen_reg_rtx (V4SImode);
6889 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6890
6891 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6892 return;
6893 }
6894
6895 if (bits >= 111)
6896 {
6897 /* Three operations. */
6898 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6899 rtx tmp2 = gen_reg_rtx (V4SImode);
6900 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6901
6902 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6903 rtx tmp4 = gen_reg_rtx (V8HImode);
6904 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6905
6906 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6907 rtx tmp6 = gen_reg_rtx (V4SImode);
6908 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6909
6910 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6911 return;
6912 }
6913
6914 if (TARGET_AVX2 || TARGET_SSE4_1)
6915 {
6916 /* Three operations. */
6917 if (bits == 32)
6918 {
6919 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6920 rtx tmp2 = gen_reg_rtx (V4SImode);
6921 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6922
6923 rtx tmp3 = gen_reg_rtx (V1TImode);
6924 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6925
6926 if (TARGET_AVX2)
6927 {
6928 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6929 rtx tmp5 = gen_reg_rtx (V4SImode);
6930 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6931 GEN_INT (7)));
6932
6933 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6934 }
6935 else
6936 {
6937 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6938 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6939 rtx tmp6 = gen_reg_rtx (V8HImode);
6940 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6941 GEN_INT (0x3f)));
6942
6943 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6944 }
6945 return;
6946 }
6947
6948 /* Three operations. */
6949 if (bits == 8 || bits == 16 || bits == 24)
6950 {
6951 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6952 rtx tmp2 = gen_reg_rtx (V4SImode);
6953 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6954
6955 rtx tmp3 = gen_reg_rtx (V1TImode);
6956 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6957
6958 if (TARGET_AVX2)
6959 {
6960 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6961 rtx tmp5 = gen_reg_rtx (V4SImode);
6962 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6963 GEN_INT (7)));
6964
6965 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6966 }
6967 else
6968 {
6969 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6970 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6971 rtx tmp6 = gen_reg_rtx (V8HImode);
6972 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6973 GEN_INT (0x3f)));
6974
6975 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6976 }
6977 return;
6978 }
6979 }
6980
6981 if (bits > 96)
6982 {
6983 /* Four operations. */
6984 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6985 rtx tmp2 = gen_reg_rtx (V4SImode);
6986 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6987
6988 rtx tmp3 = gen_reg_rtx (V4SImode);
6989 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6990
6991 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6992 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6993 rtx tmp6 = gen_reg_rtx (V2DImode);
6994 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6995
6996 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
6997 rtx tmp8 = gen_reg_rtx (V4SImode);
6998 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6999
7000 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
7001 return;
7002 }
7003
7004 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
7005 {
7006 /* Four operations. */
7007 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7008 rtx tmp2 = gen_reg_rtx (V4SImode);
7009 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7010
7011 rtx tmp3 = gen_reg_rtx (V4SImode);
7012 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7013
7014 rtx tmp4 = gen_reg_rtx (V1TImode);
7015 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7016
7017 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7018 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
7019 rtx tmp7 = gen_reg_rtx (V8HImode);
7020 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
7021 GEN_INT (bits == 48 ? 0x1f : 0x07)));
7022
7023 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7024 return;
7025 }
7026
7027 if ((bits & 7) == 0)
7028 {
7029 /* Five operations. */
7030 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7031 rtx tmp2 = gen_reg_rtx (V4SImode);
7032 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7033
7034 rtx tmp3 = gen_reg_rtx (V4SImode);
7035 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7036
7037 rtx tmp4 = gen_reg_rtx (V1TImode);
7038 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7039
7040 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7041 rtx tmp6 = gen_reg_rtx (V1TImode);
7042 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
7043
7044 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7045 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
7046 rtx tmp9 = gen_reg_rtx (V2DImode);
7047 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
7048
7049 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
7050 return;
7051 }
7052
7053 if (TARGET_AVX2 && bits < 32)
7054 {
7055 /* Six operations. */
7056 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7057 rtx tmp2 = gen_reg_rtx (V4SImode);
7058 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7059
7060 rtx tmp3 = gen_reg_rtx (V1TImode);
7061 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7062
7063 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7064 rtx tmp5 = gen_reg_rtx (V2DImode);
7065 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7066
7067 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7068 rtx tmp7 = gen_reg_rtx (V2DImode);
7069 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7070
7071 rtx tmp8 = gen_reg_rtx (V2DImode);
7072 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7073
7074 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
7075 rtx tmp10 = gen_reg_rtx (V4SImode);
7076 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
7077
7078 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
7079 return;
7080 }
7081
7082 if (TARGET_SSE4_1 && bits < 15)
7083 {
7084 /* Six operations. */
7085 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7086 rtx tmp2 = gen_reg_rtx (V4SImode);
7087 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7088
7089 rtx tmp3 = gen_reg_rtx (V1TImode);
7090 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7091
7092 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7093 rtx tmp5 = gen_reg_rtx (V2DImode);
7094 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7095
7096 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7097 rtx tmp7 = gen_reg_rtx (V2DImode);
7098 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7099
7100 rtx tmp8 = gen_reg_rtx (V2DImode);
7101 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7102
7103 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7104 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
7105 rtx tmp11 = gen_reg_rtx (V8HImode);
7106 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
7107
7108 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
7109 return;
7110 }
7111
7112 if (bits == 1)
7113 {
7114 /* Eight operations. */
7115 rtx tmp1 = gen_reg_rtx (V1TImode);
7116 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7117
7118 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7119 rtx tmp3 = gen_reg_rtx (V2DImode);
7120 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
7121
7122 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7123 rtx tmp5 = gen_reg_rtx (V2DImode);
7124 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
7125
7126 rtx tmp6 = gen_reg_rtx (V2DImode);
7127 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7128
7129 rtx tmp7 = gen_reg_rtx (V2DImode);
7130 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7131
7132 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
7133 rtx tmp9 = gen_reg_rtx (V4SImode);
7134 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7135
7136 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
7137 rtx tmp11 = gen_reg_rtx (V2DImode);
7138 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7139
7140 rtx tmp12 = gen_reg_rtx (V2DImode);
7141 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7142
7143 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
7144 return;
7145 }
7146
7147 if (bits > 64)
7148 {
7149 /* Eight operations. */
7150 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7151 rtx tmp2 = gen_reg_rtx (V4SImode);
7152 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7153
7154 rtx tmp3 = gen_reg_rtx (V4SImode);
7155 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7156
7157 rtx tmp4 = gen_reg_rtx (V1TImode);
7158 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7159
7160 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7161 rtx tmp6 = gen_reg_rtx (V2DImode);
7162 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7163
7164 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7165 rtx tmp8 = gen_reg_rtx (V1TImode);
7166 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7167
7168 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7169 rtx tmp10 = gen_reg_rtx (V2DImode);
7170 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7171
7172 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
7173 rtx tmp12 = gen_reg_rtx (V2DImode);
7174 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7175
7176 rtx tmp13 = gen_reg_rtx (V2DImode);
7177 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7178
7179 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
7180 }
7181 else
7182 {
7183 /* Nine operations. */
7184 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7185 rtx tmp2 = gen_reg_rtx (V4SImode);
7186 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7187
7188 rtx tmp3 = gen_reg_rtx (V4SImode);
7189 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7190
7191 rtx tmp4 = gen_reg_rtx (V1TImode);
7192 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7193
7194 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7195 rtx tmp6 = gen_reg_rtx (V2DImode);
7196 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7197
7198 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7199 rtx tmp8 = gen_reg_rtx (V2DImode);
7200 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7201
7202 rtx tmp9 = gen_reg_rtx (V2DImode);
7203 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7204
7205 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7206 rtx tmp11 = gen_reg_rtx (V1TImode);
7207 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7208
7209 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
7210 rtx tmp13 = gen_reg_rtx (V2DImode);
7211 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7212
7213 rtx tmp14 = gen_reg_rtx (V2DImode);
7214 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7215
7216 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
7217 }
7218 }
7219
7220 /* Replace all occurrences of REG FROM with REG TO in X, including
7221 occurrences with different modes. */
7222
7223 rtx
7224 ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
7225 {
7226 gcc_checking_assert (REG_P (from)
7227 && REG_P (to)
7228 && GET_MODE (from) == GET_MODE (to));
7229 if (!reg_overlap_mentioned_p (from, x))
7230 return x;
7231 rtx ret = copy_rtx (x);
7232 subrtx_ptr_iterator::array_type array;
7233 FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
7234 {
7235 rtx *loc = *iter;
7236 x = *loc;
7237 if (REG_P (x) && REGNO (x) == REGNO (from))
7238 {
7239 if (x == from)
7240 *loc = to;
7241 else
7242 {
7243 gcc_checking_assert (REG_NREGS (x) == 1);
7244 *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
7245 }
7246 }
7247 }
7248 return ret;
7249 }
7250
7251 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7252 DImode for constant loop counts. */
7253
7254 static machine_mode
7255 counter_mode (rtx count_exp)
7256 {
7257 if (GET_MODE (count_exp) != VOIDmode)
7258 return GET_MODE (count_exp);
7259 if (!CONST_INT_P (count_exp))
7260 return Pmode;
7261 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7262 return DImode;
7263 return SImode;
7264 }
7265
7266 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7267 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7268 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7269 memory by VALUE (supposed to be in MODE).
7270
7271 The size is rounded down to whole number of chunk size moved at once.
7272 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7273
7274
7275 static void
7276 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7277 rtx destptr, rtx srcptr, rtx value,
7278 rtx count, machine_mode mode, int unroll,
7279 int expected_size, bool issetmem)
7280 {
7281 rtx_code_label *out_label, *top_label;
7282 rtx iter, tmp;
7283 machine_mode iter_mode = counter_mode (count);
7284 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7285 rtx piece_size = GEN_INT (piece_size_n);
7286 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7287 rtx size;
7288 int i;
7289
7290 top_label = gen_label_rtx ();
7291 out_label = gen_label_rtx ();
7292 iter = gen_reg_rtx (iter_mode);
7293
7294 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7295 NULL, 1, OPTAB_DIRECT);
7296 /* Those two should combine. */
7297 if (piece_size == const1_rtx)
7298 {
7299 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7300 true, out_label);
7301 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7302 }
7303 emit_move_insn (iter, const0_rtx);
7304
7305 emit_label (top_label);
7306
7307 tmp = convert_modes (Pmode, iter_mode, iter, true);
7308
7309 /* This assert could be relaxed - in this case we'll need to compute
7310 smallest power of two, containing in PIECE_SIZE_N and pass it to
7311 offset_address. */
7312 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7313 destmem = offset_address (destmem, tmp, piece_size_n);
7314 destmem = adjust_address (destmem, mode, 0);
7315
7316 if (!issetmem)
7317 {
7318 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7319 srcmem = adjust_address (srcmem, mode, 0);
7320
7321 /* When unrolling for chips that reorder memory reads and writes,
7322 we can save registers by using single temporary.
7323 Also using 4 temporaries is overkill in 32bit mode. */
7324 if (!TARGET_64BIT && 0)
7325 {
7326 for (i = 0; i < unroll; i++)
7327 {
7328 if (i)
7329 {
7330 destmem = adjust_address (copy_rtx (destmem), mode,
7331 GET_MODE_SIZE (mode));
7332 srcmem = adjust_address (copy_rtx (srcmem), mode,
7333 GET_MODE_SIZE (mode));
7334 }
7335 emit_move_insn (destmem, srcmem);
7336 }
7337 }
7338 else
7339 {
7340 rtx tmpreg[4];
7341 gcc_assert (unroll <= 4);
7342 for (i = 0; i < unroll; i++)
7343 {
7344 tmpreg[i] = gen_reg_rtx (mode);
7345 if (i)
7346 srcmem = adjust_address (copy_rtx (srcmem), mode,
7347 GET_MODE_SIZE (mode));
7348 emit_move_insn (tmpreg[i], srcmem);
7349 }
7350 for (i = 0; i < unroll; i++)
7351 {
7352 if (i)
7353 destmem = adjust_address (copy_rtx (destmem), mode,
7354 GET_MODE_SIZE (mode));
7355 emit_move_insn (destmem, tmpreg[i]);
7356 }
7357 }
7358 }
7359 else
7360 for (i = 0; i < unroll; i++)
7361 {
7362 if (i)
7363 destmem = adjust_address (copy_rtx (destmem), mode,
7364 GET_MODE_SIZE (mode));
7365 emit_move_insn (destmem, value);
7366 }
7367
7368 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7369 true, OPTAB_LIB_WIDEN);
7370 if (tmp != iter)
7371 emit_move_insn (iter, tmp);
7372
7373 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7374 true, top_label);
7375 if (expected_size != -1)
7376 {
7377 expected_size /= GET_MODE_SIZE (mode) * unroll;
7378 if (expected_size == 0)
7379 predict_jump (0);
7380 else if (expected_size > REG_BR_PROB_BASE)
7381 predict_jump (REG_BR_PROB_BASE - 1);
7382 else
7383 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7384 / expected_size);
7385 }
7386 else
7387 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7388 iter = ix86_zero_extend_to_Pmode (iter);
7389 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7390 true, OPTAB_LIB_WIDEN);
7391 if (tmp != destptr)
7392 emit_move_insn (destptr, tmp);
7393 if (!issetmem)
7394 {
7395 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7396 true, OPTAB_LIB_WIDEN);
7397 if (tmp != srcptr)
7398 emit_move_insn (srcptr, tmp);
7399 }
7400 emit_label (out_label);
7401 }
7402
7403 /* Divide COUNTREG by SCALE. */
7404 static rtx
7405 scale_counter (rtx countreg, int scale)
7406 {
7407 rtx sc;
7408
7409 if (scale == 1)
7410 return countreg;
7411 if (CONST_INT_P (countreg))
7412 return GEN_INT (INTVAL (countreg) / scale);
7413 gcc_assert (REG_P (countreg));
7414
7415 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7416 GEN_INT (exact_log2 (scale)),
7417 NULL, 1, OPTAB_DIRECT);
7418 return sc;
7419 }
7420
7421 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7422 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7423 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7424 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7425 ORIG_VALUE is the original value passed to memset to fill the memory with.
7426 Other arguments have same meaning as for previous function. */
7427
7428 static void
7429 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7430 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7431 rtx count,
7432 machine_mode mode, bool issetmem)
7433 {
7434 rtx destexp;
7435 rtx srcexp;
7436 rtx countreg;
7437 HOST_WIDE_INT rounded_count;
7438
7439 /* If possible, it is shorter to use rep movs.
7440 TODO: Maybe it is better to move this logic to decide_alg. */
7441 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7442 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7443 && (!issetmem || orig_value == const0_rtx))
7444 mode = SImode;
7445
7446 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7447 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7448
7449 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7450 GET_MODE_SIZE (mode)));
7451 if (mode != QImode)
7452 {
7453 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7454 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7455 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7456 }
7457 else
7458 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7459 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7460 {
7461 rounded_count
7462 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7463 destmem = shallow_copy_rtx (destmem);
7464 set_mem_size (destmem, rounded_count);
7465 }
7466 else if (MEM_SIZE_KNOWN_P (destmem))
7467 clear_mem_size (destmem);
7468
7469 if (issetmem)
7470 {
7471 value = force_reg (mode, gen_lowpart (mode, value));
7472 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7473 }
7474 else
7475 {
7476 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7477 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7478 if (mode != QImode)
7479 {
7480 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7481 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7482 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7483 }
7484 else
7485 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7486 if (CONST_INT_P (count))
7487 {
7488 rounded_count
7489 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7490 srcmem = shallow_copy_rtx (srcmem);
7491 set_mem_size (srcmem, rounded_count);
7492 }
7493 else
7494 {
7495 if (MEM_SIZE_KNOWN_P (srcmem))
7496 clear_mem_size (srcmem);
7497 }
7498 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7499 destexp, srcexp));
7500 }
7501 }
7502
7503 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7504 DESTMEM.
7505 SRC is passed by pointer to be updated on return.
7506 Return value is updated DST. */
7507 static rtx
7508 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7509 HOST_WIDE_INT size_to_move)
7510 {
7511 rtx dst = destmem, src = *srcmem, tempreg;
7512 enum insn_code code;
7513 machine_mode move_mode;
7514 int piece_size, i;
7515
7516 /* Find the widest mode in which we could perform moves.
7517 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7518 it until move of such size is supported. */
7519 piece_size = 1 << floor_log2 (size_to_move);
7520 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7521 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7522 {
7523 gcc_assert (piece_size > 1);
7524 piece_size >>= 1;
7525 }
7526
7527 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7528 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7529 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7530 {
7531 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7532 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7533 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7534 {
7535 move_mode = word_mode;
7536 piece_size = GET_MODE_SIZE (move_mode);
7537 code = optab_handler (mov_optab, move_mode);
7538 }
7539 }
7540 gcc_assert (code != CODE_FOR_nothing);
7541
7542 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7543 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7544
7545 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7546 gcc_assert (size_to_move % piece_size == 0);
7547
7548 for (i = 0; i < size_to_move; i += piece_size)
7549 {
7550 /* We move from memory to memory, so we'll need to do it via
7551 a temporary register. */
7552 tempreg = gen_reg_rtx (move_mode);
7553 emit_insn (GEN_FCN (code) (tempreg, src));
7554 emit_insn (GEN_FCN (code) (dst, tempreg));
7555
7556 emit_move_insn (destptr,
7557 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7558 emit_move_insn (srcptr,
7559 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7560
7561 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7562 piece_size);
7563 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7564 piece_size);
7565 }
7566
7567 /* Update DST and SRC rtx. */
7568 *srcmem = src;
7569 return dst;
7570 }
7571
7572 /* Helper function for the string operations below. Dest VARIABLE whether
7573 it is aligned to VALUE bytes. If true, jump to the label. */
7574
7575 static rtx_code_label *
7576 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7577 {
7578 rtx_code_label *label = gen_label_rtx ();
7579 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7580 if (GET_MODE (variable) == DImode)
7581 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7582 else
7583 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7584 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7585 1, label);
7586 if (epilogue)
7587 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7588 else
7589 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7590 return label;
7591 }
7592
7593
7594 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7595
7596 static void
7597 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7598 rtx destptr, rtx srcptr, rtx count, int max_size)
7599 {
7600 rtx src, dest;
7601 if (CONST_INT_P (count))
7602 {
7603 HOST_WIDE_INT countval = INTVAL (count);
7604 HOST_WIDE_INT epilogue_size = countval % max_size;
7605 int i;
7606
7607 /* For now MAX_SIZE should be a power of 2. This assert could be
7608 relaxed, but it'll require a bit more complicated epilogue
7609 expanding. */
7610 gcc_assert ((max_size & (max_size - 1)) == 0);
7611 for (i = max_size; i >= 1; i >>= 1)
7612 {
7613 if (epilogue_size & i)
7614 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7615 }
7616 return;
7617 }
7618 if (max_size > 8)
7619 {
7620 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7621 count, 1, OPTAB_DIRECT);
7622 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7623 count, QImode, 1, 4, false);
7624 return;
7625 }
7626
7627 /* When there are stringops, we can cheaply increase dest and src pointers.
7628 Otherwise we save code size by maintaining offset (zero is readily
7629 available from preceding rep operation) and using x86 addressing modes.
7630 */
7631 if (TARGET_SINGLE_STRINGOP)
7632 {
7633 if (max_size > 4)
7634 {
7635 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7636 src = change_address (srcmem, SImode, srcptr);
7637 dest = change_address (destmem, SImode, destptr);
7638 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7639 emit_label (label);
7640 LABEL_NUSES (label) = 1;
7641 }
7642 if (max_size > 2)
7643 {
7644 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7645 src = change_address (srcmem, HImode, srcptr);
7646 dest = change_address (destmem, HImode, destptr);
7647 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7648 emit_label (label);
7649 LABEL_NUSES (label) = 1;
7650 }
7651 if (max_size > 1)
7652 {
7653 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7654 src = change_address (srcmem, QImode, srcptr);
7655 dest = change_address (destmem, QImode, destptr);
7656 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7657 emit_label (label);
7658 LABEL_NUSES (label) = 1;
7659 }
7660 }
7661 else
7662 {
7663 rtx offset = force_reg (Pmode, const0_rtx);
7664 rtx tmp;
7665
7666 if (max_size > 4)
7667 {
7668 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7669 src = change_address (srcmem, SImode, srcptr);
7670 dest = change_address (destmem, SImode, destptr);
7671 emit_move_insn (dest, src);
7672 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7673 true, OPTAB_LIB_WIDEN);
7674 if (tmp != offset)
7675 emit_move_insn (offset, tmp);
7676 emit_label (label);
7677 LABEL_NUSES (label) = 1;
7678 }
7679 if (max_size > 2)
7680 {
7681 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7682 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7683 src = change_address (srcmem, HImode, tmp);
7684 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7685 dest = change_address (destmem, HImode, tmp);
7686 emit_move_insn (dest, src);
7687 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7688 true, OPTAB_LIB_WIDEN);
7689 if (tmp != offset)
7690 emit_move_insn (offset, tmp);
7691 emit_label (label);
7692 LABEL_NUSES (label) = 1;
7693 }
7694 if (max_size > 1)
7695 {
7696 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7697 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7698 src = change_address (srcmem, QImode, tmp);
7699 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7700 dest = change_address (destmem, QImode, tmp);
7701 emit_move_insn (dest, src);
7702 emit_label (label);
7703 LABEL_NUSES (label) = 1;
7704 }
7705 }
7706 }
7707
7708 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7709 with value PROMOTED_VAL.
7710 SRC is passed by pointer to be updated on return.
7711 Return value is updated DST. */
7712 static rtx
7713 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7714 HOST_WIDE_INT size_to_move)
7715 {
7716 rtx dst = destmem;
7717 enum insn_code code;
7718 machine_mode move_mode;
7719 int piece_size, i;
7720
7721 /* Find the widest mode in which we could perform moves.
7722 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7723 it until move of such size is supported. */
7724 move_mode = GET_MODE (promoted_val);
7725 if (move_mode == VOIDmode)
7726 move_mode = QImode;
7727 if (size_to_move < GET_MODE_SIZE (move_mode))
7728 {
7729 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7730 move_mode = int_mode_for_size (move_bits, 0).require ();
7731 promoted_val = gen_lowpart (move_mode, promoted_val);
7732 }
7733 piece_size = GET_MODE_SIZE (move_mode);
7734 code = optab_handler (mov_optab, move_mode);
7735 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7736
7737 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7738
7739 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7740 gcc_assert (size_to_move % piece_size == 0);
7741
7742 for (i = 0; i < size_to_move; i += piece_size)
7743 {
7744 if (piece_size <= GET_MODE_SIZE (word_mode))
7745 {
7746 emit_insn (gen_strset (destptr, dst, promoted_val));
7747 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7748 piece_size);
7749 continue;
7750 }
7751
7752 emit_insn (GEN_FCN (code) (dst, promoted_val));
7753
7754 emit_move_insn (destptr,
7755 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7756
7757 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7758 piece_size);
7759 }
7760
7761 /* Update DST rtx. */
7762 return dst;
7763 }
7764 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7765 static void
7766 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7767 rtx count, int max_size)
7768 {
7769 count = expand_simple_binop (counter_mode (count), AND, count,
7770 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7771 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7772 gen_lowpart (QImode, value), count, QImode,
7773 1, max_size / 2, true);
7774 }
7775
7776 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7777 static void
7778 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7779 rtx count, int max_size)
7780 {
7781 rtx dest;
7782
7783 if (CONST_INT_P (count))
7784 {
7785 HOST_WIDE_INT countval = INTVAL (count);
7786 HOST_WIDE_INT epilogue_size = countval % max_size;
7787 int i;
7788
7789 /* For now MAX_SIZE should be a power of 2. This assert could be
7790 relaxed, but it'll require a bit more complicated epilogue
7791 expanding. */
7792 gcc_assert ((max_size & (max_size - 1)) == 0);
7793 for (i = max_size; i >= 1; i >>= 1)
7794 {
7795 if (epilogue_size & i)
7796 {
7797 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7798 destmem = emit_memset (destmem, destptr, vec_value, i);
7799 else
7800 destmem = emit_memset (destmem, destptr, value, i);
7801 }
7802 }
7803 return;
7804 }
7805 if (max_size > 32)
7806 {
7807 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7808 return;
7809 }
7810 if (max_size > 16)
7811 {
7812 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7813 if (TARGET_64BIT)
7814 {
7815 dest = change_address (destmem, DImode, destptr);
7816 emit_insn (gen_strset (destptr, dest, value));
7817 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7818 emit_insn (gen_strset (destptr, dest, value));
7819 }
7820 else
7821 {
7822 dest = change_address (destmem, SImode, destptr);
7823 emit_insn (gen_strset (destptr, dest, value));
7824 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7825 emit_insn (gen_strset (destptr, dest, value));
7826 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7827 emit_insn (gen_strset (destptr, dest, value));
7828 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7829 emit_insn (gen_strset (destptr, dest, value));
7830 }
7831 emit_label (label);
7832 LABEL_NUSES (label) = 1;
7833 }
7834 if (max_size > 8)
7835 {
7836 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7837 if (TARGET_64BIT)
7838 {
7839 dest = change_address (destmem, DImode, destptr);
7840 emit_insn (gen_strset (destptr, dest, value));
7841 }
7842 else
7843 {
7844 dest = change_address (destmem, SImode, destptr);
7845 emit_insn (gen_strset (destptr, dest, value));
7846 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7847 emit_insn (gen_strset (destptr, dest, value));
7848 }
7849 emit_label (label);
7850 LABEL_NUSES (label) = 1;
7851 }
7852 if (max_size > 4)
7853 {
7854 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7855 dest = change_address (destmem, SImode, destptr);
7856 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7857 emit_label (label);
7858 LABEL_NUSES (label) = 1;
7859 }
7860 if (max_size > 2)
7861 {
7862 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7863 dest = change_address (destmem, HImode, destptr);
7864 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7865 emit_label (label);
7866 LABEL_NUSES (label) = 1;
7867 }
7868 if (max_size > 1)
7869 {
7870 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7871 dest = change_address (destmem, QImode, destptr);
7872 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7873 emit_label (label);
7874 LABEL_NUSES (label) = 1;
7875 }
7876 }
7877
7878 /* Adjust COUNTER by the VALUE. */
7879 static void
7880 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7881 {
7882 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7883 }
7884
7885 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7886 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7887 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7888 ignored.
7889 Return value is updated DESTMEM. */
7890
7891 static rtx
7892 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7893 rtx destptr, rtx srcptr, rtx value,
7894 rtx vec_value, rtx count, int align,
7895 int desired_alignment, bool issetmem)
7896 {
7897 int i;
7898 for (i = 1; i < desired_alignment; i <<= 1)
7899 {
7900 if (align <= i)
7901 {
7902 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7903 if (issetmem)
7904 {
7905 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7906 destmem = emit_memset (destmem, destptr, vec_value, i);
7907 else
7908 destmem = emit_memset (destmem, destptr, value, i);
7909 }
7910 else
7911 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7912 ix86_adjust_counter (count, i);
7913 emit_label (label);
7914 LABEL_NUSES (label) = 1;
7915 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7916 }
7917 }
7918 return destmem;
7919 }
7920
7921 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7922 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7923 and jump to DONE_LABEL. */
7924 static void
7925 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
7926 rtx destptr, rtx srcptr,
7927 rtx value, rtx vec_value,
7928 rtx count, int size,
7929 rtx done_label, bool issetmem)
7930 {
7931 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7932 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7933 rtx modesize;
7934 int n;
7935
7936 /* If we do not have vector value to copy, we must reduce size. */
7937 if (issetmem)
7938 {
7939 if (!vec_value)
7940 {
7941 if (GET_MODE (value) == VOIDmode && size > 8)
7942 mode = Pmode;
7943 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7944 mode = GET_MODE (value);
7945 }
7946 else
7947 mode = GET_MODE (vec_value), value = vec_value;
7948 }
7949 else
7950 {
7951 /* Choose appropriate vector mode. */
7952 if (size >= 32)
7953 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7954 else if (size >= 16)
7955 mode = TARGET_SSE ? V16QImode : DImode;
7956 srcmem = change_address (srcmem, mode, srcptr);
7957 }
7958 destmem = change_address (destmem, mode, destptr);
7959 modesize = GEN_INT (GET_MODE_SIZE (mode));
7960 gcc_assert (GET_MODE_SIZE (mode) <= size);
7961 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7962 {
7963 if (issetmem)
7964 emit_move_insn (destmem, gen_lowpart (mode, value));
7965 else
7966 {
7967 emit_move_insn (destmem, srcmem);
7968 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7969 }
7970 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7971 }
7972
7973 destmem = offset_address (destmem, count, 1);
7974 destmem = offset_address (destmem, GEN_INT (-2 * size),
7975 GET_MODE_SIZE (mode));
7976 if (!issetmem)
7977 {
7978 srcmem = offset_address (srcmem, count, 1);
7979 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7980 GET_MODE_SIZE (mode));
7981 }
7982 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7983 {
7984 if (issetmem)
7985 emit_move_insn (destmem, gen_lowpart (mode, value));
7986 else
7987 {
7988 emit_move_insn (destmem, srcmem);
7989 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7990 }
7991 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7992 }
7993 emit_jump_insn (gen_jump (done_label));
7994 emit_barrier ();
7995
7996 emit_label (label);
7997 LABEL_NUSES (label) = 1;
7998 }
7999
8000 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
8001 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
8002 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
8003 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
8004 DONE_LABEL is a label after the whole copying sequence. The label is created
8005 on demand if *DONE_LABEL is NULL.
8006 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
8007 bounds after the initial copies.
8008
8009 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
8010 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
8011 we will dispatch to a library call for large blocks.
8012
8013 In pseudocode we do:
8014
8015 if (COUNT < SIZE)
8016 {
8017 Assume that SIZE is 4. Bigger sizes are handled analogously
8018 if (COUNT & 4)
8019 {
8020 copy 4 bytes from SRCPTR to DESTPTR
8021 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
8022 goto done_label
8023 }
8024 if (!COUNT)
8025 goto done_label;
8026 copy 1 byte from SRCPTR to DESTPTR
8027 if (COUNT & 2)
8028 {
8029 copy 2 bytes from SRCPTR to DESTPTR
8030 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
8031 }
8032 }
8033 else
8034 {
8035 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
8036 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
8037
8038 OLD_DESPTR = DESTPTR;
8039 Align DESTPTR up to DESIRED_ALIGN
8040 SRCPTR += DESTPTR - OLD_DESTPTR
8041 COUNT -= DEST_PTR - OLD_DESTPTR
8042 if (DYNAMIC_CHECK)
8043 Round COUNT down to multiple of SIZE
8044 << optional caller supplied zero size guard is here >>
8045 << optional caller supplied dynamic check is here >>
8046 << caller supplied main copy loop is here >>
8047 }
8048 done_label:
8049 */
8050 static void
8051 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
8052 rtx *destptr, rtx *srcptr,
8053 machine_mode mode,
8054 rtx value, rtx vec_value,
8055 rtx *count,
8056 rtx_code_label **done_label,
8057 int size,
8058 int desired_align,
8059 int align,
8060 unsigned HOST_WIDE_INT *min_size,
8061 bool dynamic_check,
8062 bool issetmem)
8063 {
8064 rtx_code_label *loop_label = NULL, *label;
8065 int n;
8066 rtx modesize;
8067 int prolog_size = 0;
8068 rtx mode_value;
8069
8070 /* Chose proper value to copy. */
8071 if (issetmem && VECTOR_MODE_P (mode))
8072 mode_value = vec_value;
8073 else
8074 mode_value = value;
8075 gcc_assert (GET_MODE_SIZE (mode) <= size);
8076
8077 /* See if block is big or small, handle small blocks. */
8078 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
8079 {
8080 int size2 = size;
8081 loop_label = gen_label_rtx ();
8082
8083 if (!*done_label)
8084 *done_label = gen_label_rtx ();
8085
8086 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
8087 1, loop_label);
8088 size2 >>= 1;
8089
8090 /* Handle sizes > 3. */
8091 for (;size2 > 2; size2 >>= 1)
8092 expand_small_cpymem_or_setmem (destmem, srcmem,
8093 *destptr, *srcptr,
8094 value, vec_value,
8095 *count,
8096 size2, *done_label, issetmem);
8097 /* Nothing to copy? Jump to DONE_LABEL if so */
8098 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
8099 1, *done_label);
8100
8101 /* Do a byte copy. */
8102 destmem = change_address (destmem, QImode, *destptr);
8103 if (issetmem)
8104 emit_move_insn (destmem, gen_lowpart (QImode, value));
8105 else
8106 {
8107 srcmem = change_address (srcmem, QImode, *srcptr);
8108 emit_move_insn (destmem, srcmem);
8109 }
8110
8111 /* Handle sizes 2 and 3. */
8112 label = ix86_expand_aligntest (*count, 2, false);
8113 destmem = change_address (destmem, HImode, *destptr);
8114 destmem = offset_address (destmem, *count, 1);
8115 destmem = offset_address (destmem, GEN_INT (-2), 2);
8116 if (issetmem)
8117 emit_move_insn (destmem, gen_lowpart (HImode, value));
8118 else
8119 {
8120 srcmem = change_address (srcmem, HImode, *srcptr);
8121 srcmem = offset_address (srcmem, *count, 1);
8122 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
8123 emit_move_insn (destmem, srcmem);
8124 }
8125
8126 emit_label (label);
8127 LABEL_NUSES (label) = 1;
8128 emit_jump_insn (gen_jump (*done_label));
8129 emit_barrier ();
8130 }
8131 else
8132 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
8133 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
8134
8135 /* Start memcpy for COUNT >= SIZE. */
8136 if (loop_label)
8137 {
8138 emit_label (loop_label);
8139 LABEL_NUSES (loop_label) = 1;
8140 }
8141
8142 /* Copy first desired_align bytes. */
8143 if (!issetmem)
8144 srcmem = change_address (srcmem, mode, *srcptr);
8145 destmem = change_address (destmem, mode, *destptr);
8146 modesize = GEN_INT (GET_MODE_SIZE (mode));
8147 for (n = 0; prolog_size < desired_align - align; n++)
8148 {
8149 if (issetmem)
8150 emit_move_insn (destmem, mode_value);
8151 else
8152 {
8153 emit_move_insn (destmem, srcmem);
8154 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8155 }
8156 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8157 prolog_size += GET_MODE_SIZE (mode);
8158 }
8159
8160
8161 /* Copy last SIZE bytes. */
8162 destmem = offset_address (destmem, *count, 1);
8163 destmem = offset_address (destmem,
8164 GEN_INT (-size - prolog_size),
8165 1);
8166 if (issetmem)
8167 emit_move_insn (destmem, mode_value);
8168 else
8169 {
8170 srcmem = offset_address (srcmem, *count, 1);
8171 srcmem = offset_address (srcmem,
8172 GEN_INT (-size - prolog_size),
8173 1);
8174 emit_move_insn (destmem, srcmem);
8175 }
8176 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8177 {
8178 destmem = offset_address (destmem, modesize, 1);
8179 if (issetmem)
8180 emit_move_insn (destmem, mode_value);
8181 else
8182 {
8183 srcmem = offset_address (srcmem, modesize, 1);
8184 emit_move_insn (destmem, srcmem);
8185 }
8186 }
8187
8188 /* Align destination. */
8189 if (desired_align > 1 && desired_align > align)
8190 {
8191 rtx saveddest = *destptr;
8192
8193 gcc_assert (desired_align <= size);
8194 /* Align destptr up, place it to new register. */
8195 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8196 GEN_INT (prolog_size),
8197 NULL_RTX, 1, OPTAB_DIRECT);
8198 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8199 REG_POINTER (*destptr) = 1;
8200 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8201 GEN_INT (-desired_align),
8202 *destptr, 1, OPTAB_DIRECT);
8203 /* See how many bytes we skipped. */
8204 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8205 *destptr,
8206 saveddest, 1, OPTAB_DIRECT);
8207 /* Adjust srcptr and count. */
8208 if (!issetmem)
8209 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8210 saveddest, *srcptr, 1, OPTAB_DIRECT);
8211 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8212 saveddest, *count, 1, OPTAB_DIRECT);
8213 /* We copied at most size + prolog_size. */
8214 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8215 *min_size
8216 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8217 else
8218 *min_size = 0;
8219
8220 /* Our loops always round down the block size, but for dispatch to
8221 library we need precise value. */
8222 if (dynamic_check)
8223 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
8224 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
8225 }
8226 else
8227 {
8228 gcc_assert (prolog_size == 0);
8229 /* Decrease count, so we won't end up copying last word twice. */
8230 if (!CONST_INT_P (*count))
8231 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8232 constm1_rtx, *count, 1, OPTAB_DIRECT);
8233 else
8234 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
8235 (unsigned HOST_WIDE_INT)size));
8236 if (*min_size)
8237 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
8238 }
8239 }
8240
8241
8242 /* This function is like the previous one, except here we know how many bytes
8243 need to be copied. That allows us to update alignment not only of DST, which
8244 is returned, but also of SRC, which is passed as a pointer for that
8245 reason. */
8246 static rtx
8247 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
8248 rtx srcreg, rtx value, rtx vec_value,
8249 int desired_align, int align_bytes,
8250 bool issetmem)
8251 {
8252 rtx src = NULL;
8253 rtx orig_dst = dst;
8254 rtx orig_src = NULL;
8255 int piece_size = 1;
8256 int copied_bytes = 0;
8257
8258 if (!issetmem)
8259 {
8260 gcc_assert (srcp != NULL);
8261 src = *srcp;
8262 orig_src = src;
8263 }
8264
8265 for (piece_size = 1;
8266 piece_size <= desired_align && copied_bytes < align_bytes;
8267 piece_size <<= 1)
8268 {
8269 if (align_bytes & piece_size)
8270 {
8271 if (issetmem)
8272 {
8273 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
8274 dst = emit_memset (dst, destreg, vec_value, piece_size);
8275 else
8276 dst = emit_memset (dst, destreg, value, piece_size);
8277 }
8278 else
8279 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
8280 copied_bytes += piece_size;
8281 }
8282 }
8283 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
8284 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8285 if (MEM_SIZE_KNOWN_P (orig_dst))
8286 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
8287
8288 if (!issetmem)
8289 {
8290 int src_align_bytes = get_mem_align_offset (src, desired_align
8291 * BITS_PER_UNIT);
8292 if (src_align_bytes >= 0)
8293 src_align_bytes = desired_align - src_align_bytes;
8294 if (src_align_bytes >= 0)
8295 {
8296 unsigned int src_align;
8297 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
8298 {
8299 if ((src_align_bytes & (src_align - 1))
8300 == (align_bytes & (src_align - 1)))
8301 break;
8302 }
8303 if (src_align > (unsigned int) desired_align)
8304 src_align = desired_align;
8305 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8306 set_mem_align (src, src_align * BITS_PER_UNIT);
8307 }
8308 if (MEM_SIZE_KNOWN_P (orig_src))
8309 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8310 *srcp = src;
8311 }
8312
8313 return dst;
8314 }
8315
8316 /* Return true if ALG can be used in current context.
8317 Assume we expand memset if MEMSET is true. */
8318 static bool
8319 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8320 {
8321 if (alg == no_stringop)
8322 return false;
8323 if (alg == vector_loop)
8324 return TARGET_SSE || TARGET_AVX;
8325 /* Algorithms using the rep prefix want at least edi and ecx;
8326 additionally, memset wants eax and memcpy wants esi. Don't
8327 consider such algorithms if the user has appropriated those
8328 registers for their own purposes, or if we have a non-default
8329 address space, since some string insns cannot override the segment. */
8330 if (alg == rep_prefix_1_byte
8331 || alg == rep_prefix_4_byte
8332 || alg == rep_prefix_8_byte)
8333 {
8334 if (have_as)
8335 return false;
8336 if (fixed_regs[CX_REG]
8337 || fixed_regs[DI_REG]
8338 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8339 return false;
8340 }
8341 return true;
8342 }
8343
8344 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8345 static enum stringop_alg
8346 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8347 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8348 bool memset, bool zero_memset, bool have_as,
8349 int *dynamic_check, bool *noalign, bool recur)
8350 {
8351 const struct stringop_algs *algs;
8352 bool optimize_for_speed;
8353 int max = 0;
8354 const struct processor_costs *cost;
8355 int i;
8356 bool any_alg_usable_p = false;
8357
8358 *noalign = false;
8359 *dynamic_check = -1;
8360
8361 /* Even if the string operation call is cold, we still might spend a lot
8362 of time processing large blocks. */
8363 if (optimize_function_for_size_p (cfun)
8364 || (optimize_insn_for_size_p ()
8365 && (max_size < 256
8366 || (expected_size != -1 && expected_size < 256))))
8367 optimize_for_speed = false;
8368 else
8369 optimize_for_speed = true;
8370
8371 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8372 if (memset)
8373 algs = &cost->memset[TARGET_64BIT != 0];
8374 else
8375 algs = &cost->memcpy[TARGET_64BIT != 0];
8376
8377 /* See maximal size for user defined algorithm. */
8378 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8379 {
8380 enum stringop_alg candidate = algs->size[i].alg;
8381 bool usable = alg_usable_p (candidate, memset, have_as);
8382 any_alg_usable_p |= usable;
8383
8384 if (candidate != libcall && candidate && usable)
8385 max = algs->size[i].max;
8386 }
8387
8388 /* If expected size is not known but max size is small enough
8389 so inline version is a win, set expected size into
8390 the range. */
8391 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8392 && expected_size == -1)
8393 expected_size = min_size / 2 + max_size / 2;
8394
8395 /* If user specified the algorithm, honor it if possible. */
8396 if (ix86_stringop_alg != no_stringop
8397 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8398 return ix86_stringop_alg;
8399 /* rep; movq or rep; movl is the smallest variant. */
8400 else if (!optimize_for_speed)
8401 {
8402 *noalign = true;
8403 if (!count || (count & 3) || (memset && !zero_memset))
8404 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8405 ? rep_prefix_1_byte : loop_1_byte;
8406 else
8407 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8408 ? rep_prefix_4_byte : loop;
8409 }
8410 /* Very tiny blocks are best handled via the loop, REP is expensive to
8411 setup. */
8412 else if (expected_size != -1 && expected_size < 4)
8413 return loop_1_byte;
8414 else if (expected_size != -1)
8415 {
8416 enum stringop_alg alg = libcall;
8417 bool alg_noalign = false;
8418 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8419 {
8420 /* We get here if the algorithms that were not libcall-based
8421 were rep-prefix based and we are unable to use rep prefixes
8422 based on global register usage. Break out of the loop and
8423 use the heuristic below. */
8424 if (algs->size[i].max == 0)
8425 break;
8426 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8427 {
8428 enum stringop_alg candidate = algs->size[i].alg;
8429
8430 if (candidate != libcall
8431 && alg_usable_p (candidate, memset, have_as))
8432 {
8433 alg = candidate;
8434 alg_noalign = algs->size[i].noalign;
8435 }
8436 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8437 last non-libcall inline algorithm. */
8438 if (TARGET_INLINE_ALL_STRINGOPS)
8439 {
8440 /* When the current size is best to be copied by a libcall,
8441 but we are still forced to inline, run the heuristic below
8442 that will pick code for medium sized blocks. */
8443 if (alg != libcall)
8444 {
8445 *noalign = alg_noalign;
8446 return alg;
8447 }
8448 else if (!any_alg_usable_p)
8449 break;
8450 }
8451 else if (alg_usable_p (candidate, memset, have_as)
8452 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8453 && candidate == rep_prefix_1_byte
8454 /* NB: If min_size != max_size, size is
8455 unknown. */
8456 && min_size != max_size))
8457 {
8458 *noalign = algs->size[i].noalign;
8459 return candidate;
8460 }
8461 }
8462 }
8463 }
8464 /* When asked to inline the call anyway, try to pick meaningful choice.
8465 We look for maximal size of block that is faster to copy by hand and
8466 take blocks of at most of that size guessing that average size will
8467 be roughly half of the block.
8468
8469 If this turns out to be bad, we might simply specify the preferred
8470 choice in ix86_costs. */
8471 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8472 && (algs->unknown_size == libcall
8473 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8474 {
8475 enum stringop_alg alg;
8476 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8477
8478 /* If there aren't any usable algorithms or if recursing already,
8479 then recursing on smaller sizes or same size isn't going to
8480 find anything. Just return the simple byte-at-a-time copy loop. */
8481 if (!any_alg_usable_p || recur)
8482 {
8483 /* Pick something reasonable. */
8484 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8485 *dynamic_check = 128;
8486 return loop_1_byte;
8487 }
8488 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8489 zero_memset, have_as, dynamic_check, noalign, true);
8490 gcc_assert (*dynamic_check == -1);
8491 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8492 *dynamic_check = max;
8493 else
8494 gcc_assert (alg != libcall);
8495 return alg;
8496 }
8497 return (alg_usable_p (algs->unknown_size, memset, have_as)
8498 ? algs->unknown_size : libcall);
8499 }
8500
8501 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8502 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8503 static int
8504 decide_alignment (int align,
8505 enum stringop_alg alg,
8506 int expected_size,
8507 machine_mode move_mode)
8508 {
8509 int desired_align = 0;
8510
8511 gcc_assert (alg != no_stringop);
8512
8513 if (alg == libcall)
8514 return 0;
8515 if (move_mode == VOIDmode)
8516 return 0;
8517
8518 desired_align = GET_MODE_SIZE (move_mode);
8519 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8520 copying whole cacheline at once. */
8521 if (TARGET_CPU_P (PENTIUMPRO)
8522 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8523 desired_align = 8;
8524
8525 if (optimize_size)
8526 desired_align = 1;
8527 if (desired_align < align)
8528 desired_align = align;
8529 if (expected_size != -1 && expected_size < 4)
8530 desired_align = align;
8531
8532 return desired_align;
8533 }
8534
8535
8536 /* Helper function for memcpy. For QImode value 0xXY produce
8537 0xXYXYXYXY of wide specified by MODE. This is essentially
8538 a * 0x10101010, but we can do slightly better than
8539 synth_mult by unwinding the sequence by hand on CPUs with
8540 slow multiply. */
8541 static rtx
8542 promote_duplicated_reg (machine_mode mode, rtx val)
8543 {
8544 machine_mode valmode = GET_MODE (val);
8545 rtx tmp;
8546 int nops = mode == DImode ? 3 : 2;
8547
8548 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8549 if (val == const0_rtx)
8550 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8551 if (CONST_INT_P (val))
8552 {
8553 HOST_WIDE_INT v = INTVAL (val) & 255;
8554
8555 v |= v << 8;
8556 v |= v << 16;
8557 if (mode == DImode)
8558 v |= (v << 16) << 16;
8559 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8560 }
8561
8562 if (valmode == VOIDmode)
8563 valmode = QImode;
8564 if (valmode != QImode)
8565 val = gen_lowpart (QImode, val);
8566 if (mode == QImode)
8567 return val;
8568 if (!TARGET_PARTIAL_REG_STALL)
8569 nops--;
8570 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8571 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8572 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8573 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8574 {
8575 rtx reg = convert_modes (mode, QImode, val, true);
8576 tmp = promote_duplicated_reg (mode, const1_rtx);
8577 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8578 OPTAB_DIRECT);
8579 }
8580 else
8581 {
8582 rtx reg = convert_modes (mode, QImode, val, true);
8583
8584 if (!TARGET_PARTIAL_REG_STALL)
8585 emit_insn (gen_insv_1 (mode, reg, reg));
8586 else
8587 {
8588 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8589 NULL, 1, OPTAB_DIRECT);
8590 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8591 OPTAB_DIRECT);
8592 }
8593 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8594 NULL, 1, OPTAB_DIRECT);
8595 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8596 if (mode == SImode)
8597 return reg;
8598 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8599 NULL, 1, OPTAB_DIRECT);
8600 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8601 return reg;
8602 }
8603 }
8604
8605 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8606 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8607 alignment from ALIGN to DESIRED_ALIGN. */
8608 static rtx
8609 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8610 int align)
8611 {
8612 rtx promoted_val;
8613
8614 if (TARGET_64BIT
8615 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8616 promoted_val = promote_duplicated_reg (DImode, val);
8617 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8618 promoted_val = promote_duplicated_reg (SImode, val);
8619 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8620 promoted_val = promote_duplicated_reg (HImode, val);
8621 else
8622 promoted_val = val;
8623
8624 return promoted_val;
8625 }
8626
8627 /* Copy the address to a Pmode register. This is used for x32 to
8628 truncate DImode TLS address to a SImode register. */
8629
8630 static rtx
8631 ix86_copy_addr_to_reg (rtx addr)
8632 {
8633 rtx reg;
8634 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8635 {
8636 reg = copy_addr_to_reg (addr);
8637 REG_POINTER (reg) = 1;
8638 return reg;
8639 }
8640 else
8641 {
8642 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8643 reg = copy_to_mode_reg (DImode, addr);
8644 REG_POINTER (reg) = 1;
8645 return gen_rtx_SUBREG (SImode, reg, 0);
8646 }
8647 }
8648
8649 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8650 operations when profitable. The code depends upon architecture, block size
8651 and alignment, but always has one of the following overall structures:
8652
8653 Aligned move sequence:
8654
8655 1) Prologue guard: Conditional that jumps up to epilogues for small
8656 blocks that can be handled by epilogue alone. This is faster
8657 but also needed for correctness, since prologue assume the block
8658 is larger than the desired alignment.
8659
8660 Optional dynamic check for size and libcall for large
8661 blocks is emitted here too, with -minline-stringops-dynamically.
8662
8663 2) Prologue: copy first few bytes in order to get destination
8664 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8665 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8666 copied. We emit either a jump tree on power of two sized
8667 blocks, or a byte loop.
8668
8669 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8670 with specified algorithm.
8671
8672 4) Epilogue: code copying tail of the block that is too small to be
8673 handled by main body (or up to size guarded by prologue guard).
8674
8675 Misaligned move sequence
8676
8677 1) missaligned move prologue/epilogue containing:
8678 a) Prologue handling small memory blocks and jumping to done_label
8679 (skipped if blocks are known to be large enough)
8680 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8681 needed by single possibly misaligned move
8682 (skipped if alignment is not needed)
8683 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8684
8685 2) Zero size guard dispatching to done_label, if needed
8686
8687 3) dispatch to library call, if needed,
8688
8689 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8690 with specified algorithm. */
8691 bool
8692 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8693 rtx align_exp, rtx expected_align_exp,
8694 rtx expected_size_exp, rtx min_size_exp,
8695 rtx max_size_exp, rtx probable_max_size_exp,
8696 bool issetmem)
8697 {
8698 rtx destreg;
8699 rtx srcreg = NULL;
8700 rtx_code_label *label = NULL;
8701 rtx tmp;
8702 rtx_code_label *jump_around_label = NULL;
8703 HOST_WIDE_INT align = 1;
8704 unsigned HOST_WIDE_INT count = 0;
8705 HOST_WIDE_INT expected_size = -1;
8706 int size_needed = 0, epilogue_size_needed;
8707 int desired_align = 0, align_bytes = 0;
8708 enum stringop_alg alg;
8709 rtx promoted_val = NULL;
8710 rtx vec_promoted_val = NULL;
8711 bool force_loopy_epilogue = false;
8712 int dynamic_check;
8713 bool need_zero_guard = false;
8714 bool noalign;
8715 machine_mode move_mode = VOIDmode;
8716 machine_mode wider_mode;
8717 int unroll_factor = 1;
8718 /* TODO: Once value ranges are available, fill in proper data. */
8719 unsigned HOST_WIDE_INT min_size = 0;
8720 unsigned HOST_WIDE_INT max_size = -1;
8721 unsigned HOST_WIDE_INT probable_max_size = -1;
8722 bool misaligned_prologue_used = false;
8723 bool have_as;
8724
8725 if (CONST_INT_P (align_exp))
8726 align = INTVAL (align_exp);
8727 /* i386 can do misaligned access on reasonably increased cost. */
8728 if (CONST_INT_P (expected_align_exp)
8729 && INTVAL (expected_align_exp) > align)
8730 align = INTVAL (expected_align_exp);
8731 /* ALIGN is the minimum of destination and source alignment, but we care here
8732 just about destination alignment. */
8733 else if (!issetmem
8734 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8735 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8736
8737 if (CONST_INT_P (count_exp))
8738 {
8739 min_size = max_size = probable_max_size = count = expected_size
8740 = INTVAL (count_exp);
8741 /* When COUNT is 0, there is nothing to do. */
8742 if (!count)
8743 return true;
8744 }
8745 else
8746 {
8747 if (min_size_exp)
8748 min_size = INTVAL (min_size_exp);
8749 if (max_size_exp)
8750 max_size = INTVAL (max_size_exp);
8751 if (probable_max_size_exp)
8752 probable_max_size = INTVAL (probable_max_size_exp);
8753 if (CONST_INT_P (expected_size_exp))
8754 expected_size = INTVAL (expected_size_exp);
8755 }
8756
8757 /* Make sure we don't need to care about overflow later on. */
8758 if (count > (HOST_WIDE_INT_1U << 30))
8759 return false;
8760
8761 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8762 if (!issetmem)
8763 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8764
8765 /* Step 0: Decide on preferred algorithm, desired alignment and
8766 size of chunks to be copied by main loop. */
8767 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8768 issetmem,
8769 issetmem && val_exp == const0_rtx, have_as,
8770 &dynamic_check, &noalign, false);
8771
8772 if (dump_file)
8773 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8774 stringop_alg_names[alg]);
8775
8776 if (alg == libcall)
8777 return false;
8778 gcc_assert (alg != no_stringop);
8779
8780 /* For now vector-version of memset is generated only for memory zeroing, as
8781 creating of promoted vector value is very cheap in this case. */
8782 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8783 alg = unrolled_loop;
8784
8785 if (!count)
8786 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8787 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8788 if (!issetmem)
8789 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8790
8791 unroll_factor = 1;
8792 move_mode = word_mode;
8793 switch (alg)
8794 {
8795 case libcall:
8796 case no_stringop:
8797 case last_alg:
8798 gcc_unreachable ();
8799 case loop_1_byte:
8800 need_zero_guard = true;
8801 move_mode = QImode;
8802 break;
8803 case loop:
8804 need_zero_guard = true;
8805 break;
8806 case unrolled_loop:
8807 need_zero_guard = true;
8808 unroll_factor = (TARGET_64BIT ? 4 : 2);
8809 break;
8810 case vector_loop:
8811 need_zero_guard = true;
8812 unroll_factor = 4;
8813 /* Find the widest supported mode. */
8814 move_mode = word_mode;
8815 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8816 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8817 move_mode = wider_mode;
8818
8819 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8820 move_mode = TImode;
8821 if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
8822 move_mode = OImode;
8823
8824 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8825 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8826 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8827 {
8828 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8829 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8830 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8831 move_mode = word_mode;
8832 }
8833 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8834 break;
8835 case rep_prefix_8_byte:
8836 move_mode = DImode;
8837 break;
8838 case rep_prefix_4_byte:
8839 move_mode = SImode;
8840 break;
8841 case rep_prefix_1_byte:
8842 move_mode = QImode;
8843 break;
8844 }
8845 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8846 epilogue_size_needed = size_needed;
8847
8848 /* If we are going to call any library calls conditionally, make sure any
8849 pending stack adjustment happen before the first conditional branch,
8850 otherwise they will be emitted before the library call only and won't
8851 happen from the other branches. */
8852 if (dynamic_check != -1)
8853 do_pending_stack_adjust ();
8854
8855 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8856 if (!TARGET_ALIGN_STRINGOPS || noalign)
8857 align = desired_align;
8858
8859 /* Step 1: Prologue guard. */
8860
8861 /* Alignment code needs count to be in register. */
8862 if (CONST_INT_P (count_exp) && desired_align > align)
8863 {
8864 if (INTVAL (count_exp) > desired_align
8865 && INTVAL (count_exp) > size_needed)
8866 {
8867 align_bytes
8868 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8869 if (align_bytes <= 0)
8870 align_bytes = 0;
8871 else
8872 align_bytes = desired_align - align_bytes;
8873 }
8874 if (align_bytes == 0)
8875 count_exp = force_reg (counter_mode (count_exp), count_exp);
8876 }
8877 gcc_assert (desired_align >= 1 && align >= 1);
8878
8879 /* Misaligned move sequences handle both prologue and epilogue at once.
8880 Default code generation results in a smaller code for large alignments
8881 and also avoids redundant job when sizes are known precisely. */
8882 misaligned_prologue_used
8883 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8884 && MAX (desired_align, epilogue_size_needed) <= 32
8885 && desired_align <= epilogue_size_needed
8886 && ((desired_align > align && !align_bytes)
8887 || (!count && epilogue_size_needed > 1)));
8888
8889 /* Do the cheap promotion to allow better CSE across the
8890 main loop and epilogue (ie one load of the big constant in the
8891 front of all code.
8892 For now the misaligned move sequences do not have fast path
8893 without broadcasting. */
8894 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8895 {
8896 if (alg == vector_loop)
8897 {
8898 gcc_assert (val_exp == const0_rtx);
8899 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8900 promoted_val = promote_duplicated_reg_to_size (val_exp,
8901 GET_MODE_SIZE (word_mode),
8902 desired_align, align);
8903 }
8904 else
8905 {
8906 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8907 desired_align, align);
8908 }
8909 }
8910 /* Misaligned move sequences handles both prologues and epilogues at once.
8911 Default code generation results in smaller code for large alignments and
8912 also avoids redundant job when sizes are known precisely. */
8913 if (misaligned_prologue_used)
8914 {
8915 /* Misaligned move prologue handled small blocks by itself. */
8916 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8917 (dst, src, &destreg, &srcreg,
8918 move_mode, promoted_val, vec_promoted_val,
8919 &count_exp,
8920 &jump_around_label,
8921 desired_align < align
8922 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8923 desired_align, align, &min_size, dynamic_check, issetmem);
8924 if (!issetmem)
8925 src = change_address (src, BLKmode, srcreg);
8926 dst = change_address (dst, BLKmode, destreg);
8927 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8928 epilogue_size_needed = 0;
8929 if (need_zero_guard
8930 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8931 {
8932 /* It is possible that we copied enough so the main loop will not
8933 execute. */
8934 gcc_assert (size_needed > 1);
8935 if (jump_around_label == NULL_RTX)
8936 jump_around_label = gen_label_rtx ();
8937 emit_cmp_and_jump_insns (count_exp,
8938 GEN_INT (size_needed),
8939 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8940 if (expected_size == -1
8941 || expected_size < (desired_align - align) / 2 + size_needed)
8942 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8943 else
8944 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8945 }
8946 }
8947 /* Ensure that alignment prologue won't copy past end of block. */
8948 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8949 {
8950 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8951 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8952 Make sure it is power of 2. */
8953 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8954
8955 /* To improve performance of small blocks, we jump around the VAL
8956 promoting mode. This mean that if the promoted VAL is not constant,
8957 we might not use it in the epilogue and have to use byte
8958 loop variant. */
8959 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8960 force_loopy_epilogue = true;
8961 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8962 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8963 {
8964 /* If main algorithm works on QImode, no epilogue is needed.
8965 For small sizes just don't align anything. */
8966 if (size_needed == 1)
8967 desired_align = align;
8968 else
8969 goto epilogue;
8970 }
8971 else if (!count
8972 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8973 {
8974 label = gen_label_rtx ();
8975 emit_cmp_and_jump_insns (count_exp,
8976 GEN_INT (epilogue_size_needed),
8977 LTU, 0, counter_mode (count_exp), 1, label);
8978 if (expected_size == -1 || expected_size < epilogue_size_needed)
8979 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8980 else
8981 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8982 }
8983 }
8984
8985 /* Emit code to decide on runtime whether library call or inline should be
8986 used. */
8987 if (dynamic_check != -1)
8988 {
8989 if (!issetmem && CONST_INT_P (count_exp))
8990 {
8991 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8992 {
8993 emit_block_copy_via_libcall (dst, src, count_exp);
8994 count_exp = const0_rtx;
8995 goto epilogue;
8996 }
8997 }
8998 else
8999 {
9000 rtx_code_label *hot_label = gen_label_rtx ();
9001 if (jump_around_label == NULL_RTX)
9002 jump_around_label = gen_label_rtx ();
9003 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
9004 LEU, 0, counter_mode (count_exp),
9005 1, hot_label);
9006 predict_jump (REG_BR_PROB_BASE * 90 / 100);
9007 if (issetmem)
9008 set_storage_via_libcall (dst, count_exp, val_exp);
9009 else
9010 emit_block_copy_via_libcall (dst, src, count_exp);
9011 emit_jump (jump_around_label);
9012 emit_label (hot_label);
9013 }
9014 }
9015
9016 /* Step 2: Alignment prologue. */
9017 /* Do the expensive promotion once we branched off the small blocks. */
9018 if (issetmem && !promoted_val)
9019 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9020 desired_align, align);
9021
9022 if (desired_align > align && !misaligned_prologue_used)
9023 {
9024 if (align_bytes == 0)
9025 {
9026 /* Except for the first move in prologue, we no longer know
9027 constant offset in aliasing info. It don't seems to worth
9028 the pain to maintain it for the first move, so throw away
9029 the info early. */
9030 dst = change_address (dst, BLKmode, destreg);
9031 if (!issetmem)
9032 src = change_address (src, BLKmode, srcreg);
9033 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
9034 promoted_val, vec_promoted_val,
9035 count_exp, align, desired_align,
9036 issetmem);
9037 /* At most desired_align - align bytes are copied. */
9038 if (min_size < (unsigned)(desired_align - align))
9039 min_size = 0;
9040 else
9041 min_size -= desired_align - align;
9042 }
9043 else
9044 {
9045 /* If we know how many bytes need to be stored before dst is
9046 sufficiently aligned, maintain aliasing info accurately. */
9047 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
9048 srcreg,
9049 promoted_val,
9050 vec_promoted_val,
9051 desired_align,
9052 align_bytes,
9053 issetmem);
9054
9055 count_exp = plus_constant (counter_mode (count_exp),
9056 count_exp, -align_bytes);
9057 count -= align_bytes;
9058 min_size -= align_bytes;
9059 max_size -= align_bytes;
9060 }
9061 if (need_zero_guard
9062 && min_size < (unsigned HOST_WIDE_INT) size_needed
9063 && (count < (unsigned HOST_WIDE_INT) size_needed
9064 || (align_bytes == 0
9065 && count < ((unsigned HOST_WIDE_INT) size_needed
9066 + desired_align - align))))
9067 {
9068 /* It is possible that we copied enough so the main loop will not
9069 execute. */
9070 gcc_assert (size_needed > 1);
9071 if (label == NULL_RTX)
9072 label = gen_label_rtx ();
9073 emit_cmp_and_jump_insns (count_exp,
9074 GEN_INT (size_needed),
9075 LTU, 0, counter_mode (count_exp), 1, label);
9076 if (expected_size == -1
9077 || expected_size < (desired_align - align) / 2 + size_needed)
9078 predict_jump (REG_BR_PROB_BASE * 20 / 100);
9079 else
9080 predict_jump (REG_BR_PROB_BASE * 60 / 100);
9081 }
9082 }
9083 if (label && size_needed == 1)
9084 {
9085 emit_label (label);
9086 LABEL_NUSES (label) = 1;
9087 label = NULL;
9088 epilogue_size_needed = 1;
9089 if (issetmem)
9090 promoted_val = val_exp;
9091 }
9092 else if (label == NULL_RTX && !misaligned_prologue_used)
9093 epilogue_size_needed = size_needed;
9094
9095 /* Step 3: Main loop. */
9096
9097 switch (alg)
9098 {
9099 case libcall:
9100 case no_stringop:
9101 case last_alg:
9102 gcc_unreachable ();
9103 case loop_1_byte:
9104 case loop:
9105 case unrolled_loop:
9106 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
9107 count_exp, move_mode, unroll_factor,
9108 expected_size, issetmem);
9109 break;
9110 case vector_loop:
9111 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
9112 vec_promoted_val, count_exp, move_mode,
9113 unroll_factor, expected_size, issetmem);
9114 break;
9115 case rep_prefix_8_byte:
9116 case rep_prefix_4_byte:
9117 case rep_prefix_1_byte:
9118 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
9119 val_exp, count_exp, move_mode, issetmem);
9120 break;
9121 }
9122 /* Adjust properly the offset of src and dest memory for aliasing. */
9123 if (CONST_INT_P (count_exp))
9124 {
9125 if (!issetmem)
9126 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
9127 (count / size_needed) * size_needed);
9128 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
9129 (count / size_needed) * size_needed);
9130 }
9131 else
9132 {
9133 if (!issetmem)
9134 src = change_address (src, BLKmode, srcreg);
9135 dst = change_address (dst, BLKmode, destreg);
9136 }
9137
9138 /* Step 4: Epilogue to copy the remaining bytes. */
9139 epilogue:
9140 if (label)
9141 {
9142 /* When the main loop is done, COUNT_EXP might hold original count,
9143 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9144 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9145 bytes. Compensate if needed. */
9146
9147 if (size_needed < epilogue_size_needed)
9148 {
9149 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
9150 GEN_INT (size_needed - 1), count_exp, 1,
9151 OPTAB_DIRECT);
9152 if (tmp != count_exp)
9153 emit_move_insn (count_exp, tmp);
9154 }
9155 emit_label (label);
9156 LABEL_NUSES (label) = 1;
9157 }
9158
9159 if (count_exp != const0_rtx && epilogue_size_needed > 1)
9160 {
9161 if (force_loopy_epilogue)
9162 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
9163 epilogue_size_needed);
9164 else
9165 {
9166 if (issetmem)
9167 expand_setmem_epilogue (dst, destreg, promoted_val,
9168 vec_promoted_val, count_exp,
9169 epilogue_size_needed);
9170 else
9171 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
9172 epilogue_size_needed);
9173 }
9174 }
9175 if (jump_around_label)
9176 emit_label (jump_around_label);
9177 return true;
9178 }
9179
9180 /* Expand cmpstrn or memcmp. */
9181
9182 bool
9183 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
9184 rtx length, rtx align, bool is_cmpstrn)
9185 {
9186 /* Expand strncmp and memcmp only with -minline-all-stringops since
9187 "repz cmpsb" can be much slower than strncmp and memcmp functions
9188 implemented with vector instructions, see
9189
9190 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9191 */
9192 if (!TARGET_INLINE_ALL_STRINGOPS)
9193 return false;
9194
9195 /* Can't use this if the user has appropriated ecx, esi or edi. */
9196 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
9197 return false;
9198
9199 if (is_cmpstrn)
9200 {
9201 /* For strncmp, length is the maximum length, which can be larger
9202 than actual string lengths. We can expand the cmpstrn pattern
9203 to "repz cmpsb" only if one of the strings is a constant so
9204 that expand_builtin_strncmp() can write the length argument to
9205 be the minimum of the const string length and the actual length
9206 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9207 tree t1 = MEM_EXPR (src1);
9208 tree t2 = MEM_EXPR (src2);
9209 if (!((t1 && TREE_CODE (t1) == MEM_REF
9210 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
9211 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
9212 == STRING_CST))
9213 || (t2 && TREE_CODE (t2) == MEM_REF
9214 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
9215 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
9216 == STRING_CST))))
9217 return false;
9218 }
9219
9220 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
9221 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
9222 if (addr1 != XEXP (src1, 0))
9223 src1 = replace_equiv_address_nv (src1, addr1);
9224 if (addr2 != XEXP (src2, 0))
9225 src2 = replace_equiv_address_nv (src2, addr2);
9226
9227 /* NB: Make a copy of the data length to avoid changing the original
9228 data length by cmpstrnqi patterns. */
9229 length = ix86_zero_extend_to_Pmode (length);
9230 rtx lengthreg = gen_reg_rtx (Pmode);
9231 emit_move_insn (lengthreg, length);
9232
9233 /* If we are testing strict equality, we can use known alignment to
9234 good advantage. This may be possible with combine, particularly
9235 once cc0 is dead. */
9236 if (CONST_INT_P (length))
9237 {
9238 if (length == const0_rtx)
9239 {
9240 emit_move_insn (result, const0_rtx);
9241 return true;
9242 }
9243 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
9244 src1, src2));
9245 }
9246 else
9247 {
9248 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
9249 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
9250 src1, src2));
9251 }
9252
9253 rtx out = gen_lowpart (QImode, result);
9254 emit_insn (gen_cmpintqi (out));
9255 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
9256
9257 return true;
9258 }
9259
9260 /* Expand the appropriate insns for doing strlen if not just doing
9261 repnz; scasb
9262
9263 out = result, initialized with the start address
9264 align_rtx = alignment of the address.
9265 scratch = scratch register, initialized with the startaddress when
9266 not aligned, otherwise undefined
9267
9268 This is just the body. It needs the initializations mentioned above and
9269 some address computing at the end. These things are done in i386.md. */
9270
9271 static void
9272 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
9273 {
9274 int align;
9275 rtx tmp;
9276 rtx_code_label *align_2_label = NULL;
9277 rtx_code_label *align_3_label = NULL;
9278 rtx_code_label *align_4_label = gen_label_rtx ();
9279 rtx_code_label *end_0_label = gen_label_rtx ();
9280 rtx mem;
9281 rtx tmpreg = gen_reg_rtx (SImode);
9282 rtx scratch = gen_reg_rtx (SImode);
9283 rtx cmp;
9284
9285 align = 0;
9286 if (CONST_INT_P (align_rtx))
9287 align = INTVAL (align_rtx);
9288
9289 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9290
9291 /* Is there a known alignment and is it less than 4? */
9292 if (align < 4)
9293 {
9294 rtx scratch1 = gen_reg_rtx (Pmode);
9295 emit_move_insn (scratch1, out);
9296 /* Is there a known alignment and is it not 2? */
9297 if (align != 2)
9298 {
9299 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
9300 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
9301
9302 /* Leave just the 3 lower bits. */
9303 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9304 NULL_RTX, 0, OPTAB_WIDEN);
9305
9306 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9307 Pmode, 1, align_4_label);
9308 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9309 Pmode, 1, align_2_label);
9310 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9311 Pmode, 1, align_3_label);
9312 }
9313 else
9314 {
9315 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9316 check if is aligned to 4 - byte. */
9317
9318 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9319 NULL_RTX, 0, OPTAB_WIDEN);
9320
9321 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9322 Pmode, 1, align_4_label);
9323 }
9324
9325 mem = change_address (src, QImode, out);
9326
9327 /* Now compare the bytes. */
9328
9329 /* Compare the first n unaligned byte on a byte per byte basis. */
9330 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9331 QImode, 1, end_0_label);
9332
9333 /* Increment the address. */
9334 emit_insn (gen_add2_insn (out, const1_rtx));
9335
9336 /* Not needed with an alignment of 2 */
9337 if (align != 2)
9338 {
9339 emit_label (align_2_label);
9340
9341 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9342 end_0_label);
9343
9344 emit_insn (gen_add2_insn (out, const1_rtx));
9345
9346 emit_label (align_3_label);
9347 }
9348
9349 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9350 end_0_label);
9351
9352 emit_insn (gen_add2_insn (out, const1_rtx));
9353 }
9354
9355 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9356 align this loop. It gives only huge programs, but does not help to
9357 speed up. */
9358 emit_label (align_4_label);
9359
9360 mem = change_address (src, SImode, out);
9361 emit_move_insn (scratch, mem);
9362 emit_insn (gen_add2_insn (out, GEN_INT (4)));
9363
9364 /* This formula yields a nonzero result iff one of the bytes is zero.
9365 This saves three branches inside loop and many cycles. */
9366
9367 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9368 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9369 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9370 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9371 gen_int_mode (0x80808080, SImode)));
9372 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9373 align_4_label);
9374
9375 if (TARGET_CMOVE)
9376 {
9377 rtx reg = gen_reg_rtx (SImode);
9378 rtx reg2 = gen_reg_rtx (Pmode);
9379 emit_move_insn (reg, tmpreg);
9380 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9381
9382 /* If zero is not in the first two bytes, move two bytes forward. */
9383 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9384 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9385 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9386 emit_insn (gen_rtx_SET (tmpreg,
9387 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9388 reg,
9389 tmpreg)));
9390 /* Emit lea manually to avoid clobbering of flags. */
9391 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9392
9393 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9394 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9395 emit_insn (gen_rtx_SET (out,
9396 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9397 reg2,
9398 out)));
9399 }
9400 else
9401 {
9402 rtx_code_label *end_2_label = gen_label_rtx ();
9403 /* Is zero in the first two bytes? */
9404
9405 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9406 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9407 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9408 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9409 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9410 pc_rtx);
9411 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9412 JUMP_LABEL (tmp) = end_2_label;
9413
9414 /* Not in the first two. Move two bytes forward. */
9415 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9416 emit_insn (gen_add2_insn (out, const2_rtx));
9417
9418 emit_label (end_2_label);
9419
9420 }
9421
9422 /* Avoid branch in fixing the byte. */
9423 tmpreg = gen_lowpart (QImode, tmpreg);
9424 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9425 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9426 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9427 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9428
9429 emit_label (end_0_label);
9430 }
9431
9432 /* Expand strlen. */
9433
9434 bool
9435 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9436 {
9437 if (TARGET_UNROLL_STRLEN
9438 && TARGET_INLINE_ALL_STRINGOPS
9439 && eoschar == const0_rtx
9440 && optimize > 1)
9441 {
9442 /* The generic case of strlen expander is long. Avoid it's
9443 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9444 rtx addr = force_reg (Pmode, XEXP (src, 0));
9445 /* Well it seems that some optimizer does not combine a call like
9446 foo(strlen(bar), strlen(bar));
9447 when the move and the subtraction is done here. It does calculate
9448 the length just once when these instructions are done inside of
9449 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9450 often used and I use one fewer register for the lifetime of
9451 output_strlen_unroll() this is better. */
9452
9453 emit_move_insn (out, addr);
9454
9455 ix86_expand_strlensi_unroll_1 (out, src, align);
9456
9457 /* strlensi_unroll_1 returns the address of the zero at the end of
9458 the string, like memchr(), so compute the length by subtracting
9459 the start address. */
9460 emit_insn (gen_sub2_insn (out, addr));
9461 return true;
9462 }
9463 else
9464 return false;
9465 }
9466
9467 /* For given symbol (function) construct code to compute address of it's PLT
9468 entry in large x86-64 PIC model. */
9469
9470 static rtx
9471 construct_plt_address (rtx symbol)
9472 {
9473 rtx tmp, unspec;
9474
9475 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9476 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9477 gcc_assert (Pmode == DImode);
9478
9479 tmp = gen_reg_rtx (Pmode);
9480 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9481
9482 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9483 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9484 return tmp;
9485 }
9486
9487 /* Additional registers that are clobbered by SYSV calls. */
9488
9489 static int const x86_64_ms_sysv_extra_clobbered_registers
9490 [NUM_X86_64_MS_CLOBBERED_REGS] =
9491 {
9492 SI_REG, DI_REG,
9493 XMM6_REG, XMM7_REG,
9494 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9495 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9496 };
9497
9498 rtx_insn *
9499 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9500 rtx callarg2,
9501 rtx pop, bool sibcall)
9502 {
9503 rtx vec[3];
9504 rtx use = NULL, call;
9505 unsigned int vec_len = 0;
9506 tree fndecl;
9507
9508 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9509 {
9510 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9511 if (fndecl
9512 && (lookup_attribute ("interrupt",
9513 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9514 error ("interrupt service routine cannot be called directly");
9515 }
9516 else
9517 fndecl = NULL_TREE;
9518
9519 if (pop == const0_rtx)
9520 pop = NULL;
9521 gcc_assert (!TARGET_64BIT || !pop);
9522
9523 rtx addr = XEXP (fnaddr, 0);
9524 if (TARGET_MACHO && !TARGET_64BIT)
9525 {
9526 #if TARGET_MACHO
9527 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9528 fnaddr = machopic_indirect_call_target (fnaddr);
9529 #endif
9530 }
9531 else
9532 {
9533 /* Static functions and indirect calls don't need the pic register. Also,
9534 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9535 it an indirect call. */
9536 if (flag_pic
9537 && GET_CODE (addr) == SYMBOL_REF
9538 && ix86_call_use_plt_p (addr))
9539 {
9540 if (flag_plt
9541 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9542 || !lookup_attribute ("noplt",
9543 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9544 {
9545 if (!TARGET_64BIT
9546 || (ix86_cmodel == CM_LARGE_PIC
9547 && DEFAULT_ABI != MS_ABI))
9548 {
9549 use_reg (&use, gen_rtx_REG (Pmode,
9550 REAL_PIC_OFFSET_TABLE_REGNUM));
9551 if (ix86_use_pseudo_pic_reg ())
9552 emit_move_insn (gen_rtx_REG (Pmode,
9553 REAL_PIC_OFFSET_TABLE_REGNUM),
9554 pic_offset_table_rtx);
9555 }
9556 }
9557 else if (!TARGET_PECOFF && !TARGET_MACHO)
9558 {
9559 if (TARGET_64BIT
9560 && ix86_cmodel == CM_LARGE_PIC
9561 && DEFAULT_ABI != MS_ABI)
9562 {
9563 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9564 UNSPEC_GOT);
9565 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9566 fnaddr = force_reg (Pmode, fnaddr);
9567 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9568 }
9569 else if (TARGET_64BIT)
9570 {
9571 fnaddr = gen_rtx_UNSPEC (Pmode,
9572 gen_rtvec (1, addr),
9573 UNSPEC_GOTPCREL);
9574 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9575 }
9576 else
9577 {
9578 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9579 UNSPEC_GOT);
9580 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9581 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9582 fnaddr);
9583 }
9584 fnaddr = gen_const_mem (Pmode, fnaddr);
9585 /* Pmode may not be the same as word_mode for x32, which
9586 doesn't support indirect branch via 32-bit memory slot.
9587 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9588 indirect branch via x32 GOT slot is OK. */
9589 if (GET_MODE (fnaddr) != word_mode)
9590 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9591 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9592 }
9593 }
9594 }
9595
9596 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9597 parameters passed in vector registers. */
9598 if (TARGET_64BIT
9599 && (INTVAL (callarg2) > 0
9600 || (INTVAL (callarg2) == 0
9601 && (TARGET_SSE || !flag_skip_rax_setup))))
9602 {
9603 rtx al = gen_rtx_REG (QImode, AX_REG);
9604 emit_move_insn (al, callarg2);
9605 use_reg (&use, al);
9606 }
9607
9608 if (ix86_cmodel == CM_LARGE_PIC
9609 && !TARGET_PECOFF
9610 && MEM_P (fnaddr)
9611 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9612 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9613 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9614 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9615 branch via x32 GOT slot is OK. */
9616 else if (!(TARGET_X32
9617 && MEM_P (fnaddr)
9618 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9619 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9620 && (sibcall
9621 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9622 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9623 {
9624 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9625 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9626 }
9627
9628 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9629 mask off code pointers here.
9630 TODO: also need to handle indirect jump. */
9631 if (ix86_memtag_can_tag_addresses () && !fndecl
9632 && sanitize_flags_p (SANITIZE_HWADDRESS))
9633 {
9634 rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
9635 NULL_RTX);
9636 fnaddr = gen_rtx_MEM (QImode, untagged_addr);
9637 }
9638
9639 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9640
9641 if (retval)
9642 call = gen_rtx_SET (retval, call);
9643 vec[vec_len++] = call;
9644
9645 if (pop)
9646 {
9647 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9648 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9649 vec[vec_len++] = pop;
9650 }
9651
9652 if (cfun->machine->no_caller_saved_registers
9653 && (!fndecl
9654 || (!TREE_THIS_VOLATILE (fndecl)
9655 && !lookup_attribute ("no_caller_saved_registers",
9656 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9657 {
9658 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9659 bool is_64bit_ms_abi = (TARGET_64BIT
9660 && ix86_function_abi (fndecl) == MS_ABI);
9661 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9662
9663 /* If there are no caller-saved registers, add all registers
9664 that are clobbered by the call which returns. */
9665 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9666 if (!fixed_regs[i]
9667 && (ix86_call_used_regs[i] == 1
9668 || (ix86_call_used_regs[i] & c_mask))
9669 && !STACK_REGNO_P (i)
9670 && !MMX_REGNO_P (i))
9671 clobber_reg (&use,
9672 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9673 }
9674 else if (TARGET_64BIT_MS_ABI
9675 && (!callarg2 || INTVAL (callarg2) != -2))
9676 {
9677 unsigned i;
9678
9679 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9680 {
9681 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9682 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9683
9684 clobber_reg (&use, gen_rtx_REG (mode, regno));
9685 }
9686
9687 /* Set here, but it may get cleared later. */
9688 if (TARGET_CALL_MS2SYSV_XLOGUES)
9689 {
9690 if (!TARGET_SSE)
9691 ;
9692
9693 /* Don't break hot-patched functions. */
9694 else if (ix86_function_ms_hook_prologue (current_function_decl))
9695 ;
9696
9697 /* TODO: Cases not yet examined. */
9698 else if (flag_split_stack)
9699 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9700
9701 else
9702 {
9703 gcc_assert (!reload_completed);
9704 cfun->machine->call_ms2sysv = true;
9705 }
9706 }
9707 }
9708
9709 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9710 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9711 || !fndecl || TREE_PUBLIC (fndecl)))
9712 {
9713 /* We allow public functions defined in a TU to bind locally for PIC
9714 code (the default) on 64bit Mach-O.
9715 If such functions are not inlined, we cannot tell at compile-time if
9716 they will be called via the lazy symbol resolver (this can depend on
9717 options given at link-time). Therefore, we must assume that the lazy
9718 resolver could be used which clobbers R11 and R10. */
9719 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9720 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9721 }
9722
9723 if (vec_len > 1)
9724 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9725 rtx_insn *call_insn = emit_call_insn (call);
9726 if (use)
9727 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9728
9729 return call_insn;
9730 }
9731
9732 /* Split simple return with popping POPC bytes from stack to indirect
9733 branch with stack adjustment . */
9734
9735 void
9736 ix86_split_simple_return_pop_internal (rtx popc)
9737 {
9738 struct machine_function *m = cfun->machine;
9739 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9740 rtx_insn *insn;
9741
9742 /* There is no "pascal" calling convention in any 64bit ABI. */
9743 gcc_assert (!TARGET_64BIT);
9744
9745 insn = emit_insn (gen_pop (ecx));
9746 m->fs.cfa_offset -= UNITS_PER_WORD;
9747 m->fs.sp_offset -= UNITS_PER_WORD;
9748
9749 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9750 x = gen_rtx_SET (stack_pointer_rtx, x);
9751 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9752 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9753 RTX_FRAME_RELATED_P (insn) = 1;
9754
9755 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9756 x = gen_rtx_SET (stack_pointer_rtx, x);
9757 insn = emit_insn (x);
9758 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9759 RTX_FRAME_RELATED_P (insn) = 1;
9760
9761 /* Now return address is in ECX. */
9762 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9763 }
9764
9765 /* Errors in the source file can cause expand_expr to return const0_rtx
9766 where we expect a vector. To avoid crashing, use one of the vector
9767 clear instructions. */
9768
9769 static rtx
9770 safe_vector_operand (rtx x, machine_mode mode)
9771 {
9772 if (x == const0_rtx)
9773 x = CONST0_RTX (mode);
9774 return x;
9775 }
9776
9777 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9778
9779 static rtx
9780 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9781 {
9782 rtx pat;
9783 tree arg0 = CALL_EXPR_ARG (exp, 0);
9784 tree arg1 = CALL_EXPR_ARG (exp, 1);
9785 rtx op0 = expand_normal (arg0);
9786 rtx op1 = expand_normal (arg1);
9787 machine_mode tmode = insn_data[icode].operand[0].mode;
9788 machine_mode mode0 = insn_data[icode].operand[1].mode;
9789 machine_mode mode1 = insn_data[icode].operand[2].mode;
9790
9791 if (VECTOR_MODE_P (mode0))
9792 op0 = safe_vector_operand (op0, mode0);
9793 if (VECTOR_MODE_P (mode1))
9794 op1 = safe_vector_operand (op1, mode1);
9795
9796 if (optimize || !target
9797 || GET_MODE (target) != tmode
9798 || !insn_data[icode].operand[0].predicate (target, tmode))
9799 target = gen_reg_rtx (tmode);
9800
9801 if (GET_MODE (op1) == SImode && mode1 == TImode)
9802 {
9803 rtx x = gen_reg_rtx (V4SImode);
9804 emit_insn (gen_sse2_loadd (x, op1));
9805 op1 = gen_lowpart (TImode, x);
9806 }
9807
9808 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9809 op0 = copy_to_mode_reg (mode0, op0);
9810 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9811 op1 = copy_to_mode_reg (mode1, op1);
9812
9813 pat = GEN_FCN (icode) (target, op0, op1);
9814 if (! pat)
9815 return 0;
9816
9817 emit_insn (pat);
9818
9819 return target;
9820 }
9821
9822 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9823
9824 static rtx
9825 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9826 enum ix86_builtin_func_type m_type,
9827 enum rtx_code sub_code)
9828 {
9829 rtx pat;
9830 unsigned int i, nargs;
9831 bool comparison_p = false;
9832 bool tf_p = false;
9833 bool last_arg_constant = false;
9834 int num_memory = 0;
9835 rtx xops[4];
9836
9837 machine_mode tmode = insn_data[icode].operand[0].mode;
9838
9839 switch (m_type)
9840 {
9841 case MULTI_ARG_4_DF2_DI_I:
9842 case MULTI_ARG_4_DF2_DI_I1:
9843 case MULTI_ARG_4_SF2_SI_I:
9844 case MULTI_ARG_4_SF2_SI_I1:
9845 nargs = 4;
9846 last_arg_constant = true;
9847 break;
9848
9849 case MULTI_ARG_3_SF:
9850 case MULTI_ARG_3_DF:
9851 case MULTI_ARG_3_SF2:
9852 case MULTI_ARG_3_DF2:
9853 case MULTI_ARG_3_DI:
9854 case MULTI_ARG_3_SI:
9855 case MULTI_ARG_3_SI_DI:
9856 case MULTI_ARG_3_HI:
9857 case MULTI_ARG_3_HI_SI:
9858 case MULTI_ARG_3_QI:
9859 case MULTI_ARG_3_DI2:
9860 case MULTI_ARG_3_SI2:
9861 case MULTI_ARG_3_HI2:
9862 case MULTI_ARG_3_QI2:
9863 nargs = 3;
9864 break;
9865
9866 case MULTI_ARG_2_SF:
9867 case MULTI_ARG_2_DF:
9868 case MULTI_ARG_2_DI:
9869 case MULTI_ARG_2_SI:
9870 case MULTI_ARG_2_HI:
9871 case MULTI_ARG_2_QI:
9872 nargs = 2;
9873 break;
9874
9875 case MULTI_ARG_2_DI_IMM:
9876 case MULTI_ARG_2_SI_IMM:
9877 case MULTI_ARG_2_HI_IMM:
9878 case MULTI_ARG_2_QI_IMM:
9879 nargs = 2;
9880 last_arg_constant = true;
9881 break;
9882
9883 case MULTI_ARG_1_SF:
9884 case MULTI_ARG_1_DF:
9885 case MULTI_ARG_1_SF2:
9886 case MULTI_ARG_1_DF2:
9887 case MULTI_ARG_1_DI:
9888 case MULTI_ARG_1_SI:
9889 case MULTI_ARG_1_HI:
9890 case MULTI_ARG_1_QI:
9891 case MULTI_ARG_1_SI_DI:
9892 case MULTI_ARG_1_HI_DI:
9893 case MULTI_ARG_1_HI_SI:
9894 case MULTI_ARG_1_QI_DI:
9895 case MULTI_ARG_1_QI_SI:
9896 case MULTI_ARG_1_QI_HI:
9897 nargs = 1;
9898 break;
9899
9900 case MULTI_ARG_2_DI_CMP:
9901 case MULTI_ARG_2_SI_CMP:
9902 case MULTI_ARG_2_HI_CMP:
9903 case MULTI_ARG_2_QI_CMP:
9904 nargs = 2;
9905 comparison_p = true;
9906 break;
9907
9908 case MULTI_ARG_2_SF_TF:
9909 case MULTI_ARG_2_DF_TF:
9910 case MULTI_ARG_2_DI_TF:
9911 case MULTI_ARG_2_SI_TF:
9912 case MULTI_ARG_2_HI_TF:
9913 case MULTI_ARG_2_QI_TF:
9914 nargs = 2;
9915 tf_p = true;
9916 break;
9917
9918 default:
9919 gcc_unreachable ();
9920 }
9921
9922 if (optimize || !target
9923 || GET_MODE (target) != tmode
9924 || !insn_data[icode].operand[0].predicate (target, tmode))
9925 target = gen_reg_rtx (tmode);
9926 else if (memory_operand (target, tmode))
9927 num_memory++;
9928
9929 gcc_assert (nargs <= ARRAY_SIZE (xops));
9930
9931 for (i = 0; i < nargs; i++)
9932 {
9933 tree arg = CALL_EXPR_ARG (exp, i);
9934 rtx op = expand_normal (arg);
9935 int adjust = (comparison_p) ? 1 : 0;
9936 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9937
9938 if (last_arg_constant && i == nargs - 1)
9939 {
9940 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9941 {
9942 enum insn_code new_icode = icode;
9943 switch (icode)
9944 {
9945 case CODE_FOR_xop_vpermil2v2df3:
9946 case CODE_FOR_xop_vpermil2v4sf3:
9947 case CODE_FOR_xop_vpermil2v4df3:
9948 case CODE_FOR_xop_vpermil2v8sf3:
9949 error ("the last argument must be a 2-bit immediate");
9950 return gen_reg_rtx (tmode);
9951 case CODE_FOR_xop_rotlv2di3:
9952 new_icode = CODE_FOR_rotlv2di3;
9953 goto xop_rotl;
9954 case CODE_FOR_xop_rotlv4si3:
9955 new_icode = CODE_FOR_rotlv4si3;
9956 goto xop_rotl;
9957 case CODE_FOR_xop_rotlv8hi3:
9958 new_icode = CODE_FOR_rotlv8hi3;
9959 goto xop_rotl;
9960 case CODE_FOR_xop_rotlv16qi3:
9961 new_icode = CODE_FOR_rotlv16qi3;
9962 xop_rotl:
9963 if (CONST_INT_P (op))
9964 {
9965 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9966 op = GEN_INT (INTVAL (op) & mask);
9967 gcc_checking_assert
9968 (insn_data[icode].operand[i + 1].predicate (op, mode));
9969 }
9970 else
9971 {
9972 gcc_checking_assert
9973 (nargs == 2
9974 && insn_data[new_icode].operand[0].mode == tmode
9975 && insn_data[new_icode].operand[1].mode == tmode
9976 && insn_data[new_icode].operand[2].mode == mode
9977 && insn_data[new_icode].operand[0].predicate
9978 == insn_data[icode].operand[0].predicate
9979 && insn_data[new_icode].operand[1].predicate
9980 == insn_data[icode].operand[1].predicate);
9981 icode = new_icode;
9982 goto non_constant;
9983 }
9984 break;
9985 default:
9986 gcc_unreachable ();
9987 }
9988 }
9989 }
9990 else
9991 {
9992 non_constant:
9993 if (VECTOR_MODE_P (mode))
9994 op = safe_vector_operand (op, mode);
9995
9996 /* If we aren't optimizing, only allow one memory operand to be
9997 generated. */
9998 if (memory_operand (op, mode))
9999 num_memory++;
10000
10001 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
10002
10003 if (optimize
10004 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
10005 || num_memory > 1)
10006 op = force_reg (mode, op);
10007 }
10008
10009 xops[i] = op;
10010 }
10011
10012 switch (nargs)
10013 {
10014 case 1:
10015 pat = GEN_FCN (icode) (target, xops[0]);
10016 break;
10017
10018 case 2:
10019 if (tf_p)
10020 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10021 GEN_INT ((int)sub_code));
10022 else if (! comparison_p)
10023 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10024 else
10025 {
10026 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
10027 xops[0], xops[1]);
10028
10029 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
10030 }
10031 break;
10032
10033 case 3:
10034 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10035 break;
10036
10037 case 4:
10038 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
10039 break;
10040
10041 default:
10042 gcc_unreachable ();
10043 }
10044
10045 if (! pat)
10046 return 0;
10047
10048 emit_insn (pat);
10049 return target;
10050 }
10051
10052 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
10053 insns with vec_merge. */
10054
10055 static rtx
10056 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
10057 rtx target)
10058 {
10059 rtx pat;
10060 tree arg0 = CALL_EXPR_ARG (exp, 0);
10061 rtx op1, op0 = expand_normal (arg0);
10062 machine_mode tmode = insn_data[icode].operand[0].mode;
10063 machine_mode mode0 = insn_data[icode].operand[1].mode;
10064
10065 if (optimize || !target
10066 || GET_MODE (target) != tmode
10067 || !insn_data[icode].operand[0].predicate (target, tmode))
10068 target = gen_reg_rtx (tmode);
10069
10070 if (VECTOR_MODE_P (mode0))
10071 op0 = safe_vector_operand (op0, mode0);
10072
10073 if ((optimize && !register_operand (op0, mode0))
10074 || !insn_data[icode].operand[1].predicate (op0, mode0))
10075 op0 = copy_to_mode_reg (mode0, op0);
10076
10077 op1 = op0;
10078 if (!insn_data[icode].operand[2].predicate (op1, mode0))
10079 op1 = copy_to_mode_reg (mode0, op1);
10080
10081 pat = GEN_FCN (icode) (target, op0, op1);
10082 if (! pat)
10083 return 0;
10084 emit_insn (pat);
10085 return target;
10086 }
10087
10088 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
10089
10090 static rtx
10091 ix86_expand_sse_compare (const struct builtin_description *d,
10092 tree exp, rtx target, bool swap)
10093 {
10094 rtx pat;
10095 tree arg0 = CALL_EXPR_ARG (exp, 0);
10096 tree arg1 = CALL_EXPR_ARG (exp, 1);
10097 rtx op0 = expand_normal (arg0);
10098 rtx op1 = expand_normal (arg1);
10099 rtx op2;
10100 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10101 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10102 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10103 enum rtx_code comparison = d->comparison;
10104
10105 if (VECTOR_MODE_P (mode0))
10106 op0 = safe_vector_operand (op0, mode0);
10107 if (VECTOR_MODE_P (mode1))
10108 op1 = safe_vector_operand (op1, mode1);
10109
10110 /* Swap operands if we have a comparison that isn't available in
10111 hardware. */
10112 if (swap)
10113 std::swap (op0, op1);
10114
10115 if (optimize || !target
10116 || GET_MODE (target) != tmode
10117 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10118 target = gen_reg_rtx (tmode);
10119
10120 if ((optimize && !register_operand (op0, mode0))
10121 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
10122 op0 = copy_to_mode_reg (mode0, op0);
10123 if ((optimize && !register_operand (op1, mode1))
10124 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
10125 op1 = copy_to_mode_reg (mode1, op1);
10126
10127 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
10128 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10129 if (! pat)
10130 return 0;
10131 emit_insn (pat);
10132 return target;
10133 }
10134
10135 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
10136 * ordered EQ or unordered NE, generate PF jump. */
10137
10138 static rtx
10139 ix86_ssecom_setcc (const enum rtx_code comparison,
10140 bool check_unordered, machine_mode mode,
10141 rtx set_dst, rtx target)
10142 {
10143
10144 rtx_code_label *label = NULL;
10145
10146 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10147 with NAN operands. */
10148 if (check_unordered)
10149 {
10150 gcc_assert (comparison == EQ || comparison == NE);
10151
10152 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10153 label = gen_label_rtx ();
10154 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10155 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10156 gen_rtx_LABEL_REF (VOIDmode, label),
10157 pc_rtx);
10158 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10159 }
10160
10161 /* NB: Set CCFPmode and check a different CCmode which is in subset
10162 of CCFPmode. */
10163 if (GET_MODE (set_dst) != mode)
10164 {
10165 gcc_assert (mode == CCAmode || mode == CCCmode
10166 || mode == CCOmode || mode == CCPmode
10167 || mode == CCSmode || mode == CCZmode);
10168 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10169 }
10170
10171 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10172 gen_rtx_fmt_ee (comparison, QImode,
10173 set_dst,
10174 const0_rtx)));
10175
10176 if (label)
10177 emit_label (label);
10178
10179 return SUBREG_REG (target);
10180 }
10181
10182 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
10183
10184 static rtx
10185 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
10186 rtx target)
10187 {
10188 rtx pat, set_dst;
10189 tree arg0 = CALL_EXPR_ARG (exp, 0);
10190 tree arg1 = CALL_EXPR_ARG (exp, 1);
10191 rtx op0 = expand_normal (arg0);
10192 rtx op1 = expand_normal (arg1);
10193 enum insn_code icode = d->icode;
10194 const struct insn_data_d *insn_p = &insn_data[icode];
10195 machine_mode mode0 = insn_p->operand[0].mode;
10196 machine_mode mode1 = insn_p->operand[1].mode;
10197
10198 if (VECTOR_MODE_P (mode0))
10199 op0 = safe_vector_operand (op0, mode0);
10200 if (VECTOR_MODE_P (mode1))
10201 op1 = safe_vector_operand (op1, mode1);
10202
10203 enum rtx_code comparison = d->comparison;
10204 rtx const_val = const0_rtx;
10205
10206 bool check_unordered = false;
10207 machine_mode mode = CCFPmode;
10208 switch (comparison)
10209 {
10210 case LE: /* -> GE */
10211 case LT: /* -> GT */
10212 std::swap (op0, op1);
10213 comparison = swap_condition (comparison);
10214 /* FALLTHRU */
10215 case GT:
10216 case GE:
10217 break;
10218 case EQ:
10219 check_unordered = true;
10220 mode = CCZmode;
10221 break;
10222 case NE:
10223 check_unordered = true;
10224 mode = CCZmode;
10225 const_val = const1_rtx;
10226 break;
10227 default:
10228 gcc_unreachable ();
10229 }
10230
10231 target = gen_reg_rtx (SImode);
10232 emit_move_insn (target, const_val);
10233 target = gen_rtx_SUBREG (QImode, target, 0);
10234
10235 if ((optimize && !register_operand (op0, mode0))
10236 || !insn_p->operand[0].predicate (op0, mode0))
10237 op0 = copy_to_mode_reg (mode0, op0);
10238 if ((optimize && !register_operand (op1, mode1))
10239 || !insn_p->operand[1].predicate (op1, mode1))
10240 op1 = copy_to_mode_reg (mode1, op1);
10241
10242 pat = GEN_FCN (icode) (op0, op1);
10243 if (! pat)
10244 return 0;
10245
10246 set_dst = SET_DEST (pat);
10247 emit_insn (pat);
10248 return ix86_ssecom_setcc (comparison, check_unordered, mode,
10249 set_dst, target);
10250 }
10251
10252 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10253
10254 static rtx
10255 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
10256 rtx target)
10257 {
10258 rtx pat;
10259 tree arg0 = CALL_EXPR_ARG (exp, 0);
10260 rtx op1, op0 = expand_normal (arg0);
10261 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10262 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10263
10264 if (optimize || target == 0
10265 || GET_MODE (target) != tmode
10266 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10267 target = gen_reg_rtx (tmode);
10268
10269 if (VECTOR_MODE_P (mode0))
10270 op0 = safe_vector_operand (op0, mode0);
10271
10272 if ((optimize && !register_operand (op0, mode0))
10273 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10274 op0 = copy_to_mode_reg (mode0, op0);
10275
10276 op1 = GEN_INT (d->comparison);
10277
10278 pat = GEN_FCN (d->icode) (target, op0, op1);
10279 if (! pat)
10280 return 0;
10281 emit_insn (pat);
10282 return target;
10283 }
10284
10285 static rtx
10286 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
10287 tree exp, rtx target)
10288 {
10289 rtx pat;
10290 tree arg0 = CALL_EXPR_ARG (exp, 0);
10291 tree arg1 = CALL_EXPR_ARG (exp, 1);
10292 rtx op0 = expand_normal (arg0);
10293 rtx op1 = expand_normal (arg1);
10294 rtx op2;
10295 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10296 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10297 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10298
10299 if (optimize || target == 0
10300 || GET_MODE (target) != tmode
10301 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10302 target = gen_reg_rtx (tmode);
10303
10304 op0 = safe_vector_operand (op0, mode0);
10305 op1 = safe_vector_operand (op1, mode1);
10306
10307 if ((optimize && !register_operand (op0, mode0))
10308 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10309 op0 = copy_to_mode_reg (mode0, op0);
10310 if ((optimize && !register_operand (op1, mode1))
10311 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10312 op1 = copy_to_mode_reg (mode1, op1);
10313
10314 op2 = GEN_INT (d->comparison);
10315
10316 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10317 if (! pat)
10318 return 0;
10319 emit_insn (pat);
10320 return target;
10321 }
10322
10323 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10324
10325 static rtx
10326 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10327 rtx target)
10328 {
10329 rtx pat;
10330 tree arg0 = CALL_EXPR_ARG (exp, 0);
10331 tree arg1 = CALL_EXPR_ARG (exp, 1);
10332 rtx op0 = expand_normal (arg0);
10333 rtx op1 = expand_normal (arg1);
10334 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10335 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10336 enum rtx_code comparison = d->comparison;
10337
10338 /* ptest reg, reg sets the carry flag. */
10339 if (comparison == LTU
10340 && (d->code == IX86_BUILTIN_PTESTC
10341 || d->code == IX86_BUILTIN_PTESTC256)
10342 && rtx_equal_p (op0, op1))
10343 {
10344 if (!target)
10345 target = gen_reg_rtx (SImode);
10346 emit_move_insn (target, const1_rtx);
10347 return target;
10348 }
10349
10350 if (VECTOR_MODE_P (mode0))
10351 op0 = safe_vector_operand (op0, mode0);
10352 if (VECTOR_MODE_P (mode1))
10353 op1 = safe_vector_operand (op1, mode1);
10354
10355 target = gen_reg_rtx (SImode);
10356 emit_move_insn (target, const0_rtx);
10357 target = gen_rtx_SUBREG (QImode, target, 0);
10358
10359 if ((optimize && !register_operand (op0, mode0))
10360 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10361 op0 = copy_to_mode_reg (mode0, op0);
10362 if ((optimize && !register_operand (op1, mode1))
10363 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10364 op1 = copy_to_mode_reg (mode1, op1);
10365
10366 pat = GEN_FCN (d->icode) (op0, op1);
10367 if (! pat)
10368 return 0;
10369 emit_insn (pat);
10370 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10371 gen_rtx_fmt_ee (comparison, QImode,
10372 SET_DEST (pat),
10373 const0_rtx)));
10374
10375 return SUBREG_REG (target);
10376 }
10377
10378 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10379
10380 static rtx
10381 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10382 tree exp, rtx target)
10383 {
10384 rtx pat;
10385 tree arg0 = CALL_EXPR_ARG (exp, 0);
10386 tree arg1 = CALL_EXPR_ARG (exp, 1);
10387 tree arg2 = CALL_EXPR_ARG (exp, 2);
10388 tree arg3 = CALL_EXPR_ARG (exp, 3);
10389 tree arg4 = CALL_EXPR_ARG (exp, 4);
10390 rtx scratch0, scratch1;
10391 rtx op0 = expand_normal (arg0);
10392 rtx op1 = expand_normal (arg1);
10393 rtx op2 = expand_normal (arg2);
10394 rtx op3 = expand_normal (arg3);
10395 rtx op4 = expand_normal (arg4);
10396 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10397
10398 tmode0 = insn_data[d->icode].operand[0].mode;
10399 tmode1 = insn_data[d->icode].operand[1].mode;
10400 modev2 = insn_data[d->icode].operand[2].mode;
10401 modei3 = insn_data[d->icode].operand[3].mode;
10402 modev4 = insn_data[d->icode].operand[4].mode;
10403 modei5 = insn_data[d->icode].operand[5].mode;
10404 modeimm = insn_data[d->icode].operand[6].mode;
10405
10406 if (VECTOR_MODE_P (modev2))
10407 op0 = safe_vector_operand (op0, modev2);
10408 if (VECTOR_MODE_P (modev4))
10409 op2 = safe_vector_operand (op2, modev4);
10410
10411 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10412 op0 = copy_to_mode_reg (modev2, op0);
10413 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10414 op1 = copy_to_mode_reg (modei3, op1);
10415 if ((optimize && !register_operand (op2, modev4))
10416 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10417 op2 = copy_to_mode_reg (modev4, op2);
10418 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10419 op3 = copy_to_mode_reg (modei5, op3);
10420
10421 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10422 {
10423 error ("the fifth argument must be an 8-bit immediate");
10424 return const0_rtx;
10425 }
10426
10427 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10428 {
10429 if (optimize || !target
10430 || GET_MODE (target) != tmode0
10431 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10432 target = gen_reg_rtx (tmode0);
10433
10434 scratch1 = gen_reg_rtx (tmode1);
10435
10436 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10437 }
10438 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10439 {
10440 if (optimize || !target
10441 || GET_MODE (target) != tmode1
10442 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10443 target = gen_reg_rtx (tmode1);
10444
10445 scratch0 = gen_reg_rtx (tmode0);
10446
10447 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10448 }
10449 else
10450 {
10451 gcc_assert (d->flag);
10452
10453 scratch0 = gen_reg_rtx (tmode0);
10454 scratch1 = gen_reg_rtx (tmode1);
10455
10456 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10457 }
10458
10459 if (! pat)
10460 return 0;
10461
10462 emit_insn (pat);
10463
10464 if (d->flag)
10465 {
10466 target = gen_reg_rtx (SImode);
10467 emit_move_insn (target, const0_rtx);
10468 target = gen_rtx_SUBREG (QImode, target, 0);
10469
10470 emit_insn
10471 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10472 gen_rtx_fmt_ee (EQ, QImode,
10473 gen_rtx_REG ((machine_mode) d->flag,
10474 FLAGS_REG),
10475 const0_rtx)));
10476 return SUBREG_REG (target);
10477 }
10478 else
10479 return target;
10480 }
10481
10482
10483 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10484
10485 static rtx
10486 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10487 tree exp, rtx target)
10488 {
10489 rtx pat;
10490 tree arg0 = CALL_EXPR_ARG (exp, 0);
10491 tree arg1 = CALL_EXPR_ARG (exp, 1);
10492 tree arg2 = CALL_EXPR_ARG (exp, 2);
10493 rtx scratch0, scratch1;
10494 rtx op0 = expand_normal (arg0);
10495 rtx op1 = expand_normal (arg1);
10496 rtx op2 = expand_normal (arg2);
10497 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10498
10499 tmode0 = insn_data[d->icode].operand[0].mode;
10500 tmode1 = insn_data[d->icode].operand[1].mode;
10501 modev2 = insn_data[d->icode].operand[2].mode;
10502 modev3 = insn_data[d->icode].operand[3].mode;
10503 modeimm = insn_data[d->icode].operand[4].mode;
10504
10505 if (VECTOR_MODE_P (modev2))
10506 op0 = safe_vector_operand (op0, modev2);
10507 if (VECTOR_MODE_P (modev3))
10508 op1 = safe_vector_operand (op1, modev3);
10509
10510 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10511 op0 = copy_to_mode_reg (modev2, op0);
10512 if ((optimize && !register_operand (op1, modev3))
10513 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10514 op1 = copy_to_mode_reg (modev3, op1);
10515
10516 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10517 {
10518 error ("the third argument must be an 8-bit immediate");
10519 return const0_rtx;
10520 }
10521
10522 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10523 {
10524 if (optimize || !target
10525 || GET_MODE (target) != tmode0
10526 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10527 target = gen_reg_rtx (tmode0);
10528
10529 scratch1 = gen_reg_rtx (tmode1);
10530
10531 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10532 }
10533 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10534 {
10535 if (optimize || !target
10536 || GET_MODE (target) != tmode1
10537 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10538 target = gen_reg_rtx (tmode1);
10539
10540 scratch0 = gen_reg_rtx (tmode0);
10541
10542 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10543 }
10544 else
10545 {
10546 gcc_assert (d->flag);
10547
10548 scratch0 = gen_reg_rtx (tmode0);
10549 scratch1 = gen_reg_rtx (tmode1);
10550
10551 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10552 }
10553
10554 if (! pat)
10555 return 0;
10556
10557 emit_insn (pat);
10558
10559 if (d->flag)
10560 {
10561 target = gen_reg_rtx (SImode);
10562 emit_move_insn (target, const0_rtx);
10563 target = gen_rtx_SUBREG (QImode, target, 0);
10564
10565 emit_insn
10566 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10567 gen_rtx_fmt_ee (EQ, QImode,
10568 gen_rtx_REG ((machine_mode) d->flag,
10569 FLAGS_REG),
10570 const0_rtx)));
10571 return SUBREG_REG (target);
10572 }
10573 else
10574 return target;
10575 }
10576
10577 /* Fixup modeless constants to fit required mode. */
10578
10579 static rtx
10580 fixup_modeless_constant (rtx x, machine_mode mode)
10581 {
10582 if (GET_MODE (x) == VOIDmode)
10583 x = convert_to_mode (mode, x, 1);
10584 return x;
10585 }
10586
10587 /* Subroutine of ix86_expand_builtin to take care of insns with
10588 variable number of operands. */
10589
10590 static rtx
10591 ix86_expand_args_builtin (const struct builtin_description *d,
10592 tree exp, rtx target)
10593 {
10594 rtx pat, real_target;
10595 unsigned int i, nargs;
10596 unsigned int nargs_constant = 0;
10597 unsigned int mask_pos = 0;
10598 int num_memory = 0;
10599 rtx xops[6];
10600 bool second_arg_count = false;
10601 enum insn_code icode = d->icode;
10602 const struct insn_data_d *insn_p = &insn_data[icode];
10603 machine_mode tmode = insn_p->operand[0].mode;
10604 machine_mode rmode = VOIDmode;
10605 bool swap = false;
10606 enum rtx_code comparison = d->comparison;
10607
10608 switch ((enum ix86_builtin_func_type) d->flag)
10609 {
10610 case V2DF_FTYPE_V2DF_ROUND:
10611 case V4DF_FTYPE_V4DF_ROUND:
10612 case V8DF_FTYPE_V8DF_ROUND:
10613 case V4SF_FTYPE_V4SF_ROUND:
10614 case V8SF_FTYPE_V8SF_ROUND:
10615 case V16SF_FTYPE_V16SF_ROUND:
10616 case V8HF_FTYPE_V8HF_ROUND:
10617 case V16HF_FTYPE_V16HF_ROUND:
10618 case V32HF_FTYPE_V32HF_ROUND:
10619 case V4SI_FTYPE_V4SF_ROUND:
10620 case V8SI_FTYPE_V8SF_ROUND:
10621 case V16SI_FTYPE_V16SF_ROUND:
10622 return ix86_expand_sse_round (d, exp, target);
10623 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10624 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10625 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10626 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10627 case INT_FTYPE_V8SF_V8SF_PTEST:
10628 case INT_FTYPE_V4DI_V4DI_PTEST:
10629 case INT_FTYPE_V4DF_V4DF_PTEST:
10630 case INT_FTYPE_V4SF_V4SF_PTEST:
10631 case INT_FTYPE_V2DI_V2DI_PTEST:
10632 case INT_FTYPE_V2DF_V2DF_PTEST:
10633 return ix86_expand_sse_ptest (d, exp, target);
10634 case FLOAT128_FTYPE_FLOAT128:
10635 case FLOAT_FTYPE_FLOAT:
10636 case FLOAT_FTYPE_BFLOAT16:
10637 case INT_FTYPE_INT:
10638 case UINT_FTYPE_UINT:
10639 case UINT16_FTYPE_UINT16:
10640 case UINT64_FTYPE_INT:
10641 case UINT64_FTYPE_UINT64:
10642 case INT64_FTYPE_INT64:
10643 case INT64_FTYPE_V4SF:
10644 case INT64_FTYPE_V2DF:
10645 case INT_FTYPE_V16QI:
10646 case INT_FTYPE_V8QI:
10647 case INT_FTYPE_V8SF:
10648 case INT_FTYPE_V4DF:
10649 case INT_FTYPE_V4SF:
10650 case INT_FTYPE_V2DF:
10651 case INT_FTYPE_V32QI:
10652 case V16QI_FTYPE_V16QI:
10653 case V8SI_FTYPE_V8SF:
10654 case V8SI_FTYPE_V4SI:
10655 case V8HI_FTYPE_V8HI:
10656 case V8HI_FTYPE_V16QI:
10657 case V8QI_FTYPE_V8QI:
10658 case V8SF_FTYPE_V8SF:
10659 case V8SF_FTYPE_V8SI:
10660 case V8SF_FTYPE_V4SF:
10661 case V8SF_FTYPE_V8HI:
10662 case V4SI_FTYPE_V4SI:
10663 case V4SI_FTYPE_V16QI:
10664 case V4SI_FTYPE_V4SF:
10665 case V4SI_FTYPE_V8SI:
10666 case V4SI_FTYPE_V8HI:
10667 case V4SI_FTYPE_V4DF:
10668 case V4SI_FTYPE_V2DF:
10669 case V4HI_FTYPE_V4HI:
10670 case V4DF_FTYPE_V4DF:
10671 case V4DF_FTYPE_V4SI:
10672 case V4DF_FTYPE_V4SF:
10673 case V4DF_FTYPE_V2DF:
10674 case V4SF_FTYPE_V4SF:
10675 case V4SF_FTYPE_V4SI:
10676 case V4SF_FTYPE_V8SF:
10677 case V4SF_FTYPE_V4DF:
10678 case V4SF_FTYPE_V8HI:
10679 case V4SF_FTYPE_V2DF:
10680 case V2DI_FTYPE_V2DI:
10681 case V2DI_FTYPE_V16QI:
10682 case V2DI_FTYPE_V8HI:
10683 case V2DI_FTYPE_V4SI:
10684 case V2DF_FTYPE_V2DF:
10685 case V2DF_FTYPE_V4SI:
10686 case V2DF_FTYPE_V4DF:
10687 case V2DF_FTYPE_V4SF:
10688 case V2DF_FTYPE_V2SI:
10689 case V2SI_FTYPE_V2SI:
10690 case V2SI_FTYPE_V4SF:
10691 case V2SI_FTYPE_V2SF:
10692 case V2SI_FTYPE_V2DF:
10693 case V2SF_FTYPE_V2SF:
10694 case V2SF_FTYPE_V2SI:
10695 case V32QI_FTYPE_V32QI:
10696 case V32QI_FTYPE_V16QI:
10697 case V16HI_FTYPE_V16HI:
10698 case V16HI_FTYPE_V8HI:
10699 case V8SI_FTYPE_V8SI:
10700 case V16HI_FTYPE_V16QI:
10701 case V8SI_FTYPE_V16QI:
10702 case V4DI_FTYPE_V16QI:
10703 case V8SI_FTYPE_V8HI:
10704 case V4DI_FTYPE_V8HI:
10705 case V4DI_FTYPE_V4SI:
10706 case V4DI_FTYPE_V2DI:
10707 case UQI_FTYPE_UQI:
10708 case UHI_FTYPE_UHI:
10709 case USI_FTYPE_USI:
10710 case USI_FTYPE_UQI:
10711 case USI_FTYPE_UHI:
10712 case UDI_FTYPE_UDI:
10713 case UHI_FTYPE_V16QI:
10714 case USI_FTYPE_V32QI:
10715 case UDI_FTYPE_V64QI:
10716 case V16QI_FTYPE_UHI:
10717 case V32QI_FTYPE_USI:
10718 case V64QI_FTYPE_UDI:
10719 case V8HI_FTYPE_UQI:
10720 case V16HI_FTYPE_UHI:
10721 case V32HI_FTYPE_USI:
10722 case V4SI_FTYPE_UQI:
10723 case V8SI_FTYPE_UQI:
10724 case V4SI_FTYPE_UHI:
10725 case V8SI_FTYPE_UHI:
10726 case UQI_FTYPE_V8HI:
10727 case UHI_FTYPE_V16HI:
10728 case USI_FTYPE_V32HI:
10729 case UQI_FTYPE_V4SI:
10730 case UQI_FTYPE_V8SI:
10731 case UHI_FTYPE_V16SI:
10732 case UQI_FTYPE_V2DI:
10733 case UQI_FTYPE_V4DI:
10734 case UQI_FTYPE_V8DI:
10735 case V16SI_FTYPE_UHI:
10736 case V2DI_FTYPE_UQI:
10737 case V4DI_FTYPE_UQI:
10738 case V16SI_FTYPE_INT:
10739 case V16SF_FTYPE_V8SF:
10740 case V16SI_FTYPE_V8SI:
10741 case V16SF_FTYPE_V4SF:
10742 case V16SI_FTYPE_V4SI:
10743 case V16SI_FTYPE_V16SF:
10744 case V16SI_FTYPE_V16SI:
10745 case V64QI_FTYPE_V64QI:
10746 case V32HI_FTYPE_V32HI:
10747 case V16SF_FTYPE_V16SF:
10748 case V8DI_FTYPE_UQI:
10749 case V8DI_FTYPE_V8DI:
10750 case V8DF_FTYPE_V4DF:
10751 case V8DF_FTYPE_V2DF:
10752 case V8DF_FTYPE_V8DF:
10753 case V4DI_FTYPE_V4DI:
10754 case V16BF_FTYPE_V16SF:
10755 case V8BF_FTYPE_V8SF:
10756 case V8BF_FTYPE_V4SF:
10757 nargs = 1;
10758 break;
10759 case V4SF_FTYPE_V4SF_VEC_MERGE:
10760 case V2DF_FTYPE_V2DF_VEC_MERGE:
10761 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10762 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10763 case V16QI_FTYPE_V16QI_V16QI:
10764 case V16QI_FTYPE_V8HI_V8HI:
10765 case V16HF_FTYPE_V16HF_V16HF:
10766 case V16SF_FTYPE_V16SF_V16SF:
10767 case V8QI_FTYPE_V8QI_V8QI:
10768 case V8QI_FTYPE_V4HI_V4HI:
10769 case V8HI_FTYPE_V8HI_V8HI:
10770 case V8HI_FTYPE_V16QI_V16QI:
10771 case V8HI_FTYPE_V4SI_V4SI:
10772 case V8HF_FTYPE_V8HF_V8HF:
10773 case V8SF_FTYPE_V8SF_V8SF:
10774 case V8SF_FTYPE_V8SF_V8SI:
10775 case V8DF_FTYPE_V8DF_V8DF:
10776 case V4SI_FTYPE_V4SI_V4SI:
10777 case V4SI_FTYPE_V8HI_V8HI:
10778 case V4SI_FTYPE_V2DF_V2DF:
10779 case V4HI_FTYPE_V4HI_V4HI:
10780 case V4HI_FTYPE_V8QI_V8QI:
10781 case V4HI_FTYPE_V2SI_V2SI:
10782 case V4DF_FTYPE_V4DF_V4DF:
10783 case V4DF_FTYPE_V4DF_V4DI:
10784 case V4SF_FTYPE_V4SF_V4SF:
10785 case V4SF_FTYPE_V4SF_V4SI:
10786 case V4SF_FTYPE_V4SF_V2SI:
10787 case V4SF_FTYPE_V4SF_V2DF:
10788 case V4SF_FTYPE_V4SF_UINT:
10789 case V4SF_FTYPE_V4SF_DI:
10790 case V4SF_FTYPE_V4SF_SI:
10791 case V4DI_FTYPE_V4DI_V2DI:
10792 case V2DI_FTYPE_V2DI_V2DI:
10793 case V2DI_FTYPE_V16QI_V16QI:
10794 case V2DI_FTYPE_V4SI_V4SI:
10795 case V2DI_FTYPE_V2DI_V16QI:
10796 case V2SI_FTYPE_V2SI_V2SI:
10797 case V2SI_FTYPE_V4HI_V4HI:
10798 case V2SI_FTYPE_V2SF_V2SF:
10799 case V2DF_FTYPE_V2DF_V2DF:
10800 case V2DF_FTYPE_V2DF_V4SF:
10801 case V2DF_FTYPE_V2DF_V2DI:
10802 case V2DF_FTYPE_V2DF_DI:
10803 case V2DF_FTYPE_V2DF_SI:
10804 case V2DF_FTYPE_V2DF_UINT:
10805 case V2SF_FTYPE_V2SF_V2SF:
10806 case V1DI_FTYPE_V1DI_V1DI:
10807 case V1DI_FTYPE_V8QI_V8QI:
10808 case V1DI_FTYPE_V2SI_V2SI:
10809 case V32QI_FTYPE_V16HI_V16HI:
10810 case V16HI_FTYPE_V8SI_V8SI:
10811 case V64QI_FTYPE_V64QI_V64QI:
10812 case V32QI_FTYPE_V32QI_V32QI:
10813 case V16HI_FTYPE_V32QI_V32QI:
10814 case V16HI_FTYPE_V16HI_V16HI:
10815 case V8SI_FTYPE_V4DF_V4DF:
10816 case V8SI_FTYPE_V8SI_V8SI:
10817 case V8SI_FTYPE_V16HI_V16HI:
10818 case V4DI_FTYPE_V4DI_V4DI:
10819 case V4DI_FTYPE_V8SI_V8SI:
10820 case V4DI_FTYPE_V32QI_V32QI:
10821 case V8DI_FTYPE_V64QI_V64QI:
10822 if (comparison == UNKNOWN)
10823 return ix86_expand_binop_builtin (icode, exp, target);
10824 nargs = 2;
10825 break;
10826 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10827 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10828 gcc_assert (comparison != UNKNOWN);
10829 nargs = 2;
10830 swap = true;
10831 break;
10832 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10833 case V16HI_FTYPE_V16HI_SI_COUNT:
10834 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10835 case V8SI_FTYPE_V8SI_SI_COUNT:
10836 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10837 case V4DI_FTYPE_V4DI_INT_COUNT:
10838 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10839 case V8HI_FTYPE_V8HI_SI_COUNT:
10840 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10841 case V4SI_FTYPE_V4SI_SI_COUNT:
10842 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10843 case V4HI_FTYPE_V4HI_SI_COUNT:
10844 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10845 case V2DI_FTYPE_V2DI_SI_COUNT:
10846 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10847 case V2SI_FTYPE_V2SI_SI_COUNT:
10848 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10849 case V1DI_FTYPE_V1DI_SI_COUNT:
10850 nargs = 2;
10851 second_arg_count = true;
10852 break;
10853 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10854 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10855 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10856 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10857 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10858 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10859 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10860 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10861 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10862 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10863 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10864 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10865 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10866 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10867 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10868 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10869 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10870 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10871 nargs = 4;
10872 second_arg_count = true;
10873 break;
10874 case UINT64_FTYPE_UINT64_UINT64:
10875 case UINT_FTYPE_UINT_UINT:
10876 case UINT_FTYPE_UINT_USHORT:
10877 case UINT_FTYPE_UINT_UCHAR:
10878 case UINT16_FTYPE_UINT16_INT:
10879 case UINT8_FTYPE_UINT8_INT:
10880 case UQI_FTYPE_UQI_UQI:
10881 case UHI_FTYPE_UHI_UHI:
10882 case USI_FTYPE_USI_USI:
10883 case UDI_FTYPE_UDI_UDI:
10884 case V16SI_FTYPE_V8DF_V8DF:
10885 case V32BF_FTYPE_V16SF_V16SF:
10886 case V16BF_FTYPE_V8SF_V8SF:
10887 case V8BF_FTYPE_V4SF_V4SF:
10888 case V16BF_FTYPE_V16SF_UHI:
10889 case V8BF_FTYPE_V8SF_UQI:
10890 case V8BF_FTYPE_V4SF_UQI:
10891 nargs = 2;
10892 break;
10893 case V2DI_FTYPE_V2DI_INT_CONVERT:
10894 nargs = 2;
10895 rmode = V1TImode;
10896 nargs_constant = 1;
10897 break;
10898 case V4DI_FTYPE_V4DI_INT_CONVERT:
10899 nargs = 2;
10900 rmode = V2TImode;
10901 nargs_constant = 1;
10902 break;
10903 case V8DI_FTYPE_V8DI_INT_CONVERT:
10904 nargs = 2;
10905 rmode = V4TImode;
10906 nargs_constant = 1;
10907 break;
10908 case V8HI_FTYPE_V8HI_INT:
10909 case V8HI_FTYPE_V8SF_INT:
10910 case V16HI_FTYPE_V16SF_INT:
10911 case V8HI_FTYPE_V4SF_INT:
10912 case V8SF_FTYPE_V8SF_INT:
10913 case V4SF_FTYPE_V16SF_INT:
10914 case V16SF_FTYPE_V16SF_INT:
10915 case V4SI_FTYPE_V4SI_INT:
10916 case V4SI_FTYPE_V8SI_INT:
10917 case V4HI_FTYPE_V4HI_INT:
10918 case V4DF_FTYPE_V4DF_INT:
10919 case V4DF_FTYPE_V8DF_INT:
10920 case V4SF_FTYPE_V4SF_INT:
10921 case V4SF_FTYPE_V8SF_INT:
10922 case V2DI_FTYPE_V2DI_INT:
10923 case V2DF_FTYPE_V2DF_INT:
10924 case V2DF_FTYPE_V4DF_INT:
10925 case V16HI_FTYPE_V16HI_INT:
10926 case V8SI_FTYPE_V8SI_INT:
10927 case V16SI_FTYPE_V16SI_INT:
10928 case V4SI_FTYPE_V16SI_INT:
10929 case V4DI_FTYPE_V4DI_INT:
10930 case V2DI_FTYPE_V4DI_INT:
10931 case V4DI_FTYPE_V8DI_INT:
10932 case UQI_FTYPE_UQI_UQI_CONST:
10933 case UHI_FTYPE_UHI_UQI:
10934 case USI_FTYPE_USI_UQI:
10935 case UDI_FTYPE_UDI_UQI:
10936 nargs = 2;
10937 nargs_constant = 1;
10938 break;
10939 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10940 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10941 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10942 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10943 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10944 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10945 case UHI_FTYPE_V16SI_V16SI_UHI:
10946 case UQI_FTYPE_V8DI_V8DI_UQI:
10947 case V16HI_FTYPE_V16SI_V16HI_UHI:
10948 case V16QI_FTYPE_V16SI_V16QI_UHI:
10949 case V16QI_FTYPE_V8DI_V16QI_UQI:
10950 case V32HF_FTYPE_V32HF_V32HF_USI:
10951 case V16SF_FTYPE_V16SF_V16SF_UHI:
10952 case V16SF_FTYPE_V4SF_V16SF_UHI:
10953 case V16SI_FTYPE_SI_V16SI_UHI:
10954 case V16SI_FTYPE_V16HI_V16SI_UHI:
10955 case V16SI_FTYPE_V16QI_V16SI_UHI:
10956 case V8SF_FTYPE_V4SF_V8SF_UQI:
10957 case V4DF_FTYPE_V2DF_V4DF_UQI:
10958 case V8SI_FTYPE_V4SI_V8SI_UQI:
10959 case V8SI_FTYPE_SI_V8SI_UQI:
10960 case V4SI_FTYPE_V4SI_V4SI_UQI:
10961 case V4SI_FTYPE_SI_V4SI_UQI:
10962 case V4DI_FTYPE_V2DI_V4DI_UQI:
10963 case V4DI_FTYPE_DI_V4DI_UQI:
10964 case V2DI_FTYPE_V2DI_V2DI_UQI:
10965 case V2DI_FTYPE_DI_V2DI_UQI:
10966 case V64QI_FTYPE_V64QI_V64QI_UDI:
10967 case V64QI_FTYPE_V16QI_V64QI_UDI:
10968 case V64QI_FTYPE_QI_V64QI_UDI:
10969 case V32QI_FTYPE_V32QI_V32QI_USI:
10970 case V32QI_FTYPE_V16QI_V32QI_USI:
10971 case V32QI_FTYPE_QI_V32QI_USI:
10972 case V16QI_FTYPE_V16QI_V16QI_UHI:
10973 case V16QI_FTYPE_QI_V16QI_UHI:
10974 case V32HI_FTYPE_V8HI_V32HI_USI:
10975 case V32HI_FTYPE_HI_V32HI_USI:
10976 case V16HI_FTYPE_V8HI_V16HI_UHI:
10977 case V16HI_FTYPE_HI_V16HI_UHI:
10978 case V8HI_FTYPE_V8HI_V8HI_UQI:
10979 case V8HI_FTYPE_HI_V8HI_UQI:
10980 case V16HF_FTYPE_V16HF_V16HF_UHI:
10981 case V8SF_FTYPE_V8HI_V8SF_UQI:
10982 case V4SF_FTYPE_V8HI_V4SF_UQI:
10983 case V8SI_FTYPE_V8HF_V8SI_UQI:
10984 case V8SF_FTYPE_V8HF_V8SF_UQI:
10985 case V8SI_FTYPE_V8SF_V8SI_UQI:
10986 case V4SI_FTYPE_V4SF_V4SI_UQI:
10987 case V4SI_FTYPE_V8HF_V4SI_UQI:
10988 case V4SF_FTYPE_V8HF_V4SF_UQI:
10989 case V4DI_FTYPE_V8HF_V4DI_UQI:
10990 case V4DI_FTYPE_V4SF_V4DI_UQI:
10991 case V2DI_FTYPE_V8HF_V2DI_UQI:
10992 case V2DI_FTYPE_V4SF_V2DI_UQI:
10993 case V8HF_FTYPE_V8HF_V8HF_UQI:
10994 case V8HF_FTYPE_V8HF_V8HF_V8HF:
10995 case V8HF_FTYPE_V8HI_V8HF_UQI:
10996 case V8HF_FTYPE_V8SI_V8HF_UQI:
10997 case V8HF_FTYPE_V8SF_V8HF_UQI:
10998 case V8HF_FTYPE_V4SI_V8HF_UQI:
10999 case V8HF_FTYPE_V4SF_V8HF_UQI:
11000 case V8HF_FTYPE_V4DI_V8HF_UQI:
11001 case V8HF_FTYPE_V4DF_V8HF_UQI:
11002 case V8HF_FTYPE_V2DI_V8HF_UQI:
11003 case V8HF_FTYPE_V2DF_V8HF_UQI:
11004 case V4SF_FTYPE_V4DI_V4SF_UQI:
11005 case V4SF_FTYPE_V2DI_V4SF_UQI:
11006 case V4DF_FTYPE_V4DI_V4DF_UQI:
11007 case V4DF_FTYPE_V8HF_V4DF_UQI:
11008 case V2DF_FTYPE_V8HF_V2DF_UQI:
11009 case V2DF_FTYPE_V2DI_V2DF_UQI:
11010 case V16QI_FTYPE_V8HI_V16QI_UQI:
11011 case V16QI_FTYPE_V16HI_V16QI_UHI:
11012 case V16QI_FTYPE_V4SI_V16QI_UQI:
11013 case V16QI_FTYPE_V8SI_V16QI_UQI:
11014 case V8HI_FTYPE_V8HF_V8HI_UQI:
11015 case V8HI_FTYPE_V4SI_V8HI_UQI:
11016 case V8HI_FTYPE_V8SI_V8HI_UQI:
11017 case V16QI_FTYPE_V2DI_V16QI_UQI:
11018 case V16QI_FTYPE_V4DI_V16QI_UQI:
11019 case V8HI_FTYPE_V2DI_V8HI_UQI:
11020 case V8HI_FTYPE_V4DI_V8HI_UQI:
11021 case V4SI_FTYPE_V2DI_V4SI_UQI:
11022 case V4SI_FTYPE_V4DI_V4SI_UQI:
11023 case V32QI_FTYPE_V32HI_V32QI_USI:
11024 case UHI_FTYPE_V16QI_V16QI_UHI:
11025 case USI_FTYPE_V32QI_V32QI_USI:
11026 case UDI_FTYPE_V64QI_V64QI_UDI:
11027 case UQI_FTYPE_V8HI_V8HI_UQI:
11028 case UHI_FTYPE_V16HI_V16HI_UHI:
11029 case USI_FTYPE_V32HI_V32HI_USI:
11030 case UQI_FTYPE_V4SI_V4SI_UQI:
11031 case UQI_FTYPE_V8SI_V8SI_UQI:
11032 case UQI_FTYPE_V2DI_V2DI_UQI:
11033 case UQI_FTYPE_V4DI_V4DI_UQI:
11034 case V4SF_FTYPE_V2DF_V4SF_UQI:
11035 case V4SF_FTYPE_V4DF_V4SF_UQI:
11036 case V16SI_FTYPE_V16SI_V16SI_UHI:
11037 case V16SI_FTYPE_V4SI_V16SI_UHI:
11038 case V2DI_FTYPE_V4SI_V2DI_UQI:
11039 case V2DI_FTYPE_V8HI_V2DI_UQI:
11040 case V2DI_FTYPE_V16QI_V2DI_UQI:
11041 case V4DI_FTYPE_V4DI_V4DI_UQI:
11042 case V4DI_FTYPE_V4SI_V4DI_UQI:
11043 case V4DI_FTYPE_V8HI_V4DI_UQI:
11044 case V4DI_FTYPE_V16QI_V4DI_UQI:
11045 case V4DI_FTYPE_V4DF_V4DI_UQI:
11046 case V2DI_FTYPE_V2DF_V2DI_UQI:
11047 case V4SI_FTYPE_V4DF_V4SI_UQI:
11048 case V4SI_FTYPE_V2DF_V4SI_UQI:
11049 case V4SI_FTYPE_V8HI_V4SI_UQI:
11050 case V4SI_FTYPE_V16QI_V4SI_UQI:
11051 case V4DI_FTYPE_V4DI_V4DI_V4DI:
11052 case V8DF_FTYPE_V2DF_V8DF_UQI:
11053 case V8DF_FTYPE_V4DF_V8DF_UQI:
11054 case V8DF_FTYPE_V8DF_V8DF_UQI:
11055 case V8SF_FTYPE_V8SF_V8SF_UQI:
11056 case V8SF_FTYPE_V8SI_V8SF_UQI:
11057 case V4DF_FTYPE_V4DF_V4DF_UQI:
11058 case V4SF_FTYPE_V4SF_V4SF_UQI:
11059 case V2DF_FTYPE_V2DF_V2DF_UQI:
11060 case V2DF_FTYPE_V4SF_V2DF_UQI:
11061 case V2DF_FTYPE_V4SI_V2DF_UQI:
11062 case V4SF_FTYPE_V4SI_V4SF_UQI:
11063 case V4DF_FTYPE_V4SF_V4DF_UQI:
11064 case V4DF_FTYPE_V4SI_V4DF_UQI:
11065 case V8SI_FTYPE_V8SI_V8SI_UQI:
11066 case V8SI_FTYPE_V8HI_V8SI_UQI:
11067 case V8SI_FTYPE_V16QI_V8SI_UQI:
11068 case V8DF_FTYPE_V8SI_V8DF_UQI:
11069 case V8DI_FTYPE_DI_V8DI_UQI:
11070 case V16SF_FTYPE_V8SF_V16SF_UHI:
11071 case V16SI_FTYPE_V8SI_V16SI_UHI:
11072 case V16HF_FTYPE_V16HI_V16HF_UHI:
11073 case V16HF_FTYPE_V16HF_V16HF_V16HF:
11074 case V16HI_FTYPE_V16HF_V16HI_UHI:
11075 case V16HI_FTYPE_V16HI_V16HI_UHI:
11076 case V8HI_FTYPE_V16QI_V8HI_UQI:
11077 case V16HI_FTYPE_V16QI_V16HI_UHI:
11078 case V32HI_FTYPE_V32HI_V32HI_USI:
11079 case V32HI_FTYPE_V32QI_V32HI_USI:
11080 case V8DI_FTYPE_V16QI_V8DI_UQI:
11081 case V8DI_FTYPE_V2DI_V8DI_UQI:
11082 case V8DI_FTYPE_V4DI_V8DI_UQI:
11083 case V8DI_FTYPE_V8DI_V8DI_UQI:
11084 case V8DI_FTYPE_V8HI_V8DI_UQI:
11085 case V8DI_FTYPE_V8SI_V8DI_UQI:
11086 case V8HI_FTYPE_V8DI_V8HI_UQI:
11087 case V8SI_FTYPE_V8DI_V8SI_UQI:
11088 case V4SI_FTYPE_V4SI_V4SI_V4SI:
11089 case V4DI_FTYPE_V4DI_V4DI_V2DI:
11090 case V16SI_FTYPE_V16SI_V16SI_V16SI:
11091 case V8DI_FTYPE_V8DI_V8DI_V8DI:
11092 case V32HI_FTYPE_V32HI_V32HI_V32HI:
11093 case V2DI_FTYPE_V2DI_V2DI_V2DI:
11094 case V16HI_FTYPE_V16HI_V16HI_V16HI:
11095 case V8SI_FTYPE_V8SI_V8SI_V8SI:
11096 case V8HI_FTYPE_V8HI_V8HI_V8HI:
11097 case V32BF_FTYPE_V16SF_V16SF_USI:
11098 case V16BF_FTYPE_V8SF_V8SF_UHI:
11099 case V8BF_FTYPE_V4SF_V4SF_UQI:
11100 case V16BF_FTYPE_V16SF_V16BF_UHI:
11101 case V8BF_FTYPE_V8SF_V8BF_UQI:
11102 case V8BF_FTYPE_V4SF_V8BF_UQI:
11103 case V16SF_FTYPE_V16SF_V32BF_V32BF:
11104 case V8SF_FTYPE_V8SF_V16BF_V16BF:
11105 case V4SF_FTYPE_V4SF_V8BF_V8BF:
11106 nargs = 3;
11107 break;
11108 case V32QI_FTYPE_V32QI_V32QI_INT:
11109 case V16HI_FTYPE_V16HI_V16HI_INT:
11110 case V16QI_FTYPE_V16QI_V16QI_INT:
11111 case V4DI_FTYPE_V4DI_V4DI_INT:
11112 case V8HI_FTYPE_V8HI_V8HI_INT:
11113 case V8SI_FTYPE_V8SI_V8SI_INT:
11114 case V8SI_FTYPE_V8SI_V4SI_INT:
11115 case V8SF_FTYPE_V8SF_V8SF_INT:
11116 case V8SF_FTYPE_V8SF_V4SF_INT:
11117 case V4SI_FTYPE_V4SI_V4SI_INT:
11118 case V4DF_FTYPE_V4DF_V4DF_INT:
11119 case V16SF_FTYPE_V16SF_V16SF_INT:
11120 case V16SF_FTYPE_V16SF_V4SF_INT:
11121 case V16SI_FTYPE_V16SI_V4SI_INT:
11122 case V4DF_FTYPE_V4DF_V2DF_INT:
11123 case V4SF_FTYPE_V4SF_V4SF_INT:
11124 case V2DI_FTYPE_V2DI_V2DI_INT:
11125 case V4DI_FTYPE_V4DI_V2DI_INT:
11126 case V2DF_FTYPE_V2DF_V2DF_INT:
11127 case UQI_FTYPE_V8DI_V8UDI_INT:
11128 case UQI_FTYPE_V8DF_V8DF_INT:
11129 case UQI_FTYPE_V2DF_V2DF_INT:
11130 case UQI_FTYPE_V4SF_V4SF_INT:
11131 case UHI_FTYPE_V16SI_V16SI_INT:
11132 case UHI_FTYPE_V16SF_V16SF_INT:
11133 case V64QI_FTYPE_V64QI_V64QI_INT:
11134 case V32HI_FTYPE_V32HI_V32HI_INT:
11135 case V16SI_FTYPE_V16SI_V16SI_INT:
11136 case V8DI_FTYPE_V8DI_V8DI_INT:
11137 nargs = 3;
11138 nargs_constant = 1;
11139 break;
11140 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
11141 nargs = 3;
11142 rmode = V4DImode;
11143 nargs_constant = 1;
11144 break;
11145 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
11146 nargs = 3;
11147 rmode = V2DImode;
11148 nargs_constant = 1;
11149 break;
11150 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
11151 nargs = 3;
11152 rmode = DImode;
11153 nargs_constant = 1;
11154 break;
11155 case V2DI_FTYPE_V2DI_UINT_UINT:
11156 nargs = 3;
11157 nargs_constant = 2;
11158 break;
11159 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
11160 nargs = 3;
11161 rmode = V8DImode;
11162 nargs_constant = 1;
11163 break;
11164 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
11165 nargs = 5;
11166 rmode = V8DImode;
11167 mask_pos = 2;
11168 nargs_constant = 1;
11169 break;
11170 case QI_FTYPE_V8DF_INT_UQI:
11171 case QI_FTYPE_V4DF_INT_UQI:
11172 case QI_FTYPE_V2DF_INT_UQI:
11173 case HI_FTYPE_V16SF_INT_UHI:
11174 case QI_FTYPE_V8SF_INT_UQI:
11175 case QI_FTYPE_V4SF_INT_UQI:
11176 case QI_FTYPE_V8HF_INT_UQI:
11177 case HI_FTYPE_V16HF_INT_UHI:
11178 case SI_FTYPE_V32HF_INT_USI:
11179 case V4SI_FTYPE_V4SI_V4SI_UHI:
11180 case V8SI_FTYPE_V8SI_V8SI_UHI:
11181 nargs = 3;
11182 mask_pos = 1;
11183 nargs_constant = 1;
11184 break;
11185 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
11186 nargs = 5;
11187 rmode = V4DImode;
11188 mask_pos = 2;
11189 nargs_constant = 1;
11190 break;
11191 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
11192 nargs = 5;
11193 rmode = V2DImode;
11194 mask_pos = 2;
11195 nargs_constant = 1;
11196 break;
11197 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
11198 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
11199 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
11200 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
11201 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
11202 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
11203 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
11204 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
11205 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
11206 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
11207 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
11208 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
11209 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
11210 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
11211 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
11212 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
11213 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
11214 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
11215 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
11216 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
11217 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
11218 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
11219 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
11220 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
11221 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
11222 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
11223 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
11224 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
11225 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
11226 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
11227 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
11228 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
11229 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
11230 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
11231 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
11232 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
11233 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
11234 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
11235 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
11236 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
11237 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
11238 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
11239 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
11240 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
11241 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
11242 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
11243 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
11244 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
11245 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
11246 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
11247 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
11248 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
11249 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
11250 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
11251 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
11252 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
11253 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
11254 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
11255 nargs = 4;
11256 break;
11257 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
11258 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
11259 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
11260 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
11261 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
11262 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
11263 nargs = 4;
11264 nargs_constant = 1;
11265 break;
11266 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
11267 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
11268 case QI_FTYPE_V4DF_V4DF_INT_UQI:
11269 case QI_FTYPE_V8SF_V8SF_INT_UQI:
11270 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
11271 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
11272 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
11273 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
11274 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
11275 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
11276 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
11277 case USI_FTYPE_V32QI_V32QI_INT_USI:
11278 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
11279 case USI_FTYPE_V32HI_V32HI_INT_USI:
11280 case USI_FTYPE_V32HF_V32HF_INT_USI:
11281 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
11282 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
11283 nargs = 4;
11284 mask_pos = 1;
11285 nargs_constant = 1;
11286 break;
11287 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
11288 nargs = 4;
11289 nargs_constant = 2;
11290 break;
11291 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
11292 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
11293 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
11294 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
11295 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
11296 nargs = 4;
11297 break;
11298 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
11299 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
11300 mask_pos = 1;
11301 nargs = 4;
11302 nargs_constant = 1;
11303 break;
11304 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
11305 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
11306 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
11307 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
11308 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
11309 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
11310 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
11311 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
11312 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
11313 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
11314 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
11315 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
11316 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
11317 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
11318 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
11319 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
11320 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
11321 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
11322 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
11323 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
11324 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
11325 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
11326 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
11327 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
11328 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
11329 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11330 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
11331 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11332 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11333 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11334 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11335 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11336 nargs = 4;
11337 mask_pos = 2;
11338 nargs_constant = 1;
11339 break;
11340 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11341 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11342 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11343 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11344 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11345 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11346 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11347 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11348 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11349 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11350 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11351 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11352 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11353 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11354 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11355 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11356 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11357 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11358 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11359 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11360 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11361 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11362 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11363 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11364 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11365 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11366 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11367 nargs = 5;
11368 mask_pos = 2;
11369 nargs_constant = 1;
11370 break;
11371 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11372 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11373 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11374 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11375 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11376 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11377 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11378 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11379 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11380 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11381 nargs = 5;
11382 mask_pos = 1;
11383 nargs_constant = 1;
11384 break;
11385 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11386 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11387 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11388 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11389 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11390 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11391 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11392 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11393 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11394 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11395 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11396 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11397 nargs = 5;
11398 mask_pos = 1;
11399 nargs_constant = 2;
11400 break;
11401
11402 default:
11403 gcc_unreachable ();
11404 }
11405
11406 gcc_assert (nargs <= ARRAY_SIZE (xops));
11407
11408 if (comparison != UNKNOWN)
11409 {
11410 gcc_assert (nargs == 2);
11411 return ix86_expand_sse_compare (d, exp, target, swap);
11412 }
11413
11414 if (rmode == VOIDmode || rmode == tmode)
11415 {
11416 if (optimize
11417 || target == 0
11418 || GET_MODE (target) != tmode
11419 || !insn_p->operand[0].predicate (target, tmode))
11420 target = gen_reg_rtx (tmode);
11421 else if (memory_operand (target, tmode))
11422 num_memory++;
11423 real_target = target;
11424 }
11425 else
11426 {
11427 real_target = gen_reg_rtx (tmode);
11428 target = lowpart_subreg (rmode, real_target, tmode);
11429 }
11430
11431 for (i = 0; i < nargs; i++)
11432 {
11433 tree arg = CALL_EXPR_ARG (exp, i);
11434 rtx op = expand_normal (arg);
11435 machine_mode mode = insn_p->operand[i + 1].mode;
11436 bool match = insn_p->operand[i + 1].predicate (op, mode);
11437
11438 if (second_arg_count && i == 1)
11439 {
11440 /* SIMD shift insns take either an 8-bit immediate or
11441 register as count. But builtin functions take int as
11442 count. If count doesn't match, we put it in register.
11443 The instructions are using 64-bit count, if op is just
11444 32-bit, zero-extend it, as negative shift counts
11445 are undefined behavior and zero-extension is more
11446 efficient. */
11447 if (!match)
11448 {
11449 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11450 op = convert_modes (mode, GET_MODE (op), op, 1);
11451 else
11452 op = lowpart_subreg (mode, op, GET_MODE (op));
11453 if (!insn_p->operand[i + 1].predicate (op, mode))
11454 op = copy_to_reg (op);
11455 }
11456 }
11457 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11458 (!mask_pos && (nargs - i) <= nargs_constant))
11459 {
11460 if (!match)
11461 switch (icode)
11462 {
11463 case CODE_FOR_avx_vinsertf128v4di:
11464 case CODE_FOR_avx_vextractf128v4di:
11465 error ("the last argument must be an 1-bit immediate");
11466 return const0_rtx;
11467
11468 case CODE_FOR_avx512f_cmpv8di3_mask:
11469 case CODE_FOR_avx512f_cmpv16si3_mask:
11470 case CODE_FOR_avx512f_ucmpv8di3_mask:
11471 case CODE_FOR_avx512f_ucmpv16si3_mask:
11472 case CODE_FOR_avx512vl_cmpv4di3_mask:
11473 case CODE_FOR_avx512vl_cmpv8si3_mask:
11474 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11475 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11476 case CODE_FOR_avx512vl_cmpv2di3_mask:
11477 case CODE_FOR_avx512vl_cmpv4si3_mask:
11478 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11479 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11480 error ("the last argument must be a 3-bit immediate");
11481 return const0_rtx;
11482
11483 case CODE_FOR_sse4_1_roundsd:
11484 case CODE_FOR_sse4_1_roundss:
11485
11486 case CODE_FOR_sse4_1_roundpd:
11487 case CODE_FOR_sse4_1_roundps:
11488 case CODE_FOR_avx_roundpd256:
11489 case CODE_FOR_avx_roundps256:
11490
11491 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11492 case CODE_FOR_sse4_1_roundps_sfix:
11493 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11494 case CODE_FOR_avx_roundps_sfix256:
11495
11496 case CODE_FOR_sse4_1_blendps:
11497 case CODE_FOR_avx_blendpd256:
11498 case CODE_FOR_avx_vpermilv4df:
11499 case CODE_FOR_avx_vpermilv4df_mask:
11500 case CODE_FOR_avx512f_getmantv8df_mask:
11501 case CODE_FOR_avx512f_getmantv16sf_mask:
11502 case CODE_FOR_avx512vl_getmantv16hf_mask:
11503 case CODE_FOR_avx512vl_getmantv8sf_mask:
11504 case CODE_FOR_avx512vl_getmantv4df_mask:
11505 case CODE_FOR_avx512fp16_getmantv8hf_mask:
11506 case CODE_FOR_avx512vl_getmantv4sf_mask:
11507 case CODE_FOR_avx512vl_getmantv2df_mask:
11508 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11509 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11510 case CODE_FOR_avx512dq_rangepv4df_mask:
11511 case CODE_FOR_avx512dq_rangepv8sf_mask:
11512 case CODE_FOR_avx512dq_rangepv2df_mask:
11513 case CODE_FOR_avx512dq_rangepv4sf_mask:
11514 case CODE_FOR_avx_shufpd256_mask:
11515 error ("the last argument must be a 4-bit immediate");
11516 return const0_rtx;
11517
11518 case CODE_FOR_sha1rnds4:
11519 case CODE_FOR_sse4_1_blendpd:
11520 case CODE_FOR_avx_vpermilv2df:
11521 case CODE_FOR_avx_vpermilv2df_mask:
11522 case CODE_FOR_xop_vpermil2v2df3:
11523 case CODE_FOR_xop_vpermil2v4sf3:
11524 case CODE_FOR_xop_vpermil2v4df3:
11525 case CODE_FOR_xop_vpermil2v8sf3:
11526 case CODE_FOR_avx512f_vinsertf32x4_mask:
11527 case CODE_FOR_avx512f_vinserti32x4_mask:
11528 case CODE_FOR_avx512f_vextractf32x4_mask:
11529 case CODE_FOR_avx512f_vextracti32x4_mask:
11530 case CODE_FOR_sse2_shufpd:
11531 case CODE_FOR_sse2_shufpd_mask:
11532 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11533 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11534 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11535 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11536 error ("the last argument must be a 2-bit immediate");
11537 return const0_rtx;
11538
11539 case CODE_FOR_avx_vextractf128v4df:
11540 case CODE_FOR_avx_vextractf128v8sf:
11541 case CODE_FOR_avx_vextractf128v8si:
11542 case CODE_FOR_avx_vinsertf128v4df:
11543 case CODE_FOR_avx_vinsertf128v8sf:
11544 case CODE_FOR_avx_vinsertf128v8si:
11545 case CODE_FOR_avx512f_vinsertf64x4_mask:
11546 case CODE_FOR_avx512f_vinserti64x4_mask:
11547 case CODE_FOR_avx512f_vextractf64x4_mask:
11548 case CODE_FOR_avx512f_vextracti64x4_mask:
11549 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11550 case CODE_FOR_avx512dq_vinserti32x8_mask:
11551 case CODE_FOR_avx512vl_vinsertv4df:
11552 case CODE_FOR_avx512vl_vinsertv4di:
11553 case CODE_FOR_avx512vl_vinsertv8sf:
11554 case CODE_FOR_avx512vl_vinsertv8si:
11555 error ("the last argument must be a 1-bit immediate");
11556 return const0_rtx;
11557
11558 case CODE_FOR_avx_vmcmpv2df3:
11559 case CODE_FOR_avx_vmcmpv4sf3:
11560 case CODE_FOR_avx_cmpv2df3:
11561 case CODE_FOR_avx_cmpv4sf3:
11562 case CODE_FOR_avx_cmpv4df3:
11563 case CODE_FOR_avx_cmpv8sf3:
11564 case CODE_FOR_avx512f_cmpv8df3_mask:
11565 case CODE_FOR_avx512f_cmpv16sf3_mask:
11566 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11567 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11568 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11569 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11570 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11571 error ("the last argument must be a 5-bit immediate");
11572 return const0_rtx;
11573
11574 default:
11575 switch (nargs_constant)
11576 {
11577 case 2:
11578 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11579 (!mask_pos && (nargs - i) == nargs_constant))
11580 {
11581 error ("the next to last argument must be an 8-bit immediate");
11582 break;
11583 }
11584 /* FALLTHRU */
11585 case 1:
11586 error ("the last argument must be an 8-bit immediate");
11587 break;
11588 default:
11589 gcc_unreachable ();
11590 }
11591 return const0_rtx;
11592 }
11593 }
11594 else
11595 {
11596 if (VECTOR_MODE_P (mode))
11597 op = safe_vector_operand (op, mode);
11598
11599 /* If we aren't optimizing, only allow one memory operand to
11600 be generated. */
11601 if (memory_operand (op, mode))
11602 num_memory++;
11603
11604 op = fixup_modeless_constant (op, mode);
11605
11606 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11607 {
11608 if (optimize || !match || num_memory > 1)
11609 op = copy_to_mode_reg (mode, op);
11610 }
11611 else
11612 {
11613 op = copy_to_reg (op);
11614 op = lowpart_subreg (mode, op, GET_MODE (op));
11615 }
11616 }
11617
11618 xops[i] = op;
11619 }
11620
11621 switch (nargs)
11622 {
11623 case 1:
11624 pat = GEN_FCN (icode) (real_target, xops[0]);
11625 break;
11626 case 2:
11627 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11628 break;
11629 case 3:
11630 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11631 break;
11632 case 4:
11633 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11634 xops[2], xops[3]);
11635 break;
11636 case 5:
11637 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11638 xops[2], xops[3], xops[4]);
11639 break;
11640 case 6:
11641 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11642 xops[2], xops[3], xops[4], xops[5]);
11643 break;
11644 default:
11645 gcc_unreachable ();
11646 }
11647
11648 if (! pat)
11649 return 0;
11650
11651 emit_insn (pat);
11652 return target;
11653 }
11654
11655 /* Transform pattern of following layout:
11656 (set A
11657 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11658 )
11659 into:
11660 (set (A B)) */
11661
11662 static rtx
11663 ix86_erase_embedded_rounding (rtx pat)
11664 {
11665 if (GET_CODE (pat) == INSN)
11666 pat = PATTERN (pat);
11667
11668 gcc_assert (GET_CODE (pat) == SET);
11669 rtx src = SET_SRC (pat);
11670 gcc_assert (XVECLEN (src, 0) == 2);
11671 rtx p0 = XVECEXP (src, 0, 0);
11672 gcc_assert (GET_CODE (src) == UNSPEC
11673 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11674 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11675 return res;
11676 }
11677
11678 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11679 with rounding. */
11680 static rtx
11681 ix86_expand_sse_comi_round (const struct builtin_description *d,
11682 tree exp, rtx target)
11683 {
11684 rtx pat, set_dst;
11685 tree arg0 = CALL_EXPR_ARG (exp, 0);
11686 tree arg1 = CALL_EXPR_ARG (exp, 1);
11687 tree arg2 = CALL_EXPR_ARG (exp, 2);
11688 tree arg3 = CALL_EXPR_ARG (exp, 3);
11689 rtx op0 = expand_normal (arg0);
11690 rtx op1 = expand_normal (arg1);
11691 rtx op2 = expand_normal (arg2);
11692 rtx op3 = expand_normal (arg3);
11693 enum insn_code icode = d->icode;
11694 const struct insn_data_d *insn_p = &insn_data[icode];
11695 machine_mode mode0 = insn_p->operand[0].mode;
11696 machine_mode mode1 = insn_p->operand[1].mode;
11697
11698 /* See avxintrin.h for values. */
11699 static const enum rtx_code comparisons[32] =
11700 {
11701 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11702 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11703 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11704 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11705 };
11706 static const bool ordereds[32] =
11707 {
11708 true, true, true, false, false, false, false, true,
11709 false, false, false, true, true, true, true, false,
11710 true, true, true, false, false, false, false, true,
11711 false, false, false, true, true, true, true, false
11712 };
11713 static const bool non_signalings[32] =
11714 {
11715 true, false, false, true, true, false, false, true,
11716 true, false, false, true, true, false, false, true,
11717 false, true, true, false, false, true, true, false,
11718 false, true, true, false, false, true, true, false
11719 };
11720
11721 if (!CONST_INT_P (op2))
11722 {
11723 error ("the third argument must be comparison constant");
11724 return const0_rtx;
11725 }
11726 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11727 {
11728 error ("incorrect comparison mode");
11729 return const0_rtx;
11730 }
11731
11732 if (!insn_p->operand[2].predicate (op3, SImode))
11733 {
11734 error ("incorrect rounding operand");
11735 return const0_rtx;
11736 }
11737
11738 if (VECTOR_MODE_P (mode0))
11739 op0 = safe_vector_operand (op0, mode0);
11740 if (VECTOR_MODE_P (mode1))
11741 op1 = safe_vector_operand (op1, mode1);
11742
11743 enum rtx_code comparison = comparisons[INTVAL (op2)];
11744 bool ordered = ordereds[INTVAL (op2)];
11745 bool non_signaling = non_signalings[INTVAL (op2)];
11746 rtx const_val = const0_rtx;
11747
11748 bool check_unordered = false;
11749 machine_mode mode = CCFPmode;
11750 switch (comparison)
11751 {
11752 case ORDERED:
11753 if (!ordered)
11754 {
11755 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11756 if (!non_signaling)
11757 ordered = true;
11758 mode = CCSmode;
11759 }
11760 else
11761 {
11762 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11763 if (non_signaling)
11764 ordered = false;
11765 mode = CCPmode;
11766 }
11767 comparison = NE;
11768 break;
11769 case UNORDERED:
11770 if (ordered)
11771 {
11772 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11773 if (non_signaling)
11774 ordered = false;
11775 mode = CCSmode;
11776 }
11777 else
11778 {
11779 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11780 if (!non_signaling)
11781 ordered = true;
11782 mode = CCPmode;
11783 }
11784 comparison = EQ;
11785 break;
11786
11787 case LE: /* -> GE */
11788 case LT: /* -> GT */
11789 case UNGE: /* -> UNLE */
11790 case UNGT: /* -> UNLT */
11791 std::swap (op0, op1);
11792 comparison = swap_condition (comparison);
11793 /* FALLTHRU */
11794 case GT:
11795 case GE:
11796 case UNEQ:
11797 case UNLT:
11798 case UNLE:
11799 case LTGT:
11800 /* These are supported by CCFPmode. NB: Use ordered/signaling
11801 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11802 with NAN operands. */
11803 if (ordered == non_signaling)
11804 ordered = !ordered;
11805 break;
11806 case EQ:
11807 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11808 _CMP_EQ_OQ/_CMP_EQ_OS. */
11809 check_unordered = true;
11810 mode = CCZmode;
11811 break;
11812 case NE:
11813 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11814 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11815 gcc_assert (!ordered);
11816 check_unordered = true;
11817 mode = CCZmode;
11818 const_val = const1_rtx;
11819 break;
11820 default:
11821 gcc_unreachable ();
11822 }
11823
11824 target = gen_reg_rtx (SImode);
11825 emit_move_insn (target, const_val);
11826 target = gen_rtx_SUBREG (QImode, target, 0);
11827
11828 if ((optimize && !register_operand (op0, mode0))
11829 || !insn_p->operand[0].predicate (op0, mode0))
11830 op0 = copy_to_mode_reg (mode0, op0);
11831 if ((optimize && !register_operand (op1, mode1))
11832 || !insn_p->operand[1].predicate (op1, mode1))
11833 op1 = copy_to_mode_reg (mode1, op1);
11834
11835 /*
11836 1. COMI: ordered and signaling.
11837 2. UCOMI: unordered and non-signaling.
11838 */
11839 if (non_signaling)
11840 icode = (icode == CODE_FOR_sse_comi_round
11841 ? CODE_FOR_sse_ucomi_round
11842 : CODE_FOR_sse2_ucomi_round);
11843
11844 pat = GEN_FCN (icode) (op0, op1, op3);
11845 if (! pat)
11846 return 0;
11847
11848 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11849 if (INTVAL (op3) == NO_ROUND)
11850 {
11851 pat = ix86_erase_embedded_rounding (pat);
11852 if (! pat)
11853 return 0;
11854
11855 set_dst = SET_DEST (pat);
11856 }
11857 else
11858 {
11859 gcc_assert (GET_CODE (pat) == SET);
11860 set_dst = SET_DEST (pat);
11861 }
11862
11863 emit_insn (pat);
11864
11865 return ix86_ssecom_setcc (comparison, check_unordered, mode,
11866 set_dst, target);
11867 }
11868
11869 static rtx
11870 ix86_expand_round_builtin (const struct builtin_description *d,
11871 tree exp, rtx target)
11872 {
11873 rtx pat;
11874 unsigned int i, nargs;
11875 rtx xops[6];
11876 enum insn_code icode = d->icode;
11877 const struct insn_data_d *insn_p = &insn_data[icode];
11878 machine_mode tmode = insn_p->operand[0].mode;
11879 unsigned int nargs_constant = 0;
11880 unsigned int redundant_embed_rnd = 0;
11881
11882 switch ((enum ix86_builtin_func_type) d->flag)
11883 {
11884 case UINT64_FTYPE_V2DF_INT:
11885 case UINT64_FTYPE_V4SF_INT:
11886 case UINT64_FTYPE_V8HF_INT:
11887 case UINT_FTYPE_V2DF_INT:
11888 case UINT_FTYPE_V4SF_INT:
11889 case UINT_FTYPE_V8HF_INT:
11890 case INT64_FTYPE_V2DF_INT:
11891 case INT64_FTYPE_V4SF_INT:
11892 case INT64_FTYPE_V8HF_INT:
11893 case INT_FTYPE_V2DF_INT:
11894 case INT_FTYPE_V4SF_INT:
11895 case INT_FTYPE_V8HF_INT:
11896 nargs = 2;
11897 break;
11898 case V32HF_FTYPE_V32HF_V32HF_INT:
11899 case V8HF_FTYPE_V8HF_V8HF_INT:
11900 case V8HF_FTYPE_V8HF_INT_INT:
11901 case V8HF_FTYPE_V8HF_UINT_INT:
11902 case V8HF_FTYPE_V8HF_INT64_INT:
11903 case V8HF_FTYPE_V8HF_UINT64_INT:
11904 case V4SF_FTYPE_V4SF_UINT_INT:
11905 case V4SF_FTYPE_V4SF_UINT64_INT:
11906 case V2DF_FTYPE_V2DF_UINT64_INT:
11907 case V4SF_FTYPE_V4SF_INT_INT:
11908 case V4SF_FTYPE_V4SF_INT64_INT:
11909 case V2DF_FTYPE_V2DF_INT64_INT:
11910 case V4SF_FTYPE_V4SF_V4SF_INT:
11911 case V2DF_FTYPE_V2DF_V2DF_INT:
11912 case V4SF_FTYPE_V4SF_V2DF_INT:
11913 case V2DF_FTYPE_V2DF_V4SF_INT:
11914 nargs = 3;
11915 break;
11916 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11917 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
11918 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
11919 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
11920 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
11921 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11922 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11923 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
11924 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11925 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
11926 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
11927 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
11928 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
11929 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11930 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11931 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11932 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
11933 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
11934 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
11935 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11936 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11937 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11938 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
11939 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
11940 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11941 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
11942 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
11943 nargs = 4;
11944 break;
11945 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11946 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11947 nargs_constant = 2;
11948 nargs = 4;
11949 break;
11950 case INT_FTYPE_V4SF_V4SF_INT_INT:
11951 case INT_FTYPE_V2DF_V2DF_INT_INT:
11952 return ix86_expand_sse_comi_round (d, exp, target);
11953 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11954 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11955 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
11956 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
11957 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
11958 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
11959 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
11960 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
11961 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11962 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
11963 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
11964 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11965 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
11966 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
11967 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
11968 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11969 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
11970 nargs = 5;
11971 break;
11972 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
11973 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11974 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
11975 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11976 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
11977 nargs_constant = 4;
11978 nargs = 5;
11979 break;
11980 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11981 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11982 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11983 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
11984 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11985 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
11986 nargs_constant = 3;
11987 nargs = 5;
11988 break;
11989 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11990 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11991 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11992 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11993 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11994 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
11995 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
11996 nargs = 6;
11997 nargs_constant = 4;
11998 break;
11999 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
12000 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
12001 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
12002 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
12003 nargs = 6;
12004 nargs_constant = 3;
12005 break;
12006 default:
12007 gcc_unreachable ();
12008 }
12009 gcc_assert (nargs <= ARRAY_SIZE (xops));
12010
12011 if (optimize
12012 || target == 0
12013 || GET_MODE (target) != tmode
12014 || !insn_p->operand[0].predicate (target, tmode))
12015 target = gen_reg_rtx (tmode);
12016
12017 for (i = 0; i < nargs; i++)
12018 {
12019 tree arg = CALL_EXPR_ARG (exp, i);
12020 rtx op = expand_normal (arg);
12021 machine_mode mode = insn_p->operand[i + 1].mode;
12022 bool match = insn_p->operand[i + 1].predicate (op, mode);
12023
12024 if (i == nargs - nargs_constant)
12025 {
12026 if (!match)
12027 {
12028 switch (icode)
12029 {
12030 case CODE_FOR_avx512f_getmantv8df_mask_round:
12031 case CODE_FOR_avx512f_getmantv16sf_mask_round:
12032 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
12033 case CODE_FOR_avx512f_vgetmantv2df_round:
12034 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
12035 case CODE_FOR_avx512f_vgetmantv4sf_round:
12036 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
12037 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
12038 error ("the immediate argument must be a 4-bit immediate");
12039 return const0_rtx;
12040 case CODE_FOR_avx512f_cmpv8df3_mask_round:
12041 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
12042 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
12043 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
12044 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
12045 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
12046 error ("the immediate argument must be a 5-bit immediate");
12047 return const0_rtx;
12048 default:
12049 error ("the immediate argument must be an 8-bit immediate");
12050 return const0_rtx;
12051 }
12052 }
12053 }
12054 else if (i == nargs-1)
12055 {
12056 if (!insn_p->operand[nargs].predicate (op, SImode))
12057 {
12058 error ("incorrect rounding operand");
12059 return const0_rtx;
12060 }
12061
12062 /* If there is no rounding use normal version of the pattern. */
12063 if (INTVAL (op) == NO_ROUND)
12064 {
12065 /* Skip erasing embedded rounding for below expanders who
12066 generates multiple insns. In ix86_erase_embedded_rounding
12067 the pattern will be transformed to a single set, and emit_insn
12068 appends the set insead of insert it to chain. So the insns
12069 emitted inside define_expander would be ignored. */
12070 switch (icode)
12071 {
12072 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
12073 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
12074 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
12075 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
12076 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
12077 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
12078 redundant_embed_rnd = 0;
12079 break;
12080 default:
12081 redundant_embed_rnd = 1;
12082 break;
12083 }
12084 }
12085 }
12086 else
12087 {
12088 if (VECTOR_MODE_P (mode))
12089 op = safe_vector_operand (op, mode);
12090
12091 op = fixup_modeless_constant (op, mode);
12092
12093 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12094 {
12095 if (optimize || !match)
12096 op = copy_to_mode_reg (mode, op);
12097 }
12098 else
12099 {
12100 op = copy_to_reg (op);
12101 op = lowpart_subreg (mode, op, GET_MODE (op));
12102 }
12103 }
12104
12105 xops[i] = op;
12106 }
12107
12108 switch (nargs)
12109 {
12110 case 1:
12111 pat = GEN_FCN (icode) (target, xops[0]);
12112 break;
12113 case 2:
12114 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12115 break;
12116 case 3:
12117 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12118 break;
12119 case 4:
12120 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12121 xops[2], xops[3]);
12122 break;
12123 case 5:
12124 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12125 xops[2], xops[3], xops[4]);
12126 break;
12127 case 6:
12128 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12129 xops[2], xops[3], xops[4], xops[5]);
12130 break;
12131 default:
12132 gcc_unreachable ();
12133 }
12134
12135 if (!pat)
12136 return 0;
12137
12138 if (redundant_embed_rnd)
12139 pat = ix86_erase_embedded_rounding (pat);
12140
12141 emit_insn (pat);
12142 return target;
12143 }
12144
12145 /* Subroutine of ix86_expand_builtin to take care of special insns
12146 with variable number of operands. */
12147
12148 static rtx
12149 ix86_expand_special_args_builtin (const struct builtin_description *d,
12150 tree exp, rtx target)
12151 {
12152 tree arg;
12153 rtx pat, op;
12154 unsigned int i, nargs, arg_adjust, memory;
12155 unsigned int constant = 100;
12156 bool aligned_mem = false;
12157 rtx xops[4];
12158 enum insn_code icode = d->icode;
12159 const struct insn_data_d *insn_p = &insn_data[icode];
12160 machine_mode tmode = insn_p->operand[0].mode;
12161 enum { load, store } klass;
12162
12163 switch ((enum ix86_builtin_func_type) d->flag)
12164 {
12165 case VOID_FTYPE_VOID:
12166 emit_insn (GEN_FCN (icode) (target));
12167 return 0;
12168 case VOID_FTYPE_UINT64:
12169 case VOID_FTYPE_UNSIGNED:
12170 nargs = 0;
12171 klass = store;
12172 memory = 0;
12173 break;
12174
12175 case INT_FTYPE_VOID:
12176 case USHORT_FTYPE_VOID:
12177 case UINT64_FTYPE_VOID:
12178 case UINT_FTYPE_VOID:
12179 case UINT8_FTYPE_VOID:
12180 case UNSIGNED_FTYPE_VOID:
12181 nargs = 0;
12182 klass = load;
12183 memory = 0;
12184 break;
12185 case UINT64_FTYPE_PUNSIGNED:
12186 case V2DI_FTYPE_PV2DI:
12187 case V4DI_FTYPE_PV4DI:
12188 case V32QI_FTYPE_PCCHAR:
12189 case V16QI_FTYPE_PCCHAR:
12190 case V8SF_FTYPE_PCV4SF:
12191 case V8SF_FTYPE_PCFLOAT:
12192 case V4SF_FTYPE_PCFLOAT:
12193 case V4SF_FTYPE_PCFLOAT16:
12194 case V4SF_FTYPE_PCBFLOAT16:
12195 case V4SF_FTYPE_PCV8BF:
12196 case V4SF_FTYPE_PCV8HF:
12197 case V8SF_FTYPE_PCFLOAT16:
12198 case V8SF_FTYPE_PCBFLOAT16:
12199 case V8SF_FTYPE_PCV16HF:
12200 case V8SF_FTYPE_PCV16BF:
12201 case V4DF_FTYPE_PCV2DF:
12202 case V4DF_FTYPE_PCDOUBLE:
12203 case V2DF_FTYPE_PCDOUBLE:
12204 case VOID_FTYPE_PVOID:
12205 case V8DI_FTYPE_PV8DI:
12206 nargs = 1;
12207 klass = load;
12208 memory = 0;
12209 switch (icode)
12210 {
12211 case CODE_FOR_sse4_1_movntdqa:
12212 case CODE_FOR_avx2_movntdqa:
12213 case CODE_FOR_avx512f_movntdqa:
12214 aligned_mem = true;
12215 break;
12216 default:
12217 break;
12218 }
12219 break;
12220 case VOID_FTYPE_PV2SF_V4SF:
12221 case VOID_FTYPE_PV8DI_V8DI:
12222 case VOID_FTYPE_PV4DI_V4DI:
12223 case VOID_FTYPE_PV2DI_V2DI:
12224 case VOID_FTYPE_PCHAR_V32QI:
12225 case VOID_FTYPE_PCHAR_V16QI:
12226 case VOID_FTYPE_PFLOAT_V16SF:
12227 case VOID_FTYPE_PFLOAT_V8SF:
12228 case VOID_FTYPE_PFLOAT_V4SF:
12229 case VOID_FTYPE_PDOUBLE_V8DF:
12230 case VOID_FTYPE_PDOUBLE_V4DF:
12231 case VOID_FTYPE_PDOUBLE_V2DF:
12232 case VOID_FTYPE_PLONGLONG_LONGLONG:
12233 case VOID_FTYPE_PULONGLONG_ULONGLONG:
12234 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
12235 case VOID_FTYPE_PINT_INT:
12236 nargs = 1;
12237 klass = store;
12238 /* Reserve memory operand for target. */
12239 memory = ARRAY_SIZE (xops);
12240 switch (icode)
12241 {
12242 /* These builtins and instructions require the memory
12243 to be properly aligned. */
12244 case CODE_FOR_avx_movntv4di:
12245 case CODE_FOR_sse2_movntv2di:
12246 case CODE_FOR_avx_movntv8sf:
12247 case CODE_FOR_sse_movntv4sf:
12248 case CODE_FOR_sse4a_vmmovntv4sf:
12249 case CODE_FOR_avx_movntv4df:
12250 case CODE_FOR_sse2_movntv2df:
12251 case CODE_FOR_sse4a_vmmovntv2df:
12252 case CODE_FOR_sse2_movntidi:
12253 case CODE_FOR_sse_movntq:
12254 case CODE_FOR_sse2_movntisi:
12255 case CODE_FOR_avx512f_movntv16sf:
12256 case CODE_FOR_avx512f_movntv8df:
12257 case CODE_FOR_avx512f_movntv8di:
12258 aligned_mem = true;
12259 break;
12260 default:
12261 break;
12262 }
12263 break;
12264 case VOID_FTYPE_PVOID_PCVOID:
12265 nargs = 1;
12266 klass = store;
12267 memory = 0;
12268
12269 break;
12270 case V4SF_FTYPE_V4SF_PCV2SF:
12271 case V2DF_FTYPE_V2DF_PCDOUBLE:
12272 nargs = 2;
12273 klass = load;
12274 memory = 1;
12275 break;
12276 case V8SF_FTYPE_PCV8SF_V8SI:
12277 case V4DF_FTYPE_PCV4DF_V4DI:
12278 case V4SF_FTYPE_PCV4SF_V4SI:
12279 case V2DF_FTYPE_PCV2DF_V2DI:
12280 case V8SI_FTYPE_PCV8SI_V8SI:
12281 case V4DI_FTYPE_PCV4DI_V4DI:
12282 case V4SI_FTYPE_PCV4SI_V4SI:
12283 case V2DI_FTYPE_PCV2DI_V2DI:
12284 case VOID_FTYPE_INT_INT64:
12285 nargs = 2;
12286 klass = load;
12287 memory = 0;
12288 break;
12289 case VOID_FTYPE_PV8DF_V8DF_UQI:
12290 case VOID_FTYPE_PV4DF_V4DF_UQI:
12291 case VOID_FTYPE_PV2DF_V2DF_UQI:
12292 case VOID_FTYPE_PV16SF_V16SF_UHI:
12293 case VOID_FTYPE_PV8SF_V8SF_UQI:
12294 case VOID_FTYPE_PV4SF_V4SF_UQI:
12295 case VOID_FTYPE_PV8DI_V8DI_UQI:
12296 case VOID_FTYPE_PV4DI_V4DI_UQI:
12297 case VOID_FTYPE_PV2DI_V2DI_UQI:
12298 case VOID_FTYPE_PV16SI_V16SI_UHI:
12299 case VOID_FTYPE_PV8SI_V8SI_UQI:
12300 case VOID_FTYPE_PV4SI_V4SI_UQI:
12301 case VOID_FTYPE_PV64QI_V64QI_UDI:
12302 case VOID_FTYPE_PV32HI_V32HI_USI:
12303 case VOID_FTYPE_PV32QI_V32QI_USI:
12304 case VOID_FTYPE_PV16QI_V16QI_UHI:
12305 case VOID_FTYPE_PV16HI_V16HI_UHI:
12306 case VOID_FTYPE_PV8HI_V8HI_UQI:
12307 switch (icode)
12308 {
12309 /* These builtins and instructions require the memory
12310 to be properly aligned. */
12311 case CODE_FOR_avx512f_storev16sf_mask:
12312 case CODE_FOR_avx512f_storev16si_mask:
12313 case CODE_FOR_avx512f_storev8df_mask:
12314 case CODE_FOR_avx512f_storev8di_mask:
12315 case CODE_FOR_avx512vl_storev8sf_mask:
12316 case CODE_FOR_avx512vl_storev8si_mask:
12317 case CODE_FOR_avx512vl_storev4df_mask:
12318 case CODE_FOR_avx512vl_storev4di_mask:
12319 case CODE_FOR_avx512vl_storev4sf_mask:
12320 case CODE_FOR_avx512vl_storev4si_mask:
12321 case CODE_FOR_avx512vl_storev2df_mask:
12322 case CODE_FOR_avx512vl_storev2di_mask:
12323 aligned_mem = true;
12324 break;
12325 default:
12326 break;
12327 }
12328 /* FALLTHRU */
12329 case VOID_FTYPE_PV8SF_V8SI_V8SF:
12330 case VOID_FTYPE_PV4DF_V4DI_V4DF:
12331 case VOID_FTYPE_PV4SF_V4SI_V4SF:
12332 case VOID_FTYPE_PV2DF_V2DI_V2DF:
12333 case VOID_FTYPE_PV8SI_V8SI_V8SI:
12334 case VOID_FTYPE_PV4DI_V4DI_V4DI:
12335 case VOID_FTYPE_PV4SI_V4SI_V4SI:
12336 case VOID_FTYPE_PV2DI_V2DI_V2DI:
12337 case VOID_FTYPE_PV8SI_V8DI_UQI:
12338 case VOID_FTYPE_PV8HI_V8DI_UQI:
12339 case VOID_FTYPE_PV16HI_V16SI_UHI:
12340 case VOID_FTYPE_PUDI_V8DI_UQI:
12341 case VOID_FTYPE_PV16QI_V16SI_UHI:
12342 case VOID_FTYPE_PV4SI_V4DI_UQI:
12343 case VOID_FTYPE_PUDI_V2DI_UQI:
12344 case VOID_FTYPE_PUDI_V4DI_UQI:
12345 case VOID_FTYPE_PUSI_V2DI_UQI:
12346 case VOID_FTYPE_PV8HI_V8SI_UQI:
12347 case VOID_FTYPE_PUDI_V4SI_UQI:
12348 case VOID_FTYPE_PUSI_V4DI_UQI:
12349 case VOID_FTYPE_PUHI_V2DI_UQI:
12350 case VOID_FTYPE_PUDI_V8SI_UQI:
12351 case VOID_FTYPE_PUSI_V4SI_UQI:
12352 case VOID_FTYPE_PCHAR_V64QI_UDI:
12353 case VOID_FTYPE_PCHAR_V32QI_USI:
12354 case VOID_FTYPE_PCHAR_V16QI_UHI:
12355 case VOID_FTYPE_PSHORT_V32HI_USI:
12356 case VOID_FTYPE_PSHORT_V16HI_UHI:
12357 case VOID_FTYPE_PSHORT_V8HI_UQI:
12358 case VOID_FTYPE_PINT_V16SI_UHI:
12359 case VOID_FTYPE_PINT_V8SI_UQI:
12360 case VOID_FTYPE_PINT_V4SI_UQI:
12361 case VOID_FTYPE_PINT64_V8DI_UQI:
12362 case VOID_FTYPE_PINT64_V4DI_UQI:
12363 case VOID_FTYPE_PINT64_V2DI_UQI:
12364 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12365 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12366 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12367 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12368 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12369 case VOID_FTYPE_PFLOAT_V4SF_UQI:
12370 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
12371 case VOID_FTYPE_PV32QI_V32HI_USI:
12372 case VOID_FTYPE_PV16QI_V16HI_UHI:
12373 case VOID_FTYPE_PUDI_V8HI_UQI:
12374 nargs = 2;
12375 klass = store;
12376 /* Reserve memory operand for target. */
12377 memory = ARRAY_SIZE (xops);
12378 break;
12379 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12380 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12381 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12382 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12383 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12384 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12385 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12386 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12387 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12388 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12389 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12390 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12391 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12392 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12393 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12394 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12395 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12396 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12397 switch (icode)
12398 {
12399 /* These builtins and instructions require the memory
12400 to be properly aligned. */
12401 case CODE_FOR_avx512f_loadv16sf_mask:
12402 case CODE_FOR_avx512f_loadv16si_mask:
12403 case CODE_FOR_avx512f_loadv8df_mask:
12404 case CODE_FOR_avx512f_loadv8di_mask:
12405 case CODE_FOR_avx512vl_loadv8sf_mask:
12406 case CODE_FOR_avx512vl_loadv8si_mask:
12407 case CODE_FOR_avx512vl_loadv4df_mask:
12408 case CODE_FOR_avx512vl_loadv4di_mask:
12409 case CODE_FOR_avx512vl_loadv4sf_mask:
12410 case CODE_FOR_avx512vl_loadv4si_mask:
12411 case CODE_FOR_avx512vl_loadv2df_mask:
12412 case CODE_FOR_avx512vl_loadv2di_mask:
12413 case CODE_FOR_avx512bw_loadv64qi_mask:
12414 case CODE_FOR_avx512vl_loadv32qi_mask:
12415 case CODE_FOR_avx512vl_loadv16qi_mask:
12416 case CODE_FOR_avx512bw_loadv32hi_mask:
12417 case CODE_FOR_avx512vl_loadv16hi_mask:
12418 case CODE_FOR_avx512vl_loadv8hi_mask:
12419 aligned_mem = true;
12420 break;
12421 default:
12422 break;
12423 }
12424 /* FALLTHRU */
12425 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12426 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12427 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12428 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12429 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12430 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12431 case V16SI_FTYPE_PCINT_V16SI_UHI:
12432 case V8SI_FTYPE_PCINT_V8SI_UQI:
12433 case V4SI_FTYPE_PCINT_V4SI_UQI:
12434 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12435 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12436 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12437 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12438 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12439 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12440 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12441 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12442 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12443 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12444 nargs = 3;
12445 klass = load;
12446 memory = 0;
12447 break;
12448 case INT_FTYPE_PINT_INT_INT_INT:
12449 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
12450 nargs = 4;
12451 klass = load;
12452 memory = 0;
12453 constant = 3;
12454 break;
12455 default:
12456 gcc_unreachable ();
12457 }
12458
12459 gcc_assert (nargs <= ARRAY_SIZE (xops));
12460
12461 if (klass == store)
12462 {
12463 arg = CALL_EXPR_ARG (exp, 0);
12464 op = expand_normal (arg);
12465 gcc_assert (target == 0);
12466 if (memory)
12467 {
12468 op = ix86_zero_extend_to_Pmode (op);
12469 target = gen_rtx_MEM (tmode, op);
12470 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12471 on it. Try to improve it using get_pointer_alignment,
12472 and if the special builtin is one that requires strict
12473 mode alignment, also from it's GET_MODE_ALIGNMENT.
12474 Failure to do so could lead to ix86_legitimate_combined_insn
12475 rejecting all changes to such insns. */
12476 unsigned int align = get_pointer_alignment (arg);
12477 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12478 align = GET_MODE_ALIGNMENT (tmode);
12479 if (MEM_ALIGN (target) < align)
12480 set_mem_align (target, align);
12481 }
12482 else
12483 target = force_reg (tmode, op);
12484 arg_adjust = 1;
12485 }
12486 else
12487 {
12488 arg_adjust = 0;
12489 if (optimize
12490 || target == 0
12491 || !register_operand (target, tmode)
12492 || GET_MODE (target) != tmode)
12493 target = gen_reg_rtx (tmode);
12494 }
12495
12496 for (i = 0; i < nargs; i++)
12497 {
12498 machine_mode mode = insn_p->operand[i + 1].mode;
12499
12500 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12501 op = expand_normal (arg);
12502
12503 if (i == memory)
12504 {
12505 /* This must be the memory operand. */
12506 op = ix86_zero_extend_to_Pmode (op);
12507 op = gen_rtx_MEM (mode, op);
12508 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12509 on it. Try to improve it using get_pointer_alignment,
12510 and if the special builtin is one that requires strict
12511 mode alignment, also from it's GET_MODE_ALIGNMENT.
12512 Failure to do so could lead to ix86_legitimate_combined_insn
12513 rejecting all changes to such insns. */
12514 unsigned int align = get_pointer_alignment (arg);
12515 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12516 align = GET_MODE_ALIGNMENT (mode);
12517 if (MEM_ALIGN (op) < align)
12518 set_mem_align (op, align);
12519 }
12520 else if (i == constant)
12521 {
12522 /* This must be the constant. */
12523 if (!insn_p->operand[nargs].predicate(op, SImode))
12524 {
12525 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12526 return const0_rtx;
12527 }
12528 }
12529 else
12530 {
12531 /* This must be register. */
12532 if (VECTOR_MODE_P (mode))
12533 op = safe_vector_operand (op, mode);
12534
12535 op = fixup_modeless_constant (op, mode);
12536
12537 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12538 and that mask operand shoud be at the end.
12539 Keep all-ones mask which would be simplified by the expander. */
12540 if (nargs == 3 && i == 2 && klass == load
12541 && constm1_operand (op, mode)
12542 && insn_p->operand[i].predicate (op, mode))
12543 ;
12544 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12545 op = copy_to_mode_reg (mode, op);
12546 else
12547 {
12548 op = copy_to_reg (op);
12549 op = lowpart_subreg (mode, op, GET_MODE (op));
12550 }
12551 }
12552
12553 xops[i]= op;
12554 }
12555
12556 switch (nargs)
12557 {
12558 case 0:
12559 pat = GEN_FCN (icode) (target);
12560 break;
12561 case 1:
12562 pat = GEN_FCN (icode) (target, xops[0]);
12563 break;
12564 case 2:
12565 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12566 break;
12567 case 3:
12568 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12569 break;
12570 case 4:
12571 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
12572 break;
12573 default:
12574 gcc_unreachable ();
12575 }
12576
12577 if (! pat)
12578 return 0;
12579
12580 emit_insn (pat);
12581 return klass == store ? 0 : target;
12582 }
12583
12584 /* Return the integer constant in ARG. Constrain it to be in the range
12585 of the subparts of VEC_TYPE; issue an error if not. */
12586
12587 static int
12588 get_element_number (tree vec_type, tree arg)
12589 {
12590 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12591
12592 if (!tree_fits_uhwi_p (arg)
12593 || (elt = tree_to_uhwi (arg), elt > max))
12594 {
12595 error ("selector must be an integer constant in the range "
12596 "[0, %wi]", max);
12597 return 0;
12598 }
12599
12600 return elt;
12601 }
12602
12603 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12604 ix86_expand_vector_init. We DO have language-level syntax for this, in
12605 the form of (type){ init-list }. Except that since we can't place emms
12606 instructions from inside the compiler, we can't allow the use of MMX
12607 registers unless the user explicitly asks for it. So we do *not* define
12608 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12609 we have builtins invoked by mmintrin.h that gives us license to emit
12610 these sorts of instructions. */
12611
12612 static rtx
12613 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12614 {
12615 machine_mode tmode = TYPE_MODE (type);
12616 machine_mode inner_mode = GET_MODE_INNER (tmode);
12617 int i, n_elt = GET_MODE_NUNITS (tmode);
12618 rtvec v = rtvec_alloc (n_elt);
12619
12620 gcc_assert (VECTOR_MODE_P (tmode));
12621 gcc_assert (call_expr_nargs (exp) == n_elt);
12622
12623 for (i = 0; i < n_elt; ++i)
12624 {
12625 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12626 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12627 }
12628
12629 if (!target || !register_operand (target, tmode))
12630 target = gen_reg_rtx (tmode);
12631
12632 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12633 return target;
12634 }
12635
12636 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12637 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12638 had a language-level syntax for referencing vector elements. */
12639
12640 static rtx
12641 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12642 {
12643 machine_mode tmode, mode0;
12644 tree arg0, arg1;
12645 int elt;
12646 rtx op0;
12647
12648 arg0 = CALL_EXPR_ARG (exp, 0);
12649 arg1 = CALL_EXPR_ARG (exp, 1);
12650
12651 op0 = expand_normal (arg0);
12652 elt = get_element_number (TREE_TYPE (arg0), arg1);
12653
12654 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12655 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12656 gcc_assert (VECTOR_MODE_P (mode0));
12657
12658 op0 = force_reg (mode0, op0);
12659
12660 if (optimize || !target || !register_operand (target, tmode))
12661 target = gen_reg_rtx (tmode);
12662
12663 ix86_expand_vector_extract (true, target, op0, elt);
12664
12665 return target;
12666 }
12667
12668 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12669 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12670 a language-level syntax for referencing vector elements. */
12671
12672 static rtx
12673 ix86_expand_vec_set_builtin (tree exp)
12674 {
12675 machine_mode tmode, mode1;
12676 tree arg0, arg1, arg2;
12677 int elt;
12678 rtx op0, op1, target;
12679
12680 arg0 = CALL_EXPR_ARG (exp, 0);
12681 arg1 = CALL_EXPR_ARG (exp, 1);
12682 arg2 = CALL_EXPR_ARG (exp, 2);
12683
12684 tmode = TYPE_MODE (TREE_TYPE (arg0));
12685 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12686 gcc_assert (VECTOR_MODE_P (tmode));
12687
12688 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12689 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12690 elt = get_element_number (TREE_TYPE (arg0), arg2);
12691
12692 if (GET_MODE (op1) != mode1)
12693 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12694
12695 op0 = force_reg (tmode, op0);
12696 op1 = force_reg (mode1, op1);
12697
12698 /* OP0 is the source of these builtin functions and shouldn't be
12699 modified. Create a copy, use it and return it as target. */
12700 target = gen_reg_rtx (tmode);
12701 emit_move_insn (target, op0);
12702 ix86_expand_vector_set (true, target, op1, elt);
12703
12704 return target;
12705 }
12706
12707 /* Return true if the necessary isa options for this builtin exist,
12708 else false.
12709 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12710 bool
12711 ix86_check_builtin_isa_match (unsigned int fcode,
12712 HOST_WIDE_INT* pbisa,
12713 HOST_WIDE_INT* pbisa2)
12714 {
12715 HOST_WIDE_INT isa = ix86_isa_flags;
12716 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12717 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12718 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12719 HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
12720 /* The general case is we require all the ISAs specified in bisa{,2}
12721 to be enabled.
12722 The exceptions are:
12723 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12724 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12725 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12726 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12727 OPTION_MASK_ISA2_AVXVNNI
12728 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
12729 OPTION_MASK_ISA2_AVXIFMA
12730 (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
12731 OPTION_MASK_ISA2_AVXNECONVERT
12732 OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
12733 where for each such pair it is sufficient if either of the ISAs is
12734 enabled, plus if it is ored with other options also those others.
12735 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12736
12737 #define SHARE_BUILTIN(A1, A2, B1, B2) \
12738 if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
12739 && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
12740 && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
12741 || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
12742 { \
12743 tmp_isa |= (A1) | (B1); \
12744 tmp_isa2 |= (A2) | (B2); \
12745 }
12746
12747 SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
12748 SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
12749 SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
12750 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
12751 OPTION_MASK_ISA2_AVXVNNI);
12752 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
12753 OPTION_MASK_ISA2_AVXIFMA);
12754 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
12755 OPTION_MASK_ISA2_AVXNECONVERT);
12756 SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
12757 OPTION_MASK_ISA2_VAES);
12758 isa = tmp_isa;
12759 isa2 = tmp_isa2;
12760
12761 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12762 /* __builtin_ia32_maskmovq requires MMX registers. */
12763 && fcode != IX86_BUILTIN_MASKMOVQ)
12764 {
12765 bisa &= ~OPTION_MASK_ISA_MMX;
12766 bisa |= OPTION_MASK_ISA_SSE2;
12767 }
12768
12769 if (pbisa)
12770 *pbisa = bisa;
12771 if (pbisa2)
12772 *pbisa2 = bisa2;
12773
12774 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12775 }
12776
12777 /* Emit instructions to set the carry flag from ARG. */
12778
12779 void
12780 ix86_expand_carry (rtx arg)
12781 {
12782 if (!CONST_INT_P (arg) || arg == const0_rtx)
12783 {
12784 arg = convert_to_mode (QImode, arg, 1);
12785 arg = copy_to_mode_reg (QImode, arg);
12786 emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
12787 }
12788 else
12789 emit_insn (gen_x86_stc ());
12790 }
12791
12792 /* Expand an expression EXP that calls a built-in function,
12793 with result going to TARGET if that's convenient
12794 (and in mode MODE if that's convenient).
12795 SUBTARGET may be used as the target for computing one of EXP's operands.
12796 IGNORE is nonzero if the value is to be ignored. */
12797
12798 rtx
12799 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12800 machine_mode mode, int ignore)
12801 {
12802 size_t i;
12803 enum insn_code icode, icode2;
12804 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12805 tree arg0, arg1, arg2, arg3, arg4;
12806 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12807 machine_mode mode0, mode1, mode2, mode3, mode4;
12808 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12809 HOST_WIDE_INT bisa, bisa2;
12810
12811 /* For CPU builtins that can be folded, fold first and expand the fold. */
12812 switch (fcode)
12813 {
12814 case IX86_BUILTIN_CPU_INIT:
12815 {
12816 /* Make it call __cpu_indicator_init in libgcc. */
12817 tree call_expr, fndecl, type;
12818 type = build_function_type_list (integer_type_node, NULL_TREE);
12819 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12820 call_expr = build_call_expr (fndecl, 0);
12821 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12822 }
12823 case IX86_BUILTIN_CPU_IS:
12824 case IX86_BUILTIN_CPU_SUPPORTS:
12825 {
12826 tree arg0 = CALL_EXPR_ARG (exp, 0);
12827 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12828 gcc_assert (fold_expr != NULL_TREE);
12829 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12830 }
12831 }
12832
12833 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
12834 {
12835 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12836 if (TARGET_ABI_X32)
12837 bisa |= OPTION_MASK_ABI_X32;
12838 else
12839 bisa |= OPTION_MASK_ABI_64;
12840 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
12841 (enum fpmath_unit) 0,
12842 (enum prefer_vector_width) 0,
12843 PVW_NONE, PVW_NONE,
12844 false, add_abi_p);
12845 if (!opts)
12846 error ("%qE needs unknown isa option", fndecl);
12847 else
12848 {
12849 gcc_assert (opts != NULL);
12850 error ("%qE needs isa option %s", fndecl, opts);
12851 free (opts);
12852 }
12853 return expand_call (exp, target, ignore);
12854 }
12855
12856 switch (fcode)
12857 {
12858 case IX86_BUILTIN_MASKMOVQ:
12859 case IX86_BUILTIN_MASKMOVDQU:
12860 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12861 ? CODE_FOR_mmx_maskmovq
12862 : CODE_FOR_sse2_maskmovdqu);
12863 /* Note the arg order is different from the operand order. */
12864 arg1 = CALL_EXPR_ARG (exp, 0);
12865 arg2 = CALL_EXPR_ARG (exp, 1);
12866 arg0 = CALL_EXPR_ARG (exp, 2);
12867 op0 = expand_normal (arg0);
12868 op1 = expand_normal (arg1);
12869 op2 = expand_normal (arg2);
12870 mode0 = insn_data[icode].operand[0].mode;
12871 mode1 = insn_data[icode].operand[1].mode;
12872 mode2 = insn_data[icode].operand[2].mode;
12873
12874 op0 = ix86_zero_extend_to_Pmode (op0);
12875 op0 = gen_rtx_MEM (mode1, op0);
12876
12877 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12878 op0 = copy_to_mode_reg (mode0, op0);
12879 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12880 op1 = copy_to_mode_reg (mode1, op1);
12881 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12882 op2 = copy_to_mode_reg (mode2, op2);
12883 pat = GEN_FCN (icode) (op0, op1, op2);
12884 if (! pat)
12885 return 0;
12886 emit_insn (pat);
12887 return 0;
12888
12889 case IX86_BUILTIN_LDMXCSR:
12890 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12891 target = assign_386_stack_local (SImode, SLOT_TEMP);
12892 emit_move_insn (target, op0);
12893 emit_insn (gen_sse_ldmxcsr (target));
12894 return 0;
12895
12896 case IX86_BUILTIN_STMXCSR:
12897 target = assign_386_stack_local (SImode, SLOT_TEMP);
12898 emit_insn (gen_sse_stmxcsr (target));
12899 return copy_to_mode_reg (SImode, target);
12900
12901 case IX86_BUILTIN_CLFLUSH:
12902 arg0 = CALL_EXPR_ARG (exp, 0);
12903 op0 = expand_normal (arg0);
12904 icode = CODE_FOR_sse2_clflush;
12905 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12906 op0 = ix86_zero_extend_to_Pmode (op0);
12907
12908 emit_insn (gen_sse2_clflush (op0));
12909 return 0;
12910
12911 case IX86_BUILTIN_CLWB:
12912 arg0 = CALL_EXPR_ARG (exp, 0);
12913 op0 = expand_normal (arg0);
12914 icode = CODE_FOR_clwb;
12915 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12916 op0 = ix86_zero_extend_to_Pmode (op0);
12917
12918 emit_insn (gen_clwb (op0));
12919 return 0;
12920
12921 case IX86_BUILTIN_CLFLUSHOPT:
12922 arg0 = CALL_EXPR_ARG (exp, 0);
12923 op0 = expand_normal (arg0);
12924 icode = CODE_FOR_clflushopt;
12925 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12926 op0 = ix86_zero_extend_to_Pmode (op0);
12927
12928 emit_insn (gen_clflushopt (op0));
12929 return 0;
12930
12931 case IX86_BUILTIN_MONITOR:
12932 case IX86_BUILTIN_MONITORX:
12933 arg0 = CALL_EXPR_ARG (exp, 0);
12934 arg1 = CALL_EXPR_ARG (exp, 1);
12935 arg2 = CALL_EXPR_ARG (exp, 2);
12936 op0 = expand_normal (arg0);
12937 op1 = expand_normal (arg1);
12938 op2 = expand_normal (arg2);
12939 if (!REG_P (op0))
12940 op0 = ix86_zero_extend_to_Pmode (op0);
12941 if (!REG_P (op1))
12942 op1 = copy_to_mode_reg (SImode, op1);
12943 if (!REG_P (op2))
12944 op2 = copy_to_mode_reg (SImode, op2);
12945
12946 emit_insn (fcode == IX86_BUILTIN_MONITOR
12947 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12948 : gen_monitorx (Pmode, op0, op1, op2));
12949 return 0;
12950
12951 case IX86_BUILTIN_MWAIT:
12952 arg0 = CALL_EXPR_ARG (exp, 0);
12953 arg1 = CALL_EXPR_ARG (exp, 1);
12954 op0 = expand_normal (arg0);
12955 op1 = expand_normal (arg1);
12956 if (!REG_P (op0))
12957 op0 = copy_to_mode_reg (SImode, op0);
12958 if (!REG_P (op1))
12959 op1 = copy_to_mode_reg (SImode, op1);
12960 emit_insn (gen_sse3_mwait (op0, op1));
12961 return 0;
12962
12963 case IX86_BUILTIN_MWAITX:
12964 arg0 = CALL_EXPR_ARG (exp, 0);
12965 arg1 = CALL_EXPR_ARG (exp, 1);
12966 arg2 = CALL_EXPR_ARG (exp, 2);
12967 op0 = expand_normal (arg0);
12968 op1 = expand_normal (arg1);
12969 op2 = expand_normal (arg2);
12970 if (!REG_P (op0))
12971 op0 = copy_to_mode_reg (SImode, op0);
12972 if (!REG_P (op1))
12973 op1 = copy_to_mode_reg (SImode, op1);
12974 if (!REG_P (op2))
12975 op2 = copy_to_mode_reg (SImode, op2);
12976 emit_insn (gen_mwaitx (op0, op1, op2));
12977 return 0;
12978
12979 case IX86_BUILTIN_UMONITOR:
12980 arg0 = CALL_EXPR_ARG (exp, 0);
12981 op0 = expand_normal (arg0);
12982
12983 op0 = ix86_zero_extend_to_Pmode (op0);
12984 emit_insn (gen_umonitor (Pmode, op0));
12985 return 0;
12986
12987 case IX86_BUILTIN_UMWAIT:
12988 case IX86_BUILTIN_TPAUSE:
12989 arg0 = CALL_EXPR_ARG (exp, 0);
12990 arg1 = CALL_EXPR_ARG (exp, 1);
12991 op0 = expand_normal (arg0);
12992 op1 = expand_normal (arg1);
12993
12994 if (!REG_P (op0))
12995 op0 = copy_to_mode_reg (SImode, op0);
12996
12997 op1 = force_reg (DImode, op1);
12998
12999 if (TARGET_64BIT)
13000 {
13001 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13002 NULL, 1, OPTAB_DIRECT);
13003 switch (fcode)
13004 {
13005 case IX86_BUILTIN_UMWAIT:
13006 icode = CODE_FOR_umwait_rex64;
13007 break;
13008 case IX86_BUILTIN_TPAUSE:
13009 icode = CODE_FOR_tpause_rex64;
13010 break;
13011 default:
13012 gcc_unreachable ();
13013 }
13014
13015 op2 = gen_lowpart (SImode, op2);
13016 op1 = gen_lowpart (SImode, op1);
13017 pat = GEN_FCN (icode) (op0, op1, op2);
13018 }
13019 else
13020 {
13021 switch (fcode)
13022 {
13023 case IX86_BUILTIN_UMWAIT:
13024 icode = CODE_FOR_umwait;
13025 break;
13026 case IX86_BUILTIN_TPAUSE:
13027 icode = CODE_FOR_tpause;
13028 break;
13029 default:
13030 gcc_unreachable ();
13031 }
13032 pat = GEN_FCN (icode) (op0, op1);
13033 }
13034
13035 if (!pat)
13036 return 0;
13037
13038 emit_insn (pat);
13039
13040 if (target == 0
13041 || !register_operand (target, QImode))
13042 target = gen_reg_rtx (QImode);
13043
13044 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13045 const0_rtx);
13046 emit_insn (gen_rtx_SET (target, pat));
13047
13048 return target;
13049
13050 case IX86_BUILTIN_TESTUI:
13051 emit_insn (gen_testui ());
13052
13053 if (target == 0
13054 || !register_operand (target, QImode))
13055 target = gen_reg_rtx (QImode);
13056
13057 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13058 const0_rtx);
13059 emit_insn (gen_rtx_SET (target, pat));
13060
13061 return target;
13062
13063 case IX86_BUILTIN_CLZERO:
13064 arg0 = CALL_EXPR_ARG (exp, 0);
13065 op0 = expand_normal (arg0);
13066 if (!REG_P (op0))
13067 op0 = ix86_zero_extend_to_Pmode (op0);
13068 emit_insn (gen_clzero (Pmode, op0));
13069 return 0;
13070
13071 case IX86_BUILTIN_CLDEMOTE:
13072 arg0 = CALL_EXPR_ARG (exp, 0);
13073 op0 = expand_normal (arg0);
13074 icode = CODE_FOR_cldemote;
13075 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13076 op0 = ix86_zero_extend_to_Pmode (op0);
13077
13078 emit_insn (gen_cldemote (op0));
13079 return 0;
13080
13081 case IX86_BUILTIN_LOADIWKEY:
13082 {
13083 arg0 = CALL_EXPR_ARG (exp, 0);
13084 arg1 = CALL_EXPR_ARG (exp, 1);
13085 arg2 = CALL_EXPR_ARG (exp, 2);
13086 arg3 = CALL_EXPR_ARG (exp, 3);
13087
13088 op0 = expand_normal (arg0);
13089 op1 = expand_normal (arg1);
13090 op2 = expand_normal (arg2);
13091 op3 = expand_normal (arg3);
13092
13093 if (!REG_P (op0))
13094 op0 = copy_to_mode_reg (V2DImode, op0);
13095 if (!REG_P (op1))
13096 op1 = copy_to_mode_reg (V2DImode, op1);
13097 if (!REG_P (op2))
13098 op2 = copy_to_mode_reg (V2DImode, op2);
13099 if (!REG_P (op3))
13100 op3 = copy_to_mode_reg (SImode, op3);
13101
13102 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
13103
13104 return 0;
13105 }
13106
13107 case IX86_BUILTIN_AESDEC128KLU8:
13108 icode = CODE_FOR_aesdec128klu8;
13109 goto aesdecenc_expand;
13110
13111 case IX86_BUILTIN_AESDEC256KLU8:
13112 icode = CODE_FOR_aesdec256klu8;
13113 goto aesdecenc_expand;
13114
13115 case IX86_BUILTIN_AESENC128KLU8:
13116 icode = CODE_FOR_aesenc128klu8;
13117 goto aesdecenc_expand;
13118
13119 case IX86_BUILTIN_AESENC256KLU8:
13120 icode = CODE_FOR_aesenc256klu8;
13121
13122 aesdecenc_expand:
13123
13124 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
13125 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
13126 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13127
13128 op0 = expand_normal (arg0);
13129 op1 = expand_normal (arg1);
13130 op2 = expand_normal (arg2);
13131
13132 if (!address_operand (op0, V2DImode))
13133 {
13134 op0 = convert_memory_address (Pmode, op0);
13135 op0 = copy_addr_to_reg (op0);
13136 }
13137 op0 = gen_rtx_MEM (V2DImode, op0);
13138
13139 if (!REG_P (op1))
13140 op1 = copy_to_mode_reg (V2DImode, op1);
13141
13142 if (!address_operand (op2, VOIDmode))
13143 {
13144 op2 = convert_memory_address (Pmode, op2);
13145 op2 = copy_addr_to_reg (op2);
13146 }
13147 op2 = gen_rtx_MEM (BLKmode, op2);
13148
13149 emit_insn (GEN_FCN (icode) (op1, op1, op2));
13150
13151 if (target == 0)
13152 target = gen_reg_rtx (QImode);
13153
13154 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
13155 error occurs. Then the output should be cleared for safety. */
13156 rtx_code_label *ok_label;
13157 rtx tmp;
13158
13159 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13160 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13161 ok_label = gen_label_rtx ();
13162 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13163 true, ok_label);
13164 /* Usually the runtime error seldom occur, so predict OK path as
13165 hotspot to optimize it as fallthrough block. */
13166 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13167
13168 emit_insn (gen_rtx_SET (op1, const0_rtx));
13169
13170 emit_label (ok_label);
13171 emit_insn (gen_rtx_SET (target, pat));
13172 emit_insn (gen_rtx_SET (op0, op1));
13173
13174 return target;
13175
13176 case IX86_BUILTIN_AESDECWIDE128KLU8:
13177 icode = CODE_FOR_aesdecwide128klu8;
13178 goto wideaesdecenc_expand;
13179
13180 case IX86_BUILTIN_AESDECWIDE256KLU8:
13181 icode = CODE_FOR_aesdecwide256klu8;
13182 goto wideaesdecenc_expand;
13183
13184 case IX86_BUILTIN_AESENCWIDE128KLU8:
13185 icode = CODE_FOR_aesencwide128klu8;
13186 goto wideaesdecenc_expand;
13187
13188 case IX86_BUILTIN_AESENCWIDE256KLU8:
13189 icode = CODE_FOR_aesencwide256klu8;
13190
13191 wideaesdecenc_expand:
13192
13193 rtx xmm_regs[8];
13194 rtx op;
13195
13196 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
13197 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
13198 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13199
13200 op0 = expand_normal (arg0);
13201 op1 = expand_normal (arg1);
13202 op2 = expand_normal (arg2);
13203
13204 if (!address_operand (op2, VOIDmode))
13205 {
13206 op2 = convert_memory_address (Pmode, op2);
13207 op2 = copy_addr_to_reg (op2);
13208 }
13209 op2 = gen_rtx_MEM (BLKmode, op2);
13210
13211 for (i = 0; i < 8; i++)
13212 {
13213 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13214
13215 op = gen_rtx_MEM (V2DImode,
13216 plus_constant (Pmode, op1, (i * 16)));
13217
13218 emit_move_insn (xmm_regs[i], op);
13219 }
13220
13221 emit_insn (GEN_FCN (icode) (op2));
13222
13223 if (target == 0)
13224 target = gen_reg_rtx (QImode);
13225
13226 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13227 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13228 ok_label = gen_label_rtx ();
13229 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13230 true, ok_label);
13231 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13232
13233 for (i = 0; i < 8; i++)
13234 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
13235
13236 emit_label (ok_label);
13237 emit_insn (gen_rtx_SET (target, pat));
13238
13239 for (i = 0; i < 8; i++)
13240 {
13241 op = gen_rtx_MEM (V2DImode,
13242 plus_constant (Pmode, op0, (i * 16)));
13243 emit_move_insn (op, xmm_regs[i]);
13244 }
13245
13246 return target;
13247
13248 case IX86_BUILTIN_ENCODEKEY128U32:
13249 {
13250 rtx op, xmm_regs[7];
13251
13252 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13253 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
13254 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
13255
13256 op0 = expand_normal (arg0);
13257 op1 = expand_normal (arg1);
13258 op2 = expand_normal (arg2);
13259
13260 if (!REG_P (op0))
13261 op0 = copy_to_mode_reg (SImode, op0);
13262
13263 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13264 emit_move_insn (op, op1);
13265
13266 for (i = 0; i < 3; i++)
13267 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13268
13269 if (target == 0)
13270 target = gen_reg_rtx (SImode);
13271
13272 emit_insn (gen_encodekey128u32 (target, op0));
13273
13274 for (i = 0; i < 3; i++)
13275 {
13276 op = gen_rtx_MEM (V2DImode,
13277 plus_constant (Pmode, op2, (i * 16)));
13278 emit_move_insn (op, xmm_regs[i]);
13279 }
13280
13281 return target;
13282 }
13283 case IX86_BUILTIN_ENCODEKEY256U32:
13284 {
13285 rtx op, xmm_regs[7];
13286
13287 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13288 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
13289 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
13290 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
13291
13292 op0 = expand_normal (arg0);
13293 op1 = expand_normal (arg1);
13294 op2 = expand_normal (arg2);
13295 op3 = expand_normal (arg3);
13296
13297 if (!REG_P (op0))
13298 op0 = copy_to_mode_reg (SImode, op0);
13299
13300 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13301 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13302 emit_move_insn (op, op1);
13303 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
13304 emit_move_insn (op, op2);
13305
13306 for (i = 0; i < 4; i++)
13307 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13308
13309 if (target == 0)
13310 target = gen_reg_rtx (SImode);
13311
13312 emit_insn (gen_encodekey256u32 (target, op0));
13313
13314 for (i = 0; i < 4; i++)
13315 {
13316 op = gen_rtx_MEM (V2DImode,
13317 plus_constant (Pmode, op3, (i * 16)));
13318 emit_move_insn (op, xmm_regs[i]);
13319 }
13320
13321 return target;
13322 }
13323
13324 case IX86_BUILTIN_PREFETCH:
13325 {
13326 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13327 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13328 arg2 = CALL_EXPR_ARG (exp, 2); // const int
13329 arg3 = CALL_EXPR_ARG (exp, 3); // const int
13330
13331 op0 = expand_normal (arg0);
13332 op1 = expand_normal (arg1);
13333 op2 = expand_normal (arg2);
13334 op3 = expand_normal (arg3);
13335
13336 if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
13337 {
13338 error ("second, third and fourth argument must be a const");
13339 return const0_rtx;
13340 }
13341
13342 if (INTVAL (op3) == 1)
13343 {
13344 if (INTVAL (op2) < 2 || INTVAL (op2) > 3)
13345 {
13346 error ("invalid third argument");
13347 return const0_rtx;
13348 }
13349
13350 if (TARGET_64BIT && TARGET_PREFETCHI
13351 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13352 emit_insn (gen_prefetchi (op0, op2));
13353 else
13354 {
13355 warning (0, "instruction prefetch applies when in 64-bit mode"
13356 " with RIP-relative addressing and"
13357 " option %<-mprefetchi%>;"
13358 " they stay NOPs otherwise");
13359 emit_insn (gen_nop ());
13360 }
13361 }
13362 else
13363 {
13364 if (!address_operand (op0, VOIDmode))
13365 {
13366 op0 = convert_memory_address (Pmode, op0);
13367 op0 = copy_addr_to_reg (op0);
13368 }
13369
13370 if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
13371 {
13372 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
13373 op2 = const0_rtx;
13374 }
13375
13376 if (TARGET_3DNOW || TARGET_PREFETCH_SSE
13377 || TARGET_PRFCHW || TARGET_PREFETCHWT1)
13378 emit_insn (gen_prefetch (op0, op1, op2));
13379 else if (!MEM_P (op0) && side_effects_p (op0))
13380 /* Don't do anything with direct references to volatile memory,
13381 but generate code to handle other side effects. */
13382 emit_insn (op0);
13383 }
13384
13385 return 0;
13386 }
13387
13388 case IX86_BUILTIN_PREFETCHI:
13389 {
13390 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13391 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13392
13393 op0 = expand_normal (arg0);
13394 op1 = expand_normal (arg1);
13395
13396 if (!CONST_INT_P (op1))
13397 {
13398 error ("second argument must be a const");
13399 return const0_rtx;
13400 }
13401
13402 /* GOT/PLT_PIC should not be available for instruction prefetch.
13403 It must be real instruction address. */
13404 if (TARGET_64BIT
13405 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13406 emit_insn (gen_prefetchi (op0, op1));
13407 else
13408 {
13409 /* Ignore the hint. */
13410 warning (0, "instruction prefetch applies when in 64-bit mode"
13411 " with RIP-relative addressing and"
13412 " option %<-mprefetchi%>;"
13413 " they stay NOPs otherwise");
13414 emit_insn (gen_nop ());
13415 }
13416
13417 return 0;
13418 }
13419
13420 case IX86_BUILTIN_VEC_INIT_V2SI:
13421 case IX86_BUILTIN_VEC_INIT_V4HI:
13422 case IX86_BUILTIN_VEC_INIT_V8QI:
13423 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
13424
13425 case IX86_BUILTIN_VEC_EXT_V2DF:
13426 case IX86_BUILTIN_VEC_EXT_V2DI:
13427 case IX86_BUILTIN_VEC_EXT_V4SF:
13428 case IX86_BUILTIN_VEC_EXT_V4SI:
13429 case IX86_BUILTIN_VEC_EXT_V8HI:
13430 case IX86_BUILTIN_VEC_EXT_V2SI:
13431 case IX86_BUILTIN_VEC_EXT_V4HI:
13432 case IX86_BUILTIN_VEC_EXT_V16QI:
13433 return ix86_expand_vec_ext_builtin (exp, target);
13434
13435 case IX86_BUILTIN_VEC_SET_V2DI:
13436 case IX86_BUILTIN_VEC_SET_V4SF:
13437 case IX86_BUILTIN_VEC_SET_V4SI:
13438 case IX86_BUILTIN_VEC_SET_V8HI:
13439 case IX86_BUILTIN_VEC_SET_V4HI:
13440 case IX86_BUILTIN_VEC_SET_V16QI:
13441 return ix86_expand_vec_set_builtin (exp);
13442
13443 case IX86_BUILTIN_NANQ:
13444 case IX86_BUILTIN_NANSQ:
13445 return expand_call (exp, target, ignore);
13446
13447 case IX86_BUILTIN_RDPID:
13448
13449 op0 = gen_reg_rtx (word_mode);
13450
13451 if (TARGET_64BIT)
13452 {
13453 insn = gen_rdpid_rex64 (op0);
13454 op0 = convert_to_mode (SImode, op0, 1);
13455 }
13456 else
13457 insn = gen_rdpid (op0);
13458
13459 emit_insn (insn);
13460
13461 if (target == 0
13462 || !register_operand (target, SImode))
13463 target = gen_reg_rtx (SImode);
13464
13465 emit_move_insn (target, op0);
13466 return target;
13467
13468 case IX86_BUILTIN_2INTERSECTD512:
13469 case IX86_BUILTIN_2INTERSECTQ512:
13470 case IX86_BUILTIN_2INTERSECTD256:
13471 case IX86_BUILTIN_2INTERSECTQ256:
13472 case IX86_BUILTIN_2INTERSECTD128:
13473 case IX86_BUILTIN_2INTERSECTQ128:
13474 arg0 = CALL_EXPR_ARG (exp, 0);
13475 arg1 = CALL_EXPR_ARG (exp, 1);
13476 arg2 = CALL_EXPR_ARG (exp, 2);
13477 arg3 = CALL_EXPR_ARG (exp, 3);
13478 op0 = expand_normal (arg0);
13479 op1 = expand_normal (arg1);
13480 op2 = expand_normal (arg2);
13481 op3 = expand_normal (arg3);
13482
13483 if (!address_operand (op0, VOIDmode))
13484 {
13485 op0 = convert_memory_address (Pmode, op0);
13486 op0 = copy_addr_to_reg (op0);
13487 }
13488 if (!address_operand (op1, VOIDmode))
13489 {
13490 op1 = convert_memory_address (Pmode, op1);
13491 op1 = copy_addr_to_reg (op1);
13492 }
13493
13494 switch (fcode)
13495 {
13496 case IX86_BUILTIN_2INTERSECTD512:
13497 mode4 = P2HImode;
13498 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13499 break;
13500 case IX86_BUILTIN_2INTERSECTQ512:
13501 mode4 = P2QImode;
13502 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13503 break;
13504 case IX86_BUILTIN_2INTERSECTD256:
13505 mode4 = P2QImode;
13506 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13507 break;
13508 case IX86_BUILTIN_2INTERSECTQ256:
13509 mode4 = P2QImode;
13510 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13511 break;
13512 case IX86_BUILTIN_2INTERSECTD128:
13513 mode4 = P2QImode;
13514 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13515 break;
13516 case IX86_BUILTIN_2INTERSECTQ128:
13517 mode4 = P2QImode;
13518 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13519 break;
13520 default:
13521 gcc_unreachable ();
13522 }
13523
13524 mode2 = insn_data[icode].operand[1].mode;
13525 mode3 = insn_data[icode].operand[2].mode;
13526 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13527 op2 = copy_to_mode_reg (mode2, op2);
13528 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13529 op3 = copy_to_mode_reg (mode3, op3);
13530
13531 op4 = gen_reg_rtx (mode4);
13532 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13533 mode0 = mode4 == P2HImode ? HImode : QImode;
13534 emit_move_insn (gen_rtx_MEM (mode0, op0),
13535 gen_lowpart (mode0, op4));
13536 emit_move_insn (gen_rtx_MEM (mode0, op1),
13537 gen_highpart (mode0, op4));
13538
13539 return 0;
13540
13541 case IX86_BUILTIN_RDPMC:
13542 case IX86_BUILTIN_RDTSC:
13543 case IX86_BUILTIN_RDTSCP:
13544 case IX86_BUILTIN_XGETBV:
13545
13546 op0 = gen_reg_rtx (DImode);
13547 op1 = gen_reg_rtx (DImode);
13548
13549 if (fcode == IX86_BUILTIN_RDPMC)
13550 {
13551 arg0 = CALL_EXPR_ARG (exp, 0);
13552 op2 = expand_normal (arg0);
13553 if (!register_operand (op2, SImode))
13554 op2 = copy_to_mode_reg (SImode, op2);
13555
13556 insn = (TARGET_64BIT
13557 ? gen_rdpmc_rex64 (op0, op1, op2)
13558 : gen_rdpmc (op0, op2));
13559 emit_insn (insn);
13560 }
13561 else if (fcode == IX86_BUILTIN_XGETBV)
13562 {
13563 arg0 = CALL_EXPR_ARG (exp, 0);
13564 op2 = expand_normal (arg0);
13565 if (!register_operand (op2, SImode))
13566 op2 = copy_to_mode_reg (SImode, op2);
13567
13568 insn = (TARGET_64BIT
13569 ? gen_xgetbv_rex64 (op0, op1, op2)
13570 : gen_xgetbv (op0, op2));
13571 emit_insn (insn);
13572 }
13573 else if (fcode == IX86_BUILTIN_RDTSC)
13574 {
13575 insn = (TARGET_64BIT
13576 ? gen_rdtsc_rex64 (op0, op1)
13577 : gen_rdtsc (op0));
13578 emit_insn (insn);
13579 }
13580 else
13581 {
13582 op2 = gen_reg_rtx (SImode);
13583
13584 insn = (TARGET_64BIT
13585 ? gen_rdtscp_rex64 (op0, op1, op2)
13586 : gen_rdtscp (op0, op2));
13587 emit_insn (insn);
13588
13589 arg0 = CALL_EXPR_ARG (exp, 0);
13590 op4 = expand_normal (arg0);
13591 if (!address_operand (op4, VOIDmode))
13592 {
13593 op4 = convert_memory_address (Pmode, op4);
13594 op4 = copy_addr_to_reg (op4);
13595 }
13596 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13597 }
13598
13599 if (target == 0
13600 || !register_operand (target, DImode))
13601 target = gen_reg_rtx (DImode);
13602
13603 if (TARGET_64BIT)
13604 {
13605 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13606 op1, 1, OPTAB_DIRECT);
13607 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13608 op0, 1, OPTAB_DIRECT);
13609 }
13610
13611 emit_move_insn (target, op0);
13612 return target;
13613
13614 case IX86_BUILTIN_ENQCMD:
13615 case IX86_BUILTIN_ENQCMDS:
13616 case IX86_BUILTIN_MOVDIR64B:
13617
13618 arg0 = CALL_EXPR_ARG (exp, 0);
13619 arg1 = CALL_EXPR_ARG (exp, 1);
13620 op0 = expand_normal (arg0);
13621 op1 = expand_normal (arg1);
13622
13623 op0 = ix86_zero_extend_to_Pmode (op0);
13624 if (!address_operand (op1, VOIDmode))
13625 {
13626 op1 = convert_memory_address (Pmode, op1);
13627 op1 = copy_addr_to_reg (op1);
13628 }
13629 op1 = gen_rtx_MEM (XImode, op1);
13630
13631 if (fcode == IX86_BUILTIN_MOVDIR64B)
13632 {
13633 emit_insn (gen_movdir64b (Pmode, op0, op1));
13634 return 0;
13635 }
13636 else
13637 {
13638 if (target == 0
13639 || !register_operand (target, SImode))
13640 target = gen_reg_rtx (SImode);
13641
13642 emit_move_insn (target, const0_rtx);
13643 target = gen_rtx_SUBREG (QImode, target, 0);
13644
13645 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13646 ? UNSPECV_ENQCMD
13647 : UNSPECV_ENQCMDS);
13648 icode = code_for_enqcmd (unspecv, Pmode);
13649 emit_insn (GEN_FCN (icode) (op0, op1));
13650
13651 emit_insn
13652 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13653 gen_rtx_fmt_ee (EQ, QImode,
13654 gen_rtx_REG (CCZmode, FLAGS_REG),
13655 const0_rtx)));
13656 return SUBREG_REG (target);
13657 }
13658
13659 case IX86_BUILTIN_FXSAVE:
13660 case IX86_BUILTIN_FXRSTOR:
13661 case IX86_BUILTIN_FXSAVE64:
13662 case IX86_BUILTIN_FXRSTOR64:
13663 case IX86_BUILTIN_FNSTENV:
13664 case IX86_BUILTIN_FLDENV:
13665 mode0 = BLKmode;
13666 switch (fcode)
13667 {
13668 case IX86_BUILTIN_FXSAVE:
13669 icode = CODE_FOR_fxsave;
13670 break;
13671 case IX86_BUILTIN_FXRSTOR:
13672 icode = CODE_FOR_fxrstor;
13673 break;
13674 case IX86_BUILTIN_FXSAVE64:
13675 icode = CODE_FOR_fxsave64;
13676 break;
13677 case IX86_BUILTIN_FXRSTOR64:
13678 icode = CODE_FOR_fxrstor64;
13679 break;
13680 case IX86_BUILTIN_FNSTENV:
13681 icode = CODE_FOR_fnstenv;
13682 break;
13683 case IX86_BUILTIN_FLDENV:
13684 icode = CODE_FOR_fldenv;
13685 break;
13686 default:
13687 gcc_unreachable ();
13688 }
13689
13690 arg0 = CALL_EXPR_ARG (exp, 0);
13691 op0 = expand_normal (arg0);
13692
13693 if (!address_operand (op0, VOIDmode))
13694 {
13695 op0 = convert_memory_address (Pmode, op0);
13696 op0 = copy_addr_to_reg (op0);
13697 }
13698 op0 = gen_rtx_MEM (mode0, op0);
13699
13700 pat = GEN_FCN (icode) (op0);
13701 if (pat)
13702 emit_insn (pat);
13703 return 0;
13704
13705 case IX86_BUILTIN_XSETBV:
13706 arg0 = CALL_EXPR_ARG (exp, 0);
13707 arg1 = CALL_EXPR_ARG (exp, 1);
13708 op0 = expand_normal (arg0);
13709 op1 = expand_normal (arg1);
13710
13711 if (!REG_P (op0))
13712 op0 = copy_to_mode_reg (SImode, op0);
13713
13714 op1 = force_reg (DImode, op1);
13715
13716 if (TARGET_64BIT)
13717 {
13718 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13719 NULL, 1, OPTAB_DIRECT);
13720
13721 icode = CODE_FOR_xsetbv_rex64;
13722
13723 op2 = gen_lowpart (SImode, op2);
13724 op1 = gen_lowpart (SImode, op1);
13725 pat = GEN_FCN (icode) (op0, op1, op2);
13726 }
13727 else
13728 {
13729 icode = CODE_FOR_xsetbv;
13730
13731 pat = GEN_FCN (icode) (op0, op1);
13732 }
13733 if (pat)
13734 emit_insn (pat);
13735 return 0;
13736
13737 case IX86_BUILTIN_XSAVE:
13738 case IX86_BUILTIN_XRSTOR:
13739 case IX86_BUILTIN_XSAVE64:
13740 case IX86_BUILTIN_XRSTOR64:
13741 case IX86_BUILTIN_XSAVEOPT:
13742 case IX86_BUILTIN_XSAVEOPT64:
13743 case IX86_BUILTIN_XSAVES:
13744 case IX86_BUILTIN_XRSTORS:
13745 case IX86_BUILTIN_XSAVES64:
13746 case IX86_BUILTIN_XRSTORS64:
13747 case IX86_BUILTIN_XSAVEC:
13748 case IX86_BUILTIN_XSAVEC64:
13749 arg0 = CALL_EXPR_ARG (exp, 0);
13750 arg1 = CALL_EXPR_ARG (exp, 1);
13751 op0 = expand_normal (arg0);
13752 op1 = expand_normal (arg1);
13753
13754 if (!address_operand (op0, VOIDmode))
13755 {
13756 op0 = convert_memory_address (Pmode, op0);
13757 op0 = copy_addr_to_reg (op0);
13758 }
13759 op0 = gen_rtx_MEM (BLKmode, op0);
13760
13761 op1 = force_reg (DImode, op1);
13762
13763 if (TARGET_64BIT)
13764 {
13765 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13766 NULL, 1, OPTAB_DIRECT);
13767 switch (fcode)
13768 {
13769 case IX86_BUILTIN_XSAVE:
13770 icode = CODE_FOR_xsave_rex64;
13771 break;
13772 case IX86_BUILTIN_XRSTOR:
13773 icode = CODE_FOR_xrstor_rex64;
13774 break;
13775 case IX86_BUILTIN_XSAVE64:
13776 icode = CODE_FOR_xsave64;
13777 break;
13778 case IX86_BUILTIN_XRSTOR64:
13779 icode = CODE_FOR_xrstor64;
13780 break;
13781 case IX86_BUILTIN_XSAVEOPT:
13782 icode = CODE_FOR_xsaveopt_rex64;
13783 break;
13784 case IX86_BUILTIN_XSAVEOPT64:
13785 icode = CODE_FOR_xsaveopt64;
13786 break;
13787 case IX86_BUILTIN_XSAVES:
13788 icode = CODE_FOR_xsaves_rex64;
13789 break;
13790 case IX86_BUILTIN_XRSTORS:
13791 icode = CODE_FOR_xrstors_rex64;
13792 break;
13793 case IX86_BUILTIN_XSAVES64:
13794 icode = CODE_FOR_xsaves64;
13795 break;
13796 case IX86_BUILTIN_XRSTORS64:
13797 icode = CODE_FOR_xrstors64;
13798 break;
13799 case IX86_BUILTIN_XSAVEC:
13800 icode = CODE_FOR_xsavec_rex64;
13801 break;
13802 case IX86_BUILTIN_XSAVEC64:
13803 icode = CODE_FOR_xsavec64;
13804 break;
13805 default:
13806 gcc_unreachable ();
13807 }
13808
13809 op2 = gen_lowpart (SImode, op2);
13810 op1 = gen_lowpart (SImode, op1);
13811 pat = GEN_FCN (icode) (op0, op1, op2);
13812 }
13813 else
13814 {
13815 switch (fcode)
13816 {
13817 case IX86_BUILTIN_XSAVE:
13818 icode = CODE_FOR_xsave;
13819 break;
13820 case IX86_BUILTIN_XRSTOR:
13821 icode = CODE_FOR_xrstor;
13822 break;
13823 case IX86_BUILTIN_XSAVEOPT:
13824 icode = CODE_FOR_xsaveopt;
13825 break;
13826 case IX86_BUILTIN_XSAVES:
13827 icode = CODE_FOR_xsaves;
13828 break;
13829 case IX86_BUILTIN_XRSTORS:
13830 icode = CODE_FOR_xrstors;
13831 break;
13832 case IX86_BUILTIN_XSAVEC:
13833 icode = CODE_FOR_xsavec;
13834 break;
13835 default:
13836 gcc_unreachable ();
13837 }
13838 pat = GEN_FCN (icode) (op0, op1);
13839 }
13840
13841 if (pat)
13842 emit_insn (pat);
13843 return 0;
13844
13845 case IX86_BUILTIN_LLWPCB:
13846 arg0 = CALL_EXPR_ARG (exp, 0);
13847 op0 = expand_normal (arg0);
13848
13849 if (!register_operand (op0, Pmode))
13850 op0 = ix86_zero_extend_to_Pmode (op0);
13851 emit_insn (gen_lwp_llwpcb (Pmode, op0));
13852 return 0;
13853
13854 case IX86_BUILTIN_SLWPCB:
13855 if (!target
13856 || !register_operand (target, Pmode))
13857 target = gen_reg_rtx (Pmode);
13858 emit_insn (gen_lwp_slwpcb (Pmode, target));
13859 return target;
13860
13861 case IX86_BUILTIN_LWPVAL32:
13862 case IX86_BUILTIN_LWPVAL64:
13863 case IX86_BUILTIN_LWPINS32:
13864 case IX86_BUILTIN_LWPINS64:
13865 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13866 || fcode == IX86_BUILTIN_LWPINS32)
13867 ? SImode : DImode);
13868
13869 if (fcode == IX86_BUILTIN_LWPVAL32
13870 || fcode == IX86_BUILTIN_LWPVAL64)
13871 icode = code_for_lwp_lwpval (mode);
13872 else
13873 icode = code_for_lwp_lwpins (mode);
13874
13875 arg0 = CALL_EXPR_ARG (exp, 0);
13876 arg1 = CALL_EXPR_ARG (exp, 1);
13877 arg2 = CALL_EXPR_ARG (exp, 2);
13878 op0 = expand_normal (arg0);
13879 op1 = expand_normal (arg1);
13880 op2 = expand_normal (arg2);
13881 mode0 = insn_data[icode].operand[0].mode;
13882
13883 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13884 op0 = copy_to_mode_reg (mode0, op0);
13885 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13886 op1 = copy_to_mode_reg (SImode, op1);
13887
13888 if (!CONST_INT_P (op2))
13889 {
13890 error ("the last argument must be a 32-bit immediate");
13891 return const0_rtx;
13892 }
13893
13894 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13895
13896 if (fcode == IX86_BUILTIN_LWPINS32
13897 || fcode == IX86_BUILTIN_LWPINS64)
13898 {
13899 if (target == 0
13900 || !nonimmediate_operand (target, QImode))
13901 target = gen_reg_rtx (QImode);
13902
13903 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13904 const0_rtx);
13905 emit_insn (gen_rtx_SET (target, pat));
13906
13907 return target;
13908 }
13909 else
13910 return 0;
13911
13912 case IX86_BUILTIN_BEXTRI32:
13913 case IX86_BUILTIN_BEXTRI64:
13914 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13915
13916 arg0 = CALL_EXPR_ARG (exp, 0);
13917 arg1 = CALL_EXPR_ARG (exp, 1);
13918 op0 = expand_normal (arg0);
13919 op1 = expand_normal (arg1);
13920
13921 if (!CONST_INT_P (op1))
13922 {
13923 error ("last argument must be an immediate");
13924 return const0_rtx;
13925 }
13926 else
13927 {
13928 unsigned char lsb_index = UINTVAL (op1);
13929 unsigned char length = UINTVAL (op1) >> 8;
13930
13931 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13932
13933 icode = code_for_tbm_bextri (mode);
13934
13935 mode1 = insn_data[icode].operand[1].mode;
13936 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13937 op0 = copy_to_mode_reg (mode1, op0);
13938
13939 mode0 = insn_data[icode].operand[0].mode;
13940 if (target == 0
13941 || !register_operand (target, mode0))
13942 target = gen_reg_rtx (mode0);
13943
13944 if (length == 0 || lsb_index >= bitsize)
13945 {
13946 emit_move_insn (target, const0_rtx);
13947 return target;
13948 }
13949
13950 if (length + lsb_index > bitsize)
13951 length = bitsize - lsb_index;
13952
13953 op1 = GEN_INT (length);
13954 op2 = GEN_INT (lsb_index);
13955
13956 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13957 return target;
13958 }
13959
13960 case IX86_BUILTIN_RDRAND16_STEP:
13961 mode = HImode;
13962 goto rdrand_step;
13963
13964 case IX86_BUILTIN_RDRAND32_STEP:
13965 mode = SImode;
13966 goto rdrand_step;
13967
13968 case IX86_BUILTIN_RDRAND64_STEP:
13969 mode = DImode;
13970
13971 rdrand_step:
13972 arg0 = CALL_EXPR_ARG (exp, 0);
13973 op1 = expand_normal (arg0);
13974 if (!address_operand (op1, VOIDmode))
13975 {
13976 op1 = convert_memory_address (Pmode, op1);
13977 op1 = copy_addr_to_reg (op1);
13978 }
13979
13980 op0 = gen_reg_rtx (mode);
13981 emit_insn (gen_rdrand (mode, op0));
13982
13983 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13984
13985 op1 = force_reg (SImode, const1_rtx);
13986
13987 /* Emit SImode conditional move. */
13988 if (mode == HImode)
13989 {
13990 if (TARGET_ZERO_EXTEND_WITH_AND
13991 && optimize_function_for_speed_p (cfun))
13992 {
13993 op2 = force_reg (SImode, const0_rtx);
13994
13995 emit_insn (gen_movstricthi
13996 (gen_lowpart (HImode, op2), op0));
13997 }
13998 else
13999 {
14000 op2 = gen_reg_rtx (SImode);
14001
14002 emit_insn (gen_zero_extendhisi2 (op2, op0));
14003 }
14004 }
14005 else if (mode == SImode)
14006 op2 = op0;
14007 else
14008 op2 = gen_rtx_SUBREG (SImode, op0, 0);
14009
14010 if (target == 0
14011 || !register_operand (target, SImode))
14012 target = gen_reg_rtx (SImode);
14013
14014 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
14015 const0_rtx);
14016 emit_insn (gen_rtx_SET (target,
14017 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
14018 return target;
14019
14020 case IX86_BUILTIN_RDSEED16_STEP:
14021 mode = HImode;
14022 goto rdseed_step;
14023
14024 case IX86_BUILTIN_RDSEED32_STEP:
14025 mode = SImode;
14026 goto rdseed_step;
14027
14028 case IX86_BUILTIN_RDSEED64_STEP:
14029 mode = DImode;
14030
14031 rdseed_step:
14032 arg0 = CALL_EXPR_ARG (exp, 0);
14033 op1 = expand_normal (arg0);
14034 if (!address_operand (op1, VOIDmode))
14035 {
14036 op1 = convert_memory_address (Pmode, op1);
14037 op1 = copy_addr_to_reg (op1);
14038 }
14039
14040 op0 = gen_reg_rtx (mode);
14041 emit_insn (gen_rdseed (mode, op0));
14042
14043 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
14044
14045 op2 = gen_reg_rtx (QImode);
14046
14047 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14048 const0_rtx);
14049 emit_insn (gen_rtx_SET (op2, pat));
14050
14051 if (target == 0
14052 || !register_operand (target, SImode))
14053 target = gen_reg_rtx (SImode);
14054
14055 emit_insn (gen_zero_extendqisi2 (target, op2));
14056 return target;
14057
14058 case IX86_BUILTIN_SBB32:
14059 icode = CODE_FOR_subborrowsi;
14060 icode2 = CODE_FOR_subborrowsi_0;
14061 mode0 = SImode;
14062 mode1 = DImode;
14063 mode2 = CCmode;
14064 goto handlecarry;
14065
14066 case IX86_BUILTIN_SBB64:
14067 icode = CODE_FOR_subborrowdi;
14068 icode2 = CODE_FOR_subborrowdi_0;
14069 mode0 = DImode;
14070 mode1 = TImode;
14071 mode2 = CCmode;
14072 goto handlecarry;
14073
14074 case IX86_BUILTIN_ADDCARRYX32:
14075 icode = CODE_FOR_addcarrysi;
14076 icode2 = CODE_FOR_addcarrysi_0;
14077 mode0 = SImode;
14078 mode1 = DImode;
14079 mode2 = CCCmode;
14080 goto handlecarry;
14081
14082 case IX86_BUILTIN_ADDCARRYX64:
14083 icode = CODE_FOR_addcarrydi;
14084 icode2 = CODE_FOR_addcarrydi_0;
14085 mode0 = DImode;
14086 mode1 = TImode;
14087 mode2 = CCCmode;
14088
14089 handlecarry:
14090 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
14091 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
14092 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
14093 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
14094
14095 op1 = expand_normal (arg0);
14096
14097 op2 = expand_normal (arg1);
14098 if (!register_operand (op2, mode0))
14099 op2 = copy_to_mode_reg (mode0, op2);
14100
14101 op3 = expand_normal (arg2);
14102 if (!register_operand (op3, mode0))
14103 op3 = copy_to_mode_reg (mode0, op3);
14104
14105 op4 = expand_normal (arg3);
14106 if (!address_operand (op4, VOIDmode))
14107 {
14108 op4 = convert_memory_address (Pmode, op4);
14109 op4 = copy_addr_to_reg (op4);
14110 }
14111
14112 op0 = gen_reg_rtx (mode0);
14113 if (op1 == const0_rtx)
14114 {
14115 /* If arg0 is 0, optimize right away into add or sub
14116 instruction that sets CCCmode flags. */
14117 op1 = gen_rtx_REG (mode2, FLAGS_REG);
14118 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
14119 }
14120 else
14121 {
14122 /* Generate CF from input operand. */
14123 ix86_expand_carry (op1);
14124
14125 /* Generate instruction that consumes CF. */
14126 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
14127 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
14128 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
14129 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
14130 }
14131
14132 /* Return current CF value. */
14133 if (target == 0)
14134 target = gen_reg_rtx (QImode);
14135
14136 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
14137 emit_insn (gen_rtx_SET (target, pat));
14138
14139 /* Store the result. */
14140 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
14141
14142 return target;
14143
14144 case IX86_BUILTIN_READ_FLAGS:
14145 if (ignore)
14146 return const0_rtx;
14147
14148 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
14149
14150 if (optimize
14151 || target == NULL_RTX
14152 || !nonimmediate_operand (target, word_mode)
14153 || GET_MODE (target) != word_mode)
14154 target = gen_reg_rtx (word_mode);
14155
14156 emit_insn (gen_pop (target));
14157 return target;
14158
14159 case IX86_BUILTIN_WRITE_FLAGS:
14160
14161 arg0 = CALL_EXPR_ARG (exp, 0);
14162 op0 = expand_normal (arg0);
14163 if (!general_no_elim_operand (op0, word_mode))
14164 op0 = copy_to_mode_reg (word_mode, op0);
14165
14166 emit_insn (gen_push (op0));
14167 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
14168 return 0;
14169
14170 case IX86_BUILTIN_KTESTC8:
14171 icode = CODE_FOR_ktestqi;
14172 mode3 = CCCmode;
14173 goto kortest;
14174
14175 case IX86_BUILTIN_KTESTZ8:
14176 icode = CODE_FOR_ktestqi;
14177 mode3 = CCZmode;
14178 goto kortest;
14179
14180 case IX86_BUILTIN_KTESTC16:
14181 icode = CODE_FOR_ktesthi;
14182 mode3 = CCCmode;
14183 goto kortest;
14184
14185 case IX86_BUILTIN_KTESTZ16:
14186 icode = CODE_FOR_ktesthi;
14187 mode3 = CCZmode;
14188 goto kortest;
14189
14190 case IX86_BUILTIN_KTESTC32:
14191 icode = CODE_FOR_ktestsi;
14192 mode3 = CCCmode;
14193 goto kortest;
14194
14195 case IX86_BUILTIN_KTESTZ32:
14196 icode = CODE_FOR_ktestsi;
14197 mode3 = CCZmode;
14198 goto kortest;
14199
14200 case IX86_BUILTIN_KTESTC64:
14201 icode = CODE_FOR_ktestdi;
14202 mode3 = CCCmode;
14203 goto kortest;
14204
14205 case IX86_BUILTIN_KTESTZ64:
14206 icode = CODE_FOR_ktestdi;
14207 mode3 = CCZmode;
14208 goto kortest;
14209
14210 case IX86_BUILTIN_KORTESTC8:
14211 icode = CODE_FOR_kortestqi;
14212 mode3 = CCCmode;
14213 goto kortest;
14214
14215 case IX86_BUILTIN_KORTESTZ8:
14216 icode = CODE_FOR_kortestqi;
14217 mode3 = CCZmode;
14218 goto kortest;
14219
14220 case IX86_BUILTIN_KORTESTC16:
14221 icode = CODE_FOR_kortesthi;
14222 mode3 = CCCmode;
14223 goto kortest;
14224
14225 case IX86_BUILTIN_KORTESTZ16:
14226 icode = CODE_FOR_kortesthi;
14227 mode3 = CCZmode;
14228 goto kortest;
14229
14230 case IX86_BUILTIN_KORTESTC32:
14231 icode = CODE_FOR_kortestsi;
14232 mode3 = CCCmode;
14233 goto kortest;
14234
14235 case IX86_BUILTIN_KORTESTZ32:
14236 icode = CODE_FOR_kortestsi;
14237 mode3 = CCZmode;
14238 goto kortest;
14239
14240 case IX86_BUILTIN_KORTESTC64:
14241 icode = CODE_FOR_kortestdi;
14242 mode3 = CCCmode;
14243 goto kortest;
14244
14245 case IX86_BUILTIN_KORTESTZ64:
14246 icode = CODE_FOR_kortestdi;
14247 mode3 = CCZmode;
14248
14249 kortest:
14250 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
14251 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
14252 op0 = expand_normal (arg0);
14253 op1 = expand_normal (arg1);
14254
14255 mode0 = insn_data[icode].operand[0].mode;
14256 mode1 = insn_data[icode].operand[1].mode;
14257
14258 if (GET_MODE (op0) != VOIDmode)
14259 op0 = force_reg (GET_MODE (op0), op0);
14260
14261 op0 = gen_lowpart (mode0, op0);
14262
14263 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14264 op0 = copy_to_mode_reg (mode0, op0);
14265
14266 if (GET_MODE (op1) != VOIDmode)
14267 op1 = force_reg (GET_MODE (op1), op1);
14268
14269 op1 = gen_lowpart (mode1, op1);
14270
14271 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14272 op1 = copy_to_mode_reg (mode1, op1);
14273
14274 target = gen_reg_rtx (QImode);
14275
14276 /* Emit kortest. */
14277 emit_insn (GEN_FCN (icode) (op0, op1));
14278 /* And use setcc to return result from flags. */
14279 ix86_expand_setcc (target, EQ,
14280 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
14281 return target;
14282
14283 case IX86_BUILTIN_GATHERSIV2DF:
14284 icode = CODE_FOR_avx2_gathersiv2df;
14285 goto gather_gen;
14286 case IX86_BUILTIN_GATHERSIV4DF:
14287 icode = CODE_FOR_avx2_gathersiv4df;
14288 goto gather_gen;
14289 case IX86_BUILTIN_GATHERDIV2DF:
14290 icode = CODE_FOR_avx2_gatherdiv2df;
14291 goto gather_gen;
14292 case IX86_BUILTIN_GATHERDIV4DF:
14293 icode = CODE_FOR_avx2_gatherdiv4df;
14294 goto gather_gen;
14295 case IX86_BUILTIN_GATHERSIV4SF:
14296 icode = CODE_FOR_avx2_gathersiv4sf;
14297 goto gather_gen;
14298 case IX86_BUILTIN_GATHERSIV8SF:
14299 icode = CODE_FOR_avx2_gathersiv8sf;
14300 goto gather_gen;
14301 case IX86_BUILTIN_GATHERDIV4SF:
14302 icode = CODE_FOR_avx2_gatherdiv4sf;
14303 goto gather_gen;
14304 case IX86_BUILTIN_GATHERDIV8SF:
14305 icode = CODE_FOR_avx2_gatherdiv8sf;
14306 goto gather_gen;
14307 case IX86_BUILTIN_GATHERSIV2DI:
14308 icode = CODE_FOR_avx2_gathersiv2di;
14309 goto gather_gen;
14310 case IX86_BUILTIN_GATHERSIV4DI:
14311 icode = CODE_FOR_avx2_gathersiv4di;
14312 goto gather_gen;
14313 case IX86_BUILTIN_GATHERDIV2DI:
14314 icode = CODE_FOR_avx2_gatherdiv2di;
14315 goto gather_gen;
14316 case IX86_BUILTIN_GATHERDIV4DI:
14317 icode = CODE_FOR_avx2_gatherdiv4di;
14318 goto gather_gen;
14319 case IX86_BUILTIN_GATHERSIV4SI:
14320 icode = CODE_FOR_avx2_gathersiv4si;
14321 goto gather_gen;
14322 case IX86_BUILTIN_GATHERSIV8SI:
14323 icode = CODE_FOR_avx2_gathersiv8si;
14324 goto gather_gen;
14325 case IX86_BUILTIN_GATHERDIV4SI:
14326 icode = CODE_FOR_avx2_gatherdiv4si;
14327 goto gather_gen;
14328 case IX86_BUILTIN_GATHERDIV8SI:
14329 icode = CODE_FOR_avx2_gatherdiv8si;
14330 goto gather_gen;
14331 case IX86_BUILTIN_GATHERALTSIV4DF:
14332 icode = CODE_FOR_avx2_gathersiv4df;
14333 goto gather_gen;
14334 case IX86_BUILTIN_GATHERALTDIV8SF:
14335 icode = CODE_FOR_avx2_gatherdiv8sf;
14336 goto gather_gen;
14337 case IX86_BUILTIN_GATHERALTSIV4DI:
14338 icode = CODE_FOR_avx2_gathersiv4di;
14339 goto gather_gen;
14340 case IX86_BUILTIN_GATHERALTDIV8SI:
14341 icode = CODE_FOR_avx2_gatherdiv8si;
14342 goto gather_gen;
14343 case IX86_BUILTIN_GATHER3SIV16SF:
14344 icode = CODE_FOR_avx512f_gathersiv16sf;
14345 goto gather_gen;
14346 case IX86_BUILTIN_GATHER3SIV8DF:
14347 icode = CODE_FOR_avx512f_gathersiv8df;
14348 goto gather_gen;
14349 case IX86_BUILTIN_GATHER3DIV16SF:
14350 icode = CODE_FOR_avx512f_gatherdiv16sf;
14351 goto gather_gen;
14352 case IX86_BUILTIN_GATHER3DIV8DF:
14353 icode = CODE_FOR_avx512f_gatherdiv8df;
14354 goto gather_gen;
14355 case IX86_BUILTIN_GATHER3SIV16SI:
14356 icode = CODE_FOR_avx512f_gathersiv16si;
14357 goto gather_gen;
14358 case IX86_BUILTIN_GATHER3SIV8DI:
14359 icode = CODE_FOR_avx512f_gathersiv8di;
14360 goto gather_gen;
14361 case IX86_BUILTIN_GATHER3DIV16SI:
14362 icode = CODE_FOR_avx512f_gatherdiv16si;
14363 goto gather_gen;
14364 case IX86_BUILTIN_GATHER3DIV8DI:
14365 icode = CODE_FOR_avx512f_gatherdiv8di;
14366 goto gather_gen;
14367 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14368 icode = CODE_FOR_avx512f_gathersiv8df;
14369 goto gather_gen;
14370 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14371 icode = CODE_FOR_avx512f_gatherdiv16sf;
14372 goto gather_gen;
14373 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14374 icode = CODE_FOR_avx512f_gathersiv8di;
14375 goto gather_gen;
14376 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14377 icode = CODE_FOR_avx512f_gatherdiv16si;
14378 goto gather_gen;
14379 case IX86_BUILTIN_GATHER3SIV2DF:
14380 icode = CODE_FOR_avx512vl_gathersiv2df;
14381 goto gather_gen;
14382 case IX86_BUILTIN_GATHER3SIV4DF:
14383 icode = CODE_FOR_avx512vl_gathersiv4df;
14384 goto gather_gen;
14385 case IX86_BUILTIN_GATHER3DIV2DF:
14386 icode = CODE_FOR_avx512vl_gatherdiv2df;
14387 goto gather_gen;
14388 case IX86_BUILTIN_GATHER3DIV4DF:
14389 icode = CODE_FOR_avx512vl_gatherdiv4df;
14390 goto gather_gen;
14391 case IX86_BUILTIN_GATHER3SIV4SF:
14392 icode = CODE_FOR_avx512vl_gathersiv4sf;
14393 goto gather_gen;
14394 case IX86_BUILTIN_GATHER3SIV8SF:
14395 icode = CODE_FOR_avx512vl_gathersiv8sf;
14396 goto gather_gen;
14397 case IX86_BUILTIN_GATHER3DIV4SF:
14398 icode = CODE_FOR_avx512vl_gatherdiv4sf;
14399 goto gather_gen;
14400 case IX86_BUILTIN_GATHER3DIV8SF:
14401 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14402 goto gather_gen;
14403 case IX86_BUILTIN_GATHER3SIV2DI:
14404 icode = CODE_FOR_avx512vl_gathersiv2di;
14405 goto gather_gen;
14406 case IX86_BUILTIN_GATHER3SIV4DI:
14407 icode = CODE_FOR_avx512vl_gathersiv4di;
14408 goto gather_gen;
14409 case IX86_BUILTIN_GATHER3DIV2DI:
14410 icode = CODE_FOR_avx512vl_gatherdiv2di;
14411 goto gather_gen;
14412 case IX86_BUILTIN_GATHER3DIV4DI:
14413 icode = CODE_FOR_avx512vl_gatherdiv4di;
14414 goto gather_gen;
14415 case IX86_BUILTIN_GATHER3SIV4SI:
14416 icode = CODE_FOR_avx512vl_gathersiv4si;
14417 goto gather_gen;
14418 case IX86_BUILTIN_GATHER3SIV8SI:
14419 icode = CODE_FOR_avx512vl_gathersiv8si;
14420 goto gather_gen;
14421 case IX86_BUILTIN_GATHER3DIV4SI:
14422 icode = CODE_FOR_avx512vl_gatherdiv4si;
14423 goto gather_gen;
14424 case IX86_BUILTIN_GATHER3DIV8SI:
14425 icode = CODE_FOR_avx512vl_gatherdiv8si;
14426 goto gather_gen;
14427 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14428 icode = CODE_FOR_avx512vl_gathersiv4df;
14429 goto gather_gen;
14430 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14431 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14432 goto gather_gen;
14433 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14434 icode = CODE_FOR_avx512vl_gathersiv4di;
14435 goto gather_gen;
14436 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14437 icode = CODE_FOR_avx512vl_gatherdiv8si;
14438 goto gather_gen;
14439 case IX86_BUILTIN_SCATTERSIV16SF:
14440 icode = CODE_FOR_avx512f_scattersiv16sf;
14441 goto scatter_gen;
14442 case IX86_BUILTIN_SCATTERSIV8DF:
14443 icode = CODE_FOR_avx512f_scattersiv8df;
14444 goto scatter_gen;
14445 case IX86_BUILTIN_SCATTERDIV16SF:
14446 icode = CODE_FOR_avx512f_scatterdiv16sf;
14447 goto scatter_gen;
14448 case IX86_BUILTIN_SCATTERDIV8DF:
14449 icode = CODE_FOR_avx512f_scatterdiv8df;
14450 goto scatter_gen;
14451 case IX86_BUILTIN_SCATTERSIV16SI:
14452 icode = CODE_FOR_avx512f_scattersiv16si;
14453 goto scatter_gen;
14454 case IX86_BUILTIN_SCATTERSIV8DI:
14455 icode = CODE_FOR_avx512f_scattersiv8di;
14456 goto scatter_gen;
14457 case IX86_BUILTIN_SCATTERDIV16SI:
14458 icode = CODE_FOR_avx512f_scatterdiv16si;
14459 goto scatter_gen;
14460 case IX86_BUILTIN_SCATTERDIV8DI:
14461 icode = CODE_FOR_avx512f_scatterdiv8di;
14462 goto scatter_gen;
14463 case IX86_BUILTIN_SCATTERSIV8SF:
14464 icode = CODE_FOR_avx512vl_scattersiv8sf;
14465 goto scatter_gen;
14466 case IX86_BUILTIN_SCATTERSIV4SF:
14467 icode = CODE_FOR_avx512vl_scattersiv4sf;
14468 goto scatter_gen;
14469 case IX86_BUILTIN_SCATTERSIV4DF:
14470 icode = CODE_FOR_avx512vl_scattersiv4df;
14471 goto scatter_gen;
14472 case IX86_BUILTIN_SCATTERSIV2DF:
14473 icode = CODE_FOR_avx512vl_scattersiv2df;
14474 goto scatter_gen;
14475 case IX86_BUILTIN_SCATTERDIV8SF:
14476 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14477 goto scatter_gen;
14478 case IX86_BUILTIN_SCATTERDIV4SF:
14479 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14480 goto scatter_gen;
14481 case IX86_BUILTIN_SCATTERDIV4DF:
14482 icode = CODE_FOR_avx512vl_scatterdiv4df;
14483 goto scatter_gen;
14484 case IX86_BUILTIN_SCATTERDIV2DF:
14485 icode = CODE_FOR_avx512vl_scatterdiv2df;
14486 goto scatter_gen;
14487 case IX86_BUILTIN_SCATTERSIV8SI:
14488 icode = CODE_FOR_avx512vl_scattersiv8si;
14489 goto scatter_gen;
14490 case IX86_BUILTIN_SCATTERSIV4SI:
14491 icode = CODE_FOR_avx512vl_scattersiv4si;
14492 goto scatter_gen;
14493 case IX86_BUILTIN_SCATTERSIV4DI:
14494 icode = CODE_FOR_avx512vl_scattersiv4di;
14495 goto scatter_gen;
14496 case IX86_BUILTIN_SCATTERSIV2DI:
14497 icode = CODE_FOR_avx512vl_scattersiv2di;
14498 goto scatter_gen;
14499 case IX86_BUILTIN_SCATTERDIV8SI:
14500 icode = CODE_FOR_avx512vl_scatterdiv8si;
14501 goto scatter_gen;
14502 case IX86_BUILTIN_SCATTERDIV4SI:
14503 icode = CODE_FOR_avx512vl_scatterdiv4si;
14504 goto scatter_gen;
14505 case IX86_BUILTIN_SCATTERDIV4DI:
14506 icode = CODE_FOR_avx512vl_scatterdiv4di;
14507 goto scatter_gen;
14508 case IX86_BUILTIN_SCATTERDIV2DI:
14509 icode = CODE_FOR_avx512vl_scatterdiv2di;
14510 goto scatter_gen;
14511 case IX86_BUILTIN_GATHERPFDPD:
14512 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14513 goto vec_prefetch_gen;
14514 case IX86_BUILTIN_SCATTERALTSIV8DF:
14515 icode = CODE_FOR_avx512f_scattersiv8df;
14516 goto scatter_gen;
14517 case IX86_BUILTIN_SCATTERALTDIV16SF:
14518 icode = CODE_FOR_avx512f_scatterdiv16sf;
14519 goto scatter_gen;
14520 case IX86_BUILTIN_SCATTERALTSIV8DI:
14521 icode = CODE_FOR_avx512f_scattersiv8di;
14522 goto scatter_gen;
14523 case IX86_BUILTIN_SCATTERALTDIV16SI:
14524 icode = CODE_FOR_avx512f_scatterdiv16si;
14525 goto scatter_gen;
14526 case IX86_BUILTIN_SCATTERALTSIV4DF:
14527 icode = CODE_FOR_avx512vl_scattersiv4df;
14528 goto scatter_gen;
14529 case IX86_BUILTIN_SCATTERALTDIV8SF:
14530 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14531 goto scatter_gen;
14532 case IX86_BUILTIN_SCATTERALTSIV4DI:
14533 icode = CODE_FOR_avx512vl_scattersiv4di;
14534 goto scatter_gen;
14535 case IX86_BUILTIN_SCATTERALTDIV8SI:
14536 icode = CODE_FOR_avx512vl_scatterdiv8si;
14537 goto scatter_gen;
14538 case IX86_BUILTIN_SCATTERALTSIV2DF:
14539 icode = CODE_FOR_avx512vl_scattersiv2df;
14540 goto scatter_gen;
14541 case IX86_BUILTIN_SCATTERALTDIV4SF:
14542 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14543 goto scatter_gen;
14544 case IX86_BUILTIN_SCATTERALTSIV2DI:
14545 icode = CODE_FOR_avx512vl_scattersiv2di;
14546 goto scatter_gen;
14547 case IX86_BUILTIN_SCATTERALTDIV4SI:
14548 icode = CODE_FOR_avx512vl_scatterdiv4si;
14549 goto scatter_gen;
14550 case IX86_BUILTIN_GATHERPFDPS:
14551 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14552 goto vec_prefetch_gen;
14553 case IX86_BUILTIN_GATHERPFQPD:
14554 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14555 goto vec_prefetch_gen;
14556 case IX86_BUILTIN_GATHERPFQPS:
14557 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14558 goto vec_prefetch_gen;
14559 case IX86_BUILTIN_SCATTERPFDPD:
14560 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14561 goto vec_prefetch_gen;
14562 case IX86_BUILTIN_SCATTERPFDPS:
14563 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14564 goto vec_prefetch_gen;
14565 case IX86_BUILTIN_SCATTERPFQPD:
14566 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14567 goto vec_prefetch_gen;
14568 case IX86_BUILTIN_SCATTERPFQPS:
14569 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14570 goto vec_prefetch_gen;
14571
14572 gather_gen:
14573 rtx half;
14574 rtx (*gen) (rtx, rtx);
14575
14576 arg0 = CALL_EXPR_ARG (exp, 0);
14577 arg1 = CALL_EXPR_ARG (exp, 1);
14578 arg2 = CALL_EXPR_ARG (exp, 2);
14579 arg3 = CALL_EXPR_ARG (exp, 3);
14580 arg4 = CALL_EXPR_ARG (exp, 4);
14581 op0 = expand_normal (arg0);
14582 op1 = expand_normal (arg1);
14583 op2 = expand_normal (arg2);
14584 op3 = expand_normal (arg3);
14585 op4 = expand_normal (arg4);
14586 /* Note the arg order is different from the operand order. */
14587 mode0 = insn_data[icode].operand[1].mode;
14588 mode2 = insn_data[icode].operand[3].mode;
14589 mode3 = insn_data[icode].operand[4].mode;
14590 mode4 = insn_data[icode].operand[5].mode;
14591
14592 if (target == NULL_RTX
14593 || GET_MODE (target) != insn_data[icode].operand[0].mode
14594 || !insn_data[icode].operand[0].predicate (target,
14595 GET_MODE (target)))
14596 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14597 else
14598 subtarget = target;
14599
14600 switch (fcode)
14601 {
14602 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14603 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14604 half = gen_reg_rtx (V8SImode);
14605 if (!nonimmediate_operand (op2, V16SImode))
14606 op2 = copy_to_mode_reg (V16SImode, op2);
14607 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14608 op2 = half;
14609 break;
14610 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14611 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14612 case IX86_BUILTIN_GATHERALTSIV4DF:
14613 case IX86_BUILTIN_GATHERALTSIV4DI:
14614 half = gen_reg_rtx (V4SImode);
14615 if (!nonimmediate_operand (op2, V8SImode))
14616 op2 = copy_to_mode_reg (V8SImode, op2);
14617 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14618 op2 = half;
14619 break;
14620 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14621 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14622 half = gen_reg_rtx (mode0);
14623 if (mode0 == V8SFmode)
14624 gen = gen_vec_extract_lo_v16sf;
14625 else
14626 gen = gen_vec_extract_lo_v16si;
14627 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14628 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14629 emit_insn (gen (half, op0));
14630 op0 = half;
14631 op3 = lowpart_subreg (QImode, op3, HImode);
14632 break;
14633 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14634 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14635 case IX86_BUILTIN_GATHERALTDIV8SF:
14636 case IX86_BUILTIN_GATHERALTDIV8SI:
14637 half = gen_reg_rtx (mode0);
14638 if (mode0 == V4SFmode)
14639 gen = gen_vec_extract_lo_v8sf;
14640 else
14641 gen = gen_vec_extract_lo_v8si;
14642 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14643 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14644 emit_insn (gen (half, op0));
14645 op0 = half;
14646 if (VECTOR_MODE_P (GET_MODE (op3)))
14647 {
14648 half = gen_reg_rtx (mode0);
14649 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14650 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14651 emit_insn (gen (half, op3));
14652 op3 = half;
14653 }
14654 break;
14655 default:
14656 break;
14657 }
14658
14659 /* Force memory operand only with base register here. But we
14660 don't want to do it on memory operand for other builtin
14661 functions. */
14662 op1 = ix86_zero_extend_to_Pmode (op1);
14663
14664 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14665 op0 = copy_to_mode_reg (mode0, op0);
14666 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14667 op1 = copy_to_mode_reg (Pmode, op1);
14668 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14669 op2 = copy_to_mode_reg (mode2, op2);
14670
14671 op3 = fixup_modeless_constant (op3, mode3);
14672
14673 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14674 {
14675 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14676 op3 = copy_to_mode_reg (mode3, op3);
14677 }
14678 else
14679 {
14680 op3 = copy_to_reg (op3);
14681 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14682 }
14683 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14684 {
14685 error ("the last argument must be scale 1, 2, 4, 8");
14686 return const0_rtx;
14687 }
14688
14689 /* Optimize. If mask is known to have all high bits set,
14690 replace op0 with pc_rtx to signal that the instruction
14691 overwrites the whole destination and doesn't use its
14692 previous contents. */
14693 if (optimize)
14694 {
14695 if (TREE_CODE (arg3) == INTEGER_CST)
14696 {
14697 if (integer_all_onesp (arg3))
14698 op0 = pc_rtx;
14699 }
14700 else if (TREE_CODE (arg3) == VECTOR_CST)
14701 {
14702 unsigned int negative = 0;
14703 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14704 {
14705 tree cst = VECTOR_CST_ELT (arg3, i);
14706 if (TREE_CODE (cst) == INTEGER_CST
14707 && tree_int_cst_sign_bit (cst))
14708 negative++;
14709 else if (TREE_CODE (cst) == REAL_CST
14710 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14711 negative++;
14712 }
14713 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14714 op0 = pc_rtx;
14715 }
14716 else if (TREE_CODE (arg3) == SSA_NAME
14717 && VECTOR_TYPE_P (TREE_TYPE (arg3)))
14718 {
14719 /* Recognize also when mask is like:
14720 __v2df src = _mm_setzero_pd ();
14721 __v2df mask = _mm_cmpeq_pd (src, src);
14722 or
14723 __v8sf src = _mm256_setzero_ps ();
14724 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14725 as that is a cheaper way to load all ones into
14726 a register than having to load a constant from
14727 memory. */
14728 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14729 if (is_gimple_call (def_stmt))
14730 {
14731 tree fndecl = gimple_call_fndecl (def_stmt);
14732 if (fndecl
14733 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
14734 switch (DECL_MD_FUNCTION_CODE (fndecl))
14735 {
14736 case IX86_BUILTIN_CMPPD:
14737 case IX86_BUILTIN_CMPPS:
14738 case IX86_BUILTIN_CMPPD256:
14739 case IX86_BUILTIN_CMPPS256:
14740 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14741 break;
14742 /* FALLTHRU */
14743 case IX86_BUILTIN_CMPEQPD:
14744 case IX86_BUILTIN_CMPEQPS:
14745 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14746 && initializer_zerop (gimple_call_arg (def_stmt,
14747 1)))
14748 op0 = pc_rtx;
14749 break;
14750 default:
14751 break;
14752 }
14753 }
14754 }
14755 }
14756
14757 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14758 if (! pat)
14759 return const0_rtx;
14760 emit_insn (pat);
14761
14762 switch (fcode)
14763 {
14764 case IX86_BUILTIN_GATHER3DIV16SF:
14765 if (target == NULL_RTX)
14766 target = gen_reg_rtx (V8SFmode);
14767 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14768 break;
14769 case IX86_BUILTIN_GATHER3DIV16SI:
14770 if (target == NULL_RTX)
14771 target = gen_reg_rtx (V8SImode);
14772 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14773 break;
14774 case IX86_BUILTIN_GATHER3DIV8SF:
14775 case IX86_BUILTIN_GATHERDIV8SF:
14776 if (target == NULL_RTX)
14777 target = gen_reg_rtx (V4SFmode);
14778 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14779 break;
14780 case IX86_BUILTIN_GATHER3DIV8SI:
14781 case IX86_BUILTIN_GATHERDIV8SI:
14782 if (target == NULL_RTX)
14783 target = gen_reg_rtx (V4SImode);
14784 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14785 break;
14786 default:
14787 target = subtarget;
14788 break;
14789 }
14790 return target;
14791
14792 scatter_gen:
14793 arg0 = CALL_EXPR_ARG (exp, 0);
14794 arg1 = CALL_EXPR_ARG (exp, 1);
14795 arg2 = CALL_EXPR_ARG (exp, 2);
14796 arg3 = CALL_EXPR_ARG (exp, 3);
14797 arg4 = CALL_EXPR_ARG (exp, 4);
14798 op0 = expand_normal (arg0);
14799 op1 = expand_normal (arg1);
14800 op2 = expand_normal (arg2);
14801 op3 = expand_normal (arg3);
14802 op4 = expand_normal (arg4);
14803 mode1 = insn_data[icode].operand[1].mode;
14804 mode2 = insn_data[icode].operand[2].mode;
14805 mode3 = insn_data[icode].operand[3].mode;
14806 mode4 = insn_data[icode].operand[4].mode;
14807
14808 /* Scatter instruction stores operand op3 to memory with
14809 indices from op2 and scale from op4 under writemask op1.
14810 If index operand op2 has more elements then source operand
14811 op3 one need to use only its low half. And vice versa. */
14812 switch (fcode)
14813 {
14814 case IX86_BUILTIN_SCATTERALTSIV8DF:
14815 case IX86_BUILTIN_SCATTERALTSIV8DI:
14816 half = gen_reg_rtx (V8SImode);
14817 if (!nonimmediate_operand (op2, V16SImode))
14818 op2 = copy_to_mode_reg (V16SImode, op2);
14819 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14820 op2 = half;
14821 break;
14822 case IX86_BUILTIN_SCATTERALTDIV16SF:
14823 case IX86_BUILTIN_SCATTERALTDIV16SI:
14824 half = gen_reg_rtx (mode3);
14825 if (mode3 == V8SFmode)
14826 gen = gen_vec_extract_lo_v16sf;
14827 else
14828 gen = gen_vec_extract_lo_v16si;
14829 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14830 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14831 emit_insn (gen (half, op3));
14832 op3 = half;
14833 break;
14834 case IX86_BUILTIN_SCATTERALTSIV4DF:
14835 case IX86_BUILTIN_SCATTERALTSIV4DI:
14836 half = gen_reg_rtx (V4SImode);
14837 if (!nonimmediate_operand (op2, V8SImode))
14838 op2 = copy_to_mode_reg (V8SImode, op2);
14839 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14840 op2 = half;
14841 break;
14842 case IX86_BUILTIN_SCATTERALTDIV8SF:
14843 case IX86_BUILTIN_SCATTERALTDIV8SI:
14844 half = gen_reg_rtx (mode3);
14845 if (mode3 == V4SFmode)
14846 gen = gen_vec_extract_lo_v8sf;
14847 else
14848 gen = gen_vec_extract_lo_v8si;
14849 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14850 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14851 emit_insn (gen (half, op3));
14852 op3 = half;
14853 break;
14854 case IX86_BUILTIN_SCATTERALTSIV2DF:
14855 case IX86_BUILTIN_SCATTERALTSIV2DI:
14856 if (!nonimmediate_operand (op2, V4SImode))
14857 op2 = copy_to_mode_reg (V4SImode, op2);
14858 break;
14859 case IX86_BUILTIN_SCATTERALTDIV4SF:
14860 case IX86_BUILTIN_SCATTERALTDIV4SI:
14861 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14862 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14863 break;
14864 default:
14865 break;
14866 }
14867
14868 /* Force memory operand only with base register here. But we
14869 don't want to do it on memory operand for other builtin
14870 functions. */
14871 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14872
14873 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14874 op0 = copy_to_mode_reg (Pmode, op0);
14875
14876 op1 = fixup_modeless_constant (op1, mode1);
14877
14878 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14879 {
14880 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14881 op1 = copy_to_mode_reg (mode1, op1);
14882 }
14883 else
14884 {
14885 op1 = copy_to_reg (op1);
14886 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14887 }
14888
14889 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14890 op2 = copy_to_mode_reg (mode2, op2);
14891
14892 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14893 op3 = copy_to_mode_reg (mode3, op3);
14894
14895 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14896 {
14897 error ("the last argument must be scale 1, 2, 4, 8");
14898 return const0_rtx;
14899 }
14900
14901 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14902 if (! pat)
14903 return const0_rtx;
14904
14905 emit_insn (pat);
14906 return 0;
14907
14908 vec_prefetch_gen:
14909 arg0 = CALL_EXPR_ARG (exp, 0);
14910 arg1 = CALL_EXPR_ARG (exp, 1);
14911 arg2 = CALL_EXPR_ARG (exp, 2);
14912 arg3 = CALL_EXPR_ARG (exp, 3);
14913 arg4 = CALL_EXPR_ARG (exp, 4);
14914 op0 = expand_normal (arg0);
14915 op1 = expand_normal (arg1);
14916 op2 = expand_normal (arg2);
14917 op3 = expand_normal (arg3);
14918 op4 = expand_normal (arg4);
14919 mode0 = insn_data[icode].operand[0].mode;
14920 mode1 = insn_data[icode].operand[1].mode;
14921 mode3 = insn_data[icode].operand[3].mode;
14922 mode4 = insn_data[icode].operand[4].mode;
14923
14924 op0 = fixup_modeless_constant (op0, mode0);
14925
14926 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14927 {
14928 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14929 op0 = copy_to_mode_reg (mode0, op0);
14930 }
14931 else
14932 {
14933 op0 = copy_to_reg (op0);
14934 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14935 }
14936
14937 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14938 op1 = copy_to_mode_reg (mode1, op1);
14939
14940 /* Force memory operand only with base register here. But we
14941 don't want to do it on memory operand for other builtin
14942 functions. */
14943 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14944
14945 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14946 op2 = copy_to_mode_reg (Pmode, op2);
14947
14948 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14949 {
14950 error ("the forth argument must be scale 1, 2, 4, 8");
14951 return const0_rtx;
14952 }
14953
14954 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14955 {
14956 error ("incorrect hint operand");
14957 return const0_rtx;
14958 }
14959
14960 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14961 if (! pat)
14962 return const0_rtx;
14963
14964 emit_insn (pat);
14965
14966 return 0;
14967
14968 case IX86_BUILTIN_XABORT:
14969 icode = CODE_FOR_xabort;
14970 arg0 = CALL_EXPR_ARG (exp, 0);
14971 op0 = expand_normal (arg0);
14972 mode0 = insn_data[icode].operand[0].mode;
14973 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14974 {
14975 error ("the argument to %<xabort%> intrinsic must "
14976 "be an 8-bit immediate");
14977 return const0_rtx;
14978 }
14979 emit_insn (gen_xabort (op0));
14980 return 0;
14981
14982 case IX86_BUILTIN_RDSSPD:
14983 case IX86_BUILTIN_RDSSPQ:
14984 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14985
14986 if (target == 0
14987 || !register_operand (target, mode))
14988 target = gen_reg_rtx (mode);
14989
14990 op0 = force_reg (mode, const0_rtx);
14991
14992 emit_insn (gen_rdssp (mode, target, op0));
14993 return target;
14994
14995 case IX86_BUILTIN_INCSSPD:
14996 case IX86_BUILTIN_INCSSPQ:
14997 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14998
14999 arg0 = CALL_EXPR_ARG (exp, 0);
15000 op0 = expand_normal (arg0);
15001
15002 op0 = force_reg (mode, op0);
15003
15004 emit_insn (gen_incssp (mode, op0));
15005 return 0;
15006
15007 case IX86_BUILTIN_HRESET:
15008 icode = CODE_FOR_hreset;
15009 arg0 = CALL_EXPR_ARG (exp, 0);
15010 op0 = expand_normal (arg0);
15011 op0 = force_reg (SImode, op0);
15012 emit_insn (gen_hreset (op0));
15013 return 0;
15014
15015 case IX86_BUILTIN_RSTORSSP:
15016 case IX86_BUILTIN_CLRSSBSY:
15017 arg0 = CALL_EXPR_ARG (exp, 0);
15018 op0 = expand_normal (arg0);
15019 icode = (fcode == IX86_BUILTIN_RSTORSSP
15020 ? CODE_FOR_rstorssp
15021 : CODE_FOR_clrssbsy);
15022
15023 if (!address_operand (op0, VOIDmode))
15024 {
15025 op0 = convert_memory_address (Pmode, op0);
15026 op0 = copy_addr_to_reg (op0);
15027 }
15028 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
15029 return 0;
15030
15031 case IX86_BUILTIN_WRSSD:
15032 case IX86_BUILTIN_WRSSQ:
15033 case IX86_BUILTIN_WRUSSD:
15034 case IX86_BUILTIN_WRUSSQ:
15035 mode = ((fcode == IX86_BUILTIN_WRSSD
15036 || fcode == IX86_BUILTIN_WRUSSD)
15037 ? SImode : DImode);
15038
15039 arg0 = CALL_EXPR_ARG (exp, 0);
15040 op0 = expand_normal (arg0);
15041 arg1 = CALL_EXPR_ARG (exp, 1);
15042 op1 = expand_normal (arg1);
15043
15044 op0 = force_reg (mode, op0);
15045
15046 if (!address_operand (op1, VOIDmode))
15047 {
15048 op1 = convert_memory_address (Pmode, op1);
15049 op1 = copy_addr_to_reg (op1);
15050 }
15051 op1 = gen_rtx_MEM (mode, op1);
15052
15053 icode = ((fcode == IX86_BUILTIN_WRSSD
15054 || fcode == IX86_BUILTIN_WRSSQ)
15055 ? code_for_wrss (mode)
15056 : code_for_wruss (mode));
15057 emit_insn (GEN_FCN (icode) (op0, op1));
15058
15059 return 0;
15060
15061 default:
15062 break;
15063 }
15064
15065 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
15066 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
15067 {
15068 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
15069 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
15070 target);
15071 }
15072
15073 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
15074 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
15075 {
15076 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
15077 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
15078 target);
15079 }
15080
15081 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
15082 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
15083 {
15084 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
15085 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
15086 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
15087 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
15088 int masked = 1;
15089 machine_mode mode, wide_mode, nar_mode;
15090
15091 nar_mode = V4SFmode;
15092 mode = V16SFmode;
15093 wide_mode = V64SFmode;
15094 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
15095 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
15096
15097 switch (fcode)
15098 {
15099 case IX86_BUILTIN_4FMAPS:
15100 fcn = gen_avx5124fmaddps_4fmaddps;
15101 masked = 0;
15102 goto v4fma_expand;
15103
15104 case IX86_BUILTIN_4DPWSSD:
15105 nar_mode = V4SImode;
15106 mode = V16SImode;
15107 wide_mode = V64SImode;
15108 fcn = gen_avx5124vnniw_vp4dpwssd;
15109 masked = 0;
15110 goto v4fma_expand;
15111
15112 case IX86_BUILTIN_4DPWSSDS:
15113 nar_mode = V4SImode;
15114 mode = V16SImode;
15115 wide_mode = V64SImode;
15116 fcn = gen_avx5124vnniw_vp4dpwssds;
15117 masked = 0;
15118 goto v4fma_expand;
15119
15120 case IX86_BUILTIN_4FNMAPS:
15121 fcn = gen_avx5124fmaddps_4fnmaddps;
15122 masked = 0;
15123 goto v4fma_expand;
15124
15125 case IX86_BUILTIN_4FNMAPS_MASK:
15126 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
15127 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
15128 goto v4fma_expand;
15129
15130 case IX86_BUILTIN_4DPWSSD_MASK:
15131 nar_mode = V4SImode;
15132 mode = V16SImode;
15133 wide_mode = V64SImode;
15134 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
15135 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
15136 goto v4fma_expand;
15137
15138 case IX86_BUILTIN_4DPWSSDS_MASK:
15139 nar_mode = V4SImode;
15140 mode = V16SImode;
15141 wide_mode = V64SImode;
15142 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
15143 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
15144 goto v4fma_expand;
15145
15146 case IX86_BUILTIN_4FMAPS_MASK:
15147 {
15148 tree args[4];
15149 rtx ops[4];
15150 rtx wide_reg;
15151 rtx accum;
15152 rtx addr;
15153 rtx mem;
15154
15155 v4fma_expand:
15156 wide_reg = gen_reg_rtx (wide_mode);
15157 for (i = 0; i < 4; i++)
15158 {
15159 args[i] = CALL_EXPR_ARG (exp, i);
15160 ops[i] = expand_normal (args[i]);
15161
15162 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
15163 ops[i]);
15164 }
15165
15166 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15167 accum = force_reg (mode, accum);
15168
15169 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15170 addr = force_reg (Pmode, addr);
15171
15172 mem = gen_rtx_MEM (nar_mode, addr);
15173
15174 target = gen_reg_rtx (mode);
15175
15176 emit_move_insn (target, accum);
15177
15178 if (! masked)
15179 emit_insn (fcn (target, accum, wide_reg, mem));
15180 else
15181 {
15182 rtx merge, mask;
15183 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15184
15185 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15186
15187 if (CONST_INT_P (mask))
15188 mask = fixup_modeless_constant (mask, HImode);
15189
15190 mask = force_reg (HImode, mask);
15191
15192 if (GET_MODE (mask) != HImode)
15193 mask = gen_rtx_SUBREG (HImode, mask, 0);
15194
15195 /* If merge is 0 then we're about to emit z-masked variant. */
15196 if (const0_operand (merge, mode))
15197 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15198 /* If merge is the same as accum then emit merge-masked variant. */
15199 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15200 {
15201 merge = force_reg (mode, merge);
15202 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15203 }
15204 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15205 else
15206 {
15207 target = gen_reg_rtx (mode);
15208 emit_move_insn (target, merge);
15209 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15210 }
15211 }
15212 return target;
15213 }
15214
15215 case IX86_BUILTIN_4FNMASS:
15216 fcn = gen_avx5124fmaddps_4fnmaddss;
15217 masked = 0;
15218 goto s4fma_expand;
15219
15220 case IX86_BUILTIN_4FMASS:
15221 fcn = gen_avx5124fmaddps_4fmaddss;
15222 masked = 0;
15223 goto s4fma_expand;
15224
15225 case IX86_BUILTIN_4FNMASS_MASK:
15226 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
15227 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
15228 goto s4fma_expand;
15229
15230 case IX86_BUILTIN_4FMASS_MASK:
15231 {
15232 tree args[4];
15233 rtx ops[4];
15234 rtx wide_reg;
15235 rtx accum;
15236 rtx addr;
15237 rtx mem;
15238
15239 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
15240 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
15241
15242 s4fma_expand:
15243 mode = V4SFmode;
15244 wide_reg = gen_reg_rtx (V64SFmode);
15245 for (i = 0; i < 4; i++)
15246 {
15247 rtx tmp;
15248 args[i] = CALL_EXPR_ARG (exp, i);
15249 ops[i] = expand_normal (args[i]);
15250
15251 tmp = gen_reg_rtx (SFmode);
15252 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
15253
15254 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
15255 gen_rtx_SUBREG (V16SFmode, tmp, 0));
15256 }
15257
15258 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15259 accum = force_reg (V4SFmode, accum);
15260
15261 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15262 addr = force_reg (Pmode, addr);
15263
15264 mem = gen_rtx_MEM (V4SFmode, addr);
15265
15266 target = gen_reg_rtx (V4SFmode);
15267
15268 emit_move_insn (target, accum);
15269
15270 if (! masked)
15271 emit_insn (fcn (target, accum, wide_reg, mem));
15272 else
15273 {
15274 rtx merge, mask;
15275 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15276
15277 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15278
15279 if (CONST_INT_P (mask))
15280 mask = fixup_modeless_constant (mask, QImode);
15281
15282 mask = force_reg (QImode, mask);
15283
15284 if (GET_MODE (mask) != QImode)
15285 mask = gen_rtx_SUBREG (QImode, mask, 0);
15286
15287 /* If merge is 0 then we're about to emit z-masked variant. */
15288 if (const0_operand (merge, mode))
15289 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15290 /* If merge is the same as accum then emit merge-masked
15291 variant. */
15292 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15293 {
15294 merge = force_reg (mode, merge);
15295 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15296 }
15297 /* Merge with something unknown might happen if we z-mask
15298 w/ -O0. */
15299 else
15300 {
15301 target = gen_reg_rtx (mode);
15302 emit_move_insn (target, merge);
15303 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15304 }
15305 }
15306 return target;
15307 }
15308 case IX86_BUILTIN_RDPID:
15309 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
15310 target);
15311 case IX86_BUILTIN_FABSQ:
15312 case IX86_BUILTIN_COPYSIGNQ:
15313 if (!TARGET_SSE)
15314 /* Emit a normal call if SSE isn't available. */
15315 return expand_call (exp, target, ignore);
15316 /* FALLTHRU */
15317 default:
15318 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
15319 }
15320 }
15321
15322 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
15323 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
15324 {
15325 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
15326 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
15327 }
15328
15329 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15330 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
15331 {
15332 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
15333 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
15334 }
15335
15336 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15337 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
15338 {
15339 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
15340 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
15341 }
15342
15343 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15344 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
15345 {
15346 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
15347 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
15348 }
15349
15350 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15351 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
15352 {
15353 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
15354 const struct builtin_description *d = bdesc_multi_arg + i;
15355 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
15356 (enum ix86_builtin_func_type)
15357 d->flag, d->comparison);
15358 }
15359
15360 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
15361 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
15362 {
15363 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
15364 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
15365 target);
15366 }
15367
15368 gcc_unreachable ();
15369 }
15370
15371 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15372 fill target with val via vec_duplicate. */
15373
15374 static bool
15375 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
15376 {
15377 bool ok;
15378 rtx_insn *insn;
15379 rtx dup;
15380 /* Save/restore recog_data in case this is called from splitters
15381 or other routines where recog_data needs to stay valid across
15382 force_reg. See PR106577. */
15383 recog_data_d recog_data_save = recog_data;
15384
15385 /* First attempt to recognize VAL as-is. */
15386 dup = gen_vec_duplicate (mode, val);
15387 insn = emit_insn (gen_rtx_SET (target, dup));
15388 if (recog_memoized (insn) < 0)
15389 {
15390 rtx_insn *seq;
15391 machine_mode innermode = GET_MODE_INNER (mode);
15392 rtx reg;
15393
15394 /* If that fails, force VAL into a register. */
15395
15396 start_sequence ();
15397 reg = force_reg (innermode, val);
15398 if (GET_MODE (reg) != innermode)
15399 reg = gen_lowpart (innermode, reg);
15400 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
15401 seq = get_insns ();
15402 end_sequence ();
15403 if (seq)
15404 emit_insn_before (seq, insn);
15405
15406 ok = recog_memoized (insn) >= 0;
15407 gcc_assert (ok);
15408 }
15409 recog_data = recog_data_save;
15410 return true;
15411 }
15412
15413 /* Get a vector mode of the same size as the original but with elements
15414 twice as wide. This is only guaranteed to apply to integral vectors. */
15415
15416 static machine_mode
15417 get_mode_wider_vector (machine_mode o)
15418 {
15419 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15420 machine_mode n = GET_MODE_NEXT_MODE (o).require ();
15421 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
15422 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
15423 return n;
15424 }
15425
15426 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
15427 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
15428
15429 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15430 with all elements equal to VAR. Return true if successful. */
15431
15432 bool
15433 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
15434 rtx target, rtx val)
15435 {
15436 bool ok;
15437
15438 switch (mode)
15439 {
15440 case E_V2SImode:
15441 case E_V2SFmode:
15442 if (!mmx_ok)
15443 return false;
15444 /* FALLTHRU */
15445
15446 case E_V4DFmode:
15447 case E_V4DImode:
15448 case E_V8SFmode:
15449 case E_V8SImode:
15450 case E_V2DFmode:
15451 case E_V2DImode:
15452 case E_V4SFmode:
15453 case E_V4SImode:
15454 case E_V16SImode:
15455 case E_V8DImode:
15456 case E_V16SFmode:
15457 case E_V8DFmode:
15458 return ix86_vector_duplicate_value (mode, target, val);
15459
15460 case E_V4HImode:
15461 if (!mmx_ok)
15462 return false;
15463 if (TARGET_SSE || TARGET_3DNOW_A)
15464 {
15465 rtx x;
15466
15467 val = gen_lowpart (SImode, val);
15468 x = gen_rtx_TRUNCATE (HImode, val);
15469 x = gen_rtx_VEC_DUPLICATE (mode, x);
15470 emit_insn (gen_rtx_SET (target, x));
15471 return true;
15472 }
15473 goto widen;
15474
15475 case E_V2HImode:
15476 if (TARGET_SSE2)
15477 {
15478 rtx x;
15479
15480 val = gen_lowpart (SImode, val);
15481 x = gen_rtx_TRUNCATE (HImode, val);
15482 x = gen_rtx_VEC_DUPLICATE (mode, x);
15483 emit_insn (gen_rtx_SET (target, x));
15484 return true;
15485 }
15486 return false;
15487
15488 case E_V8QImode:
15489 case E_V4QImode:
15490 if (!mmx_ok)
15491 return false;
15492 goto widen;
15493
15494 case E_V8HImode:
15495 case E_V8HFmode:
15496 case E_V8BFmode:
15497 if (TARGET_AVX2)
15498 return ix86_vector_duplicate_value (mode, target, val);
15499
15500 if (TARGET_SSE2)
15501 {
15502 struct expand_vec_perm_d dperm;
15503 rtx tmp1, tmp2;
15504
15505 permute:
15506 memset (&dperm, 0, sizeof (dperm));
15507 dperm.target = target;
15508 dperm.vmode = mode;
15509 dperm.nelt = GET_MODE_NUNITS (mode);
15510 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15511 dperm.one_operand_p = true;
15512
15513 if (mode == V8HFmode || mode == V8BFmode)
15514 {
15515 tmp1 = force_reg (GET_MODE_INNER (mode), val);
15516 tmp2 = gen_reg_rtx (mode);
15517 emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
15518 tmp1 = gen_lowpart (mode, tmp2);
15519 }
15520 else
15521 {
15522 /* Extend to SImode using a paradoxical SUBREG. */
15523 tmp1 = gen_reg_rtx (SImode);
15524 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15525
15526 /* Insert the SImode value as
15527 low element of a V4SImode vector. */
15528 tmp2 = gen_reg_rtx (V4SImode);
15529 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15530 tmp1 = gen_lowpart (mode, tmp2);
15531 }
15532
15533 emit_move_insn (dperm.op0, tmp1);
15534 ok = (expand_vec_perm_1 (&dperm)
15535 || expand_vec_perm_broadcast_1 (&dperm));
15536 gcc_assert (ok);
15537 return ok;
15538 }
15539 goto widen;
15540
15541 case E_V16QImode:
15542 if (TARGET_AVX2)
15543 return ix86_vector_duplicate_value (mode, target, val);
15544
15545 if (TARGET_SSE2)
15546 goto permute;
15547 goto widen;
15548
15549 widen:
15550 /* Replicate the value once into the next wider mode and recurse. */
15551 {
15552 machine_mode smode, wsmode, wvmode;
15553 rtx x;
15554
15555 smode = GET_MODE_INNER (mode);
15556 wvmode = get_mode_wider_vector (mode);
15557 wsmode = GET_MODE_INNER (wvmode);
15558
15559 val = convert_modes (wsmode, smode, val, true);
15560
15561 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15562 emit_insn (gen_insv_1 (wsmode, val, val));
15563 else
15564 {
15565 x = expand_simple_binop (wsmode, ASHIFT, val,
15566 GEN_INT (GET_MODE_BITSIZE (smode)),
15567 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15568 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15569 OPTAB_LIB_WIDEN);
15570 }
15571
15572 x = gen_reg_rtx (wvmode);
15573 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15574 gcc_assert (ok);
15575 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15576 return ok;
15577 }
15578
15579 case E_V16HImode:
15580 case E_V16HFmode:
15581 case E_V16BFmode:
15582 case E_V32QImode:
15583 if (TARGET_AVX2)
15584 return ix86_vector_duplicate_value (mode, target, val);
15585 else
15586 {
15587 machine_mode hvmode;
15588 switch (mode)
15589 {
15590 case V16HImode:
15591 hvmode = V8HImode;
15592 break;
15593 case V16HFmode:
15594 hvmode = V8HFmode;
15595 break;
15596 case V16BFmode:
15597 hvmode = V8BFmode;
15598 break;
15599 case V32QImode:
15600 hvmode = V16QImode;
15601 break;
15602 default:
15603 gcc_unreachable ();
15604 }
15605 rtx x = gen_reg_rtx (hvmode);
15606
15607 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15608 gcc_assert (ok);
15609
15610 x = gen_rtx_VEC_CONCAT (mode, x, x);
15611 emit_insn (gen_rtx_SET (target, x));
15612 }
15613 return true;
15614
15615 case E_V32HImode:
15616 case E_V32HFmode:
15617 case E_V32BFmode:
15618 case E_V64QImode:
15619 if (TARGET_AVX512BW)
15620 return ix86_vector_duplicate_value (mode, target, val);
15621 else
15622 {
15623 machine_mode hvmode;
15624 switch (mode)
15625 {
15626 case V32HImode:
15627 hvmode = V16HImode;
15628 break;
15629 case V32HFmode:
15630 hvmode = V16HFmode;
15631 break;
15632 case V32BFmode:
15633 hvmode = V16BFmode;
15634 break;
15635 case V64QImode:
15636 hvmode = V32QImode;
15637 break;
15638 default:
15639 gcc_unreachable ();
15640 }
15641 rtx x = gen_reg_rtx (hvmode);
15642
15643 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15644 gcc_assert (ok);
15645
15646 x = gen_rtx_VEC_CONCAT (mode, x, x);
15647 emit_insn (gen_rtx_SET (target, x));
15648 }
15649 return true;
15650
15651 default:
15652 return false;
15653 }
15654 }
15655
15656 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15657 whose ONE_VAR element is VAR, and other elements are zero. Return true
15658 if successful. */
15659
15660 static bool
15661 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15662 rtx target, rtx var, int one_var)
15663 {
15664 machine_mode vsimode;
15665 rtx new_target;
15666 rtx x, tmp;
15667 bool use_vector_set = false;
15668 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15669
15670 switch (mode)
15671 {
15672 case E_V2DImode:
15673 /* For SSE4.1, we normally use vector set. But if the second
15674 element is zero and inter-unit moves are OK, we use movq
15675 instead. */
15676 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15677 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15678 && one_var == 0));
15679 break;
15680 case E_V16QImode:
15681 case E_V4SImode:
15682 case E_V4SFmode:
15683 use_vector_set = TARGET_SSE4_1;
15684 break;
15685 case E_V8HImode:
15686 use_vector_set = TARGET_SSE2;
15687 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15688 ? gen_vec_setv8hi_0 : NULL;
15689 break;
15690 case E_V8QImode:
15691 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15692 break;
15693 case E_V4HImode:
15694 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15695 break;
15696 case E_V4QImode:
15697 use_vector_set = TARGET_SSE4_1;
15698 break;
15699 case E_V32QImode:
15700 use_vector_set = TARGET_AVX;
15701 break;
15702 case E_V16HImode:
15703 use_vector_set = TARGET_AVX;
15704 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15705 ? gen_vec_setv16hi_0 : NULL;
15706 break;
15707 case E_V8SImode:
15708 use_vector_set = TARGET_AVX;
15709 gen_vec_set_0 = gen_vec_setv8si_0;
15710 break;
15711 case E_V8SFmode:
15712 use_vector_set = TARGET_AVX;
15713 gen_vec_set_0 = gen_vec_setv8sf_0;
15714 break;
15715 case E_V4DFmode:
15716 use_vector_set = TARGET_AVX;
15717 gen_vec_set_0 = gen_vec_setv4df_0;
15718 break;
15719 case E_V4DImode:
15720 /* Use ix86_expand_vector_set in 64bit mode only. */
15721 use_vector_set = TARGET_AVX && TARGET_64BIT;
15722 gen_vec_set_0 = gen_vec_setv4di_0;
15723 break;
15724 case E_V16SImode:
15725 use_vector_set = TARGET_AVX512F && one_var == 0;
15726 gen_vec_set_0 = gen_vec_setv16si_0;
15727 break;
15728 case E_V16SFmode:
15729 use_vector_set = TARGET_AVX512F && one_var == 0;
15730 gen_vec_set_0 = gen_vec_setv16sf_0;
15731 break;
15732 case E_V8DFmode:
15733 use_vector_set = TARGET_AVX512F && one_var == 0;
15734 gen_vec_set_0 = gen_vec_setv8df_0;
15735 break;
15736 case E_V8DImode:
15737 /* Use ix86_expand_vector_set in 64bit mode only. */
15738 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15739 gen_vec_set_0 = gen_vec_setv8di_0;
15740 break;
15741 case E_V8HFmode:
15742 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15743 gen_vec_set_0 = gen_vec_setv8hf_0;
15744 break;
15745 case E_V16HFmode:
15746 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15747 gen_vec_set_0 = gen_vec_setv16hf_0;
15748 break;
15749 case E_V32HFmode:
15750 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15751 gen_vec_set_0 = gen_vec_setv32hf_0;
15752 break;
15753 case E_V8BFmode:
15754 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15755 gen_vec_set_0 = gen_vec_setv8bf_0;
15756 break;
15757 case E_V16BFmode:
15758 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15759 gen_vec_set_0 = gen_vec_setv16bf_0;
15760 break;
15761 case E_V32BFmode:
15762 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15763 gen_vec_set_0 = gen_vec_setv32bf_0;
15764 break;
15765 case E_V32HImode:
15766 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15767 gen_vec_set_0 = gen_vec_setv32hi_0;
15768 default:
15769 break;
15770 }
15771
15772 if (use_vector_set)
15773 {
15774 if (gen_vec_set_0 && one_var == 0)
15775 {
15776 var = force_reg (GET_MODE_INNER (mode), var);
15777 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15778 return true;
15779 }
15780 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15781 var = force_reg (GET_MODE_INNER (mode), var);
15782 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15783 return true;
15784 }
15785
15786 switch (mode)
15787 {
15788 case E_V2SFmode:
15789 case E_V2SImode:
15790 if (!mmx_ok)
15791 return false;
15792 /* FALLTHRU */
15793
15794 case E_V2DFmode:
15795 case E_V2DImode:
15796 if (one_var != 0)
15797 return false;
15798 var = force_reg (GET_MODE_INNER (mode), var);
15799 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15800 emit_insn (gen_rtx_SET (target, x));
15801 return true;
15802
15803 case E_V4SFmode:
15804 case E_V4SImode:
15805 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15806 new_target = gen_reg_rtx (mode);
15807 else
15808 new_target = target;
15809 var = force_reg (GET_MODE_INNER (mode), var);
15810 x = gen_rtx_VEC_DUPLICATE (mode, var);
15811 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15812 emit_insn (gen_rtx_SET (new_target, x));
15813 if (one_var != 0)
15814 {
15815 /* We need to shuffle the value to the correct position, so
15816 create a new pseudo to store the intermediate result. */
15817
15818 /* With SSE2, we can use the integer shuffle insns. */
15819 if (mode != V4SFmode && TARGET_SSE2)
15820 {
15821 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15822 const1_rtx,
15823 GEN_INT (one_var == 1 ? 0 : 1),
15824 GEN_INT (one_var == 2 ? 0 : 1),
15825 GEN_INT (one_var == 3 ? 0 : 1)));
15826 if (target != new_target)
15827 emit_move_insn (target, new_target);
15828 return true;
15829 }
15830
15831 /* Otherwise convert the intermediate result to V4SFmode and
15832 use the SSE1 shuffle instructions. */
15833 if (mode != V4SFmode)
15834 {
15835 tmp = gen_reg_rtx (V4SFmode);
15836 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15837 }
15838 else
15839 tmp = new_target;
15840
15841 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15842 const1_rtx,
15843 GEN_INT (one_var == 1 ? 0 : 1),
15844 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15845 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15846
15847 if (mode != V4SFmode)
15848 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15849 else if (tmp != target)
15850 emit_move_insn (target, tmp);
15851 }
15852 else if (target != new_target)
15853 emit_move_insn (target, new_target);
15854 return true;
15855
15856 case E_V8HImode:
15857 case E_V16QImode:
15858 vsimode = V4SImode;
15859 goto widen;
15860 case E_V4HImode:
15861 case E_V8QImode:
15862 if (!mmx_ok)
15863 return false;
15864 vsimode = V2SImode;
15865 goto widen;
15866 widen:
15867 if (one_var != 0)
15868 return false;
15869
15870 /* Zero extend the variable element to SImode and recurse. */
15871 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15872
15873 x = gen_reg_rtx (vsimode);
15874 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15875 var, one_var))
15876 gcc_unreachable ();
15877
15878 emit_move_insn (target, gen_lowpart (mode, x));
15879 return true;
15880
15881 default:
15882 return false;
15883 }
15884 }
15885
15886 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15887 consisting of the values in VALS. It is known that all elements
15888 except ONE_VAR are constants. Return true if successful. */
15889
15890 static bool
15891 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15892 rtx target, rtx vals, int one_var)
15893 {
15894 rtx var = XVECEXP (vals, 0, one_var);
15895 machine_mode wmode;
15896 rtx const_vec, x;
15897
15898 const_vec = copy_rtx (vals);
15899 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15900 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15901
15902 switch (mode)
15903 {
15904 case E_V2DFmode:
15905 case E_V2DImode:
15906 case E_V2SFmode:
15907 case E_V2SImode:
15908 /* For the two element vectors, it's just as easy to use
15909 the general case. */
15910 return false;
15911
15912 case E_V4DImode:
15913 /* Use ix86_expand_vector_set in 64bit mode only. */
15914 if (!TARGET_64BIT)
15915 return false;
15916 /* FALLTHRU */
15917 case E_V8HFmode:
15918 case E_V16HFmode:
15919 case E_V8BFmode:
15920 case E_V16BFmode:
15921 case E_V4DFmode:
15922 case E_V8SFmode:
15923 case E_V8SImode:
15924 case E_V16HImode:
15925 case E_V32QImode:
15926 case E_V4SFmode:
15927 case E_V4SImode:
15928 case E_V8HImode:
15929 case E_V4HImode:
15930 break;
15931
15932 case E_V16QImode:
15933 if (TARGET_SSE4_1)
15934 break;
15935 wmode = V8HImode;
15936 goto widen;
15937 case E_V8QImode:
15938 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15939 break;
15940 wmode = V4HImode;
15941 goto widen;
15942 case E_V4QImode:
15943 if (TARGET_SSE4_1)
15944 break;
15945 wmode = V2HImode;
15946 widen:
15947 /* There's no way to set one QImode entry easily. Combine
15948 the variable value with its adjacent constant value, and
15949 promote to an HImode set. */
15950 x = XVECEXP (vals, 0, one_var ^ 1);
15951 if (one_var & 1)
15952 {
15953 var = convert_modes (HImode, QImode, var, true);
15954 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15955 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15956 x = GEN_INT (INTVAL (x) & 0xff);
15957 }
15958 else
15959 {
15960 var = convert_modes (HImode, QImode, var, true);
15961 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15962 }
15963 if (x != const0_rtx)
15964 var = expand_simple_binop (HImode, IOR, var, x, var,
15965 1, OPTAB_LIB_WIDEN);
15966
15967 x = gen_reg_rtx (wmode);
15968 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15969 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15970
15971 emit_move_insn (target, gen_lowpart (mode, x));
15972 return true;
15973
15974 default:
15975 return false;
15976 }
15977
15978 emit_move_insn (target, const_vec);
15979 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15980 return true;
15981 }
15982
15983 /* A subroutine of ix86_expand_vector_init_general. Use vector
15984 concatenate to handle the most general case: all values variable,
15985 and none identical. */
15986
15987 static void
15988 ix86_expand_vector_init_concat (machine_mode mode,
15989 rtx target, rtx *ops, int n)
15990 {
15991 machine_mode half_mode = VOIDmode;
15992 rtx half[2];
15993 rtvec v;
15994 int i, j;
15995
15996 switch (n)
15997 {
15998 case 2:
15999 switch (mode)
16000 {
16001 case E_V32HFmode:
16002 half_mode = V16HFmode;
16003 break;
16004 case E_V32BFmode:
16005 half_mode = V16BFmode;
16006 break;
16007 case E_V16SImode:
16008 half_mode = V8SImode;
16009 break;
16010 case E_V16SFmode:
16011 half_mode = V8SFmode;
16012 break;
16013 case E_V8DImode:
16014 half_mode = V4DImode;
16015 break;
16016 case E_V8DFmode:
16017 half_mode = V4DFmode;
16018 break;
16019 case E_V16HFmode:
16020 half_mode = V8HFmode;
16021 break;
16022 case E_V16BFmode:
16023 half_mode = V8BFmode;
16024 break;
16025 case E_V8SImode:
16026 half_mode = V4SImode;
16027 break;
16028 case E_V8SFmode:
16029 half_mode = V4SFmode;
16030 break;
16031 case E_V4DImode:
16032 half_mode = V2DImode;
16033 break;
16034 case E_V4DFmode:
16035 half_mode = V2DFmode;
16036 break;
16037 case E_V4SImode:
16038 half_mode = V2SImode;
16039 break;
16040 case E_V4SFmode:
16041 half_mode = V2SFmode;
16042 break;
16043 case E_V2DImode:
16044 half_mode = DImode;
16045 break;
16046 case E_V2SImode:
16047 half_mode = SImode;
16048 break;
16049 case E_V2DFmode:
16050 half_mode = DFmode;
16051 break;
16052 case E_V2SFmode:
16053 half_mode = SFmode;
16054 break;
16055 default:
16056 gcc_unreachable ();
16057 }
16058
16059 if (!register_operand (ops[1], half_mode))
16060 ops[1] = force_reg (half_mode, ops[1]);
16061 if (!register_operand (ops[0], half_mode))
16062 ops[0] = force_reg (half_mode, ops[0]);
16063 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
16064 ops[1])));
16065 break;
16066
16067 case 4:
16068 switch (mode)
16069 {
16070 case E_V4DImode:
16071 half_mode = V2DImode;
16072 break;
16073 case E_V4DFmode:
16074 half_mode = V2DFmode;
16075 break;
16076 case E_V4SImode:
16077 half_mode = V2SImode;
16078 break;
16079 case E_V4SFmode:
16080 half_mode = V2SFmode;
16081 break;
16082 default:
16083 gcc_unreachable ();
16084 }
16085 goto half;
16086
16087 case 8:
16088 switch (mode)
16089 {
16090 case E_V8DImode:
16091 half_mode = V4DImode;
16092 break;
16093 case E_V8DFmode:
16094 half_mode = V4DFmode;
16095 break;
16096 case E_V8SImode:
16097 half_mode = V4SImode;
16098 break;
16099 case E_V8SFmode:
16100 half_mode = V4SFmode;
16101 break;
16102 default:
16103 gcc_unreachable ();
16104 }
16105 goto half;
16106
16107 case 16:
16108 switch (mode)
16109 {
16110 case E_V16SImode:
16111 half_mode = V8SImode;
16112 break;
16113 case E_V16SFmode:
16114 half_mode = V8SFmode;
16115 break;
16116 default:
16117 gcc_unreachable ();
16118 }
16119 goto half;
16120
16121 half:
16122 /* FIXME: We process inputs backward to help RA. PR 36222. */
16123 i = n - 1;
16124 for (j = 1; j != -1; j--)
16125 {
16126 half[j] = gen_reg_rtx (half_mode);
16127 switch (n >> 1)
16128 {
16129 case 2:
16130 v = gen_rtvec (2, ops[i-1], ops[i]);
16131 i -= 2;
16132 break;
16133 case 4:
16134 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
16135 i -= 4;
16136 break;
16137 case 8:
16138 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
16139 ops[i-3], ops[i-2], ops[i-1], ops[i]);
16140 i -= 8;
16141 break;
16142 default:
16143 gcc_unreachable ();
16144 }
16145 ix86_expand_vector_init (false, half[j],
16146 gen_rtx_PARALLEL (half_mode, v));
16147 }
16148
16149 ix86_expand_vector_init_concat (mode, target, half, 2);
16150 break;
16151
16152 default:
16153 gcc_unreachable ();
16154 }
16155 }
16156
16157 /* A subroutine of ix86_expand_vector_init_general. Use vector
16158 interleave to handle the most general case: all values variable,
16159 and none identical. */
16160
16161 static void
16162 ix86_expand_vector_init_interleave (machine_mode mode,
16163 rtx target, rtx *ops, int n)
16164 {
16165 machine_mode first_imode, second_imode, third_imode, inner_mode;
16166 int i, j;
16167 rtx op, op0, op1;
16168 rtx (*gen_load_even) (rtx, rtx, rtx);
16169 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
16170 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
16171
16172 switch (mode)
16173 {
16174 case E_V8HFmode:
16175 gen_load_even = gen_vec_interleave_lowv8hf;
16176 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16177 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16178 inner_mode = HFmode;
16179 first_imode = V4SImode;
16180 second_imode = V2DImode;
16181 third_imode = VOIDmode;
16182 break;
16183 case E_V8BFmode:
16184 gen_load_even = gen_vec_interleave_lowv8bf;
16185 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16186 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16187 inner_mode = BFmode;
16188 first_imode = V4SImode;
16189 second_imode = V2DImode;
16190 third_imode = VOIDmode;
16191 break;
16192 case E_V8HImode:
16193 gen_load_even = gen_vec_setv8hi;
16194 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16195 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16196 inner_mode = HImode;
16197 first_imode = V4SImode;
16198 second_imode = V2DImode;
16199 third_imode = VOIDmode;
16200 break;
16201 case E_V16QImode:
16202 gen_load_even = gen_vec_setv16qi;
16203 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
16204 gen_interleave_second_low = gen_vec_interleave_lowv4si;
16205 inner_mode = QImode;
16206 first_imode = V8HImode;
16207 second_imode = V4SImode;
16208 third_imode = V2DImode;
16209 break;
16210 default:
16211 gcc_unreachable ();
16212 }
16213
16214 for (i = 0; i < n; i++)
16215 {
16216 op = ops [i + i];
16217 if (inner_mode == HFmode || inner_mode == BFmode)
16218 {
16219 rtx even, odd;
16220 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16221 machine_mode vec_mode =
16222 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
16223 op0 = gen_reg_rtx (vec_mode);
16224 even = lowpart_subreg (vec_mode,
16225 force_reg (inner_mode, op), inner_mode);
16226 odd = lowpart_subreg (vec_mode,
16227 force_reg (inner_mode, ops[i + i + 1]),
16228 inner_mode);
16229 emit_insn (gen_load_even (op0, even, odd));
16230 }
16231 else
16232 {
16233 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16234 op0 = gen_reg_rtx (SImode);
16235 emit_move_insn (op0, gen_lowpart (SImode, op));
16236
16237 /* Insert the SImode value as low element of V4SImode vector. */
16238 op1 = gen_reg_rtx (V4SImode);
16239 op0 = gen_rtx_VEC_MERGE (V4SImode,
16240 gen_rtx_VEC_DUPLICATE (V4SImode,
16241 op0),
16242 CONST0_RTX (V4SImode),
16243 const1_rtx);
16244 emit_insn (gen_rtx_SET (op1, op0));
16245
16246 /* Cast the V4SImode vector back to a vector in orignal mode. */
16247 op0 = gen_reg_rtx (mode);
16248 emit_move_insn (op0, gen_lowpart (mode, op1));
16249
16250 /* Load even elements into the second position. */
16251 emit_insn (gen_load_even (op0,
16252 force_reg (inner_mode,
16253 ops[i + i + 1]),
16254 const1_rtx));
16255 }
16256
16257 /* Cast vector to FIRST_IMODE vector. */
16258 ops[i] = gen_reg_rtx (first_imode);
16259 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
16260 }
16261
16262 /* Interleave low FIRST_IMODE vectors. */
16263 for (i = j = 0; i < n; i += 2, j++)
16264 {
16265 op0 = gen_reg_rtx (first_imode);
16266 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
16267
16268 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16269 ops[j] = gen_reg_rtx (second_imode);
16270 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
16271 }
16272
16273 /* Interleave low SECOND_IMODE vectors. */
16274 switch (second_imode)
16275 {
16276 case E_V4SImode:
16277 for (i = j = 0; i < n / 2; i += 2, j++)
16278 {
16279 op0 = gen_reg_rtx (second_imode);
16280 emit_insn (gen_interleave_second_low (op0, ops[i],
16281 ops[i + 1]));
16282
16283 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16284 vector. */
16285 ops[j] = gen_reg_rtx (third_imode);
16286 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
16287 }
16288 second_imode = V2DImode;
16289 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16290 /* FALLTHRU */
16291
16292 case E_V2DImode:
16293 op0 = gen_reg_rtx (second_imode);
16294 emit_insn (gen_interleave_second_low (op0, ops[0],
16295 ops[1]));
16296
16297 /* Cast the SECOND_IMODE vector back to a vector on original
16298 mode. */
16299 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
16300 break;
16301
16302 default:
16303 gcc_unreachable ();
16304 }
16305 }
16306
16307 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
16308 all values variable, and none identical. */
16309
16310 static void
16311 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
16312 rtx target, rtx vals)
16313 {
16314 rtx ops[64], op0, op1, op2, op3, op4, op5;
16315 machine_mode half_mode = VOIDmode;
16316 machine_mode quarter_mode = VOIDmode;
16317 int n, i;
16318
16319 switch (mode)
16320 {
16321 case E_V2SFmode:
16322 case E_V2SImode:
16323 if (!mmx_ok && !TARGET_SSE)
16324 break;
16325 /* FALLTHRU */
16326
16327 case E_V16SImode:
16328 case E_V16SFmode:
16329 case E_V8DFmode:
16330 case E_V8DImode:
16331 case E_V8SFmode:
16332 case E_V8SImode:
16333 case E_V4DFmode:
16334 case E_V4DImode:
16335 case E_V4SFmode:
16336 case E_V4SImode:
16337 case E_V2DFmode:
16338 case E_V2DImode:
16339 n = GET_MODE_NUNITS (mode);
16340 for (i = 0; i < n; i++)
16341 ops[i] = XVECEXP (vals, 0, i);
16342 ix86_expand_vector_init_concat (mode, target, ops, n);
16343 return;
16344
16345 case E_V2TImode:
16346 for (i = 0; i < 2; i++)
16347 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16348 op0 = gen_reg_rtx (V4DImode);
16349 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
16350 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16351 return;
16352
16353 case E_V4TImode:
16354 for (i = 0; i < 4; i++)
16355 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16356 ops[4] = gen_reg_rtx (V4DImode);
16357 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
16358 ops[5] = gen_reg_rtx (V4DImode);
16359 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
16360 op0 = gen_reg_rtx (V8DImode);
16361 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
16362 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16363 return;
16364
16365 case E_V32QImode:
16366 half_mode = V16QImode;
16367 goto half;
16368
16369 case E_V16HImode:
16370 half_mode = V8HImode;
16371 goto half;
16372
16373 case E_V16HFmode:
16374 half_mode = V8HFmode;
16375 goto half;
16376
16377 case E_V16BFmode:
16378 half_mode = V8BFmode;
16379 goto half;
16380
16381 half:
16382 n = GET_MODE_NUNITS (mode);
16383 for (i = 0; i < n; i++)
16384 ops[i] = XVECEXP (vals, 0, i);
16385 op0 = gen_reg_rtx (half_mode);
16386 op1 = gen_reg_rtx (half_mode);
16387 ix86_expand_vector_init_interleave (half_mode, op0, ops,
16388 n >> 2);
16389 ix86_expand_vector_init_interleave (half_mode, op1,
16390 &ops [n >> 1], n >> 2);
16391 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
16392 return;
16393
16394 case E_V64QImode:
16395 quarter_mode = V16QImode;
16396 half_mode = V32QImode;
16397 goto quarter;
16398
16399 case E_V32HImode:
16400 quarter_mode = V8HImode;
16401 half_mode = V16HImode;
16402 goto quarter;
16403
16404 case E_V32HFmode:
16405 quarter_mode = V8HFmode;
16406 half_mode = V16HFmode;
16407 goto quarter;
16408
16409 case E_V32BFmode:
16410 quarter_mode = V8BFmode;
16411 half_mode = V16BFmode;
16412 goto quarter;
16413
16414 quarter:
16415 n = GET_MODE_NUNITS (mode);
16416 for (i = 0; i < n; i++)
16417 ops[i] = XVECEXP (vals, 0, i);
16418 op0 = gen_reg_rtx (quarter_mode);
16419 op1 = gen_reg_rtx (quarter_mode);
16420 op2 = gen_reg_rtx (quarter_mode);
16421 op3 = gen_reg_rtx (quarter_mode);
16422 op4 = gen_reg_rtx (half_mode);
16423 op5 = gen_reg_rtx (half_mode);
16424 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
16425 n >> 3);
16426 ix86_expand_vector_init_interleave (quarter_mode, op1,
16427 &ops [n >> 2], n >> 3);
16428 ix86_expand_vector_init_interleave (quarter_mode, op2,
16429 &ops [n >> 1], n >> 3);
16430 ix86_expand_vector_init_interleave (quarter_mode, op3,
16431 &ops [(n >> 1) | (n >> 2)], n >> 3);
16432 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
16433 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
16434 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
16435 return;
16436
16437 case E_V16QImode:
16438 if (!TARGET_SSE4_1)
16439 break;
16440 /* FALLTHRU */
16441
16442 case E_V8HImode:
16443 if (!TARGET_SSE2)
16444 break;
16445
16446 /* Don't use ix86_expand_vector_init_interleave if we can't
16447 move from GPR to SSE register directly. */
16448 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
16449 break;
16450 /* FALLTHRU */
16451
16452 case E_V8HFmode:
16453 case E_V8BFmode:
16454
16455 n = GET_MODE_NUNITS (mode);
16456 for (i = 0; i < n; i++)
16457 ops[i] = XVECEXP (vals, 0, i);
16458 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
16459 return;
16460
16461 case E_V4HImode:
16462 case E_V8QImode:
16463
16464 case E_V2HImode:
16465 case E_V4QImode:
16466 break;
16467
16468 default:
16469 gcc_unreachable ();
16470 }
16471
16472 {
16473 int i, j, n_elts, n_words, n_elt_per_word;
16474 machine_mode tmp_mode, inner_mode;
16475 rtx words[4], shift;
16476
16477 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
16478
16479 inner_mode = GET_MODE_INNER (mode);
16480 n_elts = GET_MODE_NUNITS (mode);
16481 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
16482 n_elt_per_word = n_elts / n_words;
16483 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
16484
16485 for (i = 0; i < n_words; ++i)
16486 {
16487 rtx word = NULL_RTX;
16488
16489 for (j = 0; j < n_elt_per_word; ++j)
16490 {
16491 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
16492 elt = convert_modes (tmp_mode, inner_mode, elt, true);
16493
16494 if (j == 0)
16495 word = elt;
16496 else
16497 {
16498 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
16499 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16500 word = expand_simple_binop (tmp_mode, IOR, word, elt,
16501 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16502 }
16503 }
16504
16505 words[i] = word;
16506 }
16507
16508 if (n_words == 1)
16509 emit_move_insn (target, gen_lowpart (mode, words[0]));
16510 else if (n_words == 2)
16511 {
16512 gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
16513 machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
16514 rtx tmp = gen_reg_rtx (concat_mode);
16515 vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
16516 ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
16517 emit_move_insn (target, gen_lowpart (mode, tmp));
16518 }
16519 else if (n_words == 4)
16520 {
16521 rtx tmp = gen_reg_rtx (V4SImode);
16522 gcc_assert (tmp_mode == SImode);
16523 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16524 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
16525 emit_move_insn (target, gen_lowpart (mode, tmp));
16526 }
16527 else
16528 gcc_unreachable ();
16529 }
16530 }
16531
16532 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16533 instructions unless MMX_OK is true. */
16534
16535 void
16536 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16537 {
16538 machine_mode mode = GET_MODE (target);
16539 machine_mode inner_mode = GET_MODE_INNER (mode);
16540 int n_elts = GET_MODE_NUNITS (mode);
16541 int n_var = 0, one_var = -1;
16542 bool all_same = true, all_const_zero = true;
16543 int i;
16544 rtx x;
16545
16546 /* Handle first initialization from vector elts. */
16547 if (n_elts != XVECLEN (vals, 0))
16548 {
16549 rtx subtarget = target;
16550 x = XVECEXP (vals, 0, 0);
16551 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16552 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16553 {
16554 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
16555 if (inner_mode == QImode
16556 || inner_mode == HImode
16557 || inner_mode == TImode
16558 || inner_mode == HFmode
16559 || inner_mode == BFmode)
16560 {
16561 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
16562 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16563 n_bits /= GET_MODE_SIZE (elt_mode);
16564 mode = mode_for_vector (elt_mode, n_bits).require ();
16565 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
16566 ops[0] = gen_lowpart (inner_mode, ops[0]);
16567 ops[1] = gen_lowpart (inner_mode, ops[1]);
16568 subtarget = gen_reg_rtx (mode);
16569 }
16570 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
16571 if (subtarget != target)
16572 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
16573 return;
16574 }
16575 gcc_unreachable ();
16576 }
16577
16578 for (i = 0; i < n_elts; ++i)
16579 {
16580 x = XVECEXP (vals, 0, i);
16581 if (!(CONST_SCALAR_INT_P (x)
16582 || CONST_DOUBLE_P (x)
16583 || CONST_FIXED_P (x)))
16584 n_var++, one_var = i;
16585 else if (x != CONST0_RTX (inner_mode))
16586 all_const_zero = false;
16587 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
16588 all_same = false;
16589 }
16590
16591 /* Constants are best loaded from the constant pool. */
16592 if (n_var == 0)
16593 {
16594 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
16595 return;
16596 }
16597
16598 /* If all values are identical, broadcast the value. */
16599 if (all_same
16600 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16601 XVECEXP (vals, 0, 0)))
16602 return;
16603
16604 /* Values where only one field is non-constant are best loaded from
16605 the pool and overwritten via move later. */
16606 if (n_var == 1)
16607 {
16608 if (all_const_zero
16609 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16610 XVECEXP (vals, 0, one_var),
16611 one_var))
16612 return;
16613
16614 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16615 return;
16616 }
16617
16618 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16619 }
16620
16621 /* Implemented as
16622 V setg (V v, int idx, T val)
16623 {
16624 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16625 V valv = (V){val, val, val, val, val, val, val, val};
16626 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16627 v = (v & ~mask) | (valv & mask);
16628 return v;
16629 }. */
16630 void
16631 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16632 {
16633 rtx vec[64];
16634 machine_mode mode = GET_MODE (target);
16635 machine_mode cmp_mode = mode;
16636 int n_elts = GET_MODE_NUNITS (mode);
16637 rtx valv,idxv,constv,idx_tmp;
16638 bool ok = false;
16639
16640 /* 512-bits vector byte/word broadcast and comparison only available
16641 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16642 when without TARGET_AVX512BW. */
16643 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
16644 || mode == V64QImode)
16645 && !TARGET_AVX512BW)
16646 {
16647 gcc_assert (TARGET_AVX512F);
16648 rtx vhi, vlo, idx_hi;
16649 machine_mode half_mode;
16650 rtx (*extract_hi)(rtx, rtx);
16651 rtx (*extract_lo)(rtx, rtx);
16652
16653 if (mode == V32HImode)
16654 {
16655 half_mode = V16HImode;
16656 extract_hi = gen_vec_extract_hi_v32hi;
16657 extract_lo = gen_vec_extract_lo_v32hi;
16658 }
16659 else if (mode == V32HFmode)
16660 {
16661 half_mode = V16HFmode;
16662 extract_hi = gen_vec_extract_hi_v32hf;
16663 extract_lo = gen_vec_extract_lo_v32hf;
16664 }
16665 else if (mode == V32BFmode)
16666 {
16667 half_mode = V16BFmode;
16668 extract_hi = gen_vec_extract_hi_v32bf;
16669 extract_lo = gen_vec_extract_lo_v32bf;
16670 }
16671 else
16672 {
16673 half_mode = V32QImode;
16674 extract_hi = gen_vec_extract_hi_v64qi;
16675 extract_lo = gen_vec_extract_lo_v64qi;
16676 }
16677
16678 vhi = gen_reg_rtx (half_mode);
16679 vlo = gen_reg_rtx (half_mode);
16680 idx_hi = gen_reg_rtx (GET_MODE (idx));
16681 emit_insn (extract_hi (vhi, target));
16682 emit_insn (extract_lo (vlo, target));
16683 vec[0] = idx_hi;
16684 vec[1] = idx;
16685 vec[2] = GEN_INT (n_elts/2);
16686 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16687 ix86_expand_vector_set_var (vhi, val, idx_hi);
16688 ix86_expand_vector_set_var (vlo, val, idx);
16689 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16690 return;
16691 }
16692
16693 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16694 {
16695 switch (mode)
16696 {
16697 case E_V2DFmode:
16698 cmp_mode = V2DImode;
16699 break;
16700 case E_V4DFmode:
16701 cmp_mode = V4DImode;
16702 break;
16703 case E_V8DFmode:
16704 cmp_mode = V8DImode;
16705 break;
16706 case E_V2SFmode:
16707 cmp_mode = V2SImode;
16708 break;
16709 case E_V4SFmode:
16710 cmp_mode = V4SImode;
16711 break;
16712 case E_V8SFmode:
16713 cmp_mode = V8SImode;
16714 break;
16715 case E_V16SFmode:
16716 cmp_mode = V16SImode;
16717 break;
16718 case E_V8HFmode:
16719 cmp_mode = V8HImode;
16720 break;
16721 case E_V16HFmode:
16722 cmp_mode = V16HImode;
16723 break;
16724 case E_V32HFmode:
16725 cmp_mode = V32HImode;
16726 break;
16727 case E_V8BFmode:
16728 cmp_mode = V8HImode;
16729 break;
16730 case E_V16BFmode:
16731 cmp_mode = V16HImode;
16732 break;
16733 case E_V32BFmode:
16734 cmp_mode = V32HImode;
16735 break;
16736 default:
16737 gcc_unreachable ();
16738 }
16739 }
16740
16741 for (int i = 0; i != n_elts; i++)
16742 vec[i] = GEN_INT (i);
16743 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16744 valv = gen_reg_rtx (mode);
16745 idxv = gen_reg_rtx (cmp_mode);
16746 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16747
16748 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16749 mode, valv, val);
16750 gcc_assert (ok);
16751 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16752 cmp_mode, idxv, idx_tmp);
16753 gcc_assert (ok);
16754 vec[0] = target;
16755 vec[1] = valv;
16756 vec[2] = target;
16757 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16758 vec[4] = idxv;
16759 vec[5] = constv;
16760 ok = ix86_expand_int_vcond (vec);
16761 gcc_assert (ok);
16762 }
16763
16764 void
16765 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16766 {
16767 machine_mode mode = GET_MODE (target);
16768 machine_mode inner_mode = GET_MODE_INNER (mode);
16769 machine_mode half_mode;
16770 bool use_vec_merge = false;
16771 bool blendm_const = false;
16772 rtx tmp;
16773 static rtx (*gen_extract[8][2]) (rtx, rtx)
16774 = {
16775 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16776 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16777 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16778 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16779 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16780 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16781 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
16782 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
16783 };
16784 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
16785 = {
16786 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16787 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16788 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16789 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16790 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16791 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16792 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16793 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
16794 };
16795 int i, j, n;
16796 machine_mode mmode = VOIDmode;
16797 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16798
16799 switch (mode)
16800 {
16801 case E_V2SImode:
16802 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16803 if (use_vec_merge)
16804 break;
16805 /* FALLTHRU */
16806
16807 case E_V2SFmode:
16808 if (mmx_ok)
16809 {
16810 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16811 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16812 if (elt == 0)
16813 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16814 else
16815 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16816 emit_insn (gen_rtx_SET (target, tmp));
16817 return;
16818 }
16819 break;
16820
16821 case E_V2DImode:
16822 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16823 if (use_vec_merge)
16824 break;
16825
16826 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16827 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16828 if (elt == 0)
16829 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16830 else
16831 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16832 emit_insn (gen_rtx_SET (target, tmp));
16833 return;
16834
16835 case E_V2DFmode:
16836 /* NB: For ELT == 0, use standard scalar operation patterns which
16837 preserve the rest of the vector for combiner:
16838
16839 (vec_merge:V2DF
16840 (vec_duplicate:V2DF (reg:DF))
16841 (reg:V2DF)
16842 (const_int 1))
16843 */
16844 if (elt == 0)
16845 goto do_vec_merge;
16846
16847 {
16848 rtx op0, op1;
16849
16850 /* For the two element vectors, we implement a VEC_CONCAT with
16851 the extraction of the other element. */
16852
16853 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16854 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16855
16856 if (elt == 0)
16857 op0 = val, op1 = tmp;
16858 else
16859 op0 = tmp, op1 = val;
16860
16861 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16862 emit_insn (gen_rtx_SET (target, tmp));
16863 }
16864 return;
16865
16866 case E_V4SFmode:
16867 use_vec_merge = TARGET_SSE4_1;
16868 if (use_vec_merge)
16869 break;
16870
16871 switch (elt)
16872 {
16873 case 0:
16874 use_vec_merge = true;
16875 break;
16876
16877 case 1:
16878 /* tmp = target = A B C D */
16879 tmp = copy_to_reg (target);
16880 /* target = A A B B */
16881 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16882 /* target = X A B B */
16883 ix86_expand_vector_set (false, target, val, 0);
16884 /* target = A X C D */
16885 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16886 const1_rtx, const0_rtx,
16887 GEN_INT (2+4), GEN_INT (3+4)));
16888 return;
16889
16890 case 2:
16891 /* tmp = target = A B C D */
16892 tmp = copy_to_reg (target);
16893 /* tmp = X B C D */
16894 ix86_expand_vector_set (false, tmp, val, 0);
16895 /* target = A B X D */
16896 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16897 const0_rtx, const1_rtx,
16898 GEN_INT (0+4), GEN_INT (3+4)));
16899 return;
16900
16901 case 3:
16902 /* tmp = target = A B C D */
16903 tmp = copy_to_reg (target);
16904 /* tmp = X B C D */
16905 ix86_expand_vector_set (false, tmp, val, 0);
16906 /* target = A B X D */
16907 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16908 const0_rtx, const1_rtx,
16909 GEN_INT (2+4), GEN_INT (0+4)));
16910 return;
16911
16912 default:
16913 gcc_unreachable ();
16914 }
16915 break;
16916
16917 case E_V4SImode:
16918 use_vec_merge = TARGET_SSE4_1;
16919 if (use_vec_merge)
16920 break;
16921
16922 /* Element 0 handled by vec_merge below. */
16923 if (elt == 0)
16924 {
16925 use_vec_merge = true;
16926 break;
16927 }
16928
16929 if (TARGET_SSE2)
16930 {
16931 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16932 store into element 0, then shuffle them back. */
16933
16934 rtx order[4];
16935
16936 order[0] = GEN_INT (elt);
16937 order[1] = const1_rtx;
16938 order[2] = const2_rtx;
16939 order[3] = GEN_INT (3);
16940 order[elt] = const0_rtx;
16941
16942 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16943 order[1], order[2], order[3]));
16944
16945 ix86_expand_vector_set (false, target, val, 0);
16946
16947 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16948 order[1], order[2], order[3]));
16949 }
16950 else
16951 {
16952 /* For SSE1, we have to reuse the V4SF code. */
16953 rtx t = gen_reg_rtx (V4SFmode);
16954 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16955 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16956 emit_move_insn (target, gen_lowpart (mode, t));
16957 }
16958 return;
16959
16960 case E_V8HImode:
16961 case E_V8HFmode:
16962 case E_V8BFmode:
16963 case E_V2HImode:
16964 use_vec_merge = TARGET_SSE2;
16965 break;
16966 case E_V4HImode:
16967 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16968 break;
16969
16970 case E_V16QImode:
16971 case E_V4QImode:
16972 use_vec_merge = TARGET_SSE4_1;
16973 break;
16974
16975 case E_V8QImode:
16976 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16977 break;
16978
16979 case E_V32QImode:
16980 half_mode = V16QImode;
16981 j = 0;
16982 n = 16;
16983 goto half;
16984
16985 case E_V16HFmode:
16986 case E_V16BFmode:
16987 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16988 if (TARGET_AVX2 && elt != 0)
16989 {
16990 mmode = SImode;
16991 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
16992 : gen_avx2_pblendbf_1);
16993 blendm_const = true;
16994 break;
16995 }
16996 else
16997 {
16998 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
16999 j = ((mode == E_V16HFmode) ? 6 : 7);
17000 n = 8;
17001 goto half;
17002 }
17003
17004 case E_V16HImode:
17005 half_mode = V8HImode;
17006 j = 1;
17007 n = 8;
17008 goto half;
17009
17010 case E_V8SImode:
17011 half_mode = V4SImode;
17012 j = 2;
17013 n = 4;
17014 goto half;
17015
17016 case E_V4DImode:
17017 half_mode = V2DImode;
17018 j = 3;
17019 n = 2;
17020 goto half;
17021
17022 case E_V8SFmode:
17023 half_mode = V4SFmode;
17024 j = 4;
17025 n = 4;
17026 goto half;
17027
17028 case E_V4DFmode:
17029 half_mode = V2DFmode;
17030 j = 5;
17031 n = 2;
17032 goto half;
17033
17034 half:
17035 /* Compute offset. */
17036 i = elt / n;
17037 elt %= n;
17038
17039 gcc_assert (i <= 1);
17040
17041 /* Extract the half. */
17042 tmp = gen_reg_rtx (half_mode);
17043 emit_insn (gen_extract[j][i] (tmp, target));
17044
17045 /* Put val in tmp at elt. */
17046 ix86_expand_vector_set (false, tmp, val, elt);
17047
17048 /* Put it back. */
17049 emit_insn (gen_insert[j][i] (target, target, tmp));
17050 return;
17051
17052 case E_V8DFmode:
17053 if (TARGET_AVX512F)
17054 {
17055 mmode = QImode;
17056 gen_blendm = gen_avx512f_blendmv8df;
17057 }
17058 break;
17059
17060 case E_V8DImode:
17061 if (TARGET_AVX512F)
17062 {
17063 mmode = QImode;
17064 gen_blendm = gen_avx512f_blendmv8di;
17065 }
17066 break;
17067
17068 case E_V16SFmode:
17069 if (TARGET_AVX512F)
17070 {
17071 mmode = HImode;
17072 gen_blendm = gen_avx512f_blendmv16sf;
17073 }
17074 break;
17075
17076 case E_V16SImode:
17077 if (TARGET_AVX512F)
17078 {
17079 mmode = HImode;
17080 gen_blendm = gen_avx512f_blendmv16si;
17081 }
17082 break;
17083
17084 case E_V32HFmode:
17085 if (TARGET_AVX512BW)
17086 {
17087 mmode = SImode;
17088 gen_blendm = gen_avx512bw_blendmv32hf;
17089 }
17090 break;
17091 case E_V32BFmode:
17092 if (TARGET_AVX512BW)
17093 {
17094 mmode = SImode;
17095 gen_blendm = gen_avx512bw_blendmv32bf;
17096 }
17097 break;
17098 case E_V32HImode:
17099 if (TARGET_AVX512BW)
17100 {
17101 mmode = SImode;
17102 gen_blendm = gen_avx512bw_blendmv32hi;
17103 }
17104 else if (TARGET_AVX512F)
17105 {
17106 half_mode = E_V8HImode;
17107 n = 8;
17108 goto quarter;
17109 }
17110 break;
17111
17112 case E_V64QImode:
17113 if (TARGET_AVX512BW)
17114 {
17115 mmode = DImode;
17116 gen_blendm = gen_avx512bw_blendmv64qi;
17117 }
17118 else if (TARGET_AVX512F)
17119 {
17120 half_mode = E_V16QImode;
17121 n = 16;
17122 goto quarter;
17123 }
17124 break;
17125
17126 quarter:
17127 /* Compute offset. */
17128 i = elt / n;
17129 elt %= n;
17130
17131 gcc_assert (i <= 3);
17132
17133 {
17134 /* Extract the quarter. */
17135 tmp = gen_reg_rtx (V4SImode);
17136 rtx tmp2 = gen_lowpart (V16SImode, target);
17137 rtx mask = gen_reg_rtx (QImode);
17138
17139 emit_move_insn (mask, constm1_rtx);
17140 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
17141 tmp, mask));
17142
17143 tmp2 = gen_reg_rtx (half_mode);
17144 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
17145 tmp = tmp2;
17146
17147 /* Put val in tmp at elt. */
17148 ix86_expand_vector_set (false, tmp, val, elt);
17149
17150 /* Put it back. */
17151 tmp2 = gen_reg_rtx (V16SImode);
17152 rtx tmp3 = gen_lowpart (V16SImode, target);
17153 mask = gen_reg_rtx (HImode);
17154 emit_move_insn (mask, constm1_rtx);
17155 tmp = gen_lowpart (V4SImode, tmp);
17156 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
17157 tmp3, mask));
17158 emit_move_insn (target, gen_lowpart (mode, tmp2));
17159 }
17160 return;
17161
17162 default:
17163 break;
17164 }
17165
17166 if (mmode != VOIDmode)
17167 {
17168 tmp = gen_reg_rtx (mode);
17169 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
17170 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
17171 /* The avx512*_blendm<mode> expanders have different operand order
17172 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
17173 elements where the mask is set and second input operand otherwise,
17174 in {sse,avx}*_*blend* the first input operand is used for elements
17175 where the mask is clear and second input operand otherwise. */
17176 if (!blendm_const)
17177 merge_mask = force_reg (mmode, merge_mask);
17178 emit_insn (gen_blendm (target, target, tmp, merge_mask));
17179 }
17180 else if (use_vec_merge)
17181 {
17182 do_vec_merge:
17183 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
17184 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
17185 GEN_INT (HOST_WIDE_INT_1U << elt));
17186 emit_insn (gen_rtx_SET (target, tmp));
17187 }
17188 else
17189 {
17190 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17191
17192 emit_move_insn (mem, target);
17193
17194 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
17195 emit_move_insn (tmp, val);
17196
17197 emit_move_insn (target, mem);
17198 }
17199 }
17200
17201 void
17202 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
17203 {
17204 machine_mode mode = GET_MODE (vec);
17205 machine_mode inner_mode = GET_MODE_INNER (mode);
17206 bool use_vec_extr = false;
17207 rtx tmp;
17208
17209 switch (mode)
17210 {
17211 case E_V2SImode:
17212 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17213 if (use_vec_extr)
17214 break;
17215 /* FALLTHRU */
17216
17217 case E_V2SFmode:
17218 if (!mmx_ok)
17219 break;
17220 /* FALLTHRU */
17221
17222 case E_V2DFmode:
17223 case E_V2DImode:
17224 case E_V2TImode:
17225 case E_V4TImode:
17226 use_vec_extr = true;
17227 break;
17228
17229 case E_V4SFmode:
17230 use_vec_extr = TARGET_SSE4_1;
17231 if (use_vec_extr)
17232 break;
17233
17234 switch (elt)
17235 {
17236 case 0:
17237 tmp = vec;
17238 break;
17239
17240 case 1:
17241 case 3:
17242 tmp = gen_reg_rtx (mode);
17243 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
17244 GEN_INT (elt), GEN_INT (elt),
17245 GEN_INT (elt+4), GEN_INT (elt+4)));
17246 break;
17247
17248 case 2:
17249 tmp = gen_reg_rtx (mode);
17250 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
17251 break;
17252
17253 default:
17254 gcc_unreachable ();
17255 }
17256 vec = tmp;
17257 use_vec_extr = true;
17258 elt = 0;
17259 break;
17260
17261 case E_V4SImode:
17262 use_vec_extr = TARGET_SSE4_1;
17263 if (use_vec_extr)
17264 break;
17265
17266 if (TARGET_SSE2)
17267 {
17268 switch (elt)
17269 {
17270 case 0:
17271 tmp = vec;
17272 break;
17273
17274 case 1:
17275 case 3:
17276 tmp = gen_reg_rtx (mode);
17277 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
17278 GEN_INT (elt), GEN_INT (elt),
17279 GEN_INT (elt), GEN_INT (elt)));
17280 break;
17281
17282 case 2:
17283 tmp = gen_reg_rtx (mode);
17284 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
17285 break;
17286
17287 default:
17288 gcc_unreachable ();
17289 }
17290 vec = tmp;
17291 use_vec_extr = true;
17292 elt = 0;
17293 }
17294 else
17295 {
17296 /* For SSE1, we have to reuse the V4SF code. */
17297 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
17298 gen_lowpart (V4SFmode, vec), elt);
17299 return;
17300 }
17301 break;
17302
17303 case E_V8HImode:
17304 case E_V8HFmode:
17305 case E_V8BFmode:
17306 case E_V2HImode:
17307 use_vec_extr = TARGET_SSE2;
17308 break;
17309 case E_V4HImode:
17310 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17311 break;
17312
17313 case E_V16QImode:
17314 use_vec_extr = TARGET_SSE4_1;
17315 if (!use_vec_extr
17316 && TARGET_SSE2
17317 && elt == 0
17318 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
17319 {
17320 tmp = gen_reg_rtx (SImode);
17321 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
17322 0);
17323 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
17324 return;
17325 }
17326 break;
17327 case E_V4QImode:
17328 use_vec_extr = TARGET_SSE4_1;
17329 break;
17330
17331 case E_V8SFmode:
17332 if (TARGET_AVX)
17333 {
17334 tmp = gen_reg_rtx (V4SFmode);
17335 if (elt < 4)
17336 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
17337 else
17338 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
17339 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17340 return;
17341 }
17342 break;
17343
17344 case E_V4DFmode:
17345 if (TARGET_AVX)
17346 {
17347 tmp = gen_reg_rtx (V2DFmode);
17348 if (elt < 2)
17349 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
17350 else
17351 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
17352 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17353 return;
17354 }
17355 break;
17356
17357 case E_V32QImode:
17358 if (TARGET_AVX)
17359 {
17360 tmp = gen_reg_rtx (V16QImode);
17361 if (elt < 16)
17362 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
17363 else
17364 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
17365 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17366 return;
17367 }
17368 break;
17369
17370 case E_V16HImode:
17371 if (TARGET_AVX)
17372 {
17373 tmp = gen_reg_rtx (V8HImode);
17374 if (elt < 8)
17375 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
17376 else
17377 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
17378 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17379 return;
17380 }
17381 break;
17382
17383 case E_V8SImode:
17384 if (TARGET_AVX)
17385 {
17386 tmp = gen_reg_rtx (V4SImode);
17387 if (elt < 4)
17388 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
17389 else
17390 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
17391 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17392 return;
17393 }
17394 break;
17395
17396 case E_V4DImode:
17397 if (TARGET_AVX)
17398 {
17399 tmp = gen_reg_rtx (V2DImode);
17400 if (elt < 2)
17401 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
17402 else
17403 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
17404 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17405 return;
17406 }
17407 break;
17408
17409 case E_V32HImode:
17410 if (TARGET_AVX512BW)
17411 {
17412 tmp = gen_reg_rtx (V16HImode);
17413 if (elt < 16)
17414 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
17415 else
17416 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
17417 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17418 return;
17419 }
17420 break;
17421
17422 case E_V64QImode:
17423 if (TARGET_AVX512BW)
17424 {
17425 tmp = gen_reg_rtx (V32QImode);
17426 if (elt < 32)
17427 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
17428 else
17429 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
17430 ix86_expand_vector_extract (false, target, tmp, elt & 31);
17431 return;
17432 }
17433 break;
17434
17435 case E_V16SFmode:
17436 tmp = gen_reg_rtx (V8SFmode);
17437 if (elt < 8)
17438 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
17439 else
17440 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
17441 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17442 return;
17443
17444 case E_V8DFmode:
17445 tmp = gen_reg_rtx (V4DFmode);
17446 if (elt < 4)
17447 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
17448 else
17449 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
17450 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17451 return;
17452
17453 case E_V16SImode:
17454 tmp = gen_reg_rtx (V8SImode);
17455 if (elt < 8)
17456 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
17457 else
17458 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
17459 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17460 return;
17461
17462 case E_V8DImode:
17463 tmp = gen_reg_rtx (V4DImode);
17464 if (elt < 4)
17465 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
17466 else
17467 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
17468 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17469 return;
17470
17471 case E_V32HFmode:
17472 case E_V32BFmode:
17473 if (TARGET_AVX512BW)
17474 {
17475 tmp = (mode == E_V32HFmode
17476 ? gen_reg_rtx (V16HFmode)
17477 : gen_reg_rtx (V16BFmode));
17478 if (elt < 16)
17479 emit_insn (gen_vec_extract_lo (mode, tmp, vec));
17480 else
17481 emit_insn (gen_vec_extract_hi (mode, tmp, vec));
17482 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17483 return;
17484 }
17485 break;
17486
17487 case E_V16HFmode:
17488 case E_V16BFmode:
17489 if (TARGET_AVX)
17490 {
17491 tmp = (mode == E_V16HFmode
17492 ? gen_reg_rtx (V8HFmode)
17493 : gen_reg_rtx (V8BFmode));
17494 if (elt < 8)
17495 emit_insn (gen_vec_extract_lo (mode, tmp, vec));
17496 else
17497 emit_insn (gen_vec_extract_hi (mode, tmp, vec));
17498 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17499 return;
17500 }
17501 break;
17502
17503 case E_V8QImode:
17504 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17505 /* ??? Could extract the appropriate HImode element and shift. */
17506 break;
17507
17508 default:
17509 break;
17510 }
17511
17512 if (use_vec_extr)
17513 {
17514 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17515 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17516
17517 /* Let the rtl optimizers know about the zero extension performed. */
17518 if (inner_mode == QImode || inner_mode == HImode)
17519 {
17520 rtx reg = gen_reg_rtx (SImode);
17521 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
17522 emit_move_insn (reg, tmp);
17523 tmp = gen_lowpart (inner_mode, reg);
17524 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17525 SUBREG_PROMOTED_SET (tmp, 1);
17526 }
17527
17528 emit_move_insn (target, tmp);
17529 }
17530 else
17531 {
17532 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17533
17534 emit_move_insn (mem, vec);
17535
17536 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17537 emit_move_insn (target, tmp);
17538 }
17539 }
17540
17541 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17542 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17543 The upper bits of DEST are undefined, though they shouldn't cause
17544 exceptions (some bits from src or all zeros are ok). */
17545
17546 static void
17547 emit_reduc_half (rtx dest, rtx src, int i)
17548 {
17549 rtx tem, d = dest;
17550 switch (GET_MODE (src))
17551 {
17552 case E_V4SFmode:
17553 if (i == 128)
17554 tem = gen_sse_movhlps (dest, src, src);
17555 else
17556 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
17557 GEN_INT (1 + 4), GEN_INT (1 + 4));
17558 break;
17559 case E_V2DFmode:
17560 tem = gen_vec_interleave_highv2df (dest, src, src);
17561 break;
17562 case E_V4QImode:
17563 d = gen_reg_rtx (V1SImode);
17564 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
17565 GEN_INT (i / 2));
17566 break;
17567 case E_V4HImode:
17568 d = gen_reg_rtx (V1DImode);
17569 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
17570 GEN_INT (i / 2));
17571 break;
17572 case E_V16QImode:
17573 case E_V8HImode:
17574 case E_V8HFmode:
17575 case E_V4SImode:
17576 case E_V2DImode:
17577 d = gen_reg_rtx (V1TImode);
17578 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
17579 GEN_INT (i / 2));
17580 break;
17581 case E_V8SFmode:
17582 if (i == 256)
17583 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
17584 else
17585 tem = gen_avx_shufps256 (dest, src, src,
17586 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
17587 break;
17588 case E_V4DFmode:
17589 if (i == 256)
17590 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
17591 else
17592 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
17593 break;
17594 case E_V32QImode:
17595 case E_V16HImode:
17596 case E_V16HFmode:
17597 case E_V8SImode:
17598 case E_V4DImode:
17599 if (i == 256)
17600 {
17601 if (GET_MODE (dest) != V4DImode)
17602 d = gen_reg_rtx (V4DImode);
17603 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
17604 gen_lowpart (V4DImode, src),
17605 const1_rtx);
17606 }
17607 else
17608 {
17609 d = gen_reg_rtx (V2TImode);
17610 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
17611 GEN_INT (i / 2));
17612 }
17613 break;
17614 case E_V64QImode:
17615 case E_V32HImode:
17616 case E_V32HFmode:
17617 if (i < 64)
17618 {
17619 d = gen_reg_rtx (V4TImode);
17620 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
17621 GEN_INT (i / 2));
17622 break;
17623 }
17624 /* FALLTHRU */
17625 case E_V16SImode:
17626 case E_V16SFmode:
17627 case E_V8DImode:
17628 case E_V8DFmode:
17629 if (i > 128)
17630 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
17631 gen_lowpart (V16SImode, src),
17632 gen_lowpart (V16SImode, src),
17633 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17634 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17635 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17636 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17637 GEN_INT (0xC), GEN_INT (0xD),
17638 GEN_INT (0xE), GEN_INT (0xF),
17639 GEN_INT (0x10), GEN_INT (0x11),
17640 GEN_INT (0x12), GEN_INT (0x13),
17641 GEN_INT (0x14), GEN_INT (0x15),
17642 GEN_INT (0x16), GEN_INT (0x17));
17643 else
17644 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
17645 gen_lowpart (V16SImode, src),
17646 GEN_INT (i == 128 ? 0x2 : 0x1),
17647 GEN_INT (0x3),
17648 GEN_INT (0x3),
17649 GEN_INT (0x3),
17650 GEN_INT (i == 128 ? 0x6 : 0x5),
17651 GEN_INT (0x7),
17652 GEN_INT (0x7),
17653 GEN_INT (0x7),
17654 GEN_INT (i == 128 ? 0xA : 0x9),
17655 GEN_INT (0xB),
17656 GEN_INT (0xB),
17657 GEN_INT (0xB),
17658 GEN_INT (i == 128 ? 0xE : 0xD),
17659 GEN_INT (0xF),
17660 GEN_INT (0xF),
17661 GEN_INT (0xF));
17662 break;
17663 default:
17664 gcc_unreachable ();
17665 }
17666 emit_insn (tem);
17667 if (d != dest)
17668 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17669 }
17670
17671 /* Expand a vector reduction. FN is the binary pattern to reduce;
17672 DEST is the destination; IN is the input vector. */
17673
17674 void
17675 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17676 {
17677 rtx half, dst, vec = in;
17678 machine_mode mode = GET_MODE (in);
17679 int i;
17680
17681 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17682 if (TARGET_SSE4_1
17683 && mode == V8HImode
17684 && fn == gen_uminv8hi3)
17685 {
17686 emit_insn (gen_sse4_1_phminposuw (dest, in));
17687 return;
17688 }
17689
17690 for (i = GET_MODE_BITSIZE (mode);
17691 i > GET_MODE_UNIT_BITSIZE (mode);
17692 i >>= 1)
17693 {
17694 half = gen_reg_rtx (mode);
17695 emit_reduc_half (half, vec, i);
17696 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17697 dst = dest;
17698 else
17699 dst = gen_reg_rtx (mode);
17700 emit_insn (fn (dst, half, vec));
17701 vec = dst;
17702 }
17703 }
17704
17705 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17706 FP status register is set. */
17707
17708 void
17709 ix86_emit_fp_unordered_jump (rtx label)
17710 {
17711 rtx reg = gen_reg_rtx (HImode);
17712 rtx_insn *insn;
17713 rtx temp;
17714
17715 emit_insn (gen_x86_fnstsw_1 (reg));
17716
17717 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17718 {
17719 emit_insn (gen_x86_sahf_1 (reg));
17720
17721 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17722 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17723 }
17724 else
17725 {
17726 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17727
17728 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17729 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17730 }
17731
17732 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17733 gen_rtx_LABEL_REF (VOIDmode, label),
17734 pc_rtx);
17735 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17736 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17737 JUMP_LABEL (insn) = label;
17738 }
17739
17740 /* Output code to perform an sinh XFmode calculation. */
17741
17742 void
17743 ix86_emit_i387_sinh (rtx op0, rtx op1)
17744 {
17745 rtx e1 = gen_reg_rtx (XFmode);
17746 rtx e2 = gen_reg_rtx (XFmode);
17747 rtx scratch = gen_reg_rtx (HImode);
17748 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17749 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17750 rtx cst1, tmp;
17751 rtx_code_label *jump_label = gen_label_rtx ();
17752 rtx_insn *insn;
17753
17754 /* scratch = fxam (op1) */
17755 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17756
17757 /* e1 = expm1 (|op1|) */
17758 emit_insn (gen_absxf2 (e2, op1));
17759 emit_insn (gen_expm1xf2 (e1, e2));
17760
17761 /* e2 = e1 / (e1 + 1.0) + e1 */
17762 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17763 emit_insn (gen_addxf3 (e2, e1, cst1));
17764 emit_insn (gen_divxf3 (e2, e1, e2));
17765 emit_insn (gen_addxf3 (e2, e2, e1));
17766
17767 /* flags = signbit (op1) */
17768 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17769
17770 /* if (flags) then e2 = -e2 */
17771 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17772 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17773 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17774 pc_rtx);
17775 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17776 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17777 JUMP_LABEL (insn) = jump_label;
17778
17779 emit_insn (gen_negxf2 (e2, e2));
17780
17781 emit_label (jump_label);
17782 LABEL_NUSES (jump_label) = 1;
17783
17784 /* op0 = 0.5 * e2 */
17785 half = force_reg (XFmode, half);
17786 emit_insn (gen_mulxf3 (op0, e2, half));
17787 }
17788
17789 /* Output code to perform an cosh XFmode calculation. */
17790
17791 void
17792 ix86_emit_i387_cosh (rtx op0, rtx op1)
17793 {
17794 rtx e1 = gen_reg_rtx (XFmode);
17795 rtx e2 = gen_reg_rtx (XFmode);
17796 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17797 rtx cst1;
17798
17799 /* e1 = exp (op1) */
17800 emit_insn (gen_expxf2 (e1, op1));
17801
17802 /* e2 = e1 + 1.0 / e1 */
17803 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17804 emit_insn (gen_divxf3 (e2, cst1, e1));
17805 emit_insn (gen_addxf3 (e2, e1, e2));
17806
17807 /* op0 = 0.5 * e2 */
17808 half = force_reg (XFmode, half);
17809 emit_insn (gen_mulxf3 (op0, e2, half));
17810 }
17811
17812 /* Output code to perform an tanh XFmode calculation. */
17813
17814 void
17815 ix86_emit_i387_tanh (rtx op0, rtx op1)
17816 {
17817 rtx e1 = gen_reg_rtx (XFmode);
17818 rtx e2 = gen_reg_rtx (XFmode);
17819 rtx scratch = gen_reg_rtx (HImode);
17820 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17821 rtx cst2, tmp;
17822 rtx_code_label *jump_label = gen_label_rtx ();
17823 rtx_insn *insn;
17824
17825 /* scratch = fxam (op1) */
17826 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17827
17828 /* e1 = expm1 (-|2 * op1|) */
17829 emit_insn (gen_addxf3 (e2, op1, op1));
17830 emit_insn (gen_absxf2 (e2, e2));
17831 emit_insn (gen_negxf2 (e2, e2));
17832 emit_insn (gen_expm1xf2 (e1, e2));
17833
17834 /* e2 = e1 / (e1 + 2.0) */
17835 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17836 emit_insn (gen_addxf3 (e2, e1, cst2));
17837 emit_insn (gen_divxf3 (e2, e1, e2));
17838
17839 /* flags = signbit (op1) */
17840 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17841
17842 /* if (!flags) then e2 = -e2 */
17843 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17844 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17845 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17846 pc_rtx);
17847 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17848 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17849 JUMP_LABEL (insn) = jump_label;
17850
17851 emit_insn (gen_negxf2 (e2, e2));
17852
17853 emit_label (jump_label);
17854 LABEL_NUSES (jump_label) = 1;
17855
17856 emit_move_insn (op0, e2);
17857 }
17858
17859 /* Output code to perform an asinh XFmode calculation. */
17860
17861 void
17862 ix86_emit_i387_asinh (rtx op0, rtx op1)
17863 {
17864 rtx e1 = gen_reg_rtx (XFmode);
17865 rtx e2 = gen_reg_rtx (XFmode);
17866 rtx scratch = gen_reg_rtx (HImode);
17867 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17868 rtx cst1, tmp;
17869 rtx_code_label *jump_label = gen_label_rtx ();
17870 rtx_insn *insn;
17871
17872 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17873 emit_insn (gen_mulxf3 (e1, op1, op1));
17874 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17875 emit_insn (gen_addxf3 (e2, e1, cst1));
17876 emit_insn (gen_sqrtxf2 (e2, e2));
17877 emit_insn (gen_addxf3 (e2, e2, cst1));
17878
17879 /* e1 = e1 / e2 */
17880 emit_insn (gen_divxf3 (e1, e1, e2));
17881
17882 /* scratch = fxam (op1) */
17883 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17884
17885 /* e1 = e1 + |op1| */
17886 emit_insn (gen_absxf2 (e2, op1));
17887 emit_insn (gen_addxf3 (e1, e1, e2));
17888
17889 /* e2 = log1p (e1) */
17890 ix86_emit_i387_log1p (e2, e1);
17891
17892 /* flags = signbit (op1) */
17893 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17894
17895 /* if (flags) then e2 = -e2 */
17896 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17897 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17898 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17899 pc_rtx);
17900 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17901 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17902 JUMP_LABEL (insn) = jump_label;
17903
17904 emit_insn (gen_negxf2 (e2, e2));
17905
17906 emit_label (jump_label);
17907 LABEL_NUSES (jump_label) = 1;
17908
17909 emit_move_insn (op0, e2);
17910 }
17911
17912 /* Output code to perform an acosh XFmode calculation. */
17913
17914 void
17915 ix86_emit_i387_acosh (rtx op0, rtx op1)
17916 {
17917 rtx e1 = gen_reg_rtx (XFmode);
17918 rtx e2 = gen_reg_rtx (XFmode);
17919 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17920
17921 /* e2 = sqrt (op1 + 1.0) */
17922 emit_insn (gen_addxf3 (e2, op1, cst1));
17923 emit_insn (gen_sqrtxf2 (e2, e2));
17924
17925 /* e1 = sqrt (op1 - 1.0) */
17926 emit_insn (gen_subxf3 (e1, op1, cst1));
17927 emit_insn (gen_sqrtxf2 (e1, e1));
17928
17929 /* e1 = e1 * e2 */
17930 emit_insn (gen_mulxf3 (e1, e1, e2));
17931
17932 /* e1 = e1 + op1 */
17933 emit_insn (gen_addxf3 (e1, e1, op1));
17934
17935 /* op0 = log (e1) */
17936 emit_insn (gen_logxf2 (op0, e1));
17937 }
17938
17939 /* Output code to perform an atanh XFmode calculation. */
17940
17941 void
17942 ix86_emit_i387_atanh (rtx op0, rtx op1)
17943 {
17944 rtx e1 = gen_reg_rtx (XFmode);
17945 rtx e2 = gen_reg_rtx (XFmode);
17946 rtx scratch = gen_reg_rtx (HImode);
17947 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17948 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17949 rtx cst1, tmp;
17950 rtx_code_label *jump_label = gen_label_rtx ();
17951 rtx_insn *insn;
17952
17953 /* scratch = fxam (op1) */
17954 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17955
17956 /* e2 = |op1| */
17957 emit_insn (gen_absxf2 (e2, op1));
17958
17959 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17960 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17961 emit_insn (gen_addxf3 (e1, e2, cst1));
17962 emit_insn (gen_addxf3 (e2, e2, e2));
17963 emit_insn (gen_negxf2 (e2, e2));
17964 emit_insn (gen_divxf3 (e1, e2, e1));
17965
17966 /* e2 = log1p (e1) */
17967 ix86_emit_i387_log1p (e2, e1);
17968
17969 /* flags = signbit (op1) */
17970 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17971
17972 /* if (!flags) then e2 = -e2 */
17973 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17974 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17975 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17976 pc_rtx);
17977 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17978 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17979 JUMP_LABEL (insn) = jump_label;
17980
17981 emit_insn (gen_negxf2 (e2, e2));
17982
17983 emit_label (jump_label);
17984 LABEL_NUSES (jump_label) = 1;
17985
17986 /* op0 = 0.5 * e2 */
17987 half = force_reg (XFmode, half);
17988 emit_insn (gen_mulxf3 (op0, e2, half));
17989 }
17990
17991 /* Output code to perform a log1p XFmode calculation. */
17992
17993 void
17994 ix86_emit_i387_log1p (rtx op0, rtx op1)
17995 {
17996 rtx_code_label *label1 = gen_label_rtx ();
17997 rtx_code_label *label2 = gen_label_rtx ();
17998
17999 rtx tmp = gen_reg_rtx (XFmode);
18000 rtx res = gen_reg_rtx (XFmode);
18001 rtx cst, cstln2, cst1;
18002 rtx_insn *insn;
18003
18004 /* The emit_jump call emits pending stack adjust, make sure it is emitted
18005 before the conditional jump, otherwise the stack adjustment will be
18006 only conditional. */
18007 do_pending_stack_adjust ();
18008
18009 cst = const_double_from_real_value
18010 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
18011 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
18012
18013 emit_insn (gen_absxf2 (tmp, op1));
18014
18015 cst = force_reg (XFmode, cst);
18016 ix86_expand_branch (GE, tmp, cst, label1);
18017 predict_jump (REG_BR_PROB_BASE * 10 / 100);
18018 insn = get_last_insn ();
18019 JUMP_LABEL (insn) = label1;
18020
18021 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
18022 emit_jump (label2);
18023
18024 emit_label (label1);
18025 LABEL_NUSES (label1) = 1;
18026
18027 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18028 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
18029 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
18030
18031 emit_label (label2);
18032 LABEL_NUSES (label2) = 1;
18033
18034 emit_move_insn (op0, res);
18035 }
18036
18037 /* Emit code for round calculation. */
18038 void
18039 ix86_emit_i387_round (rtx op0, rtx op1)
18040 {
18041 machine_mode inmode = GET_MODE (op1);
18042 machine_mode outmode = GET_MODE (op0);
18043 rtx e1 = gen_reg_rtx (XFmode);
18044 rtx e2 = gen_reg_rtx (XFmode);
18045 rtx scratch = gen_reg_rtx (HImode);
18046 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
18047 rtx half = const_double_from_real_value (dconsthalf, XFmode);
18048 rtx res = gen_reg_rtx (outmode);
18049 rtx_code_label *jump_label = gen_label_rtx ();
18050 rtx (*floor_insn) (rtx, rtx);
18051 rtx (*neg_insn) (rtx, rtx);
18052 rtx_insn *insn;
18053 rtx tmp;
18054
18055 switch (inmode)
18056 {
18057 case E_SFmode:
18058 case E_DFmode:
18059 tmp = gen_reg_rtx (XFmode);
18060
18061 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
18062 op1 = tmp;
18063 break;
18064 case E_XFmode:
18065 break;
18066 default:
18067 gcc_unreachable ();
18068 }
18069
18070 switch (outmode)
18071 {
18072 case E_SFmode:
18073 floor_insn = gen_frndintxf2_floor;
18074 neg_insn = gen_negsf2;
18075 break;
18076 case E_DFmode:
18077 floor_insn = gen_frndintxf2_floor;
18078 neg_insn = gen_negdf2;
18079 break;
18080 case E_XFmode:
18081 floor_insn = gen_frndintxf2_floor;
18082 neg_insn = gen_negxf2;
18083 break;
18084 case E_HImode:
18085 floor_insn = gen_lfloorxfhi2;
18086 neg_insn = gen_neghi2;
18087 break;
18088 case E_SImode:
18089 floor_insn = gen_lfloorxfsi2;
18090 neg_insn = gen_negsi2;
18091 break;
18092 case E_DImode:
18093 floor_insn = gen_lfloorxfdi2;
18094 neg_insn = gen_negdi2;
18095 break;
18096 default:
18097 gcc_unreachable ();
18098 }
18099
18100 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
18101
18102 /* scratch = fxam(op1) */
18103 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18104
18105 /* e1 = fabs(op1) */
18106 emit_insn (gen_absxf2 (e1, op1));
18107
18108 /* e2 = e1 + 0.5 */
18109 half = force_reg (XFmode, half);
18110 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
18111
18112 /* res = floor(e2) */
18113 switch (outmode)
18114 {
18115 case E_SFmode:
18116 case E_DFmode:
18117 {
18118 tmp = gen_reg_rtx (XFmode);
18119
18120 emit_insn (floor_insn (tmp, e2));
18121 emit_insn (gen_rtx_SET (res,
18122 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
18123 UNSPEC_TRUNC_NOOP)));
18124 }
18125 break;
18126 default:
18127 emit_insn (floor_insn (res, e2));
18128 }
18129
18130 /* flags = signbit(a) */
18131 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18132
18133 /* if (flags) then res = -res */
18134 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18135 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
18136 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18137 pc_rtx);
18138 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18139 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18140 JUMP_LABEL (insn) = jump_label;
18141
18142 emit_insn (neg_insn (res, res));
18143
18144 emit_label (jump_label);
18145 LABEL_NUSES (jump_label) = 1;
18146
18147 emit_move_insn (op0, res);
18148 }
18149
18150 /* Output code to perform a Newton-Rhapson approximation of a single precision
18151 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
18152
18153 void
18154 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
18155 {
18156 rtx x0, x1, e0, e1;
18157
18158 x0 = gen_reg_rtx (mode);
18159 e0 = gen_reg_rtx (mode);
18160 e1 = gen_reg_rtx (mode);
18161 x1 = gen_reg_rtx (mode);
18162
18163 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
18164
18165 b = force_reg (mode, b);
18166
18167 /* x0 = rcp(b) estimate */
18168 if (mode == V16SFmode || mode == V8DFmode)
18169 {
18170 if (TARGET_AVX512ER)
18171 {
18172 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18173 UNSPEC_RCP28)));
18174 /* res = a * x0 */
18175 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
18176 return;
18177 }
18178 else
18179 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18180 UNSPEC_RCP14)));
18181 }
18182 else
18183 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18184 UNSPEC_RCP)));
18185
18186 /* e0 = x0 * b */
18187 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
18188
18189 /* e0 = x0 * e0 */
18190 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
18191
18192 /* e1 = x0 + x0 */
18193 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
18194
18195 /* x1 = e1 - e0 */
18196 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
18197
18198 /* res = a * x1 */
18199 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
18200 }
18201
18202 /* Output code to perform a Newton-Rhapson approximation of a
18203 single precision floating point [reciprocal] square root. */
18204
18205 void
18206 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
18207 {
18208 rtx x0, e0, e1, e2, e3, mthree, mhalf;
18209 REAL_VALUE_TYPE r;
18210 int unspec;
18211
18212 x0 = gen_reg_rtx (mode);
18213 e0 = gen_reg_rtx (mode);
18214 e1 = gen_reg_rtx (mode);
18215 e2 = gen_reg_rtx (mode);
18216 e3 = gen_reg_rtx (mode);
18217
18218 if (TARGET_AVX512ER && mode == V16SFmode)
18219 {
18220 if (recip)
18221 /* res = rsqrt28(a) estimate */
18222 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18223 UNSPEC_RSQRT28)));
18224 else
18225 {
18226 /* x0 = rsqrt28(a) estimate */
18227 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18228 UNSPEC_RSQRT28)));
18229 /* res = rcp28(x0) estimate */
18230 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
18231 UNSPEC_RCP28)));
18232 }
18233 return;
18234 }
18235
18236 real_from_integer (&r, VOIDmode, -3, SIGNED);
18237 mthree = const_double_from_real_value (r, SFmode);
18238
18239 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
18240 mhalf = const_double_from_real_value (r, SFmode);
18241 unspec = UNSPEC_RSQRT;
18242
18243 if (VECTOR_MODE_P (mode))
18244 {
18245 mthree = ix86_build_const_vector (mode, true, mthree);
18246 mhalf = ix86_build_const_vector (mode, true, mhalf);
18247 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18248 if (GET_MODE_SIZE (mode) == 64)
18249 unspec = UNSPEC_RSQRT14;
18250 }
18251
18252 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18253 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18254
18255 a = force_reg (mode, a);
18256
18257 /* x0 = rsqrt(a) estimate */
18258 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18259 unspec)));
18260
18261 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18262 if (!recip)
18263 {
18264 rtx zero = force_reg (mode, CONST0_RTX(mode));
18265 rtx mask;
18266
18267 /* Handle masked compare. */
18268 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
18269 {
18270 mask = gen_reg_rtx (HImode);
18271 /* Imm value 0x4 corresponds to not-equal comparison. */
18272 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
18273 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
18274 }
18275 else
18276 {
18277 mask = gen_reg_rtx (mode);
18278 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
18279 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
18280 }
18281 }
18282
18283 mthree = force_reg (mode, mthree);
18284
18285 /* e0 = x0 * a */
18286 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
18287
18288 unsigned vector_size = GET_MODE_SIZE (mode);
18289 if (TARGET_FMA
18290 || (TARGET_AVX512F && vector_size == 64)
18291 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
18292 emit_insn (gen_rtx_SET (e2,
18293 gen_rtx_FMA (mode, e0, x0, mthree)));
18294 else
18295 {
18296 /* e1 = e0 * x0 */
18297 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
18298
18299 /* e2 = e1 - 3. */
18300 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
18301 }
18302
18303 mhalf = force_reg (mode, mhalf);
18304 if (recip)
18305 /* e3 = -.5 * x0 */
18306 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
18307 else
18308 /* e3 = -.5 * e0 */
18309 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
18310 /* ret = e2 * e3 */
18311 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
18312 }
18313
18314 /* Expand fabs (OP0) and return a new rtx that holds the result. The
18315 mask for masking out the sign-bit is stored in *SMASK, if that is
18316 non-null. */
18317
18318 static rtx
18319 ix86_expand_sse_fabs (rtx op0, rtx *smask)
18320 {
18321 machine_mode vmode, mode = GET_MODE (op0);
18322 rtx xa, mask;
18323
18324 xa = gen_reg_rtx (mode);
18325 if (mode == SFmode)
18326 vmode = V4SFmode;
18327 else if (mode == DFmode)
18328 vmode = V2DFmode;
18329 else
18330 vmode = mode;
18331 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
18332 if (!VECTOR_MODE_P (mode))
18333 {
18334 /* We need to generate a scalar mode mask in this case. */
18335 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18336 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18337 mask = gen_reg_rtx (mode);
18338 emit_insn (gen_rtx_SET (mask, tmp));
18339 }
18340 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
18341
18342 if (smask)
18343 *smask = mask;
18344
18345 return xa;
18346 }
18347
18348 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
18349 swapping the operands if SWAP_OPERANDS is true. The expanded
18350 code is a forward jump to a newly created label in case the
18351 comparison is true. The generated label rtx is returned. */
18352 static rtx_code_label *
18353 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
18354 bool swap_operands)
18355 {
18356 bool unordered_compare = ix86_unordered_fp_compare (code);
18357 rtx_code_label *label;
18358 rtx tmp, reg;
18359
18360 if (swap_operands)
18361 std::swap (op0, op1);
18362
18363 label = gen_label_rtx ();
18364 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
18365 if (unordered_compare)
18366 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
18367 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
18368 emit_insn (gen_rtx_SET (reg, tmp));
18369 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
18370 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18371 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
18372 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18373 JUMP_LABEL (tmp) = label;
18374
18375 return label;
18376 }
18377
18378 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18379 using comparison code CODE. Operands are swapped for the comparison if
18380 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18381 static rtx
18382 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
18383 bool swap_operands)
18384 {
18385 rtx (*insn)(rtx, rtx, rtx, rtx);
18386 machine_mode mode = GET_MODE (op0);
18387 rtx mask = gen_reg_rtx (mode);
18388
18389 if (swap_operands)
18390 std::swap (op0, op1);
18391
18392 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
18393
18394 emit_insn (insn (mask, op0, op1,
18395 gen_rtx_fmt_ee (code, mode, op0, op1)));
18396 return mask;
18397 }
18398
18399 /* Expand copysign from SIGN to the positive value ABS_VALUE
18400 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18401 the sign-bit. */
18402
18403 static void
18404 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
18405 {
18406 machine_mode mode = GET_MODE (sign);
18407 rtx sgn = gen_reg_rtx (mode);
18408 if (mask == NULL_RTX)
18409 {
18410 machine_mode vmode;
18411
18412 if (mode == SFmode)
18413 vmode = V4SFmode;
18414 else if (mode == DFmode)
18415 vmode = V2DFmode;
18416 else
18417 vmode = mode;
18418
18419 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
18420 if (!VECTOR_MODE_P (mode))
18421 {
18422 /* We need to generate a scalar mode mask in this case. */
18423 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18424 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18425 mask = gen_reg_rtx (mode);
18426 emit_insn (gen_rtx_SET (mask, tmp));
18427 }
18428 }
18429 else
18430 mask = gen_rtx_NOT (mode, mask);
18431 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
18432 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
18433 }
18434
18435 /* Expand SSE sequence for computing lround from OP1 storing
18436 into OP0. */
18437
18438 void
18439 ix86_expand_lround (rtx op0, rtx op1)
18440 {
18441 /* C code for the stuff we're doing below:
18442 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18443 return (long)tmp;
18444 */
18445 machine_mode mode = GET_MODE (op1);
18446 const struct real_format *fmt;
18447 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18448 rtx adj;
18449
18450 /* load nextafter (0.5, 0.0) */
18451 fmt = REAL_MODE_FORMAT (mode);
18452 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18453 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18454
18455 /* adj = copysign (0.5, op1) */
18456 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
18457 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
18458
18459 /* adj = op1 + adj */
18460 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
18461
18462 /* op0 = (imode)adj */
18463 expand_fix (op0, adj, 0);
18464 }
18465
18466 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18467 into OPERAND0. */
18468
18469 void
18470 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
18471 {
18472 /* C code for the stuff we're doing below (for do_floor):
18473 xi = (long)op1;
18474 xi -= (double)xi > op1 ? 1 : 0;
18475 return xi;
18476 */
18477 machine_mode fmode = GET_MODE (op1);
18478 machine_mode imode = GET_MODE (op0);
18479 rtx ireg, freg, tmp;
18480 rtx_code_label *label;
18481
18482 /* reg = (long)op1 */
18483 ireg = gen_reg_rtx (imode);
18484 expand_fix (ireg, op1, 0);
18485
18486 /* freg = (double)reg */
18487 freg = gen_reg_rtx (fmode);
18488 expand_float (freg, ireg, 0);
18489
18490 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18491 label = ix86_expand_sse_compare_and_jump (UNLE,
18492 freg, op1, !do_floor);
18493 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
18494 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
18495 emit_move_insn (ireg, tmp);
18496
18497 emit_label (label);
18498 LABEL_NUSES (label) = 1;
18499
18500 emit_move_insn (op0, ireg);
18501 }
18502
18503 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18504 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18505
18506 static rtx
18507 ix86_gen_TWO52 (machine_mode mode)
18508 {
18509 const struct real_format *fmt;
18510 REAL_VALUE_TYPE TWO52r;
18511 rtx TWO52;
18512
18513 fmt = REAL_MODE_FORMAT (mode);
18514 real_2expN (&TWO52r, fmt->p - 1, mode);
18515 TWO52 = const_double_from_real_value (TWO52r, mode);
18516 TWO52 = force_reg (mode, TWO52);
18517
18518 return TWO52;
18519 }
18520
18521 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18522
18523 void
18524 ix86_expand_rint (rtx operand0, rtx operand1)
18525 {
18526 /* C code for the stuff we're doing below:
18527 xa = fabs (operand1);
18528 if (!isless (xa, 2**52))
18529 return operand1;
18530 two52 = 2**52;
18531 if (flag_rounding_math)
18532 {
18533 two52 = copysign (two52, operand1);
18534 xa = operand1;
18535 }
18536 xa = xa + two52 - two52;
18537 return copysign (xa, operand1);
18538 */
18539 machine_mode mode = GET_MODE (operand0);
18540 rtx res, xa, TWO52, mask;
18541 rtx_code_label *label;
18542
18543 TWO52 = ix86_gen_TWO52 (mode);
18544
18545 /* Temporary for holding the result, initialized to the input
18546 operand to ease control flow. */
18547 res = copy_to_reg (operand1);
18548
18549 /* xa = abs (operand1) */
18550 xa = ix86_expand_sse_fabs (res, &mask);
18551
18552 /* if (!isless (xa, TWO52)) goto label; */
18553 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18554
18555 if (flag_rounding_math)
18556 {
18557 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
18558 xa = res;
18559 }
18560
18561 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18562 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18563
18564 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18565 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18566 xa = ix86_expand_sse_fabs (xa, NULL);
18567
18568 ix86_sse_copysign_to_positive (res, xa, res, mask);
18569
18570 emit_label (label);
18571 LABEL_NUSES (label) = 1;
18572
18573 emit_move_insn (operand0, res);
18574 }
18575
18576 /* Expand SSE2 sequence for computing floor or ceil
18577 from OPERAND1 storing into OPERAND0. */
18578 void
18579 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
18580 {
18581 /* C code for the stuff we expand below.
18582 double xa = fabs (x), x2;
18583 if (!isless (xa, TWO52))
18584 return x;
18585 x2 = (double)(long)x;
18586
18587 Compensate. Floor:
18588 if (x2 > x)
18589 x2 -= 1;
18590 Compensate. Ceil:
18591 if (x2 < x)
18592 x2 += 1;
18593
18594 if (HONOR_SIGNED_ZEROS (mode))
18595 return copysign (x2, x);
18596 return x2;
18597 */
18598 machine_mode mode = GET_MODE (operand0);
18599 rtx xa, xi, TWO52, tmp, one, res, mask;
18600 rtx_code_label *label;
18601
18602 TWO52 = ix86_gen_TWO52 (mode);
18603
18604 /* Temporary for holding the result, initialized to the input
18605 operand to ease control flow. */
18606 res = copy_to_reg (operand1);
18607
18608 /* xa = abs (operand1) */
18609 xa = ix86_expand_sse_fabs (res, &mask);
18610
18611 /* if (!isless (xa, TWO52)) goto label; */
18612 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18613
18614 /* xa = (double)(long)x */
18615 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18616 expand_fix (xi, res, 0);
18617 expand_float (xa, xi, 0);
18618
18619 /* generate 1.0 */
18620 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18621
18622 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18623 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18624 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18625 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18626 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18627 if (HONOR_SIGNED_ZEROS (mode))
18628 {
18629 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18630 if (do_floor && flag_rounding_math)
18631 tmp = ix86_expand_sse_fabs (tmp, NULL);
18632
18633 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18634 }
18635 emit_move_insn (res, tmp);
18636
18637 emit_label (label);
18638 LABEL_NUSES (label) = 1;
18639
18640 emit_move_insn (operand0, res);
18641 }
18642
18643 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18644 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18645 that is only available on 64bit targets. */
18646 void
18647 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
18648 {
18649 /* C code for the stuff we expand below.
18650 double xa = fabs (x), x2;
18651 if (!isless (xa, TWO52))
18652 return x;
18653 xa = xa + TWO52 - TWO52;
18654 x2 = copysign (xa, x);
18655
18656 Compensate. Floor:
18657 if (x2 > x)
18658 x2 -= 1;
18659 Compensate. Ceil:
18660 if (x2 < x)
18661 x2 += 1;
18662
18663 if (HONOR_SIGNED_ZEROS (mode))
18664 x2 = copysign (x2, x);
18665 return x2;
18666 */
18667 machine_mode mode = GET_MODE (operand0);
18668 rtx xa, TWO52, tmp, one, res, mask;
18669 rtx_code_label *label;
18670
18671 TWO52 = ix86_gen_TWO52 (mode);
18672
18673 /* Temporary for holding the result, initialized to the input
18674 operand to ease control flow. */
18675 res = copy_to_reg (operand1);
18676
18677 /* xa = abs (operand1) */
18678 xa = ix86_expand_sse_fabs (res, &mask);
18679
18680 /* if (!isless (xa, TWO52)) goto label; */
18681 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18682
18683 /* xa = xa + TWO52 - TWO52; */
18684 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18685 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18686
18687 /* xa = copysign (xa, operand1) */
18688 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18689
18690 /* generate 1.0 */
18691 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18692
18693 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18694 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18695 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18696 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18697 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18698 if (HONOR_SIGNED_ZEROS (mode))
18699 {
18700 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18701 if (do_floor && flag_rounding_math)
18702 tmp = ix86_expand_sse_fabs (tmp, NULL);
18703
18704 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18705 }
18706 emit_move_insn (res, tmp);
18707
18708 emit_label (label);
18709 LABEL_NUSES (label) = 1;
18710
18711 emit_move_insn (operand0, res);
18712 }
18713
18714 /* Expand SSE sequence for computing trunc
18715 from OPERAND1 storing into OPERAND0. */
18716 void
18717 ix86_expand_trunc (rtx operand0, rtx operand1)
18718 {
18719 /* C code for SSE variant we expand below.
18720 double xa = fabs (x), x2;
18721 if (!isless (xa, TWO52))
18722 return x;
18723 x2 = (double)(long)x;
18724 if (HONOR_SIGNED_ZEROS (mode))
18725 return copysign (x2, x);
18726 return x2;
18727 */
18728 machine_mode mode = GET_MODE (operand0);
18729 rtx xa, xi, TWO52, res, mask;
18730 rtx_code_label *label;
18731
18732 TWO52 = ix86_gen_TWO52 (mode);
18733
18734 /* Temporary for holding the result, initialized to the input
18735 operand to ease control flow. */
18736 res = copy_to_reg (operand1);
18737
18738 /* xa = abs (operand1) */
18739 xa = ix86_expand_sse_fabs (res, &mask);
18740
18741 /* if (!isless (xa, TWO52)) goto label; */
18742 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18743
18744 /* xa = (double)(long)x */
18745 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18746 expand_fix (xi, res, 0);
18747 expand_float (xa, xi, 0);
18748
18749 if (HONOR_SIGNED_ZEROS (mode))
18750 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18751
18752 emit_move_insn (res, xa);
18753
18754 emit_label (label);
18755 LABEL_NUSES (label) = 1;
18756
18757 emit_move_insn (operand0, res);
18758 }
18759
18760 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18761 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18762 that is only available on 64bit targets. */
18763 void
18764 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18765 {
18766 machine_mode mode = GET_MODE (operand0);
18767 rtx xa, xa2, TWO52, tmp, one, res, mask;
18768 rtx_code_label *label;
18769
18770 /* C code for SSE variant we expand below.
18771 double xa = fabs (x), x2;
18772 if (!isless (xa, TWO52))
18773 return x;
18774 xa2 = xa + TWO52 - TWO52;
18775 Compensate:
18776 if (xa2 > xa)
18777 xa2 -= 1.0;
18778 x2 = copysign (xa2, x);
18779 return x2;
18780 */
18781
18782 TWO52 = ix86_gen_TWO52 (mode);
18783
18784 /* Temporary for holding the result, initialized to the input
18785 operand to ease control flow. */
18786 res =copy_to_reg (operand1);
18787
18788 /* xa = abs (operand1) */
18789 xa = ix86_expand_sse_fabs (res, &mask);
18790
18791 /* if (!isless (xa, TWO52)) goto label; */
18792 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18793
18794 /* xa2 = xa + TWO52 - TWO52; */
18795 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18796 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18797
18798 /* generate 1.0 */
18799 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18800
18801 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18802 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18803 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18804 tmp = expand_simple_binop (mode, MINUS,
18805 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18806 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18807 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18808 tmp = ix86_expand_sse_fabs (tmp, NULL);
18809
18810 /* res = copysign (xa2, operand1) */
18811 ix86_sse_copysign_to_positive (res, tmp, res, mask);
18812
18813 emit_label (label);
18814 LABEL_NUSES (label) = 1;
18815
18816 emit_move_insn (operand0, res);
18817 }
18818
18819 /* Expand SSE sequence for computing round
18820 from OPERAND1 storing into OPERAND0. */
18821 void
18822 ix86_expand_round (rtx operand0, rtx operand1)
18823 {
18824 /* C code for the stuff we're doing below:
18825 double xa = fabs (x);
18826 if (!isless (xa, TWO52))
18827 return x;
18828 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18829 return copysign (xa, x);
18830 */
18831 machine_mode mode = GET_MODE (operand0);
18832 rtx res, TWO52, xa, xi, half, mask;
18833 rtx_code_label *label;
18834 const struct real_format *fmt;
18835 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18836
18837 /* Temporary for holding the result, initialized to the input
18838 operand to ease control flow. */
18839 res = copy_to_reg (operand1);
18840
18841 TWO52 = ix86_gen_TWO52 (mode);
18842 xa = ix86_expand_sse_fabs (res, &mask);
18843 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18844
18845 /* load nextafter (0.5, 0.0) */
18846 fmt = REAL_MODE_FORMAT (mode);
18847 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18848 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18849
18850 /* xa = xa + 0.5 */
18851 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18852 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18853
18854 /* xa = (double)(int64_t)xa */
18855 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18856 expand_fix (xi, xa, 0);
18857 expand_float (xa, xi, 0);
18858
18859 /* res = copysign (xa, operand1) */
18860 ix86_sse_copysign_to_positive (res, xa, res, mask);
18861
18862 emit_label (label);
18863 LABEL_NUSES (label) = 1;
18864
18865 emit_move_insn (operand0, res);
18866 }
18867
18868 /* Expand SSE sequence for computing round from OPERAND1 storing
18869 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18870 that is only available on 64bit targets. */
18871 void
18872 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18873 {
18874 /* C code for the stuff we expand below.
18875 double xa = fabs (x), xa2, x2;
18876 if (!isless (xa, TWO52))
18877 return x;
18878 Using the absolute value and copying back sign makes
18879 -0.0 -> -0.0 correct.
18880 xa2 = xa + TWO52 - TWO52;
18881 Compensate.
18882 dxa = xa2 - xa;
18883 if (dxa <= -0.5)
18884 xa2 += 1;
18885 else if (dxa > 0.5)
18886 xa2 -= 1;
18887 x2 = copysign (xa2, x);
18888 return x2;
18889 */
18890 machine_mode mode = GET_MODE (operand0);
18891 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18892 rtx_code_label *label;
18893
18894 TWO52 = ix86_gen_TWO52 (mode);
18895
18896 /* Temporary for holding the result, initialized to the input
18897 operand to ease control flow. */
18898 res = copy_to_reg (operand1);
18899
18900 /* xa = abs (operand1) */
18901 xa = ix86_expand_sse_fabs (res, &mask);
18902
18903 /* if (!isless (xa, TWO52)) goto label; */
18904 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18905
18906 /* xa2 = xa + TWO52 - TWO52; */
18907 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18908 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18909
18910 /* dxa = xa2 - xa; */
18911 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18912
18913 /* generate 0.5, 1.0 and -0.5 */
18914 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18915 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18916 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18917 0, OPTAB_DIRECT);
18918
18919 /* Compensate. */
18920 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18921 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18922 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18923 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18924 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18925 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18926 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18927 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18928
18929 /* res = copysign (xa2, operand1) */
18930 ix86_sse_copysign_to_positive (res, xa2, res, mask);
18931
18932 emit_label (label);
18933 LABEL_NUSES (label) = 1;
18934
18935 emit_move_insn (operand0, res);
18936 }
18937
18938 /* Expand SSE sequence for computing round
18939 from OP1 storing into OP0 using sse4 round insn. */
18940 void
18941 ix86_expand_round_sse4 (rtx op0, rtx op1)
18942 {
18943 machine_mode mode = GET_MODE (op0);
18944 rtx e1, e2, res, half;
18945 const struct real_format *fmt;
18946 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18947 rtx (*gen_copysign) (rtx, rtx, rtx);
18948 rtx (*gen_round) (rtx, rtx, rtx);
18949
18950 switch (mode)
18951 {
18952 case E_SFmode:
18953 gen_copysign = gen_copysignsf3;
18954 gen_round = gen_sse4_1_roundsf2;
18955 break;
18956 case E_DFmode:
18957 gen_copysign = gen_copysigndf3;
18958 gen_round = gen_sse4_1_rounddf2;
18959 break;
18960 default:
18961 gcc_unreachable ();
18962 }
18963
18964 /* round (a) = trunc (a + copysign (0.5, a)) */
18965
18966 /* load nextafter (0.5, 0.0) */
18967 fmt = REAL_MODE_FORMAT (mode);
18968 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18969 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18970 half = const_double_from_real_value (pred_half, mode);
18971
18972 /* e1 = copysign (0.5, op1) */
18973 e1 = gen_reg_rtx (mode);
18974 emit_insn (gen_copysign (e1, half, op1));
18975
18976 /* e2 = op1 + e1 */
18977 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18978
18979 /* res = trunc (e2) */
18980 res = gen_reg_rtx (mode);
18981 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18982
18983 emit_move_insn (op0, res);
18984 }
18985
18986 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18987 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18988 insn every time. */
18989
18990 static GTY(()) rtx_insn *vselect_insn;
18991
18992 /* Initialize vselect_insn. */
18993
18994 static void
18995 init_vselect_insn (void)
18996 {
18997 unsigned i;
18998 rtx x;
18999
19000 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
19001 for (i = 0; i < MAX_VECT_LEN; ++i)
19002 XVECEXP (x, 0, i) = const0_rtx;
19003 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
19004 const0_rtx), x);
19005 x = gen_rtx_SET (const0_rtx, x);
19006 start_sequence ();
19007 vselect_insn = emit_insn (x);
19008 end_sequence ();
19009 }
19010
19011 /* Construct (set target (vec_select op0 (parallel perm))) and
19012 return true if that's a valid instruction in the active ISA. */
19013
19014 static bool
19015 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
19016 unsigned nelt, bool testing_p)
19017 {
19018 unsigned int i;
19019 rtx x, save_vconcat;
19020 int icode;
19021
19022 if (vselect_insn == NULL_RTX)
19023 init_vselect_insn ();
19024
19025 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
19026 PUT_NUM_ELEM (XVEC (x, 0), nelt);
19027 for (i = 0; i < nelt; ++i)
19028 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
19029 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
19030 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
19031 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
19032 SET_DEST (PATTERN (vselect_insn)) = target;
19033 icode = recog_memoized (vselect_insn);
19034
19035 if (icode >= 0 && !testing_p)
19036 emit_insn (copy_rtx (PATTERN (vselect_insn)));
19037
19038 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
19039 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
19040 INSN_CODE (vselect_insn) = -1;
19041
19042 return icode >= 0;
19043 }
19044
19045 /* Similar, but generate a vec_concat from op0 and op1 as well. */
19046
19047 static bool
19048 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
19049 const unsigned char *perm, unsigned nelt,
19050 bool testing_p)
19051 {
19052 machine_mode v2mode;
19053 rtx x;
19054 bool ok;
19055
19056 if (vselect_insn == NULL_RTX)
19057 init_vselect_insn ();
19058
19059 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
19060 return false;
19061 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
19062 PUT_MODE (x, v2mode);
19063 XEXP (x, 0) = op0;
19064 XEXP (x, 1) = op1;
19065 ok = expand_vselect (target, x, perm, nelt, testing_p);
19066 XEXP (x, 0) = const0_rtx;
19067 XEXP (x, 1) = const0_rtx;
19068 return ok;
19069 }
19070
19071 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19072 using movss or movsd. */
19073 static bool
19074 expand_vec_perm_movs (struct expand_vec_perm_d *d)
19075 {
19076 machine_mode vmode = d->vmode;
19077 unsigned i, nelt = d->nelt;
19078 rtx x;
19079
19080 if (d->one_operand_p)
19081 return false;
19082
19083 if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
19084 && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
19085 && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
19086 return false;
19087
19088 /* Only the first element is changed. */
19089 if (d->perm[0] != nelt && d->perm[0] != 0)
19090 return false;
19091 for (i = 1; i < nelt; ++i)
19092 if (d->perm[i] != i + nelt - d->perm[0])
19093 return false;
19094
19095 if (d->testing_p)
19096 return true;
19097
19098 if (d->perm[0] == nelt)
19099 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
19100 else
19101 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
19102
19103 emit_insn (gen_rtx_SET (d->target, x));
19104
19105 return true;
19106 }
19107
19108 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19109 using insertps. */
19110 static bool
19111 expand_vec_perm_insertps (struct expand_vec_perm_d *d)
19112 {
19113 machine_mode vmode = d->vmode;
19114 unsigned i, cnt_s, nelt = d->nelt;
19115 int cnt_d = -1;
19116 rtx src, dst;
19117
19118 if (d->one_operand_p)
19119 return false;
19120
19121 if (!(TARGET_SSE4_1
19122 && (vmode == V4SFmode || vmode == V4SImode
19123 || (TARGET_MMX_WITH_SSE
19124 && (vmode == V2SFmode || vmode == V2SImode)))))
19125 return false;
19126
19127 for (i = 0; i < nelt; ++i)
19128 {
19129 if (d->perm[i] == i)
19130 continue;
19131 if (cnt_d != -1)
19132 {
19133 cnt_d = -1;
19134 break;
19135 }
19136 cnt_d = i;
19137 }
19138
19139 if (cnt_d == -1)
19140 {
19141 for (i = 0; i < nelt; ++i)
19142 {
19143 if (d->perm[i] == i + nelt)
19144 continue;
19145 if (cnt_d != -1)
19146 return false;
19147 cnt_d = i;
19148 }
19149
19150 if (cnt_d == -1)
19151 return false;
19152 }
19153
19154 if (d->testing_p)
19155 return true;
19156
19157 gcc_assert (cnt_d != -1);
19158
19159 cnt_s = d->perm[cnt_d];
19160 if (cnt_s < nelt)
19161 {
19162 src = d->op0;
19163 dst = d->op1;
19164 }
19165 else
19166 {
19167 cnt_s -= nelt;
19168 src = d->op1;
19169 dst = d->op0;
19170 }
19171 gcc_assert (cnt_s < nelt);
19172
19173 rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
19174 GEN_INT (cnt_s << 6 | cnt_d << 4));
19175 emit_insn (x);
19176
19177 return true;
19178 }
19179
19180 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19181 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
19182
19183 static bool
19184 expand_vec_perm_blend (struct expand_vec_perm_d *d)
19185 {
19186 machine_mode mmode, vmode = d->vmode;
19187 unsigned i, nelt = d->nelt;
19188 unsigned HOST_WIDE_INT mask;
19189 rtx target, op0, op1, maskop, x;
19190 rtx rperm[32], vperm;
19191
19192 if (d->one_operand_p)
19193 return false;
19194 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
19195 && (TARGET_AVX512BW
19196 || GET_MODE_UNIT_SIZE (vmode) >= 4))
19197 ;
19198 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19199 ;
19200 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19201 ;
19202 else if (TARGET_SSE4_1
19203 && (GET_MODE_SIZE (vmode) == 16
19204 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
19205 || GET_MODE_SIZE (vmode) == 4))
19206 ;
19207 else
19208 return false;
19209
19210 /* This is a blend, not a permute. Elements must stay in their
19211 respective lanes. */
19212 for (i = 0; i < nelt; ++i)
19213 {
19214 unsigned e = d->perm[i];
19215 if (!(e == i || e == i + nelt))
19216 return false;
19217 }
19218
19219 if (d->testing_p)
19220 return true;
19221
19222 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
19223 decision should be extracted elsewhere, so that we only try that
19224 sequence once all budget==3 options have been tried. */
19225 target = d->target;
19226 op0 = d->op0;
19227 op1 = d->op1;
19228 mask = 0;
19229
19230 switch (vmode)
19231 {
19232 case E_V8DFmode:
19233 case E_V16SFmode:
19234 case E_V4DFmode:
19235 case E_V8SFmode:
19236 case E_V2DFmode:
19237 case E_V4SFmode:
19238 case E_V2SFmode:
19239 case E_V2HImode:
19240 case E_V4HImode:
19241 case E_V8HImode:
19242 case E_V8SImode:
19243 case E_V32HImode:
19244 case E_V64QImode:
19245 case E_V16SImode:
19246 case E_V8DImode:
19247 for (i = 0; i < nelt; ++i)
19248 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19249 break;
19250
19251 case E_V2DImode:
19252 for (i = 0; i < 2; ++i)
19253 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
19254 vmode = V8HImode;
19255 goto do_subreg;
19256
19257 case E_V2SImode:
19258 for (i = 0; i < 2; ++i)
19259 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
19260 vmode = V4HImode;
19261 goto do_subreg;
19262
19263 case E_V4SImode:
19264 if (TARGET_AVX2)
19265 {
19266 /* Use vpblendd instead of vpblendw. */
19267 for (i = 0; i < nelt; ++i)
19268 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19269 break;
19270 }
19271 else
19272 {
19273 for (i = 0; i < 4; ++i)
19274 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19275 vmode = V8HImode;
19276 goto do_subreg;
19277 }
19278
19279 case E_V16QImode:
19280 /* See if bytes move in pairs so we can use pblendw with
19281 an immediate argument, rather than pblendvb with a vector
19282 argument. */
19283 for (i = 0; i < 16; i += 2)
19284 if (d->perm[i] + 1 != d->perm[i + 1])
19285 {
19286 use_pblendvb:
19287 for (i = 0; i < nelt; ++i)
19288 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
19289
19290 finish_pblendvb:
19291 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19292 vperm = force_reg (vmode, vperm);
19293
19294 if (GET_MODE_SIZE (vmode) == 4)
19295 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
19296 else if (GET_MODE_SIZE (vmode) == 8)
19297 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
19298 else if (GET_MODE_SIZE (vmode) == 16)
19299 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
19300 else
19301 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
19302 if (target != d->target)
19303 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19304 return true;
19305 }
19306
19307 for (i = 0; i < 8; ++i)
19308 mask |= (d->perm[i * 2] >= 16) << i;
19309 vmode = V8HImode;
19310 /* FALLTHRU */
19311
19312 do_subreg:
19313 target = gen_reg_rtx (vmode);
19314 op0 = gen_lowpart (vmode, op0);
19315 op1 = gen_lowpart (vmode, op1);
19316 break;
19317
19318 case E_V8QImode:
19319 for (i = 0; i < 8; i += 2)
19320 if (d->perm[i] + 1 != d->perm[i + 1])
19321 goto use_pblendvb;
19322
19323 for (i = 0; i < 4; ++i)
19324 mask |= (d->perm[i * 2] >= 8) << i;
19325 vmode = V4HImode;
19326 goto do_subreg;
19327
19328 case E_V4QImode:
19329 for (i = 0; i < 4; i += 2)
19330 if (d->perm[i] + 1 != d->perm[i + 1])
19331 goto use_pblendvb;
19332
19333 for (i = 0; i < 2; ++i)
19334 mask |= (d->perm[i * 2] >= 4) << i;
19335 vmode = V2HImode;
19336 goto do_subreg;
19337
19338 case E_V32QImode:
19339 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19340 for (i = 0; i < 32; i += 2)
19341 if (d->perm[i] + 1 != d->perm[i + 1])
19342 goto use_pblendvb;
19343 /* See if bytes move in quadruplets. If yes, vpblendd
19344 with immediate can be used. */
19345 for (i = 0; i < 32; i += 4)
19346 if (d->perm[i] + 2 != d->perm[i + 2])
19347 break;
19348 if (i < 32)
19349 {
19350 /* See if bytes move the same in both lanes. If yes,
19351 vpblendw with immediate can be used. */
19352 for (i = 0; i < 16; i += 2)
19353 if (d->perm[i] + 16 != d->perm[i + 16])
19354 goto use_pblendvb;
19355
19356 /* Use vpblendw. */
19357 for (i = 0; i < 16; ++i)
19358 mask |= (d->perm[i * 2] >= 32) << i;
19359 vmode = V16HImode;
19360 goto do_subreg;
19361 }
19362
19363 /* Use vpblendd. */
19364 for (i = 0; i < 8; ++i)
19365 mask |= (d->perm[i * 4] >= 32) << i;
19366 vmode = V8SImode;
19367 goto do_subreg;
19368
19369 case E_V16HImode:
19370 /* See if words move in pairs. If yes, vpblendd can be used. */
19371 for (i = 0; i < 16; i += 2)
19372 if (d->perm[i] + 1 != d->perm[i + 1])
19373 break;
19374 if (i < 16)
19375 {
19376 /* See if words move the same in both lanes. If not,
19377 vpblendvb must be used. */
19378 for (i = 0; i < 8; i++)
19379 if (d->perm[i] + 8 != d->perm[i + 8])
19380 {
19381 /* Use vpblendvb. */
19382 for (i = 0; i < 32; ++i)
19383 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
19384
19385 vmode = V32QImode;
19386 nelt = 32;
19387 target = gen_reg_rtx (vmode);
19388 op0 = gen_lowpart (vmode, op0);
19389 op1 = gen_lowpart (vmode, op1);
19390 goto finish_pblendvb;
19391 }
19392
19393 /* Use vpblendw. */
19394 for (i = 0; i < 16; ++i)
19395 mask |= (d->perm[i] >= 16) << i;
19396 break;
19397 }
19398
19399 /* Use vpblendd. */
19400 for (i = 0; i < 8; ++i)
19401 mask |= (d->perm[i * 2] >= 16) << i;
19402 vmode = V8SImode;
19403 goto do_subreg;
19404
19405 case E_V4DImode:
19406 /* Use vpblendd. */
19407 for (i = 0; i < 4; ++i)
19408 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19409 vmode = V8SImode;
19410 goto do_subreg;
19411
19412 default:
19413 gcc_unreachable ();
19414 }
19415
19416 switch (vmode)
19417 {
19418 case E_V8DFmode:
19419 case E_V8DImode:
19420 mmode = QImode;
19421 break;
19422 case E_V16SFmode:
19423 case E_V16SImode:
19424 mmode = HImode;
19425 break;
19426 case E_V32HImode:
19427 mmode = SImode;
19428 break;
19429 case E_V64QImode:
19430 mmode = DImode;
19431 break;
19432 default:
19433 mmode = VOIDmode;
19434 }
19435
19436 if (mmode != VOIDmode)
19437 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
19438 else
19439 maskop = GEN_INT (mask);
19440
19441 /* This matches five different patterns with the different modes. */
19442 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
19443 x = gen_rtx_SET (target, x);
19444 emit_insn (x);
19445 if (target != d->target)
19446 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19447
19448 return true;
19449 }
19450
19451 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19452 in terms of the variable form of vpermilps.
19453
19454 Note that we will have already failed the immediate input vpermilps,
19455 which requires that the high and low part shuffle be identical; the
19456 variable form doesn't require that. */
19457
19458 static bool
19459 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
19460 {
19461 rtx rperm[8], vperm;
19462 unsigned i;
19463
19464 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
19465 return false;
19466
19467 /* We can only permute within the 128-bit lane. */
19468 for (i = 0; i < 8; ++i)
19469 {
19470 unsigned e = d->perm[i];
19471 if (i < 4 ? e >= 4 : e < 4)
19472 return false;
19473 }
19474
19475 if (d->testing_p)
19476 return true;
19477
19478 for (i = 0; i < 8; ++i)
19479 {
19480 unsigned e = d->perm[i];
19481
19482 /* Within each 128-bit lane, the elements of op0 are numbered
19483 from 0 and the elements of op1 are numbered from 4. */
19484 if (e >= 8 + 4)
19485 e -= 8;
19486 else if (e >= 4)
19487 e -= 4;
19488
19489 rperm[i] = GEN_INT (e);
19490 }
19491
19492 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
19493 vperm = force_reg (V8SImode, vperm);
19494 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
19495
19496 return true;
19497 }
19498
19499 /* For V*[QHS]Imode permutations, check if the same permutation
19500 can't be performed in a 2x, 4x or 8x wider inner mode. */
19501
19502 static bool
19503 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
19504 struct expand_vec_perm_d *nd)
19505 {
19506 int i;
19507 machine_mode mode = VOIDmode;
19508
19509 switch (d->vmode)
19510 {
19511 case E_V8QImode: mode = V4HImode; break;
19512 case E_V16QImode: mode = V8HImode; break;
19513 case E_V32QImode: mode = V16HImode; break;
19514 case E_V64QImode: mode = V32HImode; break;
19515 case E_V4HImode: mode = V2SImode; break;
19516 case E_V8HImode: mode = V4SImode; break;
19517 case E_V16HImode: mode = V8SImode; break;
19518 case E_V32HImode: mode = V16SImode; break;
19519 case E_V4SImode: mode = V2DImode; break;
19520 case E_V8SImode: mode = V4DImode; break;
19521 case E_V16SImode: mode = V8DImode; break;
19522 default: return false;
19523 }
19524 for (i = 0; i < d->nelt; i += 2)
19525 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
19526 return false;
19527 nd->vmode = mode;
19528 nd->nelt = d->nelt / 2;
19529 for (i = 0; i < nd->nelt; i++)
19530 nd->perm[i] = d->perm[2 * i] / 2;
19531 if (GET_MODE_INNER (mode) != DImode)
19532 canonicalize_vector_int_perm (nd, nd);
19533 if (nd != d)
19534 {
19535 nd->one_operand_p = d->one_operand_p;
19536 nd->testing_p = d->testing_p;
19537 if (d->op0 == d->op1)
19538 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
19539 else
19540 {
19541 nd->op0 = gen_lowpart (nd->vmode, d->op0);
19542 nd->op1 = gen_lowpart (nd->vmode, d->op1);
19543 }
19544 if (d->testing_p)
19545 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
19546 else
19547 nd->target = gen_reg_rtx (nd->vmode);
19548 }
19549 return true;
19550 }
19551
19552 /* Return true if permutation D can be performed as VMODE permutation
19553 instead. */
19554
19555 static bool
19556 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
19557 {
19558 unsigned int i, j, chunk;
19559
19560 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
19561 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
19562 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
19563 return false;
19564
19565 if (GET_MODE_NUNITS (vmode) >= d->nelt)
19566 return true;
19567
19568 chunk = d->nelt / GET_MODE_NUNITS (vmode);
19569 for (i = 0; i < d->nelt; i += chunk)
19570 if (d->perm[i] & (chunk - 1))
19571 return false;
19572 else
19573 for (j = 1; j < chunk; ++j)
19574 if (d->perm[i] + j != d->perm[i + j])
19575 return false;
19576
19577 return true;
19578 }
19579
19580 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19581 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19582
19583 static bool
19584 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
19585 {
19586 unsigned i, nelt, eltsz, mask;
19587 unsigned char perm[64];
19588 machine_mode vmode;
19589 struct expand_vec_perm_d nd;
19590 rtx rperm[64], vperm, target, op0, op1;
19591
19592 nelt = d->nelt;
19593
19594 if (!d->one_operand_p)
19595 switch (GET_MODE_SIZE (d->vmode))
19596 {
19597 case 4:
19598 if (!TARGET_XOP)
19599 return false;
19600 vmode = V4QImode;
19601 break;
19602
19603 case 8:
19604 if (!TARGET_XOP)
19605 return false;
19606 vmode = V8QImode;
19607 break;
19608
19609 case 16:
19610 if (!TARGET_XOP)
19611 return false;
19612 vmode = V16QImode;
19613 break;
19614
19615 case 32:
19616 if (!TARGET_AVX2)
19617 return false;
19618
19619 if (valid_perm_using_mode_p (V2TImode, d))
19620 {
19621 if (d->testing_p)
19622 return true;
19623
19624 /* Use vperm2i128 insn. The pattern uses
19625 V4DImode instead of V2TImode. */
19626 target = d->target;
19627 if (d->vmode != V4DImode)
19628 target = gen_reg_rtx (V4DImode);
19629 op0 = gen_lowpart (V4DImode, d->op0);
19630 op1 = gen_lowpart (V4DImode, d->op1);
19631 rperm[0]
19632 = GEN_INT ((d->perm[0] / (nelt / 2))
19633 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
19634 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
19635 if (target != d->target)
19636 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19637 return true;
19638 }
19639 /* FALLTHRU */
19640
19641 default:
19642 return false;
19643 }
19644 else
19645 switch (GET_MODE_SIZE (d->vmode))
19646 {
19647 case 4:
19648 if (!TARGET_SSSE3)
19649 return false;
19650 vmode = V4QImode;
19651 break;
19652
19653 case 8:
19654 if (!TARGET_SSSE3)
19655 return false;
19656 vmode = V8QImode;
19657 break;
19658
19659 case 16:
19660 if (!TARGET_SSSE3)
19661 return false;
19662 vmode = V16QImode;
19663 break;
19664
19665 case 32:
19666 if (!TARGET_AVX2)
19667 return false;
19668
19669 /* V4DImode should be already handled through
19670 expand_vselect by vpermq instruction. */
19671 gcc_assert (d->vmode != V4DImode);
19672
19673 vmode = V32QImode;
19674 if (d->vmode == V8SImode
19675 || d->vmode == V16HImode
19676 || d->vmode == V32QImode)
19677 {
19678 /* First see if vpermq can be used for
19679 V8SImode/V16HImode/V32QImode. */
19680 if (valid_perm_using_mode_p (V4DImode, d))
19681 {
19682 for (i = 0; i < 4; i++)
19683 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
19684 if (d->testing_p)
19685 return true;
19686 target = gen_reg_rtx (V4DImode);
19687 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
19688 perm, 4, false))
19689 {
19690 emit_move_insn (d->target,
19691 gen_lowpart (d->vmode, target));
19692 return true;
19693 }
19694 return false;
19695 }
19696
19697 /* Next see if vpermd can be used. */
19698 if (valid_perm_using_mode_p (V8SImode, d))
19699 vmode = V8SImode;
19700 }
19701 /* Or if vpermps can be used. */
19702 else if (d->vmode == V8SFmode)
19703 vmode = V8SImode;
19704
19705 if (vmode == V32QImode)
19706 {
19707 /* vpshufb only works intra lanes, it is not
19708 possible to shuffle bytes in between the lanes. */
19709 for (i = 0; i < nelt; ++i)
19710 if ((d->perm[i] ^ i) & (nelt / 2))
19711 return false;
19712 }
19713 break;
19714
19715 case 64:
19716 if (!TARGET_AVX512BW)
19717 return false;
19718
19719 /* If vpermq didn't work, vpshufb won't work either. */
19720 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19721 return false;
19722
19723 vmode = V64QImode;
19724 if (d->vmode == V16SImode
19725 || d->vmode == V32HImode
19726 || d->vmode == V64QImode)
19727 {
19728 /* First see if vpermq can be used for
19729 V16SImode/V32HImode/V64QImode. */
19730 if (valid_perm_using_mode_p (V8DImode, d))
19731 {
19732 for (i = 0; i < 8; i++)
19733 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19734 if (d->testing_p)
19735 return true;
19736 target = gen_reg_rtx (V8DImode);
19737 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19738 perm, 8, false))
19739 {
19740 emit_move_insn (d->target,
19741 gen_lowpart (d->vmode, target));
19742 return true;
19743 }
19744 return false;
19745 }
19746
19747 /* Next see if vpermd can be used. */
19748 if (valid_perm_using_mode_p (V16SImode, d))
19749 vmode = V16SImode;
19750 }
19751 /* Or if vpermps can be used. */
19752 else if (d->vmode == V16SFmode)
19753 vmode = V16SImode;
19754
19755 if (vmode == V64QImode)
19756 {
19757 /* vpshufb only works intra lanes, it is not
19758 possible to shuffle bytes in between the lanes. */
19759 for (i = 0; i < nelt; ++i)
19760 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19761 return false;
19762 }
19763 break;
19764
19765 default:
19766 return false;
19767 }
19768
19769 if (d->testing_p)
19770 return true;
19771
19772 /* Try to avoid variable permutation instruction. */
19773 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19774 {
19775 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19776 return true;
19777 }
19778
19779 if (vmode == V8SImode)
19780 for (i = 0; i < 8; ++i)
19781 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19782 else if (vmode == V16SImode)
19783 for (i = 0; i < 16; ++i)
19784 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19785 else
19786 {
19787 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19788 if (!d->one_operand_p)
19789 mask = 2 * nelt - 1;
19790 else if (vmode == V64QImode)
19791 mask = nelt / 4 - 1;
19792 else if (vmode == V32QImode)
19793 mask = nelt / 2 - 1;
19794 else
19795 mask = nelt - 1;
19796
19797 for (i = 0; i < nelt; ++i)
19798 {
19799 unsigned j, e = d->perm[i] & mask;
19800 for (j = 0; j < eltsz; ++j)
19801 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19802 }
19803 }
19804
19805 machine_mode vpmode = vmode;
19806
19807 nelt = GET_MODE_SIZE (vmode);
19808
19809 /* Emulate narrow modes with V16QI instructions. */
19810 if (nelt < 16)
19811 {
19812 rtx m128 = GEN_INT (-128);
19813
19814 /* Remap elements from the second operand, as we have to
19815 account for inactive top elements from the first operand. */
19816 if (!d->one_operand_p)
19817 {
19818 for (i = 0; i < nelt; ++i)
19819 {
19820 unsigned ival = UINTVAL (rperm[i]);
19821 if (ival >= nelt)
19822 rperm[i] = GEN_INT (ival + 16 - nelt);
19823 }
19824 }
19825
19826 /* Fill inactive elements in the top positions with zeros. */
19827 for (i = nelt; i < 16; ++i)
19828 rperm[i] = m128;
19829
19830 vpmode = V16QImode;
19831 }
19832
19833 vperm = gen_rtx_CONST_VECTOR (vpmode,
19834 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19835 vperm = force_reg (vpmode, vperm);
19836
19837 if (vmode == d->vmode)
19838 target = d->target;
19839 else
19840 target = gen_reg_rtx (vmode);
19841
19842 op0 = gen_lowpart (vmode, d->op0);
19843
19844 if (d->one_operand_p)
19845 {
19846 rtx (*gen) (rtx, rtx, rtx);
19847
19848 if (vmode == V4QImode)
19849 gen = gen_mmx_pshufbv4qi3;
19850 else if (vmode == V8QImode)
19851 gen = gen_mmx_pshufbv8qi3;
19852 else if (vmode == V16QImode)
19853 gen = gen_ssse3_pshufbv16qi3;
19854 else if (vmode == V32QImode)
19855 gen = gen_avx2_pshufbv32qi3;
19856 else if (vmode == V64QImode)
19857 gen = gen_avx512bw_pshufbv64qi3;
19858 else if (vmode == V8SFmode)
19859 gen = gen_avx2_permvarv8sf;
19860 else if (vmode == V8SImode)
19861 gen = gen_avx2_permvarv8si;
19862 else if (vmode == V16SFmode)
19863 gen = gen_avx512f_permvarv16sf;
19864 else if (vmode == V16SImode)
19865 gen = gen_avx512f_permvarv16si;
19866 else
19867 gcc_unreachable ();
19868
19869 emit_insn (gen (target, op0, vperm));
19870 }
19871 else
19872 {
19873 rtx (*gen) (rtx, rtx, rtx, rtx);
19874
19875 op1 = gen_lowpart (vmode, d->op1);
19876
19877 if (vmode == V4QImode)
19878 gen = gen_mmx_ppermv32;
19879 else if (vmode == V8QImode)
19880 gen = gen_mmx_ppermv64;
19881 else if (vmode == V16QImode)
19882 gen = gen_xop_pperm;
19883 else
19884 gcc_unreachable ();
19885
19886 emit_insn (gen (target, op0, op1, vperm));
19887 }
19888
19889 if (target != d->target)
19890 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19891
19892 return true;
19893 }
19894
19895 /* Try to expand one-operand permutation with constant mask. */
19896
19897 static bool
19898 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19899 {
19900 machine_mode mode = GET_MODE (d->op0);
19901 machine_mode maskmode = mode;
19902 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
19903 rtx (*gen) (rtx, rtx, rtx) = NULL;
19904 rtx target, op0, mask;
19905 rtx vec[64];
19906
19907 if (!rtx_equal_p (d->op0, d->op1))
19908 return false;
19909
19910 if (!TARGET_AVX512F)
19911 return false;
19912
19913 /* Accept VNxHImode and VNxQImode now. */
19914 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19915 return false;
19916
19917 /* vpermw. */
19918 if (!TARGET_AVX512BW && inner_size == 2)
19919 return false;
19920
19921 /* vpermb. */
19922 if (!TARGET_AVX512VBMI && inner_size == 1)
19923 return false;
19924
19925 switch (mode)
19926 {
19927 case E_V16SImode:
19928 gen = gen_avx512f_permvarv16si;
19929 break;
19930 case E_V16SFmode:
19931 gen = gen_avx512f_permvarv16sf;
19932 maskmode = V16SImode;
19933 break;
19934 case E_V8DImode:
19935 gen = gen_avx512f_permvarv8di;
19936 break;
19937 case E_V8DFmode:
19938 gen = gen_avx512f_permvarv8df;
19939 maskmode = V8DImode;
19940 break;
19941 case E_V32HImode:
19942 gen = gen_avx512bw_permvarv32hi;
19943 break;
19944 case E_V16HImode:
19945 gen = gen_avx512vl_permvarv16hi;
19946 break;
19947 case E_V8HImode:
19948 gen = gen_avx512vl_permvarv8hi;
19949 break;
19950 case E_V64QImode:
19951 gen = gen_avx512bw_permvarv64qi;
19952 break;
19953 case E_V32QImode:
19954 gen = gen_avx512vl_permvarv32qi;
19955 break;
19956 case E_V16QImode:
19957 gen = gen_avx512vl_permvarv16qi;
19958 break;
19959
19960 default:
19961 return false;
19962 }
19963
19964 if (d->testing_p)
19965 return true;
19966
19967 target = d->target;
19968 op0 = d->op0;
19969 for (int i = 0; i < d->nelt; ++i)
19970 vec[i] = GEN_INT (d->perm[i]);
19971 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19972 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19973 return true;
19974 }
19975
19976 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19977
19978 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19979 in a single instruction. */
19980
19981 static bool
19982 expand_vec_perm_1 (struct expand_vec_perm_d *d)
19983 {
19984 unsigned i, nelt = d->nelt;
19985 struct expand_vec_perm_d nd;
19986
19987 /* Check plain VEC_SELECT first, because AVX has instructions that could
19988 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19989 input where SEL+CONCAT may not. */
19990 if (d->one_operand_p)
19991 {
19992 int mask = nelt - 1;
19993 bool identity_perm = true;
19994 bool broadcast_perm = true;
19995
19996 for (i = 0; i < nelt; i++)
19997 {
19998 nd.perm[i] = d->perm[i] & mask;
19999 if (nd.perm[i] != i)
20000 identity_perm = false;
20001 if (nd.perm[i])
20002 broadcast_perm = false;
20003 }
20004
20005 if (identity_perm)
20006 {
20007 if (!d->testing_p)
20008 emit_move_insn (d->target, d->op0);
20009 return true;
20010 }
20011 else if (broadcast_perm && TARGET_AVX2)
20012 {
20013 /* Use vpbroadcast{b,w,d}. */
20014 rtx (*gen) (rtx, rtx) = NULL;
20015 switch (d->vmode)
20016 {
20017 case E_V64QImode:
20018 if (TARGET_AVX512BW)
20019 gen = gen_avx512bw_vec_dupv64qi_1;
20020 break;
20021 case E_V32QImode:
20022 gen = gen_avx2_pbroadcastv32qi_1;
20023 break;
20024 case E_V32HImode:
20025 if (TARGET_AVX512BW)
20026 gen = gen_avx512bw_vec_dupv32hi_1;
20027 break;
20028 case E_V16HImode:
20029 gen = gen_avx2_pbroadcastv16hi_1;
20030 break;
20031 case E_V16SImode:
20032 if (TARGET_AVX512F)
20033 gen = gen_avx512f_vec_dupv16si_1;
20034 break;
20035 case E_V8SImode:
20036 gen = gen_avx2_pbroadcastv8si_1;
20037 break;
20038 case E_V16QImode:
20039 gen = gen_avx2_pbroadcastv16qi;
20040 break;
20041 case E_V8HImode:
20042 gen = gen_avx2_pbroadcastv8hi;
20043 break;
20044 case E_V16SFmode:
20045 if (TARGET_AVX512F)
20046 gen = gen_avx512f_vec_dupv16sf_1;
20047 break;
20048 case E_V8SFmode:
20049 gen = gen_avx2_vec_dupv8sf_1;
20050 break;
20051 case E_V8DFmode:
20052 if (TARGET_AVX512F)
20053 gen = gen_avx512f_vec_dupv8df_1;
20054 break;
20055 case E_V8DImode:
20056 if (TARGET_AVX512F)
20057 gen = gen_avx512f_vec_dupv8di_1;
20058 break;
20059 /* For other modes prefer other shuffles this function creates. */
20060 default: break;
20061 }
20062 if (gen != NULL)
20063 {
20064 if (!d->testing_p)
20065 emit_insn (gen (d->target, d->op0));
20066 return true;
20067 }
20068 }
20069
20070 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
20071 return true;
20072
20073 /* There are plenty of patterns in sse.md that are written for
20074 SEL+CONCAT and are not replicated for a single op. Perhaps
20075 that should be changed, to avoid the nastiness here. */
20076
20077 /* Recognize interleave style patterns, which means incrementing
20078 every other permutation operand. */
20079 for (i = 0; i < nelt; i += 2)
20080 {
20081 nd.perm[i] = d->perm[i] & mask;
20082 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
20083 }
20084 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
20085 d->testing_p))
20086 return true;
20087
20088 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
20089 if (nelt >= 4)
20090 {
20091 for (i = 0; i < nelt; i += 4)
20092 {
20093 nd.perm[i + 0] = d->perm[i + 0] & mask;
20094 nd.perm[i + 1] = d->perm[i + 1] & mask;
20095 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
20096 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
20097 }
20098
20099 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
20100 d->testing_p))
20101 return true;
20102 }
20103 }
20104
20105 /* Try the SSE4.1 blend variable merge instructions. */
20106 if (expand_vec_perm_blend (d))
20107 return true;
20108
20109 /* Try movss/movsd instructions. */
20110 if (expand_vec_perm_movs (d))
20111 return true;
20112
20113 /* Try the SSE4.1 insertps instruction. */
20114 if (expand_vec_perm_insertps (d))
20115 return true;
20116
20117 /* Try the fully general two operand permute. */
20118 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
20119 d->testing_p))
20120 return true;
20121
20122 /* Recognize interleave style patterns with reversed operands. */
20123 if (!d->one_operand_p)
20124 {
20125 for (i = 0; i < nelt; ++i)
20126 {
20127 unsigned e = d->perm[i];
20128 if (e >= nelt)
20129 e -= nelt;
20130 else
20131 e += nelt;
20132 nd.perm[i] = e;
20133 }
20134
20135 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
20136 d->testing_p))
20137 return true;
20138 }
20139
20140 /* Try one of the AVX vpermil variable permutations. */
20141 if (expand_vec_perm_vpermil (d))
20142 return true;
20143
20144 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
20145 vpshufb, vpermd, vpermps or vpermq variable permutation. */
20146 if (expand_vec_perm_pshufb (d))
20147 return true;
20148
20149 /* Try the AVX2 vpalignr instruction. */
20150 if (expand_vec_perm_palignr (d, true))
20151 return true;
20152
20153 /* Try the AVX512F vperm{w,b,s,d} instructions */
20154 if (ix86_expand_vec_one_operand_perm_avx512 (d))
20155 return true;
20156
20157 /* Try the AVX512F vpermt2/vpermi2 instructions. */
20158 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
20159 return true;
20160
20161 /* See if we can get the same permutation in different vector integer
20162 mode. */
20163 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
20164 {
20165 if (!d->testing_p)
20166 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
20167 return true;
20168 }
20169 return false;
20170 }
20171
20172 /* Canonicalize vec_perm index to make the first index
20173 always comes from the first vector. */
20174 static void
20175 ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
20176 {
20177 unsigned nelt = d->nelt;
20178 if (d->perm[0] < nelt)
20179 return;
20180
20181 for (unsigned i = 0; i != nelt; i++)
20182 d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
20183
20184 std::swap (d->op0, d->op1);
20185 return;
20186 }
20187
20188 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20189 in terms of a pair of shufps+ shufps/pshufd instructions. */
20190 static bool
20191 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
20192 {
20193 unsigned char perm1[4];
20194 machine_mode vmode = d->vmode;
20195 bool ok;
20196 unsigned i, j, k, count = 0;
20197
20198 if (d->one_operand_p
20199 || (vmode != V4SImode && vmode != V4SFmode))
20200 return false;
20201
20202 if (d->testing_p)
20203 return true;
20204
20205 ix86_vec_perm_index_canon (d);
20206 for (i = 0; i < 4; ++i)
20207 count += d->perm[i] > 3 ? 1 : 0;
20208
20209 gcc_assert (count & 3);
20210
20211 rtx tmp = gen_reg_rtx (vmode);
20212 /* 2 from op0 and 2 from op1. */
20213 if (count == 2)
20214 {
20215 unsigned char perm2[4];
20216 for (i = 0, j = 0, k = 2; i < 4; ++i)
20217 if (d->perm[i] & 4)
20218 {
20219 perm1[k++] = d->perm[i];
20220 perm2[i] = k - 1;
20221 }
20222 else
20223 {
20224 perm1[j++] = d->perm[i];
20225 perm2[i] = j - 1;
20226 }
20227
20228 /* shufps. */
20229 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20230 perm1, d->nelt, false);
20231 gcc_assert (ok);
20232 if (vmode == V4SImode && TARGET_SSE2)
20233 /* pshufd. */
20234 ok = expand_vselect (d->target, tmp,
20235 perm2, d->nelt, false);
20236 else
20237 {
20238 /* shufps. */
20239 perm2[2] += 4;
20240 perm2[3] += 4;
20241 ok = expand_vselect_vconcat (d->target, tmp, tmp,
20242 perm2, d->nelt, false);
20243 }
20244 gcc_assert (ok);
20245 }
20246 /* 3 from one op and 1 from another. */
20247 else
20248 {
20249 unsigned pair_idx = 8, lone_idx = 8, shift;
20250
20251 /* Find the lone index. */
20252 for (i = 0; i < 4; ++i)
20253 if ((d->perm[i] > 3 && count == 1)
20254 || (d->perm[i] < 4 && count == 3))
20255 lone_idx = i;
20256
20257 /* When lone_idx is not 0, it must from second op(count == 1). */
20258 gcc_assert (count == (lone_idx ? 1 : 3));
20259
20260 /* Find the pair index that sits in the same half as the lone index. */
20261 shift = lone_idx & 2;
20262 pair_idx = 1 - lone_idx + 2 * shift;
20263
20264 /* First permutate lone index and pair index into the same vector as
20265 [ lone, lone, pair, pair ]. */
20266 perm1[1] = perm1[0]
20267 = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
20268 perm1[3] = perm1[2]
20269 = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
20270
20271 /* Alway put the vector contains lone indx at the first. */
20272 if (count == 1)
20273 std::swap (d->op0, d->op1);
20274
20275 /* shufps. */
20276 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
20277 perm1, d->nelt, false);
20278 gcc_assert (ok);
20279
20280 /* Refine lone and pair index to original order. */
20281 perm1[shift] = lone_idx << 1;
20282 perm1[shift + 1] = pair_idx << 1;
20283
20284 /* Select the remaining 2 elements in another vector. */
20285 for (i = 2 - shift; i < 4 - shift; ++i)
20286 perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
20287
20288 /* Adjust to original selector. */
20289 if (lone_idx > 1)
20290 std::swap (tmp, d->op1);
20291
20292 /* shufps. */
20293 ok = expand_vselect_vconcat (d->target, tmp, d->op1,
20294 perm1, d->nelt, false);
20295
20296 gcc_assert (ok);
20297 }
20298
20299 return true;
20300 }
20301
20302 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20303 in terms of a pair of pshuflw + pshufhw instructions. */
20304
20305 static bool
20306 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
20307 {
20308 unsigned char perm2[MAX_VECT_LEN];
20309 unsigned i;
20310 bool ok;
20311
20312 if (d->vmode != V8HImode || !d->one_operand_p)
20313 return false;
20314
20315 /* The two permutations only operate in 64-bit lanes. */
20316 for (i = 0; i < 4; ++i)
20317 if (d->perm[i] >= 4)
20318 return false;
20319 for (i = 4; i < 8; ++i)
20320 if (d->perm[i] < 4)
20321 return false;
20322
20323 if (d->testing_p)
20324 return true;
20325
20326 /* Emit the pshuflw. */
20327 memcpy (perm2, d->perm, 4);
20328 for (i = 4; i < 8; ++i)
20329 perm2[i] = i;
20330 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
20331 gcc_assert (ok);
20332
20333 /* Emit the pshufhw. */
20334 memcpy (perm2 + 4, d->perm + 4, 4);
20335 for (i = 0; i < 4; ++i)
20336 perm2[i] = i;
20337 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
20338 gcc_assert (ok);
20339
20340 return true;
20341 }
20342
20343 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20344 the permutation using the SSSE3 palignr instruction. This succeeds
20345 when all of the elements in PERM fit within one vector and we merely
20346 need to shift them down so that a single vector permutation has a
20347 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20348 the vpalignr instruction itself can perform the requested permutation. */
20349
20350 static bool
20351 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
20352 {
20353 unsigned i, nelt = d->nelt;
20354 unsigned min, max, minswap, maxswap;
20355 bool in_order, ok, swap = false;
20356 rtx shift, target;
20357 struct expand_vec_perm_d dcopy;
20358
20359 /* Even with AVX, palignr only operates on 128-bit vectors,
20360 in AVX2 palignr operates on both 128-bit lanes. */
20361 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
20362 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
20363 return false;
20364
20365 min = 2 * nelt;
20366 max = 0;
20367 minswap = 2 * nelt;
20368 maxswap = 0;
20369 for (i = 0; i < nelt; ++i)
20370 {
20371 unsigned e = d->perm[i];
20372 unsigned eswap = d->perm[i] ^ nelt;
20373 if (GET_MODE_SIZE (d->vmode) == 32)
20374 {
20375 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
20376 eswap = e ^ (nelt / 2);
20377 }
20378 if (e < min)
20379 min = e;
20380 if (e > max)
20381 max = e;
20382 if (eswap < minswap)
20383 minswap = eswap;
20384 if (eswap > maxswap)
20385 maxswap = eswap;
20386 }
20387 if (min == 0
20388 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
20389 {
20390 if (d->one_operand_p
20391 || minswap == 0
20392 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
20393 ? nelt / 2 : nelt))
20394 return false;
20395 swap = true;
20396 min = minswap;
20397 max = maxswap;
20398 }
20399
20400 /* Given that we have SSSE3, we know we'll be able to implement the
20401 single operand permutation after the palignr with pshufb for
20402 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20403 first. */
20404 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
20405 return true;
20406
20407 dcopy = *d;
20408 if (swap)
20409 {
20410 dcopy.op0 = d->op1;
20411 dcopy.op1 = d->op0;
20412 for (i = 0; i < nelt; ++i)
20413 dcopy.perm[i] ^= nelt;
20414 }
20415
20416 in_order = true;
20417 for (i = 0; i < nelt; ++i)
20418 {
20419 unsigned e = dcopy.perm[i];
20420 if (GET_MODE_SIZE (d->vmode) == 32
20421 && e >= nelt
20422 && (e & (nelt / 2 - 1)) < min)
20423 e = e - min - (nelt / 2);
20424 else
20425 e = e - min;
20426 if (e != i)
20427 in_order = false;
20428 dcopy.perm[i] = e;
20429 }
20430 dcopy.one_operand_p = true;
20431
20432 if (single_insn_only_p && !in_order)
20433 return false;
20434
20435 /* For AVX2, test whether we can permute the result in one instruction. */
20436 if (d->testing_p)
20437 {
20438 if (in_order)
20439 return true;
20440 dcopy.op1 = dcopy.op0;
20441 return expand_vec_perm_1 (&dcopy);
20442 }
20443
20444 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
20445 if (GET_MODE_SIZE (d->vmode) == 16)
20446 {
20447 target = gen_reg_rtx (V1TImode);
20448 emit_insn (gen_ssse3_palignrv1ti (target,
20449 gen_lowpart (V1TImode, dcopy.op1),
20450 gen_lowpart (V1TImode, dcopy.op0),
20451 shift));
20452 }
20453 else
20454 {
20455 target = gen_reg_rtx (V2TImode);
20456 emit_insn (gen_avx2_palignrv2ti (target,
20457 gen_lowpart (V2TImode, dcopy.op1),
20458 gen_lowpart (V2TImode, dcopy.op0),
20459 shift));
20460 }
20461
20462 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
20463
20464 /* Test for the degenerate case where the alignment by itself
20465 produces the desired permutation. */
20466 if (in_order)
20467 {
20468 emit_move_insn (d->target, dcopy.op0);
20469 return true;
20470 }
20471
20472 ok = expand_vec_perm_1 (&dcopy);
20473 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
20474
20475 return ok;
20476 }
20477
20478 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20479 the permutation using the SSE4_1 pblendv instruction. Potentially
20480 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20481
20482 static bool
20483 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
20484 {
20485 unsigned i, which, nelt = d->nelt;
20486 struct expand_vec_perm_d dcopy, dcopy1;
20487 machine_mode vmode = d->vmode;
20488 bool ok;
20489
20490 /* Use the same checks as in expand_vec_perm_blend. */
20491 if (d->one_operand_p)
20492 return false;
20493 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20494 ;
20495 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20496 ;
20497 else if (TARGET_SSE4_1
20498 && (GET_MODE_SIZE (vmode) == 16
20499 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
20500 || GET_MODE_SIZE (vmode) == 4))
20501 ;
20502 else
20503 return false;
20504
20505 /* Figure out where permutation elements stay not in their
20506 respective lanes. */
20507 for (i = 0, which = 0; i < nelt; ++i)
20508 {
20509 unsigned e = d->perm[i];
20510 if (e != i)
20511 which |= (e < nelt ? 1 : 2);
20512 }
20513 /* We can pblend the part where elements stay not in their
20514 respective lanes only when these elements are all in one
20515 half of a permutation.
20516 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20517 lanes, but both 8 and 9 >= 8
20518 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20519 respective lanes and 8 >= 8, but 2 not. */
20520 if (which != 1 && which != 2)
20521 return false;
20522 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
20523 return true;
20524
20525 /* First we apply one operand permutation to the part where
20526 elements stay not in their respective lanes. */
20527 dcopy = *d;
20528 if (which == 2)
20529 dcopy.op0 = dcopy.op1 = d->op1;
20530 else
20531 dcopy.op0 = dcopy.op1 = d->op0;
20532 if (!d->testing_p)
20533 dcopy.target = gen_reg_rtx (vmode);
20534 dcopy.one_operand_p = true;
20535
20536 for (i = 0; i < nelt; ++i)
20537 dcopy.perm[i] = d->perm[i] & (nelt - 1);
20538
20539 ok = expand_vec_perm_1 (&dcopy);
20540 if (GET_MODE_SIZE (vmode) != 16 && !ok)
20541 return false;
20542 else
20543 gcc_assert (ok);
20544 if (d->testing_p)
20545 return true;
20546
20547 /* Next we put permuted elements into their positions. */
20548 dcopy1 = *d;
20549 if (which == 2)
20550 dcopy1.op1 = dcopy.target;
20551 else
20552 dcopy1.op0 = dcopy.target;
20553
20554 for (i = 0; i < nelt; ++i)
20555 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
20556
20557 ok = expand_vec_perm_blend (&dcopy1);
20558 gcc_assert (ok);
20559
20560 return true;
20561 }
20562
20563 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
20564
20565 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20566 a two vector permutation into a single vector permutation by using
20567 an interleave operation to merge the vectors. */
20568
20569 static bool
20570 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
20571 {
20572 struct expand_vec_perm_d dremap, dfinal;
20573 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
20574 unsigned HOST_WIDE_INT contents;
20575 unsigned char remap[2 * MAX_VECT_LEN];
20576 rtx_insn *seq;
20577 bool ok, same_halves = false;
20578
20579 if (GET_MODE_SIZE (d->vmode) == 4
20580 || GET_MODE_SIZE (d->vmode) == 8
20581 || GET_MODE_SIZE (d->vmode) == 16)
20582 {
20583 if (d->one_operand_p)
20584 return false;
20585 }
20586 else if (GET_MODE_SIZE (d->vmode) == 32)
20587 {
20588 if (!TARGET_AVX)
20589 return false;
20590 /* For 32-byte modes allow even d->one_operand_p.
20591 The lack of cross-lane shuffling in some instructions
20592 might prevent a single insn shuffle. */
20593 dfinal = *d;
20594 dfinal.testing_p = true;
20595 /* If expand_vec_perm_interleave3 can expand this into
20596 a 3 insn sequence, give up and let it be expanded as
20597 3 insn sequence. While that is one insn longer,
20598 it doesn't need a memory operand and in the common
20599 case that both interleave low and high permutations
20600 with the same operands are adjacent needs 4 insns
20601 for both after CSE. */
20602 if (expand_vec_perm_interleave3 (&dfinal))
20603 return false;
20604 }
20605 else
20606 return false;
20607
20608 /* Examine from whence the elements come. */
20609 contents = 0;
20610 for (i = 0; i < nelt; ++i)
20611 contents |= HOST_WIDE_INT_1U << d->perm[i];
20612
20613 memset (remap, 0xff, sizeof (remap));
20614 dremap = *d;
20615
20616 if (GET_MODE_SIZE (d->vmode) == 4
20617 || GET_MODE_SIZE (d->vmode) == 8)
20618 {
20619 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20620
20621 /* Split the two input vectors into 4 halves. */
20622 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20623 h2 = h1 << nelt2;
20624 h3 = h2 << nelt2;
20625 h4 = h3 << nelt2;
20626
20627 /* If the elements from the low halves use interleave low,
20628 and similarly for interleave high. */
20629 if ((contents & (h1 | h3)) == contents)
20630 {
20631 /* punpckl* */
20632 for (i = 0; i < nelt2; ++i)
20633 {
20634 remap[i] = i * 2;
20635 remap[i + nelt] = i * 2 + 1;
20636 dremap.perm[i * 2] = i;
20637 dremap.perm[i * 2 + 1] = i + nelt;
20638 }
20639 }
20640 else if ((contents & (h2 | h4)) == contents)
20641 {
20642 /* punpckh* */
20643 for (i = 0; i < nelt2; ++i)
20644 {
20645 remap[i + nelt2] = i * 2;
20646 remap[i + nelt + nelt2] = i * 2 + 1;
20647 dremap.perm[i * 2] = i + nelt2;
20648 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20649 }
20650 }
20651 else
20652 return false;
20653 }
20654 else if (GET_MODE_SIZE (d->vmode) == 16)
20655 {
20656 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20657
20658 /* Split the two input vectors into 4 halves. */
20659 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20660 h2 = h1 << nelt2;
20661 h3 = h2 << nelt2;
20662 h4 = h3 << nelt2;
20663
20664 /* If the elements from the low halves use interleave low, and similarly
20665 for interleave high. If the elements are from mis-matched halves, we
20666 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20667 if ((contents & (h1 | h3)) == contents)
20668 {
20669 /* punpckl* */
20670 for (i = 0; i < nelt2; ++i)
20671 {
20672 remap[i] = i * 2;
20673 remap[i + nelt] = i * 2 + 1;
20674 dremap.perm[i * 2] = i;
20675 dremap.perm[i * 2 + 1] = i + nelt;
20676 }
20677 if (!TARGET_SSE2 && d->vmode == V4SImode)
20678 dremap.vmode = V4SFmode;
20679 }
20680 else if ((contents & (h2 | h4)) == contents)
20681 {
20682 /* punpckh* */
20683 for (i = 0; i < nelt2; ++i)
20684 {
20685 remap[i + nelt2] = i * 2;
20686 remap[i + nelt + nelt2] = i * 2 + 1;
20687 dremap.perm[i * 2] = i + nelt2;
20688 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20689 }
20690 if (!TARGET_SSE2 && d->vmode == V4SImode)
20691 dremap.vmode = V4SFmode;
20692 }
20693 else if ((contents & (h1 | h4)) == contents)
20694 {
20695 /* shufps */
20696 for (i = 0; i < nelt2; ++i)
20697 {
20698 remap[i] = i;
20699 remap[i + nelt + nelt2] = i + nelt2;
20700 dremap.perm[i] = i;
20701 dremap.perm[i + nelt2] = i + nelt + nelt2;
20702 }
20703 if (nelt != 4)
20704 {
20705 /* shufpd */
20706 dremap.vmode = V2DImode;
20707 dremap.nelt = 2;
20708 dremap.perm[0] = 0;
20709 dremap.perm[1] = 3;
20710 }
20711 }
20712 else if ((contents & (h2 | h3)) == contents)
20713 {
20714 /* shufps */
20715 for (i = 0; i < nelt2; ++i)
20716 {
20717 remap[i + nelt2] = i;
20718 remap[i + nelt] = i + nelt2;
20719 dremap.perm[i] = i + nelt2;
20720 dremap.perm[i + nelt2] = i + nelt;
20721 }
20722 if (nelt != 4)
20723 {
20724 /* shufpd */
20725 dremap.vmode = V2DImode;
20726 dremap.nelt = 2;
20727 dremap.perm[0] = 1;
20728 dremap.perm[1] = 2;
20729 }
20730 }
20731 else
20732 return false;
20733 }
20734 else
20735 {
20736 unsigned int nelt4 = nelt / 4, nzcnt = 0;
20737 unsigned HOST_WIDE_INT q[8];
20738 unsigned int nonzero_halves[4];
20739
20740 /* Split the two input vectors into 8 quarters. */
20741 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
20742 for (i = 1; i < 8; ++i)
20743 q[i] = q[0] << (nelt4 * i);
20744 for (i = 0; i < 4; ++i)
20745 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
20746 {
20747 nonzero_halves[nzcnt] = i;
20748 ++nzcnt;
20749 }
20750
20751 if (nzcnt == 1)
20752 {
20753 gcc_assert (d->one_operand_p);
20754 nonzero_halves[1] = nonzero_halves[0];
20755 same_halves = true;
20756 }
20757 else if (d->one_operand_p)
20758 {
20759 gcc_assert (nonzero_halves[0] == 0);
20760 gcc_assert (nonzero_halves[1] == 1);
20761 }
20762
20763 if (nzcnt <= 2)
20764 {
20765 if (d->perm[0] / nelt2 == nonzero_halves[1])
20766 {
20767 /* Attempt to increase the likelihood that dfinal
20768 shuffle will be intra-lane. */
20769 std::swap (nonzero_halves[0], nonzero_halves[1]);
20770 }
20771
20772 /* vperm2f128 or vperm2i128. */
20773 for (i = 0; i < nelt2; ++i)
20774 {
20775 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
20776 remap[i + nonzero_halves[0] * nelt2] = i;
20777 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
20778 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
20779 }
20780
20781 if (d->vmode != V8SFmode
20782 && d->vmode != V4DFmode
20783 && d->vmode != V8SImode)
20784 {
20785 dremap.vmode = V8SImode;
20786 dremap.nelt = 8;
20787 for (i = 0; i < 4; ++i)
20788 {
20789 dremap.perm[i] = i + nonzero_halves[0] * 4;
20790 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
20791 }
20792 }
20793 }
20794 else if (d->one_operand_p)
20795 return false;
20796 else if (TARGET_AVX2
20797 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
20798 {
20799 /* vpunpckl* */
20800 for (i = 0; i < nelt4; ++i)
20801 {
20802 remap[i] = i * 2;
20803 remap[i + nelt] = i * 2 + 1;
20804 remap[i + nelt2] = i * 2 + nelt2;
20805 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
20806 dremap.perm[i * 2] = i;
20807 dremap.perm[i * 2 + 1] = i + nelt;
20808 dremap.perm[i * 2 + nelt2] = i + nelt2;
20809 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
20810 }
20811 }
20812 else if (TARGET_AVX2
20813 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
20814 {
20815 /* vpunpckh* */
20816 for (i = 0; i < nelt4; ++i)
20817 {
20818 remap[i + nelt4] = i * 2;
20819 remap[i + nelt + nelt4] = i * 2 + 1;
20820 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
20821 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
20822 dremap.perm[i * 2] = i + nelt4;
20823 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
20824 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
20825 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
20826 }
20827 }
20828 else
20829 return false;
20830 }
20831
20832 /* Use the remapping array set up above to move the elements from their
20833 swizzled locations into their final destinations. */
20834 dfinal = *d;
20835 for (i = 0; i < nelt; ++i)
20836 {
20837 unsigned e = remap[d->perm[i]];
20838 gcc_assert (e < nelt);
20839 /* If same_halves is true, both halves of the remapped vector are the
20840 same. Avoid cross-lane accesses if possible. */
20841 if (same_halves && i >= nelt2)
20842 {
20843 gcc_assert (e < nelt2);
20844 dfinal.perm[i] = e + nelt2;
20845 }
20846 else
20847 dfinal.perm[i] = e;
20848 }
20849 if (!d->testing_p)
20850 {
20851 dremap.target = gen_reg_rtx (dremap.vmode);
20852 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20853 }
20854 dfinal.op1 = dfinal.op0;
20855 dfinal.one_operand_p = true;
20856
20857 /* Test if the final remap can be done with a single insn. For V4SFmode or
20858 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20859 start_sequence ();
20860 ok = expand_vec_perm_1 (&dfinal);
20861 seq = get_insns ();
20862 end_sequence ();
20863
20864 if (!ok)
20865 return false;
20866
20867 if (d->testing_p)
20868 return true;
20869
20870 if (dremap.vmode != dfinal.vmode)
20871 {
20872 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20873 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20874 }
20875
20876 ok = expand_vec_perm_1 (&dremap);
20877 gcc_assert (ok);
20878
20879 emit_insn (seq);
20880 return true;
20881 }
20882
20883 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20884 a single vector cross-lane permutation into vpermq followed
20885 by any of the single insn permutations. */
20886
20887 static bool
20888 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20889 {
20890 struct expand_vec_perm_d dremap, dfinal;
20891 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20892 unsigned contents[2];
20893 bool ok;
20894
20895 if (!(TARGET_AVX2
20896 && (d->vmode == V32QImode || d->vmode == V16HImode)
20897 && d->one_operand_p))
20898 return false;
20899
20900 contents[0] = 0;
20901 contents[1] = 0;
20902 for (i = 0; i < nelt2; ++i)
20903 {
20904 contents[0] |= 1u << (d->perm[i] / nelt4);
20905 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20906 }
20907
20908 for (i = 0; i < 2; ++i)
20909 {
20910 unsigned int cnt = 0;
20911 for (j = 0; j < 4; ++j)
20912 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20913 return false;
20914 }
20915
20916 if (d->testing_p)
20917 return true;
20918
20919 dremap = *d;
20920 dremap.vmode = V4DImode;
20921 dremap.nelt = 4;
20922 dremap.target = gen_reg_rtx (V4DImode);
20923 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20924 dremap.op1 = dremap.op0;
20925 dremap.one_operand_p = true;
20926 for (i = 0; i < 2; ++i)
20927 {
20928 unsigned int cnt = 0;
20929 for (j = 0; j < 4; ++j)
20930 if ((contents[i] & (1u << j)) != 0)
20931 dremap.perm[2 * i + cnt++] = j;
20932 for (; cnt < 2; ++cnt)
20933 dremap.perm[2 * i + cnt] = 0;
20934 }
20935
20936 dfinal = *d;
20937 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20938 dfinal.op1 = dfinal.op0;
20939 dfinal.one_operand_p = true;
20940 for (i = 0, j = 0; i < nelt; ++i)
20941 {
20942 if (i == nelt2)
20943 j = 2;
20944 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20945 if ((d->perm[i] / nelt4) == dremap.perm[j])
20946 ;
20947 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20948 dfinal.perm[i] |= nelt4;
20949 else
20950 gcc_unreachable ();
20951 }
20952
20953 ok = expand_vec_perm_1 (&dremap);
20954 gcc_assert (ok);
20955
20956 ok = expand_vec_perm_1 (&dfinal);
20957 gcc_assert (ok);
20958
20959 return true;
20960 }
20961
20962 static bool canonicalize_perm (struct expand_vec_perm_d *d);
20963
20964 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20965 a vector permutation using two instructions, vperm2f128 resp.
20966 vperm2i128 followed by any single in-lane permutation. */
20967
20968 static bool
20969 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20970 {
20971 struct expand_vec_perm_d dfirst, dsecond;
20972 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20973 bool ok;
20974
20975 if (!TARGET_AVX
20976 || GET_MODE_SIZE (d->vmode) != 32
20977 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20978 return false;
20979
20980 dsecond = *d;
20981 dsecond.one_operand_p = false;
20982 dsecond.testing_p = true;
20983
20984 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20985 immediate. For perm < 16 the second permutation uses
20986 d->op0 as first operand, for perm >= 16 it uses d->op1
20987 as first operand. The second operand is the result of
20988 vperm2[fi]128. */
20989 for (perm = 0; perm < 32; perm++)
20990 {
20991 /* Ignore permutations which do not move anything cross-lane. */
20992 if (perm < 16)
20993 {
20994 /* The second shuffle for e.g. V4DFmode has
20995 0123 and ABCD operands.
20996 Ignore AB23, as 23 is already in the second lane
20997 of the first operand. */
20998 if ((perm & 0xc) == (1 << 2)) continue;
20999 /* And 01CD, as 01 is in the first lane of the first
21000 operand. */
21001 if ((perm & 3) == 0) continue;
21002 /* And 4567, as then the vperm2[fi]128 doesn't change
21003 anything on the original 4567 second operand. */
21004 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
21005 }
21006 else
21007 {
21008 /* The second shuffle for e.g. V4DFmode has
21009 4567 and ABCD operands.
21010 Ignore AB67, as 67 is already in the second lane
21011 of the first operand. */
21012 if ((perm & 0xc) == (3 << 2)) continue;
21013 /* And 45CD, as 45 is in the first lane of the first
21014 operand. */
21015 if ((perm & 3) == 2) continue;
21016 /* And 0123, as then the vperm2[fi]128 doesn't change
21017 anything on the original 0123 first operand. */
21018 if ((perm & 0xf) == (1 << 2)) continue;
21019 }
21020
21021 for (i = 0; i < nelt; i++)
21022 {
21023 j = d->perm[i] / nelt2;
21024 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
21025 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
21026 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
21027 dsecond.perm[i] = d->perm[i] & (nelt - 1);
21028 else
21029 break;
21030 }
21031
21032 if (i == nelt)
21033 {
21034 start_sequence ();
21035 ok = expand_vec_perm_1 (&dsecond);
21036 end_sequence ();
21037 }
21038 else
21039 ok = false;
21040
21041 if (ok)
21042 {
21043 if (d->testing_p)
21044 return true;
21045
21046 /* Found a usable second shuffle. dfirst will be
21047 vperm2f128 on d->op0 and d->op1. */
21048 dsecond.testing_p = false;
21049 dfirst = *d;
21050 dfirst.target = gen_reg_rtx (d->vmode);
21051 for (i = 0; i < nelt; i++)
21052 dfirst.perm[i] = (i & (nelt2 - 1))
21053 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
21054
21055 canonicalize_perm (&dfirst);
21056 ok = expand_vec_perm_1 (&dfirst);
21057 gcc_assert (ok);
21058
21059 /* And dsecond is some single insn shuffle, taking
21060 d->op0 and result of vperm2f128 (if perm < 16) or
21061 d->op1 and result of vperm2f128 (otherwise). */
21062 if (perm >= 16)
21063 dsecond.op0 = dsecond.op1;
21064 dsecond.op1 = dfirst.target;
21065
21066 ok = expand_vec_perm_1 (&dsecond);
21067 gcc_assert (ok);
21068
21069 return true;
21070 }
21071
21072 /* For one operand, the only useful vperm2f128 permutation is 0x01
21073 aka lanes swap. */
21074 if (d->one_operand_p)
21075 return false;
21076 }
21077
21078 return false;
21079 }
21080
21081 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21082 a two vector permutation using 2 intra-lane interleave insns
21083 and cross-lane shuffle for 32-byte vectors. */
21084
21085 static bool
21086 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
21087 {
21088 unsigned i, nelt;
21089 rtx (*gen) (rtx, rtx, rtx);
21090
21091 if (d->one_operand_p)
21092 return false;
21093 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
21094 ;
21095 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
21096 ;
21097 else
21098 return false;
21099
21100 nelt = d->nelt;
21101 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
21102 return false;
21103 for (i = 0; i < nelt; i += 2)
21104 if (d->perm[i] != d->perm[0] + i / 2
21105 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
21106 return false;
21107
21108 if (d->testing_p)
21109 return true;
21110
21111 switch (d->vmode)
21112 {
21113 case E_V32QImode:
21114 if (d->perm[0])
21115 gen = gen_vec_interleave_highv32qi;
21116 else
21117 gen = gen_vec_interleave_lowv32qi;
21118 break;
21119 case E_V16HImode:
21120 if (d->perm[0])
21121 gen = gen_vec_interleave_highv16hi;
21122 else
21123 gen = gen_vec_interleave_lowv16hi;
21124 break;
21125 case E_V8SImode:
21126 if (d->perm[0])
21127 gen = gen_vec_interleave_highv8si;
21128 else
21129 gen = gen_vec_interleave_lowv8si;
21130 break;
21131 case E_V4DImode:
21132 if (d->perm[0])
21133 gen = gen_vec_interleave_highv4di;
21134 else
21135 gen = gen_vec_interleave_lowv4di;
21136 break;
21137 case E_V8SFmode:
21138 if (d->perm[0])
21139 gen = gen_vec_interleave_highv8sf;
21140 else
21141 gen = gen_vec_interleave_lowv8sf;
21142 break;
21143 case E_V4DFmode:
21144 if (d->perm[0])
21145 gen = gen_vec_interleave_highv4df;
21146 else
21147 gen = gen_vec_interleave_lowv4df;
21148 break;
21149 default:
21150 gcc_unreachable ();
21151 }
21152
21153 emit_insn (gen (d->target, d->op0, d->op1));
21154 return true;
21155 }
21156
21157 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21158 a single vector permutation using a single intra-lane vector
21159 permutation, vperm2f128 swapping the lanes and vblend* insn blending
21160 the non-swapped and swapped vectors together. */
21161
21162 static bool
21163 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
21164 {
21165 struct expand_vec_perm_d dfirst, dsecond;
21166 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
21167 rtx_insn *seq;
21168 bool ok;
21169 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21170
21171 if (!TARGET_AVX
21172 || TARGET_AVX2
21173 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21174 || !d->one_operand_p)
21175 return false;
21176
21177 dfirst = *d;
21178 for (i = 0; i < nelt; i++)
21179 dfirst.perm[i] = 0xff;
21180 for (i = 0, msk = 0; i < nelt; i++)
21181 {
21182 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21183 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
21184 return false;
21185 dfirst.perm[j] = d->perm[i];
21186 if (j != i)
21187 msk |= (1 << i);
21188 }
21189 for (i = 0; i < nelt; i++)
21190 if (dfirst.perm[i] == 0xff)
21191 dfirst.perm[i] = i;
21192
21193 if (!d->testing_p)
21194 dfirst.target = gen_reg_rtx (dfirst.vmode);
21195
21196 start_sequence ();
21197 ok = expand_vec_perm_1 (&dfirst);
21198 seq = get_insns ();
21199 end_sequence ();
21200
21201 if (!ok)
21202 return false;
21203
21204 if (d->testing_p)
21205 return true;
21206
21207 emit_insn (seq);
21208
21209 dsecond = *d;
21210 dsecond.op0 = dfirst.target;
21211 dsecond.op1 = dfirst.target;
21212 dsecond.one_operand_p = true;
21213 dsecond.target = gen_reg_rtx (dsecond.vmode);
21214 for (i = 0; i < nelt; i++)
21215 dsecond.perm[i] = i ^ nelt2;
21216
21217 ok = expand_vec_perm_1 (&dsecond);
21218 gcc_assert (ok);
21219
21220 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21221 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
21222 return true;
21223 }
21224
21225 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21226 a two vector permutation using two single vector permutations and
21227 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
21228 of dfirst or dsecond is identity permutation. */
21229
21230 static bool
21231 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
21232 {
21233 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
21234 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21235 bool ident1 = true, ident2 = true;
21236
21237 if (d->one_operand_p)
21238 return false;
21239
21240 if (GET_MODE_SIZE (d->vmode) == 16)
21241 {
21242 if (!TARGET_SSE)
21243 return false;
21244 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
21245 return false;
21246 }
21247 else if (GET_MODE_SIZE (d->vmode) == 32)
21248 {
21249 if (!TARGET_AVX)
21250 return false;
21251 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
21252 return false;
21253 lane = nelt2;
21254 }
21255 else
21256 return false;
21257
21258 for (i = 1; i < nelt; i++)
21259 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
21260 return false;
21261
21262 dfirst = *d;
21263 dsecond = *d;
21264 dfinal = *d;
21265 dfirst.op1 = dfirst.op0;
21266 dfirst.one_operand_p = true;
21267 dsecond.op0 = dsecond.op1;
21268 dsecond.one_operand_p = true;
21269
21270 for (i = 0; i < nelt; i++)
21271 if (d->perm[i] >= nelt)
21272 {
21273 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
21274 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
21275 ident2 = false;
21276 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
21277 = d->perm[i] - nelt;
21278 }
21279 else
21280 {
21281 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
21282 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
21283 ident1 = false;
21284 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
21285 }
21286
21287 if (two_insn && !ident1 && !ident2)
21288 return false;
21289
21290 if (!d->testing_p)
21291 {
21292 if (!ident1)
21293 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21294 if (!ident2)
21295 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21296 if (d->perm[0] >= nelt)
21297 std::swap (dfinal.op0, dfinal.op1);
21298 }
21299
21300 bool ok;
21301 rtx_insn *seq1 = NULL, *seq2 = NULL;
21302
21303 if (!ident1)
21304 {
21305 start_sequence ();
21306 ok = expand_vec_perm_1 (&dfirst);
21307 seq1 = get_insns ();
21308 end_sequence ();
21309
21310 if (!ok)
21311 return false;
21312 }
21313
21314 if (!ident2)
21315 {
21316 start_sequence ();
21317 ok = expand_vec_perm_1 (&dsecond);
21318 seq2 = get_insns ();
21319 end_sequence ();
21320
21321 if (!ok)
21322 return false;
21323 }
21324
21325 if (d->testing_p)
21326 return true;
21327
21328 for (i = 0; i < nelt; i++)
21329 {
21330 dfinal.perm[i] = i / 2;
21331 if (i >= lane)
21332 dfinal.perm[i] += lane / 2;
21333 if ((i & 1) != 0)
21334 dfinal.perm[i] += nelt;
21335 }
21336 emit_insn (seq1);
21337 emit_insn (seq2);
21338 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
21339 dfinal.perm, dfinal.nelt, false);
21340 gcc_assert (ok);
21341 return true;
21342 }
21343
21344 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21345 the permutation using two single vector permutations and the SSE4_1 pblendv
21346 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21347 identity permutation. */
21348
21349 static bool
21350 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
21351 {
21352 unsigned i, nelt = d->nelt;
21353 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21354 machine_mode vmode = d->vmode;
21355 bool ident1 = true, ident2 = true;
21356
21357 /* Use the same checks as in expand_vec_perm_blend. */
21358 if (d->one_operand_p)
21359 return false;
21360 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21361 ;
21362 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21363 ;
21364 else if (TARGET_SSE4_1
21365 && (GET_MODE_SIZE (vmode) == 16
21366 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
21367 || GET_MODE_SIZE (vmode) == 4))
21368 ;
21369 else
21370 return false;
21371
21372 dfirst = *d;
21373 dsecond = *d;
21374 dfinal = *d;
21375 dfirst.op1 = dfirst.op0;
21376 dfirst.one_operand_p = true;
21377 dsecond.op0 = dsecond.op1;
21378 dsecond.one_operand_p = true;
21379
21380 for (i = 0; i < nelt; ++i)
21381 if (d->perm[i] >= nelt)
21382 {
21383 dfirst.perm[i] = 0xff;
21384 dsecond.perm[i] = d->perm[i] - nelt;
21385 if (d->perm[i] != i + nelt)
21386 ident2 = false;
21387 }
21388 else
21389 {
21390 dsecond.perm[i] = 0xff;
21391 dfirst.perm[i] = d->perm[i];
21392 if (d->perm[i] != i)
21393 ident1 = false;
21394 }
21395
21396 if (two_insn && !ident1 && !ident2)
21397 return false;
21398
21399 /* For now. Ideally treat 0xff as a wildcard. */
21400 for (i = 0; i < nelt; ++i)
21401 if (dfirst.perm[i] == 0xff)
21402 {
21403 if (GET_MODE_SIZE (vmode) == 32
21404 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
21405 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21406 else
21407 dfirst.perm[i] = i;
21408 }
21409 else
21410 {
21411 if (GET_MODE_SIZE (vmode) == 32
21412 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
21413 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21414 else
21415 dsecond.perm[i] = i;
21416 }
21417
21418 if (!d->testing_p)
21419 {
21420 if (!ident1)
21421 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21422 if (!ident2)
21423 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21424 }
21425
21426 bool ok;
21427 rtx_insn *seq1 = NULL, *seq2 = NULL;
21428
21429 if (!ident1)
21430 {
21431 start_sequence ();
21432 ok = expand_vec_perm_1 (&dfirst);
21433 seq1 = get_insns ();
21434 end_sequence ();
21435
21436 if (!ok)
21437 return false;
21438 }
21439
21440 if (!ident2)
21441 {
21442 start_sequence ();
21443 ok = expand_vec_perm_1 (&dsecond);
21444 seq2 = get_insns ();
21445 end_sequence ();
21446
21447 if (!ok)
21448 return false;
21449 }
21450
21451 if (d->testing_p)
21452 return true;
21453
21454 for (i = 0; i < nelt; ++i)
21455 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
21456
21457 emit_insn (seq1);
21458 emit_insn (seq2);
21459 ok = expand_vec_perm_blend (&dfinal);
21460 gcc_assert (ok);
21461 return true;
21462 }
21463
21464 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21465 permutation using two vperm2f128, followed by a vshufpd insn blending
21466 the two vectors together. */
21467
21468 static bool
21469 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
21470 {
21471 struct expand_vec_perm_d dfirst, dsecond, dthird;
21472 bool ok;
21473
21474 if (!TARGET_AVX || (d->vmode != V4DFmode))
21475 return false;
21476
21477 if (d->testing_p)
21478 return true;
21479
21480 dfirst = *d;
21481 dsecond = *d;
21482 dthird = *d;
21483
21484 dfirst.perm[0] = (d->perm[0] & ~1);
21485 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
21486 dfirst.perm[2] = (d->perm[2] & ~1);
21487 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
21488 dsecond.perm[0] = (d->perm[1] & ~1);
21489 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
21490 dsecond.perm[2] = (d->perm[3] & ~1);
21491 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
21492 dthird.perm[0] = (d->perm[0] % 2);
21493 dthird.perm[1] = (d->perm[1] % 2) + 4;
21494 dthird.perm[2] = (d->perm[2] % 2) + 2;
21495 dthird.perm[3] = (d->perm[3] % 2) + 6;
21496
21497 dfirst.target = gen_reg_rtx (dfirst.vmode);
21498 dsecond.target = gen_reg_rtx (dsecond.vmode);
21499 dthird.op0 = dfirst.target;
21500 dthird.op1 = dsecond.target;
21501 dthird.one_operand_p = false;
21502
21503 canonicalize_perm (&dfirst);
21504 canonicalize_perm (&dsecond);
21505
21506 ok = expand_vec_perm_1 (&dfirst)
21507 && expand_vec_perm_1 (&dsecond)
21508 && expand_vec_perm_1 (&dthird);
21509
21510 gcc_assert (ok);
21511
21512 return true;
21513 }
21514
21515 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
21516
21517 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21518 a two vector permutation using two intra-lane vector
21519 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21520 the non-swapped and swapped vectors together. */
21521
21522 static bool
21523 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
21524 {
21525 struct expand_vec_perm_d dfirst, dsecond, dthird;
21526 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
21527 rtx_insn *seq1, *seq2;
21528 bool ok;
21529 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21530
21531 if (!TARGET_AVX
21532 || TARGET_AVX2
21533 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21534 || d->one_operand_p)
21535 return false;
21536
21537 dfirst = *d;
21538 dsecond = *d;
21539 for (i = 0; i < nelt; i++)
21540 {
21541 dfirst.perm[i] = 0xff;
21542 dsecond.perm[i] = 0xff;
21543 }
21544 for (i = 0, msk = 0; i < nelt; i++)
21545 {
21546 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21547 if (j == i)
21548 {
21549 dfirst.perm[j] = d->perm[i];
21550 which1 |= (d->perm[i] < nelt ? 1 : 2);
21551 }
21552 else
21553 {
21554 dsecond.perm[j] = d->perm[i];
21555 which2 |= (d->perm[i] < nelt ? 1 : 2);
21556 msk |= (1U << i);
21557 }
21558 }
21559 if (msk == 0 || msk == (1U << nelt) - 1)
21560 return false;
21561
21562 if (!d->testing_p)
21563 {
21564 dfirst.target = gen_reg_rtx (dfirst.vmode);
21565 dsecond.target = gen_reg_rtx (dsecond.vmode);
21566 }
21567
21568 for (i = 0; i < nelt; i++)
21569 {
21570 if (dfirst.perm[i] == 0xff)
21571 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
21572 if (dsecond.perm[i] == 0xff)
21573 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
21574 }
21575 canonicalize_perm (&dfirst);
21576 start_sequence ();
21577 ok = ix86_expand_vec_perm_const_1 (&dfirst);
21578 seq1 = get_insns ();
21579 end_sequence ();
21580
21581 if (!ok)
21582 return false;
21583
21584 canonicalize_perm (&dsecond);
21585 start_sequence ();
21586 ok = ix86_expand_vec_perm_const_1 (&dsecond);
21587 seq2 = get_insns ();
21588 end_sequence ();
21589
21590 if (!ok)
21591 return false;
21592
21593 if (d->testing_p)
21594 return true;
21595
21596 emit_insn (seq1);
21597 emit_insn (seq2);
21598
21599 dthird = *d;
21600 dthird.op0 = dsecond.target;
21601 dthird.op1 = dsecond.target;
21602 dthird.one_operand_p = true;
21603 dthird.target = gen_reg_rtx (dthird.vmode);
21604 for (i = 0; i < nelt; i++)
21605 dthird.perm[i] = i ^ nelt2;
21606
21607 ok = expand_vec_perm_1 (&dthird);
21608 gcc_assert (ok);
21609
21610 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21611 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
21612 return true;
21613 }
21614
21615 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21616 permutation with two pshufb insns and an ior. We should have already
21617 failed all two instruction sequences. */
21618
21619 static bool
21620 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
21621 {
21622 rtx rperm[2][16], vperm, l, h, op, m128;
21623 unsigned int i, nelt, eltsz;
21624 machine_mode mode;
21625 rtx (*gen) (rtx, rtx, rtx);
21626
21627 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
21628 && GET_MODE_SIZE (d->vmode) != 8
21629 && GET_MODE_SIZE (d->vmode) != 4))
21630 return false;
21631 gcc_assert (!d->one_operand_p);
21632
21633 if (d->testing_p)
21634 return true;
21635
21636 switch (GET_MODE_SIZE (d->vmode))
21637 {
21638 case 4:
21639 mode = V4QImode;
21640 gen = gen_mmx_pshufbv4qi3;
21641 break;
21642 case 8:
21643 mode = V8QImode;
21644 gen = gen_mmx_pshufbv8qi3;
21645 break;
21646 case 16:
21647 mode = V16QImode;
21648 gen = gen_ssse3_pshufbv16qi3;
21649 break;
21650 default:
21651 gcc_unreachable ();
21652 }
21653
21654 nelt = d->nelt;
21655 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21656
21657 /* Generate two permutation masks. If the required element is within
21658 the given vector it is shuffled into the proper lane. If the required
21659 element is in the other vector, force a zero into the lane by setting
21660 bit 7 in the permutation mask. */
21661 m128 = GEN_INT (-128);
21662 for (i = 0; i < nelt; ++i)
21663 {
21664 unsigned j, k, e = d->perm[i];
21665 unsigned which = (e >= nelt);
21666 if (e >= nelt)
21667 e -= nelt;
21668
21669 for (j = 0; j < eltsz; ++j)
21670 {
21671 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
21672 rperm[1-which][i*eltsz + j] = m128;
21673 }
21674
21675 for (k = i*eltsz + j; k < 16; ++k)
21676 rperm[0][k] = rperm[1][k] = m128;
21677 }
21678
21679 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
21680 vperm = force_reg (V16QImode, vperm);
21681
21682 l = gen_reg_rtx (mode);
21683 op = gen_lowpart (mode, d->op0);
21684 emit_insn (gen (l, op, vperm));
21685
21686 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
21687 vperm = force_reg (V16QImode, vperm);
21688
21689 h = gen_reg_rtx (mode);
21690 op = gen_lowpart (mode, d->op1);
21691 emit_insn (gen (h, op, vperm));
21692
21693 op = d->target;
21694 if (d->vmode != mode)
21695 op = gen_reg_rtx (mode);
21696 ix86_emit_vec_binop (IOR, mode, op, l, h);
21697 if (op != d->target)
21698 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21699
21700 return true;
21701 }
21702
21703 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
21704 with two vpshufb insns, vpermq and vpor. We should have already failed
21705 all two or three instruction sequences. */
21706
21707 static bool
21708 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
21709 {
21710 rtx rperm[2][32], vperm, l, h, hp, op, m128;
21711 unsigned int i, nelt, eltsz;
21712
21713 if (!TARGET_AVX2
21714 || !d->one_operand_p
21715 || (d->vmode != V32QImode && d->vmode != V16HImode))
21716 return false;
21717
21718 if (d->testing_p)
21719 return true;
21720
21721 nelt = d->nelt;
21722 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21723
21724 /* Generate two permutation masks. If the required element is within
21725 the same lane, it is shuffled in. If the required element from the
21726 other lane, force a zero by setting bit 7 in the permutation mask.
21727 In the other mask the mask has non-negative elements if element
21728 is requested from the other lane, but also moved to the other lane,
21729 so that the result of vpshufb can have the two V2TImode halves
21730 swapped. */
21731 m128 = GEN_INT (-128);
21732 for (i = 0; i < nelt; ++i)
21733 {
21734 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21735 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21736
21737 for (j = 0; j < eltsz; ++j)
21738 {
21739 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
21740 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
21741 }
21742 }
21743
21744 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21745 vperm = force_reg (V32QImode, vperm);
21746
21747 h = gen_reg_rtx (V32QImode);
21748 op = gen_lowpart (V32QImode, d->op0);
21749 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21750
21751 /* Swap the 128-byte lanes of h into hp. */
21752 hp = gen_reg_rtx (V4DImode);
21753 op = gen_lowpart (V4DImode, h);
21754 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
21755 const1_rtx));
21756
21757 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21758 vperm = force_reg (V32QImode, vperm);
21759
21760 l = gen_reg_rtx (V32QImode);
21761 op = gen_lowpart (V32QImode, d->op0);
21762 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21763
21764 op = d->target;
21765 if (d->vmode != V32QImode)
21766 op = gen_reg_rtx (V32QImode);
21767 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
21768 if (op != d->target)
21769 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21770
21771 return true;
21772 }
21773
21774 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21775 and extract-odd permutations of two V32QImode and V16QImode operand
21776 with two vpshufb insns, vpor and vpermq. We should have already
21777 failed all two or three instruction sequences. */
21778
21779 static bool
21780 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
21781 {
21782 rtx rperm[2][32], vperm, l, h, ior, op, m128;
21783 unsigned int i, nelt, eltsz;
21784
21785 if (!TARGET_AVX2
21786 || d->one_operand_p
21787 || (d->vmode != V32QImode && d->vmode != V16HImode))
21788 return false;
21789
21790 for (i = 0; i < d->nelt; ++i)
21791 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
21792 return false;
21793
21794 if (d->testing_p)
21795 return true;
21796
21797 nelt = d->nelt;
21798 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21799
21800 /* Generate two permutation masks. In the first permutation mask
21801 the first quarter will contain indexes for the first half
21802 of the op0, the second quarter will contain bit 7 set, third quarter
21803 will contain indexes for the second half of the op0 and the
21804 last quarter bit 7 set. In the second permutation mask
21805 the first quarter will contain bit 7 set, the second quarter
21806 indexes for the first half of the op1, the third quarter bit 7 set
21807 and last quarter indexes for the second half of the op1.
21808 I.e. the first mask e.g. for V32QImode extract even will be:
21809 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21810 (all values masked with 0xf except for -128) and second mask
21811 for extract even will be
21812 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21813 m128 = GEN_INT (-128);
21814 for (i = 0; i < nelt; ++i)
21815 {
21816 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21817 unsigned which = d->perm[i] >= nelt;
21818 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
21819
21820 for (j = 0; j < eltsz; ++j)
21821 {
21822 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
21823 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
21824 }
21825 }
21826
21827 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21828 vperm = force_reg (V32QImode, vperm);
21829
21830 l = gen_reg_rtx (V32QImode);
21831 op = gen_lowpart (V32QImode, d->op0);
21832 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21833
21834 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21835 vperm = force_reg (V32QImode, vperm);
21836
21837 h = gen_reg_rtx (V32QImode);
21838 op = gen_lowpart (V32QImode, d->op1);
21839 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21840
21841 ior = gen_reg_rtx (V32QImode);
21842 emit_insn (gen_iorv32qi3 (ior, l, h));
21843
21844 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21845 op = gen_reg_rtx (V4DImode);
21846 ior = gen_lowpart (V4DImode, ior);
21847 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
21848 const1_rtx, GEN_INT (3)));
21849 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21850
21851 return true;
21852 }
21853
21854 /* Implement permutation with pslldq + psrldq + por when pshufb is not
21855 available. */
21856 static bool
21857 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
21858 {
21859 unsigned i, nelt = d->nelt;
21860 unsigned start1, end1 = -1;
21861 machine_mode vmode = d->vmode, imode;
21862 int start2 = -1;
21863 bool clear_op0, clear_op1;
21864 unsigned inner_size;
21865 rtx op0, op1, dop1;
21866 rtx (*gen_vec_shr) (rtx, rtx, rtx);
21867 rtx (*gen_vec_shl) (rtx, rtx, rtx);
21868
21869 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21870 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
21871 return false;
21872
21873 start1 = d->perm[0];
21874 for (i = 1; i < nelt; i++)
21875 {
21876 if (d->perm[i] != d->perm[i-1] + 1
21877 || d->perm[i] == nelt)
21878 {
21879 if (start2 == -1)
21880 {
21881 start2 = d->perm[i];
21882 end1 = d->perm[i-1];
21883 }
21884 else
21885 return false;
21886 }
21887 }
21888
21889 clear_op0 = end1 != nelt - 1;
21890 clear_op1 = start2 % nelt != 0;
21891 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21892 if (!pandn && (clear_op0 || clear_op1))
21893 return false;
21894
21895 if (d->testing_p)
21896 return true;
21897
21898 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
21899 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
21900 imode = GET_MODE_INNER (vmode);
21901 inner_size = GET_MODE_BITSIZE (imode);
21902 op0 = gen_reg_rtx (vmode);
21903 op1 = gen_reg_rtx (vmode);
21904
21905 if (start1)
21906 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
21907 else
21908 emit_move_insn (op0, d->op0);
21909
21910 dop1 = d->op1;
21911 if (d->one_operand_p)
21912 dop1 = d->op0;
21913
21914 int shl_offset = end1 - start1 + 1 - start2 % nelt;
21915 if (shl_offset)
21916 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
21917 else
21918 emit_move_insn (op1, dop1);
21919
21920 /* Clear lower/upper bits for op0/op1. */
21921 if (clear_op0 || clear_op1)
21922 {
21923 rtx vec[16];
21924 rtx const_vec;
21925 rtx clear;
21926 for (i = 0; i != nelt; i++)
21927 {
21928 if (i < (end1 - start1 + 1))
21929 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
21930 else
21931 vec[i] = CONST0_RTX (imode);
21932 }
21933 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
21934 const_vec = validize_mem (force_const_mem (vmode, const_vec));
21935 clear = force_reg (vmode, const_vec);
21936
21937 if (clear_op0)
21938 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
21939 if (clear_op1)
21940 emit_move_insn (op1, gen_rtx_AND (vmode,
21941 gen_rtx_NOT (vmode, clear),
21942 op1));
21943 }
21944
21945 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
21946 return true;
21947 }
21948
21949 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21950 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21951 operands with two "and" and "pack" or two "shift" and "pack" insns.
21952 We should have already failed all two instruction sequences. */
21953
21954 static bool
21955 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
21956 {
21957 rtx op, dop0, dop1, t;
21958 unsigned i, odd, c, s, nelt = d->nelt;
21959 bool end_perm = false;
21960 machine_mode half_mode;
21961 rtx (*gen_and) (rtx, rtx, rtx);
21962 rtx (*gen_pack) (rtx, rtx, rtx);
21963 rtx (*gen_shift) (rtx, rtx, rtx);
21964
21965 if (d->one_operand_p)
21966 return false;
21967
21968 switch (d->vmode)
21969 {
21970 case E_V4HImode:
21971 /* Required for "pack". */
21972 if (!TARGET_SSE4_1)
21973 return false;
21974 c = 0xffff;
21975 s = 16;
21976 half_mode = V2SImode;
21977 gen_and = gen_andv2si3;
21978 gen_pack = gen_mmx_packusdw;
21979 gen_shift = gen_lshrv2si3;
21980 break;
21981 case E_V8HImode:
21982 /* Required for "pack". */
21983 if (!TARGET_SSE4_1)
21984 return false;
21985 c = 0xffff;
21986 s = 16;
21987 half_mode = V4SImode;
21988 gen_and = gen_andv4si3;
21989 gen_pack = gen_sse4_1_packusdw;
21990 gen_shift = gen_lshrv4si3;
21991 break;
21992 case E_V8QImode:
21993 /* No check as all instructions are SSE2. */
21994 c = 0xff;
21995 s = 8;
21996 half_mode = V4HImode;
21997 gen_and = gen_andv4hi3;
21998 gen_pack = gen_mmx_packuswb;
21999 gen_shift = gen_lshrv4hi3;
22000 break;
22001 case E_V16QImode:
22002 /* No check as all instructions are SSE2. */
22003 c = 0xff;
22004 s = 8;
22005 half_mode = V8HImode;
22006 gen_and = gen_andv8hi3;
22007 gen_pack = gen_sse2_packuswb;
22008 gen_shift = gen_lshrv8hi3;
22009 break;
22010 case E_V16HImode:
22011 if (!TARGET_AVX2)
22012 return false;
22013 c = 0xffff;
22014 s = 16;
22015 half_mode = V8SImode;
22016 gen_and = gen_andv8si3;
22017 gen_pack = gen_avx2_packusdw;
22018 gen_shift = gen_lshrv8si3;
22019 end_perm = true;
22020 break;
22021 case E_V32QImode:
22022 if (!TARGET_AVX2)
22023 return false;
22024 c = 0xff;
22025 s = 8;
22026 half_mode = V16HImode;
22027 gen_and = gen_andv16hi3;
22028 gen_pack = gen_avx2_packuswb;
22029 gen_shift = gen_lshrv16hi3;
22030 end_perm = true;
22031 break;
22032 default:
22033 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
22034 are more profitable than general shuffles. */
22035 return false;
22036 }
22037
22038 /* Check that permutation is even or odd. */
22039 odd = d->perm[0];
22040 if (odd > 1)
22041 return false;
22042
22043 for (i = 1; i < nelt; ++i)
22044 if (d->perm[i] != 2 * i + odd)
22045 return false;
22046
22047 if (d->testing_p)
22048 return true;
22049
22050 dop0 = gen_reg_rtx (half_mode);
22051 dop1 = gen_reg_rtx (half_mode);
22052 if (odd == 0)
22053 {
22054 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
22055 t = force_reg (half_mode, t);
22056 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
22057 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
22058 }
22059 else
22060 {
22061 emit_insn (gen_shift (dop0,
22062 gen_lowpart (half_mode, d->op0),
22063 GEN_INT (s)));
22064 emit_insn (gen_shift (dop1,
22065 gen_lowpart (half_mode, d->op1),
22066 GEN_INT (s)));
22067 }
22068 /* In AVX2 for 256 bit case we need to permute pack result. */
22069 if (TARGET_AVX2 && end_perm)
22070 {
22071 op = gen_reg_rtx (d->vmode);
22072 t = gen_reg_rtx (V4DImode);
22073 emit_insn (gen_pack (op, dop0, dop1));
22074 emit_insn (gen_avx2_permv4di_1 (t,
22075 gen_lowpart (V4DImode, op),
22076 const0_rtx,
22077 const2_rtx,
22078 const1_rtx,
22079 GEN_INT (3)));
22080 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
22081 }
22082 else
22083 emit_insn (gen_pack (d->target, dop0, dop1));
22084
22085 return true;
22086 }
22087
22088 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22089 and extract-odd permutations of two V64QI operands
22090 with two "shifts", two "truncs" and one "concat" insns for "odd"
22091 and two "truncs" and one concat insn for "even."
22092 Have already failed all two instruction sequences. */
22093
22094 static bool
22095 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
22096 {
22097 rtx t1, t2, t3, t4;
22098 unsigned i, odd, nelt = d->nelt;
22099
22100 if (!TARGET_AVX512BW
22101 || d->one_operand_p
22102 || d->vmode != V64QImode)
22103 return false;
22104
22105 /* Check that permutation is even or odd. */
22106 odd = d->perm[0];
22107 if (odd > 1)
22108 return false;
22109
22110 for (i = 1; i < nelt; ++i)
22111 if (d->perm[i] != 2 * i + odd)
22112 return false;
22113
22114 if (d->testing_p)
22115 return true;
22116
22117
22118 if (odd)
22119 {
22120 t1 = gen_reg_rtx (V32HImode);
22121 t2 = gen_reg_rtx (V32HImode);
22122 emit_insn (gen_lshrv32hi3 (t1,
22123 gen_lowpart (V32HImode, d->op0),
22124 GEN_INT (8)));
22125 emit_insn (gen_lshrv32hi3 (t2,
22126 gen_lowpart (V32HImode, d->op1),
22127 GEN_INT (8)));
22128 }
22129 else
22130 {
22131 t1 = gen_lowpart (V32HImode, d->op0);
22132 t2 = gen_lowpart (V32HImode, d->op1);
22133 }
22134
22135 t3 = gen_reg_rtx (V32QImode);
22136 t4 = gen_reg_rtx (V32QImode);
22137 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
22138 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
22139 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
22140
22141 return true;
22142 }
22143
22144 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
22145 and extract-odd permutations. */
22146
22147 static bool
22148 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
22149 {
22150 rtx t1, t2, t3, t4, t5;
22151
22152 switch (d->vmode)
22153 {
22154 case E_V4DFmode:
22155 if (d->testing_p)
22156 break;
22157 t1 = gen_reg_rtx (V4DFmode);
22158 t2 = gen_reg_rtx (V4DFmode);
22159
22160 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22161 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
22162 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
22163
22164 /* Now an unpck[lh]pd will produce the result required. */
22165 if (odd)
22166 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
22167 else
22168 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
22169 emit_insn (t3);
22170 break;
22171
22172 case E_V8SFmode:
22173 {
22174 int mask = odd ? 0xdd : 0x88;
22175
22176 if (d->testing_p)
22177 break;
22178 t1 = gen_reg_rtx (V8SFmode);
22179 t2 = gen_reg_rtx (V8SFmode);
22180 t3 = gen_reg_rtx (V8SFmode);
22181
22182 /* Shuffle within the 128-bit lanes to produce:
22183 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
22184 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
22185 GEN_INT (mask)));
22186
22187 /* Shuffle the lanes around to produce:
22188 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
22189 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
22190 GEN_INT (0x3)));
22191
22192 /* Shuffle within the 128-bit lanes to produce:
22193 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
22194 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
22195
22196 /* Shuffle within the 128-bit lanes to produce:
22197 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
22198 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
22199
22200 /* Shuffle the lanes around to produce:
22201 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
22202 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
22203 GEN_INT (0x20)));
22204 }
22205 break;
22206
22207 case E_V2DFmode:
22208 case E_V4SFmode:
22209 case E_V2DImode:
22210 case E_V2SImode:
22211 case E_V4SImode:
22212 case E_V2HImode:
22213 /* These are always directly implementable by expand_vec_perm_1. */
22214 gcc_unreachable ();
22215
22216 case E_V2SFmode:
22217 gcc_assert (TARGET_MMX_WITH_SSE);
22218 /* We have no suitable instructions. */
22219 if (d->testing_p)
22220 return false;
22221 break;
22222
22223 case E_V4QImode:
22224 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22225 return expand_vec_perm_pshufb2 (d);
22226 else
22227 {
22228 if (d->testing_p)
22229 break;
22230 /* We need 2*log2(N)-1 operations to achieve odd/even
22231 with interleave. */
22232 t1 = gen_reg_rtx (V4QImode);
22233 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
22234 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
22235 if (odd)
22236 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
22237 else
22238 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
22239 emit_insn (t2);
22240 }
22241 break;
22242
22243 case E_V4HImode:
22244 if (TARGET_SSE4_1)
22245 return expand_vec_perm_even_odd_pack (d);
22246 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22247 return expand_vec_perm_pshufb2 (d);
22248 else
22249 {
22250 if (d->testing_p)
22251 break;
22252 /* We need 2*log2(N)-1 operations to achieve odd/even
22253 with interleave. */
22254 t1 = gen_reg_rtx (V4HImode);
22255 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
22256 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
22257 if (odd)
22258 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
22259 else
22260 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
22261 emit_insn (t2);
22262 }
22263 break;
22264
22265 case E_V8HImode:
22266 if (TARGET_SSE4_1)
22267 return expand_vec_perm_even_odd_pack (d);
22268 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22269 return expand_vec_perm_pshufb2 (d);
22270 else
22271 {
22272 if (d->testing_p)
22273 break;
22274 /* We need 2*log2(N)-1 operations to achieve odd/even
22275 with interleave. */
22276 t1 = gen_reg_rtx (V8HImode);
22277 t2 = gen_reg_rtx (V8HImode);
22278 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
22279 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
22280 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
22281 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
22282 if (odd)
22283 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
22284 else
22285 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
22286 emit_insn (t3);
22287 }
22288 break;
22289
22290 case E_V8QImode:
22291 case E_V16QImode:
22292 return expand_vec_perm_even_odd_pack (d);
22293
22294 case E_V16HImode:
22295 case E_V32QImode:
22296 return expand_vec_perm_even_odd_pack (d);
22297
22298 case E_V64QImode:
22299 return expand_vec_perm_even_odd_trunc (d);
22300
22301 case E_V4DImode:
22302 if (!TARGET_AVX2)
22303 {
22304 struct expand_vec_perm_d d_copy = *d;
22305 d_copy.vmode = V4DFmode;
22306 if (d->testing_p)
22307 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
22308 else
22309 d_copy.target = gen_reg_rtx (V4DFmode);
22310 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
22311 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
22312 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22313 {
22314 if (!d->testing_p)
22315 emit_move_insn (d->target,
22316 gen_lowpart (V4DImode, d_copy.target));
22317 return true;
22318 }
22319 return false;
22320 }
22321
22322 if (d->testing_p)
22323 break;
22324
22325 t1 = gen_reg_rtx (V4DImode);
22326 t2 = gen_reg_rtx (V4DImode);
22327
22328 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22329 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
22330 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
22331
22332 /* Now an vpunpck[lh]qdq will produce the result required. */
22333 if (odd)
22334 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
22335 else
22336 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
22337 emit_insn (t3);
22338 break;
22339
22340 case E_V8SImode:
22341 if (!TARGET_AVX2)
22342 {
22343 struct expand_vec_perm_d d_copy = *d;
22344 d_copy.vmode = V8SFmode;
22345 if (d->testing_p)
22346 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
22347 else
22348 d_copy.target = gen_reg_rtx (V8SFmode);
22349 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
22350 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
22351 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
22352 {
22353 if (!d->testing_p)
22354 emit_move_insn (d->target,
22355 gen_lowpart (V8SImode, d_copy.target));
22356 return true;
22357 }
22358 return false;
22359 }
22360
22361 if (d->testing_p)
22362 break;
22363
22364 t1 = gen_reg_rtx (V8SImode);
22365 t2 = gen_reg_rtx (V8SImode);
22366 t3 = gen_reg_rtx (V4DImode);
22367 t4 = gen_reg_rtx (V4DImode);
22368 t5 = gen_reg_rtx (V4DImode);
22369
22370 /* Shuffle the lanes around into
22371 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22372 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
22373 gen_lowpart (V4DImode, d->op1),
22374 GEN_INT (0x20)));
22375 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
22376 gen_lowpart (V4DImode, d->op1),
22377 GEN_INT (0x31)));
22378
22379 /* Swap the 2nd and 3rd position in each lane into
22380 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22381 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
22382 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22383 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
22384 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22385
22386 /* Now an vpunpck[lh]qdq will produce
22387 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22388 if (odd)
22389 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
22390 gen_lowpart (V4DImode, t2));
22391 else
22392 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
22393 gen_lowpart (V4DImode, t2));
22394 emit_insn (t3);
22395 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
22396 break;
22397
22398 default:
22399 gcc_unreachable ();
22400 }
22401
22402 return true;
22403 }
22404
22405 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22406 extract-even and extract-odd permutations. */
22407
22408 static bool
22409 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
22410 {
22411 unsigned i, odd, nelt = d->nelt;
22412
22413 odd = d->perm[0];
22414 if (odd != 0 && odd != 1)
22415 return false;
22416
22417 for (i = 1; i < nelt; ++i)
22418 if (d->perm[i] != 2 * i + odd)
22419 return false;
22420
22421 if (d->vmode == E_V32HImode
22422 && d->testing_p
22423 && !TARGET_AVX512BW)
22424 return false;
22425
22426 return expand_vec_perm_even_odd_1 (d, odd);
22427 }
22428
22429 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22430 permutations. We assume that expand_vec_perm_1 has already failed. */
22431
22432 static bool
22433 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
22434 {
22435 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
22436 machine_mode vmode = d->vmode;
22437 rtx (*gen) (rtx, rtx, rtx);
22438 unsigned char perm2[4];
22439 rtx op0 = d->op0, dest;
22440 bool ok;
22441
22442 switch (vmode)
22443 {
22444 case E_V4DFmode:
22445 case E_V8SFmode:
22446 /* These are special-cased in sse.md so that we can optionally
22447 use the vbroadcast instruction. They expand to two insns
22448 if the input happens to be in a register. */
22449 gcc_unreachable ();
22450
22451 case E_V2DFmode:
22452 case E_V2SFmode:
22453 case E_V4SFmode:
22454 case E_V2DImode:
22455 case E_V2SImode:
22456 case E_V4SImode:
22457 case E_V2HImode:
22458 case E_V4HImode:
22459 /* These are always implementable using standard shuffle patterns. */
22460 gcc_unreachable ();
22461
22462 case E_V4QImode:
22463 /* This can be implemented via interleave and pshuflw. */
22464 if (d->testing_p)
22465 return true;
22466
22467 if (elt >= nelt2)
22468 {
22469 gen = gen_mmx_punpckhbw_low;
22470 elt -= nelt2;
22471 }
22472 else
22473 gen = gen_mmx_punpcklbw_low;
22474
22475 dest = gen_reg_rtx (vmode);
22476 emit_insn (gen (dest, op0, op0));
22477 vmode = get_mode_wider_vector (vmode);
22478 op0 = gen_lowpart (vmode, dest);
22479
22480 memset (perm2, elt, 2);
22481 dest = gen_reg_rtx (vmode);
22482 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22483 gcc_assert (ok);
22484
22485 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22486 return true;
22487
22488 case E_V8QImode:
22489 /* This can be implemented via interleave. We save one insn by
22490 stopping once we have promoted to V2SImode and then use pshufd. */
22491 if (d->testing_p)
22492 return true;
22493 do
22494 {
22495 if (elt >= nelt2)
22496 {
22497 gen = vmode == V8QImode ? gen_mmx_punpckhbw
22498 : gen_mmx_punpckhwd;
22499 elt -= nelt2;
22500 }
22501 else
22502 gen = vmode == V8QImode ? gen_mmx_punpcklbw
22503 : gen_mmx_punpcklwd;
22504 nelt2 /= 2;
22505
22506 dest = gen_reg_rtx (vmode);
22507 emit_insn (gen (dest, op0, op0));
22508 vmode = get_mode_wider_vector (vmode);
22509 op0 = gen_lowpart (vmode, dest);
22510 }
22511 while (vmode != V2SImode);
22512
22513 memset (perm2, elt, 2);
22514 dest = gen_reg_rtx (vmode);
22515 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22516 gcc_assert (ok);
22517
22518 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22519 return true;
22520
22521 case E_V8HImode:
22522 case E_V16QImode:
22523 /* These can be implemented via interleave. We save one insn by
22524 stopping once we have promoted to V4SImode and then use pshufd. */
22525 if (d->testing_p)
22526 return true;
22527 do
22528 {
22529 if (elt >= nelt2)
22530 {
22531 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
22532 : gen_vec_interleave_highv8hi;
22533 elt -= nelt2;
22534 }
22535 else
22536 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
22537 : gen_vec_interleave_lowv8hi;
22538 nelt2 /= 2;
22539
22540 dest = gen_reg_rtx (vmode);
22541 emit_insn (gen (dest, op0, op0));
22542 vmode = get_mode_wider_vector (vmode);
22543 op0 = gen_lowpart (vmode, dest);
22544 }
22545 while (vmode != V4SImode);
22546
22547 memset (perm2, elt, 4);
22548 dest = gen_reg_rtx (vmode);
22549 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22550 gcc_assert (ok);
22551
22552 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22553 return true;
22554
22555 case E_V8HFmode:
22556 case E_V8BFmode:
22557 /* This can be implemented via interleave and pshufd. */
22558 if (d->testing_p)
22559 return true;
22560
22561 rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
22562 if (elt >= nelt2)
22563 {
22564 gen_interleave = gen_vec_interleave_high;
22565 elt -= nelt2;
22566 }
22567 else
22568 gen_interleave = gen_vec_interleave_low;
22569 nelt2 /= 2;
22570
22571 dest = gen_reg_rtx (vmode);
22572 emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
22573
22574 vmode = V4SImode;
22575 op0 = gen_lowpart (vmode, dest);
22576
22577 memset (perm2, elt, 4);
22578 dest = gen_reg_rtx (vmode);
22579 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22580 gcc_assert (ok);
22581
22582 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22583 return true;
22584
22585 case E_V32QImode:
22586 case E_V16HImode:
22587 case E_V8SImode:
22588 case E_V4DImode:
22589 /* For AVX2 broadcasts of the first element vpbroadcast* or
22590 vpermq should be used by expand_vec_perm_1. */
22591 gcc_assert (!TARGET_AVX2 || d->perm[0]);
22592 return false;
22593
22594 case E_V64QImode:
22595 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
22596 return false;
22597
22598 case E_V32HImode:
22599 gcc_assert (!TARGET_AVX512BW);
22600 return false;
22601
22602 default:
22603 gcc_unreachable ();
22604 }
22605 }
22606
22607 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22608 broadcast permutations. */
22609
22610 static bool
22611 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
22612 {
22613 unsigned i, elt, nelt = d->nelt;
22614
22615 if (!d->one_operand_p)
22616 return false;
22617
22618 elt = d->perm[0];
22619 for (i = 1; i < nelt; ++i)
22620 if (d->perm[i] != elt)
22621 return false;
22622
22623 return expand_vec_perm_broadcast_1 (d);
22624 }
22625
22626 /* Implement arbitrary permutations of two V64QImode operands
22627 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22628 static bool
22629 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
22630 {
22631 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
22632 return false;
22633
22634 if (d->testing_p)
22635 return true;
22636
22637 struct expand_vec_perm_d ds[2];
22638 rtx rperm[128], vperm, target0, target1;
22639 unsigned int i, nelt;
22640 machine_mode vmode;
22641
22642 nelt = d->nelt;
22643 vmode = V64QImode;
22644
22645 for (i = 0; i < 2; i++)
22646 {
22647 ds[i] = *d;
22648 ds[i].vmode = V32HImode;
22649 ds[i].nelt = 32;
22650 ds[i].target = gen_reg_rtx (V32HImode);
22651 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
22652 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
22653 }
22654
22655 /* Prepare permutations such that the first one takes care of
22656 putting the even bytes into the right positions or one higher
22657 positions (ds[0]) and the second one takes care of
22658 putting the odd bytes into the right positions or one below
22659 (ds[1]). */
22660
22661 for (i = 0; i < nelt; i++)
22662 {
22663 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
22664 if (i & 1)
22665 {
22666 rperm[i] = constm1_rtx;
22667 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22668 }
22669 else
22670 {
22671 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22672 rperm[i + 64] = constm1_rtx;
22673 }
22674 }
22675
22676 bool ok = expand_vec_perm_1 (&ds[0]);
22677 gcc_assert (ok);
22678 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
22679
22680 ok = expand_vec_perm_1 (&ds[1]);
22681 gcc_assert (ok);
22682 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
22683
22684 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
22685 vperm = force_reg (vmode, vperm);
22686 target0 = gen_reg_rtx (V64QImode);
22687 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
22688
22689 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
22690 vperm = force_reg (vmode, vperm);
22691 target1 = gen_reg_rtx (V64QImode);
22692 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
22693
22694 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
22695 return true;
22696 }
22697
22698 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
22699 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22700 all the shorter instruction sequences. */
22701
22702 static bool
22703 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
22704 {
22705 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
22706 unsigned int i, nelt, eltsz;
22707 bool used[4];
22708
22709 if (!TARGET_AVX2
22710 || d->one_operand_p
22711 || (d->vmode != V32QImode && d->vmode != V16HImode))
22712 return false;
22713
22714 if (d->testing_p)
22715 return true;
22716
22717 nelt = d->nelt;
22718 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22719
22720 /* Generate 4 permutation masks. If the required element is within
22721 the same lane, it is shuffled in. If the required element from the
22722 other lane, force a zero by setting bit 7 in the permutation mask.
22723 In the other mask the mask has non-negative elements if element
22724 is requested from the other lane, but also moved to the other lane,
22725 so that the result of vpshufb can have the two V2TImode halves
22726 swapped. */
22727 m128 = GEN_INT (-128);
22728 for (i = 0; i < 32; ++i)
22729 {
22730 rperm[0][i] = m128;
22731 rperm[1][i] = m128;
22732 rperm[2][i] = m128;
22733 rperm[3][i] = m128;
22734 }
22735 used[0] = false;
22736 used[1] = false;
22737 used[2] = false;
22738 used[3] = false;
22739 for (i = 0; i < nelt; ++i)
22740 {
22741 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22742 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22743 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
22744
22745 for (j = 0; j < eltsz; ++j)
22746 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
22747 used[which] = true;
22748 }
22749
22750 for (i = 0; i < 2; ++i)
22751 {
22752 if (!used[2 * i + 1])
22753 {
22754 h[i] = NULL_RTX;
22755 continue;
22756 }
22757 vperm = gen_rtx_CONST_VECTOR (V32QImode,
22758 gen_rtvec_v (32, rperm[2 * i + 1]));
22759 vperm = force_reg (V32QImode, vperm);
22760 h[i] = gen_reg_rtx (V32QImode);
22761 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22762 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
22763 }
22764
22765 /* Swap the 128-byte lanes of h[X]. */
22766 for (i = 0; i < 2; ++i)
22767 {
22768 if (h[i] == NULL_RTX)
22769 continue;
22770 op = gen_reg_rtx (V4DImode);
22771 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
22772 const2_rtx, GEN_INT (3), const0_rtx,
22773 const1_rtx));
22774 h[i] = gen_lowpart (V32QImode, op);
22775 }
22776
22777 for (i = 0; i < 2; ++i)
22778 {
22779 if (!used[2 * i])
22780 {
22781 l[i] = NULL_RTX;
22782 continue;
22783 }
22784 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
22785 vperm = force_reg (V32QImode, vperm);
22786 l[i] = gen_reg_rtx (V32QImode);
22787 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22788 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
22789 }
22790
22791 for (i = 0; i < 2; ++i)
22792 {
22793 if (h[i] && l[i])
22794 {
22795 op = gen_reg_rtx (V32QImode);
22796 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
22797 l[i] = op;
22798 }
22799 else if (h[i])
22800 l[i] = h[i];
22801 }
22802
22803 gcc_assert (l[0] && l[1]);
22804 op = d->target;
22805 if (d->vmode != V32QImode)
22806 op = gen_reg_rtx (V32QImode);
22807 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
22808 if (op != d->target)
22809 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22810 return true;
22811 }
22812
22813 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22814 taken care of, perform the expansion in D and return true on success. */
22815
22816 static bool
22817 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22818 {
22819 /* Try a single instruction expansion. */
22820 if (expand_vec_perm_1 (d))
22821 return true;
22822
22823 /* Try sequences of two instructions. */
22824
22825 if (expand_vec_perm_pshuflw_pshufhw (d))
22826 return true;
22827
22828 if (expand_vec_perm_palignr (d, false))
22829 return true;
22830
22831 if (expand_vec_perm_interleave2 (d))
22832 return true;
22833
22834 if (expand_vec_perm_broadcast (d))
22835 return true;
22836
22837 if (expand_vec_perm_vpermq_perm_1 (d))
22838 return true;
22839
22840 if (expand_vec_perm_vperm2f128 (d))
22841 return true;
22842
22843 if (expand_vec_perm_pblendv (d))
22844 return true;
22845
22846 if (expand_vec_perm_2perm_interleave (d, true))
22847 return true;
22848
22849 if (expand_vec_perm_2perm_pblendv (d, true))
22850 return true;
22851
22852 if (expand_vec_perm_shufps_shufps (d))
22853 return true;
22854
22855 /* Try sequences of three instructions. */
22856
22857 if (expand_vec_perm_even_odd_pack (d))
22858 return true;
22859
22860 if (expand_vec_perm_2vperm2f128_vshuf (d))
22861 return true;
22862
22863 if (expand_vec_perm_pshufb2 (d))
22864 return true;
22865
22866 if (expand_vec_perm_pslldq_psrldq_por (d, false))
22867 return true;
22868
22869 if (expand_vec_perm_interleave3 (d))
22870 return true;
22871
22872 if (expand_vec_perm_vperm2f128_vblend (d))
22873 return true;
22874
22875 if (expand_vec_perm_2perm_interleave (d, false))
22876 return true;
22877
22878 if (expand_vec_perm_2perm_pblendv (d, false))
22879 return true;
22880
22881 /* Try sequences of four instructions. */
22882
22883 if (expand_vec_perm_even_odd_trunc (d))
22884 return true;
22885 if (expand_vec_perm_vpshufb2_vpermq (d))
22886 return true;
22887
22888 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
22889 return true;
22890
22891 if (expand_vec_perm_vpermt2_vpshub2 (d))
22892 return true;
22893
22894 /* ??? Look for narrow permutations whose element orderings would
22895 allow the promotion to a wider mode. */
22896
22897 /* ??? Look for sequences of interleave or a wider permute that place
22898 the data into the correct lanes for a half-vector shuffle like
22899 pshuf[lh]w or vpermilps. */
22900
22901 /* ??? Look for sequences of interleave that produce the desired results.
22902 The combinatorics of punpck[lh] get pretty ugly... */
22903
22904 if (expand_vec_perm_even_odd (d))
22905 return true;
22906
22907 /* Generate four or five instructions. */
22908 if (expand_vec_perm_pslldq_psrldq_por (d, true))
22909 return true;
22910
22911 /* Even longer sequences. */
22912 if (expand_vec_perm_vpshufb4_vpermq2 (d))
22913 return true;
22914
22915 /* See if we can get the same permutation in different vector integer
22916 mode. */
22917 struct expand_vec_perm_d nd;
22918 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22919 {
22920 if (!d->testing_p)
22921 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22922 return true;
22923 }
22924
22925 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22926 if (expand_vec_perm2_vperm2f128_vblend (d))
22927 return true;
22928
22929 return false;
22930 }
22931
22932 /* If a permutation only uses one operand, make it clear. Returns true
22933 if the permutation references both operands. */
22934
22935 static bool
22936 canonicalize_perm (struct expand_vec_perm_d *d)
22937 {
22938 int i, which, nelt = d->nelt;
22939
22940 for (i = which = 0; i < nelt; ++i)
22941 which |= (d->perm[i] < nelt ? 1 : 2);
22942
22943 d->one_operand_p = true;
22944 switch (which)
22945 {
22946 default:
22947 gcc_unreachable();
22948
22949 case 3:
22950 if (!rtx_equal_p (d->op0, d->op1))
22951 {
22952 d->one_operand_p = false;
22953 break;
22954 }
22955 /* The elements of PERM do not suggest that only the first operand
22956 is used, but both operands are identical. Allow easier matching
22957 of the permutation by folding the permutation into the single
22958 input vector. */
22959 /* FALLTHRU */
22960
22961 case 2:
22962 for (i = 0; i < nelt; ++i)
22963 d->perm[i] &= nelt - 1;
22964 d->op0 = d->op1;
22965 break;
22966
22967 case 1:
22968 d->op1 = d->op0;
22969 break;
22970 }
22971
22972 return (which == 3);
22973 }
22974
22975 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22976
22977 bool
22978 ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
22979 rtx target, rtx op0, rtx op1,
22980 const vec_perm_indices &sel)
22981 {
22982 if (vmode != op_mode)
22983 return false;
22984
22985 struct expand_vec_perm_d d;
22986 unsigned char perm[MAX_VECT_LEN];
22987 unsigned int i, nelt, which;
22988 bool two_args;
22989
22990 /* For HF mode vector, convert it to HI using subreg. */
22991 if (GET_MODE_INNER (vmode) == HFmode)
22992 {
22993 machine_mode orig_mode = vmode;
22994 vmode = mode_for_vector (HImode,
22995 GET_MODE_NUNITS (vmode)).require ();
22996 if (target)
22997 target = lowpart_subreg (vmode, target, orig_mode);
22998 if (op0)
22999 op0 = lowpart_subreg (vmode, op0, orig_mode);
23000 if (op1)
23001 op1 = lowpart_subreg (vmode, op1, orig_mode);
23002 }
23003
23004 d.target = target;
23005 d.op0 = op0;
23006 d.op1 = op1;
23007
23008 d.vmode = vmode;
23009 gcc_assert (VECTOR_MODE_P (d.vmode));
23010 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23011 d.testing_p = !target;
23012
23013 gcc_assert (sel.length () == nelt);
23014 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
23015
23016 /* Given sufficient ISA support we can just return true here
23017 for selected vector modes. */
23018 switch (d.vmode)
23019 {
23020 case E_V16SFmode:
23021 case E_V16SImode:
23022 case E_V8DImode:
23023 case E_V8DFmode:
23024 if (!TARGET_AVX512F)
23025 return false;
23026 /* All implementable with a single vperm[it]2 insn. */
23027 if (d.testing_p)
23028 return true;
23029 break;
23030 case E_V32HImode:
23031 if (!TARGET_AVX512F)
23032 return false;
23033 if (d.testing_p && TARGET_AVX512BW)
23034 /* All implementable with a single vperm[it]2 insn. */
23035 return true;
23036 break;
23037 case E_V64QImode:
23038 if (!TARGET_AVX512F)
23039 return false;
23040 if (d.testing_p && TARGET_AVX512BW)
23041 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
23042 return true;
23043 break;
23044 case E_V8SImode:
23045 case E_V8SFmode:
23046 case E_V4DFmode:
23047 case E_V4DImode:
23048 if (!TARGET_AVX)
23049 return false;
23050 if (d.testing_p && TARGET_AVX512VL)
23051 /* All implementable with a single vperm[it]2 insn. */
23052 return true;
23053 break;
23054 case E_V16HImode:
23055 if (!TARGET_SSE2)
23056 return false;
23057 if (d.testing_p && TARGET_AVX2)
23058 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23059 return true;
23060 break;
23061 case E_V32QImode:
23062 if (!TARGET_SSE2)
23063 return false;
23064 if (d.testing_p && TARGET_AVX2)
23065 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23066 return true;
23067 break;
23068 case E_V8HImode:
23069 case E_V16QImode:
23070 if (!TARGET_SSE2)
23071 return false;
23072 /* Fall through. */
23073 case E_V4SImode:
23074 case E_V4SFmode:
23075 if (!TARGET_SSE)
23076 return false;
23077 /* All implementable with a single vpperm insn. */
23078 if (d.testing_p && TARGET_XOP)
23079 return true;
23080 /* All implementable with 2 pshufb + 1 ior. */
23081 if (d.testing_p && TARGET_SSSE3)
23082 return true;
23083 break;
23084 case E_V2SFmode:
23085 case E_V2SImode:
23086 case E_V4HImode:
23087 case E_V8QImode:
23088 if (!TARGET_MMX_WITH_SSE)
23089 return false;
23090 break;
23091 case E_V2HImode:
23092 if (!TARGET_SSE2)
23093 return false;
23094 /* All implementable with *punpckwd. */
23095 if (d.testing_p)
23096 return true;
23097 break;
23098 case E_V4QImode:
23099 if (!TARGET_SSE2)
23100 return false;
23101 break;
23102 case E_V2DImode:
23103 case E_V2DFmode:
23104 if (!TARGET_SSE)
23105 return false;
23106 /* All implementable with shufpd or unpck[lh]pd. */
23107 if (d.testing_p)
23108 return true;
23109 break;
23110 default:
23111 return false;
23112 }
23113
23114 for (i = which = 0; i < nelt; ++i)
23115 {
23116 unsigned char e = sel[i];
23117 gcc_assert (e < 2 * nelt);
23118 d.perm[i] = e;
23119 perm[i] = e;
23120 which |= (e < nelt ? 1 : 2);
23121 }
23122
23123 if (d.testing_p)
23124 {
23125 /* For all elements from second vector, fold the elements to first. */
23126 if (which == 2)
23127 for (i = 0; i < nelt; ++i)
23128 d.perm[i] -= nelt;
23129
23130 /* Check whether the mask can be applied to the vector type. */
23131 d.one_operand_p = (which != 3);
23132
23133 /* Implementable with shufps, pshufd or pshuflw. */
23134 if (d.one_operand_p
23135 && (d.vmode == V4SFmode || d.vmode == V2SFmode
23136 || d.vmode == V4SImode || d.vmode == V2SImode
23137 || d.vmode == V4HImode || d.vmode == V2HImode))
23138 return true;
23139
23140 /* Otherwise we have to go through the motions and see if we can
23141 figure out how to generate the requested permutation. */
23142 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
23143 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
23144 if (!d.one_operand_p)
23145 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
23146
23147 start_sequence ();
23148 bool ret = ix86_expand_vec_perm_const_1 (&d);
23149 end_sequence ();
23150
23151 return ret;
23152 }
23153
23154 two_args = canonicalize_perm (&d);
23155
23156 /* If one of the operands is a zero vector, try to match pmovzx. */
23157 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
23158 {
23159 struct expand_vec_perm_d dzero = d;
23160 if (d.op0 == CONST0_RTX (vmode))
23161 {
23162 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
23163 std::swap (dzero.op0, dzero.op1);
23164 for (i = 0; i < nelt; ++i)
23165 dzero.perm[i] ^= nelt;
23166 }
23167 else
23168 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
23169
23170 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
23171 dzero.perm, nelt, dzero.testing_p))
23172 return true;
23173 }
23174
23175 /* Force operands into registers. */
23176 rtx nop0 = force_reg (vmode, d.op0);
23177 if (d.op0 == d.op1)
23178 d.op1 = nop0;
23179 d.op0 = nop0;
23180 d.op1 = force_reg (vmode, d.op1);
23181
23182 if (ix86_expand_vec_perm_const_1 (&d))
23183 return true;
23184
23185 /* If the selector says both arguments are needed, but the operands are the
23186 same, the above tried to expand with one_operand_p and flattened selector.
23187 If that didn't work, retry without one_operand_p; we succeeded with that
23188 during testing. */
23189 if (two_args && d.one_operand_p)
23190 {
23191 d.one_operand_p = false;
23192 memcpy (d.perm, perm, sizeof (perm));
23193 return ix86_expand_vec_perm_const_1 (&d);
23194 }
23195
23196 return false;
23197 }
23198
23199 void
23200 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
23201 {
23202 struct expand_vec_perm_d d;
23203 unsigned i, nelt;
23204
23205 d.target = targ;
23206 d.op0 = op0;
23207 d.op1 = op1;
23208 d.vmode = GET_MODE (targ);
23209 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23210 d.one_operand_p = false;
23211 d.testing_p = false;
23212
23213 for (i = 0; i < nelt; ++i)
23214 d.perm[i] = i * 2 + odd;
23215
23216 /* We'll either be able to implement the permutation directly... */
23217 if (expand_vec_perm_1 (&d))
23218 return;
23219
23220 /* ... or we use the special-case patterns. */
23221 expand_vec_perm_even_odd_1 (&d, odd);
23222 }
23223
23224 static void
23225 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
23226 {
23227 struct expand_vec_perm_d d;
23228 unsigned i, nelt, base;
23229 bool ok;
23230
23231 d.target = targ;
23232 d.op0 = op0;
23233 d.op1 = op1;
23234 d.vmode = GET_MODE (targ);
23235 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23236 d.one_operand_p = false;
23237 d.testing_p = false;
23238
23239 base = high_p ? nelt / 2 : 0;
23240 for (i = 0; i < nelt / 2; ++i)
23241 {
23242 d.perm[i * 2] = i + base;
23243 d.perm[i * 2 + 1] = i + base + nelt;
23244 }
23245
23246 /* Note that for AVX this isn't one instruction. */
23247 ok = ix86_expand_vec_perm_const_1 (&d);
23248 gcc_assert (ok);
23249 }
23250
23251 /* Expand a vector operation shift by constant for a V*QImode in terms of the
23252 same operation on V*HImode. Return true if success. */
23253 static bool
23254 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
23255 rtx dest, rtx op1, rtx op2)
23256 {
23257 machine_mode qimode, himode;
23258 HOST_WIDE_INT and_constant, xor_constant;
23259 HOST_WIDE_INT shift_amount;
23260 rtx vec_const_and, vec_const_xor;
23261 rtx tmp, op1_subreg;
23262 rtx (*gen_shift) (rtx, rtx, rtx);
23263 rtx (*gen_and) (rtx, rtx, rtx);
23264 rtx (*gen_xor) (rtx, rtx, rtx);
23265 rtx (*gen_sub) (rtx, rtx, rtx);
23266
23267 /* Only optimize shift by constant. */
23268 if (!CONST_INT_P (op2))
23269 return false;
23270
23271 qimode = GET_MODE (dest);
23272 shift_amount = INTVAL (op2);
23273 /* Do nothing when shift amount greater equal 8. */
23274 if (shift_amount > 7)
23275 return false;
23276
23277 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
23278 /* Record sign bit. */
23279 xor_constant = 1 << (8 - shift_amount - 1);
23280
23281 /* Zero upper/lower bits shift from left/right element. */
23282 and_constant
23283 = (code == ASHIFT ? 256 - (1 << shift_amount)
23284 : (1 << (8 - shift_amount)) - 1);
23285
23286 switch (qimode)
23287 {
23288 case V16QImode:
23289 himode = V8HImode;
23290 gen_shift =
23291 ((code == ASHIFT)
23292 ? gen_ashlv8hi3
23293 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
23294 gen_and = gen_andv16qi3;
23295 gen_xor = gen_xorv16qi3;
23296 gen_sub = gen_subv16qi3;
23297 break;
23298 case V32QImode:
23299 himode = V16HImode;
23300 gen_shift =
23301 ((code == ASHIFT)
23302 ? gen_ashlv16hi3
23303 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
23304 gen_and = gen_andv32qi3;
23305 gen_xor = gen_xorv32qi3;
23306 gen_sub = gen_subv32qi3;
23307 break;
23308 case V64QImode:
23309 himode = V32HImode;
23310 gen_shift =
23311 ((code == ASHIFT)
23312 ? gen_ashlv32hi3
23313 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
23314 gen_and = gen_andv64qi3;
23315 gen_xor = gen_xorv64qi3;
23316 gen_sub = gen_subv64qi3;
23317 break;
23318 default:
23319 gcc_unreachable ();
23320 }
23321
23322 tmp = gen_reg_rtx (himode);
23323 vec_const_and = gen_reg_rtx (qimode);
23324 op1_subreg = lowpart_subreg (himode, op1, qimode);
23325
23326 /* For ASHIFT and LSHIFTRT, perform operation like
23327 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23328 vpand %vec_const_and, %dest. */
23329 emit_insn (gen_shift (tmp, op1_subreg, op2));
23330 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
23331 emit_move_insn (vec_const_and,
23332 ix86_build_const_vector (qimode, true,
23333 gen_int_mode (and_constant, QImode)));
23334 emit_insn (gen_and (dest, dest, vec_const_and));
23335
23336 /* For ASHIFTRT, perform extra operation like
23337 vpxor %vec_const_xor, %dest, %dest
23338 vpsubb %vec_const_xor, %dest, %dest */
23339 if (code == ASHIFTRT)
23340 {
23341 vec_const_xor = gen_reg_rtx (qimode);
23342 emit_move_insn (vec_const_xor,
23343 ix86_build_const_vector (qimode, true,
23344 gen_int_mode (xor_constant, QImode)));
23345 emit_insn (gen_xor (dest, dest, vec_const_xor));
23346 emit_insn (gen_sub (dest, dest, vec_const_xor));
23347 }
23348 return true;
23349 }
23350
23351 void
23352 ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23353 {
23354 machine_mode qimode = GET_MODE (dest);
23355 rtx qop1, qop2, hop1, hop2, qdest, hdest;
23356 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23357 bool uns_p = code != ASHIFTRT;
23358
23359 switch (qimode)
23360 {
23361 case E_V4QImode:
23362 case E_V8QImode:
23363 break;
23364 default:
23365 gcc_unreachable ();
23366 }
23367
23368 qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
23369
23370 if (op2vec)
23371 qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
23372 else
23373 qop2 = op2;
23374
23375 qdest = gen_reg_rtx (V16QImode);
23376
23377 if (CONST_INT_P (op2)
23378 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23379 && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
23380 {
23381 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23382 return;
23383 }
23384
23385 switch (code)
23386 {
23387 case MULT:
23388 gcc_assert (op2vec);
23389 if (!TARGET_SSE4_1)
23390 {
23391 /* Unpack data such that we've got a source byte in each low byte
23392 of each word. We don't care what goes into the high byte of
23393 each word. Rather than trying to get zero in there, most
23394 convenient is to let it be a copy of the low byte. */
23395 hop1 = copy_to_reg (qop1);
23396 hop2 = copy_to_reg (qop2);
23397 emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
23398 emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
23399 break;
23400 }
23401 /* FALLTHRU */
23402 case ASHIFT:
23403 case ASHIFTRT:
23404 case LSHIFTRT:
23405 hop1 = gen_reg_rtx (V8HImode);
23406 ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
23407 /* mult/vashr/vlshr/vashl */
23408 if (op2vec)
23409 {
23410 hop2 = gen_reg_rtx (V8HImode);
23411 ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
23412 }
23413 else
23414 hop2 = qop2;
23415
23416 break;
23417 default:
23418 gcc_unreachable ();
23419 }
23420
23421 if (code != MULT && op2vec)
23422 {
23423 /* Expand vashr/vlshr/vashl. */
23424 hdest = gen_reg_rtx (V8HImode);
23425 emit_insn (gen_rtx_SET (hdest,
23426 simplify_gen_binary (code, V8HImode,
23427 hop1, hop2)));
23428 }
23429 else
23430 /* Expand mult/ashr/lshr/ashl. */
23431 hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
23432 NULL_RTX, 1, OPTAB_DIRECT);
23433
23434 if (TARGET_AVX512BW && TARGET_AVX512VL)
23435 {
23436 if (qimode == V8QImode)
23437 qdest = dest;
23438 else
23439 qdest = gen_reg_rtx (V8QImode);
23440
23441 emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
23442 }
23443 else
23444 {
23445 struct expand_vec_perm_d d;
23446 rtx qres = gen_lowpart (V16QImode, hdest);
23447 bool ok;
23448 int i;
23449
23450 /* Merge the data back into the right place. */
23451 d.target = qdest;
23452 d.op0 = d.op1 = qres;
23453 d.vmode = V16QImode;
23454 d.nelt = 16;
23455 d.one_operand_p = false;
23456 d.testing_p = false;
23457
23458 for (i = 0; i < d.nelt; ++i)
23459 d.perm[i] = i * 2;
23460
23461 ok = ix86_expand_vec_perm_const_1 (&d);
23462 gcc_assert (ok);
23463 }
23464
23465 if (qdest != dest)
23466 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23467 }
23468
23469 /* Emit instruction in 2x wider mode. For example, optimize
23470 vector MUL generation like
23471
23472 vpmovzxbw ymm2, xmm0
23473 vpmovzxbw ymm3, xmm1
23474 vpmullw ymm4, ymm2, ymm3
23475 vpmovwb xmm0, ymm4
23476
23477 it would take less instructions than ix86_expand_vecop_qihi.
23478 Return true if success. */
23479
23480 static bool
23481 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23482 {
23483 machine_mode himode, qimode = GET_MODE (dest);
23484 machine_mode wqimode;
23485 rtx qop1, qop2, hop1, hop2, hdest;
23486 rtx (*gen_truncate)(rtx, rtx) = NULL;
23487 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23488 bool uns_p = code != ASHIFTRT;
23489
23490 if ((qimode == V16QImode && !TARGET_AVX2)
23491 || (qimode == V32QImode && !TARGET_AVX512BW)
23492 /* There are no V64HImode instructions. */
23493 || qimode == V64QImode)
23494 return false;
23495
23496 /* Do not generate ymm/zmm instructions when
23497 target prefers 128/256 bit vector width. */
23498 if ((qimode == V16QImode && TARGET_PREFER_AVX128)
23499 || (qimode == V32QImode && TARGET_PREFER_AVX256))
23500 return false;
23501
23502 switch (qimode)
23503 {
23504 case E_V16QImode:
23505 himode = V16HImode;
23506 if (TARGET_AVX512VL && TARGET_AVX512BW)
23507 gen_truncate = gen_truncv16hiv16qi2;
23508 break;
23509 case E_V32QImode:
23510 himode = V32HImode;
23511 gen_truncate = gen_truncv32hiv32qi2;
23512 break;
23513 default:
23514 gcc_unreachable ();
23515 }
23516
23517 wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
23518 qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
23519
23520 if (op2vec)
23521 qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
23522 else
23523 qop2 = op2;
23524
23525 hop1 = gen_reg_rtx (himode);
23526 ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
23527
23528 if (op2vec)
23529 {
23530 hop2 = gen_reg_rtx (himode);
23531 ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
23532 }
23533 else
23534 hop2 = qop2;
23535
23536 if (code != MULT && op2vec)
23537 {
23538 /* Expand vashr/vlshr/vashl. */
23539 hdest = gen_reg_rtx (himode);
23540 emit_insn (gen_rtx_SET (hdest,
23541 simplify_gen_binary (code, himode,
23542 hop1, hop2)));
23543 }
23544 else
23545 /* Expand mult/ashr/lshr/ashl. */
23546 hdest = expand_simple_binop (himode, code, hop1, hop2,
23547 NULL_RTX, 1, OPTAB_DIRECT);
23548
23549 if (gen_truncate)
23550 emit_insn (gen_truncate (dest, hdest));
23551 else
23552 {
23553 struct expand_vec_perm_d d;
23554 rtx wqdest = gen_reg_rtx (wqimode);
23555 rtx wqres = gen_lowpart (wqimode, hdest);
23556 bool ok;
23557 int i;
23558
23559 /* Merge the data back into the right place. */
23560 d.target = wqdest;
23561 d.op0 = d.op1 = wqres;
23562 d.vmode = wqimode;
23563 d.nelt = GET_MODE_NUNITS (wqimode);
23564 d.one_operand_p = false;
23565 d.testing_p = false;
23566
23567 for (i = 0; i < d.nelt; ++i)
23568 d.perm[i] = i * 2;
23569
23570 ok = ix86_expand_vec_perm_const_1 (&d);
23571 gcc_assert (ok);
23572
23573 emit_move_insn (dest, gen_lowpart (qimode, wqdest));
23574 }
23575
23576 return true;
23577 }
23578
23579 /* Expand a vector operation CODE for a V*QImode in terms of the
23580 same operation on V*HImode. */
23581
23582 void
23583 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23584 {
23585 machine_mode qimode = GET_MODE (dest);
23586 machine_mode himode;
23587 rtx (*gen_il) (rtx, rtx, rtx);
23588 rtx (*gen_ih) (rtx, rtx, rtx);
23589 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
23590 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23591 struct expand_vec_perm_d d;
23592 bool full_interleave = true;
23593 bool uns_p = code != ASHIFTRT;
23594 bool ok;
23595 int i;
23596
23597 if (CONST_INT_P (op2)
23598 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23599 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
23600 return;
23601
23602 if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
23603 return;
23604
23605 switch (qimode)
23606 {
23607 case E_V16QImode:
23608 himode = V8HImode;
23609 break;
23610 case E_V32QImode:
23611 himode = V16HImode;
23612 break;
23613 case E_V64QImode:
23614 himode = V32HImode;
23615 break;
23616 default:
23617 gcc_unreachable ();
23618 }
23619
23620 switch (code)
23621 {
23622 case MULT:
23623 gcc_assert (op2vec);
23624 /* Unpack data such that we've got a source byte in each low byte of
23625 each word. We don't care what goes into the high byte of each word.
23626 Rather than trying to get zero in there, most convenient is to let
23627 it be a copy of the low byte. */
23628 switch (qimode)
23629 {
23630 case E_V16QImode:
23631 gen_il = gen_vec_interleave_lowv16qi;
23632 gen_ih = gen_vec_interleave_highv16qi;
23633 break;
23634 case E_V32QImode:
23635 gen_il = gen_avx2_interleave_lowv32qi;
23636 gen_ih = gen_avx2_interleave_highv32qi;
23637 full_interleave = false;
23638 break;
23639 case E_V64QImode:
23640 gen_il = gen_avx512bw_interleave_lowv64qi;
23641 gen_ih = gen_avx512bw_interleave_highv64qi;
23642 full_interleave = false;
23643 break;
23644 default:
23645 gcc_unreachable ();
23646 }
23647
23648 op2_l = gen_reg_rtx (qimode);
23649 op2_h = gen_reg_rtx (qimode);
23650 emit_insn (gen_il (op2_l, op2, op2));
23651 emit_insn (gen_ih (op2_h, op2, op2));
23652
23653 op1_l = gen_reg_rtx (qimode);
23654 op1_h = gen_reg_rtx (qimode);
23655 emit_insn (gen_il (op1_l, op1, op1));
23656 emit_insn (gen_ih (op1_h, op1, op1));
23657 break;
23658
23659 case ASHIFT:
23660 case ASHIFTRT:
23661 case LSHIFTRT:
23662 op1_l = gen_reg_rtx (himode);
23663 op1_h = gen_reg_rtx (himode);
23664 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
23665 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
23666 /* vashr/vlshr/vashl */
23667 if (op2vec)
23668 {
23669 rtx tmp = force_reg (qimode, op2);
23670 op2_l = gen_reg_rtx (himode);
23671 op2_h = gen_reg_rtx (himode);
23672 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
23673 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
23674 }
23675 else
23676 op2_l = op2_h = op2;
23677
23678 break;
23679 default:
23680 gcc_unreachable ();
23681 }
23682
23683 if (code != MULT && op2vec)
23684 {
23685 /* Expand vashr/vlshr/vashl. */
23686 res_l = gen_reg_rtx (himode);
23687 res_h = gen_reg_rtx (himode);
23688 emit_insn (gen_rtx_SET (res_l,
23689 simplify_gen_binary (code, himode,
23690 op1_l, op2_l)));
23691 emit_insn (gen_rtx_SET (res_h,
23692 simplify_gen_binary (code, himode,
23693 op1_h, op2_h)));
23694 }
23695 else
23696 {
23697 /* Expand mult/ashr/lshr/ashl. */
23698 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
23699 1, OPTAB_DIRECT);
23700 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
23701 1, OPTAB_DIRECT);
23702 }
23703
23704 gcc_assert (res_l && res_h);
23705
23706 /* Merge the data back into the right place. */
23707 d.target = dest;
23708 d.op0 = gen_lowpart (qimode, res_l);
23709 d.op1 = gen_lowpart (qimode, res_h);
23710 d.vmode = qimode;
23711 d.nelt = GET_MODE_NUNITS (qimode);
23712 d.one_operand_p = false;
23713 d.testing_p = false;
23714
23715 if (full_interleave)
23716 {
23717 /* We used the full interleave, the desired
23718 results are in the even elements. */
23719 for (i = 0; i < d.nelt; ++i)
23720 d.perm[i] = i * 2;
23721 }
23722 else
23723 {
23724 /* For AVX, the interleave used above was not cross-lane. So the
23725 extraction is evens but with the second and third quarter swapped.
23726 Happily, that is even one insn shorter than even extraction.
23727 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23728 always first from the first and then from the second source operand,
23729 the index bits above the low 4 bits remains the same.
23730 Thus, for d.nelt == 32 we want permutation
23731 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23732 and for d.nelt == 64 we want permutation
23733 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23734 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23735 for (i = 0; i < d.nelt; ++i)
23736 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
23737 }
23738
23739 ok = ix86_expand_vec_perm_const_1 (&d);
23740 gcc_assert (ok);
23741 }
23742
23743 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
23744 if op is CONST_VECTOR with all odd elements equal to their
23745 preceding element. */
23746
23747 static bool
23748 const_vector_equal_evenodd_p (rtx op)
23749 {
23750 machine_mode mode = GET_MODE (op);
23751 int i, nunits = GET_MODE_NUNITS (mode);
23752 if (GET_CODE (op) != CONST_VECTOR
23753 || nunits != CONST_VECTOR_NUNITS (op))
23754 return false;
23755 for (i = 0; i < nunits; i += 2)
23756 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
23757 return false;
23758 return true;
23759 }
23760
23761 void
23762 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
23763 bool uns_p, bool odd_p)
23764 {
23765 machine_mode mode = GET_MODE (op1);
23766 machine_mode wmode = GET_MODE (dest);
23767 rtx x;
23768 rtx orig_op1 = op1, orig_op2 = op2;
23769
23770 if (!nonimmediate_operand (op1, mode))
23771 op1 = force_reg (mode, op1);
23772 if (!nonimmediate_operand (op2, mode))
23773 op2 = force_reg (mode, op2);
23774
23775 /* We only play even/odd games with vectors of SImode. */
23776 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
23777
23778 /* If we're looking for the odd results, shift those members down to
23779 the even slots. For some cpus this is faster than a PSHUFD. */
23780 if (odd_p)
23781 {
23782 /* For XOP use vpmacsdqh, but only for smult, as it is only
23783 signed. */
23784 if (TARGET_XOP && mode == V4SImode && !uns_p)
23785 {
23786 x = force_reg (wmode, CONST0_RTX (wmode));
23787 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
23788 return;
23789 }
23790
23791 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
23792 if (!const_vector_equal_evenodd_p (orig_op1))
23793 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
23794 x, NULL, 1, OPTAB_DIRECT);
23795 if (!const_vector_equal_evenodd_p (orig_op2))
23796 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
23797 x, NULL, 1, OPTAB_DIRECT);
23798 op1 = gen_lowpart (mode, op1);
23799 op2 = gen_lowpart (mode, op2);
23800 }
23801
23802 if (mode == V16SImode)
23803 {
23804 if (uns_p)
23805 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
23806 else
23807 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
23808 }
23809 else if (mode == V8SImode)
23810 {
23811 if (uns_p)
23812 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
23813 else
23814 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
23815 }
23816 else if (uns_p)
23817 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
23818 else if (TARGET_SSE4_1)
23819 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
23820 else
23821 {
23822 rtx s1, s2, t0, t1, t2;
23823
23824 /* The easiest way to implement this without PMULDQ is to go through
23825 the motions as if we are performing a full 64-bit multiply. With
23826 the exception that we need to do less shuffling of the elements. */
23827
23828 /* Compute the sign-extension, aka highparts, of the two operands. */
23829 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23830 op1, pc_rtx, pc_rtx);
23831 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23832 op2, pc_rtx, pc_rtx);
23833
23834 /* Multiply LO(A) * HI(B), and vice-versa. */
23835 t1 = gen_reg_rtx (wmode);
23836 t2 = gen_reg_rtx (wmode);
23837 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
23838 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
23839
23840 /* Multiply LO(A) * LO(B). */
23841 t0 = gen_reg_rtx (wmode);
23842 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
23843
23844 /* Combine and shift the highparts into place. */
23845 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
23846 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
23847 1, OPTAB_DIRECT);
23848
23849 /* Combine high and low parts. */
23850 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
23851 return;
23852 }
23853 emit_insn (x);
23854 }
23855
23856 void
23857 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
23858 bool uns_p, bool high_p)
23859 {
23860 machine_mode wmode = GET_MODE (dest);
23861 machine_mode mode = GET_MODE (op1);
23862 rtx t1, t2, t3, t4, mask;
23863
23864 switch (mode)
23865 {
23866 case E_V4SImode:
23867 t1 = gen_reg_rtx (mode);
23868 t2 = gen_reg_rtx (mode);
23869 if (TARGET_XOP && !uns_p)
23870 {
23871 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23872 shuffle the elements once so that all elements are in the right
23873 place for immediate use: { A C B D }. */
23874 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
23875 const1_rtx, GEN_INT (3)));
23876 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
23877 const1_rtx, GEN_INT (3)));
23878 }
23879 else
23880 {
23881 /* Put the elements into place for the multiply. */
23882 ix86_expand_vec_interleave (t1, op1, op1, high_p);
23883 ix86_expand_vec_interleave (t2, op2, op2, high_p);
23884 high_p = false;
23885 }
23886 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
23887 break;
23888
23889 case E_V8SImode:
23890 /* Shuffle the elements between the lanes. After this we
23891 have { A B E F | C D G H } for each operand. */
23892 t1 = gen_reg_rtx (V4DImode);
23893 t2 = gen_reg_rtx (V4DImode);
23894 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
23895 const0_rtx, const2_rtx,
23896 const1_rtx, GEN_INT (3)));
23897 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
23898 const0_rtx, const2_rtx,
23899 const1_rtx, GEN_INT (3)));
23900
23901 /* Shuffle the elements within the lanes. After this we
23902 have { A A B B | C C D D } or { E E F F | G G H H }. */
23903 t3 = gen_reg_rtx (V8SImode);
23904 t4 = gen_reg_rtx (V8SImode);
23905 mask = GEN_INT (high_p
23906 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23907 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23908 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
23909 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
23910
23911 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
23912 break;
23913
23914 case E_V8HImode:
23915 case E_V16HImode:
23916 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
23917 uns_p, OPTAB_DIRECT);
23918 t2 = expand_binop (mode,
23919 uns_p ? umul_highpart_optab : smul_highpart_optab,
23920 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
23921 gcc_assert (t1 && t2);
23922
23923 t3 = gen_reg_rtx (mode);
23924 ix86_expand_vec_interleave (t3, t1, t2, high_p);
23925 emit_move_insn (dest, gen_lowpart (wmode, t3));
23926 break;
23927
23928 case E_V16QImode:
23929 case E_V32QImode:
23930 case E_V32HImode:
23931 case E_V16SImode:
23932 case E_V64QImode:
23933 t1 = gen_reg_rtx (wmode);
23934 t2 = gen_reg_rtx (wmode);
23935 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
23936 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
23937
23938 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
23939 break;
23940
23941 default:
23942 gcc_unreachable ();
23943 }
23944 }
23945
23946 void
23947 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
23948 {
23949 rtx res_1, res_2, res_3, res_4;
23950
23951 res_1 = gen_reg_rtx (V4SImode);
23952 res_2 = gen_reg_rtx (V4SImode);
23953 res_3 = gen_reg_rtx (V2DImode);
23954 res_4 = gen_reg_rtx (V2DImode);
23955 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
23956 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
23957
23958 /* Move the results in element 2 down to element 1; we don't care
23959 what goes in elements 2 and 3. Then we can merge the parts
23960 back together with an interleave.
23961
23962 Note that two other sequences were tried:
23963 (1) Use interleaves at the start instead of psrldq, which allows
23964 us to use a single shufps to merge things back at the end.
23965 (2) Use shufps here to combine the two vectors, then pshufd to
23966 put the elements in the correct order.
23967 In both cases the cost of the reformatting stall was too high
23968 and the overall sequence slower. */
23969
23970 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
23971 const0_rtx, const2_rtx,
23972 const0_rtx, const0_rtx));
23973 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
23974 const0_rtx, const2_rtx,
23975 const0_rtx, const0_rtx));
23976 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
23977
23978 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
23979 }
23980
23981 void
23982 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
23983 {
23984 machine_mode mode = GET_MODE (op0);
23985 rtx t1, t2, t3, t4, t5, t6;
23986
23987 if (TARGET_AVX512DQ && mode == V8DImode)
23988 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
23989 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
23990 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
23991 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
23992 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
23993 else if (TARGET_XOP && mode == V2DImode)
23994 {
23995 /* op1: A,B,C,D, op2: E,F,G,H */
23996 op1 = gen_lowpart (V4SImode, op1);
23997 op2 = gen_lowpart (V4SImode, op2);
23998
23999 t1 = gen_reg_rtx (V4SImode);
24000 t2 = gen_reg_rtx (V4SImode);
24001 t3 = gen_reg_rtx (V2DImode);
24002 t4 = gen_reg_rtx (V2DImode);
24003
24004 /* t1: B,A,D,C */
24005 emit_insn (gen_sse2_pshufd_1 (t1, op1,
24006 GEN_INT (1),
24007 GEN_INT (0),
24008 GEN_INT (3),
24009 GEN_INT (2)));
24010
24011 /* t2: (B*E),(A*F),(D*G),(C*H) */
24012 emit_insn (gen_mulv4si3 (t2, t1, op2));
24013
24014 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
24015 emit_insn (gen_xop_phadddq (t3, t2));
24016
24017 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
24018 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
24019
24020 /* Multiply lower parts and add all */
24021 t5 = gen_reg_rtx (V2DImode);
24022 emit_insn (gen_vec_widen_umult_even_v4si (t5,
24023 gen_lowpart (V4SImode, op1),
24024 gen_lowpart (V4SImode, op2)));
24025 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
24026 }
24027 else
24028 {
24029 machine_mode nmode;
24030 rtx (*umul) (rtx, rtx, rtx);
24031
24032 if (mode == V2DImode)
24033 {
24034 umul = gen_vec_widen_umult_even_v4si;
24035 nmode = V4SImode;
24036 }
24037 else if (mode == V4DImode)
24038 {
24039 umul = gen_vec_widen_umult_even_v8si;
24040 nmode = V8SImode;
24041 }
24042 else if (mode == V8DImode)
24043 {
24044 umul = gen_vec_widen_umult_even_v16si;
24045 nmode = V16SImode;
24046 }
24047 else
24048 gcc_unreachable ();
24049
24050
24051 /* Multiply low parts. */
24052 t1 = gen_reg_rtx (mode);
24053 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
24054
24055 /* Shift input vectors right 32 bits so we can multiply high parts. */
24056 t6 = GEN_INT (32);
24057 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
24058 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
24059
24060 /* Multiply high parts by low parts. */
24061 t4 = gen_reg_rtx (mode);
24062 t5 = gen_reg_rtx (mode);
24063 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
24064 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
24065
24066 /* Combine and shift the highparts back. */
24067 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
24068 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
24069
24070 /* Combine high and low parts. */
24071 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
24072 }
24073
24074 set_unique_reg_note (get_last_insn (), REG_EQUAL,
24075 gen_rtx_MULT (mode, op1, op2));
24076 }
24077
24078 /* Return 1 if control tansfer instruction INSN
24079 should be encoded with notrack prefix. */
24080
24081 bool
24082 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
24083 {
24084 if (!insn || !((flag_cf_protection & CF_BRANCH)))
24085 return false;
24086
24087 if (CALL_P (insn))
24088 {
24089 rtx call = get_call_rtx_from (insn);
24090 gcc_assert (call != NULL_RTX);
24091 rtx addr = XEXP (call, 0);
24092
24093 /* Do not emit 'notrack' if it's not an indirect call. */
24094 if (MEM_P (addr)
24095 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
24096 return false;
24097 else
24098 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
24099 }
24100
24101 if (JUMP_P (insn) && !flag_cet_switch)
24102 {
24103 rtx target = JUMP_LABEL (insn);
24104 if (target == NULL_RTX || ANY_RETURN_P (target))
24105 return false;
24106
24107 /* Check the jump is a switch table. */
24108 rtx_insn *label = as_a<rtx_insn *> (target);
24109 rtx_insn *table = next_insn (label);
24110 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
24111 return false;
24112 else
24113 return true;
24114 }
24115 return false;
24116 }
24117
24118 /* Calculate integer abs() using only SSE2 instructions. */
24119
24120 void
24121 ix86_expand_sse2_abs (rtx target, rtx input)
24122 {
24123 machine_mode mode = GET_MODE (target);
24124 rtx tmp0, tmp1, x;
24125
24126 switch (mode)
24127 {
24128 case E_V2DImode:
24129 case E_V4DImode:
24130 /* For 64-bit signed integer X, with SSE4.2 use
24131 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
24132 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
24133 32 and use logical instead of arithmetic right shift (which is
24134 unimplemented) and subtract. */
24135 if (TARGET_SSE4_2)
24136 {
24137 tmp0 = gen_reg_rtx (mode);
24138 tmp1 = gen_reg_rtx (mode);
24139 emit_move_insn (tmp1, CONST0_RTX (mode));
24140 if (mode == E_V2DImode)
24141 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
24142 else
24143 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
24144 }
24145 else
24146 {
24147 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
24148 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
24149 - 1), NULL, 0, OPTAB_DIRECT);
24150 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
24151 }
24152
24153 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24154 NULL, 0, OPTAB_DIRECT);
24155 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24156 target, 0, OPTAB_DIRECT);
24157 break;
24158
24159 case E_V4SImode:
24160 /* For 32-bit signed integer X, the best way to calculate the absolute
24161 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
24162 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
24163 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
24164 NULL, 0, OPTAB_DIRECT);
24165 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24166 NULL, 0, OPTAB_DIRECT);
24167 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24168 target, 0, OPTAB_DIRECT);
24169 break;
24170
24171 case E_V8HImode:
24172 /* For 16-bit signed integer X, the best way to calculate the absolute
24173 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
24174 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24175
24176 x = expand_simple_binop (mode, SMAX, tmp0, input,
24177 target, 0, OPTAB_DIRECT);
24178 break;
24179
24180 case E_V16QImode:
24181 /* For 8-bit signed integer X, the best way to calculate the absolute
24182 value of X is min ((unsigned char) X, (unsigned char) (-X)),
24183 as SSE2 provides the PMINUB insn. */
24184 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24185
24186 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
24187 target, 0, OPTAB_DIRECT);
24188 break;
24189
24190 default:
24191 gcc_unreachable ();
24192 }
24193
24194 if (x != target)
24195 emit_move_insn (target, x);
24196 }
24197
24198 /* Expand an extract from a vector register through pextr insn.
24199 Return true if successful. */
24200
24201 bool
24202 ix86_expand_pextr (rtx *operands)
24203 {
24204 rtx dst = operands[0];
24205 rtx src = operands[1];
24206
24207 unsigned int size = INTVAL (operands[2]);
24208 unsigned int pos = INTVAL (operands[3]);
24209
24210 if (SUBREG_P (dst))
24211 {
24212 /* Reject non-lowpart subregs. */
24213 if (SUBREG_BYTE (dst) > 0)
24214 return false;
24215 dst = SUBREG_REG (dst);
24216 }
24217
24218 if (SUBREG_P (src))
24219 {
24220 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
24221 src = SUBREG_REG (src);
24222 }
24223
24224 switch (GET_MODE (src))
24225 {
24226 case E_V16QImode:
24227 case E_V8HImode:
24228 case E_V4SImode:
24229 case E_V2DImode:
24230 case E_V1TImode:
24231 {
24232 machine_mode srcmode, dstmode;
24233 rtx d, pat;
24234
24235 if (!int_mode_for_size (size, 0).exists (&dstmode))
24236 return false;
24237
24238 switch (dstmode)
24239 {
24240 case E_QImode:
24241 if (!TARGET_SSE4_1)
24242 return false;
24243 srcmode = V16QImode;
24244 break;
24245
24246 case E_HImode:
24247 if (!TARGET_SSE2)
24248 return false;
24249 srcmode = V8HImode;
24250 break;
24251
24252 case E_SImode:
24253 if (!TARGET_SSE4_1)
24254 return false;
24255 srcmode = V4SImode;
24256 break;
24257
24258 case E_DImode:
24259 gcc_assert (TARGET_64BIT);
24260 if (!TARGET_SSE4_1)
24261 return false;
24262 srcmode = V2DImode;
24263 break;
24264
24265 default:
24266 return false;
24267 }
24268
24269 /* Reject extractions from misaligned positions. */
24270 if (pos & (size-1))
24271 return false;
24272
24273 if (GET_MODE (dst) == dstmode)
24274 d = dst;
24275 else
24276 d = gen_reg_rtx (dstmode);
24277
24278 /* Construct insn pattern. */
24279 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
24280 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
24281
24282 /* Let the rtl optimizers know about the zero extension performed. */
24283 if (dstmode == QImode || dstmode == HImode)
24284 {
24285 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
24286 d = gen_lowpart (SImode, d);
24287 }
24288
24289 emit_insn (gen_rtx_SET (d, pat));
24290
24291 if (d != dst)
24292 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24293 return true;
24294 }
24295
24296 default:
24297 return false;
24298 }
24299 }
24300
24301 /* Expand an insert into a vector register through pinsr insn.
24302 Return true if successful. */
24303
24304 bool
24305 ix86_expand_pinsr (rtx *operands)
24306 {
24307 rtx dst = operands[0];
24308 rtx src = operands[3];
24309
24310 unsigned int size = INTVAL (operands[1]);
24311 unsigned int pos = INTVAL (operands[2]);
24312
24313 if (SUBREG_P (dst))
24314 {
24315 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
24316 dst = SUBREG_REG (dst);
24317 }
24318
24319 switch (GET_MODE (dst))
24320 {
24321 case E_V16QImode:
24322 case E_V8HImode:
24323 case E_V4SImode:
24324 case E_V2DImode:
24325 case E_V1TImode:
24326 {
24327 machine_mode srcmode, dstmode;
24328 rtx (*pinsr)(rtx, rtx, rtx, rtx);
24329 rtx d;
24330
24331 if (!int_mode_for_size (size, 0).exists (&srcmode))
24332 return false;
24333
24334 switch (srcmode)
24335 {
24336 case E_QImode:
24337 if (!TARGET_SSE4_1)
24338 return false;
24339 dstmode = V16QImode;
24340 pinsr = gen_sse4_1_pinsrb;
24341 break;
24342
24343 case E_HImode:
24344 if (!TARGET_SSE2)
24345 return false;
24346 dstmode = V8HImode;
24347 pinsr = gen_sse2_pinsrw;
24348 break;
24349
24350 case E_SImode:
24351 if (!TARGET_SSE4_1)
24352 return false;
24353 dstmode = V4SImode;
24354 pinsr = gen_sse4_1_pinsrd;
24355 break;
24356
24357 case E_DImode:
24358 gcc_assert (TARGET_64BIT);
24359 if (!TARGET_SSE4_1)
24360 return false;
24361 dstmode = V2DImode;
24362 pinsr = gen_sse4_1_pinsrq;
24363 break;
24364
24365 default:
24366 return false;
24367 }
24368
24369 /* Reject insertions to misaligned positions. */
24370 if (pos & (size-1))
24371 return false;
24372
24373 if (SUBREG_P (src))
24374 {
24375 unsigned int srcpos = SUBREG_BYTE (src);
24376
24377 if (srcpos > 0)
24378 {
24379 rtx extr_ops[4];
24380
24381 extr_ops[0] = gen_reg_rtx (srcmode);
24382 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
24383 extr_ops[2] = GEN_INT (size);
24384 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
24385
24386 if (!ix86_expand_pextr (extr_ops))
24387 return false;
24388
24389 src = extr_ops[0];
24390 }
24391 else
24392 src = gen_lowpart (srcmode, SUBREG_REG (src));
24393 }
24394
24395 if (GET_MODE (dst) == dstmode)
24396 d = dst;
24397 else
24398 d = gen_reg_rtx (dstmode);
24399
24400 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
24401 gen_lowpart (srcmode, src),
24402 GEN_INT (1 << (pos / size))));
24403 if (d != dst)
24404 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24405 return true;
24406 }
24407
24408 default:
24409 return false;
24410 }
24411 }
24412
24413 /* All CPUs prefer to avoid cross-lane operations so perform reductions
24414 upper against lower halves up to SSE reg size. */
24415
24416 machine_mode
24417 ix86_split_reduction (machine_mode mode)
24418 {
24419 /* Reduce lowpart against highpart until we reach SSE reg width to
24420 avoid cross-lane operations. */
24421 switch (mode)
24422 {
24423 case E_V8DImode:
24424 case E_V4DImode:
24425 return V2DImode;
24426 case E_V16SImode:
24427 case E_V8SImode:
24428 return V4SImode;
24429 case E_V32HImode:
24430 case E_V16HImode:
24431 return V8HImode;
24432 case E_V64QImode:
24433 case E_V32QImode:
24434 return V16QImode;
24435 case E_V16SFmode:
24436 case E_V8SFmode:
24437 return V4SFmode;
24438 case E_V8DFmode:
24439 case E_V4DFmode:
24440 return V2DFmode;
24441 default:
24442 return mode;
24443 }
24444 }
24445
24446 /* Generate call to __divmoddi4. */
24447
24448 void
24449 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
24450 rtx op0, rtx op1,
24451 rtx *quot_p, rtx *rem_p)
24452 {
24453 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
24454
24455 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
24456 mode, op0, mode, op1, mode,
24457 XEXP (rem, 0), Pmode);
24458 *quot_p = quot;
24459 *rem_p = rem;
24460 }
24461
24462 void
24463 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
24464 enum rtx_code code, bool after,
24465 bool doubleword)
24466 {
24467 rtx old_reg, new_reg, old_mem, success;
24468 machine_mode mode = GET_MODE (target);
24469 rtx_code_label *loop_label = NULL;
24470
24471 old_reg = gen_reg_rtx (mode);
24472 new_reg = old_reg;
24473 old_mem = copy_to_reg (mem);
24474 loop_label = gen_label_rtx ();
24475 emit_label (loop_label);
24476 emit_move_insn (old_reg, old_mem);
24477
24478 /* return value for atomic_fetch_op. */
24479 if (!after)
24480 emit_move_insn (target, old_reg);
24481
24482 if (code == NOT)
24483 {
24484 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
24485 true, OPTAB_LIB_WIDEN);
24486 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
24487 }
24488 else
24489 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
24490 true, OPTAB_LIB_WIDEN);
24491
24492 /* return value for atomic_op_fetch. */
24493 if (after)
24494 emit_move_insn (target, new_reg);
24495
24496 success = NULL_RTX;
24497
24498 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
24499 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
24500 SImode),
24501 doubleword, loop_label);
24502 }
24503
24504 /* Relax cmpxchg instruction, param loop_label indicates whether
24505 the instruction should be relaxed with a pause loop. If not,
24506 it will be relaxed to an atomic load + compare, and skip
24507 cmpxchg instruction if mem != exp_input. */
24508
24509 void
24510 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
24511 rtx mem, rtx exp_input, rtx new_input,
24512 rtx mem_model, bool doubleword,
24513 rtx_code_label *loop_label)
24514 {
24515 rtx_code_label *cmp_label = NULL;
24516 rtx_code_label *done_label = NULL;
24517 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
24518 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
24519 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
24520 machine_mode mode = GET_MODE (target_val), hmode = mode;
24521
24522 if (*ptarget_bool == NULL)
24523 target_bool = gen_reg_rtx (QImode);
24524 else
24525 target_bool = *ptarget_bool;
24526
24527 cmp_label = gen_label_rtx ();
24528 done_label = gen_label_rtx ();
24529
24530 new_mem = gen_reg_rtx (mode);
24531 /* Load memory first. */
24532 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
24533
24534 switch (mode)
24535 {
24536 case E_TImode:
24537 gendw = gen_atomic_compare_and_swapti_doubleword;
24538 hmode = DImode;
24539 break;
24540 case E_DImode:
24541 if (doubleword)
24542 {
24543 gendw = gen_atomic_compare_and_swapdi_doubleword;
24544 hmode = SImode;
24545 }
24546 else
24547 gen = gen_atomic_compare_and_swapdi_1;
24548 break;
24549 case E_SImode:
24550 gen = gen_atomic_compare_and_swapsi_1;
24551 break;
24552 case E_HImode:
24553 gen = gen_atomic_compare_and_swaphi_1;
24554 break;
24555 case E_QImode:
24556 gen = gen_atomic_compare_and_swapqi_1;
24557 break;
24558 default:
24559 gcc_unreachable ();
24560 }
24561
24562 /* Compare mem value with expected value. */
24563 if (doubleword)
24564 {
24565 rtx low_new_mem = gen_lowpart (hmode, new_mem);
24566 rtx low_exp_input = gen_lowpart (hmode, exp_input);
24567 rtx high_new_mem = gen_highpart (hmode, new_mem);
24568 rtx high_exp_input = gen_highpart (hmode, exp_input);
24569 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
24570 hmode, 1, cmp_label,
24571 profile_probability::guessed_never ());
24572 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
24573 hmode, 1, cmp_label,
24574 profile_probability::guessed_never ());
24575 }
24576 else
24577 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
24578 GET_MODE (exp_input), 1, cmp_label,
24579 profile_probability::guessed_never ());
24580
24581 /* Directly emits cmpxchg here. */
24582 if (doubleword)
24583 emit_insn (gendw (target_val, mem, exp_input,
24584 gen_lowpart (hmode, new_input),
24585 gen_highpart (hmode, new_input),
24586 mem_model));
24587 else
24588 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
24589
24590 if (!loop_label)
24591 {
24592 emit_jump_insn (gen_jump (done_label));
24593 emit_barrier ();
24594 emit_label (cmp_label);
24595 emit_move_insn (target_val, new_mem);
24596 emit_label (done_label);
24597 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24598 const0_rtx);
24599 }
24600 else
24601 {
24602 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
24603 const0_rtx);
24604 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
24605 GET_MODE (target_bool), 1, loop_label,
24606 profile_probability::guessed_never ());
24607 emit_jump_insn (gen_jump (done_label));
24608 emit_barrier ();
24609
24610 /* If mem is not expected, pause and loop back. */
24611 emit_label (cmp_label);
24612 emit_move_insn (target_val, new_mem);
24613 emit_insn (gen_pause ());
24614 emit_jump_insn (gen_jump (loop_label));
24615 emit_barrier ();
24616 emit_label (done_label);
24617 }
24618
24619 *ptarget_bool = target_bool;
24620 }
24621
24622 /* Convert a BFmode VAL to SFmode without signaling sNaNs.
24623 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24624
24625 rtx
24626 ix86_expand_fast_convert_bf_to_sf (rtx val)
24627 {
24628 rtx op = gen_lowpart (HImode, val), ret;
24629 if (CONST_INT_P (op))
24630 {
24631 ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
24632 val, BFmode);
24633 if (ret)
24634 return ret;
24635 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24636 ret = gen_reg_rtx (SImode);
24637 emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
24638 emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
24639 return gen_lowpart (SFmode, ret);
24640 }
24641
24642 ret = gen_reg_rtx (SFmode);
24643 emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
24644 return ret;
24645 }
24646
24647 #include "gt-i386-expand.h"