1 /* Copyright (C) 1988-2023 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "shrink-wrap.h"
72 #include "tree-iterator.h"
74 #include "case-cfn-macros.h"
76 #include "fold-const-call.h"
78 #include "tree-ssanames.h"
80 #include "selftest-rtl.h"
81 #include "print-rtl.h"
84 #include "symbol-summary.h"
86 #include "ipa-fnsummary.h"
87 #include "wide-int-bitmask.h"
88 #include "tree-vector-builder.h"
90 #include "dwarf2out.h"
91 #include "i386-options.h"
92 #include "i386-builtins.h"
93 #include "i386-expand.h"
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
103 split_double_mode (machine_mode mode
, rtx operands
[],
104 int num
, rtx lo_half
[], rtx hi_half
[])
106 machine_mode half_mode
;
108 rtx mem_op
= NULL_RTX
;
129 byte
= GET_MODE_SIZE (half_mode
);
133 rtx op
= operands
[num
];
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
139 if (mem_op
&& rtx_equal_p (op
, mem_op
))
141 lo_half
[num
] = lo_half
[mem_num
];
142 hi_half
[num
] = hi_half
[mem_num
];
148 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
149 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
154 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
155 GET_MODE (op
) == VOIDmode
156 ? mode
: GET_MODE (op
), 0);
158 rtx tmp
= simplify_gen_subreg (half_mode
, op
,
159 GET_MODE (op
) == VOIDmode
160 ? mode
: GET_MODE (op
), byte
);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half
[num
] = tmp
? tmp
: gen_reg_rtx (half_mode
);
168 /* Emit the double word assignment DST = { LO, HI }. */
171 split_double_concat (machine_mode mode
, rtx dst
, rtx lo
, rtx hi
)
174 int deleted_move_count
= 0;
175 split_double_mode (mode
, &dst
, 1, &dlo
, &dhi
);
176 /* Constraints ensure that if both lo and hi are MEMs, then
177 dst has early-clobber and thus addresses of MEMs don't use
178 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
179 dlo/dhi are registers. */
181 && rtx_equal_p (dlo
, hi
)
182 && reg_overlap_mentioned_p (dhi
, lo
))
184 /* If dlo is same as hi and lo's address uses dhi register,
185 code below would first emit_move_insn (dhi, hi)
186 and then emit_move_insn (dlo, lo). But the former
187 would invalidate lo's address. Load into dhi first,
189 emit_move_insn (dhi
, lo
);
194 && !rtx_equal_p (dlo
, lo
)
195 && reg_overlap_mentioned_p (dlo
, hi
))
197 /* In this case, code below would first emit_move_insn (dlo, lo)
198 and then emit_move_insn (dhi, hi). But the former would
199 invalidate hi's address. */
200 if (rtx_equal_p (dhi
, lo
))
202 /* We can't load into dhi first, so load into dlo
203 first and we'll swap. */
204 emit_move_insn (dlo
, hi
);
209 /* Load into dhi first. */
210 emit_move_insn (dhi
, hi
);
214 if (!rtx_equal_p (dlo
, hi
))
216 if (!rtx_equal_p (dlo
, lo
))
217 emit_move_insn (dlo
, lo
);
219 deleted_move_count
++;
220 if (!rtx_equal_p (dhi
, hi
))
221 emit_move_insn (dhi
, hi
);
223 deleted_move_count
++;
225 else if (!rtx_equal_p (lo
, dhi
))
227 if (!rtx_equal_p (dhi
, hi
))
228 emit_move_insn (dhi
, hi
);
230 deleted_move_count
++;
231 if (!rtx_equal_p (dlo
, lo
))
232 emit_move_insn (dlo
, lo
);
234 deleted_move_count
++;
236 else if (mode
== TImode
)
237 emit_insn (gen_swapdi (dlo
, dhi
));
239 emit_insn (gen_swapsi (dlo
, dhi
));
241 if (deleted_move_count
== 2)
242 emit_note (NOTE_INSN_DELETED
);
246 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
250 ix86_expand_clear (rtx dest
)
254 /* We play register width games, which are only valid after reload. */
255 gcc_assert (reload_completed
);
257 /* Avoid HImode and its attendant prefix byte. */
258 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
259 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
260 tmp
= gen_rtx_SET (dest
, const0_rtx
);
262 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
264 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
265 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
271 /* Return true if V can be broadcasted from an integer of WIDTH bits
272 which is returned in VAL_BROADCAST. Otherwise, return false. */
275 ix86_broadcast (HOST_WIDE_INT v
, unsigned int width
,
276 HOST_WIDE_INT
&val_broadcast
)
278 wide_int val
= wi::uhwi (v
, HOST_BITS_PER_WIDE_INT
);
279 val_broadcast
= wi::extract_uhwi (val
, 0, width
);
280 for (unsigned int i
= width
; i
< HOST_BITS_PER_WIDE_INT
; i
+= width
)
282 HOST_WIDE_INT each
= wi::extract_uhwi (val
, i
, width
);
283 if (val_broadcast
!= each
)
286 val_broadcast
= sext_hwi (val_broadcast
, width
);
290 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
293 ix86_convert_const_wide_int_to_broadcast (machine_mode mode
, rtx op
)
295 /* Don't use integer vector broadcast if we can't move from GPR to SSE
296 register directly. */
297 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
300 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
301 broadcast only if vector broadcast is available. */
303 || !CONST_WIDE_INT_P (op
)
304 || standard_sse_constant_p (op
, mode
)
305 || (CONST_WIDE_INT_NUNITS (op
) * HOST_BITS_PER_WIDE_INT
306 != GET_MODE_BITSIZE (mode
)))
309 HOST_WIDE_INT val
= CONST_WIDE_INT_ELT (op
, 0);
310 HOST_WIDE_INT val_broadcast
;
311 scalar_int_mode broadcast_mode
;
313 && ix86_broadcast (val
, GET_MODE_BITSIZE (QImode
),
315 broadcast_mode
= QImode
;
317 && ix86_broadcast (val
, GET_MODE_BITSIZE (HImode
),
319 broadcast_mode
= HImode
;
320 else if (ix86_broadcast (val
, GET_MODE_BITSIZE (SImode
),
322 broadcast_mode
= SImode
;
323 else if (TARGET_64BIT
324 && ix86_broadcast (val
, GET_MODE_BITSIZE (DImode
),
326 broadcast_mode
= DImode
;
330 /* Check if OP can be broadcasted from VAL. */
331 for (int i
= 1; i
< CONST_WIDE_INT_NUNITS (op
); i
++)
332 if (val
!= CONST_WIDE_INT_ELT (op
, i
))
335 unsigned int nunits
= (GET_MODE_SIZE (mode
)
336 / GET_MODE_SIZE (broadcast_mode
));
337 machine_mode vector_mode
;
338 if (!mode_for_vector (broadcast_mode
, nunits
).exists (&vector_mode
))
340 rtx target
= gen_reg_rtx (vector_mode
);
341 bool ok
= ix86_expand_vector_init_duplicate (false, vector_mode
,
343 GEN_INT (val_broadcast
));
345 target
= lowpart_subreg (mode
, target
, vector_mode
);
350 ix86_expand_move (machine_mode mode
, rtx operands
[])
353 rtx tmp
, addend
= NULL_RTX
;
354 enum tls_model model
;
359 /* Avoid complex sets of likely spilled hard registers before reload. */
360 if (!ix86_hardreg_mov_ok (op0
, op1
))
362 tmp
= gen_reg_rtx (mode
);
364 ix86_expand_move (mode
, operands
);
370 switch (GET_CODE (op1
))
375 if (GET_CODE (tmp
) != PLUS
376 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
380 addend
= XEXP (tmp
, 1);
384 model
= SYMBOL_REF_TLS_MODEL (op1
);
387 op1
= legitimize_tls_address (op1
, model
, true);
388 else if (ix86_force_load_from_GOT_p (op1
))
390 /* Load the external function address via GOT slot to avoid PLT. */
391 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
395 op1
= gen_rtx_CONST (Pmode
, op1
);
396 op1
= gen_const_mem (Pmode
, op1
);
397 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
401 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
417 op1
= force_operand (op1
, NULL_RTX
);
418 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
419 op0
, 1, OPTAB_DIRECT
);
422 op1
= force_operand (op1
, op0
);
427 op1
= convert_to_mode (mode
, op1
, 1);
433 /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
437 && GET_MODE (SUBREG_REG (op1
)) == DImode
438 && SUBREG_BYTE (op1
) == 0)
439 op1
= gen_rtx_ZERO_EXTEND (TImode
, SUBREG_REG (op1
));
443 if ((flag_pic
|| MACHOPIC_INDIRECT
)
444 && symbolic_operand (op1
, mode
))
446 if (TARGET_MACHO
&& !TARGET_64BIT
)
450 if (MACHOPIC_INDIRECT
)
452 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
453 ? op0
: gen_reg_rtx (Pmode
);
454 op1
= machopic_indirect_data_reference (op1
, temp
);
456 op1
= machopic_legitimize_pic_address (op1
, mode
,
457 temp
== op1
? 0 : temp
);
459 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
461 rtx insn
= gen_rtx_SET (op0
, op1
);
465 if (GET_CODE (op0
) == MEM
)
466 op1
= force_reg (Pmode
, op1
);
470 if (GET_CODE (temp
) != REG
)
471 temp
= gen_reg_rtx (Pmode
);
472 temp
= legitimize_pic_address (op1
, temp
);
483 op1
= force_reg (mode
, op1
);
484 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
486 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
487 op1
= legitimize_pic_address (op1
, reg
);
490 op1
= convert_to_mode (mode
, op1
, 1);
497 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
498 || !push_operand (op0
, mode
))
500 op1
= force_reg (mode
, op1
);
502 if (push_operand (op0
, mode
)
503 && ! general_no_elim_operand (op1
, mode
))
504 op1
= copy_to_mode_reg (mode
, op1
);
506 /* Force large constants in 64bit compilation into register
507 to get them CSEed. */
508 if (can_create_pseudo_p ()
509 && (mode
== DImode
) && TARGET_64BIT
510 && immediate_operand (op1
, mode
)
511 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
512 && !register_operand (op0
, mode
)
514 op1
= copy_to_mode_reg (mode
, op1
);
516 if (can_create_pseudo_p ())
518 if (CONST_DOUBLE_P (op1
))
520 /* If we are loading a floating point constant to a
521 register, force the value to memory now, since we'll
522 get better code out the back end. */
524 op1
= validize_mem (force_const_mem (mode
, op1
));
525 if (!register_operand (op0
, mode
))
527 rtx temp
= gen_reg_rtx (mode
);
528 emit_insn (gen_rtx_SET (temp
, op1
));
529 emit_move_insn (op0
, temp
);
533 else if (CONST_WIDE_INT_P (op1
)
534 && GET_MODE_SIZE (mode
) >= 16)
536 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
537 (GET_MODE (op0
), op1
);
544 /* Special case inserting 64-bit values into a TImode register. */
546 /* Disable for -O0 (see PR110587) unless naked (PR110533). */
547 && (optimize
|| ix86_function_naked (current_function_decl
))
548 && (mode
== DImode
|| mode
== DFmode
)
550 && GET_MODE (SUBREG_REG (op0
)) == TImode
551 && REG_P (SUBREG_REG (op0
))
554 /* Use *insvti_lowpart_1 to set lowpart. */
555 if (SUBREG_BYTE (op0
) == 0)
557 wide_int mask
= wi::mask (64, true, 128);
558 rtx tmp
= immed_wide_int_const (mask
, TImode
);
559 op0
= SUBREG_REG (op0
);
560 tmp
= gen_rtx_AND (TImode
, copy_rtx (op0
), tmp
);
562 op1
= gen_lowpart (DImode
, op1
);
563 op1
= gen_rtx_ZERO_EXTEND (TImode
, op1
);
564 op1
= gen_rtx_IOR (TImode
, tmp
, op1
);
566 /* Use *insvti_highpart_1 to set highpart. */
567 else if (SUBREG_BYTE (op0
) == 8)
569 wide_int mask
= wi::mask (64, false, 128);
570 rtx tmp
= immed_wide_int_const (mask
, TImode
);
571 op0
= SUBREG_REG (op0
);
572 tmp
= gen_rtx_AND (TImode
, copy_rtx (op0
), tmp
);
574 op1
= gen_lowpart (DImode
, op1
);
575 op1
= gen_rtx_ZERO_EXTEND (TImode
, op1
);
576 op1
= gen_rtx_ASHIFT (TImode
, op1
, GEN_INT (64));
577 op1
= gen_rtx_IOR (TImode
, tmp
, op1
);
581 emit_insn (gen_rtx_SET (op0
, op1
));
584 /* OP is a memref of CONST_VECTOR, return scalar constant mem
585 if CONST_VECTOR is a vec_duplicate, else return NULL. */
587 ix86_broadcast_from_constant (machine_mode mode
, rtx op
)
589 int nunits
= GET_MODE_NUNITS (mode
);
593 /* Don't use integer vector broadcast if we can't move from GPR to SSE
594 register directly. */
595 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
596 && INTEGRAL_MODE_P (mode
))
599 /* Convert CONST_VECTOR to a non-standard SSE constant integer
600 broadcast only if vector broadcast is available. */
603 && (GET_MODE_INNER (mode
) == SImode
604 || GET_MODE_INNER (mode
) == DImode
))
605 || FLOAT_MODE_P (mode
))
606 || standard_sse_constant_p (op
, mode
))
609 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
610 We can still put 64-bit integer constant in memory when
611 avx512 embed broadcast is available. */
612 if (GET_MODE_INNER (mode
) == DImode
&& !TARGET_64BIT
614 || (GET_MODE_SIZE (mode
) < 64 && !TARGET_AVX512VL
)))
617 if (GET_MODE_INNER (mode
) == TImode
)
620 rtx constant
= get_pool_constant (XEXP (op
, 0));
621 if (GET_CODE (constant
) != CONST_VECTOR
)
624 /* There could be some rtx like
625 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
626 but with "*.LC1" refer to V2DI constant vector. */
627 if (GET_MODE (constant
) != mode
)
629 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
),
631 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
635 rtx first
= XVECEXP (constant
, 0, 0);
637 for (int i
= 1; i
< nunits
; ++i
)
639 rtx tmp
= XVECEXP (constant
, 0, i
);
640 /* Vector duplicate value. */
641 if (!rtx_equal_p (tmp
, first
))
649 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
651 rtx op0
= operands
[0], op1
= operands
[1];
652 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
653 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
654 unsigned int align
= (TARGET_IAMCU
655 ? GET_MODE_BITSIZE (mode
)
656 : GET_MODE_ALIGNMENT (mode
));
658 if (push_operand (op0
, VOIDmode
))
659 op0
= emit_move_resolve_push (mode
, op0
);
661 /* Force constants other than zero into memory. We do not know how
662 the instructions used to build constants modify the upper 64 bits
663 of the register, once we have that information we may be able
664 to handle some of them more efficiently. */
665 if (can_create_pseudo_p ()
668 && CONSTANT_P (SUBREG_REG (op1
))))
669 && ((register_operand (op0
, mode
)
670 && !standard_sse_constant_p (op1
, mode
))
671 /* ix86_expand_vector_move_misalign() does not like constants. */
672 || (SSE_REG_MODE_P (mode
)
674 && MEM_ALIGN (op0
) < align
)))
678 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
679 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
681 r
= validize_mem (r
);
683 r
= force_reg (imode
, SUBREG_REG (op1
));
684 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
688 machine_mode mode
= GET_MODE (op0
);
689 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
692 op1
= validize_mem (force_const_mem (mode
, op1
));
698 if (can_create_pseudo_p ()
699 && GET_MODE_SIZE (mode
) >= 16
700 && VECTOR_MODE_P (mode
)
702 && SYMBOL_REF_P (XEXP (op1
, 0))
703 && CONSTANT_POOL_ADDRESS_P (XEXP (op1
, 0))))
705 rtx first
= ix86_broadcast_from_constant (mode
, op1
);
706 if (first
!= nullptr)
708 /* Broadcast to XMM/YMM/ZMM register from an integer
709 constant or scalar mem. */
710 op1
= gen_reg_rtx (mode
);
711 if (FLOAT_MODE_P (mode
)
712 || (!TARGET_64BIT
&& GET_MODE_INNER (mode
) == DImode
))
713 first
= force_const_mem (GET_MODE_INNER (mode
), first
);
714 bool ok
= ix86_expand_vector_init_duplicate (false, mode
,
717 emit_move_insn (op0
, op1
);
722 /* We need to check memory alignment for SSE mode since attribute
723 can make operands unaligned. */
724 if (can_create_pseudo_p ()
725 && SSE_REG_MODE_P (mode
)
726 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
727 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
731 /* ix86_expand_vector_move_misalign() does not like both
732 arguments in memory. */
733 if (!register_operand (op0
, mode
)
734 && !register_operand (op1
, mode
))
736 rtx scratch
= gen_reg_rtx (mode
);
737 emit_move_insn (scratch
, op1
);
741 tmp
[0] = op0
; tmp
[1] = op1
;
742 ix86_expand_vector_move_misalign (mode
, tmp
);
746 /* Special case TImode to 128-bit vector conversions via V2DI. */
747 if (VECTOR_MODE_P (mode
)
748 && GET_MODE_SIZE (mode
) == 16
750 && GET_MODE (SUBREG_REG (op1
)) == TImode
751 && TARGET_64BIT
&& TARGET_SSE
752 && can_create_pseudo_p ())
754 rtx tmp
= gen_reg_rtx (V2DImode
);
755 rtx lo
= gen_reg_rtx (DImode
);
756 rtx hi
= gen_reg_rtx (DImode
);
757 emit_move_insn (lo
, gen_lowpart (DImode
, SUBREG_REG (op1
)));
758 emit_move_insn (hi
, gen_highpart (DImode
, SUBREG_REG (op1
)));
759 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
760 emit_move_insn (op0
, gen_lowpart (mode
, tmp
));
764 /* If operand0 is a hard register, make operand1 a pseudo. */
765 if (can_create_pseudo_p ()
766 && !ix86_hardreg_mov_ok (op0
, op1
))
768 rtx tmp
= gen_reg_rtx (GET_MODE (op0
));
769 emit_move_insn (tmp
, op1
);
770 emit_move_insn (op0
, tmp
);
774 /* Make operand1 a register if it isn't already. */
775 if (can_create_pseudo_p ()
776 && !register_operand (op0
, mode
)
777 && !register_operand (op1
, mode
))
779 rtx tmp
= gen_reg_rtx (GET_MODE (op0
));
780 emit_move_insn (tmp
, op1
);
781 emit_move_insn (op0
, tmp
);
785 emit_insn (gen_rtx_SET (op0
, op1
));
788 /* Split 32-byte AVX unaligned load and store if needed. */
791 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
794 rtx (*extract
) (rtx
, rtx
, rtx
);
797 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
798 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
800 emit_insn (gen_rtx_SET (op0
, op1
));
804 rtx orig_op0
= NULL_RTX
;
805 mode
= GET_MODE (op0
);
806 switch (GET_MODE_CLASS (mode
))
808 case MODE_VECTOR_INT
:
810 if (mode
!= V32QImode
)
815 op0
= gen_reg_rtx (V32QImode
);
818 op0
= gen_lowpart (V32QImode
, op0
);
819 op1
= gen_lowpart (V32QImode
, op1
);
823 case MODE_VECTOR_FLOAT
:
834 extract
= gen_avx_vextractf128v32qi
;
838 extract
= gen_avx_vextractf128v16bf
;
842 extract
= gen_avx_vextractf128v16hf
;
846 extract
= gen_avx_vextractf128v8sf
;
850 extract
= gen_avx_vextractf128v4df
;
857 rtx r
= gen_reg_rtx (mode
);
858 m
= adjust_address (op1
, mode
, 0);
859 emit_move_insn (r
, m
);
860 m
= adjust_address (op1
, mode
, 16);
861 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
862 emit_move_insn (op0
, r
);
864 else if (MEM_P (op0
))
866 m
= adjust_address (op0
, mode
, 0);
867 emit_insn (extract (m
, op1
, const0_rtx
));
868 m
= adjust_address (op0
, mode
, 16);
869 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
875 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
878 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
879 straight to ix86_expand_vector_move. */
880 /* Code generation for scalar reg-reg moves of single and double precision data:
881 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
885 if (x86_sse_partial_reg_dependency == true)
890 Code generation for scalar loads of double precision data:
891 if (x86_sse_split_regs == true)
892 movlpd mem, reg (gas syntax)
896 Code generation for unaligned packed loads of single precision data
897 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
898 if (x86_sse_unaligned_move_optimal)
901 if (x86_sse_partial_reg_dependency == true)
913 Code generation for unaligned packed loads of double precision data
914 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
915 if (x86_sse_unaligned_move_optimal)
918 if (x86_sse_split_regs == true)
931 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
938 /* Use unaligned load/store for AVX512 or when optimizing for size. */
939 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
941 emit_insn (gen_rtx_SET (op0
, op1
));
947 if (GET_MODE_SIZE (mode
) == 32)
948 ix86_avx256_split_vector_move_misalign (op0
, op1
);
950 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
951 emit_insn (gen_rtx_SET (op0
, op1
));
955 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
956 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
958 emit_insn (gen_rtx_SET (op0
, op1
));
962 /* ??? If we have typed data, then it would appear that using
963 movdqu is the only way to get unaligned data loaded with
965 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
967 emit_insn (gen_rtx_SET (op0
, op1
));
973 if (TARGET_SSE2
&& mode
== V2DFmode
)
977 /* When SSE registers are split into halves, we can avoid
978 writing to the top half twice. */
979 if (TARGET_SSE_SPLIT_REGS
)
986 /* ??? Not sure about the best option for the Intel chips.
987 The following would seem to satisfy; the register is
988 entirely cleared, breaking the dependency chain. We
989 then store to the upper half, with a dependency depth
990 of one. A rumor has it that Intel recommends two movsd
991 followed by an unpacklpd, but this is unconfirmed. And
992 given that the dependency depth of the unpacklpd would
993 still be one, I'm not sure why this would be better. */
994 zero
= CONST0_RTX (V2DFmode
);
997 m
= adjust_address (op1
, DFmode
, 0);
998 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
999 m
= adjust_address (op1
, DFmode
, 8);
1000 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
1006 if (mode
!= V4SFmode
)
1007 t
= gen_reg_rtx (V4SFmode
);
1011 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
1012 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
1016 m
= adjust_address (op1
, V2SFmode
, 0);
1017 emit_insn (gen_sse_loadlps (t
, t
, m
));
1018 m
= adjust_address (op1
, V2SFmode
, 8);
1019 emit_insn (gen_sse_loadhps (t
, t
, m
));
1020 if (mode
!= V4SFmode
)
1021 emit_move_insn (op0
, gen_lowpart (mode
, t
));
1024 else if (MEM_P (op0
))
1026 if (TARGET_SSE2
&& mode
== V2DFmode
)
1028 m
= adjust_address (op0
, DFmode
, 0);
1029 emit_insn (gen_sse2_storelpd (m
, op1
));
1030 m
= adjust_address (op0
, DFmode
, 8);
1031 emit_insn (gen_sse2_storehpd (m
, op1
));
1035 if (mode
!= V4SFmode
)
1036 op1
= gen_lowpart (V4SFmode
, op1
);
1038 m
= adjust_address (op0
, V2SFmode
, 0);
1039 emit_insn (gen_sse_storelps (m
, op1
));
1040 m
= adjust_address (op0
, V2SFmode
, 8);
1041 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
1048 /* Move bits 64:95 to bits 32:63. */
1051 ix86_move_vector_high_sse_to_mmx (rtx op
)
1053 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
1054 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1055 GEN_INT (0), GEN_INT (0)));
1056 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
1057 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
1058 rtx insn
= gen_rtx_SET (dest
, op
);
1062 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1065 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
1067 rtx op0
= operands
[0];
1068 rtx op1
= operands
[1];
1069 rtx op2
= operands
[2];
1072 machine_mode dmode
= GET_MODE (op0
);
1073 machine_mode smode
= GET_MODE (op1
);
1074 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
1075 machine_mode inner_smode
= GET_MODE_INNER (smode
);
1077 /* Get the corresponding SSE mode for destination. */
1078 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
1079 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
1081 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
1082 nunits
/ 2).require ();
1084 /* Get the corresponding SSE mode for source. */
1085 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
1086 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
1089 /* Generate SSE pack with signed/unsigned saturation. */
1090 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
1091 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
1092 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
1094 /* paskusdw/packuswb does unsigned saturation of a signed source
1095 which is different from generic us_truncate RTX. */
1096 if (code
== US_TRUNCATE
)
1097 src
= gen_rtx_UNSPEC (sse_dmode
,
1098 gen_rtvec (2, op1
, op2
),
1099 UNSPEC_US_TRUNCATE
);
1102 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
1103 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
1104 src
= gen_rtx_VEC_CONCAT (sse_dmode
, op1
, op2
);
1107 emit_move_insn (dest
, src
);
1109 ix86_move_vector_high_sse_to_mmx (op0
);
1112 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1115 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
1117 rtx op0
= operands
[0];
1118 rtx op1
= operands
[1];
1119 rtx op2
= operands
[2];
1120 machine_mode mode
= GET_MODE (op0
);
1122 /* The corresponding SSE mode. */
1123 machine_mode sse_mode
, double_sse_mode
;
1130 sse_mode
= V16QImode
;
1131 double_sse_mode
= V32QImode
;
1132 mask
= gen_rtx_PARALLEL (VOIDmode
,
1134 GEN_INT (0), GEN_INT (16),
1135 GEN_INT (1), GEN_INT (17),
1136 GEN_INT (2), GEN_INT (18),
1137 GEN_INT (3), GEN_INT (19),
1138 GEN_INT (4), GEN_INT (20),
1139 GEN_INT (5), GEN_INT (21),
1140 GEN_INT (6), GEN_INT (22),
1141 GEN_INT (7), GEN_INT (23)));
1146 sse_mode
= V8HImode
;
1147 double_sse_mode
= V16HImode
;
1148 mask
= gen_rtx_PARALLEL (VOIDmode
,
1150 GEN_INT (0), GEN_INT (8),
1151 GEN_INT (1), GEN_INT (9),
1152 GEN_INT (2), GEN_INT (10),
1153 GEN_INT (3), GEN_INT (11)));
1157 sse_mode
= V4SImode
;
1158 double_sse_mode
= V8SImode
;
1159 mask
= gen_rtx_PARALLEL (VOIDmode
,
1161 GEN_INT (0), GEN_INT (4),
1162 GEN_INT (1), GEN_INT (5)));
1166 sse_mode
= V4SFmode
;
1167 double_sse_mode
= V8SFmode
;
1168 mask
= gen_rtx_PARALLEL (VOIDmode
,
1170 GEN_INT (0), GEN_INT (4),
1171 GEN_INT (1), GEN_INT (5)));
1178 /* Generate SSE punpcklXX. */
1179 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
1180 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
1181 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
1183 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
1184 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
1185 rtx insn
= gen_rtx_SET (dest
, op2
);
1188 /* Move high bits to low bits. */
1191 if (sse_mode
== V4SFmode
)
1193 mask
= gen_rtx_PARALLEL (VOIDmode
,
1194 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1195 GEN_INT (4), GEN_INT (5)));
1196 op2
= gen_rtx_VEC_CONCAT (V8SFmode
, dest
, dest
);
1197 op1
= gen_rtx_VEC_SELECT (V4SFmode
, op2
, mask
);
1201 int sz
= GET_MODE_SIZE (mode
);
1204 mask
= gen_rtx_PARALLEL (VOIDmode
,
1205 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1206 GEN_INT (0), GEN_INT (1)));
1208 mask
= gen_rtx_PARALLEL (VOIDmode
,
1209 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1210 GEN_INT (0), GEN_INT (1)));
1214 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
1215 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
1218 insn
= gen_rtx_SET (dest
, op1
);
1223 /* Helper function of ix86_fixup_binary_operands to canonicalize
1224 operand order. Returns true if the operands should be swapped. */
1227 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
1230 rtx dst
= operands
[0];
1231 rtx src1
= operands
[1];
1232 rtx src2
= operands
[2];
1234 /* If the operation is not commutative, we can't do anything. */
1235 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
1236 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
1239 /* Highest priority is that src1 should match dst. */
1240 if (rtx_equal_p (dst
, src1
))
1242 if (rtx_equal_p (dst
, src2
))
1245 /* Next highest priority is that immediate constants come second. */
1246 if (immediate_operand (src2
, mode
))
1248 if (immediate_operand (src1
, mode
))
1251 /* Lowest priority is that memory references should come second. */
1261 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1262 destination to use for the operation. If different from the true
1263 destination in operands[0], a copy operation will be required. */
1266 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
1269 rtx dst
= operands
[0];
1270 rtx src1
= operands
[1];
1271 rtx src2
= operands
[2];
1273 /* Canonicalize operand order. */
1274 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1276 /* It is invalid to swap operands of different modes. */
1277 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
1279 std::swap (src1
, src2
);
1282 /* Both source operands cannot be in memory. */
1283 if (MEM_P (src1
) && MEM_P (src2
))
1285 /* Optimization: Only read from memory once. */
1286 if (rtx_equal_p (src1
, src2
))
1288 src2
= force_reg (mode
, src2
);
1291 else if (rtx_equal_p (dst
, src1
))
1292 src2
= force_reg (mode
, src2
);
1294 src1
= force_reg (mode
, src1
);
1297 /* If the destination is memory, and we do not have matching source
1298 operands, do things in registers. */
1299 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1300 dst
= gen_reg_rtx (mode
);
1302 /* Source 1 cannot be a constant. */
1303 if (CONSTANT_P (src1
))
1304 src1
= force_reg (mode
, src1
);
1306 /* Source 1 cannot be a non-matching memory. */
1307 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1308 src1
= force_reg (mode
, src1
);
1310 /* Improve address combine. */
1312 && GET_MODE_CLASS (mode
) == MODE_INT
1314 src2
= force_reg (mode
, src2
);
1321 /* Similarly, but assume that the destination has already been
1325 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
1326 machine_mode mode
, rtx operands
[])
1328 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1329 gcc_assert (dst
== operands
[0]);
1332 /* Attempt to expand a binary operator. Make the expansion closer to the
1333 actual machine, then just general_operand, which will allow 3 separate
1334 memory references (one output, two input) in a single insn. */
1337 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
1340 rtx src1
, src2
, dst
, op
, clob
;
1342 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1346 /* Emit the instruction. */
1348 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
1350 if (reload_completed
1352 && !rtx_equal_p (dst
, src1
))
1354 /* This is going to be an LEA; avoid splitting it later. */
1359 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1360 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1363 /* Fix up the destination if needed. */
1364 if (dst
!= operands
[0])
1365 emit_move_insn (operands
[0], dst
);
1368 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1369 the given OPERANDS. */
1372 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
1375 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
1376 if (SUBREG_P (operands
[1]))
1381 else if (SUBREG_P (operands
[2]))
1386 /* Optimize (__m128i) d | (__m128i) e and similar code
1387 when d and e are float vectors into float vector logical
1388 insn. In C/C++ without using intrinsics there is no other way
1389 to express vector logical operation on float vectors than
1390 to cast them temporarily to integer vectors. */
1392 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1393 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
1394 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
1395 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
1396 && SUBREG_BYTE (op1
) == 0
1397 && (GET_CODE (op2
) == CONST_VECTOR
1398 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
1399 && SUBREG_BYTE (op2
) == 0))
1400 && can_create_pseudo_p ())
1403 switch (GET_MODE (SUBREG_REG (op1
)))
1411 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1412 if (GET_CODE (op2
) == CONST_VECTOR
)
1414 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1415 op2
= force_reg (GET_MODE (dst
), op2
);
1420 op2
= SUBREG_REG (operands
[2]);
1421 if (!vector_operand (op2
, GET_MODE (dst
)))
1422 op2
= force_reg (GET_MODE (dst
), op2
);
1424 op1
= SUBREG_REG (op1
);
1425 if (!vector_operand (op1
, GET_MODE (dst
)))
1426 op1
= force_reg (GET_MODE (dst
), op1
);
1427 emit_insn (gen_rtx_SET (dst
,
1428 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1430 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1436 if (!vector_operand (operands
[1], mode
))
1437 operands
[1] = force_reg (mode
, operands
[1]);
1438 if (!vector_operand (operands
[2], mode
))
1439 operands
[2] = force_reg (mode
, operands
[2]);
1440 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1441 emit_insn (gen_rtx_SET (operands
[0],
1442 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1446 /* Return TRUE or FALSE depending on whether the binary operator meets the
1447 appropriate constraints. */
1450 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1453 rtx dst
= operands
[0];
1454 rtx src1
= operands
[1];
1455 rtx src2
= operands
[2];
1457 /* Both source operands cannot be in memory. */
1458 if ((MEM_P (src1
) || bcst_mem_operand (src1
, mode
))
1459 && (MEM_P (src2
) || bcst_mem_operand (src2
, mode
)))
1462 /* Canonicalize operand order for commutative operators. */
1463 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1464 std::swap (src1
, src2
);
1466 /* If the destination is memory, we must have a matching source operand. */
1467 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1470 /* Source 1 cannot be a constant. */
1471 if (CONSTANT_P (src1
))
1474 /* Source 1 cannot be a non-matching memory. */
1475 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1476 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1480 || (TARGET_64BIT
&& mode
== DImode
))
1481 && satisfies_constraint_L (src2
));
1486 /* Attempt to expand a unary operator. Make the expansion closer to the
1487 actual machine, then just general_operand, which will allow 2 separate
1488 memory references (one output, one input) in a single insn. */
1491 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1494 bool matching_memory
= false;
1495 rtx src
, dst
, op
, clob
;
1500 /* If the destination is memory, and we do not have matching source
1501 operands, do things in registers. */
1504 if (rtx_equal_p (dst
, src
))
1505 matching_memory
= true;
1507 dst
= gen_reg_rtx (mode
);
1510 /* When source operand is memory, destination must match. */
1511 if (MEM_P (src
) && !matching_memory
)
1512 src
= force_reg (mode
, src
);
1514 /* Emit the instruction. */
1516 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1522 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1523 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1526 /* Fix up the destination if needed. */
1527 if (dst
!= operands
[0])
1528 emit_move_insn (operands
[0], dst
);
1531 /* Predict just emitted jump instruction to be taken with probability PROB. */
1534 predict_jump (int prob
)
1536 rtx_insn
*insn
= get_last_insn ();
1537 gcc_assert (JUMP_P (insn
));
1538 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1541 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1542 divisor are within the range [0-255]. */
1545 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1548 rtx_code_label
*end_label
, *qimode_label
;
1551 rtx scratch
, tmp0
, tmp1
, tmp2
;
1552 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1554 operands
[2] = force_reg (mode
, operands
[2]);
1555 operands
[3] = force_reg (mode
, operands
[3]);
1560 if (GET_MODE (operands
[0]) == SImode
)
1562 if (GET_MODE (operands
[1]) == SImode
)
1563 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1566 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1570 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1574 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1581 end_label
= gen_label_rtx ();
1582 qimode_label
= gen_label_rtx ();
1584 scratch
= gen_reg_rtx (mode
);
1586 /* Use 8bit unsigned divimod if dividend and divisor are within
1587 the range [0-255]. */
1588 emit_move_insn (scratch
, operands
[2]);
1589 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1590 scratch
, 1, OPTAB_DIRECT
);
1591 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1592 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1593 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1594 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1595 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1597 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1598 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1599 JUMP_LABEL (insn
) = qimode_label
;
1601 /* Generate original signed/unsigned divimod. */
1602 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1603 operands
[2], operands
[3]));
1605 /* Branch to the end. */
1606 emit_jump_insn (gen_jump (end_label
));
1609 /* Generate 8bit unsigned divide. */
1610 emit_label (qimode_label
);
1611 /* Don't use operands[0] for result of 8bit divide since not all
1612 registers support QImode ZERO_EXTRACT. */
1613 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1614 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1615 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1616 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1620 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1621 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1625 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1626 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1630 if (GET_MODE (operands
[0]) != SImode
)
1631 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1632 if (GET_MODE (operands
[1]) != SImode
)
1633 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1636 /* Extract remainder from AH. */
1637 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1638 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1639 GEN_INT (8), GEN_INT (8));
1640 insn
= emit_move_insn (operands
[1], tmp1
);
1641 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1643 /* Zero extend quotient from AL. */
1644 tmp1
= gen_lowpart (QImode
, tmp0
);
1645 insn
= emit_insn (gen_extend_insn
1647 GET_MODE (operands
[0]), QImode
, 1));
1648 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1650 emit_label (end_label
);
1653 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1654 matches destination. RTX includes clobber of FLAGS_REG. */
1657 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1662 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1663 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1665 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1668 /* Return true if regno1 def is nearest to the insn. */
1671 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1673 rtx_insn
*prev
= insn
;
1674 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1678 while (prev
&& prev
!= start
)
1680 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1682 prev
= PREV_INSN (prev
);
1685 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1687 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1689 prev
= PREV_INSN (prev
);
1692 /* None of the regs is defined in the bb. */
1696 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1697 int ix86_last_zero_store_uid
;
1699 /* Split lea instructions into a sequence of instructions
1700 which are executed on ALU to avoid AGU stalls.
1701 It is assumed that it is allowed to clobber flags register
1705 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1707 unsigned int regno0
, regno1
, regno2
;
1708 struct ix86_address parts
;
1712 ok
= ix86_decompose_address (operands
[1], &parts
);
1715 target
= gen_lowpart (mode
, operands
[0]);
1717 regno0
= true_regnum (target
);
1718 regno1
= INVALID_REGNUM
;
1719 regno2
= INVALID_REGNUM
;
1723 parts
.base
= gen_lowpart (mode
, parts
.base
);
1724 regno1
= true_regnum (parts
.base
);
1729 parts
.index
= gen_lowpart (mode
, parts
.index
);
1730 regno2
= true_regnum (parts
.index
);
1734 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1736 if (parts
.scale
> 1)
1738 /* Case r1 = r1 + ... */
1739 if (regno1
== regno0
)
1741 /* If we have a case r1 = r1 + C * r2 then we
1742 should use multiplication which is very
1743 expensive. Assume cost model is wrong if we
1744 have such case here. */
1745 gcc_assert (regno2
!= regno0
);
1747 for (adds
= parts
.scale
; adds
> 0; adds
--)
1748 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1752 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1753 if (regno0
!= regno2
)
1754 emit_insn (gen_rtx_SET (target
, parts
.index
));
1756 /* Use shift for scaling, but emit it as MULT instead
1757 to avoid it being immediately peephole2 optimized back
1759 ix86_emit_binop (MULT
, mode
, target
, GEN_INT (parts
.scale
));
1762 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1764 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1765 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1768 else if (!parts
.base
&& !parts
.index
)
1770 gcc_assert(parts
.disp
);
1771 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1777 if (regno0
!= regno2
)
1778 emit_insn (gen_rtx_SET (target
, parts
.index
));
1780 else if (!parts
.index
)
1782 if (regno0
!= regno1
)
1783 emit_insn (gen_rtx_SET (target
, parts
.base
));
1787 if (regno0
== regno1
)
1789 else if (regno0
== regno2
)
1795 /* Find better operand for SET instruction, depending
1796 on which definition is farther from the insn. */
1797 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1798 tmp
= parts
.index
, tmp1
= parts
.base
;
1800 tmp
= parts
.base
, tmp1
= parts
.index
;
1802 emit_insn (gen_rtx_SET (target
, tmp
));
1804 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1805 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1807 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1811 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1814 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1815 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1819 /* Post-reload splitter for converting an SF or DFmode value in an
1820 SSE register into an unsigned SImode. */
1823 ix86_split_convert_uns_si_sse (rtx operands
[])
1825 machine_mode vecmode
;
1826 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1828 large
= operands
[1];
1829 zero_or_two31
= operands
[2];
1830 input
= operands
[3];
1831 two31
= operands
[4];
1832 vecmode
= GET_MODE (large
);
1833 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1835 /* Load up the value into the low element. We must ensure that the other
1836 elements are valid floats -- zero is the easiest such value. */
1839 if (vecmode
== V4SFmode
)
1840 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1842 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1846 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1847 emit_move_insn (value
, CONST0_RTX (vecmode
));
1848 if (vecmode
== V4SFmode
)
1849 emit_insn (gen_sse_movss_v4sf (value
, value
, input
));
1851 emit_insn (gen_sse2_movsd_v2df (value
, value
, input
));
1854 emit_move_insn (large
, two31
);
1855 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1857 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1858 emit_insn (gen_rtx_SET (large
, x
));
1860 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1861 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1863 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1864 emit_insn (gen_rtx_SET (value
, x
));
1866 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1867 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1869 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1870 if (vecmode
== V4SFmode
)
1871 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1873 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1876 emit_insn (gen_xorv4si3 (value
, value
, large
));
1879 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1880 machine_mode mode
, rtx target
,
1881 rtx var
, int one_var
);
1883 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1884 Expects the 64-bit DImode to be supplied in a pair of integral
1885 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1886 -mfpmath=sse, !optimize_size only. */
1889 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1891 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1892 rtx int_xmm
, fp_xmm
;
1893 rtx biases
, exponents
;
1896 int_xmm
= gen_reg_rtx (V4SImode
);
1897 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1898 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1899 else if (TARGET_SSE_SPLIT_REGS
)
1901 emit_clobber (int_xmm
);
1902 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1906 x
= gen_reg_rtx (V2DImode
);
1907 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1908 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1911 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1912 gen_rtvec (4, GEN_INT (0x43300000UL
),
1913 GEN_INT (0x45300000UL
),
1914 const0_rtx
, const0_rtx
));
1915 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1917 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1918 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1920 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1921 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1922 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1923 (0x1.0p84 + double(fp_value_hi_xmm)).
1924 Note these exponents differ by 32. */
1926 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1928 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1929 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1930 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1931 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1932 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1933 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1934 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1935 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1936 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1938 /* Add the upper and lower DFmode values together. */
1940 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1943 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1944 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1945 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1948 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1951 /* Not used, but eases macroization of patterns. */
1953 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1958 static rtx
ix86_expand_sse_fabs (rtx op0
, rtx
*smask
);
1960 /* Convert an unsigned SImode value into a DFmode. Only currently used
1961 for SSE, but applicable anywhere. */
1964 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1966 REAL_VALUE_TYPE TWO31r
;
1969 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1970 NULL
, 1, OPTAB_DIRECT
);
1972 fp
= gen_reg_rtx (DFmode
);
1973 emit_insn (gen_floatsidf2 (fp
, x
));
1975 real_ldexp (&TWO31r
, &dconst1
, 31);
1976 x
= const_double_from_real_value (TWO31r
, DFmode
);
1978 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1980 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1981 if (HONOR_SIGNED_ZEROS (DFmode
) && flag_rounding_math
)
1982 x
= ix86_expand_sse_fabs (x
, NULL
);
1985 emit_move_insn (target
, x
);
1988 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1989 32-bit mode; otherwise we have a direct convert instruction. */
1992 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1994 REAL_VALUE_TYPE TWO32r
;
1995 rtx fp_lo
, fp_hi
, x
;
1997 fp_lo
= gen_reg_rtx (DFmode
);
1998 fp_hi
= gen_reg_rtx (DFmode
);
2000 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
2002 real_ldexp (&TWO32r
, &dconst1
, 32);
2003 x
= const_double_from_real_value (TWO32r
, DFmode
);
2004 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
2006 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
2008 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
2011 emit_move_insn (target
, x
);
2014 /* Convert an unsigned SImode value into a SFmode, using only SSE.
2015 For x86_32, -mfpmath=sse, !optimize_size only. */
2017 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
2019 REAL_VALUE_TYPE ONE16r
;
2020 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
2022 real_ldexp (&ONE16r
, &dconst1
, 16);
2023 x
= const_double_from_real_value (ONE16r
, SFmode
);
2024 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
2025 NULL
, 0, OPTAB_DIRECT
);
2026 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
2027 NULL
, 0, OPTAB_DIRECT
);
2028 fp_hi
= gen_reg_rtx (SFmode
);
2029 fp_lo
= gen_reg_rtx (SFmode
);
2030 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
2031 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
2034 x
= validize_mem (force_const_mem (SFmode
, x
));
2035 fp_hi
= gen_rtx_FMA (SFmode
, fp_hi
, x
, fp_lo
);
2036 emit_move_insn (target
, fp_hi
);
2040 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
2042 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
2044 if (!rtx_equal_p (target
, fp_hi
))
2045 emit_move_insn (target
, fp_hi
);
2049 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2050 a vector of unsigned ints VAL to vector of floats TARGET. */
2053 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
2056 REAL_VALUE_TYPE TWO16r
;
2057 machine_mode intmode
= GET_MODE (val
);
2058 machine_mode fltmode
= GET_MODE (target
);
2059 rtx (*cvt
) (rtx
, rtx
);
2061 if (intmode
== V4SImode
)
2062 cvt
= gen_floatv4siv4sf2
;
2064 cvt
= gen_floatv8siv8sf2
;
2065 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
2066 tmp
[0] = force_reg (intmode
, tmp
[0]);
2067 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
2069 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
2070 NULL_RTX
, 1, OPTAB_DIRECT
);
2071 tmp
[3] = gen_reg_rtx (fltmode
);
2072 emit_insn (cvt (tmp
[3], tmp
[1]));
2073 tmp
[4] = gen_reg_rtx (fltmode
);
2074 emit_insn (cvt (tmp
[4], tmp
[2]));
2075 real_ldexp (&TWO16r
, &dconst1
, 16);
2076 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
2077 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
2080 tmp
[6] = gen_rtx_FMA (fltmode
, tmp
[4], tmp
[5], tmp
[3]);
2081 emit_move_insn (target
, tmp
[6]);
2085 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5],
2086 NULL_RTX
, 1, OPTAB_DIRECT
);
2087 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6],
2088 target
, 1, OPTAB_DIRECT
);
2089 if (tmp
[7] != target
)
2090 emit_move_insn (target
, tmp
[7]);
2094 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2095 pattern can be used on it instead of fixuns_trunc*.
2096 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2097 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2100 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
2102 REAL_VALUE_TYPE TWO31r
;
2104 machine_mode mode
= GET_MODE (val
);
2105 machine_mode scalarmode
= GET_MODE_INNER (mode
);
2106 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
2107 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
2110 for (i
= 0; i
< 3; i
++)
2111 tmp
[i
] = gen_reg_rtx (mode
);
2112 real_ldexp (&TWO31r
, &dconst1
, 31);
2113 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
2114 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
2115 two31r
= force_reg (mode
, two31r
);
2118 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
2119 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
2120 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
2121 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
2122 default: gcc_unreachable ();
2124 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
2125 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
2126 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
2128 if (intmode
== V4SImode
|| TARGET_AVX2
)
2129 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
2130 gen_lowpart (intmode
, tmp
[0]),
2131 GEN_INT (31), NULL_RTX
, 0,
2135 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
2136 two31
= ix86_build_const_vector (intmode
, 1, two31
);
2137 *xorp
= expand_simple_binop (intmode
, AND
,
2138 gen_lowpart (intmode
, tmp
[0]),
2142 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
2146 /* Generate code for floating point ABS or NEG. */
2149 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2153 bool use_sse
= false;
2154 bool vector_mode
= VECTOR_MODE_P (mode
);
2155 machine_mode vmode
= mode
;
2158 if (vector_mode
|| mode
== TFmode
|| mode
== HFmode
)
2164 else if (TARGET_SSE_MATH
)
2166 use_sse
= SSE_FLOAT_MODE_P (mode
);
2169 else if (mode
== DFmode
)
2176 set
= gen_rtx_fmt_e (code
, mode
, src
);
2177 set
= gen_rtx_SET (dst
, set
);
2181 rtx mask
, use
, clob
;
2183 /* NEG and ABS performed with SSE use bitwise mask operations.
2184 Create the appropriate mask now. */
2185 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
2186 use
= gen_rtx_USE (VOIDmode
, mask
);
2187 if (vector_mode
|| mode
== TFmode
)
2188 par
= gen_rtvec (2, set
, use
);
2191 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2192 par
= gen_rtvec (3, set
, use
, clob
);
2199 /* Changing of sign for FP values is doable using integer unit too. */
2200 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2201 par
= gen_rtvec (2, set
, clob
);
2204 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2207 /* Deconstruct a floating point ABS or NEG operation
2208 with integer registers into integer operations. */
2211 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2214 enum rtx_code absneg_op
;
2217 gcc_assert (operands_match_p (operands
[0], operands
[1]));
2222 dst
= gen_lowpart (SImode
, operands
[0]);
2226 set
= gen_int_mode (0x7fffffff, SImode
);
2231 set
= gen_int_mode (0x80000000, SImode
);
2234 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2240 dst
= gen_lowpart (DImode
, operands
[0]);
2241 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
2246 set
= gen_rtx_NOT (DImode
, dst
);
2250 dst
= gen_highpart (SImode
, operands
[0]);
2254 set
= gen_int_mode (0x7fffffff, SImode
);
2259 set
= gen_int_mode (0x80000000, SImode
);
2262 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2267 dst
= gen_rtx_REG (SImode
,
2268 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
2271 set
= GEN_INT (0x7fff);
2276 set
= GEN_INT (0x8000);
2279 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2286 set
= gen_rtx_SET (dst
, set
);
2288 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2289 rtvec par
= gen_rtvec (2, set
, clob
);
2291 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2294 /* Expand a copysign operation. Special case operand 0 being a constant. */
2297 ix86_expand_copysign (rtx operands
[])
2299 machine_mode mode
, vmode
;
2300 rtx dest
, vdest
, op0
, op1
, mask
, op2
, op3
;
2302 mode
= GET_MODE (operands
[0]);
2306 else if (mode
== SFmode
)
2308 else if (mode
== DFmode
)
2310 else if (mode
== TFmode
)
2315 if (rtx_equal_p (operands
[1], operands
[2]))
2317 emit_move_insn (operands
[0], operands
[1]);
2322 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2323 if (vdest
== NULL_RTX
)
2324 vdest
= gen_reg_rtx (vmode
);
2327 op1
= lowpart_subreg (vmode
, force_reg (mode
, operands
[2]), mode
);
2328 mask
= ix86_build_signbit_mask (vmode
, TARGET_AVX512F
&& mode
!= HFmode
, 0);
2330 if (CONST_DOUBLE_P (operands
[1]))
2332 op0
= simplify_unary_operation (ABS
, mode
, operands
[1], mode
);
2333 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2334 if (op0
== CONST0_RTX (mode
))
2336 emit_move_insn (vdest
, gen_rtx_AND (vmode
, mask
, op1
));
2338 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2342 if (GET_MODE_SIZE (mode
) < 16)
2343 op0
= ix86_build_const_vector (vmode
, false, op0
);
2344 op0
= force_reg (vmode
, op0
);
2347 op0
= lowpart_subreg (vmode
, force_reg (mode
, operands
[1]), mode
);
2349 op2
= gen_reg_rtx (vmode
);
2350 op3
= gen_reg_rtx (vmode
);
2351 emit_move_insn (op2
, gen_rtx_AND (vmode
,
2352 gen_rtx_NOT (vmode
, mask
),
2354 emit_move_insn (op3
, gen_rtx_AND (vmode
, mask
, op1
));
2355 emit_move_insn (vdest
, gen_rtx_IOR (vmode
, op2
, op3
));
2357 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2360 /* Expand an xorsign operation. */
2363 ix86_expand_xorsign (rtx operands
[])
2365 machine_mode mode
, vmode
;
2366 rtx dest
, vdest
, op0
, op1
, mask
, x
, temp
;
2372 mode
= GET_MODE (dest
);
2376 else if (mode
== SFmode
)
2378 else if (mode
== DFmode
)
2383 temp
= gen_reg_rtx (vmode
);
2384 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2386 op1
= lowpart_subreg (vmode
, force_reg (mode
, op1
), mode
);
2387 x
= gen_rtx_AND (vmode
, op1
, mask
);
2388 emit_insn (gen_rtx_SET (temp
, x
));
2390 op0
= lowpart_subreg (vmode
, force_reg (mode
, op0
), mode
);
2391 x
= gen_rtx_XOR (vmode
, temp
, op0
);
2393 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2394 if (vdest
== NULL_RTX
)
2395 vdest
= gen_reg_rtx (vmode
);
2398 emit_insn (gen_rtx_SET (vdest
, x
));
2401 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2404 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2407 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2409 machine_mode mode
= GET_MODE (op0
);
2412 /* Handle special case - vector comparsion with boolean result, transform
2413 it using ptest instruction. */
2414 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
2415 || (mode
== TImode
&& !TARGET_64BIT
)
2418 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2419 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2421 gcc_assert (code
== EQ
|| code
== NE
);
2423 if (GET_MODE_CLASS (mode
) != MODE_VECTOR_INT
)
2425 op0
= lowpart_subreg (p_mode
, force_reg (mode
, op0
), mode
);
2426 op1
= lowpart_subreg (p_mode
, force_reg (mode
, op1
), mode
);
2429 /* Generate XOR since we can't check that one operand is zero vector. */
2430 tmp
= gen_reg_rtx (mode
);
2431 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2432 tmp
= gen_lowpart (p_mode
, tmp
);
2433 emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode
, FLAGS_REG
),
2434 gen_rtx_UNSPEC (CCZmode
,
2435 gen_rtvec (2, tmp
, tmp
),
2437 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2438 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2439 gen_rtx_LABEL_REF (VOIDmode
, label
),
2441 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2455 tmp
= ix86_expand_compare (code
, op0
, op1
);
2456 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2457 gen_rtx_LABEL_REF (VOIDmode
, label
),
2459 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2467 /* DI and TI mode equality/inequality comparisons may be performed
2468 on SSE registers. Avoid splitting them, except when optimizing
2470 if ((code
== EQ
|| code
== NE
)
2471 && !optimize_insn_for_size_p ())
2474 /* Expand DImode branch into multiple compare+branch. */
2477 rtx_code_label
*label2
;
2478 enum rtx_code code1
, code2
, code3
;
2479 machine_mode submode
;
2481 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2483 std::swap (op0
, op1
);
2484 code
= swap_condition (code
);
2487 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2488 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2490 submode
= mode
== DImode
? SImode
: DImode
;
2492 /* If we are doing less-than or greater-or-equal-than,
2493 op1 is a constant and the low word is zero, then we can just
2494 examine the high word. Similarly for low word -1 and
2495 less-or-equal-than or greater-than. */
2497 if (CONST_INT_P (hi
[1]))
2500 case LT
: case LTU
: case GE
: case GEU
:
2501 if (lo
[1] == const0_rtx
)
2503 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2507 case LE
: case LEU
: case GT
: case GTU
:
2508 if (lo
[1] == constm1_rtx
)
2510 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2518 /* Emulate comparisons that do not depend on Zero flag with
2519 double-word subtraction. Note that only Overflow, Sign
2520 and Carry flags are valid, so swap arguments and condition
2521 of comparisons that would otherwise test Zero flag. */
2525 case LE
: case LEU
: case GT
: case GTU
:
2526 std::swap (lo
[0], lo
[1]);
2527 std::swap (hi
[0], hi
[1]);
2528 code
= swap_condition (code
);
2531 case LT
: case LTU
: case GE
: case GEU
:
2533 bool uns
= (code
== LTU
|| code
== GEU
);
2534 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2535 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2537 if (!nonimmediate_operand (lo
[0], submode
))
2538 lo
[0] = force_reg (submode
, lo
[0]);
2539 if (!x86_64_general_operand (lo
[1], submode
))
2540 lo
[1] = force_reg (submode
, lo
[1]);
2542 if (!register_operand (hi
[0], submode
))
2543 hi
[0] = force_reg (submode
, hi
[0]);
2544 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2545 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2546 hi
[1] = force_reg (submode
, hi
[1]);
2548 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2550 tmp
= gen_rtx_SCRATCH (submode
);
2551 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2553 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2554 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2562 /* Otherwise, we need two or three jumps. */
2564 label2
= gen_label_rtx ();
2567 code2
= swap_condition (code
);
2568 code3
= unsigned_condition (code
);
2572 case LT
: case GT
: case LTU
: case GTU
:
2575 case LE
: code1
= LT
; code2
= GT
; break;
2576 case GE
: code1
= GT
; code2
= LT
; break;
2577 case LEU
: code1
= LTU
; code2
= GTU
; break;
2578 case GEU
: code1
= GTU
; code2
= LTU
; break;
2580 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2581 case NE
: code2
= UNKNOWN
; break;
2589 * if (hi(a) < hi(b)) goto true;
2590 * if (hi(a) > hi(b)) goto false;
2591 * if (lo(a) < lo(b)) goto true;
2595 if (code1
!= UNKNOWN
)
2596 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2597 if (code2
!= UNKNOWN
)
2598 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2600 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2602 if (code2
!= UNKNOWN
)
2603 emit_label (label2
);
2608 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2613 /* Figure out whether to use unordered fp comparisons. */
2616 ix86_unordered_fp_compare (enum rtx_code code
)
2618 if (!TARGET_IEEE_FP
)
2647 /* Return a comparison we can do and that it is equivalent to
2648 swap_condition (code) apart possibly from orderedness.
2649 But, never change orderedness if TARGET_IEEE_FP, returning
2650 UNKNOWN in that case if necessary. */
2652 static enum rtx_code
2653 ix86_fp_swap_condition (enum rtx_code code
)
2657 case GT
: /* GTU - CF=0 & ZF=0 */
2658 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2659 case GE
: /* GEU - CF=0 */
2660 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2661 case UNLT
: /* LTU - CF=1 */
2662 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2663 case UNLE
: /* LEU - CF=1 | ZF=1 */
2664 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2666 return swap_condition (code
);
2670 /* Return cost of comparison CODE using the best strategy for performance.
2671 All following functions do use number of instructions as a cost metrics.
2672 In future this should be tweaked to compute bytes for optimize_size and
2673 take into account performance of various instructions on various CPUs. */
2676 ix86_fp_comparison_cost (enum rtx_code code
)
2680 /* The cost of code using bit-twiddling on %ah. */
2697 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2701 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2707 switch (ix86_fp_comparison_strategy (code
))
2709 case IX86_FPCMP_COMI
:
2710 return arith_cost
> 4 ? 3 : 2;
2711 case IX86_FPCMP_SAHF
:
2712 return arith_cost
> 4 ? 4 : 3;
2718 /* Swap, force into registers, or otherwise massage the two operands
2719 to a fp comparison. The operands are updated in place; the new
2720 comparison code is returned. */
2722 static enum rtx_code
2723 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2725 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2726 rtx op0
= *pop0
, op1
= *pop1
;
2727 machine_mode op_mode
= GET_MODE (op0
);
2728 bool is_sse
= SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode
);
2730 if (op_mode
== BFmode
)
2732 rtx op
= gen_lowpart (HImode
, op0
);
2733 if (CONST_INT_P (op
))
2734 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2738 rtx t1
= gen_reg_rtx (SImode
);
2739 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2740 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2741 op
= gen_lowpart (SFmode
, t1
);
2744 op
= gen_lowpart (HImode
, op1
);
2745 if (CONST_INT_P (op
))
2746 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2750 rtx t1
= gen_reg_rtx (SImode
);
2751 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2752 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2753 op
= gen_lowpart (SFmode
, t1
);
2756 return ix86_prepare_fp_compare_args (code
, pop0
, pop1
);
2759 /* All of the unordered compare instructions only work on registers.
2760 The same is true of the fcomi compare instructions. The XFmode
2761 compare instructions require registers except when comparing
2762 against zero or when converting operand 1 from fixed point to
2766 && (unordered_compare
2767 || (op_mode
== XFmode
2768 && ! (standard_80387_constant_p (op0
) == 1
2769 || standard_80387_constant_p (op1
) == 1)
2770 && GET_CODE (op1
) != FLOAT
)
2771 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2773 op0
= force_reg (op_mode
, op0
);
2774 op1
= force_reg (op_mode
, op1
);
2778 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2779 things around if they appear profitable, otherwise force op0
2782 if (standard_80387_constant_p (op0
) == 0
2784 && ! (standard_80387_constant_p (op1
) == 0
2787 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2788 if (new_code
!= UNKNOWN
)
2790 std::swap (op0
, op1
);
2796 op0
= force_reg (op_mode
, op0
);
2798 if (CONSTANT_P (op1
))
2800 int tmp
= standard_80387_constant_p (op1
);
2802 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2806 op1
= force_reg (op_mode
, op1
);
2809 op1
= force_reg (op_mode
, op1
);
2813 /* Try to rearrange the comparison to make it cheaper. */
2814 if (ix86_fp_comparison_cost (code
)
2815 > ix86_fp_comparison_cost (swap_condition (code
))
2816 && (REG_P (op1
) || can_create_pseudo_p ()))
2818 std::swap (op0
, op1
);
2819 code
= swap_condition (code
);
2821 op0
= force_reg (op_mode
, op0
);
2829 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2832 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2834 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2835 machine_mode cmp_mode
;
2838 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2840 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2841 if (unordered_compare
)
2842 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2844 /* Do fcomi/sahf based test when profitable. */
2845 switch (ix86_fp_comparison_strategy (code
))
2847 case IX86_FPCMP_COMI
:
2848 cmp_mode
= CCFPmode
;
2849 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2852 case IX86_FPCMP_SAHF
:
2853 cmp_mode
= CCFPmode
;
2854 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2855 scratch
= gen_reg_rtx (HImode
);
2856 emit_insn (gen_rtx_SET (scratch
, tmp
));
2857 emit_insn (gen_x86_sahf_1 (scratch
));
2860 case IX86_FPCMP_ARITH
:
2861 cmp_mode
= CCNOmode
;
2862 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2863 scratch
= gen_reg_rtx (HImode
);
2864 emit_insn (gen_rtx_SET (scratch
, tmp
));
2866 /* In the unordered case, we have to check C2 for NaN's, which
2867 doesn't happen to work out to anything nice combination-wise.
2868 So do some bit twiddling on the value we've got in AH to come
2869 up with an appropriate set of condition codes. */
2875 if (code
== GT
|| !TARGET_IEEE_FP
)
2877 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2882 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2883 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2884 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2891 if (code
== LT
&& TARGET_IEEE_FP
)
2893 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2894 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2900 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2906 if (code
== GE
|| !TARGET_IEEE_FP
)
2908 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2913 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2914 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2920 if (code
== LE
&& TARGET_IEEE_FP
)
2922 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2923 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2924 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2930 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2936 if (code
== EQ
&& TARGET_IEEE_FP
)
2938 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2939 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2945 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2951 if (code
== NE
&& TARGET_IEEE_FP
)
2953 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2954 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2960 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2966 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2970 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2983 /* Return the test that should be put into the flags user, i.e.
2984 the bcc, scc, or cmov instruction. */
2985 return gen_rtx_fmt_ee (code
, VOIDmode
,
2986 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2990 /* Generate insn patterns to do an integer compare of OPERANDS. */
2993 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2995 machine_mode cmpmode
;
2998 /* Swap operands to emit carry flag comparison. */
2999 if ((code
== GTU
|| code
== LEU
)
3000 && nonimmediate_operand (op1
, VOIDmode
))
3002 std::swap (op0
, op1
);
3003 code
= swap_condition (code
);
3006 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
3007 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
3009 /* Attempt to use PTEST, if available, when testing vector modes for
3010 equality/inequality against zero. */
3011 if (op1
== const0_rtx
3013 && cmpmode
== CCZmode
3014 && SUBREG_BYTE (op0
) == 0
3015 && REG_P (SUBREG_REG (op0
))
3016 && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0
)))
3018 && GET_MODE (op0
) == TImode
3019 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0
))) == 16)
3021 tmp
= SUBREG_REG (op0
);
3022 tmp
= gen_rtx_UNSPEC (CCZmode
, gen_rtvec (2, tmp
, tmp
), UNSPEC_PTEST
);
3025 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
3027 /* This is very simple, but making the interface the same as in the
3028 FP case makes the rest of the code easier. */
3029 emit_insn (gen_rtx_SET (flags
, tmp
));
3031 /* Return the test that should be put into the flags user, i.e.
3032 the bcc, scc, or cmov instruction. */
3033 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
3037 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
3041 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
3042 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
3044 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
3046 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
3047 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
3050 ret
= ix86_expand_int_compare (code
, op0
, op1
);
3056 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
3060 gcc_assert (GET_MODE (dest
) == QImode
);
3062 ret
= ix86_expand_compare (code
, op0
, op1
);
3063 PUT_MODE (ret
, QImode
);
3064 emit_insn (gen_rtx_SET (dest
, ret
));
3067 /* Expand floating point op0 <=> op1, i.e.
3068 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
3071 ix86_expand_fp_spaceship (rtx dest
, rtx op0
, rtx op1
)
3073 gcc_checking_assert (ix86_fp_comparison_strategy (GT
) != IX86_FPCMP_ARITH
);
3074 rtx gt
= ix86_expand_fp_compare (GT
, op0
, op1
);
3075 rtx l0
= gen_label_rtx ();
3076 rtx l1
= gen_label_rtx ();
3077 rtx l2
= TARGET_IEEE_FP
? gen_label_rtx () : NULL_RTX
;
3078 rtx lend
= gen_label_rtx ();
3083 rtx un
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
,
3084 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
3085 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, un
,
3086 gen_rtx_LABEL_REF (VOIDmode
, l2
), pc_rtx
);
3087 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3088 add_reg_br_prob_note (jmp
, profile_probability:: very_unlikely ());
3090 rtx eq
= gen_rtx_fmt_ee (UNEQ
, VOIDmode
,
3091 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
3092 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, eq
,
3093 gen_rtx_LABEL_REF (VOIDmode
, l0
), pc_rtx
);
3094 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3095 add_reg_br_prob_note (jmp
, profile_probability::unlikely ());
3096 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, gt
,
3097 gen_rtx_LABEL_REF (VOIDmode
, l1
), pc_rtx
);
3098 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3099 add_reg_br_prob_note (jmp
, profile_probability::even ());
3100 emit_move_insn (dest
, constm1_rtx
);
3103 emit_move_insn (dest
, const0_rtx
);
3106 emit_move_insn (dest
, const1_rtx
);
3111 emit_move_insn (dest
, const2_rtx
);
3116 /* Expand comparison setting or clearing carry flag. Return true when
3117 successful and set pop for the operation. */
3119 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
3122 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
3124 /* Do not handle double-mode compares that go through special path. */
3125 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
3128 if (SCALAR_FLOAT_MODE_P (mode
))
3131 rtx_insn
*compare_seq
;
3133 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
3135 /* Shortcut: following common codes never translate
3136 into carry flag compares. */
3137 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
3138 || code
== ORDERED
|| code
== UNORDERED
)
3141 /* These comparisons require zero flag; swap operands so they won't. */
3142 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
3145 std::swap (op0
, op1
);
3146 code
= swap_condition (code
);
3149 /* Try to expand the comparison and verify that we end up with
3150 carry flag based comparison. This fails to be true only when
3151 we decide to expand comparison using arithmetic that is not
3152 too common scenario. */
3154 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
3155 compare_seq
= get_insns ();
3158 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
3159 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
3161 code
= GET_CODE (compare_op
);
3163 if (code
!= LTU
&& code
!= GEU
)
3166 emit_insn (compare_seq
);
3171 if (!INTEGRAL_MODE_P (mode
))
3180 /* Convert a==0 into (unsigned)a<1. */
3183 if (op1
!= const0_rtx
)
3186 code
= (code
== EQ
? LTU
: GEU
);
3189 /* Convert a>b into b<a or a>=b-1. */
3192 if (CONST_INT_P (op1
))
3194 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
3195 /* Bail out on overflow. We still can swap operands but that
3196 would force loading of the constant into register. */
3197 if (op1
== const0_rtx
3198 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
3200 code
= (code
== GTU
? GEU
: LTU
);
3204 std::swap (op0
, op1
);
3205 code
= (code
== GTU
? LTU
: GEU
);
3209 /* Convert a>=0 into (unsigned)a<0x80000000. */
3212 if (mode
== DImode
|| op1
!= const0_rtx
)
3214 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3215 code
= (code
== LT
? GEU
: LTU
);
3219 if (mode
== DImode
|| op1
!= constm1_rtx
)
3221 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3222 code
= (code
== LE
? GEU
: LTU
);
3228 /* Swapping operands may cause constant to appear as first operand. */
3229 if (!nonimmediate_operand (op0
, VOIDmode
))
3231 if (!can_create_pseudo_p ())
3233 op0
= force_reg (mode
, op0
);
3235 *pop
= ix86_expand_compare (code
, op0
, op1
);
3236 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
3240 /* Expand conditional increment or decrement using adb/sbb instructions.
3241 The default case using setcc followed by the conditional move can be
3242 done by generic code. */
3244 ix86_expand_int_addcc (rtx operands
[])
3246 enum rtx_code code
= GET_CODE (operands
[1]);
3248 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
3250 rtx val
= const0_rtx
;
3253 rtx op0
= XEXP (operands
[1], 0);
3254 rtx op1
= XEXP (operands
[1], 1);
3256 if (operands
[3] != const1_rtx
3257 && operands
[3] != constm1_rtx
)
3259 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3261 code
= GET_CODE (compare_op
);
3263 flags
= XEXP (compare_op
, 0);
3265 if (GET_MODE (flags
) == CCFPmode
)
3268 code
= ix86_fp_compare_code_to_integer (code
);
3275 PUT_CODE (compare_op
,
3276 reverse_condition_maybe_unordered
3277 (GET_CODE (compare_op
)));
3279 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
3282 mode
= GET_MODE (operands
[0]);
3284 /* Construct either adc or sbb insn. */
3285 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
3286 insn
= gen_sub3_carry
;
3288 insn
= gen_add3_carry
;
3290 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
3296 ix86_expand_int_movcc (rtx operands
[])
3298 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
3299 rtx_insn
*compare_seq
;
3301 machine_mode mode
= GET_MODE (operands
[0]);
3302 bool sign_bit_compare_p
= false;
3303 bool negate_cc_compare_p
= false;
3304 rtx op0
= XEXP (operands
[1], 0);
3305 rtx op1
= XEXP (operands
[1], 1);
3306 rtx op2
= operands
[2];
3307 rtx op3
= operands
[3];
3309 if (GET_MODE (op0
) == TImode
3310 || (GET_MODE (op0
) == DImode
3314 if (GET_MODE (op0
) == BFmode
3315 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
3319 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3320 compare_seq
= get_insns ();
3323 compare_code
= GET_CODE (compare_op
);
3325 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
3326 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
3327 sign_bit_compare_p
= true;
3329 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3330 but if op1 is a constant, the latter form allows more optimizations,
3331 either through the last 2 ops being constant handling, or the one
3332 constant and one variable cases. On the other side, for cmov the
3333 former might be better as we don't need to load the constant into
3334 another register. */
3335 if (code
== EQ
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op2
))
3337 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3338 else if (code
== NE
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op3
))
3341 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3342 HImode insns, we'd be swallowed in word prefix ops. */
3344 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
3345 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
3346 && CONST_INT_P (op2
)
3347 && CONST_INT_P (op3
))
3349 rtx out
= operands
[0];
3350 HOST_WIDE_INT ct
= INTVAL (op2
);
3351 HOST_WIDE_INT cf
= INTVAL (op3
);
3355 || (TARGET_64BIT
&& mode
== DImode
))
3356 && (GET_MODE (op0
) == SImode
3357 || (TARGET_64BIT
&& GET_MODE (op0
) == DImode
)))
3359 /* Special case x != 0 ? -1 : y. */
3360 if (code
== NE
&& op1
== const0_rtx
&& ct
== -1)
3362 negate_cc_compare_p
= true;
3366 else if (code
== EQ
&& op1
== const0_rtx
&& cf
== -1)
3367 negate_cc_compare_p
= true;
3371 /* Sign bit compares are better done using shifts than we do by using
3373 if (sign_bit_compare_p
3374 || negate_cc_compare_p
3375 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3377 /* Detect overlap between destination and compare sources. */
3380 if (negate_cc_compare_p
)
3382 if (GET_MODE (op0
) == DImode
)
3383 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode
), op0
));
3385 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode
),
3386 gen_lowpart (SImode
, op0
)));
3388 tmp
= gen_reg_rtx (mode
);
3390 emit_insn (gen_x86_movdicc_0_m1_neg (tmp
));
3392 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode
,
3395 else if (!sign_bit_compare_p
)
3400 compare_code
= GET_CODE (compare_op
);
3402 flags
= XEXP (compare_op
, 0);
3404 if (GET_MODE (flags
) == CCFPmode
)
3408 = ix86_fp_compare_code_to_integer (compare_code
);
3411 /* To simplify rest of code, restrict to the GEU case. */
3412 if (compare_code
== LTU
)
3415 compare_code
= reverse_condition (compare_code
);
3416 code
= reverse_condition (code
);
3421 PUT_CODE (compare_op
,
3422 reverse_condition_maybe_unordered
3423 (GET_CODE (compare_op
)));
3425 PUT_CODE (compare_op
,
3426 reverse_condition (GET_CODE (compare_op
)));
3430 if (reg_overlap_mentioned_p (out
, compare_op
))
3431 tmp
= gen_reg_rtx (mode
);
3434 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
3436 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
3437 flags
, compare_op
));
3441 if (code
== GT
|| code
== GE
)
3442 code
= reverse_condition (code
);
3448 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
3461 tmp
= expand_simple_binop (mode
, PLUS
,
3463 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3474 tmp
= expand_simple_binop (mode
, IOR
,
3476 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3478 else if (diff
== -1 && ct
)
3488 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3490 tmp
= expand_simple_binop (mode
, PLUS
,
3491 copy_rtx (tmp
), GEN_INT (cf
),
3492 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3500 * andl cf - ct, dest
3510 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3513 tmp
= expand_simple_binop (mode
, AND
,
3515 gen_int_mode (cf
- ct
, mode
),
3516 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3518 tmp
= expand_simple_binop (mode
, PLUS
,
3519 copy_rtx (tmp
), GEN_INT (ct
),
3520 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3523 if (!rtx_equal_p (tmp
, out
))
3524 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3531 machine_mode cmp_mode
= GET_MODE (op0
);
3532 enum rtx_code new_code
;
3534 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3536 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3538 /* We may be reversing a non-trapping
3539 comparison to a trapping comparison. */
3540 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3541 && code
!= EQ
&& code
!= NE
3542 && code
!= ORDERED
&& code
!= UNORDERED
)
3545 new_code
= reverse_condition_maybe_unordered (code
);
3548 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3549 if (new_code
!= UNKNOWN
)
3557 compare_code
= UNKNOWN
;
3558 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3559 && CONST_INT_P (op1
))
3561 if (op1
== const0_rtx
3562 && (code
== LT
|| code
== GE
))
3563 compare_code
= code
;
3564 else if (op1
== constm1_rtx
)
3568 else if (code
== GT
)
3573 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3574 if (compare_code
!= UNKNOWN
3575 && GET_MODE (op0
) == GET_MODE (out
)
3576 && (cf
== -1 || ct
== -1))
3578 /* If lea code below could be used, only optimize
3579 if it results in a 2 insn sequence. */
3581 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3582 || diff
== 3 || diff
== 5 || diff
== 9)
3583 || (compare_code
== LT
&& ct
== -1)
3584 || (compare_code
== GE
&& cf
== -1))
3587 * notl op1 (if necessary)
3595 code
= reverse_condition (code
);
3598 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3600 out
= expand_simple_binop (mode
, IOR
,
3602 out
, 1, OPTAB_DIRECT
);
3603 if (out
!= operands
[0])
3604 emit_move_insn (operands
[0], out
);
3611 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3612 || diff
== 3 || diff
== 5 || diff
== 9)
3613 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3615 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3621 * lea cf(dest*(ct-cf)),dest
3625 * This also catches the degenerate setcc-only case.
3631 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3634 /* On x86_64 the lea instruction operates on Pmode, so we need
3635 to get arithmetics done in proper mode to match. */
3637 tmp
= copy_rtx (out
);
3641 out1
= copy_rtx (out
);
3642 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3646 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3652 tmp
= plus_constant (mode
, tmp
, cf
);
3655 if (!rtx_equal_p (tmp
, out
))
3658 out
= force_operand (tmp
, copy_rtx (out
));
3660 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3662 if (!rtx_equal_p (out
, operands
[0]))
3663 emit_move_insn (operands
[0], copy_rtx (out
));
3669 * General case: Jumpful:
3670 * xorl dest,dest cmpl op1, op2
3671 * cmpl op1, op2 movl ct, dest
3673 * decl dest movl cf, dest
3674 * andl (cf-ct),dest 1:
3679 * This is reasonably steep, but branch mispredict costs are
3680 * high on modern cpus, so consider failing only if optimizing
3684 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3685 && BRANCH_COST (optimize_insn_for_speed_p (),
3690 machine_mode cmp_mode
= GET_MODE (op0
);
3691 enum rtx_code new_code
;
3693 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3695 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3697 /* We may be reversing a non-trapping
3698 comparison to a trapping comparison. */
3699 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3700 && code
!= EQ
&& code
!= NE
3701 && code
!= ORDERED
&& code
!= UNORDERED
)
3704 new_code
= reverse_condition_maybe_unordered (code
);
3709 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3710 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3711 compare_code
= reverse_condition (compare_code
);
3714 if (new_code
!= UNKNOWN
)
3722 if (compare_code
!= UNKNOWN
)
3724 /* notl op1 (if needed)
3729 For x < 0 (resp. x <= -1) there will be no notl,
3730 so if possible swap the constants to get rid of the
3732 True/false will be -1/0 while code below (store flag
3733 followed by decrement) is 0/-1, so the constants need
3734 to be exchanged once more. */
3736 if (compare_code
== GE
|| !cf
)
3738 code
= reverse_condition (code
);
3744 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3748 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3750 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3752 copy_rtx (out
), 1, OPTAB_DIRECT
);
3755 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3756 gen_int_mode (cf
- ct
, mode
),
3757 copy_rtx (out
), 1, OPTAB_DIRECT
);
3759 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3760 copy_rtx (out
), 1, OPTAB_DIRECT
);
3761 if (!rtx_equal_p (out
, operands
[0]))
3762 emit_move_insn (operands
[0], copy_rtx (out
));
3768 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3770 /* Try a few things more with specific constants and a variable. */
3773 rtx var
, orig_out
, out
, tmp
;
3775 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3781 /* If one of the two operands is an interesting constant, load a
3782 constant with the above and mask it in with a logical operation. */
3784 if (CONST_INT_P (operands
[2]))
3787 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3788 operands
[3] = constm1_rtx
, op
= and_optab
;
3789 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3790 operands
[3] = const0_rtx
, op
= ior_optab
;
3794 else if (CONST_INT_P (operands
[3]))
3797 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3799 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3800 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3801 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3802 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3806 operands
[2] = constm1_rtx
;
3809 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3810 operands
[2] = const0_rtx
, op
= ior_optab
;
3817 orig_out
= operands
[0];
3818 tmp
= gen_reg_rtx (mode
);
3821 /* Recurse to get the constant loaded. */
3822 if (!ix86_expand_int_movcc (operands
))
3825 /* Mask in the interesting variable. */
3826 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3828 if (!rtx_equal_p (out
, orig_out
))
3829 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3835 * For comparison with above,
3845 if (! nonimmediate_operand (operands
[2], mode
))
3846 operands
[2] = force_reg (mode
, operands
[2]);
3847 if (! nonimmediate_operand (operands
[3], mode
))
3848 operands
[3] = force_reg (mode
, operands
[3]);
3850 if (! register_operand (operands
[2], VOIDmode
)
3852 || ! register_operand (operands
[3], VOIDmode
)))
3853 operands
[2] = force_reg (mode
, operands
[2]);
3856 && ! register_operand (operands
[3], VOIDmode
))
3857 operands
[3] = force_reg (mode
, operands
[3]);
3859 emit_insn (compare_seq
);
3860 emit_insn (gen_rtx_SET (operands
[0],
3861 gen_rtx_IF_THEN_ELSE (mode
,
3862 compare_op
, operands
[2],
3867 /* Detect conditional moves that exactly match min/max operational
3868 semantics. Note that this is IEEE safe, as long as we don't
3869 interchange the operands.
3871 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3872 and TRUE if the operation is successful and instructions are emitted. */
3875 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3876 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3884 else if (code
== UNGE
)
3885 std::swap (if_true
, if_false
);
3889 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3891 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3896 mode
= GET_MODE (dest
);
3898 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3899 but MODE may be a vector mode and thus not appropriate. */
3900 if (!flag_finite_math_only
|| flag_signed_zeros
)
3902 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3905 if_true
= force_reg (mode
, if_true
);
3906 v
= gen_rtvec (2, if_true
, if_false
);
3907 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3911 code
= is_min
? SMIN
: SMAX
;
3912 if (MEM_P (if_true
) && MEM_P (if_false
))
3913 if_true
= force_reg (mode
, if_true
);
3914 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3917 emit_insn (gen_rtx_SET (dest
, tmp
));
3921 /* Return true if MODE is valid for vector compare to mask register,
3922 Same result for conditionl vector move with mask register. */
3924 ix86_valid_mask_cmp_mode (machine_mode mode
)
3926 /* XOP has its own vector conditional movement. */
3927 if (TARGET_XOP
&& !TARGET_AVX512F
)
3930 /* HFmode only supports vcmpsh whose dest is mask register. */
3931 if (TARGET_AVX512FP16
&& mode
== HFmode
)
3934 /* AVX512F is needed for mask operation. */
3935 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3938 /* AVX512BW is needed for vector QI/HImode,
3939 AVX512VL is needed for 128/256-bit vector. */
3940 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3941 int vector_size
= GET_MODE_SIZE (mode
);
3942 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3945 return vector_size
== 64 || TARGET_AVX512VL
;
3948 /* Return true if integer mask comparison should be used. */
3950 ix86_use_mask_cmp_p (machine_mode mode
, machine_mode cmp_mode
,
3951 rtx op_true
, rtx op_false
)
3953 int vector_size
= GET_MODE_SIZE (mode
);
3955 if (cmp_mode
== HFmode
)
3957 else if (vector_size
< 16)
3959 else if (vector_size
== 64)
3961 else if (GET_MODE_INNER (cmp_mode
) == HFmode
)
3964 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3965 gcc_assert (!op_true
== !op_false
);
3967 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3968 vector dest is required. */
3969 if (!op_true
|| !ix86_valid_mask_cmp_mode (cmp_mode
))
3972 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3973 if (op_false
== CONST0_RTX (mode
)
3974 || op_true
== CONST0_RTX (mode
)
3975 || (INTEGRAL_MODE_P (mode
)
3976 && (op_true
== CONSTM1_RTX (mode
)
3977 || op_false
== CONSTM1_RTX (mode
))))
3983 /* Expand an SSE comparison. Return the register with the result. */
3986 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3987 rtx op_true
, rtx op_false
)
3989 machine_mode mode
= GET_MODE (dest
);
3990 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3992 /* In general case result of comparison can differ from operands' type. */
3993 machine_mode cmp_mode
;
3995 /* In AVX512F the result of comparison is an integer mask. */
3996 bool maskcmp
= false;
3999 if (ix86_use_mask_cmp_p (mode
, cmp_ops_mode
, op_true
, op_false
))
4001 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
4003 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
4006 cmp_mode
= cmp_ops_mode
;
4008 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
4010 bool (*op1_predicate
)(rtx
, machine_mode
)
4011 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
4013 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
4014 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
4017 || (maskcmp
&& cmp_mode
!= mode
)
4018 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
4019 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
4020 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
4024 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
4029 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
4031 if (cmp_mode
!= mode
)
4033 x
= force_reg (cmp_ops_mode
, x
);
4034 convert_move (dest
, x
, false);
4037 emit_insn (gen_rtx_SET (dest
, x
));
4042 /* Emit x86 binary operand CODE in mode MODE for SSE vector
4043 instructions that can be performed using GP registers. */
4046 ix86_emit_vec_binop (enum rtx_code code
, machine_mode mode
,
4047 rtx dst
, rtx src1
, rtx src2
)
4051 tmp
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
4053 if (GET_MODE_SIZE (mode
) <= GET_MODE_SIZE (SImode
)
4054 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
4056 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
4057 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
4063 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
4064 operations. This is used for both scalar and vector conditional moves. */
4067 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
4069 machine_mode mode
= GET_MODE (dest
);
4070 machine_mode cmpmode
= GET_MODE (cmp
);
4073 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4074 if (rtx_equal_p (op_true
, op_false
))
4076 emit_move_insn (dest
, op_true
);
4080 /* If we have an integer mask and FP value then we need
4081 to cast mask to FP mode. */
4082 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
4084 cmp
= force_reg (cmpmode
, cmp
);
4085 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
4088 /* In AVX512F the result of comparison is an integer mask. */
4090 && GET_MODE_CLASS (cmpmode
) == MODE_INT
)
4092 gcc_assert (ix86_valid_mask_cmp_mode (mode
));
4093 /* Using scalar/vector move with mask register. */
4094 cmp
= force_reg (cmpmode
, cmp
);
4095 /* Optimize for mask zero. */
4096 op_true
= (op_true
!= CONST0_RTX (mode
)
4097 ? force_reg (mode
, op_true
) : op_true
);
4098 op_false
= (op_false
!= CONST0_RTX (mode
)
4099 ? force_reg (mode
, op_false
) : op_false
);
4100 if (op_true
== CONST0_RTX (mode
))
4102 if (cmpmode
== E_DImode
&& !TARGET_64BIT
)
4104 x
= gen_reg_rtx (cmpmode
);
4105 emit_insn (gen_knotdi (x
, cmp
));
4108 x
= expand_simple_unop (cmpmode
, NOT
, cmp
, NULL
, 1);
4110 /* Reverse op_true op_false. */
4111 std::swap (op_true
, op_false
);
4115 emit_insn (gen_movhf_mask (dest
, op_true
, op_false
, cmp
));
4117 emit_insn (gen_rtx_SET (dest
,
4118 gen_rtx_VEC_MERGE (mode
,
4119 op_true
, op_false
, cmp
)));
4123 if (vector_all_ones_operand (op_true
, mode
)
4124 && op_false
== CONST0_RTX (mode
))
4126 emit_move_insn (dest
, cmp
);
4129 else if (op_false
== CONST0_RTX (mode
))
4131 x
= expand_simple_binop (mode
, AND
, cmp
, op_true
,
4132 dest
, 1, OPTAB_DIRECT
);
4134 emit_move_insn (dest
, x
);
4137 else if (op_true
== CONST0_RTX (mode
))
4139 op_false
= force_reg (mode
, op_false
);
4140 x
= gen_rtx_NOT (mode
, cmp
);
4141 ix86_emit_vec_binop (AND
, mode
, dest
, x
, op_false
);
4144 else if (vector_all_ones_operand (op_true
, mode
))
4146 x
= expand_simple_binop (mode
, IOR
, cmp
, op_false
,
4147 dest
, 1, OPTAB_DIRECT
);
4149 emit_move_insn (dest
, x
);
4155 op_true
= force_reg (mode
, op_true
);
4157 if (GET_MODE_SIZE (mode
) < 16
4158 || !nonimmediate_operand (op_false
, mode
))
4159 op_false
= force_reg (mode
, op_false
);
4161 emit_insn (gen_rtx_SET (dest
,
4162 gen_rtx_IF_THEN_ELSE (mode
, cmp
,
4163 op_true
, op_false
)));
4167 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4168 machine_mode blend_mode
= mode
;
4170 if (GET_MODE_SIZE (mode
) < 16
4171 || !vector_operand (op_true
, mode
))
4172 op_true
= force_reg (mode
, op_true
);
4174 op_false
= force_reg (mode
, op_false
);
4180 gen
= gen_mmx_blendvps
;
4184 gen
= gen_sse4_1_blendvps
;
4188 gen
= gen_sse4_1_blendvpd
;
4192 gen
= gen_sse4_1_blendvss
;
4196 gen
= gen_sse4_1_blendvsd
;
4203 gen
= gen_mmx_pblendvb_v8qi
;
4204 blend_mode
= V8QImode
;
4211 gen
= gen_mmx_pblendvb_v4qi
;
4212 blend_mode
= V4QImode
;
4217 gen
= gen_mmx_pblendvb_v2qi
;
4228 gen
= gen_sse4_1_pblendvb
;
4229 blend_mode
= V16QImode
;
4234 gen
= gen_avx_blendvps256
;
4238 gen
= gen_avx_blendvpd256
;
4248 gen
= gen_avx2_pblendvb
;
4249 blend_mode
= V32QImode
;
4254 gen
= gen_avx512bw_blendmv64qi
;
4257 gen
= gen_avx512bw_blendmv32hi
;
4260 gen
= gen_avx512bw_blendmv32hf
;
4263 gen
= gen_avx512bw_blendmv32bf
;
4266 gen
= gen_avx512f_blendmv16si
;
4269 gen
= gen_avx512f_blendmv8di
;
4272 gen
= gen_avx512f_blendmv8df
;
4275 gen
= gen_avx512f_blendmv16sf
;
4284 if (blend_mode
== mode
)
4288 x
= gen_reg_rtx (blend_mode
);
4289 op_false
= gen_lowpart (blend_mode
, op_false
);
4290 op_true
= gen_lowpart (blend_mode
, op_true
);
4291 cmp
= gen_lowpart (blend_mode
, cmp
);
4294 emit_insn (gen (x
, op_false
, op_true
, cmp
));
4297 emit_move_insn (dest
, gen_lowpart (mode
, x
));
4303 t2
= expand_simple_binop (mode
, AND
, op_true
, cmp
,
4304 NULL
, 1, OPTAB_DIRECT
);
4306 t3
= gen_reg_rtx (mode
);
4307 x
= gen_rtx_NOT (mode
, cmp
);
4308 ix86_emit_vec_binop (AND
, mode
, t3
, x
, op_false
);
4310 x
= expand_simple_binop (mode
, IOR
, t3
, t2
,
4311 dest
, 1, OPTAB_DIRECT
);
4313 emit_move_insn (dest
, x
);
4317 /* Swap, force into registers, or otherwise massage the two operands
4318 to an sse comparison with a mask result. Thus we differ a bit from
4319 ix86_prepare_fp_compare_args which expects to produce a flags result.
4321 The DEST operand exists to help determine whether to commute commutative
4322 operators. The POP0/POP1 operands are updated in place. The new
4323 comparison code is returned, or UNKNOWN if not implementable. */
4325 static enum rtx_code
4326 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
4327 rtx
*pop0
, rtx
*pop1
)
4333 /* AVX supports all the needed comparisons. */
4336 /* We have no LTGT as an operator. We could implement it with
4337 NE & ORDERED, but this requires an extra temporary. It's
4338 not clear that it's worth it. */
4345 /* These are supported directly. */
4352 /* AVX has 3 operand comparisons, no need to swap anything. */
4355 /* For commutative operators, try to canonicalize the destination
4356 operand to be first in the comparison - this helps reload to
4357 avoid extra moves. */
4358 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
4366 /* These are not supported directly before AVX, and furthermore
4367 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4368 comparison operands to transform into something that is
4370 std::swap (*pop0
, *pop1
);
4371 code
= swap_condition (code
);
4381 /* Expand a floating-point conditional move. Return true if successful. */
4384 ix86_expand_fp_movcc (rtx operands
[])
4386 machine_mode mode
= GET_MODE (operands
[0]);
4387 enum rtx_code code
= GET_CODE (operands
[1]);
4388 rtx tmp
, compare_op
;
4389 rtx op0
= XEXP (operands
[1], 0);
4390 rtx op1
= XEXP (operands
[1], 1);
4392 if (GET_MODE (op0
) == BFmode
4393 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
4396 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode
))
4400 /* Since we've no cmove for sse registers, don't force bad register
4401 allocation just to gain access to it. Deny movcc when the
4402 comparison mode doesn't match the move mode. */
4403 cmode
= GET_MODE (op0
);
4404 if (cmode
== VOIDmode
)
4405 cmode
= GET_MODE (op1
);
4409 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
4410 if (code
== UNKNOWN
)
4413 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
4414 operands
[2], operands
[3]))
4417 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
4418 operands
[2], operands
[3]);
4419 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
4423 if (GET_MODE (op0
) == TImode
4424 || (GET_MODE (op0
) == DImode
4428 /* The floating point conditional move instructions don't directly
4429 support conditions resulting from a signed integer comparison. */
4431 compare_op
= ix86_expand_compare (code
, op0
, op1
);
4432 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
4434 tmp
= gen_reg_rtx (QImode
);
4435 ix86_expand_setcc (tmp
, code
, op0
, op1
);
4437 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
4440 emit_insn (gen_rtx_SET (operands
[0],
4441 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
4442 operands
[2], operands
[3])));
4447 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4450 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4475 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4478 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4515 /* Return immediate value to be used in UNSPEC_PCMP
4516 for comparison CODE in MODE. */
4519 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
4521 if (FLOAT_MODE_P (mode
))
4522 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
4523 return ix86_int_cmp_code_to_pcmp_immediate (code
);
4526 /* Expand AVX-512 vector comparison. */
4529 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
4531 machine_mode mask_mode
= GET_MODE (dest
);
4532 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
4533 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
4543 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
4547 unspec_code
= UNSPEC_PCMP
;
4550 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
4552 emit_insn (gen_rtx_SET (dest
, unspec
));
4557 /* Expand fp vector comparison. */
4560 ix86_expand_fp_vec_cmp (rtx operands
[])
4562 enum rtx_code code
= GET_CODE (operands
[1]);
4565 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4566 &operands
[2], &operands
[3]);
4567 if (code
== UNKNOWN
)
4570 switch (GET_CODE (operands
[1]))
4573 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4574 operands
[3], NULL
, NULL
);
4575 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4576 operands
[3], NULL
, NULL
);
4580 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4581 operands
[3], NULL
, NULL
);
4582 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4583 operands
[3], NULL
, NULL
);
4589 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4593 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4596 if (operands
[0] != cmp
)
4597 emit_move_insn (operands
[0], cmp
);
4603 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4604 rtx op_true
, rtx op_false
, bool *negate
)
4606 machine_mode data_mode
= GET_MODE (dest
);
4607 machine_mode mode
= GET_MODE (cop0
);
4612 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4614 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
4615 && GET_MODE_SIZE (mode
) <= 16)
4617 /* AVX512F supports all of the comparsions
4618 on all 128/256/512-bit vector int types. */
4619 else if (ix86_use_mask_cmp_p (data_mode
, mode
, op_true
, op_false
))
4623 /* Canonicalize the comparison to EQ, GT, GTU. */
4633 /* x <= cst can be handled as x < cst + 1 unless there is
4634 wrap around in cst + 1. */
4635 if (GET_CODE (cop1
) == CONST_VECTOR
4636 && GET_MODE_INNER (mode
) != TImode
)
4638 unsigned int n_elts
= GET_MODE_NUNITS (mode
), i
;
4639 machine_mode eltmode
= GET_MODE_INNER (mode
);
4640 for (i
= 0; i
< n_elts
; ++i
)
4642 rtx elt
= CONST_VECTOR_ELT (cop1
, i
);
4643 if (!CONST_INT_P (elt
))
4647 /* For LE punt if some element is signed maximum. */
4648 if ((INTVAL (elt
) & (GET_MODE_MASK (eltmode
) >> 1))
4649 == (GET_MODE_MASK (eltmode
) >> 1))
4652 /* For LEU punt if some element is unsigned maximum. */
4653 else if (elt
== constm1_rtx
)
4658 rtvec v
= rtvec_alloc (n_elts
);
4659 for (i
= 0; i
< n_elts
; ++i
)
4661 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1
, i
)) + 1,
4663 cop1
= gen_rtx_CONST_VECTOR (mode
, v
);
4664 std::swap (cop0
, cop1
);
4665 code
= code
== LE
? GT
: GTU
;
4671 code
= reverse_condition (code
);
4677 /* x >= cst can be handled as x > cst - 1 unless there is
4678 wrap around in cst - 1. */
4679 if (GET_CODE (cop1
) == CONST_VECTOR
4680 && GET_MODE_INNER (mode
) != TImode
)
4682 unsigned int n_elts
= GET_MODE_NUNITS (mode
), i
;
4683 machine_mode eltmode
= GET_MODE_INNER (mode
);
4684 for (i
= 0; i
< n_elts
; ++i
)
4686 rtx elt
= CONST_VECTOR_ELT (cop1
, i
);
4687 if (!CONST_INT_P (elt
))
4691 /* For GE punt if some element is signed minimum. */
4692 if (INTVAL (elt
) < 0
4693 && ((INTVAL (elt
) & (GET_MODE_MASK (eltmode
) >> 1))
4697 /* For GEU punt if some element is zero. */
4698 else if (elt
== const0_rtx
)
4703 rtvec v
= rtvec_alloc (n_elts
);
4704 for (i
= 0; i
< n_elts
; ++i
)
4706 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1
, i
)) - 1,
4708 cop1
= gen_rtx_CONST_VECTOR (mode
, v
);
4709 code
= code
== GE
? GT
: GTU
;
4713 code
= reverse_condition (code
);
4719 std::swap (cop0
, cop1
);
4720 code
= swap_condition (code
);
4727 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4728 if (mode
== V2DImode
)
4733 /* SSE4.1 supports EQ. */
4740 /* SSE4.2 supports GT/GTU. */
4750 if (GET_CODE (cop0
) == CONST_VECTOR
)
4751 cop0
= force_reg (mode
, cop0
);
4752 else if (GET_CODE (cop1
) == CONST_VECTOR
)
4753 cop1
= force_reg (mode
, cop1
);
4755 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4756 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4758 std::swap (optrue
, opfalse
);
4760 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4761 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4762 min (x, y) == x). While we add one instruction (the minimum),
4763 we remove the need for two instructions in the negation, as the
4764 result is done this way.
4765 When using masks, do it for SI/DImode element types, as it is shorter
4766 than the two subtractions. */
4768 && GET_MODE_SIZE (mode
) != 64
4769 && vector_all_ones_operand (opfalse
, data_mode
)
4770 && optrue
== CONST0_RTX (data_mode
))
4772 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4773 /* Don't do it if not using integer masks and we'd end up with
4774 the right values in the registers though. */
4775 && (GET_MODE_SIZE (mode
) == 64
4776 || !vector_all_ones_operand (optrue
, data_mode
)
4777 || opfalse
!= CONST0_RTX (data_mode
))))
4779 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4784 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4787 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4788 cop0
= force_reg (mode
, cop0
);
4789 cop1
= force_reg (mode
, cop1
);
4793 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4797 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4801 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4804 if (TARGET_AVX512VL
)
4806 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4807 cop0
= force_reg (mode
, cop0
);
4808 cop1
= force_reg (mode
, cop1
);
4812 if (code
== GTU
&& TARGET_SSE2
)
4813 gen
= gen_uminv16qi3
;
4814 else if (code
== GT
&& TARGET_SSE4_1
)
4815 gen
= gen_sminv16qi3
;
4818 if (code
== GTU
&& TARGET_SSE2
)
4819 gen
= gen_uminv8qi3
;
4820 else if (code
== GT
&& TARGET_SSE4_1
)
4821 gen
= gen_sminv8qi3
;
4824 if (code
== GTU
&& TARGET_SSE2
)
4825 gen
= gen_uminv4qi3
;
4826 else if (code
== GT
&& TARGET_SSE4_1
)
4827 gen
= gen_sminv4qi3
;
4830 if (code
== GTU
&& TARGET_SSE2
)
4831 gen
= gen_uminv2qi3
;
4832 else if (code
== GT
&& TARGET_SSE4_1
)
4833 gen
= gen_sminv2qi3
;
4836 if (code
== GTU
&& TARGET_SSE4_1
)
4837 gen
= gen_uminv8hi3
;
4838 else if (code
== GT
&& TARGET_SSE2
)
4839 gen
= gen_sminv8hi3
;
4842 if (code
== GTU
&& TARGET_SSE4_1
)
4843 gen
= gen_uminv4hi3
;
4844 else if (code
== GT
&& TARGET_SSE2
)
4845 gen
= gen_sminv4hi3
;
4848 if (code
== GTU
&& TARGET_SSE4_1
)
4849 gen
= gen_uminv2hi3
;
4850 else if (code
== GT
&& TARGET_SSE2
)
4851 gen
= gen_sminv2hi3
;
4855 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4859 gen
= (code
== GTU
) ? gen_uminv2si3
: gen_sminv2si3
;
4862 if (TARGET_AVX512VL
)
4864 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4865 cop0
= force_reg (mode
, cop0
);
4866 cop1
= force_reg (mode
, cop1
);
4875 rtx tem
= gen_reg_rtx (mode
);
4876 if (!vector_operand (cop0
, mode
))
4877 cop0
= force_reg (mode
, cop0
);
4878 if (!vector_operand (cop1
, mode
))
4879 cop1
= force_reg (mode
, cop1
);
4881 emit_insn (gen (tem
, cop0
, cop1
));
4887 /* Unsigned parallel compare is not supported by the hardware.
4888 Play some tricks to turn this into a signed comparison
4892 cop0
= force_reg (mode
, cop0
);
4906 /* Subtract (-(INT MAX) - 1) from both operands to make
4908 mask
= ix86_build_signbit_mask (mode
, true, false);
4909 t1
= gen_reg_rtx (mode
);
4910 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4912 t2
= gen_reg_rtx (mode
);
4913 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4932 /* Perform a parallel unsigned saturating subtraction. */
4933 x
= gen_reg_rtx (mode
);
4934 emit_insn (gen_rtx_SET
4935 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4937 cop1
= CONST0_RTX (mode
);
4949 std::swap (op_true
, op_false
);
4951 if (GET_CODE (cop1
) == CONST_VECTOR
)
4952 cop1
= force_reg (mode
, cop1
);
4954 /* Allow the comparison to be done in one mode, but the movcc to
4955 happen in another mode. */
4956 if (data_mode
== mode
)
4957 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
, op_true
, op_false
);
4960 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4961 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4963 if (GET_MODE (x
) == mode
)
4964 x
= gen_lowpart (data_mode
, x
);
4970 /* Expand integer vector comparison. */
4973 ix86_expand_int_vec_cmp (rtx operands
[])
4975 rtx_code code
= GET_CODE (operands
[1]);
4976 bool negate
= false;
4977 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4978 operands
[3], NULL
, NULL
, &negate
);
4984 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4985 CONST0_RTX (GET_MODE (cmp
)),
4986 NULL
, NULL
, &negate
);
4988 gcc_assert (!negate
);
4990 if (operands
[0] != cmp
)
4991 emit_move_insn (operands
[0], cmp
);
4996 /* Expand a floating-point vector conditional move; a vcond operation
4997 rather than a movcc operation. */
5000 ix86_expand_fp_vcond (rtx operands
[])
5002 enum rtx_code code
= GET_CODE (operands
[3]);
5005 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
5006 &operands
[4], &operands
[5]);
5007 if (code
== UNKNOWN
)
5010 switch (GET_CODE (operands
[3]))
5013 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
5014 operands
[5], operands
[0], operands
[0]);
5015 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
5016 operands
[5], operands
[1], operands
[2]);
5020 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
5021 operands
[5], operands
[0], operands
[0]);
5022 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
5023 operands
[5], operands
[1], operands
[2]);
5029 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
5031 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
5035 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
5036 operands
[5], operands
[1], operands
[2]))
5039 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
5040 operands
[1], operands
[2]);
5041 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
5045 /* Expand a signed/unsigned integral vector conditional move. */
5048 ix86_expand_int_vcond (rtx operands
[])
5050 machine_mode data_mode
= GET_MODE (operands
[0]);
5051 machine_mode mode
= GET_MODE (operands
[4]);
5052 enum rtx_code code
= GET_CODE (operands
[3]);
5053 bool negate
= false;
5059 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
5060 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
5061 if ((code
== LT
|| code
== GE
)
5062 && data_mode
== mode
5063 && cop1
== CONST0_RTX (mode
)
5064 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
5065 && GET_MODE_UNIT_SIZE (data_mode
) > 1
5066 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
5067 && (GET_MODE_SIZE (data_mode
) == 16
5068 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
5070 rtx negop
= operands
[2 - (code
== LT
)];
5071 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
5072 if (negop
== CONST1_RTX (data_mode
))
5074 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
5075 operands
[0], 1, OPTAB_DIRECT
);
5076 if (res
!= operands
[0])
5077 emit_move_insn (operands
[0], res
);
5080 else if (GET_MODE_INNER (data_mode
) != DImode
5081 && vector_all_ones_operand (negop
, data_mode
))
5083 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
5084 operands
[0], 0, OPTAB_DIRECT
);
5085 if (res
!= operands
[0])
5086 emit_move_insn (operands
[0], res
);
5091 if (!nonimmediate_operand (cop1
, mode
))
5092 cop1
= force_reg (mode
, cop1
);
5093 if (!general_operand (operands
[1], data_mode
))
5094 operands
[1] = force_reg (data_mode
, operands
[1]);
5095 if (!general_operand (operands
[2], data_mode
))
5096 operands
[2] = force_reg (data_mode
, operands
[2]);
5098 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
5099 operands
[1], operands
[2], &negate
);
5104 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
5105 operands
[2-negate
]);
5110 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
5111 struct expand_vec_perm_d
*d
)
5113 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5114 expander, so args are either in d, or in op0, op1 etc. */
5115 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
5116 machine_mode maskmode
= mode
;
5117 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
5122 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
5123 gen
= gen_avx512vl_vpermt2varv16qi3
;
5126 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
5127 gen
= gen_avx512vl_vpermt2varv32qi3
;
5130 if (TARGET_AVX512VBMI
)
5131 gen
= gen_avx512bw_vpermt2varv64qi3
;
5134 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
5135 gen
= gen_avx512vl_vpermt2varv8hi3
;
5138 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
5139 gen
= gen_avx512vl_vpermt2varv16hi3
;
5142 if (TARGET_AVX512BW
)
5143 gen
= gen_avx512bw_vpermt2varv32hi3
;
5146 if (TARGET_AVX512VL
)
5147 gen
= gen_avx512vl_vpermt2varv4si3
;
5150 if (TARGET_AVX512VL
)
5151 gen
= gen_avx512vl_vpermt2varv8si3
;
5155 gen
= gen_avx512f_vpermt2varv16si3
;
5158 if (TARGET_AVX512VL
)
5160 gen
= gen_avx512vl_vpermt2varv4sf3
;
5161 maskmode
= V4SImode
;
5165 if (TARGET_AVX512VL
)
5167 gen
= gen_avx512vl_vpermt2varv8sf3
;
5168 maskmode
= V8SImode
;
5174 gen
= gen_avx512f_vpermt2varv16sf3
;
5175 maskmode
= V16SImode
;
5179 if (TARGET_AVX512VL
)
5180 gen
= gen_avx512vl_vpermt2varv2di3
;
5183 if (TARGET_AVX512VL
)
5184 gen
= gen_avx512vl_vpermt2varv4di3
;
5188 gen
= gen_avx512f_vpermt2varv8di3
;
5191 if (TARGET_AVX512VL
)
5193 gen
= gen_avx512vl_vpermt2varv2df3
;
5194 maskmode
= V2DImode
;
5198 if (TARGET_AVX512VL
)
5200 gen
= gen_avx512vl_vpermt2varv4df3
;
5201 maskmode
= V4DImode
;
5207 gen
= gen_avx512f_vpermt2varv8df3
;
5208 maskmode
= V8DImode
;
5218 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5219 expander, so args are either in d, or in op0, op1 etc. */
5226 for (int i
= 0; i
< d
->nelt
; ++i
)
5227 vec
[i
] = GEN_INT (d
->perm
[i
]);
5228 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
5231 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
5235 /* Expand a variable vector permutation. */
5238 ix86_expand_vec_perm (rtx operands
[])
5240 rtx target
= operands
[0];
5241 rtx op0
= operands
[1];
5242 rtx op1
= operands
[2];
5243 rtx mask
= operands
[3];
5244 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
5245 machine_mode mode
= GET_MODE (op0
);
5246 machine_mode maskmode
= GET_MODE (mask
);
5248 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
5250 /* Number of elements in the vector. */
5251 w
= GET_MODE_NUNITS (mode
);
5252 e
= GET_MODE_UNIT_SIZE (mode
);
5253 gcc_assert (w
<= 64);
5255 /* For HF mode vector, convert it to HI using subreg. */
5256 if (GET_MODE_INNER (mode
) == HFmode
)
5258 machine_mode orig_mode
= mode
;
5259 mode
= mode_for_vector (HImode
, w
).require ();
5260 target
= lowpart_subreg (mode
, target
, orig_mode
);
5261 op0
= lowpart_subreg (mode
, op0
, orig_mode
);
5262 op1
= lowpart_subreg (mode
, op1
, orig_mode
);
5265 if (TARGET_AVX512F
&& one_operand_shuffle
)
5267 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
5271 gen
=gen_avx512f_permvarv16si
;
5274 gen
= gen_avx512f_permvarv16sf
;
5277 gen
= gen_avx512f_permvarv8di
;
5280 gen
= gen_avx512f_permvarv8df
;
5287 emit_insn (gen (target
, op0
, mask
));
5292 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
5297 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
5299 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5300 an constant shuffle operand. With a tiny bit of effort we can
5301 use VPERMD instead. A re-interpretation stall for V4DFmode is
5302 unfortunate but there's no avoiding it.
5303 Similarly for V16HImode we don't have instructions for variable
5304 shuffling, while for V32QImode we can use after preparing suitable
5305 masks vpshufb; vpshufb; vpermq; vpor. */
5307 if (mode
== V16HImode
)
5309 maskmode
= mode
= V32QImode
;
5315 maskmode
= mode
= V8SImode
;
5319 t1
= gen_reg_rtx (maskmode
);
5321 /* Replicate the low bits of the V4DImode mask into V8SImode:
5323 t1 = { A A B B C C D D }. */
5324 for (i
= 0; i
< w
/ 2; ++i
)
5325 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
5326 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5327 vt
= force_reg (maskmode
, vt
);
5328 mask
= gen_lowpart (maskmode
, mask
);
5329 if (maskmode
== V8SImode
)
5330 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
5332 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
5334 /* Multiply the shuffle indicies by two. */
5335 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
5338 /* Add one to the odd shuffle indicies:
5339 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5340 for (i
= 0; i
< w
/ 2; ++i
)
5342 vec
[i
* 2] = const0_rtx
;
5343 vec
[i
* 2 + 1] = const1_rtx
;
5345 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5346 vt
= validize_mem (force_const_mem (maskmode
, vt
));
5347 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
5350 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5351 operands
[3] = mask
= t1
;
5352 target
= gen_reg_rtx (mode
);
5353 op0
= gen_lowpart (mode
, op0
);
5354 op1
= gen_lowpart (mode
, op1
);
5360 /* The VPERMD and VPERMPS instructions already properly ignore
5361 the high bits of the shuffle elements. No need for us to
5362 perform an AND ourselves. */
5363 if (one_operand_shuffle
)
5365 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
5366 if (target
!= operands
[0])
5367 emit_move_insn (operands
[0],
5368 gen_lowpart (GET_MODE (operands
[0]), target
));
5372 t1
= gen_reg_rtx (V8SImode
);
5373 t2
= gen_reg_rtx (V8SImode
);
5374 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
5375 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
5381 mask
= gen_lowpart (V8SImode
, mask
);
5382 if (one_operand_shuffle
)
5383 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
5386 t1
= gen_reg_rtx (V8SFmode
);
5387 t2
= gen_reg_rtx (V8SFmode
);
5388 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
5389 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
5395 /* By combining the two 128-bit input vectors into one 256-bit
5396 input vector, we can use VPERMD and VPERMPS for the full
5397 two-operand shuffle. */
5398 t1
= gen_reg_rtx (V8SImode
);
5399 t2
= gen_reg_rtx (V8SImode
);
5400 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
5401 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5402 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
5403 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
5407 t1
= gen_reg_rtx (V8SFmode
);
5408 t2
= gen_reg_rtx (V8SImode
);
5409 mask
= gen_lowpart (V4SImode
, mask
);
5410 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
5411 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5412 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
5413 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
5417 t1
= gen_reg_rtx (V32QImode
);
5418 t2
= gen_reg_rtx (V32QImode
);
5419 t3
= gen_reg_rtx (V32QImode
);
5420 vt2
= GEN_INT (-128);
5421 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
5422 vt
= force_reg (V32QImode
, vt
);
5423 for (i
= 0; i
< 32; i
++)
5424 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
5425 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
5426 vt2
= force_reg (V32QImode
, vt2
);
5427 /* From mask create two adjusted masks, which contain the same
5428 bits as mask in the low 7 bits of each vector element.
5429 The first mask will have the most significant bit clear
5430 if it requests element from the same 128-bit lane
5431 and MSB set if it requests element from the other 128-bit lane.
5432 The second mask will have the opposite values of the MSB,
5433 and additionally will have its 128-bit lanes swapped.
5434 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5435 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5436 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5437 stands for other 12 bytes. */
5438 /* The bit whether element is from the same lane or the other
5439 lane is bit 4, so shift it up by 3 to the MSB position. */
5440 t5
= gen_reg_rtx (V4DImode
);
5441 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
5443 /* Clear MSB bits from the mask just in case it had them set. */
5444 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
5445 /* After this t1 will have MSB set for elements from other lane. */
5446 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
5447 /* Clear bits other than MSB. */
5448 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
5449 /* Or in the lower bits from mask into t3. */
5450 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
5451 /* And invert MSB bits in t1, so MSB is set for elements from the same
5453 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
5454 /* Swap 128-bit lanes in t3. */
5455 t6
= gen_reg_rtx (V4DImode
);
5456 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
5457 const2_rtx
, GEN_INT (3),
5458 const0_rtx
, const1_rtx
));
5459 /* And or in the lower bits from mask into t1. */
5460 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
5461 if (one_operand_shuffle
)
5463 /* Each of these shuffles will put 0s in places where
5464 element from the other 128-bit lane is needed, otherwise
5465 will shuffle in the requested value. */
5466 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
5467 gen_lowpart (V32QImode
, t6
)));
5468 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
5469 /* For t3 the 128-bit lanes are swapped again. */
5470 t7
= gen_reg_rtx (V4DImode
);
5471 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
5472 const2_rtx
, GEN_INT (3),
5473 const0_rtx
, const1_rtx
));
5474 /* And oring both together leads to the result. */
5475 emit_insn (gen_iorv32qi3 (target
, t1
,
5476 gen_lowpart (V32QImode
, t7
)));
5477 if (target
!= operands
[0])
5478 emit_move_insn (operands
[0],
5479 gen_lowpart (GET_MODE (operands
[0]), target
));
5483 t4
= gen_reg_rtx (V32QImode
);
5484 /* Similarly to the above one_operand_shuffle code,
5485 just for repeated twice for each operand. merge_two:
5486 code will merge the two results together. */
5487 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
5488 gen_lowpart (V32QImode
, t6
)));
5489 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
5490 gen_lowpart (V32QImode
, t6
)));
5491 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
5492 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
5493 t7
= gen_reg_rtx (V4DImode
);
5494 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
5495 const2_rtx
, GEN_INT (3),
5496 const0_rtx
, const1_rtx
));
5497 t8
= gen_reg_rtx (V4DImode
);
5498 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
5499 const2_rtx
, GEN_INT (3),
5500 const0_rtx
, const1_rtx
));
5501 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
5502 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
5508 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
5515 /* The XOP VPPERM insn supports three inputs. By ignoring the
5516 one_operand_shuffle special case, we avoid creating another
5517 set of constant vectors in memory. */
5518 one_operand_shuffle
= false;
5520 /* mask = mask & {2*w-1, ...} */
5521 vt
= GEN_INT (2*w
- 1);
5525 /* mask = mask & {w-1, ...} */
5526 vt
= GEN_INT (w
- 1);
5529 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5530 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5531 NULL_RTX
, 0, OPTAB_DIRECT
);
5533 /* For non-QImode operations, convert the word permutation control
5534 into a byte permutation control. */
5535 if (mode
!= V16QImode
)
5537 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
5538 GEN_INT (exact_log2 (e
)),
5539 NULL_RTX
, 0, OPTAB_DIRECT
);
5541 /* Convert mask to vector of chars. */
5542 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
5544 /* Replicate each of the input bytes into byte positions:
5545 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5546 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5547 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5548 for (i
= 0; i
< 16; ++i
)
5549 vec
[i
] = GEN_INT (i
/e
* e
);
5550 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5551 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5553 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
5555 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
5557 /* Convert it into the byte positions by doing
5558 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5559 for (i
= 0; i
< 16; ++i
)
5560 vec
[i
] = GEN_INT (i
% e
);
5561 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5562 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5563 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
5566 /* The actual shuffle operations all operate on V16QImode. */
5567 op0
= gen_lowpart (V16QImode
, op0
);
5568 op1
= gen_lowpart (V16QImode
, op1
);
5572 if (GET_MODE (target
) != V16QImode
)
5573 target
= gen_reg_rtx (V16QImode
);
5574 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
5575 if (target
!= operands
[0])
5576 emit_move_insn (operands
[0],
5577 gen_lowpart (GET_MODE (operands
[0]), target
));
5579 else if (one_operand_shuffle
)
5581 if (GET_MODE (target
) != V16QImode
)
5582 target
= gen_reg_rtx (V16QImode
);
5583 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
5584 if (target
!= operands
[0])
5585 emit_move_insn (operands
[0],
5586 gen_lowpart (GET_MODE (operands
[0]), target
));
5593 /* Shuffle the two input vectors independently. */
5594 t1
= gen_reg_rtx (V16QImode
);
5595 t2
= gen_reg_rtx (V16QImode
);
5596 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
5597 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
5600 /* Then merge them together. The key is whether any given control
5601 element contained a bit set that indicates the second word. */
5604 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
5606 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5607 more shuffle to convert the V2DI input mask into a V4SI
5608 input mask. At which point the masking that expand_int_vcond
5609 will work as desired. */
5610 rtx t3
= gen_reg_rtx (V4SImode
);
5611 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
5612 const0_rtx
, const0_rtx
,
5613 const2_rtx
, const2_rtx
));
5615 maskmode
= V4SImode
;
5619 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5620 vt
= force_reg (maskmode
, vt
);
5621 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5622 NULL_RTX
, 0, OPTAB_DIRECT
);
5624 if (GET_MODE (target
) != mode
)
5625 target
= gen_reg_rtx (mode
);
5627 xops
[1] = gen_lowpart (mode
, t2
);
5628 xops
[2] = gen_lowpart (mode
, t1
);
5629 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
5632 ok
= ix86_expand_int_vcond (xops
);
5634 if (target
!= operands
[0])
5635 emit_move_insn (operands
[0],
5636 gen_lowpart (GET_MODE (operands
[0]), target
));
5640 /* Extend SRC into next wider integer vector type. UNSIGNED_P is
5641 true if we should do zero extension, else sign extension. */
5644 ix86_expand_sse_extend (rtx dest
, rtx src
, bool unsigned_p
)
5646 machine_mode imode
= GET_MODE (src
);
5662 ops
[0] = gen_reg_rtx (imode
);
5664 ops
[1] = force_reg (imode
, src
);
5667 ops
[2] = force_reg (imode
, CONST0_RTX (imode
));
5669 ops
[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5670 ops
[1], pc_rtx
, pc_rtx
);
5672 ix86_split_mmx_punpck (ops
, false);
5673 emit_move_insn (dest
, lowpart_subreg (GET_MODE (dest
), ops
[0], imode
));
5676 /* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
5677 true if we should do zero extension, else sign extension. HIGH_P is
5678 true if we want the N/2 high elements, else the low elements. */
5681 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
5683 machine_mode imode
= GET_MODE (src
);
5688 rtx (*unpack
)(rtx
, rtx
);
5689 rtx (*extract
)(rtx
, rtx
) = NULL
;
5690 machine_mode halfmode
= BLKmode
;
5696 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
5698 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
5699 halfmode
= V32QImode
;
5701 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
5705 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
5707 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
5708 halfmode
= V16QImode
;
5710 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
5714 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
5716 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
5717 halfmode
= V16HImode
;
5719 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
5723 unpack
= gen_avx2_zero_extendv8hiv8si2
;
5725 unpack
= gen_avx2_sign_extendv8hiv8si2
;
5726 halfmode
= V8HImode
;
5728 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5732 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5734 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5735 halfmode
= V8SImode
;
5737 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5741 unpack
= gen_avx2_zero_extendv4siv4di2
;
5743 unpack
= gen_avx2_sign_extendv4siv4di2
;
5744 halfmode
= V4SImode
;
5746 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5750 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5752 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5756 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5758 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5762 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5764 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5768 unpack
= gen_sse4_1_zero_extendv4qiv4hi2
;
5770 unpack
= gen_sse4_1_sign_extendv4qiv4hi2
;
5774 unpack
= gen_sse4_1_zero_extendv2hiv2si2
;
5776 unpack
= gen_sse4_1_sign_extendv2hiv2si2
;
5780 unpack
= gen_sse4_1_zero_extendv2qiv2hi2
;
5782 unpack
= gen_sse4_1_sign_extendv2qiv2hi2
;
5788 if (GET_MODE_SIZE (imode
) >= 32)
5790 tmp
= gen_reg_rtx (halfmode
);
5791 emit_insn (extract (tmp
, src
));
5795 switch (GET_MODE_SIZE (imode
))
5798 /* Shift higher 8 bytes to lower 8 bytes. */
5799 tmp
= gen_reg_rtx (V1TImode
);
5800 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5804 /* Shift higher 4 bytes to lower 4 bytes. */
5805 tmp
= gen_reg_rtx (V1DImode
);
5806 emit_insn (gen_mmx_lshrv1di3 (tmp
, gen_lowpart (V1DImode
, src
),
5810 /* Shift higher 2 bytes to lower 2 bytes. */
5811 tmp
= gen_reg_rtx (V1SImode
);
5812 emit_insn (gen_mmx_lshrv1si3 (tmp
, gen_lowpart (V1SImode
, src
),
5819 tmp
= gen_lowpart (imode
, tmp
);
5824 emit_insn (unpack (dest
, tmp
));
5828 rtx (*unpack
)(rtx
, rtx
, rtx
);
5834 unpack
= gen_vec_interleave_highv16qi
;
5836 unpack
= gen_vec_interleave_lowv16qi
;
5840 unpack
= gen_vec_interleave_highv8hi
;
5842 unpack
= gen_vec_interleave_lowv8hi
;
5846 unpack
= gen_vec_interleave_highv4si
;
5848 unpack
= gen_vec_interleave_lowv4si
;
5852 unpack
= gen_mmx_punpckhbw
;
5854 unpack
= gen_mmx_punpcklbw
;
5858 unpack
= gen_mmx_punpckhwd
;
5860 unpack
= gen_mmx_punpcklwd
;
5864 unpack
= gen_mmx_punpckhbw_low
;
5866 unpack
= gen_mmx_punpcklbw_low
;
5873 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5875 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5876 src
, pc_rtx
, pc_rtx
);
5878 rtx tmp2
= gen_reg_rtx (imode
);
5879 emit_insn (unpack (tmp2
, src
, tmp
));
5880 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5884 /* Return true if mem is pool constant which contains a const_vector
5885 perm index, assign the index to PERM. */
5887 ix86_extract_perm_from_pool_constant (int* perm
, rtx mem
)
5889 machine_mode mode
= GET_MODE (mem
);
5890 int nelt
= GET_MODE_NUNITS (mode
);
5892 if (!INTEGRAL_MODE_P (mode
))
5895 /* Needs to be constant pool. */
5897 || !SYMBOL_REF_P (XEXP (mem
, 0))
5898 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem
, 0)))
5901 rtx constant
= get_pool_constant (XEXP (mem
, 0));
5903 if (GET_CODE (constant
) != CONST_VECTOR
)
5906 /* There could be some rtx like
5907 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5908 but with "*.LC1" refer to V2DI constant vector. */
5909 if (GET_MODE (constant
) != mode
)
5911 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
), 0);
5913 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
5917 for (int i
= 0; i
!= nelt
; i
++)
5918 perm
[i
] = UINTVAL (XVECEXP (constant
, 0, i
));
5923 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5924 but works for floating pointer parameters and nonoffsetable memories.
5925 For pushes, it returns just stack offsets; the values will be saved
5926 in the right order. Maximally three parts are generated. */
5929 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5934 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5936 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5938 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5939 gcc_assert (size
>= 2 && size
<= 4);
5941 /* Optimize constant pool reference to immediates. This is used by fp
5942 moves, that force all constants to memory to allow combining. */
5943 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5944 operand
= avoid_constant_pool_reference (operand
);
5946 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5948 /* The only non-offsetable memories we handle are pushes. */
5949 int ok
= push_operand (operand
, VOIDmode
);
5953 operand
= copy_rtx (operand
);
5954 PUT_MODE (operand
, word_mode
);
5955 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5959 if (GET_CODE (operand
) == CONST_VECTOR
)
5961 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5962 /* Caution: if we looked through a constant pool memory above,
5963 the operand may actually have a different mode now. That's
5964 ok, since we want to pun this all the way back to an integer. */
5965 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5966 gcc_assert (operand
!= NULL
);
5973 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5978 if (REG_P (operand
))
5980 gcc_assert (reload_completed
);
5981 for (i
= 0; i
< size
; i
++)
5982 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5984 else if (offsettable_memref_p (operand
))
5986 operand
= adjust_address (operand
, SImode
, 0);
5988 for (i
= 1; i
< size
; i
++)
5989 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5991 else if (CONST_DOUBLE_P (operand
))
5993 const REAL_VALUE_TYPE
*r
;
5996 r
= CONST_DOUBLE_REAL_VALUE (operand
);
6000 real_to_target (l
, r
, mode
);
6001 parts
[3] = gen_int_mode (l
[3], SImode
);
6002 parts
[2] = gen_int_mode (l
[2], SImode
);
6005 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
6006 long double may not be 80-bit. */
6007 real_to_target (l
, r
, mode
);
6008 parts
[2] = gen_int_mode (l
[2], SImode
);
6011 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
6016 parts
[1] = gen_int_mode (l
[1], SImode
);
6017 parts
[0] = gen_int_mode (l
[0], SImode
);
6026 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
6027 if (mode
== XFmode
|| mode
== TFmode
)
6029 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
6030 if (REG_P (operand
))
6032 gcc_assert (reload_completed
);
6033 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
6034 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
6036 else if (offsettable_memref_p (operand
))
6038 operand
= adjust_address (operand
, DImode
, 0);
6040 parts
[1] = adjust_address (operand
, upper_mode
, 8);
6042 else if (CONST_DOUBLE_P (operand
))
6046 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
6048 /* real_to_target puts 32-bit pieces in each long. */
6049 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
6050 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
6053 if (upper_mode
== SImode
)
6054 parts
[1] = gen_int_mode (l
[2], SImode
);
6057 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
6058 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
6069 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
6070 Return false when normal moves are needed; true when all required
6071 insns have been emitted. Operands 2-4 contain the input values
6072 int the correct order; operands 5-7 contain the output values. */
6075 ix86_split_long_move (rtx operands
[])
6081 machine_mode mode
= GET_MODE (operands
[0]);
6082 bool collisionparts
[4];
6084 /* The DFmode expanders may ask us to move double.
6085 For 64bit target this is single move. By hiding the fact
6086 here we simplify i386.md splitters. */
6087 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
6089 /* Optimize constant pool reference to immediates. This is used by
6090 fp moves, that force all constants to memory to allow combining. */
6092 if (MEM_P (operands
[1])
6093 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
6094 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
6095 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
6096 if (push_operand (operands
[0], VOIDmode
))
6098 operands
[0] = copy_rtx (operands
[0]);
6099 PUT_MODE (operands
[0], word_mode
);
6102 operands
[0] = gen_lowpart (DImode
, operands
[0]);
6103 operands
[1] = gen_lowpart (DImode
, operands
[1]);
6104 emit_move_insn (operands
[0], operands
[1]);
6108 /* The only non-offsettable memory we handle is push. */
6109 if (push_operand (operands
[0], VOIDmode
))
6112 gcc_assert (!MEM_P (operands
[0])
6113 || offsettable_memref_p (operands
[0]));
6115 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
6116 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
6118 /* When emitting push, take care for source operands on the stack. */
6119 if (push
&& MEM_P (operands
[1])
6120 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
6122 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
6124 /* Compensate for the stack decrement by 4. */
6125 if (!TARGET_64BIT
&& nparts
== 3
6126 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
6127 src_base
= plus_constant (Pmode
, src_base
, 4);
6129 /* src_base refers to the stack pointer and is
6130 automatically decreased by emitted push. */
6131 for (i
= 0; i
< nparts
; i
++)
6132 part
[1][i
] = change_address (part
[1][i
],
6133 GET_MODE (part
[1][i
]), src_base
);
6136 /* We need to do copy in the right order in case an address register
6137 of the source overlaps the destination. */
6138 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
6142 for (i
= 0; i
< nparts
; i
++)
6145 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
6146 if (collisionparts
[i
])
6150 /* Collision in the middle part can be handled by reordering. */
6151 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
6153 std::swap (part
[0][1], part
[0][2]);
6154 std::swap (part
[1][1], part
[1][2]);
6156 else if (collisions
== 1
6158 && (collisionparts
[1] || collisionparts
[2]))
6160 if (collisionparts
[1])
6162 std::swap (part
[0][1], part
[0][2]);
6163 std::swap (part
[1][1], part
[1][2]);
6167 std::swap (part
[0][2], part
[0][3]);
6168 std::swap (part
[1][2], part
[1][3]);
6172 /* If there are more collisions, we can't handle it by reordering.
6173 Do an lea to the last part and use only one colliding move. */
6174 else if (collisions
> 1)
6180 base
= part
[0][nparts
- 1];
6182 /* Handle the case when the last part isn't valid for lea.
6183 Happens in 64-bit mode storing the 12-byte XFmode. */
6184 if (GET_MODE (base
) != Pmode
)
6185 base
= gen_rtx_REG (Pmode
, REGNO (base
));
6187 addr
= XEXP (part
[1][0], 0);
6188 if (TARGET_TLS_DIRECT_SEG_REFS
)
6190 struct ix86_address parts
;
6191 int ok
= ix86_decompose_address (addr
, &parts
);
6193 /* It is not valid to use %gs: or %fs: in lea. */
6194 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
6196 emit_insn (gen_rtx_SET (base
, addr
));
6197 part
[1][0] = replace_equiv_address (part
[1][0], base
);
6198 for (i
= 1; i
< nparts
; i
++)
6200 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
6201 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
6212 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
6213 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
6214 emit_move_insn (part
[0][2], part
[1][2]);
6216 else if (nparts
== 4)
6218 emit_move_insn (part
[0][3], part
[1][3]);
6219 emit_move_insn (part
[0][2], part
[1][2]);
6224 /* In 64bit mode we don't have 32bit push available. In case this is
6225 register, it is OK - we will just use larger counterpart. We also
6226 retype memory - these comes from attempt to avoid REX prefix on
6227 moving of second half of TFmode value. */
6228 if (GET_MODE (part
[1][1]) == SImode
)
6230 switch (GET_CODE (part
[1][1]))
6233 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
6237 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
6244 if (GET_MODE (part
[1][0]) == SImode
)
6245 part
[1][0] = part
[1][1];
6248 emit_move_insn (part
[0][1], part
[1][1]);
6249 emit_move_insn (part
[0][0], part
[1][0]);
6253 /* Choose correct order to not overwrite the source before it is copied. */
6254 if ((REG_P (part
[0][0])
6255 && REG_P (part
[1][1])
6256 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
6258 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
6260 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
6262 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
6264 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
6266 operands
[2 + i
] = part
[0][j
];
6267 operands
[6 + i
] = part
[1][j
];
6272 for (i
= 0; i
< nparts
; i
++)
6274 operands
[2 + i
] = part
[0][i
];
6275 operands
[6 + i
] = part
[1][i
];
6279 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6280 if (optimize_insn_for_size_p ())
6282 for (j
= 0; j
< nparts
- 1; j
++)
6283 if (CONST_INT_P (operands
[6 + j
])
6284 && operands
[6 + j
] != const0_rtx
6285 && REG_P (operands
[2 + j
]))
6286 for (i
= j
; i
< nparts
- 1; i
++)
6287 if (CONST_INT_P (operands
[7 + i
])
6288 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
6289 operands
[7 + i
] = operands
[2 + j
];
6292 for (i
= 0; i
< nparts
; i
++)
6293 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
6298 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6299 left shift by a constant, either using a single shift or
6300 a sequence of add instructions. */
6303 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
6306 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
6307 && !optimize_insn_for_size_p ()))
6310 emit_insn (gen_add2_insn (operand
, operand
));
6314 rtx (*insn
)(rtx
, rtx
, rtx
);
6316 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6317 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
6322 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
6324 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
6325 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
6326 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6327 machine_mode half_mode
;
6329 rtx low
[2], high
[2];
6332 if (CONST_INT_P (operands
[2]))
6334 split_double_mode (mode
, operands
, 2, low
, high
);
6335 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6337 if (count
>= half_width
)
6339 emit_move_insn (high
[0], low
[1]);
6340 ix86_expand_clear (low
[0]);
6342 if (count
> half_width
)
6343 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
6347 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6349 if (!rtx_equal_p (operands
[0], operands
[1]))
6350 emit_move_insn (operands
[0], operands
[1]);
6352 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
6353 ix86_expand_ashl_const (low
[0], count
, mode
);
6358 split_double_mode (mode
, operands
, 1, low
, high
);
6359 half_mode
= mode
== DImode
? SImode
: DImode
;
6361 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6363 if (operands
[1] == const1_rtx
)
6365 /* Assuming we've chosen a QImode capable registers, then 1 << N
6366 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6367 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
6369 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
6371 ix86_expand_clear (low
[0]);
6372 ix86_expand_clear (high
[0]);
6373 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
6375 d
= gen_lowpart (QImode
, low
[0]);
6376 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6377 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
6378 emit_insn (gen_rtx_SET (d
, s
));
6380 d
= gen_lowpart (QImode
, high
[0]);
6381 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6382 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
6383 emit_insn (gen_rtx_SET (d
, s
));
6386 /* Otherwise, we can get the same results by manually performing
6387 a bit extract operation on bit 5/6, and then performing the two
6388 shifts. The two methods of getting 0/1 into low/high are exactly
6389 the same size. Avoiding the shift in the bit extract case helps
6390 pentium4 a bit; no one else seems to care much either way. */
6393 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
6394 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
6395 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
6401 gen_lshr3
= gen_lshrsi3
;
6402 gen_and3
= gen_andsi3
;
6403 gen_xor3
= gen_xorsi3
;
6408 gen_lshr3
= gen_lshrdi3
;
6409 gen_and3
= gen_anddi3
;
6410 gen_xor3
= gen_xordi3
;
6414 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
6415 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
6417 x
= gen_lowpart (half_mode
, operands
[2]);
6418 emit_insn (gen_rtx_SET (high
[0], x
));
6420 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
6421 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
6422 emit_move_insn (low
[0], high
[0]);
6423 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
6426 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6427 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
6431 if (operands
[1] == constm1_rtx
)
6433 /* For -1 << N, we can avoid the shld instruction, because we
6434 know that we're shifting 0...31/63 ones into a -1. */
6435 emit_move_insn (low
[0], constm1_rtx
);
6436 if (optimize_insn_for_size_p ())
6437 emit_move_insn (high
[0], low
[0]);
6439 emit_move_insn (high
[0], constm1_rtx
);
6443 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6445 if (!rtx_equal_p (operands
[0], operands
[1]))
6446 emit_move_insn (operands
[0], operands
[1]);
6448 split_double_mode (mode
, operands
, 1, low
, high
);
6449 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
6452 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6454 if (TARGET_CMOVE
&& scratch
)
6456 ix86_expand_clear (scratch
);
6457 emit_insn (gen_x86_shift_adj_1
6458 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
6461 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
6465 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6467 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
6468 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
6469 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6470 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6472 rtx low
[2], high
[2];
6475 if (CONST_INT_P (operands
[2]))
6477 split_double_mode (mode
, operands
, 2, low
, high
);
6478 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6480 if (count
== GET_MODE_BITSIZE (mode
) - 1)
6482 emit_move_insn (high
[0], high
[1]);
6483 emit_insn (gen_ashr3 (high
[0], high
[0],
6484 GEN_INT (half_width
- 1)));
6485 emit_move_insn (low
[0], high
[0]);
6488 else if (count
>= half_width
)
6490 emit_move_insn (low
[0], high
[1]);
6491 emit_move_insn (high
[0], low
[0]);
6492 emit_insn (gen_ashr3 (high
[0], high
[0],
6493 GEN_INT (half_width
- 1)));
6495 if (count
> half_width
)
6496 emit_insn (gen_ashr3 (low
[0], low
[0],
6497 GEN_INT (count
- half_width
)));
6501 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6503 if (!rtx_equal_p (operands
[0], operands
[1]))
6504 emit_move_insn (operands
[0], operands
[1]);
6506 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6507 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
6512 machine_mode half_mode
;
6514 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6516 if (!rtx_equal_p (operands
[0], operands
[1]))
6517 emit_move_insn (operands
[0], operands
[1]);
6519 split_double_mode (mode
, operands
, 1, low
, high
);
6520 half_mode
= mode
== DImode
? SImode
: DImode
;
6522 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6523 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
6525 if (TARGET_CMOVE
&& scratch
)
6527 emit_move_insn (scratch
, high
[0]);
6528 emit_insn (gen_ashr3 (scratch
, scratch
,
6529 GEN_INT (half_width
- 1)));
6530 emit_insn (gen_x86_shift_adj_1
6531 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6534 emit_insn (gen_x86_shift_adj_3
6535 (half_mode
, low
[0], high
[0], operands
[2]));
6540 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6542 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
6543 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
6544 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6545 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6547 rtx low
[2], high
[2];
6550 if (CONST_INT_P (operands
[2]))
6552 split_double_mode (mode
, operands
, 2, low
, high
);
6553 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6555 if (count
>= half_width
)
6557 emit_move_insn (low
[0], high
[1]);
6558 ix86_expand_clear (high
[0]);
6560 if (count
> half_width
)
6561 emit_insn (gen_lshr3 (low
[0], low
[0],
6562 GEN_INT (count
- half_width
)));
6566 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6568 if (!rtx_equal_p (operands
[0], operands
[1]))
6569 emit_move_insn (operands
[0], operands
[1]);
6571 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6572 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
6577 machine_mode half_mode
;
6579 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6581 if (!rtx_equal_p (operands
[0], operands
[1]))
6582 emit_move_insn (operands
[0], operands
[1]);
6584 split_double_mode (mode
, operands
, 1, low
, high
);
6585 half_mode
= mode
== DImode
? SImode
: DImode
;
6587 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6588 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
6590 if (TARGET_CMOVE
&& scratch
)
6592 ix86_expand_clear (scratch
);
6593 emit_insn (gen_x86_shift_adj_1
6594 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6597 emit_insn (gen_x86_shift_adj_2
6598 (half_mode
, low
[0], high
[0], operands
[2]));
6602 /* Expand move of V1TI mode register X to a new TI mode register. */
6604 ix86_expand_v1ti_to_ti (rtx x
)
6606 rtx result
= gen_reg_rtx (TImode
);
6609 rtx temp
= force_reg (V2DImode
, gen_lowpart (V2DImode
, x
));
6610 rtx lo
= gen_lowpart (DImode
, result
);
6611 emit_insn (gen_vec_extractv2didi (lo
, temp
, const0_rtx
));
6612 rtx hi
= gen_highpart (DImode
, result
);
6613 emit_insn (gen_vec_extractv2didi (hi
, temp
, const1_rtx
));
6616 emit_move_insn (result
, gen_lowpart (TImode
, x
));
6620 /* Expand move of TI mode register X to a new V1TI mode register. */
6622 ix86_expand_ti_to_v1ti (rtx x
)
6626 rtx lo
= gen_lowpart (DImode
, x
);
6627 rtx hi
= gen_highpart (DImode
, x
);
6628 rtx tmp
= gen_reg_rtx (V2DImode
);
6629 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
6630 return force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp
));
6633 return force_reg (V1TImode
, gen_lowpart (V1TImode
, x
));
6636 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6638 ix86_expand_v1ti_shift (enum rtx_code code
, rtx operands
[])
6640 rtx op1
= force_reg (V1TImode
, operands
[1]);
6642 if (!CONST_INT_P (operands
[2]))
6644 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6645 rtx tmp2
= gen_reg_rtx (TImode
);
6646 rtx (*shift
) (rtx
, rtx
, rtx
)
6647 = (code
== ASHIFT
) ? gen_ashlti3
: gen_lshrti3
;
6648 emit_insn (shift (tmp2
, tmp1
, operands
[2]));
6649 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6650 emit_move_insn (operands
[0], tmp3
);
6654 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6658 emit_move_insn (operands
[0], op1
);
6662 if ((bits
& 7) == 0)
6664 rtx tmp
= gen_reg_rtx (V1TImode
);
6666 emit_insn (gen_sse2_ashlv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6668 emit_insn (gen_sse2_lshrv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6669 emit_move_insn (operands
[0], tmp
);
6673 rtx tmp1
= gen_reg_rtx (V1TImode
);
6675 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (64)));
6677 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
6679 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6680 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6682 /* tmp3 will be the V2DImode result. */
6683 rtx tmp3
= gen_reg_rtx (V2DImode
);
6688 emit_insn (gen_ashlv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6690 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6694 /* tmp4 is operands[1], in V2DImode. */
6695 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6697 rtx tmp5
= gen_reg_rtx (V2DImode
);
6699 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6701 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6703 rtx tmp6
= gen_reg_rtx (V2DImode
);
6705 emit_insn (gen_lshrv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6707 emit_insn (gen_ashlv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6709 emit_insn (gen_iorv2di3 (tmp3
, tmp5
, tmp6
));
6712 /* Convert the result back to V1TImode and store in operands[0]. */
6713 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6714 emit_move_insn (operands
[0], tmp7
);
6717 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6719 ix86_expand_v1ti_rotate (enum rtx_code code
, rtx operands
[])
6721 rtx op1
= force_reg (V1TImode
, operands
[1]);
6723 if (!CONST_INT_P (operands
[2]))
6725 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6726 rtx tmp2
= gen_reg_rtx (TImode
);
6727 rtx (*rotate
) (rtx
, rtx
, rtx
)
6728 = (code
== ROTATE
) ? gen_rotlti3
: gen_rotrti3
;
6729 emit_insn (rotate (tmp2
, tmp1
, operands
[2]));
6730 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6731 emit_move_insn (operands
[0], tmp3
);
6735 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6739 emit_move_insn (operands
[0], op1
);
6743 if (code
== ROTATERT
)
6746 if ((bits
& 31) == 0)
6748 rtx tmp2
= gen_reg_rtx (V4SImode
);
6749 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6751 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x93)));
6752 else if (bits
== 64)
6753 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x4e)));
6755 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x39)));
6756 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp2
));
6760 if ((bits
& 7) == 0)
6762 rtx tmp1
= gen_reg_rtx (V1TImode
);
6763 rtx tmp2
= gen_reg_rtx (V1TImode
);
6764 rtx tmp3
= gen_reg_rtx (V1TImode
);
6766 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (bits
)));
6767 emit_insn (gen_sse2_lshrv1ti3 (tmp2
, op1
, GEN_INT (128 - bits
)));
6768 emit_insn (gen_iorv1ti3 (tmp3
, tmp1
, tmp2
));
6769 emit_move_insn (operands
[0], tmp3
);
6773 rtx op1_v4si
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6782 hibits
= gen_reg_rtx (V4SImode
);
6783 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x93)));
6787 lobits
= gen_reg_rtx (V4SImode
);
6788 hibits
= gen_reg_rtx (V4SImode
);
6789 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x93)));
6790 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x4e)));
6794 lobits
= gen_reg_rtx (V4SImode
);
6795 hibits
= gen_reg_rtx (V4SImode
);
6796 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x4e)));
6797 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x39)));
6801 lobits
= gen_reg_rtx (V4SImode
);
6802 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x39)));
6807 rtx tmp1
= gen_reg_rtx (V4SImode
);
6808 rtx tmp2
= gen_reg_rtx (V4SImode
);
6809 rtx tmp3
= gen_reg_rtx (V4SImode
);
6811 emit_insn (gen_ashlv4si3 (tmp1
, lobits
, GEN_INT (bits
& 31)));
6812 emit_insn (gen_lshrv4si3 (tmp2
, hibits
, GEN_INT (32 - (bits
& 31))));
6813 emit_insn (gen_iorv4si3 (tmp3
, tmp1
, tmp2
));
6815 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6818 /* Expand V1TI mode ashiftrt by constant. */
6820 ix86_expand_v1ti_ashiftrt (rtx operands
[])
6822 rtx op1
= force_reg (V1TImode
, operands
[1]);
6824 if (!CONST_INT_P (operands
[2]))
6826 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6827 rtx tmp2
= gen_reg_rtx (TImode
);
6828 emit_insn (gen_ashrti3 (tmp2
, tmp1
, operands
[2]));
6829 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6830 emit_move_insn (operands
[0], tmp3
);
6834 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6838 emit_move_insn (operands
[0], op1
);
6844 /* Two operations. */
6845 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6846 rtx tmp2
= gen_reg_rtx (V4SImode
);
6847 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6849 rtx tmp3
= gen_reg_rtx (V4SImode
);
6850 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6852 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6858 /* Three operations. */
6859 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6860 rtx tmp2
= gen_reg_rtx (V4SImode
);
6861 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6863 rtx tmp3
= gen_reg_rtx (V4SImode
);
6864 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6866 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6867 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6868 rtx tmp6
= gen_reg_rtx (V2DImode
);
6869 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6871 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6877 /* Three operations. */
6878 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6879 rtx tmp2
= gen_reg_rtx (V4SImode
);
6880 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6882 rtx tmp3
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6883 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6884 rtx tmp5
= gen_reg_rtx (V2DImode
);
6885 emit_insn (gen_vec_interleave_highv2di (tmp5
, tmp3
, tmp4
));
6887 rtx tmp6
= force_reg(V4SImode
, gen_lowpart (V4SImode
, tmp5
));
6888 rtx tmp7
= gen_reg_rtx (V4SImode
);
6889 emit_insn (gen_sse2_pshufd (tmp7
, tmp6
, GEN_INT (0xfd)));
6891 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
6897 /* Three operations. */
6898 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6899 rtx tmp2
= gen_reg_rtx (V4SImode
);
6900 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6902 rtx tmp3
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6903 rtx tmp4
= gen_reg_rtx (V8HImode
);
6904 emit_insn (gen_sse2_pshufhw (tmp4
, tmp3
, GEN_INT (0xfe)));
6906 rtx tmp5
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp4
));
6907 rtx tmp6
= gen_reg_rtx (V4SImode
);
6908 emit_insn (gen_sse2_pshufd (tmp6
, tmp5
, GEN_INT (0xfe)));
6910 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6914 if (TARGET_AVX2
|| TARGET_SSE4_1
)
6916 /* Three operations. */
6919 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6920 rtx tmp2
= gen_reg_rtx (V4SImode
);
6921 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6923 rtx tmp3
= gen_reg_rtx (V1TImode
);
6924 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (32)));
6928 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6929 rtx tmp5
= gen_reg_rtx (V4SImode
);
6930 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6933 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6937 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6938 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6939 rtx tmp6
= gen_reg_rtx (V8HImode
);
6940 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6943 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6948 /* Three operations. */
6949 if (bits
== 8 || bits
== 16 || bits
== 24)
6951 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6952 rtx tmp2
= gen_reg_rtx (V4SImode
);
6953 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6955 rtx tmp3
= gen_reg_rtx (V1TImode
);
6956 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (bits
)));
6960 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6961 rtx tmp5
= gen_reg_rtx (V4SImode
);
6962 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6965 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6969 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6970 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6971 rtx tmp6
= gen_reg_rtx (V8HImode
);
6972 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6975 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6983 /* Four operations. */
6984 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6985 rtx tmp2
= gen_reg_rtx (V4SImode
);
6986 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6988 rtx tmp3
= gen_reg_rtx (V4SImode
);
6989 emit_insn (gen_ashrv4si3 (tmp3
, tmp1
, GEN_INT (31)));
6991 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6992 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6993 rtx tmp6
= gen_reg_rtx (V2DImode
);
6994 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6996 rtx tmp7
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp6
));
6997 rtx tmp8
= gen_reg_rtx (V4SImode
);
6998 emit_insn (gen_sse2_pshufd (tmp8
, tmp7
, GEN_INT (0xfd)));
7000 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp8
));
7004 if (TARGET_SSE4_1
&& (bits
== 48 || bits
== 80))
7006 /* Four operations. */
7007 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7008 rtx tmp2
= gen_reg_rtx (V4SImode
);
7009 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7011 rtx tmp3
= gen_reg_rtx (V4SImode
);
7012 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7014 rtx tmp4
= gen_reg_rtx (V1TImode
);
7015 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
7017 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
7018 rtx tmp6
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp4
));
7019 rtx tmp7
= gen_reg_rtx (V8HImode
);
7020 emit_insn (gen_sse4_1_pblendw (tmp7
, tmp5
, tmp6
,
7021 GEN_INT (bits
== 48 ? 0x1f : 0x07)));
7023 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
7027 if ((bits
& 7) == 0)
7029 /* Five operations. */
7030 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7031 rtx tmp2
= gen_reg_rtx (V4SImode
);
7032 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7034 rtx tmp3
= gen_reg_rtx (V4SImode
);
7035 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7037 rtx tmp4
= gen_reg_rtx (V1TImode
);
7038 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
7040 rtx tmp5
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7041 rtx tmp6
= gen_reg_rtx (V1TImode
);
7042 emit_insn (gen_sse2_ashlv1ti3 (tmp6
, tmp5
, GEN_INT (128 - bits
)));
7044 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7045 rtx tmp8
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp6
));
7046 rtx tmp9
= gen_reg_rtx (V2DImode
);
7047 emit_insn (gen_iorv2di3 (tmp9
, tmp7
, tmp8
));
7049 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp9
));
7053 if (TARGET_AVX2
&& bits
< 32)
7055 /* Six operations. */
7056 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7057 rtx tmp2
= gen_reg_rtx (V4SImode
);
7058 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
7060 rtx tmp3
= gen_reg_rtx (V1TImode
);
7061 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
7063 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7064 rtx tmp5
= gen_reg_rtx (V2DImode
);
7065 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
7067 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7068 rtx tmp7
= gen_reg_rtx (V2DImode
);
7069 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
7071 rtx tmp8
= gen_reg_rtx (V2DImode
);
7072 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
7074 rtx tmp9
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp8
));
7075 rtx tmp10
= gen_reg_rtx (V4SImode
);
7076 emit_insn (gen_avx2_pblenddv4si (tmp10
, tmp2
, tmp9
, GEN_INT (7)));
7078 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp10
));
7082 if (TARGET_SSE4_1
&& bits
< 15)
7084 /* Six operations. */
7085 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7086 rtx tmp2
= gen_reg_rtx (V4SImode
);
7087 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
7089 rtx tmp3
= gen_reg_rtx (V1TImode
);
7090 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
7092 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7093 rtx tmp5
= gen_reg_rtx (V2DImode
);
7094 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
7096 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7097 rtx tmp7
= gen_reg_rtx (V2DImode
);
7098 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
7100 rtx tmp8
= gen_reg_rtx (V2DImode
);
7101 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
7103 rtx tmp9
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
7104 rtx tmp10
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp8
));
7105 rtx tmp11
= gen_reg_rtx (V8HImode
);
7106 emit_insn (gen_sse4_1_pblendw (tmp11
, tmp9
, tmp10
, GEN_INT (0x3f)));
7108 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp11
));
7114 /* Eight operations. */
7115 rtx tmp1
= gen_reg_rtx (V1TImode
);
7116 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
7118 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7119 rtx tmp3
= gen_reg_rtx (V2DImode
);
7120 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (1)));
7122 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
7123 rtx tmp5
= gen_reg_rtx (V2DImode
);
7124 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (63)));
7126 rtx tmp6
= gen_reg_rtx (V2DImode
);
7127 emit_insn (gen_iorv2di3 (tmp6
, tmp3
, tmp5
));
7129 rtx tmp7
= gen_reg_rtx (V2DImode
);
7130 emit_insn (gen_lshrv2di3 (tmp7
, tmp2
, GEN_INT (63)));
7132 rtx tmp8
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp7
));
7133 rtx tmp9
= gen_reg_rtx (V4SImode
);
7134 emit_insn (gen_sse2_pshufd (tmp9
, tmp8
, GEN_INT (0xbf)));
7136 rtx tmp10
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp9
));
7137 rtx tmp11
= gen_reg_rtx (V2DImode
);
7138 emit_insn (gen_ashlv2di3 (tmp11
, tmp10
, GEN_INT (31)));
7140 rtx tmp12
= gen_reg_rtx (V2DImode
);
7141 emit_insn (gen_iorv2di3 (tmp12
, tmp6
, tmp11
));
7143 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp12
));
7149 /* Eight operations. */
7150 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7151 rtx tmp2
= gen_reg_rtx (V4SImode
);
7152 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7154 rtx tmp3
= gen_reg_rtx (V4SImode
);
7155 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7157 rtx tmp4
= gen_reg_rtx (V1TImode
);
7158 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
7160 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7161 rtx tmp6
= gen_reg_rtx (V2DImode
);
7162 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
- 64)));
7164 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7165 rtx tmp8
= gen_reg_rtx (V1TImode
);
7166 emit_insn (gen_sse2_ashlv1ti3 (tmp8
, tmp7
, GEN_INT (64)));
7168 rtx tmp9
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7169 rtx tmp10
= gen_reg_rtx (V2DImode
);
7170 emit_insn (gen_ashlv2di3 (tmp10
, tmp9
, GEN_INT (128 - bits
)));
7172 rtx tmp11
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp8
));
7173 rtx tmp12
= gen_reg_rtx (V2DImode
);
7174 emit_insn (gen_iorv2di3 (tmp12
, tmp10
, tmp11
));
7176 rtx tmp13
= gen_reg_rtx (V2DImode
);
7177 emit_insn (gen_iorv2di3 (tmp13
, tmp6
, tmp12
));
7179 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp13
));
7183 /* Nine operations. */
7184 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7185 rtx tmp2
= gen_reg_rtx (V4SImode
);
7186 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7188 rtx tmp3
= gen_reg_rtx (V4SImode
);
7189 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7191 rtx tmp4
= gen_reg_rtx (V1TImode
);
7192 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
7194 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7195 rtx tmp6
= gen_reg_rtx (V2DImode
);
7196 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
)));
7198 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7199 rtx tmp8
= gen_reg_rtx (V2DImode
);
7200 emit_insn (gen_ashlv2di3 (tmp8
, tmp7
, GEN_INT (64 - bits
)));
7202 rtx tmp9
= gen_reg_rtx (V2DImode
);
7203 emit_insn (gen_iorv2di3 (tmp9
, tmp6
, tmp8
));
7205 rtx tmp10
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7206 rtx tmp11
= gen_reg_rtx (V1TImode
);
7207 emit_insn (gen_sse2_ashlv1ti3 (tmp11
, tmp10
, GEN_INT (64)));
7209 rtx tmp12
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp11
));
7210 rtx tmp13
= gen_reg_rtx (V2DImode
);
7211 emit_insn (gen_ashlv2di3 (tmp13
, tmp12
, GEN_INT (64 - bits
)));
7213 rtx tmp14
= gen_reg_rtx (V2DImode
);
7214 emit_insn (gen_iorv2di3 (tmp14
, tmp9
, tmp13
));
7216 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp14
));
7220 /* Replace all occurrences of REG FROM with REG TO in X, including
7221 occurrences with different modes. */
7224 ix86_replace_reg_with_reg (rtx x
, rtx from
, rtx to
)
7226 gcc_checking_assert (REG_P (from
)
7228 && GET_MODE (from
) == GET_MODE (to
));
7229 if (!reg_overlap_mentioned_p (from
, x
))
7231 rtx ret
= copy_rtx (x
);
7232 subrtx_ptr_iterator::array_type array
;
7233 FOR_EACH_SUBRTX_PTR (iter
, array
, &ret
, NONCONST
)
7237 if (REG_P (x
) && REGNO (x
) == REGNO (from
))
7243 gcc_checking_assert (REG_NREGS (x
) == 1);
7244 *loc
= gen_rtx_REG (GET_MODE (x
), REGNO (to
));
7251 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7252 DImode for constant loop counts. */
7255 counter_mode (rtx count_exp
)
7257 if (GET_MODE (count_exp
) != VOIDmode
)
7258 return GET_MODE (count_exp
);
7259 if (!CONST_INT_P (count_exp
))
7261 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
7266 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7267 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7268 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7269 memory by VALUE (supposed to be in MODE).
7271 The size is rounded down to whole number of chunk size moved at once.
7272 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7276 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
7277 rtx destptr
, rtx srcptr
, rtx value
,
7278 rtx count
, machine_mode mode
, int unroll
,
7279 int expected_size
, bool issetmem
)
7281 rtx_code_label
*out_label
, *top_label
;
7283 machine_mode iter_mode
= counter_mode (count
);
7284 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
7285 rtx piece_size
= GEN_INT (piece_size_n
);
7286 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
7290 top_label
= gen_label_rtx ();
7291 out_label
= gen_label_rtx ();
7292 iter
= gen_reg_rtx (iter_mode
);
7294 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
7295 NULL
, 1, OPTAB_DIRECT
);
7296 /* Those two should combine. */
7297 if (piece_size
== const1_rtx
)
7299 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
7301 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
7303 emit_move_insn (iter
, const0_rtx
);
7305 emit_label (top_label
);
7307 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
7309 /* This assert could be relaxed - in this case we'll need to compute
7310 smallest power of two, containing in PIECE_SIZE_N and pass it to
7312 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
7313 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
7314 destmem
= adjust_address (destmem
, mode
, 0);
7318 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
7319 srcmem
= adjust_address (srcmem
, mode
, 0);
7321 /* When unrolling for chips that reorder memory reads and writes,
7322 we can save registers by using single temporary.
7323 Also using 4 temporaries is overkill in 32bit mode. */
7324 if (!TARGET_64BIT
&& 0)
7326 for (i
= 0; i
< unroll
; i
++)
7330 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7331 GET_MODE_SIZE (mode
));
7332 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7333 GET_MODE_SIZE (mode
));
7335 emit_move_insn (destmem
, srcmem
);
7341 gcc_assert (unroll
<= 4);
7342 for (i
= 0; i
< unroll
; i
++)
7344 tmpreg
[i
] = gen_reg_rtx (mode
);
7346 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7347 GET_MODE_SIZE (mode
));
7348 emit_move_insn (tmpreg
[i
], srcmem
);
7350 for (i
= 0; i
< unroll
; i
++)
7353 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7354 GET_MODE_SIZE (mode
));
7355 emit_move_insn (destmem
, tmpreg
[i
]);
7360 for (i
= 0; i
< unroll
; i
++)
7363 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7364 GET_MODE_SIZE (mode
));
7365 emit_move_insn (destmem
, value
);
7368 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
7369 true, OPTAB_LIB_WIDEN
);
7371 emit_move_insn (iter
, tmp
);
7373 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
7375 if (expected_size
!= -1)
7377 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
7378 if (expected_size
== 0)
7380 else if (expected_size
> REG_BR_PROB_BASE
)
7381 predict_jump (REG_BR_PROB_BASE
- 1);
7383 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
7387 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
7388 iter
= ix86_zero_extend_to_Pmode (iter
);
7389 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
7390 true, OPTAB_LIB_WIDEN
);
7392 emit_move_insn (destptr
, tmp
);
7395 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
7396 true, OPTAB_LIB_WIDEN
);
7398 emit_move_insn (srcptr
, tmp
);
7400 emit_label (out_label
);
7403 /* Divide COUNTREG by SCALE. */
7405 scale_counter (rtx countreg
, int scale
)
7411 if (CONST_INT_P (countreg
))
7412 return GEN_INT (INTVAL (countreg
) / scale
);
7413 gcc_assert (REG_P (countreg
));
7415 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
7416 GEN_INT (exact_log2 (scale
)),
7417 NULL
, 1, OPTAB_DIRECT
);
7421 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7422 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7423 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7424 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7425 ORIG_VALUE is the original value passed to memset to fill the memory with.
7426 Other arguments have same meaning as for previous function. */
7429 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
7430 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
7432 machine_mode mode
, bool issetmem
)
7437 HOST_WIDE_INT rounded_count
;
7439 /* If possible, it is shorter to use rep movs.
7440 TODO: Maybe it is better to move this logic to decide_alg. */
7441 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
7442 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7443 && (!issetmem
|| orig_value
== const0_rtx
))
7446 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
7447 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
7449 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
7450 GET_MODE_SIZE (mode
)));
7453 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7454 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7455 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
7458 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
7459 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
7462 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7463 destmem
= shallow_copy_rtx (destmem
);
7464 set_mem_size (destmem
, rounded_count
);
7466 else if (MEM_SIZE_KNOWN_P (destmem
))
7467 clear_mem_size (destmem
);
7471 value
= force_reg (mode
, gen_lowpart (mode
, value
));
7472 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
7476 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
7477 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
7480 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7481 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7482 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
7485 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
7486 if (CONST_INT_P (count
))
7489 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7490 srcmem
= shallow_copy_rtx (srcmem
);
7491 set_mem_size (srcmem
, rounded_count
);
7495 if (MEM_SIZE_KNOWN_P (srcmem
))
7496 clear_mem_size (srcmem
);
7498 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
7503 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7505 SRC is passed by pointer to be updated on return.
7506 Return value is updated DST. */
7508 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
7509 HOST_WIDE_INT size_to_move
)
7511 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
7512 enum insn_code code
;
7513 machine_mode move_mode
;
7516 /* Find the widest mode in which we could perform moves.
7517 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7518 it until move of such size is supported. */
7519 piece_size
= 1 << floor_log2 (size_to_move
);
7520 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
7521 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7523 gcc_assert (piece_size
> 1);
7527 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7528 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7529 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7531 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7532 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7533 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7535 move_mode
= word_mode
;
7536 piece_size
= GET_MODE_SIZE (move_mode
);
7537 code
= optab_handler (mov_optab
, move_mode
);
7540 gcc_assert (code
!= CODE_FOR_nothing
);
7542 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7543 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
7545 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7546 gcc_assert (size_to_move
% piece_size
== 0);
7548 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7550 /* We move from memory to memory, so we'll need to do it via
7551 a temporary register. */
7552 tempreg
= gen_reg_rtx (move_mode
);
7553 emit_insn (GEN_FCN (code
) (tempreg
, src
));
7554 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
7556 emit_move_insn (destptr
,
7557 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7558 emit_move_insn (srcptr
,
7559 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
7561 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7563 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
7567 /* Update DST and SRC rtx. */
7572 /* Helper function for the string operations below. Dest VARIABLE whether
7573 it is aligned to VALUE bytes. If true, jump to the label. */
7575 static rtx_code_label
*
7576 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
7578 rtx_code_label
*label
= gen_label_rtx ();
7579 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
7580 if (GET_MODE (variable
) == DImode
)
7581 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
7583 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
7584 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
7587 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
7589 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7594 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7597 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
7598 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
7601 if (CONST_INT_P (count
))
7603 HOST_WIDE_INT countval
= INTVAL (count
);
7604 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7607 /* For now MAX_SIZE should be a power of 2. This assert could be
7608 relaxed, but it'll require a bit more complicated epilogue
7610 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7611 for (i
= max_size
; i
>= 1; i
>>= 1)
7613 if (epilogue_size
& i
)
7614 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7620 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
7621 count
, 1, OPTAB_DIRECT
);
7622 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
7623 count
, QImode
, 1, 4, false);
7627 /* When there are stringops, we can cheaply increase dest and src pointers.
7628 Otherwise we save code size by maintaining offset (zero is readily
7629 available from preceding rep operation) and using x86 addressing modes.
7631 if (TARGET_SINGLE_STRINGOP
)
7635 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7636 src
= change_address (srcmem
, SImode
, srcptr
);
7637 dest
= change_address (destmem
, SImode
, destptr
);
7638 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7640 LABEL_NUSES (label
) = 1;
7644 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7645 src
= change_address (srcmem
, HImode
, srcptr
);
7646 dest
= change_address (destmem
, HImode
, destptr
);
7647 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7649 LABEL_NUSES (label
) = 1;
7653 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7654 src
= change_address (srcmem
, QImode
, srcptr
);
7655 dest
= change_address (destmem
, QImode
, destptr
);
7656 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7658 LABEL_NUSES (label
) = 1;
7663 rtx offset
= force_reg (Pmode
, const0_rtx
);
7668 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7669 src
= change_address (srcmem
, SImode
, srcptr
);
7670 dest
= change_address (destmem
, SImode
, destptr
);
7671 emit_move_insn (dest
, src
);
7672 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
7673 true, OPTAB_LIB_WIDEN
);
7675 emit_move_insn (offset
, tmp
);
7677 LABEL_NUSES (label
) = 1;
7681 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7682 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7683 src
= change_address (srcmem
, HImode
, tmp
);
7684 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7685 dest
= change_address (destmem
, HImode
, tmp
);
7686 emit_move_insn (dest
, src
);
7687 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
7688 true, OPTAB_LIB_WIDEN
);
7690 emit_move_insn (offset
, tmp
);
7692 LABEL_NUSES (label
) = 1;
7696 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7697 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7698 src
= change_address (srcmem
, QImode
, tmp
);
7699 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7700 dest
= change_address (destmem
, QImode
, tmp
);
7701 emit_move_insn (dest
, src
);
7703 LABEL_NUSES (label
) = 1;
7708 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7709 with value PROMOTED_VAL.
7710 SRC is passed by pointer to be updated on return.
7711 Return value is updated DST. */
7713 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
7714 HOST_WIDE_INT size_to_move
)
7717 enum insn_code code
;
7718 machine_mode move_mode
;
7721 /* Find the widest mode in which we could perform moves.
7722 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7723 it until move of such size is supported. */
7724 move_mode
= GET_MODE (promoted_val
);
7725 if (move_mode
== VOIDmode
)
7727 if (size_to_move
< GET_MODE_SIZE (move_mode
))
7729 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
7730 move_mode
= int_mode_for_size (move_bits
, 0).require ();
7731 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
7733 piece_size
= GET_MODE_SIZE (move_mode
);
7734 code
= optab_handler (mov_optab
, move_mode
);
7735 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
7737 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7739 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7740 gcc_assert (size_to_move
% piece_size
== 0);
7742 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7744 if (piece_size
<= GET_MODE_SIZE (word_mode
))
7746 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
7747 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7752 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
7754 emit_move_insn (destptr
,
7755 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7757 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7761 /* Update DST rtx. */
7764 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7766 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
7767 rtx count
, int max_size
)
7769 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
7770 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
7771 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
7772 gen_lowpart (QImode
, value
), count
, QImode
,
7773 1, max_size
/ 2, true);
7776 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7778 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
7779 rtx count
, int max_size
)
7783 if (CONST_INT_P (count
))
7785 HOST_WIDE_INT countval
= INTVAL (count
);
7786 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7789 /* For now MAX_SIZE should be a power of 2. This assert could be
7790 relaxed, but it'll require a bit more complicated epilogue
7792 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7793 for (i
= max_size
; i
>= 1; i
>>= 1)
7795 if (epilogue_size
& i
)
7797 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7798 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7800 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7807 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
7812 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
7815 dest
= change_address (destmem
, DImode
, destptr
);
7816 emit_insn (gen_strset (destptr
, dest
, value
));
7817 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
7818 emit_insn (gen_strset (destptr
, dest
, value
));
7822 dest
= change_address (destmem
, SImode
, destptr
);
7823 emit_insn (gen_strset (destptr
, dest
, value
));
7824 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7825 emit_insn (gen_strset (destptr
, dest
, value
));
7826 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
7827 emit_insn (gen_strset (destptr
, dest
, value
));
7828 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
7829 emit_insn (gen_strset (destptr
, dest
, value
));
7832 LABEL_NUSES (label
) = 1;
7836 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
7839 dest
= change_address (destmem
, DImode
, destptr
);
7840 emit_insn (gen_strset (destptr
, dest
, value
));
7844 dest
= change_address (destmem
, SImode
, destptr
);
7845 emit_insn (gen_strset (destptr
, dest
, value
));
7846 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7847 emit_insn (gen_strset (destptr
, dest
, value
));
7850 LABEL_NUSES (label
) = 1;
7854 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7855 dest
= change_address (destmem
, SImode
, destptr
);
7856 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
7858 LABEL_NUSES (label
) = 1;
7862 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7863 dest
= change_address (destmem
, HImode
, destptr
);
7864 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
7866 LABEL_NUSES (label
) = 1;
7870 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7871 dest
= change_address (destmem
, QImode
, destptr
);
7872 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
7874 LABEL_NUSES (label
) = 1;
7878 /* Adjust COUNTER by the VALUE. */
7880 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
7882 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
7885 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7886 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7887 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7889 Return value is updated DESTMEM. */
7892 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
7893 rtx destptr
, rtx srcptr
, rtx value
,
7894 rtx vec_value
, rtx count
, int align
,
7895 int desired_alignment
, bool issetmem
)
7898 for (i
= 1; i
< desired_alignment
; i
<<= 1)
7902 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
7905 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7906 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7908 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7911 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7912 ix86_adjust_counter (count
, i
);
7914 LABEL_NUSES (label
) = 1;
7915 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
7921 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7922 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7923 and jump to DONE_LABEL. */
7925 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
7926 rtx destptr
, rtx srcptr
,
7927 rtx value
, rtx vec_value
,
7928 rtx count
, int size
,
7929 rtx done_label
, bool issetmem
)
7931 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
7932 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
7936 /* If we do not have vector value to copy, we must reduce size. */
7941 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
7943 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
7944 mode
= GET_MODE (value
);
7947 mode
= GET_MODE (vec_value
), value
= vec_value
;
7951 /* Choose appropriate vector mode. */
7953 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
7954 else if (size
>= 16)
7955 mode
= TARGET_SSE
? V16QImode
: DImode
;
7956 srcmem
= change_address (srcmem
, mode
, srcptr
);
7958 destmem
= change_address (destmem
, mode
, destptr
);
7959 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
7960 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
7961 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7964 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7967 emit_move_insn (destmem
, srcmem
);
7968 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7970 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7973 destmem
= offset_address (destmem
, count
, 1);
7974 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
7975 GET_MODE_SIZE (mode
));
7978 srcmem
= offset_address (srcmem
, count
, 1);
7979 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
7980 GET_MODE_SIZE (mode
));
7982 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7985 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7988 emit_move_insn (destmem
, srcmem
);
7989 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7991 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7993 emit_jump_insn (gen_jump (done_label
));
7997 LABEL_NUSES (label
) = 1;
8000 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
8001 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
8002 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
8003 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
8004 DONE_LABEL is a label after the whole copying sequence. The label is created
8005 on demand if *DONE_LABEL is NULL.
8006 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
8007 bounds after the initial copies.
8009 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
8010 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
8011 we will dispatch to a library call for large blocks.
8013 In pseudocode we do:
8017 Assume that SIZE is 4. Bigger sizes are handled analogously
8020 copy 4 bytes from SRCPTR to DESTPTR
8021 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
8026 copy 1 byte from SRCPTR to DESTPTR
8029 copy 2 bytes from SRCPTR to DESTPTR
8030 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
8035 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
8036 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
8038 OLD_DESPTR = DESTPTR;
8039 Align DESTPTR up to DESIRED_ALIGN
8040 SRCPTR += DESTPTR - OLD_DESTPTR
8041 COUNT -= DEST_PTR - OLD_DESTPTR
8043 Round COUNT down to multiple of SIZE
8044 << optional caller supplied zero size guard is here >>
8045 << optional caller supplied dynamic check is here >>
8046 << caller supplied main copy loop is here >>
8051 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
8052 rtx
*destptr
, rtx
*srcptr
,
8054 rtx value
, rtx vec_value
,
8056 rtx_code_label
**done_label
,
8060 unsigned HOST_WIDE_INT
*min_size
,
8064 rtx_code_label
*loop_label
= NULL
, *label
;
8067 int prolog_size
= 0;
8070 /* Chose proper value to copy. */
8071 if (issetmem
&& VECTOR_MODE_P (mode
))
8072 mode_value
= vec_value
;
8075 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
8077 /* See if block is big or small, handle small blocks. */
8078 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
8081 loop_label
= gen_label_rtx ();
8084 *done_label
= gen_label_rtx ();
8086 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
8090 /* Handle sizes > 3. */
8091 for (;size2
> 2; size2
>>= 1)
8092 expand_small_cpymem_or_setmem (destmem
, srcmem
,
8096 size2
, *done_label
, issetmem
);
8097 /* Nothing to copy? Jump to DONE_LABEL if so */
8098 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
8101 /* Do a byte copy. */
8102 destmem
= change_address (destmem
, QImode
, *destptr
);
8104 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
8107 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
8108 emit_move_insn (destmem
, srcmem
);
8111 /* Handle sizes 2 and 3. */
8112 label
= ix86_expand_aligntest (*count
, 2, false);
8113 destmem
= change_address (destmem
, HImode
, *destptr
);
8114 destmem
= offset_address (destmem
, *count
, 1);
8115 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
8117 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
8120 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
8121 srcmem
= offset_address (srcmem
, *count
, 1);
8122 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
8123 emit_move_insn (destmem
, srcmem
);
8127 LABEL_NUSES (label
) = 1;
8128 emit_jump_insn (gen_jump (*done_label
));
8132 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
8133 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
8135 /* Start memcpy for COUNT >= SIZE. */
8138 emit_label (loop_label
);
8139 LABEL_NUSES (loop_label
) = 1;
8142 /* Copy first desired_align bytes. */
8144 srcmem
= change_address (srcmem
, mode
, *srcptr
);
8145 destmem
= change_address (destmem
, mode
, *destptr
);
8146 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
8147 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
8150 emit_move_insn (destmem
, mode_value
);
8153 emit_move_insn (destmem
, srcmem
);
8154 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
8156 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
8157 prolog_size
+= GET_MODE_SIZE (mode
);
8161 /* Copy last SIZE bytes. */
8162 destmem
= offset_address (destmem
, *count
, 1);
8163 destmem
= offset_address (destmem
,
8164 GEN_INT (-size
- prolog_size
),
8167 emit_move_insn (destmem
, mode_value
);
8170 srcmem
= offset_address (srcmem
, *count
, 1);
8171 srcmem
= offset_address (srcmem
,
8172 GEN_INT (-size
- prolog_size
),
8174 emit_move_insn (destmem
, srcmem
);
8176 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
8178 destmem
= offset_address (destmem
, modesize
, 1);
8180 emit_move_insn (destmem
, mode_value
);
8183 srcmem
= offset_address (srcmem
, modesize
, 1);
8184 emit_move_insn (destmem
, srcmem
);
8188 /* Align destination. */
8189 if (desired_align
> 1 && desired_align
> align
)
8191 rtx saveddest
= *destptr
;
8193 gcc_assert (desired_align
<= size
);
8194 /* Align destptr up, place it to new register. */
8195 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
8196 GEN_INT (prolog_size
),
8197 NULL_RTX
, 1, OPTAB_DIRECT
);
8198 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
8199 REG_POINTER (*destptr
) = 1;
8200 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
8201 GEN_INT (-desired_align
),
8202 *destptr
, 1, OPTAB_DIRECT
);
8203 /* See how many bytes we skipped. */
8204 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
8206 saveddest
, 1, OPTAB_DIRECT
);
8207 /* Adjust srcptr and count. */
8209 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
8210 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
8211 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
8212 saveddest
, *count
, 1, OPTAB_DIRECT
);
8213 /* We copied at most size + prolog_size. */
8214 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
8216 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
8220 /* Our loops always round down the block size, but for dispatch to
8221 library we need precise value. */
8223 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
8224 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
8228 gcc_assert (prolog_size
== 0);
8229 /* Decrease count, so we won't end up copying last word twice. */
8230 if (!CONST_INT_P (*count
))
8231 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
8232 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
8234 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
8235 (unsigned HOST_WIDE_INT
)size
));
8237 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
8242 /* This function is like the previous one, except here we know how many bytes
8243 need to be copied. That allows us to update alignment not only of DST, which
8244 is returned, but also of SRC, which is passed as a pointer for that
8247 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
8248 rtx srcreg
, rtx value
, rtx vec_value
,
8249 int desired_align
, int align_bytes
,
8254 rtx orig_src
= NULL
;
8256 int copied_bytes
= 0;
8260 gcc_assert (srcp
!= NULL
);
8265 for (piece_size
= 1;
8266 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
8269 if (align_bytes
& piece_size
)
8273 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
8274 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
8276 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
8279 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
8280 copied_bytes
+= piece_size
;
8283 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
8284 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8285 if (MEM_SIZE_KNOWN_P (orig_dst
))
8286 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
8290 int src_align_bytes
= get_mem_align_offset (src
, desired_align
8292 if (src_align_bytes
>= 0)
8293 src_align_bytes
= desired_align
- src_align_bytes
;
8294 if (src_align_bytes
>= 0)
8296 unsigned int src_align
;
8297 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
8299 if ((src_align_bytes
& (src_align
- 1))
8300 == (align_bytes
& (src_align
- 1)))
8303 if (src_align
> (unsigned int) desired_align
)
8304 src_align
= desired_align
;
8305 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
8306 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
8308 if (MEM_SIZE_KNOWN_P (orig_src
))
8309 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
8316 /* Return true if ALG can be used in current context.
8317 Assume we expand memset if MEMSET is true. */
8319 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
8321 if (alg
== no_stringop
)
8323 if (alg
== vector_loop
)
8324 return TARGET_SSE
|| TARGET_AVX
;
8325 /* Algorithms using the rep prefix want at least edi and ecx;
8326 additionally, memset wants eax and memcpy wants esi. Don't
8327 consider such algorithms if the user has appropriated those
8328 registers for their own purposes, or if we have a non-default
8329 address space, since some string insns cannot override the segment. */
8330 if (alg
== rep_prefix_1_byte
8331 || alg
== rep_prefix_4_byte
8332 || alg
== rep_prefix_8_byte
)
8336 if (fixed_regs
[CX_REG
]
8337 || fixed_regs
[DI_REG
]
8338 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
8344 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8345 static enum stringop_alg
8346 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
8347 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
8348 bool memset
, bool zero_memset
, bool have_as
,
8349 int *dynamic_check
, bool *noalign
, bool recur
)
8351 const struct stringop_algs
*algs
;
8352 bool optimize_for_speed
;
8354 const struct processor_costs
*cost
;
8356 bool any_alg_usable_p
= false;
8359 *dynamic_check
= -1;
8361 /* Even if the string operation call is cold, we still might spend a lot
8362 of time processing large blocks. */
8363 if (optimize_function_for_size_p (cfun
)
8364 || (optimize_insn_for_size_p ()
8366 || (expected_size
!= -1 && expected_size
< 256))))
8367 optimize_for_speed
= false;
8369 optimize_for_speed
= true;
8371 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
8373 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
8375 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
8377 /* See maximal size for user defined algorithm. */
8378 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8380 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8381 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
8382 any_alg_usable_p
|= usable
;
8384 if (candidate
!= libcall
&& candidate
&& usable
)
8385 max
= algs
->size
[i
].max
;
8388 /* If expected size is not known but max size is small enough
8389 so inline version is a win, set expected size into
8391 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
8392 && expected_size
== -1)
8393 expected_size
= min_size
/ 2 + max_size
/ 2;
8395 /* If user specified the algorithm, honor it if possible. */
8396 if (ix86_stringop_alg
!= no_stringop
8397 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
8398 return ix86_stringop_alg
;
8399 /* rep; movq or rep; movl is the smallest variant. */
8400 else if (!optimize_for_speed
)
8403 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
8404 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
8405 ? rep_prefix_1_byte
: loop_1_byte
;
8407 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
8408 ? rep_prefix_4_byte
: loop
;
8410 /* Very tiny blocks are best handled via the loop, REP is expensive to
8412 else if (expected_size
!= -1 && expected_size
< 4)
8414 else if (expected_size
!= -1)
8416 enum stringop_alg alg
= libcall
;
8417 bool alg_noalign
= false;
8418 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8420 /* We get here if the algorithms that were not libcall-based
8421 were rep-prefix based and we are unable to use rep prefixes
8422 based on global register usage. Break out of the loop and
8423 use the heuristic below. */
8424 if (algs
->size
[i
].max
== 0)
8426 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
8428 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8430 if (candidate
!= libcall
8431 && alg_usable_p (candidate
, memset
, have_as
))
8434 alg_noalign
= algs
->size
[i
].noalign
;
8436 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8437 last non-libcall inline algorithm. */
8438 if (TARGET_INLINE_ALL_STRINGOPS
)
8440 /* When the current size is best to be copied by a libcall,
8441 but we are still forced to inline, run the heuristic below
8442 that will pick code for medium sized blocks. */
8445 *noalign
= alg_noalign
;
8448 else if (!any_alg_usable_p
)
8451 else if (alg_usable_p (candidate
, memset
, have_as
)
8452 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8453 && candidate
== rep_prefix_1_byte
8454 /* NB: If min_size != max_size, size is
8456 && min_size
!= max_size
))
8458 *noalign
= algs
->size
[i
].noalign
;
8464 /* When asked to inline the call anyway, try to pick meaningful choice.
8465 We look for maximal size of block that is faster to copy by hand and
8466 take blocks of at most of that size guessing that average size will
8467 be roughly half of the block.
8469 If this turns out to be bad, we might simply specify the preferred
8470 choice in ix86_costs. */
8471 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8472 && (algs
->unknown_size
== libcall
8473 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
8475 enum stringop_alg alg
;
8476 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
8478 /* If there aren't any usable algorithms or if recursing already,
8479 then recursing on smaller sizes or same size isn't going to
8480 find anything. Just return the simple byte-at-a-time copy loop. */
8481 if (!any_alg_usable_p
|| recur
)
8483 /* Pick something reasonable. */
8484 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
8485 *dynamic_check
= 128;
8488 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
8489 zero_memset
, have_as
, dynamic_check
, noalign
, true);
8490 gcc_assert (*dynamic_check
== -1);
8491 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8492 *dynamic_check
= max
;
8494 gcc_assert (alg
!= libcall
);
8497 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
8498 ? algs
->unknown_size
: libcall
);
8501 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8502 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8504 decide_alignment (int align
,
8505 enum stringop_alg alg
,
8507 machine_mode move_mode
)
8509 int desired_align
= 0;
8511 gcc_assert (alg
!= no_stringop
);
8515 if (move_mode
== VOIDmode
)
8518 desired_align
= GET_MODE_SIZE (move_mode
);
8519 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8520 copying whole cacheline at once. */
8521 if (TARGET_CPU_P (PENTIUMPRO
)
8522 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
8527 if (desired_align
< align
)
8528 desired_align
= align
;
8529 if (expected_size
!= -1 && expected_size
< 4)
8530 desired_align
= align
;
8532 return desired_align
;
8536 /* Helper function for memcpy. For QImode value 0xXY produce
8537 0xXYXYXYXY of wide specified by MODE. This is essentially
8538 a * 0x10101010, but we can do slightly better than
8539 synth_mult by unwinding the sequence by hand on CPUs with
8542 promote_duplicated_reg (machine_mode mode
, rtx val
)
8544 machine_mode valmode
= GET_MODE (val
);
8546 int nops
= mode
== DImode
? 3 : 2;
8548 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
8549 if (val
== const0_rtx
)
8550 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
8551 if (CONST_INT_P (val
))
8553 HOST_WIDE_INT v
= INTVAL (val
) & 255;
8558 v
|= (v
<< 16) << 16;
8559 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
8562 if (valmode
== VOIDmode
)
8564 if (valmode
!= QImode
)
8565 val
= gen_lowpart (QImode
, val
);
8568 if (!TARGET_PARTIAL_REG_STALL
)
8570 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
8571 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
8572 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
8573 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
8575 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8576 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
8577 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
8582 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8584 if (!TARGET_PARTIAL_REG_STALL
)
8585 emit_insn (gen_insv_1 (mode
, reg
, reg
));
8588 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
8589 NULL
, 1, OPTAB_DIRECT
);
8590 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
8593 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
8594 NULL
, 1, OPTAB_DIRECT
);
8595 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8598 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
8599 NULL
, 1, OPTAB_DIRECT
);
8600 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8605 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8606 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8607 alignment from ALIGN to DESIRED_ALIGN. */
8609 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
8615 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
8616 promoted_val
= promote_duplicated_reg (DImode
, val
);
8617 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
8618 promoted_val
= promote_duplicated_reg (SImode
, val
);
8619 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
8620 promoted_val
= promote_duplicated_reg (HImode
, val
);
8624 return promoted_val
;
8627 /* Copy the address to a Pmode register. This is used for x32 to
8628 truncate DImode TLS address to a SImode register. */
8631 ix86_copy_addr_to_reg (rtx addr
)
8634 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
8636 reg
= copy_addr_to_reg (addr
);
8637 REG_POINTER (reg
) = 1;
8642 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
8643 reg
= copy_to_mode_reg (DImode
, addr
);
8644 REG_POINTER (reg
) = 1;
8645 return gen_rtx_SUBREG (SImode
, reg
, 0);
8649 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8650 operations when profitable. The code depends upon architecture, block size
8651 and alignment, but always has one of the following overall structures:
8653 Aligned move sequence:
8655 1) Prologue guard: Conditional that jumps up to epilogues for small
8656 blocks that can be handled by epilogue alone. This is faster
8657 but also needed for correctness, since prologue assume the block
8658 is larger than the desired alignment.
8660 Optional dynamic check for size and libcall for large
8661 blocks is emitted here too, with -minline-stringops-dynamically.
8663 2) Prologue: copy first few bytes in order to get destination
8664 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8665 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8666 copied. We emit either a jump tree on power of two sized
8667 blocks, or a byte loop.
8669 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8670 with specified algorithm.
8672 4) Epilogue: code copying tail of the block that is too small to be
8673 handled by main body (or up to size guarded by prologue guard).
8675 Misaligned move sequence
8677 1) missaligned move prologue/epilogue containing:
8678 a) Prologue handling small memory blocks and jumping to done_label
8679 (skipped if blocks are known to be large enough)
8680 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8681 needed by single possibly misaligned move
8682 (skipped if alignment is not needed)
8683 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8685 2) Zero size guard dispatching to done_label, if needed
8687 3) dispatch to library call, if needed,
8689 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8690 with specified algorithm. */
8692 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
8693 rtx align_exp
, rtx expected_align_exp
,
8694 rtx expected_size_exp
, rtx min_size_exp
,
8695 rtx max_size_exp
, rtx probable_max_size_exp
,
8700 rtx_code_label
*label
= NULL
;
8702 rtx_code_label
*jump_around_label
= NULL
;
8703 HOST_WIDE_INT align
= 1;
8704 unsigned HOST_WIDE_INT count
= 0;
8705 HOST_WIDE_INT expected_size
= -1;
8706 int size_needed
= 0, epilogue_size_needed
;
8707 int desired_align
= 0, align_bytes
= 0;
8708 enum stringop_alg alg
;
8709 rtx promoted_val
= NULL
;
8710 rtx vec_promoted_val
= NULL
;
8711 bool force_loopy_epilogue
= false;
8713 bool need_zero_guard
= false;
8715 machine_mode move_mode
= VOIDmode
;
8716 machine_mode wider_mode
;
8717 int unroll_factor
= 1;
8718 /* TODO: Once value ranges are available, fill in proper data. */
8719 unsigned HOST_WIDE_INT min_size
= 0;
8720 unsigned HOST_WIDE_INT max_size
= -1;
8721 unsigned HOST_WIDE_INT probable_max_size
= -1;
8722 bool misaligned_prologue_used
= false;
8725 if (CONST_INT_P (align_exp
))
8726 align
= INTVAL (align_exp
);
8727 /* i386 can do misaligned access on reasonably increased cost. */
8728 if (CONST_INT_P (expected_align_exp
)
8729 && INTVAL (expected_align_exp
) > align
)
8730 align
= INTVAL (expected_align_exp
);
8731 /* ALIGN is the minimum of destination and source alignment, but we care here
8732 just about destination alignment. */
8734 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
8735 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
8737 if (CONST_INT_P (count_exp
))
8739 min_size
= max_size
= probable_max_size
= count
= expected_size
8740 = INTVAL (count_exp
);
8741 /* When COUNT is 0, there is nothing to do. */
8748 min_size
= INTVAL (min_size_exp
);
8750 max_size
= INTVAL (max_size_exp
);
8751 if (probable_max_size_exp
)
8752 probable_max_size
= INTVAL (probable_max_size_exp
);
8753 if (CONST_INT_P (expected_size_exp
))
8754 expected_size
= INTVAL (expected_size_exp
);
8757 /* Make sure we don't need to care about overflow later on. */
8758 if (count
> (HOST_WIDE_INT_1U
<< 30))
8761 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
8763 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
8765 /* Step 0: Decide on preferred algorithm, desired alignment and
8766 size of chunks to be copied by main loop. */
8767 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
8769 issetmem
&& val_exp
== const0_rtx
, have_as
,
8770 &dynamic_check
, &noalign
, false);
8773 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
8774 stringop_alg_names
[alg
]);
8778 gcc_assert (alg
!= no_stringop
);
8780 /* For now vector-version of memset is generated only for memory zeroing, as
8781 creating of promoted vector value is very cheap in this case. */
8782 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
8783 alg
= unrolled_loop
;
8786 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
8787 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
8789 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
8792 move_mode
= word_mode
;
8800 need_zero_guard
= true;
8804 need_zero_guard
= true;
8807 need_zero_guard
= true;
8808 unroll_factor
= (TARGET_64BIT
? 4 : 2);
8811 need_zero_guard
= true;
8813 /* Find the widest supported mode. */
8814 move_mode
= word_mode
;
8815 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
8816 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
8817 move_mode
= wider_mode
;
8819 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
8821 if (TARGET_AVX512_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 256)
8824 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8825 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8826 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
8828 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
8829 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
8830 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
8831 move_mode
= word_mode
;
8833 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
8835 case rep_prefix_8_byte
:
8838 case rep_prefix_4_byte
:
8841 case rep_prefix_1_byte
:
8845 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
8846 epilogue_size_needed
= size_needed
;
8848 /* If we are going to call any library calls conditionally, make sure any
8849 pending stack adjustment happen before the first conditional branch,
8850 otherwise they will be emitted before the library call only and won't
8851 happen from the other branches. */
8852 if (dynamic_check
!= -1)
8853 do_pending_stack_adjust ();
8855 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
8856 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
8857 align
= desired_align
;
8859 /* Step 1: Prologue guard. */
8861 /* Alignment code needs count to be in register. */
8862 if (CONST_INT_P (count_exp
) && desired_align
> align
)
8864 if (INTVAL (count_exp
) > desired_align
8865 && INTVAL (count_exp
) > size_needed
)
8868 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
8869 if (align_bytes
<= 0)
8872 align_bytes
= desired_align
- align_bytes
;
8874 if (align_bytes
== 0)
8875 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
8877 gcc_assert (desired_align
>= 1 && align
>= 1);
8879 /* Misaligned move sequences handle both prologue and epilogue at once.
8880 Default code generation results in a smaller code for large alignments
8881 and also avoids redundant job when sizes are known precisely. */
8882 misaligned_prologue_used
8883 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8884 && MAX (desired_align
, epilogue_size_needed
) <= 32
8885 && desired_align
<= epilogue_size_needed
8886 && ((desired_align
> align
&& !align_bytes
)
8887 || (!count
&& epilogue_size_needed
> 1)));
8889 /* Do the cheap promotion to allow better CSE across the
8890 main loop and epilogue (ie one load of the big constant in the
8892 For now the misaligned move sequences do not have fast path
8893 without broadcasting. */
8894 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
8896 if (alg
== vector_loop
)
8898 gcc_assert (val_exp
== const0_rtx
);
8899 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
8900 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
8901 GET_MODE_SIZE (word_mode
),
8902 desired_align
, align
);
8906 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
8907 desired_align
, align
);
8910 /* Misaligned move sequences handles both prologues and epilogues at once.
8911 Default code generation results in smaller code for large alignments and
8912 also avoids redundant job when sizes are known precisely. */
8913 if (misaligned_prologue_used
)
8915 /* Misaligned move prologue handled small blocks by itself. */
8916 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8917 (dst
, src
, &destreg
, &srcreg
,
8918 move_mode
, promoted_val
, vec_promoted_val
,
8921 desired_align
< align
8922 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
8923 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
8925 src
= change_address (src
, BLKmode
, srcreg
);
8926 dst
= change_address (dst
, BLKmode
, destreg
);
8927 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8928 epilogue_size_needed
= 0;
8930 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
8932 /* It is possible that we copied enough so the main loop will not
8934 gcc_assert (size_needed
> 1);
8935 if (jump_around_label
== NULL_RTX
)
8936 jump_around_label
= gen_label_rtx ();
8937 emit_cmp_and_jump_insns (count_exp
,
8938 GEN_INT (size_needed
),
8939 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
8940 if (expected_size
== -1
8941 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
8942 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8944 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8947 /* Ensure that alignment prologue won't copy past end of block. */
8948 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
8950 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
8951 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8952 Make sure it is power of 2. */
8953 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
8955 /* To improve performance of small blocks, we jump around the VAL
8956 promoting mode. This mean that if the promoted VAL is not constant,
8957 we might not use it in the epilogue and have to use byte
8959 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
8960 force_loopy_epilogue
= true;
8961 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8962 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8964 /* If main algorithm works on QImode, no epilogue is needed.
8965 For small sizes just don't align anything. */
8966 if (size_needed
== 1)
8967 desired_align
= align
;
8972 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8974 label
= gen_label_rtx ();
8975 emit_cmp_and_jump_insns (count_exp
,
8976 GEN_INT (epilogue_size_needed
),
8977 LTU
, 0, counter_mode (count_exp
), 1, label
);
8978 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
8979 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8981 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8985 /* Emit code to decide on runtime whether library call or inline should be
8987 if (dynamic_check
!= -1)
8989 if (!issetmem
&& CONST_INT_P (count_exp
))
8991 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
8993 emit_block_copy_via_libcall (dst
, src
, count_exp
);
8994 count_exp
= const0_rtx
;
9000 rtx_code_label
*hot_label
= gen_label_rtx ();
9001 if (jump_around_label
== NULL_RTX
)
9002 jump_around_label
= gen_label_rtx ();
9003 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
9004 LEU
, 0, counter_mode (count_exp
),
9006 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
9008 set_storage_via_libcall (dst
, count_exp
, val_exp
);
9010 emit_block_copy_via_libcall (dst
, src
, count_exp
);
9011 emit_jump (jump_around_label
);
9012 emit_label (hot_label
);
9016 /* Step 2: Alignment prologue. */
9017 /* Do the expensive promotion once we branched off the small blocks. */
9018 if (issetmem
&& !promoted_val
)
9019 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
9020 desired_align
, align
);
9022 if (desired_align
> align
&& !misaligned_prologue_used
)
9024 if (align_bytes
== 0)
9026 /* Except for the first move in prologue, we no longer know
9027 constant offset in aliasing info. It don't seems to worth
9028 the pain to maintain it for the first move, so throw away
9030 dst
= change_address (dst
, BLKmode
, destreg
);
9032 src
= change_address (src
, BLKmode
, srcreg
);
9033 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
9034 promoted_val
, vec_promoted_val
,
9035 count_exp
, align
, desired_align
,
9037 /* At most desired_align - align bytes are copied. */
9038 if (min_size
< (unsigned)(desired_align
- align
))
9041 min_size
-= desired_align
- align
;
9045 /* If we know how many bytes need to be stored before dst is
9046 sufficiently aligned, maintain aliasing info accurately. */
9047 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
9055 count_exp
= plus_constant (counter_mode (count_exp
),
9056 count_exp
, -align_bytes
);
9057 count
-= align_bytes
;
9058 min_size
-= align_bytes
;
9059 max_size
-= align_bytes
;
9062 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
9063 && (count
< (unsigned HOST_WIDE_INT
) size_needed
9064 || (align_bytes
== 0
9065 && count
< ((unsigned HOST_WIDE_INT
) size_needed
9066 + desired_align
- align
))))
9068 /* It is possible that we copied enough so the main loop will not
9070 gcc_assert (size_needed
> 1);
9071 if (label
== NULL_RTX
)
9072 label
= gen_label_rtx ();
9073 emit_cmp_and_jump_insns (count_exp
,
9074 GEN_INT (size_needed
),
9075 LTU
, 0, counter_mode (count_exp
), 1, label
);
9076 if (expected_size
== -1
9077 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
9078 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
9080 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
9083 if (label
&& size_needed
== 1)
9086 LABEL_NUSES (label
) = 1;
9088 epilogue_size_needed
= 1;
9090 promoted_val
= val_exp
;
9092 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
9093 epilogue_size_needed
= size_needed
;
9095 /* Step 3: Main loop. */
9106 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
9107 count_exp
, move_mode
, unroll_factor
,
9108 expected_size
, issetmem
);
9111 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
9112 vec_promoted_val
, count_exp
, move_mode
,
9113 unroll_factor
, expected_size
, issetmem
);
9115 case rep_prefix_8_byte
:
9116 case rep_prefix_4_byte
:
9117 case rep_prefix_1_byte
:
9118 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
9119 val_exp
, count_exp
, move_mode
, issetmem
);
9122 /* Adjust properly the offset of src and dest memory for aliasing. */
9123 if (CONST_INT_P (count_exp
))
9126 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
9127 (count
/ size_needed
) * size_needed
);
9128 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
9129 (count
/ size_needed
) * size_needed
);
9134 src
= change_address (src
, BLKmode
, srcreg
);
9135 dst
= change_address (dst
, BLKmode
, destreg
);
9138 /* Step 4: Epilogue to copy the remaining bytes. */
9142 /* When the main loop is done, COUNT_EXP might hold original count,
9143 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9144 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9145 bytes. Compensate if needed. */
9147 if (size_needed
< epilogue_size_needed
)
9149 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
9150 GEN_INT (size_needed
- 1), count_exp
, 1,
9152 if (tmp
!= count_exp
)
9153 emit_move_insn (count_exp
, tmp
);
9156 LABEL_NUSES (label
) = 1;
9159 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
9161 if (force_loopy_epilogue
)
9162 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
9163 epilogue_size_needed
);
9167 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
9168 vec_promoted_val
, count_exp
,
9169 epilogue_size_needed
);
9171 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
9172 epilogue_size_needed
);
9175 if (jump_around_label
)
9176 emit_label (jump_around_label
);
9180 /* Expand cmpstrn or memcmp. */
9183 ix86_expand_cmpstrn_or_cmpmem (rtx result
, rtx src1
, rtx src2
,
9184 rtx length
, rtx align
, bool is_cmpstrn
)
9186 /* Expand strncmp and memcmp only with -minline-all-stringops since
9187 "repz cmpsb" can be much slower than strncmp and memcmp functions
9188 implemented with vector instructions, see
9190 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9192 if (!TARGET_INLINE_ALL_STRINGOPS
)
9195 /* Can't use this if the user has appropriated ecx, esi or edi. */
9196 if (fixed_regs
[CX_REG
] || fixed_regs
[SI_REG
] || fixed_regs
[DI_REG
])
9201 /* For strncmp, length is the maximum length, which can be larger
9202 than actual string lengths. We can expand the cmpstrn pattern
9203 to "repz cmpsb" only if one of the strings is a constant so
9204 that expand_builtin_strncmp() can write the length argument to
9205 be the minimum of the const string length and the actual length
9206 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9207 tree t1
= MEM_EXPR (src1
);
9208 tree t2
= MEM_EXPR (src2
);
9209 if (!((t1
&& TREE_CODE (t1
) == MEM_REF
9210 && TREE_CODE (TREE_OPERAND (t1
, 0)) == ADDR_EXPR
9211 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1
, 0), 0))
9213 || (t2
&& TREE_CODE (t2
) == MEM_REF
9214 && TREE_CODE (TREE_OPERAND (t2
, 0)) == ADDR_EXPR
9215 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2
, 0), 0))
9220 rtx addr1
= copy_addr_to_reg (XEXP (src1
, 0));
9221 rtx addr2
= copy_addr_to_reg (XEXP (src2
, 0));
9222 if (addr1
!= XEXP (src1
, 0))
9223 src1
= replace_equiv_address_nv (src1
, addr1
);
9224 if (addr2
!= XEXP (src2
, 0))
9225 src2
= replace_equiv_address_nv (src2
, addr2
);
9227 /* NB: Make a copy of the data length to avoid changing the original
9228 data length by cmpstrnqi patterns. */
9229 length
= ix86_zero_extend_to_Pmode (length
);
9230 rtx lengthreg
= gen_reg_rtx (Pmode
);
9231 emit_move_insn (lengthreg
, length
);
9233 /* If we are testing strict equality, we can use known alignment to
9234 good advantage. This may be possible with combine, particularly
9235 once cc0 is dead. */
9236 if (CONST_INT_P (length
))
9238 if (length
== const0_rtx
)
9240 emit_move_insn (result
, const0_rtx
);
9243 emit_insn (gen_cmpstrnqi_nz_1 (addr1
, addr2
, lengthreg
, align
,
9248 emit_insn (gen_cmp_1 (Pmode
, lengthreg
, lengthreg
));
9249 emit_insn (gen_cmpstrnqi_1 (addr1
, addr2
, lengthreg
, align
,
9253 rtx out
= gen_lowpart (QImode
, result
);
9254 emit_insn (gen_cmpintqi (out
));
9255 emit_move_insn (result
, gen_rtx_SIGN_EXTEND (SImode
, out
));
9260 /* Expand the appropriate insns for doing strlen if not just doing
9263 out = result, initialized with the start address
9264 align_rtx = alignment of the address.
9265 scratch = scratch register, initialized with the startaddress when
9266 not aligned, otherwise undefined
9268 This is just the body. It needs the initializations mentioned above and
9269 some address computing at the end. These things are done in i386.md. */
9272 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
9276 rtx_code_label
*align_2_label
= NULL
;
9277 rtx_code_label
*align_3_label
= NULL
;
9278 rtx_code_label
*align_4_label
= gen_label_rtx ();
9279 rtx_code_label
*end_0_label
= gen_label_rtx ();
9281 rtx tmpreg
= gen_reg_rtx (SImode
);
9282 rtx scratch
= gen_reg_rtx (SImode
);
9286 if (CONST_INT_P (align_rtx
))
9287 align
= INTVAL (align_rtx
);
9289 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9291 /* Is there a known alignment and is it less than 4? */
9294 rtx scratch1
= gen_reg_rtx (Pmode
);
9295 emit_move_insn (scratch1
, out
);
9296 /* Is there a known alignment and is it not 2? */
9299 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
9300 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
9302 /* Leave just the 3 lower bits. */
9303 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
9304 NULL_RTX
, 0, OPTAB_WIDEN
);
9306 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9307 Pmode
, 1, align_4_label
);
9308 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
9309 Pmode
, 1, align_2_label
);
9310 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
9311 Pmode
, 1, align_3_label
);
9315 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9316 check if is aligned to 4 - byte. */
9318 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
9319 NULL_RTX
, 0, OPTAB_WIDEN
);
9321 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9322 Pmode
, 1, align_4_label
);
9325 mem
= change_address (src
, QImode
, out
);
9327 /* Now compare the bytes. */
9329 /* Compare the first n unaligned byte on a byte per byte basis. */
9330 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
9331 QImode
, 1, end_0_label
);
9333 /* Increment the address. */
9334 emit_insn (gen_add2_insn (out
, const1_rtx
));
9336 /* Not needed with an alignment of 2 */
9339 emit_label (align_2_label
);
9341 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9344 emit_insn (gen_add2_insn (out
, const1_rtx
));
9346 emit_label (align_3_label
);
9349 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9352 emit_insn (gen_add2_insn (out
, const1_rtx
));
9355 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9356 align this loop. It gives only huge programs, but does not help to
9358 emit_label (align_4_label
);
9360 mem
= change_address (src
, SImode
, out
);
9361 emit_move_insn (scratch
, mem
);
9362 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
9364 /* This formula yields a nonzero result iff one of the bytes is zero.
9365 This saves three branches inside loop and many cycles. */
9367 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
9368 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
9369 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
9370 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
9371 gen_int_mode (0x80808080, SImode
)));
9372 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
9377 rtx reg
= gen_reg_rtx (SImode
);
9378 rtx reg2
= gen_reg_rtx (Pmode
);
9379 emit_move_insn (reg
, tmpreg
);
9380 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
9382 /* If zero is not in the first two bytes, move two bytes forward. */
9383 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9384 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9385 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9386 emit_insn (gen_rtx_SET (tmpreg
,
9387 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
9390 /* Emit lea manually to avoid clobbering of flags. */
9391 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
9393 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9394 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9395 emit_insn (gen_rtx_SET (out
,
9396 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
9402 rtx_code_label
*end_2_label
= gen_label_rtx ();
9403 /* Is zero in the first two bytes? */
9405 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9406 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9407 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
9408 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
9409 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
9411 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
9412 JUMP_LABEL (tmp
) = end_2_label
;
9414 /* Not in the first two. Move two bytes forward. */
9415 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
9416 emit_insn (gen_add2_insn (out
, const2_rtx
));
9418 emit_label (end_2_label
);
9422 /* Avoid branch in fixing the byte. */
9423 tmpreg
= gen_lowpart (QImode
, tmpreg
);
9424 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
9425 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
9426 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
9427 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
9429 emit_label (end_0_label
);
9432 /* Expand strlen. */
9435 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
9437 if (TARGET_UNROLL_STRLEN
9438 && TARGET_INLINE_ALL_STRINGOPS
9439 && eoschar
== const0_rtx
9442 /* The generic case of strlen expander is long. Avoid it's
9443 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9444 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
9445 /* Well it seems that some optimizer does not combine a call like
9446 foo(strlen(bar), strlen(bar));
9447 when the move and the subtraction is done here. It does calculate
9448 the length just once when these instructions are done inside of
9449 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9450 often used and I use one fewer register for the lifetime of
9451 output_strlen_unroll() this is better. */
9453 emit_move_insn (out
, addr
);
9455 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
9457 /* strlensi_unroll_1 returns the address of the zero at the end of
9458 the string, like memchr(), so compute the length by subtracting
9459 the start address. */
9460 emit_insn (gen_sub2_insn (out
, addr
));
9467 /* For given symbol (function) construct code to compute address of it's PLT
9468 entry in large x86-64 PIC model. */
9471 construct_plt_address (rtx symbol
)
9475 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
9476 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
9477 gcc_assert (Pmode
== DImode
);
9479 tmp
= gen_reg_rtx (Pmode
);
9480 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
9482 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
9483 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
9487 /* Additional registers that are clobbered by SYSV calls. */
9489 static int const x86_64_ms_sysv_extra_clobbered_registers
9490 [NUM_X86_64_MS_CLOBBERED_REGS
] =
9494 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
9495 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
9499 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
9501 rtx pop
, bool sibcall
)
9504 rtx use
= NULL
, call
;
9505 unsigned int vec_len
= 0;
9508 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9510 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
9512 && (lookup_attribute ("interrupt",
9513 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
9514 error ("interrupt service routine cannot be called directly");
9519 if (pop
== const0_rtx
)
9521 gcc_assert (!TARGET_64BIT
|| !pop
);
9523 rtx addr
= XEXP (fnaddr
, 0);
9524 if (TARGET_MACHO
&& !TARGET_64BIT
)
9527 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9528 fnaddr
= machopic_indirect_call_target (fnaddr
);
9533 /* Static functions and indirect calls don't need the pic register. Also,
9534 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9535 it an indirect call. */
9537 && GET_CODE (addr
) == SYMBOL_REF
9538 && ix86_call_use_plt_p (addr
))
9541 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
9542 || !lookup_attribute ("noplt",
9543 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
9546 || (ix86_cmodel
== CM_LARGE_PIC
9547 && DEFAULT_ABI
!= MS_ABI
))
9549 use_reg (&use
, gen_rtx_REG (Pmode
,
9550 REAL_PIC_OFFSET_TABLE_REGNUM
));
9551 if (ix86_use_pseudo_pic_reg ())
9552 emit_move_insn (gen_rtx_REG (Pmode
,
9553 REAL_PIC_OFFSET_TABLE_REGNUM
),
9554 pic_offset_table_rtx
);
9557 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
9560 && ix86_cmodel
== CM_LARGE_PIC
9561 && DEFAULT_ABI
!= MS_ABI
)
9563 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9565 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9566 fnaddr
= force_reg (Pmode
, fnaddr
);
9567 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
, fnaddr
);
9569 else if (TARGET_64BIT
)
9571 fnaddr
= gen_rtx_UNSPEC (Pmode
,
9572 gen_rtvec (1, addr
),
9574 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9578 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9580 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9581 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
9584 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
9585 /* Pmode may not be the same as word_mode for x32, which
9586 doesn't support indirect branch via 32-bit memory slot.
9587 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9588 indirect branch via x32 GOT slot is OK. */
9589 if (GET_MODE (fnaddr
) != word_mode
)
9590 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
9591 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
9596 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9597 parameters passed in vector registers. */
9599 && (INTVAL (callarg2
) > 0
9600 || (INTVAL (callarg2
) == 0
9601 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
9603 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
9604 emit_move_insn (al
, callarg2
);
9608 if (ix86_cmodel
== CM_LARGE_PIC
9611 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
9612 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
9613 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
9614 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9615 branch via x32 GOT slot is OK. */
9616 else if (!(TARGET_X32
9618 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
9619 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
9621 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
9622 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
9624 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
9625 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
9628 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9629 mask off code pointers here.
9630 TODO: also need to handle indirect jump. */
9631 if (ix86_memtag_can_tag_addresses () && !fndecl
9632 && sanitize_flags_p (SANITIZE_HWADDRESS
))
9634 rtx untagged_addr
= ix86_memtag_untagged_pointer (XEXP (fnaddr
, 0),
9636 fnaddr
= gen_rtx_MEM (QImode
, untagged_addr
);
9639 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
9642 call
= gen_rtx_SET (retval
, call
);
9643 vec
[vec_len
++] = call
;
9647 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
9648 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
9649 vec
[vec_len
++] = pop
;
9652 if (cfun
->machine
->no_caller_saved_registers
9654 || (!TREE_THIS_VOLATILE (fndecl
)
9655 && !lookup_attribute ("no_caller_saved_registers",
9656 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
9658 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
9659 bool is_64bit_ms_abi
= (TARGET_64BIT
9660 && ix86_function_abi (fndecl
) == MS_ABI
);
9661 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
9663 /* If there are no caller-saved registers, add all registers
9664 that are clobbered by the call which returns. */
9665 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
9667 && (ix86_call_used_regs
[i
] == 1
9668 || (ix86_call_used_regs
[i
] & c_mask
))
9669 && !STACK_REGNO_P (i
)
9670 && !MMX_REGNO_P (i
))
9672 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
9674 else if (TARGET_64BIT_MS_ABI
9675 && (!callarg2
|| INTVAL (callarg2
) != -2))
9679 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
9681 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
9682 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
9684 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
9687 /* Set here, but it may get cleared later. */
9688 if (TARGET_CALL_MS2SYSV_XLOGUES
)
9693 /* Don't break hot-patched functions. */
9694 else if (ix86_function_ms_hook_prologue (current_function_decl
))
9697 /* TODO: Cases not yet examined. */
9698 else if (flag_split_stack
)
9699 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9703 gcc_assert (!reload_completed
);
9704 cfun
->machine
->call_ms2sysv
= true;
9709 if (TARGET_MACHO
&& TARGET_64BIT
&& !sibcall
9710 && ((GET_CODE (addr
) == SYMBOL_REF
&& !SYMBOL_REF_LOCAL_P (addr
))
9711 || !fndecl
|| TREE_PUBLIC (fndecl
)))
9713 /* We allow public functions defined in a TU to bind locally for PIC
9714 code (the default) on 64bit Mach-O.
9715 If such functions are not inlined, we cannot tell at compile-time if
9716 they will be called via the lazy symbol resolver (this can depend on
9717 options given at link-time). Therefore, we must assume that the lazy
9718 resolver could be used which clobbers R11 and R10. */
9719 clobber_reg (&use
, gen_rtx_REG (DImode
, R11_REG
));
9720 clobber_reg (&use
, gen_rtx_REG (DImode
, R10_REG
));
9724 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
9725 rtx_insn
*call_insn
= emit_call_insn (call
);
9727 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
9732 /* Split simple return with popping POPC bytes from stack to indirect
9733 branch with stack adjustment . */
9736 ix86_split_simple_return_pop_internal (rtx popc
)
9738 struct machine_function
*m
= cfun
->machine
;
9739 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
9742 /* There is no "pascal" calling convention in any 64bit ABI. */
9743 gcc_assert (!TARGET_64BIT
);
9745 insn
= emit_insn (gen_pop (ecx
));
9746 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
9747 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
9749 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
9750 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9751 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9752 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
9753 RTX_FRAME_RELATED_P (insn
) = 1;
9755 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
9756 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9757 insn
= emit_insn (x
);
9758 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9759 RTX_FRAME_RELATED_P (insn
) = 1;
9761 /* Now return address is in ECX. */
9762 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
9765 /* Errors in the source file can cause expand_expr to return const0_rtx
9766 where we expect a vector. To avoid crashing, use one of the vector
9767 clear instructions. */
9770 safe_vector_operand (rtx x
, machine_mode mode
)
9772 if (x
== const0_rtx
)
9773 x
= CONST0_RTX (mode
);
9777 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9780 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
9783 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9784 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9785 rtx op0
= expand_normal (arg0
);
9786 rtx op1
= expand_normal (arg1
);
9787 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9788 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
9789 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
9791 if (VECTOR_MODE_P (mode0
))
9792 op0
= safe_vector_operand (op0
, mode0
);
9793 if (VECTOR_MODE_P (mode1
))
9794 op1
= safe_vector_operand (op1
, mode1
);
9796 if (optimize
|| !target
9797 || GET_MODE (target
) != tmode
9798 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9799 target
= gen_reg_rtx (tmode
);
9801 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
9803 rtx x
= gen_reg_rtx (V4SImode
);
9804 emit_insn (gen_sse2_loadd (x
, op1
));
9805 op1
= gen_lowpart (TImode
, x
);
9808 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
9809 op0
= copy_to_mode_reg (mode0
, op0
);
9810 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
9811 op1
= copy_to_mode_reg (mode1
, op1
);
9813 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
9822 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9825 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
9826 enum ix86_builtin_func_type m_type
,
9827 enum rtx_code sub_code
)
9830 unsigned int i
, nargs
;
9831 bool comparison_p
= false;
9833 bool last_arg_constant
= false;
9837 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9841 case MULTI_ARG_4_DF2_DI_I
:
9842 case MULTI_ARG_4_DF2_DI_I1
:
9843 case MULTI_ARG_4_SF2_SI_I
:
9844 case MULTI_ARG_4_SF2_SI_I1
:
9846 last_arg_constant
= true;
9849 case MULTI_ARG_3_SF
:
9850 case MULTI_ARG_3_DF
:
9851 case MULTI_ARG_3_SF2
:
9852 case MULTI_ARG_3_DF2
:
9853 case MULTI_ARG_3_DI
:
9854 case MULTI_ARG_3_SI
:
9855 case MULTI_ARG_3_SI_DI
:
9856 case MULTI_ARG_3_HI
:
9857 case MULTI_ARG_3_HI_SI
:
9858 case MULTI_ARG_3_QI
:
9859 case MULTI_ARG_3_DI2
:
9860 case MULTI_ARG_3_SI2
:
9861 case MULTI_ARG_3_HI2
:
9862 case MULTI_ARG_3_QI2
:
9866 case MULTI_ARG_2_SF
:
9867 case MULTI_ARG_2_DF
:
9868 case MULTI_ARG_2_DI
:
9869 case MULTI_ARG_2_SI
:
9870 case MULTI_ARG_2_HI
:
9871 case MULTI_ARG_2_QI
:
9875 case MULTI_ARG_2_DI_IMM
:
9876 case MULTI_ARG_2_SI_IMM
:
9877 case MULTI_ARG_2_HI_IMM
:
9878 case MULTI_ARG_2_QI_IMM
:
9880 last_arg_constant
= true;
9883 case MULTI_ARG_1_SF
:
9884 case MULTI_ARG_1_DF
:
9885 case MULTI_ARG_1_SF2
:
9886 case MULTI_ARG_1_DF2
:
9887 case MULTI_ARG_1_DI
:
9888 case MULTI_ARG_1_SI
:
9889 case MULTI_ARG_1_HI
:
9890 case MULTI_ARG_1_QI
:
9891 case MULTI_ARG_1_SI_DI
:
9892 case MULTI_ARG_1_HI_DI
:
9893 case MULTI_ARG_1_HI_SI
:
9894 case MULTI_ARG_1_QI_DI
:
9895 case MULTI_ARG_1_QI_SI
:
9896 case MULTI_ARG_1_QI_HI
:
9900 case MULTI_ARG_2_DI_CMP
:
9901 case MULTI_ARG_2_SI_CMP
:
9902 case MULTI_ARG_2_HI_CMP
:
9903 case MULTI_ARG_2_QI_CMP
:
9905 comparison_p
= true;
9908 case MULTI_ARG_2_SF_TF
:
9909 case MULTI_ARG_2_DF_TF
:
9910 case MULTI_ARG_2_DI_TF
:
9911 case MULTI_ARG_2_SI_TF
:
9912 case MULTI_ARG_2_HI_TF
:
9913 case MULTI_ARG_2_QI_TF
:
9922 if (optimize
|| !target
9923 || GET_MODE (target
) != tmode
9924 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9925 target
= gen_reg_rtx (tmode
);
9926 else if (memory_operand (target
, tmode
))
9929 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
9931 for (i
= 0; i
< nargs
; i
++)
9933 tree arg
= CALL_EXPR_ARG (exp
, i
);
9934 rtx op
= expand_normal (arg
);
9935 int adjust
= (comparison_p
) ? 1 : 0;
9936 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
9938 if (last_arg_constant
&& i
== nargs
- 1)
9940 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
9942 enum insn_code new_icode
= icode
;
9945 case CODE_FOR_xop_vpermil2v2df3
:
9946 case CODE_FOR_xop_vpermil2v4sf3
:
9947 case CODE_FOR_xop_vpermil2v4df3
:
9948 case CODE_FOR_xop_vpermil2v8sf3
:
9949 error ("the last argument must be a 2-bit immediate");
9950 return gen_reg_rtx (tmode
);
9951 case CODE_FOR_xop_rotlv2di3
:
9952 new_icode
= CODE_FOR_rotlv2di3
;
9954 case CODE_FOR_xop_rotlv4si3
:
9955 new_icode
= CODE_FOR_rotlv4si3
;
9957 case CODE_FOR_xop_rotlv8hi3
:
9958 new_icode
= CODE_FOR_rotlv8hi3
;
9960 case CODE_FOR_xop_rotlv16qi3
:
9961 new_icode
= CODE_FOR_rotlv16qi3
;
9963 if (CONST_INT_P (op
))
9965 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
9966 op
= GEN_INT (INTVAL (op
) & mask
);
9968 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
9974 && insn_data
[new_icode
].operand
[0].mode
== tmode
9975 && insn_data
[new_icode
].operand
[1].mode
== tmode
9976 && insn_data
[new_icode
].operand
[2].mode
== mode
9977 && insn_data
[new_icode
].operand
[0].predicate
9978 == insn_data
[icode
].operand
[0].predicate
9979 && insn_data
[new_icode
].operand
[1].predicate
9980 == insn_data
[icode
].operand
[1].predicate
);
9993 if (VECTOR_MODE_P (mode
))
9994 op
= safe_vector_operand (op
, mode
);
9996 /* If we aren't optimizing, only allow one memory operand to be
9998 if (memory_operand (op
, mode
))
10001 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
10004 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
10006 op
= force_reg (mode
, op
);
10015 pat
= GEN_FCN (icode
) (target
, xops
[0]);
10020 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10021 GEN_INT ((int)sub_code
));
10022 else if (! comparison_p
)
10023 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
10026 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
10029 pat
= GEN_FCN (icode
) (target
, cmp_op
, xops
[0], xops
[1]);
10034 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
10038 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
10042 gcc_unreachable ();
10052 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
10053 insns with vec_merge. */
10056 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
10060 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10061 rtx op1
, op0
= expand_normal (arg0
);
10062 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
10063 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
10065 if (optimize
|| !target
10066 || GET_MODE (target
) != tmode
10067 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
10068 target
= gen_reg_rtx (tmode
);
10070 if (VECTOR_MODE_P (mode0
))
10071 op0
= safe_vector_operand (op0
, mode0
);
10073 if ((optimize
&& !register_operand (op0
, mode0
))
10074 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
10075 op0
= copy_to_mode_reg (mode0
, op0
);
10078 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
10079 op1
= copy_to_mode_reg (mode0
, op1
);
10081 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
10088 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
10091 ix86_expand_sse_compare (const struct builtin_description
*d
,
10092 tree exp
, rtx target
, bool swap
)
10095 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10096 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10097 rtx op0
= expand_normal (arg0
);
10098 rtx op1
= expand_normal (arg1
);
10100 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10101 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10102 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
10103 enum rtx_code comparison
= d
->comparison
;
10105 if (VECTOR_MODE_P (mode0
))
10106 op0
= safe_vector_operand (op0
, mode0
);
10107 if (VECTOR_MODE_P (mode1
))
10108 op1
= safe_vector_operand (op1
, mode1
);
10110 /* Swap operands if we have a comparison that isn't available in
10113 std::swap (op0
, op1
);
10115 if (optimize
|| !target
10116 || GET_MODE (target
) != tmode
10117 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10118 target
= gen_reg_rtx (tmode
);
10120 if ((optimize
&& !register_operand (op0
, mode0
))
10121 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
10122 op0
= copy_to_mode_reg (mode0
, op0
);
10123 if ((optimize
&& !register_operand (op1
, mode1
))
10124 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
10125 op1
= copy_to_mode_reg (mode1
, op1
);
10127 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
10128 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
10135 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
10136 * ordered EQ or unordered NE, generate PF jump. */
10139 ix86_ssecom_setcc (const enum rtx_code comparison
,
10140 bool check_unordered
, machine_mode mode
,
10141 rtx set_dst
, rtx target
)
10144 rtx_code_label
*label
= NULL
;
10146 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10147 with NAN operands. */
10148 if (check_unordered
)
10150 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10152 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10153 label
= gen_label_rtx ();
10154 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10155 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10156 gen_rtx_LABEL_REF (VOIDmode
, label
),
10158 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10161 /* NB: Set CCFPmode and check a different CCmode which is in subset
10163 if (GET_MODE (set_dst
) != mode
)
10165 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10166 || mode
== CCOmode
|| mode
== CCPmode
10167 || mode
== CCSmode
|| mode
== CCZmode
);
10168 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10171 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10172 gen_rtx_fmt_ee (comparison
, QImode
,
10177 emit_label (label
);
10179 return SUBREG_REG (target
);
10182 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
10185 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
10189 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10190 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10191 rtx op0
= expand_normal (arg0
);
10192 rtx op1
= expand_normal (arg1
);
10193 enum insn_code icode
= d
->icode
;
10194 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10195 machine_mode mode0
= insn_p
->operand
[0].mode
;
10196 machine_mode mode1
= insn_p
->operand
[1].mode
;
10198 if (VECTOR_MODE_P (mode0
))
10199 op0
= safe_vector_operand (op0
, mode0
);
10200 if (VECTOR_MODE_P (mode1
))
10201 op1
= safe_vector_operand (op1
, mode1
);
10203 enum rtx_code comparison
= d
->comparison
;
10204 rtx const_val
= const0_rtx
;
10206 bool check_unordered
= false;
10207 machine_mode mode
= CCFPmode
;
10208 switch (comparison
)
10210 case LE
: /* -> GE */
10211 case LT
: /* -> GT */
10212 std::swap (op0
, op1
);
10213 comparison
= swap_condition (comparison
);
10219 check_unordered
= true;
10223 check_unordered
= true;
10225 const_val
= const1_rtx
;
10228 gcc_unreachable ();
10231 target
= gen_reg_rtx (SImode
);
10232 emit_move_insn (target
, const_val
);
10233 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10235 if ((optimize
&& !register_operand (op0
, mode0
))
10236 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10237 op0
= copy_to_mode_reg (mode0
, op0
);
10238 if ((optimize
&& !register_operand (op1
, mode1
))
10239 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10240 op1
= copy_to_mode_reg (mode1
, op1
);
10242 pat
= GEN_FCN (icode
) (op0
, op1
);
10246 set_dst
= SET_DEST (pat
);
10248 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
10252 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10255 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
10259 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10260 rtx op1
, op0
= expand_normal (arg0
);
10261 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10262 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10264 if (optimize
|| target
== 0
10265 || GET_MODE (target
) != tmode
10266 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10267 target
= gen_reg_rtx (tmode
);
10269 if (VECTOR_MODE_P (mode0
))
10270 op0
= safe_vector_operand (op0
, mode0
);
10272 if ((optimize
&& !register_operand (op0
, mode0
))
10273 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10274 op0
= copy_to_mode_reg (mode0
, op0
);
10276 op1
= GEN_INT (d
->comparison
);
10278 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
10286 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
10287 tree exp
, rtx target
)
10290 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10291 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10292 rtx op0
= expand_normal (arg0
);
10293 rtx op1
= expand_normal (arg1
);
10295 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10296 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10297 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
10299 if (optimize
|| target
== 0
10300 || GET_MODE (target
) != tmode
10301 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10302 target
= gen_reg_rtx (tmode
);
10304 op0
= safe_vector_operand (op0
, mode0
);
10305 op1
= safe_vector_operand (op1
, mode1
);
10307 if ((optimize
&& !register_operand (op0
, mode0
))
10308 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10309 op0
= copy_to_mode_reg (mode0
, op0
);
10310 if ((optimize
&& !register_operand (op1
, mode1
))
10311 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10312 op1
= copy_to_mode_reg (mode1
, op1
);
10314 op2
= GEN_INT (d
->comparison
);
10316 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
10323 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10326 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
10330 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10331 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10332 rtx op0
= expand_normal (arg0
);
10333 rtx op1
= expand_normal (arg1
);
10334 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
10335 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
10336 enum rtx_code comparison
= d
->comparison
;
10338 /* ptest reg, reg sets the carry flag. */
10339 if (comparison
== LTU
10340 && (d
->code
== IX86_BUILTIN_PTESTC
10341 || d
->code
== IX86_BUILTIN_PTESTC256
)
10342 && rtx_equal_p (op0
, op1
))
10345 target
= gen_reg_rtx (SImode
);
10346 emit_move_insn (target
, const1_rtx
);
10350 if (VECTOR_MODE_P (mode0
))
10351 op0
= safe_vector_operand (op0
, mode0
);
10352 if (VECTOR_MODE_P (mode1
))
10353 op1
= safe_vector_operand (op1
, mode1
);
10355 target
= gen_reg_rtx (SImode
);
10356 emit_move_insn (target
, const0_rtx
);
10357 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10359 if ((optimize
&& !register_operand (op0
, mode0
))
10360 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10361 op0
= copy_to_mode_reg (mode0
, op0
);
10362 if ((optimize
&& !register_operand (op1
, mode1
))
10363 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10364 op1
= copy_to_mode_reg (mode1
, op1
);
10366 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
10370 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10371 gen_rtx_fmt_ee (comparison
, QImode
,
10375 return SUBREG_REG (target
);
10378 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10381 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
10382 tree exp
, rtx target
)
10385 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10386 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10387 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10388 tree arg3
= CALL_EXPR_ARG (exp
, 3);
10389 tree arg4
= CALL_EXPR_ARG (exp
, 4);
10390 rtx scratch0
, scratch1
;
10391 rtx op0
= expand_normal (arg0
);
10392 rtx op1
= expand_normal (arg1
);
10393 rtx op2
= expand_normal (arg2
);
10394 rtx op3
= expand_normal (arg3
);
10395 rtx op4
= expand_normal (arg4
);
10396 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
10398 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10399 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10400 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10401 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
10402 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
10403 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
10404 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
10406 if (VECTOR_MODE_P (modev2
))
10407 op0
= safe_vector_operand (op0
, modev2
);
10408 if (VECTOR_MODE_P (modev4
))
10409 op2
= safe_vector_operand (op2
, modev4
);
10411 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10412 op0
= copy_to_mode_reg (modev2
, op0
);
10413 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
10414 op1
= copy_to_mode_reg (modei3
, op1
);
10415 if ((optimize
&& !register_operand (op2
, modev4
))
10416 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
10417 op2
= copy_to_mode_reg (modev4
, op2
);
10418 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
10419 op3
= copy_to_mode_reg (modei5
, op3
);
10421 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
10423 error ("the fifth argument must be an 8-bit immediate");
10427 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
10429 if (optimize
|| !target
10430 || GET_MODE (target
) != tmode0
10431 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10432 target
= gen_reg_rtx (tmode0
);
10434 scratch1
= gen_reg_rtx (tmode1
);
10436 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10438 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
10440 if (optimize
|| !target
10441 || GET_MODE (target
) != tmode1
10442 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10443 target
= gen_reg_rtx (tmode1
);
10445 scratch0
= gen_reg_rtx (tmode0
);
10447 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
10451 gcc_assert (d
->flag
);
10453 scratch0
= gen_reg_rtx (tmode0
);
10454 scratch1
= gen_reg_rtx (tmode1
);
10456 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10466 target
= gen_reg_rtx (SImode
);
10467 emit_move_insn (target
, const0_rtx
);
10468 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10471 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10472 gen_rtx_fmt_ee (EQ
, QImode
,
10473 gen_rtx_REG ((machine_mode
) d
->flag
,
10476 return SUBREG_REG (target
);
10483 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10486 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
10487 tree exp
, rtx target
)
10490 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10491 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10492 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10493 rtx scratch0
, scratch1
;
10494 rtx op0
= expand_normal (arg0
);
10495 rtx op1
= expand_normal (arg1
);
10496 rtx op2
= expand_normal (arg2
);
10497 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
10499 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10500 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10501 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10502 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
10503 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
10505 if (VECTOR_MODE_P (modev2
))
10506 op0
= safe_vector_operand (op0
, modev2
);
10507 if (VECTOR_MODE_P (modev3
))
10508 op1
= safe_vector_operand (op1
, modev3
);
10510 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10511 op0
= copy_to_mode_reg (modev2
, op0
);
10512 if ((optimize
&& !register_operand (op1
, modev3
))
10513 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
10514 op1
= copy_to_mode_reg (modev3
, op1
);
10516 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
10518 error ("the third argument must be an 8-bit immediate");
10522 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
10524 if (optimize
|| !target
10525 || GET_MODE (target
) != tmode0
10526 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10527 target
= gen_reg_rtx (tmode0
);
10529 scratch1
= gen_reg_rtx (tmode1
);
10531 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
10533 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
10535 if (optimize
|| !target
10536 || GET_MODE (target
) != tmode1
10537 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10538 target
= gen_reg_rtx (tmode1
);
10540 scratch0
= gen_reg_rtx (tmode0
);
10542 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
10546 gcc_assert (d
->flag
);
10548 scratch0
= gen_reg_rtx (tmode0
);
10549 scratch1
= gen_reg_rtx (tmode1
);
10551 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
10561 target
= gen_reg_rtx (SImode
);
10562 emit_move_insn (target
, const0_rtx
);
10563 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10566 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10567 gen_rtx_fmt_ee (EQ
, QImode
,
10568 gen_rtx_REG ((machine_mode
) d
->flag
,
10571 return SUBREG_REG (target
);
10577 /* Fixup modeless constants to fit required mode. */
10580 fixup_modeless_constant (rtx x
, machine_mode mode
)
10582 if (GET_MODE (x
) == VOIDmode
)
10583 x
= convert_to_mode (mode
, x
, 1);
10587 /* Subroutine of ix86_expand_builtin to take care of insns with
10588 variable number of operands. */
10591 ix86_expand_args_builtin (const struct builtin_description
*d
,
10592 tree exp
, rtx target
)
10594 rtx pat
, real_target
;
10595 unsigned int i
, nargs
;
10596 unsigned int nargs_constant
= 0;
10597 unsigned int mask_pos
= 0;
10598 int num_memory
= 0;
10600 bool second_arg_count
= false;
10601 enum insn_code icode
= d
->icode
;
10602 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10603 machine_mode tmode
= insn_p
->operand
[0].mode
;
10604 machine_mode rmode
= VOIDmode
;
10606 enum rtx_code comparison
= d
->comparison
;
10608 switch ((enum ix86_builtin_func_type
) d
->flag
)
10610 case V2DF_FTYPE_V2DF_ROUND
:
10611 case V4DF_FTYPE_V4DF_ROUND
:
10612 case V8DF_FTYPE_V8DF_ROUND
:
10613 case V4SF_FTYPE_V4SF_ROUND
:
10614 case V8SF_FTYPE_V8SF_ROUND
:
10615 case V16SF_FTYPE_V16SF_ROUND
:
10616 case V8HF_FTYPE_V8HF_ROUND
:
10617 case V16HF_FTYPE_V16HF_ROUND
:
10618 case V32HF_FTYPE_V32HF_ROUND
:
10619 case V4SI_FTYPE_V4SF_ROUND
:
10620 case V8SI_FTYPE_V8SF_ROUND
:
10621 case V16SI_FTYPE_V16SF_ROUND
:
10622 return ix86_expand_sse_round (d
, exp
, target
);
10623 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
10624 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
10625 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
10626 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
10627 case INT_FTYPE_V8SF_V8SF_PTEST
:
10628 case INT_FTYPE_V4DI_V4DI_PTEST
:
10629 case INT_FTYPE_V4DF_V4DF_PTEST
:
10630 case INT_FTYPE_V4SF_V4SF_PTEST
:
10631 case INT_FTYPE_V2DI_V2DI_PTEST
:
10632 case INT_FTYPE_V2DF_V2DF_PTEST
:
10633 return ix86_expand_sse_ptest (d
, exp
, target
);
10634 case FLOAT128_FTYPE_FLOAT128
:
10635 case FLOAT_FTYPE_FLOAT
:
10636 case FLOAT_FTYPE_BFLOAT16
:
10637 case INT_FTYPE_INT
:
10638 case UINT_FTYPE_UINT
:
10639 case UINT16_FTYPE_UINT16
:
10640 case UINT64_FTYPE_INT
:
10641 case UINT64_FTYPE_UINT64
:
10642 case INT64_FTYPE_INT64
:
10643 case INT64_FTYPE_V4SF
:
10644 case INT64_FTYPE_V2DF
:
10645 case INT_FTYPE_V16QI
:
10646 case INT_FTYPE_V8QI
:
10647 case INT_FTYPE_V8SF
:
10648 case INT_FTYPE_V4DF
:
10649 case INT_FTYPE_V4SF
:
10650 case INT_FTYPE_V2DF
:
10651 case INT_FTYPE_V32QI
:
10652 case V16QI_FTYPE_V16QI
:
10653 case V8SI_FTYPE_V8SF
:
10654 case V8SI_FTYPE_V4SI
:
10655 case V8HI_FTYPE_V8HI
:
10656 case V8HI_FTYPE_V16QI
:
10657 case V8QI_FTYPE_V8QI
:
10658 case V8SF_FTYPE_V8SF
:
10659 case V8SF_FTYPE_V8SI
:
10660 case V8SF_FTYPE_V4SF
:
10661 case V8SF_FTYPE_V8HI
:
10662 case V4SI_FTYPE_V4SI
:
10663 case V4SI_FTYPE_V16QI
:
10664 case V4SI_FTYPE_V4SF
:
10665 case V4SI_FTYPE_V8SI
:
10666 case V4SI_FTYPE_V8HI
:
10667 case V4SI_FTYPE_V4DF
:
10668 case V4SI_FTYPE_V2DF
:
10669 case V4HI_FTYPE_V4HI
:
10670 case V4DF_FTYPE_V4DF
:
10671 case V4DF_FTYPE_V4SI
:
10672 case V4DF_FTYPE_V4SF
:
10673 case V4DF_FTYPE_V2DF
:
10674 case V4SF_FTYPE_V4SF
:
10675 case V4SF_FTYPE_V4SI
:
10676 case V4SF_FTYPE_V8SF
:
10677 case V4SF_FTYPE_V4DF
:
10678 case V4SF_FTYPE_V8HI
:
10679 case V4SF_FTYPE_V2DF
:
10680 case V2DI_FTYPE_V2DI
:
10681 case V2DI_FTYPE_V16QI
:
10682 case V2DI_FTYPE_V8HI
:
10683 case V2DI_FTYPE_V4SI
:
10684 case V2DF_FTYPE_V2DF
:
10685 case V2DF_FTYPE_V4SI
:
10686 case V2DF_FTYPE_V4DF
:
10687 case V2DF_FTYPE_V4SF
:
10688 case V2DF_FTYPE_V2SI
:
10689 case V2SI_FTYPE_V2SI
:
10690 case V2SI_FTYPE_V4SF
:
10691 case V2SI_FTYPE_V2SF
:
10692 case V2SI_FTYPE_V2DF
:
10693 case V2SF_FTYPE_V2SF
:
10694 case V2SF_FTYPE_V2SI
:
10695 case V32QI_FTYPE_V32QI
:
10696 case V32QI_FTYPE_V16QI
:
10697 case V16HI_FTYPE_V16HI
:
10698 case V16HI_FTYPE_V8HI
:
10699 case V8SI_FTYPE_V8SI
:
10700 case V16HI_FTYPE_V16QI
:
10701 case V8SI_FTYPE_V16QI
:
10702 case V4DI_FTYPE_V16QI
:
10703 case V8SI_FTYPE_V8HI
:
10704 case V4DI_FTYPE_V8HI
:
10705 case V4DI_FTYPE_V4SI
:
10706 case V4DI_FTYPE_V2DI
:
10707 case UQI_FTYPE_UQI
:
10708 case UHI_FTYPE_UHI
:
10709 case USI_FTYPE_USI
:
10710 case USI_FTYPE_UQI
:
10711 case USI_FTYPE_UHI
:
10712 case UDI_FTYPE_UDI
:
10713 case UHI_FTYPE_V16QI
:
10714 case USI_FTYPE_V32QI
:
10715 case UDI_FTYPE_V64QI
:
10716 case V16QI_FTYPE_UHI
:
10717 case V32QI_FTYPE_USI
:
10718 case V64QI_FTYPE_UDI
:
10719 case V8HI_FTYPE_UQI
:
10720 case V16HI_FTYPE_UHI
:
10721 case V32HI_FTYPE_USI
:
10722 case V4SI_FTYPE_UQI
:
10723 case V8SI_FTYPE_UQI
:
10724 case V4SI_FTYPE_UHI
:
10725 case V8SI_FTYPE_UHI
:
10726 case UQI_FTYPE_V8HI
:
10727 case UHI_FTYPE_V16HI
:
10728 case USI_FTYPE_V32HI
:
10729 case UQI_FTYPE_V4SI
:
10730 case UQI_FTYPE_V8SI
:
10731 case UHI_FTYPE_V16SI
:
10732 case UQI_FTYPE_V2DI
:
10733 case UQI_FTYPE_V4DI
:
10734 case UQI_FTYPE_V8DI
:
10735 case V16SI_FTYPE_UHI
:
10736 case V2DI_FTYPE_UQI
:
10737 case V4DI_FTYPE_UQI
:
10738 case V16SI_FTYPE_INT
:
10739 case V16SF_FTYPE_V8SF
:
10740 case V16SI_FTYPE_V8SI
:
10741 case V16SF_FTYPE_V4SF
:
10742 case V16SI_FTYPE_V4SI
:
10743 case V16SI_FTYPE_V16SF
:
10744 case V16SI_FTYPE_V16SI
:
10745 case V64QI_FTYPE_V64QI
:
10746 case V32HI_FTYPE_V32HI
:
10747 case V16SF_FTYPE_V16SF
:
10748 case V8DI_FTYPE_UQI
:
10749 case V8DI_FTYPE_V8DI
:
10750 case V8DF_FTYPE_V4DF
:
10751 case V8DF_FTYPE_V2DF
:
10752 case V8DF_FTYPE_V8DF
:
10753 case V4DI_FTYPE_V4DI
:
10754 case V16BF_FTYPE_V16SF
:
10755 case V8BF_FTYPE_V8SF
:
10756 case V8BF_FTYPE_V4SF
:
10759 case V4SF_FTYPE_V4SF_VEC_MERGE
:
10760 case V2DF_FTYPE_V2DF_VEC_MERGE
:
10761 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
10762 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
10763 case V16QI_FTYPE_V16QI_V16QI
:
10764 case V16QI_FTYPE_V8HI_V8HI
:
10765 case V16HF_FTYPE_V16HF_V16HF
:
10766 case V16SF_FTYPE_V16SF_V16SF
:
10767 case V8QI_FTYPE_V8QI_V8QI
:
10768 case V8QI_FTYPE_V4HI_V4HI
:
10769 case V8HI_FTYPE_V8HI_V8HI
:
10770 case V8HI_FTYPE_V16QI_V16QI
:
10771 case V8HI_FTYPE_V4SI_V4SI
:
10772 case V8HF_FTYPE_V8HF_V8HF
:
10773 case V8SF_FTYPE_V8SF_V8SF
:
10774 case V8SF_FTYPE_V8SF_V8SI
:
10775 case V8DF_FTYPE_V8DF_V8DF
:
10776 case V4SI_FTYPE_V4SI_V4SI
:
10777 case V4SI_FTYPE_V8HI_V8HI
:
10778 case V4SI_FTYPE_V2DF_V2DF
:
10779 case V4HI_FTYPE_V4HI_V4HI
:
10780 case V4HI_FTYPE_V8QI_V8QI
:
10781 case V4HI_FTYPE_V2SI_V2SI
:
10782 case V4DF_FTYPE_V4DF_V4DF
:
10783 case V4DF_FTYPE_V4DF_V4DI
:
10784 case V4SF_FTYPE_V4SF_V4SF
:
10785 case V4SF_FTYPE_V4SF_V4SI
:
10786 case V4SF_FTYPE_V4SF_V2SI
:
10787 case V4SF_FTYPE_V4SF_V2DF
:
10788 case V4SF_FTYPE_V4SF_UINT
:
10789 case V4SF_FTYPE_V4SF_DI
:
10790 case V4SF_FTYPE_V4SF_SI
:
10791 case V4DI_FTYPE_V4DI_V2DI
:
10792 case V2DI_FTYPE_V2DI_V2DI
:
10793 case V2DI_FTYPE_V16QI_V16QI
:
10794 case V2DI_FTYPE_V4SI_V4SI
:
10795 case V2DI_FTYPE_V2DI_V16QI
:
10796 case V2SI_FTYPE_V2SI_V2SI
:
10797 case V2SI_FTYPE_V4HI_V4HI
:
10798 case V2SI_FTYPE_V2SF_V2SF
:
10799 case V2DF_FTYPE_V2DF_V2DF
:
10800 case V2DF_FTYPE_V2DF_V4SF
:
10801 case V2DF_FTYPE_V2DF_V2DI
:
10802 case V2DF_FTYPE_V2DF_DI
:
10803 case V2DF_FTYPE_V2DF_SI
:
10804 case V2DF_FTYPE_V2DF_UINT
:
10805 case V2SF_FTYPE_V2SF_V2SF
:
10806 case V1DI_FTYPE_V1DI_V1DI
:
10807 case V1DI_FTYPE_V8QI_V8QI
:
10808 case V1DI_FTYPE_V2SI_V2SI
:
10809 case V32QI_FTYPE_V16HI_V16HI
:
10810 case V16HI_FTYPE_V8SI_V8SI
:
10811 case V64QI_FTYPE_V64QI_V64QI
:
10812 case V32QI_FTYPE_V32QI_V32QI
:
10813 case V16HI_FTYPE_V32QI_V32QI
:
10814 case V16HI_FTYPE_V16HI_V16HI
:
10815 case V8SI_FTYPE_V4DF_V4DF
:
10816 case V8SI_FTYPE_V8SI_V8SI
:
10817 case V8SI_FTYPE_V16HI_V16HI
:
10818 case V4DI_FTYPE_V4DI_V4DI
:
10819 case V4DI_FTYPE_V8SI_V8SI
:
10820 case V4DI_FTYPE_V32QI_V32QI
:
10821 case V8DI_FTYPE_V64QI_V64QI
:
10822 if (comparison
== UNKNOWN
)
10823 return ix86_expand_binop_builtin (icode
, exp
, target
);
10826 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
10827 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
10828 gcc_assert (comparison
!= UNKNOWN
);
10832 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
10833 case V16HI_FTYPE_V16HI_SI_COUNT
:
10834 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
10835 case V8SI_FTYPE_V8SI_SI_COUNT
:
10836 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
10837 case V4DI_FTYPE_V4DI_INT_COUNT
:
10838 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
10839 case V8HI_FTYPE_V8HI_SI_COUNT
:
10840 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
10841 case V4SI_FTYPE_V4SI_SI_COUNT
:
10842 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
10843 case V4HI_FTYPE_V4HI_SI_COUNT
:
10844 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
10845 case V2DI_FTYPE_V2DI_SI_COUNT
:
10846 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
10847 case V2SI_FTYPE_V2SI_SI_COUNT
:
10848 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
10849 case V1DI_FTYPE_V1DI_SI_COUNT
:
10851 second_arg_count
= true;
10853 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
10854 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
10855 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
10856 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
10857 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
10858 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
10859 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
10860 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
10861 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
10862 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
10863 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
10864 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
10865 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
10866 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
10867 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
10868 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
10869 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
10870 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
10872 second_arg_count
= true;
10874 case UINT64_FTYPE_UINT64_UINT64
:
10875 case UINT_FTYPE_UINT_UINT
:
10876 case UINT_FTYPE_UINT_USHORT
:
10877 case UINT_FTYPE_UINT_UCHAR
:
10878 case UINT16_FTYPE_UINT16_INT
:
10879 case UINT8_FTYPE_UINT8_INT
:
10880 case UQI_FTYPE_UQI_UQI
:
10881 case UHI_FTYPE_UHI_UHI
:
10882 case USI_FTYPE_USI_USI
:
10883 case UDI_FTYPE_UDI_UDI
:
10884 case V16SI_FTYPE_V8DF_V8DF
:
10885 case V32BF_FTYPE_V16SF_V16SF
:
10886 case V16BF_FTYPE_V8SF_V8SF
:
10887 case V8BF_FTYPE_V4SF_V4SF
:
10888 case V16BF_FTYPE_V16SF_UHI
:
10889 case V8BF_FTYPE_V8SF_UQI
:
10890 case V8BF_FTYPE_V4SF_UQI
:
10893 case V2DI_FTYPE_V2DI_INT_CONVERT
:
10896 nargs_constant
= 1;
10898 case V4DI_FTYPE_V4DI_INT_CONVERT
:
10901 nargs_constant
= 1;
10903 case V8DI_FTYPE_V8DI_INT_CONVERT
:
10906 nargs_constant
= 1;
10908 case V8HI_FTYPE_V8HI_INT
:
10909 case V8HI_FTYPE_V8SF_INT
:
10910 case V16HI_FTYPE_V16SF_INT
:
10911 case V8HI_FTYPE_V4SF_INT
:
10912 case V8SF_FTYPE_V8SF_INT
:
10913 case V4SF_FTYPE_V16SF_INT
:
10914 case V16SF_FTYPE_V16SF_INT
:
10915 case V4SI_FTYPE_V4SI_INT
:
10916 case V4SI_FTYPE_V8SI_INT
:
10917 case V4HI_FTYPE_V4HI_INT
:
10918 case V4DF_FTYPE_V4DF_INT
:
10919 case V4DF_FTYPE_V8DF_INT
:
10920 case V4SF_FTYPE_V4SF_INT
:
10921 case V4SF_FTYPE_V8SF_INT
:
10922 case V2DI_FTYPE_V2DI_INT
:
10923 case V2DF_FTYPE_V2DF_INT
:
10924 case V2DF_FTYPE_V4DF_INT
:
10925 case V16HI_FTYPE_V16HI_INT
:
10926 case V8SI_FTYPE_V8SI_INT
:
10927 case V16SI_FTYPE_V16SI_INT
:
10928 case V4SI_FTYPE_V16SI_INT
:
10929 case V4DI_FTYPE_V4DI_INT
:
10930 case V2DI_FTYPE_V4DI_INT
:
10931 case V4DI_FTYPE_V8DI_INT
:
10932 case UQI_FTYPE_UQI_UQI_CONST
:
10933 case UHI_FTYPE_UHI_UQI
:
10934 case USI_FTYPE_USI_UQI
:
10935 case UDI_FTYPE_UDI_UQI
:
10937 nargs_constant
= 1;
10939 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
10940 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
10941 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
10942 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
10943 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
10944 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
10945 case UHI_FTYPE_V16SI_V16SI_UHI
:
10946 case UQI_FTYPE_V8DI_V8DI_UQI
:
10947 case V16HI_FTYPE_V16SI_V16HI_UHI
:
10948 case V16QI_FTYPE_V16SI_V16QI_UHI
:
10949 case V16QI_FTYPE_V8DI_V16QI_UQI
:
10950 case V32HF_FTYPE_V32HF_V32HF_USI
:
10951 case V16SF_FTYPE_V16SF_V16SF_UHI
:
10952 case V16SF_FTYPE_V4SF_V16SF_UHI
:
10953 case V16SI_FTYPE_SI_V16SI_UHI
:
10954 case V16SI_FTYPE_V16HI_V16SI_UHI
:
10955 case V16SI_FTYPE_V16QI_V16SI_UHI
:
10956 case V8SF_FTYPE_V4SF_V8SF_UQI
:
10957 case V4DF_FTYPE_V2DF_V4DF_UQI
:
10958 case V8SI_FTYPE_V4SI_V8SI_UQI
:
10959 case V8SI_FTYPE_SI_V8SI_UQI
:
10960 case V4SI_FTYPE_V4SI_V4SI_UQI
:
10961 case V4SI_FTYPE_SI_V4SI_UQI
:
10962 case V4DI_FTYPE_V2DI_V4DI_UQI
:
10963 case V4DI_FTYPE_DI_V4DI_UQI
:
10964 case V2DI_FTYPE_V2DI_V2DI_UQI
:
10965 case V2DI_FTYPE_DI_V2DI_UQI
:
10966 case V64QI_FTYPE_V64QI_V64QI_UDI
:
10967 case V64QI_FTYPE_V16QI_V64QI_UDI
:
10968 case V64QI_FTYPE_QI_V64QI_UDI
:
10969 case V32QI_FTYPE_V32QI_V32QI_USI
:
10970 case V32QI_FTYPE_V16QI_V32QI_USI
:
10971 case V32QI_FTYPE_QI_V32QI_USI
:
10972 case V16QI_FTYPE_V16QI_V16QI_UHI
:
10973 case V16QI_FTYPE_QI_V16QI_UHI
:
10974 case V32HI_FTYPE_V8HI_V32HI_USI
:
10975 case V32HI_FTYPE_HI_V32HI_USI
:
10976 case V16HI_FTYPE_V8HI_V16HI_UHI
:
10977 case V16HI_FTYPE_HI_V16HI_UHI
:
10978 case V8HI_FTYPE_V8HI_V8HI_UQI
:
10979 case V8HI_FTYPE_HI_V8HI_UQI
:
10980 case V16HF_FTYPE_V16HF_V16HF_UHI
:
10981 case V8SF_FTYPE_V8HI_V8SF_UQI
:
10982 case V4SF_FTYPE_V8HI_V4SF_UQI
:
10983 case V8SI_FTYPE_V8HF_V8SI_UQI
:
10984 case V8SF_FTYPE_V8HF_V8SF_UQI
:
10985 case V8SI_FTYPE_V8SF_V8SI_UQI
:
10986 case V4SI_FTYPE_V4SF_V4SI_UQI
:
10987 case V4SI_FTYPE_V8HF_V4SI_UQI
:
10988 case V4SF_FTYPE_V8HF_V4SF_UQI
:
10989 case V4DI_FTYPE_V8HF_V4DI_UQI
:
10990 case V4DI_FTYPE_V4SF_V4DI_UQI
:
10991 case V2DI_FTYPE_V8HF_V2DI_UQI
:
10992 case V2DI_FTYPE_V4SF_V2DI_UQI
:
10993 case V8HF_FTYPE_V8HF_V8HF_UQI
:
10994 case V8HF_FTYPE_V8HF_V8HF_V8HF
:
10995 case V8HF_FTYPE_V8HI_V8HF_UQI
:
10996 case V8HF_FTYPE_V8SI_V8HF_UQI
:
10997 case V8HF_FTYPE_V8SF_V8HF_UQI
:
10998 case V8HF_FTYPE_V4SI_V8HF_UQI
:
10999 case V8HF_FTYPE_V4SF_V8HF_UQI
:
11000 case V8HF_FTYPE_V4DI_V8HF_UQI
:
11001 case V8HF_FTYPE_V4DF_V8HF_UQI
:
11002 case V8HF_FTYPE_V2DI_V8HF_UQI
:
11003 case V8HF_FTYPE_V2DF_V8HF_UQI
:
11004 case V4SF_FTYPE_V4DI_V4SF_UQI
:
11005 case V4SF_FTYPE_V2DI_V4SF_UQI
:
11006 case V4DF_FTYPE_V4DI_V4DF_UQI
:
11007 case V4DF_FTYPE_V8HF_V4DF_UQI
:
11008 case V2DF_FTYPE_V8HF_V2DF_UQI
:
11009 case V2DF_FTYPE_V2DI_V2DF_UQI
:
11010 case V16QI_FTYPE_V8HI_V16QI_UQI
:
11011 case V16QI_FTYPE_V16HI_V16QI_UHI
:
11012 case V16QI_FTYPE_V4SI_V16QI_UQI
:
11013 case V16QI_FTYPE_V8SI_V16QI_UQI
:
11014 case V8HI_FTYPE_V8HF_V8HI_UQI
:
11015 case V8HI_FTYPE_V4SI_V8HI_UQI
:
11016 case V8HI_FTYPE_V8SI_V8HI_UQI
:
11017 case V16QI_FTYPE_V2DI_V16QI_UQI
:
11018 case V16QI_FTYPE_V4DI_V16QI_UQI
:
11019 case V8HI_FTYPE_V2DI_V8HI_UQI
:
11020 case V8HI_FTYPE_V4DI_V8HI_UQI
:
11021 case V4SI_FTYPE_V2DI_V4SI_UQI
:
11022 case V4SI_FTYPE_V4DI_V4SI_UQI
:
11023 case V32QI_FTYPE_V32HI_V32QI_USI
:
11024 case UHI_FTYPE_V16QI_V16QI_UHI
:
11025 case USI_FTYPE_V32QI_V32QI_USI
:
11026 case UDI_FTYPE_V64QI_V64QI_UDI
:
11027 case UQI_FTYPE_V8HI_V8HI_UQI
:
11028 case UHI_FTYPE_V16HI_V16HI_UHI
:
11029 case USI_FTYPE_V32HI_V32HI_USI
:
11030 case UQI_FTYPE_V4SI_V4SI_UQI
:
11031 case UQI_FTYPE_V8SI_V8SI_UQI
:
11032 case UQI_FTYPE_V2DI_V2DI_UQI
:
11033 case UQI_FTYPE_V4DI_V4DI_UQI
:
11034 case V4SF_FTYPE_V2DF_V4SF_UQI
:
11035 case V4SF_FTYPE_V4DF_V4SF_UQI
:
11036 case V16SI_FTYPE_V16SI_V16SI_UHI
:
11037 case V16SI_FTYPE_V4SI_V16SI_UHI
:
11038 case V2DI_FTYPE_V4SI_V2DI_UQI
:
11039 case V2DI_FTYPE_V8HI_V2DI_UQI
:
11040 case V2DI_FTYPE_V16QI_V2DI_UQI
:
11041 case V4DI_FTYPE_V4DI_V4DI_UQI
:
11042 case V4DI_FTYPE_V4SI_V4DI_UQI
:
11043 case V4DI_FTYPE_V8HI_V4DI_UQI
:
11044 case V4DI_FTYPE_V16QI_V4DI_UQI
:
11045 case V4DI_FTYPE_V4DF_V4DI_UQI
:
11046 case V2DI_FTYPE_V2DF_V2DI_UQI
:
11047 case V4SI_FTYPE_V4DF_V4SI_UQI
:
11048 case V4SI_FTYPE_V2DF_V4SI_UQI
:
11049 case V4SI_FTYPE_V8HI_V4SI_UQI
:
11050 case V4SI_FTYPE_V16QI_V4SI_UQI
:
11051 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
11052 case V8DF_FTYPE_V2DF_V8DF_UQI
:
11053 case V8DF_FTYPE_V4DF_V8DF_UQI
:
11054 case V8DF_FTYPE_V8DF_V8DF_UQI
:
11055 case V8SF_FTYPE_V8SF_V8SF_UQI
:
11056 case V8SF_FTYPE_V8SI_V8SF_UQI
:
11057 case V4DF_FTYPE_V4DF_V4DF_UQI
:
11058 case V4SF_FTYPE_V4SF_V4SF_UQI
:
11059 case V2DF_FTYPE_V2DF_V2DF_UQI
:
11060 case V2DF_FTYPE_V4SF_V2DF_UQI
:
11061 case V2DF_FTYPE_V4SI_V2DF_UQI
:
11062 case V4SF_FTYPE_V4SI_V4SF_UQI
:
11063 case V4DF_FTYPE_V4SF_V4DF_UQI
:
11064 case V4DF_FTYPE_V4SI_V4DF_UQI
:
11065 case V8SI_FTYPE_V8SI_V8SI_UQI
:
11066 case V8SI_FTYPE_V8HI_V8SI_UQI
:
11067 case V8SI_FTYPE_V16QI_V8SI_UQI
:
11068 case V8DF_FTYPE_V8SI_V8DF_UQI
:
11069 case V8DI_FTYPE_DI_V8DI_UQI
:
11070 case V16SF_FTYPE_V8SF_V16SF_UHI
:
11071 case V16SI_FTYPE_V8SI_V16SI_UHI
:
11072 case V16HF_FTYPE_V16HI_V16HF_UHI
:
11073 case V16HF_FTYPE_V16HF_V16HF_V16HF
:
11074 case V16HI_FTYPE_V16HF_V16HI_UHI
:
11075 case V16HI_FTYPE_V16HI_V16HI_UHI
:
11076 case V8HI_FTYPE_V16QI_V8HI_UQI
:
11077 case V16HI_FTYPE_V16QI_V16HI_UHI
:
11078 case V32HI_FTYPE_V32HI_V32HI_USI
:
11079 case V32HI_FTYPE_V32QI_V32HI_USI
:
11080 case V8DI_FTYPE_V16QI_V8DI_UQI
:
11081 case V8DI_FTYPE_V2DI_V8DI_UQI
:
11082 case V8DI_FTYPE_V4DI_V8DI_UQI
:
11083 case V8DI_FTYPE_V8DI_V8DI_UQI
:
11084 case V8DI_FTYPE_V8HI_V8DI_UQI
:
11085 case V8DI_FTYPE_V8SI_V8DI_UQI
:
11086 case V8HI_FTYPE_V8DI_V8HI_UQI
:
11087 case V8SI_FTYPE_V8DI_V8SI_UQI
:
11088 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
11089 case V4DI_FTYPE_V4DI_V4DI_V2DI
:
11090 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
11091 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
11092 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
11093 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
11094 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
11095 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
11096 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
11097 case V32BF_FTYPE_V16SF_V16SF_USI
:
11098 case V16BF_FTYPE_V8SF_V8SF_UHI
:
11099 case V8BF_FTYPE_V4SF_V4SF_UQI
:
11100 case V16BF_FTYPE_V16SF_V16BF_UHI
:
11101 case V8BF_FTYPE_V8SF_V8BF_UQI
:
11102 case V8BF_FTYPE_V4SF_V8BF_UQI
:
11103 case V16SF_FTYPE_V16SF_V32BF_V32BF
:
11104 case V8SF_FTYPE_V8SF_V16BF_V16BF
:
11105 case V4SF_FTYPE_V4SF_V8BF_V8BF
:
11108 case V32QI_FTYPE_V32QI_V32QI_INT
:
11109 case V16HI_FTYPE_V16HI_V16HI_INT
:
11110 case V16QI_FTYPE_V16QI_V16QI_INT
:
11111 case V4DI_FTYPE_V4DI_V4DI_INT
:
11112 case V8HI_FTYPE_V8HI_V8HI_INT
:
11113 case V8SI_FTYPE_V8SI_V8SI_INT
:
11114 case V8SI_FTYPE_V8SI_V4SI_INT
:
11115 case V8SF_FTYPE_V8SF_V8SF_INT
:
11116 case V8SF_FTYPE_V8SF_V4SF_INT
:
11117 case V4SI_FTYPE_V4SI_V4SI_INT
:
11118 case V4DF_FTYPE_V4DF_V4DF_INT
:
11119 case V16SF_FTYPE_V16SF_V16SF_INT
:
11120 case V16SF_FTYPE_V16SF_V4SF_INT
:
11121 case V16SI_FTYPE_V16SI_V4SI_INT
:
11122 case V4DF_FTYPE_V4DF_V2DF_INT
:
11123 case V4SF_FTYPE_V4SF_V4SF_INT
:
11124 case V2DI_FTYPE_V2DI_V2DI_INT
:
11125 case V4DI_FTYPE_V4DI_V2DI_INT
:
11126 case V2DF_FTYPE_V2DF_V2DF_INT
:
11127 case UQI_FTYPE_V8DI_V8UDI_INT
:
11128 case UQI_FTYPE_V8DF_V8DF_INT
:
11129 case UQI_FTYPE_V2DF_V2DF_INT
:
11130 case UQI_FTYPE_V4SF_V4SF_INT
:
11131 case UHI_FTYPE_V16SI_V16SI_INT
:
11132 case UHI_FTYPE_V16SF_V16SF_INT
:
11133 case V64QI_FTYPE_V64QI_V64QI_INT
:
11134 case V32HI_FTYPE_V32HI_V32HI_INT
:
11135 case V16SI_FTYPE_V16SI_V16SI_INT
:
11136 case V8DI_FTYPE_V8DI_V8DI_INT
:
11138 nargs_constant
= 1;
11140 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
11143 nargs_constant
= 1;
11145 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
11148 nargs_constant
= 1;
11150 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
11153 nargs_constant
= 1;
11155 case V2DI_FTYPE_V2DI_UINT_UINT
:
11157 nargs_constant
= 2;
11159 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
11162 nargs_constant
= 1;
11164 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
11168 nargs_constant
= 1;
11170 case QI_FTYPE_V8DF_INT_UQI
:
11171 case QI_FTYPE_V4DF_INT_UQI
:
11172 case QI_FTYPE_V2DF_INT_UQI
:
11173 case HI_FTYPE_V16SF_INT_UHI
:
11174 case QI_FTYPE_V8SF_INT_UQI
:
11175 case QI_FTYPE_V4SF_INT_UQI
:
11176 case QI_FTYPE_V8HF_INT_UQI
:
11177 case HI_FTYPE_V16HF_INT_UHI
:
11178 case SI_FTYPE_V32HF_INT_USI
:
11179 case V4SI_FTYPE_V4SI_V4SI_UHI
:
11180 case V8SI_FTYPE_V8SI_V8SI_UHI
:
11183 nargs_constant
= 1;
11185 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
11189 nargs_constant
= 1;
11191 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
11195 nargs_constant
= 1;
11197 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
11198 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
11199 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
11200 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
11201 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
11202 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
11203 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
11204 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
11205 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
11206 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
11207 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
11208 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
11209 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
11210 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
11211 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
11212 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
11213 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI
:
11214 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
11215 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
11216 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
11217 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
11218 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
11219 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
11220 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
11221 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
11222 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
11223 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
11224 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
11225 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
11226 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
11227 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
11228 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
11229 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
11230 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
11231 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI
:
11232 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI
:
11233 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
11234 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
11235 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
11236 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
11237 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
11238 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
11239 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
11240 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI
:
11241 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
11242 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
11243 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
11244 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
11245 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
11246 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
11247 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
11248 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
11249 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
11250 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
11251 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
11252 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI
:
11253 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI
:
11254 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI
:
11257 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
11258 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
11259 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
11260 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
11261 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
11262 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT
:
11264 nargs_constant
= 1;
11266 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
11267 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
11268 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
11269 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
11270 case UHI_FTYPE_V16HF_V16HF_INT_UHI
:
11271 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
11272 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
11273 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
11274 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
11275 case UQI_FTYPE_V8HF_V8HF_INT_UQI
:
11276 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
11277 case USI_FTYPE_V32QI_V32QI_INT_USI
:
11278 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
11279 case USI_FTYPE_V32HI_V32HI_INT_USI
:
11280 case USI_FTYPE_V32HF_V32HF_INT_USI
:
11281 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
11282 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
11285 nargs_constant
= 1;
11287 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
11289 nargs_constant
= 2;
11291 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
11292 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
11293 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI
:
11294 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI
:
11295 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI
:
11298 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
11299 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
11302 nargs_constant
= 1;
11304 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
11305 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
11306 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
11307 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
11308 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
11309 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
11310 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
11311 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
11312 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
11313 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
11314 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
11315 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
11316 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
11317 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
11318 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
11319 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
11320 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
11321 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
11322 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
11323 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
11324 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
11325 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
11326 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
11327 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
11328 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
11329 case V16HF_FTYPE_V16HF_INT_V16HF_UHI
:
11330 case V8HF_FTYPE_V8HF_INT_V8HF_UQI
:
11331 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
11332 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
11333 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
11334 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
11335 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
11338 nargs_constant
= 1;
11340 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
11341 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
11342 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
11343 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
11344 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
11345 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
11346 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
11347 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
11348 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
11349 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
11350 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
11351 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
11352 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
11353 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
11354 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
11355 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
11356 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
11357 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
11358 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
11359 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
11360 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
11361 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
11362 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
11363 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
11364 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
11365 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
11366 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
11369 nargs_constant
= 1;
11371 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
11372 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
11373 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
11374 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
11375 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
11376 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
11377 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
11378 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
11379 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
11380 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
11383 nargs_constant
= 1;
11385 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
11386 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
11387 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
11388 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
11389 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
11390 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
11391 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
11392 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
11393 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
11394 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
11395 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
11396 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
11399 nargs_constant
= 2;
11403 gcc_unreachable ();
11406 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
11408 if (comparison
!= UNKNOWN
)
11410 gcc_assert (nargs
== 2);
11411 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
11414 if (rmode
== VOIDmode
|| rmode
== tmode
)
11418 || GET_MODE (target
) != tmode
11419 || !insn_p
->operand
[0].predicate (target
, tmode
))
11420 target
= gen_reg_rtx (tmode
);
11421 else if (memory_operand (target
, tmode
))
11423 real_target
= target
;
11427 real_target
= gen_reg_rtx (tmode
);
11428 target
= lowpart_subreg (rmode
, real_target
, tmode
);
11431 for (i
= 0; i
< nargs
; i
++)
11433 tree arg
= CALL_EXPR_ARG (exp
, i
);
11434 rtx op
= expand_normal (arg
);
11435 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
11436 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
11438 if (second_arg_count
&& i
== 1)
11440 /* SIMD shift insns take either an 8-bit immediate or
11441 register as count. But builtin functions take int as
11442 count. If count doesn't match, we put it in register.
11443 The instructions are using 64-bit count, if op is just
11444 32-bit, zero-extend it, as negative shift counts
11445 are undefined behavior and zero-extension is more
11449 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
11450 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
11452 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11453 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
11454 op
= copy_to_reg (op
);
11457 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11458 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
11463 case CODE_FOR_avx_vinsertf128v4di
:
11464 case CODE_FOR_avx_vextractf128v4di
:
11465 error ("the last argument must be an 1-bit immediate");
11468 case CODE_FOR_avx512f_cmpv8di3_mask
:
11469 case CODE_FOR_avx512f_cmpv16si3_mask
:
11470 case CODE_FOR_avx512f_ucmpv8di3_mask
:
11471 case CODE_FOR_avx512f_ucmpv16si3_mask
:
11472 case CODE_FOR_avx512vl_cmpv4di3_mask
:
11473 case CODE_FOR_avx512vl_cmpv8si3_mask
:
11474 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
11475 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
11476 case CODE_FOR_avx512vl_cmpv2di3_mask
:
11477 case CODE_FOR_avx512vl_cmpv4si3_mask
:
11478 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
11479 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
11480 error ("the last argument must be a 3-bit immediate");
11483 case CODE_FOR_sse4_1_roundsd
:
11484 case CODE_FOR_sse4_1_roundss
:
11486 case CODE_FOR_sse4_1_roundpd
:
11487 case CODE_FOR_sse4_1_roundps
:
11488 case CODE_FOR_avx_roundpd256
:
11489 case CODE_FOR_avx_roundps256
:
11491 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
11492 case CODE_FOR_sse4_1_roundps_sfix
:
11493 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
11494 case CODE_FOR_avx_roundps_sfix256
:
11496 case CODE_FOR_sse4_1_blendps
:
11497 case CODE_FOR_avx_blendpd256
:
11498 case CODE_FOR_avx_vpermilv4df
:
11499 case CODE_FOR_avx_vpermilv4df_mask
:
11500 case CODE_FOR_avx512f_getmantv8df_mask
:
11501 case CODE_FOR_avx512f_getmantv16sf_mask
:
11502 case CODE_FOR_avx512vl_getmantv16hf_mask
:
11503 case CODE_FOR_avx512vl_getmantv8sf_mask
:
11504 case CODE_FOR_avx512vl_getmantv4df_mask
:
11505 case CODE_FOR_avx512fp16_getmantv8hf_mask
:
11506 case CODE_FOR_avx512vl_getmantv4sf_mask
:
11507 case CODE_FOR_avx512vl_getmantv2df_mask
:
11508 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
11509 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
11510 case CODE_FOR_avx512dq_rangepv4df_mask
:
11511 case CODE_FOR_avx512dq_rangepv8sf_mask
:
11512 case CODE_FOR_avx512dq_rangepv2df_mask
:
11513 case CODE_FOR_avx512dq_rangepv4sf_mask
:
11514 case CODE_FOR_avx_shufpd256_mask
:
11515 error ("the last argument must be a 4-bit immediate");
11518 case CODE_FOR_sha1rnds4
:
11519 case CODE_FOR_sse4_1_blendpd
:
11520 case CODE_FOR_avx_vpermilv2df
:
11521 case CODE_FOR_avx_vpermilv2df_mask
:
11522 case CODE_FOR_xop_vpermil2v2df3
:
11523 case CODE_FOR_xop_vpermil2v4sf3
:
11524 case CODE_FOR_xop_vpermil2v4df3
:
11525 case CODE_FOR_xop_vpermil2v8sf3
:
11526 case CODE_FOR_avx512f_vinsertf32x4_mask
:
11527 case CODE_FOR_avx512f_vinserti32x4_mask
:
11528 case CODE_FOR_avx512f_vextractf32x4_mask
:
11529 case CODE_FOR_avx512f_vextracti32x4_mask
:
11530 case CODE_FOR_sse2_shufpd
:
11531 case CODE_FOR_sse2_shufpd_mask
:
11532 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
11533 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
11534 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
11535 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
11536 error ("the last argument must be a 2-bit immediate");
11539 case CODE_FOR_avx_vextractf128v4df
:
11540 case CODE_FOR_avx_vextractf128v8sf
:
11541 case CODE_FOR_avx_vextractf128v8si
:
11542 case CODE_FOR_avx_vinsertf128v4df
:
11543 case CODE_FOR_avx_vinsertf128v8sf
:
11544 case CODE_FOR_avx_vinsertf128v8si
:
11545 case CODE_FOR_avx512f_vinsertf64x4_mask
:
11546 case CODE_FOR_avx512f_vinserti64x4_mask
:
11547 case CODE_FOR_avx512f_vextractf64x4_mask
:
11548 case CODE_FOR_avx512f_vextracti64x4_mask
:
11549 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
11550 case CODE_FOR_avx512dq_vinserti32x8_mask
:
11551 case CODE_FOR_avx512vl_vinsertv4df
:
11552 case CODE_FOR_avx512vl_vinsertv4di
:
11553 case CODE_FOR_avx512vl_vinsertv8sf
:
11554 case CODE_FOR_avx512vl_vinsertv8si
:
11555 error ("the last argument must be a 1-bit immediate");
11558 case CODE_FOR_avx_vmcmpv2df3
:
11559 case CODE_FOR_avx_vmcmpv4sf3
:
11560 case CODE_FOR_avx_cmpv2df3
:
11561 case CODE_FOR_avx_cmpv4sf3
:
11562 case CODE_FOR_avx_cmpv4df3
:
11563 case CODE_FOR_avx_cmpv8sf3
:
11564 case CODE_FOR_avx512f_cmpv8df3_mask
:
11565 case CODE_FOR_avx512f_cmpv16sf3_mask
:
11566 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
11567 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
11568 case CODE_FOR_avx512bw_cmpv32hf3_mask
:
11569 case CODE_FOR_avx512vl_cmpv16hf3_mask
:
11570 case CODE_FOR_avx512fp16_cmpv8hf3_mask
:
11571 error ("the last argument must be a 5-bit immediate");
11575 switch (nargs_constant
)
11578 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11579 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
11581 error ("the next to last argument must be an 8-bit immediate");
11586 error ("the last argument must be an 8-bit immediate");
11589 gcc_unreachable ();
11596 if (VECTOR_MODE_P (mode
))
11597 op
= safe_vector_operand (op
, mode
);
11599 /* If we aren't optimizing, only allow one memory operand to
11601 if (memory_operand (op
, mode
))
11604 op
= fixup_modeless_constant (op
, mode
);
11606 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11608 if (optimize
|| !match
|| num_memory
> 1)
11609 op
= copy_to_mode_reg (mode
, op
);
11613 op
= copy_to_reg (op
);
11614 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11624 pat
= GEN_FCN (icode
) (real_target
, xops
[0]);
11627 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1]);
11630 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1], xops
[2]);
11633 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11637 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11638 xops
[2], xops
[3], xops
[4]);
11641 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11642 xops
[2], xops
[3], xops
[4], xops
[5]);
11645 gcc_unreachable ();
11655 /* Transform pattern of following layout:
11657 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11663 ix86_erase_embedded_rounding (rtx pat
)
11665 if (GET_CODE (pat
) == INSN
)
11666 pat
= PATTERN (pat
);
11668 gcc_assert (GET_CODE (pat
) == SET
);
11669 rtx src
= SET_SRC (pat
);
11670 gcc_assert (XVECLEN (src
, 0) == 2);
11671 rtx p0
= XVECEXP (src
, 0, 0);
11672 gcc_assert (GET_CODE (src
) == UNSPEC
11673 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
11674 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
11678 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11681 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
11682 tree exp
, rtx target
)
11685 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11686 tree arg1
= CALL_EXPR_ARG (exp
, 1);
11687 tree arg2
= CALL_EXPR_ARG (exp
, 2);
11688 tree arg3
= CALL_EXPR_ARG (exp
, 3);
11689 rtx op0
= expand_normal (arg0
);
11690 rtx op1
= expand_normal (arg1
);
11691 rtx op2
= expand_normal (arg2
);
11692 rtx op3
= expand_normal (arg3
);
11693 enum insn_code icode
= d
->icode
;
11694 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11695 machine_mode mode0
= insn_p
->operand
[0].mode
;
11696 machine_mode mode1
= insn_p
->operand
[1].mode
;
11698 /* See avxintrin.h for values. */
11699 static const enum rtx_code comparisons
[32] =
11701 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11702 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
11703 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11704 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
11706 static const bool ordereds
[32] =
11708 true, true, true, false, false, false, false, true,
11709 false, false, false, true, true, true, true, false,
11710 true, true, true, false, false, false, false, true,
11711 false, false, false, true, true, true, true, false
11713 static const bool non_signalings
[32] =
11715 true, false, false, true, true, false, false, true,
11716 true, false, false, true, true, false, false, true,
11717 false, true, true, false, false, true, true, false,
11718 false, true, true, false, false, true, true, false
11721 if (!CONST_INT_P (op2
))
11723 error ("the third argument must be comparison constant");
11726 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
11728 error ("incorrect comparison mode");
11732 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
11734 error ("incorrect rounding operand");
11738 if (VECTOR_MODE_P (mode0
))
11739 op0
= safe_vector_operand (op0
, mode0
);
11740 if (VECTOR_MODE_P (mode1
))
11741 op1
= safe_vector_operand (op1
, mode1
);
11743 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
11744 bool ordered
= ordereds
[INTVAL (op2
)];
11745 bool non_signaling
= non_signalings
[INTVAL (op2
)];
11746 rtx const_val
= const0_rtx
;
11748 bool check_unordered
= false;
11749 machine_mode mode
= CCFPmode
;
11750 switch (comparison
)
11755 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11756 if (!non_signaling
)
11762 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11772 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11779 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11780 if (!non_signaling
)
11787 case LE
: /* -> GE */
11788 case LT
: /* -> GT */
11789 case UNGE
: /* -> UNLE */
11790 case UNGT
: /* -> UNLT */
11791 std::swap (op0
, op1
);
11792 comparison
= swap_condition (comparison
);
11800 /* These are supported by CCFPmode. NB: Use ordered/signaling
11801 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11802 with NAN operands. */
11803 if (ordered
== non_signaling
)
11804 ordered
= !ordered
;
11807 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11808 _CMP_EQ_OQ/_CMP_EQ_OS. */
11809 check_unordered
= true;
11813 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11814 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11815 gcc_assert (!ordered
);
11816 check_unordered
= true;
11818 const_val
= const1_rtx
;
11821 gcc_unreachable ();
11824 target
= gen_reg_rtx (SImode
);
11825 emit_move_insn (target
, const_val
);
11826 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11828 if ((optimize
&& !register_operand (op0
, mode0
))
11829 || !insn_p
->operand
[0].predicate (op0
, mode0
))
11830 op0
= copy_to_mode_reg (mode0
, op0
);
11831 if ((optimize
&& !register_operand (op1
, mode1
))
11832 || !insn_p
->operand
[1].predicate (op1
, mode1
))
11833 op1
= copy_to_mode_reg (mode1
, op1
);
11836 1. COMI: ordered and signaling.
11837 2. UCOMI: unordered and non-signaling.
11840 icode
= (icode
== CODE_FOR_sse_comi_round
11841 ? CODE_FOR_sse_ucomi_round
11842 : CODE_FOR_sse2_ucomi_round
);
11844 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
11848 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11849 if (INTVAL (op3
) == NO_ROUND
)
11851 pat
= ix86_erase_embedded_rounding (pat
);
11855 set_dst
= SET_DEST (pat
);
11859 gcc_assert (GET_CODE (pat
) == SET
);
11860 set_dst
= SET_DEST (pat
);
11865 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
11870 ix86_expand_round_builtin (const struct builtin_description
*d
,
11871 tree exp
, rtx target
)
11874 unsigned int i
, nargs
;
11876 enum insn_code icode
= d
->icode
;
11877 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11878 machine_mode tmode
= insn_p
->operand
[0].mode
;
11879 unsigned int nargs_constant
= 0;
11880 unsigned int redundant_embed_rnd
= 0;
11882 switch ((enum ix86_builtin_func_type
) d
->flag
)
11884 case UINT64_FTYPE_V2DF_INT
:
11885 case UINT64_FTYPE_V4SF_INT
:
11886 case UINT64_FTYPE_V8HF_INT
:
11887 case UINT_FTYPE_V2DF_INT
:
11888 case UINT_FTYPE_V4SF_INT
:
11889 case UINT_FTYPE_V8HF_INT
:
11890 case INT64_FTYPE_V2DF_INT
:
11891 case INT64_FTYPE_V4SF_INT
:
11892 case INT64_FTYPE_V8HF_INT
:
11893 case INT_FTYPE_V2DF_INT
:
11894 case INT_FTYPE_V4SF_INT
:
11895 case INT_FTYPE_V8HF_INT
:
11898 case V32HF_FTYPE_V32HF_V32HF_INT
:
11899 case V8HF_FTYPE_V8HF_V8HF_INT
:
11900 case V8HF_FTYPE_V8HF_INT_INT
:
11901 case V8HF_FTYPE_V8HF_UINT_INT
:
11902 case V8HF_FTYPE_V8HF_INT64_INT
:
11903 case V8HF_FTYPE_V8HF_UINT64_INT
:
11904 case V4SF_FTYPE_V4SF_UINT_INT
:
11905 case V4SF_FTYPE_V4SF_UINT64_INT
:
11906 case V2DF_FTYPE_V2DF_UINT64_INT
:
11907 case V4SF_FTYPE_V4SF_INT_INT
:
11908 case V4SF_FTYPE_V4SF_INT64_INT
:
11909 case V2DF_FTYPE_V2DF_INT64_INT
:
11910 case V4SF_FTYPE_V4SF_V4SF_INT
:
11911 case V2DF_FTYPE_V2DF_V2DF_INT
:
11912 case V4SF_FTYPE_V4SF_V2DF_INT
:
11913 case V2DF_FTYPE_V2DF_V4SF_INT
:
11916 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
11917 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
11918 case V32HI_FTYPE_V32HF_V32HI_USI_INT
:
11919 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
11920 case V8DI_FTYPE_V8HF_V8DI_UQI_INT
:
11921 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
11922 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
11923 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
11924 case V8DF_FTYPE_V8HF_V8DF_UQI_INT
:
11925 case V16SF_FTYPE_V16HF_V16SF_UHI_INT
:
11926 case V32HF_FTYPE_V32HI_V32HF_USI_INT
:
11927 case V32HF_FTYPE_V32HF_V32HF_USI_INT
:
11928 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT
:
11929 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
11930 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
11931 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
11932 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
11933 case V16SI_FTYPE_V16HF_V16SI_UHI_INT
:
11934 case V16HF_FTYPE_V16SI_V16HF_UHI_INT
:
11935 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
11936 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
11937 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
11938 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
11939 case V8HF_FTYPE_V8DI_V8HF_UQI_INT
:
11940 case V8HF_FTYPE_V8DF_V8HF_UQI_INT
:
11941 case V16HF_FTYPE_V16SF_V16HF_UHI_INT
:
11942 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT
:
11945 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
11946 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
11947 nargs_constant
= 2;
11950 case INT_FTYPE_V4SF_V4SF_INT_INT
:
11951 case INT_FTYPE_V2DF_V2DF_INT_INT
:
11952 return ix86_expand_sse_comi_round (d
, exp
, target
);
11953 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
11954 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
11955 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
11956 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT
:
11957 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
11958 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT
:
11959 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT
:
11960 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT
:
11961 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
11962 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
11963 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT
:
11964 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
11965 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
11966 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT
:
11967 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT
:
11968 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT
:
11969 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT
:
11972 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT
:
11973 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
11974 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
11975 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT
:
11976 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT
:
11977 nargs_constant
= 4;
11980 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
11981 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
11982 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
11983 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
11984 case USI_FTYPE_V32HF_V32HF_INT_USI_INT
:
11985 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT
:
11986 nargs_constant
= 3;
11989 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
11990 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
11991 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
11992 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
11993 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
11994 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
11995 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT
:
11997 nargs_constant
= 4;
11999 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
12000 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
12001 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
12002 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
12004 nargs_constant
= 3;
12007 gcc_unreachable ();
12009 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
12013 || GET_MODE (target
) != tmode
12014 || !insn_p
->operand
[0].predicate (target
, tmode
))
12015 target
= gen_reg_rtx (tmode
);
12017 for (i
= 0; i
< nargs
; i
++)
12019 tree arg
= CALL_EXPR_ARG (exp
, i
);
12020 rtx op
= expand_normal (arg
);
12021 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
12022 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
12024 if (i
== nargs
- nargs_constant
)
12030 case CODE_FOR_avx512f_getmantv8df_mask_round
:
12031 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
12032 case CODE_FOR_avx512bw_getmantv32hf_mask_round
:
12033 case CODE_FOR_avx512f_vgetmantv2df_round
:
12034 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
12035 case CODE_FOR_avx512f_vgetmantv4sf_round
:
12036 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
12037 case CODE_FOR_avx512f_vgetmantv8hf_mask_round
:
12038 error ("the immediate argument must be a 4-bit immediate");
12040 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
12041 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
12042 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
12043 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
12044 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round
:
12045 case CODE_FOR_avx512bw_cmpv32hf3_mask_round
:
12046 error ("the immediate argument must be a 5-bit immediate");
12049 error ("the immediate argument must be an 8-bit immediate");
12054 else if (i
== nargs
-1)
12056 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
12058 error ("incorrect rounding operand");
12062 /* If there is no rounding use normal version of the pattern. */
12063 if (INTVAL (op
) == NO_ROUND
)
12065 /* Skip erasing embedded rounding for below expanders who
12066 generates multiple insns. In ix86_erase_embedded_rounding
12067 the pattern will be transformed to a single set, and emit_insn
12068 appends the set insead of insert it to chain. So the insns
12069 emitted inside define_expander would be ignored. */
12072 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round
:
12073 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round
:
12074 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round
:
12075 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round
:
12076 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round
:
12077 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round
:
12078 redundant_embed_rnd
= 0;
12081 redundant_embed_rnd
= 1;
12088 if (VECTOR_MODE_P (mode
))
12089 op
= safe_vector_operand (op
, mode
);
12091 op
= fixup_modeless_constant (op
, mode
);
12093 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
12095 if (optimize
|| !match
)
12096 op
= copy_to_mode_reg (mode
, op
);
12100 op
= copy_to_reg (op
);
12101 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
12111 pat
= GEN_FCN (icode
) (target
, xops
[0]);
12114 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
12117 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
12120 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
12124 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
12125 xops
[2], xops
[3], xops
[4]);
12128 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
12129 xops
[2], xops
[3], xops
[4], xops
[5]);
12132 gcc_unreachable ();
12138 if (redundant_embed_rnd
)
12139 pat
= ix86_erase_embedded_rounding (pat
);
12145 /* Subroutine of ix86_expand_builtin to take care of special insns
12146 with variable number of operands. */
12149 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
12150 tree exp
, rtx target
)
12154 unsigned int i
, nargs
, arg_adjust
, memory
;
12155 unsigned int constant
= 100;
12156 bool aligned_mem
= false;
12158 enum insn_code icode
= d
->icode
;
12159 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
12160 machine_mode tmode
= insn_p
->operand
[0].mode
;
12161 enum { load
, store
} klass
;
12163 switch ((enum ix86_builtin_func_type
) d
->flag
)
12165 case VOID_FTYPE_VOID
:
12166 emit_insn (GEN_FCN (icode
) (target
));
12168 case VOID_FTYPE_UINT64
:
12169 case VOID_FTYPE_UNSIGNED
:
12175 case INT_FTYPE_VOID
:
12176 case USHORT_FTYPE_VOID
:
12177 case UINT64_FTYPE_VOID
:
12178 case UINT_FTYPE_VOID
:
12179 case UINT8_FTYPE_VOID
:
12180 case UNSIGNED_FTYPE_VOID
:
12185 case UINT64_FTYPE_PUNSIGNED
:
12186 case V2DI_FTYPE_PV2DI
:
12187 case V4DI_FTYPE_PV4DI
:
12188 case V32QI_FTYPE_PCCHAR
:
12189 case V16QI_FTYPE_PCCHAR
:
12190 case V8SF_FTYPE_PCV4SF
:
12191 case V8SF_FTYPE_PCFLOAT
:
12192 case V4SF_FTYPE_PCFLOAT
:
12193 case V4SF_FTYPE_PCFLOAT16
:
12194 case V4SF_FTYPE_PCBFLOAT16
:
12195 case V4SF_FTYPE_PCV8BF
:
12196 case V4SF_FTYPE_PCV8HF
:
12197 case V8SF_FTYPE_PCFLOAT16
:
12198 case V8SF_FTYPE_PCBFLOAT16
:
12199 case V8SF_FTYPE_PCV16HF
:
12200 case V8SF_FTYPE_PCV16BF
:
12201 case V4DF_FTYPE_PCV2DF
:
12202 case V4DF_FTYPE_PCDOUBLE
:
12203 case V2DF_FTYPE_PCDOUBLE
:
12204 case VOID_FTYPE_PVOID
:
12205 case V8DI_FTYPE_PV8DI
:
12211 case CODE_FOR_sse4_1_movntdqa
:
12212 case CODE_FOR_avx2_movntdqa
:
12213 case CODE_FOR_avx512f_movntdqa
:
12214 aligned_mem
= true;
12220 case VOID_FTYPE_PV2SF_V4SF
:
12221 case VOID_FTYPE_PV8DI_V8DI
:
12222 case VOID_FTYPE_PV4DI_V4DI
:
12223 case VOID_FTYPE_PV2DI_V2DI
:
12224 case VOID_FTYPE_PCHAR_V32QI
:
12225 case VOID_FTYPE_PCHAR_V16QI
:
12226 case VOID_FTYPE_PFLOAT_V16SF
:
12227 case VOID_FTYPE_PFLOAT_V8SF
:
12228 case VOID_FTYPE_PFLOAT_V4SF
:
12229 case VOID_FTYPE_PDOUBLE_V8DF
:
12230 case VOID_FTYPE_PDOUBLE_V4DF
:
12231 case VOID_FTYPE_PDOUBLE_V2DF
:
12232 case VOID_FTYPE_PLONGLONG_LONGLONG
:
12233 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
12234 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
12235 case VOID_FTYPE_PINT_INT
:
12238 /* Reserve memory operand for target. */
12239 memory
= ARRAY_SIZE (xops
);
12242 /* These builtins and instructions require the memory
12243 to be properly aligned. */
12244 case CODE_FOR_avx_movntv4di
:
12245 case CODE_FOR_sse2_movntv2di
:
12246 case CODE_FOR_avx_movntv8sf
:
12247 case CODE_FOR_sse_movntv4sf
:
12248 case CODE_FOR_sse4a_vmmovntv4sf
:
12249 case CODE_FOR_avx_movntv4df
:
12250 case CODE_FOR_sse2_movntv2df
:
12251 case CODE_FOR_sse4a_vmmovntv2df
:
12252 case CODE_FOR_sse2_movntidi
:
12253 case CODE_FOR_sse_movntq
:
12254 case CODE_FOR_sse2_movntisi
:
12255 case CODE_FOR_avx512f_movntv16sf
:
12256 case CODE_FOR_avx512f_movntv8df
:
12257 case CODE_FOR_avx512f_movntv8di
:
12258 aligned_mem
= true;
12264 case VOID_FTYPE_PVOID_PCVOID
:
12270 case V4SF_FTYPE_V4SF_PCV2SF
:
12271 case V2DF_FTYPE_V2DF_PCDOUBLE
:
12276 case V8SF_FTYPE_PCV8SF_V8SI
:
12277 case V4DF_FTYPE_PCV4DF_V4DI
:
12278 case V4SF_FTYPE_PCV4SF_V4SI
:
12279 case V2DF_FTYPE_PCV2DF_V2DI
:
12280 case V8SI_FTYPE_PCV8SI_V8SI
:
12281 case V4DI_FTYPE_PCV4DI_V4DI
:
12282 case V4SI_FTYPE_PCV4SI_V4SI
:
12283 case V2DI_FTYPE_PCV2DI_V2DI
:
12284 case VOID_FTYPE_INT_INT64
:
12289 case VOID_FTYPE_PV8DF_V8DF_UQI
:
12290 case VOID_FTYPE_PV4DF_V4DF_UQI
:
12291 case VOID_FTYPE_PV2DF_V2DF_UQI
:
12292 case VOID_FTYPE_PV16SF_V16SF_UHI
:
12293 case VOID_FTYPE_PV8SF_V8SF_UQI
:
12294 case VOID_FTYPE_PV4SF_V4SF_UQI
:
12295 case VOID_FTYPE_PV8DI_V8DI_UQI
:
12296 case VOID_FTYPE_PV4DI_V4DI_UQI
:
12297 case VOID_FTYPE_PV2DI_V2DI_UQI
:
12298 case VOID_FTYPE_PV16SI_V16SI_UHI
:
12299 case VOID_FTYPE_PV8SI_V8SI_UQI
:
12300 case VOID_FTYPE_PV4SI_V4SI_UQI
:
12301 case VOID_FTYPE_PV64QI_V64QI_UDI
:
12302 case VOID_FTYPE_PV32HI_V32HI_USI
:
12303 case VOID_FTYPE_PV32QI_V32QI_USI
:
12304 case VOID_FTYPE_PV16QI_V16QI_UHI
:
12305 case VOID_FTYPE_PV16HI_V16HI_UHI
:
12306 case VOID_FTYPE_PV8HI_V8HI_UQI
:
12309 /* These builtins and instructions require the memory
12310 to be properly aligned. */
12311 case CODE_FOR_avx512f_storev16sf_mask
:
12312 case CODE_FOR_avx512f_storev16si_mask
:
12313 case CODE_FOR_avx512f_storev8df_mask
:
12314 case CODE_FOR_avx512f_storev8di_mask
:
12315 case CODE_FOR_avx512vl_storev8sf_mask
:
12316 case CODE_FOR_avx512vl_storev8si_mask
:
12317 case CODE_FOR_avx512vl_storev4df_mask
:
12318 case CODE_FOR_avx512vl_storev4di_mask
:
12319 case CODE_FOR_avx512vl_storev4sf_mask
:
12320 case CODE_FOR_avx512vl_storev4si_mask
:
12321 case CODE_FOR_avx512vl_storev2df_mask
:
12322 case CODE_FOR_avx512vl_storev2di_mask
:
12323 aligned_mem
= true;
12329 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
12330 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
12331 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
12332 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
12333 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
12334 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
12335 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
12336 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
12337 case VOID_FTYPE_PV8SI_V8DI_UQI
:
12338 case VOID_FTYPE_PV8HI_V8DI_UQI
:
12339 case VOID_FTYPE_PV16HI_V16SI_UHI
:
12340 case VOID_FTYPE_PUDI_V8DI_UQI
:
12341 case VOID_FTYPE_PV16QI_V16SI_UHI
:
12342 case VOID_FTYPE_PV4SI_V4DI_UQI
:
12343 case VOID_FTYPE_PUDI_V2DI_UQI
:
12344 case VOID_FTYPE_PUDI_V4DI_UQI
:
12345 case VOID_FTYPE_PUSI_V2DI_UQI
:
12346 case VOID_FTYPE_PV8HI_V8SI_UQI
:
12347 case VOID_FTYPE_PUDI_V4SI_UQI
:
12348 case VOID_FTYPE_PUSI_V4DI_UQI
:
12349 case VOID_FTYPE_PUHI_V2DI_UQI
:
12350 case VOID_FTYPE_PUDI_V8SI_UQI
:
12351 case VOID_FTYPE_PUSI_V4SI_UQI
:
12352 case VOID_FTYPE_PCHAR_V64QI_UDI
:
12353 case VOID_FTYPE_PCHAR_V32QI_USI
:
12354 case VOID_FTYPE_PCHAR_V16QI_UHI
:
12355 case VOID_FTYPE_PSHORT_V32HI_USI
:
12356 case VOID_FTYPE_PSHORT_V16HI_UHI
:
12357 case VOID_FTYPE_PSHORT_V8HI_UQI
:
12358 case VOID_FTYPE_PINT_V16SI_UHI
:
12359 case VOID_FTYPE_PINT_V8SI_UQI
:
12360 case VOID_FTYPE_PINT_V4SI_UQI
:
12361 case VOID_FTYPE_PINT64_V8DI_UQI
:
12362 case VOID_FTYPE_PINT64_V4DI_UQI
:
12363 case VOID_FTYPE_PINT64_V2DI_UQI
:
12364 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
12365 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
12366 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
12367 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
12368 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
12369 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
12370 case VOID_FTYPE_PCFLOAT16_V8HF_UQI
:
12371 case VOID_FTYPE_PV32QI_V32HI_USI
:
12372 case VOID_FTYPE_PV16QI_V16HI_UHI
:
12373 case VOID_FTYPE_PUDI_V8HI_UQI
:
12376 /* Reserve memory operand for target. */
12377 memory
= ARRAY_SIZE (xops
);
12379 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
12380 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
12381 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
12382 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
12383 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
12384 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
12385 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
12386 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
12387 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
12388 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
12389 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
12390 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
12391 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
12392 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
12393 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
12394 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
12395 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
12396 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
12399 /* These builtins and instructions require the memory
12400 to be properly aligned. */
12401 case CODE_FOR_avx512f_loadv16sf_mask
:
12402 case CODE_FOR_avx512f_loadv16si_mask
:
12403 case CODE_FOR_avx512f_loadv8df_mask
:
12404 case CODE_FOR_avx512f_loadv8di_mask
:
12405 case CODE_FOR_avx512vl_loadv8sf_mask
:
12406 case CODE_FOR_avx512vl_loadv8si_mask
:
12407 case CODE_FOR_avx512vl_loadv4df_mask
:
12408 case CODE_FOR_avx512vl_loadv4di_mask
:
12409 case CODE_FOR_avx512vl_loadv4sf_mask
:
12410 case CODE_FOR_avx512vl_loadv4si_mask
:
12411 case CODE_FOR_avx512vl_loadv2df_mask
:
12412 case CODE_FOR_avx512vl_loadv2di_mask
:
12413 case CODE_FOR_avx512bw_loadv64qi_mask
:
12414 case CODE_FOR_avx512vl_loadv32qi_mask
:
12415 case CODE_FOR_avx512vl_loadv16qi_mask
:
12416 case CODE_FOR_avx512bw_loadv32hi_mask
:
12417 case CODE_FOR_avx512vl_loadv16hi_mask
:
12418 case CODE_FOR_avx512vl_loadv8hi_mask
:
12419 aligned_mem
= true;
12425 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
12426 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
12427 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
12428 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
12429 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
12430 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
12431 case V16SI_FTYPE_PCINT_V16SI_UHI
:
12432 case V8SI_FTYPE_PCINT_V8SI_UQI
:
12433 case V4SI_FTYPE_PCINT_V4SI_UQI
:
12434 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
12435 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
12436 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
12437 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
12438 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
12439 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
12440 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
12441 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
12442 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
12443 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI
:
12448 case INT_FTYPE_PINT_INT_INT_INT
:
12449 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT
:
12456 gcc_unreachable ();
12459 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
12461 if (klass
== store
)
12463 arg
= CALL_EXPR_ARG (exp
, 0);
12464 op
= expand_normal (arg
);
12465 gcc_assert (target
== 0);
12468 op
= ix86_zero_extend_to_Pmode (op
);
12469 target
= gen_rtx_MEM (tmode
, op
);
12470 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12471 on it. Try to improve it using get_pointer_alignment,
12472 and if the special builtin is one that requires strict
12473 mode alignment, also from it's GET_MODE_ALIGNMENT.
12474 Failure to do so could lead to ix86_legitimate_combined_insn
12475 rejecting all changes to such insns. */
12476 unsigned int align
= get_pointer_alignment (arg
);
12477 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
12478 align
= GET_MODE_ALIGNMENT (tmode
);
12479 if (MEM_ALIGN (target
) < align
)
12480 set_mem_align (target
, align
);
12483 target
= force_reg (tmode
, op
);
12491 || !register_operand (target
, tmode
)
12492 || GET_MODE (target
) != tmode
)
12493 target
= gen_reg_rtx (tmode
);
12496 for (i
= 0; i
< nargs
; i
++)
12498 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
12500 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
12501 op
= expand_normal (arg
);
12505 /* This must be the memory operand. */
12506 op
= ix86_zero_extend_to_Pmode (op
);
12507 op
= gen_rtx_MEM (mode
, op
);
12508 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12509 on it. Try to improve it using get_pointer_alignment,
12510 and if the special builtin is one that requires strict
12511 mode alignment, also from it's GET_MODE_ALIGNMENT.
12512 Failure to do so could lead to ix86_legitimate_combined_insn
12513 rejecting all changes to such insns. */
12514 unsigned int align
= get_pointer_alignment (arg
);
12515 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
12516 align
= GET_MODE_ALIGNMENT (mode
);
12517 if (MEM_ALIGN (op
) < align
)
12518 set_mem_align (op
, align
);
12520 else if (i
== constant
)
12522 /* This must be the constant. */
12523 if (!insn_p
->operand
[nargs
].predicate(op
, SImode
))
12525 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12531 /* This must be register. */
12532 if (VECTOR_MODE_P (mode
))
12533 op
= safe_vector_operand (op
, mode
);
12535 op
= fixup_modeless_constant (op
, mode
);
12537 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12538 and that mask operand shoud be at the end.
12539 Keep all-ones mask which would be simplified by the expander. */
12540 if (nargs
== 3 && i
== 2 && klass
== load
12541 && constm1_operand (op
, mode
)
12542 && insn_p
->operand
[i
].predicate (op
, mode
))
12544 else if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
12545 op
= copy_to_mode_reg (mode
, op
);
12548 op
= copy_to_reg (op
);
12549 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
12559 pat
= GEN_FCN (icode
) (target
);
12562 pat
= GEN_FCN (icode
) (target
, xops
[0]);
12565 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
12568 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
12571 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
12574 gcc_unreachable ();
12581 return klass
== store
? 0 : target
;
12584 /* Return the integer constant in ARG. Constrain it to be in the range
12585 of the subparts of VEC_TYPE; issue an error if not. */
12588 get_element_number (tree vec_type
, tree arg
)
12590 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
12592 if (!tree_fits_uhwi_p (arg
)
12593 || (elt
= tree_to_uhwi (arg
), elt
> max
))
12595 error ("selector must be an integer constant in the range "
12603 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12604 ix86_expand_vector_init. We DO have language-level syntax for this, in
12605 the form of (type){ init-list }. Except that since we can't place emms
12606 instructions from inside the compiler, we can't allow the use of MMX
12607 registers unless the user explicitly asks for it. So we do *not* define
12608 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12609 we have builtins invoked by mmintrin.h that gives us license to emit
12610 these sorts of instructions. */
12613 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
12615 machine_mode tmode
= TYPE_MODE (type
);
12616 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
12617 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
12618 rtvec v
= rtvec_alloc (n_elt
);
12620 gcc_assert (VECTOR_MODE_P (tmode
));
12621 gcc_assert (call_expr_nargs (exp
) == n_elt
);
12623 for (i
= 0; i
< n_elt
; ++i
)
12625 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
12626 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
12629 if (!target
|| !register_operand (target
, tmode
))
12630 target
= gen_reg_rtx (tmode
);
12632 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
12636 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12637 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12638 had a language-level syntax for referencing vector elements. */
12641 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
12643 machine_mode tmode
, mode0
;
12648 arg0
= CALL_EXPR_ARG (exp
, 0);
12649 arg1
= CALL_EXPR_ARG (exp
, 1);
12651 op0
= expand_normal (arg0
);
12652 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
12654 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12655 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
12656 gcc_assert (VECTOR_MODE_P (mode0
));
12658 op0
= force_reg (mode0
, op0
);
12660 if (optimize
|| !target
|| !register_operand (target
, tmode
))
12661 target
= gen_reg_rtx (tmode
);
12663 ix86_expand_vector_extract (true, target
, op0
, elt
);
12668 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12669 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12670 a language-level syntax for referencing vector elements. */
12673 ix86_expand_vec_set_builtin (tree exp
)
12675 machine_mode tmode
, mode1
;
12676 tree arg0
, arg1
, arg2
;
12678 rtx op0
, op1
, target
;
12680 arg0
= CALL_EXPR_ARG (exp
, 0);
12681 arg1
= CALL_EXPR_ARG (exp
, 1);
12682 arg2
= CALL_EXPR_ARG (exp
, 2);
12684 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
12685 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12686 gcc_assert (VECTOR_MODE_P (tmode
));
12688 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
12689 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
12690 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
12692 if (GET_MODE (op1
) != mode1
)
12693 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
12695 op0
= force_reg (tmode
, op0
);
12696 op1
= force_reg (mode1
, op1
);
12698 /* OP0 is the source of these builtin functions and shouldn't be
12699 modified. Create a copy, use it and return it as target. */
12700 target
= gen_reg_rtx (tmode
);
12701 emit_move_insn (target
, op0
);
12702 ix86_expand_vector_set (true, target
, op1
, elt
);
12707 /* Return true if the necessary isa options for this builtin exist,
12709 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12711 ix86_check_builtin_isa_match (unsigned int fcode
,
12712 HOST_WIDE_INT
* pbisa
,
12713 HOST_WIDE_INT
* pbisa2
)
12715 HOST_WIDE_INT isa
= ix86_isa_flags
;
12716 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
12717 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
12718 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
12719 HOST_WIDE_INT tmp_isa
= isa
, tmp_isa2
= isa2
;
12720 /* The general case is we require all the ISAs specified in bisa{,2}
12722 The exceptions are:
12723 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12724 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12725 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12726 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12727 OPTION_MASK_ISA2_AVXVNNI
12728 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
12729 OPTION_MASK_ISA2_AVXIFMA
12730 (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
12731 OPTION_MASK_ISA2_AVXNECONVERT
12732 OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
12733 where for each such pair it is sufficient if either of the ISAs is
12734 enabled, plus if it is ored with other options also those others.
12735 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12737 #define SHARE_BUILTIN(A1, A2, B1, B2) \
12738 if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
12739 && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
12740 && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
12741 || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
12743 tmp_isa |= (A1) | (B1); \
12744 tmp_isa2 |= (A2) | (B2); \
12747 SHARE_BUILTIN (OPTION_MASK_ISA_SSE
, 0, OPTION_MASK_ISA_3DNOW_A
, 0);
12748 SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2
, 0, OPTION_MASK_ISA_CRC32
, 0);
12749 SHARE_BUILTIN (OPTION_MASK_ISA_FMA
, 0, OPTION_MASK_ISA_FMA4
, 0);
12750 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
, 0, 0,
12751 OPTION_MASK_ISA2_AVXVNNI
);
12752 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
, 0, 0,
12753 OPTION_MASK_ISA2_AVXIFMA
);
12754 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL
, OPTION_MASK_ISA2_AVX512BF16
, 0,
12755 OPTION_MASK_ISA2_AVXNECONVERT
);
12756 SHARE_BUILTIN (OPTION_MASK_ISA_AES
, 0, OPTION_MASK_ISA_AVX512VL
,
12757 OPTION_MASK_ISA2_VAES
);
12761 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
12762 /* __builtin_ia32_maskmovq requires MMX registers. */
12763 && fcode
!= IX86_BUILTIN_MASKMOVQ
)
12765 bisa
&= ~OPTION_MASK_ISA_MMX
;
12766 bisa
|= OPTION_MASK_ISA_SSE2
;
12774 return (bisa
& isa
) == bisa
&& (bisa2
& isa2
) == bisa2
;
12777 /* Emit instructions to set the carry flag from ARG. */
12780 ix86_expand_carry (rtx arg
)
12782 if (!CONST_INT_P (arg
) || arg
== const0_rtx
)
12784 arg
= convert_to_mode (QImode
, arg
, 1);
12785 arg
= copy_to_mode_reg (QImode
, arg
);
12786 emit_insn (gen_addqi3_cconly_overflow (arg
, constm1_rtx
));
12789 emit_insn (gen_x86_stc ());
12792 /* Expand an expression EXP that calls a built-in function,
12793 with result going to TARGET if that's convenient
12794 (and in mode MODE if that's convenient).
12795 SUBTARGET may be used as the target for computing one of EXP's operands.
12796 IGNORE is nonzero if the value is to be ignored. */
12799 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
12800 machine_mode mode
, int ignore
)
12803 enum insn_code icode
, icode2
;
12804 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
12805 tree arg0
, arg1
, arg2
, arg3
, arg4
;
12806 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
12807 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
12808 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
12809 HOST_WIDE_INT bisa
, bisa2
;
12811 /* For CPU builtins that can be folded, fold first and expand the fold. */
12814 case IX86_BUILTIN_CPU_INIT
:
12816 /* Make it call __cpu_indicator_init in libgcc. */
12817 tree call_expr
, fndecl
, type
;
12818 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
12819 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
12820 call_expr
= build_call_expr (fndecl
, 0);
12821 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
12823 case IX86_BUILTIN_CPU_IS
:
12824 case IX86_BUILTIN_CPU_SUPPORTS
:
12826 tree arg0
= CALL_EXPR_ARG (exp
, 0);
12827 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
12828 gcc_assert (fold_expr
!= NULL_TREE
);
12829 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
12833 if (!ix86_check_builtin_isa_match (fcode
, &bisa
, &bisa2
))
12835 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
12836 if (TARGET_ABI_X32
)
12837 bisa
|= OPTION_MASK_ABI_X32
;
12839 bisa
|= OPTION_MASK_ABI_64
;
12840 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
12841 (enum fpmath_unit
) 0,
12842 (enum prefer_vector_width
) 0,
12843 PVW_NONE
, PVW_NONE
,
12846 error ("%qE needs unknown isa option", fndecl
);
12849 gcc_assert (opts
!= NULL
);
12850 error ("%qE needs isa option %s", fndecl
, opts
);
12853 return expand_call (exp
, target
, ignore
);
12858 case IX86_BUILTIN_MASKMOVQ
:
12859 case IX86_BUILTIN_MASKMOVDQU
:
12860 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
12861 ? CODE_FOR_mmx_maskmovq
12862 : CODE_FOR_sse2_maskmovdqu
);
12863 /* Note the arg order is different from the operand order. */
12864 arg1
= CALL_EXPR_ARG (exp
, 0);
12865 arg2
= CALL_EXPR_ARG (exp
, 1);
12866 arg0
= CALL_EXPR_ARG (exp
, 2);
12867 op0
= expand_normal (arg0
);
12868 op1
= expand_normal (arg1
);
12869 op2
= expand_normal (arg2
);
12870 mode0
= insn_data
[icode
].operand
[0].mode
;
12871 mode1
= insn_data
[icode
].operand
[1].mode
;
12872 mode2
= insn_data
[icode
].operand
[2].mode
;
12874 op0
= ix86_zero_extend_to_Pmode (op0
);
12875 op0
= gen_rtx_MEM (mode1
, op0
);
12877 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12878 op0
= copy_to_mode_reg (mode0
, op0
);
12879 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12880 op1
= copy_to_mode_reg (mode1
, op1
);
12881 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12882 op2
= copy_to_mode_reg (mode2
, op2
);
12883 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
12889 case IX86_BUILTIN_LDMXCSR
:
12890 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
12891 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12892 emit_move_insn (target
, op0
);
12893 emit_insn (gen_sse_ldmxcsr (target
));
12896 case IX86_BUILTIN_STMXCSR
:
12897 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12898 emit_insn (gen_sse_stmxcsr (target
));
12899 return copy_to_mode_reg (SImode
, target
);
12901 case IX86_BUILTIN_CLFLUSH
:
12902 arg0
= CALL_EXPR_ARG (exp
, 0);
12903 op0
= expand_normal (arg0
);
12904 icode
= CODE_FOR_sse2_clflush
;
12905 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12906 op0
= ix86_zero_extend_to_Pmode (op0
);
12908 emit_insn (gen_sse2_clflush (op0
));
12911 case IX86_BUILTIN_CLWB
:
12912 arg0
= CALL_EXPR_ARG (exp
, 0);
12913 op0
= expand_normal (arg0
);
12914 icode
= CODE_FOR_clwb
;
12915 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12916 op0
= ix86_zero_extend_to_Pmode (op0
);
12918 emit_insn (gen_clwb (op0
));
12921 case IX86_BUILTIN_CLFLUSHOPT
:
12922 arg0
= CALL_EXPR_ARG (exp
, 0);
12923 op0
= expand_normal (arg0
);
12924 icode
= CODE_FOR_clflushopt
;
12925 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12926 op0
= ix86_zero_extend_to_Pmode (op0
);
12928 emit_insn (gen_clflushopt (op0
));
12931 case IX86_BUILTIN_MONITOR
:
12932 case IX86_BUILTIN_MONITORX
:
12933 arg0
= CALL_EXPR_ARG (exp
, 0);
12934 arg1
= CALL_EXPR_ARG (exp
, 1);
12935 arg2
= CALL_EXPR_ARG (exp
, 2);
12936 op0
= expand_normal (arg0
);
12937 op1
= expand_normal (arg1
);
12938 op2
= expand_normal (arg2
);
12940 op0
= ix86_zero_extend_to_Pmode (op0
);
12942 op1
= copy_to_mode_reg (SImode
, op1
);
12944 op2
= copy_to_mode_reg (SImode
, op2
);
12946 emit_insn (fcode
== IX86_BUILTIN_MONITOR
12947 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
12948 : gen_monitorx (Pmode
, op0
, op1
, op2
));
12951 case IX86_BUILTIN_MWAIT
:
12952 arg0
= CALL_EXPR_ARG (exp
, 0);
12953 arg1
= CALL_EXPR_ARG (exp
, 1);
12954 op0
= expand_normal (arg0
);
12955 op1
= expand_normal (arg1
);
12957 op0
= copy_to_mode_reg (SImode
, op0
);
12959 op1
= copy_to_mode_reg (SImode
, op1
);
12960 emit_insn (gen_sse3_mwait (op0
, op1
));
12963 case IX86_BUILTIN_MWAITX
:
12964 arg0
= CALL_EXPR_ARG (exp
, 0);
12965 arg1
= CALL_EXPR_ARG (exp
, 1);
12966 arg2
= CALL_EXPR_ARG (exp
, 2);
12967 op0
= expand_normal (arg0
);
12968 op1
= expand_normal (arg1
);
12969 op2
= expand_normal (arg2
);
12971 op0
= copy_to_mode_reg (SImode
, op0
);
12973 op1
= copy_to_mode_reg (SImode
, op1
);
12975 op2
= copy_to_mode_reg (SImode
, op2
);
12976 emit_insn (gen_mwaitx (op0
, op1
, op2
));
12979 case IX86_BUILTIN_UMONITOR
:
12980 arg0
= CALL_EXPR_ARG (exp
, 0);
12981 op0
= expand_normal (arg0
);
12983 op0
= ix86_zero_extend_to_Pmode (op0
);
12984 emit_insn (gen_umonitor (Pmode
, op0
));
12987 case IX86_BUILTIN_UMWAIT
:
12988 case IX86_BUILTIN_TPAUSE
:
12989 arg0
= CALL_EXPR_ARG (exp
, 0);
12990 arg1
= CALL_EXPR_ARG (exp
, 1);
12991 op0
= expand_normal (arg0
);
12992 op1
= expand_normal (arg1
);
12995 op0
= copy_to_mode_reg (SImode
, op0
);
12997 op1
= force_reg (DImode
, op1
);
13001 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13002 NULL
, 1, OPTAB_DIRECT
);
13005 case IX86_BUILTIN_UMWAIT
:
13006 icode
= CODE_FOR_umwait_rex64
;
13008 case IX86_BUILTIN_TPAUSE
:
13009 icode
= CODE_FOR_tpause_rex64
;
13012 gcc_unreachable ();
13015 op2
= gen_lowpart (SImode
, op2
);
13016 op1
= gen_lowpart (SImode
, op1
);
13017 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13023 case IX86_BUILTIN_UMWAIT
:
13024 icode
= CODE_FOR_umwait
;
13026 case IX86_BUILTIN_TPAUSE
:
13027 icode
= CODE_FOR_tpause
;
13030 gcc_unreachable ();
13032 pat
= GEN_FCN (icode
) (op0
, op1
);
13041 || !register_operand (target
, QImode
))
13042 target
= gen_reg_rtx (QImode
);
13044 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13046 emit_insn (gen_rtx_SET (target
, pat
));
13050 case IX86_BUILTIN_TESTUI
:
13051 emit_insn (gen_testui ());
13054 || !register_operand (target
, QImode
))
13055 target
= gen_reg_rtx (QImode
);
13057 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13059 emit_insn (gen_rtx_SET (target
, pat
));
13063 case IX86_BUILTIN_CLZERO
:
13064 arg0
= CALL_EXPR_ARG (exp
, 0);
13065 op0
= expand_normal (arg0
);
13067 op0
= ix86_zero_extend_to_Pmode (op0
);
13068 emit_insn (gen_clzero (Pmode
, op0
));
13071 case IX86_BUILTIN_CLDEMOTE
:
13072 arg0
= CALL_EXPR_ARG (exp
, 0);
13073 op0
= expand_normal (arg0
);
13074 icode
= CODE_FOR_cldemote
;
13075 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
13076 op0
= ix86_zero_extend_to_Pmode (op0
);
13078 emit_insn (gen_cldemote (op0
));
13081 case IX86_BUILTIN_LOADIWKEY
:
13083 arg0
= CALL_EXPR_ARG (exp
, 0);
13084 arg1
= CALL_EXPR_ARG (exp
, 1);
13085 arg2
= CALL_EXPR_ARG (exp
, 2);
13086 arg3
= CALL_EXPR_ARG (exp
, 3);
13088 op0
= expand_normal (arg0
);
13089 op1
= expand_normal (arg1
);
13090 op2
= expand_normal (arg2
);
13091 op3
= expand_normal (arg3
);
13094 op0
= copy_to_mode_reg (V2DImode
, op0
);
13096 op1
= copy_to_mode_reg (V2DImode
, op1
);
13098 op2
= copy_to_mode_reg (V2DImode
, op2
);
13100 op3
= copy_to_mode_reg (SImode
, op3
);
13102 emit_insn (gen_loadiwkey (op0
, op1
, op2
, op3
));
13107 case IX86_BUILTIN_AESDEC128KLU8
:
13108 icode
= CODE_FOR_aesdec128klu8
;
13109 goto aesdecenc_expand
;
13111 case IX86_BUILTIN_AESDEC256KLU8
:
13112 icode
= CODE_FOR_aesdec256klu8
;
13113 goto aesdecenc_expand
;
13115 case IX86_BUILTIN_AESENC128KLU8
:
13116 icode
= CODE_FOR_aesenc128klu8
;
13117 goto aesdecenc_expand
;
13119 case IX86_BUILTIN_AESENC256KLU8
:
13120 icode
= CODE_FOR_aesenc256klu8
;
13124 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i *odata
13125 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i idata
13126 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
13128 op0
= expand_normal (arg0
);
13129 op1
= expand_normal (arg1
);
13130 op2
= expand_normal (arg2
);
13132 if (!address_operand (op0
, V2DImode
))
13134 op0
= convert_memory_address (Pmode
, op0
);
13135 op0
= copy_addr_to_reg (op0
);
13137 op0
= gen_rtx_MEM (V2DImode
, op0
);
13140 op1
= copy_to_mode_reg (V2DImode
, op1
);
13142 if (!address_operand (op2
, VOIDmode
))
13144 op2
= convert_memory_address (Pmode
, op2
);
13145 op2
= copy_addr_to_reg (op2
);
13147 op2
= gen_rtx_MEM (BLKmode
, op2
);
13149 emit_insn (GEN_FCN (icode
) (op1
, op1
, op2
));
13152 target
= gen_reg_rtx (QImode
);
13154 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
13155 error occurs. Then the output should be cleared for safety. */
13156 rtx_code_label
*ok_label
;
13159 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
13160 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
13161 ok_label
= gen_label_rtx ();
13162 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
13164 /* Usually the runtime error seldom occur, so predict OK path as
13165 hotspot to optimize it as fallthrough block. */
13166 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
13168 emit_insn (gen_rtx_SET (op1
, const0_rtx
));
13170 emit_label (ok_label
);
13171 emit_insn (gen_rtx_SET (target
, pat
));
13172 emit_insn (gen_rtx_SET (op0
, op1
));
13176 case IX86_BUILTIN_AESDECWIDE128KLU8
:
13177 icode
= CODE_FOR_aesdecwide128klu8
;
13178 goto wideaesdecenc_expand
;
13180 case IX86_BUILTIN_AESDECWIDE256KLU8
:
13181 icode
= CODE_FOR_aesdecwide256klu8
;
13182 goto wideaesdecenc_expand
;
13184 case IX86_BUILTIN_AESENCWIDE128KLU8
:
13185 icode
= CODE_FOR_aesencwide128klu8
;
13186 goto wideaesdecenc_expand
;
13188 case IX86_BUILTIN_AESENCWIDE256KLU8
:
13189 icode
= CODE_FOR_aesencwide256klu8
;
13191 wideaesdecenc_expand
:
13196 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i * odata
13197 arg1
= CALL_EXPR_ARG (exp
, 1); // const __m128i * idata
13198 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
13200 op0
= expand_normal (arg0
);
13201 op1
= expand_normal (arg1
);
13202 op2
= expand_normal (arg2
);
13204 if (!address_operand (op2
, VOIDmode
))
13206 op2
= convert_memory_address (Pmode
, op2
);
13207 op2
= copy_addr_to_reg (op2
);
13209 op2
= gen_rtx_MEM (BLKmode
, op2
);
13211 for (i
= 0; i
< 8; i
++)
13213 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13215 op
= gen_rtx_MEM (V2DImode
,
13216 plus_constant (Pmode
, op1
, (i
* 16)));
13218 emit_move_insn (xmm_regs
[i
], op
);
13221 emit_insn (GEN_FCN (icode
) (op2
));
13224 target
= gen_reg_rtx (QImode
);
13226 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
13227 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
13228 ok_label
= gen_label_rtx ();
13229 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
13231 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
13233 for (i
= 0; i
< 8; i
++)
13234 emit_insn (gen_rtx_SET (xmm_regs
[i
], const0_rtx
));
13236 emit_label (ok_label
);
13237 emit_insn (gen_rtx_SET (target
, pat
));
13239 for (i
= 0; i
< 8; i
++)
13241 op
= gen_rtx_MEM (V2DImode
,
13242 plus_constant (Pmode
, op0
, (i
* 16)));
13243 emit_move_insn (op
, xmm_regs
[i
]);
13248 case IX86_BUILTIN_ENCODEKEY128U32
:
13250 rtx op
, xmm_regs
[7];
13252 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
13253 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i key
13254 arg2
= CALL_EXPR_ARG (exp
, 2); // void *h
13256 op0
= expand_normal (arg0
);
13257 op1
= expand_normal (arg1
);
13258 op2
= expand_normal (arg2
);
13261 op0
= copy_to_mode_reg (SImode
, op0
);
13263 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
13264 emit_move_insn (op
, op1
);
13266 for (i
= 0; i
< 3; i
++)
13267 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13270 target
= gen_reg_rtx (SImode
);
13272 emit_insn (gen_encodekey128u32 (target
, op0
));
13274 for (i
= 0; i
< 3; i
++)
13276 op
= gen_rtx_MEM (V2DImode
,
13277 plus_constant (Pmode
, op2
, (i
* 16)));
13278 emit_move_insn (op
, xmm_regs
[i
]);
13283 case IX86_BUILTIN_ENCODEKEY256U32
:
13285 rtx op
, xmm_regs
[7];
13287 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
13288 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i keylow
13289 arg2
= CALL_EXPR_ARG (exp
, 2); // __m128i keyhi
13290 arg3
= CALL_EXPR_ARG (exp
, 3); // void *h
13292 op0
= expand_normal (arg0
);
13293 op1
= expand_normal (arg1
);
13294 op2
= expand_normal (arg2
);
13295 op3
= expand_normal (arg3
);
13298 op0
= copy_to_mode_reg (SImode
, op0
);
13300 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13301 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
13302 emit_move_insn (op
, op1
);
13303 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (1));
13304 emit_move_insn (op
, op2
);
13306 for (i
= 0; i
< 4; i
++)
13307 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13310 target
= gen_reg_rtx (SImode
);
13312 emit_insn (gen_encodekey256u32 (target
, op0
));
13314 for (i
= 0; i
< 4; i
++)
13316 op
= gen_rtx_MEM (V2DImode
,
13317 plus_constant (Pmode
, op3
, (i
* 16)));
13318 emit_move_insn (op
, xmm_regs
[i
]);
13324 case IX86_BUILTIN_PREFETCH
:
13326 arg0
= CALL_EXPR_ARG (exp
, 0); // const void *
13327 arg1
= CALL_EXPR_ARG (exp
, 1); // const int
13328 arg2
= CALL_EXPR_ARG (exp
, 2); // const int
13329 arg3
= CALL_EXPR_ARG (exp
, 3); // const int
13331 op0
= expand_normal (arg0
);
13332 op1
= expand_normal (arg1
);
13333 op2
= expand_normal (arg2
);
13334 op3
= expand_normal (arg3
);
13336 if (!CONST_INT_P (op1
) || !CONST_INT_P (op2
) || !CONST_INT_P (op3
))
13338 error ("second, third and fourth argument must be a const");
13342 if (INTVAL (op3
) == 1)
13344 if (INTVAL (op2
) < 2 || INTVAL (op2
) > 3)
13346 error ("invalid third argument");
13350 if (TARGET_64BIT
&& TARGET_PREFETCHI
13351 && local_func_symbolic_operand (op0
, GET_MODE (op0
)))
13352 emit_insn (gen_prefetchi (op0
, op2
));
13355 warning (0, "instruction prefetch applies when in 64-bit mode"
13356 " with RIP-relative addressing and"
13357 " option %<-mprefetchi%>;"
13358 " they stay NOPs otherwise");
13359 emit_insn (gen_nop ());
13364 if (!address_operand (op0
, VOIDmode
))
13366 op0
= convert_memory_address (Pmode
, op0
);
13367 op0
= copy_addr_to_reg (op0
);
13370 if (INTVAL (op2
) < 0 || INTVAL (op2
) > 3)
13372 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
13376 if (TARGET_3DNOW
|| TARGET_PREFETCH_SSE
13377 || TARGET_PRFCHW
|| TARGET_PREFETCHWT1
)
13378 emit_insn (gen_prefetch (op0
, op1
, op2
));
13379 else if (!MEM_P (op0
) && side_effects_p (op0
))
13380 /* Don't do anything with direct references to volatile memory,
13381 but generate code to handle other side effects. */
13388 case IX86_BUILTIN_PREFETCHI
:
13390 arg0
= CALL_EXPR_ARG (exp
, 0); // const void *
13391 arg1
= CALL_EXPR_ARG (exp
, 1); // const int
13393 op0
= expand_normal (arg0
);
13394 op1
= expand_normal (arg1
);
13396 if (!CONST_INT_P (op1
))
13398 error ("second argument must be a const");
13402 /* GOT/PLT_PIC should not be available for instruction prefetch.
13403 It must be real instruction address. */
13405 && local_func_symbolic_operand (op0
, GET_MODE (op0
)))
13406 emit_insn (gen_prefetchi (op0
, op1
));
13409 /* Ignore the hint. */
13410 warning (0, "instruction prefetch applies when in 64-bit mode"
13411 " with RIP-relative addressing and"
13412 " option %<-mprefetchi%>;"
13413 " they stay NOPs otherwise");
13414 emit_insn (gen_nop ());
13420 case IX86_BUILTIN_VEC_INIT_V2SI
:
13421 case IX86_BUILTIN_VEC_INIT_V4HI
:
13422 case IX86_BUILTIN_VEC_INIT_V8QI
:
13423 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
13425 case IX86_BUILTIN_VEC_EXT_V2DF
:
13426 case IX86_BUILTIN_VEC_EXT_V2DI
:
13427 case IX86_BUILTIN_VEC_EXT_V4SF
:
13428 case IX86_BUILTIN_VEC_EXT_V4SI
:
13429 case IX86_BUILTIN_VEC_EXT_V8HI
:
13430 case IX86_BUILTIN_VEC_EXT_V2SI
:
13431 case IX86_BUILTIN_VEC_EXT_V4HI
:
13432 case IX86_BUILTIN_VEC_EXT_V16QI
:
13433 return ix86_expand_vec_ext_builtin (exp
, target
);
13435 case IX86_BUILTIN_VEC_SET_V2DI
:
13436 case IX86_BUILTIN_VEC_SET_V4SF
:
13437 case IX86_BUILTIN_VEC_SET_V4SI
:
13438 case IX86_BUILTIN_VEC_SET_V8HI
:
13439 case IX86_BUILTIN_VEC_SET_V4HI
:
13440 case IX86_BUILTIN_VEC_SET_V16QI
:
13441 return ix86_expand_vec_set_builtin (exp
);
13443 case IX86_BUILTIN_NANQ
:
13444 case IX86_BUILTIN_NANSQ
:
13445 return expand_call (exp
, target
, ignore
);
13447 case IX86_BUILTIN_RDPID
:
13449 op0
= gen_reg_rtx (word_mode
);
13453 insn
= gen_rdpid_rex64 (op0
);
13454 op0
= convert_to_mode (SImode
, op0
, 1);
13457 insn
= gen_rdpid (op0
);
13462 || !register_operand (target
, SImode
))
13463 target
= gen_reg_rtx (SImode
);
13465 emit_move_insn (target
, op0
);
13468 case IX86_BUILTIN_2INTERSECTD512
:
13469 case IX86_BUILTIN_2INTERSECTQ512
:
13470 case IX86_BUILTIN_2INTERSECTD256
:
13471 case IX86_BUILTIN_2INTERSECTQ256
:
13472 case IX86_BUILTIN_2INTERSECTD128
:
13473 case IX86_BUILTIN_2INTERSECTQ128
:
13474 arg0
= CALL_EXPR_ARG (exp
, 0);
13475 arg1
= CALL_EXPR_ARG (exp
, 1);
13476 arg2
= CALL_EXPR_ARG (exp
, 2);
13477 arg3
= CALL_EXPR_ARG (exp
, 3);
13478 op0
= expand_normal (arg0
);
13479 op1
= expand_normal (arg1
);
13480 op2
= expand_normal (arg2
);
13481 op3
= expand_normal (arg3
);
13483 if (!address_operand (op0
, VOIDmode
))
13485 op0
= convert_memory_address (Pmode
, op0
);
13486 op0
= copy_addr_to_reg (op0
);
13488 if (!address_operand (op1
, VOIDmode
))
13490 op1
= convert_memory_address (Pmode
, op1
);
13491 op1
= copy_addr_to_reg (op1
);
13496 case IX86_BUILTIN_2INTERSECTD512
:
13498 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
13500 case IX86_BUILTIN_2INTERSECTQ512
:
13502 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
13504 case IX86_BUILTIN_2INTERSECTD256
:
13506 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
13508 case IX86_BUILTIN_2INTERSECTQ256
:
13510 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
13512 case IX86_BUILTIN_2INTERSECTD128
:
13514 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
13516 case IX86_BUILTIN_2INTERSECTQ128
:
13518 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
13521 gcc_unreachable ();
13524 mode2
= insn_data
[icode
].operand
[1].mode
;
13525 mode3
= insn_data
[icode
].operand
[2].mode
;
13526 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
13527 op2
= copy_to_mode_reg (mode2
, op2
);
13528 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
13529 op3
= copy_to_mode_reg (mode3
, op3
);
13531 op4
= gen_reg_rtx (mode4
);
13532 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
13533 mode0
= mode4
== P2HImode
? HImode
: QImode
;
13534 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
13535 gen_lowpart (mode0
, op4
));
13536 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
13537 gen_highpart (mode0
, op4
));
13541 case IX86_BUILTIN_RDPMC
:
13542 case IX86_BUILTIN_RDTSC
:
13543 case IX86_BUILTIN_RDTSCP
:
13544 case IX86_BUILTIN_XGETBV
:
13546 op0
= gen_reg_rtx (DImode
);
13547 op1
= gen_reg_rtx (DImode
);
13549 if (fcode
== IX86_BUILTIN_RDPMC
)
13551 arg0
= CALL_EXPR_ARG (exp
, 0);
13552 op2
= expand_normal (arg0
);
13553 if (!register_operand (op2
, SImode
))
13554 op2
= copy_to_mode_reg (SImode
, op2
);
13556 insn
= (TARGET_64BIT
13557 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
13558 : gen_rdpmc (op0
, op2
));
13561 else if (fcode
== IX86_BUILTIN_XGETBV
)
13563 arg0
= CALL_EXPR_ARG (exp
, 0);
13564 op2
= expand_normal (arg0
);
13565 if (!register_operand (op2
, SImode
))
13566 op2
= copy_to_mode_reg (SImode
, op2
);
13568 insn
= (TARGET_64BIT
13569 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
13570 : gen_xgetbv (op0
, op2
));
13573 else if (fcode
== IX86_BUILTIN_RDTSC
)
13575 insn
= (TARGET_64BIT
13576 ? gen_rdtsc_rex64 (op0
, op1
)
13577 : gen_rdtsc (op0
));
13582 op2
= gen_reg_rtx (SImode
);
13584 insn
= (TARGET_64BIT
13585 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
13586 : gen_rdtscp (op0
, op2
));
13589 arg0
= CALL_EXPR_ARG (exp
, 0);
13590 op4
= expand_normal (arg0
);
13591 if (!address_operand (op4
, VOIDmode
))
13593 op4
= convert_memory_address (Pmode
, op4
);
13594 op4
= copy_addr_to_reg (op4
);
13596 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
13600 || !register_operand (target
, DImode
))
13601 target
= gen_reg_rtx (DImode
);
13605 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
13606 op1
, 1, OPTAB_DIRECT
);
13607 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
13608 op0
, 1, OPTAB_DIRECT
);
13611 emit_move_insn (target
, op0
);
13614 case IX86_BUILTIN_ENQCMD
:
13615 case IX86_BUILTIN_ENQCMDS
:
13616 case IX86_BUILTIN_MOVDIR64B
:
13618 arg0
= CALL_EXPR_ARG (exp
, 0);
13619 arg1
= CALL_EXPR_ARG (exp
, 1);
13620 op0
= expand_normal (arg0
);
13621 op1
= expand_normal (arg1
);
13623 op0
= ix86_zero_extend_to_Pmode (op0
);
13624 if (!address_operand (op1
, VOIDmode
))
13626 op1
= convert_memory_address (Pmode
, op1
);
13627 op1
= copy_addr_to_reg (op1
);
13629 op1
= gen_rtx_MEM (XImode
, op1
);
13631 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
13633 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
13639 || !register_operand (target
, SImode
))
13640 target
= gen_reg_rtx (SImode
);
13642 emit_move_insn (target
, const0_rtx
);
13643 target
= gen_rtx_SUBREG (QImode
, target
, 0);
13645 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
13647 : UNSPECV_ENQCMDS
);
13648 icode
= code_for_enqcmd (unspecv
, Pmode
);
13649 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13652 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
13653 gen_rtx_fmt_ee (EQ
, QImode
,
13654 gen_rtx_REG (CCZmode
, FLAGS_REG
),
13656 return SUBREG_REG (target
);
13659 case IX86_BUILTIN_FXSAVE
:
13660 case IX86_BUILTIN_FXRSTOR
:
13661 case IX86_BUILTIN_FXSAVE64
:
13662 case IX86_BUILTIN_FXRSTOR64
:
13663 case IX86_BUILTIN_FNSTENV
:
13664 case IX86_BUILTIN_FLDENV
:
13668 case IX86_BUILTIN_FXSAVE
:
13669 icode
= CODE_FOR_fxsave
;
13671 case IX86_BUILTIN_FXRSTOR
:
13672 icode
= CODE_FOR_fxrstor
;
13674 case IX86_BUILTIN_FXSAVE64
:
13675 icode
= CODE_FOR_fxsave64
;
13677 case IX86_BUILTIN_FXRSTOR64
:
13678 icode
= CODE_FOR_fxrstor64
;
13680 case IX86_BUILTIN_FNSTENV
:
13681 icode
= CODE_FOR_fnstenv
;
13683 case IX86_BUILTIN_FLDENV
:
13684 icode
= CODE_FOR_fldenv
;
13687 gcc_unreachable ();
13690 arg0
= CALL_EXPR_ARG (exp
, 0);
13691 op0
= expand_normal (arg0
);
13693 if (!address_operand (op0
, VOIDmode
))
13695 op0
= convert_memory_address (Pmode
, op0
);
13696 op0
= copy_addr_to_reg (op0
);
13698 op0
= gen_rtx_MEM (mode0
, op0
);
13700 pat
= GEN_FCN (icode
) (op0
);
13705 case IX86_BUILTIN_XSETBV
:
13706 arg0
= CALL_EXPR_ARG (exp
, 0);
13707 arg1
= CALL_EXPR_ARG (exp
, 1);
13708 op0
= expand_normal (arg0
);
13709 op1
= expand_normal (arg1
);
13712 op0
= copy_to_mode_reg (SImode
, op0
);
13714 op1
= force_reg (DImode
, op1
);
13718 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13719 NULL
, 1, OPTAB_DIRECT
);
13721 icode
= CODE_FOR_xsetbv_rex64
;
13723 op2
= gen_lowpart (SImode
, op2
);
13724 op1
= gen_lowpart (SImode
, op1
);
13725 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13729 icode
= CODE_FOR_xsetbv
;
13731 pat
= GEN_FCN (icode
) (op0
, op1
);
13737 case IX86_BUILTIN_XSAVE
:
13738 case IX86_BUILTIN_XRSTOR
:
13739 case IX86_BUILTIN_XSAVE64
:
13740 case IX86_BUILTIN_XRSTOR64
:
13741 case IX86_BUILTIN_XSAVEOPT
:
13742 case IX86_BUILTIN_XSAVEOPT64
:
13743 case IX86_BUILTIN_XSAVES
:
13744 case IX86_BUILTIN_XRSTORS
:
13745 case IX86_BUILTIN_XSAVES64
:
13746 case IX86_BUILTIN_XRSTORS64
:
13747 case IX86_BUILTIN_XSAVEC
:
13748 case IX86_BUILTIN_XSAVEC64
:
13749 arg0
= CALL_EXPR_ARG (exp
, 0);
13750 arg1
= CALL_EXPR_ARG (exp
, 1);
13751 op0
= expand_normal (arg0
);
13752 op1
= expand_normal (arg1
);
13754 if (!address_operand (op0
, VOIDmode
))
13756 op0
= convert_memory_address (Pmode
, op0
);
13757 op0
= copy_addr_to_reg (op0
);
13759 op0
= gen_rtx_MEM (BLKmode
, op0
);
13761 op1
= force_reg (DImode
, op1
);
13765 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13766 NULL
, 1, OPTAB_DIRECT
);
13769 case IX86_BUILTIN_XSAVE
:
13770 icode
= CODE_FOR_xsave_rex64
;
13772 case IX86_BUILTIN_XRSTOR
:
13773 icode
= CODE_FOR_xrstor_rex64
;
13775 case IX86_BUILTIN_XSAVE64
:
13776 icode
= CODE_FOR_xsave64
;
13778 case IX86_BUILTIN_XRSTOR64
:
13779 icode
= CODE_FOR_xrstor64
;
13781 case IX86_BUILTIN_XSAVEOPT
:
13782 icode
= CODE_FOR_xsaveopt_rex64
;
13784 case IX86_BUILTIN_XSAVEOPT64
:
13785 icode
= CODE_FOR_xsaveopt64
;
13787 case IX86_BUILTIN_XSAVES
:
13788 icode
= CODE_FOR_xsaves_rex64
;
13790 case IX86_BUILTIN_XRSTORS
:
13791 icode
= CODE_FOR_xrstors_rex64
;
13793 case IX86_BUILTIN_XSAVES64
:
13794 icode
= CODE_FOR_xsaves64
;
13796 case IX86_BUILTIN_XRSTORS64
:
13797 icode
= CODE_FOR_xrstors64
;
13799 case IX86_BUILTIN_XSAVEC
:
13800 icode
= CODE_FOR_xsavec_rex64
;
13802 case IX86_BUILTIN_XSAVEC64
:
13803 icode
= CODE_FOR_xsavec64
;
13806 gcc_unreachable ();
13809 op2
= gen_lowpart (SImode
, op2
);
13810 op1
= gen_lowpart (SImode
, op1
);
13811 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13817 case IX86_BUILTIN_XSAVE
:
13818 icode
= CODE_FOR_xsave
;
13820 case IX86_BUILTIN_XRSTOR
:
13821 icode
= CODE_FOR_xrstor
;
13823 case IX86_BUILTIN_XSAVEOPT
:
13824 icode
= CODE_FOR_xsaveopt
;
13826 case IX86_BUILTIN_XSAVES
:
13827 icode
= CODE_FOR_xsaves
;
13829 case IX86_BUILTIN_XRSTORS
:
13830 icode
= CODE_FOR_xrstors
;
13832 case IX86_BUILTIN_XSAVEC
:
13833 icode
= CODE_FOR_xsavec
;
13836 gcc_unreachable ();
13838 pat
= GEN_FCN (icode
) (op0
, op1
);
13845 case IX86_BUILTIN_LLWPCB
:
13846 arg0
= CALL_EXPR_ARG (exp
, 0);
13847 op0
= expand_normal (arg0
);
13849 if (!register_operand (op0
, Pmode
))
13850 op0
= ix86_zero_extend_to_Pmode (op0
);
13851 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
13854 case IX86_BUILTIN_SLWPCB
:
13856 || !register_operand (target
, Pmode
))
13857 target
= gen_reg_rtx (Pmode
);
13858 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
13861 case IX86_BUILTIN_LWPVAL32
:
13862 case IX86_BUILTIN_LWPVAL64
:
13863 case IX86_BUILTIN_LWPINS32
:
13864 case IX86_BUILTIN_LWPINS64
:
13865 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
13866 || fcode
== IX86_BUILTIN_LWPINS32
)
13867 ? SImode
: DImode
);
13869 if (fcode
== IX86_BUILTIN_LWPVAL32
13870 || fcode
== IX86_BUILTIN_LWPVAL64
)
13871 icode
= code_for_lwp_lwpval (mode
);
13873 icode
= code_for_lwp_lwpins (mode
);
13875 arg0
= CALL_EXPR_ARG (exp
, 0);
13876 arg1
= CALL_EXPR_ARG (exp
, 1);
13877 arg2
= CALL_EXPR_ARG (exp
, 2);
13878 op0
= expand_normal (arg0
);
13879 op1
= expand_normal (arg1
);
13880 op2
= expand_normal (arg2
);
13881 mode0
= insn_data
[icode
].operand
[0].mode
;
13883 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13884 op0
= copy_to_mode_reg (mode0
, op0
);
13885 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
13886 op1
= copy_to_mode_reg (SImode
, op1
);
13888 if (!CONST_INT_P (op2
))
13890 error ("the last argument must be a 32-bit immediate");
13894 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
13896 if (fcode
== IX86_BUILTIN_LWPINS32
13897 || fcode
== IX86_BUILTIN_LWPINS64
)
13900 || !nonimmediate_operand (target
, QImode
))
13901 target
= gen_reg_rtx (QImode
);
13903 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13905 emit_insn (gen_rtx_SET (target
, pat
));
13912 case IX86_BUILTIN_BEXTRI32
:
13913 case IX86_BUILTIN_BEXTRI64
:
13914 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
13916 arg0
= CALL_EXPR_ARG (exp
, 0);
13917 arg1
= CALL_EXPR_ARG (exp
, 1);
13918 op0
= expand_normal (arg0
);
13919 op1
= expand_normal (arg1
);
13921 if (!CONST_INT_P (op1
))
13923 error ("last argument must be an immediate");
13928 unsigned char lsb_index
= UINTVAL (op1
);
13929 unsigned char length
= UINTVAL (op1
) >> 8;
13931 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
13933 icode
= code_for_tbm_bextri (mode
);
13935 mode1
= insn_data
[icode
].operand
[1].mode
;
13936 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
13937 op0
= copy_to_mode_reg (mode1
, op0
);
13939 mode0
= insn_data
[icode
].operand
[0].mode
;
13941 || !register_operand (target
, mode0
))
13942 target
= gen_reg_rtx (mode0
);
13944 if (length
== 0 || lsb_index
>= bitsize
)
13946 emit_move_insn (target
, const0_rtx
);
13950 if (length
+ lsb_index
> bitsize
)
13951 length
= bitsize
- lsb_index
;
13953 op1
= GEN_INT (length
);
13954 op2
= GEN_INT (lsb_index
);
13956 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
13960 case IX86_BUILTIN_RDRAND16_STEP
:
13964 case IX86_BUILTIN_RDRAND32_STEP
:
13968 case IX86_BUILTIN_RDRAND64_STEP
:
13972 arg0
= CALL_EXPR_ARG (exp
, 0);
13973 op1
= expand_normal (arg0
);
13974 if (!address_operand (op1
, VOIDmode
))
13976 op1
= convert_memory_address (Pmode
, op1
);
13977 op1
= copy_addr_to_reg (op1
);
13980 op0
= gen_reg_rtx (mode
);
13981 emit_insn (gen_rdrand (mode
, op0
));
13983 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
13985 op1
= force_reg (SImode
, const1_rtx
);
13987 /* Emit SImode conditional move. */
13988 if (mode
== HImode
)
13990 if (TARGET_ZERO_EXTEND_WITH_AND
13991 && optimize_function_for_speed_p (cfun
))
13993 op2
= force_reg (SImode
, const0_rtx
);
13995 emit_insn (gen_movstricthi
13996 (gen_lowpart (HImode
, op2
), op0
));
14000 op2
= gen_reg_rtx (SImode
);
14002 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
14005 else if (mode
== SImode
)
14008 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
14011 || !register_operand (target
, SImode
))
14012 target
= gen_reg_rtx (SImode
);
14014 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
14016 emit_insn (gen_rtx_SET (target
,
14017 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
14020 case IX86_BUILTIN_RDSEED16_STEP
:
14024 case IX86_BUILTIN_RDSEED32_STEP
:
14028 case IX86_BUILTIN_RDSEED64_STEP
:
14032 arg0
= CALL_EXPR_ARG (exp
, 0);
14033 op1
= expand_normal (arg0
);
14034 if (!address_operand (op1
, VOIDmode
))
14036 op1
= convert_memory_address (Pmode
, op1
);
14037 op1
= copy_addr_to_reg (op1
);
14040 op0
= gen_reg_rtx (mode
);
14041 emit_insn (gen_rdseed (mode
, op0
));
14043 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
14045 op2
= gen_reg_rtx (QImode
);
14047 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
14049 emit_insn (gen_rtx_SET (op2
, pat
));
14052 || !register_operand (target
, SImode
))
14053 target
= gen_reg_rtx (SImode
);
14055 emit_insn (gen_zero_extendqisi2 (target
, op2
));
14058 case IX86_BUILTIN_SBB32
:
14059 icode
= CODE_FOR_subborrowsi
;
14060 icode2
= CODE_FOR_subborrowsi_0
;
14066 case IX86_BUILTIN_SBB64
:
14067 icode
= CODE_FOR_subborrowdi
;
14068 icode2
= CODE_FOR_subborrowdi_0
;
14074 case IX86_BUILTIN_ADDCARRYX32
:
14075 icode
= CODE_FOR_addcarrysi
;
14076 icode2
= CODE_FOR_addcarrysi_0
;
14082 case IX86_BUILTIN_ADDCARRYX64
:
14083 icode
= CODE_FOR_addcarrydi
;
14084 icode2
= CODE_FOR_addcarrydi_0
;
14090 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
14091 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
14092 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
14093 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
14095 op1
= expand_normal (arg0
);
14097 op2
= expand_normal (arg1
);
14098 if (!register_operand (op2
, mode0
))
14099 op2
= copy_to_mode_reg (mode0
, op2
);
14101 op3
= expand_normal (arg2
);
14102 if (!register_operand (op3
, mode0
))
14103 op3
= copy_to_mode_reg (mode0
, op3
);
14105 op4
= expand_normal (arg3
);
14106 if (!address_operand (op4
, VOIDmode
))
14108 op4
= convert_memory_address (Pmode
, op4
);
14109 op4
= copy_addr_to_reg (op4
);
14112 op0
= gen_reg_rtx (mode0
);
14113 if (op1
== const0_rtx
)
14115 /* If arg0 is 0, optimize right away into add or sub
14116 instruction that sets CCCmode flags. */
14117 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
14118 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
14122 /* Generate CF from input operand. */
14123 ix86_expand_carry (op1
);
14125 /* Generate instruction that consumes CF. */
14126 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
14127 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
14128 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
14129 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
14132 /* Return current CF value. */
14134 target
= gen_reg_rtx (QImode
);
14136 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
14137 emit_insn (gen_rtx_SET (target
, pat
));
14139 /* Store the result. */
14140 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
14144 case IX86_BUILTIN_READ_FLAGS
:
14148 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
14151 || target
== NULL_RTX
14152 || !nonimmediate_operand (target
, word_mode
)
14153 || GET_MODE (target
) != word_mode
)
14154 target
= gen_reg_rtx (word_mode
);
14156 emit_insn (gen_pop (target
));
14159 case IX86_BUILTIN_WRITE_FLAGS
:
14161 arg0
= CALL_EXPR_ARG (exp
, 0);
14162 op0
= expand_normal (arg0
);
14163 if (!general_no_elim_operand (op0
, word_mode
))
14164 op0
= copy_to_mode_reg (word_mode
, op0
);
14166 emit_insn (gen_push (op0
));
14167 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
14170 case IX86_BUILTIN_KTESTC8
:
14171 icode
= CODE_FOR_ktestqi
;
14175 case IX86_BUILTIN_KTESTZ8
:
14176 icode
= CODE_FOR_ktestqi
;
14180 case IX86_BUILTIN_KTESTC16
:
14181 icode
= CODE_FOR_ktesthi
;
14185 case IX86_BUILTIN_KTESTZ16
:
14186 icode
= CODE_FOR_ktesthi
;
14190 case IX86_BUILTIN_KTESTC32
:
14191 icode
= CODE_FOR_ktestsi
;
14195 case IX86_BUILTIN_KTESTZ32
:
14196 icode
= CODE_FOR_ktestsi
;
14200 case IX86_BUILTIN_KTESTC64
:
14201 icode
= CODE_FOR_ktestdi
;
14205 case IX86_BUILTIN_KTESTZ64
:
14206 icode
= CODE_FOR_ktestdi
;
14210 case IX86_BUILTIN_KORTESTC8
:
14211 icode
= CODE_FOR_kortestqi
;
14215 case IX86_BUILTIN_KORTESTZ8
:
14216 icode
= CODE_FOR_kortestqi
;
14220 case IX86_BUILTIN_KORTESTC16
:
14221 icode
= CODE_FOR_kortesthi
;
14225 case IX86_BUILTIN_KORTESTZ16
:
14226 icode
= CODE_FOR_kortesthi
;
14230 case IX86_BUILTIN_KORTESTC32
:
14231 icode
= CODE_FOR_kortestsi
;
14235 case IX86_BUILTIN_KORTESTZ32
:
14236 icode
= CODE_FOR_kortestsi
;
14240 case IX86_BUILTIN_KORTESTC64
:
14241 icode
= CODE_FOR_kortestdi
;
14245 case IX86_BUILTIN_KORTESTZ64
:
14246 icode
= CODE_FOR_kortestdi
;
14250 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
14251 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
14252 op0
= expand_normal (arg0
);
14253 op1
= expand_normal (arg1
);
14255 mode0
= insn_data
[icode
].operand
[0].mode
;
14256 mode1
= insn_data
[icode
].operand
[1].mode
;
14258 if (GET_MODE (op0
) != VOIDmode
)
14259 op0
= force_reg (GET_MODE (op0
), op0
);
14261 op0
= gen_lowpart (mode0
, op0
);
14263 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14264 op0
= copy_to_mode_reg (mode0
, op0
);
14266 if (GET_MODE (op1
) != VOIDmode
)
14267 op1
= force_reg (GET_MODE (op1
), op1
);
14269 op1
= gen_lowpart (mode1
, op1
);
14271 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14272 op1
= copy_to_mode_reg (mode1
, op1
);
14274 target
= gen_reg_rtx (QImode
);
14276 /* Emit kortest. */
14277 emit_insn (GEN_FCN (icode
) (op0
, op1
));
14278 /* And use setcc to return result from flags. */
14279 ix86_expand_setcc (target
, EQ
,
14280 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
14283 case IX86_BUILTIN_GATHERSIV2DF
:
14284 icode
= CODE_FOR_avx2_gathersiv2df
;
14286 case IX86_BUILTIN_GATHERSIV4DF
:
14287 icode
= CODE_FOR_avx2_gathersiv4df
;
14289 case IX86_BUILTIN_GATHERDIV2DF
:
14290 icode
= CODE_FOR_avx2_gatherdiv2df
;
14292 case IX86_BUILTIN_GATHERDIV4DF
:
14293 icode
= CODE_FOR_avx2_gatherdiv4df
;
14295 case IX86_BUILTIN_GATHERSIV4SF
:
14296 icode
= CODE_FOR_avx2_gathersiv4sf
;
14298 case IX86_BUILTIN_GATHERSIV8SF
:
14299 icode
= CODE_FOR_avx2_gathersiv8sf
;
14301 case IX86_BUILTIN_GATHERDIV4SF
:
14302 icode
= CODE_FOR_avx2_gatherdiv4sf
;
14304 case IX86_BUILTIN_GATHERDIV8SF
:
14305 icode
= CODE_FOR_avx2_gatherdiv8sf
;
14307 case IX86_BUILTIN_GATHERSIV2DI
:
14308 icode
= CODE_FOR_avx2_gathersiv2di
;
14310 case IX86_BUILTIN_GATHERSIV4DI
:
14311 icode
= CODE_FOR_avx2_gathersiv4di
;
14313 case IX86_BUILTIN_GATHERDIV2DI
:
14314 icode
= CODE_FOR_avx2_gatherdiv2di
;
14316 case IX86_BUILTIN_GATHERDIV4DI
:
14317 icode
= CODE_FOR_avx2_gatherdiv4di
;
14319 case IX86_BUILTIN_GATHERSIV4SI
:
14320 icode
= CODE_FOR_avx2_gathersiv4si
;
14322 case IX86_BUILTIN_GATHERSIV8SI
:
14323 icode
= CODE_FOR_avx2_gathersiv8si
;
14325 case IX86_BUILTIN_GATHERDIV4SI
:
14326 icode
= CODE_FOR_avx2_gatherdiv4si
;
14328 case IX86_BUILTIN_GATHERDIV8SI
:
14329 icode
= CODE_FOR_avx2_gatherdiv8si
;
14331 case IX86_BUILTIN_GATHERALTSIV4DF
:
14332 icode
= CODE_FOR_avx2_gathersiv4df
;
14334 case IX86_BUILTIN_GATHERALTDIV8SF
:
14335 icode
= CODE_FOR_avx2_gatherdiv8sf
;
14337 case IX86_BUILTIN_GATHERALTSIV4DI
:
14338 icode
= CODE_FOR_avx2_gathersiv4di
;
14340 case IX86_BUILTIN_GATHERALTDIV8SI
:
14341 icode
= CODE_FOR_avx2_gatherdiv8si
;
14343 case IX86_BUILTIN_GATHER3SIV16SF
:
14344 icode
= CODE_FOR_avx512f_gathersiv16sf
;
14346 case IX86_BUILTIN_GATHER3SIV8DF
:
14347 icode
= CODE_FOR_avx512f_gathersiv8df
;
14349 case IX86_BUILTIN_GATHER3DIV16SF
:
14350 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
14352 case IX86_BUILTIN_GATHER3DIV8DF
:
14353 icode
= CODE_FOR_avx512f_gatherdiv8df
;
14355 case IX86_BUILTIN_GATHER3SIV16SI
:
14356 icode
= CODE_FOR_avx512f_gathersiv16si
;
14358 case IX86_BUILTIN_GATHER3SIV8DI
:
14359 icode
= CODE_FOR_avx512f_gathersiv8di
;
14361 case IX86_BUILTIN_GATHER3DIV16SI
:
14362 icode
= CODE_FOR_avx512f_gatherdiv16si
;
14364 case IX86_BUILTIN_GATHER3DIV8DI
:
14365 icode
= CODE_FOR_avx512f_gatherdiv8di
;
14367 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14368 icode
= CODE_FOR_avx512f_gathersiv8df
;
14370 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14371 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
14373 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14374 icode
= CODE_FOR_avx512f_gathersiv8di
;
14376 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14377 icode
= CODE_FOR_avx512f_gatherdiv16si
;
14379 case IX86_BUILTIN_GATHER3SIV2DF
:
14380 icode
= CODE_FOR_avx512vl_gathersiv2df
;
14382 case IX86_BUILTIN_GATHER3SIV4DF
:
14383 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14385 case IX86_BUILTIN_GATHER3DIV2DF
:
14386 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
14388 case IX86_BUILTIN_GATHER3DIV4DF
:
14389 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
14391 case IX86_BUILTIN_GATHER3SIV4SF
:
14392 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
14394 case IX86_BUILTIN_GATHER3SIV8SF
:
14395 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
14397 case IX86_BUILTIN_GATHER3DIV4SF
:
14398 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
14400 case IX86_BUILTIN_GATHER3DIV8SF
:
14401 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14403 case IX86_BUILTIN_GATHER3SIV2DI
:
14404 icode
= CODE_FOR_avx512vl_gathersiv2di
;
14406 case IX86_BUILTIN_GATHER3SIV4DI
:
14407 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14409 case IX86_BUILTIN_GATHER3DIV2DI
:
14410 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
14412 case IX86_BUILTIN_GATHER3DIV4DI
:
14413 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
14415 case IX86_BUILTIN_GATHER3SIV4SI
:
14416 icode
= CODE_FOR_avx512vl_gathersiv4si
;
14418 case IX86_BUILTIN_GATHER3SIV8SI
:
14419 icode
= CODE_FOR_avx512vl_gathersiv8si
;
14421 case IX86_BUILTIN_GATHER3DIV4SI
:
14422 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
14424 case IX86_BUILTIN_GATHER3DIV8SI
:
14425 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14427 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14428 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14430 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14431 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14433 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14434 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14436 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14437 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14439 case IX86_BUILTIN_SCATTERSIV16SF
:
14440 icode
= CODE_FOR_avx512f_scattersiv16sf
;
14442 case IX86_BUILTIN_SCATTERSIV8DF
:
14443 icode
= CODE_FOR_avx512f_scattersiv8df
;
14445 case IX86_BUILTIN_SCATTERDIV16SF
:
14446 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14448 case IX86_BUILTIN_SCATTERDIV8DF
:
14449 icode
= CODE_FOR_avx512f_scatterdiv8df
;
14451 case IX86_BUILTIN_SCATTERSIV16SI
:
14452 icode
= CODE_FOR_avx512f_scattersiv16si
;
14454 case IX86_BUILTIN_SCATTERSIV8DI
:
14455 icode
= CODE_FOR_avx512f_scattersiv8di
;
14457 case IX86_BUILTIN_SCATTERDIV16SI
:
14458 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14460 case IX86_BUILTIN_SCATTERDIV8DI
:
14461 icode
= CODE_FOR_avx512f_scatterdiv8di
;
14463 case IX86_BUILTIN_SCATTERSIV8SF
:
14464 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
14466 case IX86_BUILTIN_SCATTERSIV4SF
:
14467 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
14469 case IX86_BUILTIN_SCATTERSIV4DF
:
14470 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14472 case IX86_BUILTIN_SCATTERSIV2DF
:
14473 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14475 case IX86_BUILTIN_SCATTERDIV8SF
:
14476 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14478 case IX86_BUILTIN_SCATTERDIV4SF
:
14479 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14481 case IX86_BUILTIN_SCATTERDIV4DF
:
14482 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
14484 case IX86_BUILTIN_SCATTERDIV2DF
:
14485 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
14487 case IX86_BUILTIN_SCATTERSIV8SI
:
14488 icode
= CODE_FOR_avx512vl_scattersiv8si
;
14490 case IX86_BUILTIN_SCATTERSIV4SI
:
14491 icode
= CODE_FOR_avx512vl_scattersiv4si
;
14493 case IX86_BUILTIN_SCATTERSIV4DI
:
14494 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14496 case IX86_BUILTIN_SCATTERSIV2DI
:
14497 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14499 case IX86_BUILTIN_SCATTERDIV8SI
:
14500 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14502 case IX86_BUILTIN_SCATTERDIV4SI
:
14503 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14505 case IX86_BUILTIN_SCATTERDIV4DI
:
14506 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
14508 case IX86_BUILTIN_SCATTERDIV2DI
:
14509 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
14511 case IX86_BUILTIN_GATHERPFDPD
:
14512 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
14513 goto vec_prefetch_gen
;
14514 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14515 icode
= CODE_FOR_avx512f_scattersiv8df
;
14517 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14518 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14520 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14521 icode
= CODE_FOR_avx512f_scattersiv8di
;
14523 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14524 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14526 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14527 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14529 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14530 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14532 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14533 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14535 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14536 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14538 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14539 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14541 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14542 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14544 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14545 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14547 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14548 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14550 case IX86_BUILTIN_GATHERPFDPS
:
14551 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
14552 goto vec_prefetch_gen
;
14553 case IX86_BUILTIN_GATHERPFQPD
:
14554 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
14555 goto vec_prefetch_gen
;
14556 case IX86_BUILTIN_GATHERPFQPS
:
14557 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
14558 goto vec_prefetch_gen
;
14559 case IX86_BUILTIN_SCATTERPFDPD
:
14560 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
14561 goto vec_prefetch_gen
;
14562 case IX86_BUILTIN_SCATTERPFDPS
:
14563 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
14564 goto vec_prefetch_gen
;
14565 case IX86_BUILTIN_SCATTERPFQPD
:
14566 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
14567 goto vec_prefetch_gen
;
14568 case IX86_BUILTIN_SCATTERPFQPS
:
14569 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
14570 goto vec_prefetch_gen
;
14574 rtx (*gen
) (rtx
, rtx
);
14576 arg0
= CALL_EXPR_ARG (exp
, 0);
14577 arg1
= CALL_EXPR_ARG (exp
, 1);
14578 arg2
= CALL_EXPR_ARG (exp
, 2);
14579 arg3
= CALL_EXPR_ARG (exp
, 3);
14580 arg4
= CALL_EXPR_ARG (exp
, 4);
14581 op0
= expand_normal (arg0
);
14582 op1
= expand_normal (arg1
);
14583 op2
= expand_normal (arg2
);
14584 op3
= expand_normal (arg3
);
14585 op4
= expand_normal (arg4
);
14586 /* Note the arg order is different from the operand order. */
14587 mode0
= insn_data
[icode
].operand
[1].mode
;
14588 mode2
= insn_data
[icode
].operand
[3].mode
;
14589 mode3
= insn_data
[icode
].operand
[4].mode
;
14590 mode4
= insn_data
[icode
].operand
[5].mode
;
14592 if (target
== NULL_RTX
14593 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
14594 || !insn_data
[icode
].operand
[0].predicate (target
,
14595 GET_MODE (target
)))
14596 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
14598 subtarget
= target
;
14602 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14603 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14604 half
= gen_reg_rtx (V8SImode
);
14605 if (!nonimmediate_operand (op2
, V16SImode
))
14606 op2
= copy_to_mode_reg (V16SImode
, op2
);
14607 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14610 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14611 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14612 case IX86_BUILTIN_GATHERALTSIV4DF
:
14613 case IX86_BUILTIN_GATHERALTSIV4DI
:
14614 half
= gen_reg_rtx (V4SImode
);
14615 if (!nonimmediate_operand (op2
, V8SImode
))
14616 op2
= copy_to_mode_reg (V8SImode
, op2
);
14617 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14620 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14621 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14622 half
= gen_reg_rtx (mode0
);
14623 if (mode0
== V8SFmode
)
14624 gen
= gen_vec_extract_lo_v16sf
;
14626 gen
= gen_vec_extract_lo_v16si
;
14627 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14628 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14629 emit_insn (gen (half
, op0
));
14631 op3
= lowpart_subreg (QImode
, op3
, HImode
);
14633 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14634 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14635 case IX86_BUILTIN_GATHERALTDIV8SF
:
14636 case IX86_BUILTIN_GATHERALTDIV8SI
:
14637 half
= gen_reg_rtx (mode0
);
14638 if (mode0
== V4SFmode
)
14639 gen
= gen_vec_extract_lo_v8sf
;
14641 gen
= gen_vec_extract_lo_v8si
;
14642 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14643 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14644 emit_insn (gen (half
, op0
));
14646 if (VECTOR_MODE_P (GET_MODE (op3
)))
14648 half
= gen_reg_rtx (mode0
);
14649 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14650 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14651 emit_insn (gen (half
, op3
));
14659 /* Force memory operand only with base register here. But we
14660 don't want to do it on memory operand for other builtin
14662 op1
= ix86_zero_extend_to_Pmode (op1
);
14664 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
14665 op0
= copy_to_mode_reg (mode0
, op0
);
14666 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
14667 op1
= copy_to_mode_reg (Pmode
, op1
);
14668 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
14669 op2
= copy_to_mode_reg (mode2
, op2
);
14671 op3
= fixup_modeless_constant (op3
, mode3
);
14673 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
14675 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
14676 op3
= copy_to_mode_reg (mode3
, op3
);
14680 op3
= copy_to_reg (op3
);
14681 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
14683 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
14685 error ("the last argument must be scale 1, 2, 4, 8");
14689 /* Optimize. If mask is known to have all high bits set,
14690 replace op0 with pc_rtx to signal that the instruction
14691 overwrites the whole destination and doesn't use its
14692 previous contents. */
14695 if (TREE_CODE (arg3
) == INTEGER_CST
)
14697 if (integer_all_onesp (arg3
))
14700 else if (TREE_CODE (arg3
) == VECTOR_CST
)
14702 unsigned int negative
= 0;
14703 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
14705 tree cst
= VECTOR_CST_ELT (arg3
, i
);
14706 if (TREE_CODE (cst
) == INTEGER_CST
14707 && tree_int_cst_sign_bit (cst
))
14709 else if (TREE_CODE (cst
) == REAL_CST
14710 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
14713 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
14716 else if (TREE_CODE (arg3
) == SSA_NAME
14717 && VECTOR_TYPE_P (TREE_TYPE (arg3
)))
14719 /* Recognize also when mask is like:
14720 __v2df src = _mm_setzero_pd ();
14721 __v2df mask = _mm_cmpeq_pd (src, src);
14723 __v8sf src = _mm256_setzero_ps ();
14724 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14725 as that is a cheaper way to load all ones into
14726 a register than having to load a constant from
14728 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
14729 if (is_gimple_call (def_stmt
))
14731 tree fndecl
= gimple_call_fndecl (def_stmt
);
14733 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
14734 switch (DECL_MD_FUNCTION_CODE (fndecl
))
14736 case IX86_BUILTIN_CMPPD
:
14737 case IX86_BUILTIN_CMPPS
:
14738 case IX86_BUILTIN_CMPPD256
:
14739 case IX86_BUILTIN_CMPPS256
:
14740 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
14743 case IX86_BUILTIN_CMPEQPD
:
14744 case IX86_BUILTIN_CMPEQPS
:
14745 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
14746 && initializer_zerop (gimple_call_arg (def_stmt
,
14757 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
14764 case IX86_BUILTIN_GATHER3DIV16SF
:
14765 if (target
== NULL_RTX
)
14766 target
= gen_reg_rtx (V8SFmode
);
14767 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
14769 case IX86_BUILTIN_GATHER3DIV16SI
:
14770 if (target
== NULL_RTX
)
14771 target
= gen_reg_rtx (V8SImode
);
14772 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
14774 case IX86_BUILTIN_GATHER3DIV8SF
:
14775 case IX86_BUILTIN_GATHERDIV8SF
:
14776 if (target
== NULL_RTX
)
14777 target
= gen_reg_rtx (V4SFmode
);
14778 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
14780 case IX86_BUILTIN_GATHER3DIV8SI
:
14781 case IX86_BUILTIN_GATHERDIV8SI
:
14782 if (target
== NULL_RTX
)
14783 target
= gen_reg_rtx (V4SImode
);
14784 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
14787 target
= subtarget
;
14793 arg0
= CALL_EXPR_ARG (exp
, 0);
14794 arg1
= CALL_EXPR_ARG (exp
, 1);
14795 arg2
= CALL_EXPR_ARG (exp
, 2);
14796 arg3
= CALL_EXPR_ARG (exp
, 3);
14797 arg4
= CALL_EXPR_ARG (exp
, 4);
14798 op0
= expand_normal (arg0
);
14799 op1
= expand_normal (arg1
);
14800 op2
= expand_normal (arg2
);
14801 op3
= expand_normal (arg3
);
14802 op4
= expand_normal (arg4
);
14803 mode1
= insn_data
[icode
].operand
[1].mode
;
14804 mode2
= insn_data
[icode
].operand
[2].mode
;
14805 mode3
= insn_data
[icode
].operand
[3].mode
;
14806 mode4
= insn_data
[icode
].operand
[4].mode
;
14808 /* Scatter instruction stores operand op3 to memory with
14809 indices from op2 and scale from op4 under writemask op1.
14810 If index operand op2 has more elements then source operand
14811 op3 one need to use only its low half. And vice versa. */
14814 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14815 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14816 half
= gen_reg_rtx (V8SImode
);
14817 if (!nonimmediate_operand (op2
, V16SImode
))
14818 op2
= copy_to_mode_reg (V16SImode
, op2
);
14819 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14822 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14823 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14824 half
= gen_reg_rtx (mode3
);
14825 if (mode3
== V8SFmode
)
14826 gen
= gen_vec_extract_lo_v16sf
;
14828 gen
= gen_vec_extract_lo_v16si
;
14829 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14830 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14831 emit_insn (gen (half
, op3
));
14834 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14835 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14836 half
= gen_reg_rtx (V4SImode
);
14837 if (!nonimmediate_operand (op2
, V8SImode
))
14838 op2
= copy_to_mode_reg (V8SImode
, op2
);
14839 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14842 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14843 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14844 half
= gen_reg_rtx (mode3
);
14845 if (mode3
== V4SFmode
)
14846 gen
= gen_vec_extract_lo_v8sf
;
14848 gen
= gen_vec_extract_lo_v8si
;
14849 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14850 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14851 emit_insn (gen (half
, op3
));
14854 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14855 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14856 if (!nonimmediate_operand (op2
, V4SImode
))
14857 op2
= copy_to_mode_reg (V4SImode
, op2
);
14859 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14860 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14861 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14862 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14868 /* Force memory operand only with base register here. But we
14869 don't want to do it on memory operand for other builtin
14871 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
14873 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
14874 op0
= copy_to_mode_reg (Pmode
, op0
);
14876 op1
= fixup_modeless_constant (op1
, mode1
);
14878 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
14880 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14881 op1
= copy_to_mode_reg (mode1
, op1
);
14885 op1
= copy_to_reg (op1
);
14886 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
14889 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
14890 op2
= copy_to_mode_reg (mode2
, op2
);
14892 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14893 op3
= copy_to_mode_reg (mode3
, op3
);
14895 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14897 error ("the last argument must be scale 1, 2, 4, 8");
14901 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14909 arg0
= CALL_EXPR_ARG (exp
, 0);
14910 arg1
= CALL_EXPR_ARG (exp
, 1);
14911 arg2
= CALL_EXPR_ARG (exp
, 2);
14912 arg3
= CALL_EXPR_ARG (exp
, 3);
14913 arg4
= CALL_EXPR_ARG (exp
, 4);
14914 op0
= expand_normal (arg0
);
14915 op1
= expand_normal (arg1
);
14916 op2
= expand_normal (arg2
);
14917 op3
= expand_normal (arg3
);
14918 op4
= expand_normal (arg4
);
14919 mode0
= insn_data
[icode
].operand
[0].mode
;
14920 mode1
= insn_data
[icode
].operand
[1].mode
;
14921 mode3
= insn_data
[icode
].operand
[3].mode
;
14922 mode4
= insn_data
[icode
].operand
[4].mode
;
14924 op0
= fixup_modeless_constant (op0
, mode0
);
14926 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
14928 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14929 op0
= copy_to_mode_reg (mode0
, op0
);
14933 op0
= copy_to_reg (op0
);
14934 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
14937 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14938 op1
= copy_to_mode_reg (mode1
, op1
);
14940 /* Force memory operand only with base register here. But we
14941 don't want to do it on memory operand for other builtin
14943 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
14945 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
14946 op2
= copy_to_mode_reg (Pmode
, op2
);
14948 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14950 error ("the forth argument must be scale 1, 2, 4, 8");
14954 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14956 error ("incorrect hint operand");
14960 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14968 case IX86_BUILTIN_XABORT
:
14969 icode
= CODE_FOR_xabort
;
14970 arg0
= CALL_EXPR_ARG (exp
, 0);
14971 op0
= expand_normal (arg0
);
14972 mode0
= insn_data
[icode
].operand
[0].mode
;
14973 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14975 error ("the argument to %<xabort%> intrinsic must "
14976 "be an 8-bit immediate");
14979 emit_insn (gen_xabort (op0
));
14982 case IX86_BUILTIN_RDSSPD
:
14983 case IX86_BUILTIN_RDSSPQ
:
14984 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
14987 || !register_operand (target
, mode
))
14988 target
= gen_reg_rtx (mode
);
14990 op0
= force_reg (mode
, const0_rtx
);
14992 emit_insn (gen_rdssp (mode
, target
, op0
));
14995 case IX86_BUILTIN_INCSSPD
:
14996 case IX86_BUILTIN_INCSSPQ
:
14997 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
14999 arg0
= CALL_EXPR_ARG (exp
, 0);
15000 op0
= expand_normal (arg0
);
15002 op0
= force_reg (mode
, op0
);
15004 emit_insn (gen_incssp (mode
, op0
));
15007 case IX86_BUILTIN_HRESET
:
15008 icode
= CODE_FOR_hreset
;
15009 arg0
= CALL_EXPR_ARG (exp
, 0);
15010 op0
= expand_normal (arg0
);
15011 op0
= force_reg (SImode
, op0
);
15012 emit_insn (gen_hreset (op0
));
15015 case IX86_BUILTIN_RSTORSSP
:
15016 case IX86_BUILTIN_CLRSSBSY
:
15017 arg0
= CALL_EXPR_ARG (exp
, 0);
15018 op0
= expand_normal (arg0
);
15019 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
15020 ? CODE_FOR_rstorssp
15021 : CODE_FOR_clrssbsy
);
15023 if (!address_operand (op0
, VOIDmode
))
15025 op0
= convert_memory_address (Pmode
, op0
);
15026 op0
= copy_addr_to_reg (op0
);
15028 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
15031 case IX86_BUILTIN_WRSSD
:
15032 case IX86_BUILTIN_WRSSQ
:
15033 case IX86_BUILTIN_WRUSSD
:
15034 case IX86_BUILTIN_WRUSSQ
:
15035 mode
= ((fcode
== IX86_BUILTIN_WRSSD
15036 || fcode
== IX86_BUILTIN_WRUSSD
)
15037 ? SImode
: DImode
);
15039 arg0
= CALL_EXPR_ARG (exp
, 0);
15040 op0
= expand_normal (arg0
);
15041 arg1
= CALL_EXPR_ARG (exp
, 1);
15042 op1
= expand_normal (arg1
);
15044 op0
= force_reg (mode
, op0
);
15046 if (!address_operand (op1
, VOIDmode
))
15048 op1
= convert_memory_address (Pmode
, op1
);
15049 op1
= copy_addr_to_reg (op1
);
15051 op1
= gen_rtx_MEM (mode
, op1
);
15053 icode
= ((fcode
== IX86_BUILTIN_WRSSD
15054 || fcode
== IX86_BUILTIN_WRSSQ
)
15055 ? code_for_wrss (mode
)
15056 : code_for_wruss (mode
));
15057 emit_insn (GEN_FCN (icode
) (op0
, op1
));
15065 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
15066 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
15068 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
15069 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
15073 if (fcode
>= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
15074 && fcode
<= IX86_BUILTIN__BDESC_PURE_ARGS_LAST
)
15076 i
= fcode
- IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
;
15077 return ix86_expand_special_args_builtin (bdesc_pure_args
+ i
, exp
,
15081 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
15082 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
15084 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
15085 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
15086 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
15087 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
15089 machine_mode mode
, wide_mode
, nar_mode
;
15091 nar_mode
= V4SFmode
;
15093 wide_mode
= V64SFmode
;
15094 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
15095 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
15099 case IX86_BUILTIN_4FMAPS
:
15100 fcn
= gen_avx5124fmaddps_4fmaddps
;
15104 case IX86_BUILTIN_4DPWSSD
:
15105 nar_mode
= V4SImode
;
15107 wide_mode
= V64SImode
;
15108 fcn
= gen_avx5124vnniw_vp4dpwssd
;
15112 case IX86_BUILTIN_4DPWSSDS
:
15113 nar_mode
= V4SImode
;
15115 wide_mode
= V64SImode
;
15116 fcn
= gen_avx5124vnniw_vp4dpwssds
;
15120 case IX86_BUILTIN_4FNMAPS
:
15121 fcn
= gen_avx5124fmaddps_4fnmaddps
;
15125 case IX86_BUILTIN_4FNMAPS_MASK
:
15126 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
15127 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
15130 case IX86_BUILTIN_4DPWSSD_MASK
:
15131 nar_mode
= V4SImode
;
15133 wide_mode
= V64SImode
;
15134 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
15135 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
15138 case IX86_BUILTIN_4DPWSSDS_MASK
:
15139 nar_mode
= V4SImode
;
15141 wide_mode
= V64SImode
;
15142 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
15143 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
15146 case IX86_BUILTIN_4FMAPS_MASK
:
15156 wide_reg
= gen_reg_rtx (wide_mode
);
15157 for (i
= 0; i
< 4; i
++)
15159 args
[i
] = CALL_EXPR_ARG (exp
, i
);
15160 ops
[i
] = expand_normal (args
[i
]);
15162 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
15166 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
15167 accum
= force_reg (mode
, accum
);
15169 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
15170 addr
= force_reg (Pmode
, addr
);
15172 mem
= gen_rtx_MEM (nar_mode
, addr
);
15174 target
= gen_reg_rtx (mode
);
15176 emit_move_insn (target
, accum
);
15179 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
15183 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
15185 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
15187 if (CONST_INT_P (mask
))
15188 mask
= fixup_modeless_constant (mask
, HImode
);
15190 mask
= force_reg (HImode
, mask
);
15192 if (GET_MODE (mask
) != HImode
)
15193 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
15195 /* If merge is 0 then we're about to emit z-masked variant. */
15196 if (const0_operand (merge
, mode
))
15197 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
15198 /* If merge is the same as accum then emit merge-masked variant. */
15199 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
15201 merge
= force_reg (mode
, merge
);
15202 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
15204 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15207 target
= gen_reg_rtx (mode
);
15208 emit_move_insn (target
, merge
);
15209 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
15215 case IX86_BUILTIN_4FNMASS
:
15216 fcn
= gen_avx5124fmaddps_4fnmaddss
;
15220 case IX86_BUILTIN_4FMASS
:
15221 fcn
= gen_avx5124fmaddps_4fmaddss
;
15225 case IX86_BUILTIN_4FNMASS_MASK
:
15226 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
15227 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
15230 case IX86_BUILTIN_4FMASS_MASK
:
15239 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
15240 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
15244 wide_reg
= gen_reg_rtx (V64SFmode
);
15245 for (i
= 0; i
< 4; i
++)
15248 args
[i
] = CALL_EXPR_ARG (exp
, i
);
15249 ops
[i
] = expand_normal (args
[i
]);
15251 tmp
= gen_reg_rtx (SFmode
);
15252 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
15254 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
15255 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
15258 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
15259 accum
= force_reg (V4SFmode
, accum
);
15261 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
15262 addr
= force_reg (Pmode
, addr
);
15264 mem
= gen_rtx_MEM (V4SFmode
, addr
);
15266 target
= gen_reg_rtx (V4SFmode
);
15268 emit_move_insn (target
, accum
);
15271 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
15275 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
15277 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
15279 if (CONST_INT_P (mask
))
15280 mask
= fixup_modeless_constant (mask
, QImode
);
15282 mask
= force_reg (QImode
, mask
);
15284 if (GET_MODE (mask
) != QImode
)
15285 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
15287 /* If merge is 0 then we're about to emit z-masked variant. */
15288 if (const0_operand (merge
, mode
))
15289 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
15290 /* If merge is the same as accum then emit merge-masked
15292 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
15294 merge
= force_reg (mode
, merge
);
15295 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
15297 /* Merge with something unknown might happen if we z-mask
15301 target
= gen_reg_rtx (mode
);
15302 emit_move_insn (target
, merge
);
15303 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
15308 case IX86_BUILTIN_RDPID
:
15309 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
15311 case IX86_BUILTIN_FABSQ
:
15312 case IX86_BUILTIN_COPYSIGNQ
:
15314 /* Emit a normal call if SSE isn't available. */
15315 return expand_call (exp
, target
, ignore
);
15318 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
15322 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
15323 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
15325 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
15326 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
15329 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15330 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
15332 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
15333 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
15336 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15337 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
15339 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
15340 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
15343 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15344 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
15346 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
15347 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
15350 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15351 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
15353 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
15354 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
15355 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
15356 (enum ix86_builtin_func_type
)
15357 d
->flag
, d
->comparison
);
15360 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
15361 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
15363 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
15364 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
15368 gcc_unreachable ();
15371 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15372 fill target with val via vec_duplicate. */
15375 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
15380 /* Save/restore recog_data in case this is called from splitters
15381 or other routines where recog_data needs to stay valid across
15382 force_reg. See PR106577. */
15383 recog_data_d recog_data_save
= recog_data
;
15385 /* First attempt to recognize VAL as-is. */
15386 dup
= gen_vec_duplicate (mode
, val
);
15387 insn
= emit_insn (gen_rtx_SET (target
, dup
));
15388 if (recog_memoized (insn
) < 0)
15391 machine_mode innermode
= GET_MODE_INNER (mode
);
15394 /* If that fails, force VAL into a register. */
15397 reg
= force_reg (innermode
, val
);
15398 if (GET_MODE (reg
) != innermode
)
15399 reg
= gen_lowpart (innermode
, reg
);
15400 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
15401 seq
= get_insns ();
15404 emit_insn_before (seq
, insn
);
15406 ok
= recog_memoized (insn
) >= 0;
15409 recog_data
= recog_data_save
;
15413 /* Get a vector mode of the same size as the original but with elements
15414 twice as wide. This is only guaranteed to apply to integral vectors. */
15416 static machine_mode
15417 get_mode_wider_vector (machine_mode o
)
15419 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15420 machine_mode n
= GET_MODE_NEXT_MODE (o
).require ();
15421 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
15422 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
15426 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
15427 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
15429 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15430 with all elements equal to VAR. Return true if successful. */
15433 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
15434 rtx target
, rtx val
)
15458 return ix86_vector_duplicate_value (mode
, target
, val
);
15463 if (TARGET_SSE
|| TARGET_3DNOW_A
)
15467 val
= gen_lowpart (SImode
, val
);
15468 x
= gen_rtx_TRUNCATE (HImode
, val
);
15469 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15470 emit_insn (gen_rtx_SET (target
, x
));
15480 val
= gen_lowpart (SImode
, val
);
15481 x
= gen_rtx_TRUNCATE (HImode
, val
);
15482 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15483 emit_insn (gen_rtx_SET (target
, x
));
15498 return ix86_vector_duplicate_value (mode
, target
, val
);
15502 struct expand_vec_perm_d dperm
;
15506 memset (&dperm
, 0, sizeof (dperm
));
15507 dperm
.target
= target
;
15508 dperm
.vmode
= mode
;
15509 dperm
.nelt
= GET_MODE_NUNITS (mode
);
15510 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
15511 dperm
.one_operand_p
= true;
15513 if (mode
== V8HFmode
|| mode
== V8BFmode
)
15515 tmp1
= force_reg (GET_MODE_INNER (mode
), val
);
15516 tmp2
= gen_reg_rtx (mode
);
15517 emit_insn (gen_vec_set_0 (mode
, tmp2
, CONST0_RTX (mode
), tmp1
));
15518 tmp1
= gen_lowpart (mode
, tmp2
);
15522 /* Extend to SImode using a paradoxical SUBREG. */
15523 tmp1
= gen_reg_rtx (SImode
);
15524 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
15526 /* Insert the SImode value as
15527 low element of a V4SImode vector. */
15528 tmp2
= gen_reg_rtx (V4SImode
);
15529 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
15530 tmp1
= gen_lowpart (mode
, tmp2
);
15533 emit_move_insn (dperm
.op0
, tmp1
);
15534 ok
= (expand_vec_perm_1 (&dperm
)
15535 || expand_vec_perm_broadcast_1 (&dperm
));
15543 return ix86_vector_duplicate_value (mode
, target
, val
);
15550 /* Replicate the value once into the next wider mode and recurse. */
15552 machine_mode smode
, wsmode
, wvmode
;
15555 smode
= GET_MODE_INNER (mode
);
15556 wvmode
= get_mode_wider_vector (mode
);
15557 wsmode
= GET_MODE_INNER (wvmode
);
15559 val
= convert_modes (wsmode
, smode
, val
, true);
15561 if (smode
== QImode
&& !TARGET_PARTIAL_REG_STALL
)
15562 emit_insn (gen_insv_1 (wsmode
, val
, val
));
15565 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
15566 GEN_INT (GET_MODE_BITSIZE (smode
)),
15567 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15568 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1,
15572 x
= gen_reg_rtx (wvmode
);
15573 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
15575 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
15584 return ix86_vector_duplicate_value (mode
, target
, val
);
15587 machine_mode hvmode
;
15600 hvmode
= V16QImode
;
15603 gcc_unreachable ();
15605 rtx x
= gen_reg_rtx (hvmode
);
15607 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15610 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15611 emit_insn (gen_rtx_SET (target
, x
));
15619 if (TARGET_AVX512BW
)
15620 return ix86_vector_duplicate_value (mode
, target
, val
);
15623 machine_mode hvmode
;
15627 hvmode
= V16HImode
;
15630 hvmode
= V16HFmode
;
15633 hvmode
= V16BFmode
;
15636 hvmode
= V32QImode
;
15639 gcc_unreachable ();
15641 rtx x
= gen_reg_rtx (hvmode
);
15643 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15646 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15647 emit_insn (gen_rtx_SET (target
, x
));
15656 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15657 whose ONE_VAR element is VAR, and other elements are zero. Return true
15661 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
15662 rtx target
, rtx var
, int one_var
)
15664 machine_mode vsimode
;
15667 bool use_vector_set
= false;
15668 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
15673 /* For SSE4.1, we normally use vector set. But if the second
15674 element is zero and inter-unit moves are OK, we use movq
15676 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
15677 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15683 use_vector_set
= TARGET_SSE4_1
;
15686 use_vector_set
= TARGET_SSE2
;
15687 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15688 ? gen_vec_setv8hi_0
: NULL
;
15691 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15694 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
15697 use_vector_set
= TARGET_SSE4_1
;
15700 use_vector_set
= TARGET_AVX
;
15703 use_vector_set
= TARGET_AVX
;
15704 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15705 ? gen_vec_setv16hi_0
: NULL
;
15708 use_vector_set
= TARGET_AVX
;
15709 gen_vec_set_0
= gen_vec_setv8si_0
;
15712 use_vector_set
= TARGET_AVX
;
15713 gen_vec_set_0
= gen_vec_setv8sf_0
;
15716 use_vector_set
= TARGET_AVX
;
15717 gen_vec_set_0
= gen_vec_setv4df_0
;
15720 /* Use ix86_expand_vector_set in 64bit mode only. */
15721 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
15722 gen_vec_set_0
= gen_vec_setv4di_0
;
15725 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15726 gen_vec_set_0
= gen_vec_setv16si_0
;
15729 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15730 gen_vec_set_0
= gen_vec_setv16sf_0
;
15733 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15734 gen_vec_set_0
= gen_vec_setv8df_0
;
15737 /* Use ix86_expand_vector_set in 64bit mode only. */
15738 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
15739 gen_vec_set_0
= gen_vec_setv8di_0
;
15742 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15743 gen_vec_set_0
= gen_vec_setv8hf_0
;
15746 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15747 gen_vec_set_0
= gen_vec_setv16hf_0
;
15750 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15751 gen_vec_set_0
= gen_vec_setv32hf_0
;
15754 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15755 gen_vec_set_0
= gen_vec_setv8bf_0
;
15758 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15759 gen_vec_set_0
= gen_vec_setv16bf_0
;
15762 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15763 gen_vec_set_0
= gen_vec_setv32bf_0
;
15766 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15767 gen_vec_set_0
= gen_vec_setv32hi_0
;
15772 if (use_vector_set
)
15774 if (gen_vec_set_0
&& one_var
== 0)
15776 var
= force_reg (GET_MODE_INNER (mode
), var
);
15777 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
15780 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
15781 var
= force_reg (GET_MODE_INNER (mode
), var
);
15782 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15798 var
= force_reg (GET_MODE_INNER (mode
), var
);
15799 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
15800 emit_insn (gen_rtx_SET (target
, x
));
15805 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
15806 new_target
= gen_reg_rtx (mode
);
15808 new_target
= target
;
15809 var
= force_reg (GET_MODE_INNER (mode
), var
);
15810 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
15811 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
15812 emit_insn (gen_rtx_SET (new_target
, x
));
15815 /* We need to shuffle the value to the correct position, so
15816 create a new pseudo to store the intermediate result. */
15818 /* With SSE2, we can use the integer shuffle insns. */
15819 if (mode
!= V4SFmode
&& TARGET_SSE2
)
15821 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
15823 GEN_INT (one_var
== 1 ? 0 : 1),
15824 GEN_INT (one_var
== 2 ? 0 : 1),
15825 GEN_INT (one_var
== 3 ? 0 : 1)));
15826 if (target
!= new_target
)
15827 emit_move_insn (target
, new_target
);
15831 /* Otherwise convert the intermediate result to V4SFmode and
15832 use the SSE1 shuffle instructions. */
15833 if (mode
!= V4SFmode
)
15835 tmp
= gen_reg_rtx (V4SFmode
);
15836 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
15841 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
15843 GEN_INT (one_var
== 1 ? 0 : 1),
15844 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
15845 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
15847 if (mode
!= V4SFmode
)
15848 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
15849 else if (tmp
!= target
)
15850 emit_move_insn (target
, tmp
);
15852 else if (target
!= new_target
)
15853 emit_move_insn (target
, new_target
);
15858 vsimode
= V4SImode
;
15864 vsimode
= V2SImode
;
15870 /* Zero extend the variable element to SImode and recurse. */
15871 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
15873 x
= gen_reg_rtx (vsimode
);
15874 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
15876 gcc_unreachable ();
15878 emit_move_insn (target
, gen_lowpart (mode
, x
));
15886 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15887 consisting of the values in VALS. It is known that all elements
15888 except ONE_VAR are constants. Return true if successful. */
15891 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
15892 rtx target
, rtx vals
, int one_var
)
15894 rtx var
= XVECEXP (vals
, 0, one_var
);
15895 machine_mode wmode
;
15898 const_vec
= copy_rtx (vals
);
15899 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
15900 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
15908 /* For the two element vectors, it's just as easy to use
15909 the general case. */
15913 /* Use ix86_expand_vector_set in 64bit mode only. */
15938 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
15947 /* There's no way to set one QImode entry easily. Combine
15948 the variable value with its adjacent constant value, and
15949 promote to an HImode set. */
15950 x
= XVECEXP (vals
, 0, one_var
^ 1);
15953 var
= convert_modes (HImode
, QImode
, var
, true);
15954 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
15955 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15956 x
= GEN_INT (INTVAL (x
) & 0xff);
15960 var
= convert_modes (HImode
, QImode
, var
, true);
15961 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
15963 if (x
!= const0_rtx
)
15964 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
15965 1, OPTAB_LIB_WIDEN
);
15967 x
= gen_reg_rtx (wmode
);
15968 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
15969 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
15971 emit_move_insn (target
, gen_lowpart (mode
, x
));
15978 emit_move_insn (target
, const_vec
);
15979 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15983 /* A subroutine of ix86_expand_vector_init_general. Use vector
15984 concatenate to handle the most general case: all values variable,
15985 and none identical. */
15988 ix86_expand_vector_init_concat (machine_mode mode
,
15989 rtx target
, rtx
*ops
, int n
)
15991 machine_mode half_mode
= VOIDmode
;
16002 half_mode
= V16HFmode
;
16005 half_mode
= V16BFmode
;
16008 half_mode
= V8SImode
;
16011 half_mode
= V8SFmode
;
16014 half_mode
= V4DImode
;
16017 half_mode
= V4DFmode
;
16020 half_mode
= V8HFmode
;
16023 half_mode
= V8BFmode
;
16026 half_mode
= V4SImode
;
16029 half_mode
= V4SFmode
;
16032 half_mode
= V2DImode
;
16035 half_mode
= V2DFmode
;
16038 half_mode
= V2SImode
;
16041 half_mode
= V2SFmode
;
16044 half_mode
= DImode
;
16047 half_mode
= SImode
;
16050 half_mode
= DFmode
;
16053 half_mode
= SFmode
;
16056 gcc_unreachable ();
16059 if (!register_operand (ops
[1], half_mode
))
16060 ops
[1] = force_reg (half_mode
, ops
[1]);
16061 if (!register_operand (ops
[0], half_mode
))
16062 ops
[0] = force_reg (half_mode
, ops
[0]);
16063 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
16071 half_mode
= V2DImode
;
16074 half_mode
= V2DFmode
;
16077 half_mode
= V2SImode
;
16080 half_mode
= V2SFmode
;
16083 gcc_unreachable ();
16091 half_mode
= V4DImode
;
16094 half_mode
= V4DFmode
;
16097 half_mode
= V4SImode
;
16100 half_mode
= V4SFmode
;
16103 gcc_unreachable ();
16111 half_mode
= V8SImode
;
16114 half_mode
= V8SFmode
;
16117 gcc_unreachable ();
16122 /* FIXME: We process inputs backward to help RA. PR 36222. */
16124 for (j
= 1; j
!= -1; j
--)
16126 half
[j
] = gen_reg_rtx (half_mode
);
16130 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
16134 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
16138 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
16139 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
16143 gcc_unreachable ();
16145 ix86_expand_vector_init (false, half
[j
],
16146 gen_rtx_PARALLEL (half_mode
, v
));
16149 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
16153 gcc_unreachable ();
16157 /* A subroutine of ix86_expand_vector_init_general. Use vector
16158 interleave to handle the most general case: all values variable,
16159 and none identical. */
16162 ix86_expand_vector_init_interleave (machine_mode mode
,
16163 rtx target
, rtx
*ops
, int n
)
16165 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
16168 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
16169 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
16170 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
16175 gen_load_even
= gen_vec_interleave_lowv8hf
;
16176 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16177 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16178 inner_mode
= HFmode
;
16179 first_imode
= V4SImode
;
16180 second_imode
= V2DImode
;
16181 third_imode
= VOIDmode
;
16184 gen_load_even
= gen_vec_interleave_lowv8bf
;
16185 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16186 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16187 inner_mode
= BFmode
;
16188 first_imode
= V4SImode
;
16189 second_imode
= V2DImode
;
16190 third_imode
= VOIDmode
;
16193 gen_load_even
= gen_vec_setv8hi
;
16194 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16195 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16196 inner_mode
= HImode
;
16197 first_imode
= V4SImode
;
16198 second_imode
= V2DImode
;
16199 third_imode
= VOIDmode
;
16202 gen_load_even
= gen_vec_setv16qi
;
16203 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
16204 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
16205 inner_mode
= QImode
;
16206 first_imode
= V8HImode
;
16207 second_imode
= V4SImode
;
16208 third_imode
= V2DImode
;
16211 gcc_unreachable ();
16214 for (i
= 0; i
< n
; i
++)
16217 if (inner_mode
== HFmode
|| inner_mode
== BFmode
)
16220 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16221 machine_mode vec_mode
=
16222 (inner_mode
== HFmode
) ? V8HFmode
: V8BFmode
;
16223 op0
= gen_reg_rtx (vec_mode
);
16224 even
= lowpart_subreg (vec_mode
,
16225 force_reg (inner_mode
, op
), inner_mode
);
16226 odd
= lowpart_subreg (vec_mode
,
16227 force_reg (inner_mode
, ops
[i
+ i
+ 1]),
16229 emit_insn (gen_load_even (op0
, even
, odd
));
16233 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16234 op0
= gen_reg_rtx (SImode
);
16235 emit_move_insn (op0
, gen_lowpart (SImode
, op
));
16237 /* Insert the SImode value as low element of V4SImode vector. */
16238 op1
= gen_reg_rtx (V4SImode
);
16239 op0
= gen_rtx_VEC_MERGE (V4SImode
,
16240 gen_rtx_VEC_DUPLICATE (V4SImode
,
16242 CONST0_RTX (V4SImode
),
16244 emit_insn (gen_rtx_SET (op1
, op0
));
16246 /* Cast the V4SImode vector back to a vector in orignal mode. */
16247 op0
= gen_reg_rtx (mode
);
16248 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
16250 /* Load even elements into the second position. */
16251 emit_insn (gen_load_even (op0
,
16252 force_reg (inner_mode
,
16257 /* Cast vector to FIRST_IMODE vector. */
16258 ops
[i
] = gen_reg_rtx (first_imode
);
16259 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
16262 /* Interleave low FIRST_IMODE vectors. */
16263 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
16265 op0
= gen_reg_rtx (first_imode
);
16266 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
16268 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16269 ops
[j
] = gen_reg_rtx (second_imode
);
16270 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
16273 /* Interleave low SECOND_IMODE vectors. */
16274 switch (second_imode
)
16277 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
16279 op0
= gen_reg_rtx (second_imode
);
16280 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
16283 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16285 ops
[j
] = gen_reg_rtx (third_imode
);
16286 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
16288 second_imode
= V2DImode
;
16289 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16293 op0
= gen_reg_rtx (second_imode
);
16294 emit_insn (gen_interleave_second_low (op0
, ops
[0],
16297 /* Cast the SECOND_IMODE vector back to a vector on original
16299 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
16303 gcc_unreachable ();
16307 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
16308 all values variable, and none identical. */
16311 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
16312 rtx target
, rtx vals
)
16314 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
16315 machine_mode half_mode
= VOIDmode
;
16316 machine_mode quarter_mode
= VOIDmode
;
16323 if (!mmx_ok
&& !TARGET_SSE
)
16339 n
= GET_MODE_NUNITS (mode
);
16340 for (i
= 0; i
< n
; i
++)
16341 ops
[i
] = XVECEXP (vals
, 0, i
);
16342 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
16346 for (i
= 0; i
< 2; i
++)
16347 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
16348 op0
= gen_reg_rtx (V4DImode
);
16349 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
16350 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
16354 for (i
= 0; i
< 4; i
++)
16355 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
16356 ops
[4] = gen_reg_rtx (V4DImode
);
16357 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
16358 ops
[5] = gen_reg_rtx (V4DImode
);
16359 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
16360 op0
= gen_reg_rtx (V8DImode
);
16361 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
16362 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
16366 half_mode
= V16QImode
;
16370 half_mode
= V8HImode
;
16374 half_mode
= V8HFmode
;
16378 half_mode
= V8BFmode
;
16382 n
= GET_MODE_NUNITS (mode
);
16383 for (i
= 0; i
< n
; i
++)
16384 ops
[i
] = XVECEXP (vals
, 0, i
);
16385 op0
= gen_reg_rtx (half_mode
);
16386 op1
= gen_reg_rtx (half_mode
);
16387 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
16389 ix86_expand_vector_init_interleave (half_mode
, op1
,
16390 &ops
[n
>> 1], n
>> 2);
16391 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
16395 quarter_mode
= V16QImode
;
16396 half_mode
= V32QImode
;
16400 quarter_mode
= V8HImode
;
16401 half_mode
= V16HImode
;
16405 quarter_mode
= V8HFmode
;
16406 half_mode
= V16HFmode
;
16410 quarter_mode
= V8BFmode
;
16411 half_mode
= V16BFmode
;
16415 n
= GET_MODE_NUNITS (mode
);
16416 for (i
= 0; i
< n
; i
++)
16417 ops
[i
] = XVECEXP (vals
, 0, i
);
16418 op0
= gen_reg_rtx (quarter_mode
);
16419 op1
= gen_reg_rtx (quarter_mode
);
16420 op2
= gen_reg_rtx (quarter_mode
);
16421 op3
= gen_reg_rtx (quarter_mode
);
16422 op4
= gen_reg_rtx (half_mode
);
16423 op5
= gen_reg_rtx (half_mode
);
16424 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
16426 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
16427 &ops
[n
>> 2], n
>> 3);
16428 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
16429 &ops
[n
>> 1], n
>> 3);
16430 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
16431 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
16432 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
16433 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
16434 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
16438 if (!TARGET_SSE4_1
)
16446 /* Don't use ix86_expand_vector_init_interleave if we can't
16447 move from GPR to SSE register directly. */
16448 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
16455 n
= GET_MODE_NUNITS (mode
);
16456 for (i
= 0; i
< n
; i
++)
16457 ops
[i
] = XVECEXP (vals
, 0, i
);
16458 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
16469 gcc_unreachable ();
16473 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
16474 machine_mode tmp_mode
, inner_mode
;
16475 rtx words
[4], shift
;
16477 tmp_mode
= (GET_MODE_SIZE (mode
) < UNITS_PER_WORD
) ? SImode
: word_mode
;
16479 inner_mode
= GET_MODE_INNER (mode
);
16480 n_elts
= GET_MODE_NUNITS (mode
);
16481 n_words
= GET_MODE_SIZE (mode
) / GET_MODE_SIZE (tmp_mode
);
16482 n_elt_per_word
= n_elts
/ n_words
;
16483 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
16485 for (i
= 0; i
< n_words
; ++i
)
16487 rtx word
= NULL_RTX
;
16489 for (j
= 0; j
< n_elt_per_word
; ++j
)
16491 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
16492 elt
= convert_modes (tmp_mode
, inner_mode
, elt
, true);
16498 word
= expand_simple_binop (tmp_mode
, ASHIFT
, word
, shift
,
16499 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16500 word
= expand_simple_binop (tmp_mode
, IOR
, word
, elt
,
16501 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16509 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
16510 else if (n_words
== 2)
16512 gcc_assert (tmp_mode
== DImode
|| tmp_mode
== SImode
);
16513 machine_mode concat_mode
= tmp_mode
== DImode
? V2DImode
: V2SImode
;
16514 rtx tmp
= gen_reg_rtx (concat_mode
);
16515 vals
= gen_rtx_PARALLEL (concat_mode
, gen_rtvec_v (2, words
));
16516 ix86_expand_vector_init_general (mmx_ok
, concat_mode
, tmp
, vals
);
16517 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
16519 else if (n_words
== 4)
16521 rtx tmp
= gen_reg_rtx (V4SImode
);
16522 gcc_assert (tmp_mode
== SImode
);
16523 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
16524 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
16525 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
16528 gcc_unreachable ();
16532 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16533 instructions unless MMX_OK is true. */
16536 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
16538 machine_mode mode
= GET_MODE (target
);
16539 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16540 int n_elts
= GET_MODE_NUNITS (mode
);
16541 int n_var
= 0, one_var
= -1;
16542 bool all_same
= true, all_const_zero
= true;
16546 /* Handle first initialization from vector elts. */
16547 if (n_elts
!= XVECLEN (vals
, 0))
16549 rtx subtarget
= target
;
16550 x
= XVECEXP (vals
, 0, 0);
16551 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
16552 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
16554 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
16555 if (inner_mode
== QImode
16556 || inner_mode
== HImode
16557 || inner_mode
== TImode
16558 || inner_mode
== HFmode
16559 || inner_mode
== BFmode
)
16561 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
16562 scalar_mode elt_mode
= inner_mode
== TImode
? DImode
: SImode
;
16563 n_bits
/= GET_MODE_SIZE (elt_mode
);
16564 mode
= mode_for_vector (elt_mode
, n_bits
).require ();
16565 inner_mode
= mode_for_vector (elt_mode
, n_bits
/ 2).require ();
16566 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
16567 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
16568 subtarget
= gen_reg_rtx (mode
);
16570 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
16571 if (subtarget
!= target
)
16572 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
16575 gcc_unreachable ();
16578 for (i
= 0; i
< n_elts
; ++i
)
16580 x
= XVECEXP (vals
, 0, i
);
16581 if (!(CONST_SCALAR_INT_P (x
)
16582 || CONST_DOUBLE_P (x
)
16583 || CONST_FIXED_P (x
)))
16584 n_var
++, one_var
= i
;
16585 else if (x
!= CONST0_RTX (inner_mode
))
16586 all_const_zero
= false;
16587 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
16591 /* Constants are best loaded from the constant pool. */
16594 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
16598 /* If all values are identical, broadcast the value. */
16600 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
16601 XVECEXP (vals
, 0, 0)))
16604 /* Values where only one field is non-constant are best loaded from
16605 the pool and overwritten via move later. */
16609 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
16610 XVECEXP (vals
, 0, one_var
),
16614 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
16618 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
16622 V setg (V v, int idx, T val)
16624 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16625 V valv = (V){val, val, val, val, val, val, val, val};
16626 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16627 v = (v & ~mask) | (valv & mask);
16631 ix86_expand_vector_set_var (rtx target
, rtx val
, rtx idx
)
16634 machine_mode mode
= GET_MODE (target
);
16635 machine_mode cmp_mode
= mode
;
16636 int n_elts
= GET_MODE_NUNITS (mode
);
16637 rtx valv
,idxv
,constv
,idx_tmp
;
16640 /* 512-bits vector byte/word broadcast and comparison only available
16641 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16642 when without TARGET_AVX512BW. */
16643 if ((mode
== V32HImode
|| mode
== V32HFmode
|| mode
== V32BFmode
16644 || mode
== V64QImode
)
16645 && !TARGET_AVX512BW
)
16647 gcc_assert (TARGET_AVX512F
);
16648 rtx vhi
, vlo
, idx_hi
;
16649 machine_mode half_mode
;
16650 rtx (*extract_hi
)(rtx
, rtx
);
16651 rtx (*extract_lo
)(rtx
, rtx
);
16653 if (mode
== V32HImode
)
16655 half_mode
= V16HImode
;
16656 extract_hi
= gen_vec_extract_hi_v32hi
;
16657 extract_lo
= gen_vec_extract_lo_v32hi
;
16659 else if (mode
== V32HFmode
)
16661 half_mode
= V16HFmode
;
16662 extract_hi
= gen_vec_extract_hi_v32hf
;
16663 extract_lo
= gen_vec_extract_lo_v32hf
;
16665 else if (mode
== V32BFmode
)
16667 half_mode
= V16BFmode
;
16668 extract_hi
= gen_vec_extract_hi_v32bf
;
16669 extract_lo
= gen_vec_extract_lo_v32bf
;
16673 half_mode
= V32QImode
;
16674 extract_hi
= gen_vec_extract_hi_v64qi
;
16675 extract_lo
= gen_vec_extract_lo_v64qi
;
16678 vhi
= gen_reg_rtx (half_mode
);
16679 vlo
= gen_reg_rtx (half_mode
);
16680 idx_hi
= gen_reg_rtx (GET_MODE (idx
));
16681 emit_insn (extract_hi (vhi
, target
));
16682 emit_insn (extract_lo (vlo
, target
));
16685 vec
[2] = GEN_INT (n_elts
/2);
16686 ix86_expand_binary_operator (MINUS
, GET_MODE (idx
), vec
);
16687 ix86_expand_vector_set_var (vhi
, val
, idx_hi
);
16688 ix86_expand_vector_set_var (vlo
, val
, idx
);
16689 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, vlo
, vhi
)));
16693 if (FLOAT_MODE_P (GET_MODE_INNER (mode
)))
16698 cmp_mode
= V2DImode
;
16701 cmp_mode
= V4DImode
;
16704 cmp_mode
= V8DImode
;
16707 cmp_mode
= V2SImode
;
16710 cmp_mode
= V4SImode
;
16713 cmp_mode
= V8SImode
;
16716 cmp_mode
= V16SImode
;
16719 cmp_mode
= V8HImode
;
16722 cmp_mode
= V16HImode
;
16725 cmp_mode
= V32HImode
;
16728 cmp_mode
= V8HImode
;
16731 cmp_mode
= V16HImode
;
16734 cmp_mode
= V32HImode
;
16737 gcc_unreachable ();
16741 for (int i
= 0; i
!= n_elts
; i
++)
16742 vec
[i
] = GEN_INT (i
);
16743 constv
= gen_rtx_CONST_VECTOR (cmp_mode
, gen_rtvec_v (n_elts
, vec
));
16744 valv
= gen_reg_rtx (mode
);
16745 idxv
= gen_reg_rtx (cmp_mode
);
16746 idx_tmp
= convert_to_mode (GET_MODE_INNER (cmp_mode
), idx
, 1);
16748 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16751 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16752 cmp_mode
, idxv
, idx_tmp
);
16757 vec
[3] = gen_rtx_EQ (mode
, idxv
, constv
);
16760 ok
= ix86_expand_int_vcond (vec
);
16765 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
16767 machine_mode mode
= GET_MODE (target
);
16768 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16769 machine_mode half_mode
;
16770 bool use_vec_merge
= false;
16771 bool blendm_const
= false;
16773 static rtx (*gen_extract
[8][2]) (rtx
, rtx
)
16775 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
16776 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
16777 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
16778 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
16779 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
16780 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
},
16781 { gen_vec_extract_lo_v16hf
, gen_vec_extract_hi_v16hf
},
16782 { gen_vec_extract_lo_v16bf
, gen_vec_extract_hi_v16bf
}
16784 static rtx (*gen_insert
[8][2]) (rtx
, rtx
, rtx
)
16786 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
16787 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
16788 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
16789 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
16790 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
16791 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
},
16792 { gen_vec_set_lo_v16hf
, gen_vec_set_hi_v16hf
},
16793 { gen_vec_set_lo_v16bf
, gen_vec_set_hi_v16bf
},
16796 machine_mode mmode
= VOIDmode
;
16797 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
16802 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16810 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16811 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
16813 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16815 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16816 emit_insn (gen_rtx_SET (target
, tmp
));
16822 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
16826 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16827 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
16829 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16831 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16832 emit_insn (gen_rtx_SET (target
, tmp
));
16836 /* NB: For ELT == 0, use standard scalar operation patterns which
16837 preserve the rest of the vector for combiner:
16840 (vec_duplicate:V2DF (reg:DF))
16850 /* For the two element vectors, we implement a VEC_CONCAT with
16851 the extraction of the other element. */
16853 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
16854 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
16857 op0
= val
, op1
= tmp
;
16859 op0
= tmp
, op1
= val
;
16861 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
16862 emit_insn (gen_rtx_SET (target
, tmp
));
16867 use_vec_merge
= TARGET_SSE4_1
;
16874 use_vec_merge
= true;
16878 /* tmp = target = A B C D */
16879 tmp
= copy_to_reg (target
);
16880 /* target = A A B B */
16881 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
16882 /* target = X A B B */
16883 ix86_expand_vector_set (false, target
, val
, 0);
16884 /* target = A X C D */
16885 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16886 const1_rtx
, const0_rtx
,
16887 GEN_INT (2+4), GEN_INT (3+4)));
16891 /* tmp = target = A B C D */
16892 tmp
= copy_to_reg (target
);
16893 /* tmp = X B C D */
16894 ix86_expand_vector_set (false, tmp
, val
, 0);
16895 /* target = A B X D */
16896 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16897 const0_rtx
, const1_rtx
,
16898 GEN_INT (0+4), GEN_INT (3+4)));
16902 /* tmp = target = A B C D */
16903 tmp
= copy_to_reg (target
);
16904 /* tmp = X B C D */
16905 ix86_expand_vector_set (false, tmp
, val
, 0);
16906 /* target = A B X D */
16907 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16908 const0_rtx
, const1_rtx
,
16909 GEN_INT (2+4), GEN_INT (0+4)));
16913 gcc_unreachable ();
16918 use_vec_merge
= TARGET_SSE4_1
;
16922 /* Element 0 handled by vec_merge below. */
16925 use_vec_merge
= true;
16931 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16932 store into element 0, then shuffle them back. */
16936 order
[0] = GEN_INT (elt
);
16937 order
[1] = const1_rtx
;
16938 order
[2] = const2_rtx
;
16939 order
[3] = GEN_INT (3);
16940 order
[elt
] = const0_rtx
;
16942 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16943 order
[1], order
[2], order
[3]));
16945 ix86_expand_vector_set (false, target
, val
, 0);
16947 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16948 order
[1], order
[2], order
[3]));
16952 /* For SSE1, we have to reuse the V4SF code. */
16953 rtx t
= gen_reg_rtx (V4SFmode
);
16954 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
16955 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
16956 emit_move_insn (target
, gen_lowpart (mode
, t
));
16964 use_vec_merge
= TARGET_SSE2
;
16967 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
16972 use_vec_merge
= TARGET_SSE4_1
;
16976 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16980 half_mode
= V16QImode
;
16987 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16988 if (TARGET_AVX2
&& elt
!= 0)
16991 gen_blendm
= ((mode
== E_V16HFmode
) ? gen_avx2_pblendph_1
16992 : gen_avx2_pblendbf_1
);
16993 blendm_const
= true;
16998 half_mode
= ((mode
== E_V16HFmode
) ? V8HFmode
: V8BFmode
);
16999 j
= ((mode
== E_V16HFmode
) ? 6 : 7);
17005 half_mode
= V8HImode
;
17011 half_mode
= V4SImode
;
17017 half_mode
= V2DImode
;
17023 half_mode
= V4SFmode
;
17029 half_mode
= V2DFmode
;
17035 /* Compute offset. */
17039 gcc_assert (i
<= 1);
17041 /* Extract the half. */
17042 tmp
= gen_reg_rtx (half_mode
);
17043 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
17045 /* Put val in tmp at elt. */
17046 ix86_expand_vector_set (false, tmp
, val
, elt
);
17049 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
17053 if (TARGET_AVX512F
)
17056 gen_blendm
= gen_avx512f_blendmv8df
;
17061 if (TARGET_AVX512F
)
17064 gen_blendm
= gen_avx512f_blendmv8di
;
17069 if (TARGET_AVX512F
)
17072 gen_blendm
= gen_avx512f_blendmv16sf
;
17077 if (TARGET_AVX512F
)
17080 gen_blendm
= gen_avx512f_blendmv16si
;
17085 if (TARGET_AVX512BW
)
17088 gen_blendm
= gen_avx512bw_blendmv32hf
;
17092 if (TARGET_AVX512BW
)
17095 gen_blendm
= gen_avx512bw_blendmv32bf
;
17099 if (TARGET_AVX512BW
)
17102 gen_blendm
= gen_avx512bw_blendmv32hi
;
17104 else if (TARGET_AVX512F
)
17106 half_mode
= E_V8HImode
;
17113 if (TARGET_AVX512BW
)
17116 gen_blendm
= gen_avx512bw_blendmv64qi
;
17118 else if (TARGET_AVX512F
)
17120 half_mode
= E_V16QImode
;
17127 /* Compute offset. */
17131 gcc_assert (i
<= 3);
17134 /* Extract the quarter. */
17135 tmp
= gen_reg_rtx (V4SImode
);
17136 rtx tmp2
= gen_lowpart (V16SImode
, target
);
17137 rtx mask
= gen_reg_rtx (QImode
);
17139 emit_move_insn (mask
, constm1_rtx
);
17140 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
17143 tmp2
= gen_reg_rtx (half_mode
);
17144 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
17147 /* Put val in tmp at elt. */
17148 ix86_expand_vector_set (false, tmp
, val
, elt
);
17151 tmp2
= gen_reg_rtx (V16SImode
);
17152 rtx tmp3
= gen_lowpart (V16SImode
, target
);
17153 mask
= gen_reg_rtx (HImode
);
17154 emit_move_insn (mask
, constm1_rtx
);
17155 tmp
= gen_lowpart (V4SImode
, tmp
);
17156 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
17158 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
17166 if (mmode
!= VOIDmode
)
17168 tmp
= gen_reg_rtx (mode
);
17169 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
17170 rtx merge_mask
= gen_int_mode (HOST_WIDE_INT_1U
<< elt
, mmode
);
17171 /* The avx512*_blendm<mode> expanders have different operand order
17172 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
17173 elements where the mask is set and second input operand otherwise,
17174 in {sse,avx}*_*blend* the first input operand is used for elements
17175 where the mask is clear and second input operand otherwise. */
17177 merge_mask
= force_reg (mmode
, merge_mask
);
17178 emit_insn (gen_blendm (target
, target
, tmp
, merge_mask
));
17180 else if (use_vec_merge
)
17183 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
17184 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
17185 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
17186 emit_insn (gen_rtx_SET (target
, tmp
));
17190 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
17192 emit_move_insn (mem
, target
);
17194 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
17195 emit_move_insn (tmp
, val
);
17197 emit_move_insn (target
, mem
);
17202 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
17204 machine_mode mode
= GET_MODE (vec
);
17205 machine_mode inner_mode
= GET_MODE_INNER (mode
);
17206 bool use_vec_extr
= false;
17212 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17226 use_vec_extr
= true;
17230 use_vec_extr
= TARGET_SSE4_1
;
17242 tmp
= gen_reg_rtx (mode
);
17243 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
17244 GEN_INT (elt
), GEN_INT (elt
),
17245 GEN_INT (elt
+4), GEN_INT (elt
+4)));
17249 tmp
= gen_reg_rtx (mode
);
17250 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
17254 gcc_unreachable ();
17257 use_vec_extr
= true;
17262 use_vec_extr
= TARGET_SSE4_1
;
17276 tmp
= gen_reg_rtx (mode
);
17277 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
17278 GEN_INT (elt
), GEN_INT (elt
),
17279 GEN_INT (elt
), GEN_INT (elt
)));
17283 tmp
= gen_reg_rtx (mode
);
17284 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
17288 gcc_unreachable ();
17291 use_vec_extr
= true;
17296 /* For SSE1, we have to reuse the V4SF code. */
17297 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
17298 gen_lowpart (V4SFmode
, vec
), elt
);
17307 use_vec_extr
= TARGET_SSE2
;
17310 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
17314 use_vec_extr
= TARGET_SSE4_1
;
17318 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
17320 tmp
= gen_reg_rtx (SImode
);
17321 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
17323 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
17328 use_vec_extr
= TARGET_SSE4_1
;
17334 tmp
= gen_reg_rtx (V4SFmode
);
17336 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
17338 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
17339 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17347 tmp
= gen_reg_rtx (V2DFmode
);
17349 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
17351 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
17352 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
17360 tmp
= gen_reg_rtx (V16QImode
);
17362 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
17364 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
17365 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17373 tmp
= gen_reg_rtx (V8HImode
);
17375 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
17377 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
17378 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17386 tmp
= gen_reg_rtx (V4SImode
);
17388 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
17390 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
17391 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17399 tmp
= gen_reg_rtx (V2DImode
);
17401 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
17403 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
17404 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
17410 if (TARGET_AVX512BW
)
17412 tmp
= gen_reg_rtx (V16HImode
);
17414 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
17416 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
17417 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17423 if (TARGET_AVX512BW
)
17425 tmp
= gen_reg_rtx (V32QImode
);
17427 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
17429 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
17430 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
17436 tmp
= gen_reg_rtx (V8SFmode
);
17438 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
17440 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
17441 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17445 tmp
= gen_reg_rtx (V4DFmode
);
17447 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
17449 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
17450 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17454 tmp
= gen_reg_rtx (V8SImode
);
17456 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
17458 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
17459 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17463 tmp
= gen_reg_rtx (V4DImode
);
17465 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
17467 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
17468 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17473 if (TARGET_AVX512BW
)
17475 tmp
= (mode
== E_V32HFmode
17476 ? gen_reg_rtx (V16HFmode
)
17477 : gen_reg_rtx (V16BFmode
));
17479 emit_insn (gen_vec_extract_lo (mode
, tmp
, vec
));
17481 emit_insn (gen_vec_extract_hi (mode
, tmp
, vec
));
17482 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17491 tmp
= (mode
== E_V16HFmode
17492 ? gen_reg_rtx (V8HFmode
)
17493 : gen_reg_rtx (V8BFmode
));
17495 emit_insn (gen_vec_extract_lo (mode
, tmp
, vec
));
17497 emit_insn (gen_vec_extract_hi (mode
, tmp
, vec
));
17498 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17504 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17505 /* ??? Could extract the appropriate HImode element and shift. */
17514 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
17515 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
17517 /* Let the rtl optimizers know about the zero extension performed. */
17518 if (inner_mode
== QImode
|| inner_mode
== HImode
)
17520 rtx reg
= gen_reg_rtx (SImode
);
17521 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
17522 emit_move_insn (reg
, tmp
);
17523 tmp
= gen_lowpart (inner_mode
, reg
);
17524 SUBREG_PROMOTED_VAR_P (tmp
) = 1;
17525 SUBREG_PROMOTED_SET (tmp
, 1);
17528 emit_move_insn (target
, tmp
);
17532 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
17534 emit_move_insn (mem
, vec
);
17536 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
17537 emit_move_insn (target
, tmp
);
17541 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17542 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17543 The upper bits of DEST are undefined, though they shouldn't cause
17544 exceptions (some bits from src or all zeros are ok). */
17547 emit_reduc_half (rtx dest
, rtx src
, int i
)
17550 switch (GET_MODE (src
))
17554 tem
= gen_sse_movhlps (dest
, src
, src
);
17556 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
17557 GEN_INT (1 + 4), GEN_INT (1 + 4));
17560 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
17563 d
= gen_reg_rtx (V1SImode
);
17564 tem
= gen_mmx_lshrv1si3 (d
, gen_lowpart (V1SImode
, src
),
17568 d
= gen_reg_rtx (V1DImode
);
17569 tem
= gen_mmx_lshrv1di3 (d
, gen_lowpart (V1DImode
, src
),
17577 d
= gen_reg_rtx (V1TImode
);
17578 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
17583 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
17585 tem
= gen_avx_shufps256 (dest
, src
, src
,
17586 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
17590 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
17592 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
17601 if (GET_MODE (dest
) != V4DImode
)
17602 d
= gen_reg_rtx (V4DImode
);
17603 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
17604 gen_lowpart (V4DImode
, src
),
17609 d
= gen_reg_rtx (V2TImode
);
17610 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
17619 d
= gen_reg_rtx (V4TImode
);
17620 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
17630 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
17631 gen_lowpart (V16SImode
, src
),
17632 gen_lowpart (V16SImode
, src
),
17633 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
17634 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
17635 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
17636 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
17637 GEN_INT (0xC), GEN_INT (0xD),
17638 GEN_INT (0xE), GEN_INT (0xF),
17639 GEN_INT (0x10), GEN_INT (0x11),
17640 GEN_INT (0x12), GEN_INT (0x13),
17641 GEN_INT (0x14), GEN_INT (0x15),
17642 GEN_INT (0x16), GEN_INT (0x17));
17644 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
17645 gen_lowpart (V16SImode
, src
),
17646 GEN_INT (i
== 128 ? 0x2 : 0x1),
17650 GEN_INT (i
== 128 ? 0x6 : 0x5),
17654 GEN_INT (i
== 128 ? 0xA : 0x9),
17658 GEN_INT (i
== 128 ? 0xE : 0xD),
17664 gcc_unreachable ();
17668 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
17671 /* Expand a vector reduction. FN is the binary pattern to reduce;
17672 DEST is the destination; IN is the input vector. */
17675 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
17677 rtx half
, dst
, vec
= in
;
17678 machine_mode mode
= GET_MODE (in
);
17681 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17683 && mode
== V8HImode
17684 && fn
== gen_uminv8hi3
)
17686 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
17690 for (i
= GET_MODE_BITSIZE (mode
);
17691 i
> GET_MODE_UNIT_BITSIZE (mode
);
17694 half
= gen_reg_rtx (mode
);
17695 emit_reduc_half (half
, vec
, i
);
17696 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
17699 dst
= gen_reg_rtx (mode
);
17700 emit_insn (fn (dst
, half
, vec
));
17705 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17706 FP status register is set. */
17709 ix86_emit_fp_unordered_jump (rtx label
)
17711 rtx reg
= gen_reg_rtx (HImode
);
17715 emit_insn (gen_x86_fnstsw_1 (reg
));
17717 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
17719 emit_insn (gen_x86_sahf_1 (reg
));
17721 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
17722 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
17726 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
17728 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17729 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
17732 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
17733 gen_rtx_LABEL_REF (VOIDmode
, label
),
17735 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
17736 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
17737 JUMP_LABEL (insn
) = label
;
17740 /* Output code to perform an sinh XFmode calculation. */
17743 ix86_emit_i387_sinh (rtx op0
, rtx op1
)
17745 rtx e1
= gen_reg_rtx (XFmode
);
17746 rtx e2
= gen_reg_rtx (XFmode
);
17747 rtx scratch
= gen_reg_rtx (HImode
);
17748 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17749 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17751 rtx_code_label
*jump_label
= gen_label_rtx ();
17754 /* scratch = fxam (op1) */
17755 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17757 /* e1 = expm1 (|op1|) */
17758 emit_insn (gen_absxf2 (e2
, op1
));
17759 emit_insn (gen_expm1xf2 (e1
, e2
));
17761 /* e2 = e1 / (e1 + 1.0) + e1 */
17762 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17763 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17764 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17765 emit_insn (gen_addxf3 (e2
, e2
, e1
));
17767 /* flags = signbit (op1) */
17768 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17770 /* if (flags) then e2 = -e2 */
17771 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17772 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17773 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17775 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17776 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17777 JUMP_LABEL (insn
) = jump_label
;
17779 emit_insn (gen_negxf2 (e2
, e2
));
17781 emit_label (jump_label
);
17782 LABEL_NUSES (jump_label
) = 1;
17784 /* op0 = 0.5 * e2 */
17785 half
= force_reg (XFmode
, half
);
17786 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17789 /* Output code to perform an cosh XFmode calculation. */
17792 ix86_emit_i387_cosh (rtx op0
, rtx op1
)
17794 rtx e1
= gen_reg_rtx (XFmode
);
17795 rtx e2
= gen_reg_rtx (XFmode
);
17796 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17799 /* e1 = exp (op1) */
17800 emit_insn (gen_expxf2 (e1
, op1
));
17802 /* e2 = e1 + 1.0 / e1 */
17803 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17804 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
17805 emit_insn (gen_addxf3 (e2
, e1
, e2
));
17807 /* op0 = 0.5 * e2 */
17808 half
= force_reg (XFmode
, half
);
17809 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17812 /* Output code to perform an tanh XFmode calculation. */
17815 ix86_emit_i387_tanh (rtx op0
, rtx op1
)
17817 rtx e1
= gen_reg_rtx (XFmode
);
17818 rtx e2
= gen_reg_rtx (XFmode
);
17819 rtx scratch
= gen_reg_rtx (HImode
);
17820 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17822 rtx_code_label
*jump_label
= gen_label_rtx ();
17825 /* scratch = fxam (op1) */
17826 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17828 /* e1 = expm1 (-|2 * op1|) */
17829 emit_insn (gen_addxf3 (e2
, op1
, op1
));
17830 emit_insn (gen_absxf2 (e2
, e2
));
17831 emit_insn (gen_negxf2 (e2
, e2
));
17832 emit_insn (gen_expm1xf2 (e1
, e2
));
17834 /* e2 = e1 / (e1 + 2.0) */
17835 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
17836 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
17837 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17839 /* flags = signbit (op1) */
17840 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17842 /* if (!flags) then e2 = -e2 */
17843 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17844 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17845 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17847 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17848 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17849 JUMP_LABEL (insn
) = jump_label
;
17851 emit_insn (gen_negxf2 (e2
, e2
));
17853 emit_label (jump_label
);
17854 LABEL_NUSES (jump_label
) = 1;
17856 emit_move_insn (op0
, e2
);
17859 /* Output code to perform an asinh XFmode calculation. */
17862 ix86_emit_i387_asinh (rtx op0
, rtx op1
)
17864 rtx e1
= gen_reg_rtx (XFmode
);
17865 rtx e2
= gen_reg_rtx (XFmode
);
17866 rtx scratch
= gen_reg_rtx (HImode
);
17867 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17869 rtx_code_label
*jump_label
= gen_label_rtx ();
17872 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17873 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
17874 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17875 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17876 emit_insn (gen_sqrtxf2 (e2
, e2
));
17877 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
17880 emit_insn (gen_divxf3 (e1
, e1
, e2
));
17882 /* scratch = fxam (op1) */
17883 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17885 /* e1 = e1 + |op1| */
17886 emit_insn (gen_absxf2 (e2
, op1
));
17887 emit_insn (gen_addxf3 (e1
, e1
, e2
));
17889 /* e2 = log1p (e1) */
17890 ix86_emit_i387_log1p (e2
, e1
);
17892 /* flags = signbit (op1) */
17893 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17895 /* if (flags) then e2 = -e2 */
17896 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17897 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17898 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17900 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17901 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17902 JUMP_LABEL (insn
) = jump_label
;
17904 emit_insn (gen_negxf2 (e2
, e2
));
17906 emit_label (jump_label
);
17907 LABEL_NUSES (jump_label
) = 1;
17909 emit_move_insn (op0
, e2
);
17912 /* Output code to perform an acosh XFmode calculation. */
17915 ix86_emit_i387_acosh (rtx op0
, rtx op1
)
17917 rtx e1
= gen_reg_rtx (XFmode
);
17918 rtx e2
= gen_reg_rtx (XFmode
);
17919 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17921 /* e2 = sqrt (op1 + 1.0) */
17922 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
17923 emit_insn (gen_sqrtxf2 (e2
, e2
));
17925 /* e1 = sqrt (op1 - 1.0) */
17926 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
17927 emit_insn (gen_sqrtxf2 (e1
, e1
));
17930 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
17932 /* e1 = e1 + op1 */
17933 emit_insn (gen_addxf3 (e1
, e1
, op1
));
17935 /* op0 = log (e1) */
17936 emit_insn (gen_logxf2 (op0
, e1
));
17939 /* Output code to perform an atanh XFmode calculation. */
17942 ix86_emit_i387_atanh (rtx op0
, rtx op1
)
17944 rtx e1
= gen_reg_rtx (XFmode
);
17945 rtx e2
= gen_reg_rtx (XFmode
);
17946 rtx scratch
= gen_reg_rtx (HImode
);
17947 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17948 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17950 rtx_code_label
*jump_label
= gen_label_rtx ();
17953 /* scratch = fxam (op1) */
17954 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17957 emit_insn (gen_absxf2 (e2
, op1
));
17959 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17960 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17961 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
17962 emit_insn (gen_addxf3 (e2
, e2
, e2
));
17963 emit_insn (gen_negxf2 (e2
, e2
));
17964 emit_insn (gen_divxf3 (e1
, e2
, e1
));
17966 /* e2 = log1p (e1) */
17967 ix86_emit_i387_log1p (e2
, e1
);
17969 /* flags = signbit (op1) */
17970 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17972 /* if (!flags) then e2 = -e2 */
17973 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17974 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17975 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17977 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17978 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17979 JUMP_LABEL (insn
) = jump_label
;
17981 emit_insn (gen_negxf2 (e2
, e2
));
17983 emit_label (jump_label
);
17984 LABEL_NUSES (jump_label
) = 1;
17986 /* op0 = 0.5 * e2 */
17987 half
= force_reg (XFmode
, half
);
17988 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17991 /* Output code to perform a log1p XFmode calculation. */
17994 ix86_emit_i387_log1p (rtx op0
, rtx op1
)
17996 rtx_code_label
*label1
= gen_label_rtx ();
17997 rtx_code_label
*label2
= gen_label_rtx ();
17999 rtx tmp
= gen_reg_rtx (XFmode
);
18000 rtx res
= gen_reg_rtx (XFmode
);
18001 rtx cst
, cstln2
, cst1
;
18004 /* The emit_jump call emits pending stack adjust, make sure it is emitted
18005 before the conditional jump, otherwise the stack adjustment will be
18006 only conditional. */
18007 do_pending_stack_adjust ();
18009 cst
= const_double_from_real_value
18010 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
18011 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
18013 emit_insn (gen_absxf2 (tmp
, op1
));
18015 cst
= force_reg (XFmode
, cst
);
18016 ix86_expand_branch (GE
, tmp
, cst
, label1
);
18017 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
18018 insn
= get_last_insn ();
18019 JUMP_LABEL (insn
) = label1
;
18021 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
18022 emit_jump (label2
);
18024 emit_label (label1
);
18025 LABEL_NUSES (label1
) = 1;
18027 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
18028 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
18029 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
18031 emit_label (label2
);
18032 LABEL_NUSES (label2
) = 1;
18034 emit_move_insn (op0
, res
);
18037 /* Emit code for round calculation. */
18039 ix86_emit_i387_round (rtx op0
, rtx op1
)
18041 machine_mode inmode
= GET_MODE (op1
);
18042 machine_mode outmode
= GET_MODE (op0
);
18043 rtx e1
= gen_reg_rtx (XFmode
);
18044 rtx e2
= gen_reg_rtx (XFmode
);
18045 rtx scratch
= gen_reg_rtx (HImode
);
18046 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
18047 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
18048 rtx res
= gen_reg_rtx (outmode
);
18049 rtx_code_label
*jump_label
= gen_label_rtx ();
18050 rtx (*floor_insn
) (rtx
, rtx
);
18051 rtx (*neg_insn
) (rtx
, rtx
);
18059 tmp
= gen_reg_rtx (XFmode
);
18061 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
18067 gcc_unreachable ();
18073 floor_insn
= gen_frndintxf2_floor
;
18074 neg_insn
= gen_negsf2
;
18077 floor_insn
= gen_frndintxf2_floor
;
18078 neg_insn
= gen_negdf2
;
18081 floor_insn
= gen_frndintxf2_floor
;
18082 neg_insn
= gen_negxf2
;
18085 floor_insn
= gen_lfloorxfhi2
;
18086 neg_insn
= gen_neghi2
;
18089 floor_insn
= gen_lfloorxfsi2
;
18090 neg_insn
= gen_negsi2
;
18093 floor_insn
= gen_lfloorxfdi2
;
18094 neg_insn
= gen_negdi2
;
18097 gcc_unreachable ();
18100 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
18102 /* scratch = fxam(op1) */
18103 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
18105 /* e1 = fabs(op1) */
18106 emit_insn (gen_absxf2 (e1
, op1
));
18108 /* e2 = e1 + 0.5 */
18109 half
= force_reg (XFmode
, half
);
18110 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
18112 /* res = floor(e2) */
18118 tmp
= gen_reg_rtx (XFmode
);
18120 emit_insn (floor_insn (tmp
, e2
));
18121 emit_insn (gen_rtx_SET (res
,
18122 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
18123 UNSPEC_TRUNC_NOOP
)));
18127 emit_insn (floor_insn (res
, e2
));
18130 /* flags = signbit(a) */
18131 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
18133 /* if (flags) then res = -res */
18134 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
18135 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
18136 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
18138 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18139 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
18140 JUMP_LABEL (insn
) = jump_label
;
18142 emit_insn (neg_insn (res
, res
));
18144 emit_label (jump_label
);
18145 LABEL_NUSES (jump_label
) = 1;
18147 emit_move_insn (op0
, res
);
18150 /* Output code to perform a Newton-Rhapson approximation of a single precision
18151 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
18154 ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
18156 rtx x0
, x1
, e0
, e1
;
18158 x0
= gen_reg_rtx (mode
);
18159 e0
= gen_reg_rtx (mode
);
18160 e1
= gen_reg_rtx (mode
);
18161 x1
= gen_reg_rtx (mode
);
18163 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
18165 b
= force_reg (mode
, b
);
18167 /* x0 = rcp(b) estimate */
18168 if (mode
== V16SFmode
|| mode
== V8DFmode
)
18170 if (TARGET_AVX512ER
)
18172 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18175 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
18179 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18183 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18187 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
18190 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
18193 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
18196 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
18199 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
18202 /* Output code to perform a Newton-Rhapson approximation of a
18203 single precision floating point [reciprocal] square root. */
18206 ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
18208 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
18212 x0
= gen_reg_rtx (mode
);
18213 e0
= gen_reg_rtx (mode
);
18214 e1
= gen_reg_rtx (mode
);
18215 e2
= gen_reg_rtx (mode
);
18216 e3
= gen_reg_rtx (mode
);
18218 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
18221 /* res = rsqrt28(a) estimate */
18222 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18226 /* x0 = rsqrt28(a) estimate */
18227 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18229 /* res = rcp28(x0) estimate */
18230 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
18236 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
18237 mthree
= const_double_from_real_value (r
, SFmode
);
18239 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
18240 mhalf
= const_double_from_real_value (r
, SFmode
);
18241 unspec
= UNSPEC_RSQRT
;
18243 if (VECTOR_MODE_P (mode
))
18245 mthree
= ix86_build_const_vector (mode
, true, mthree
);
18246 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
18247 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18248 if (GET_MODE_SIZE (mode
) == 64)
18249 unspec
= UNSPEC_RSQRT14
;
18252 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18253 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18255 a
= force_reg (mode
, a
);
18257 /* x0 = rsqrt(a) estimate */
18258 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18261 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18264 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
18267 /* Handle masked compare. */
18268 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
18270 mask
= gen_reg_rtx (HImode
);
18271 /* Imm value 0x4 corresponds to not-equal comparison. */
18272 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
18273 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
18277 mask
= gen_reg_rtx (mode
);
18278 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
18279 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
18283 mthree
= force_reg (mode
, mthree
);
18286 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
18288 unsigned vector_size
= GET_MODE_SIZE (mode
);
18290 || (TARGET_AVX512F
&& vector_size
== 64)
18291 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
18292 emit_insn (gen_rtx_SET (e2
,
18293 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
18297 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
18300 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
18303 mhalf
= force_reg (mode
, mhalf
);
18305 /* e3 = -.5 * x0 */
18306 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
18308 /* e3 = -.5 * e0 */
18309 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
18310 /* ret = e2 * e3 */
18311 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
18314 /* Expand fabs (OP0) and return a new rtx that holds the result. The
18315 mask for masking out the sign-bit is stored in *SMASK, if that is
18319 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
18321 machine_mode vmode
, mode
= GET_MODE (op0
);
18324 xa
= gen_reg_rtx (mode
);
18325 if (mode
== SFmode
)
18327 else if (mode
== DFmode
)
18331 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
18332 if (!VECTOR_MODE_P (mode
))
18334 /* We need to generate a scalar mode mask in this case. */
18335 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
18336 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
18337 mask
= gen_reg_rtx (mode
);
18338 emit_insn (gen_rtx_SET (mask
, tmp
));
18340 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
18348 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
18349 swapping the operands if SWAP_OPERANDS is true. The expanded
18350 code is a forward jump to a newly created label in case the
18351 comparison is true. The generated label rtx is returned. */
18352 static rtx_code_label
*
18353 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
18354 bool swap_operands
)
18356 bool unordered_compare
= ix86_unordered_fp_compare (code
);
18357 rtx_code_label
*label
;
18361 std::swap (op0
, op1
);
18363 label
= gen_label_rtx ();
18364 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
18365 if (unordered_compare
)
18366 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
18367 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
18368 emit_insn (gen_rtx_SET (reg
, tmp
));
18369 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
18370 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
18371 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
18372 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18373 JUMP_LABEL (tmp
) = label
;
18378 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18379 using comparison code CODE. Operands are swapped for the comparison if
18380 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18382 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
18383 bool swap_operands
)
18385 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
18386 machine_mode mode
= GET_MODE (op0
);
18387 rtx mask
= gen_reg_rtx (mode
);
18390 std::swap (op0
, op1
);
18392 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
18394 emit_insn (insn (mask
, op0
, op1
,
18395 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
18399 /* Expand copysign from SIGN to the positive value ABS_VALUE
18400 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18404 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
18406 machine_mode mode
= GET_MODE (sign
);
18407 rtx sgn
= gen_reg_rtx (mode
);
18408 if (mask
== NULL_RTX
)
18410 machine_mode vmode
;
18412 if (mode
== SFmode
)
18414 else if (mode
== DFmode
)
18419 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
18420 if (!VECTOR_MODE_P (mode
))
18422 /* We need to generate a scalar mode mask in this case. */
18423 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
18424 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
18425 mask
= gen_reg_rtx (mode
);
18426 emit_insn (gen_rtx_SET (mask
, tmp
));
18430 mask
= gen_rtx_NOT (mode
, mask
);
18431 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
18432 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
18435 /* Expand SSE sequence for computing lround from OP1 storing
18439 ix86_expand_lround (rtx op0
, rtx op1
)
18441 /* C code for the stuff we're doing below:
18442 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18445 machine_mode mode
= GET_MODE (op1
);
18446 const struct real_format
*fmt
;
18447 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18450 /* load nextafter (0.5, 0.0) */
18451 fmt
= REAL_MODE_FORMAT (mode
);
18452 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18453 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18455 /* adj = copysign (0.5, op1) */
18456 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18457 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
18459 /* adj = op1 + adj */
18460 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18462 /* op0 = (imode)adj */
18463 expand_fix (op0
, adj
, 0);
18466 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18470 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
18472 /* C code for the stuff we're doing below (for do_floor):
18474 xi -= (double)xi > op1 ? 1 : 0;
18477 machine_mode fmode
= GET_MODE (op1
);
18478 machine_mode imode
= GET_MODE (op0
);
18479 rtx ireg
, freg
, tmp
;
18480 rtx_code_label
*label
;
18482 /* reg = (long)op1 */
18483 ireg
= gen_reg_rtx (imode
);
18484 expand_fix (ireg
, op1
, 0);
18486 /* freg = (double)reg */
18487 freg
= gen_reg_rtx (fmode
);
18488 expand_float (freg
, ireg
, 0);
18490 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18491 label
= ix86_expand_sse_compare_and_jump (UNLE
,
18492 freg
, op1
, !do_floor
);
18493 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
18494 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
18495 emit_move_insn (ireg
, tmp
);
18497 emit_label (label
);
18498 LABEL_NUSES (label
) = 1;
18500 emit_move_insn (op0
, ireg
);
18503 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18504 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18507 ix86_gen_TWO52 (machine_mode mode
)
18509 const struct real_format
*fmt
;
18510 REAL_VALUE_TYPE TWO52r
;
18513 fmt
= REAL_MODE_FORMAT (mode
);
18514 real_2expN (&TWO52r
, fmt
->p
- 1, mode
);
18515 TWO52
= const_double_from_real_value (TWO52r
, mode
);
18516 TWO52
= force_reg (mode
, TWO52
);
18521 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18524 ix86_expand_rint (rtx operand0
, rtx operand1
)
18526 /* C code for the stuff we're doing below:
18527 xa = fabs (operand1);
18528 if (!isless (xa, 2**52))
18531 if (flag_rounding_math)
18533 two52 = copysign (two52, operand1);
18536 xa = xa + two52 - two52;
18537 return copysign (xa, operand1);
18539 machine_mode mode
= GET_MODE (operand0
);
18540 rtx res
, xa
, TWO52
, mask
;
18541 rtx_code_label
*label
;
18543 TWO52
= ix86_gen_TWO52 (mode
);
18545 /* Temporary for holding the result, initialized to the input
18546 operand to ease control flow. */
18547 res
= copy_to_reg (operand1
);
18549 /* xa = abs (operand1) */
18550 xa
= ix86_expand_sse_fabs (res
, &mask
);
18552 /* if (!isless (xa, TWO52)) goto label; */
18553 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18555 if (flag_rounding_math
)
18557 ix86_sse_copysign_to_positive (TWO52
, TWO52
, res
, mask
);
18561 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18562 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
18564 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18565 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
18566 xa
= ix86_expand_sse_fabs (xa
, NULL
);
18568 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
18570 emit_label (label
);
18571 LABEL_NUSES (label
) = 1;
18573 emit_move_insn (operand0
, res
);
18576 /* Expand SSE2 sequence for computing floor or ceil
18577 from OPERAND1 storing into OPERAND0. */
18579 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
18581 /* C code for the stuff we expand below.
18582 double xa = fabs (x), x2;
18583 if (!isless (xa, TWO52))
18585 x2 = (double)(long)x;
18594 if (HONOR_SIGNED_ZEROS (mode))
18595 return copysign (x2, x);
18598 machine_mode mode
= GET_MODE (operand0
);
18599 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
18600 rtx_code_label
*label
;
18602 TWO52
= ix86_gen_TWO52 (mode
);
18604 /* Temporary for holding the result, initialized to the input
18605 operand to ease control flow. */
18606 res
= copy_to_reg (operand1
);
18608 /* xa = abs (operand1) */
18609 xa
= ix86_expand_sse_fabs (res
, &mask
);
18611 /* if (!isless (xa, TWO52)) goto label; */
18612 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18614 /* xa = (double)(long)x */
18615 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18616 expand_fix (xi
, res
, 0);
18617 expand_float (xa
, xi
, 0);
18620 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18622 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18623 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
18624 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18625 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
18626 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18627 if (HONOR_SIGNED_ZEROS (mode
))
18629 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18630 if (do_floor
&& flag_rounding_math
)
18631 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18633 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
18635 emit_move_insn (res
, tmp
);
18637 emit_label (label
);
18638 LABEL_NUSES (label
) = 1;
18640 emit_move_insn (operand0
, res
);
18643 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18644 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18645 that is only available on 64bit targets. */
18647 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
18649 /* C code for the stuff we expand below.
18650 double xa = fabs (x), x2;
18651 if (!isless (xa, TWO52))
18653 xa = xa + TWO52 - TWO52;
18654 x2 = copysign (xa, x);
18663 if (HONOR_SIGNED_ZEROS (mode))
18664 x2 = copysign (x2, x);
18667 machine_mode mode
= GET_MODE (operand0
);
18668 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
18669 rtx_code_label
*label
;
18671 TWO52
= ix86_gen_TWO52 (mode
);
18673 /* Temporary for holding the result, initialized to the input
18674 operand to ease control flow. */
18675 res
= copy_to_reg (operand1
);
18677 /* xa = abs (operand1) */
18678 xa
= ix86_expand_sse_fabs (res
, &mask
);
18680 /* if (!isless (xa, TWO52)) goto label; */
18681 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18683 /* xa = xa + TWO52 - TWO52; */
18684 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18685 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
18687 /* xa = copysign (xa, operand1) */
18688 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18691 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18693 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18694 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
18695 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18696 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
18697 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18698 if (HONOR_SIGNED_ZEROS (mode
))
18700 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18701 if (do_floor
&& flag_rounding_math
)
18702 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18704 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
18706 emit_move_insn (res
, tmp
);
18708 emit_label (label
);
18709 LABEL_NUSES (label
) = 1;
18711 emit_move_insn (operand0
, res
);
18714 /* Expand SSE sequence for computing trunc
18715 from OPERAND1 storing into OPERAND0. */
18717 ix86_expand_trunc (rtx operand0
, rtx operand1
)
18719 /* C code for SSE variant we expand below.
18720 double xa = fabs (x), x2;
18721 if (!isless (xa, TWO52))
18723 x2 = (double)(long)x;
18724 if (HONOR_SIGNED_ZEROS (mode))
18725 return copysign (x2, x);
18728 machine_mode mode
= GET_MODE (operand0
);
18729 rtx xa
, xi
, TWO52
, res
, mask
;
18730 rtx_code_label
*label
;
18732 TWO52
= ix86_gen_TWO52 (mode
);
18734 /* Temporary for holding the result, initialized to the input
18735 operand to ease control flow. */
18736 res
= copy_to_reg (operand1
);
18738 /* xa = abs (operand1) */
18739 xa
= ix86_expand_sse_fabs (res
, &mask
);
18741 /* if (!isless (xa, TWO52)) goto label; */
18742 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18744 /* xa = (double)(long)x */
18745 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18746 expand_fix (xi
, res
, 0);
18747 expand_float (xa
, xi
, 0);
18749 if (HONOR_SIGNED_ZEROS (mode
))
18750 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18752 emit_move_insn (res
, xa
);
18754 emit_label (label
);
18755 LABEL_NUSES (label
) = 1;
18757 emit_move_insn (operand0
, res
);
18760 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18761 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18762 that is only available on 64bit targets. */
18764 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
18766 machine_mode mode
= GET_MODE (operand0
);
18767 rtx xa
, xa2
, TWO52
, tmp
, one
, res
, mask
;
18768 rtx_code_label
*label
;
18770 /* C code for SSE variant we expand below.
18771 double xa = fabs (x), x2;
18772 if (!isless (xa, TWO52))
18774 xa2 = xa + TWO52 - TWO52;
18778 x2 = copysign (xa2, x);
18782 TWO52
= ix86_gen_TWO52 (mode
);
18784 /* Temporary for holding the result, initialized to the input
18785 operand to ease control flow. */
18786 res
=copy_to_reg (operand1
);
18788 /* xa = abs (operand1) */
18789 xa
= ix86_expand_sse_fabs (res
, &mask
);
18791 /* if (!isless (xa, TWO52)) goto label; */
18792 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18794 /* xa2 = xa + TWO52 - TWO52; */
18795 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18796 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18799 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18801 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18802 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa2
, xa
, false);
18803 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18804 tmp
= expand_simple_binop (mode
, MINUS
,
18805 xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18806 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18807 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
18808 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18810 /* res = copysign (xa2, operand1) */
18811 ix86_sse_copysign_to_positive (res
, tmp
, res
, mask
);
18813 emit_label (label
);
18814 LABEL_NUSES (label
) = 1;
18816 emit_move_insn (operand0
, res
);
18819 /* Expand SSE sequence for computing round
18820 from OPERAND1 storing into OPERAND0. */
18822 ix86_expand_round (rtx operand0
, rtx operand1
)
18824 /* C code for the stuff we're doing below:
18825 double xa = fabs (x);
18826 if (!isless (xa, TWO52))
18828 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18829 return copysign (xa, x);
18831 machine_mode mode
= GET_MODE (operand0
);
18832 rtx res
, TWO52
, xa
, xi
, half
, mask
;
18833 rtx_code_label
*label
;
18834 const struct real_format
*fmt
;
18835 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18837 /* Temporary for holding the result, initialized to the input
18838 operand to ease control flow. */
18839 res
= copy_to_reg (operand1
);
18841 TWO52
= ix86_gen_TWO52 (mode
);
18842 xa
= ix86_expand_sse_fabs (res
, &mask
);
18843 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18845 /* load nextafter (0.5, 0.0) */
18846 fmt
= REAL_MODE_FORMAT (mode
);
18847 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18848 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18850 /* xa = xa + 0.5 */
18851 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18852 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18854 /* xa = (double)(int64_t)xa */
18855 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18856 expand_fix (xi
, xa
, 0);
18857 expand_float (xa
, xi
, 0);
18859 /* res = copysign (xa, operand1) */
18860 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
18862 emit_label (label
);
18863 LABEL_NUSES (label
) = 1;
18865 emit_move_insn (operand0
, res
);
18868 /* Expand SSE sequence for computing round from OPERAND1 storing
18869 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18870 that is only available on 64bit targets. */
18872 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
18874 /* C code for the stuff we expand below.
18875 double xa = fabs (x), xa2, x2;
18876 if (!isless (xa, TWO52))
18878 Using the absolute value and copying back sign makes
18879 -0.0 -> -0.0 correct.
18880 xa2 = xa + TWO52 - TWO52;
18885 else if (dxa > 0.5)
18887 x2 = copysign (xa2, x);
18890 machine_mode mode
= GET_MODE (operand0
);
18891 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
18892 rtx_code_label
*label
;
18894 TWO52
= ix86_gen_TWO52 (mode
);
18896 /* Temporary for holding the result, initialized to the input
18897 operand to ease control flow. */
18898 res
= copy_to_reg (operand1
);
18900 /* xa = abs (operand1) */
18901 xa
= ix86_expand_sse_fabs (res
, &mask
);
18903 /* if (!isless (xa, TWO52)) goto label; */
18904 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18906 /* xa2 = xa + TWO52 - TWO52; */
18907 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18908 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18910 /* dxa = xa2 - xa; */
18911 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
18913 /* generate 0.5, 1.0 and -0.5 */
18914 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
18915 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18916 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
18920 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18921 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
18922 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18923 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18924 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18925 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
18926 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18927 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18929 /* res = copysign (xa2, operand1) */
18930 ix86_sse_copysign_to_positive (res
, xa2
, res
, mask
);
18932 emit_label (label
);
18933 LABEL_NUSES (label
) = 1;
18935 emit_move_insn (operand0
, res
);
18938 /* Expand SSE sequence for computing round
18939 from OP1 storing into OP0 using sse4 round insn. */
18941 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
18943 machine_mode mode
= GET_MODE (op0
);
18944 rtx e1
, e2
, res
, half
;
18945 const struct real_format
*fmt
;
18946 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18947 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
18948 rtx (*gen_round
) (rtx
, rtx
, rtx
);
18953 gen_copysign
= gen_copysignsf3
;
18954 gen_round
= gen_sse4_1_roundsf2
;
18957 gen_copysign
= gen_copysigndf3
;
18958 gen_round
= gen_sse4_1_rounddf2
;
18961 gcc_unreachable ();
18964 /* round (a) = trunc (a + copysign (0.5, a)) */
18966 /* load nextafter (0.5, 0.0) */
18967 fmt
= REAL_MODE_FORMAT (mode
);
18968 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18969 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18970 half
= const_double_from_real_value (pred_half
, mode
);
18972 /* e1 = copysign (0.5, op1) */
18973 e1
= gen_reg_rtx (mode
);
18974 emit_insn (gen_copysign (e1
, half
, op1
));
18976 /* e2 = op1 + e1 */
18977 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18979 /* res = trunc (e2) */
18980 res
= gen_reg_rtx (mode
);
18981 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
18983 emit_move_insn (op0
, res
);
18986 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18987 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18988 insn every time. */
18990 static GTY(()) rtx_insn
*vselect_insn
;
18992 /* Initialize vselect_insn. */
18995 init_vselect_insn (void)
19000 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
19001 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
19002 XVECEXP (x
, 0, i
) = const0_rtx
;
19003 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
19005 x
= gen_rtx_SET (const0_rtx
, x
);
19007 vselect_insn
= emit_insn (x
);
19011 /* Construct (set target (vec_select op0 (parallel perm))) and
19012 return true if that's a valid instruction in the active ISA. */
19015 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
19016 unsigned nelt
, bool testing_p
)
19019 rtx x
, save_vconcat
;
19022 if (vselect_insn
== NULL_RTX
)
19023 init_vselect_insn ();
19025 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
19026 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
19027 for (i
= 0; i
< nelt
; ++i
)
19028 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
19029 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
19030 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
19031 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
19032 SET_DEST (PATTERN (vselect_insn
)) = target
;
19033 icode
= recog_memoized (vselect_insn
);
19035 if (icode
>= 0 && !testing_p
)
19036 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
19038 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
19039 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
19040 INSN_CODE (vselect_insn
) = -1;
19045 /* Similar, but generate a vec_concat from op0 and op1 as well. */
19048 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
19049 const unsigned char *perm
, unsigned nelt
,
19052 machine_mode v2mode
;
19056 if (vselect_insn
== NULL_RTX
)
19057 init_vselect_insn ();
19059 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
19061 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
19062 PUT_MODE (x
, v2mode
);
19065 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
19066 XEXP (x
, 0) = const0_rtx
;
19067 XEXP (x
, 1) = const0_rtx
;
19071 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19072 using movss or movsd. */
19074 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
19076 machine_mode vmode
= d
->vmode
;
19077 unsigned i
, nelt
= d
->nelt
;
19080 if (d
->one_operand_p
)
19083 if (!(TARGET_SSE
&& (vmode
== V4SFmode
|| vmode
== V4SImode
))
19084 && !(TARGET_MMX_WITH_SSE
&& (vmode
== V2SFmode
|| vmode
== V2SImode
))
19085 && !(TARGET_SSE2
&& (vmode
== V2DFmode
|| vmode
== V2DImode
)))
19088 /* Only the first element is changed. */
19089 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
19091 for (i
= 1; i
< nelt
; ++i
)
19092 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
19098 if (d
->perm
[0] == nelt
)
19099 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
19101 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
19103 emit_insn (gen_rtx_SET (d
->target
, x
));
19108 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19111 expand_vec_perm_insertps (struct expand_vec_perm_d
*d
)
19113 machine_mode vmode
= d
->vmode
;
19114 unsigned i
, cnt_s
, nelt
= d
->nelt
;
19118 if (d
->one_operand_p
)
19121 if (!(TARGET_SSE4_1
19122 && (vmode
== V4SFmode
|| vmode
== V4SImode
19123 || (TARGET_MMX_WITH_SSE
19124 && (vmode
== V2SFmode
|| vmode
== V2SImode
)))))
19127 for (i
= 0; i
< nelt
; ++i
)
19129 if (d
->perm
[i
] == i
)
19141 for (i
= 0; i
< nelt
; ++i
)
19143 if (d
->perm
[i
] == i
+ nelt
)
19157 gcc_assert (cnt_d
!= -1);
19159 cnt_s
= d
->perm
[cnt_d
];
19171 gcc_assert (cnt_s
< nelt
);
19173 rtx x
= gen_sse4_1_insertps (vmode
, d
->target
, dst
, src
,
19174 GEN_INT (cnt_s
<< 6 | cnt_d
<< 4));
19180 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19181 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
19184 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
19186 machine_mode mmode
, vmode
= d
->vmode
;
19187 unsigned i
, nelt
= d
->nelt
;
19188 unsigned HOST_WIDE_INT mask
;
19189 rtx target
, op0
, op1
, maskop
, x
;
19190 rtx rperm
[32], vperm
;
19192 if (d
->one_operand_p
)
19194 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
19195 && (TARGET_AVX512BW
19196 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
19198 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
19200 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
19202 else if (TARGET_SSE4_1
19203 && (GET_MODE_SIZE (vmode
) == 16
19204 || (TARGET_MMX_WITH_SSE
&& GET_MODE_SIZE (vmode
) == 8)
19205 || GET_MODE_SIZE (vmode
) == 4))
19210 /* This is a blend, not a permute. Elements must stay in their
19211 respective lanes. */
19212 for (i
= 0; i
< nelt
; ++i
)
19214 unsigned e
= d
->perm
[i
];
19215 if (!(e
== i
|| e
== i
+ nelt
))
19222 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
19223 decision should be extracted elsewhere, so that we only try that
19224 sequence once all budget==3 options have been tried. */
19225 target
= d
->target
;
19247 for (i
= 0; i
< nelt
; ++i
)
19248 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
19252 for (i
= 0; i
< 2; ++i
)
19253 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
19258 for (i
= 0; i
< 2; ++i
)
19259 mask
|= (d
->perm
[i
] >= 2 ? 3 : 0) << (i
* 2);
19266 /* Use vpblendd instead of vpblendw. */
19267 for (i
= 0; i
< nelt
; ++i
)
19268 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
19273 for (i
= 0; i
< 4; ++i
)
19274 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
19280 /* See if bytes move in pairs so we can use pblendw with
19281 an immediate argument, rather than pblendvb with a vector
19283 for (i
= 0; i
< 16; i
+= 2)
19284 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19287 for (i
= 0; i
< nelt
; ++i
)
19288 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
19291 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
19292 vperm
= force_reg (vmode
, vperm
);
19294 if (GET_MODE_SIZE (vmode
) == 4)
19295 emit_insn (gen_mmx_pblendvb_v4qi (target
, op0
, op1
, vperm
));
19296 else if (GET_MODE_SIZE (vmode
) == 8)
19297 emit_insn (gen_mmx_pblendvb_v8qi (target
, op0
, op1
, vperm
));
19298 else if (GET_MODE_SIZE (vmode
) == 16)
19299 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
19301 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
19302 if (target
!= d
->target
)
19303 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19307 for (i
= 0; i
< 8; ++i
)
19308 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
19313 target
= gen_reg_rtx (vmode
);
19314 op0
= gen_lowpart (vmode
, op0
);
19315 op1
= gen_lowpart (vmode
, op1
);
19319 for (i
= 0; i
< 8; i
+= 2)
19320 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19323 for (i
= 0; i
< 4; ++i
)
19324 mask
|= (d
->perm
[i
* 2] >= 8) << i
;
19329 for (i
= 0; i
< 4; i
+= 2)
19330 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19333 for (i
= 0; i
< 2; ++i
)
19334 mask
|= (d
->perm
[i
* 2] >= 4) << i
;
19339 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19340 for (i
= 0; i
< 32; i
+= 2)
19341 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19343 /* See if bytes move in quadruplets. If yes, vpblendd
19344 with immediate can be used. */
19345 for (i
= 0; i
< 32; i
+= 4)
19346 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
19350 /* See if bytes move the same in both lanes. If yes,
19351 vpblendw with immediate can be used. */
19352 for (i
= 0; i
< 16; i
+= 2)
19353 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
19356 /* Use vpblendw. */
19357 for (i
= 0; i
< 16; ++i
)
19358 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
19363 /* Use vpblendd. */
19364 for (i
= 0; i
< 8; ++i
)
19365 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
19370 /* See if words move in pairs. If yes, vpblendd can be used. */
19371 for (i
= 0; i
< 16; i
+= 2)
19372 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19376 /* See if words move the same in both lanes. If not,
19377 vpblendvb must be used. */
19378 for (i
= 0; i
< 8; i
++)
19379 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
19381 /* Use vpblendvb. */
19382 for (i
= 0; i
< 32; ++i
)
19383 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
19387 target
= gen_reg_rtx (vmode
);
19388 op0
= gen_lowpart (vmode
, op0
);
19389 op1
= gen_lowpart (vmode
, op1
);
19390 goto finish_pblendvb
;
19393 /* Use vpblendw. */
19394 for (i
= 0; i
< 16; ++i
)
19395 mask
|= (d
->perm
[i
] >= 16) << i
;
19399 /* Use vpblendd. */
19400 for (i
= 0; i
< 8; ++i
)
19401 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
19406 /* Use vpblendd. */
19407 for (i
= 0; i
< 4; ++i
)
19408 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
19413 gcc_unreachable ();
19436 if (mmode
!= VOIDmode
)
19437 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
19439 maskop
= GEN_INT (mask
);
19441 /* This matches five different patterns with the different modes. */
19442 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
19443 x
= gen_rtx_SET (target
, x
);
19445 if (target
!= d
->target
)
19446 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19451 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19452 in terms of the variable form of vpermilps.
19454 Note that we will have already failed the immediate input vpermilps,
19455 which requires that the high and low part shuffle be identical; the
19456 variable form doesn't require that. */
19459 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
19461 rtx rperm
[8], vperm
;
19464 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
19467 /* We can only permute within the 128-bit lane. */
19468 for (i
= 0; i
< 8; ++i
)
19470 unsigned e
= d
->perm
[i
];
19471 if (i
< 4 ? e
>= 4 : e
< 4)
19478 for (i
= 0; i
< 8; ++i
)
19480 unsigned e
= d
->perm
[i
];
19482 /* Within each 128-bit lane, the elements of op0 are numbered
19483 from 0 and the elements of op1 are numbered from 4. */
19489 rperm
[i
] = GEN_INT (e
);
19492 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
19493 vperm
= force_reg (V8SImode
, vperm
);
19494 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
19499 /* For V*[QHS]Imode permutations, check if the same permutation
19500 can't be performed in a 2x, 4x or 8x wider inner mode. */
19503 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
19504 struct expand_vec_perm_d
*nd
)
19507 machine_mode mode
= VOIDmode
;
19511 case E_V8QImode
: mode
= V4HImode
; break;
19512 case E_V16QImode
: mode
= V8HImode
; break;
19513 case E_V32QImode
: mode
= V16HImode
; break;
19514 case E_V64QImode
: mode
= V32HImode
; break;
19515 case E_V4HImode
: mode
= V2SImode
; break;
19516 case E_V8HImode
: mode
= V4SImode
; break;
19517 case E_V16HImode
: mode
= V8SImode
; break;
19518 case E_V32HImode
: mode
= V16SImode
; break;
19519 case E_V4SImode
: mode
= V2DImode
; break;
19520 case E_V8SImode
: mode
= V4DImode
; break;
19521 case E_V16SImode
: mode
= V8DImode
; break;
19522 default: return false;
19524 for (i
= 0; i
< d
->nelt
; i
+= 2)
19525 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
19528 nd
->nelt
= d
->nelt
/ 2;
19529 for (i
= 0; i
< nd
->nelt
; i
++)
19530 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
19531 if (GET_MODE_INNER (mode
) != DImode
)
19532 canonicalize_vector_int_perm (nd
, nd
);
19535 nd
->one_operand_p
= d
->one_operand_p
;
19536 nd
->testing_p
= d
->testing_p
;
19537 if (d
->op0
== d
->op1
)
19538 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
19541 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
19542 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
19545 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19547 nd
->target
= gen_reg_rtx (nd
->vmode
);
19552 /* Return true if permutation D can be performed as VMODE permutation
19556 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
19558 unsigned int i
, j
, chunk
;
19560 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
19561 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
19562 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
19565 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
19568 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
19569 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
19570 if (d
->perm
[i
] & (chunk
- 1))
19573 for (j
= 1; j
< chunk
; ++j
)
19574 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
19580 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19581 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19584 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
19586 unsigned i
, nelt
, eltsz
, mask
;
19587 unsigned char perm
[64];
19588 machine_mode vmode
;
19589 struct expand_vec_perm_d nd
;
19590 rtx rperm
[64], vperm
, target
, op0
, op1
;
19594 if (!d
->one_operand_p
)
19595 switch (GET_MODE_SIZE (d
->vmode
))
19619 if (valid_perm_using_mode_p (V2TImode
, d
))
19624 /* Use vperm2i128 insn. The pattern uses
19625 V4DImode instead of V2TImode. */
19626 target
= d
->target
;
19627 if (d
->vmode
!= V4DImode
)
19628 target
= gen_reg_rtx (V4DImode
);
19629 op0
= gen_lowpart (V4DImode
, d
->op0
);
19630 op1
= gen_lowpart (V4DImode
, d
->op1
);
19632 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
19633 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
19634 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
19635 if (target
!= d
->target
)
19636 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19645 switch (GET_MODE_SIZE (d
->vmode
))
19669 /* V4DImode should be already handled through
19670 expand_vselect by vpermq instruction. */
19671 gcc_assert (d
->vmode
!= V4DImode
);
19674 if (d
->vmode
== V8SImode
19675 || d
->vmode
== V16HImode
19676 || d
->vmode
== V32QImode
)
19678 /* First see if vpermq can be used for
19679 V8SImode/V16HImode/V32QImode. */
19680 if (valid_perm_using_mode_p (V4DImode
, d
))
19682 for (i
= 0; i
< 4; i
++)
19683 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
19686 target
= gen_reg_rtx (V4DImode
);
19687 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
19690 emit_move_insn (d
->target
,
19691 gen_lowpart (d
->vmode
, target
));
19697 /* Next see if vpermd can be used. */
19698 if (valid_perm_using_mode_p (V8SImode
, d
))
19701 /* Or if vpermps can be used. */
19702 else if (d
->vmode
== V8SFmode
)
19705 if (vmode
== V32QImode
)
19707 /* vpshufb only works intra lanes, it is not
19708 possible to shuffle bytes in between the lanes. */
19709 for (i
= 0; i
< nelt
; ++i
)
19710 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
19716 if (!TARGET_AVX512BW
)
19719 /* If vpermq didn't work, vpshufb won't work either. */
19720 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
19724 if (d
->vmode
== V16SImode
19725 || d
->vmode
== V32HImode
19726 || d
->vmode
== V64QImode
)
19728 /* First see if vpermq can be used for
19729 V16SImode/V32HImode/V64QImode. */
19730 if (valid_perm_using_mode_p (V8DImode
, d
))
19732 for (i
= 0; i
< 8; i
++)
19733 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
19736 target
= gen_reg_rtx (V8DImode
);
19737 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
19740 emit_move_insn (d
->target
,
19741 gen_lowpart (d
->vmode
, target
));
19747 /* Next see if vpermd can be used. */
19748 if (valid_perm_using_mode_p (V16SImode
, d
))
19751 /* Or if vpermps can be used. */
19752 else if (d
->vmode
== V16SFmode
)
19755 if (vmode
== V64QImode
)
19757 /* vpshufb only works intra lanes, it is not
19758 possible to shuffle bytes in between the lanes. */
19759 for (i
= 0; i
< nelt
; ++i
)
19760 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
19772 /* Try to avoid variable permutation instruction. */
19773 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19775 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19779 if (vmode
== V8SImode
)
19780 for (i
= 0; i
< 8; ++i
)
19781 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
19782 else if (vmode
== V16SImode
)
19783 for (i
= 0; i
< 16; ++i
)
19784 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
19787 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19788 if (!d
->one_operand_p
)
19789 mask
= 2 * nelt
- 1;
19790 else if (vmode
== V64QImode
)
19791 mask
= nelt
/ 4 - 1;
19792 else if (vmode
== V32QImode
)
19793 mask
= nelt
/ 2 - 1;
19797 for (i
= 0; i
< nelt
; ++i
)
19799 unsigned j
, e
= d
->perm
[i
] & mask
;
19800 for (j
= 0; j
< eltsz
; ++j
)
19801 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
19805 machine_mode vpmode
= vmode
;
19807 nelt
= GET_MODE_SIZE (vmode
);
19809 /* Emulate narrow modes with V16QI instructions. */
19812 rtx m128
= GEN_INT (-128);
19814 /* Remap elements from the second operand, as we have to
19815 account for inactive top elements from the first operand. */
19816 if (!d
->one_operand_p
)
19818 for (i
= 0; i
< nelt
; ++i
)
19820 unsigned ival
= UINTVAL (rperm
[i
]);
19822 rperm
[i
] = GEN_INT (ival
+ 16 - nelt
);
19826 /* Fill inactive elements in the top positions with zeros. */
19827 for (i
= nelt
; i
< 16; ++i
)
19830 vpmode
= V16QImode
;
19833 vperm
= gen_rtx_CONST_VECTOR (vpmode
,
19834 gen_rtvec_v (GET_MODE_NUNITS (vpmode
), rperm
));
19835 vperm
= force_reg (vpmode
, vperm
);
19837 if (vmode
== d
->vmode
)
19838 target
= d
->target
;
19840 target
= gen_reg_rtx (vmode
);
19842 op0
= gen_lowpart (vmode
, d
->op0
);
19844 if (d
->one_operand_p
)
19846 rtx (*gen
) (rtx
, rtx
, rtx
);
19848 if (vmode
== V4QImode
)
19849 gen
= gen_mmx_pshufbv4qi3
;
19850 else if (vmode
== V8QImode
)
19851 gen
= gen_mmx_pshufbv8qi3
;
19852 else if (vmode
== V16QImode
)
19853 gen
= gen_ssse3_pshufbv16qi3
;
19854 else if (vmode
== V32QImode
)
19855 gen
= gen_avx2_pshufbv32qi3
;
19856 else if (vmode
== V64QImode
)
19857 gen
= gen_avx512bw_pshufbv64qi3
;
19858 else if (vmode
== V8SFmode
)
19859 gen
= gen_avx2_permvarv8sf
;
19860 else if (vmode
== V8SImode
)
19861 gen
= gen_avx2_permvarv8si
;
19862 else if (vmode
== V16SFmode
)
19863 gen
= gen_avx512f_permvarv16sf
;
19864 else if (vmode
== V16SImode
)
19865 gen
= gen_avx512f_permvarv16si
;
19867 gcc_unreachable ();
19869 emit_insn (gen (target
, op0
, vperm
));
19873 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
19875 op1
= gen_lowpart (vmode
, d
->op1
);
19877 if (vmode
== V4QImode
)
19878 gen
= gen_mmx_ppermv32
;
19879 else if (vmode
== V8QImode
)
19880 gen
= gen_mmx_ppermv64
;
19881 else if (vmode
== V16QImode
)
19882 gen
= gen_xop_pperm
;
19884 gcc_unreachable ();
19886 emit_insn (gen (target
, op0
, op1
, vperm
));
19889 if (target
!= d
->target
)
19890 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19895 /* Try to expand one-operand permutation with constant mask. */
19898 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
19900 machine_mode mode
= GET_MODE (d
->op0
);
19901 machine_mode maskmode
= mode
;
19902 unsigned inner_size
= GET_MODE_SIZE (GET_MODE_INNER (mode
));
19903 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
19904 rtx target
, op0
, mask
;
19907 if (!rtx_equal_p (d
->op0
, d
->op1
))
19910 if (!TARGET_AVX512F
)
19913 /* Accept VNxHImode and VNxQImode now. */
19914 if (!TARGET_AVX512VL
&& GET_MODE_SIZE (mode
) < 64)
19918 if (!TARGET_AVX512BW
&& inner_size
== 2)
19922 if (!TARGET_AVX512VBMI
&& inner_size
== 1)
19928 gen
= gen_avx512f_permvarv16si
;
19931 gen
= gen_avx512f_permvarv16sf
;
19932 maskmode
= V16SImode
;
19935 gen
= gen_avx512f_permvarv8di
;
19938 gen
= gen_avx512f_permvarv8df
;
19939 maskmode
= V8DImode
;
19942 gen
= gen_avx512bw_permvarv32hi
;
19945 gen
= gen_avx512vl_permvarv16hi
;
19948 gen
= gen_avx512vl_permvarv8hi
;
19951 gen
= gen_avx512bw_permvarv64qi
;
19954 gen
= gen_avx512vl_permvarv32qi
;
19957 gen
= gen_avx512vl_permvarv16qi
;
19967 target
= d
->target
;
19969 for (int i
= 0; i
< d
->nelt
; ++i
)
19970 vec
[i
] = GEN_INT (d
->perm
[i
]);
19971 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
19972 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
19976 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
19978 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19979 in a single instruction. */
19982 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
19984 unsigned i
, nelt
= d
->nelt
;
19985 struct expand_vec_perm_d nd
;
19987 /* Check plain VEC_SELECT first, because AVX has instructions that could
19988 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19989 input where SEL+CONCAT may not. */
19990 if (d
->one_operand_p
)
19992 int mask
= nelt
- 1;
19993 bool identity_perm
= true;
19994 bool broadcast_perm
= true;
19996 for (i
= 0; i
< nelt
; i
++)
19998 nd
.perm
[i
] = d
->perm
[i
] & mask
;
19999 if (nd
.perm
[i
] != i
)
20000 identity_perm
= false;
20002 broadcast_perm
= false;
20008 emit_move_insn (d
->target
, d
->op0
);
20011 else if (broadcast_perm
&& TARGET_AVX2
)
20013 /* Use vpbroadcast{b,w,d}. */
20014 rtx (*gen
) (rtx
, rtx
) = NULL
;
20018 if (TARGET_AVX512BW
)
20019 gen
= gen_avx512bw_vec_dupv64qi_1
;
20022 gen
= gen_avx2_pbroadcastv32qi_1
;
20025 if (TARGET_AVX512BW
)
20026 gen
= gen_avx512bw_vec_dupv32hi_1
;
20029 gen
= gen_avx2_pbroadcastv16hi_1
;
20032 if (TARGET_AVX512F
)
20033 gen
= gen_avx512f_vec_dupv16si_1
;
20036 gen
= gen_avx2_pbroadcastv8si_1
;
20039 gen
= gen_avx2_pbroadcastv16qi
;
20042 gen
= gen_avx2_pbroadcastv8hi
;
20045 if (TARGET_AVX512F
)
20046 gen
= gen_avx512f_vec_dupv16sf_1
;
20049 gen
= gen_avx2_vec_dupv8sf_1
;
20052 if (TARGET_AVX512F
)
20053 gen
= gen_avx512f_vec_dupv8df_1
;
20056 if (TARGET_AVX512F
)
20057 gen
= gen_avx512f_vec_dupv8di_1
;
20059 /* For other modes prefer other shuffles this function creates. */
20065 emit_insn (gen (d
->target
, d
->op0
));
20070 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
20073 /* There are plenty of patterns in sse.md that are written for
20074 SEL+CONCAT and are not replicated for a single op. Perhaps
20075 that should be changed, to avoid the nastiness here. */
20077 /* Recognize interleave style patterns, which means incrementing
20078 every other permutation operand. */
20079 for (i
= 0; i
< nelt
; i
+= 2)
20081 nd
.perm
[i
] = d
->perm
[i
] & mask
;
20082 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
20084 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
20088 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
20091 for (i
= 0; i
< nelt
; i
+= 4)
20093 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
20094 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
20095 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
20096 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
20099 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
20105 /* Try the SSE4.1 blend variable merge instructions. */
20106 if (expand_vec_perm_blend (d
))
20109 /* Try movss/movsd instructions. */
20110 if (expand_vec_perm_movs (d
))
20113 /* Try the SSE4.1 insertps instruction. */
20114 if (expand_vec_perm_insertps (d
))
20117 /* Try the fully general two operand permute. */
20118 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
20122 /* Recognize interleave style patterns with reversed operands. */
20123 if (!d
->one_operand_p
)
20125 for (i
= 0; i
< nelt
; ++i
)
20127 unsigned e
= d
->perm
[i
];
20135 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
20140 /* Try one of the AVX vpermil variable permutations. */
20141 if (expand_vec_perm_vpermil (d
))
20144 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
20145 vpshufb, vpermd, vpermps or vpermq variable permutation. */
20146 if (expand_vec_perm_pshufb (d
))
20149 /* Try the AVX2 vpalignr instruction. */
20150 if (expand_vec_perm_palignr (d
, true))
20153 /* Try the AVX512F vperm{w,b,s,d} instructions */
20154 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
20157 /* Try the AVX512F vpermt2/vpermi2 instructions. */
20158 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
20161 /* See if we can get the same permutation in different vector integer
20163 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
20166 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
20172 /* Canonicalize vec_perm index to make the first index
20173 always comes from the first vector. */
20175 ix86_vec_perm_index_canon (struct expand_vec_perm_d
*d
)
20177 unsigned nelt
= d
->nelt
;
20178 if (d
->perm
[0] < nelt
)
20181 for (unsigned i
= 0; i
!= nelt
; i
++)
20182 d
->perm
[i
] = (d
->perm
[i
] + nelt
) % (2 * nelt
);
20184 std::swap (d
->op0
, d
->op1
);
20188 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20189 in terms of a pair of shufps+ shufps/pshufd instructions. */
20191 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d
*d
)
20193 unsigned char perm1
[4];
20194 machine_mode vmode
= d
->vmode
;
20196 unsigned i
, j
, k
, count
= 0;
20198 if (d
->one_operand_p
20199 || (vmode
!= V4SImode
&& vmode
!= V4SFmode
))
20205 ix86_vec_perm_index_canon (d
);
20206 for (i
= 0; i
< 4; ++i
)
20207 count
+= d
->perm
[i
] > 3 ? 1 : 0;
20209 gcc_assert (count
& 3);
20211 rtx tmp
= gen_reg_rtx (vmode
);
20212 /* 2 from op0 and 2 from op1. */
20215 unsigned char perm2
[4];
20216 for (i
= 0, j
= 0, k
= 2; i
< 4; ++i
)
20217 if (d
->perm
[i
] & 4)
20219 perm1
[k
++] = d
->perm
[i
];
20224 perm1
[j
++] = d
->perm
[i
];
20229 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
20230 perm1
, d
->nelt
, false);
20232 if (vmode
== V4SImode
&& TARGET_SSE2
)
20234 ok
= expand_vselect (d
->target
, tmp
,
20235 perm2
, d
->nelt
, false);
20241 ok
= expand_vselect_vconcat (d
->target
, tmp
, tmp
,
20242 perm2
, d
->nelt
, false);
20246 /* 3 from one op and 1 from another. */
20249 unsigned pair_idx
= 8, lone_idx
= 8, shift
;
20251 /* Find the lone index. */
20252 for (i
= 0; i
< 4; ++i
)
20253 if ((d
->perm
[i
] > 3 && count
== 1)
20254 || (d
->perm
[i
] < 4 && count
== 3))
20257 /* When lone_idx is not 0, it must from second op(count == 1). */
20258 gcc_assert (count
== (lone_idx
? 1 : 3));
20260 /* Find the pair index that sits in the same half as the lone index. */
20261 shift
= lone_idx
& 2;
20262 pair_idx
= 1 - lone_idx
+ 2 * shift
;
20264 /* First permutate lone index and pair index into the same vector as
20265 [ lone, lone, pair, pair ]. */
20266 perm1
[1] = perm1
[0]
20267 = (count
== 3) ? d
->perm
[lone_idx
] : d
->perm
[lone_idx
] - 4;
20268 perm1
[3] = perm1
[2]
20269 = (count
== 3) ? d
->perm
[pair_idx
] : d
->perm
[pair_idx
] + 4;
20271 /* Alway put the vector contains lone indx at the first. */
20273 std::swap (d
->op0
, d
->op1
);
20276 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
20277 perm1
, d
->nelt
, false);
20280 /* Refine lone and pair index to original order. */
20281 perm1
[shift
] = lone_idx
<< 1;
20282 perm1
[shift
+ 1] = pair_idx
<< 1;
20284 /* Select the remaining 2 elements in another vector. */
20285 for (i
= 2 - shift
; i
< 4 - shift
; ++i
)
20286 perm1
[i
] = lone_idx
== 1 ? d
->perm
[i
] + 4 : d
->perm
[i
];
20288 /* Adjust to original selector. */
20290 std::swap (tmp
, d
->op1
);
20293 ok
= expand_vselect_vconcat (d
->target
, tmp
, d
->op1
,
20294 perm1
, d
->nelt
, false);
20302 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20303 in terms of a pair of pshuflw + pshufhw instructions. */
20306 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
20308 unsigned char perm2
[MAX_VECT_LEN
];
20312 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
20315 /* The two permutations only operate in 64-bit lanes. */
20316 for (i
= 0; i
< 4; ++i
)
20317 if (d
->perm
[i
] >= 4)
20319 for (i
= 4; i
< 8; ++i
)
20320 if (d
->perm
[i
] < 4)
20326 /* Emit the pshuflw. */
20327 memcpy (perm2
, d
->perm
, 4);
20328 for (i
= 4; i
< 8; ++i
)
20330 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
20333 /* Emit the pshufhw. */
20334 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
20335 for (i
= 0; i
< 4; ++i
)
20337 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
20343 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20344 the permutation using the SSSE3 palignr instruction. This succeeds
20345 when all of the elements in PERM fit within one vector and we merely
20346 need to shift them down so that a single vector permutation has a
20347 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20348 the vpalignr instruction itself can perform the requested permutation. */
20351 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
20353 unsigned i
, nelt
= d
->nelt
;
20354 unsigned min
, max
, minswap
, maxswap
;
20355 bool in_order
, ok
, swap
= false;
20357 struct expand_vec_perm_d dcopy
;
20359 /* Even with AVX, palignr only operates on 128-bit vectors,
20360 in AVX2 palignr operates on both 128-bit lanes. */
20361 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
20362 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
20367 minswap
= 2 * nelt
;
20369 for (i
= 0; i
< nelt
; ++i
)
20371 unsigned e
= d
->perm
[i
];
20372 unsigned eswap
= d
->perm
[i
] ^ nelt
;
20373 if (GET_MODE_SIZE (d
->vmode
) == 32)
20375 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
20376 eswap
= e
^ (nelt
/ 2);
20382 if (eswap
< minswap
)
20384 if (eswap
> maxswap
)
20388 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
20390 if (d
->one_operand_p
20392 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
20393 ? nelt
/ 2 : nelt
))
20400 /* Given that we have SSSE3, we know we'll be able to implement the
20401 single operand permutation after the palignr with pshufb for
20402 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20404 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
20410 dcopy
.op0
= d
->op1
;
20411 dcopy
.op1
= d
->op0
;
20412 for (i
= 0; i
< nelt
; ++i
)
20413 dcopy
.perm
[i
] ^= nelt
;
20417 for (i
= 0; i
< nelt
; ++i
)
20419 unsigned e
= dcopy
.perm
[i
];
20420 if (GET_MODE_SIZE (d
->vmode
) == 32
20422 && (e
& (nelt
/ 2 - 1)) < min
)
20423 e
= e
- min
- (nelt
/ 2);
20430 dcopy
.one_operand_p
= true;
20432 if (single_insn_only_p
&& !in_order
)
20435 /* For AVX2, test whether we can permute the result in one instruction. */
20440 dcopy
.op1
= dcopy
.op0
;
20441 return expand_vec_perm_1 (&dcopy
);
20444 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
20445 if (GET_MODE_SIZE (d
->vmode
) == 16)
20447 target
= gen_reg_rtx (V1TImode
);
20448 emit_insn (gen_ssse3_palignrv1ti (target
,
20449 gen_lowpart (V1TImode
, dcopy
.op1
),
20450 gen_lowpart (V1TImode
, dcopy
.op0
),
20455 target
= gen_reg_rtx (V2TImode
);
20456 emit_insn (gen_avx2_palignrv2ti (target
,
20457 gen_lowpart (V2TImode
, dcopy
.op1
),
20458 gen_lowpart (V2TImode
, dcopy
.op0
),
20462 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
20464 /* Test for the degenerate case where the alignment by itself
20465 produces the desired permutation. */
20468 emit_move_insn (d
->target
, dcopy
.op0
);
20472 ok
= expand_vec_perm_1 (&dcopy
);
20473 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
20478 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20479 the permutation using the SSE4_1 pblendv instruction. Potentially
20480 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20483 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
20485 unsigned i
, which
, nelt
= d
->nelt
;
20486 struct expand_vec_perm_d dcopy
, dcopy1
;
20487 machine_mode vmode
= d
->vmode
;
20490 /* Use the same checks as in expand_vec_perm_blend. */
20491 if (d
->one_operand_p
)
20493 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
20495 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
20497 else if (TARGET_SSE4_1
20498 && (GET_MODE_SIZE (vmode
) == 16
20499 || (TARGET_MMX_WITH_SSE
&& GET_MODE_SIZE (vmode
) == 8)
20500 || GET_MODE_SIZE (vmode
) == 4))
20505 /* Figure out where permutation elements stay not in their
20506 respective lanes. */
20507 for (i
= 0, which
= 0; i
< nelt
; ++i
)
20509 unsigned e
= d
->perm
[i
];
20511 which
|= (e
< nelt
? 1 : 2);
20513 /* We can pblend the part where elements stay not in their
20514 respective lanes only when these elements are all in one
20515 half of a permutation.
20516 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20517 lanes, but both 8 and 9 >= 8
20518 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20519 respective lanes and 8 >= 8, but 2 not. */
20520 if (which
!= 1 && which
!= 2)
20522 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
20525 /* First we apply one operand permutation to the part where
20526 elements stay not in their respective lanes. */
20529 dcopy
.op0
= dcopy
.op1
= d
->op1
;
20531 dcopy
.op0
= dcopy
.op1
= d
->op0
;
20533 dcopy
.target
= gen_reg_rtx (vmode
);
20534 dcopy
.one_operand_p
= true;
20536 for (i
= 0; i
< nelt
; ++i
)
20537 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
20539 ok
= expand_vec_perm_1 (&dcopy
);
20540 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
20547 /* Next we put permuted elements into their positions. */
20550 dcopy1
.op1
= dcopy
.target
;
20552 dcopy1
.op0
= dcopy
.target
;
20554 for (i
= 0; i
< nelt
; ++i
)
20555 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
20557 ok
= expand_vec_perm_blend (&dcopy1
);
20563 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
20565 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20566 a two vector permutation into a single vector permutation by using
20567 an interleave operation to merge the vectors. */
20570 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
20572 struct expand_vec_perm_d dremap
, dfinal
;
20573 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
20574 unsigned HOST_WIDE_INT contents
;
20575 unsigned char remap
[2 * MAX_VECT_LEN
];
20577 bool ok
, same_halves
= false;
20579 if (GET_MODE_SIZE (d
->vmode
) == 4
20580 || GET_MODE_SIZE (d
->vmode
) == 8
20581 || GET_MODE_SIZE (d
->vmode
) == 16)
20583 if (d
->one_operand_p
)
20586 else if (GET_MODE_SIZE (d
->vmode
) == 32)
20590 /* For 32-byte modes allow even d->one_operand_p.
20591 The lack of cross-lane shuffling in some instructions
20592 might prevent a single insn shuffle. */
20594 dfinal
.testing_p
= true;
20595 /* If expand_vec_perm_interleave3 can expand this into
20596 a 3 insn sequence, give up and let it be expanded as
20597 3 insn sequence. While that is one insn longer,
20598 it doesn't need a memory operand and in the common
20599 case that both interleave low and high permutations
20600 with the same operands are adjacent needs 4 insns
20601 for both after CSE. */
20602 if (expand_vec_perm_interleave3 (&dfinal
))
20608 /* Examine from whence the elements come. */
20610 for (i
= 0; i
< nelt
; ++i
)
20611 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
20613 memset (remap
, 0xff, sizeof (remap
));
20616 if (GET_MODE_SIZE (d
->vmode
) == 4
20617 || GET_MODE_SIZE (d
->vmode
) == 8)
20619 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
20621 /* Split the two input vectors into 4 halves. */
20622 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
20627 /* If the elements from the low halves use interleave low,
20628 and similarly for interleave high. */
20629 if ((contents
& (h1
| h3
)) == contents
)
20632 for (i
= 0; i
< nelt2
; ++i
)
20635 remap
[i
+ nelt
] = i
* 2 + 1;
20636 dremap
.perm
[i
* 2] = i
;
20637 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20640 else if ((contents
& (h2
| h4
)) == contents
)
20643 for (i
= 0; i
< nelt2
; ++i
)
20645 remap
[i
+ nelt2
] = i
* 2;
20646 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
20647 dremap
.perm
[i
* 2] = i
+ nelt2
;
20648 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
20654 else if (GET_MODE_SIZE (d
->vmode
) == 16)
20656 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
20658 /* Split the two input vectors into 4 halves. */
20659 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
20664 /* If the elements from the low halves use interleave low, and similarly
20665 for interleave high. If the elements are from mis-matched halves, we
20666 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20667 if ((contents
& (h1
| h3
)) == contents
)
20670 for (i
= 0; i
< nelt2
; ++i
)
20673 remap
[i
+ nelt
] = i
* 2 + 1;
20674 dremap
.perm
[i
* 2] = i
;
20675 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20677 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
20678 dremap
.vmode
= V4SFmode
;
20680 else if ((contents
& (h2
| h4
)) == contents
)
20683 for (i
= 0; i
< nelt2
; ++i
)
20685 remap
[i
+ nelt2
] = i
* 2;
20686 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
20687 dremap
.perm
[i
* 2] = i
+ nelt2
;
20688 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
20690 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
20691 dremap
.vmode
= V4SFmode
;
20693 else if ((contents
& (h1
| h4
)) == contents
)
20696 for (i
= 0; i
< nelt2
; ++i
)
20699 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
20700 dremap
.perm
[i
] = i
;
20701 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
20706 dremap
.vmode
= V2DImode
;
20708 dremap
.perm
[0] = 0;
20709 dremap
.perm
[1] = 3;
20712 else if ((contents
& (h2
| h3
)) == contents
)
20715 for (i
= 0; i
< nelt2
; ++i
)
20717 remap
[i
+ nelt2
] = i
;
20718 remap
[i
+ nelt
] = i
+ nelt2
;
20719 dremap
.perm
[i
] = i
+ nelt2
;
20720 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
20725 dremap
.vmode
= V2DImode
;
20727 dremap
.perm
[0] = 1;
20728 dremap
.perm
[1] = 2;
20736 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
20737 unsigned HOST_WIDE_INT q
[8];
20738 unsigned int nonzero_halves
[4];
20740 /* Split the two input vectors into 8 quarters. */
20741 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
20742 for (i
= 1; i
< 8; ++i
)
20743 q
[i
] = q
[0] << (nelt4
* i
);
20744 for (i
= 0; i
< 4; ++i
)
20745 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
20747 nonzero_halves
[nzcnt
] = i
;
20753 gcc_assert (d
->one_operand_p
);
20754 nonzero_halves
[1] = nonzero_halves
[0];
20755 same_halves
= true;
20757 else if (d
->one_operand_p
)
20759 gcc_assert (nonzero_halves
[0] == 0);
20760 gcc_assert (nonzero_halves
[1] == 1);
20765 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
20767 /* Attempt to increase the likelihood that dfinal
20768 shuffle will be intra-lane. */
20769 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
20772 /* vperm2f128 or vperm2i128. */
20773 for (i
= 0; i
< nelt2
; ++i
)
20775 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
20776 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
20777 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
20778 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
20781 if (d
->vmode
!= V8SFmode
20782 && d
->vmode
!= V4DFmode
20783 && d
->vmode
!= V8SImode
)
20785 dremap
.vmode
= V8SImode
;
20787 for (i
= 0; i
< 4; ++i
)
20789 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
20790 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
20794 else if (d
->one_operand_p
)
20796 else if (TARGET_AVX2
20797 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
20800 for (i
= 0; i
< nelt4
; ++i
)
20803 remap
[i
+ nelt
] = i
* 2 + 1;
20804 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
20805 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
20806 dremap
.perm
[i
* 2] = i
;
20807 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20808 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
20809 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
20812 else if (TARGET_AVX2
20813 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
20816 for (i
= 0; i
< nelt4
; ++i
)
20818 remap
[i
+ nelt4
] = i
* 2;
20819 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
20820 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
20821 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
20822 dremap
.perm
[i
* 2] = i
+ nelt4
;
20823 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
20824 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
20825 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
20832 /* Use the remapping array set up above to move the elements from their
20833 swizzled locations into their final destinations. */
20835 for (i
= 0; i
< nelt
; ++i
)
20837 unsigned e
= remap
[d
->perm
[i
]];
20838 gcc_assert (e
< nelt
);
20839 /* If same_halves is true, both halves of the remapped vector are the
20840 same. Avoid cross-lane accesses if possible. */
20841 if (same_halves
&& i
>= nelt2
)
20843 gcc_assert (e
< nelt2
);
20844 dfinal
.perm
[i
] = e
+ nelt2
;
20847 dfinal
.perm
[i
] = e
;
20851 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
20852 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
20854 dfinal
.op1
= dfinal
.op0
;
20855 dfinal
.one_operand_p
= true;
20857 /* Test if the final remap can be done with a single insn. For V4SFmode or
20858 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20860 ok
= expand_vec_perm_1 (&dfinal
);
20861 seq
= get_insns ();
20870 if (dremap
.vmode
!= dfinal
.vmode
)
20872 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
20873 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
20876 ok
= expand_vec_perm_1 (&dremap
);
20883 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20884 a single vector cross-lane permutation into vpermq followed
20885 by any of the single insn permutations. */
20888 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
20890 struct expand_vec_perm_d dremap
, dfinal
;
20891 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
20892 unsigned contents
[2];
20896 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
20897 && d
->one_operand_p
))
20902 for (i
= 0; i
< nelt2
; ++i
)
20904 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
20905 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
20908 for (i
= 0; i
< 2; ++i
)
20910 unsigned int cnt
= 0;
20911 for (j
= 0; j
< 4; ++j
)
20912 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
20920 dremap
.vmode
= V4DImode
;
20922 dremap
.target
= gen_reg_rtx (V4DImode
);
20923 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
20924 dremap
.op1
= dremap
.op0
;
20925 dremap
.one_operand_p
= true;
20926 for (i
= 0; i
< 2; ++i
)
20928 unsigned int cnt
= 0;
20929 for (j
= 0; j
< 4; ++j
)
20930 if ((contents
[i
] & (1u << j
)) != 0)
20931 dremap
.perm
[2 * i
+ cnt
++] = j
;
20932 for (; cnt
< 2; ++cnt
)
20933 dremap
.perm
[2 * i
+ cnt
] = 0;
20937 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
20938 dfinal
.op1
= dfinal
.op0
;
20939 dfinal
.one_operand_p
= true;
20940 for (i
= 0, j
= 0; i
< nelt
; ++i
)
20944 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
20945 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
20947 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
20948 dfinal
.perm
[i
] |= nelt4
;
20950 gcc_unreachable ();
20953 ok
= expand_vec_perm_1 (&dremap
);
20956 ok
= expand_vec_perm_1 (&dfinal
);
20962 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
20964 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20965 a vector permutation using two instructions, vperm2f128 resp.
20966 vperm2i128 followed by any single in-lane permutation. */
20969 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
20971 struct expand_vec_perm_d dfirst
, dsecond
;
20972 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
20976 || GET_MODE_SIZE (d
->vmode
) != 32
20977 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
20981 dsecond
.one_operand_p
= false;
20982 dsecond
.testing_p
= true;
20984 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20985 immediate. For perm < 16 the second permutation uses
20986 d->op0 as first operand, for perm >= 16 it uses d->op1
20987 as first operand. The second operand is the result of
20989 for (perm
= 0; perm
< 32; perm
++)
20991 /* Ignore permutations which do not move anything cross-lane. */
20994 /* The second shuffle for e.g. V4DFmode has
20995 0123 and ABCD operands.
20996 Ignore AB23, as 23 is already in the second lane
20997 of the first operand. */
20998 if ((perm
& 0xc) == (1 << 2)) continue;
20999 /* And 01CD, as 01 is in the first lane of the first
21001 if ((perm
& 3) == 0) continue;
21002 /* And 4567, as then the vperm2[fi]128 doesn't change
21003 anything on the original 4567 second operand. */
21004 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
21008 /* The second shuffle for e.g. V4DFmode has
21009 4567 and ABCD operands.
21010 Ignore AB67, as 67 is already in the second lane
21011 of the first operand. */
21012 if ((perm
& 0xc) == (3 << 2)) continue;
21013 /* And 45CD, as 45 is in the first lane of the first
21015 if ((perm
& 3) == 2) continue;
21016 /* And 0123, as then the vperm2[fi]128 doesn't change
21017 anything on the original 0123 first operand. */
21018 if ((perm
& 0xf) == (1 << 2)) continue;
21021 for (i
= 0; i
< nelt
; i
++)
21023 j
= d
->perm
[i
] / nelt2
;
21024 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
21025 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
21026 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
21027 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
21035 ok
= expand_vec_perm_1 (&dsecond
);
21046 /* Found a usable second shuffle. dfirst will be
21047 vperm2f128 on d->op0 and d->op1. */
21048 dsecond
.testing_p
= false;
21050 dfirst
.target
= gen_reg_rtx (d
->vmode
);
21051 for (i
= 0; i
< nelt
; i
++)
21052 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
21053 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
21055 canonicalize_perm (&dfirst
);
21056 ok
= expand_vec_perm_1 (&dfirst
);
21059 /* And dsecond is some single insn shuffle, taking
21060 d->op0 and result of vperm2f128 (if perm < 16) or
21061 d->op1 and result of vperm2f128 (otherwise). */
21063 dsecond
.op0
= dsecond
.op1
;
21064 dsecond
.op1
= dfirst
.target
;
21066 ok
= expand_vec_perm_1 (&dsecond
);
21072 /* For one operand, the only useful vperm2f128 permutation is 0x01
21074 if (d
->one_operand_p
)
21081 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21082 a two vector permutation using 2 intra-lane interleave insns
21083 and cross-lane shuffle for 32-byte vectors. */
21086 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
21089 rtx (*gen
) (rtx
, rtx
, rtx
);
21091 if (d
->one_operand_p
)
21093 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
21095 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
21101 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
21103 for (i
= 0; i
< nelt
; i
+= 2)
21104 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
21105 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
21115 gen
= gen_vec_interleave_highv32qi
;
21117 gen
= gen_vec_interleave_lowv32qi
;
21121 gen
= gen_vec_interleave_highv16hi
;
21123 gen
= gen_vec_interleave_lowv16hi
;
21127 gen
= gen_vec_interleave_highv8si
;
21129 gen
= gen_vec_interleave_lowv8si
;
21133 gen
= gen_vec_interleave_highv4di
;
21135 gen
= gen_vec_interleave_lowv4di
;
21139 gen
= gen_vec_interleave_highv8sf
;
21141 gen
= gen_vec_interleave_lowv8sf
;
21145 gen
= gen_vec_interleave_highv4df
;
21147 gen
= gen_vec_interleave_lowv4df
;
21150 gcc_unreachable ();
21153 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
21157 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21158 a single vector permutation using a single intra-lane vector
21159 permutation, vperm2f128 swapping the lanes and vblend* insn blending
21160 the non-swapped and swapped vectors together. */
21163 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
21165 struct expand_vec_perm_d dfirst
, dsecond
;
21166 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
21169 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
21173 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
21174 || !d
->one_operand_p
)
21178 for (i
= 0; i
< nelt
; i
++)
21179 dfirst
.perm
[i
] = 0xff;
21180 for (i
= 0, msk
= 0; i
< nelt
; i
++)
21182 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
21183 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
21185 dfirst
.perm
[j
] = d
->perm
[i
];
21189 for (i
= 0; i
< nelt
; i
++)
21190 if (dfirst
.perm
[i
] == 0xff)
21191 dfirst
.perm
[i
] = i
;
21194 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21197 ok
= expand_vec_perm_1 (&dfirst
);
21198 seq
= get_insns ();
21210 dsecond
.op0
= dfirst
.target
;
21211 dsecond
.op1
= dfirst
.target
;
21212 dsecond
.one_operand_p
= true;
21213 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21214 for (i
= 0; i
< nelt
; i
++)
21215 dsecond
.perm
[i
] = i
^ nelt2
;
21217 ok
= expand_vec_perm_1 (&dsecond
);
21220 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
21221 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
21225 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21226 a two vector permutation using two single vector permutations and
21227 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
21228 of dfirst or dsecond is identity permutation. */
21231 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d
*d
, bool two_insn
)
21233 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, lane
= nelt
;
21234 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
21235 bool ident1
= true, ident2
= true;
21237 if (d
->one_operand_p
)
21240 if (GET_MODE_SIZE (d
->vmode
) == 16)
21244 if (d
->vmode
!= V4SFmode
&& d
->vmode
!= V2DFmode
&& !TARGET_SSE2
)
21247 else if (GET_MODE_SIZE (d
->vmode
) == 32)
21251 if (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
)
21258 for (i
= 1; i
< nelt
; i
++)
21259 if ((d
->perm
[i
] >= nelt
) != ((d
->perm
[0] >= nelt
) ^ (i
& 1)))
21265 dfirst
.op1
= dfirst
.op0
;
21266 dfirst
.one_operand_p
= true;
21267 dsecond
.op0
= dsecond
.op1
;
21268 dsecond
.one_operand_p
= true;
21270 for (i
= 0; i
< nelt
; i
++)
21271 if (d
->perm
[i
] >= nelt
)
21273 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
] - nelt
;
21274 if (d
->perm
[i
] - nelt
!= i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
21276 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)]
21277 = d
->perm
[i
] - nelt
;
21281 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
];
21282 if (d
->perm
[i
] != i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
21284 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)] = d
->perm
[i
];
21287 if (two_insn
&& !ident1
&& !ident2
)
21293 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
21295 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
21296 if (d
->perm
[0] >= nelt
)
21297 std::swap (dfinal
.op0
, dfinal
.op1
);
21301 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
21306 ok
= expand_vec_perm_1 (&dfirst
);
21307 seq1
= get_insns ();
21317 ok
= expand_vec_perm_1 (&dsecond
);
21318 seq2
= get_insns ();
21328 for (i
= 0; i
< nelt
; i
++)
21330 dfinal
.perm
[i
] = i
/ 2;
21332 dfinal
.perm
[i
] += lane
/ 2;
21334 dfinal
.perm
[i
] += nelt
;
21338 ok
= expand_vselect_vconcat (dfinal
.target
, dfinal
.op0
, dfinal
.op1
,
21339 dfinal
.perm
, dfinal
.nelt
, false);
21344 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21345 the permutation using two single vector permutations and the SSE4_1 pblendv
21346 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21347 identity permutation. */
21350 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d
*d
, bool two_insn
)
21352 unsigned i
, nelt
= d
->nelt
;
21353 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
21354 machine_mode vmode
= d
->vmode
;
21355 bool ident1
= true, ident2
= true;
21357 /* Use the same checks as in expand_vec_perm_blend. */
21358 if (d
->one_operand_p
)
21360 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
21362 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
21364 else if (TARGET_SSE4_1
21365 && (GET_MODE_SIZE (vmode
) == 16
21366 || (TARGET_MMX_WITH_SSE
&& GET_MODE_SIZE (vmode
) == 8)
21367 || GET_MODE_SIZE (vmode
) == 4))
21375 dfirst
.op1
= dfirst
.op0
;
21376 dfirst
.one_operand_p
= true;
21377 dsecond
.op0
= dsecond
.op1
;
21378 dsecond
.one_operand_p
= true;
21380 for (i
= 0; i
< nelt
; ++i
)
21381 if (d
->perm
[i
] >= nelt
)
21383 dfirst
.perm
[i
] = 0xff;
21384 dsecond
.perm
[i
] = d
->perm
[i
] - nelt
;
21385 if (d
->perm
[i
] != i
+ nelt
)
21390 dsecond
.perm
[i
] = 0xff;
21391 dfirst
.perm
[i
] = d
->perm
[i
];
21392 if (d
->perm
[i
] != i
)
21396 if (two_insn
&& !ident1
&& !ident2
)
21399 /* For now. Ideally treat 0xff as a wildcard. */
21400 for (i
= 0; i
< nelt
; ++i
)
21401 if (dfirst
.perm
[i
] == 0xff)
21403 if (GET_MODE_SIZE (vmode
) == 32
21404 && dfirst
.perm
[i
^ (nelt
/ 2)] != 0xff)
21405 dfirst
.perm
[i
] = dfirst
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
21407 dfirst
.perm
[i
] = i
;
21411 if (GET_MODE_SIZE (vmode
) == 32
21412 && dsecond
.perm
[i
^ (nelt
/ 2)] != 0xff)
21413 dsecond
.perm
[i
] = dsecond
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
21415 dsecond
.perm
[i
] = i
;
21421 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
21423 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
21427 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
21432 ok
= expand_vec_perm_1 (&dfirst
);
21433 seq1
= get_insns ();
21443 ok
= expand_vec_perm_1 (&dsecond
);
21444 seq2
= get_insns ();
21454 for (i
= 0; i
< nelt
; ++i
)
21455 dfinal
.perm
[i
] = (d
->perm
[i
] >= nelt
? i
+ nelt
: i
);
21459 ok
= expand_vec_perm_blend (&dfinal
);
21464 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21465 permutation using two vperm2f128, followed by a vshufpd insn blending
21466 the two vectors together. */
21469 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
21471 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
21474 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
21484 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
21485 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
21486 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
21487 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
21488 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
21489 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
21490 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
21491 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
21492 dthird
.perm
[0] = (d
->perm
[0] % 2);
21493 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
21494 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
21495 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
21497 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21498 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21499 dthird
.op0
= dfirst
.target
;
21500 dthird
.op1
= dsecond
.target
;
21501 dthird
.one_operand_p
= false;
21503 canonicalize_perm (&dfirst
);
21504 canonicalize_perm (&dsecond
);
21506 ok
= expand_vec_perm_1 (&dfirst
)
21507 && expand_vec_perm_1 (&dsecond
)
21508 && expand_vec_perm_1 (&dthird
);
21515 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
21517 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21518 a two vector permutation using two intra-lane vector
21519 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21520 the non-swapped and swapped vectors together. */
21523 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
21525 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
21526 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
21527 rtx_insn
*seq1
, *seq2
;
21529 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
21533 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
21534 || d
->one_operand_p
)
21539 for (i
= 0; i
< nelt
; i
++)
21541 dfirst
.perm
[i
] = 0xff;
21542 dsecond
.perm
[i
] = 0xff;
21544 for (i
= 0, msk
= 0; i
< nelt
; i
++)
21546 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
21549 dfirst
.perm
[j
] = d
->perm
[i
];
21550 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
21554 dsecond
.perm
[j
] = d
->perm
[i
];
21555 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
21559 if (msk
== 0 || msk
== (1U << nelt
) - 1)
21564 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21565 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21568 for (i
= 0; i
< nelt
; i
++)
21570 if (dfirst
.perm
[i
] == 0xff)
21571 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
21572 if (dsecond
.perm
[i
] == 0xff)
21573 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
21575 canonicalize_perm (&dfirst
);
21577 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
21578 seq1
= get_insns ();
21584 canonicalize_perm (&dsecond
);
21586 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
21587 seq2
= get_insns ();
21600 dthird
.op0
= dsecond
.target
;
21601 dthird
.op1
= dsecond
.target
;
21602 dthird
.one_operand_p
= true;
21603 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
21604 for (i
= 0; i
< nelt
; i
++)
21605 dthird
.perm
[i
] = i
^ nelt2
;
21607 ok
= expand_vec_perm_1 (&dthird
);
21610 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
21611 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
21615 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21616 permutation with two pshufb insns and an ior. We should have already
21617 failed all two instruction sequences. */
21620 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
21622 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
21623 unsigned int i
, nelt
, eltsz
;
21625 rtx (*gen
) (rtx
, rtx
, rtx
);
21627 if (!TARGET_SSSE3
|| (GET_MODE_SIZE (d
->vmode
) != 16
21628 && GET_MODE_SIZE (d
->vmode
) != 8
21629 && GET_MODE_SIZE (d
->vmode
) != 4))
21631 gcc_assert (!d
->one_operand_p
);
21636 switch (GET_MODE_SIZE (d
->vmode
))
21640 gen
= gen_mmx_pshufbv4qi3
;
21644 gen
= gen_mmx_pshufbv8qi3
;
21648 gen
= gen_ssse3_pshufbv16qi3
;
21651 gcc_unreachable ();
21655 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21657 /* Generate two permutation masks. If the required element is within
21658 the given vector it is shuffled into the proper lane. If the required
21659 element is in the other vector, force a zero into the lane by setting
21660 bit 7 in the permutation mask. */
21661 m128
= GEN_INT (-128);
21662 for (i
= 0; i
< nelt
; ++i
)
21664 unsigned j
, k
, e
= d
->perm
[i
];
21665 unsigned which
= (e
>= nelt
);
21669 for (j
= 0; j
< eltsz
; ++j
)
21671 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
21672 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
21675 for (k
= i
*eltsz
+ j
; k
< 16; ++k
)
21676 rperm
[0][k
] = rperm
[1][k
] = m128
;
21679 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
21680 vperm
= force_reg (V16QImode
, vperm
);
21682 l
= gen_reg_rtx (mode
);
21683 op
= gen_lowpart (mode
, d
->op0
);
21684 emit_insn (gen (l
, op
, vperm
));
21686 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
21687 vperm
= force_reg (V16QImode
, vperm
);
21689 h
= gen_reg_rtx (mode
);
21690 op
= gen_lowpart (mode
, d
->op1
);
21691 emit_insn (gen (h
, op
, vperm
));
21694 if (d
->vmode
!= mode
)
21695 op
= gen_reg_rtx (mode
);
21696 ix86_emit_vec_binop (IOR
, mode
, op
, l
, h
);
21697 if (op
!= d
->target
)
21698 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21703 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
21704 with two vpshufb insns, vpermq and vpor. We should have already failed
21705 all two or three instruction sequences. */
21708 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
21710 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
21711 unsigned int i
, nelt
, eltsz
;
21714 || !d
->one_operand_p
21715 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
21722 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21724 /* Generate two permutation masks. If the required element is within
21725 the same lane, it is shuffled in. If the required element from the
21726 other lane, force a zero by setting bit 7 in the permutation mask.
21727 In the other mask the mask has non-negative elements if element
21728 is requested from the other lane, but also moved to the other lane,
21729 so that the result of vpshufb can have the two V2TImode halves
21731 m128
= GEN_INT (-128);
21732 for (i
= 0; i
< nelt
; ++i
)
21734 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
21735 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
21737 for (j
= 0; j
< eltsz
; ++j
)
21739 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
21740 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
21744 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
21745 vperm
= force_reg (V32QImode
, vperm
);
21747 h
= gen_reg_rtx (V32QImode
);
21748 op
= gen_lowpart (V32QImode
, d
->op0
);
21749 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
21751 /* Swap the 128-byte lanes of h into hp. */
21752 hp
= gen_reg_rtx (V4DImode
);
21753 op
= gen_lowpart (V4DImode
, h
);
21754 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
21757 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
21758 vperm
= force_reg (V32QImode
, vperm
);
21760 l
= gen_reg_rtx (V32QImode
);
21761 op
= gen_lowpart (V32QImode
, d
->op0
);
21762 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
21765 if (d
->vmode
!= V32QImode
)
21766 op
= gen_reg_rtx (V32QImode
);
21767 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
21768 if (op
!= d
->target
)
21769 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21774 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21775 and extract-odd permutations of two V32QImode and V16QImode operand
21776 with two vpshufb insns, vpor and vpermq. We should have already
21777 failed all two or three instruction sequences. */
21780 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
21782 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
21783 unsigned int i
, nelt
, eltsz
;
21786 || d
->one_operand_p
21787 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
21790 for (i
= 0; i
< d
->nelt
; ++i
)
21791 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
21798 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21800 /* Generate two permutation masks. In the first permutation mask
21801 the first quarter will contain indexes for the first half
21802 of the op0, the second quarter will contain bit 7 set, third quarter
21803 will contain indexes for the second half of the op0 and the
21804 last quarter bit 7 set. In the second permutation mask
21805 the first quarter will contain bit 7 set, the second quarter
21806 indexes for the first half of the op1, the third quarter bit 7 set
21807 and last quarter indexes for the second half of the op1.
21808 I.e. the first mask e.g. for V32QImode extract even will be:
21809 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21810 (all values masked with 0xf except for -128) and second mask
21811 for extract even will be
21812 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21813 m128
= GEN_INT (-128);
21814 for (i
= 0; i
< nelt
; ++i
)
21816 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
21817 unsigned which
= d
->perm
[i
] >= nelt
;
21818 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
21820 for (j
= 0; j
< eltsz
; ++j
)
21822 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
21823 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
21827 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
21828 vperm
= force_reg (V32QImode
, vperm
);
21830 l
= gen_reg_rtx (V32QImode
);
21831 op
= gen_lowpart (V32QImode
, d
->op0
);
21832 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
21834 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
21835 vperm
= force_reg (V32QImode
, vperm
);
21837 h
= gen_reg_rtx (V32QImode
);
21838 op
= gen_lowpart (V32QImode
, d
->op1
);
21839 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
21841 ior
= gen_reg_rtx (V32QImode
);
21842 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
21844 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21845 op
= gen_reg_rtx (V4DImode
);
21846 ior
= gen_lowpart (V4DImode
, ior
);
21847 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
21848 const1_rtx
, GEN_INT (3)));
21849 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21854 /* Implement permutation with pslldq + psrldq + por when pshufb is not
21857 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d
*d
, bool pandn
)
21859 unsigned i
, nelt
= d
->nelt
;
21860 unsigned start1
, end1
= -1;
21861 machine_mode vmode
= d
->vmode
, imode
;
21863 bool clear_op0
, clear_op1
;
21864 unsigned inner_size
;
21865 rtx op0
, op1
, dop1
;
21866 rtx (*gen_vec_shr
) (rtx
, rtx
, rtx
);
21867 rtx (*gen_vec_shl
) (rtx
, rtx
, rtx
);
21869 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21870 if (!TARGET_SSE2
|| (vmode
!= E_V16QImode
&& vmode
!= E_V8HImode
))
21873 start1
= d
->perm
[0];
21874 for (i
= 1; i
< nelt
; i
++)
21876 if (d
->perm
[i
] != d
->perm
[i
-1] + 1
21877 || d
->perm
[i
] == nelt
)
21881 start2
= d
->perm
[i
];
21882 end1
= d
->perm
[i
-1];
21889 clear_op0
= end1
!= nelt
- 1;
21890 clear_op1
= start2
% nelt
!= 0;
21891 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21892 if (!pandn
&& (clear_op0
|| clear_op1
))
21898 gen_vec_shr
= vmode
== E_V16QImode
? gen_vec_shr_v16qi
: gen_vec_shr_v8hi
;
21899 gen_vec_shl
= vmode
== E_V16QImode
? gen_vec_shl_v16qi
: gen_vec_shl_v8hi
;
21900 imode
= GET_MODE_INNER (vmode
);
21901 inner_size
= GET_MODE_BITSIZE (imode
);
21902 op0
= gen_reg_rtx (vmode
);
21903 op1
= gen_reg_rtx (vmode
);
21906 emit_insn (gen_vec_shr (op0
, d
->op0
, GEN_INT (start1
* inner_size
)));
21908 emit_move_insn (op0
, d
->op0
);
21911 if (d
->one_operand_p
)
21914 int shl_offset
= end1
- start1
+ 1 - start2
% nelt
;
21916 emit_insn (gen_vec_shl (op1
, dop1
, GEN_INT (shl_offset
* inner_size
)));
21918 emit_move_insn (op1
, dop1
);
21920 /* Clear lower/upper bits for op0/op1. */
21921 if (clear_op0
|| clear_op1
)
21926 for (i
= 0; i
!= nelt
; i
++)
21928 if (i
< (end1
- start1
+ 1))
21929 vec
[i
] = gen_int_mode ((HOST_WIDE_INT_1U
<< inner_size
) - 1, imode
);
21931 vec
[i
] = CONST0_RTX (imode
);
21933 const_vec
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, vec
));
21934 const_vec
= validize_mem (force_const_mem (vmode
, const_vec
));
21935 clear
= force_reg (vmode
, const_vec
);
21938 emit_move_insn (op0
, gen_rtx_AND (vmode
, op0
, clear
));
21940 emit_move_insn (op1
, gen_rtx_AND (vmode
,
21941 gen_rtx_NOT (vmode
, clear
),
21945 emit_move_insn (d
->target
, gen_rtx_IOR (vmode
, op0
, op1
));
21949 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21950 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21951 operands with two "and" and "pack" or two "shift" and "pack" insns.
21952 We should have already failed all two instruction sequences. */
21955 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
21957 rtx op
, dop0
, dop1
, t
;
21958 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
21959 bool end_perm
= false;
21960 machine_mode half_mode
;
21961 rtx (*gen_and
) (rtx
, rtx
, rtx
);
21962 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
21963 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
21965 if (d
->one_operand_p
)
21971 /* Required for "pack". */
21972 if (!TARGET_SSE4_1
)
21976 half_mode
= V2SImode
;
21977 gen_and
= gen_andv2si3
;
21978 gen_pack
= gen_mmx_packusdw
;
21979 gen_shift
= gen_lshrv2si3
;
21982 /* Required for "pack". */
21983 if (!TARGET_SSE4_1
)
21987 half_mode
= V4SImode
;
21988 gen_and
= gen_andv4si3
;
21989 gen_pack
= gen_sse4_1_packusdw
;
21990 gen_shift
= gen_lshrv4si3
;
21993 /* No check as all instructions are SSE2. */
21996 half_mode
= V4HImode
;
21997 gen_and
= gen_andv4hi3
;
21998 gen_pack
= gen_mmx_packuswb
;
21999 gen_shift
= gen_lshrv4hi3
;
22002 /* No check as all instructions are SSE2. */
22005 half_mode
= V8HImode
;
22006 gen_and
= gen_andv8hi3
;
22007 gen_pack
= gen_sse2_packuswb
;
22008 gen_shift
= gen_lshrv8hi3
;
22015 half_mode
= V8SImode
;
22016 gen_and
= gen_andv8si3
;
22017 gen_pack
= gen_avx2_packusdw
;
22018 gen_shift
= gen_lshrv8si3
;
22026 half_mode
= V16HImode
;
22027 gen_and
= gen_andv16hi3
;
22028 gen_pack
= gen_avx2_packuswb
;
22029 gen_shift
= gen_lshrv16hi3
;
22033 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
22034 are more profitable than general shuffles. */
22038 /* Check that permutation is even or odd. */
22043 for (i
= 1; i
< nelt
; ++i
)
22044 if (d
->perm
[i
] != 2 * i
+ odd
)
22050 dop0
= gen_reg_rtx (half_mode
);
22051 dop1
= gen_reg_rtx (half_mode
);
22054 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
22055 t
= force_reg (half_mode
, t
);
22056 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
22057 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
22061 emit_insn (gen_shift (dop0
,
22062 gen_lowpart (half_mode
, d
->op0
),
22064 emit_insn (gen_shift (dop1
,
22065 gen_lowpart (half_mode
, d
->op1
),
22068 /* In AVX2 for 256 bit case we need to permute pack result. */
22069 if (TARGET_AVX2
&& end_perm
)
22071 op
= gen_reg_rtx (d
->vmode
);
22072 t
= gen_reg_rtx (V4DImode
);
22073 emit_insn (gen_pack (op
, dop0
, dop1
));
22074 emit_insn (gen_avx2_permv4di_1 (t
,
22075 gen_lowpart (V4DImode
, op
),
22080 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
22083 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
22088 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22089 and extract-odd permutations of two V64QI operands
22090 with two "shifts", two "truncs" and one "concat" insns for "odd"
22091 and two "truncs" and one concat insn for "even."
22092 Have already failed all two instruction sequences. */
22095 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
22097 rtx t1
, t2
, t3
, t4
;
22098 unsigned i
, odd
, nelt
= d
->nelt
;
22100 if (!TARGET_AVX512BW
22101 || d
->one_operand_p
22102 || d
->vmode
!= V64QImode
)
22105 /* Check that permutation is even or odd. */
22110 for (i
= 1; i
< nelt
; ++i
)
22111 if (d
->perm
[i
] != 2 * i
+ odd
)
22120 t1
= gen_reg_rtx (V32HImode
);
22121 t2
= gen_reg_rtx (V32HImode
);
22122 emit_insn (gen_lshrv32hi3 (t1
,
22123 gen_lowpart (V32HImode
, d
->op0
),
22125 emit_insn (gen_lshrv32hi3 (t2
,
22126 gen_lowpart (V32HImode
, d
->op1
),
22131 t1
= gen_lowpart (V32HImode
, d
->op0
);
22132 t2
= gen_lowpart (V32HImode
, d
->op1
);
22135 t3
= gen_reg_rtx (V32QImode
);
22136 t4
= gen_reg_rtx (V32QImode
);
22137 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
22138 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
22139 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
22144 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
22145 and extract-odd permutations. */
22148 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
22150 rtx t1
, t2
, t3
, t4
, t5
;
22157 t1
= gen_reg_rtx (V4DFmode
);
22158 t2
= gen_reg_rtx (V4DFmode
);
22160 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22161 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
22162 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
22164 /* Now an unpck[lh]pd will produce the result required. */
22166 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
22168 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
22174 int mask
= odd
? 0xdd : 0x88;
22178 t1
= gen_reg_rtx (V8SFmode
);
22179 t2
= gen_reg_rtx (V8SFmode
);
22180 t3
= gen_reg_rtx (V8SFmode
);
22182 /* Shuffle within the 128-bit lanes to produce:
22183 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
22184 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
22187 /* Shuffle the lanes around to produce:
22188 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
22189 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
22192 /* Shuffle within the 128-bit lanes to produce:
22193 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
22194 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
22196 /* Shuffle within the 128-bit lanes to produce:
22197 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
22198 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
22200 /* Shuffle the lanes around to produce:
22201 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
22202 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
22213 /* These are always directly implementable by expand_vec_perm_1. */
22214 gcc_unreachable ();
22217 gcc_assert (TARGET_MMX_WITH_SSE
);
22218 /* We have no suitable instructions. */
22224 if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22225 return expand_vec_perm_pshufb2 (d
);
22230 /* We need 2*log2(N)-1 operations to achieve odd/even
22231 with interleave. */
22232 t1
= gen_reg_rtx (V4QImode
);
22233 emit_insn (gen_mmx_punpckhbw_low (t1
, d
->op0
, d
->op1
));
22234 emit_insn (gen_mmx_punpcklbw_low (d
->target
, d
->op0
, d
->op1
));
22236 t2
= gen_mmx_punpckhbw_low (d
->target
, d
->target
, t1
);
22238 t2
= gen_mmx_punpcklbw_low (d
->target
, d
->target
, t1
);
22245 return expand_vec_perm_even_odd_pack (d
);
22246 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22247 return expand_vec_perm_pshufb2 (d
);
22252 /* We need 2*log2(N)-1 operations to achieve odd/even
22253 with interleave. */
22254 t1
= gen_reg_rtx (V4HImode
);
22255 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
22256 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
22258 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
22260 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
22267 return expand_vec_perm_even_odd_pack (d
);
22268 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22269 return expand_vec_perm_pshufb2 (d
);
22274 /* We need 2*log2(N)-1 operations to achieve odd/even
22275 with interleave. */
22276 t1
= gen_reg_rtx (V8HImode
);
22277 t2
= gen_reg_rtx (V8HImode
);
22278 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
22279 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
22280 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
22281 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
22283 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
22285 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
22292 return expand_vec_perm_even_odd_pack (d
);
22296 return expand_vec_perm_even_odd_pack (d
);
22299 return expand_vec_perm_even_odd_trunc (d
);
22304 struct expand_vec_perm_d d_copy
= *d
;
22305 d_copy
.vmode
= V4DFmode
;
22307 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
22309 d_copy
.target
= gen_reg_rtx (V4DFmode
);
22310 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
22311 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
22312 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
22315 emit_move_insn (d
->target
,
22316 gen_lowpart (V4DImode
, d_copy
.target
));
22325 t1
= gen_reg_rtx (V4DImode
);
22326 t2
= gen_reg_rtx (V4DImode
);
22328 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22329 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
22330 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
22332 /* Now an vpunpck[lh]qdq will produce the result required. */
22334 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
22336 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
22343 struct expand_vec_perm_d d_copy
= *d
;
22344 d_copy
.vmode
= V8SFmode
;
22346 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
22348 d_copy
.target
= gen_reg_rtx (V8SFmode
);
22349 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
22350 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
22351 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
22354 emit_move_insn (d
->target
,
22355 gen_lowpart (V8SImode
, d_copy
.target
));
22364 t1
= gen_reg_rtx (V8SImode
);
22365 t2
= gen_reg_rtx (V8SImode
);
22366 t3
= gen_reg_rtx (V4DImode
);
22367 t4
= gen_reg_rtx (V4DImode
);
22368 t5
= gen_reg_rtx (V4DImode
);
22370 /* Shuffle the lanes around into
22371 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22372 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
22373 gen_lowpart (V4DImode
, d
->op1
),
22375 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
22376 gen_lowpart (V4DImode
, d
->op1
),
22379 /* Swap the 2nd and 3rd position in each lane into
22380 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22381 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
22382 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22383 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
22384 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22386 /* Now an vpunpck[lh]qdq will produce
22387 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22389 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
22390 gen_lowpart (V4DImode
, t2
));
22392 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
22393 gen_lowpart (V4DImode
, t2
));
22395 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
22399 gcc_unreachable ();
22405 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22406 extract-even and extract-odd permutations. */
22409 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
22411 unsigned i
, odd
, nelt
= d
->nelt
;
22414 if (odd
!= 0 && odd
!= 1)
22417 for (i
= 1; i
< nelt
; ++i
)
22418 if (d
->perm
[i
] != 2 * i
+ odd
)
22421 if (d
->vmode
== E_V32HImode
22423 && !TARGET_AVX512BW
)
22426 return expand_vec_perm_even_odd_1 (d
, odd
);
22429 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22430 permutations. We assume that expand_vec_perm_1 has already failed. */
22433 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
22435 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
22436 machine_mode vmode
= d
->vmode
;
22437 rtx (*gen
) (rtx
, rtx
, rtx
);
22438 unsigned char perm2
[4];
22439 rtx op0
= d
->op0
, dest
;
22446 /* These are special-cased in sse.md so that we can optionally
22447 use the vbroadcast instruction. They expand to two insns
22448 if the input happens to be in a register. */
22449 gcc_unreachable ();
22459 /* These are always implementable using standard shuffle patterns. */
22460 gcc_unreachable ();
22463 /* This can be implemented via interleave and pshuflw. */
22469 gen
= gen_mmx_punpckhbw_low
;
22473 gen
= gen_mmx_punpcklbw_low
;
22475 dest
= gen_reg_rtx (vmode
);
22476 emit_insn (gen (dest
, op0
, op0
));
22477 vmode
= get_mode_wider_vector (vmode
);
22478 op0
= gen_lowpart (vmode
, dest
);
22480 memset (perm2
, elt
, 2);
22481 dest
= gen_reg_rtx (vmode
);
22482 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22485 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22489 /* This can be implemented via interleave. We save one insn by
22490 stopping once we have promoted to V2SImode and then use pshufd. */
22497 gen
= vmode
== V8QImode
? gen_mmx_punpckhbw
22498 : gen_mmx_punpckhwd
;
22502 gen
= vmode
== V8QImode
? gen_mmx_punpcklbw
22503 : gen_mmx_punpcklwd
;
22506 dest
= gen_reg_rtx (vmode
);
22507 emit_insn (gen (dest
, op0
, op0
));
22508 vmode
= get_mode_wider_vector (vmode
);
22509 op0
= gen_lowpart (vmode
, dest
);
22511 while (vmode
!= V2SImode
);
22513 memset (perm2
, elt
, 2);
22514 dest
= gen_reg_rtx (vmode
);
22515 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22518 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22523 /* These can be implemented via interleave. We save one insn by
22524 stopping once we have promoted to V4SImode and then use pshufd. */
22531 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
22532 : gen_vec_interleave_highv8hi
;
22536 gen
= vmode
== V16QImode
? gen_vec_interleave_lowv16qi
22537 : gen_vec_interleave_lowv8hi
;
22540 dest
= gen_reg_rtx (vmode
);
22541 emit_insn (gen (dest
, op0
, op0
));
22542 vmode
= get_mode_wider_vector (vmode
);
22543 op0
= gen_lowpart (vmode
, dest
);
22545 while (vmode
!= V4SImode
);
22547 memset (perm2
, elt
, 4);
22548 dest
= gen_reg_rtx (vmode
);
22549 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
22552 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22557 /* This can be implemented via interleave and pshufd. */
22561 rtx (*gen_interleave
) (machine_mode
, int, rtx
, rtx
, rtx
);
22564 gen_interleave
= gen_vec_interleave_high
;
22568 gen_interleave
= gen_vec_interleave_low
;
22571 dest
= gen_reg_rtx (vmode
);
22572 emit_insn (gen_interleave (vmode
, 1, dest
, op0
, op0
));
22575 op0
= gen_lowpart (vmode
, dest
);
22577 memset (perm2
, elt
, 4);
22578 dest
= gen_reg_rtx (vmode
);
22579 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
22582 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22589 /* For AVX2 broadcasts of the first element vpbroadcast* or
22590 vpermq should be used by expand_vec_perm_1. */
22591 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
22595 gcc_assert (!TARGET_AVX512BW
|| d
->perm
[0]);
22599 gcc_assert (!TARGET_AVX512BW
);
22603 gcc_unreachable ();
22607 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22608 broadcast permutations. */
22611 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
22613 unsigned i
, elt
, nelt
= d
->nelt
;
22615 if (!d
->one_operand_p
)
22619 for (i
= 1; i
< nelt
; ++i
)
22620 if (d
->perm
[i
] != elt
)
22623 return expand_vec_perm_broadcast_1 (d
);
22626 /* Implement arbitrary permutations of two V64QImode operands
22627 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22629 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
22631 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
22637 struct expand_vec_perm_d ds
[2];
22638 rtx rperm
[128], vperm
, target0
, target1
;
22639 unsigned int i
, nelt
;
22640 machine_mode vmode
;
22645 for (i
= 0; i
< 2; i
++)
22648 ds
[i
].vmode
= V32HImode
;
22650 ds
[i
].target
= gen_reg_rtx (V32HImode
);
22651 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
22652 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
22655 /* Prepare permutations such that the first one takes care of
22656 putting the even bytes into the right positions or one higher
22657 positions (ds[0]) and the second one takes care of
22658 putting the odd bytes into the right positions or one below
22661 for (i
= 0; i
< nelt
; i
++)
22663 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
22666 rperm
[i
] = constm1_rtx
;
22667 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
22671 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
22672 rperm
[i
+ 64] = constm1_rtx
;
22676 bool ok
= expand_vec_perm_1 (&ds
[0]);
22678 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
22680 ok
= expand_vec_perm_1 (&ds
[1]);
22682 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
22684 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
22685 vperm
= force_reg (vmode
, vperm
);
22686 target0
= gen_reg_rtx (V64QImode
);
22687 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
22689 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
22690 vperm
= force_reg (vmode
, vperm
);
22691 target1
= gen_reg_rtx (V64QImode
);
22692 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
22694 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
22698 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
22699 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22700 all the shorter instruction sequences. */
22703 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
22705 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
22706 unsigned int i
, nelt
, eltsz
;
22710 || d
->one_operand_p
22711 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
22718 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
22720 /* Generate 4 permutation masks. If the required element is within
22721 the same lane, it is shuffled in. If the required element from the
22722 other lane, force a zero by setting bit 7 in the permutation mask.
22723 In the other mask the mask has non-negative elements if element
22724 is requested from the other lane, but also moved to the other lane,
22725 so that the result of vpshufb can have the two V2TImode halves
22727 m128
= GEN_INT (-128);
22728 for (i
= 0; i
< 32; ++i
)
22730 rperm
[0][i
] = m128
;
22731 rperm
[1][i
] = m128
;
22732 rperm
[2][i
] = m128
;
22733 rperm
[3][i
] = m128
;
22739 for (i
= 0; i
< nelt
; ++i
)
22741 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
22742 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
22743 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
22745 for (j
= 0; j
< eltsz
; ++j
)
22746 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
22747 used
[which
] = true;
22750 for (i
= 0; i
< 2; ++i
)
22752 if (!used
[2 * i
+ 1])
22757 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
22758 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
22759 vperm
= force_reg (V32QImode
, vperm
);
22760 h
[i
] = gen_reg_rtx (V32QImode
);
22761 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
22762 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
22765 /* Swap the 128-byte lanes of h[X]. */
22766 for (i
= 0; i
< 2; ++i
)
22768 if (h
[i
] == NULL_RTX
)
22770 op
= gen_reg_rtx (V4DImode
);
22771 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
22772 const2_rtx
, GEN_INT (3), const0_rtx
,
22774 h
[i
] = gen_lowpart (V32QImode
, op
);
22777 for (i
= 0; i
< 2; ++i
)
22784 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
22785 vperm
= force_reg (V32QImode
, vperm
);
22786 l
[i
] = gen_reg_rtx (V32QImode
);
22787 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
22788 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
22791 for (i
= 0; i
< 2; ++i
)
22795 op
= gen_reg_rtx (V32QImode
);
22796 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
22803 gcc_assert (l
[0] && l
[1]);
22805 if (d
->vmode
!= V32QImode
)
22806 op
= gen_reg_rtx (V32QImode
);
22807 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
22808 if (op
!= d
->target
)
22809 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
22813 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22814 taken care of, perform the expansion in D and return true on success. */
22817 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
22819 /* Try a single instruction expansion. */
22820 if (expand_vec_perm_1 (d
))
22823 /* Try sequences of two instructions. */
22825 if (expand_vec_perm_pshuflw_pshufhw (d
))
22828 if (expand_vec_perm_palignr (d
, false))
22831 if (expand_vec_perm_interleave2 (d
))
22834 if (expand_vec_perm_broadcast (d
))
22837 if (expand_vec_perm_vpermq_perm_1 (d
))
22840 if (expand_vec_perm_vperm2f128 (d
))
22843 if (expand_vec_perm_pblendv (d
))
22846 if (expand_vec_perm_2perm_interleave (d
, true))
22849 if (expand_vec_perm_2perm_pblendv (d
, true))
22852 if (expand_vec_perm_shufps_shufps (d
))
22855 /* Try sequences of three instructions. */
22857 if (expand_vec_perm_even_odd_pack (d
))
22860 if (expand_vec_perm_2vperm2f128_vshuf (d
))
22863 if (expand_vec_perm_pshufb2 (d
))
22866 if (expand_vec_perm_pslldq_psrldq_por (d
, false))
22869 if (expand_vec_perm_interleave3 (d
))
22872 if (expand_vec_perm_vperm2f128_vblend (d
))
22875 if (expand_vec_perm_2perm_interleave (d
, false))
22878 if (expand_vec_perm_2perm_pblendv (d
, false))
22881 /* Try sequences of four instructions. */
22883 if (expand_vec_perm_even_odd_trunc (d
))
22885 if (expand_vec_perm_vpshufb2_vpermq (d
))
22888 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
22891 if (expand_vec_perm_vpermt2_vpshub2 (d
))
22894 /* ??? Look for narrow permutations whose element orderings would
22895 allow the promotion to a wider mode. */
22897 /* ??? Look for sequences of interleave or a wider permute that place
22898 the data into the correct lanes for a half-vector shuffle like
22899 pshuf[lh]w or vpermilps. */
22901 /* ??? Look for sequences of interleave that produce the desired results.
22902 The combinatorics of punpck[lh] get pretty ugly... */
22904 if (expand_vec_perm_even_odd (d
))
22907 /* Generate four or five instructions. */
22908 if (expand_vec_perm_pslldq_psrldq_por (d
, true))
22911 /* Even longer sequences. */
22912 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
22915 /* See if we can get the same permutation in different vector integer
22917 struct expand_vec_perm_d nd
;
22918 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
22921 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
22925 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22926 if (expand_vec_perm2_vperm2f128_vblend (d
))
22932 /* If a permutation only uses one operand, make it clear. Returns true
22933 if the permutation references both operands. */
22936 canonicalize_perm (struct expand_vec_perm_d
*d
)
22938 int i
, which
, nelt
= d
->nelt
;
22940 for (i
= which
= 0; i
< nelt
; ++i
)
22941 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
22943 d
->one_operand_p
= true;
22950 if (!rtx_equal_p (d
->op0
, d
->op1
))
22952 d
->one_operand_p
= false;
22955 /* The elements of PERM do not suggest that only the first operand
22956 is used, but both operands are identical. Allow easier matching
22957 of the permutation by folding the permutation into the single
22962 for (i
= 0; i
< nelt
; ++i
)
22963 d
->perm
[i
] &= nelt
- 1;
22972 return (which
== 3);
22975 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22978 ix86_vectorize_vec_perm_const (machine_mode vmode
, machine_mode op_mode
,
22979 rtx target
, rtx op0
, rtx op1
,
22980 const vec_perm_indices
&sel
)
22982 if (vmode
!= op_mode
)
22985 struct expand_vec_perm_d d
;
22986 unsigned char perm
[MAX_VECT_LEN
];
22987 unsigned int i
, nelt
, which
;
22990 /* For HF mode vector, convert it to HI using subreg. */
22991 if (GET_MODE_INNER (vmode
) == HFmode
)
22993 machine_mode orig_mode
= vmode
;
22994 vmode
= mode_for_vector (HImode
,
22995 GET_MODE_NUNITS (vmode
)).require ();
22997 target
= lowpart_subreg (vmode
, target
, orig_mode
);
22999 op0
= lowpart_subreg (vmode
, op0
, orig_mode
);
23001 op1
= lowpart_subreg (vmode
, op1
, orig_mode
);
23009 gcc_assert (VECTOR_MODE_P (d
.vmode
));
23010 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
23011 d
.testing_p
= !target
;
23013 gcc_assert (sel
.length () == nelt
);
23014 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
23016 /* Given sufficient ISA support we can just return true here
23017 for selected vector modes. */
23024 if (!TARGET_AVX512F
)
23026 /* All implementable with a single vperm[it]2 insn. */
23031 if (!TARGET_AVX512F
)
23033 if (d
.testing_p
&& TARGET_AVX512BW
)
23034 /* All implementable with a single vperm[it]2 insn. */
23038 if (!TARGET_AVX512F
)
23040 if (d
.testing_p
&& TARGET_AVX512BW
)
23041 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
23050 if (d
.testing_p
&& TARGET_AVX512VL
)
23051 /* All implementable with a single vperm[it]2 insn. */
23057 if (d
.testing_p
&& TARGET_AVX2
)
23058 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23064 if (d
.testing_p
&& TARGET_AVX2
)
23065 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23072 /* Fall through. */
23077 /* All implementable with a single vpperm insn. */
23078 if (d
.testing_p
&& TARGET_XOP
)
23080 /* All implementable with 2 pshufb + 1 ior. */
23081 if (d
.testing_p
&& TARGET_SSSE3
)
23088 if (!TARGET_MMX_WITH_SSE
)
23094 /* All implementable with *punpckwd. */
23106 /* All implementable with shufpd or unpck[lh]pd. */
23114 for (i
= which
= 0; i
< nelt
; ++i
)
23116 unsigned char e
= sel
[i
];
23117 gcc_assert (e
< 2 * nelt
);
23120 which
|= (e
< nelt
? 1 : 2);
23125 /* For all elements from second vector, fold the elements to first. */
23127 for (i
= 0; i
< nelt
; ++i
)
23130 /* Check whether the mask can be applied to the vector type. */
23131 d
.one_operand_p
= (which
!= 3);
23133 /* Implementable with shufps, pshufd or pshuflw. */
23134 if (d
.one_operand_p
23135 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
23136 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
23137 || d
.vmode
== V4HImode
|| d
.vmode
== V2HImode
))
23140 /* Otherwise we have to go through the motions and see if we can
23141 figure out how to generate the requested permutation. */
23142 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
23143 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
23144 if (!d
.one_operand_p
)
23145 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
23148 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
23154 two_args
= canonicalize_perm (&d
);
23156 /* If one of the operands is a zero vector, try to match pmovzx. */
23157 if (two_args
&& (d
.op0
== CONST0_RTX (vmode
) || d
.op1
== CONST0_RTX (vmode
)))
23159 struct expand_vec_perm_d dzero
= d
;
23160 if (d
.op0
== CONST0_RTX (vmode
))
23162 d
.op1
= dzero
.op1
= force_reg (vmode
, d
.op1
);
23163 std::swap (dzero
.op0
, dzero
.op1
);
23164 for (i
= 0; i
< nelt
; ++i
)
23165 dzero
.perm
[i
] ^= nelt
;
23168 d
.op0
= dzero
.op0
= force_reg (vmode
, d
.op0
);
23170 if (expand_vselect_vconcat (dzero
.target
, dzero
.op0
, dzero
.op1
,
23171 dzero
.perm
, nelt
, dzero
.testing_p
))
23175 /* Force operands into registers. */
23176 rtx nop0
= force_reg (vmode
, d
.op0
);
23177 if (d
.op0
== d
.op1
)
23180 d
.op1
= force_reg (vmode
, d
.op1
);
23182 if (ix86_expand_vec_perm_const_1 (&d
))
23185 /* If the selector says both arguments are needed, but the operands are the
23186 same, the above tried to expand with one_operand_p and flattened selector.
23187 If that didn't work, retry without one_operand_p; we succeeded with that
23189 if (two_args
&& d
.one_operand_p
)
23191 d
.one_operand_p
= false;
23192 memcpy (d
.perm
, perm
, sizeof (perm
));
23193 return ix86_expand_vec_perm_const_1 (&d
);
23200 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
23202 struct expand_vec_perm_d d
;
23208 d
.vmode
= GET_MODE (targ
);
23209 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
23210 d
.one_operand_p
= false;
23211 d
.testing_p
= false;
23213 for (i
= 0; i
< nelt
; ++i
)
23214 d
.perm
[i
] = i
* 2 + odd
;
23216 /* We'll either be able to implement the permutation directly... */
23217 if (expand_vec_perm_1 (&d
))
23220 /* ... or we use the special-case patterns. */
23221 expand_vec_perm_even_odd_1 (&d
, odd
);
23225 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
23227 struct expand_vec_perm_d d
;
23228 unsigned i
, nelt
, base
;
23234 d
.vmode
= GET_MODE (targ
);
23235 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
23236 d
.one_operand_p
= false;
23237 d
.testing_p
= false;
23239 base
= high_p
? nelt
/ 2 : 0;
23240 for (i
= 0; i
< nelt
/ 2; ++i
)
23242 d
.perm
[i
* 2] = i
+ base
;
23243 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
23246 /* Note that for AVX this isn't one instruction. */
23247 ok
= ix86_expand_vec_perm_const_1 (&d
);
23251 /* Expand a vector operation shift by constant for a V*QImode in terms of the
23252 same operation on V*HImode. Return true if success. */
23254 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
,
23255 rtx dest
, rtx op1
, rtx op2
)
23257 machine_mode qimode
, himode
;
23258 HOST_WIDE_INT and_constant
, xor_constant
;
23259 HOST_WIDE_INT shift_amount
;
23260 rtx vec_const_and
, vec_const_xor
;
23261 rtx tmp
, op1_subreg
;
23262 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
23263 rtx (*gen_and
) (rtx
, rtx
, rtx
);
23264 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
23265 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
23267 /* Only optimize shift by constant. */
23268 if (!CONST_INT_P (op2
))
23271 qimode
= GET_MODE (dest
);
23272 shift_amount
= INTVAL (op2
);
23273 /* Do nothing when shift amount greater equal 8. */
23274 if (shift_amount
> 7)
23277 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
23278 /* Record sign bit. */
23279 xor_constant
= 1 << (8 - shift_amount
- 1);
23281 /* Zero upper/lower bits shift from left/right element. */
23283 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
23284 : (1 << (8 - shift_amount
)) - 1);
23293 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
23294 gen_and
= gen_andv16qi3
;
23295 gen_xor
= gen_xorv16qi3
;
23296 gen_sub
= gen_subv16qi3
;
23299 himode
= V16HImode
;
23303 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
23304 gen_and
= gen_andv32qi3
;
23305 gen_xor
= gen_xorv32qi3
;
23306 gen_sub
= gen_subv32qi3
;
23309 himode
= V32HImode
;
23313 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
23314 gen_and
= gen_andv64qi3
;
23315 gen_xor
= gen_xorv64qi3
;
23316 gen_sub
= gen_subv64qi3
;
23319 gcc_unreachable ();
23322 tmp
= gen_reg_rtx (himode
);
23323 vec_const_and
= gen_reg_rtx (qimode
);
23324 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
23326 /* For ASHIFT and LSHIFTRT, perform operation like
23327 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23328 vpand %vec_const_and, %dest. */
23329 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
23330 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
23331 emit_move_insn (vec_const_and
,
23332 ix86_build_const_vector (qimode
, true,
23333 gen_int_mode (and_constant
, QImode
)));
23334 emit_insn (gen_and (dest
, dest
, vec_const_and
));
23336 /* For ASHIFTRT, perform extra operation like
23337 vpxor %vec_const_xor, %dest, %dest
23338 vpsubb %vec_const_xor, %dest, %dest */
23339 if (code
== ASHIFTRT
)
23341 vec_const_xor
= gen_reg_rtx (qimode
);
23342 emit_move_insn (vec_const_xor
,
23343 ix86_build_const_vector (qimode
, true,
23344 gen_int_mode (xor_constant
, QImode
)));
23345 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
23346 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
23352 ix86_expand_vecop_qihi_partial (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23354 machine_mode qimode
= GET_MODE (dest
);
23355 rtx qop1
, qop2
, hop1
, hop2
, qdest
, hdest
;
23356 bool op2vec
= GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
;
23357 bool uns_p
= code
!= ASHIFTRT
;
23365 gcc_unreachable ();
23368 qop1
= lowpart_subreg (V16QImode
, force_reg (qimode
, op1
), qimode
);
23371 qop2
= lowpart_subreg (V16QImode
, force_reg (qimode
, op2
), qimode
);
23375 qdest
= gen_reg_rtx (V16QImode
);
23377 if (CONST_INT_P (op2
)
23378 && (code
== ASHIFT
|| code
== LSHIFTRT
|| code
== ASHIFTRT
)
23379 && ix86_expand_vec_shift_qihi_constant (code
, qdest
, qop1
, qop2
))
23381 emit_move_insn (dest
, gen_lowpart (qimode
, qdest
));
23388 gcc_assert (op2vec
);
23389 if (!TARGET_SSE4_1
)
23391 /* Unpack data such that we've got a source byte in each low byte
23392 of each word. We don't care what goes into the high byte of
23393 each word. Rather than trying to get zero in there, most
23394 convenient is to let it be a copy of the low byte. */
23395 hop1
= copy_to_reg (qop1
);
23396 hop2
= copy_to_reg (qop2
);
23397 emit_insn (gen_vec_interleave_lowv16qi (hop1
, hop1
, hop1
));
23398 emit_insn (gen_vec_interleave_lowv16qi (hop2
, hop2
, hop2
));
23405 hop1
= gen_reg_rtx (V8HImode
);
23406 ix86_expand_sse_unpack (hop1
, qop1
, uns_p
, false);
23407 /* mult/vashr/vlshr/vashl */
23410 hop2
= gen_reg_rtx (V8HImode
);
23411 ix86_expand_sse_unpack (hop2
, qop2
, uns_p
, false);
23418 gcc_unreachable ();
23421 if (code
!= MULT
&& op2vec
)
23423 /* Expand vashr/vlshr/vashl. */
23424 hdest
= gen_reg_rtx (V8HImode
);
23425 emit_insn (gen_rtx_SET (hdest
,
23426 simplify_gen_binary (code
, V8HImode
,
23430 /* Expand mult/ashr/lshr/ashl. */
23431 hdest
= expand_simple_binop (V8HImode
, code
, hop1
, hop2
,
23432 NULL_RTX
, 1, OPTAB_DIRECT
);
23434 if (TARGET_AVX512BW
&& TARGET_AVX512VL
)
23436 if (qimode
== V8QImode
)
23439 qdest
= gen_reg_rtx (V8QImode
);
23441 emit_insn (gen_truncv8hiv8qi2 (qdest
, hdest
));
23445 struct expand_vec_perm_d d
;
23446 rtx qres
= gen_lowpart (V16QImode
, hdest
);
23450 /* Merge the data back into the right place. */
23452 d
.op0
= d
.op1
= qres
;
23453 d
.vmode
= V16QImode
;
23455 d
.one_operand_p
= false;
23456 d
.testing_p
= false;
23458 for (i
= 0; i
< d
.nelt
; ++i
)
23461 ok
= ix86_expand_vec_perm_const_1 (&d
);
23466 emit_move_insn (dest
, gen_lowpart (qimode
, qdest
));
23469 /* Emit instruction in 2x wider mode. For example, optimize
23470 vector MUL generation like
23472 vpmovzxbw ymm2, xmm0
23473 vpmovzxbw ymm3, xmm1
23474 vpmullw ymm4, ymm2, ymm3
23477 it would take less instructions than ix86_expand_vecop_qihi.
23478 Return true if success. */
23481 ix86_expand_vecop_qihi2 (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23483 machine_mode himode
, qimode
= GET_MODE (dest
);
23484 machine_mode wqimode
;
23485 rtx qop1
, qop2
, hop1
, hop2
, hdest
;
23486 rtx (*gen_truncate
)(rtx
, rtx
) = NULL
;
23487 bool op2vec
= GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
;
23488 bool uns_p
= code
!= ASHIFTRT
;
23490 if ((qimode
== V16QImode
&& !TARGET_AVX2
)
23491 || (qimode
== V32QImode
&& !TARGET_AVX512BW
)
23492 /* There are no V64HImode instructions. */
23493 || qimode
== V64QImode
)
23496 /* Do not generate ymm/zmm instructions when
23497 target prefers 128/256 bit vector width. */
23498 if ((qimode
== V16QImode
&& TARGET_PREFER_AVX128
)
23499 || (qimode
== V32QImode
&& TARGET_PREFER_AVX256
))
23505 himode
= V16HImode
;
23506 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
23507 gen_truncate
= gen_truncv16hiv16qi2
;
23510 himode
= V32HImode
;
23511 gen_truncate
= gen_truncv32hiv32qi2
;
23514 gcc_unreachable ();
23517 wqimode
= GET_MODE_2XWIDER_MODE (qimode
).require ();
23518 qop1
= lowpart_subreg (wqimode
, force_reg (qimode
, op1
), qimode
);
23521 qop2
= lowpart_subreg (wqimode
, force_reg (qimode
, op2
), qimode
);
23525 hop1
= gen_reg_rtx (himode
);
23526 ix86_expand_sse_unpack (hop1
, qop1
, uns_p
, false);
23530 hop2
= gen_reg_rtx (himode
);
23531 ix86_expand_sse_unpack (hop2
, qop2
, uns_p
, false);
23536 if (code
!= MULT
&& op2vec
)
23538 /* Expand vashr/vlshr/vashl. */
23539 hdest
= gen_reg_rtx (himode
);
23540 emit_insn (gen_rtx_SET (hdest
,
23541 simplify_gen_binary (code
, himode
,
23545 /* Expand mult/ashr/lshr/ashl. */
23546 hdest
= expand_simple_binop (himode
, code
, hop1
, hop2
,
23547 NULL_RTX
, 1, OPTAB_DIRECT
);
23550 emit_insn (gen_truncate (dest
, hdest
));
23553 struct expand_vec_perm_d d
;
23554 rtx wqdest
= gen_reg_rtx (wqimode
);
23555 rtx wqres
= gen_lowpart (wqimode
, hdest
);
23559 /* Merge the data back into the right place. */
23561 d
.op0
= d
.op1
= wqres
;
23563 d
.nelt
= GET_MODE_NUNITS (wqimode
);
23564 d
.one_operand_p
= false;
23565 d
.testing_p
= false;
23567 for (i
= 0; i
< d
.nelt
; ++i
)
23570 ok
= ix86_expand_vec_perm_const_1 (&d
);
23573 emit_move_insn (dest
, gen_lowpart (qimode
, wqdest
));
23579 /* Expand a vector operation CODE for a V*QImode in terms of the
23580 same operation on V*HImode. */
23583 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23585 machine_mode qimode
= GET_MODE (dest
);
23586 machine_mode himode
;
23587 rtx (*gen_il
) (rtx
, rtx
, rtx
);
23588 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
23589 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
23590 bool op2vec
= GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
;
23591 struct expand_vec_perm_d d
;
23592 bool full_interleave
= true;
23593 bool uns_p
= code
!= ASHIFTRT
;
23597 if (CONST_INT_P (op2
)
23598 && (code
== ASHIFT
|| code
== LSHIFTRT
|| code
== ASHIFTRT
)
23599 && ix86_expand_vec_shift_qihi_constant (code
, dest
, op1
, op2
))
23602 if (ix86_expand_vecop_qihi2 (code
, dest
, op1
, op2
))
23611 himode
= V16HImode
;
23614 himode
= V32HImode
;
23617 gcc_unreachable ();
23623 gcc_assert (op2vec
);
23624 /* Unpack data such that we've got a source byte in each low byte of
23625 each word. We don't care what goes into the high byte of each word.
23626 Rather than trying to get zero in there, most convenient is to let
23627 it be a copy of the low byte. */
23631 gen_il
= gen_vec_interleave_lowv16qi
;
23632 gen_ih
= gen_vec_interleave_highv16qi
;
23635 gen_il
= gen_avx2_interleave_lowv32qi
;
23636 gen_ih
= gen_avx2_interleave_highv32qi
;
23637 full_interleave
= false;
23640 gen_il
= gen_avx512bw_interleave_lowv64qi
;
23641 gen_ih
= gen_avx512bw_interleave_highv64qi
;
23642 full_interleave
= false;
23645 gcc_unreachable ();
23648 op2_l
= gen_reg_rtx (qimode
);
23649 op2_h
= gen_reg_rtx (qimode
);
23650 emit_insn (gen_il (op2_l
, op2
, op2
));
23651 emit_insn (gen_ih (op2_h
, op2
, op2
));
23653 op1_l
= gen_reg_rtx (qimode
);
23654 op1_h
= gen_reg_rtx (qimode
);
23655 emit_insn (gen_il (op1_l
, op1
, op1
));
23656 emit_insn (gen_ih (op1_h
, op1
, op1
));
23662 op1_l
= gen_reg_rtx (himode
);
23663 op1_h
= gen_reg_rtx (himode
);
23664 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
23665 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
23666 /* vashr/vlshr/vashl */
23669 rtx tmp
= force_reg (qimode
, op2
);
23670 op2_l
= gen_reg_rtx (himode
);
23671 op2_h
= gen_reg_rtx (himode
);
23672 ix86_expand_sse_unpack (op2_l
, tmp
, uns_p
, false);
23673 ix86_expand_sse_unpack (op2_h
, tmp
, uns_p
, true);
23676 op2_l
= op2_h
= op2
;
23680 gcc_unreachable ();
23683 if (code
!= MULT
&& op2vec
)
23685 /* Expand vashr/vlshr/vashl. */
23686 res_l
= gen_reg_rtx (himode
);
23687 res_h
= gen_reg_rtx (himode
);
23688 emit_insn (gen_rtx_SET (res_l
,
23689 simplify_gen_binary (code
, himode
,
23691 emit_insn (gen_rtx_SET (res_h
,
23692 simplify_gen_binary (code
, himode
,
23697 /* Expand mult/ashr/lshr/ashl. */
23698 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
23700 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
23704 gcc_assert (res_l
&& res_h
);
23706 /* Merge the data back into the right place. */
23708 d
.op0
= gen_lowpart (qimode
, res_l
);
23709 d
.op1
= gen_lowpart (qimode
, res_h
);
23711 d
.nelt
= GET_MODE_NUNITS (qimode
);
23712 d
.one_operand_p
= false;
23713 d
.testing_p
= false;
23715 if (full_interleave
)
23717 /* We used the full interleave, the desired
23718 results are in the even elements. */
23719 for (i
= 0; i
< d
.nelt
; ++i
)
23724 /* For AVX, the interleave used above was not cross-lane. So the
23725 extraction is evens but with the second and third quarter swapped.
23726 Happily, that is even one insn shorter than even extraction.
23727 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23728 always first from the first and then from the second source operand,
23729 the index bits above the low 4 bits remains the same.
23730 Thus, for d.nelt == 32 we want permutation
23731 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23732 and for d.nelt == 64 we want permutation
23733 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23734 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23735 for (i
= 0; i
< d
.nelt
; ++i
)
23736 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
23739 ok
= ix86_expand_vec_perm_const_1 (&d
);
23743 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
23744 if op is CONST_VECTOR with all odd elements equal to their
23745 preceding element. */
23748 const_vector_equal_evenodd_p (rtx op
)
23750 machine_mode mode
= GET_MODE (op
);
23751 int i
, nunits
= GET_MODE_NUNITS (mode
);
23752 if (GET_CODE (op
) != CONST_VECTOR
23753 || nunits
!= CONST_VECTOR_NUNITS (op
))
23755 for (i
= 0; i
< nunits
; i
+= 2)
23756 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
23762 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
23763 bool uns_p
, bool odd_p
)
23765 machine_mode mode
= GET_MODE (op1
);
23766 machine_mode wmode
= GET_MODE (dest
);
23768 rtx orig_op1
= op1
, orig_op2
= op2
;
23770 if (!nonimmediate_operand (op1
, mode
))
23771 op1
= force_reg (mode
, op1
);
23772 if (!nonimmediate_operand (op2
, mode
))
23773 op2
= force_reg (mode
, op2
);
23775 /* We only play even/odd games with vectors of SImode. */
23776 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
23778 /* If we're looking for the odd results, shift those members down to
23779 the even slots. For some cpus this is faster than a PSHUFD. */
23782 /* For XOP use vpmacsdqh, but only for smult, as it is only
23784 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
23786 x
= force_reg (wmode
, CONST0_RTX (wmode
));
23787 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
23791 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
23792 if (!const_vector_equal_evenodd_p (orig_op1
))
23793 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
23794 x
, NULL
, 1, OPTAB_DIRECT
);
23795 if (!const_vector_equal_evenodd_p (orig_op2
))
23796 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
23797 x
, NULL
, 1, OPTAB_DIRECT
);
23798 op1
= gen_lowpart (mode
, op1
);
23799 op2
= gen_lowpart (mode
, op2
);
23802 if (mode
== V16SImode
)
23805 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
23807 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
23809 else if (mode
== V8SImode
)
23812 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
23814 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
23817 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
23818 else if (TARGET_SSE4_1
)
23819 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
23822 rtx s1
, s2
, t0
, t1
, t2
;
23824 /* The easiest way to implement this without PMULDQ is to go through
23825 the motions as if we are performing a full 64-bit multiply. With
23826 the exception that we need to do less shuffling of the elements. */
23828 /* Compute the sign-extension, aka highparts, of the two operands. */
23829 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
23830 op1
, pc_rtx
, pc_rtx
);
23831 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
23832 op2
, pc_rtx
, pc_rtx
);
23834 /* Multiply LO(A) * HI(B), and vice-versa. */
23835 t1
= gen_reg_rtx (wmode
);
23836 t2
= gen_reg_rtx (wmode
);
23837 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
23838 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
23840 /* Multiply LO(A) * LO(B). */
23841 t0
= gen_reg_rtx (wmode
);
23842 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
23844 /* Combine and shift the highparts into place. */
23845 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
23846 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
23849 /* Combine high and low parts. */
23850 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
23857 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
23858 bool uns_p
, bool high_p
)
23860 machine_mode wmode
= GET_MODE (dest
);
23861 machine_mode mode
= GET_MODE (op1
);
23862 rtx t1
, t2
, t3
, t4
, mask
;
23867 t1
= gen_reg_rtx (mode
);
23868 t2
= gen_reg_rtx (mode
);
23869 if (TARGET_XOP
&& !uns_p
)
23871 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23872 shuffle the elements once so that all elements are in the right
23873 place for immediate use: { A C B D }. */
23874 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
23875 const1_rtx
, GEN_INT (3)));
23876 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
23877 const1_rtx
, GEN_INT (3)));
23881 /* Put the elements into place for the multiply. */
23882 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
23883 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
23886 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
23890 /* Shuffle the elements between the lanes. After this we
23891 have { A B E F | C D G H } for each operand. */
23892 t1
= gen_reg_rtx (V4DImode
);
23893 t2
= gen_reg_rtx (V4DImode
);
23894 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
23895 const0_rtx
, const2_rtx
,
23896 const1_rtx
, GEN_INT (3)));
23897 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
23898 const0_rtx
, const2_rtx
,
23899 const1_rtx
, GEN_INT (3)));
23901 /* Shuffle the elements within the lanes. After this we
23902 have { A A B B | C C D D } or { E E F F | G G H H }. */
23903 t3
= gen_reg_rtx (V8SImode
);
23904 t4
= gen_reg_rtx (V8SImode
);
23905 mask
= GEN_INT (high_p
23906 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23907 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23908 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
23909 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
23911 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
23916 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
23917 uns_p
, OPTAB_DIRECT
);
23918 t2
= expand_binop (mode
,
23919 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
23920 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
23921 gcc_assert (t1
&& t2
);
23923 t3
= gen_reg_rtx (mode
);
23924 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
23925 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
23933 t1
= gen_reg_rtx (wmode
);
23934 t2
= gen_reg_rtx (wmode
);
23935 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
23936 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
23938 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
23942 gcc_unreachable ();
23947 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
23949 rtx res_1
, res_2
, res_3
, res_4
;
23951 res_1
= gen_reg_rtx (V4SImode
);
23952 res_2
= gen_reg_rtx (V4SImode
);
23953 res_3
= gen_reg_rtx (V2DImode
);
23954 res_4
= gen_reg_rtx (V2DImode
);
23955 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
23956 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
23958 /* Move the results in element 2 down to element 1; we don't care
23959 what goes in elements 2 and 3. Then we can merge the parts
23960 back together with an interleave.
23962 Note that two other sequences were tried:
23963 (1) Use interleaves at the start instead of psrldq, which allows
23964 us to use a single shufps to merge things back at the end.
23965 (2) Use shufps here to combine the two vectors, then pshufd to
23966 put the elements in the correct order.
23967 In both cases the cost of the reformatting stall was too high
23968 and the overall sequence slower. */
23970 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
23971 const0_rtx
, const2_rtx
,
23972 const0_rtx
, const0_rtx
));
23973 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
23974 const0_rtx
, const2_rtx
,
23975 const0_rtx
, const0_rtx
));
23976 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
23978 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
23982 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
23984 machine_mode mode
= GET_MODE (op0
);
23985 rtx t1
, t2
, t3
, t4
, t5
, t6
;
23987 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
23988 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
23989 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
23990 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
23991 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
23992 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
23993 else if (TARGET_XOP
&& mode
== V2DImode
)
23995 /* op1: A,B,C,D, op2: E,F,G,H */
23996 op1
= gen_lowpart (V4SImode
, op1
);
23997 op2
= gen_lowpart (V4SImode
, op2
);
23999 t1
= gen_reg_rtx (V4SImode
);
24000 t2
= gen_reg_rtx (V4SImode
);
24001 t3
= gen_reg_rtx (V2DImode
);
24002 t4
= gen_reg_rtx (V2DImode
);
24005 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
24011 /* t2: (B*E),(A*F),(D*G),(C*H) */
24012 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
24014 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
24015 emit_insn (gen_xop_phadddq (t3
, t2
));
24017 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
24018 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
24020 /* Multiply lower parts and add all */
24021 t5
= gen_reg_rtx (V2DImode
);
24022 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
24023 gen_lowpart (V4SImode
, op1
),
24024 gen_lowpart (V4SImode
, op2
)));
24025 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
24029 machine_mode nmode
;
24030 rtx (*umul
) (rtx
, rtx
, rtx
);
24032 if (mode
== V2DImode
)
24034 umul
= gen_vec_widen_umult_even_v4si
;
24037 else if (mode
== V4DImode
)
24039 umul
= gen_vec_widen_umult_even_v8si
;
24042 else if (mode
== V8DImode
)
24044 umul
= gen_vec_widen_umult_even_v16si
;
24048 gcc_unreachable ();
24051 /* Multiply low parts. */
24052 t1
= gen_reg_rtx (mode
);
24053 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
24055 /* Shift input vectors right 32 bits so we can multiply high parts. */
24057 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
24058 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
24060 /* Multiply high parts by low parts. */
24061 t4
= gen_reg_rtx (mode
);
24062 t5
= gen_reg_rtx (mode
);
24063 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
24064 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
24066 /* Combine and shift the highparts back. */
24067 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
24068 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
24070 /* Combine high and low parts. */
24071 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
24074 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
24075 gen_rtx_MULT (mode
, op1
, op2
));
24078 /* Return 1 if control tansfer instruction INSN
24079 should be encoded with notrack prefix. */
24082 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
24084 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
24089 rtx call
= get_call_rtx_from (insn
);
24090 gcc_assert (call
!= NULL_RTX
);
24091 rtx addr
= XEXP (call
, 0);
24093 /* Do not emit 'notrack' if it's not an indirect call. */
24095 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
24098 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
24101 if (JUMP_P (insn
) && !flag_cet_switch
)
24103 rtx target
= JUMP_LABEL (insn
);
24104 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
24107 /* Check the jump is a switch table. */
24108 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
24109 rtx_insn
*table
= next_insn (label
);
24110 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
24118 /* Calculate integer abs() using only SSE2 instructions. */
24121 ix86_expand_sse2_abs (rtx target
, rtx input
)
24123 machine_mode mode
= GET_MODE (target
);
24130 /* For 64-bit signed integer X, with SSE4.2 use
24131 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
24132 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
24133 32 and use logical instead of arithmetic right shift (which is
24134 unimplemented) and subtract. */
24137 tmp0
= gen_reg_rtx (mode
);
24138 tmp1
= gen_reg_rtx (mode
);
24139 emit_move_insn (tmp1
, CONST0_RTX (mode
));
24140 if (mode
== E_V2DImode
)
24141 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
24143 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
24147 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
24148 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
24149 - 1), NULL
, 0, OPTAB_DIRECT
);
24150 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
24153 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
24154 NULL
, 0, OPTAB_DIRECT
);
24155 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
24156 target
, 0, OPTAB_DIRECT
);
24160 /* For 32-bit signed integer X, the best way to calculate the absolute
24161 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
24162 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
24163 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
24164 NULL
, 0, OPTAB_DIRECT
);
24165 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
24166 NULL
, 0, OPTAB_DIRECT
);
24167 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
24168 target
, 0, OPTAB_DIRECT
);
24172 /* For 16-bit signed integer X, the best way to calculate the absolute
24173 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
24174 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
24176 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
24177 target
, 0, OPTAB_DIRECT
);
24181 /* For 8-bit signed integer X, the best way to calculate the absolute
24182 value of X is min ((unsigned char) X, (unsigned char) (-X)),
24183 as SSE2 provides the PMINUB insn. */
24184 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
24186 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
24187 target
, 0, OPTAB_DIRECT
);
24191 gcc_unreachable ();
24195 emit_move_insn (target
, x
);
24198 /* Expand an extract from a vector register through pextr insn.
24199 Return true if successful. */
24202 ix86_expand_pextr (rtx
*operands
)
24204 rtx dst
= operands
[0];
24205 rtx src
= operands
[1];
24207 unsigned int size
= INTVAL (operands
[2]);
24208 unsigned int pos
= INTVAL (operands
[3]);
24210 if (SUBREG_P (dst
))
24212 /* Reject non-lowpart subregs. */
24213 if (SUBREG_BYTE (dst
) > 0)
24215 dst
= SUBREG_REG (dst
);
24218 if (SUBREG_P (src
))
24220 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
24221 src
= SUBREG_REG (src
);
24224 switch (GET_MODE (src
))
24232 machine_mode srcmode
, dstmode
;
24235 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
24241 if (!TARGET_SSE4_1
)
24243 srcmode
= V16QImode
;
24249 srcmode
= V8HImode
;
24253 if (!TARGET_SSE4_1
)
24255 srcmode
= V4SImode
;
24259 gcc_assert (TARGET_64BIT
);
24260 if (!TARGET_SSE4_1
)
24262 srcmode
= V2DImode
;
24269 /* Reject extractions from misaligned positions. */
24270 if (pos
& (size
-1))
24273 if (GET_MODE (dst
) == dstmode
)
24276 d
= gen_reg_rtx (dstmode
);
24278 /* Construct insn pattern. */
24279 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
24280 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
24282 /* Let the rtl optimizers know about the zero extension performed. */
24283 if (dstmode
== QImode
|| dstmode
== HImode
)
24285 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
24286 d
= gen_lowpart (SImode
, d
);
24289 emit_insn (gen_rtx_SET (d
, pat
));
24292 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
24301 /* Expand an insert into a vector register through pinsr insn.
24302 Return true if successful. */
24305 ix86_expand_pinsr (rtx
*operands
)
24307 rtx dst
= operands
[0];
24308 rtx src
= operands
[3];
24310 unsigned int size
= INTVAL (operands
[1]);
24311 unsigned int pos
= INTVAL (operands
[2]);
24313 if (SUBREG_P (dst
))
24315 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
24316 dst
= SUBREG_REG (dst
);
24319 switch (GET_MODE (dst
))
24327 machine_mode srcmode
, dstmode
;
24328 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
24331 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
24337 if (!TARGET_SSE4_1
)
24339 dstmode
= V16QImode
;
24340 pinsr
= gen_sse4_1_pinsrb
;
24346 dstmode
= V8HImode
;
24347 pinsr
= gen_sse2_pinsrw
;
24351 if (!TARGET_SSE4_1
)
24353 dstmode
= V4SImode
;
24354 pinsr
= gen_sse4_1_pinsrd
;
24358 gcc_assert (TARGET_64BIT
);
24359 if (!TARGET_SSE4_1
)
24361 dstmode
= V2DImode
;
24362 pinsr
= gen_sse4_1_pinsrq
;
24369 /* Reject insertions to misaligned positions. */
24370 if (pos
& (size
-1))
24373 if (SUBREG_P (src
))
24375 unsigned int srcpos
= SUBREG_BYTE (src
);
24381 extr_ops
[0] = gen_reg_rtx (srcmode
);
24382 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
24383 extr_ops
[2] = GEN_INT (size
);
24384 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
24386 if (!ix86_expand_pextr (extr_ops
))
24392 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
24395 if (GET_MODE (dst
) == dstmode
)
24398 d
= gen_reg_rtx (dstmode
);
24400 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
24401 gen_lowpart (srcmode
, src
),
24402 GEN_INT (1 << (pos
/ size
))));
24404 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
24413 /* All CPUs prefer to avoid cross-lane operations so perform reductions
24414 upper against lower halves up to SSE reg size. */
24417 ix86_split_reduction (machine_mode mode
)
24419 /* Reduce lowpart against highpart until we reach SSE reg width to
24420 avoid cross-lane operations. */
24446 /* Generate call to __divmoddi4. */
24449 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
24451 rtx
*quot_p
, rtx
*rem_p
)
24453 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
24455 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
24456 mode
, op0
, mode
, op1
, mode
,
24457 XEXP (rem
, 0), Pmode
);
24463 ix86_expand_atomic_fetch_op_loop (rtx target
, rtx mem
, rtx val
,
24464 enum rtx_code code
, bool after
,
24467 rtx old_reg
, new_reg
, old_mem
, success
;
24468 machine_mode mode
= GET_MODE (target
);
24469 rtx_code_label
*loop_label
= NULL
;
24471 old_reg
= gen_reg_rtx (mode
);
24473 old_mem
= copy_to_reg (mem
);
24474 loop_label
= gen_label_rtx ();
24475 emit_label (loop_label
);
24476 emit_move_insn (old_reg
, old_mem
);
24478 /* return value for atomic_fetch_op. */
24480 emit_move_insn (target
, old_reg
);
24484 new_reg
= expand_simple_binop (mode
, AND
, new_reg
, val
, NULL_RTX
,
24485 true, OPTAB_LIB_WIDEN
);
24486 new_reg
= expand_simple_unop (mode
, code
, new_reg
, NULL_RTX
, true);
24489 new_reg
= expand_simple_binop (mode
, code
, new_reg
, val
, NULL_RTX
,
24490 true, OPTAB_LIB_WIDEN
);
24492 /* return value for atomic_op_fetch. */
24494 emit_move_insn (target
, new_reg
);
24496 success
= NULL_RTX
;
24498 ix86_expand_cmpxchg_loop (&success
, old_mem
, mem
, old_reg
, new_reg
,
24499 gen_int_mode (MEMMODEL_SYNC_SEQ_CST
,
24501 doubleword
, loop_label
);
24504 /* Relax cmpxchg instruction, param loop_label indicates whether
24505 the instruction should be relaxed with a pause loop. If not,
24506 it will be relaxed to an atomic load + compare, and skip
24507 cmpxchg instruction if mem != exp_input. */
24510 ix86_expand_cmpxchg_loop (rtx
*ptarget_bool
, rtx target_val
,
24511 rtx mem
, rtx exp_input
, rtx new_input
,
24512 rtx mem_model
, bool doubleword
,
24513 rtx_code_label
*loop_label
)
24515 rtx_code_label
*cmp_label
= NULL
;
24516 rtx_code_label
*done_label
= NULL
;
24517 rtx target_bool
= NULL_RTX
, new_mem
= NULL_RTX
;
24518 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
24519 rtx (*gendw
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
24520 machine_mode mode
= GET_MODE (target_val
), hmode
= mode
;
24522 if (*ptarget_bool
== NULL
)
24523 target_bool
= gen_reg_rtx (QImode
);
24525 target_bool
= *ptarget_bool
;
24527 cmp_label
= gen_label_rtx ();
24528 done_label
= gen_label_rtx ();
24530 new_mem
= gen_reg_rtx (mode
);
24531 /* Load memory first. */
24532 expand_atomic_load (new_mem
, mem
, MEMMODEL_SEQ_CST
);
24537 gendw
= gen_atomic_compare_and_swapti_doubleword
;
24543 gendw
= gen_atomic_compare_and_swapdi_doubleword
;
24547 gen
= gen_atomic_compare_and_swapdi_1
;
24550 gen
= gen_atomic_compare_and_swapsi_1
;
24553 gen
= gen_atomic_compare_and_swaphi_1
;
24556 gen
= gen_atomic_compare_and_swapqi_1
;
24559 gcc_unreachable ();
24562 /* Compare mem value with expected value. */
24565 rtx low_new_mem
= gen_lowpart (hmode
, new_mem
);
24566 rtx low_exp_input
= gen_lowpart (hmode
, exp_input
);
24567 rtx high_new_mem
= gen_highpart (hmode
, new_mem
);
24568 rtx high_exp_input
= gen_highpart (hmode
, exp_input
);
24569 emit_cmp_and_jump_insns (low_new_mem
, low_exp_input
, NE
, NULL_RTX
,
24570 hmode
, 1, cmp_label
,
24571 profile_probability::guessed_never ());
24572 emit_cmp_and_jump_insns (high_new_mem
, high_exp_input
, NE
, NULL_RTX
,
24573 hmode
, 1, cmp_label
,
24574 profile_probability::guessed_never ());
24577 emit_cmp_and_jump_insns (new_mem
, exp_input
, NE
, NULL_RTX
,
24578 GET_MODE (exp_input
), 1, cmp_label
,
24579 profile_probability::guessed_never ());
24581 /* Directly emits cmpxchg here. */
24583 emit_insn (gendw (target_val
, mem
, exp_input
,
24584 gen_lowpart (hmode
, new_input
),
24585 gen_highpart (hmode
, new_input
),
24588 emit_insn (gen (target_val
, mem
, exp_input
, new_input
, mem_model
));
24592 emit_jump_insn (gen_jump (done_label
));
24594 emit_label (cmp_label
);
24595 emit_move_insn (target_val
, new_mem
);
24596 emit_label (done_label
);
24597 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
24602 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
24604 emit_cmp_and_jump_insns (target_bool
, const0_rtx
, EQ
, const0_rtx
,
24605 GET_MODE (target_bool
), 1, loop_label
,
24606 profile_probability::guessed_never ());
24607 emit_jump_insn (gen_jump (done_label
));
24610 /* If mem is not expected, pause and loop back. */
24611 emit_label (cmp_label
);
24612 emit_move_insn (target_val
, new_mem
);
24613 emit_insn (gen_pause ());
24614 emit_jump_insn (gen_jump (loop_label
));
24616 emit_label (done_label
);
24619 *ptarget_bool
= target_bool
;
24622 /* Convert a BFmode VAL to SFmode without signaling sNaNs.
24623 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24626 ix86_expand_fast_convert_bf_to_sf (rtx val
)
24628 rtx op
= gen_lowpart (HImode
, val
), ret
;
24629 if (CONST_INT_P (op
))
24631 ret
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
24635 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24636 ret
= gen_reg_rtx (SImode
);
24637 emit_move_insn (ret
, GEN_INT (INTVAL (op
) & 0xffff));
24638 emit_insn (gen_ashlsi3 (ret
, ret
, GEN_INT (16)));
24639 return gen_lowpart (SFmode
, ret
);
24642 ret
= gen_reg_rtx (SFmode
);
24643 emit_insn (gen_extendbfsf2_1 (ret
, force_reg (BFmode
, val
)));
24647 #include "gt-i386-expand.h"