1 /* Copyright (C) 1988-2023 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
73 #include "tree-iterator.h"
75 #include "case-cfn-macros.h"
77 #include "fold-const-call.h"
79 #include "tree-ssanames.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
85 #include "symbol-summary.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
104 split_double_mode (machine_mode mode
, rtx operands
[],
105 int num
, rtx lo_half
[], rtx hi_half
[])
107 machine_mode half_mode
;
109 rtx mem_op
= NULL_RTX
;
130 byte
= GET_MODE_SIZE (half_mode
);
134 rtx op
= operands
[num
];
136 /* simplify_subreg refuse to split volatile memory addresses,
137 but we still have to handle it. */
140 if (mem_op
&& rtx_equal_p (op
, mem_op
))
142 lo_half
[num
] = lo_half
[mem_num
];
143 hi_half
[num
] = hi_half
[mem_num
];
149 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
150 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
155 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
156 GET_MODE (op
) == VOIDmode
157 ? mode
: GET_MODE (op
), 0);
159 rtx tmp
= simplify_gen_subreg (half_mode
, op
,
160 GET_MODE (op
) == VOIDmode
161 ? mode
: GET_MODE (op
), byte
);
162 /* simplify_gen_subreg will return NULL RTX for the
163 high half of the paradoxical subreg. */
164 hi_half
[num
] = tmp
? tmp
: gen_reg_rtx (half_mode
);
169 /* Emit the double word assignment DST = { LO, HI }. */
172 split_double_concat (machine_mode mode
, rtx dst
, rtx lo
, rtx hi
)
175 int deleted_move_count
= 0;
176 split_double_mode (mode
, &dst
, 1, &dlo
, &dhi
);
177 /* Constraints ensure that if both lo and hi are MEMs, then
178 dst has early-clobber and thus addresses of MEMs don't use
179 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
180 dlo/dhi are registers. */
182 && rtx_equal_p (dlo
, hi
)
183 && reg_overlap_mentioned_p (dhi
, lo
))
185 /* If dlo is same as hi and lo's address uses dhi register,
186 code below would first emit_move_insn (dhi, hi)
187 and then emit_move_insn (dlo, lo). But the former
188 would invalidate lo's address. Load into dhi first,
190 emit_move_insn (dhi
, lo
);
195 && !rtx_equal_p (dlo
, lo
)
196 && reg_overlap_mentioned_p (dlo
, hi
))
198 /* In this case, code below would first emit_move_insn (dlo, lo)
199 and then emit_move_insn (dhi, hi). But the former would
200 invalidate hi's address. Load into dhi first. */
201 emit_move_insn (dhi
, hi
);
204 if (!rtx_equal_p (dlo
, hi
))
206 if (!rtx_equal_p (dlo
, lo
))
207 emit_move_insn (dlo
, lo
);
209 deleted_move_count
++;
210 if (!rtx_equal_p (dhi
, hi
))
211 emit_move_insn (dhi
, hi
);
213 deleted_move_count
++;
215 else if (!rtx_equal_p (lo
, dhi
))
217 if (!rtx_equal_p (dhi
, hi
))
218 emit_move_insn (dhi
, hi
);
220 deleted_move_count
++;
221 if (!rtx_equal_p (dlo
, lo
))
222 emit_move_insn (dlo
, lo
);
224 deleted_move_count
++;
226 else if (mode
== TImode
)
227 emit_insn (gen_swapdi (dlo
, dhi
));
229 emit_insn (gen_swapsi (dlo
, dhi
));
231 if (deleted_move_count
== 2)
232 emit_note (NOTE_INSN_DELETED
);
236 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
240 ix86_expand_clear (rtx dest
)
244 /* We play register width games, which are only valid after reload. */
245 gcc_assert (reload_completed
);
247 /* Avoid HImode and its attendant prefix byte. */
248 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
249 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
250 tmp
= gen_rtx_SET (dest
, const0_rtx
);
252 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
254 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
255 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
261 /* Return true if V can be broadcasted from an integer of WIDTH bits
262 which is returned in VAL_BROADCAST. Otherwise, return false. */
265 ix86_broadcast (HOST_WIDE_INT v
, unsigned int width
,
266 HOST_WIDE_INT
&val_broadcast
)
268 wide_int val
= wi::uhwi (v
, HOST_BITS_PER_WIDE_INT
);
269 val_broadcast
= wi::extract_uhwi (val
, 0, width
);
270 for (unsigned int i
= width
; i
< HOST_BITS_PER_WIDE_INT
; i
+= width
)
272 HOST_WIDE_INT each
= wi::extract_uhwi (val
, i
, width
);
273 if (val_broadcast
!= each
)
276 val_broadcast
= sext_hwi (val_broadcast
, width
);
280 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
283 ix86_convert_const_wide_int_to_broadcast (machine_mode mode
, rtx op
)
285 /* Don't use integer vector broadcast if we can't move from GPR to SSE
286 register directly. */
287 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
290 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
291 broadcast only if vector broadcast is available. */
293 || !CONST_WIDE_INT_P (op
)
294 || standard_sse_constant_p (op
, mode
))
297 HOST_WIDE_INT val
= CONST_WIDE_INT_ELT (op
, 0);
298 HOST_WIDE_INT val_broadcast
;
299 scalar_int_mode broadcast_mode
;
301 && ix86_broadcast (val
, GET_MODE_BITSIZE (QImode
),
303 broadcast_mode
= QImode
;
305 && ix86_broadcast (val
, GET_MODE_BITSIZE (HImode
),
307 broadcast_mode
= HImode
;
308 else if (ix86_broadcast (val
, GET_MODE_BITSIZE (SImode
),
310 broadcast_mode
= SImode
;
311 else if (TARGET_64BIT
312 && ix86_broadcast (val
, GET_MODE_BITSIZE (DImode
),
314 broadcast_mode
= DImode
;
318 /* Check if OP can be broadcasted from VAL. */
319 for (int i
= 1; i
< CONST_WIDE_INT_NUNITS (op
); i
++)
320 if (val
!= CONST_WIDE_INT_ELT (op
, i
))
323 unsigned int nunits
= (GET_MODE_SIZE (mode
)
324 / GET_MODE_SIZE (broadcast_mode
));
325 machine_mode vector_mode
;
326 if (!mode_for_vector (broadcast_mode
, nunits
).exists (&vector_mode
))
328 rtx target
= ix86_gen_scratch_sse_rtx (vector_mode
);
329 bool ok
= ix86_expand_vector_init_duplicate (false, vector_mode
,
331 GEN_INT (val_broadcast
));
333 target
= lowpart_subreg (mode
, target
, vector_mode
);
338 ix86_expand_move (machine_mode mode
, rtx operands
[])
341 rtx tmp
, addend
= NULL_RTX
;
342 enum tls_model model
;
347 /* Avoid complex sets of likely spilled hard registers before reload. */
348 if (!ix86_hardreg_mov_ok (op0
, op1
))
350 tmp
= gen_reg_rtx (mode
);
352 ix86_expand_move (mode
, operands
);
358 switch (GET_CODE (op1
))
363 if (GET_CODE (tmp
) != PLUS
364 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
368 addend
= XEXP (tmp
, 1);
372 model
= SYMBOL_REF_TLS_MODEL (op1
);
375 op1
= legitimize_tls_address (op1
, model
, true);
376 else if (ix86_force_load_from_GOT_p (op1
))
378 /* Load the external function address via GOT slot to avoid PLT. */
379 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
383 op1
= gen_rtx_CONST (Pmode
, op1
);
384 op1
= gen_const_mem (Pmode
, op1
);
385 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
389 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
405 op1
= force_operand (op1
, NULL_RTX
);
406 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
407 op0
, 1, OPTAB_DIRECT
);
410 op1
= force_operand (op1
, op0
);
415 op1
= convert_to_mode (mode
, op1
, 1);
421 if ((flag_pic
|| MACHOPIC_INDIRECT
)
422 && symbolic_operand (op1
, mode
))
424 if (TARGET_MACHO
&& !TARGET_64BIT
)
428 if (MACHOPIC_INDIRECT
)
430 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
431 ? op0
: gen_reg_rtx (Pmode
);
432 op1
= machopic_indirect_data_reference (op1
, temp
);
434 op1
= machopic_legitimize_pic_address (op1
, mode
,
435 temp
== op1
? 0 : temp
);
437 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
439 rtx insn
= gen_rtx_SET (op0
, op1
);
443 if (GET_CODE (op0
) == MEM
)
444 op1
= force_reg (Pmode
, op1
);
448 if (GET_CODE (temp
) != REG
)
449 temp
= gen_reg_rtx (Pmode
);
450 temp
= legitimize_pic_address (op1
, temp
);
461 op1
= force_reg (mode
, op1
);
462 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
464 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
465 op1
= legitimize_pic_address (op1
, reg
);
468 op1
= convert_to_mode (mode
, op1
, 1);
475 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
476 || !push_operand (op0
, mode
))
478 op1
= force_reg (mode
, op1
);
480 if (push_operand (op0
, mode
)
481 && ! general_no_elim_operand (op1
, mode
))
482 op1
= copy_to_mode_reg (mode
, op1
);
484 /* Force large constants in 64bit compilation into register
485 to get them CSEed. */
486 if (can_create_pseudo_p ()
487 && (mode
== DImode
) && TARGET_64BIT
488 && immediate_operand (op1
, mode
)
489 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
490 && !register_operand (op0
, mode
)
492 op1
= copy_to_mode_reg (mode
, op1
);
494 if (can_create_pseudo_p ())
496 if (CONST_DOUBLE_P (op1
))
498 /* If we are loading a floating point constant to a
499 register, force the value to memory now, since we'll
500 get better code out the back end. */
502 op1
= validize_mem (force_const_mem (mode
, op1
));
503 if (!register_operand (op0
, mode
))
505 rtx temp
= gen_reg_rtx (mode
);
506 emit_insn (gen_rtx_SET (temp
, op1
));
507 emit_move_insn (op0
, temp
);
511 else if (GET_MODE_SIZE (mode
) >= 16)
513 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
514 (GET_MODE (op0
), op1
);
521 emit_insn (gen_rtx_SET (op0
, op1
));
524 /* OP is a memref of CONST_VECTOR, return scalar constant mem
525 if CONST_VECTOR is a vec_duplicate, else return NULL. */
527 ix86_broadcast_from_constant (machine_mode mode
, rtx op
)
529 int nunits
= GET_MODE_NUNITS (mode
);
533 /* Don't use integer vector broadcast if we can't move from GPR to SSE
534 register directly. */
535 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
536 && INTEGRAL_MODE_P (mode
))
539 /* Convert CONST_VECTOR to a non-standard SSE constant integer
540 broadcast only if vector broadcast is available. */
543 && (GET_MODE_INNER (mode
) == SImode
544 || GET_MODE_INNER (mode
) == DImode
))
545 || FLOAT_MODE_P (mode
))
546 || standard_sse_constant_p (op
, mode
))
549 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
550 We can still put 64-bit integer constant in memory when
551 avx512 embed broadcast is available. */
552 if (GET_MODE_INNER (mode
) == DImode
&& !TARGET_64BIT
554 || (GET_MODE_SIZE (mode
) < 64 && !TARGET_AVX512VL
)))
557 if (GET_MODE_INNER (mode
) == TImode
)
560 rtx constant
= get_pool_constant (XEXP (op
, 0));
561 if (GET_CODE (constant
) != CONST_VECTOR
)
564 /* There could be some rtx like
565 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
566 but with "*.LC1" refer to V2DI constant vector. */
567 if (GET_MODE (constant
) != mode
)
569 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
),
571 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
575 rtx first
= XVECEXP (constant
, 0, 0);
577 for (int i
= 1; i
< nunits
; ++i
)
579 rtx tmp
= XVECEXP (constant
, 0, i
);
580 /* Vector duplicate value. */
581 if (!rtx_equal_p (tmp
, first
))
589 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
591 rtx op0
= operands
[0], op1
= operands
[1];
592 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
593 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
594 unsigned int align
= (TARGET_IAMCU
595 ? GET_MODE_BITSIZE (mode
)
596 : GET_MODE_ALIGNMENT (mode
));
598 if (push_operand (op0
, VOIDmode
))
599 op0
= emit_move_resolve_push (mode
, op0
);
601 /* Force constants other than zero into memory. We do not know how
602 the instructions used to build constants modify the upper 64 bits
603 of the register, once we have that information we may be able
604 to handle some of them more efficiently. */
605 if (can_create_pseudo_p ()
608 && CONSTANT_P (SUBREG_REG (op1
))))
609 && ((register_operand (op0
, mode
)
610 && !standard_sse_constant_p (op1
, mode
))
611 /* ix86_expand_vector_move_misalign() does not like constants. */
612 || (SSE_REG_MODE_P (mode
)
614 && MEM_ALIGN (op0
) < align
)))
618 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
619 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
621 r
= validize_mem (r
);
623 r
= force_reg (imode
, SUBREG_REG (op1
));
624 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
628 machine_mode mode
= GET_MODE (op0
);
629 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
632 op1
= validize_mem (force_const_mem (mode
, op1
));
638 if (can_create_pseudo_p ()
639 && GET_MODE_SIZE (mode
) >= 16
640 && VECTOR_MODE_P (mode
)
642 && SYMBOL_REF_P (XEXP (op1
, 0))
643 && CONSTANT_POOL_ADDRESS_P (XEXP (op1
, 0))))
645 rtx first
= ix86_broadcast_from_constant (mode
, op1
);
646 if (first
!= nullptr)
648 /* Broadcast to XMM/YMM/ZMM register from an integer
649 constant or scalar mem. */
650 op1
= gen_reg_rtx (mode
);
651 if (FLOAT_MODE_P (mode
)
652 || (!TARGET_64BIT
&& GET_MODE_INNER (mode
) == DImode
))
653 first
= force_const_mem (GET_MODE_INNER (mode
), first
);
654 bool ok
= ix86_expand_vector_init_duplicate (false, mode
,
657 emit_move_insn (op0
, op1
);
662 /* We need to check memory alignment for SSE mode since attribute
663 can make operands unaligned. */
664 if (can_create_pseudo_p ()
665 && SSE_REG_MODE_P (mode
)
666 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
667 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
671 /* ix86_expand_vector_move_misalign() does not like both
672 arguments in memory. */
673 if (!register_operand (op0
, mode
)
674 && !register_operand (op1
, mode
))
676 rtx scratch
= ix86_gen_scratch_sse_rtx (mode
);
677 emit_move_insn (scratch
, op1
);
681 tmp
[0] = op0
; tmp
[1] = op1
;
682 ix86_expand_vector_move_misalign (mode
, tmp
);
686 /* Special case TImode to V1TImode conversions, via V2DI. */
689 && GET_MODE (SUBREG_REG (op1
)) == TImode
690 && TARGET_64BIT
&& TARGET_SSE
691 && can_create_pseudo_p ())
693 rtx tmp
= gen_reg_rtx (V2DImode
);
694 rtx lo
= gen_reg_rtx (DImode
);
695 rtx hi
= gen_reg_rtx (DImode
);
696 emit_move_insn (lo
, gen_lowpart (DImode
, SUBREG_REG (op1
)));
697 emit_move_insn (hi
, gen_highpart (DImode
, SUBREG_REG (op1
)));
698 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
699 emit_move_insn (op0
, gen_lowpart (V1TImode
, tmp
));
703 /* If operand0 is a hard register, make operand1 a pseudo. */
704 if (can_create_pseudo_p ()
705 && !ix86_hardreg_mov_ok (op0
, op1
))
707 rtx tmp
= gen_reg_rtx (GET_MODE (op0
));
708 emit_move_insn (tmp
, op1
);
709 emit_move_insn (op0
, tmp
);
713 /* Make operand1 a register if it isn't already. */
714 if (can_create_pseudo_p ()
715 && !register_operand (op0
, mode
)
716 && !register_operand (op1
, mode
))
718 rtx tmp
= ix86_gen_scratch_sse_rtx (GET_MODE (op0
));
719 emit_move_insn (tmp
, op1
);
720 emit_move_insn (op0
, tmp
);
724 emit_insn (gen_rtx_SET (op0
, op1
));
727 /* Split 32-byte AVX unaligned load and store if needed. */
730 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
733 rtx (*extract
) (rtx
, rtx
, rtx
);
736 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
737 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
739 emit_insn (gen_rtx_SET (op0
, op1
));
743 rtx orig_op0
= NULL_RTX
;
744 mode
= GET_MODE (op0
);
745 switch (GET_MODE_CLASS (mode
))
747 case MODE_VECTOR_INT
:
749 if (mode
!= V32QImode
)
754 op0
= gen_reg_rtx (V32QImode
);
757 op0
= gen_lowpart (V32QImode
, op0
);
758 op1
= gen_lowpart (V32QImode
, op1
);
762 case MODE_VECTOR_FLOAT
:
773 extract
= gen_avx_vextractf128v32qi
;
777 extract
= gen_avx_vextractf128v16bf
;
781 extract
= gen_avx_vextractf128v16hf
;
785 extract
= gen_avx_vextractf128v8sf
;
789 extract
= gen_avx_vextractf128v4df
;
796 rtx r
= gen_reg_rtx (mode
);
797 m
= adjust_address (op1
, mode
, 0);
798 emit_move_insn (r
, m
);
799 m
= adjust_address (op1
, mode
, 16);
800 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
801 emit_move_insn (op0
, r
);
803 else if (MEM_P (op0
))
805 m
= adjust_address (op0
, mode
, 0);
806 emit_insn (extract (m
, op1
, const0_rtx
));
807 m
= adjust_address (op0
, mode
, 16);
808 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
814 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
817 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
818 straight to ix86_expand_vector_move. */
819 /* Code generation for scalar reg-reg moves of single and double precision data:
820 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
824 if (x86_sse_partial_reg_dependency == true)
829 Code generation for scalar loads of double precision data:
830 if (x86_sse_split_regs == true)
831 movlpd mem, reg (gas syntax)
835 Code generation for unaligned packed loads of single precision data
836 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
837 if (x86_sse_unaligned_move_optimal)
840 if (x86_sse_partial_reg_dependency == true)
852 Code generation for unaligned packed loads of double precision data
853 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
854 if (x86_sse_unaligned_move_optimal)
857 if (x86_sse_split_regs == true)
870 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
877 /* Use unaligned load/store for AVX512 or when optimizing for size. */
878 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
880 emit_insn (gen_rtx_SET (op0
, op1
));
886 if (GET_MODE_SIZE (mode
) == 32)
887 ix86_avx256_split_vector_move_misalign (op0
, op1
);
889 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
890 emit_insn (gen_rtx_SET (op0
, op1
));
894 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
895 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
897 emit_insn (gen_rtx_SET (op0
, op1
));
901 /* ??? If we have typed data, then it would appear that using
902 movdqu is the only way to get unaligned data loaded with
904 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
906 emit_insn (gen_rtx_SET (op0
, op1
));
912 if (TARGET_SSE2
&& mode
== V2DFmode
)
916 /* When SSE registers are split into halves, we can avoid
917 writing to the top half twice. */
918 if (TARGET_SSE_SPLIT_REGS
)
925 /* ??? Not sure about the best option for the Intel chips.
926 The following would seem to satisfy; the register is
927 entirely cleared, breaking the dependency chain. We
928 then store to the upper half, with a dependency depth
929 of one. A rumor has it that Intel recommends two movsd
930 followed by an unpacklpd, but this is unconfirmed. And
931 given that the dependency depth of the unpacklpd would
932 still be one, I'm not sure why this would be better. */
933 zero
= CONST0_RTX (V2DFmode
);
936 m
= adjust_address (op1
, DFmode
, 0);
937 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
938 m
= adjust_address (op1
, DFmode
, 8);
939 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
945 if (mode
!= V4SFmode
)
946 t
= gen_reg_rtx (V4SFmode
);
950 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
951 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
955 m
= adjust_address (op1
, V2SFmode
, 0);
956 emit_insn (gen_sse_loadlps (t
, t
, m
));
957 m
= adjust_address (op1
, V2SFmode
, 8);
958 emit_insn (gen_sse_loadhps (t
, t
, m
));
959 if (mode
!= V4SFmode
)
960 emit_move_insn (op0
, gen_lowpart (mode
, t
));
963 else if (MEM_P (op0
))
965 if (TARGET_SSE2
&& mode
== V2DFmode
)
967 m
= adjust_address (op0
, DFmode
, 0);
968 emit_insn (gen_sse2_storelpd (m
, op1
));
969 m
= adjust_address (op0
, DFmode
, 8);
970 emit_insn (gen_sse2_storehpd (m
, op1
));
974 if (mode
!= V4SFmode
)
975 op1
= gen_lowpart (V4SFmode
, op1
);
977 m
= adjust_address (op0
, V2SFmode
, 0);
978 emit_insn (gen_sse_storelps (m
, op1
));
979 m
= adjust_address (op0
, V2SFmode
, 8);
980 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
987 /* Move bits 64:95 to bits 32:63. */
990 ix86_move_vector_high_sse_to_mmx (rtx op
)
992 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
993 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
994 GEN_INT (0), GEN_INT (0)));
995 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
996 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
997 rtx insn
= gen_rtx_SET (dest
, op
);
1001 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1004 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
1006 rtx op0
= operands
[0];
1007 rtx op1
= operands
[1];
1008 rtx op2
= operands
[2];
1010 machine_mode dmode
= GET_MODE (op0
);
1011 machine_mode smode
= GET_MODE (op1
);
1012 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
1013 machine_mode inner_smode
= GET_MODE_INNER (smode
);
1015 /* Get the corresponding SSE mode for destination. */
1016 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
1017 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
1019 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
1020 nunits
/ 2).require ();
1022 /* Get the corresponding SSE mode for source. */
1023 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
1024 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
1027 /* Generate SSE pack with signed/unsigned saturation. */
1028 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
1029 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
1030 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
1032 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
1033 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
1034 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
1038 ix86_move_vector_high_sse_to_mmx (op0
);
1041 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1044 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
1046 rtx op0
= operands
[0];
1047 rtx op1
= operands
[1];
1048 rtx op2
= operands
[2];
1049 machine_mode mode
= GET_MODE (op0
);
1051 /* The corresponding SSE mode. */
1052 machine_mode sse_mode
, double_sse_mode
;
1058 sse_mode
= V16QImode
;
1059 double_sse_mode
= V32QImode
;
1060 mask
= gen_rtx_PARALLEL (VOIDmode
,
1062 GEN_INT (0), GEN_INT (16),
1063 GEN_INT (1), GEN_INT (17),
1064 GEN_INT (2), GEN_INT (18),
1065 GEN_INT (3), GEN_INT (19),
1066 GEN_INT (4), GEN_INT (20),
1067 GEN_INT (5), GEN_INT (21),
1068 GEN_INT (6), GEN_INT (22),
1069 GEN_INT (7), GEN_INT (23)));
1074 sse_mode
= V8HImode
;
1075 double_sse_mode
= V16HImode
;
1076 mask
= gen_rtx_PARALLEL (VOIDmode
,
1078 GEN_INT (0), GEN_INT (8),
1079 GEN_INT (1), GEN_INT (9),
1080 GEN_INT (2), GEN_INT (10),
1081 GEN_INT (3), GEN_INT (11)));
1085 sse_mode
= V4SImode
;
1086 double_sse_mode
= V8SImode
;
1087 mask
= gen_rtx_PARALLEL (VOIDmode
,
1089 GEN_INT (0), GEN_INT (4),
1090 GEN_INT (1), GEN_INT (5)));
1094 sse_mode
= V4SFmode
;
1095 double_sse_mode
= V8SFmode
;
1096 mask
= gen_rtx_PARALLEL (VOIDmode
,
1098 GEN_INT (0), GEN_INT (4),
1099 GEN_INT (1), GEN_INT (5)));
1106 /* Generate SSE punpcklXX. */
1107 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
1108 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
1109 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
1111 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
1112 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
1113 rtx insn
= gen_rtx_SET (dest
, op2
);
1116 /* Move high bits to low bits. */
1119 if (sse_mode
== V4SFmode
)
1121 mask
= gen_rtx_PARALLEL (VOIDmode
,
1122 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1123 GEN_INT (4), GEN_INT (5)));
1124 op2
= gen_rtx_VEC_CONCAT (V8SFmode
, dest
, dest
);
1125 op1
= gen_rtx_VEC_SELECT (V4SFmode
, op2
, mask
);
1129 int sz
= GET_MODE_SIZE (mode
);
1132 mask
= gen_rtx_PARALLEL (VOIDmode
,
1133 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1134 GEN_INT (0), GEN_INT (1)));
1136 mask
= gen_rtx_PARALLEL (VOIDmode
,
1137 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1138 GEN_INT (0), GEN_INT (1)));
1142 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
1143 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
1146 insn
= gen_rtx_SET (dest
, op1
);
1151 /* Helper function of ix86_fixup_binary_operands to canonicalize
1152 operand order. Returns true if the operands should be swapped. */
1155 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
1158 rtx dst
= operands
[0];
1159 rtx src1
= operands
[1];
1160 rtx src2
= operands
[2];
1162 /* If the operation is not commutative, we can't do anything. */
1163 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
1164 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
1167 /* Highest priority is that src1 should match dst. */
1168 if (rtx_equal_p (dst
, src1
))
1170 if (rtx_equal_p (dst
, src2
))
1173 /* Next highest priority is that immediate constants come second. */
1174 if (immediate_operand (src2
, mode
))
1176 if (immediate_operand (src1
, mode
))
1179 /* Lowest priority is that memory references should come second. */
1189 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1190 destination to use for the operation. If different from the true
1191 destination in operands[0], a copy operation will be required. */
1194 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
1197 rtx dst
= operands
[0];
1198 rtx src1
= operands
[1];
1199 rtx src2
= operands
[2];
1201 /* Canonicalize operand order. */
1202 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1204 /* It is invalid to swap operands of different modes. */
1205 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
1207 std::swap (src1
, src2
);
1210 /* Both source operands cannot be in memory. */
1211 if (MEM_P (src1
) && MEM_P (src2
))
1213 /* Optimization: Only read from memory once. */
1214 if (rtx_equal_p (src1
, src2
))
1216 src2
= force_reg (mode
, src2
);
1219 else if (rtx_equal_p (dst
, src1
))
1220 src2
= force_reg (mode
, src2
);
1222 src1
= force_reg (mode
, src1
);
1225 /* If the destination is memory, and we do not have matching source
1226 operands, do things in registers. */
1227 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1228 dst
= gen_reg_rtx (mode
);
1230 /* Source 1 cannot be a constant. */
1231 if (CONSTANT_P (src1
))
1232 src1
= force_reg (mode
, src1
);
1234 /* Source 1 cannot be a non-matching memory. */
1235 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1236 src1
= force_reg (mode
, src1
);
1238 /* Improve address combine. */
1240 && GET_MODE_CLASS (mode
) == MODE_INT
1242 src2
= force_reg (mode
, src2
);
1249 /* Similarly, but assume that the destination has already been
1253 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
1254 machine_mode mode
, rtx operands
[])
1256 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1257 gcc_assert (dst
== operands
[0]);
1260 /* Attempt to expand a binary operator. Make the expansion closer to the
1261 actual machine, then just general_operand, which will allow 3 separate
1262 memory references (one output, two input) in a single insn. */
1265 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
1268 rtx src1
, src2
, dst
, op
, clob
;
1270 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1274 /* Emit the instruction. */
1276 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
1278 if (reload_completed
1280 && !rtx_equal_p (dst
, src1
))
1282 /* This is going to be an LEA; avoid splitting it later. */
1287 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1288 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1291 /* Fix up the destination if needed. */
1292 if (dst
!= operands
[0])
1293 emit_move_insn (operands
[0], dst
);
1296 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1297 the given OPERANDS. */
1300 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
1303 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
1304 if (SUBREG_P (operands
[1]))
1309 else if (SUBREG_P (operands
[2]))
1314 /* Optimize (__m128i) d | (__m128i) e and similar code
1315 when d and e are float vectors into float vector logical
1316 insn. In C/C++ without using intrinsics there is no other way
1317 to express vector logical operation on float vectors than
1318 to cast them temporarily to integer vectors. */
1320 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1321 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
1322 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
1323 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
1324 && SUBREG_BYTE (op1
) == 0
1325 && (GET_CODE (op2
) == CONST_VECTOR
1326 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
1327 && SUBREG_BYTE (op2
) == 0))
1328 && can_create_pseudo_p ())
1331 switch (GET_MODE (SUBREG_REG (op1
)))
1339 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1340 if (GET_CODE (op2
) == CONST_VECTOR
)
1342 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1343 op2
= force_reg (GET_MODE (dst
), op2
);
1348 op2
= SUBREG_REG (operands
[2]);
1349 if (!vector_operand (op2
, GET_MODE (dst
)))
1350 op2
= force_reg (GET_MODE (dst
), op2
);
1352 op1
= SUBREG_REG (op1
);
1353 if (!vector_operand (op1
, GET_MODE (dst
)))
1354 op1
= force_reg (GET_MODE (dst
), op1
);
1355 emit_insn (gen_rtx_SET (dst
,
1356 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1358 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1364 if (!vector_operand (operands
[1], mode
))
1365 operands
[1] = force_reg (mode
, operands
[1]);
1366 if (!vector_operand (operands
[2], mode
))
1367 operands
[2] = force_reg (mode
, operands
[2]);
1368 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1369 emit_insn (gen_rtx_SET (operands
[0],
1370 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1374 /* Return TRUE or FALSE depending on whether the binary operator meets the
1375 appropriate constraints. */
1378 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1381 rtx dst
= operands
[0];
1382 rtx src1
= operands
[1];
1383 rtx src2
= operands
[2];
1385 /* Both source operands cannot be in memory. */
1386 if ((MEM_P (src1
) || bcst_mem_operand (src1
, mode
))
1387 && (MEM_P (src2
) || bcst_mem_operand (src2
, mode
)))
1390 /* Canonicalize operand order for commutative operators. */
1391 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1392 std::swap (src1
, src2
);
1394 /* If the destination is memory, we must have a matching source operand. */
1395 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1398 /* Source 1 cannot be a constant. */
1399 if (CONSTANT_P (src1
))
1402 /* Source 1 cannot be a non-matching memory. */
1403 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1404 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1408 || (TARGET_64BIT
&& mode
== DImode
))
1409 && satisfies_constraint_L (src2
));
1414 /* Attempt to expand a unary operator. Make the expansion closer to the
1415 actual machine, then just general_operand, which will allow 2 separate
1416 memory references (one output, one input) in a single insn. */
1419 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1422 bool matching_memory
= false;
1423 rtx src
, dst
, op
, clob
;
1428 /* If the destination is memory, and we do not have matching source
1429 operands, do things in registers. */
1432 if (rtx_equal_p (dst
, src
))
1433 matching_memory
= true;
1435 dst
= gen_reg_rtx (mode
);
1438 /* When source operand is memory, destination must match. */
1439 if (MEM_P (src
) && !matching_memory
)
1440 src
= force_reg (mode
, src
);
1442 /* Emit the instruction. */
1444 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1450 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1451 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1454 /* Fix up the destination if needed. */
1455 if (dst
!= operands
[0])
1456 emit_move_insn (operands
[0], dst
);
1459 /* Predict just emitted jump instruction to be taken with probability PROB. */
1462 predict_jump (int prob
)
1464 rtx_insn
*insn
= get_last_insn ();
1465 gcc_assert (JUMP_P (insn
));
1466 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1469 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1470 divisor are within the range [0-255]. */
1473 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1476 rtx_code_label
*end_label
, *qimode_label
;
1479 rtx scratch
, tmp0
, tmp1
, tmp2
;
1480 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1482 operands
[2] = force_reg (mode
, operands
[2]);
1483 operands
[3] = force_reg (mode
, operands
[3]);
1488 if (GET_MODE (operands
[0]) == SImode
)
1490 if (GET_MODE (operands
[1]) == SImode
)
1491 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1494 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1498 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1502 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1509 end_label
= gen_label_rtx ();
1510 qimode_label
= gen_label_rtx ();
1512 scratch
= gen_reg_rtx (mode
);
1514 /* Use 8bit unsigned divimod if dividend and divisor are within
1515 the range [0-255]. */
1516 emit_move_insn (scratch
, operands
[2]);
1517 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1518 scratch
, 1, OPTAB_DIRECT
);
1519 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1520 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1521 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1522 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1523 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1525 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1526 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1527 JUMP_LABEL (insn
) = qimode_label
;
1529 /* Generate original signed/unsigned divimod. */
1530 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1531 operands
[2], operands
[3]));
1533 /* Branch to the end. */
1534 emit_jump_insn (gen_jump (end_label
));
1537 /* Generate 8bit unsigned divide. */
1538 emit_label (qimode_label
);
1539 /* Don't use operands[0] for result of 8bit divide since not all
1540 registers support QImode ZERO_EXTRACT. */
1541 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1542 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1543 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1544 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1548 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1549 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1553 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1554 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1558 if (GET_MODE (operands
[0]) != SImode
)
1559 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1560 if (GET_MODE (operands
[1]) != SImode
)
1561 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1564 /* Extract remainder from AH. */
1565 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1566 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1567 GEN_INT (8), GEN_INT (8));
1568 insn
= emit_move_insn (operands
[1], tmp1
);
1569 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1571 /* Zero extend quotient from AL. */
1572 tmp1
= gen_lowpart (QImode
, tmp0
);
1573 insn
= emit_insn (gen_extend_insn
1575 GET_MODE (operands
[0]), QImode
, 1));
1576 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1578 emit_label (end_label
);
1581 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1582 matches destination. RTX includes clobber of FLAGS_REG. */
1585 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1590 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1591 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1593 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1596 /* Return true if regno1 def is nearest to the insn. */
1599 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1601 rtx_insn
*prev
= insn
;
1602 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1606 while (prev
&& prev
!= start
)
1608 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1610 prev
= PREV_INSN (prev
);
1613 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1615 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1617 prev
= PREV_INSN (prev
);
1620 /* None of the regs is defined in the bb. */
1624 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1625 int ix86_last_zero_store_uid
;
1627 /* Split lea instructions into a sequence of instructions
1628 which are executed on ALU to avoid AGU stalls.
1629 It is assumed that it is allowed to clobber flags register
1633 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1635 unsigned int regno0
, regno1
, regno2
;
1636 struct ix86_address parts
;
1640 ok
= ix86_decompose_address (operands
[1], &parts
);
1643 target
= gen_lowpart (mode
, operands
[0]);
1645 regno0
= true_regnum (target
);
1646 regno1
= INVALID_REGNUM
;
1647 regno2
= INVALID_REGNUM
;
1651 parts
.base
= gen_lowpart (mode
, parts
.base
);
1652 regno1
= true_regnum (parts
.base
);
1657 parts
.index
= gen_lowpart (mode
, parts
.index
);
1658 regno2
= true_regnum (parts
.index
);
1662 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1664 if (parts
.scale
> 1)
1666 /* Case r1 = r1 + ... */
1667 if (regno1
== regno0
)
1669 /* If we have a case r1 = r1 + C * r2 then we
1670 should use multiplication which is very
1671 expensive. Assume cost model is wrong if we
1672 have such case here. */
1673 gcc_assert (regno2
!= regno0
);
1675 for (adds
= parts
.scale
; adds
> 0; adds
--)
1676 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1680 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1681 if (regno0
!= regno2
)
1682 emit_insn (gen_rtx_SET (target
, parts
.index
));
1684 /* Use shift for scaling, but emit it as MULT instead
1685 to avoid it being immediately peephole2 optimized back
1687 ix86_emit_binop (MULT
, mode
, target
, GEN_INT (parts
.scale
));
1690 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1692 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1693 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1696 else if (!parts
.base
&& !parts
.index
)
1698 gcc_assert(parts
.disp
);
1699 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1705 if (regno0
!= regno2
)
1706 emit_insn (gen_rtx_SET (target
, parts
.index
));
1708 else if (!parts
.index
)
1710 if (regno0
!= regno1
)
1711 emit_insn (gen_rtx_SET (target
, parts
.base
));
1715 if (regno0
== regno1
)
1717 else if (regno0
== regno2
)
1723 /* Find better operand for SET instruction, depending
1724 on which definition is farther from the insn. */
1725 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1726 tmp
= parts
.index
, tmp1
= parts
.base
;
1728 tmp
= parts
.base
, tmp1
= parts
.index
;
1730 emit_insn (gen_rtx_SET (target
, tmp
));
1732 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1733 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1735 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1739 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1742 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1743 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1747 /* Post-reload splitter for converting an SF or DFmode value in an
1748 SSE register into an unsigned SImode. */
1751 ix86_split_convert_uns_si_sse (rtx operands
[])
1753 machine_mode vecmode
;
1754 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1756 large
= operands
[1];
1757 zero_or_two31
= operands
[2];
1758 input
= operands
[3];
1759 two31
= operands
[4];
1760 vecmode
= GET_MODE (large
);
1761 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1763 /* Load up the value into the low element. We must ensure that the other
1764 elements are valid floats -- zero is the easiest such value. */
1767 if (vecmode
== V4SFmode
)
1768 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1770 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1774 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1775 emit_move_insn (value
, CONST0_RTX (vecmode
));
1776 if (vecmode
== V4SFmode
)
1777 emit_insn (gen_sse_movss_v4sf (value
, value
, input
));
1779 emit_insn (gen_sse2_movsd_v2df (value
, value
, input
));
1782 emit_move_insn (large
, two31
);
1783 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1785 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1786 emit_insn (gen_rtx_SET (large
, x
));
1788 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1789 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1791 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1792 emit_insn (gen_rtx_SET (value
, x
));
1794 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1795 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1797 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1798 if (vecmode
== V4SFmode
)
1799 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1801 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1804 emit_insn (gen_xorv4si3 (value
, value
, large
));
1807 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1808 machine_mode mode
, rtx target
,
1809 rtx var
, int one_var
);
1811 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1812 Expects the 64-bit DImode to be supplied in a pair of integral
1813 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1814 -mfpmath=sse, !optimize_size only. */
1817 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1819 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1820 rtx int_xmm
, fp_xmm
;
1821 rtx biases
, exponents
;
1824 int_xmm
= gen_reg_rtx (V4SImode
);
1825 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1826 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1827 else if (TARGET_SSE_SPLIT_REGS
)
1829 emit_clobber (int_xmm
);
1830 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1834 x
= gen_reg_rtx (V2DImode
);
1835 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1836 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1839 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1840 gen_rtvec (4, GEN_INT (0x43300000UL
),
1841 GEN_INT (0x45300000UL
),
1842 const0_rtx
, const0_rtx
));
1843 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1845 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1846 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1848 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1849 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1850 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1851 (0x1.0p84 + double(fp_value_hi_xmm)).
1852 Note these exponents differ by 32. */
1854 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1856 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1857 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1858 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1859 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1860 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1861 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1862 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1863 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1864 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1866 /* Add the upper and lower DFmode values together. */
1868 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1871 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1872 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1873 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1876 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1879 /* Not used, but eases macroization of patterns. */
1881 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1886 static rtx
ix86_expand_sse_fabs (rtx op0
, rtx
*smask
);
1888 /* Convert an unsigned SImode value into a DFmode. Only currently used
1889 for SSE, but applicable anywhere. */
1892 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1894 REAL_VALUE_TYPE TWO31r
;
1897 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1898 NULL
, 1, OPTAB_DIRECT
);
1900 fp
= gen_reg_rtx (DFmode
);
1901 emit_insn (gen_floatsidf2 (fp
, x
));
1903 real_ldexp (&TWO31r
, &dconst1
, 31);
1904 x
= const_double_from_real_value (TWO31r
, DFmode
);
1906 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1908 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1909 if (HONOR_SIGNED_ZEROS (DFmode
) && flag_rounding_math
)
1910 x
= ix86_expand_sse_fabs (x
, NULL
);
1913 emit_move_insn (target
, x
);
1916 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1917 32-bit mode; otherwise we have a direct convert instruction. */
1920 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1922 REAL_VALUE_TYPE TWO32r
;
1923 rtx fp_lo
, fp_hi
, x
;
1925 fp_lo
= gen_reg_rtx (DFmode
);
1926 fp_hi
= gen_reg_rtx (DFmode
);
1928 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1930 real_ldexp (&TWO32r
, &dconst1
, 32);
1931 x
= const_double_from_real_value (TWO32r
, DFmode
);
1932 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1934 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1936 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1939 emit_move_insn (target
, x
);
1942 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1943 For x86_32, -mfpmath=sse, !optimize_size only. */
1945 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1947 REAL_VALUE_TYPE ONE16r
;
1948 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1950 real_ldexp (&ONE16r
, &dconst1
, 16);
1951 x
= const_double_from_real_value (ONE16r
, SFmode
);
1952 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1953 NULL
, 0, OPTAB_DIRECT
);
1954 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1955 NULL
, 0, OPTAB_DIRECT
);
1956 fp_hi
= gen_reg_rtx (SFmode
);
1957 fp_lo
= gen_reg_rtx (SFmode
);
1958 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1959 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1962 x
= validize_mem (force_const_mem (SFmode
, x
));
1963 fp_hi
= gen_rtx_FMA (SFmode
, fp_hi
, x
, fp_lo
);
1964 emit_move_insn (target
, fp_hi
);
1968 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1970 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1972 if (!rtx_equal_p (target
, fp_hi
))
1973 emit_move_insn (target
, fp_hi
);
1977 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1978 a vector of unsigned ints VAL to vector of floats TARGET. */
1981 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1984 REAL_VALUE_TYPE TWO16r
;
1985 machine_mode intmode
= GET_MODE (val
);
1986 machine_mode fltmode
= GET_MODE (target
);
1987 rtx (*cvt
) (rtx
, rtx
);
1989 if (intmode
== V4SImode
)
1990 cvt
= gen_floatv4siv4sf2
;
1992 cvt
= gen_floatv8siv8sf2
;
1993 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1994 tmp
[0] = force_reg (intmode
, tmp
[0]);
1995 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1997 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1998 NULL_RTX
, 1, OPTAB_DIRECT
);
1999 tmp
[3] = gen_reg_rtx (fltmode
);
2000 emit_insn (cvt (tmp
[3], tmp
[1]));
2001 tmp
[4] = gen_reg_rtx (fltmode
);
2002 emit_insn (cvt (tmp
[4], tmp
[2]));
2003 real_ldexp (&TWO16r
, &dconst1
, 16);
2004 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
2005 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
2008 tmp
[6] = gen_rtx_FMA (fltmode
, tmp
[4], tmp
[5], tmp
[3]);
2009 emit_move_insn (target
, tmp
[6]);
2013 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5],
2014 NULL_RTX
, 1, OPTAB_DIRECT
);
2015 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6],
2016 target
, 1, OPTAB_DIRECT
);
2017 if (tmp
[7] != target
)
2018 emit_move_insn (target
, tmp
[7]);
2022 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2023 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
2024 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2025 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2028 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
2030 REAL_VALUE_TYPE TWO31r
;
2032 machine_mode mode
= GET_MODE (val
);
2033 machine_mode scalarmode
= GET_MODE_INNER (mode
);
2034 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
2035 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
2038 for (i
= 0; i
< 3; i
++)
2039 tmp
[i
] = gen_reg_rtx (mode
);
2040 real_ldexp (&TWO31r
, &dconst1
, 31);
2041 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
2042 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
2043 two31r
= force_reg (mode
, two31r
);
2046 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
2047 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
2048 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
2049 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
2050 default: gcc_unreachable ();
2052 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
2053 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
2054 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
2056 if (intmode
== V4SImode
|| TARGET_AVX2
)
2057 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
2058 gen_lowpart (intmode
, tmp
[0]),
2059 GEN_INT (31), NULL_RTX
, 0,
2063 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
2064 two31
= ix86_build_const_vector (intmode
, 1, two31
);
2065 *xorp
= expand_simple_binop (intmode
, AND
,
2066 gen_lowpart (intmode
, tmp
[0]),
2070 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
2074 /* Generate code for floating point ABS or NEG. */
2077 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2081 bool use_sse
= false;
2082 bool vector_mode
= VECTOR_MODE_P (mode
);
2083 machine_mode vmode
= mode
;
2086 if (vector_mode
|| mode
== TFmode
|| mode
== HFmode
)
2092 else if (TARGET_SSE_MATH
)
2094 use_sse
= SSE_FLOAT_MODE_P (mode
);
2097 else if (mode
== DFmode
)
2104 set
= gen_rtx_fmt_e (code
, mode
, src
);
2105 set
= gen_rtx_SET (dst
, set
);
2109 rtx mask
, use
, clob
;
2111 /* NEG and ABS performed with SSE use bitwise mask operations.
2112 Create the appropriate mask now. */
2113 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
2114 use
= gen_rtx_USE (VOIDmode
, mask
);
2115 if (vector_mode
|| mode
== TFmode
)
2116 par
= gen_rtvec (2, set
, use
);
2119 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2120 par
= gen_rtvec (3, set
, use
, clob
);
2127 /* Changing of sign for FP values is doable using integer unit too. */
2128 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2129 par
= gen_rtvec (2, set
, clob
);
2132 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2135 /* Deconstruct a floating point ABS or NEG operation
2136 with integer registers into integer operations. */
2139 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2142 enum rtx_code absneg_op
;
2145 gcc_assert (operands_match_p (operands
[0], operands
[1]));
2150 dst
= gen_lowpart (SImode
, operands
[0]);
2154 set
= gen_int_mode (0x7fffffff, SImode
);
2159 set
= gen_int_mode (0x80000000, SImode
);
2162 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2168 dst
= gen_lowpart (DImode
, operands
[0]);
2169 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
2174 set
= gen_rtx_NOT (DImode
, dst
);
2178 dst
= gen_highpart (SImode
, operands
[0]);
2182 set
= gen_int_mode (0x7fffffff, SImode
);
2187 set
= gen_int_mode (0x80000000, SImode
);
2190 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2195 dst
= gen_rtx_REG (SImode
,
2196 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
2199 set
= GEN_INT (0x7fff);
2204 set
= GEN_INT (0x8000);
2207 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2214 set
= gen_rtx_SET (dst
, set
);
2216 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2217 rtvec par
= gen_rtvec (2, set
, clob
);
2219 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2222 /* Expand a copysign operation. Special case operand 0 being a constant. */
2225 ix86_expand_copysign (rtx operands
[])
2227 machine_mode mode
, vmode
;
2228 rtx dest
, vdest
, op0
, op1
, mask
, op2
, op3
;
2230 mode
= GET_MODE (operands
[0]);
2234 else if (mode
== SFmode
)
2236 else if (mode
== DFmode
)
2238 else if (mode
== TFmode
)
2243 if (rtx_equal_p (operands
[1], operands
[2]))
2245 emit_move_insn (operands
[0], operands
[1]);
2250 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2251 if (vdest
== NULL_RTX
)
2252 vdest
= gen_reg_rtx (vmode
);
2255 op1
= lowpart_subreg (vmode
, force_reg (mode
, operands
[2]), mode
);
2256 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2258 if (CONST_DOUBLE_P (operands
[1]))
2260 op0
= simplify_unary_operation (ABS
, mode
, operands
[1], mode
);
2261 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2262 if (op0
== CONST0_RTX (mode
))
2264 emit_move_insn (vdest
, gen_rtx_AND (vmode
, mask
, op1
));
2266 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2270 if (GET_MODE_SIZE (mode
) < 16)
2271 op0
= ix86_build_const_vector (vmode
, false, op0
);
2272 op0
= force_reg (vmode
, op0
);
2275 op0
= lowpart_subreg (vmode
, force_reg (mode
, operands
[1]), mode
);
2277 op2
= gen_reg_rtx (vmode
);
2278 op3
= gen_reg_rtx (vmode
);
2279 emit_move_insn (op2
, gen_rtx_AND (vmode
,
2280 gen_rtx_NOT (vmode
, mask
),
2282 emit_move_insn (op3
, gen_rtx_AND (vmode
, mask
, op1
));
2283 emit_move_insn (vdest
, gen_rtx_IOR (vmode
, op2
, op3
));
2285 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2288 /* Expand an xorsign operation. */
2291 ix86_expand_xorsign (rtx operands
[])
2293 machine_mode mode
, vmode
;
2294 rtx dest
, vdest
, op0
, op1
, mask
, x
, temp
;
2300 mode
= GET_MODE (dest
);
2304 else if (mode
== SFmode
)
2306 else if (mode
== DFmode
)
2311 temp
= gen_reg_rtx (vmode
);
2312 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2314 op1
= lowpart_subreg (vmode
, force_reg (mode
, op1
), mode
);
2315 x
= gen_rtx_AND (vmode
, op1
, mask
);
2316 emit_insn (gen_rtx_SET (temp
, x
));
2318 op0
= lowpart_subreg (vmode
, force_reg (mode
, op0
), mode
);
2319 x
= gen_rtx_XOR (vmode
, temp
, op0
);
2321 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2322 if (vdest
== NULL_RTX
)
2323 vdest
= gen_reg_rtx (vmode
);
2326 emit_insn (gen_rtx_SET (vdest
, x
));
2329 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2332 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2335 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2337 machine_mode mode
= GET_MODE (op0
);
2340 /* Handle special case - vector comparsion with boolean result, transform
2341 it using ptest instruction. */
2342 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
2345 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2346 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2348 gcc_assert (code
== EQ
|| code
== NE
);
2352 op0
= lowpart_subreg (p_mode
, force_reg (mode
, op0
), mode
);
2353 op1
= lowpart_subreg (p_mode
, force_reg (mode
, op1
), mode
);
2356 /* Generate XOR since we can't check that one operand is zero vector. */
2357 tmp
= gen_reg_rtx (mode
);
2358 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2359 tmp
= gen_lowpart (p_mode
, tmp
);
2360 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2361 gen_rtx_UNSPEC (CCmode
,
2362 gen_rtvec (2, tmp
, tmp
),
2364 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2365 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2366 gen_rtx_LABEL_REF (VOIDmode
, label
),
2368 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2382 tmp
= ix86_expand_compare (code
, op0
, op1
);
2383 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2384 gen_rtx_LABEL_REF (VOIDmode
, label
),
2386 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2394 /* DI and TI mode equality/inequality comparisons may be performed
2395 on SSE registers. Avoid splitting them, except when optimizing
2397 if ((code
== EQ
|| code
== NE
)
2398 && !optimize_insn_for_size_p ())
2401 /* Expand DImode branch into multiple compare+branch. */
2404 rtx_code_label
*label2
;
2405 enum rtx_code code1
, code2
, code3
;
2406 machine_mode submode
;
2408 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2410 std::swap (op0
, op1
);
2411 code
= swap_condition (code
);
2414 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2415 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2417 submode
= mode
== DImode
? SImode
: DImode
;
2419 /* If we are doing less-than or greater-or-equal-than,
2420 op1 is a constant and the low word is zero, then we can just
2421 examine the high word. Similarly for low word -1 and
2422 less-or-equal-than or greater-than. */
2424 if (CONST_INT_P (hi
[1]))
2427 case LT
: case LTU
: case GE
: case GEU
:
2428 if (lo
[1] == const0_rtx
)
2430 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2434 case LE
: case LEU
: case GT
: case GTU
:
2435 if (lo
[1] == constm1_rtx
)
2437 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2445 /* Emulate comparisons that do not depend on Zero flag with
2446 double-word subtraction. Note that only Overflow, Sign
2447 and Carry flags are valid, so swap arguments and condition
2448 of comparisons that would otherwise test Zero flag. */
2452 case LE
: case LEU
: case GT
: case GTU
:
2453 std::swap (lo
[0], lo
[1]);
2454 std::swap (hi
[0], hi
[1]);
2455 code
= swap_condition (code
);
2458 case LT
: case LTU
: case GE
: case GEU
:
2460 bool uns
= (code
== LTU
|| code
== GEU
);
2461 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2462 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2464 if (!nonimmediate_operand (lo
[0], submode
))
2465 lo
[0] = force_reg (submode
, lo
[0]);
2466 if (!x86_64_general_operand (lo
[1], submode
))
2467 lo
[1] = force_reg (submode
, lo
[1]);
2469 if (!register_operand (hi
[0], submode
))
2470 hi
[0] = force_reg (submode
, hi
[0]);
2471 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2472 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2473 hi
[1] = force_reg (submode
, hi
[1]);
2475 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2477 tmp
= gen_rtx_SCRATCH (submode
);
2478 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2480 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2481 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2489 /* Otherwise, we need two or three jumps. */
2491 label2
= gen_label_rtx ();
2494 code2
= swap_condition (code
);
2495 code3
= unsigned_condition (code
);
2499 case LT
: case GT
: case LTU
: case GTU
:
2502 case LE
: code1
= LT
; code2
= GT
; break;
2503 case GE
: code1
= GT
; code2
= LT
; break;
2504 case LEU
: code1
= LTU
; code2
= GTU
; break;
2505 case GEU
: code1
= GTU
; code2
= LTU
; break;
2507 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2508 case NE
: code2
= UNKNOWN
; break;
2516 * if (hi(a) < hi(b)) goto true;
2517 * if (hi(a) > hi(b)) goto false;
2518 * if (lo(a) < lo(b)) goto true;
2522 if (code1
!= UNKNOWN
)
2523 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2524 if (code2
!= UNKNOWN
)
2525 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2527 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2529 if (code2
!= UNKNOWN
)
2530 emit_label (label2
);
2535 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2540 /* Figure out whether to use unordered fp comparisons. */
2543 ix86_unordered_fp_compare (enum rtx_code code
)
2545 if (!TARGET_IEEE_FP
)
2574 /* Return a comparison we can do and that it is equivalent to
2575 swap_condition (code) apart possibly from orderedness.
2576 But, never change orderedness if TARGET_IEEE_FP, returning
2577 UNKNOWN in that case if necessary. */
2579 static enum rtx_code
2580 ix86_fp_swap_condition (enum rtx_code code
)
2584 case GT
: /* GTU - CF=0 & ZF=0 */
2585 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2586 case GE
: /* GEU - CF=0 */
2587 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2588 case UNLT
: /* LTU - CF=1 */
2589 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2590 case UNLE
: /* LEU - CF=1 | ZF=1 */
2591 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2593 return swap_condition (code
);
2597 /* Return cost of comparison CODE using the best strategy for performance.
2598 All following functions do use number of instructions as a cost metrics.
2599 In future this should be tweaked to compute bytes for optimize_size and
2600 take into account performance of various instructions on various CPUs. */
2603 ix86_fp_comparison_cost (enum rtx_code code
)
2607 /* The cost of code using bit-twiddling on %ah. */
2624 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2628 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2634 switch (ix86_fp_comparison_strategy (code
))
2636 case IX86_FPCMP_COMI
:
2637 return arith_cost
> 4 ? 3 : 2;
2638 case IX86_FPCMP_SAHF
:
2639 return arith_cost
> 4 ? 4 : 3;
2645 /* Swap, force into registers, or otherwise massage the two operands
2646 to a fp comparison. The operands are updated in place; the new
2647 comparison code is returned. */
2649 static enum rtx_code
2650 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2652 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2653 rtx op0
= *pop0
, op1
= *pop1
;
2654 machine_mode op_mode
= GET_MODE (op0
);
2655 bool is_sse
= SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode
);
2657 if (op_mode
== BFmode
)
2659 rtx op
= gen_lowpart (HImode
, op0
);
2660 if (CONST_INT_P (op
))
2661 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2665 rtx t1
= gen_reg_rtx (SImode
);
2666 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2667 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2668 op
= gen_lowpart (SFmode
, t1
);
2671 op
= gen_lowpart (HImode
, op1
);
2672 if (CONST_INT_P (op
))
2673 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2677 rtx t1
= gen_reg_rtx (SImode
);
2678 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2679 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2680 op
= gen_lowpart (SFmode
, t1
);
2683 return ix86_prepare_fp_compare_args (code
, pop0
, pop1
);
2686 /* All of the unordered compare instructions only work on registers.
2687 The same is true of the fcomi compare instructions. The XFmode
2688 compare instructions require registers except when comparing
2689 against zero or when converting operand 1 from fixed point to
2693 && (unordered_compare
2694 || (op_mode
== XFmode
2695 && ! (standard_80387_constant_p (op0
) == 1
2696 || standard_80387_constant_p (op1
) == 1)
2697 && GET_CODE (op1
) != FLOAT
)
2698 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2700 op0
= force_reg (op_mode
, op0
);
2701 op1
= force_reg (op_mode
, op1
);
2705 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2706 things around if they appear profitable, otherwise force op0
2709 if (standard_80387_constant_p (op0
) == 0
2711 && ! (standard_80387_constant_p (op1
) == 0
2714 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2715 if (new_code
!= UNKNOWN
)
2717 std::swap (op0
, op1
);
2723 op0
= force_reg (op_mode
, op0
);
2725 if (CONSTANT_P (op1
))
2727 int tmp
= standard_80387_constant_p (op1
);
2729 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2733 op1
= force_reg (op_mode
, op1
);
2736 op1
= force_reg (op_mode
, op1
);
2740 /* Try to rearrange the comparison to make it cheaper. */
2741 if (ix86_fp_comparison_cost (code
)
2742 > ix86_fp_comparison_cost (swap_condition (code
))
2743 && (REG_P (op1
) || can_create_pseudo_p ()))
2745 std::swap (op0
, op1
);
2746 code
= swap_condition (code
);
2748 op0
= force_reg (op_mode
, op0
);
2756 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2759 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2761 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2762 machine_mode cmp_mode
;
2765 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2767 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2768 if (unordered_compare
)
2769 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2771 /* Do fcomi/sahf based test when profitable. */
2772 switch (ix86_fp_comparison_strategy (code
))
2774 case IX86_FPCMP_COMI
:
2775 cmp_mode
= CCFPmode
;
2776 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2779 case IX86_FPCMP_SAHF
:
2780 cmp_mode
= CCFPmode
;
2781 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2782 scratch
= gen_reg_rtx (HImode
);
2783 emit_insn (gen_rtx_SET (scratch
, tmp
));
2784 emit_insn (gen_x86_sahf_1 (scratch
));
2787 case IX86_FPCMP_ARITH
:
2788 cmp_mode
= CCNOmode
;
2789 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2790 scratch
= gen_reg_rtx (HImode
);
2791 emit_insn (gen_rtx_SET (scratch
, tmp
));
2793 /* In the unordered case, we have to check C2 for NaN's, which
2794 doesn't happen to work out to anything nice combination-wise.
2795 So do some bit twiddling on the value we've got in AH to come
2796 up with an appropriate set of condition codes. */
2802 if (code
== GT
|| !TARGET_IEEE_FP
)
2804 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2809 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2810 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2811 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2818 if (code
== LT
&& TARGET_IEEE_FP
)
2820 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2821 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2827 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2833 if (code
== GE
|| !TARGET_IEEE_FP
)
2835 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2840 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2841 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2847 if (code
== LE
&& TARGET_IEEE_FP
)
2849 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2850 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2851 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2857 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2863 if (code
== EQ
&& TARGET_IEEE_FP
)
2865 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2866 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2872 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2878 if (code
== NE
&& TARGET_IEEE_FP
)
2880 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2881 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2887 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2893 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2897 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2910 /* Return the test that should be put into the flags user, i.e.
2911 the bcc, scc, or cmov instruction. */
2912 return gen_rtx_fmt_ee (code
, VOIDmode
,
2913 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2917 /* Generate insn patterns to do an integer compare of OPERANDS. */
2920 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2922 machine_mode cmpmode
;
2925 /* Swap operands to emit carry flag comparison. */
2926 if ((code
== GTU
|| code
== LEU
)
2927 && nonimmediate_operand (op1
, VOIDmode
))
2929 std::swap (op0
, op1
);
2930 code
= swap_condition (code
);
2933 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2934 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2936 /* This is very simple, but making the interface the same as in the
2937 FP case makes the rest of the code easier. */
2938 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2939 emit_insn (gen_rtx_SET (flags
, tmp
));
2941 /* Return the test that should be put into the flags user, i.e.
2942 the bcc, scc, or cmov instruction. */
2943 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2947 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2951 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2952 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2954 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2956 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2957 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2960 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2966 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2970 gcc_assert (GET_MODE (dest
) == QImode
);
2972 ret
= ix86_expand_compare (code
, op0
, op1
);
2973 PUT_MODE (ret
, QImode
);
2974 emit_insn (gen_rtx_SET (dest
, ret
));
2977 /* Expand floating point op0 <=> op1, i.e.
2978 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2981 ix86_expand_fp_spaceship (rtx dest
, rtx op0
, rtx op1
)
2983 gcc_checking_assert (ix86_fp_comparison_strategy (GT
) != IX86_FPCMP_ARITH
);
2984 rtx gt
= ix86_expand_fp_compare (GT
, op0
, op1
);
2985 rtx l0
= gen_label_rtx ();
2986 rtx l1
= gen_label_rtx ();
2987 rtx l2
= TARGET_IEEE_FP
? gen_label_rtx () : NULL_RTX
;
2988 rtx lend
= gen_label_rtx ();
2993 rtx un
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
,
2994 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
2995 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, un
,
2996 gen_rtx_LABEL_REF (VOIDmode
, l2
), pc_rtx
);
2997 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2998 add_reg_br_prob_note (jmp
, profile_probability:: very_unlikely ());
3000 rtx eq
= gen_rtx_fmt_ee (UNEQ
, VOIDmode
,
3001 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
3002 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, eq
,
3003 gen_rtx_LABEL_REF (VOIDmode
, l0
), pc_rtx
);
3004 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3005 add_reg_br_prob_note (jmp
, profile_probability::unlikely ());
3006 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, gt
,
3007 gen_rtx_LABEL_REF (VOIDmode
, l1
), pc_rtx
);
3008 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3009 add_reg_br_prob_note (jmp
, profile_probability::even ());
3010 emit_move_insn (dest
, constm1_rtx
);
3013 emit_move_insn (dest
, const0_rtx
);
3016 emit_move_insn (dest
, const1_rtx
);
3021 emit_move_insn (dest
, const2_rtx
);
3026 /* Expand comparison setting or clearing carry flag. Return true when
3027 successful and set pop for the operation. */
3029 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
3032 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
3034 /* Do not handle double-mode compares that go through special path. */
3035 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
3038 if (SCALAR_FLOAT_MODE_P (mode
))
3041 rtx_insn
*compare_seq
;
3043 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
3045 /* Shortcut: following common codes never translate
3046 into carry flag compares. */
3047 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
3048 || code
== ORDERED
|| code
== UNORDERED
)
3051 /* These comparisons require zero flag; swap operands so they won't. */
3052 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
3055 std::swap (op0
, op1
);
3056 code
= swap_condition (code
);
3059 /* Try to expand the comparison and verify that we end up with
3060 carry flag based comparison. This fails to be true only when
3061 we decide to expand comparison using arithmetic that is not
3062 too common scenario. */
3064 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
3065 compare_seq
= get_insns ();
3068 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
3069 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
3071 code
= GET_CODE (compare_op
);
3073 if (code
!= LTU
&& code
!= GEU
)
3076 emit_insn (compare_seq
);
3081 if (!INTEGRAL_MODE_P (mode
))
3090 /* Convert a==0 into (unsigned)a<1. */
3093 if (op1
!= const0_rtx
)
3096 code
= (code
== EQ
? LTU
: GEU
);
3099 /* Convert a>b into b<a or a>=b-1. */
3102 if (CONST_INT_P (op1
))
3104 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
3105 /* Bail out on overflow. We still can swap operands but that
3106 would force loading of the constant into register. */
3107 if (op1
== const0_rtx
3108 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
3110 code
= (code
== GTU
? GEU
: LTU
);
3114 std::swap (op0
, op1
);
3115 code
= (code
== GTU
? LTU
: GEU
);
3119 /* Convert a>=0 into (unsigned)a<0x80000000. */
3122 if (mode
== DImode
|| op1
!= const0_rtx
)
3124 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3125 code
= (code
== LT
? GEU
: LTU
);
3129 if (mode
== DImode
|| op1
!= constm1_rtx
)
3131 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3132 code
= (code
== LE
? GEU
: LTU
);
3138 /* Swapping operands may cause constant to appear as first operand. */
3139 if (!nonimmediate_operand (op0
, VOIDmode
))
3141 if (!can_create_pseudo_p ())
3143 op0
= force_reg (mode
, op0
);
3145 *pop
= ix86_expand_compare (code
, op0
, op1
);
3146 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
3150 /* Expand conditional increment or decrement using adb/sbb instructions.
3151 The default case using setcc followed by the conditional move can be
3152 done by generic code. */
3154 ix86_expand_int_addcc (rtx operands
[])
3156 enum rtx_code code
= GET_CODE (operands
[1]);
3158 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
3160 rtx val
= const0_rtx
;
3163 rtx op0
= XEXP (operands
[1], 0);
3164 rtx op1
= XEXP (operands
[1], 1);
3166 if (operands
[3] != const1_rtx
3167 && operands
[3] != constm1_rtx
)
3169 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3171 code
= GET_CODE (compare_op
);
3173 flags
= XEXP (compare_op
, 0);
3175 if (GET_MODE (flags
) == CCFPmode
)
3178 code
= ix86_fp_compare_code_to_integer (code
);
3185 PUT_CODE (compare_op
,
3186 reverse_condition_maybe_unordered
3187 (GET_CODE (compare_op
)));
3189 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
3192 mode
= GET_MODE (operands
[0]);
3194 /* Construct either adc or sbb insn. */
3195 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
3196 insn
= gen_sub3_carry
;
3198 insn
= gen_add3_carry
;
3200 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
3206 ix86_expand_int_movcc (rtx operands
[])
3208 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
3209 rtx_insn
*compare_seq
;
3211 machine_mode mode
= GET_MODE (operands
[0]);
3212 bool sign_bit_compare_p
= false;
3213 bool negate_cc_compare_p
= false;
3214 rtx op0
= XEXP (operands
[1], 0);
3215 rtx op1
= XEXP (operands
[1], 1);
3216 rtx op2
= operands
[2];
3217 rtx op3
= operands
[3];
3219 if (GET_MODE (op0
) == TImode
3220 || (GET_MODE (op0
) == DImode
3224 if (GET_MODE (op0
) == BFmode
3225 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
3229 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3230 compare_seq
= get_insns ();
3233 compare_code
= GET_CODE (compare_op
);
3235 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
3236 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
3237 sign_bit_compare_p
= true;
3239 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3240 but if op1 is a constant, the latter form allows more optimizations,
3241 either through the last 2 ops being constant handling, or the one
3242 constant and one variable cases. On the other side, for cmov the
3243 former might be better as we don't need to load the constant into
3244 another register. */
3245 if (code
== EQ
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op2
))
3247 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3248 else if (code
== NE
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op3
))
3251 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3252 HImode insns, we'd be swallowed in word prefix ops. */
3254 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
3255 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
3256 && CONST_INT_P (op2
)
3257 && CONST_INT_P (op3
))
3259 rtx out
= operands
[0];
3260 HOST_WIDE_INT ct
= INTVAL (op2
);
3261 HOST_WIDE_INT cf
= INTVAL (op3
);
3265 || (TARGET_64BIT
&& mode
== DImode
))
3266 && (GET_MODE (op0
) == SImode
3267 || (TARGET_64BIT
&& GET_MODE (op0
) == DImode
)))
3269 /* Special case x != 0 ? -1 : y. */
3270 if (code
== NE
&& op1
== const0_rtx
&& ct
== -1)
3272 negate_cc_compare_p
= true;
3276 else if (code
== EQ
&& op1
== const0_rtx
&& cf
== -1)
3277 negate_cc_compare_p
= true;
3281 /* Sign bit compares are better done using shifts than we do by using
3283 if (sign_bit_compare_p
3284 || negate_cc_compare_p
3285 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3287 /* Detect overlap between destination and compare sources. */
3290 if (negate_cc_compare_p
)
3292 if (GET_MODE (op0
) == DImode
)
3293 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode
), op0
));
3295 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode
),
3296 gen_lowpart (SImode
, op0
)));
3298 tmp
= gen_reg_rtx (mode
);
3300 emit_insn (gen_x86_movdicc_0_m1_neg (tmp
));
3302 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode
,
3305 else if (!sign_bit_compare_p
)
3310 compare_code
= GET_CODE (compare_op
);
3312 flags
= XEXP (compare_op
, 0);
3314 if (GET_MODE (flags
) == CCFPmode
)
3318 = ix86_fp_compare_code_to_integer (compare_code
);
3321 /* To simplify rest of code, restrict to the GEU case. */
3322 if (compare_code
== LTU
)
3325 compare_code
= reverse_condition (compare_code
);
3326 code
= reverse_condition (code
);
3331 PUT_CODE (compare_op
,
3332 reverse_condition_maybe_unordered
3333 (GET_CODE (compare_op
)));
3335 PUT_CODE (compare_op
,
3336 reverse_condition (GET_CODE (compare_op
)));
3340 if (reg_overlap_mentioned_p (out
, compare_op
))
3341 tmp
= gen_reg_rtx (mode
);
3344 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
3346 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
3347 flags
, compare_op
));
3351 if (code
== GT
|| code
== GE
)
3352 code
= reverse_condition (code
);
3358 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
3371 tmp
= expand_simple_binop (mode
, PLUS
,
3373 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3384 tmp
= expand_simple_binop (mode
, IOR
,
3386 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3388 else if (diff
== -1 && ct
)
3398 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3400 tmp
= expand_simple_binop (mode
, PLUS
,
3401 copy_rtx (tmp
), GEN_INT (cf
),
3402 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3410 * andl cf - ct, dest
3420 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3423 tmp
= expand_simple_binop (mode
, AND
,
3425 gen_int_mode (cf
- ct
, mode
),
3426 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3428 tmp
= expand_simple_binop (mode
, PLUS
,
3429 copy_rtx (tmp
), GEN_INT (ct
),
3430 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3433 if (!rtx_equal_p (tmp
, out
))
3434 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3441 machine_mode cmp_mode
= GET_MODE (op0
);
3442 enum rtx_code new_code
;
3444 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3446 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3448 /* We may be reversing a non-trapping
3449 comparison to a trapping comparison. */
3450 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3451 && code
!= EQ
&& code
!= NE
3452 && code
!= ORDERED
&& code
!= UNORDERED
)
3455 new_code
= reverse_condition_maybe_unordered (code
);
3458 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3459 if (new_code
!= UNKNOWN
)
3467 compare_code
= UNKNOWN
;
3468 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3469 && CONST_INT_P (op1
))
3471 if (op1
== const0_rtx
3472 && (code
== LT
|| code
== GE
))
3473 compare_code
= code
;
3474 else if (op1
== constm1_rtx
)
3478 else if (code
== GT
)
3483 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3484 if (compare_code
!= UNKNOWN
3485 && GET_MODE (op0
) == GET_MODE (out
)
3486 && (cf
== -1 || ct
== -1))
3488 /* If lea code below could be used, only optimize
3489 if it results in a 2 insn sequence. */
3491 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3492 || diff
== 3 || diff
== 5 || diff
== 9)
3493 || (compare_code
== LT
&& ct
== -1)
3494 || (compare_code
== GE
&& cf
== -1))
3497 * notl op1 (if necessary)
3505 code
= reverse_condition (code
);
3508 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3510 out
= expand_simple_binop (mode
, IOR
,
3512 out
, 1, OPTAB_DIRECT
);
3513 if (out
!= operands
[0])
3514 emit_move_insn (operands
[0], out
);
3521 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3522 || diff
== 3 || diff
== 5 || diff
== 9)
3523 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3525 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3531 * lea cf(dest*(ct-cf)),dest
3535 * This also catches the degenerate setcc-only case.
3541 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3544 /* On x86_64 the lea instruction operates on Pmode, so we need
3545 to get arithmetics done in proper mode to match. */
3547 tmp
= copy_rtx (out
);
3551 out1
= copy_rtx (out
);
3552 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3556 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3562 tmp
= plus_constant (mode
, tmp
, cf
);
3565 if (!rtx_equal_p (tmp
, out
))
3568 out
= force_operand (tmp
, copy_rtx (out
));
3570 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3572 if (!rtx_equal_p (out
, operands
[0]))
3573 emit_move_insn (operands
[0], copy_rtx (out
));
3579 * General case: Jumpful:
3580 * xorl dest,dest cmpl op1, op2
3581 * cmpl op1, op2 movl ct, dest
3583 * decl dest movl cf, dest
3584 * andl (cf-ct),dest 1:
3589 * This is reasonably steep, but branch mispredict costs are
3590 * high on modern cpus, so consider failing only if optimizing
3594 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3595 && BRANCH_COST (optimize_insn_for_speed_p (),
3600 machine_mode cmp_mode
= GET_MODE (op0
);
3601 enum rtx_code new_code
;
3603 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3605 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3607 /* We may be reversing a non-trapping
3608 comparison to a trapping comparison. */
3609 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3610 && code
!= EQ
&& code
!= NE
3611 && code
!= ORDERED
&& code
!= UNORDERED
)
3614 new_code
= reverse_condition_maybe_unordered (code
);
3619 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3620 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3621 compare_code
= reverse_condition (compare_code
);
3624 if (new_code
!= UNKNOWN
)
3632 if (compare_code
!= UNKNOWN
)
3634 /* notl op1 (if needed)
3639 For x < 0 (resp. x <= -1) there will be no notl,
3640 so if possible swap the constants to get rid of the
3642 True/false will be -1/0 while code below (store flag
3643 followed by decrement) is 0/-1, so the constants need
3644 to be exchanged once more. */
3646 if (compare_code
== GE
|| !cf
)
3648 code
= reverse_condition (code
);
3654 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3658 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3660 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3662 copy_rtx (out
), 1, OPTAB_DIRECT
);
3665 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3666 gen_int_mode (cf
- ct
, mode
),
3667 copy_rtx (out
), 1, OPTAB_DIRECT
);
3669 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3670 copy_rtx (out
), 1, OPTAB_DIRECT
);
3671 if (!rtx_equal_p (out
, operands
[0]))
3672 emit_move_insn (operands
[0], copy_rtx (out
));
3678 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3680 /* Try a few things more with specific constants and a variable. */
3683 rtx var
, orig_out
, out
, tmp
;
3685 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3691 /* If one of the two operands is an interesting constant, load a
3692 constant with the above and mask it in with a logical operation. */
3694 if (CONST_INT_P (operands
[2]))
3697 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3698 operands
[3] = constm1_rtx
, op
= and_optab
;
3699 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3700 operands
[3] = const0_rtx
, op
= ior_optab
;
3704 else if (CONST_INT_P (operands
[3]))
3707 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3709 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3710 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3711 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3712 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3716 operands
[2] = constm1_rtx
;
3719 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3720 operands
[2] = const0_rtx
, op
= ior_optab
;
3727 orig_out
= operands
[0];
3728 tmp
= gen_reg_rtx (mode
);
3731 /* Recurse to get the constant loaded. */
3732 if (!ix86_expand_int_movcc (operands
))
3735 /* Mask in the interesting variable. */
3736 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3738 if (!rtx_equal_p (out
, orig_out
))
3739 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3745 * For comparison with above,
3755 if (! nonimmediate_operand (operands
[2], mode
))
3756 operands
[2] = force_reg (mode
, operands
[2]);
3757 if (! nonimmediate_operand (operands
[3], mode
))
3758 operands
[3] = force_reg (mode
, operands
[3]);
3760 if (! register_operand (operands
[2], VOIDmode
)
3762 || ! register_operand (operands
[3], VOIDmode
)))
3763 operands
[2] = force_reg (mode
, operands
[2]);
3766 && ! register_operand (operands
[3], VOIDmode
))
3767 operands
[3] = force_reg (mode
, operands
[3]);
3769 emit_insn (compare_seq
);
3770 emit_insn (gen_rtx_SET (operands
[0],
3771 gen_rtx_IF_THEN_ELSE (mode
,
3772 compare_op
, operands
[2],
3777 /* Detect conditional moves that exactly match min/max operational
3778 semantics. Note that this is IEEE safe, as long as we don't
3779 interchange the operands.
3781 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3782 and TRUE if the operation is successful and instructions are emitted. */
3785 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3786 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3794 else if (code
== UNGE
)
3795 std::swap (if_true
, if_false
);
3799 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3801 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3806 mode
= GET_MODE (dest
);
3808 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3809 but MODE may be a vector mode and thus not appropriate. */
3810 if (!flag_finite_math_only
|| flag_signed_zeros
)
3812 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3815 if_true
= force_reg (mode
, if_true
);
3816 v
= gen_rtvec (2, if_true
, if_false
);
3817 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3821 code
= is_min
? SMIN
: SMAX
;
3822 if (MEM_P (if_true
) && MEM_P (if_false
))
3823 if_true
= force_reg (mode
, if_true
);
3824 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3827 emit_insn (gen_rtx_SET (dest
, tmp
));
3831 /* Return true if MODE is valid for vector compare to mask register,
3832 Same result for conditionl vector move with mask register. */
3834 ix86_valid_mask_cmp_mode (machine_mode mode
)
3836 /* XOP has its own vector conditional movement. */
3837 if (TARGET_XOP
&& !TARGET_AVX512F
)
3840 /* HFmode only supports vcmpsh whose dest is mask register. */
3841 if (TARGET_AVX512FP16
&& mode
== HFmode
)
3844 /* AVX512F is needed for mask operation. */
3845 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3848 /* AVX512BW is needed for vector QI/HImode,
3849 AVX512VL is needed for 128/256-bit vector. */
3850 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3851 int vector_size
= GET_MODE_SIZE (mode
);
3852 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3855 return vector_size
== 64 || TARGET_AVX512VL
;
3858 /* Return true if integer mask comparison should be used. */
3860 ix86_use_mask_cmp_p (machine_mode mode
, machine_mode cmp_mode
,
3861 rtx op_true
, rtx op_false
)
3863 int vector_size
= GET_MODE_SIZE (mode
);
3865 if (cmp_mode
== HFmode
)
3867 else if (vector_size
< 16)
3869 else if (vector_size
== 64)
3871 else if (GET_MODE_INNER (cmp_mode
) == HFmode
)
3874 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3875 gcc_assert (!op_true
== !op_false
);
3877 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3878 vector dest is required. */
3879 if (!op_true
|| !ix86_valid_mask_cmp_mode (cmp_mode
))
3882 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3883 if (op_false
== CONST0_RTX (mode
)
3884 || op_true
== CONST0_RTX (mode
)
3885 || (INTEGRAL_MODE_P (mode
)
3886 && (op_true
== CONSTM1_RTX (mode
)
3887 || op_false
== CONSTM1_RTX (mode
))))
3893 /* Expand an SSE comparison. Return the register with the result. */
3896 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3897 rtx op_true
, rtx op_false
)
3899 machine_mode mode
= GET_MODE (dest
);
3900 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3902 /* In general case result of comparison can differ from operands' type. */
3903 machine_mode cmp_mode
;
3905 /* In AVX512F the result of comparison is an integer mask. */
3906 bool maskcmp
= false;
3909 if (ix86_use_mask_cmp_p (mode
, cmp_ops_mode
, op_true
, op_false
))
3911 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3913 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
3916 cmp_mode
= cmp_ops_mode
;
3918 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3920 bool (*op1_predicate
)(rtx
, machine_mode
)
3921 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3923 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3924 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3927 || (maskcmp
&& cmp_mode
!= mode
)
3928 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3929 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3930 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3934 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
3939 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3941 if (cmp_mode
!= mode
)
3943 x
= force_reg (cmp_ops_mode
, x
);
3944 convert_move (dest
, x
, false);
3947 emit_insn (gen_rtx_SET (dest
, x
));
3952 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3953 instructions that can be performed using GP registers. */
3956 ix86_emit_vec_binop (enum rtx_code code
, machine_mode mode
,
3957 rtx dst
, rtx src1
, rtx src2
)
3961 tmp
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
3963 if (GET_MODE_SIZE (mode
) <= GET_MODE_SIZE (SImode
)
3964 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
3966 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
3967 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
3973 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3974 operations. This is used for both scalar and vector conditional moves. */
3977 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3979 machine_mode mode
= GET_MODE (dest
);
3980 machine_mode cmpmode
= GET_MODE (cmp
);
3983 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3984 if (rtx_equal_p (op_true
, op_false
))
3986 emit_move_insn (dest
, op_true
);
3990 /* If we have an integer mask and FP value then we need
3991 to cast mask to FP mode. */
3992 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3994 cmp
= force_reg (cmpmode
, cmp
);
3995 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3998 /* In AVX512F the result of comparison is an integer mask. */
4000 && GET_MODE_CLASS (cmpmode
) == MODE_INT
)
4002 gcc_assert (ix86_valid_mask_cmp_mode (mode
));
4003 /* Using scalar/vector move with mask register. */
4004 cmp
= force_reg (cmpmode
, cmp
);
4005 /* Optimize for mask zero. */
4006 op_true
= (op_true
!= CONST0_RTX (mode
)
4007 ? force_reg (mode
, op_true
) : op_true
);
4008 op_false
= (op_false
!= CONST0_RTX (mode
)
4009 ? force_reg (mode
, op_false
) : op_false
);
4010 if (op_true
== CONST0_RTX (mode
))
4012 if (cmpmode
== E_DImode
&& !TARGET_64BIT
)
4014 x
= gen_reg_rtx (cmpmode
);
4015 emit_insn (gen_knotdi (x
, cmp
));
4018 x
= expand_simple_unop (cmpmode
, NOT
, cmp
, NULL
, 1);
4020 /* Reverse op_true op_false. */
4021 std::swap (op_true
, op_false
);
4025 emit_insn (gen_movhf_mask (dest
, op_true
, op_false
, cmp
));
4027 emit_insn (gen_rtx_SET (dest
,
4028 gen_rtx_VEC_MERGE (mode
,
4029 op_true
, op_false
, cmp
)));
4033 if (vector_all_ones_operand (op_true
, mode
)
4034 && op_false
== CONST0_RTX (mode
))
4036 emit_move_insn (dest
, cmp
);
4039 else if (op_false
== CONST0_RTX (mode
))
4041 x
= expand_simple_binop (mode
, AND
, cmp
, op_true
,
4042 dest
, 1, OPTAB_DIRECT
);
4044 emit_move_insn (dest
, x
);
4047 else if (op_true
== CONST0_RTX (mode
))
4049 op_false
= force_reg (mode
, op_false
);
4050 x
= gen_rtx_NOT (mode
, cmp
);
4051 ix86_emit_vec_binop (AND
, mode
, dest
, x
, op_false
);
4054 else if (vector_all_ones_operand (op_true
, mode
))
4056 x
= expand_simple_binop (mode
, IOR
, cmp
, op_false
,
4057 dest
, 1, OPTAB_DIRECT
);
4059 emit_move_insn (dest
, x
);
4065 op_true
= force_reg (mode
, op_true
);
4067 if (GET_MODE_SIZE (mode
) < 16
4068 || !nonimmediate_operand (op_false
, mode
))
4069 op_false
= force_reg (mode
, op_false
);
4071 emit_insn (gen_rtx_SET (dest
,
4072 gen_rtx_IF_THEN_ELSE (mode
, cmp
,
4073 op_true
, op_false
)));
4077 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4078 machine_mode blend_mode
= mode
;
4080 if (GET_MODE_SIZE (mode
) < 16
4081 || !vector_operand (op_true
, mode
))
4082 op_true
= force_reg (mode
, op_true
);
4084 op_false
= force_reg (mode
, op_false
);
4090 gen
= gen_mmx_blendvps
;
4094 gen
= gen_sse4_1_blendvps
;
4098 gen
= gen_sse4_1_blendvpd
;
4102 gen
= gen_sse4_1_blendvss
;
4106 gen
= gen_sse4_1_blendvsd
;
4113 gen
= gen_mmx_pblendvb_v8qi
;
4114 blend_mode
= V8QImode
;
4121 gen
= gen_mmx_pblendvb_v4qi
;
4122 blend_mode
= V4QImode
;
4127 gen
= gen_mmx_pblendvb_v2qi
;
4138 gen
= gen_sse4_1_pblendvb
;
4139 blend_mode
= V16QImode
;
4144 gen
= gen_avx_blendvps256
;
4148 gen
= gen_avx_blendvpd256
;
4158 gen
= gen_avx2_pblendvb
;
4159 blend_mode
= V32QImode
;
4164 gen
= gen_avx512bw_blendmv64qi
;
4167 gen
= gen_avx512bw_blendmv32hi
;
4170 gen
= gen_avx512bw_blendmv32hf
;
4173 gen
= gen_avx512bw_blendmv32bf
;
4176 gen
= gen_avx512f_blendmv16si
;
4179 gen
= gen_avx512f_blendmv8di
;
4182 gen
= gen_avx512f_blendmv8df
;
4185 gen
= gen_avx512f_blendmv16sf
;
4194 if (blend_mode
== mode
)
4198 x
= gen_reg_rtx (blend_mode
);
4199 op_false
= gen_lowpart (blend_mode
, op_false
);
4200 op_true
= gen_lowpart (blend_mode
, op_true
);
4201 cmp
= gen_lowpart (blend_mode
, cmp
);
4204 emit_insn (gen (x
, op_false
, op_true
, cmp
));
4207 emit_move_insn (dest
, gen_lowpart (mode
, x
));
4213 t2
= expand_simple_binop (mode
, AND
, op_true
, cmp
,
4214 NULL
, 1, OPTAB_DIRECT
);
4216 t3
= gen_reg_rtx (mode
);
4217 x
= gen_rtx_NOT (mode
, cmp
);
4218 ix86_emit_vec_binop (AND
, mode
, t3
, x
, op_false
);
4220 x
= expand_simple_binop (mode
, IOR
, t3
, t2
,
4221 dest
, 1, OPTAB_DIRECT
);
4223 emit_move_insn (dest
, x
);
4227 /* Swap, force into registers, or otherwise massage the two operands
4228 to an sse comparison with a mask result. Thus we differ a bit from
4229 ix86_prepare_fp_compare_args which expects to produce a flags result.
4231 The DEST operand exists to help determine whether to commute commutative
4232 operators. The POP0/POP1 operands are updated in place. The new
4233 comparison code is returned, or UNKNOWN if not implementable. */
4235 static enum rtx_code
4236 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
4237 rtx
*pop0
, rtx
*pop1
)
4243 /* AVX supports all the needed comparisons. */
4246 /* We have no LTGT as an operator. We could implement it with
4247 NE & ORDERED, but this requires an extra temporary. It's
4248 not clear that it's worth it. */
4255 /* These are supported directly. */
4262 /* AVX has 3 operand comparisons, no need to swap anything. */
4265 /* For commutative operators, try to canonicalize the destination
4266 operand to be first in the comparison - this helps reload to
4267 avoid extra moves. */
4268 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
4276 /* These are not supported directly before AVX, and furthermore
4277 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4278 comparison operands to transform into something that is
4280 std::swap (*pop0
, *pop1
);
4281 code
= swap_condition (code
);
4291 /* Expand a floating-point conditional move. Return true if successful. */
4294 ix86_expand_fp_movcc (rtx operands
[])
4296 machine_mode mode
= GET_MODE (operands
[0]);
4297 enum rtx_code code
= GET_CODE (operands
[1]);
4298 rtx tmp
, compare_op
;
4299 rtx op0
= XEXP (operands
[1], 0);
4300 rtx op1
= XEXP (operands
[1], 1);
4302 if (GET_MODE (op0
) == BFmode
4303 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
4306 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode
))
4310 /* Since we've no cmove for sse registers, don't force bad register
4311 allocation just to gain access to it. Deny movcc when the
4312 comparison mode doesn't match the move mode. */
4313 cmode
= GET_MODE (op0
);
4314 if (cmode
== VOIDmode
)
4315 cmode
= GET_MODE (op1
);
4319 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
4320 if (code
== UNKNOWN
)
4323 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
4324 operands
[2], operands
[3]))
4327 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
4328 operands
[2], operands
[3]);
4329 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
4333 if (GET_MODE (op0
) == TImode
4334 || (GET_MODE (op0
) == DImode
4338 /* The floating point conditional move instructions don't directly
4339 support conditions resulting from a signed integer comparison. */
4341 compare_op
= ix86_expand_compare (code
, op0
, op1
);
4342 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
4344 tmp
= gen_reg_rtx (QImode
);
4345 ix86_expand_setcc (tmp
, code
, op0
, op1
);
4347 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
4350 emit_insn (gen_rtx_SET (operands
[0],
4351 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
4352 operands
[2], operands
[3])));
4357 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4360 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4385 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4388 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4425 /* Return immediate value to be used in UNSPEC_PCMP
4426 for comparison CODE in MODE. */
4429 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
4431 if (FLOAT_MODE_P (mode
))
4432 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
4433 return ix86_int_cmp_code_to_pcmp_immediate (code
);
4436 /* Expand AVX-512 vector comparison. */
4439 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
4441 machine_mode mask_mode
= GET_MODE (dest
);
4442 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
4443 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
4453 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
4457 unspec_code
= UNSPEC_PCMP
;
4460 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
4462 emit_insn (gen_rtx_SET (dest
, unspec
));
4467 /* Expand fp vector comparison. */
4470 ix86_expand_fp_vec_cmp (rtx operands
[])
4472 enum rtx_code code
= GET_CODE (operands
[1]);
4475 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4476 &operands
[2], &operands
[3]);
4477 if (code
== UNKNOWN
)
4480 switch (GET_CODE (operands
[1]))
4483 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4484 operands
[3], NULL
, NULL
);
4485 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4486 operands
[3], NULL
, NULL
);
4490 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4491 operands
[3], NULL
, NULL
);
4492 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4493 operands
[3], NULL
, NULL
);
4499 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4503 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4506 if (operands
[0] != cmp
)
4507 emit_move_insn (operands
[0], cmp
);
4513 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4514 rtx op_true
, rtx op_false
, bool *negate
)
4516 machine_mode data_mode
= GET_MODE (dest
);
4517 machine_mode mode
= GET_MODE (cop0
);
4522 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4524 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
4525 && GET_MODE_SIZE (mode
) <= 16)
4527 /* AVX512F supports all of the comparsions
4528 on all 128/256/512-bit vector int types. */
4529 else if (ix86_use_mask_cmp_p (data_mode
, mode
, op_true
, op_false
))
4533 /* Canonicalize the comparison to EQ, GT, GTU. */
4543 /* x <= cst can be handled as x < cst + 1 unless there is
4544 wrap around in cst + 1. */
4545 if (GET_CODE (cop1
) == CONST_VECTOR
4546 && GET_MODE_INNER (mode
) != TImode
)
4548 unsigned int n_elts
= GET_MODE_NUNITS (mode
), i
;
4549 machine_mode eltmode
= GET_MODE_INNER (mode
);
4550 for (i
= 0; i
< n_elts
; ++i
)
4552 rtx elt
= CONST_VECTOR_ELT (cop1
, i
);
4553 if (!CONST_INT_P (elt
))
4557 /* For LE punt if some element is signed maximum. */
4558 if ((INTVAL (elt
) & (GET_MODE_MASK (eltmode
) >> 1))
4559 == (GET_MODE_MASK (eltmode
) >> 1))
4562 /* For LEU punt if some element is unsigned maximum. */
4563 else if (elt
== constm1_rtx
)
4568 rtvec v
= rtvec_alloc (n_elts
);
4569 for (i
= 0; i
< n_elts
; ++i
)
4571 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1
, i
)) + 1,
4573 cop1
= gen_rtx_CONST_VECTOR (mode
, v
);
4574 std::swap (cop0
, cop1
);
4575 code
= code
== LE
? GT
: GTU
;
4581 code
= reverse_condition (code
);
4587 /* x >= cst can be handled as x > cst - 1 unless there is
4588 wrap around in cst - 1. */
4589 if (GET_CODE (cop1
) == CONST_VECTOR
4590 && GET_MODE_INNER (mode
) != TImode
)
4592 unsigned int n_elts
= GET_MODE_NUNITS (mode
), i
;
4593 machine_mode eltmode
= GET_MODE_INNER (mode
);
4594 for (i
= 0; i
< n_elts
; ++i
)
4596 rtx elt
= CONST_VECTOR_ELT (cop1
, i
);
4597 if (!CONST_INT_P (elt
))
4601 /* For GE punt if some element is signed minimum. */
4602 if (INTVAL (elt
) < 0
4603 && ((INTVAL (elt
) & (GET_MODE_MASK (eltmode
) >> 1))
4607 /* For GEU punt if some element is zero. */
4608 else if (elt
== const0_rtx
)
4613 rtvec v
= rtvec_alloc (n_elts
);
4614 for (i
= 0; i
< n_elts
; ++i
)
4616 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1
, i
)) - 1,
4618 cop1
= gen_rtx_CONST_VECTOR (mode
, v
);
4619 code
= code
== GE
? GT
: GTU
;
4623 code
= reverse_condition (code
);
4629 std::swap (cop0
, cop1
);
4630 code
= swap_condition (code
);
4637 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4638 if (mode
== V2DImode
)
4643 /* SSE4.1 supports EQ. */
4650 /* SSE4.2 supports GT/GTU. */
4660 if (GET_CODE (cop0
) == CONST_VECTOR
)
4661 cop0
= force_reg (mode
, cop0
);
4662 else if (GET_CODE (cop1
) == CONST_VECTOR
)
4663 cop1
= force_reg (mode
, cop1
);
4665 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4666 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4668 std::swap (optrue
, opfalse
);
4670 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4671 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4672 min (x, y) == x). While we add one instruction (the minimum),
4673 we remove the need for two instructions in the negation, as the
4674 result is done this way.
4675 When using masks, do it for SI/DImode element types, as it is shorter
4676 than the two subtractions. */
4678 && GET_MODE_SIZE (mode
) != 64
4679 && vector_all_ones_operand (opfalse
, data_mode
)
4680 && optrue
== CONST0_RTX (data_mode
))
4682 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4683 /* Don't do it if not using integer masks and we'd end up with
4684 the right values in the registers though. */
4685 && (GET_MODE_SIZE (mode
) == 64
4686 || !vector_all_ones_operand (optrue
, data_mode
)
4687 || opfalse
!= CONST0_RTX (data_mode
))))
4689 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4694 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4697 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4698 cop0
= force_reg (mode
, cop0
);
4699 cop1
= force_reg (mode
, cop1
);
4703 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4707 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4711 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4714 if (TARGET_AVX512VL
)
4716 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4717 cop0
= force_reg (mode
, cop0
);
4718 cop1
= force_reg (mode
, cop1
);
4722 if (code
== GTU
&& TARGET_SSE2
)
4723 gen
= gen_uminv16qi3
;
4724 else if (code
== GT
&& TARGET_SSE4_1
)
4725 gen
= gen_sminv16qi3
;
4728 if (code
== GTU
&& TARGET_SSE2
)
4729 gen
= gen_uminv8qi3
;
4730 else if (code
== GT
&& TARGET_SSE4_1
)
4731 gen
= gen_sminv8qi3
;
4734 if (code
== GTU
&& TARGET_SSE2
)
4735 gen
= gen_uminv4qi3
;
4736 else if (code
== GT
&& TARGET_SSE4_1
)
4737 gen
= gen_sminv4qi3
;
4740 if (code
== GTU
&& TARGET_SSE2
)
4741 gen
= gen_uminv2qi3
;
4742 else if (code
== GT
&& TARGET_SSE4_1
)
4743 gen
= gen_sminv2qi3
;
4746 if (code
== GTU
&& TARGET_SSE4_1
)
4747 gen
= gen_uminv8hi3
;
4748 else if (code
== GT
&& TARGET_SSE2
)
4749 gen
= gen_sminv8hi3
;
4752 if (code
== GTU
&& TARGET_SSE4_1
)
4753 gen
= gen_uminv4hi3
;
4754 else if (code
== GT
&& TARGET_SSE2
)
4755 gen
= gen_sminv4hi3
;
4758 if (code
== GTU
&& TARGET_SSE4_1
)
4759 gen
= gen_uminv2hi3
;
4760 else if (code
== GT
&& TARGET_SSE2
)
4761 gen
= gen_sminv2hi3
;
4765 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4769 gen
= (code
== GTU
) ? gen_uminv2si3
: gen_sminv2si3
;
4772 if (TARGET_AVX512VL
)
4774 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4775 cop0
= force_reg (mode
, cop0
);
4776 cop1
= force_reg (mode
, cop1
);
4785 rtx tem
= gen_reg_rtx (mode
);
4786 if (!vector_operand (cop0
, mode
))
4787 cop0
= force_reg (mode
, cop0
);
4788 if (!vector_operand (cop1
, mode
))
4789 cop1
= force_reg (mode
, cop1
);
4791 emit_insn (gen (tem
, cop0
, cop1
));
4797 /* Unsigned parallel compare is not supported by the hardware.
4798 Play some tricks to turn this into a signed comparison
4802 cop0
= force_reg (mode
, cop0
);
4816 /* Subtract (-(INT MAX) - 1) from both operands to make
4818 mask
= ix86_build_signbit_mask (mode
, true, false);
4819 t1
= gen_reg_rtx (mode
);
4820 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4822 t2
= gen_reg_rtx (mode
);
4823 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4842 /* Perform a parallel unsigned saturating subtraction. */
4843 x
= gen_reg_rtx (mode
);
4844 emit_insn (gen_rtx_SET
4845 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4847 cop1
= CONST0_RTX (mode
);
4859 std::swap (op_true
, op_false
);
4861 if (GET_CODE (cop1
) == CONST_VECTOR
)
4862 cop1
= force_reg (mode
, cop1
);
4864 /* Allow the comparison to be done in one mode, but the movcc to
4865 happen in another mode. */
4866 if (data_mode
== mode
)
4867 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
, op_true
, op_false
);
4870 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4871 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4873 if (GET_MODE (x
) == mode
)
4874 x
= gen_lowpart (data_mode
, x
);
4880 /* Expand integer vector comparison. */
4883 ix86_expand_int_vec_cmp (rtx operands
[])
4885 rtx_code code
= GET_CODE (operands
[1]);
4886 bool negate
= false;
4887 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4888 operands
[3], NULL
, NULL
, &negate
);
4894 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4895 CONST0_RTX (GET_MODE (cmp
)),
4896 NULL
, NULL
, &negate
);
4898 gcc_assert (!negate
);
4900 if (operands
[0] != cmp
)
4901 emit_move_insn (operands
[0], cmp
);
4906 /* Expand a floating-point vector conditional move; a vcond operation
4907 rather than a movcc operation. */
4910 ix86_expand_fp_vcond (rtx operands
[])
4912 enum rtx_code code
= GET_CODE (operands
[3]);
4915 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4916 &operands
[4], &operands
[5]);
4917 if (code
== UNKNOWN
)
4920 switch (GET_CODE (operands
[3]))
4923 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4924 operands
[5], operands
[0], operands
[0]);
4925 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4926 operands
[5], operands
[1], operands
[2]);
4930 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4931 operands
[5], operands
[0], operands
[0]);
4932 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4933 operands
[5], operands
[1], operands
[2]);
4939 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4941 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4945 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4946 operands
[5], operands
[1], operands
[2]))
4949 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4950 operands
[1], operands
[2]);
4951 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4955 /* Expand a signed/unsigned integral vector conditional move. */
4958 ix86_expand_int_vcond (rtx operands
[])
4960 machine_mode data_mode
= GET_MODE (operands
[0]);
4961 machine_mode mode
= GET_MODE (operands
[4]);
4962 enum rtx_code code
= GET_CODE (operands
[3]);
4963 bool negate
= false;
4969 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4970 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4971 if ((code
== LT
|| code
== GE
)
4972 && data_mode
== mode
4973 && cop1
== CONST0_RTX (mode
)
4974 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4975 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4976 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4977 && (GET_MODE_SIZE (data_mode
) == 16
4978 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4980 rtx negop
= operands
[2 - (code
== LT
)];
4981 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4982 if (negop
== CONST1_RTX (data_mode
))
4984 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4985 operands
[0], 1, OPTAB_DIRECT
);
4986 if (res
!= operands
[0])
4987 emit_move_insn (operands
[0], res
);
4990 else if (GET_MODE_INNER (data_mode
) != DImode
4991 && vector_all_ones_operand (negop
, data_mode
))
4993 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4994 operands
[0], 0, OPTAB_DIRECT
);
4995 if (res
!= operands
[0])
4996 emit_move_insn (operands
[0], res
);
5001 if (!nonimmediate_operand (cop1
, mode
))
5002 cop1
= force_reg (mode
, cop1
);
5003 if (!general_operand (operands
[1], data_mode
))
5004 operands
[1] = force_reg (data_mode
, operands
[1]);
5005 if (!general_operand (operands
[2], data_mode
))
5006 operands
[2] = force_reg (data_mode
, operands
[2]);
5008 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
5009 operands
[1], operands
[2], &negate
);
5014 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
5015 operands
[2-negate
]);
5020 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
5021 struct expand_vec_perm_d
*d
)
5023 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5024 expander, so args are either in d, or in op0, op1 etc. */
5025 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
5026 machine_mode maskmode
= mode
;
5027 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
5032 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
5033 gen
= gen_avx512vl_vpermt2varv16qi3
;
5036 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
5037 gen
= gen_avx512vl_vpermt2varv32qi3
;
5040 if (TARGET_AVX512VBMI
)
5041 gen
= gen_avx512bw_vpermt2varv64qi3
;
5044 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
5045 gen
= gen_avx512vl_vpermt2varv8hi3
;
5048 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
5049 gen
= gen_avx512vl_vpermt2varv16hi3
;
5052 if (TARGET_AVX512BW
)
5053 gen
= gen_avx512bw_vpermt2varv32hi3
;
5056 if (TARGET_AVX512VL
)
5057 gen
= gen_avx512vl_vpermt2varv4si3
;
5060 if (TARGET_AVX512VL
)
5061 gen
= gen_avx512vl_vpermt2varv8si3
;
5065 gen
= gen_avx512f_vpermt2varv16si3
;
5068 if (TARGET_AVX512VL
)
5070 gen
= gen_avx512vl_vpermt2varv4sf3
;
5071 maskmode
= V4SImode
;
5075 if (TARGET_AVX512VL
)
5077 gen
= gen_avx512vl_vpermt2varv8sf3
;
5078 maskmode
= V8SImode
;
5084 gen
= gen_avx512f_vpermt2varv16sf3
;
5085 maskmode
= V16SImode
;
5089 if (TARGET_AVX512VL
)
5090 gen
= gen_avx512vl_vpermt2varv2di3
;
5093 if (TARGET_AVX512VL
)
5094 gen
= gen_avx512vl_vpermt2varv4di3
;
5098 gen
= gen_avx512f_vpermt2varv8di3
;
5101 if (TARGET_AVX512VL
)
5103 gen
= gen_avx512vl_vpermt2varv2df3
;
5104 maskmode
= V2DImode
;
5108 if (TARGET_AVX512VL
)
5110 gen
= gen_avx512vl_vpermt2varv4df3
;
5111 maskmode
= V4DImode
;
5117 gen
= gen_avx512f_vpermt2varv8df3
;
5118 maskmode
= V8DImode
;
5128 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5129 expander, so args are either in d, or in op0, op1 etc. */
5136 for (int i
= 0; i
< d
->nelt
; ++i
)
5137 vec
[i
] = GEN_INT (d
->perm
[i
]);
5138 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
5141 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
5145 /* Expand a variable vector permutation. */
5148 ix86_expand_vec_perm (rtx operands
[])
5150 rtx target
= operands
[0];
5151 rtx op0
= operands
[1];
5152 rtx op1
= operands
[2];
5153 rtx mask
= operands
[3];
5154 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
5155 machine_mode mode
= GET_MODE (op0
);
5156 machine_mode maskmode
= GET_MODE (mask
);
5158 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
5160 /* Number of elements in the vector. */
5161 w
= GET_MODE_NUNITS (mode
);
5162 e
= GET_MODE_UNIT_SIZE (mode
);
5163 gcc_assert (w
<= 64);
5165 /* For HF mode vector, convert it to HI using subreg. */
5166 if (GET_MODE_INNER (mode
) == HFmode
)
5168 machine_mode orig_mode
= mode
;
5169 mode
= mode_for_vector (HImode
, w
).require ();
5170 target
= lowpart_subreg (mode
, target
, orig_mode
);
5171 op0
= lowpart_subreg (mode
, op0
, orig_mode
);
5172 op1
= lowpart_subreg (mode
, op1
, orig_mode
);
5175 if (TARGET_AVX512F
&& one_operand_shuffle
)
5177 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
5181 gen
=gen_avx512f_permvarv16si
;
5184 gen
= gen_avx512f_permvarv16sf
;
5187 gen
= gen_avx512f_permvarv8di
;
5190 gen
= gen_avx512f_permvarv8df
;
5197 emit_insn (gen (target
, op0
, mask
));
5202 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
5207 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
5209 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5210 an constant shuffle operand. With a tiny bit of effort we can
5211 use VPERMD instead. A re-interpretation stall for V4DFmode is
5212 unfortunate but there's no avoiding it.
5213 Similarly for V16HImode we don't have instructions for variable
5214 shuffling, while for V32QImode we can use after preparing suitable
5215 masks vpshufb; vpshufb; vpermq; vpor. */
5217 if (mode
== V16HImode
)
5219 maskmode
= mode
= V32QImode
;
5225 maskmode
= mode
= V8SImode
;
5229 t1
= gen_reg_rtx (maskmode
);
5231 /* Replicate the low bits of the V4DImode mask into V8SImode:
5233 t1 = { A A B B C C D D }. */
5234 for (i
= 0; i
< w
/ 2; ++i
)
5235 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
5236 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5237 vt
= force_reg (maskmode
, vt
);
5238 mask
= gen_lowpart (maskmode
, mask
);
5239 if (maskmode
== V8SImode
)
5240 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
5242 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
5244 /* Multiply the shuffle indicies by two. */
5245 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
5248 /* Add one to the odd shuffle indicies:
5249 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5250 for (i
= 0; i
< w
/ 2; ++i
)
5252 vec
[i
* 2] = const0_rtx
;
5253 vec
[i
* 2 + 1] = const1_rtx
;
5255 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5256 vt
= validize_mem (force_const_mem (maskmode
, vt
));
5257 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
5260 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5261 operands
[3] = mask
= t1
;
5262 target
= gen_reg_rtx (mode
);
5263 op0
= gen_lowpart (mode
, op0
);
5264 op1
= gen_lowpart (mode
, op1
);
5270 /* The VPERMD and VPERMPS instructions already properly ignore
5271 the high bits of the shuffle elements. No need for us to
5272 perform an AND ourselves. */
5273 if (one_operand_shuffle
)
5275 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
5276 if (target
!= operands
[0])
5277 emit_move_insn (operands
[0],
5278 gen_lowpart (GET_MODE (operands
[0]), target
));
5282 t1
= gen_reg_rtx (V8SImode
);
5283 t2
= gen_reg_rtx (V8SImode
);
5284 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
5285 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
5291 mask
= gen_lowpart (V8SImode
, mask
);
5292 if (one_operand_shuffle
)
5293 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
5296 t1
= gen_reg_rtx (V8SFmode
);
5297 t2
= gen_reg_rtx (V8SFmode
);
5298 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
5299 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
5305 /* By combining the two 128-bit input vectors into one 256-bit
5306 input vector, we can use VPERMD and VPERMPS for the full
5307 two-operand shuffle. */
5308 t1
= gen_reg_rtx (V8SImode
);
5309 t2
= gen_reg_rtx (V8SImode
);
5310 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
5311 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5312 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
5313 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
5317 t1
= gen_reg_rtx (V8SFmode
);
5318 t2
= gen_reg_rtx (V8SImode
);
5319 mask
= gen_lowpart (V4SImode
, mask
);
5320 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
5321 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5322 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
5323 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
5327 t1
= gen_reg_rtx (V32QImode
);
5328 t2
= gen_reg_rtx (V32QImode
);
5329 t3
= gen_reg_rtx (V32QImode
);
5330 vt2
= GEN_INT (-128);
5331 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
5332 vt
= force_reg (V32QImode
, vt
);
5333 for (i
= 0; i
< 32; i
++)
5334 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
5335 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
5336 vt2
= force_reg (V32QImode
, vt2
);
5337 /* From mask create two adjusted masks, which contain the same
5338 bits as mask in the low 7 bits of each vector element.
5339 The first mask will have the most significant bit clear
5340 if it requests element from the same 128-bit lane
5341 and MSB set if it requests element from the other 128-bit lane.
5342 The second mask will have the opposite values of the MSB,
5343 and additionally will have its 128-bit lanes swapped.
5344 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5345 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5346 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5347 stands for other 12 bytes. */
5348 /* The bit whether element is from the same lane or the other
5349 lane is bit 4, so shift it up by 3 to the MSB position. */
5350 t5
= gen_reg_rtx (V4DImode
);
5351 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
5353 /* Clear MSB bits from the mask just in case it had them set. */
5354 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
5355 /* After this t1 will have MSB set for elements from other lane. */
5356 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
5357 /* Clear bits other than MSB. */
5358 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
5359 /* Or in the lower bits from mask into t3. */
5360 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
5361 /* And invert MSB bits in t1, so MSB is set for elements from the same
5363 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
5364 /* Swap 128-bit lanes in t3. */
5365 t6
= gen_reg_rtx (V4DImode
);
5366 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
5367 const2_rtx
, GEN_INT (3),
5368 const0_rtx
, const1_rtx
));
5369 /* And or in the lower bits from mask into t1. */
5370 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
5371 if (one_operand_shuffle
)
5373 /* Each of these shuffles will put 0s in places where
5374 element from the other 128-bit lane is needed, otherwise
5375 will shuffle in the requested value. */
5376 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
5377 gen_lowpart (V32QImode
, t6
)));
5378 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
5379 /* For t3 the 128-bit lanes are swapped again. */
5380 t7
= gen_reg_rtx (V4DImode
);
5381 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
5382 const2_rtx
, GEN_INT (3),
5383 const0_rtx
, const1_rtx
));
5384 /* And oring both together leads to the result. */
5385 emit_insn (gen_iorv32qi3 (target
, t1
,
5386 gen_lowpart (V32QImode
, t7
)));
5387 if (target
!= operands
[0])
5388 emit_move_insn (operands
[0],
5389 gen_lowpart (GET_MODE (operands
[0]), target
));
5393 t4
= gen_reg_rtx (V32QImode
);
5394 /* Similarly to the above one_operand_shuffle code,
5395 just for repeated twice for each operand. merge_two:
5396 code will merge the two results together. */
5397 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
5398 gen_lowpart (V32QImode
, t6
)));
5399 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
5400 gen_lowpart (V32QImode
, t6
)));
5401 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
5402 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
5403 t7
= gen_reg_rtx (V4DImode
);
5404 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
5405 const2_rtx
, GEN_INT (3),
5406 const0_rtx
, const1_rtx
));
5407 t8
= gen_reg_rtx (V4DImode
);
5408 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
5409 const2_rtx
, GEN_INT (3),
5410 const0_rtx
, const1_rtx
));
5411 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
5412 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
5418 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
5425 /* The XOP VPPERM insn supports three inputs. By ignoring the
5426 one_operand_shuffle special case, we avoid creating another
5427 set of constant vectors in memory. */
5428 one_operand_shuffle
= false;
5430 /* mask = mask & {2*w-1, ...} */
5431 vt
= GEN_INT (2*w
- 1);
5435 /* mask = mask & {w-1, ...} */
5436 vt
= GEN_INT (w
- 1);
5439 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5440 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5441 NULL_RTX
, 0, OPTAB_DIRECT
);
5443 /* For non-QImode operations, convert the word permutation control
5444 into a byte permutation control. */
5445 if (mode
!= V16QImode
)
5447 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
5448 GEN_INT (exact_log2 (e
)),
5449 NULL_RTX
, 0, OPTAB_DIRECT
);
5451 /* Convert mask to vector of chars. */
5452 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
5454 /* Replicate each of the input bytes into byte positions:
5455 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5456 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5457 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5458 for (i
= 0; i
< 16; ++i
)
5459 vec
[i
] = GEN_INT (i
/e
* e
);
5460 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5461 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5463 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
5465 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
5467 /* Convert it into the byte positions by doing
5468 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5469 for (i
= 0; i
< 16; ++i
)
5470 vec
[i
] = GEN_INT (i
% e
);
5471 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5472 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5473 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
5476 /* The actual shuffle operations all operate on V16QImode. */
5477 op0
= gen_lowpart (V16QImode
, op0
);
5478 op1
= gen_lowpart (V16QImode
, op1
);
5482 if (GET_MODE (target
) != V16QImode
)
5483 target
= gen_reg_rtx (V16QImode
);
5484 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
5485 if (target
!= operands
[0])
5486 emit_move_insn (operands
[0],
5487 gen_lowpart (GET_MODE (operands
[0]), target
));
5489 else if (one_operand_shuffle
)
5491 if (GET_MODE (target
) != V16QImode
)
5492 target
= gen_reg_rtx (V16QImode
);
5493 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
5494 if (target
!= operands
[0])
5495 emit_move_insn (operands
[0],
5496 gen_lowpart (GET_MODE (operands
[0]), target
));
5503 /* Shuffle the two input vectors independently. */
5504 t1
= gen_reg_rtx (V16QImode
);
5505 t2
= gen_reg_rtx (V16QImode
);
5506 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
5507 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
5510 /* Then merge them together. The key is whether any given control
5511 element contained a bit set that indicates the second word. */
5514 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
5516 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5517 more shuffle to convert the V2DI input mask into a V4SI
5518 input mask. At which point the masking that expand_int_vcond
5519 will work as desired. */
5520 rtx t3
= gen_reg_rtx (V4SImode
);
5521 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
5522 const0_rtx
, const0_rtx
,
5523 const2_rtx
, const2_rtx
));
5525 maskmode
= V4SImode
;
5529 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5530 vt
= force_reg (maskmode
, vt
);
5531 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5532 NULL_RTX
, 0, OPTAB_DIRECT
);
5534 if (GET_MODE (target
) != mode
)
5535 target
= gen_reg_rtx (mode
);
5537 xops
[1] = gen_lowpart (mode
, t2
);
5538 xops
[2] = gen_lowpart (mode
, t1
);
5539 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
5542 ok
= ix86_expand_int_vcond (xops
);
5544 if (target
!= operands
[0])
5545 emit_move_insn (operands
[0],
5546 gen_lowpart (GET_MODE (operands
[0]), target
));
5550 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5551 true if we should do zero extension, else sign extension. HIGH_P is
5552 true if we want the N/2 high elements, else the low elements. */
5555 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
5557 machine_mode imode
= GET_MODE (src
);
5562 rtx (*unpack
)(rtx
, rtx
);
5563 rtx (*extract
)(rtx
, rtx
) = NULL
;
5564 machine_mode halfmode
= BLKmode
;
5570 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
5572 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
5573 halfmode
= V32QImode
;
5575 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
5579 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
5581 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
5582 halfmode
= V16QImode
;
5584 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
5588 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
5590 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
5591 halfmode
= V16HImode
;
5593 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
5597 unpack
= gen_avx2_zero_extendv8hiv8si2
;
5599 unpack
= gen_avx2_sign_extendv8hiv8si2
;
5600 halfmode
= V8HImode
;
5602 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5606 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5608 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5609 halfmode
= V8SImode
;
5611 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5615 unpack
= gen_avx2_zero_extendv4siv4di2
;
5617 unpack
= gen_avx2_sign_extendv4siv4di2
;
5618 halfmode
= V4SImode
;
5620 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5624 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5626 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5630 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5632 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5636 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5638 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5642 unpack
= gen_sse4_1_zero_extendv4qiv4hi2
;
5644 unpack
= gen_sse4_1_sign_extendv4qiv4hi2
;
5648 unpack
= gen_sse4_1_zero_extendv2hiv2si2
;
5650 unpack
= gen_sse4_1_sign_extendv2hiv2si2
;
5654 unpack
= gen_sse4_1_zero_extendv2qiv2hi2
;
5656 unpack
= gen_sse4_1_sign_extendv2qiv2hi2
;
5662 if (GET_MODE_SIZE (imode
) >= 32)
5664 tmp
= gen_reg_rtx (halfmode
);
5665 emit_insn (extract (tmp
, src
));
5669 switch (GET_MODE_SIZE (imode
))
5672 /* Shift higher 8 bytes to lower 8 bytes. */
5673 tmp
= gen_reg_rtx (V1TImode
);
5674 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5678 /* Shift higher 4 bytes to lower 4 bytes. */
5679 tmp
= gen_reg_rtx (V1DImode
);
5680 emit_insn (gen_mmx_lshrv1di3 (tmp
, gen_lowpart (V1DImode
, src
),
5684 /* Shift higher 2 bytes to lower 2 bytes. */
5685 tmp
= gen_reg_rtx (V1SImode
);
5686 emit_insn (gen_mmx_lshrv1si3 (tmp
, gen_lowpart (V1SImode
, src
),
5693 tmp
= gen_lowpart (imode
, tmp
);
5698 emit_insn (unpack (dest
, tmp
));
5702 rtx (*unpack
)(rtx
, rtx
, rtx
);
5708 unpack
= gen_vec_interleave_highv16qi
;
5710 unpack
= gen_vec_interleave_lowv16qi
;
5714 unpack
= gen_vec_interleave_highv8hi
;
5716 unpack
= gen_vec_interleave_lowv8hi
;
5720 unpack
= gen_vec_interleave_highv4si
;
5722 unpack
= gen_vec_interleave_lowv4si
;
5726 unpack
= gen_mmx_punpckhbw
;
5728 unpack
= gen_mmx_punpcklbw
;
5732 unpack
= gen_mmx_punpckhwd
;
5734 unpack
= gen_mmx_punpcklwd
;
5738 unpack
= gen_mmx_punpckhbw_low
;
5740 unpack
= gen_mmx_punpcklbw_low
;
5747 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5749 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5750 src
, pc_rtx
, pc_rtx
);
5752 rtx tmp2
= gen_reg_rtx (imode
);
5753 emit_insn (unpack (tmp2
, src
, tmp
));
5754 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5758 /* Return true if mem is pool constant which contains a const_vector
5759 perm index, assign the index to PERM. */
5761 ix86_extract_perm_from_pool_constant (int* perm
, rtx mem
)
5763 machine_mode mode
= GET_MODE (mem
);
5764 int nelt
= GET_MODE_NUNITS (mode
);
5766 if (!INTEGRAL_MODE_P (mode
))
5769 /* Needs to be constant pool. */
5771 || !SYMBOL_REF_P (XEXP (mem
, 0))
5772 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem
, 0)))
5775 rtx constant
= get_pool_constant (XEXP (mem
, 0));
5777 if (GET_CODE (constant
) != CONST_VECTOR
)
5780 /* There could be some rtx like
5781 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5782 but with "*.LC1" refer to V2DI constant vector. */
5783 if (GET_MODE (constant
) != mode
)
5785 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
), 0);
5787 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
5791 for (int i
= 0; i
!= nelt
; i
++)
5792 perm
[i
] = UINTVAL (XVECEXP (constant
, 0, i
));
5797 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5798 but works for floating pointer parameters and nonoffsetable memories.
5799 For pushes, it returns just stack offsets; the values will be saved
5800 in the right order. Maximally three parts are generated. */
5803 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5808 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5810 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5812 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5813 gcc_assert (size
>= 2 && size
<= 4);
5815 /* Optimize constant pool reference to immediates. This is used by fp
5816 moves, that force all constants to memory to allow combining. */
5817 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5818 operand
= avoid_constant_pool_reference (operand
);
5820 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5822 /* The only non-offsetable memories we handle are pushes. */
5823 int ok
= push_operand (operand
, VOIDmode
);
5827 operand
= copy_rtx (operand
);
5828 PUT_MODE (operand
, word_mode
);
5829 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5833 if (GET_CODE (operand
) == CONST_VECTOR
)
5835 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5836 /* Caution: if we looked through a constant pool memory above,
5837 the operand may actually have a different mode now. That's
5838 ok, since we want to pun this all the way back to an integer. */
5839 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5840 gcc_assert (operand
!= NULL
);
5847 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5852 if (REG_P (operand
))
5854 gcc_assert (reload_completed
);
5855 for (i
= 0; i
< size
; i
++)
5856 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5858 else if (offsettable_memref_p (operand
))
5860 operand
= adjust_address (operand
, SImode
, 0);
5862 for (i
= 1; i
< size
; i
++)
5863 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5865 else if (CONST_DOUBLE_P (operand
))
5867 const REAL_VALUE_TYPE
*r
;
5870 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5874 real_to_target (l
, r
, mode
);
5875 parts
[3] = gen_int_mode (l
[3], SImode
);
5876 parts
[2] = gen_int_mode (l
[2], SImode
);
5879 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5880 long double may not be 80-bit. */
5881 real_to_target (l
, r
, mode
);
5882 parts
[2] = gen_int_mode (l
[2], SImode
);
5885 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5890 parts
[1] = gen_int_mode (l
[1], SImode
);
5891 parts
[0] = gen_int_mode (l
[0], SImode
);
5900 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5901 if (mode
== XFmode
|| mode
== TFmode
)
5903 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5904 if (REG_P (operand
))
5906 gcc_assert (reload_completed
);
5907 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5908 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5910 else if (offsettable_memref_p (operand
))
5912 operand
= adjust_address (operand
, DImode
, 0);
5914 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5916 else if (CONST_DOUBLE_P (operand
))
5920 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5922 /* real_to_target puts 32-bit pieces in each long. */
5923 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5924 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5927 if (upper_mode
== SImode
)
5928 parts
[1] = gen_int_mode (l
[2], SImode
);
5931 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5932 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5943 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5944 Return false when normal moves are needed; true when all required
5945 insns have been emitted. Operands 2-4 contain the input values
5946 int the correct order; operands 5-7 contain the output values. */
5949 ix86_split_long_move (rtx operands
[])
5955 machine_mode mode
= GET_MODE (operands
[0]);
5956 bool collisionparts
[4];
5958 /* The DFmode expanders may ask us to move double.
5959 For 64bit target this is single move. By hiding the fact
5960 here we simplify i386.md splitters. */
5961 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5963 /* Optimize constant pool reference to immediates. This is used by
5964 fp moves, that force all constants to memory to allow combining. */
5966 if (MEM_P (operands
[1])
5967 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5968 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5969 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5970 if (push_operand (operands
[0], VOIDmode
))
5972 operands
[0] = copy_rtx (operands
[0]);
5973 PUT_MODE (operands
[0], word_mode
);
5976 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5977 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5978 emit_move_insn (operands
[0], operands
[1]);
5982 /* The only non-offsettable memory we handle is push. */
5983 if (push_operand (operands
[0], VOIDmode
))
5986 gcc_assert (!MEM_P (operands
[0])
5987 || offsettable_memref_p (operands
[0]));
5989 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5990 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5992 /* When emitting push, take care for source operands on the stack. */
5993 if (push
&& MEM_P (operands
[1])
5994 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5996 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5998 /* Compensate for the stack decrement by 4. */
5999 if (!TARGET_64BIT
&& nparts
== 3
6000 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
6001 src_base
= plus_constant (Pmode
, src_base
, 4);
6003 /* src_base refers to the stack pointer and is
6004 automatically decreased by emitted push. */
6005 for (i
= 0; i
< nparts
; i
++)
6006 part
[1][i
] = change_address (part
[1][i
],
6007 GET_MODE (part
[1][i
]), src_base
);
6010 /* We need to do copy in the right order in case an address register
6011 of the source overlaps the destination. */
6012 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
6016 for (i
= 0; i
< nparts
; i
++)
6019 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
6020 if (collisionparts
[i
])
6024 /* Collision in the middle part can be handled by reordering. */
6025 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
6027 std::swap (part
[0][1], part
[0][2]);
6028 std::swap (part
[1][1], part
[1][2]);
6030 else if (collisions
== 1
6032 && (collisionparts
[1] || collisionparts
[2]))
6034 if (collisionparts
[1])
6036 std::swap (part
[0][1], part
[0][2]);
6037 std::swap (part
[1][1], part
[1][2]);
6041 std::swap (part
[0][2], part
[0][3]);
6042 std::swap (part
[1][2], part
[1][3]);
6046 /* If there are more collisions, we can't handle it by reordering.
6047 Do an lea to the last part and use only one colliding move. */
6048 else if (collisions
> 1)
6054 base
= part
[0][nparts
- 1];
6056 /* Handle the case when the last part isn't valid for lea.
6057 Happens in 64-bit mode storing the 12-byte XFmode. */
6058 if (GET_MODE (base
) != Pmode
)
6059 base
= gen_rtx_REG (Pmode
, REGNO (base
));
6061 addr
= XEXP (part
[1][0], 0);
6062 if (TARGET_TLS_DIRECT_SEG_REFS
)
6064 struct ix86_address parts
;
6065 int ok
= ix86_decompose_address (addr
, &parts
);
6067 /* It is not valid to use %gs: or %fs: in lea. */
6068 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
6070 emit_insn (gen_rtx_SET (base
, addr
));
6071 part
[1][0] = replace_equiv_address (part
[1][0], base
);
6072 for (i
= 1; i
< nparts
; i
++)
6074 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
6075 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
6086 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
6087 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
6088 emit_move_insn (part
[0][2], part
[1][2]);
6090 else if (nparts
== 4)
6092 emit_move_insn (part
[0][3], part
[1][3]);
6093 emit_move_insn (part
[0][2], part
[1][2]);
6098 /* In 64bit mode we don't have 32bit push available. In case this is
6099 register, it is OK - we will just use larger counterpart. We also
6100 retype memory - these comes from attempt to avoid REX prefix on
6101 moving of second half of TFmode value. */
6102 if (GET_MODE (part
[1][1]) == SImode
)
6104 switch (GET_CODE (part
[1][1]))
6107 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
6111 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
6118 if (GET_MODE (part
[1][0]) == SImode
)
6119 part
[1][0] = part
[1][1];
6122 emit_move_insn (part
[0][1], part
[1][1]);
6123 emit_move_insn (part
[0][0], part
[1][0]);
6127 /* Choose correct order to not overwrite the source before it is copied. */
6128 if ((REG_P (part
[0][0])
6129 && REG_P (part
[1][1])
6130 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
6132 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
6134 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
6136 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
6138 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
6140 operands
[2 + i
] = part
[0][j
];
6141 operands
[6 + i
] = part
[1][j
];
6146 for (i
= 0; i
< nparts
; i
++)
6148 operands
[2 + i
] = part
[0][i
];
6149 operands
[6 + i
] = part
[1][i
];
6153 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6154 if (optimize_insn_for_size_p ())
6156 for (j
= 0; j
< nparts
- 1; j
++)
6157 if (CONST_INT_P (operands
[6 + j
])
6158 && operands
[6 + j
] != const0_rtx
6159 && REG_P (operands
[2 + j
]))
6160 for (i
= j
; i
< nparts
- 1; i
++)
6161 if (CONST_INT_P (operands
[7 + i
])
6162 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
6163 operands
[7 + i
] = operands
[2 + j
];
6166 for (i
= 0; i
< nparts
; i
++)
6167 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
6172 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6173 left shift by a constant, either using a single shift or
6174 a sequence of add instructions. */
6177 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
6180 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
6181 && !optimize_insn_for_size_p ()))
6184 emit_insn (gen_add2_insn (operand
, operand
));
6188 rtx (*insn
)(rtx
, rtx
, rtx
);
6190 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6191 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
6196 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
6198 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
6199 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
6200 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6201 machine_mode half_mode
;
6203 rtx low
[2], high
[2];
6206 if (CONST_INT_P (operands
[2]))
6208 split_double_mode (mode
, operands
, 2, low
, high
);
6209 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6211 if (count
>= half_width
)
6213 emit_move_insn (high
[0], low
[1]);
6214 ix86_expand_clear (low
[0]);
6216 if (count
> half_width
)
6217 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
6221 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6223 if (!rtx_equal_p (operands
[0], operands
[1]))
6224 emit_move_insn (operands
[0], operands
[1]);
6226 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
6227 ix86_expand_ashl_const (low
[0], count
, mode
);
6232 split_double_mode (mode
, operands
, 1, low
, high
);
6233 half_mode
= mode
== DImode
? SImode
: DImode
;
6235 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6237 if (operands
[1] == const1_rtx
)
6239 /* Assuming we've chosen a QImode capable registers, then 1 << N
6240 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6241 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
6243 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
6245 ix86_expand_clear (low
[0]);
6246 ix86_expand_clear (high
[0]);
6247 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
6249 d
= gen_lowpart (QImode
, low
[0]);
6250 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6251 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
6252 emit_insn (gen_rtx_SET (d
, s
));
6254 d
= gen_lowpart (QImode
, high
[0]);
6255 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6256 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
6257 emit_insn (gen_rtx_SET (d
, s
));
6260 /* Otherwise, we can get the same results by manually performing
6261 a bit extract operation on bit 5/6, and then performing the two
6262 shifts. The two methods of getting 0/1 into low/high are exactly
6263 the same size. Avoiding the shift in the bit extract case helps
6264 pentium4 a bit; no one else seems to care much either way. */
6267 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
6268 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
6269 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
6275 gen_lshr3
= gen_lshrsi3
;
6276 gen_and3
= gen_andsi3
;
6277 gen_xor3
= gen_xorsi3
;
6282 gen_lshr3
= gen_lshrdi3
;
6283 gen_and3
= gen_anddi3
;
6284 gen_xor3
= gen_xordi3
;
6288 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
6289 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
6291 x
= gen_lowpart (half_mode
, operands
[2]);
6292 emit_insn (gen_rtx_SET (high
[0], x
));
6294 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
6295 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
6296 emit_move_insn (low
[0], high
[0]);
6297 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
6300 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6301 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
6305 if (operands
[1] == constm1_rtx
)
6307 /* For -1 << N, we can avoid the shld instruction, because we
6308 know that we're shifting 0...31/63 ones into a -1. */
6309 emit_move_insn (low
[0], constm1_rtx
);
6310 if (optimize_insn_for_size_p ())
6311 emit_move_insn (high
[0], low
[0]);
6313 emit_move_insn (high
[0], constm1_rtx
);
6317 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6319 if (!rtx_equal_p (operands
[0], operands
[1]))
6320 emit_move_insn (operands
[0], operands
[1]);
6322 split_double_mode (mode
, operands
, 1, low
, high
);
6323 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
6326 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6328 if (TARGET_CMOVE
&& scratch
)
6330 ix86_expand_clear (scratch
);
6331 emit_insn (gen_x86_shift_adj_1
6332 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
6335 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
6339 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6341 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
6342 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
6343 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6344 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6346 rtx low
[2], high
[2];
6349 if (CONST_INT_P (operands
[2]))
6351 split_double_mode (mode
, operands
, 2, low
, high
);
6352 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6354 if (count
== GET_MODE_BITSIZE (mode
) - 1)
6356 emit_move_insn (high
[0], high
[1]);
6357 emit_insn (gen_ashr3 (high
[0], high
[0],
6358 GEN_INT (half_width
- 1)));
6359 emit_move_insn (low
[0], high
[0]);
6362 else if (count
>= half_width
)
6364 emit_move_insn (low
[0], high
[1]);
6365 emit_move_insn (high
[0], low
[0]);
6366 emit_insn (gen_ashr3 (high
[0], high
[0],
6367 GEN_INT (half_width
- 1)));
6369 if (count
> half_width
)
6370 emit_insn (gen_ashr3 (low
[0], low
[0],
6371 GEN_INT (count
- half_width
)));
6375 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6377 if (!rtx_equal_p (operands
[0], operands
[1]))
6378 emit_move_insn (operands
[0], operands
[1]);
6380 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6381 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
6386 machine_mode half_mode
;
6388 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6390 if (!rtx_equal_p (operands
[0], operands
[1]))
6391 emit_move_insn (operands
[0], operands
[1]);
6393 split_double_mode (mode
, operands
, 1, low
, high
);
6394 half_mode
= mode
== DImode
? SImode
: DImode
;
6396 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6397 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
6399 if (TARGET_CMOVE
&& scratch
)
6401 emit_move_insn (scratch
, high
[0]);
6402 emit_insn (gen_ashr3 (scratch
, scratch
,
6403 GEN_INT (half_width
- 1)));
6404 emit_insn (gen_x86_shift_adj_1
6405 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6408 emit_insn (gen_x86_shift_adj_3
6409 (half_mode
, low
[0], high
[0], operands
[2]));
6414 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6416 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
6417 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
6418 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6419 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6421 rtx low
[2], high
[2];
6424 if (CONST_INT_P (operands
[2]))
6426 split_double_mode (mode
, operands
, 2, low
, high
);
6427 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6429 if (count
>= half_width
)
6431 emit_move_insn (low
[0], high
[1]);
6432 ix86_expand_clear (high
[0]);
6434 if (count
> half_width
)
6435 emit_insn (gen_lshr3 (low
[0], low
[0],
6436 GEN_INT (count
- half_width
)));
6440 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6442 if (!rtx_equal_p (operands
[0], operands
[1]))
6443 emit_move_insn (operands
[0], operands
[1]);
6445 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6446 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
6451 machine_mode half_mode
;
6453 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6455 if (!rtx_equal_p (operands
[0], operands
[1]))
6456 emit_move_insn (operands
[0], operands
[1]);
6458 split_double_mode (mode
, operands
, 1, low
, high
);
6459 half_mode
= mode
== DImode
? SImode
: DImode
;
6461 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6462 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
6464 if (TARGET_CMOVE
&& scratch
)
6466 ix86_expand_clear (scratch
);
6467 emit_insn (gen_x86_shift_adj_1
6468 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6471 emit_insn (gen_x86_shift_adj_2
6472 (half_mode
, low
[0], high
[0], operands
[2]));
6476 /* Expand move of V1TI mode register X to a new TI mode register. */
6478 ix86_expand_v1ti_to_ti (rtx x
)
6480 rtx result
= gen_reg_rtx (TImode
);
6483 rtx temp
= force_reg (V2DImode
, gen_lowpart (V2DImode
, x
));
6484 rtx lo
= gen_lowpart (DImode
, result
);
6485 emit_insn (gen_vec_extractv2didi (lo
, temp
, const0_rtx
));
6486 rtx hi
= gen_highpart (DImode
, result
);
6487 emit_insn (gen_vec_extractv2didi (hi
, temp
, const1_rtx
));
6490 emit_move_insn (result
, gen_lowpart (TImode
, x
));
6494 /* Expand move of TI mode register X to a new V1TI mode register. */
6496 ix86_expand_ti_to_v1ti (rtx x
)
6500 rtx lo
= gen_lowpart (DImode
, x
);
6501 rtx hi
= gen_highpart (DImode
, x
);
6502 rtx tmp
= gen_reg_rtx (V2DImode
);
6503 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
6504 return force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp
));
6507 return force_reg (V1TImode
, gen_lowpart (V1TImode
, x
));
6510 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6512 ix86_expand_v1ti_shift (enum rtx_code code
, rtx operands
[])
6514 rtx op1
= force_reg (V1TImode
, operands
[1]);
6516 if (!CONST_INT_P (operands
[2]))
6518 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6519 rtx tmp2
= gen_reg_rtx (TImode
);
6520 rtx (*shift
) (rtx
, rtx
, rtx
)
6521 = (code
== ASHIFT
) ? gen_ashlti3
: gen_lshrti3
;
6522 emit_insn (shift (tmp2
, tmp1
, operands
[2]));
6523 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6524 emit_move_insn (operands
[0], tmp3
);
6528 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6532 emit_move_insn (operands
[0], op1
);
6536 if ((bits
& 7) == 0)
6538 rtx tmp
= gen_reg_rtx (V1TImode
);
6540 emit_insn (gen_sse2_ashlv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6542 emit_insn (gen_sse2_lshrv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6543 emit_move_insn (operands
[0], tmp
);
6547 rtx tmp1
= gen_reg_rtx (V1TImode
);
6549 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (64)));
6551 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
6553 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6554 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6556 /* tmp3 will be the V2DImode result. */
6557 rtx tmp3
= gen_reg_rtx (V2DImode
);
6562 emit_insn (gen_ashlv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6564 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6568 /* tmp4 is operands[1], in V2DImode. */
6569 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6571 rtx tmp5
= gen_reg_rtx (V2DImode
);
6573 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6575 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6577 rtx tmp6
= gen_reg_rtx (V2DImode
);
6579 emit_insn (gen_lshrv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6581 emit_insn (gen_ashlv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6583 emit_insn (gen_iorv2di3 (tmp3
, tmp5
, tmp6
));
6586 /* Convert the result back to V1TImode and store in operands[0]. */
6587 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6588 emit_move_insn (operands
[0], tmp7
);
6591 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6593 ix86_expand_v1ti_rotate (enum rtx_code code
, rtx operands
[])
6595 rtx op1
= force_reg (V1TImode
, operands
[1]);
6597 if (!CONST_INT_P (operands
[2]))
6599 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6600 rtx tmp2
= gen_reg_rtx (TImode
);
6601 rtx (*rotate
) (rtx
, rtx
, rtx
)
6602 = (code
== ROTATE
) ? gen_rotlti3
: gen_rotrti3
;
6603 emit_insn (rotate (tmp2
, tmp1
, operands
[2]));
6604 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6605 emit_move_insn (operands
[0], tmp3
);
6609 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6613 emit_move_insn (operands
[0], op1
);
6617 if (code
== ROTATERT
)
6620 if ((bits
& 31) == 0)
6622 rtx tmp2
= gen_reg_rtx (V4SImode
);
6623 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6625 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x93)));
6626 else if (bits
== 64)
6627 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x4e)));
6629 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x39)));
6630 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp2
));
6634 if ((bits
& 7) == 0)
6636 rtx tmp1
= gen_reg_rtx (V1TImode
);
6637 rtx tmp2
= gen_reg_rtx (V1TImode
);
6638 rtx tmp3
= gen_reg_rtx (V1TImode
);
6640 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (bits
)));
6641 emit_insn (gen_sse2_lshrv1ti3 (tmp2
, op1
, GEN_INT (128 - bits
)));
6642 emit_insn (gen_iorv1ti3 (tmp3
, tmp1
, tmp2
));
6643 emit_move_insn (operands
[0], tmp3
);
6647 rtx op1_v4si
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6656 hibits
= gen_reg_rtx (V4SImode
);
6657 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x93)));
6661 lobits
= gen_reg_rtx (V4SImode
);
6662 hibits
= gen_reg_rtx (V4SImode
);
6663 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x93)));
6664 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x4e)));
6668 lobits
= gen_reg_rtx (V4SImode
);
6669 hibits
= gen_reg_rtx (V4SImode
);
6670 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x4e)));
6671 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x39)));
6675 lobits
= gen_reg_rtx (V4SImode
);
6676 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x39)));
6681 rtx tmp1
= gen_reg_rtx (V4SImode
);
6682 rtx tmp2
= gen_reg_rtx (V4SImode
);
6683 rtx tmp3
= gen_reg_rtx (V4SImode
);
6685 emit_insn (gen_ashlv4si3 (tmp1
, lobits
, GEN_INT (bits
& 31)));
6686 emit_insn (gen_lshrv4si3 (tmp2
, hibits
, GEN_INT (32 - (bits
& 31))));
6687 emit_insn (gen_iorv4si3 (tmp3
, tmp1
, tmp2
));
6689 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6692 /* Expand V1TI mode ashiftrt by constant. */
6694 ix86_expand_v1ti_ashiftrt (rtx operands
[])
6696 rtx op1
= force_reg (V1TImode
, operands
[1]);
6698 if (!CONST_INT_P (operands
[2]))
6700 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6701 rtx tmp2
= gen_reg_rtx (TImode
);
6702 emit_insn (gen_ashrti3 (tmp2
, tmp1
, operands
[2]));
6703 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6704 emit_move_insn (operands
[0], tmp3
);
6708 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6712 emit_move_insn (operands
[0], op1
);
6718 /* Two operations. */
6719 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6720 rtx tmp2
= gen_reg_rtx (V4SImode
);
6721 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6723 rtx tmp3
= gen_reg_rtx (V4SImode
);
6724 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6726 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6732 /* Three operations. */
6733 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6734 rtx tmp2
= gen_reg_rtx (V4SImode
);
6735 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6737 rtx tmp3
= gen_reg_rtx (V4SImode
);
6738 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6740 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6741 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6742 rtx tmp6
= gen_reg_rtx (V2DImode
);
6743 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6745 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6751 /* Three operations. */
6752 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6753 rtx tmp2
= gen_reg_rtx (V4SImode
);
6754 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6756 rtx tmp3
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6757 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6758 rtx tmp5
= gen_reg_rtx (V2DImode
);
6759 emit_insn (gen_vec_interleave_highv2di (tmp5
, tmp3
, tmp4
));
6761 rtx tmp6
= force_reg(V4SImode
, gen_lowpart (V4SImode
, tmp5
));
6762 rtx tmp7
= gen_reg_rtx (V4SImode
);
6763 emit_insn (gen_sse2_pshufd (tmp7
, tmp6
, GEN_INT (0xfd)));
6765 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
6771 /* Three operations. */
6772 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6773 rtx tmp2
= gen_reg_rtx (V4SImode
);
6774 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6776 rtx tmp3
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6777 rtx tmp4
= gen_reg_rtx (V8HImode
);
6778 emit_insn (gen_sse2_pshufhw (tmp4
, tmp3
, GEN_INT (0xfe)));
6780 rtx tmp5
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp4
));
6781 rtx tmp6
= gen_reg_rtx (V4SImode
);
6782 emit_insn (gen_sse2_pshufd (tmp6
, tmp5
, GEN_INT (0xfe)));
6784 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6788 if (TARGET_AVX2
|| TARGET_SSE4_1
)
6790 /* Three operations. */
6793 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6794 rtx tmp2
= gen_reg_rtx (V4SImode
);
6795 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6797 rtx tmp3
= gen_reg_rtx (V1TImode
);
6798 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (32)));
6802 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6803 rtx tmp5
= gen_reg_rtx (V4SImode
);
6804 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6807 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6811 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6812 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6813 rtx tmp6
= gen_reg_rtx (V8HImode
);
6814 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6817 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6822 /* Three operations. */
6823 if (bits
== 8 || bits
== 16 || bits
== 24)
6825 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6826 rtx tmp2
= gen_reg_rtx (V4SImode
);
6827 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6829 rtx tmp3
= gen_reg_rtx (V1TImode
);
6830 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (bits
)));
6834 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6835 rtx tmp5
= gen_reg_rtx (V4SImode
);
6836 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6839 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6843 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6844 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6845 rtx tmp6
= gen_reg_rtx (V8HImode
);
6846 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6849 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6857 /* Four operations. */
6858 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6859 rtx tmp2
= gen_reg_rtx (V4SImode
);
6860 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6862 rtx tmp3
= gen_reg_rtx (V4SImode
);
6863 emit_insn (gen_ashrv4si3 (tmp3
, tmp1
, GEN_INT (31)));
6865 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6866 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6867 rtx tmp6
= gen_reg_rtx (V2DImode
);
6868 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6870 rtx tmp7
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp6
));
6871 rtx tmp8
= gen_reg_rtx (V4SImode
);
6872 emit_insn (gen_sse2_pshufd (tmp8
, tmp7
, GEN_INT (0xfd)));
6874 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp8
));
6878 if (TARGET_SSE4_1
&& (bits
== 48 || bits
== 80))
6880 /* Four operations. */
6881 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6882 rtx tmp2
= gen_reg_rtx (V4SImode
);
6883 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6885 rtx tmp3
= gen_reg_rtx (V4SImode
);
6886 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6888 rtx tmp4
= gen_reg_rtx (V1TImode
);
6889 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
6891 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6892 rtx tmp6
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp4
));
6893 rtx tmp7
= gen_reg_rtx (V8HImode
);
6894 emit_insn (gen_sse4_1_pblendw (tmp7
, tmp5
, tmp6
,
6895 GEN_INT (bits
== 48 ? 0x1f : 0x07)));
6897 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
6901 if ((bits
& 7) == 0)
6903 /* Five operations. */
6904 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6905 rtx tmp2
= gen_reg_rtx (V4SImode
);
6906 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6908 rtx tmp3
= gen_reg_rtx (V4SImode
);
6909 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6911 rtx tmp4
= gen_reg_rtx (V1TImode
);
6912 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
6914 rtx tmp5
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6915 rtx tmp6
= gen_reg_rtx (V1TImode
);
6916 emit_insn (gen_sse2_ashlv1ti3 (tmp6
, tmp5
, GEN_INT (128 - bits
)));
6918 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
6919 rtx tmp8
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp6
));
6920 rtx tmp9
= gen_reg_rtx (V2DImode
);
6921 emit_insn (gen_iorv2di3 (tmp9
, tmp7
, tmp8
));
6923 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp9
));
6927 if (TARGET_AVX2
&& bits
< 32)
6929 /* Six operations. */
6930 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6931 rtx tmp2
= gen_reg_rtx (V4SImode
);
6932 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6934 rtx tmp3
= gen_reg_rtx (V1TImode
);
6935 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
6937 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6938 rtx tmp5
= gen_reg_rtx (V2DImode
);
6939 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6941 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6942 rtx tmp7
= gen_reg_rtx (V2DImode
);
6943 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
6945 rtx tmp8
= gen_reg_rtx (V2DImode
);
6946 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
6948 rtx tmp9
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp8
));
6949 rtx tmp10
= gen_reg_rtx (V4SImode
);
6950 emit_insn (gen_avx2_pblenddv4si (tmp10
, tmp2
, tmp9
, GEN_INT (7)));
6952 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp10
));
6956 if (TARGET_SSE4_1
&& bits
< 15)
6958 /* Six operations. */
6959 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6960 rtx tmp2
= gen_reg_rtx (V4SImode
);
6961 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6963 rtx tmp3
= gen_reg_rtx (V1TImode
);
6964 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
6966 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6967 rtx tmp5
= gen_reg_rtx (V2DImode
);
6968 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6970 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6971 rtx tmp7
= gen_reg_rtx (V2DImode
);
6972 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
6974 rtx tmp8
= gen_reg_rtx (V2DImode
);
6975 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
6977 rtx tmp9
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6978 rtx tmp10
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp8
));
6979 rtx tmp11
= gen_reg_rtx (V8HImode
);
6980 emit_insn (gen_sse4_1_pblendw (tmp11
, tmp9
, tmp10
, GEN_INT (0x3f)));
6982 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp11
));
6988 /* Eight operations. */
6989 rtx tmp1
= gen_reg_rtx (V1TImode
);
6990 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
6992 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6993 rtx tmp3
= gen_reg_rtx (V2DImode
);
6994 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (1)));
6996 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6997 rtx tmp5
= gen_reg_rtx (V2DImode
);
6998 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (63)));
7000 rtx tmp6
= gen_reg_rtx (V2DImode
);
7001 emit_insn (gen_iorv2di3 (tmp6
, tmp3
, tmp5
));
7003 rtx tmp7
= gen_reg_rtx (V2DImode
);
7004 emit_insn (gen_lshrv2di3 (tmp7
, tmp2
, GEN_INT (63)));
7006 rtx tmp8
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp7
));
7007 rtx tmp9
= gen_reg_rtx (V4SImode
);
7008 emit_insn (gen_sse2_pshufd (tmp9
, tmp8
, GEN_INT (0xbf)));
7010 rtx tmp10
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp9
));
7011 rtx tmp11
= gen_reg_rtx (V2DImode
);
7012 emit_insn (gen_ashlv2di3 (tmp11
, tmp10
, GEN_INT (31)));
7014 rtx tmp12
= gen_reg_rtx (V2DImode
);
7015 emit_insn (gen_iorv2di3 (tmp12
, tmp6
, tmp11
));
7017 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp12
));
7023 /* Eight operations. */
7024 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7025 rtx tmp2
= gen_reg_rtx (V4SImode
);
7026 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7028 rtx tmp3
= gen_reg_rtx (V4SImode
);
7029 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7031 rtx tmp4
= gen_reg_rtx (V1TImode
);
7032 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
7034 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7035 rtx tmp6
= gen_reg_rtx (V2DImode
);
7036 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
- 64)));
7038 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7039 rtx tmp8
= gen_reg_rtx (V1TImode
);
7040 emit_insn (gen_sse2_ashlv1ti3 (tmp8
, tmp7
, GEN_INT (64)));
7042 rtx tmp9
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7043 rtx tmp10
= gen_reg_rtx (V2DImode
);
7044 emit_insn (gen_ashlv2di3 (tmp10
, tmp9
, GEN_INT (128 - bits
)));
7046 rtx tmp11
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp8
));
7047 rtx tmp12
= gen_reg_rtx (V2DImode
);
7048 emit_insn (gen_iorv2di3 (tmp12
, tmp10
, tmp11
));
7050 rtx tmp13
= gen_reg_rtx (V2DImode
);
7051 emit_insn (gen_iorv2di3 (tmp13
, tmp6
, tmp12
));
7053 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp13
));
7057 /* Nine operations. */
7058 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7059 rtx tmp2
= gen_reg_rtx (V4SImode
);
7060 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7062 rtx tmp3
= gen_reg_rtx (V4SImode
);
7063 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7065 rtx tmp4
= gen_reg_rtx (V1TImode
);
7066 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
7068 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7069 rtx tmp6
= gen_reg_rtx (V2DImode
);
7070 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
)));
7072 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7073 rtx tmp8
= gen_reg_rtx (V2DImode
);
7074 emit_insn (gen_ashlv2di3 (tmp8
, tmp7
, GEN_INT (64 - bits
)));
7076 rtx tmp9
= gen_reg_rtx (V2DImode
);
7077 emit_insn (gen_iorv2di3 (tmp9
, tmp6
, tmp8
));
7079 rtx tmp10
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7080 rtx tmp11
= gen_reg_rtx (V1TImode
);
7081 emit_insn (gen_sse2_ashlv1ti3 (tmp11
, tmp10
, GEN_INT (64)));
7083 rtx tmp12
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp11
));
7084 rtx tmp13
= gen_reg_rtx (V2DImode
);
7085 emit_insn (gen_ashlv2di3 (tmp13
, tmp12
, GEN_INT (64 - bits
)));
7087 rtx tmp14
= gen_reg_rtx (V2DImode
);
7088 emit_insn (gen_iorv2di3 (tmp14
, tmp9
, tmp13
));
7090 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp14
));
7094 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7095 DImode for constant loop counts. */
7098 counter_mode (rtx count_exp
)
7100 if (GET_MODE (count_exp
) != VOIDmode
)
7101 return GET_MODE (count_exp
);
7102 if (!CONST_INT_P (count_exp
))
7104 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
7109 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7110 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7111 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7112 memory by VALUE (supposed to be in MODE).
7114 The size is rounded down to whole number of chunk size moved at once.
7115 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7119 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
7120 rtx destptr
, rtx srcptr
, rtx value
,
7121 rtx count
, machine_mode mode
, int unroll
,
7122 int expected_size
, bool issetmem
)
7124 rtx_code_label
*out_label
, *top_label
;
7126 machine_mode iter_mode
= counter_mode (count
);
7127 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
7128 rtx piece_size
= GEN_INT (piece_size_n
);
7129 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
7133 top_label
= gen_label_rtx ();
7134 out_label
= gen_label_rtx ();
7135 iter
= gen_reg_rtx (iter_mode
);
7137 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
7138 NULL
, 1, OPTAB_DIRECT
);
7139 /* Those two should combine. */
7140 if (piece_size
== const1_rtx
)
7142 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
7144 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
7146 emit_move_insn (iter
, const0_rtx
);
7148 emit_label (top_label
);
7150 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
7152 /* This assert could be relaxed - in this case we'll need to compute
7153 smallest power of two, containing in PIECE_SIZE_N and pass it to
7155 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
7156 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
7157 destmem
= adjust_address (destmem
, mode
, 0);
7161 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
7162 srcmem
= adjust_address (srcmem
, mode
, 0);
7164 /* When unrolling for chips that reorder memory reads and writes,
7165 we can save registers by using single temporary.
7166 Also using 4 temporaries is overkill in 32bit mode. */
7167 if (!TARGET_64BIT
&& 0)
7169 for (i
= 0; i
< unroll
; i
++)
7173 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7174 GET_MODE_SIZE (mode
));
7175 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7176 GET_MODE_SIZE (mode
));
7178 emit_move_insn (destmem
, srcmem
);
7184 gcc_assert (unroll
<= 4);
7185 for (i
= 0; i
< unroll
; i
++)
7187 tmpreg
[i
] = gen_reg_rtx (mode
);
7189 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7190 GET_MODE_SIZE (mode
));
7191 emit_move_insn (tmpreg
[i
], srcmem
);
7193 for (i
= 0; i
< unroll
; i
++)
7196 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7197 GET_MODE_SIZE (mode
));
7198 emit_move_insn (destmem
, tmpreg
[i
]);
7203 for (i
= 0; i
< unroll
; i
++)
7206 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7207 GET_MODE_SIZE (mode
));
7208 emit_move_insn (destmem
, value
);
7211 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
7212 true, OPTAB_LIB_WIDEN
);
7214 emit_move_insn (iter
, tmp
);
7216 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
7218 if (expected_size
!= -1)
7220 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
7221 if (expected_size
== 0)
7223 else if (expected_size
> REG_BR_PROB_BASE
)
7224 predict_jump (REG_BR_PROB_BASE
- 1);
7226 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
7230 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
7231 iter
= ix86_zero_extend_to_Pmode (iter
);
7232 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
7233 true, OPTAB_LIB_WIDEN
);
7235 emit_move_insn (destptr
, tmp
);
7238 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
7239 true, OPTAB_LIB_WIDEN
);
7241 emit_move_insn (srcptr
, tmp
);
7243 emit_label (out_label
);
7246 /* Divide COUNTREG by SCALE. */
7248 scale_counter (rtx countreg
, int scale
)
7254 if (CONST_INT_P (countreg
))
7255 return GEN_INT (INTVAL (countreg
) / scale
);
7256 gcc_assert (REG_P (countreg
));
7258 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
7259 GEN_INT (exact_log2 (scale
)),
7260 NULL
, 1, OPTAB_DIRECT
);
7264 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7265 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7266 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7267 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7268 ORIG_VALUE is the original value passed to memset to fill the memory with.
7269 Other arguments have same meaning as for previous function. */
7272 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
7273 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
7275 machine_mode mode
, bool issetmem
)
7280 HOST_WIDE_INT rounded_count
;
7282 /* If possible, it is shorter to use rep movs.
7283 TODO: Maybe it is better to move this logic to decide_alg. */
7284 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
7285 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7286 && (!issetmem
|| orig_value
== const0_rtx
))
7289 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
7290 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
7292 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
7293 GET_MODE_SIZE (mode
)));
7296 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7297 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7298 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
7301 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
7302 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
7305 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7306 destmem
= shallow_copy_rtx (destmem
);
7307 set_mem_size (destmem
, rounded_count
);
7309 else if (MEM_SIZE_KNOWN_P (destmem
))
7310 clear_mem_size (destmem
);
7314 value
= force_reg (mode
, gen_lowpart (mode
, value
));
7315 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
7319 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
7320 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
7323 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7324 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7325 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
7328 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
7329 if (CONST_INT_P (count
))
7332 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7333 srcmem
= shallow_copy_rtx (srcmem
);
7334 set_mem_size (srcmem
, rounded_count
);
7338 if (MEM_SIZE_KNOWN_P (srcmem
))
7339 clear_mem_size (srcmem
);
7341 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
7346 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7348 SRC is passed by pointer to be updated on return.
7349 Return value is updated DST. */
7351 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
7352 HOST_WIDE_INT size_to_move
)
7354 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
7355 enum insn_code code
;
7356 machine_mode move_mode
;
7359 /* Find the widest mode in which we could perform moves.
7360 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7361 it until move of such size is supported. */
7362 piece_size
= 1 << floor_log2 (size_to_move
);
7363 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
7364 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7366 gcc_assert (piece_size
> 1);
7370 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7371 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7372 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7374 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7375 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7376 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7378 move_mode
= word_mode
;
7379 piece_size
= GET_MODE_SIZE (move_mode
);
7380 code
= optab_handler (mov_optab
, move_mode
);
7383 gcc_assert (code
!= CODE_FOR_nothing
);
7385 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7386 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
7388 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7389 gcc_assert (size_to_move
% piece_size
== 0);
7391 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7393 /* We move from memory to memory, so we'll need to do it via
7394 a temporary register. */
7395 tempreg
= gen_reg_rtx (move_mode
);
7396 emit_insn (GEN_FCN (code
) (tempreg
, src
));
7397 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
7399 emit_move_insn (destptr
,
7400 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7401 emit_move_insn (srcptr
,
7402 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
7404 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7406 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
7410 /* Update DST and SRC rtx. */
7415 /* Helper function for the string operations below. Dest VARIABLE whether
7416 it is aligned to VALUE bytes. If true, jump to the label. */
7418 static rtx_code_label
*
7419 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
7421 rtx_code_label
*label
= gen_label_rtx ();
7422 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
7423 if (GET_MODE (variable
) == DImode
)
7424 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
7426 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
7427 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
7430 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
7432 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7437 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7440 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
7441 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
7444 if (CONST_INT_P (count
))
7446 HOST_WIDE_INT countval
= INTVAL (count
);
7447 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7450 /* For now MAX_SIZE should be a power of 2. This assert could be
7451 relaxed, but it'll require a bit more complicated epilogue
7453 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7454 for (i
= max_size
; i
>= 1; i
>>= 1)
7456 if (epilogue_size
& i
)
7457 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7463 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
7464 count
, 1, OPTAB_DIRECT
);
7465 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
7466 count
, QImode
, 1, 4, false);
7470 /* When there are stringops, we can cheaply increase dest and src pointers.
7471 Otherwise we save code size by maintaining offset (zero is readily
7472 available from preceding rep operation) and using x86 addressing modes.
7474 if (TARGET_SINGLE_STRINGOP
)
7478 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7479 src
= change_address (srcmem
, SImode
, srcptr
);
7480 dest
= change_address (destmem
, SImode
, destptr
);
7481 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7483 LABEL_NUSES (label
) = 1;
7487 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7488 src
= change_address (srcmem
, HImode
, srcptr
);
7489 dest
= change_address (destmem
, HImode
, destptr
);
7490 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7492 LABEL_NUSES (label
) = 1;
7496 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7497 src
= change_address (srcmem
, QImode
, srcptr
);
7498 dest
= change_address (destmem
, QImode
, destptr
);
7499 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7501 LABEL_NUSES (label
) = 1;
7506 rtx offset
= force_reg (Pmode
, const0_rtx
);
7511 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7512 src
= change_address (srcmem
, SImode
, srcptr
);
7513 dest
= change_address (destmem
, SImode
, destptr
);
7514 emit_move_insn (dest
, src
);
7515 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
7516 true, OPTAB_LIB_WIDEN
);
7518 emit_move_insn (offset
, tmp
);
7520 LABEL_NUSES (label
) = 1;
7524 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7525 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7526 src
= change_address (srcmem
, HImode
, tmp
);
7527 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7528 dest
= change_address (destmem
, HImode
, tmp
);
7529 emit_move_insn (dest
, src
);
7530 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
7531 true, OPTAB_LIB_WIDEN
);
7533 emit_move_insn (offset
, tmp
);
7535 LABEL_NUSES (label
) = 1;
7539 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7540 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7541 src
= change_address (srcmem
, QImode
, tmp
);
7542 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7543 dest
= change_address (destmem
, QImode
, tmp
);
7544 emit_move_insn (dest
, src
);
7546 LABEL_NUSES (label
) = 1;
7551 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7552 with value PROMOTED_VAL.
7553 SRC is passed by pointer to be updated on return.
7554 Return value is updated DST. */
7556 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
7557 HOST_WIDE_INT size_to_move
)
7560 enum insn_code code
;
7561 machine_mode move_mode
;
7564 /* Find the widest mode in which we could perform moves.
7565 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7566 it until move of such size is supported. */
7567 move_mode
= GET_MODE (promoted_val
);
7568 if (move_mode
== VOIDmode
)
7570 if (size_to_move
< GET_MODE_SIZE (move_mode
))
7572 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
7573 move_mode
= int_mode_for_size (move_bits
, 0).require ();
7574 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
7576 piece_size
= GET_MODE_SIZE (move_mode
);
7577 code
= optab_handler (mov_optab
, move_mode
);
7578 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
7580 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7582 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7583 gcc_assert (size_to_move
% piece_size
== 0);
7585 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7587 if (piece_size
<= GET_MODE_SIZE (word_mode
))
7589 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
7590 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7595 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
7597 emit_move_insn (destptr
,
7598 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7600 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7604 /* Update DST rtx. */
7607 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7609 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
7610 rtx count
, int max_size
)
7612 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
7613 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
7614 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
7615 gen_lowpart (QImode
, value
), count
, QImode
,
7616 1, max_size
/ 2, true);
7619 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7621 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
7622 rtx count
, int max_size
)
7626 if (CONST_INT_P (count
))
7628 HOST_WIDE_INT countval
= INTVAL (count
);
7629 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7632 /* For now MAX_SIZE should be a power of 2. This assert could be
7633 relaxed, but it'll require a bit more complicated epilogue
7635 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7636 for (i
= max_size
; i
>= 1; i
>>= 1)
7638 if (epilogue_size
& i
)
7640 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7641 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7643 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7650 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
7655 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
7658 dest
= change_address (destmem
, DImode
, destptr
);
7659 emit_insn (gen_strset (destptr
, dest
, value
));
7660 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
7661 emit_insn (gen_strset (destptr
, dest
, value
));
7665 dest
= change_address (destmem
, SImode
, destptr
);
7666 emit_insn (gen_strset (destptr
, dest
, value
));
7667 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7668 emit_insn (gen_strset (destptr
, dest
, value
));
7669 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
7670 emit_insn (gen_strset (destptr
, dest
, value
));
7671 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
7672 emit_insn (gen_strset (destptr
, dest
, value
));
7675 LABEL_NUSES (label
) = 1;
7679 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
7682 dest
= change_address (destmem
, DImode
, destptr
);
7683 emit_insn (gen_strset (destptr
, dest
, value
));
7687 dest
= change_address (destmem
, SImode
, destptr
);
7688 emit_insn (gen_strset (destptr
, dest
, value
));
7689 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7690 emit_insn (gen_strset (destptr
, dest
, value
));
7693 LABEL_NUSES (label
) = 1;
7697 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7698 dest
= change_address (destmem
, SImode
, destptr
);
7699 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
7701 LABEL_NUSES (label
) = 1;
7705 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7706 dest
= change_address (destmem
, HImode
, destptr
);
7707 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
7709 LABEL_NUSES (label
) = 1;
7713 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7714 dest
= change_address (destmem
, QImode
, destptr
);
7715 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
7717 LABEL_NUSES (label
) = 1;
7721 /* Adjust COUNTER by the VALUE. */
7723 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
7725 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
7728 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7729 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7730 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7732 Return value is updated DESTMEM. */
7735 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
7736 rtx destptr
, rtx srcptr
, rtx value
,
7737 rtx vec_value
, rtx count
, int align
,
7738 int desired_alignment
, bool issetmem
)
7741 for (i
= 1; i
< desired_alignment
; i
<<= 1)
7745 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
7748 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7749 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7751 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7754 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7755 ix86_adjust_counter (count
, i
);
7757 LABEL_NUSES (label
) = 1;
7758 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
7764 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7765 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7766 and jump to DONE_LABEL. */
7768 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
7769 rtx destptr
, rtx srcptr
,
7770 rtx value
, rtx vec_value
,
7771 rtx count
, int size
,
7772 rtx done_label
, bool issetmem
)
7774 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
7775 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
7779 /* If we do not have vector value to copy, we must reduce size. */
7784 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
7786 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
7787 mode
= GET_MODE (value
);
7790 mode
= GET_MODE (vec_value
), value
= vec_value
;
7794 /* Choose appropriate vector mode. */
7796 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
7797 else if (size
>= 16)
7798 mode
= TARGET_SSE
? V16QImode
: DImode
;
7799 srcmem
= change_address (srcmem
, mode
, srcptr
);
7801 destmem
= change_address (destmem
, mode
, destptr
);
7802 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
7803 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
7804 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7807 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7810 emit_move_insn (destmem
, srcmem
);
7811 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7813 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7816 destmem
= offset_address (destmem
, count
, 1);
7817 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
7818 GET_MODE_SIZE (mode
));
7821 srcmem
= offset_address (srcmem
, count
, 1);
7822 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
7823 GET_MODE_SIZE (mode
));
7825 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7828 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7831 emit_move_insn (destmem
, srcmem
);
7832 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7834 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7836 emit_jump_insn (gen_jump (done_label
));
7840 LABEL_NUSES (label
) = 1;
7843 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7844 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7845 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7846 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7847 DONE_LABEL is a label after the whole copying sequence. The label is created
7848 on demand if *DONE_LABEL is NULL.
7849 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7850 bounds after the initial copies.
7852 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7853 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7854 we will dispatch to a library call for large blocks.
7856 In pseudocode we do:
7860 Assume that SIZE is 4. Bigger sizes are handled analogously
7863 copy 4 bytes from SRCPTR to DESTPTR
7864 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7869 copy 1 byte from SRCPTR to DESTPTR
7872 copy 2 bytes from SRCPTR to DESTPTR
7873 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7878 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7879 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7881 OLD_DESPTR = DESTPTR;
7882 Align DESTPTR up to DESIRED_ALIGN
7883 SRCPTR += DESTPTR - OLD_DESTPTR
7884 COUNT -= DEST_PTR - OLD_DESTPTR
7886 Round COUNT down to multiple of SIZE
7887 << optional caller supplied zero size guard is here >>
7888 << optional caller supplied dynamic check is here >>
7889 << caller supplied main copy loop is here >>
7894 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
7895 rtx
*destptr
, rtx
*srcptr
,
7897 rtx value
, rtx vec_value
,
7899 rtx_code_label
**done_label
,
7903 unsigned HOST_WIDE_INT
*min_size
,
7907 rtx_code_label
*loop_label
= NULL
, *label
;
7910 int prolog_size
= 0;
7913 /* Chose proper value to copy. */
7914 if (issetmem
&& VECTOR_MODE_P (mode
))
7915 mode_value
= vec_value
;
7918 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
7920 /* See if block is big or small, handle small blocks. */
7921 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
7924 loop_label
= gen_label_rtx ();
7927 *done_label
= gen_label_rtx ();
7929 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
7933 /* Handle sizes > 3. */
7934 for (;size2
> 2; size2
>>= 1)
7935 expand_small_cpymem_or_setmem (destmem
, srcmem
,
7939 size2
, *done_label
, issetmem
);
7940 /* Nothing to copy? Jump to DONE_LABEL if so */
7941 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
7944 /* Do a byte copy. */
7945 destmem
= change_address (destmem
, QImode
, *destptr
);
7947 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
7950 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
7951 emit_move_insn (destmem
, srcmem
);
7954 /* Handle sizes 2 and 3. */
7955 label
= ix86_expand_aligntest (*count
, 2, false);
7956 destmem
= change_address (destmem
, HImode
, *destptr
);
7957 destmem
= offset_address (destmem
, *count
, 1);
7958 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
7960 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
7963 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
7964 srcmem
= offset_address (srcmem
, *count
, 1);
7965 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
7966 emit_move_insn (destmem
, srcmem
);
7970 LABEL_NUSES (label
) = 1;
7971 emit_jump_insn (gen_jump (*done_label
));
7975 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
7976 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
7978 /* Start memcpy for COUNT >= SIZE. */
7981 emit_label (loop_label
);
7982 LABEL_NUSES (loop_label
) = 1;
7985 /* Copy first desired_align bytes. */
7987 srcmem
= change_address (srcmem
, mode
, *srcptr
);
7988 destmem
= change_address (destmem
, mode
, *destptr
);
7989 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
7990 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
7993 emit_move_insn (destmem
, mode_value
);
7996 emit_move_insn (destmem
, srcmem
);
7997 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7999 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
8000 prolog_size
+= GET_MODE_SIZE (mode
);
8004 /* Copy last SIZE bytes. */
8005 destmem
= offset_address (destmem
, *count
, 1);
8006 destmem
= offset_address (destmem
,
8007 GEN_INT (-size
- prolog_size
),
8010 emit_move_insn (destmem
, mode_value
);
8013 srcmem
= offset_address (srcmem
, *count
, 1);
8014 srcmem
= offset_address (srcmem
,
8015 GEN_INT (-size
- prolog_size
),
8017 emit_move_insn (destmem
, srcmem
);
8019 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
8021 destmem
= offset_address (destmem
, modesize
, 1);
8023 emit_move_insn (destmem
, mode_value
);
8026 srcmem
= offset_address (srcmem
, modesize
, 1);
8027 emit_move_insn (destmem
, srcmem
);
8031 /* Align destination. */
8032 if (desired_align
> 1 && desired_align
> align
)
8034 rtx saveddest
= *destptr
;
8036 gcc_assert (desired_align
<= size
);
8037 /* Align destptr up, place it to new register. */
8038 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
8039 GEN_INT (prolog_size
),
8040 NULL_RTX
, 1, OPTAB_DIRECT
);
8041 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
8042 REG_POINTER (*destptr
) = 1;
8043 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
8044 GEN_INT (-desired_align
),
8045 *destptr
, 1, OPTAB_DIRECT
);
8046 /* See how many bytes we skipped. */
8047 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
8049 saveddest
, 1, OPTAB_DIRECT
);
8050 /* Adjust srcptr and count. */
8052 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
8053 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
8054 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
8055 saveddest
, *count
, 1, OPTAB_DIRECT
);
8056 /* We copied at most size + prolog_size. */
8057 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
8059 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
8063 /* Our loops always round down the block size, but for dispatch to
8064 library we need precise value. */
8066 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
8067 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
8071 gcc_assert (prolog_size
== 0);
8072 /* Decrease count, so we won't end up copying last word twice. */
8073 if (!CONST_INT_P (*count
))
8074 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
8075 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
8077 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
8078 (unsigned HOST_WIDE_INT
)size
));
8080 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
8085 /* This function is like the previous one, except here we know how many bytes
8086 need to be copied. That allows us to update alignment not only of DST, which
8087 is returned, but also of SRC, which is passed as a pointer for that
8090 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
8091 rtx srcreg
, rtx value
, rtx vec_value
,
8092 int desired_align
, int align_bytes
,
8097 rtx orig_src
= NULL
;
8099 int copied_bytes
= 0;
8103 gcc_assert (srcp
!= NULL
);
8108 for (piece_size
= 1;
8109 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
8112 if (align_bytes
& piece_size
)
8116 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
8117 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
8119 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
8122 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
8123 copied_bytes
+= piece_size
;
8126 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
8127 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8128 if (MEM_SIZE_KNOWN_P (orig_dst
))
8129 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
8133 int src_align_bytes
= get_mem_align_offset (src
, desired_align
8135 if (src_align_bytes
>= 0)
8136 src_align_bytes
= desired_align
- src_align_bytes
;
8137 if (src_align_bytes
>= 0)
8139 unsigned int src_align
;
8140 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
8142 if ((src_align_bytes
& (src_align
- 1))
8143 == (align_bytes
& (src_align
- 1)))
8146 if (src_align
> (unsigned int) desired_align
)
8147 src_align
= desired_align
;
8148 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
8149 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
8151 if (MEM_SIZE_KNOWN_P (orig_src
))
8152 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
8159 /* Return true if ALG can be used in current context.
8160 Assume we expand memset if MEMSET is true. */
8162 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
8164 if (alg
== no_stringop
)
8166 if (alg
== vector_loop
)
8167 return TARGET_SSE
|| TARGET_AVX
;
8168 /* Algorithms using the rep prefix want at least edi and ecx;
8169 additionally, memset wants eax and memcpy wants esi. Don't
8170 consider such algorithms if the user has appropriated those
8171 registers for their own purposes, or if we have a non-default
8172 address space, since some string insns cannot override the segment. */
8173 if (alg
== rep_prefix_1_byte
8174 || alg
== rep_prefix_4_byte
8175 || alg
== rep_prefix_8_byte
)
8179 if (fixed_regs
[CX_REG
]
8180 || fixed_regs
[DI_REG
]
8181 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
8187 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8188 static enum stringop_alg
8189 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
8190 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
8191 bool memset
, bool zero_memset
, bool have_as
,
8192 int *dynamic_check
, bool *noalign
, bool recur
)
8194 const struct stringop_algs
*algs
;
8195 bool optimize_for_speed
;
8197 const struct processor_costs
*cost
;
8199 bool any_alg_usable_p
= false;
8202 *dynamic_check
= -1;
8204 /* Even if the string operation call is cold, we still might spend a lot
8205 of time processing large blocks. */
8206 if (optimize_function_for_size_p (cfun
)
8207 || (optimize_insn_for_size_p ()
8209 || (expected_size
!= -1 && expected_size
< 256))))
8210 optimize_for_speed
= false;
8212 optimize_for_speed
= true;
8214 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
8216 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
8218 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
8220 /* See maximal size for user defined algorithm. */
8221 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8223 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8224 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
8225 any_alg_usable_p
|= usable
;
8227 if (candidate
!= libcall
&& candidate
&& usable
)
8228 max
= algs
->size
[i
].max
;
8231 /* If expected size is not known but max size is small enough
8232 so inline version is a win, set expected size into
8234 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
8235 && expected_size
== -1)
8236 expected_size
= min_size
/ 2 + max_size
/ 2;
8238 /* If user specified the algorithm, honor it if possible. */
8239 if (ix86_stringop_alg
!= no_stringop
8240 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
8241 return ix86_stringop_alg
;
8242 /* rep; movq or rep; movl is the smallest variant. */
8243 else if (!optimize_for_speed
)
8246 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
8247 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
8248 ? rep_prefix_1_byte
: loop_1_byte
;
8250 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
8251 ? rep_prefix_4_byte
: loop
;
8253 /* Very tiny blocks are best handled via the loop, REP is expensive to
8255 else if (expected_size
!= -1 && expected_size
< 4)
8257 else if (expected_size
!= -1)
8259 enum stringop_alg alg
= libcall
;
8260 bool alg_noalign
= false;
8261 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8263 /* We get here if the algorithms that were not libcall-based
8264 were rep-prefix based and we are unable to use rep prefixes
8265 based on global register usage. Break out of the loop and
8266 use the heuristic below. */
8267 if (algs
->size
[i
].max
== 0)
8269 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
8271 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8273 if (candidate
!= libcall
8274 && alg_usable_p (candidate
, memset
, have_as
))
8277 alg_noalign
= algs
->size
[i
].noalign
;
8279 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8280 last non-libcall inline algorithm. */
8281 if (TARGET_INLINE_ALL_STRINGOPS
)
8283 /* When the current size is best to be copied by a libcall,
8284 but we are still forced to inline, run the heuristic below
8285 that will pick code for medium sized blocks. */
8288 *noalign
= alg_noalign
;
8291 else if (!any_alg_usable_p
)
8294 else if (alg_usable_p (candidate
, memset
, have_as
)
8295 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8296 && candidate
== rep_prefix_1_byte
8297 /* NB: If min_size != max_size, size is
8299 && min_size
!= max_size
))
8301 *noalign
= algs
->size
[i
].noalign
;
8307 /* When asked to inline the call anyway, try to pick meaningful choice.
8308 We look for maximal size of block that is faster to copy by hand and
8309 take blocks of at most of that size guessing that average size will
8310 be roughly half of the block.
8312 If this turns out to be bad, we might simply specify the preferred
8313 choice in ix86_costs. */
8314 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8315 && (algs
->unknown_size
== libcall
8316 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
8318 enum stringop_alg alg
;
8319 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
8321 /* If there aren't any usable algorithms or if recursing already,
8322 then recursing on smaller sizes or same size isn't going to
8323 find anything. Just return the simple byte-at-a-time copy loop. */
8324 if (!any_alg_usable_p
|| recur
)
8326 /* Pick something reasonable. */
8327 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
8328 *dynamic_check
= 128;
8331 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
8332 zero_memset
, have_as
, dynamic_check
, noalign
, true);
8333 gcc_assert (*dynamic_check
== -1);
8334 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8335 *dynamic_check
= max
;
8337 gcc_assert (alg
!= libcall
);
8340 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
8341 ? algs
->unknown_size
: libcall
);
8344 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8345 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8347 decide_alignment (int align
,
8348 enum stringop_alg alg
,
8350 machine_mode move_mode
)
8352 int desired_align
= 0;
8354 gcc_assert (alg
!= no_stringop
);
8358 if (move_mode
== VOIDmode
)
8361 desired_align
= GET_MODE_SIZE (move_mode
);
8362 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8363 copying whole cacheline at once. */
8364 if (TARGET_CPU_P (PENTIUMPRO
)
8365 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
8370 if (desired_align
< align
)
8371 desired_align
= align
;
8372 if (expected_size
!= -1 && expected_size
< 4)
8373 desired_align
= align
;
8375 return desired_align
;
8379 /* Helper function for memcpy. For QImode value 0xXY produce
8380 0xXYXYXYXY of wide specified by MODE. This is essentially
8381 a * 0x10101010, but we can do slightly better than
8382 synth_mult by unwinding the sequence by hand on CPUs with
8385 promote_duplicated_reg (machine_mode mode
, rtx val
)
8387 machine_mode valmode
= GET_MODE (val
);
8389 int nops
= mode
== DImode
? 3 : 2;
8391 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
8392 if (val
== const0_rtx
)
8393 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
8394 if (CONST_INT_P (val
))
8396 HOST_WIDE_INT v
= INTVAL (val
) & 255;
8401 v
|= (v
<< 16) << 16;
8402 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
8405 if (valmode
== VOIDmode
)
8407 if (valmode
!= QImode
)
8408 val
= gen_lowpart (QImode
, val
);
8411 if (!TARGET_PARTIAL_REG_STALL
)
8413 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
8414 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
8415 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
8416 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
8418 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8419 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
8420 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
8425 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8427 if (!TARGET_PARTIAL_REG_STALL
)
8428 emit_insn (gen_insv_1 (mode
, reg
, reg
));
8431 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
8432 NULL
, 1, OPTAB_DIRECT
);
8433 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
8436 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
8437 NULL
, 1, OPTAB_DIRECT
);
8438 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8441 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
8442 NULL
, 1, OPTAB_DIRECT
);
8443 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8448 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8449 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8450 alignment from ALIGN to DESIRED_ALIGN. */
8452 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
8458 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
8459 promoted_val
= promote_duplicated_reg (DImode
, val
);
8460 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
8461 promoted_val
= promote_duplicated_reg (SImode
, val
);
8462 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
8463 promoted_val
= promote_duplicated_reg (HImode
, val
);
8467 return promoted_val
;
8470 /* Copy the address to a Pmode register. This is used for x32 to
8471 truncate DImode TLS address to a SImode register. */
8474 ix86_copy_addr_to_reg (rtx addr
)
8477 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
8479 reg
= copy_addr_to_reg (addr
);
8480 REG_POINTER (reg
) = 1;
8485 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
8486 reg
= copy_to_mode_reg (DImode
, addr
);
8487 REG_POINTER (reg
) = 1;
8488 return gen_rtx_SUBREG (SImode
, reg
, 0);
8492 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8493 operations when profitable. The code depends upon architecture, block size
8494 and alignment, but always has one of the following overall structures:
8496 Aligned move sequence:
8498 1) Prologue guard: Conditional that jumps up to epilogues for small
8499 blocks that can be handled by epilogue alone. This is faster
8500 but also needed for correctness, since prologue assume the block
8501 is larger than the desired alignment.
8503 Optional dynamic check for size and libcall for large
8504 blocks is emitted here too, with -minline-stringops-dynamically.
8506 2) Prologue: copy first few bytes in order to get destination
8507 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8508 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8509 copied. We emit either a jump tree on power of two sized
8510 blocks, or a byte loop.
8512 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8513 with specified algorithm.
8515 4) Epilogue: code copying tail of the block that is too small to be
8516 handled by main body (or up to size guarded by prologue guard).
8518 Misaligned move sequence
8520 1) missaligned move prologue/epilogue containing:
8521 a) Prologue handling small memory blocks and jumping to done_label
8522 (skipped if blocks are known to be large enough)
8523 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8524 needed by single possibly misaligned move
8525 (skipped if alignment is not needed)
8526 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8528 2) Zero size guard dispatching to done_label, if needed
8530 3) dispatch to library call, if needed,
8532 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8533 with specified algorithm. */
8535 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
8536 rtx align_exp
, rtx expected_align_exp
,
8537 rtx expected_size_exp
, rtx min_size_exp
,
8538 rtx max_size_exp
, rtx probable_max_size_exp
,
8543 rtx_code_label
*label
= NULL
;
8545 rtx_code_label
*jump_around_label
= NULL
;
8546 HOST_WIDE_INT align
= 1;
8547 unsigned HOST_WIDE_INT count
= 0;
8548 HOST_WIDE_INT expected_size
= -1;
8549 int size_needed
= 0, epilogue_size_needed
;
8550 int desired_align
= 0, align_bytes
= 0;
8551 enum stringop_alg alg
;
8552 rtx promoted_val
= NULL
;
8553 rtx vec_promoted_val
= NULL
;
8554 bool force_loopy_epilogue
= false;
8556 bool need_zero_guard
= false;
8558 machine_mode move_mode
= VOIDmode
;
8559 machine_mode wider_mode
;
8560 int unroll_factor
= 1;
8561 /* TODO: Once value ranges are available, fill in proper data. */
8562 unsigned HOST_WIDE_INT min_size
= 0;
8563 unsigned HOST_WIDE_INT max_size
= -1;
8564 unsigned HOST_WIDE_INT probable_max_size
= -1;
8565 bool misaligned_prologue_used
= false;
8568 if (CONST_INT_P (align_exp
))
8569 align
= INTVAL (align_exp
);
8570 /* i386 can do misaligned access on reasonably increased cost. */
8571 if (CONST_INT_P (expected_align_exp
)
8572 && INTVAL (expected_align_exp
) > align
)
8573 align
= INTVAL (expected_align_exp
);
8574 /* ALIGN is the minimum of destination and source alignment, but we care here
8575 just about destination alignment. */
8577 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
8578 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
8580 if (CONST_INT_P (count_exp
))
8582 min_size
= max_size
= probable_max_size
= count
= expected_size
8583 = INTVAL (count_exp
);
8584 /* When COUNT is 0, there is nothing to do. */
8591 min_size
= INTVAL (min_size_exp
);
8593 max_size
= INTVAL (max_size_exp
);
8594 if (probable_max_size_exp
)
8595 probable_max_size
= INTVAL (probable_max_size_exp
);
8596 if (CONST_INT_P (expected_size_exp
))
8597 expected_size
= INTVAL (expected_size_exp
);
8600 /* Make sure we don't need to care about overflow later on. */
8601 if (count
> (HOST_WIDE_INT_1U
<< 30))
8604 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
8606 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
8608 /* Step 0: Decide on preferred algorithm, desired alignment and
8609 size of chunks to be copied by main loop. */
8610 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
8612 issetmem
&& val_exp
== const0_rtx
, have_as
,
8613 &dynamic_check
, &noalign
, false);
8616 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
8617 stringop_alg_names
[alg
]);
8621 gcc_assert (alg
!= no_stringop
);
8623 /* For now vector-version of memset is generated only for memory zeroing, as
8624 creating of promoted vector value is very cheap in this case. */
8625 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
8626 alg
= unrolled_loop
;
8629 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
8630 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
8632 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
8635 move_mode
= word_mode
;
8643 need_zero_guard
= true;
8647 need_zero_guard
= true;
8650 need_zero_guard
= true;
8651 unroll_factor
= (TARGET_64BIT
? 4 : 2);
8654 need_zero_guard
= true;
8656 /* Find the widest supported mode. */
8657 move_mode
= word_mode
;
8658 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
8659 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
8660 move_mode
= wider_mode
;
8662 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
8664 if (TARGET_AVX512_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 256)
8667 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8668 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8669 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
8671 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
8672 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
8673 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
8674 move_mode
= word_mode
;
8676 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
8678 case rep_prefix_8_byte
:
8681 case rep_prefix_4_byte
:
8684 case rep_prefix_1_byte
:
8688 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
8689 epilogue_size_needed
= size_needed
;
8691 /* If we are going to call any library calls conditionally, make sure any
8692 pending stack adjustment happen before the first conditional branch,
8693 otherwise they will be emitted before the library call only and won't
8694 happen from the other branches. */
8695 if (dynamic_check
!= -1)
8696 do_pending_stack_adjust ();
8698 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
8699 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
8700 align
= desired_align
;
8702 /* Step 1: Prologue guard. */
8704 /* Alignment code needs count to be in register. */
8705 if (CONST_INT_P (count_exp
) && desired_align
> align
)
8707 if (INTVAL (count_exp
) > desired_align
8708 && INTVAL (count_exp
) > size_needed
)
8711 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
8712 if (align_bytes
<= 0)
8715 align_bytes
= desired_align
- align_bytes
;
8717 if (align_bytes
== 0)
8718 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
8720 gcc_assert (desired_align
>= 1 && align
>= 1);
8722 /* Misaligned move sequences handle both prologue and epilogue at once.
8723 Default code generation results in a smaller code for large alignments
8724 and also avoids redundant job when sizes are known precisely. */
8725 misaligned_prologue_used
8726 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8727 && MAX (desired_align
, epilogue_size_needed
) <= 32
8728 && desired_align
<= epilogue_size_needed
8729 && ((desired_align
> align
&& !align_bytes
)
8730 || (!count
&& epilogue_size_needed
> 1)));
8732 /* Do the cheap promotion to allow better CSE across the
8733 main loop and epilogue (ie one load of the big constant in the
8735 For now the misaligned move sequences do not have fast path
8736 without broadcasting. */
8737 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
8739 if (alg
== vector_loop
)
8741 gcc_assert (val_exp
== const0_rtx
);
8742 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
8743 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
8744 GET_MODE_SIZE (word_mode
),
8745 desired_align
, align
);
8749 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
8750 desired_align
, align
);
8753 /* Misaligned move sequences handles both prologues and epilogues at once.
8754 Default code generation results in smaller code for large alignments and
8755 also avoids redundant job when sizes are known precisely. */
8756 if (misaligned_prologue_used
)
8758 /* Misaligned move prologue handled small blocks by itself. */
8759 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8760 (dst
, src
, &destreg
, &srcreg
,
8761 move_mode
, promoted_val
, vec_promoted_val
,
8764 desired_align
< align
8765 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
8766 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
8768 src
= change_address (src
, BLKmode
, srcreg
);
8769 dst
= change_address (dst
, BLKmode
, destreg
);
8770 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8771 epilogue_size_needed
= 0;
8773 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
8775 /* It is possible that we copied enough so the main loop will not
8777 gcc_assert (size_needed
> 1);
8778 if (jump_around_label
== NULL_RTX
)
8779 jump_around_label
= gen_label_rtx ();
8780 emit_cmp_and_jump_insns (count_exp
,
8781 GEN_INT (size_needed
),
8782 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
8783 if (expected_size
== -1
8784 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
8785 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8787 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8790 /* Ensure that alignment prologue won't copy past end of block. */
8791 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
8793 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
8794 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8795 Make sure it is power of 2. */
8796 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
8798 /* To improve performance of small blocks, we jump around the VAL
8799 promoting mode. This mean that if the promoted VAL is not constant,
8800 we might not use it in the epilogue and have to use byte
8802 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
8803 force_loopy_epilogue
= true;
8804 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8805 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8807 /* If main algorithm works on QImode, no epilogue is needed.
8808 For small sizes just don't align anything. */
8809 if (size_needed
== 1)
8810 desired_align
= align
;
8815 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8817 label
= gen_label_rtx ();
8818 emit_cmp_and_jump_insns (count_exp
,
8819 GEN_INT (epilogue_size_needed
),
8820 LTU
, 0, counter_mode (count_exp
), 1, label
);
8821 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
8822 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8824 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8828 /* Emit code to decide on runtime whether library call or inline should be
8830 if (dynamic_check
!= -1)
8832 if (!issetmem
&& CONST_INT_P (count_exp
))
8834 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
8836 emit_block_copy_via_libcall (dst
, src
, count_exp
);
8837 count_exp
= const0_rtx
;
8843 rtx_code_label
*hot_label
= gen_label_rtx ();
8844 if (jump_around_label
== NULL_RTX
)
8845 jump_around_label
= gen_label_rtx ();
8846 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
8847 LEU
, 0, counter_mode (count_exp
),
8849 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
8851 set_storage_via_libcall (dst
, count_exp
, val_exp
);
8853 emit_block_copy_via_libcall (dst
, src
, count_exp
);
8854 emit_jump (jump_around_label
);
8855 emit_label (hot_label
);
8859 /* Step 2: Alignment prologue. */
8860 /* Do the expensive promotion once we branched off the small blocks. */
8861 if (issetmem
&& !promoted_val
)
8862 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
8863 desired_align
, align
);
8865 if (desired_align
> align
&& !misaligned_prologue_used
)
8867 if (align_bytes
== 0)
8869 /* Except for the first move in prologue, we no longer know
8870 constant offset in aliasing info. It don't seems to worth
8871 the pain to maintain it for the first move, so throw away
8873 dst
= change_address (dst
, BLKmode
, destreg
);
8875 src
= change_address (src
, BLKmode
, srcreg
);
8876 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
8877 promoted_val
, vec_promoted_val
,
8878 count_exp
, align
, desired_align
,
8880 /* At most desired_align - align bytes are copied. */
8881 if (min_size
< (unsigned)(desired_align
- align
))
8884 min_size
-= desired_align
- align
;
8888 /* If we know how many bytes need to be stored before dst is
8889 sufficiently aligned, maintain aliasing info accurately. */
8890 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
8898 count_exp
= plus_constant (counter_mode (count_exp
),
8899 count_exp
, -align_bytes
);
8900 count
-= align_bytes
;
8901 min_size
-= align_bytes
;
8902 max_size
-= align_bytes
;
8905 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
8906 && (count
< (unsigned HOST_WIDE_INT
) size_needed
8907 || (align_bytes
== 0
8908 && count
< ((unsigned HOST_WIDE_INT
) size_needed
8909 + desired_align
- align
))))
8911 /* It is possible that we copied enough so the main loop will not
8913 gcc_assert (size_needed
> 1);
8914 if (label
== NULL_RTX
)
8915 label
= gen_label_rtx ();
8916 emit_cmp_and_jump_insns (count_exp
,
8917 GEN_INT (size_needed
),
8918 LTU
, 0, counter_mode (count_exp
), 1, label
);
8919 if (expected_size
== -1
8920 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
8921 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8923 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8926 if (label
&& size_needed
== 1)
8929 LABEL_NUSES (label
) = 1;
8931 epilogue_size_needed
= 1;
8933 promoted_val
= val_exp
;
8935 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
8936 epilogue_size_needed
= size_needed
;
8938 /* Step 3: Main loop. */
8949 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
8950 count_exp
, move_mode
, unroll_factor
,
8951 expected_size
, issetmem
);
8954 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
8955 vec_promoted_val
, count_exp
, move_mode
,
8956 unroll_factor
, expected_size
, issetmem
);
8958 case rep_prefix_8_byte
:
8959 case rep_prefix_4_byte
:
8960 case rep_prefix_1_byte
:
8961 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
8962 val_exp
, count_exp
, move_mode
, issetmem
);
8965 /* Adjust properly the offset of src and dest memory for aliasing. */
8966 if (CONST_INT_P (count_exp
))
8969 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
8970 (count
/ size_needed
) * size_needed
);
8971 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
8972 (count
/ size_needed
) * size_needed
);
8977 src
= change_address (src
, BLKmode
, srcreg
);
8978 dst
= change_address (dst
, BLKmode
, destreg
);
8981 /* Step 4: Epilogue to copy the remaining bytes. */
8985 /* When the main loop is done, COUNT_EXP might hold original count,
8986 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8987 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8988 bytes. Compensate if needed. */
8990 if (size_needed
< epilogue_size_needed
)
8992 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
8993 GEN_INT (size_needed
- 1), count_exp
, 1,
8995 if (tmp
!= count_exp
)
8996 emit_move_insn (count_exp
, tmp
);
8999 LABEL_NUSES (label
) = 1;
9002 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
9004 if (force_loopy_epilogue
)
9005 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
9006 epilogue_size_needed
);
9010 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
9011 vec_promoted_val
, count_exp
,
9012 epilogue_size_needed
);
9014 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
9015 epilogue_size_needed
);
9018 if (jump_around_label
)
9019 emit_label (jump_around_label
);
9023 /* Expand cmpstrn or memcmp. */
9026 ix86_expand_cmpstrn_or_cmpmem (rtx result
, rtx src1
, rtx src2
,
9027 rtx length
, rtx align
, bool is_cmpstrn
)
9029 /* Expand strncmp and memcmp only with -minline-all-stringops since
9030 "repz cmpsb" can be much slower than strncmp and memcmp functions
9031 implemented with vector instructions, see
9033 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9035 if (!TARGET_INLINE_ALL_STRINGOPS
)
9038 /* Can't use this if the user has appropriated ecx, esi or edi. */
9039 if (fixed_regs
[CX_REG
] || fixed_regs
[SI_REG
] || fixed_regs
[DI_REG
])
9044 /* For strncmp, length is the maximum length, which can be larger
9045 than actual string lengths. We can expand the cmpstrn pattern
9046 to "repz cmpsb" only if one of the strings is a constant so
9047 that expand_builtin_strncmp() can write the length argument to
9048 be the minimum of the const string length and the actual length
9049 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9050 tree t1
= MEM_EXPR (src1
);
9051 tree t2
= MEM_EXPR (src2
);
9052 if (!((t1
&& TREE_CODE (t1
) == MEM_REF
9053 && TREE_CODE (TREE_OPERAND (t1
, 0)) == ADDR_EXPR
9054 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1
, 0), 0))
9056 || (t2
&& TREE_CODE (t2
) == MEM_REF
9057 && TREE_CODE (TREE_OPERAND (t2
, 0)) == ADDR_EXPR
9058 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2
, 0), 0))
9063 rtx addr1
= copy_addr_to_reg (XEXP (src1
, 0));
9064 rtx addr2
= copy_addr_to_reg (XEXP (src2
, 0));
9065 if (addr1
!= XEXP (src1
, 0))
9066 src1
= replace_equiv_address_nv (src1
, addr1
);
9067 if (addr2
!= XEXP (src2
, 0))
9068 src2
= replace_equiv_address_nv (src2
, addr2
);
9070 /* NB: Make a copy of the data length to avoid changing the original
9071 data length by cmpstrnqi patterns. */
9072 length
= ix86_zero_extend_to_Pmode (length
);
9073 rtx lengthreg
= gen_reg_rtx (Pmode
);
9074 emit_move_insn (lengthreg
, length
);
9076 /* If we are testing strict equality, we can use known alignment to
9077 good advantage. This may be possible with combine, particularly
9078 once cc0 is dead. */
9079 if (CONST_INT_P (length
))
9081 if (length
== const0_rtx
)
9083 emit_move_insn (result
, const0_rtx
);
9086 emit_insn (gen_cmpstrnqi_nz_1 (addr1
, addr2
, lengthreg
, align
,
9091 emit_insn (gen_cmp_1 (Pmode
, lengthreg
, lengthreg
));
9092 emit_insn (gen_cmpstrnqi_1 (addr1
, addr2
, lengthreg
, align
,
9096 rtx out
= gen_lowpart (QImode
, result
);
9097 emit_insn (gen_cmpintqi (out
));
9098 emit_move_insn (result
, gen_rtx_SIGN_EXTEND (SImode
, out
));
9103 /* Expand the appropriate insns for doing strlen if not just doing
9106 out = result, initialized with the start address
9107 align_rtx = alignment of the address.
9108 scratch = scratch register, initialized with the startaddress when
9109 not aligned, otherwise undefined
9111 This is just the body. It needs the initializations mentioned above and
9112 some address computing at the end. These things are done in i386.md. */
9115 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
9119 rtx_code_label
*align_2_label
= NULL
;
9120 rtx_code_label
*align_3_label
= NULL
;
9121 rtx_code_label
*align_4_label
= gen_label_rtx ();
9122 rtx_code_label
*end_0_label
= gen_label_rtx ();
9124 rtx tmpreg
= gen_reg_rtx (SImode
);
9125 rtx scratch
= gen_reg_rtx (SImode
);
9129 if (CONST_INT_P (align_rtx
))
9130 align
= INTVAL (align_rtx
);
9132 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9134 /* Is there a known alignment and is it less than 4? */
9137 rtx scratch1
= gen_reg_rtx (Pmode
);
9138 emit_move_insn (scratch1
, out
);
9139 /* Is there a known alignment and is it not 2? */
9142 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
9143 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
9145 /* Leave just the 3 lower bits. */
9146 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
9147 NULL_RTX
, 0, OPTAB_WIDEN
);
9149 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9150 Pmode
, 1, align_4_label
);
9151 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
9152 Pmode
, 1, align_2_label
);
9153 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
9154 Pmode
, 1, align_3_label
);
9158 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9159 check if is aligned to 4 - byte. */
9161 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
9162 NULL_RTX
, 0, OPTAB_WIDEN
);
9164 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9165 Pmode
, 1, align_4_label
);
9168 mem
= change_address (src
, QImode
, out
);
9170 /* Now compare the bytes. */
9172 /* Compare the first n unaligned byte on a byte per byte basis. */
9173 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
9174 QImode
, 1, end_0_label
);
9176 /* Increment the address. */
9177 emit_insn (gen_add2_insn (out
, const1_rtx
));
9179 /* Not needed with an alignment of 2 */
9182 emit_label (align_2_label
);
9184 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9187 emit_insn (gen_add2_insn (out
, const1_rtx
));
9189 emit_label (align_3_label
);
9192 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9195 emit_insn (gen_add2_insn (out
, const1_rtx
));
9198 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9199 align this loop. It gives only huge programs, but does not help to
9201 emit_label (align_4_label
);
9203 mem
= change_address (src
, SImode
, out
);
9204 emit_move_insn (scratch
, mem
);
9205 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
9207 /* This formula yields a nonzero result iff one of the bytes is zero.
9208 This saves three branches inside loop and many cycles. */
9210 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
9211 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
9212 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
9213 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
9214 gen_int_mode (0x80808080, SImode
)));
9215 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
9220 rtx reg
= gen_reg_rtx (SImode
);
9221 rtx reg2
= gen_reg_rtx (Pmode
);
9222 emit_move_insn (reg
, tmpreg
);
9223 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
9225 /* If zero is not in the first two bytes, move two bytes forward. */
9226 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9227 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9228 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9229 emit_insn (gen_rtx_SET (tmpreg
,
9230 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
9233 /* Emit lea manually to avoid clobbering of flags. */
9234 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
9236 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9237 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9238 emit_insn (gen_rtx_SET (out
,
9239 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
9245 rtx_code_label
*end_2_label
= gen_label_rtx ();
9246 /* Is zero in the first two bytes? */
9248 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9249 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9250 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
9251 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
9252 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
9254 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
9255 JUMP_LABEL (tmp
) = end_2_label
;
9257 /* Not in the first two. Move two bytes forward. */
9258 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
9259 emit_insn (gen_add2_insn (out
, const2_rtx
));
9261 emit_label (end_2_label
);
9265 /* Avoid branch in fixing the byte. */
9266 tmpreg
= gen_lowpart (QImode
, tmpreg
);
9267 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
9268 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
9269 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
9270 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
9272 emit_label (end_0_label
);
9275 /* Expand strlen. */
9278 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
9280 if (TARGET_UNROLL_STRLEN
9281 && TARGET_INLINE_ALL_STRINGOPS
9282 && eoschar
== const0_rtx
9285 /* The generic case of strlen expander is long. Avoid it's
9286 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9287 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
9288 /* Well it seems that some optimizer does not combine a call like
9289 foo(strlen(bar), strlen(bar));
9290 when the move and the subtraction is done here. It does calculate
9291 the length just once when these instructions are done inside of
9292 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9293 often used and I use one fewer register for the lifetime of
9294 output_strlen_unroll() this is better. */
9296 emit_move_insn (out
, addr
);
9298 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
9300 /* strlensi_unroll_1 returns the address of the zero at the end of
9301 the string, like memchr(), so compute the length by subtracting
9302 the start address. */
9303 emit_insn (gen_sub2_insn (out
, addr
));
9310 /* For given symbol (function) construct code to compute address of it's PLT
9311 entry in large x86-64 PIC model. */
9314 construct_plt_address (rtx symbol
)
9318 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
9319 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
9320 gcc_assert (Pmode
== DImode
);
9322 tmp
= gen_reg_rtx (Pmode
);
9323 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
9325 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
9326 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
9330 /* Additional registers that are clobbered by SYSV calls. */
9332 static int const x86_64_ms_sysv_extra_clobbered_registers
9333 [NUM_X86_64_MS_CLOBBERED_REGS
] =
9337 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
9338 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
9342 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
9344 rtx pop
, bool sibcall
)
9347 rtx use
= NULL
, call
;
9348 unsigned int vec_len
= 0;
9351 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9353 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
9355 && (lookup_attribute ("interrupt",
9356 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
9357 error ("interrupt service routine cannot be called directly");
9362 if (pop
== const0_rtx
)
9364 gcc_assert (!TARGET_64BIT
|| !pop
);
9366 rtx addr
= XEXP (fnaddr
, 0);
9367 if (TARGET_MACHO
&& !TARGET_64BIT
)
9370 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9371 fnaddr
= machopic_indirect_call_target (fnaddr
);
9376 /* Static functions and indirect calls don't need the pic register. Also,
9377 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9378 it an indirect call. */
9380 && GET_CODE (addr
) == SYMBOL_REF
9381 && ix86_call_use_plt_p (addr
))
9384 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
9385 || !lookup_attribute ("noplt",
9386 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
9389 || (ix86_cmodel
== CM_LARGE_PIC
9390 && DEFAULT_ABI
!= MS_ABI
))
9392 use_reg (&use
, gen_rtx_REG (Pmode
,
9393 REAL_PIC_OFFSET_TABLE_REGNUM
));
9394 if (ix86_use_pseudo_pic_reg ())
9395 emit_move_insn (gen_rtx_REG (Pmode
,
9396 REAL_PIC_OFFSET_TABLE_REGNUM
),
9397 pic_offset_table_rtx
);
9400 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
9403 && ix86_cmodel
== CM_LARGE_PIC
9404 && DEFAULT_ABI
!= MS_ABI
)
9406 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9408 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9409 fnaddr
= force_reg (Pmode
, fnaddr
);
9410 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
, fnaddr
);
9412 else if (TARGET_64BIT
)
9414 fnaddr
= gen_rtx_UNSPEC (Pmode
,
9415 gen_rtvec (1, addr
),
9417 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9421 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9423 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9424 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
9427 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
9428 /* Pmode may not be the same as word_mode for x32, which
9429 doesn't support indirect branch via 32-bit memory slot.
9430 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9431 indirect branch via x32 GOT slot is OK. */
9432 if (GET_MODE (fnaddr
) != word_mode
)
9433 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
9434 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
9439 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9440 parameters passed in vector registers. */
9442 && (INTVAL (callarg2
) > 0
9443 || (INTVAL (callarg2
) == 0
9444 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
9446 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
9447 emit_move_insn (al
, callarg2
);
9451 if (ix86_cmodel
== CM_LARGE_PIC
9454 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
9455 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
9456 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
9457 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9458 branch via x32 GOT slot is OK. */
9459 else if (!(TARGET_X32
9461 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
9462 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
9464 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
9465 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
9467 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
9468 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
9471 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9472 mask off code pointers here.
9473 TODO: also need to handle indirect jump. */
9474 if (ix86_memtag_can_tag_addresses () && !fndecl
9475 && sanitize_flags_p (SANITIZE_HWADDRESS
))
9477 rtx untagged_addr
= ix86_memtag_untagged_pointer (XEXP (fnaddr
, 0),
9479 fnaddr
= gen_rtx_MEM (QImode
, untagged_addr
);
9482 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
9485 call
= gen_rtx_SET (retval
, call
);
9486 vec
[vec_len
++] = call
;
9490 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
9491 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
9492 vec
[vec_len
++] = pop
;
9495 if (cfun
->machine
->no_caller_saved_registers
9497 || (!TREE_THIS_VOLATILE (fndecl
)
9498 && !lookup_attribute ("no_caller_saved_registers",
9499 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
9501 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
9502 bool is_64bit_ms_abi
= (TARGET_64BIT
9503 && ix86_function_abi (fndecl
) == MS_ABI
);
9504 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
9506 /* If there are no caller-saved registers, add all registers
9507 that are clobbered by the call which returns. */
9508 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
9510 && (ix86_call_used_regs
[i
] == 1
9511 || (ix86_call_used_regs
[i
] & c_mask
))
9512 && !STACK_REGNO_P (i
)
9513 && !MMX_REGNO_P (i
))
9515 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
9517 else if (TARGET_64BIT_MS_ABI
9518 && (!callarg2
|| INTVAL (callarg2
) != -2))
9522 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
9524 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
9525 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
9527 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
9530 /* Set here, but it may get cleared later. */
9531 if (TARGET_CALL_MS2SYSV_XLOGUES
)
9536 /* Don't break hot-patched functions. */
9537 else if (ix86_function_ms_hook_prologue (current_function_decl
))
9540 /* TODO: Cases not yet examined. */
9541 else if (flag_split_stack
)
9542 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9546 gcc_assert (!reload_completed
);
9547 cfun
->machine
->call_ms2sysv
= true;
9552 if (TARGET_MACHO
&& TARGET_64BIT
&& !sibcall
9553 && ((GET_CODE (addr
) == SYMBOL_REF
&& !SYMBOL_REF_LOCAL_P (addr
))
9554 || !fndecl
|| TREE_PUBLIC (fndecl
)))
9556 /* We allow public functions defined in a TU to bind locally for PIC
9557 code (the default) on 64bit Mach-O.
9558 If such functions are not inlined, we cannot tell at compile-time if
9559 they will be called via the lazy symbol resolver (this can depend on
9560 options given at link-time). Therefore, we must assume that the lazy
9561 resolver could be used which clobbers R11 and R10. */
9562 clobber_reg (&use
, gen_rtx_REG (DImode
, R11_REG
));
9563 clobber_reg (&use
, gen_rtx_REG (DImode
, R10_REG
));
9567 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
9568 rtx_insn
*call_insn
= emit_call_insn (call
);
9570 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
9575 /* Split simple return with popping POPC bytes from stack to indirect
9576 branch with stack adjustment . */
9579 ix86_split_simple_return_pop_internal (rtx popc
)
9581 struct machine_function
*m
= cfun
->machine
;
9582 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
9585 /* There is no "pascal" calling convention in any 64bit ABI. */
9586 gcc_assert (!TARGET_64BIT
);
9588 insn
= emit_insn (gen_pop (ecx
));
9589 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
9590 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
9592 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
9593 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9594 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9595 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
9596 RTX_FRAME_RELATED_P (insn
) = 1;
9598 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
9599 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9600 insn
= emit_insn (x
);
9601 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9602 RTX_FRAME_RELATED_P (insn
) = 1;
9604 /* Now return address is in ECX. */
9605 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
9608 /* Errors in the source file can cause expand_expr to return const0_rtx
9609 where we expect a vector. To avoid crashing, use one of the vector
9610 clear instructions. */
9613 safe_vector_operand (rtx x
, machine_mode mode
)
9615 if (x
== const0_rtx
)
9616 x
= CONST0_RTX (mode
);
9620 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9623 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
9626 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9627 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9628 rtx op0
= expand_normal (arg0
);
9629 rtx op1
= expand_normal (arg1
);
9630 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9631 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
9632 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
9634 if (VECTOR_MODE_P (mode0
))
9635 op0
= safe_vector_operand (op0
, mode0
);
9636 if (VECTOR_MODE_P (mode1
))
9637 op1
= safe_vector_operand (op1
, mode1
);
9639 if (optimize
|| !target
9640 || GET_MODE (target
) != tmode
9641 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9642 target
= gen_reg_rtx (tmode
);
9644 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
9646 rtx x
= gen_reg_rtx (V4SImode
);
9647 emit_insn (gen_sse2_loadd (x
, op1
));
9648 op1
= gen_lowpart (TImode
, x
);
9651 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
9652 op0
= copy_to_mode_reg (mode0
, op0
);
9653 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
9654 op1
= copy_to_mode_reg (mode1
, op1
);
9656 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
9665 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9668 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
9669 enum ix86_builtin_func_type m_type
,
9670 enum rtx_code sub_code
)
9673 unsigned int i
, nargs
;
9674 bool comparison_p
= false;
9676 bool last_arg_constant
= false;
9680 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9684 case MULTI_ARG_4_DF2_DI_I
:
9685 case MULTI_ARG_4_DF2_DI_I1
:
9686 case MULTI_ARG_4_SF2_SI_I
:
9687 case MULTI_ARG_4_SF2_SI_I1
:
9689 last_arg_constant
= true;
9692 case MULTI_ARG_3_SF
:
9693 case MULTI_ARG_3_DF
:
9694 case MULTI_ARG_3_SF2
:
9695 case MULTI_ARG_3_DF2
:
9696 case MULTI_ARG_3_DI
:
9697 case MULTI_ARG_3_SI
:
9698 case MULTI_ARG_3_SI_DI
:
9699 case MULTI_ARG_3_HI
:
9700 case MULTI_ARG_3_HI_SI
:
9701 case MULTI_ARG_3_QI
:
9702 case MULTI_ARG_3_DI2
:
9703 case MULTI_ARG_3_SI2
:
9704 case MULTI_ARG_3_HI2
:
9705 case MULTI_ARG_3_QI2
:
9709 case MULTI_ARG_2_SF
:
9710 case MULTI_ARG_2_DF
:
9711 case MULTI_ARG_2_DI
:
9712 case MULTI_ARG_2_SI
:
9713 case MULTI_ARG_2_HI
:
9714 case MULTI_ARG_2_QI
:
9718 case MULTI_ARG_2_DI_IMM
:
9719 case MULTI_ARG_2_SI_IMM
:
9720 case MULTI_ARG_2_HI_IMM
:
9721 case MULTI_ARG_2_QI_IMM
:
9723 last_arg_constant
= true;
9726 case MULTI_ARG_1_SF
:
9727 case MULTI_ARG_1_DF
:
9728 case MULTI_ARG_1_SF2
:
9729 case MULTI_ARG_1_DF2
:
9730 case MULTI_ARG_1_DI
:
9731 case MULTI_ARG_1_SI
:
9732 case MULTI_ARG_1_HI
:
9733 case MULTI_ARG_1_QI
:
9734 case MULTI_ARG_1_SI_DI
:
9735 case MULTI_ARG_1_HI_DI
:
9736 case MULTI_ARG_1_HI_SI
:
9737 case MULTI_ARG_1_QI_DI
:
9738 case MULTI_ARG_1_QI_SI
:
9739 case MULTI_ARG_1_QI_HI
:
9743 case MULTI_ARG_2_DI_CMP
:
9744 case MULTI_ARG_2_SI_CMP
:
9745 case MULTI_ARG_2_HI_CMP
:
9746 case MULTI_ARG_2_QI_CMP
:
9748 comparison_p
= true;
9751 case MULTI_ARG_2_SF_TF
:
9752 case MULTI_ARG_2_DF_TF
:
9753 case MULTI_ARG_2_DI_TF
:
9754 case MULTI_ARG_2_SI_TF
:
9755 case MULTI_ARG_2_HI_TF
:
9756 case MULTI_ARG_2_QI_TF
:
9765 if (optimize
|| !target
9766 || GET_MODE (target
) != tmode
9767 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9768 target
= gen_reg_rtx (tmode
);
9769 else if (memory_operand (target
, tmode
))
9772 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
9774 for (i
= 0; i
< nargs
; i
++)
9776 tree arg
= CALL_EXPR_ARG (exp
, i
);
9777 rtx op
= expand_normal (arg
);
9778 int adjust
= (comparison_p
) ? 1 : 0;
9779 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
9781 if (last_arg_constant
&& i
== nargs
- 1)
9783 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
9785 enum insn_code new_icode
= icode
;
9788 case CODE_FOR_xop_vpermil2v2df3
:
9789 case CODE_FOR_xop_vpermil2v4sf3
:
9790 case CODE_FOR_xop_vpermil2v4df3
:
9791 case CODE_FOR_xop_vpermil2v8sf3
:
9792 error ("the last argument must be a 2-bit immediate");
9793 return gen_reg_rtx (tmode
);
9794 case CODE_FOR_xop_rotlv2di3
:
9795 new_icode
= CODE_FOR_rotlv2di3
;
9797 case CODE_FOR_xop_rotlv4si3
:
9798 new_icode
= CODE_FOR_rotlv4si3
;
9800 case CODE_FOR_xop_rotlv8hi3
:
9801 new_icode
= CODE_FOR_rotlv8hi3
;
9803 case CODE_FOR_xop_rotlv16qi3
:
9804 new_icode
= CODE_FOR_rotlv16qi3
;
9806 if (CONST_INT_P (op
))
9808 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
9809 op
= GEN_INT (INTVAL (op
) & mask
);
9811 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
9817 && insn_data
[new_icode
].operand
[0].mode
== tmode
9818 && insn_data
[new_icode
].operand
[1].mode
== tmode
9819 && insn_data
[new_icode
].operand
[2].mode
== mode
9820 && insn_data
[new_icode
].operand
[0].predicate
9821 == insn_data
[icode
].operand
[0].predicate
9822 && insn_data
[new_icode
].operand
[1].predicate
9823 == insn_data
[icode
].operand
[1].predicate
);
9836 if (VECTOR_MODE_P (mode
))
9837 op
= safe_vector_operand (op
, mode
);
9839 /* If we aren't optimizing, only allow one memory operand to be
9841 if (memory_operand (op
, mode
))
9844 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
9847 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
9849 op
= force_reg (mode
, op
);
9858 pat
= GEN_FCN (icode
) (target
, xops
[0]);
9863 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
9864 GEN_INT ((int)sub_code
));
9865 else if (! comparison_p
)
9866 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
9869 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
9872 pat
= GEN_FCN (icode
) (target
, cmp_op
, xops
[0], xops
[1]);
9877 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
9881 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
9895 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9896 insns with vec_merge. */
9899 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
9903 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9904 rtx op1
, op0
= expand_normal (arg0
);
9905 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9906 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
9908 if (optimize
|| !target
9909 || GET_MODE (target
) != tmode
9910 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9911 target
= gen_reg_rtx (tmode
);
9913 if (VECTOR_MODE_P (mode0
))
9914 op0
= safe_vector_operand (op0
, mode0
);
9916 if ((optimize
&& !register_operand (op0
, mode0
))
9917 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
9918 op0
= copy_to_mode_reg (mode0
, op0
);
9921 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
9922 op1
= copy_to_mode_reg (mode0
, op1
);
9924 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
9931 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9934 ix86_expand_sse_compare (const struct builtin_description
*d
,
9935 tree exp
, rtx target
, bool swap
)
9938 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9939 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9940 rtx op0
= expand_normal (arg0
);
9941 rtx op1
= expand_normal (arg1
);
9943 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
9944 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
9945 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
9946 enum rtx_code comparison
= d
->comparison
;
9948 if (VECTOR_MODE_P (mode0
))
9949 op0
= safe_vector_operand (op0
, mode0
);
9950 if (VECTOR_MODE_P (mode1
))
9951 op1
= safe_vector_operand (op1
, mode1
);
9953 /* Swap operands if we have a comparison that isn't available in
9956 std::swap (op0
, op1
);
9958 if (optimize
|| !target
9959 || GET_MODE (target
) != tmode
9960 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
9961 target
= gen_reg_rtx (tmode
);
9963 if ((optimize
&& !register_operand (op0
, mode0
))
9964 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
9965 op0
= copy_to_mode_reg (mode0
, op0
);
9966 if ((optimize
&& !register_operand (op1
, mode1
))
9967 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
9968 op1
= copy_to_mode_reg (mode1
, op1
);
9970 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
9971 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
9978 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
9979 * ordered EQ or unordered NE, generate PF jump. */
9982 ix86_ssecom_setcc (const enum rtx_code comparison
,
9983 bool check_unordered
, machine_mode mode
,
9984 rtx set_dst
, rtx target
)
9987 rtx_code_label
*label
= NULL
;
9989 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
9990 with NAN operands. */
9991 if (check_unordered
)
9993 gcc_assert (comparison
== EQ
|| comparison
== NE
);
9995 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
9996 label
= gen_label_rtx ();
9997 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
9998 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
9999 gen_rtx_LABEL_REF (VOIDmode
, label
),
10001 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10004 /* NB: Set CCFPmode and check a different CCmode which is in subset
10006 if (GET_MODE (set_dst
) != mode
)
10008 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10009 || mode
== CCOmode
|| mode
== CCPmode
10010 || mode
== CCSmode
|| mode
== CCZmode
);
10011 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10014 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10015 gen_rtx_fmt_ee (comparison
, QImode
,
10020 emit_label (label
);
10022 return SUBREG_REG (target
);
10025 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
10028 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
10032 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10033 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10034 rtx op0
= expand_normal (arg0
);
10035 rtx op1
= expand_normal (arg1
);
10036 enum insn_code icode
= d
->icode
;
10037 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10038 machine_mode mode0
= insn_p
->operand
[0].mode
;
10039 machine_mode mode1
= insn_p
->operand
[1].mode
;
10041 if (VECTOR_MODE_P (mode0
))
10042 op0
= safe_vector_operand (op0
, mode0
);
10043 if (VECTOR_MODE_P (mode1
))
10044 op1
= safe_vector_operand (op1
, mode1
);
10046 enum rtx_code comparison
= d
->comparison
;
10047 rtx const_val
= const0_rtx
;
10049 bool check_unordered
= false;
10050 machine_mode mode
= CCFPmode
;
10051 switch (comparison
)
10053 case LE
: /* -> GE */
10054 case LT
: /* -> GT */
10055 std::swap (op0
, op1
);
10056 comparison
= swap_condition (comparison
);
10062 check_unordered
= true;
10066 check_unordered
= true;
10068 const_val
= const1_rtx
;
10071 gcc_unreachable ();
10074 target
= gen_reg_rtx (SImode
);
10075 emit_move_insn (target
, const_val
);
10076 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10078 if ((optimize
&& !register_operand (op0
, mode0
))
10079 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10080 op0
= copy_to_mode_reg (mode0
, op0
);
10081 if ((optimize
&& !register_operand (op1
, mode1
))
10082 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10083 op1
= copy_to_mode_reg (mode1
, op1
);
10085 pat
= GEN_FCN (icode
) (op0
, op1
);
10089 set_dst
= SET_DEST (pat
);
10091 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
10095 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10098 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
10102 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10103 rtx op1
, op0
= expand_normal (arg0
);
10104 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10105 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10107 if (optimize
|| target
== 0
10108 || GET_MODE (target
) != tmode
10109 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10110 target
= gen_reg_rtx (tmode
);
10112 if (VECTOR_MODE_P (mode0
))
10113 op0
= safe_vector_operand (op0
, mode0
);
10115 if ((optimize
&& !register_operand (op0
, mode0
))
10116 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10117 op0
= copy_to_mode_reg (mode0
, op0
);
10119 op1
= GEN_INT (d
->comparison
);
10121 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
10129 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
10130 tree exp
, rtx target
)
10133 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10134 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10135 rtx op0
= expand_normal (arg0
);
10136 rtx op1
= expand_normal (arg1
);
10138 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10139 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10140 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
10142 if (optimize
|| target
== 0
10143 || GET_MODE (target
) != tmode
10144 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10145 target
= gen_reg_rtx (tmode
);
10147 op0
= safe_vector_operand (op0
, mode0
);
10148 op1
= safe_vector_operand (op1
, mode1
);
10150 if ((optimize
&& !register_operand (op0
, mode0
))
10151 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10152 op0
= copy_to_mode_reg (mode0
, op0
);
10153 if ((optimize
&& !register_operand (op1
, mode1
))
10154 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10155 op1
= copy_to_mode_reg (mode1
, op1
);
10157 op2
= GEN_INT (d
->comparison
);
10159 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
10166 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10169 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
10173 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10174 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10175 rtx op0
= expand_normal (arg0
);
10176 rtx op1
= expand_normal (arg1
);
10177 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
10178 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
10179 enum rtx_code comparison
= d
->comparison
;
10181 if (VECTOR_MODE_P (mode0
))
10182 op0
= safe_vector_operand (op0
, mode0
);
10183 if (VECTOR_MODE_P (mode1
))
10184 op1
= safe_vector_operand (op1
, mode1
);
10186 target
= gen_reg_rtx (SImode
);
10187 emit_move_insn (target
, const0_rtx
);
10188 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10190 if ((optimize
&& !register_operand (op0
, mode0
))
10191 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10192 op0
= copy_to_mode_reg (mode0
, op0
);
10193 if ((optimize
&& !register_operand (op1
, mode1
))
10194 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10195 op1
= copy_to_mode_reg (mode1
, op1
);
10197 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
10201 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10202 gen_rtx_fmt_ee (comparison
, QImode
,
10206 return SUBREG_REG (target
);
10209 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10212 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
10213 tree exp
, rtx target
)
10216 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10217 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10218 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10219 tree arg3
= CALL_EXPR_ARG (exp
, 3);
10220 tree arg4
= CALL_EXPR_ARG (exp
, 4);
10221 rtx scratch0
, scratch1
;
10222 rtx op0
= expand_normal (arg0
);
10223 rtx op1
= expand_normal (arg1
);
10224 rtx op2
= expand_normal (arg2
);
10225 rtx op3
= expand_normal (arg3
);
10226 rtx op4
= expand_normal (arg4
);
10227 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
10229 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10230 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10231 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10232 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
10233 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
10234 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
10235 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
10237 if (VECTOR_MODE_P (modev2
))
10238 op0
= safe_vector_operand (op0
, modev2
);
10239 if (VECTOR_MODE_P (modev4
))
10240 op2
= safe_vector_operand (op2
, modev4
);
10242 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10243 op0
= copy_to_mode_reg (modev2
, op0
);
10244 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
10245 op1
= copy_to_mode_reg (modei3
, op1
);
10246 if ((optimize
&& !register_operand (op2
, modev4
))
10247 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
10248 op2
= copy_to_mode_reg (modev4
, op2
);
10249 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
10250 op3
= copy_to_mode_reg (modei5
, op3
);
10252 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
10254 error ("the fifth argument must be an 8-bit immediate");
10258 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
10260 if (optimize
|| !target
10261 || GET_MODE (target
) != tmode0
10262 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10263 target
= gen_reg_rtx (tmode0
);
10265 scratch1
= gen_reg_rtx (tmode1
);
10267 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10269 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
10271 if (optimize
|| !target
10272 || GET_MODE (target
) != tmode1
10273 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10274 target
= gen_reg_rtx (tmode1
);
10276 scratch0
= gen_reg_rtx (tmode0
);
10278 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
10282 gcc_assert (d
->flag
);
10284 scratch0
= gen_reg_rtx (tmode0
);
10285 scratch1
= gen_reg_rtx (tmode1
);
10287 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10297 target
= gen_reg_rtx (SImode
);
10298 emit_move_insn (target
, const0_rtx
);
10299 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10302 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10303 gen_rtx_fmt_ee (EQ
, QImode
,
10304 gen_rtx_REG ((machine_mode
) d
->flag
,
10307 return SUBREG_REG (target
);
10314 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10317 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
10318 tree exp
, rtx target
)
10321 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10322 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10323 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10324 rtx scratch0
, scratch1
;
10325 rtx op0
= expand_normal (arg0
);
10326 rtx op1
= expand_normal (arg1
);
10327 rtx op2
= expand_normal (arg2
);
10328 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
10330 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10331 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10332 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10333 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
10334 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
10336 if (VECTOR_MODE_P (modev2
))
10337 op0
= safe_vector_operand (op0
, modev2
);
10338 if (VECTOR_MODE_P (modev3
))
10339 op1
= safe_vector_operand (op1
, modev3
);
10341 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10342 op0
= copy_to_mode_reg (modev2
, op0
);
10343 if ((optimize
&& !register_operand (op1
, modev3
))
10344 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
10345 op1
= copy_to_mode_reg (modev3
, op1
);
10347 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
10349 error ("the third argument must be an 8-bit immediate");
10353 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
10355 if (optimize
|| !target
10356 || GET_MODE (target
) != tmode0
10357 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10358 target
= gen_reg_rtx (tmode0
);
10360 scratch1
= gen_reg_rtx (tmode1
);
10362 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
10364 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
10366 if (optimize
|| !target
10367 || GET_MODE (target
) != tmode1
10368 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10369 target
= gen_reg_rtx (tmode1
);
10371 scratch0
= gen_reg_rtx (tmode0
);
10373 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
10377 gcc_assert (d
->flag
);
10379 scratch0
= gen_reg_rtx (tmode0
);
10380 scratch1
= gen_reg_rtx (tmode1
);
10382 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
10392 target
= gen_reg_rtx (SImode
);
10393 emit_move_insn (target
, const0_rtx
);
10394 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10397 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10398 gen_rtx_fmt_ee (EQ
, QImode
,
10399 gen_rtx_REG ((machine_mode
) d
->flag
,
10402 return SUBREG_REG (target
);
10408 /* Fixup modeless constants to fit required mode. */
10411 fixup_modeless_constant (rtx x
, machine_mode mode
)
10413 if (GET_MODE (x
) == VOIDmode
)
10414 x
= convert_to_mode (mode
, x
, 1);
10418 /* Subroutine of ix86_expand_builtin to take care of insns with
10419 variable number of operands. */
10422 ix86_expand_args_builtin (const struct builtin_description
*d
,
10423 tree exp
, rtx target
)
10425 rtx pat
, real_target
;
10426 unsigned int i
, nargs
;
10427 unsigned int nargs_constant
= 0;
10428 unsigned int mask_pos
= 0;
10429 int num_memory
= 0;
10431 bool second_arg_count
= false;
10432 enum insn_code icode
= d
->icode
;
10433 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10434 machine_mode tmode
= insn_p
->operand
[0].mode
;
10435 machine_mode rmode
= VOIDmode
;
10437 enum rtx_code comparison
= d
->comparison
;
10439 switch ((enum ix86_builtin_func_type
) d
->flag
)
10441 case V2DF_FTYPE_V2DF_ROUND
:
10442 case V4DF_FTYPE_V4DF_ROUND
:
10443 case V8DF_FTYPE_V8DF_ROUND
:
10444 case V4SF_FTYPE_V4SF_ROUND
:
10445 case V8SF_FTYPE_V8SF_ROUND
:
10446 case V16SF_FTYPE_V16SF_ROUND
:
10447 case V8HF_FTYPE_V8HF_ROUND
:
10448 case V16HF_FTYPE_V16HF_ROUND
:
10449 case V32HF_FTYPE_V32HF_ROUND
:
10450 case V4SI_FTYPE_V4SF_ROUND
:
10451 case V8SI_FTYPE_V8SF_ROUND
:
10452 case V16SI_FTYPE_V16SF_ROUND
:
10453 return ix86_expand_sse_round (d
, exp
, target
);
10454 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
10455 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
10456 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
10457 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
10458 case INT_FTYPE_V8SF_V8SF_PTEST
:
10459 case INT_FTYPE_V4DI_V4DI_PTEST
:
10460 case INT_FTYPE_V4DF_V4DF_PTEST
:
10461 case INT_FTYPE_V4SF_V4SF_PTEST
:
10462 case INT_FTYPE_V2DI_V2DI_PTEST
:
10463 case INT_FTYPE_V2DF_V2DF_PTEST
:
10464 return ix86_expand_sse_ptest (d
, exp
, target
);
10465 case FLOAT128_FTYPE_FLOAT128
:
10466 case FLOAT_FTYPE_FLOAT
:
10467 case FLOAT_FTYPE_BFLOAT16
:
10468 case INT_FTYPE_INT
:
10469 case UINT_FTYPE_UINT
:
10470 case UINT16_FTYPE_UINT16
:
10471 case UINT64_FTYPE_INT
:
10472 case UINT64_FTYPE_UINT64
:
10473 case INT64_FTYPE_INT64
:
10474 case INT64_FTYPE_V4SF
:
10475 case INT64_FTYPE_V2DF
:
10476 case INT_FTYPE_V16QI
:
10477 case INT_FTYPE_V8QI
:
10478 case INT_FTYPE_V8SF
:
10479 case INT_FTYPE_V4DF
:
10480 case INT_FTYPE_V4SF
:
10481 case INT_FTYPE_V2DF
:
10482 case INT_FTYPE_V32QI
:
10483 case V16QI_FTYPE_V16QI
:
10484 case V8SI_FTYPE_V8SF
:
10485 case V8SI_FTYPE_V4SI
:
10486 case V8HI_FTYPE_V8HI
:
10487 case V8HI_FTYPE_V16QI
:
10488 case V8QI_FTYPE_V8QI
:
10489 case V8SF_FTYPE_V8SF
:
10490 case V8SF_FTYPE_V8SI
:
10491 case V8SF_FTYPE_V4SF
:
10492 case V8SF_FTYPE_V8HI
:
10493 case V4SI_FTYPE_V4SI
:
10494 case V4SI_FTYPE_V16QI
:
10495 case V4SI_FTYPE_V4SF
:
10496 case V4SI_FTYPE_V8SI
:
10497 case V4SI_FTYPE_V8HI
:
10498 case V4SI_FTYPE_V4DF
:
10499 case V4SI_FTYPE_V2DF
:
10500 case V4HI_FTYPE_V4HI
:
10501 case V4DF_FTYPE_V4DF
:
10502 case V4DF_FTYPE_V4SI
:
10503 case V4DF_FTYPE_V4SF
:
10504 case V4DF_FTYPE_V2DF
:
10505 case V4SF_FTYPE_V4SF
:
10506 case V4SF_FTYPE_V4SI
:
10507 case V4SF_FTYPE_V8SF
:
10508 case V4SF_FTYPE_V4DF
:
10509 case V4SF_FTYPE_V8HI
:
10510 case V4SF_FTYPE_V2DF
:
10511 case V2DI_FTYPE_V2DI
:
10512 case V2DI_FTYPE_V16QI
:
10513 case V2DI_FTYPE_V8HI
:
10514 case V2DI_FTYPE_V4SI
:
10515 case V2DF_FTYPE_V2DF
:
10516 case V2DF_FTYPE_V4SI
:
10517 case V2DF_FTYPE_V4DF
:
10518 case V2DF_FTYPE_V4SF
:
10519 case V2DF_FTYPE_V2SI
:
10520 case V2SI_FTYPE_V2SI
:
10521 case V2SI_FTYPE_V4SF
:
10522 case V2SI_FTYPE_V2SF
:
10523 case V2SI_FTYPE_V2DF
:
10524 case V2SF_FTYPE_V2SF
:
10525 case V2SF_FTYPE_V2SI
:
10526 case V32QI_FTYPE_V32QI
:
10527 case V32QI_FTYPE_V16QI
:
10528 case V16HI_FTYPE_V16HI
:
10529 case V16HI_FTYPE_V8HI
:
10530 case V8SI_FTYPE_V8SI
:
10531 case V16HI_FTYPE_V16QI
:
10532 case V8SI_FTYPE_V16QI
:
10533 case V4DI_FTYPE_V16QI
:
10534 case V8SI_FTYPE_V8HI
:
10535 case V4DI_FTYPE_V8HI
:
10536 case V4DI_FTYPE_V4SI
:
10537 case V4DI_FTYPE_V2DI
:
10538 case UQI_FTYPE_UQI
:
10539 case UHI_FTYPE_UHI
:
10540 case USI_FTYPE_USI
:
10541 case USI_FTYPE_UQI
:
10542 case USI_FTYPE_UHI
:
10543 case UDI_FTYPE_UDI
:
10544 case UHI_FTYPE_V16QI
:
10545 case USI_FTYPE_V32QI
:
10546 case UDI_FTYPE_V64QI
:
10547 case V16QI_FTYPE_UHI
:
10548 case V32QI_FTYPE_USI
:
10549 case V64QI_FTYPE_UDI
:
10550 case V8HI_FTYPE_UQI
:
10551 case V16HI_FTYPE_UHI
:
10552 case V32HI_FTYPE_USI
:
10553 case V4SI_FTYPE_UQI
:
10554 case V8SI_FTYPE_UQI
:
10555 case V4SI_FTYPE_UHI
:
10556 case V8SI_FTYPE_UHI
:
10557 case UQI_FTYPE_V8HI
:
10558 case UHI_FTYPE_V16HI
:
10559 case USI_FTYPE_V32HI
:
10560 case UQI_FTYPE_V4SI
:
10561 case UQI_FTYPE_V8SI
:
10562 case UHI_FTYPE_V16SI
:
10563 case UQI_FTYPE_V2DI
:
10564 case UQI_FTYPE_V4DI
:
10565 case UQI_FTYPE_V8DI
:
10566 case V16SI_FTYPE_UHI
:
10567 case V2DI_FTYPE_UQI
:
10568 case V4DI_FTYPE_UQI
:
10569 case V16SI_FTYPE_INT
:
10570 case V16SF_FTYPE_V8SF
:
10571 case V16SI_FTYPE_V8SI
:
10572 case V16SF_FTYPE_V4SF
:
10573 case V16SI_FTYPE_V4SI
:
10574 case V16SI_FTYPE_V16SF
:
10575 case V16SI_FTYPE_V16SI
:
10576 case V64QI_FTYPE_V64QI
:
10577 case V32HI_FTYPE_V32HI
:
10578 case V16SF_FTYPE_V16SF
:
10579 case V8DI_FTYPE_UQI
:
10580 case V8DI_FTYPE_V8DI
:
10581 case V8DF_FTYPE_V4DF
:
10582 case V8DF_FTYPE_V2DF
:
10583 case V8DF_FTYPE_V8DF
:
10584 case V4DI_FTYPE_V4DI
:
10585 case V16BF_FTYPE_V16SF
:
10586 case V8BF_FTYPE_V8SF
:
10587 case V8BF_FTYPE_V4SF
:
10590 case V4SF_FTYPE_V4SF_VEC_MERGE
:
10591 case V2DF_FTYPE_V2DF_VEC_MERGE
:
10592 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
10593 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
10594 case V16QI_FTYPE_V16QI_V16QI
:
10595 case V16QI_FTYPE_V8HI_V8HI
:
10596 case V16HF_FTYPE_V16HF_V16HF
:
10597 case V16SF_FTYPE_V16SF_V16SF
:
10598 case V8QI_FTYPE_V8QI_V8QI
:
10599 case V8QI_FTYPE_V4HI_V4HI
:
10600 case V8HI_FTYPE_V8HI_V8HI
:
10601 case V8HI_FTYPE_V16QI_V16QI
:
10602 case V8HI_FTYPE_V4SI_V4SI
:
10603 case V8HF_FTYPE_V8HF_V8HF
:
10604 case V8SF_FTYPE_V8SF_V8SF
:
10605 case V8SF_FTYPE_V8SF_V8SI
:
10606 case V8DF_FTYPE_V8DF_V8DF
:
10607 case V4SI_FTYPE_V4SI_V4SI
:
10608 case V4SI_FTYPE_V8HI_V8HI
:
10609 case V4SI_FTYPE_V2DF_V2DF
:
10610 case V4HI_FTYPE_V4HI_V4HI
:
10611 case V4HI_FTYPE_V8QI_V8QI
:
10612 case V4HI_FTYPE_V2SI_V2SI
:
10613 case V4DF_FTYPE_V4DF_V4DF
:
10614 case V4DF_FTYPE_V4DF_V4DI
:
10615 case V4SF_FTYPE_V4SF_V4SF
:
10616 case V4SF_FTYPE_V4SF_V4SI
:
10617 case V4SF_FTYPE_V4SF_V2SI
:
10618 case V4SF_FTYPE_V4SF_V2DF
:
10619 case V4SF_FTYPE_V4SF_UINT
:
10620 case V4SF_FTYPE_V4SF_DI
:
10621 case V4SF_FTYPE_V4SF_SI
:
10622 case V2DI_FTYPE_V2DI_V2DI
:
10623 case V2DI_FTYPE_V16QI_V16QI
:
10624 case V2DI_FTYPE_V4SI_V4SI
:
10625 case V2DI_FTYPE_V2DI_V16QI
:
10626 case V2SI_FTYPE_V2SI_V2SI
:
10627 case V2SI_FTYPE_V4HI_V4HI
:
10628 case V2SI_FTYPE_V2SF_V2SF
:
10629 case V2DF_FTYPE_V2DF_V2DF
:
10630 case V2DF_FTYPE_V2DF_V4SF
:
10631 case V2DF_FTYPE_V2DF_V2DI
:
10632 case V2DF_FTYPE_V2DF_DI
:
10633 case V2DF_FTYPE_V2DF_SI
:
10634 case V2DF_FTYPE_V2DF_UINT
:
10635 case V2SF_FTYPE_V2SF_V2SF
:
10636 case V1DI_FTYPE_V1DI_V1DI
:
10637 case V1DI_FTYPE_V8QI_V8QI
:
10638 case V1DI_FTYPE_V2SI_V2SI
:
10639 case V32QI_FTYPE_V16HI_V16HI
:
10640 case V16HI_FTYPE_V8SI_V8SI
:
10641 case V64QI_FTYPE_V64QI_V64QI
:
10642 case V32QI_FTYPE_V32QI_V32QI
:
10643 case V16HI_FTYPE_V32QI_V32QI
:
10644 case V16HI_FTYPE_V16HI_V16HI
:
10645 case V8SI_FTYPE_V4DF_V4DF
:
10646 case V8SI_FTYPE_V8SI_V8SI
:
10647 case V8SI_FTYPE_V16HI_V16HI
:
10648 case V4DI_FTYPE_V4DI_V4DI
:
10649 case V4DI_FTYPE_V8SI_V8SI
:
10650 case V4DI_FTYPE_V32QI_V32QI
:
10651 case V8DI_FTYPE_V64QI_V64QI
:
10652 if (comparison
== UNKNOWN
)
10653 return ix86_expand_binop_builtin (icode
, exp
, target
);
10656 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
10657 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
10658 gcc_assert (comparison
!= UNKNOWN
);
10662 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
10663 case V16HI_FTYPE_V16HI_SI_COUNT
:
10664 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
10665 case V8SI_FTYPE_V8SI_SI_COUNT
:
10666 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
10667 case V4DI_FTYPE_V4DI_INT_COUNT
:
10668 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
10669 case V8HI_FTYPE_V8HI_SI_COUNT
:
10670 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
10671 case V4SI_FTYPE_V4SI_SI_COUNT
:
10672 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
10673 case V4HI_FTYPE_V4HI_SI_COUNT
:
10674 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
10675 case V2DI_FTYPE_V2DI_SI_COUNT
:
10676 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
10677 case V2SI_FTYPE_V2SI_SI_COUNT
:
10678 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
10679 case V1DI_FTYPE_V1DI_SI_COUNT
:
10681 second_arg_count
= true;
10683 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
10684 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
10685 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
10686 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
10687 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
10688 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
10689 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
10690 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
10691 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
10692 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
10693 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
10694 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
10695 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
10696 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
10697 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
10698 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
10699 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
10700 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
10702 second_arg_count
= true;
10704 case UINT64_FTYPE_UINT64_UINT64
:
10705 case UINT_FTYPE_UINT_UINT
:
10706 case UINT_FTYPE_UINT_USHORT
:
10707 case UINT_FTYPE_UINT_UCHAR
:
10708 case UINT16_FTYPE_UINT16_INT
:
10709 case UINT8_FTYPE_UINT8_INT
:
10710 case UQI_FTYPE_UQI_UQI
:
10711 case UHI_FTYPE_UHI_UHI
:
10712 case USI_FTYPE_USI_USI
:
10713 case UDI_FTYPE_UDI_UDI
:
10714 case V16SI_FTYPE_V8DF_V8DF
:
10715 case V32BF_FTYPE_V16SF_V16SF
:
10716 case V16BF_FTYPE_V8SF_V8SF
:
10717 case V8BF_FTYPE_V4SF_V4SF
:
10718 case V16BF_FTYPE_V16SF_UHI
:
10719 case V8BF_FTYPE_V8SF_UQI
:
10720 case V8BF_FTYPE_V4SF_UQI
:
10723 case V2DI_FTYPE_V2DI_INT_CONVERT
:
10726 nargs_constant
= 1;
10728 case V4DI_FTYPE_V4DI_INT_CONVERT
:
10731 nargs_constant
= 1;
10733 case V8DI_FTYPE_V8DI_INT_CONVERT
:
10736 nargs_constant
= 1;
10738 case V8HI_FTYPE_V8HI_INT
:
10739 case V8HI_FTYPE_V8SF_INT
:
10740 case V16HI_FTYPE_V16SF_INT
:
10741 case V8HI_FTYPE_V4SF_INT
:
10742 case V8SF_FTYPE_V8SF_INT
:
10743 case V4SF_FTYPE_V16SF_INT
:
10744 case V16SF_FTYPE_V16SF_INT
:
10745 case V4SI_FTYPE_V4SI_INT
:
10746 case V4SI_FTYPE_V8SI_INT
:
10747 case V4HI_FTYPE_V4HI_INT
:
10748 case V4DF_FTYPE_V4DF_INT
:
10749 case V4DF_FTYPE_V8DF_INT
:
10750 case V4SF_FTYPE_V4SF_INT
:
10751 case V4SF_FTYPE_V8SF_INT
:
10752 case V2DI_FTYPE_V2DI_INT
:
10753 case V2DF_FTYPE_V2DF_INT
:
10754 case V2DF_FTYPE_V4DF_INT
:
10755 case V16HI_FTYPE_V16HI_INT
:
10756 case V8SI_FTYPE_V8SI_INT
:
10757 case V16SI_FTYPE_V16SI_INT
:
10758 case V4SI_FTYPE_V16SI_INT
:
10759 case V4DI_FTYPE_V4DI_INT
:
10760 case V2DI_FTYPE_V4DI_INT
:
10761 case V4DI_FTYPE_V8DI_INT
:
10762 case UQI_FTYPE_UQI_UQI_CONST
:
10763 case UHI_FTYPE_UHI_UQI
:
10764 case USI_FTYPE_USI_UQI
:
10765 case UDI_FTYPE_UDI_UQI
:
10767 nargs_constant
= 1;
10769 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
10770 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
10771 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
10772 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
10773 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
10774 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
10775 case UHI_FTYPE_V16SI_V16SI_UHI
:
10776 case UQI_FTYPE_V8DI_V8DI_UQI
:
10777 case V16HI_FTYPE_V16SI_V16HI_UHI
:
10778 case V16QI_FTYPE_V16SI_V16QI_UHI
:
10779 case V16QI_FTYPE_V8DI_V16QI_UQI
:
10780 case V32HF_FTYPE_V32HF_V32HF_USI
:
10781 case V16SF_FTYPE_V16SF_V16SF_UHI
:
10782 case V16SF_FTYPE_V4SF_V16SF_UHI
:
10783 case V16SI_FTYPE_SI_V16SI_UHI
:
10784 case V16SI_FTYPE_V16HI_V16SI_UHI
:
10785 case V16SI_FTYPE_V16QI_V16SI_UHI
:
10786 case V8SF_FTYPE_V4SF_V8SF_UQI
:
10787 case V4DF_FTYPE_V2DF_V4DF_UQI
:
10788 case V8SI_FTYPE_V4SI_V8SI_UQI
:
10789 case V8SI_FTYPE_SI_V8SI_UQI
:
10790 case V4SI_FTYPE_V4SI_V4SI_UQI
:
10791 case V4SI_FTYPE_SI_V4SI_UQI
:
10792 case V4DI_FTYPE_V2DI_V4DI_UQI
:
10793 case V4DI_FTYPE_DI_V4DI_UQI
:
10794 case V2DI_FTYPE_V2DI_V2DI_UQI
:
10795 case V2DI_FTYPE_DI_V2DI_UQI
:
10796 case V64QI_FTYPE_V64QI_V64QI_UDI
:
10797 case V64QI_FTYPE_V16QI_V64QI_UDI
:
10798 case V64QI_FTYPE_QI_V64QI_UDI
:
10799 case V32QI_FTYPE_V32QI_V32QI_USI
:
10800 case V32QI_FTYPE_V16QI_V32QI_USI
:
10801 case V32QI_FTYPE_QI_V32QI_USI
:
10802 case V16QI_FTYPE_V16QI_V16QI_UHI
:
10803 case V16QI_FTYPE_QI_V16QI_UHI
:
10804 case V32HI_FTYPE_V8HI_V32HI_USI
:
10805 case V32HI_FTYPE_HI_V32HI_USI
:
10806 case V16HI_FTYPE_V8HI_V16HI_UHI
:
10807 case V16HI_FTYPE_HI_V16HI_UHI
:
10808 case V8HI_FTYPE_V8HI_V8HI_UQI
:
10809 case V8HI_FTYPE_HI_V8HI_UQI
:
10810 case V16HF_FTYPE_V16HF_V16HF_UHI
:
10811 case V8SF_FTYPE_V8HI_V8SF_UQI
:
10812 case V4SF_FTYPE_V8HI_V4SF_UQI
:
10813 case V8SI_FTYPE_V8HF_V8SI_UQI
:
10814 case V8SF_FTYPE_V8HF_V8SF_UQI
:
10815 case V8SI_FTYPE_V8SF_V8SI_UQI
:
10816 case V4SI_FTYPE_V4SF_V4SI_UQI
:
10817 case V4SI_FTYPE_V8HF_V4SI_UQI
:
10818 case V4SF_FTYPE_V8HF_V4SF_UQI
:
10819 case V4DI_FTYPE_V8HF_V4DI_UQI
:
10820 case V4DI_FTYPE_V4SF_V4DI_UQI
:
10821 case V2DI_FTYPE_V8HF_V2DI_UQI
:
10822 case V2DI_FTYPE_V4SF_V2DI_UQI
:
10823 case V8HF_FTYPE_V8HF_V8HF_UQI
:
10824 case V8HF_FTYPE_V8HF_V8HF_V8HF
:
10825 case V8HF_FTYPE_V8HI_V8HF_UQI
:
10826 case V8HF_FTYPE_V8SI_V8HF_UQI
:
10827 case V8HF_FTYPE_V8SF_V8HF_UQI
:
10828 case V8HF_FTYPE_V4SI_V8HF_UQI
:
10829 case V8HF_FTYPE_V4SF_V8HF_UQI
:
10830 case V8HF_FTYPE_V4DI_V8HF_UQI
:
10831 case V8HF_FTYPE_V4DF_V8HF_UQI
:
10832 case V8HF_FTYPE_V2DI_V8HF_UQI
:
10833 case V8HF_FTYPE_V2DF_V8HF_UQI
:
10834 case V4SF_FTYPE_V4DI_V4SF_UQI
:
10835 case V4SF_FTYPE_V2DI_V4SF_UQI
:
10836 case V4DF_FTYPE_V4DI_V4DF_UQI
:
10837 case V4DF_FTYPE_V8HF_V4DF_UQI
:
10838 case V2DF_FTYPE_V8HF_V2DF_UQI
:
10839 case V2DF_FTYPE_V2DI_V2DF_UQI
:
10840 case V16QI_FTYPE_V8HI_V16QI_UQI
:
10841 case V16QI_FTYPE_V16HI_V16QI_UHI
:
10842 case V16QI_FTYPE_V4SI_V16QI_UQI
:
10843 case V16QI_FTYPE_V8SI_V16QI_UQI
:
10844 case V8HI_FTYPE_V8HF_V8HI_UQI
:
10845 case V8HI_FTYPE_V4SI_V8HI_UQI
:
10846 case V8HI_FTYPE_V8SI_V8HI_UQI
:
10847 case V16QI_FTYPE_V2DI_V16QI_UQI
:
10848 case V16QI_FTYPE_V4DI_V16QI_UQI
:
10849 case V8HI_FTYPE_V2DI_V8HI_UQI
:
10850 case V8HI_FTYPE_V4DI_V8HI_UQI
:
10851 case V4SI_FTYPE_V2DI_V4SI_UQI
:
10852 case V4SI_FTYPE_V4DI_V4SI_UQI
:
10853 case V32QI_FTYPE_V32HI_V32QI_USI
:
10854 case UHI_FTYPE_V16QI_V16QI_UHI
:
10855 case USI_FTYPE_V32QI_V32QI_USI
:
10856 case UDI_FTYPE_V64QI_V64QI_UDI
:
10857 case UQI_FTYPE_V8HI_V8HI_UQI
:
10858 case UHI_FTYPE_V16HI_V16HI_UHI
:
10859 case USI_FTYPE_V32HI_V32HI_USI
:
10860 case UQI_FTYPE_V4SI_V4SI_UQI
:
10861 case UQI_FTYPE_V8SI_V8SI_UQI
:
10862 case UQI_FTYPE_V2DI_V2DI_UQI
:
10863 case UQI_FTYPE_V4DI_V4DI_UQI
:
10864 case V4SF_FTYPE_V2DF_V4SF_UQI
:
10865 case V4SF_FTYPE_V4DF_V4SF_UQI
:
10866 case V16SI_FTYPE_V16SI_V16SI_UHI
:
10867 case V16SI_FTYPE_V4SI_V16SI_UHI
:
10868 case V2DI_FTYPE_V4SI_V2DI_UQI
:
10869 case V2DI_FTYPE_V8HI_V2DI_UQI
:
10870 case V2DI_FTYPE_V16QI_V2DI_UQI
:
10871 case V4DI_FTYPE_V4DI_V4DI_UQI
:
10872 case V4DI_FTYPE_V4SI_V4DI_UQI
:
10873 case V4DI_FTYPE_V8HI_V4DI_UQI
:
10874 case V4DI_FTYPE_V16QI_V4DI_UQI
:
10875 case V4DI_FTYPE_V4DF_V4DI_UQI
:
10876 case V2DI_FTYPE_V2DF_V2DI_UQI
:
10877 case V4SI_FTYPE_V4DF_V4SI_UQI
:
10878 case V4SI_FTYPE_V2DF_V4SI_UQI
:
10879 case V4SI_FTYPE_V8HI_V4SI_UQI
:
10880 case V4SI_FTYPE_V16QI_V4SI_UQI
:
10881 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
10882 case V8DF_FTYPE_V2DF_V8DF_UQI
:
10883 case V8DF_FTYPE_V4DF_V8DF_UQI
:
10884 case V8DF_FTYPE_V8DF_V8DF_UQI
:
10885 case V8SF_FTYPE_V8SF_V8SF_UQI
:
10886 case V8SF_FTYPE_V8SI_V8SF_UQI
:
10887 case V4DF_FTYPE_V4DF_V4DF_UQI
:
10888 case V4SF_FTYPE_V4SF_V4SF_UQI
:
10889 case V2DF_FTYPE_V2DF_V2DF_UQI
:
10890 case V2DF_FTYPE_V4SF_V2DF_UQI
:
10891 case V2DF_FTYPE_V4SI_V2DF_UQI
:
10892 case V4SF_FTYPE_V4SI_V4SF_UQI
:
10893 case V4DF_FTYPE_V4SF_V4DF_UQI
:
10894 case V4DF_FTYPE_V4SI_V4DF_UQI
:
10895 case V8SI_FTYPE_V8SI_V8SI_UQI
:
10896 case V8SI_FTYPE_V8HI_V8SI_UQI
:
10897 case V8SI_FTYPE_V16QI_V8SI_UQI
:
10898 case V8DF_FTYPE_V8SI_V8DF_UQI
:
10899 case V8DI_FTYPE_DI_V8DI_UQI
:
10900 case V16SF_FTYPE_V8SF_V16SF_UHI
:
10901 case V16SI_FTYPE_V8SI_V16SI_UHI
:
10902 case V16HF_FTYPE_V16HI_V16HF_UHI
:
10903 case V16HF_FTYPE_V16HF_V16HF_V16HF
:
10904 case V16HI_FTYPE_V16HF_V16HI_UHI
:
10905 case V16HI_FTYPE_V16HI_V16HI_UHI
:
10906 case V8HI_FTYPE_V16QI_V8HI_UQI
:
10907 case V16HI_FTYPE_V16QI_V16HI_UHI
:
10908 case V32HI_FTYPE_V32HI_V32HI_USI
:
10909 case V32HI_FTYPE_V32QI_V32HI_USI
:
10910 case V8DI_FTYPE_V16QI_V8DI_UQI
:
10911 case V8DI_FTYPE_V2DI_V8DI_UQI
:
10912 case V8DI_FTYPE_V4DI_V8DI_UQI
:
10913 case V8DI_FTYPE_V8DI_V8DI_UQI
:
10914 case V8DI_FTYPE_V8HI_V8DI_UQI
:
10915 case V8DI_FTYPE_V8SI_V8DI_UQI
:
10916 case V8HI_FTYPE_V8DI_V8HI_UQI
:
10917 case V8SI_FTYPE_V8DI_V8SI_UQI
:
10918 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
10919 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
10920 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
10921 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
10922 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
10923 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
10924 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
10925 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
10926 case V32BF_FTYPE_V16SF_V16SF_USI
:
10927 case V16BF_FTYPE_V8SF_V8SF_UHI
:
10928 case V8BF_FTYPE_V4SF_V4SF_UQI
:
10929 case V16BF_FTYPE_V16SF_V16BF_UHI
:
10930 case V8BF_FTYPE_V8SF_V8BF_UQI
:
10931 case V8BF_FTYPE_V4SF_V8BF_UQI
:
10932 case V16SF_FTYPE_V16SF_V32BF_V32BF
:
10933 case V8SF_FTYPE_V8SF_V16BF_V16BF
:
10934 case V4SF_FTYPE_V4SF_V8BF_V8BF
:
10937 case V32QI_FTYPE_V32QI_V32QI_INT
:
10938 case V16HI_FTYPE_V16HI_V16HI_INT
:
10939 case V16QI_FTYPE_V16QI_V16QI_INT
:
10940 case V4DI_FTYPE_V4DI_V4DI_INT
:
10941 case V8HI_FTYPE_V8HI_V8HI_INT
:
10942 case V8SI_FTYPE_V8SI_V8SI_INT
:
10943 case V8SI_FTYPE_V8SI_V4SI_INT
:
10944 case V8SF_FTYPE_V8SF_V8SF_INT
:
10945 case V8SF_FTYPE_V8SF_V4SF_INT
:
10946 case V4SI_FTYPE_V4SI_V4SI_INT
:
10947 case V4DF_FTYPE_V4DF_V4DF_INT
:
10948 case V16SF_FTYPE_V16SF_V16SF_INT
:
10949 case V16SF_FTYPE_V16SF_V4SF_INT
:
10950 case V16SI_FTYPE_V16SI_V4SI_INT
:
10951 case V4DF_FTYPE_V4DF_V2DF_INT
:
10952 case V4SF_FTYPE_V4SF_V4SF_INT
:
10953 case V2DI_FTYPE_V2DI_V2DI_INT
:
10954 case V4DI_FTYPE_V4DI_V2DI_INT
:
10955 case V2DF_FTYPE_V2DF_V2DF_INT
:
10956 case UQI_FTYPE_V8DI_V8UDI_INT
:
10957 case UQI_FTYPE_V8DF_V8DF_INT
:
10958 case UQI_FTYPE_V2DF_V2DF_INT
:
10959 case UQI_FTYPE_V4SF_V4SF_INT
:
10960 case UHI_FTYPE_V16SI_V16SI_INT
:
10961 case UHI_FTYPE_V16SF_V16SF_INT
:
10962 case V64QI_FTYPE_V64QI_V64QI_INT
:
10963 case V32HI_FTYPE_V32HI_V32HI_INT
:
10964 case V16SI_FTYPE_V16SI_V16SI_INT
:
10965 case V8DI_FTYPE_V8DI_V8DI_INT
:
10967 nargs_constant
= 1;
10969 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
10972 nargs_constant
= 1;
10974 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
10977 nargs_constant
= 1;
10979 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
10982 nargs_constant
= 1;
10984 case V2DI_FTYPE_V2DI_UINT_UINT
:
10986 nargs_constant
= 2;
10988 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
10991 nargs_constant
= 1;
10993 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
10997 nargs_constant
= 1;
10999 case QI_FTYPE_V8DF_INT_UQI
:
11000 case QI_FTYPE_V4DF_INT_UQI
:
11001 case QI_FTYPE_V2DF_INT_UQI
:
11002 case HI_FTYPE_V16SF_INT_UHI
:
11003 case QI_FTYPE_V8SF_INT_UQI
:
11004 case QI_FTYPE_V4SF_INT_UQI
:
11005 case QI_FTYPE_V8HF_INT_UQI
:
11006 case HI_FTYPE_V16HF_INT_UHI
:
11007 case SI_FTYPE_V32HF_INT_USI
:
11008 case V4SI_FTYPE_V4SI_V4SI_UHI
:
11009 case V8SI_FTYPE_V8SI_V8SI_UHI
:
11012 nargs_constant
= 1;
11014 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
11018 nargs_constant
= 1;
11020 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
11024 nargs_constant
= 1;
11026 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
11027 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
11028 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
11029 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
11030 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
11031 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
11032 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
11033 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
11034 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
11035 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
11036 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
11037 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
11038 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
11039 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
11040 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
11041 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
11042 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI
:
11043 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
11044 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
11045 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
11046 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
11047 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
11048 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
11049 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
11050 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
11051 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
11052 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
11053 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
11054 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
11055 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
11056 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
11057 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
11058 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
11059 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
11060 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI
:
11061 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI
:
11062 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
11063 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
11064 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
11065 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
11066 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
11067 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
11068 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
11069 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI
:
11070 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
11071 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
11072 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
11073 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
11074 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
11075 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
11076 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
11077 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
11078 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
11079 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
11080 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
11081 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI
:
11082 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI
:
11083 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI
:
11086 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
11087 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
11088 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
11089 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
11090 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
11092 nargs_constant
= 1;
11094 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
11095 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
11096 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
11097 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
11098 case UHI_FTYPE_V16HF_V16HF_INT_UHI
:
11099 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
11100 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
11101 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
11102 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
11103 case UQI_FTYPE_V8HF_V8HF_INT_UQI
:
11104 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
11105 case USI_FTYPE_V32QI_V32QI_INT_USI
:
11106 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
11107 case USI_FTYPE_V32HI_V32HI_INT_USI
:
11108 case USI_FTYPE_V32HF_V32HF_INT_USI
:
11109 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
11110 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
11113 nargs_constant
= 1;
11115 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
11117 nargs_constant
= 2;
11119 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
11120 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
11121 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI
:
11122 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI
:
11123 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI
:
11126 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
11127 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
11130 nargs_constant
= 1;
11132 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
11133 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
11134 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
11135 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
11136 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
11137 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
11138 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
11139 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
11140 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
11141 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
11142 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
11143 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
11144 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
11145 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
11146 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
11147 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
11148 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
11149 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
11150 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
11151 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
11152 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
11153 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
11154 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
11155 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
11156 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
11157 case V16HF_FTYPE_V16HF_INT_V16HF_UHI
:
11158 case V8HF_FTYPE_V8HF_INT_V8HF_UQI
:
11159 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
11160 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
11161 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
11162 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
11163 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
11166 nargs_constant
= 1;
11168 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
11169 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
11170 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
11171 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
11172 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
11173 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
11174 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
11175 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
11176 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
11177 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
11178 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
11179 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
11180 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
11181 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
11182 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
11183 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
11184 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
11185 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
11186 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
11187 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
11188 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
11189 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
11190 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
11191 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
11192 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
11193 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
11194 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
11197 nargs_constant
= 1;
11199 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
11200 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
11201 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
11202 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
11203 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
11204 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
11205 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
11206 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
11207 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
11208 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
11211 nargs_constant
= 1;
11213 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
11214 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
11215 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
11216 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
11217 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
11218 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
11219 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
11220 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
11221 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
11222 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
11223 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
11224 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
11227 nargs_constant
= 2;
11231 gcc_unreachable ();
11234 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
11236 if (comparison
!= UNKNOWN
)
11238 gcc_assert (nargs
== 2);
11239 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
11242 if (rmode
== VOIDmode
|| rmode
== tmode
)
11246 || GET_MODE (target
) != tmode
11247 || !insn_p
->operand
[0].predicate (target
, tmode
))
11248 target
= gen_reg_rtx (tmode
);
11249 else if (memory_operand (target
, tmode
))
11251 real_target
= target
;
11255 real_target
= gen_reg_rtx (tmode
);
11256 target
= lowpart_subreg (rmode
, real_target
, tmode
);
11259 for (i
= 0; i
< nargs
; i
++)
11261 tree arg
= CALL_EXPR_ARG (exp
, i
);
11262 rtx op
= expand_normal (arg
);
11263 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
11264 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
11266 if (second_arg_count
&& i
== 1)
11268 /* SIMD shift insns take either an 8-bit immediate or
11269 register as count. But builtin functions take int as
11270 count. If count doesn't match, we put it in register.
11271 The instructions are using 64-bit count, if op is just
11272 32-bit, zero-extend it, as negative shift counts
11273 are undefined behavior and zero-extension is more
11277 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
11278 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
11280 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11281 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
11282 op
= copy_to_reg (op
);
11285 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11286 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
11291 case CODE_FOR_avx_vinsertf128v4di
:
11292 case CODE_FOR_avx_vextractf128v4di
:
11293 error ("the last argument must be an 1-bit immediate");
11296 case CODE_FOR_avx512f_cmpv8di3_mask
:
11297 case CODE_FOR_avx512f_cmpv16si3_mask
:
11298 case CODE_FOR_avx512f_ucmpv8di3_mask
:
11299 case CODE_FOR_avx512f_ucmpv16si3_mask
:
11300 case CODE_FOR_avx512vl_cmpv4di3_mask
:
11301 case CODE_FOR_avx512vl_cmpv8si3_mask
:
11302 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
11303 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
11304 case CODE_FOR_avx512vl_cmpv2di3_mask
:
11305 case CODE_FOR_avx512vl_cmpv4si3_mask
:
11306 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
11307 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
11308 error ("the last argument must be a 3-bit immediate");
11311 case CODE_FOR_sse4_1_roundsd
:
11312 case CODE_FOR_sse4_1_roundss
:
11314 case CODE_FOR_sse4_1_roundpd
:
11315 case CODE_FOR_sse4_1_roundps
:
11316 case CODE_FOR_avx_roundpd256
:
11317 case CODE_FOR_avx_roundps256
:
11319 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
11320 case CODE_FOR_sse4_1_roundps_sfix
:
11321 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
11322 case CODE_FOR_avx_roundps_sfix256
:
11324 case CODE_FOR_sse4_1_blendps
:
11325 case CODE_FOR_avx_blendpd256
:
11326 case CODE_FOR_avx_vpermilv4df
:
11327 case CODE_FOR_avx_vpermilv4df_mask
:
11328 case CODE_FOR_avx512f_getmantv8df_mask
:
11329 case CODE_FOR_avx512f_getmantv16sf_mask
:
11330 case CODE_FOR_avx512vl_getmantv16hf_mask
:
11331 case CODE_FOR_avx512vl_getmantv8sf_mask
:
11332 case CODE_FOR_avx512vl_getmantv4df_mask
:
11333 case CODE_FOR_avx512fp16_getmantv8hf_mask
:
11334 case CODE_FOR_avx512vl_getmantv4sf_mask
:
11335 case CODE_FOR_avx512vl_getmantv2df_mask
:
11336 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
11337 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
11338 case CODE_FOR_avx512dq_rangepv4df_mask
:
11339 case CODE_FOR_avx512dq_rangepv8sf_mask
:
11340 case CODE_FOR_avx512dq_rangepv2df_mask
:
11341 case CODE_FOR_avx512dq_rangepv4sf_mask
:
11342 case CODE_FOR_avx_shufpd256_mask
:
11343 error ("the last argument must be a 4-bit immediate");
11346 case CODE_FOR_sha1rnds4
:
11347 case CODE_FOR_sse4_1_blendpd
:
11348 case CODE_FOR_avx_vpermilv2df
:
11349 case CODE_FOR_avx_vpermilv2df_mask
:
11350 case CODE_FOR_xop_vpermil2v2df3
:
11351 case CODE_FOR_xop_vpermil2v4sf3
:
11352 case CODE_FOR_xop_vpermil2v4df3
:
11353 case CODE_FOR_xop_vpermil2v8sf3
:
11354 case CODE_FOR_avx512f_vinsertf32x4_mask
:
11355 case CODE_FOR_avx512f_vinserti32x4_mask
:
11356 case CODE_FOR_avx512f_vextractf32x4_mask
:
11357 case CODE_FOR_avx512f_vextracti32x4_mask
:
11358 case CODE_FOR_sse2_shufpd
:
11359 case CODE_FOR_sse2_shufpd_mask
:
11360 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
11361 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
11362 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
11363 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
11364 error ("the last argument must be a 2-bit immediate");
11367 case CODE_FOR_avx_vextractf128v4df
:
11368 case CODE_FOR_avx_vextractf128v8sf
:
11369 case CODE_FOR_avx_vextractf128v8si
:
11370 case CODE_FOR_avx_vinsertf128v4df
:
11371 case CODE_FOR_avx_vinsertf128v8sf
:
11372 case CODE_FOR_avx_vinsertf128v8si
:
11373 case CODE_FOR_avx512f_vinsertf64x4_mask
:
11374 case CODE_FOR_avx512f_vinserti64x4_mask
:
11375 case CODE_FOR_avx512f_vextractf64x4_mask
:
11376 case CODE_FOR_avx512f_vextracti64x4_mask
:
11377 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
11378 case CODE_FOR_avx512dq_vinserti32x8_mask
:
11379 case CODE_FOR_avx512vl_vinsertv4df
:
11380 case CODE_FOR_avx512vl_vinsertv4di
:
11381 case CODE_FOR_avx512vl_vinsertv8sf
:
11382 case CODE_FOR_avx512vl_vinsertv8si
:
11383 error ("the last argument must be a 1-bit immediate");
11386 case CODE_FOR_avx_vmcmpv2df3
:
11387 case CODE_FOR_avx_vmcmpv4sf3
:
11388 case CODE_FOR_avx_cmpv2df3
:
11389 case CODE_FOR_avx_cmpv4sf3
:
11390 case CODE_FOR_avx_cmpv4df3
:
11391 case CODE_FOR_avx_cmpv8sf3
:
11392 case CODE_FOR_avx512f_cmpv8df3_mask
:
11393 case CODE_FOR_avx512f_cmpv16sf3_mask
:
11394 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
11395 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
11396 case CODE_FOR_avx512bw_cmpv32hf3_mask
:
11397 case CODE_FOR_avx512vl_cmpv16hf3_mask
:
11398 case CODE_FOR_avx512fp16_cmpv8hf3_mask
:
11399 error ("the last argument must be a 5-bit immediate");
11403 switch (nargs_constant
)
11406 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11407 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
11409 error ("the next to last argument must be an 8-bit immediate");
11414 error ("the last argument must be an 8-bit immediate");
11417 gcc_unreachable ();
11424 if (VECTOR_MODE_P (mode
))
11425 op
= safe_vector_operand (op
, mode
);
11427 /* If we aren't optimizing, only allow one memory operand to
11429 if (memory_operand (op
, mode
))
11432 op
= fixup_modeless_constant (op
, mode
);
11434 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11436 if (optimize
|| !match
|| num_memory
> 1)
11437 op
= copy_to_mode_reg (mode
, op
);
11441 op
= copy_to_reg (op
);
11442 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11452 pat
= GEN_FCN (icode
) (real_target
, xops
[0]);
11455 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1]);
11458 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1], xops
[2]);
11461 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11465 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11466 xops
[2], xops
[3], xops
[4]);
11469 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11470 xops
[2], xops
[3], xops
[4], xops
[5]);
11473 gcc_unreachable ();
11483 /* Transform pattern of following layout:
11485 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11491 ix86_erase_embedded_rounding (rtx pat
)
11493 if (GET_CODE (pat
) == INSN
)
11494 pat
= PATTERN (pat
);
11496 gcc_assert (GET_CODE (pat
) == SET
);
11497 rtx src
= SET_SRC (pat
);
11498 gcc_assert (XVECLEN (src
, 0) == 2);
11499 rtx p0
= XVECEXP (src
, 0, 0);
11500 gcc_assert (GET_CODE (src
) == UNSPEC
11501 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
11502 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
11506 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11509 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
11510 tree exp
, rtx target
)
11513 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11514 tree arg1
= CALL_EXPR_ARG (exp
, 1);
11515 tree arg2
= CALL_EXPR_ARG (exp
, 2);
11516 tree arg3
= CALL_EXPR_ARG (exp
, 3);
11517 rtx op0
= expand_normal (arg0
);
11518 rtx op1
= expand_normal (arg1
);
11519 rtx op2
= expand_normal (arg2
);
11520 rtx op3
= expand_normal (arg3
);
11521 enum insn_code icode
= d
->icode
;
11522 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11523 machine_mode mode0
= insn_p
->operand
[0].mode
;
11524 machine_mode mode1
= insn_p
->operand
[1].mode
;
11526 /* See avxintrin.h for values. */
11527 static const enum rtx_code comparisons
[32] =
11529 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11530 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
11531 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11532 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
11534 static const bool ordereds
[32] =
11536 true, true, true, false, false, false, false, true,
11537 false, false, false, true, true, true, true, false,
11538 true, true, true, false, false, false, false, true,
11539 false, false, false, true, true, true, true, false
11541 static const bool non_signalings
[32] =
11543 true, false, false, true, true, false, false, true,
11544 true, false, false, true, true, false, false, true,
11545 false, true, true, false, false, true, true, false,
11546 false, true, true, false, false, true, true, false
11549 if (!CONST_INT_P (op2
))
11551 error ("the third argument must be comparison constant");
11554 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
11556 error ("incorrect comparison mode");
11560 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
11562 error ("incorrect rounding operand");
11566 if (VECTOR_MODE_P (mode0
))
11567 op0
= safe_vector_operand (op0
, mode0
);
11568 if (VECTOR_MODE_P (mode1
))
11569 op1
= safe_vector_operand (op1
, mode1
);
11571 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
11572 bool ordered
= ordereds
[INTVAL (op2
)];
11573 bool non_signaling
= non_signalings
[INTVAL (op2
)];
11574 rtx const_val
= const0_rtx
;
11576 bool check_unordered
= false;
11577 machine_mode mode
= CCFPmode
;
11578 switch (comparison
)
11583 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11584 if (!non_signaling
)
11590 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11600 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11607 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11608 if (!non_signaling
)
11615 case LE
: /* -> GE */
11616 case LT
: /* -> GT */
11617 case UNGE
: /* -> UNLE */
11618 case UNGT
: /* -> UNLT */
11619 std::swap (op0
, op1
);
11620 comparison
= swap_condition (comparison
);
11628 /* These are supported by CCFPmode. NB: Use ordered/signaling
11629 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11630 with NAN operands. */
11631 if (ordered
== non_signaling
)
11632 ordered
= !ordered
;
11635 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11636 _CMP_EQ_OQ/_CMP_EQ_OS. */
11637 check_unordered
= true;
11641 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11642 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11643 gcc_assert (!ordered
);
11644 check_unordered
= true;
11646 const_val
= const1_rtx
;
11649 gcc_unreachable ();
11652 target
= gen_reg_rtx (SImode
);
11653 emit_move_insn (target
, const_val
);
11654 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11656 if ((optimize
&& !register_operand (op0
, mode0
))
11657 || !insn_p
->operand
[0].predicate (op0
, mode0
))
11658 op0
= copy_to_mode_reg (mode0
, op0
);
11659 if ((optimize
&& !register_operand (op1
, mode1
))
11660 || !insn_p
->operand
[1].predicate (op1
, mode1
))
11661 op1
= copy_to_mode_reg (mode1
, op1
);
11664 1. COMI: ordered and signaling.
11665 2. UCOMI: unordered and non-signaling.
11668 icode
= (icode
== CODE_FOR_sse_comi_round
11669 ? CODE_FOR_sse_ucomi_round
11670 : CODE_FOR_sse2_ucomi_round
);
11672 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
11676 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11677 if (INTVAL (op3
) == NO_ROUND
)
11679 pat
= ix86_erase_embedded_rounding (pat
);
11683 set_dst
= SET_DEST (pat
);
11687 gcc_assert (GET_CODE (pat
) == SET
);
11688 set_dst
= SET_DEST (pat
);
11693 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
11698 ix86_expand_round_builtin (const struct builtin_description
*d
,
11699 tree exp
, rtx target
)
11702 unsigned int i
, nargs
;
11704 enum insn_code icode
= d
->icode
;
11705 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11706 machine_mode tmode
= insn_p
->operand
[0].mode
;
11707 unsigned int nargs_constant
= 0;
11708 unsigned int redundant_embed_rnd
= 0;
11710 switch ((enum ix86_builtin_func_type
) d
->flag
)
11712 case UINT64_FTYPE_V2DF_INT
:
11713 case UINT64_FTYPE_V4SF_INT
:
11714 case UINT64_FTYPE_V8HF_INT
:
11715 case UINT_FTYPE_V2DF_INT
:
11716 case UINT_FTYPE_V4SF_INT
:
11717 case UINT_FTYPE_V8HF_INT
:
11718 case INT64_FTYPE_V2DF_INT
:
11719 case INT64_FTYPE_V4SF_INT
:
11720 case INT64_FTYPE_V8HF_INT
:
11721 case INT_FTYPE_V2DF_INT
:
11722 case INT_FTYPE_V4SF_INT
:
11723 case INT_FTYPE_V8HF_INT
:
11726 case V32HF_FTYPE_V32HF_V32HF_INT
:
11727 case V8HF_FTYPE_V8HF_V8HF_INT
:
11728 case V8HF_FTYPE_V8HF_INT_INT
:
11729 case V8HF_FTYPE_V8HF_UINT_INT
:
11730 case V8HF_FTYPE_V8HF_INT64_INT
:
11731 case V8HF_FTYPE_V8HF_UINT64_INT
:
11732 case V4SF_FTYPE_V4SF_UINT_INT
:
11733 case V4SF_FTYPE_V4SF_UINT64_INT
:
11734 case V2DF_FTYPE_V2DF_UINT64_INT
:
11735 case V4SF_FTYPE_V4SF_INT_INT
:
11736 case V4SF_FTYPE_V4SF_INT64_INT
:
11737 case V2DF_FTYPE_V2DF_INT64_INT
:
11738 case V4SF_FTYPE_V4SF_V4SF_INT
:
11739 case V2DF_FTYPE_V2DF_V2DF_INT
:
11740 case V4SF_FTYPE_V4SF_V2DF_INT
:
11741 case V2DF_FTYPE_V2DF_V4SF_INT
:
11744 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
11745 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
11746 case V32HI_FTYPE_V32HF_V32HI_USI_INT
:
11747 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
11748 case V8DI_FTYPE_V8HF_V8DI_UQI_INT
:
11749 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
11750 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
11751 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
11752 case V8DF_FTYPE_V8HF_V8DF_UQI_INT
:
11753 case V16SF_FTYPE_V16HF_V16SF_UHI_INT
:
11754 case V32HF_FTYPE_V32HI_V32HF_USI_INT
:
11755 case V32HF_FTYPE_V32HF_V32HF_USI_INT
:
11756 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT
:
11757 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
11758 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
11759 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
11760 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
11761 case V16SI_FTYPE_V16HF_V16SI_UHI_INT
:
11762 case V16HF_FTYPE_V16SI_V16HF_UHI_INT
:
11763 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
11764 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
11765 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
11766 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
11767 case V8HF_FTYPE_V8DI_V8HF_UQI_INT
:
11768 case V8HF_FTYPE_V8DF_V8HF_UQI_INT
:
11769 case V16HF_FTYPE_V16SF_V16HF_UHI_INT
:
11770 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT
:
11773 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
11774 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
11775 nargs_constant
= 2;
11778 case INT_FTYPE_V4SF_V4SF_INT_INT
:
11779 case INT_FTYPE_V2DF_V2DF_INT_INT
:
11780 return ix86_expand_sse_comi_round (d
, exp
, target
);
11781 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
11782 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
11783 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
11784 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT
:
11785 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
11786 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT
:
11787 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT
:
11788 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT
:
11789 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
11790 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
11791 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT
:
11792 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
11793 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
11794 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT
:
11795 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT
:
11796 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT
:
11797 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT
:
11800 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT
:
11801 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
11802 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
11803 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT
:
11804 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT
:
11805 nargs_constant
= 4;
11808 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
11809 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
11810 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
11811 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
11812 case USI_FTYPE_V32HF_V32HF_INT_USI_INT
:
11813 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT
:
11814 nargs_constant
= 3;
11817 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
11818 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
11819 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
11820 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
11821 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
11822 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
11823 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT
:
11825 nargs_constant
= 4;
11827 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
11828 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
11829 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
11830 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
11832 nargs_constant
= 3;
11835 gcc_unreachable ();
11837 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
11841 || GET_MODE (target
) != tmode
11842 || !insn_p
->operand
[0].predicate (target
, tmode
))
11843 target
= gen_reg_rtx (tmode
);
11845 for (i
= 0; i
< nargs
; i
++)
11847 tree arg
= CALL_EXPR_ARG (exp
, i
);
11848 rtx op
= expand_normal (arg
);
11849 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
11850 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
11852 if (i
== nargs
- nargs_constant
)
11858 case CODE_FOR_avx512f_getmantv8df_mask_round
:
11859 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
11860 case CODE_FOR_avx512bw_getmantv32hf_mask_round
:
11861 case CODE_FOR_avx512f_vgetmantv2df_round
:
11862 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
11863 case CODE_FOR_avx512f_vgetmantv4sf_round
:
11864 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
11865 case CODE_FOR_avx512f_vgetmantv8hf_mask_round
:
11866 error ("the immediate argument must be a 4-bit immediate");
11868 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
11869 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
11870 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
11871 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
11872 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round
:
11873 case CODE_FOR_avx512bw_cmpv32hf3_mask_round
:
11874 error ("the immediate argument must be a 5-bit immediate");
11877 error ("the immediate argument must be an 8-bit immediate");
11882 else if (i
== nargs
-1)
11884 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
11886 error ("incorrect rounding operand");
11890 /* If there is no rounding use normal version of the pattern. */
11891 if (INTVAL (op
) == NO_ROUND
)
11893 /* Skip erasing embedded rounding for below expanders who
11894 generates multiple insns. In ix86_erase_embedded_rounding
11895 the pattern will be transformed to a single set, and emit_insn
11896 appends the set insead of insert it to chain. So the insns
11897 emitted inside define_expander would be ignored. */
11900 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round
:
11901 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round
:
11902 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round
:
11903 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round
:
11904 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round
:
11905 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round
:
11906 redundant_embed_rnd
= 0;
11909 redundant_embed_rnd
= 1;
11916 if (VECTOR_MODE_P (mode
))
11917 op
= safe_vector_operand (op
, mode
);
11919 op
= fixup_modeless_constant (op
, mode
);
11921 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11923 if (optimize
|| !match
)
11924 op
= copy_to_mode_reg (mode
, op
);
11928 op
= copy_to_reg (op
);
11929 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11939 pat
= GEN_FCN (icode
) (target
, xops
[0]);
11942 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
11945 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
11948 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11952 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11953 xops
[2], xops
[3], xops
[4]);
11956 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11957 xops
[2], xops
[3], xops
[4], xops
[5]);
11960 gcc_unreachable ();
11966 if (redundant_embed_rnd
)
11967 pat
= ix86_erase_embedded_rounding (pat
);
11973 /* Subroutine of ix86_expand_builtin to take care of special insns
11974 with variable number of operands. */
11977 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
11978 tree exp
, rtx target
)
11982 unsigned int i
, nargs
, arg_adjust
, memory
;
11983 unsigned int constant
= 100;
11984 bool aligned_mem
= false;
11986 enum insn_code icode
= d
->icode
;
11987 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11988 machine_mode tmode
= insn_p
->operand
[0].mode
;
11989 enum { load
, store
} klass
;
11991 switch ((enum ix86_builtin_func_type
) d
->flag
)
11993 case VOID_FTYPE_VOID
:
11994 emit_insn (GEN_FCN (icode
) (target
));
11996 case VOID_FTYPE_UINT64
:
11997 case VOID_FTYPE_UNSIGNED
:
12003 case INT_FTYPE_VOID
:
12004 case USHORT_FTYPE_VOID
:
12005 case UINT64_FTYPE_VOID
:
12006 case UINT_FTYPE_VOID
:
12007 case UINT8_FTYPE_VOID
:
12008 case UNSIGNED_FTYPE_VOID
:
12013 case UINT64_FTYPE_PUNSIGNED
:
12014 case V2DI_FTYPE_PV2DI
:
12015 case V4DI_FTYPE_PV4DI
:
12016 case V32QI_FTYPE_PCCHAR
:
12017 case V16QI_FTYPE_PCCHAR
:
12018 case V8SF_FTYPE_PCV4SF
:
12019 case V8SF_FTYPE_PCFLOAT
:
12020 case V4SF_FTYPE_PCFLOAT
:
12021 case V4SF_FTYPE_PCFLOAT16
:
12022 case V4SF_FTYPE_PCBFLOAT16
:
12023 case V4SF_FTYPE_PCV8BF
:
12024 case V4SF_FTYPE_PCV8HF
:
12025 case V8SF_FTYPE_PCFLOAT16
:
12026 case V8SF_FTYPE_PCBFLOAT16
:
12027 case V8SF_FTYPE_PCV16HF
:
12028 case V8SF_FTYPE_PCV16BF
:
12029 case V4DF_FTYPE_PCV2DF
:
12030 case V4DF_FTYPE_PCDOUBLE
:
12031 case V2DF_FTYPE_PCDOUBLE
:
12032 case VOID_FTYPE_PVOID
:
12033 case V8DI_FTYPE_PV8DI
:
12039 case CODE_FOR_sse4_1_movntdqa
:
12040 case CODE_FOR_avx2_movntdqa
:
12041 case CODE_FOR_avx512f_movntdqa
:
12042 aligned_mem
= true;
12048 case VOID_FTYPE_PV2SF_V4SF
:
12049 case VOID_FTYPE_PV8DI_V8DI
:
12050 case VOID_FTYPE_PV4DI_V4DI
:
12051 case VOID_FTYPE_PV2DI_V2DI
:
12052 case VOID_FTYPE_PCHAR_V32QI
:
12053 case VOID_FTYPE_PCHAR_V16QI
:
12054 case VOID_FTYPE_PFLOAT_V16SF
:
12055 case VOID_FTYPE_PFLOAT_V8SF
:
12056 case VOID_FTYPE_PFLOAT_V4SF
:
12057 case VOID_FTYPE_PDOUBLE_V8DF
:
12058 case VOID_FTYPE_PDOUBLE_V4DF
:
12059 case VOID_FTYPE_PDOUBLE_V2DF
:
12060 case VOID_FTYPE_PLONGLONG_LONGLONG
:
12061 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
12062 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
12063 case VOID_FTYPE_PINT_INT
:
12066 /* Reserve memory operand for target. */
12067 memory
= ARRAY_SIZE (xops
);
12070 /* These builtins and instructions require the memory
12071 to be properly aligned. */
12072 case CODE_FOR_avx_movntv4di
:
12073 case CODE_FOR_sse2_movntv2di
:
12074 case CODE_FOR_avx_movntv8sf
:
12075 case CODE_FOR_sse_movntv4sf
:
12076 case CODE_FOR_sse4a_vmmovntv4sf
:
12077 case CODE_FOR_avx_movntv4df
:
12078 case CODE_FOR_sse2_movntv2df
:
12079 case CODE_FOR_sse4a_vmmovntv2df
:
12080 case CODE_FOR_sse2_movntidi
:
12081 case CODE_FOR_sse_movntq
:
12082 case CODE_FOR_sse2_movntisi
:
12083 case CODE_FOR_avx512f_movntv16sf
:
12084 case CODE_FOR_avx512f_movntv8df
:
12085 case CODE_FOR_avx512f_movntv8di
:
12086 aligned_mem
= true;
12092 case VOID_FTYPE_PVOID_PCVOID
:
12098 case V4SF_FTYPE_V4SF_PCV2SF
:
12099 case V2DF_FTYPE_V2DF_PCDOUBLE
:
12104 case V8SF_FTYPE_PCV8SF_V8SI
:
12105 case V4DF_FTYPE_PCV4DF_V4DI
:
12106 case V4SF_FTYPE_PCV4SF_V4SI
:
12107 case V2DF_FTYPE_PCV2DF_V2DI
:
12108 case V8SI_FTYPE_PCV8SI_V8SI
:
12109 case V4DI_FTYPE_PCV4DI_V4DI
:
12110 case V4SI_FTYPE_PCV4SI_V4SI
:
12111 case V2DI_FTYPE_PCV2DI_V2DI
:
12112 case VOID_FTYPE_INT_INT64
:
12117 case VOID_FTYPE_PV8DF_V8DF_UQI
:
12118 case VOID_FTYPE_PV4DF_V4DF_UQI
:
12119 case VOID_FTYPE_PV2DF_V2DF_UQI
:
12120 case VOID_FTYPE_PV16SF_V16SF_UHI
:
12121 case VOID_FTYPE_PV8SF_V8SF_UQI
:
12122 case VOID_FTYPE_PV4SF_V4SF_UQI
:
12123 case VOID_FTYPE_PV8DI_V8DI_UQI
:
12124 case VOID_FTYPE_PV4DI_V4DI_UQI
:
12125 case VOID_FTYPE_PV2DI_V2DI_UQI
:
12126 case VOID_FTYPE_PV16SI_V16SI_UHI
:
12127 case VOID_FTYPE_PV8SI_V8SI_UQI
:
12128 case VOID_FTYPE_PV4SI_V4SI_UQI
:
12129 case VOID_FTYPE_PV64QI_V64QI_UDI
:
12130 case VOID_FTYPE_PV32HI_V32HI_USI
:
12131 case VOID_FTYPE_PV32QI_V32QI_USI
:
12132 case VOID_FTYPE_PV16QI_V16QI_UHI
:
12133 case VOID_FTYPE_PV16HI_V16HI_UHI
:
12134 case VOID_FTYPE_PV8HI_V8HI_UQI
:
12137 /* These builtins and instructions require the memory
12138 to be properly aligned. */
12139 case CODE_FOR_avx512f_storev16sf_mask
:
12140 case CODE_FOR_avx512f_storev16si_mask
:
12141 case CODE_FOR_avx512f_storev8df_mask
:
12142 case CODE_FOR_avx512f_storev8di_mask
:
12143 case CODE_FOR_avx512vl_storev8sf_mask
:
12144 case CODE_FOR_avx512vl_storev8si_mask
:
12145 case CODE_FOR_avx512vl_storev4df_mask
:
12146 case CODE_FOR_avx512vl_storev4di_mask
:
12147 case CODE_FOR_avx512vl_storev4sf_mask
:
12148 case CODE_FOR_avx512vl_storev4si_mask
:
12149 case CODE_FOR_avx512vl_storev2df_mask
:
12150 case CODE_FOR_avx512vl_storev2di_mask
:
12151 aligned_mem
= true;
12157 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
12158 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
12159 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
12160 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
12161 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
12162 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
12163 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
12164 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
12165 case VOID_FTYPE_PV8SI_V8DI_UQI
:
12166 case VOID_FTYPE_PV8HI_V8DI_UQI
:
12167 case VOID_FTYPE_PV16HI_V16SI_UHI
:
12168 case VOID_FTYPE_PUDI_V8DI_UQI
:
12169 case VOID_FTYPE_PV16QI_V16SI_UHI
:
12170 case VOID_FTYPE_PV4SI_V4DI_UQI
:
12171 case VOID_FTYPE_PUDI_V2DI_UQI
:
12172 case VOID_FTYPE_PUDI_V4DI_UQI
:
12173 case VOID_FTYPE_PUSI_V2DI_UQI
:
12174 case VOID_FTYPE_PV8HI_V8SI_UQI
:
12175 case VOID_FTYPE_PUDI_V4SI_UQI
:
12176 case VOID_FTYPE_PUSI_V4DI_UQI
:
12177 case VOID_FTYPE_PUHI_V2DI_UQI
:
12178 case VOID_FTYPE_PUDI_V8SI_UQI
:
12179 case VOID_FTYPE_PUSI_V4SI_UQI
:
12180 case VOID_FTYPE_PCHAR_V64QI_UDI
:
12181 case VOID_FTYPE_PCHAR_V32QI_USI
:
12182 case VOID_FTYPE_PCHAR_V16QI_UHI
:
12183 case VOID_FTYPE_PSHORT_V32HI_USI
:
12184 case VOID_FTYPE_PSHORT_V16HI_UHI
:
12185 case VOID_FTYPE_PSHORT_V8HI_UQI
:
12186 case VOID_FTYPE_PINT_V16SI_UHI
:
12187 case VOID_FTYPE_PINT_V8SI_UQI
:
12188 case VOID_FTYPE_PINT_V4SI_UQI
:
12189 case VOID_FTYPE_PINT64_V8DI_UQI
:
12190 case VOID_FTYPE_PINT64_V4DI_UQI
:
12191 case VOID_FTYPE_PINT64_V2DI_UQI
:
12192 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
12193 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
12194 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
12195 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
12196 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
12197 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
12198 case VOID_FTYPE_PCFLOAT16_V8HF_UQI
:
12199 case VOID_FTYPE_PV32QI_V32HI_USI
:
12200 case VOID_FTYPE_PV16QI_V16HI_UHI
:
12201 case VOID_FTYPE_PUDI_V8HI_UQI
:
12204 /* Reserve memory operand for target. */
12205 memory
= ARRAY_SIZE (xops
);
12207 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
12208 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
12209 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
12210 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
12211 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
12212 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
12213 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
12214 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
12215 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
12216 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
12217 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
12218 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
12219 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
12220 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
12221 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
12222 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
12223 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
12224 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
12227 /* These builtins and instructions require the memory
12228 to be properly aligned. */
12229 case CODE_FOR_avx512f_loadv16sf_mask
:
12230 case CODE_FOR_avx512f_loadv16si_mask
:
12231 case CODE_FOR_avx512f_loadv8df_mask
:
12232 case CODE_FOR_avx512f_loadv8di_mask
:
12233 case CODE_FOR_avx512vl_loadv8sf_mask
:
12234 case CODE_FOR_avx512vl_loadv8si_mask
:
12235 case CODE_FOR_avx512vl_loadv4df_mask
:
12236 case CODE_FOR_avx512vl_loadv4di_mask
:
12237 case CODE_FOR_avx512vl_loadv4sf_mask
:
12238 case CODE_FOR_avx512vl_loadv4si_mask
:
12239 case CODE_FOR_avx512vl_loadv2df_mask
:
12240 case CODE_FOR_avx512vl_loadv2di_mask
:
12241 case CODE_FOR_avx512bw_loadv64qi_mask
:
12242 case CODE_FOR_avx512vl_loadv32qi_mask
:
12243 case CODE_FOR_avx512vl_loadv16qi_mask
:
12244 case CODE_FOR_avx512bw_loadv32hi_mask
:
12245 case CODE_FOR_avx512vl_loadv16hi_mask
:
12246 case CODE_FOR_avx512vl_loadv8hi_mask
:
12247 aligned_mem
= true;
12253 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
12254 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
12255 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
12256 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
12257 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
12258 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
12259 case V16SI_FTYPE_PCINT_V16SI_UHI
:
12260 case V8SI_FTYPE_PCINT_V8SI_UQI
:
12261 case V4SI_FTYPE_PCINT_V4SI_UQI
:
12262 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
12263 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
12264 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
12265 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
12266 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
12267 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
12268 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
12269 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
12270 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
12271 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI
:
12276 case INT_FTYPE_PINT_INT_INT_INT
:
12277 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT
:
12284 gcc_unreachable ();
12287 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
12289 if (klass
== store
)
12291 arg
= CALL_EXPR_ARG (exp
, 0);
12292 op
= expand_normal (arg
);
12293 gcc_assert (target
== 0);
12296 op
= ix86_zero_extend_to_Pmode (op
);
12297 target
= gen_rtx_MEM (tmode
, op
);
12298 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12299 on it. Try to improve it using get_pointer_alignment,
12300 and if the special builtin is one that requires strict
12301 mode alignment, also from it's GET_MODE_ALIGNMENT.
12302 Failure to do so could lead to ix86_legitimate_combined_insn
12303 rejecting all changes to such insns. */
12304 unsigned int align
= get_pointer_alignment (arg
);
12305 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
12306 align
= GET_MODE_ALIGNMENT (tmode
);
12307 if (MEM_ALIGN (target
) < align
)
12308 set_mem_align (target
, align
);
12311 target
= force_reg (tmode
, op
);
12319 || !register_operand (target
, tmode
)
12320 || GET_MODE (target
) != tmode
)
12321 target
= gen_reg_rtx (tmode
);
12324 for (i
= 0; i
< nargs
; i
++)
12326 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
12328 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
12329 op
= expand_normal (arg
);
12333 /* This must be the memory operand. */
12334 op
= ix86_zero_extend_to_Pmode (op
);
12335 op
= gen_rtx_MEM (mode
, op
);
12336 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12337 on it. Try to improve it using get_pointer_alignment,
12338 and if the special builtin is one that requires strict
12339 mode alignment, also from it's GET_MODE_ALIGNMENT.
12340 Failure to do so could lead to ix86_legitimate_combined_insn
12341 rejecting all changes to such insns. */
12342 unsigned int align
= get_pointer_alignment (arg
);
12343 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
12344 align
= GET_MODE_ALIGNMENT (mode
);
12345 if (MEM_ALIGN (op
) < align
)
12346 set_mem_align (op
, align
);
12348 else if (i
== constant
)
12350 /* This must be the constant. */
12351 if (!insn_p
->operand
[nargs
].predicate(op
, SImode
))
12353 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12359 /* This must be register. */
12360 if (VECTOR_MODE_P (mode
))
12361 op
= safe_vector_operand (op
, mode
);
12363 op
= fixup_modeless_constant (op
, mode
);
12365 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12366 and that mask operand shoud be at the end.
12367 Keep all-ones mask which would be simplified by the expander. */
12368 if (nargs
== 3 && i
== 2 && klass
== load
12369 && constm1_operand (op
, mode
)
12370 && insn_p
->operand
[i
].predicate (op
, mode
))
12372 else if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
12373 op
= copy_to_mode_reg (mode
, op
);
12376 op
= copy_to_reg (op
);
12377 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
12387 pat
= GEN_FCN (icode
) (target
);
12390 pat
= GEN_FCN (icode
) (target
, xops
[0]);
12393 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
12396 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
12399 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
12402 gcc_unreachable ();
12409 return klass
== store
? 0 : target
;
12412 /* Return the integer constant in ARG. Constrain it to be in the range
12413 of the subparts of VEC_TYPE; issue an error if not. */
12416 get_element_number (tree vec_type
, tree arg
)
12418 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
12420 if (!tree_fits_uhwi_p (arg
)
12421 || (elt
= tree_to_uhwi (arg
), elt
> max
))
12423 error ("selector must be an integer constant in the range "
12431 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12432 ix86_expand_vector_init. We DO have language-level syntax for this, in
12433 the form of (type){ init-list }. Except that since we can't place emms
12434 instructions from inside the compiler, we can't allow the use of MMX
12435 registers unless the user explicitly asks for it. So we do *not* define
12436 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12437 we have builtins invoked by mmintrin.h that gives us license to emit
12438 these sorts of instructions. */
12441 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
12443 machine_mode tmode
= TYPE_MODE (type
);
12444 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
12445 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
12446 rtvec v
= rtvec_alloc (n_elt
);
12448 gcc_assert (VECTOR_MODE_P (tmode
));
12449 gcc_assert (call_expr_nargs (exp
) == n_elt
);
12451 for (i
= 0; i
< n_elt
; ++i
)
12453 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
12454 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
12457 if (!target
|| !register_operand (target
, tmode
))
12458 target
= gen_reg_rtx (tmode
);
12460 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
12464 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12465 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12466 had a language-level syntax for referencing vector elements. */
12469 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
12471 machine_mode tmode
, mode0
;
12476 arg0
= CALL_EXPR_ARG (exp
, 0);
12477 arg1
= CALL_EXPR_ARG (exp
, 1);
12479 op0
= expand_normal (arg0
);
12480 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
12482 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12483 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
12484 gcc_assert (VECTOR_MODE_P (mode0
));
12486 op0
= force_reg (mode0
, op0
);
12488 if (optimize
|| !target
|| !register_operand (target
, tmode
))
12489 target
= gen_reg_rtx (tmode
);
12491 ix86_expand_vector_extract (true, target
, op0
, elt
);
12496 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12497 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12498 a language-level syntax for referencing vector elements. */
12501 ix86_expand_vec_set_builtin (tree exp
)
12503 machine_mode tmode
, mode1
;
12504 tree arg0
, arg1
, arg2
;
12506 rtx op0
, op1
, target
;
12508 arg0
= CALL_EXPR_ARG (exp
, 0);
12509 arg1
= CALL_EXPR_ARG (exp
, 1);
12510 arg2
= CALL_EXPR_ARG (exp
, 2);
12512 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
12513 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12514 gcc_assert (VECTOR_MODE_P (tmode
));
12516 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
12517 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
12518 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
12520 if (GET_MODE (op1
) != mode1
)
12521 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
12523 op0
= force_reg (tmode
, op0
);
12524 op1
= force_reg (mode1
, op1
);
12526 /* OP0 is the source of these builtin functions and shouldn't be
12527 modified. Create a copy, use it and return it as target. */
12528 target
= gen_reg_rtx (tmode
);
12529 emit_move_insn (target
, op0
);
12530 ix86_expand_vector_set (true, target
, op1
, elt
);
12535 /* Return true if the necessary isa options for this builtin exist,
12537 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12539 ix86_check_builtin_isa_match (unsigned int fcode
,
12540 HOST_WIDE_INT
* pbisa
,
12541 HOST_WIDE_INT
* pbisa2
)
12543 HOST_WIDE_INT isa
= ix86_isa_flags
;
12544 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
12545 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
12546 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
12547 /* The general case is we require all the ISAs specified in bisa{,2}
12549 The exceptions are:
12550 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12551 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12552 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12553 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12554 OPTION_MASK_ISA2_AVXVNNI
12555 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512IFMA) or
12556 OPTION_MASK_ISA2_AVXIFMA
12557 (OPTION_MASK_ISA_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16) or
12558 OPTION_MASK_ISA2_AVXNECONVERT
12559 where for each such pair it is sufficient if either of the ISAs is
12560 enabled, plus if it is ored with other options also those others.
12561 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12562 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
12563 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
12564 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
12565 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
12567 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
12568 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
12569 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
12570 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
12572 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
12573 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
12574 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
12575 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
12577 if ((((bisa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12578 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12579 || (bisa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0)
12580 && (((isa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12581 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12582 || (isa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0))
12584 isa
|= OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
;
12585 isa2
|= OPTION_MASK_ISA2_AVXVNNI
;
12588 if ((((bisa
& (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
))
12589 == (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
))
12590 || (bisa2
& OPTION_MASK_ISA2_AVXIFMA
) != 0)
12591 && (((isa
& (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
))
12592 == (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
))
12593 || (isa2
& OPTION_MASK_ISA2_AVXIFMA
) != 0))
12595 isa
|= OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
;
12596 isa2
|= OPTION_MASK_ISA2_AVXIFMA
;
12599 if ((((bisa
& OPTION_MASK_ISA_AVX512VL
) != 0
12600 && (bisa2
& OPTION_MASK_ISA2_AVX512BF16
) != 0)
12601 && (bisa2
& OPTION_MASK_ISA2_AVXNECONVERT
) != 0)
12602 && (((isa
& OPTION_MASK_ISA_AVX512VL
) != 0
12603 && (isa2
& OPTION_MASK_ISA2_AVX512BF16
) != 0)
12604 || (isa2
& OPTION_MASK_ISA2_AVXNECONVERT
) != 0))
12606 isa
|= OPTION_MASK_ISA_AVX512VL
;
12607 isa2
|= OPTION_MASK_ISA2_AVXNECONVERT
| OPTION_MASK_ISA2_AVX512BF16
;
12610 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
12611 /* __builtin_ia32_maskmovq requires MMX registers. */
12612 && fcode
!= IX86_BUILTIN_MASKMOVQ
)
12614 bisa
&= ~OPTION_MASK_ISA_MMX
;
12615 bisa
|= OPTION_MASK_ISA_SSE2
;
12623 return (bisa
& isa
) == bisa
&& (bisa2
& isa2
) == bisa2
;
12626 /* Expand an expression EXP that calls a built-in function,
12627 with result going to TARGET if that's convenient
12628 (and in mode MODE if that's convenient).
12629 SUBTARGET may be used as the target for computing one of EXP's operands.
12630 IGNORE is nonzero if the value is to be ignored. */
12633 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
12634 machine_mode mode
, int ignore
)
12637 enum insn_code icode
, icode2
;
12638 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
12639 tree arg0
, arg1
, arg2
, arg3
, arg4
;
12640 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
12641 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
12642 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
12643 HOST_WIDE_INT bisa
, bisa2
;
12645 /* For CPU builtins that can be folded, fold first and expand the fold. */
12648 case IX86_BUILTIN_CPU_INIT
:
12650 /* Make it call __cpu_indicator_init in libgcc. */
12651 tree call_expr
, fndecl
, type
;
12652 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
12653 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
12654 call_expr
= build_call_expr (fndecl
, 0);
12655 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
12657 case IX86_BUILTIN_CPU_IS
:
12658 case IX86_BUILTIN_CPU_SUPPORTS
:
12660 tree arg0
= CALL_EXPR_ARG (exp
, 0);
12661 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
12662 gcc_assert (fold_expr
!= NULL_TREE
);
12663 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
12667 if (!ix86_check_builtin_isa_match (fcode
, &bisa
, &bisa2
))
12669 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
12670 if (TARGET_ABI_X32
)
12671 bisa
|= OPTION_MASK_ABI_X32
;
12673 bisa
|= OPTION_MASK_ABI_64
;
12674 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
12675 (enum fpmath_unit
) 0,
12676 (enum prefer_vector_width
) 0,
12677 PVW_NONE
, PVW_NONE
,
12680 error ("%qE needs unknown isa option", fndecl
);
12683 gcc_assert (opts
!= NULL
);
12684 error ("%qE needs isa option %s", fndecl
, opts
);
12687 return expand_call (exp
, target
, ignore
);
12692 case IX86_BUILTIN_MASKMOVQ
:
12693 case IX86_BUILTIN_MASKMOVDQU
:
12694 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
12695 ? CODE_FOR_mmx_maskmovq
12696 : CODE_FOR_sse2_maskmovdqu
);
12697 /* Note the arg order is different from the operand order. */
12698 arg1
= CALL_EXPR_ARG (exp
, 0);
12699 arg2
= CALL_EXPR_ARG (exp
, 1);
12700 arg0
= CALL_EXPR_ARG (exp
, 2);
12701 op0
= expand_normal (arg0
);
12702 op1
= expand_normal (arg1
);
12703 op2
= expand_normal (arg2
);
12704 mode0
= insn_data
[icode
].operand
[0].mode
;
12705 mode1
= insn_data
[icode
].operand
[1].mode
;
12706 mode2
= insn_data
[icode
].operand
[2].mode
;
12708 op0
= ix86_zero_extend_to_Pmode (op0
);
12709 op0
= gen_rtx_MEM (mode1
, op0
);
12711 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12712 op0
= copy_to_mode_reg (mode0
, op0
);
12713 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12714 op1
= copy_to_mode_reg (mode1
, op1
);
12715 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12716 op2
= copy_to_mode_reg (mode2
, op2
);
12717 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
12723 case IX86_BUILTIN_LDMXCSR
:
12724 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
12725 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12726 emit_move_insn (target
, op0
);
12727 emit_insn (gen_sse_ldmxcsr (target
));
12730 case IX86_BUILTIN_STMXCSR
:
12731 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12732 emit_insn (gen_sse_stmxcsr (target
));
12733 return copy_to_mode_reg (SImode
, target
);
12735 case IX86_BUILTIN_CLFLUSH
:
12736 arg0
= CALL_EXPR_ARG (exp
, 0);
12737 op0
= expand_normal (arg0
);
12738 icode
= CODE_FOR_sse2_clflush
;
12739 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12740 op0
= ix86_zero_extend_to_Pmode (op0
);
12742 emit_insn (gen_sse2_clflush (op0
));
12745 case IX86_BUILTIN_CLWB
:
12746 arg0
= CALL_EXPR_ARG (exp
, 0);
12747 op0
= expand_normal (arg0
);
12748 icode
= CODE_FOR_clwb
;
12749 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12750 op0
= ix86_zero_extend_to_Pmode (op0
);
12752 emit_insn (gen_clwb (op0
));
12755 case IX86_BUILTIN_CLFLUSHOPT
:
12756 arg0
= CALL_EXPR_ARG (exp
, 0);
12757 op0
= expand_normal (arg0
);
12758 icode
= CODE_FOR_clflushopt
;
12759 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12760 op0
= ix86_zero_extend_to_Pmode (op0
);
12762 emit_insn (gen_clflushopt (op0
));
12765 case IX86_BUILTIN_MONITOR
:
12766 case IX86_BUILTIN_MONITORX
:
12767 arg0
= CALL_EXPR_ARG (exp
, 0);
12768 arg1
= CALL_EXPR_ARG (exp
, 1);
12769 arg2
= CALL_EXPR_ARG (exp
, 2);
12770 op0
= expand_normal (arg0
);
12771 op1
= expand_normal (arg1
);
12772 op2
= expand_normal (arg2
);
12774 op0
= ix86_zero_extend_to_Pmode (op0
);
12776 op1
= copy_to_mode_reg (SImode
, op1
);
12778 op2
= copy_to_mode_reg (SImode
, op2
);
12780 emit_insn (fcode
== IX86_BUILTIN_MONITOR
12781 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
12782 : gen_monitorx (Pmode
, op0
, op1
, op2
));
12785 case IX86_BUILTIN_MWAIT
:
12786 arg0
= CALL_EXPR_ARG (exp
, 0);
12787 arg1
= CALL_EXPR_ARG (exp
, 1);
12788 op0
= expand_normal (arg0
);
12789 op1
= expand_normal (arg1
);
12791 op0
= copy_to_mode_reg (SImode
, op0
);
12793 op1
= copy_to_mode_reg (SImode
, op1
);
12794 emit_insn (gen_sse3_mwait (op0
, op1
));
12797 case IX86_BUILTIN_MWAITX
:
12798 arg0
= CALL_EXPR_ARG (exp
, 0);
12799 arg1
= CALL_EXPR_ARG (exp
, 1);
12800 arg2
= CALL_EXPR_ARG (exp
, 2);
12801 op0
= expand_normal (arg0
);
12802 op1
= expand_normal (arg1
);
12803 op2
= expand_normal (arg2
);
12805 op0
= copy_to_mode_reg (SImode
, op0
);
12807 op1
= copy_to_mode_reg (SImode
, op1
);
12809 op2
= copy_to_mode_reg (SImode
, op2
);
12810 emit_insn (gen_mwaitx (op0
, op1
, op2
));
12813 case IX86_BUILTIN_UMONITOR
:
12814 arg0
= CALL_EXPR_ARG (exp
, 0);
12815 op0
= expand_normal (arg0
);
12817 op0
= ix86_zero_extend_to_Pmode (op0
);
12818 emit_insn (gen_umonitor (Pmode
, op0
));
12821 case IX86_BUILTIN_UMWAIT
:
12822 case IX86_BUILTIN_TPAUSE
:
12823 arg0
= CALL_EXPR_ARG (exp
, 0);
12824 arg1
= CALL_EXPR_ARG (exp
, 1);
12825 op0
= expand_normal (arg0
);
12826 op1
= expand_normal (arg1
);
12829 op0
= copy_to_mode_reg (SImode
, op0
);
12831 op1
= force_reg (DImode
, op1
);
12835 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
12836 NULL
, 1, OPTAB_DIRECT
);
12839 case IX86_BUILTIN_UMWAIT
:
12840 icode
= CODE_FOR_umwait_rex64
;
12842 case IX86_BUILTIN_TPAUSE
:
12843 icode
= CODE_FOR_tpause_rex64
;
12846 gcc_unreachable ();
12849 op2
= gen_lowpart (SImode
, op2
);
12850 op1
= gen_lowpart (SImode
, op1
);
12851 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
12857 case IX86_BUILTIN_UMWAIT
:
12858 icode
= CODE_FOR_umwait
;
12860 case IX86_BUILTIN_TPAUSE
:
12861 icode
= CODE_FOR_tpause
;
12864 gcc_unreachable ();
12866 pat
= GEN_FCN (icode
) (op0
, op1
);
12875 || !register_operand (target
, QImode
))
12876 target
= gen_reg_rtx (QImode
);
12878 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12880 emit_insn (gen_rtx_SET (target
, pat
));
12884 case IX86_BUILTIN_TESTUI
:
12885 emit_insn (gen_testui ());
12888 || !register_operand (target
, QImode
))
12889 target
= gen_reg_rtx (QImode
);
12891 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12893 emit_insn (gen_rtx_SET (target
, pat
));
12897 case IX86_BUILTIN_CLZERO
:
12898 arg0
= CALL_EXPR_ARG (exp
, 0);
12899 op0
= expand_normal (arg0
);
12901 op0
= ix86_zero_extend_to_Pmode (op0
);
12902 emit_insn (gen_clzero (Pmode
, op0
));
12905 case IX86_BUILTIN_CLDEMOTE
:
12906 arg0
= CALL_EXPR_ARG (exp
, 0);
12907 op0
= expand_normal (arg0
);
12908 icode
= CODE_FOR_cldemote
;
12909 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12910 op0
= ix86_zero_extend_to_Pmode (op0
);
12912 emit_insn (gen_cldemote (op0
));
12915 case IX86_BUILTIN_LOADIWKEY
:
12917 arg0
= CALL_EXPR_ARG (exp
, 0);
12918 arg1
= CALL_EXPR_ARG (exp
, 1);
12919 arg2
= CALL_EXPR_ARG (exp
, 2);
12920 arg3
= CALL_EXPR_ARG (exp
, 3);
12922 op0
= expand_normal (arg0
);
12923 op1
= expand_normal (arg1
);
12924 op2
= expand_normal (arg2
);
12925 op3
= expand_normal (arg3
);
12928 op0
= copy_to_mode_reg (V2DImode
, op0
);
12930 op1
= copy_to_mode_reg (V2DImode
, op1
);
12932 op2
= copy_to_mode_reg (V2DImode
, op2
);
12934 op3
= copy_to_mode_reg (SImode
, op3
);
12936 emit_insn (gen_loadiwkey (op0
, op1
, op2
, op3
));
12941 case IX86_BUILTIN_AESDEC128KLU8
:
12942 icode
= CODE_FOR_aesdec128klu8
;
12943 goto aesdecenc_expand
;
12945 case IX86_BUILTIN_AESDEC256KLU8
:
12946 icode
= CODE_FOR_aesdec256klu8
;
12947 goto aesdecenc_expand
;
12949 case IX86_BUILTIN_AESENC128KLU8
:
12950 icode
= CODE_FOR_aesenc128klu8
;
12951 goto aesdecenc_expand
;
12953 case IX86_BUILTIN_AESENC256KLU8
:
12954 icode
= CODE_FOR_aesenc256klu8
;
12958 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i *odata
12959 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i idata
12960 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
12962 op0
= expand_normal (arg0
);
12963 op1
= expand_normal (arg1
);
12964 op2
= expand_normal (arg2
);
12966 if (!address_operand (op0
, V2DImode
))
12968 op0
= convert_memory_address (Pmode
, op0
);
12969 op0
= copy_addr_to_reg (op0
);
12971 op0
= gen_rtx_MEM (V2DImode
, op0
);
12974 op1
= copy_to_mode_reg (V2DImode
, op1
);
12976 if (!address_operand (op2
, VOIDmode
))
12978 op2
= convert_memory_address (Pmode
, op2
);
12979 op2
= copy_addr_to_reg (op2
);
12981 op2
= gen_rtx_MEM (BLKmode
, op2
);
12983 emit_insn (GEN_FCN (icode
) (op1
, op1
, op2
));
12986 target
= gen_reg_rtx (QImode
);
12988 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12989 error occurs. Then the output should be cleared for safety. */
12990 rtx_code_label
*ok_label
;
12993 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
12994 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
12995 ok_label
= gen_label_rtx ();
12996 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
12998 /* Usually the runtime error seldom occur, so predict OK path as
12999 hotspot to optimize it as fallthrough block. */
13000 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
13002 emit_insn (gen_rtx_SET (op1
, const0_rtx
));
13004 emit_label (ok_label
);
13005 emit_insn (gen_rtx_SET (target
, pat
));
13006 emit_insn (gen_rtx_SET (op0
, op1
));
13010 case IX86_BUILTIN_AESDECWIDE128KLU8
:
13011 icode
= CODE_FOR_aesdecwide128klu8
;
13012 goto wideaesdecenc_expand
;
13014 case IX86_BUILTIN_AESDECWIDE256KLU8
:
13015 icode
= CODE_FOR_aesdecwide256klu8
;
13016 goto wideaesdecenc_expand
;
13018 case IX86_BUILTIN_AESENCWIDE128KLU8
:
13019 icode
= CODE_FOR_aesencwide128klu8
;
13020 goto wideaesdecenc_expand
;
13022 case IX86_BUILTIN_AESENCWIDE256KLU8
:
13023 icode
= CODE_FOR_aesencwide256klu8
;
13025 wideaesdecenc_expand
:
13030 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i * odata
13031 arg1
= CALL_EXPR_ARG (exp
, 1); // const __m128i * idata
13032 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
13034 op0
= expand_normal (arg0
);
13035 op1
= expand_normal (arg1
);
13036 op2
= expand_normal (arg2
);
13038 if (!address_operand (op2
, VOIDmode
))
13040 op2
= convert_memory_address (Pmode
, op2
);
13041 op2
= copy_addr_to_reg (op2
);
13043 op2
= gen_rtx_MEM (BLKmode
, op2
);
13045 for (i
= 0; i
< 8; i
++)
13047 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13049 op
= gen_rtx_MEM (V2DImode
,
13050 plus_constant (Pmode
, op1
, (i
* 16)));
13052 emit_move_insn (xmm_regs
[i
], op
);
13055 emit_insn (GEN_FCN (icode
) (op2
));
13058 target
= gen_reg_rtx (QImode
);
13060 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
13061 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
13062 ok_label
= gen_label_rtx ();
13063 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
13065 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
13067 for (i
= 0; i
< 8; i
++)
13068 emit_insn (gen_rtx_SET (xmm_regs
[i
], const0_rtx
));
13070 emit_label (ok_label
);
13071 emit_insn (gen_rtx_SET (target
, pat
));
13073 for (i
= 0; i
< 8; i
++)
13075 op
= gen_rtx_MEM (V2DImode
,
13076 plus_constant (Pmode
, op0
, (i
* 16)));
13077 emit_move_insn (op
, xmm_regs
[i
]);
13082 case IX86_BUILTIN_ENCODEKEY128U32
:
13084 rtx op
, xmm_regs
[7];
13086 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
13087 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i key
13088 arg2
= CALL_EXPR_ARG (exp
, 2); // void *h
13090 op0
= expand_normal (arg0
);
13091 op1
= expand_normal (arg1
);
13092 op2
= expand_normal (arg2
);
13095 op0
= copy_to_mode_reg (SImode
, op0
);
13097 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
13098 emit_move_insn (op
, op1
);
13100 for (i
= 0; i
< 3; i
++)
13101 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13104 target
= gen_reg_rtx (SImode
);
13106 emit_insn (gen_encodekey128u32 (target
, op0
));
13108 for (i
= 0; i
< 3; i
++)
13110 op
= gen_rtx_MEM (V2DImode
,
13111 plus_constant (Pmode
, op2
, (i
* 16)));
13112 emit_move_insn (op
, xmm_regs
[i
]);
13117 case IX86_BUILTIN_ENCODEKEY256U32
:
13119 rtx op
, xmm_regs
[7];
13121 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
13122 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i keylow
13123 arg2
= CALL_EXPR_ARG (exp
, 2); // __m128i keyhi
13124 arg3
= CALL_EXPR_ARG (exp
, 3); // void *h
13126 op0
= expand_normal (arg0
);
13127 op1
= expand_normal (arg1
);
13128 op2
= expand_normal (arg2
);
13129 op3
= expand_normal (arg3
);
13132 op0
= copy_to_mode_reg (SImode
, op0
);
13134 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13135 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
13136 emit_move_insn (op
, op1
);
13137 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (1));
13138 emit_move_insn (op
, op2
);
13140 for (i
= 0; i
< 4; i
++)
13141 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13144 target
= gen_reg_rtx (SImode
);
13146 emit_insn (gen_encodekey256u32 (target
, op0
));
13148 for (i
= 0; i
< 4; i
++)
13150 op
= gen_rtx_MEM (V2DImode
,
13151 plus_constant (Pmode
, op3
, (i
* 16)));
13152 emit_move_insn (op
, xmm_regs
[i
]);
13158 case IX86_BUILTIN_PREFETCH
:
13160 arg0
= CALL_EXPR_ARG (exp
, 0); // const void *
13161 arg1
= CALL_EXPR_ARG (exp
, 1); // const int
13162 arg2
= CALL_EXPR_ARG (exp
, 2); // const int
13163 arg3
= CALL_EXPR_ARG (exp
, 3); // const int
13165 op0
= expand_normal (arg0
);
13166 op1
= expand_normal (arg1
);
13167 op2
= expand_normal (arg2
);
13168 op3
= expand_normal (arg3
);
13170 if (!CONST_INT_P (op1
) || !CONST_INT_P (op2
) || !CONST_INT_P (op3
))
13172 error ("second, third and fourth argument must be a const");
13176 if (INTVAL (op3
) == 1)
13178 if (TARGET_64BIT
&& TARGET_PREFETCHI
13179 && local_func_symbolic_operand (op0
, GET_MODE (op0
)))
13180 emit_insn (gen_prefetchi (op0
, op2
));
13183 warning (0, "instruction prefetch applies when in 64-bit mode"
13184 " with RIP-relative addressing and"
13185 " option %<-mprefetchi%>;"
13186 " they stay NOPs otherwise");
13187 emit_insn (gen_nop ());
13192 if (!address_operand (op0
, VOIDmode
))
13194 op0
= convert_memory_address (Pmode
, op0
);
13195 op0
= copy_addr_to_reg (op0
);
13198 if (TARGET_3DNOW
|| TARGET_PREFETCH_SSE
13199 || TARGET_PRFCHW
|| TARGET_PREFETCHWT1
)
13200 emit_insn (gen_prefetch (op0
, op1
, op2
));
13201 else if (!MEM_P (op0
) && side_effects_p (op0
))
13202 /* Don't do anything with direct references to volatile memory,
13203 but generate code to handle other side effects. */
13210 case IX86_BUILTIN_PREFETCHI
:
13212 arg0
= CALL_EXPR_ARG (exp
, 0); // const void *
13213 arg1
= CALL_EXPR_ARG (exp
, 1); // const int
13215 op0
= expand_normal (arg0
);
13216 op1
= expand_normal (arg1
);
13218 if (!CONST_INT_P (op1
))
13220 error ("second argument must be a const");
13224 /* GOT/PLT_PIC should not be available for instruction prefetch.
13225 It must be real instruction address. */
13227 && local_func_symbolic_operand (op0
, GET_MODE (op0
)))
13228 emit_insn (gen_prefetchi (op0
, op1
));
13231 /* Ignore the hint. */
13232 warning (0, "instruction prefetch applies when in 64-bit mode"
13233 " with RIP-relative addressing and"
13234 " option %<-mprefetchi%>;"
13235 " they stay NOPs otherwise");
13236 emit_insn (gen_nop ());
13242 case IX86_BUILTIN_VEC_INIT_V2SI
:
13243 case IX86_BUILTIN_VEC_INIT_V4HI
:
13244 case IX86_BUILTIN_VEC_INIT_V8QI
:
13245 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
13247 case IX86_BUILTIN_VEC_EXT_V2DF
:
13248 case IX86_BUILTIN_VEC_EXT_V2DI
:
13249 case IX86_BUILTIN_VEC_EXT_V4SF
:
13250 case IX86_BUILTIN_VEC_EXT_V4SI
:
13251 case IX86_BUILTIN_VEC_EXT_V8HI
:
13252 case IX86_BUILTIN_VEC_EXT_V2SI
:
13253 case IX86_BUILTIN_VEC_EXT_V4HI
:
13254 case IX86_BUILTIN_VEC_EXT_V16QI
:
13255 return ix86_expand_vec_ext_builtin (exp
, target
);
13257 case IX86_BUILTIN_VEC_SET_V2DI
:
13258 case IX86_BUILTIN_VEC_SET_V4SF
:
13259 case IX86_BUILTIN_VEC_SET_V4SI
:
13260 case IX86_BUILTIN_VEC_SET_V8HI
:
13261 case IX86_BUILTIN_VEC_SET_V4HI
:
13262 case IX86_BUILTIN_VEC_SET_V16QI
:
13263 return ix86_expand_vec_set_builtin (exp
);
13265 case IX86_BUILTIN_NANQ
:
13266 case IX86_BUILTIN_NANSQ
:
13267 return expand_call (exp
, target
, ignore
);
13269 case IX86_BUILTIN_RDPID
:
13271 op0
= gen_reg_rtx (word_mode
);
13275 insn
= gen_rdpid_rex64 (op0
);
13276 op0
= convert_to_mode (SImode
, op0
, 1);
13279 insn
= gen_rdpid (op0
);
13284 || !register_operand (target
, SImode
))
13285 target
= gen_reg_rtx (SImode
);
13287 emit_move_insn (target
, op0
);
13290 case IX86_BUILTIN_2INTERSECTD512
:
13291 case IX86_BUILTIN_2INTERSECTQ512
:
13292 case IX86_BUILTIN_2INTERSECTD256
:
13293 case IX86_BUILTIN_2INTERSECTQ256
:
13294 case IX86_BUILTIN_2INTERSECTD128
:
13295 case IX86_BUILTIN_2INTERSECTQ128
:
13296 arg0
= CALL_EXPR_ARG (exp
, 0);
13297 arg1
= CALL_EXPR_ARG (exp
, 1);
13298 arg2
= CALL_EXPR_ARG (exp
, 2);
13299 arg3
= CALL_EXPR_ARG (exp
, 3);
13300 op0
= expand_normal (arg0
);
13301 op1
= expand_normal (arg1
);
13302 op2
= expand_normal (arg2
);
13303 op3
= expand_normal (arg3
);
13305 if (!address_operand (op0
, VOIDmode
))
13307 op0
= convert_memory_address (Pmode
, op0
);
13308 op0
= copy_addr_to_reg (op0
);
13310 if (!address_operand (op1
, VOIDmode
))
13312 op1
= convert_memory_address (Pmode
, op1
);
13313 op1
= copy_addr_to_reg (op1
);
13318 case IX86_BUILTIN_2INTERSECTD512
:
13320 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
13322 case IX86_BUILTIN_2INTERSECTQ512
:
13324 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
13326 case IX86_BUILTIN_2INTERSECTD256
:
13328 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
13330 case IX86_BUILTIN_2INTERSECTQ256
:
13332 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
13334 case IX86_BUILTIN_2INTERSECTD128
:
13336 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
13338 case IX86_BUILTIN_2INTERSECTQ128
:
13340 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
13343 gcc_unreachable ();
13346 mode2
= insn_data
[icode
].operand
[1].mode
;
13347 mode3
= insn_data
[icode
].operand
[2].mode
;
13348 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
13349 op2
= copy_to_mode_reg (mode2
, op2
);
13350 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
13351 op3
= copy_to_mode_reg (mode3
, op3
);
13353 op4
= gen_reg_rtx (mode4
);
13354 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
13355 mode0
= mode4
== P2HImode
? HImode
: QImode
;
13356 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
13357 gen_lowpart (mode0
, op4
));
13358 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
13359 gen_highpart (mode0
, op4
));
13363 case IX86_BUILTIN_RDPMC
:
13364 case IX86_BUILTIN_RDTSC
:
13365 case IX86_BUILTIN_RDTSCP
:
13366 case IX86_BUILTIN_XGETBV
:
13368 op0
= gen_reg_rtx (DImode
);
13369 op1
= gen_reg_rtx (DImode
);
13371 if (fcode
== IX86_BUILTIN_RDPMC
)
13373 arg0
= CALL_EXPR_ARG (exp
, 0);
13374 op2
= expand_normal (arg0
);
13375 if (!register_operand (op2
, SImode
))
13376 op2
= copy_to_mode_reg (SImode
, op2
);
13378 insn
= (TARGET_64BIT
13379 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
13380 : gen_rdpmc (op0
, op2
));
13383 else if (fcode
== IX86_BUILTIN_XGETBV
)
13385 arg0
= CALL_EXPR_ARG (exp
, 0);
13386 op2
= expand_normal (arg0
);
13387 if (!register_operand (op2
, SImode
))
13388 op2
= copy_to_mode_reg (SImode
, op2
);
13390 insn
= (TARGET_64BIT
13391 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
13392 : gen_xgetbv (op0
, op2
));
13395 else if (fcode
== IX86_BUILTIN_RDTSC
)
13397 insn
= (TARGET_64BIT
13398 ? gen_rdtsc_rex64 (op0
, op1
)
13399 : gen_rdtsc (op0
));
13404 op2
= gen_reg_rtx (SImode
);
13406 insn
= (TARGET_64BIT
13407 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
13408 : gen_rdtscp (op0
, op2
));
13411 arg0
= CALL_EXPR_ARG (exp
, 0);
13412 op4
= expand_normal (arg0
);
13413 if (!address_operand (op4
, VOIDmode
))
13415 op4
= convert_memory_address (Pmode
, op4
);
13416 op4
= copy_addr_to_reg (op4
);
13418 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
13422 || !register_operand (target
, DImode
))
13423 target
= gen_reg_rtx (DImode
);
13427 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
13428 op1
, 1, OPTAB_DIRECT
);
13429 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
13430 op0
, 1, OPTAB_DIRECT
);
13433 emit_move_insn (target
, op0
);
13436 case IX86_BUILTIN_ENQCMD
:
13437 case IX86_BUILTIN_ENQCMDS
:
13438 case IX86_BUILTIN_MOVDIR64B
:
13440 arg0
= CALL_EXPR_ARG (exp
, 0);
13441 arg1
= CALL_EXPR_ARG (exp
, 1);
13442 op0
= expand_normal (arg0
);
13443 op1
= expand_normal (arg1
);
13445 op0
= ix86_zero_extend_to_Pmode (op0
);
13446 if (!address_operand (op1
, VOIDmode
))
13448 op1
= convert_memory_address (Pmode
, op1
);
13449 op1
= copy_addr_to_reg (op1
);
13451 op1
= gen_rtx_MEM (XImode
, op1
);
13453 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
13455 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
13461 || !register_operand (target
, SImode
))
13462 target
= gen_reg_rtx (SImode
);
13464 emit_move_insn (target
, const0_rtx
);
13465 target
= gen_rtx_SUBREG (QImode
, target
, 0);
13467 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
13469 : UNSPECV_ENQCMDS
);
13470 icode
= code_for_enqcmd (unspecv
, Pmode
);
13471 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13474 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
13475 gen_rtx_fmt_ee (EQ
, QImode
,
13476 gen_rtx_REG (CCZmode
, FLAGS_REG
),
13478 return SUBREG_REG (target
);
13481 case IX86_BUILTIN_FXSAVE
:
13482 case IX86_BUILTIN_FXRSTOR
:
13483 case IX86_BUILTIN_FXSAVE64
:
13484 case IX86_BUILTIN_FXRSTOR64
:
13485 case IX86_BUILTIN_FNSTENV
:
13486 case IX86_BUILTIN_FLDENV
:
13490 case IX86_BUILTIN_FXSAVE
:
13491 icode
= CODE_FOR_fxsave
;
13493 case IX86_BUILTIN_FXRSTOR
:
13494 icode
= CODE_FOR_fxrstor
;
13496 case IX86_BUILTIN_FXSAVE64
:
13497 icode
= CODE_FOR_fxsave64
;
13499 case IX86_BUILTIN_FXRSTOR64
:
13500 icode
= CODE_FOR_fxrstor64
;
13502 case IX86_BUILTIN_FNSTENV
:
13503 icode
= CODE_FOR_fnstenv
;
13505 case IX86_BUILTIN_FLDENV
:
13506 icode
= CODE_FOR_fldenv
;
13509 gcc_unreachable ();
13512 arg0
= CALL_EXPR_ARG (exp
, 0);
13513 op0
= expand_normal (arg0
);
13515 if (!address_operand (op0
, VOIDmode
))
13517 op0
= convert_memory_address (Pmode
, op0
);
13518 op0
= copy_addr_to_reg (op0
);
13520 op0
= gen_rtx_MEM (mode0
, op0
);
13522 pat
= GEN_FCN (icode
) (op0
);
13527 case IX86_BUILTIN_XSETBV
:
13528 arg0
= CALL_EXPR_ARG (exp
, 0);
13529 arg1
= CALL_EXPR_ARG (exp
, 1);
13530 op0
= expand_normal (arg0
);
13531 op1
= expand_normal (arg1
);
13534 op0
= copy_to_mode_reg (SImode
, op0
);
13536 op1
= force_reg (DImode
, op1
);
13540 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13541 NULL
, 1, OPTAB_DIRECT
);
13543 icode
= CODE_FOR_xsetbv_rex64
;
13545 op2
= gen_lowpart (SImode
, op2
);
13546 op1
= gen_lowpart (SImode
, op1
);
13547 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13551 icode
= CODE_FOR_xsetbv
;
13553 pat
= GEN_FCN (icode
) (op0
, op1
);
13559 case IX86_BUILTIN_XSAVE
:
13560 case IX86_BUILTIN_XRSTOR
:
13561 case IX86_BUILTIN_XSAVE64
:
13562 case IX86_BUILTIN_XRSTOR64
:
13563 case IX86_BUILTIN_XSAVEOPT
:
13564 case IX86_BUILTIN_XSAVEOPT64
:
13565 case IX86_BUILTIN_XSAVES
:
13566 case IX86_BUILTIN_XRSTORS
:
13567 case IX86_BUILTIN_XSAVES64
:
13568 case IX86_BUILTIN_XRSTORS64
:
13569 case IX86_BUILTIN_XSAVEC
:
13570 case IX86_BUILTIN_XSAVEC64
:
13571 arg0
= CALL_EXPR_ARG (exp
, 0);
13572 arg1
= CALL_EXPR_ARG (exp
, 1);
13573 op0
= expand_normal (arg0
);
13574 op1
= expand_normal (arg1
);
13576 if (!address_operand (op0
, VOIDmode
))
13578 op0
= convert_memory_address (Pmode
, op0
);
13579 op0
= copy_addr_to_reg (op0
);
13581 op0
= gen_rtx_MEM (BLKmode
, op0
);
13583 op1
= force_reg (DImode
, op1
);
13587 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13588 NULL
, 1, OPTAB_DIRECT
);
13591 case IX86_BUILTIN_XSAVE
:
13592 icode
= CODE_FOR_xsave_rex64
;
13594 case IX86_BUILTIN_XRSTOR
:
13595 icode
= CODE_FOR_xrstor_rex64
;
13597 case IX86_BUILTIN_XSAVE64
:
13598 icode
= CODE_FOR_xsave64
;
13600 case IX86_BUILTIN_XRSTOR64
:
13601 icode
= CODE_FOR_xrstor64
;
13603 case IX86_BUILTIN_XSAVEOPT
:
13604 icode
= CODE_FOR_xsaveopt_rex64
;
13606 case IX86_BUILTIN_XSAVEOPT64
:
13607 icode
= CODE_FOR_xsaveopt64
;
13609 case IX86_BUILTIN_XSAVES
:
13610 icode
= CODE_FOR_xsaves_rex64
;
13612 case IX86_BUILTIN_XRSTORS
:
13613 icode
= CODE_FOR_xrstors_rex64
;
13615 case IX86_BUILTIN_XSAVES64
:
13616 icode
= CODE_FOR_xsaves64
;
13618 case IX86_BUILTIN_XRSTORS64
:
13619 icode
= CODE_FOR_xrstors64
;
13621 case IX86_BUILTIN_XSAVEC
:
13622 icode
= CODE_FOR_xsavec_rex64
;
13624 case IX86_BUILTIN_XSAVEC64
:
13625 icode
= CODE_FOR_xsavec64
;
13628 gcc_unreachable ();
13631 op2
= gen_lowpart (SImode
, op2
);
13632 op1
= gen_lowpart (SImode
, op1
);
13633 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13639 case IX86_BUILTIN_XSAVE
:
13640 icode
= CODE_FOR_xsave
;
13642 case IX86_BUILTIN_XRSTOR
:
13643 icode
= CODE_FOR_xrstor
;
13645 case IX86_BUILTIN_XSAVEOPT
:
13646 icode
= CODE_FOR_xsaveopt
;
13648 case IX86_BUILTIN_XSAVES
:
13649 icode
= CODE_FOR_xsaves
;
13651 case IX86_BUILTIN_XRSTORS
:
13652 icode
= CODE_FOR_xrstors
;
13654 case IX86_BUILTIN_XSAVEC
:
13655 icode
= CODE_FOR_xsavec
;
13658 gcc_unreachable ();
13660 pat
= GEN_FCN (icode
) (op0
, op1
);
13667 case IX86_BUILTIN_LLWPCB
:
13668 arg0
= CALL_EXPR_ARG (exp
, 0);
13669 op0
= expand_normal (arg0
);
13671 if (!register_operand (op0
, Pmode
))
13672 op0
= ix86_zero_extend_to_Pmode (op0
);
13673 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
13676 case IX86_BUILTIN_SLWPCB
:
13678 || !register_operand (target
, Pmode
))
13679 target
= gen_reg_rtx (Pmode
);
13680 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
13683 case IX86_BUILTIN_LWPVAL32
:
13684 case IX86_BUILTIN_LWPVAL64
:
13685 case IX86_BUILTIN_LWPINS32
:
13686 case IX86_BUILTIN_LWPINS64
:
13687 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
13688 || fcode
== IX86_BUILTIN_LWPINS32
)
13689 ? SImode
: DImode
);
13691 if (fcode
== IX86_BUILTIN_LWPVAL32
13692 || fcode
== IX86_BUILTIN_LWPVAL64
)
13693 icode
= code_for_lwp_lwpval (mode
);
13695 icode
= code_for_lwp_lwpins (mode
);
13697 arg0
= CALL_EXPR_ARG (exp
, 0);
13698 arg1
= CALL_EXPR_ARG (exp
, 1);
13699 arg2
= CALL_EXPR_ARG (exp
, 2);
13700 op0
= expand_normal (arg0
);
13701 op1
= expand_normal (arg1
);
13702 op2
= expand_normal (arg2
);
13703 mode0
= insn_data
[icode
].operand
[0].mode
;
13705 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13706 op0
= copy_to_mode_reg (mode0
, op0
);
13707 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
13708 op1
= copy_to_mode_reg (SImode
, op1
);
13710 if (!CONST_INT_P (op2
))
13712 error ("the last argument must be a 32-bit immediate");
13716 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
13718 if (fcode
== IX86_BUILTIN_LWPINS32
13719 || fcode
== IX86_BUILTIN_LWPINS64
)
13722 || !nonimmediate_operand (target
, QImode
))
13723 target
= gen_reg_rtx (QImode
);
13725 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13727 emit_insn (gen_rtx_SET (target
, pat
));
13734 case IX86_BUILTIN_BEXTRI32
:
13735 case IX86_BUILTIN_BEXTRI64
:
13736 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
13738 arg0
= CALL_EXPR_ARG (exp
, 0);
13739 arg1
= CALL_EXPR_ARG (exp
, 1);
13740 op0
= expand_normal (arg0
);
13741 op1
= expand_normal (arg1
);
13743 if (!CONST_INT_P (op1
))
13745 error ("last argument must be an immediate");
13750 unsigned char lsb_index
= UINTVAL (op1
);
13751 unsigned char length
= UINTVAL (op1
) >> 8;
13753 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
13755 icode
= code_for_tbm_bextri (mode
);
13757 mode1
= insn_data
[icode
].operand
[1].mode
;
13758 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
13759 op0
= copy_to_mode_reg (mode1
, op0
);
13761 mode0
= insn_data
[icode
].operand
[0].mode
;
13763 || !register_operand (target
, mode0
))
13764 target
= gen_reg_rtx (mode0
);
13766 if (length
== 0 || lsb_index
>= bitsize
)
13768 emit_move_insn (target
, const0_rtx
);
13772 if (length
+ lsb_index
> bitsize
)
13773 length
= bitsize
- lsb_index
;
13775 op1
= GEN_INT (length
);
13776 op2
= GEN_INT (lsb_index
);
13778 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
13782 case IX86_BUILTIN_RDRAND16_STEP
:
13786 case IX86_BUILTIN_RDRAND32_STEP
:
13790 case IX86_BUILTIN_RDRAND64_STEP
:
13794 arg0
= CALL_EXPR_ARG (exp
, 0);
13795 op1
= expand_normal (arg0
);
13796 if (!address_operand (op1
, VOIDmode
))
13798 op1
= convert_memory_address (Pmode
, op1
);
13799 op1
= copy_addr_to_reg (op1
);
13802 op0
= gen_reg_rtx (mode
);
13803 emit_insn (gen_rdrand (mode
, op0
));
13805 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
13807 op1
= force_reg (SImode
, const1_rtx
);
13809 /* Emit SImode conditional move. */
13810 if (mode
== HImode
)
13812 if (TARGET_ZERO_EXTEND_WITH_AND
13813 && optimize_function_for_speed_p (cfun
))
13815 op2
= force_reg (SImode
, const0_rtx
);
13817 emit_insn (gen_movstricthi
13818 (gen_lowpart (HImode
, op2
), op0
));
13822 op2
= gen_reg_rtx (SImode
);
13824 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
13827 else if (mode
== SImode
)
13830 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
13833 || !register_operand (target
, SImode
))
13834 target
= gen_reg_rtx (SImode
);
13836 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13838 emit_insn (gen_rtx_SET (target
,
13839 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
13842 case IX86_BUILTIN_RDSEED16_STEP
:
13846 case IX86_BUILTIN_RDSEED32_STEP
:
13850 case IX86_BUILTIN_RDSEED64_STEP
:
13854 arg0
= CALL_EXPR_ARG (exp
, 0);
13855 op1
= expand_normal (arg0
);
13856 if (!address_operand (op1
, VOIDmode
))
13858 op1
= convert_memory_address (Pmode
, op1
);
13859 op1
= copy_addr_to_reg (op1
);
13862 op0
= gen_reg_rtx (mode
);
13863 emit_insn (gen_rdseed (mode
, op0
));
13865 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
13867 op2
= gen_reg_rtx (QImode
);
13869 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13871 emit_insn (gen_rtx_SET (op2
, pat
));
13874 || !register_operand (target
, SImode
))
13875 target
= gen_reg_rtx (SImode
);
13877 emit_insn (gen_zero_extendqisi2 (target
, op2
));
13880 case IX86_BUILTIN_SBB32
:
13881 icode
= CODE_FOR_subborrowsi
;
13882 icode2
= CODE_FOR_subborrowsi_0
;
13888 case IX86_BUILTIN_SBB64
:
13889 icode
= CODE_FOR_subborrowdi
;
13890 icode2
= CODE_FOR_subborrowdi_0
;
13896 case IX86_BUILTIN_ADDCARRYX32
:
13897 icode
= CODE_FOR_addcarrysi
;
13898 icode2
= CODE_FOR_addcarrysi_0
;
13904 case IX86_BUILTIN_ADDCARRYX64
:
13905 icode
= CODE_FOR_addcarrydi
;
13906 icode2
= CODE_FOR_addcarrydi_0
;
13912 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
13913 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
13914 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
13915 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
13917 op1
= expand_normal (arg0
);
13918 if (!integer_zerop (arg0
))
13919 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
13921 op2
= expand_normal (arg1
);
13922 if (!register_operand (op2
, mode0
))
13923 op2
= copy_to_mode_reg (mode0
, op2
);
13925 op3
= expand_normal (arg2
);
13926 if (!register_operand (op3
, mode0
))
13927 op3
= copy_to_mode_reg (mode0
, op3
);
13929 op4
= expand_normal (arg3
);
13930 if (!address_operand (op4
, VOIDmode
))
13932 op4
= convert_memory_address (Pmode
, op4
);
13933 op4
= copy_addr_to_reg (op4
);
13936 op0
= gen_reg_rtx (mode0
);
13937 if (integer_zerop (arg0
))
13939 /* If arg0 is 0, optimize right away into add or sub
13940 instruction that sets CCCmode flags. */
13941 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
13942 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
13946 /* Generate CF from input operand. */
13947 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
13949 /* Generate instruction that consumes CF. */
13950 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
13951 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
13952 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
13953 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
13956 /* Return current CF value. */
13958 target
= gen_reg_rtx (QImode
);
13960 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
13961 emit_insn (gen_rtx_SET (target
, pat
));
13963 /* Store the result. */
13964 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
13968 case IX86_BUILTIN_READ_FLAGS
:
13972 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
13975 || target
== NULL_RTX
13976 || !nonimmediate_operand (target
, word_mode
)
13977 || GET_MODE (target
) != word_mode
)
13978 target
= gen_reg_rtx (word_mode
);
13980 emit_insn (gen_pop (target
));
13983 case IX86_BUILTIN_WRITE_FLAGS
:
13985 arg0
= CALL_EXPR_ARG (exp
, 0);
13986 op0
= expand_normal (arg0
);
13987 if (!general_no_elim_operand (op0
, word_mode
))
13988 op0
= copy_to_mode_reg (word_mode
, op0
);
13990 emit_insn (gen_push (op0
));
13991 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
13994 case IX86_BUILTIN_KTESTC8
:
13995 icode
= CODE_FOR_ktestqi
;
13999 case IX86_BUILTIN_KTESTZ8
:
14000 icode
= CODE_FOR_ktestqi
;
14004 case IX86_BUILTIN_KTESTC16
:
14005 icode
= CODE_FOR_ktesthi
;
14009 case IX86_BUILTIN_KTESTZ16
:
14010 icode
= CODE_FOR_ktesthi
;
14014 case IX86_BUILTIN_KTESTC32
:
14015 icode
= CODE_FOR_ktestsi
;
14019 case IX86_BUILTIN_KTESTZ32
:
14020 icode
= CODE_FOR_ktestsi
;
14024 case IX86_BUILTIN_KTESTC64
:
14025 icode
= CODE_FOR_ktestdi
;
14029 case IX86_BUILTIN_KTESTZ64
:
14030 icode
= CODE_FOR_ktestdi
;
14034 case IX86_BUILTIN_KORTESTC8
:
14035 icode
= CODE_FOR_kortestqi
;
14039 case IX86_BUILTIN_KORTESTZ8
:
14040 icode
= CODE_FOR_kortestqi
;
14044 case IX86_BUILTIN_KORTESTC16
:
14045 icode
= CODE_FOR_kortesthi
;
14049 case IX86_BUILTIN_KORTESTZ16
:
14050 icode
= CODE_FOR_kortesthi
;
14054 case IX86_BUILTIN_KORTESTC32
:
14055 icode
= CODE_FOR_kortestsi
;
14059 case IX86_BUILTIN_KORTESTZ32
:
14060 icode
= CODE_FOR_kortestsi
;
14064 case IX86_BUILTIN_KORTESTC64
:
14065 icode
= CODE_FOR_kortestdi
;
14069 case IX86_BUILTIN_KORTESTZ64
:
14070 icode
= CODE_FOR_kortestdi
;
14074 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
14075 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
14076 op0
= expand_normal (arg0
);
14077 op1
= expand_normal (arg1
);
14079 mode0
= insn_data
[icode
].operand
[0].mode
;
14080 mode1
= insn_data
[icode
].operand
[1].mode
;
14082 if (GET_MODE (op0
) != VOIDmode
)
14083 op0
= force_reg (GET_MODE (op0
), op0
);
14085 op0
= gen_lowpart (mode0
, op0
);
14087 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14088 op0
= copy_to_mode_reg (mode0
, op0
);
14090 if (GET_MODE (op1
) != VOIDmode
)
14091 op1
= force_reg (GET_MODE (op1
), op1
);
14093 op1
= gen_lowpart (mode1
, op1
);
14095 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14096 op1
= copy_to_mode_reg (mode1
, op1
);
14098 target
= gen_reg_rtx (QImode
);
14100 /* Emit kortest. */
14101 emit_insn (GEN_FCN (icode
) (op0
, op1
));
14102 /* And use setcc to return result from flags. */
14103 ix86_expand_setcc (target
, EQ
,
14104 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
14107 case IX86_BUILTIN_GATHERSIV2DF
:
14108 icode
= CODE_FOR_avx2_gathersiv2df
;
14110 case IX86_BUILTIN_GATHERSIV4DF
:
14111 icode
= CODE_FOR_avx2_gathersiv4df
;
14113 case IX86_BUILTIN_GATHERDIV2DF
:
14114 icode
= CODE_FOR_avx2_gatherdiv2df
;
14116 case IX86_BUILTIN_GATHERDIV4DF
:
14117 icode
= CODE_FOR_avx2_gatherdiv4df
;
14119 case IX86_BUILTIN_GATHERSIV4SF
:
14120 icode
= CODE_FOR_avx2_gathersiv4sf
;
14122 case IX86_BUILTIN_GATHERSIV8SF
:
14123 icode
= CODE_FOR_avx2_gathersiv8sf
;
14125 case IX86_BUILTIN_GATHERDIV4SF
:
14126 icode
= CODE_FOR_avx2_gatherdiv4sf
;
14128 case IX86_BUILTIN_GATHERDIV8SF
:
14129 icode
= CODE_FOR_avx2_gatherdiv8sf
;
14131 case IX86_BUILTIN_GATHERSIV2DI
:
14132 icode
= CODE_FOR_avx2_gathersiv2di
;
14134 case IX86_BUILTIN_GATHERSIV4DI
:
14135 icode
= CODE_FOR_avx2_gathersiv4di
;
14137 case IX86_BUILTIN_GATHERDIV2DI
:
14138 icode
= CODE_FOR_avx2_gatherdiv2di
;
14140 case IX86_BUILTIN_GATHERDIV4DI
:
14141 icode
= CODE_FOR_avx2_gatherdiv4di
;
14143 case IX86_BUILTIN_GATHERSIV4SI
:
14144 icode
= CODE_FOR_avx2_gathersiv4si
;
14146 case IX86_BUILTIN_GATHERSIV8SI
:
14147 icode
= CODE_FOR_avx2_gathersiv8si
;
14149 case IX86_BUILTIN_GATHERDIV4SI
:
14150 icode
= CODE_FOR_avx2_gatherdiv4si
;
14152 case IX86_BUILTIN_GATHERDIV8SI
:
14153 icode
= CODE_FOR_avx2_gatherdiv8si
;
14155 case IX86_BUILTIN_GATHERALTSIV4DF
:
14156 icode
= CODE_FOR_avx2_gathersiv4df
;
14158 case IX86_BUILTIN_GATHERALTDIV8SF
:
14159 icode
= CODE_FOR_avx2_gatherdiv8sf
;
14161 case IX86_BUILTIN_GATHERALTSIV4DI
:
14162 icode
= CODE_FOR_avx2_gathersiv4di
;
14164 case IX86_BUILTIN_GATHERALTDIV8SI
:
14165 icode
= CODE_FOR_avx2_gatherdiv8si
;
14167 case IX86_BUILTIN_GATHER3SIV16SF
:
14168 icode
= CODE_FOR_avx512f_gathersiv16sf
;
14170 case IX86_BUILTIN_GATHER3SIV8DF
:
14171 icode
= CODE_FOR_avx512f_gathersiv8df
;
14173 case IX86_BUILTIN_GATHER3DIV16SF
:
14174 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
14176 case IX86_BUILTIN_GATHER3DIV8DF
:
14177 icode
= CODE_FOR_avx512f_gatherdiv8df
;
14179 case IX86_BUILTIN_GATHER3SIV16SI
:
14180 icode
= CODE_FOR_avx512f_gathersiv16si
;
14182 case IX86_BUILTIN_GATHER3SIV8DI
:
14183 icode
= CODE_FOR_avx512f_gathersiv8di
;
14185 case IX86_BUILTIN_GATHER3DIV16SI
:
14186 icode
= CODE_FOR_avx512f_gatherdiv16si
;
14188 case IX86_BUILTIN_GATHER3DIV8DI
:
14189 icode
= CODE_FOR_avx512f_gatherdiv8di
;
14191 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14192 icode
= CODE_FOR_avx512f_gathersiv8df
;
14194 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14195 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
14197 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14198 icode
= CODE_FOR_avx512f_gathersiv8di
;
14200 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14201 icode
= CODE_FOR_avx512f_gatherdiv16si
;
14203 case IX86_BUILTIN_GATHER3SIV2DF
:
14204 icode
= CODE_FOR_avx512vl_gathersiv2df
;
14206 case IX86_BUILTIN_GATHER3SIV4DF
:
14207 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14209 case IX86_BUILTIN_GATHER3DIV2DF
:
14210 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
14212 case IX86_BUILTIN_GATHER3DIV4DF
:
14213 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
14215 case IX86_BUILTIN_GATHER3SIV4SF
:
14216 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
14218 case IX86_BUILTIN_GATHER3SIV8SF
:
14219 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
14221 case IX86_BUILTIN_GATHER3DIV4SF
:
14222 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
14224 case IX86_BUILTIN_GATHER3DIV8SF
:
14225 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14227 case IX86_BUILTIN_GATHER3SIV2DI
:
14228 icode
= CODE_FOR_avx512vl_gathersiv2di
;
14230 case IX86_BUILTIN_GATHER3SIV4DI
:
14231 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14233 case IX86_BUILTIN_GATHER3DIV2DI
:
14234 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
14236 case IX86_BUILTIN_GATHER3DIV4DI
:
14237 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
14239 case IX86_BUILTIN_GATHER3SIV4SI
:
14240 icode
= CODE_FOR_avx512vl_gathersiv4si
;
14242 case IX86_BUILTIN_GATHER3SIV8SI
:
14243 icode
= CODE_FOR_avx512vl_gathersiv8si
;
14245 case IX86_BUILTIN_GATHER3DIV4SI
:
14246 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
14248 case IX86_BUILTIN_GATHER3DIV8SI
:
14249 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14251 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14252 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14254 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14255 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14257 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14258 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14260 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14261 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14263 case IX86_BUILTIN_SCATTERSIV16SF
:
14264 icode
= CODE_FOR_avx512f_scattersiv16sf
;
14266 case IX86_BUILTIN_SCATTERSIV8DF
:
14267 icode
= CODE_FOR_avx512f_scattersiv8df
;
14269 case IX86_BUILTIN_SCATTERDIV16SF
:
14270 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14272 case IX86_BUILTIN_SCATTERDIV8DF
:
14273 icode
= CODE_FOR_avx512f_scatterdiv8df
;
14275 case IX86_BUILTIN_SCATTERSIV16SI
:
14276 icode
= CODE_FOR_avx512f_scattersiv16si
;
14278 case IX86_BUILTIN_SCATTERSIV8DI
:
14279 icode
= CODE_FOR_avx512f_scattersiv8di
;
14281 case IX86_BUILTIN_SCATTERDIV16SI
:
14282 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14284 case IX86_BUILTIN_SCATTERDIV8DI
:
14285 icode
= CODE_FOR_avx512f_scatterdiv8di
;
14287 case IX86_BUILTIN_SCATTERSIV8SF
:
14288 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
14290 case IX86_BUILTIN_SCATTERSIV4SF
:
14291 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
14293 case IX86_BUILTIN_SCATTERSIV4DF
:
14294 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14296 case IX86_BUILTIN_SCATTERSIV2DF
:
14297 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14299 case IX86_BUILTIN_SCATTERDIV8SF
:
14300 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14302 case IX86_BUILTIN_SCATTERDIV4SF
:
14303 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14305 case IX86_BUILTIN_SCATTERDIV4DF
:
14306 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
14308 case IX86_BUILTIN_SCATTERDIV2DF
:
14309 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
14311 case IX86_BUILTIN_SCATTERSIV8SI
:
14312 icode
= CODE_FOR_avx512vl_scattersiv8si
;
14314 case IX86_BUILTIN_SCATTERSIV4SI
:
14315 icode
= CODE_FOR_avx512vl_scattersiv4si
;
14317 case IX86_BUILTIN_SCATTERSIV4DI
:
14318 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14320 case IX86_BUILTIN_SCATTERSIV2DI
:
14321 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14323 case IX86_BUILTIN_SCATTERDIV8SI
:
14324 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14326 case IX86_BUILTIN_SCATTERDIV4SI
:
14327 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14329 case IX86_BUILTIN_SCATTERDIV4DI
:
14330 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
14332 case IX86_BUILTIN_SCATTERDIV2DI
:
14333 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
14335 case IX86_BUILTIN_GATHERPFDPD
:
14336 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
14337 goto vec_prefetch_gen
;
14338 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14339 icode
= CODE_FOR_avx512f_scattersiv8df
;
14341 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14342 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14344 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14345 icode
= CODE_FOR_avx512f_scattersiv8di
;
14347 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14348 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14350 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14351 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14353 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14354 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14356 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14357 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14359 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14360 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14362 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14363 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14365 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14366 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14368 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14369 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14371 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14372 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14374 case IX86_BUILTIN_GATHERPFDPS
:
14375 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
14376 goto vec_prefetch_gen
;
14377 case IX86_BUILTIN_GATHERPFQPD
:
14378 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
14379 goto vec_prefetch_gen
;
14380 case IX86_BUILTIN_GATHERPFQPS
:
14381 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
14382 goto vec_prefetch_gen
;
14383 case IX86_BUILTIN_SCATTERPFDPD
:
14384 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
14385 goto vec_prefetch_gen
;
14386 case IX86_BUILTIN_SCATTERPFDPS
:
14387 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
14388 goto vec_prefetch_gen
;
14389 case IX86_BUILTIN_SCATTERPFQPD
:
14390 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
14391 goto vec_prefetch_gen
;
14392 case IX86_BUILTIN_SCATTERPFQPS
:
14393 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
14394 goto vec_prefetch_gen
;
14398 rtx (*gen
) (rtx
, rtx
);
14400 arg0
= CALL_EXPR_ARG (exp
, 0);
14401 arg1
= CALL_EXPR_ARG (exp
, 1);
14402 arg2
= CALL_EXPR_ARG (exp
, 2);
14403 arg3
= CALL_EXPR_ARG (exp
, 3);
14404 arg4
= CALL_EXPR_ARG (exp
, 4);
14405 op0
= expand_normal (arg0
);
14406 op1
= expand_normal (arg1
);
14407 op2
= expand_normal (arg2
);
14408 op3
= expand_normal (arg3
);
14409 op4
= expand_normal (arg4
);
14410 /* Note the arg order is different from the operand order. */
14411 mode0
= insn_data
[icode
].operand
[1].mode
;
14412 mode2
= insn_data
[icode
].operand
[3].mode
;
14413 mode3
= insn_data
[icode
].operand
[4].mode
;
14414 mode4
= insn_data
[icode
].operand
[5].mode
;
14416 if (target
== NULL_RTX
14417 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
14418 || !insn_data
[icode
].operand
[0].predicate (target
,
14419 GET_MODE (target
)))
14420 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
14422 subtarget
= target
;
14426 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14427 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14428 half
= gen_reg_rtx (V8SImode
);
14429 if (!nonimmediate_operand (op2
, V16SImode
))
14430 op2
= copy_to_mode_reg (V16SImode
, op2
);
14431 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14434 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14435 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14436 case IX86_BUILTIN_GATHERALTSIV4DF
:
14437 case IX86_BUILTIN_GATHERALTSIV4DI
:
14438 half
= gen_reg_rtx (V4SImode
);
14439 if (!nonimmediate_operand (op2
, V8SImode
))
14440 op2
= copy_to_mode_reg (V8SImode
, op2
);
14441 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14444 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14445 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14446 half
= gen_reg_rtx (mode0
);
14447 if (mode0
== V8SFmode
)
14448 gen
= gen_vec_extract_lo_v16sf
;
14450 gen
= gen_vec_extract_lo_v16si
;
14451 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14452 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14453 emit_insn (gen (half
, op0
));
14455 op3
= lowpart_subreg (QImode
, op3
, HImode
);
14457 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14458 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14459 case IX86_BUILTIN_GATHERALTDIV8SF
:
14460 case IX86_BUILTIN_GATHERALTDIV8SI
:
14461 half
= gen_reg_rtx (mode0
);
14462 if (mode0
== V4SFmode
)
14463 gen
= gen_vec_extract_lo_v8sf
;
14465 gen
= gen_vec_extract_lo_v8si
;
14466 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14467 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14468 emit_insn (gen (half
, op0
));
14470 if (VECTOR_MODE_P (GET_MODE (op3
)))
14472 half
= gen_reg_rtx (mode0
);
14473 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14474 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14475 emit_insn (gen (half
, op3
));
14483 /* Force memory operand only with base register here. But we
14484 don't want to do it on memory operand for other builtin
14486 op1
= ix86_zero_extend_to_Pmode (op1
);
14488 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
14489 op0
= copy_to_mode_reg (mode0
, op0
);
14490 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
14491 op1
= copy_to_mode_reg (Pmode
, op1
);
14492 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
14493 op2
= copy_to_mode_reg (mode2
, op2
);
14495 op3
= fixup_modeless_constant (op3
, mode3
);
14497 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
14499 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
14500 op3
= copy_to_mode_reg (mode3
, op3
);
14504 op3
= copy_to_reg (op3
);
14505 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
14507 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
14509 error ("the last argument must be scale 1, 2, 4, 8");
14513 /* Optimize. If mask is known to have all high bits set,
14514 replace op0 with pc_rtx to signal that the instruction
14515 overwrites the whole destination and doesn't use its
14516 previous contents. */
14519 if (TREE_CODE (arg3
) == INTEGER_CST
)
14521 if (integer_all_onesp (arg3
))
14524 else if (TREE_CODE (arg3
) == VECTOR_CST
)
14526 unsigned int negative
= 0;
14527 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
14529 tree cst
= VECTOR_CST_ELT (arg3
, i
);
14530 if (TREE_CODE (cst
) == INTEGER_CST
14531 && tree_int_cst_sign_bit (cst
))
14533 else if (TREE_CODE (cst
) == REAL_CST
14534 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
14537 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
14540 else if (TREE_CODE (arg3
) == SSA_NAME
14541 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
14543 /* Recognize also when mask is like:
14544 __v2df src = _mm_setzero_pd ();
14545 __v2df mask = _mm_cmpeq_pd (src, src);
14547 __v8sf src = _mm256_setzero_ps ();
14548 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14549 as that is a cheaper way to load all ones into
14550 a register than having to load a constant from
14552 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
14553 if (is_gimple_call (def_stmt
))
14555 tree fndecl
= gimple_call_fndecl (def_stmt
);
14557 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
14558 switch (DECL_MD_FUNCTION_CODE (fndecl
))
14560 case IX86_BUILTIN_CMPPD
:
14561 case IX86_BUILTIN_CMPPS
:
14562 case IX86_BUILTIN_CMPPD256
:
14563 case IX86_BUILTIN_CMPPS256
:
14564 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
14567 case IX86_BUILTIN_CMPEQPD
:
14568 case IX86_BUILTIN_CMPEQPS
:
14569 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
14570 && initializer_zerop (gimple_call_arg (def_stmt
,
14581 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
14588 case IX86_BUILTIN_GATHER3DIV16SF
:
14589 if (target
== NULL_RTX
)
14590 target
= gen_reg_rtx (V8SFmode
);
14591 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
14593 case IX86_BUILTIN_GATHER3DIV16SI
:
14594 if (target
== NULL_RTX
)
14595 target
= gen_reg_rtx (V8SImode
);
14596 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
14598 case IX86_BUILTIN_GATHER3DIV8SF
:
14599 case IX86_BUILTIN_GATHERDIV8SF
:
14600 if (target
== NULL_RTX
)
14601 target
= gen_reg_rtx (V4SFmode
);
14602 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
14604 case IX86_BUILTIN_GATHER3DIV8SI
:
14605 case IX86_BUILTIN_GATHERDIV8SI
:
14606 if (target
== NULL_RTX
)
14607 target
= gen_reg_rtx (V4SImode
);
14608 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
14611 target
= subtarget
;
14617 arg0
= CALL_EXPR_ARG (exp
, 0);
14618 arg1
= CALL_EXPR_ARG (exp
, 1);
14619 arg2
= CALL_EXPR_ARG (exp
, 2);
14620 arg3
= CALL_EXPR_ARG (exp
, 3);
14621 arg4
= CALL_EXPR_ARG (exp
, 4);
14622 op0
= expand_normal (arg0
);
14623 op1
= expand_normal (arg1
);
14624 op2
= expand_normal (arg2
);
14625 op3
= expand_normal (arg3
);
14626 op4
= expand_normal (arg4
);
14627 mode1
= insn_data
[icode
].operand
[1].mode
;
14628 mode2
= insn_data
[icode
].operand
[2].mode
;
14629 mode3
= insn_data
[icode
].operand
[3].mode
;
14630 mode4
= insn_data
[icode
].operand
[4].mode
;
14632 /* Scatter instruction stores operand op3 to memory with
14633 indices from op2 and scale from op4 under writemask op1.
14634 If index operand op2 has more elements then source operand
14635 op3 one need to use only its low half. And vice versa. */
14638 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14639 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14640 half
= gen_reg_rtx (V8SImode
);
14641 if (!nonimmediate_operand (op2
, V16SImode
))
14642 op2
= copy_to_mode_reg (V16SImode
, op2
);
14643 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14646 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14647 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14648 half
= gen_reg_rtx (mode3
);
14649 if (mode3
== V8SFmode
)
14650 gen
= gen_vec_extract_lo_v16sf
;
14652 gen
= gen_vec_extract_lo_v16si
;
14653 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14654 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14655 emit_insn (gen (half
, op3
));
14658 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14659 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14660 half
= gen_reg_rtx (V4SImode
);
14661 if (!nonimmediate_operand (op2
, V8SImode
))
14662 op2
= copy_to_mode_reg (V8SImode
, op2
);
14663 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14666 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14667 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14668 half
= gen_reg_rtx (mode3
);
14669 if (mode3
== V4SFmode
)
14670 gen
= gen_vec_extract_lo_v8sf
;
14672 gen
= gen_vec_extract_lo_v8si
;
14673 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14674 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14675 emit_insn (gen (half
, op3
));
14678 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14679 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14680 if (!nonimmediate_operand (op2
, V4SImode
))
14681 op2
= copy_to_mode_reg (V4SImode
, op2
);
14683 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14684 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14685 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14686 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14692 /* Force memory operand only with base register here. But we
14693 don't want to do it on memory operand for other builtin
14695 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
14697 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
14698 op0
= copy_to_mode_reg (Pmode
, op0
);
14700 op1
= fixup_modeless_constant (op1
, mode1
);
14702 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
14704 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14705 op1
= copy_to_mode_reg (mode1
, op1
);
14709 op1
= copy_to_reg (op1
);
14710 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
14713 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
14714 op2
= copy_to_mode_reg (mode2
, op2
);
14716 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14717 op3
= copy_to_mode_reg (mode3
, op3
);
14719 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14721 error ("the last argument must be scale 1, 2, 4, 8");
14725 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14733 arg0
= CALL_EXPR_ARG (exp
, 0);
14734 arg1
= CALL_EXPR_ARG (exp
, 1);
14735 arg2
= CALL_EXPR_ARG (exp
, 2);
14736 arg3
= CALL_EXPR_ARG (exp
, 3);
14737 arg4
= CALL_EXPR_ARG (exp
, 4);
14738 op0
= expand_normal (arg0
);
14739 op1
= expand_normal (arg1
);
14740 op2
= expand_normal (arg2
);
14741 op3
= expand_normal (arg3
);
14742 op4
= expand_normal (arg4
);
14743 mode0
= insn_data
[icode
].operand
[0].mode
;
14744 mode1
= insn_data
[icode
].operand
[1].mode
;
14745 mode3
= insn_data
[icode
].operand
[3].mode
;
14746 mode4
= insn_data
[icode
].operand
[4].mode
;
14748 op0
= fixup_modeless_constant (op0
, mode0
);
14750 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
14752 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14753 op0
= copy_to_mode_reg (mode0
, op0
);
14757 op0
= copy_to_reg (op0
);
14758 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
14761 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14762 op1
= copy_to_mode_reg (mode1
, op1
);
14764 /* Force memory operand only with base register here. But we
14765 don't want to do it on memory operand for other builtin
14767 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
14769 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
14770 op2
= copy_to_mode_reg (Pmode
, op2
);
14772 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14774 error ("the forth argument must be scale 1, 2, 4, 8");
14778 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14780 error ("incorrect hint operand");
14784 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14792 case IX86_BUILTIN_XABORT
:
14793 icode
= CODE_FOR_xabort
;
14794 arg0
= CALL_EXPR_ARG (exp
, 0);
14795 op0
= expand_normal (arg0
);
14796 mode0
= insn_data
[icode
].operand
[0].mode
;
14797 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14799 error ("the argument to %<xabort%> intrinsic must "
14800 "be an 8-bit immediate");
14803 emit_insn (gen_xabort (op0
));
14806 case IX86_BUILTIN_RDSSPD
:
14807 case IX86_BUILTIN_RDSSPQ
:
14808 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
14811 || !register_operand (target
, mode
))
14812 target
= gen_reg_rtx (mode
);
14814 op0
= force_reg (mode
, const0_rtx
);
14816 emit_insn (gen_rdssp (mode
, target
, op0
));
14819 case IX86_BUILTIN_INCSSPD
:
14820 case IX86_BUILTIN_INCSSPQ
:
14821 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
14823 arg0
= CALL_EXPR_ARG (exp
, 0);
14824 op0
= expand_normal (arg0
);
14826 op0
= force_reg (mode
, op0
);
14828 emit_insn (gen_incssp (mode
, op0
));
14831 case IX86_BUILTIN_HRESET
:
14832 icode
= CODE_FOR_hreset
;
14833 arg0
= CALL_EXPR_ARG (exp
, 0);
14834 op0
= expand_normal (arg0
);
14835 op0
= force_reg (SImode
, op0
);
14836 emit_insn (gen_hreset (op0
));
14839 case IX86_BUILTIN_RSTORSSP
:
14840 case IX86_BUILTIN_CLRSSBSY
:
14841 arg0
= CALL_EXPR_ARG (exp
, 0);
14842 op0
= expand_normal (arg0
);
14843 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
14844 ? CODE_FOR_rstorssp
14845 : CODE_FOR_clrssbsy
);
14847 if (!address_operand (op0
, VOIDmode
))
14849 op0
= convert_memory_address (Pmode
, op0
);
14850 op0
= copy_addr_to_reg (op0
);
14852 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
14855 case IX86_BUILTIN_WRSSD
:
14856 case IX86_BUILTIN_WRSSQ
:
14857 case IX86_BUILTIN_WRUSSD
:
14858 case IX86_BUILTIN_WRUSSQ
:
14859 mode
= ((fcode
== IX86_BUILTIN_WRSSD
14860 || fcode
== IX86_BUILTIN_WRUSSD
)
14861 ? SImode
: DImode
);
14863 arg0
= CALL_EXPR_ARG (exp
, 0);
14864 op0
= expand_normal (arg0
);
14865 arg1
= CALL_EXPR_ARG (exp
, 1);
14866 op1
= expand_normal (arg1
);
14868 op0
= force_reg (mode
, op0
);
14870 if (!address_operand (op1
, VOIDmode
))
14872 op1
= convert_memory_address (Pmode
, op1
);
14873 op1
= copy_addr_to_reg (op1
);
14875 op1
= gen_rtx_MEM (mode
, op1
);
14877 icode
= ((fcode
== IX86_BUILTIN_WRSSD
14878 || fcode
== IX86_BUILTIN_WRSSQ
)
14879 ? code_for_wrss (mode
)
14880 : code_for_wruss (mode
));
14881 emit_insn (GEN_FCN (icode
) (op0
, op1
));
14889 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14890 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
14892 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
14893 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
14897 if (fcode
>= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14898 && fcode
<= IX86_BUILTIN__BDESC_PURE_ARGS_LAST
)
14900 i
= fcode
- IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
;
14901 return ix86_expand_special_args_builtin (bdesc_pure_args
+ i
, exp
,
14905 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
14906 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
14908 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
14909 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
14910 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
14911 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
14913 machine_mode mode
, wide_mode
, nar_mode
;
14915 nar_mode
= V4SFmode
;
14917 wide_mode
= V64SFmode
;
14918 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
14919 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
14923 case IX86_BUILTIN_4FMAPS
:
14924 fcn
= gen_avx5124fmaddps_4fmaddps
;
14928 case IX86_BUILTIN_4DPWSSD
:
14929 nar_mode
= V4SImode
;
14931 wide_mode
= V64SImode
;
14932 fcn
= gen_avx5124vnniw_vp4dpwssd
;
14936 case IX86_BUILTIN_4DPWSSDS
:
14937 nar_mode
= V4SImode
;
14939 wide_mode
= V64SImode
;
14940 fcn
= gen_avx5124vnniw_vp4dpwssds
;
14944 case IX86_BUILTIN_4FNMAPS
:
14945 fcn
= gen_avx5124fmaddps_4fnmaddps
;
14949 case IX86_BUILTIN_4FNMAPS_MASK
:
14950 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
14951 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
14954 case IX86_BUILTIN_4DPWSSD_MASK
:
14955 nar_mode
= V4SImode
;
14957 wide_mode
= V64SImode
;
14958 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
14959 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
14962 case IX86_BUILTIN_4DPWSSDS_MASK
:
14963 nar_mode
= V4SImode
;
14965 wide_mode
= V64SImode
;
14966 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
14967 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
14970 case IX86_BUILTIN_4FMAPS_MASK
:
14980 wide_reg
= gen_reg_rtx (wide_mode
);
14981 for (i
= 0; i
< 4; i
++)
14983 args
[i
] = CALL_EXPR_ARG (exp
, i
);
14984 ops
[i
] = expand_normal (args
[i
]);
14986 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
14990 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
14991 accum
= force_reg (mode
, accum
);
14993 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
14994 addr
= force_reg (Pmode
, addr
);
14996 mem
= gen_rtx_MEM (nar_mode
, addr
);
14998 target
= gen_reg_rtx (mode
);
15000 emit_move_insn (target
, accum
);
15003 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
15007 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
15009 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
15011 if (CONST_INT_P (mask
))
15012 mask
= fixup_modeless_constant (mask
, HImode
);
15014 mask
= force_reg (HImode
, mask
);
15016 if (GET_MODE (mask
) != HImode
)
15017 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
15019 /* If merge is 0 then we're about to emit z-masked variant. */
15020 if (const0_operand (merge
, mode
))
15021 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
15022 /* If merge is the same as accum then emit merge-masked variant. */
15023 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
15025 merge
= force_reg (mode
, merge
);
15026 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
15028 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15031 target
= gen_reg_rtx (mode
);
15032 emit_move_insn (target
, merge
);
15033 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
15039 case IX86_BUILTIN_4FNMASS
:
15040 fcn
= gen_avx5124fmaddps_4fnmaddss
;
15044 case IX86_BUILTIN_4FMASS
:
15045 fcn
= gen_avx5124fmaddps_4fmaddss
;
15049 case IX86_BUILTIN_4FNMASS_MASK
:
15050 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
15051 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
15054 case IX86_BUILTIN_4FMASS_MASK
:
15063 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
15064 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
15068 wide_reg
= gen_reg_rtx (V64SFmode
);
15069 for (i
= 0; i
< 4; i
++)
15072 args
[i
] = CALL_EXPR_ARG (exp
, i
);
15073 ops
[i
] = expand_normal (args
[i
]);
15075 tmp
= gen_reg_rtx (SFmode
);
15076 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
15078 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
15079 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
15082 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
15083 accum
= force_reg (V4SFmode
, accum
);
15085 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
15086 addr
= force_reg (Pmode
, addr
);
15088 mem
= gen_rtx_MEM (V4SFmode
, addr
);
15090 target
= gen_reg_rtx (V4SFmode
);
15092 emit_move_insn (target
, accum
);
15095 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
15099 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
15101 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
15103 if (CONST_INT_P (mask
))
15104 mask
= fixup_modeless_constant (mask
, QImode
);
15106 mask
= force_reg (QImode
, mask
);
15108 if (GET_MODE (mask
) != QImode
)
15109 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
15111 /* If merge is 0 then we're about to emit z-masked variant. */
15112 if (const0_operand (merge
, mode
))
15113 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
15114 /* If merge is the same as accum then emit merge-masked
15116 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
15118 merge
= force_reg (mode
, merge
);
15119 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
15121 /* Merge with something unknown might happen if we z-mask
15125 target
= gen_reg_rtx (mode
);
15126 emit_move_insn (target
, merge
);
15127 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
15132 case IX86_BUILTIN_RDPID
:
15133 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
15135 case IX86_BUILTIN_FABSQ
:
15136 case IX86_BUILTIN_COPYSIGNQ
:
15138 /* Emit a normal call if SSE isn't available. */
15139 return expand_call (exp
, target
, ignore
);
15142 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
15146 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
15147 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
15149 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
15150 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
15153 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15154 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
15156 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
15157 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
15160 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15161 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
15163 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
15164 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
15167 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15168 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
15170 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
15171 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
15174 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15175 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
15177 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
15178 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
15179 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
15180 (enum ix86_builtin_func_type
)
15181 d
->flag
, d
->comparison
);
15184 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
15185 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
15187 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
15188 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
15192 gcc_unreachable ();
15195 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15196 fill target with val via vec_duplicate. */
15199 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
15204 /* Save/restore recog_data in case this is called from splitters
15205 or other routines where recog_data needs to stay valid across
15206 force_reg. See PR106577. */
15207 recog_data_d recog_data_save
= recog_data
;
15209 /* First attempt to recognize VAL as-is. */
15210 dup
= gen_vec_duplicate (mode
, val
);
15211 insn
= emit_insn (gen_rtx_SET (target
, dup
));
15212 if (recog_memoized (insn
) < 0)
15215 machine_mode innermode
= GET_MODE_INNER (mode
);
15218 /* If that fails, force VAL into a register. */
15221 reg
= force_reg (innermode
, val
);
15222 if (GET_MODE (reg
) != innermode
)
15223 reg
= gen_lowpart (innermode
, reg
);
15224 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
15225 seq
= get_insns ();
15228 emit_insn_before (seq
, insn
);
15230 ok
= recog_memoized (insn
) >= 0;
15233 recog_data
= recog_data_save
;
15237 /* Get a vector mode of the same size as the original but with elements
15238 twice as wide. This is only guaranteed to apply to integral vectors. */
15240 static machine_mode
15241 get_mode_wider_vector (machine_mode o
)
15243 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15244 machine_mode n
= GET_MODE_NEXT_MODE (o
).require ();
15245 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
15246 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
15250 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
15251 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
15253 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15254 with all elements equal to VAR. Return true if successful. */
15257 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
15258 rtx target
, rtx val
)
15282 return ix86_vector_duplicate_value (mode
, target
, val
);
15287 if (TARGET_SSE
|| TARGET_3DNOW_A
)
15291 val
= gen_lowpart (SImode
, val
);
15292 x
= gen_rtx_TRUNCATE (HImode
, val
);
15293 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15294 emit_insn (gen_rtx_SET (target
, x
));
15304 val
= gen_lowpart (SImode
, val
);
15305 x
= gen_rtx_TRUNCATE (HImode
, val
);
15306 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15307 emit_insn (gen_rtx_SET (target
, x
));
15322 return ix86_vector_duplicate_value (mode
, target
, val
);
15326 struct expand_vec_perm_d dperm
;
15330 memset (&dperm
, 0, sizeof (dperm
));
15331 dperm
.target
= target
;
15332 dperm
.vmode
= mode
;
15333 dperm
.nelt
= GET_MODE_NUNITS (mode
);
15334 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
15335 dperm
.one_operand_p
= true;
15337 if (mode
== V8HFmode
|| mode
== V8BFmode
)
15339 tmp1
= force_reg (GET_MODE_INNER (mode
), val
);
15340 tmp2
= gen_reg_rtx (mode
);
15341 emit_insn (maybe_gen_vec_set_0 (mode
, tmp2
,
15342 CONST0_RTX (mode
), tmp1
));
15343 tmp1
= gen_lowpart (mode
, tmp2
);
15347 /* Extend to SImode using a paradoxical SUBREG. */
15348 tmp1
= gen_reg_rtx (SImode
);
15349 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
15351 /* Insert the SImode value as
15352 low element of a V4SImode vector. */
15353 tmp2
= gen_reg_rtx (V4SImode
);
15354 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
15355 tmp1
= gen_lowpart (mode
, tmp2
);
15358 emit_move_insn (dperm
.op0
, tmp1
);
15359 ok
= (expand_vec_perm_1 (&dperm
)
15360 || expand_vec_perm_broadcast_1 (&dperm
));
15368 return ix86_vector_duplicate_value (mode
, target
, val
);
15375 /* Replicate the value once into the next wider mode and recurse. */
15377 machine_mode smode
, wsmode
, wvmode
;
15380 smode
= GET_MODE_INNER (mode
);
15381 wvmode
= get_mode_wider_vector (mode
);
15382 wsmode
= GET_MODE_INNER (wvmode
);
15384 val
= convert_modes (wsmode
, smode
, val
, true);
15386 if (smode
== QImode
&& !TARGET_PARTIAL_REG_STALL
)
15387 emit_insn (gen_insv_1 (wsmode
, val
, val
));
15390 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
15391 GEN_INT (GET_MODE_BITSIZE (smode
)),
15392 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15393 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1,
15397 x
= gen_reg_rtx (wvmode
);
15398 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
15400 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
15409 return ix86_vector_duplicate_value (mode
, target
, val
);
15412 machine_mode hvmode
;
15425 hvmode
= V16QImode
;
15428 gcc_unreachable ();
15430 rtx x
= gen_reg_rtx (hvmode
);
15432 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15435 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15436 emit_insn (gen_rtx_SET (target
, x
));
15444 if (TARGET_AVX512BW
)
15445 return ix86_vector_duplicate_value (mode
, target
, val
);
15448 machine_mode hvmode
;
15452 hvmode
= V16HImode
;
15455 hvmode
= V16HFmode
;
15458 hvmode
= V16BFmode
;
15461 hvmode
= V32QImode
;
15464 gcc_unreachable ();
15466 rtx x
= gen_reg_rtx (hvmode
);
15468 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15471 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15472 emit_insn (gen_rtx_SET (target
, x
));
15481 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15482 whose ONE_VAR element is VAR, and other elements are zero. Return true
15486 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
15487 rtx target
, rtx var
, int one_var
)
15489 machine_mode vsimode
;
15492 bool use_vector_set
= false;
15493 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
15498 /* For SSE4.1, we normally use vector set. But if the second
15499 element is zero and inter-unit moves are OK, we use movq
15501 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
15502 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15508 use_vector_set
= TARGET_SSE4_1
;
15511 use_vector_set
= TARGET_SSE2
;
15512 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15513 ? gen_vec_setv8hi_0
: NULL
;
15516 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15519 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
15522 use_vector_set
= TARGET_SSE4_1
;
15525 use_vector_set
= TARGET_AVX
;
15528 use_vector_set
= TARGET_AVX
;
15529 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15530 ? gen_vec_setv16hi_0
: NULL
;
15533 use_vector_set
= TARGET_AVX
;
15534 gen_vec_set_0
= gen_vec_setv8si_0
;
15537 use_vector_set
= TARGET_AVX
;
15538 gen_vec_set_0
= gen_vec_setv8sf_0
;
15541 use_vector_set
= TARGET_AVX
;
15542 gen_vec_set_0
= gen_vec_setv4df_0
;
15545 /* Use ix86_expand_vector_set in 64bit mode only. */
15546 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
15547 gen_vec_set_0
= gen_vec_setv4di_0
;
15550 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15551 gen_vec_set_0
= gen_vec_setv16si_0
;
15554 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15555 gen_vec_set_0
= gen_vec_setv16sf_0
;
15558 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15559 gen_vec_set_0
= gen_vec_setv8df_0
;
15562 /* Use ix86_expand_vector_set in 64bit mode only. */
15563 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
15564 gen_vec_set_0
= gen_vec_setv8di_0
;
15567 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15568 gen_vec_set_0
= gen_vec_setv8hf_0
;
15571 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15572 gen_vec_set_0
= gen_vec_setv16hf_0
;
15575 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15576 gen_vec_set_0
= gen_vec_setv32hf_0
;
15579 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15580 gen_vec_set_0
= gen_vec_setv8bf_0
;
15583 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15584 gen_vec_set_0
= gen_vec_setv16bf_0
;
15587 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15588 gen_vec_set_0
= gen_vec_setv32bf_0
;
15591 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15592 gen_vec_set_0
= gen_vec_setv32hi_0
;
15597 if (use_vector_set
)
15599 if (gen_vec_set_0
&& one_var
== 0)
15601 var
= force_reg (GET_MODE_INNER (mode
), var
);
15602 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
15605 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
15606 var
= force_reg (GET_MODE_INNER (mode
), var
);
15607 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15623 var
= force_reg (GET_MODE_INNER (mode
), var
);
15624 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
15625 emit_insn (gen_rtx_SET (target
, x
));
15630 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
15631 new_target
= gen_reg_rtx (mode
);
15633 new_target
= target
;
15634 var
= force_reg (GET_MODE_INNER (mode
), var
);
15635 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
15636 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
15637 emit_insn (gen_rtx_SET (new_target
, x
));
15640 /* We need to shuffle the value to the correct position, so
15641 create a new pseudo to store the intermediate result. */
15643 /* With SSE2, we can use the integer shuffle insns. */
15644 if (mode
!= V4SFmode
&& TARGET_SSE2
)
15646 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
15648 GEN_INT (one_var
== 1 ? 0 : 1),
15649 GEN_INT (one_var
== 2 ? 0 : 1),
15650 GEN_INT (one_var
== 3 ? 0 : 1)));
15651 if (target
!= new_target
)
15652 emit_move_insn (target
, new_target
);
15656 /* Otherwise convert the intermediate result to V4SFmode and
15657 use the SSE1 shuffle instructions. */
15658 if (mode
!= V4SFmode
)
15660 tmp
= gen_reg_rtx (V4SFmode
);
15661 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
15666 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
15668 GEN_INT (one_var
== 1 ? 0 : 1),
15669 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
15670 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
15672 if (mode
!= V4SFmode
)
15673 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
15674 else if (tmp
!= target
)
15675 emit_move_insn (target
, tmp
);
15677 else if (target
!= new_target
)
15678 emit_move_insn (target
, new_target
);
15683 vsimode
= V4SImode
;
15689 vsimode
= V2SImode
;
15695 /* Zero extend the variable element to SImode and recurse. */
15696 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
15698 x
= gen_reg_rtx (vsimode
);
15699 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
15701 gcc_unreachable ();
15703 emit_move_insn (target
, gen_lowpart (mode
, x
));
15711 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15712 consisting of the values in VALS. It is known that all elements
15713 except ONE_VAR are constants. Return true if successful. */
15716 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
15717 rtx target
, rtx vals
, int one_var
)
15719 rtx var
= XVECEXP (vals
, 0, one_var
);
15720 machine_mode wmode
;
15723 const_vec
= copy_rtx (vals
);
15724 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
15725 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
15733 /* For the two element vectors, it's just as easy to use
15734 the general case. */
15738 /* Use ix86_expand_vector_set in 64bit mode only. */
15763 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
15772 /* There's no way to set one QImode entry easily. Combine
15773 the variable value with its adjacent constant value, and
15774 promote to an HImode set. */
15775 x
= XVECEXP (vals
, 0, one_var
^ 1);
15778 var
= convert_modes (HImode
, QImode
, var
, true);
15779 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
15780 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15781 x
= GEN_INT (INTVAL (x
) & 0xff);
15785 var
= convert_modes (HImode
, QImode
, var
, true);
15786 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
15788 if (x
!= const0_rtx
)
15789 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
15790 1, OPTAB_LIB_WIDEN
);
15792 x
= gen_reg_rtx (wmode
);
15793 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
15794 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
15796 emit_move_insn (target
, gen_lowpart (mode
, x
));
15803 emit_move_insn (target
, const_vec
);
15804 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15808 /* A subroutine of ix86_expand_vector_init_general. Use vector
15809 concatenate to handle the most general case: all values variable,
15810 and none identical. */
15813 ix86_expand_vector_init_concat (machine_mode mode
,
15814 rtx target
, rtx
*ops
, int n
)
15816 machine_mode half_mode
= VOIDmode
;
15827 half_mode
= V16HFmode
;
15830 half_mode
= V16BFmode
;
15833 half_mode
= V8SImode
;
15836 half_mode
= V8SFmode
;
15839 half_mode
= V4DImode
;
15842 half_mode
= V4DFmode
;
15845 half_mode
= V8HFmode
;
15848 half_mode
= V8BFmode
;
15851 half_mode
= V4SImode
;
15854 half_mode
= V4SFmode
;
15857 half_mode
= V2DImode
;
15860 half_mode
= V2DFmode
;
15863 half_mode
= V2SImode
;
15866 half_mode
= V2SFmode
;
15869 half_mode
= DImode
;
15872 half_mode
= SImode
;
15875 half_mode
= DFmode
;
15878 half_mode
= SFmode
;
15881 gcc_unreachable ();
15884 if (!register_operand (ops
[1], half_mode
))
15885 ops
[1] = force_reg (half_mode
, ops
[1]);
15886 if (!register_operand (ops
[0], half_mode
))
15887 ops
[0] = force_reg (half_mode
, ops
[0]);
15888 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
15896 half_mode
= V2DImode
;
15899 half_mode
= V2DFmode
;
15902 half_mode
= V2SImode
;
15905 half_mode
= V2SFmode
;
15908 gcc_unreachable ();
15916 half_mode
= V4DImode
;
15919 half_mode
= V4DFmode
;
15922 half_mode
= V4SImode
;
15925 half_mode
= V4SFmode
;
15928 gcc_unreachable ();
15936 half_mode
= V8SImode
;
15939 half_mode
= V8SFmode
;
15942 gcc_unreachable ();
15947 /* FIXME: We process inputs backward to help RA. PR 36222. */
15949 for (j
= 1; j
!= -1; j
--)
15951 half
[j
] = gen_reg_rtx (half_mode
);
15955 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
15959 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
15963 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
15964 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
15968 gcc_unreachable ();
15970 ix86_expand_vector_init (false, half
[j
],
15971 gen_rtx_PARALLEL (half_mode
, v
));
15974 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
15978 gcc_unreachable ();
15982 /* A subroutine of ix86_expand_vector_init_general. Use vector
15983 interleave to handle the most general case: all values variable,
15984 and none identical. */
15987 ix86_expand_vector_init_interleave (machine_mode mode
,
15988 rtx target
, rtx
*ops
, int n
)
15990 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
15993 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
15994 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
15995 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
16000 gen_load_even
= gen_vec_interleave_lowv8hf
;
16001 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16002 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16003 inner_mode
= HFmode
;
16004 first_imode
= V4SImode
;
16005 second_imode
= V2DImode
;
16006 third_imode
= VOIDmode
;
16009 gen_load_even
= gen_vec_interleave_lowv8bf
;
16010 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16011 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16012 inner_mode
= BFmode
;
16013 first_imode
= V4SImode
;
16014 second_imode
= V2DImode
;
16015 third_imode
= VOIDmode
;
16018 gen_load_even
= gen_vec_setv8hi
;
16019 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16020 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16021 inner_mode
= HImode
;
16022 first_imode
= V4SImode
;
16023 second_imode
= V2DImode
;
16024 third_imode
= VOIDmode
;
16027 gen_load_even
= gen_vec_setv16qi
;
16028 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
16029 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
16030 inner_mode
= QImode
;
16031 first_imode
= V8HImode
;
16032 second_imode
= V4SImode
;
16033 third_imode
= V2DImode
;
16036 gcc_unreachable ();
16039 for (i
= 0; i
< n
; i
++)
16042 if (inner_mode
== HFmode
|| inner_mode
== BFmode
)
16045 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16046 machine_mode vec_mode
=
16047 (inner_mode
== HFmode
) ? V8HFmode
: V8BFmode
;
16048 op0
= gen_reg_rtx (vec_mode
);
16049 even
= lowpart_subreg (vec_mode
,
16050 force_reg (inner_mode
, op
), inner_mode
);
16051 odd
= lowpart_subreg (vec_mode
,
16052 force_reg (inner_mode
, ops
[i
+ i
+ 1]),
16054 emit_insn (gen_load_even (op0
, even
, odd
));
16058 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16059 op0
= gen_reg_rtx (SImode
);
16060 emit_move_insn (op0
, gen_lowpart (SImode
, op
));
16062 /* Insert the SImode value as low element of V4SImode vector. */
16063 op1
= gen_reg_rtx (V4SImode
);
16064 op0
= gen_rtx_VEC_MERGE (V4SImode
,
16065 gen_rtx_VEC_DUPLICATE (V4SImode
,
16067 CONST0_RTX (V4SImode
),
16069 emit_insn (gen_rtx_SET (op1
, op0
));
16071 /* Cast the V4SImode vector back to a vector in orignal mode. */
16072 op0
= gen_reg_rtx (mode
);
16073 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
16075 /* Load even elements into the second position. */
16076 emit_insn (gen_load_even (op0
,
16077 force_reg (inner_mode
,
16082 /* Cast vector to FIRST_IMODE vector. */
16083 ops
[i
] = gen_reg_rtx (first_imode
);
16084 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
16087 /* Interleave low FIRST_IMODE vectors. */
16088 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
16090 op0
= gen_reg_rtx (first_imode
);
16091 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
16093 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16094 ops
[j
] = gen_reg_rtx (second_imode
);
16095 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
16098 /* Interleave low SECOND_IMODE vectors. */
16099 switch (second_imode
)
16102 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
16104 op0
= gen_reg_rtx (second_imode
);
16105 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
16108 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16110 ops
[j
] = gen_reg_rtx (third_imode
);
16111 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
16113 second_imode
= V2DImode
;
16114 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16118 op0
= gen_reg_rtx (second_imode
);
16119 emit_insn (gen_interleave_second_low (op0
, ops
[0],
16122 /* Cast the SECOND_IMODE vector back to a vector on original
16124 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
16128 gcc_unreachable ();
16132 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
16133 all values variable, and none identical. */
16136 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
16137 rtx target
, rtx vals
)
16139 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
16140 machine_mode half_mode
= VOIDmode
;
16141 machine_mode quarter_mode
= VOIDmode
;
16148 if (!mmx_ok
&& !TARGET_SSE
)
16164 n
= GET_MODE_NUNITS (mode
);
16165 for (i
= 0; i
< n
; i
++)
16166 ops
[i
] = XVECEXP (vals
, 0, i
);
16167 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
16171 for (i
= 0; i
< 2; i
++)
16172 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
16173 op0
= gen_reg_rtx (V4DImode
);
16174 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
16175 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
16179 for (i
= 0; i
< 4; i
++)
16180 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
16181 ops
[4] = gen_reg_rtx (V4DImode
);
16182 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
16183 ops
[5] = gen_reg_rtx (V4DImode
);
16184 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
16185 op0
= gen_reg_rtx (V8DImode
);
16186 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
16187 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
16191 half_mode
= V16QImode
;
16195 half_mode
= V8HImode
;
16199 half_mode
= V8HFmode
;
16203 half_mode
= V8BFmode
;
16207 n
= GET_MODE_NUNITS (mode
);
16208 for (i
= 0; i
< n
; i
++)
16209 ops
[i
] = XVECEXP (vals
, 0, i
);
16210 op0
= gen_reg_rtx (half_mode
);
16211 op1
= gen_reg_rtx (half_mode
);
16212 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
16214 ix86_expand_vector_init_interleave (half_mode
, op1
,
16215 &ops
[n
>> 1], n
>> 2);
16216 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
16220 quarter_mode
= V16QImode
;
16221 half_mode
= V32QImode
;
16225 quarter_mode
= V8HImode
;
16226 half_mode
= V16HImode
;
16230 quarter_mode
= V8HFmode
;
16231 half_mode
= V16HFmode
;
16235 quarter_mode
= V8BFmode
;
16236 half_mode
= V16BFmode
;
16240 n
= GET_MODE_NUNITS (mode
);
16241 for (i
= 0; i
< n
; i
++)
16242 ops
[i
] = XVECEXP (vals
, 0, i
);
16243 op0
= gen_reg_rtx (quarter_mode
);
16244 op1
= gen_reg_rtx (quarter_mode
);
16245 op2
= gen_reg_rtx (quarter_mode
);
16246 op3
= gen_reg_rtx (quarter_mode
);
16247 op4
= gen_reg_rtx (half_mode
);
16248 op5
= gen_reg_rtx (half_mode
);
16249 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
16251 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
16252 &ops
[n
>> 2], n
>> 3);
16253 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
16254 &ops
[n
>> 1], n
>> 3);
16255 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
16256 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
16257 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
16258 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
16259 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
16263 if (!TARGET_SSE4_1
)
16271 /* Don't use ix86_expand_vector_init_interleave if we can't
16272 move from GPR to SSE register directly. */
16273 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
16280 n
= GET_MODE_NUNITS (mode
);
16281 for (i
= 0; i
< n
; i
++)
16282 ops
[i
] = XVECEXP (vals
, 0, i
);
16283 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
16294 gcc_unreachable ();
16298 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
16299 machine_mode tmp_mode
, inner_mode
;
16300 rtx words
[4], shift
;
16302 tmp_mode
= (GET_MODE_SIZE (mode
) < UNITS_PER_WORD
) ? SImode
: word_mode
;
16304 inner_mode
= GET_MODE_INNER (mode
);
16305 n_elts
= GET_MODE_NUNITS (mode
);
16306 n_words
= GET_MODE_SIZE (mode
) / GET_MODE_SIZE (tmp_mode
);
16307 n_elt_per_word
= n_elts
/ n_words
;
16308 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
16310 for (i
= 0; i
< n_words
; ++i
)
16312 rtx word
= NULL_RTX
;
16314 for (j
= 0; j
< n_elt_per_word
; ++j
)
16316 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
16317 elt
= convert_modes (tmp_mode
, inner_mode
, elt
, true);
16323 word
= expand_simple_binop (tmp_mode
, ASHIFT
, word
, shift
,
16324 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16325 word
= expand_simple_binop (tmp_mode
, IOR
, word
, elt
,
16326 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16334 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
16335 else if (n_words
== 2)
16337 rtx tmp
= gen_reg_rtx (mode
);
16338 emit_clobber (tmp
);
16339 emit_move_insn (gen_lowpart (tmp_mode
, tmp
), words
[0]);
16340 emit_move_insn (gen_highpart (tmp_mode
, tmp
), words
[1]);
16341 emit_move_insn (target
, tmp
);
16343 else if (n_words
== 4)
16345 rtx tmp
= gen_reg_rtx (V4SImode
);
16346 gcc_assert (tmp_mode
== SImode
);
16347 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
16348 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
16349 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
16352 gcc_unreachable ();
16356 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16357 instructions unless MMX_OK is true. */
16360 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
16362 machine_mode mode
= GET_MODE (target
);
16363 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16364 int n_elts
= GET_MODE_NUNITS (mode
);
16365 int n_var
= 0, one_var
= -1;
16366 bool all_same
= true, all_const_zero
= true;
16370 /* Handle first initialization from vector elts. */
16371 if (n_elts
!= XVECLEN (vals
, 0))
16373 rtx subtarget
= target
;
16374 x
= XVECEXP (vals
, 0, 0);
16375 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
16376 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
16378 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
16379 if (inner_mode
== QImode
16380 || inner_mode
== HImode
16381 || inner_mode
== TImode
16382 || inner_mode
== HFmode
16383 || inner_mode
== BFmode
)
16385 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
16386 scalar_mode elt_mode
= inner_mode
== TImode
? DImode
: SImode
;
16387 n_bits
/= GET_MODE_SIZE (elt_mode
);
16388 mode
= mode_for_vector (elt_mode
, n_bits
).require ();
16389 inner_mode
= mode_for_vector (elt_mode
, n_bits
/ 2).require ();
16390 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
16391 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
16392 subtarget
= gen_reg_rtx (mode
);
16394 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
16395 if (subtarget
!= target
)
16396 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
16399 gcc_unreachable ();
16402 for (i
= 0; i
< n_elts
; ++i
)
16404 x
= XVECEXP (vals
, 0, i
);
16405 if (!(CONST_SCALAR_INT_P (x
)
16406 || CONST_DOUBLE_P (x
)
16407 || CONST_FIXED_P (x
)))
16408 n_var
++, one_var
= i
;
16409 else if (x
!= CONST0_RTX (inner_mode
))
16410 all_const_zero
= false;
16411 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
16415 /* Constants are best loaded from the constant pool. */
16418 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
16422 /* If all values are identical, broadcast the value. */
16424 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
16425 XVECEXP (vals
, 0, 0)))
16428 /* Values where only one field is non-constant are best loaded from
16429 the pool and overwritten via move later. */
16433 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
16434 XVECEXP (vals
, 0, one_var
),
16438 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
16442 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
16446 V setg (V v, int idx, T val)
16448 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16449 V valv = (V){val, val, val, val, val, val, val, val};
16450 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16451 v = (v & ~mask) | (valv & mask);
16455 ix86_expand_vector_set_var (rtx target
, rtx val
, rtx idx
)
16458 machine_mode mode
= GET_MODE (target
);
16459 machine_mode cmp_mode
= mode
;
16460 int n_elts
= GET_MODE_NUNITS (mode
);
16461 rtx valv
,idxv
,constv
,idx_tmp
;
16464 /* 512-bits vector byte/word broadcast and comparison only available
16465 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16466 when without TARGET_AVX512BW. */
16467 if ((mode
== V32HImode
|| mode
== V32HFmode
|| mode
== V32BFmode
16468 || mode
== V64QImode
)
16469 && !TARGET_AVX512BW
)
16471 gcc_assert (TARGET_AVX512F
);
16472 rtx vhi
, vlo
, idx_hi
;
16473 machine_mode half_mode
;
16474 rtx (*extract_hi
)(rtx
, rtx
);
16475 rtx (*extract_lo
)(rtx
, rtx
);
16477 if (mode
== V32HImode
)
16479 half_mode
= V16HImode
;
16480 extract_hi
= gen_vec_extract_hi_v32hi
;
16481 extract_lo
= gen_vec_extract_lo_v32hi
;
16483 else if (mode
== V32HFmode
)
16485 half_mode
= V16HFmode
;
16486 extract_hi
= gen_vec_extract_hi_v32hf
;
16487 extract_lo
= gen_vec_extract_lo_v32hf
;
16489 else if (mode
== V32BFmode
)
16491 half_mode
= V16BFmode
;
16492 extract_hi
= gen_vec_extract_hi_v32bf
;
16493 extract_lo
= gen_vec_extract_lo_v32bf
;
16497 half_mode
= V32QImode
;
16498 extract_hi
= gen_vec_extract_hi_v64qi
;
16499 extract_lo
= gen_vec_extract_lo_v64qi
;
16502 vhi
= gen_reg_rtx (half_mode
);
16503 vlo
= gen_reg_rtx (half_mode
);
16504 idx_hi
= gen_reg_rtx (GET_MODE (idx
));
16505 emit_insn (extract_hi (vhi
, target
));
16506 emit_insn (extract_lo (vlo
, target
));
16509 vec
[2] = GEN_INT (n_elts
/2);
16510 ix86_expand_binary_operator (MINUS
, GET_MODE (idx
), vec
);
16511 ix86_expand_vector_set_var (vhi
, val
, idx_hi
);
16512 ix86_expand_vector_set_var (vlo
, val
, idx
);
16513 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, vlo
, vhi
)));
16517 if (FLOAT_MODE_P (GET_MODE_INNER (mode
)))
16522 cmp_mode
= V2DImode
;
16525 cmp_mode
= V4DImode
;
16528 cmp_mode
= V8DImode
;
16531 cmp_mode
= V2SImode
;
16534 cmp_mode
= V4SImode
;
16537 cmp_mode
= V8SImode
;
16540 cmp_mode
= V16SImode
;
16543 cmp_mode
= V8HImode
;
16546 cmp_mode
= V16HImode
;
16549 cmp_mode
= V32HImode
;
16552 cmp_mode
= V8HImode
;
16555 cmp_mode
= V16HImode
;
16558 cmp_mode
= V32HImode
;
16561 gcc_unreachable ();
16565 for (int i
= 0; i
!= n_elts
; i
++)
16566 vec
[i
] = GEN_INT (i
);
16567 constv
= gen_rtx_CONST_VECTOR (cmp_mode
, gen_rtvec_v (n_elts
, vec
));
16568 valv
= gen_reg_rtx (mode
);
16569 idxv
= gen_reg_rtx (cmp_mode
);
16570 idx_tmp
= convert_to_mode (GET_MODE_INNER (cmp_mode
), idx
, 1);
16572 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16575 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16576 cmp_mode
, idxv
, idx_tmp
);
16581 vec
[3] = gen_rtx_EQ (mode
, idxv
, constv
);
16584 ok
= ix86_expand_int_vcond (vec
);
16589 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
16591 machine_mode mode
= GET_MODE (target
);
16592 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16593 machine_mode half_mode
;
16594 bool use_vec_merge
= false;
16595 bool blendm_const
= false;
16597 static rtx (*gen_extract
[8][2]) (rtx
, rtx
)
16599 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
16600 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
16601 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
16602 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
16603 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
16604 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
},
16605 { gen_vec_extract_lo_v16hf
, gen_vec_extract_hi_v16hf
},
16606 { gen_vec_extract_lo_v16bf
, gen_vec_extract_hi_v16bf
}
16608 static rtx (*gen_insert
[8][2]) (rtx
, rtx
, rtx
)
16610 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
16611 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
16612 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
16613 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
16614 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
16615 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
},
16616 { gen_vec_set_lo_v16hf
, gen_vec_set_hi_v16hf
},
16617 { gen_vec_set_lo_v16bf
, gen_vec_set_hi_v16bf
},
16620 machine_mode mmode
= VOIDmode
;
16621 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
16626 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16634 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16635 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
16637 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16639 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16640 emit_insn (gen_rtx_SET (target
, tmp
));
16646 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
16650 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16651 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
16653 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16655 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16656 emit_insn (gen_rtx_SET (target
, tmp
));
16660 /* NB: For ELT == 0, use standard scalar operation patterns which
16661 preserve the rest of the vector for combiner:
16664 (vec_duplicate:V2DF (reg:DF))
16674 /* For the two element vectors, we implement a VEC_CONCAT with
16675 the extraction of the other element. */
16677 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
16678 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
16681 op0
= val
, op1
= tmp
;
16683 op0
= tmp
, op1
= val
;
16685 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
16686 emit_insn (gen_rtx_SET (target
, tmp
));
16691 use_vec_merge
= TARGET_SSE4_1
;
16698 use_vec_merge
= true;
16702 /* tmp = target = A B C D */
16703 tmp
= copy_to_reg (target
);
16704 /* target = A A B B */
16705 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
16706 /* target = X A B B */
16707 ix86_expand_vector_set (false, target
, val
, 0);
16708 /* target = A X C D */
16709 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16710 const1_rtx
, const0_rtx
,
16711 GEN_INT (2+4), GEN_INT (3+4)));
16715 /* tmp = target = A B C D */
16716 tmp
= copy_to_reg (target
);
16717 /* tmp = X B C D */
16718 ix86_expand_vector_set (false, tmp
, val
, 0);
16719 /* target = A B X D */
16720 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16721 const0_rtx
, const1_rtx
,
16722 GEN_INT (0+4), GEN_INT (3+4)));
16726 /* tmp = target = A B C D */
16727 tmp
= copy_to_reg (target
);
16728 /* tmp = X B C D */
16729 ix86_expand_vector_set (false, tmp
, val
, 0);
16730 /* target = A B X D */
16731 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16732 const0_rtx
, const1_rtx
,
16733 GEN_INT (2+4), GEN_INT (0+4)));
16737 gcc_unreachable ();
16742 use_vec_merge
= TARGET_SSE4_1
;
16746 /* Element 0 handled by vec_merge below. */
16749 use_vec_merge
= true;
16755 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16756 store into element 0, then shuffle them back. */
16760 order
[0] = GEN_INT (elt
);
16761 order
[1] = const1_rtx
;
16762 order
[2] = const2_rtx
;
16763 order
[3] = GEN_INT (3);
16764 order
[elt
] = const0_rtx
;
16766 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16767 order
[1], order
[2], order
[3]));
16769 ix86_expand_vector_set (false, target
, val
, 0);
16771 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16772 order
[1], order
[2], order
[3]));
16776 /* For SSE1, we have to reuse the V4SF code. */
16777 rtx t
= gen_reg_rtx (V4SFmode
);
16778 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
16779 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
16780 emit_move_insn (target
, gen_lowpart (mode
, t
));
16788 use_vec_merge
= TARGET_SSE2
;
16791 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
16796 use_vec_merge
= TARGET_SSE4_1
;
16800 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16804 half_mode
= V16QImode
;
16811 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16812 if (TARGET_AVX2
&& elt
!= 0)
16815 gen_blendm
= ((mode
== E_V16HFmode
) ? gen_avx2_pblendph_1
16816 : gen_avx2_pblendbf_1
);
16817 blendm_const
= true;
16822 half_mode
= ((mode
== E_V16HFmode
) ? V8HFmode
: V8BFmode
);
16823 j
= ((mode
== E_V16HFmode
) ? 6 : 7);
16829 half_mode
= V8HImode
;
16835 half_mode
= V4SImode
;
16841 half_mode
= V2DImode
;
16847 half_mode
= V4SFmode
;
16853 half_mode
= V2DFmode
;
16859 /* Compute offset. */
16863 gcc_assert (i
<= 1);
16865 /* Extract the half. */
16866 tmp
= gen_reg_rtx (half_mode
);
16867 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
16869 /* Put val in tmp at elt. */
16870 ix86_expand_vector_set (false, tmp
, val
, elt
);
16873 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
16877 if (TARGET_AVX512F
)
16880 gen_blendm
= gen_avx512f_blendmv8df
;
16885 if (TARGET_AVX512F
)
16888 gen_blendm
= gen_avx512f_blendmv8di
;
16893 if (TARGET_AVX512F
)
16896 gen_blendm
= gen_avx512f_blendmv16sf
;
16901 if (TARGET_AVX512F
)
16904 gen_blendm
= gen_avx512f_blendmv16si
;
16909 if (TARGET_AVX512BW
)
16912 gen_blendm
= gen_avx512bw_blendmv32hf
;
16916 if (TARGET_AVX512BW
)
16919 gen_blendm
= gen_avx512bw_blendmv32bf
;
16923 if (TARGET_AVX512BW
)
16926 gen_blendm
= gen_avx512bw_blendmv32hi
;
16928 else if (TARGET_AVX512F
)
16930 half_mode
= E_V8HImode
;
16937 if (TARGET_AVX512BW
)
16940 gen_blendm
= gen_avx512bw_blendmv64qi
;
16942 else if (TARGET_AVX512F
)
16944 half_mode
= E_V16QImode
;
16951 /* Compute offset. */
16955 gcc_assert (i
<= 3);
16958 /* Extract the quarter. */
16959 tmp
= gen_reg_rtx (V4SImode
);
16960 rtx tmp2
= gen_lowpart (V16SImode
, target
);
16961 rtx mask
= gen_reg_rtx (QImode
);
16963 emit_move_insn (mask
, constm1_rtx
);
16964 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
16967 tmp2
= gen_reg_rtx (half_mode
);
16968 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
16971 /* Put val in tmp at elt. */
16972 ix86_expand_vector_set (false, tmp
, val
, elt
);
16975 tmp2
= gen_reg_rtx (V16SImode
);
16976 rtx tmp3
= gen_lowpart (V16SImode
, target
);
16977 mask
= gen_reg_rtx (HImode
);
16978 emit_move_insn (mask
, constm1_rtx
);
16979 tmp
= gen_lowpart (V4SImode
, tmp
);
16980 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
16982 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
16990 if (mmode
!= VOIDmode
)
16992 tmp
= gen_reg_rtx (mode
);
16993 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
16994 rtx merge_mask
= gen_int_mode (HOST_WIDE_INT_1U
<< elt
, mmode
);
16995 /* The avx512*_blendm<mode> expanders have different operand order
16996 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
16997 elements where the mask is set and second input operand otherwise,
16998 in {sse,avx}*_*blend* the first input operand is used for elements
16999 where the mask is clear and second input operand otherwise. */
17001 merge_mask
= force_reg (mmode
, merge_mask
);
17002 emit_insn (gen_blendm (target
, target
, tmp
, merge_mask
));
17004 else if (use_vec_merge
)
17007 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
17008 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
17009 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
17010 emit_insn (gen_rtx_SET (target
, tmp
));
17014 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
17016 emit_move_insn (mem
, target
);
17018 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
17019 emit_move_insn (tmp
, val
);
17021 emit_move_insn (target
, mem
);
17026 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
17028 machine_mode mode
= GET_MODE (vec
);
17029 machine_mode inner_mode
= GET_MODE_INNER (mode
);
17030 bool use_vec_extr
= false;
17036 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17050 use_vec_extr
= true;
17054 use_vec_extr
= TARGET_SSE4_1
;
17066 tmp
= gen_reg_rtx (mode
);
17067 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
17068 GEN_INT (elt
), GEN_INT (elt
),
17069 GEN_INT (elt
+4), GEN_INT (elt
+4)));
17073 tmp
= gen_reg_rtx (mode
);
17074 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
17078 gcc_unreachable ();
17081 use_vec_extr
= true;
17086 use_vec_extr
= TARGET_SSE4_1
;
17100 tmp
= gen_reg_rtx (mode
);
17101 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
17102 GEN_INT (elt
), GEN_INT (elt
),
17103 GEN_INT (elt
), GEN_INT (elt
)));
17107 tmp
= gen_reg_rtx (mode
);
17108 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
17112 gcc_unreachable ();
17115 use_vec_extr
= true;
17120 /* For SSE1, we have to reuse the V4SF code. */
17121 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
17122 gen_lowpart (V4SFmode
, vec
), elt
);
17131 use_vec_extr
= TARGET_SSE2
;
17134 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
17138 use_vec_extr
= TARGET_SSE4_1
;
17142 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
17144 tmp
= gen_reg_rtx (SImode
);
17145 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
17147 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
17152 use_vec_extr
= TARGET_SSE4_1
;
17158 tmp
= gen_reg_rtx (V4SFmode
);
17160 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
17162 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
17163 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17171 tmp
= gen_reg_rtx (V2DFmode
);
17173 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
17175 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
17176 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
17184 tmp
= gen_reg_rtx (V16QImode
);
17186 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
17188 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
17189 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17197 tmp
= gen_reg_rtx (V8HImode
);
17199 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
17201 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
17202 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17210 tmp
= gen_reg_rtx (V4SImode
);
17212 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
17214 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
17215 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17223 tmp
= gen_reg_rtx (V2DImode
);
17225 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
17227 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
17228 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
17234 if (TARGET_AVX512BW
)
17236 tmp
= gen_reg_rtx (V16HImode
);
17238 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
17240 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
17241 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17247 if (TARGET_AVX512BW
)
17249 tmp
= gen_reg_rtx (V32QImode
);
17251 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
17253 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
17254 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
17260 tmp
= gen_reg_rtx (V8SFmode
);
17262 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
17264 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
17265 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17269 tmp
= gen_reg_rtx (V4DFmode
);
17271 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
17273 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
17274 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17278 tmp
= gen_reg_rtx (V8SImode
);
17280 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
17282 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
17283 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17287 tmp
= gen_reg_rtx (V4DImode
);
17289 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
17291 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
17292 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17297 if (TARGET_AVX512BW
)
17299 tmp
= (mode
== E_V32HFmode
17300 ? gen_reg_rtx (V16HFmode
)
17301 : gen_reg_rtx (V16BFmode
));
17303 emit_insn (maybe_gen_vec_extract_lo (mode
, tmp
, vec
));
17305 emit_insn (maybe_gen_vec_extract_hi (mode
, tmp
, vec
));
17306 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17315 tmp
= (mode
== E_V16HFmode
17316 ? gen_reg_rtx (V8HFmode
)
17317 : gen_reg_rtx (V8BFmode
));
17319 emit_insn (maybe_gen_vec_extract_lo (mode
, tmp
, vec
));
17321 emit_insn (maybe_gen_vec_extract_hi (mode
, tmp
, vec
));
17322 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17328 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17329 /* ??? Could extract the appropriate HImode element and shift. */
17338 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
17339 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
17341 /* Let the rtl optimizers know about the zero extension performed. */
17342 if (inner_mode
== QImode
|| inner_mode
== HImode
)
17344 rtx reg
= gen_reg_rtx (SImode
);
17345 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
17346 emit_move_insn (reg
, tmp
);
17347 tmp
= gen_lowpart (inner_mode
, reg
);
17348 SUBREG_PROMOTED_VAR_P (tmp
) = 1;
17349 SUBREG_PROMOTED_SET (tmp
, 1);
17352 emit_move_insn (target
, tmp
);
17356 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
17358 emit_move_insn (mem
, vec
);
17360 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
17361 emit_move_insn (target
, tmp
);
17365 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17366 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17367 The upper bits of DEST are undefined, though they shouldn't cause
17368 exceptions (some bits from src or all zeros are ok). */
17371 emit_reduc_half (rtx dest
, rtx src
, int i
)
17374 switch (GET_MODE (src
))
17378 tem
= gen_sse_movhlps (dest
, src
, src
);
17380 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
17381 GEN_INT (1 + 4), GEN_INT (1 + 4));
17384 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
17387 d
= gen_reg_rtx (V1SImode
);
17388 tem
= gen_mmx_lshrv1si3 (d
, gen_lowpart (V1SImode
, src
),
17392 d
= gen_reg_rtx (V1DImode
);
17393 tem
= gen_mmx_lshrv1di3 (d
, gen_lowpart (V1DImode
, src
),
17401 d
= gen_reg_rtx (V1TImode
);
17402 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
17407 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
17409 tem
= gen_avx_shufps256 (dest
, src
, src
,
17410 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
17414 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
17416 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
17425 if (GET_MODE (dest
) != V4DImode
)
17426 d
= gen_reg_rtx (V4DImode
);
17427 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
17428 gen_lowpart (V4DImode
, src
),
17433 d
= gen_reg_rtx (V2TImode
);
17434 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
17443 d
= gen_reg_rtx (V4TImode
);
17444 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
17454 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
17455 gen_lowpart (V16SImode
, src
),
17456 gen_lowpart (V16SImode
, src
),
17457 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
17458 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
17459 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
17460 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
17461 GEN_INT (0xC), GEN_INT (0xD),
17462 GEN_INT (0xE), GEN_INT (0xF),
17463 GEN_INT (0x10), GEN_INT (0x11),
17464 GEN_INT (0x12), GEN_INT (0x13),
17465 GEN_INT (0x14), GEN_INT (0x15),
17466 GEN_INT (0x16), GEN_INT (0x17));
17468 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
17469 gen_lowpart (V16SImode
, src
),
17470 GEN_INT (i
== 128 ? 0x2 : 0x1),
17474 GEN_INT (i
== 128 ? 0x6 : 0x5),
17478 GEN_INT (i
== 128 ? 0xA : 0x9),
17482 GEN_INT (i
== 128 ? 0xE : 0xD),
17488 gcc_unreachable ();
17492 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
17495 /* Expand a vector reduction. FN is the binary pattern to reduce;
17496 DEST is the destination; IN is the input vector. */
17499 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
17501 rtx half
, dst
, vec
= in
;
17502 machine_mode mode
= GET_MODE (in
);
17505 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17507 && mode
== V8HImode
17508 && fn
== gen_uminv8hi3
)
17510 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
17514 for (i
= GET_MODE_BITSIZE (mode
);
17515 i
> GET_MODE_UNIT_BITSIZE (mode
);
17518 half
= gen_reg_rtx (mode
);
17519 emit_reduc_half (half
, vec
, i
);
17520 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
17523 dst
= gen_reg_rtx (mode
);
17524 emit_insn (fn (dst
, half
, vec
));
17529 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17530 FP status register is set. */
17533 ix86_emit_fp_unordered_jump (rtx label
)
17535 rtx reg
= gen_reg_rtx (HImode
);
17539 emit_insn (gen_x86_fnstsw_1 (reg
));
17541 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
17543 emit_insn (gen_x86_sahf_1 (reg
));
17545 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
17546 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
17550 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
17552 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17553 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
17556 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
17557 gen_rtx_LABEL_REF (VOIDmode
, label
),
17559 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
17560 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
17561 JUMP_LABEL (insn
) = label
;
17564 /* Output code to perform an sinh XFmode calculation. */
17567 ix86_emit_i387_sinh (rtx op0
, rtx op1
)
17569 rtx e1
= gen_reg_rtx (XFmode
);
17570 rtx e2
= gen_reg_rtx (XFmode
);
17571 rtx scratch
= gen_reg_rtx (HImode
);
17572 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17573 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17575 rtx_code_label
*jump_label
= gen_label_rtx ();
17578 /* scratch = fxam (op1) */
17579 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17581 /* e1 = expm1 (|op1|) */
17582 emit_insn (gen_absxf2 (e2
, op1
));
17583 emit_insn (gen_expm1xf2 (e1
, e2
));
17585 /* e2 = e1 / (e1 + 1.0) + e1 */
17586 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17587 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17588 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17589 emit_insn (gen_addxf3 (e2
, e2
, e1
));
17591 /* flags = signbit (op1) */
17592 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17594 /* if (flags) then e2 = -e2 */
17595 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17596 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17597 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17599 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17600 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17601 JUMP_LABEL (insn
) = jump_label
;
17603 emit_insn (gen_negxf2 (e2
, e2
));
17605 emit_label (jump_label
);
17606 LABEL_NUSES (jump_label
) = 1;
17608 /* op0 = 0.5 * e2 */
17609 half
= force_reg (XFmode
, half
);
17610 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17613 /* Output code to perform an cosh XFmode calculation. */
17616 ix86_emit_i387_cosh (rtx op0
, rtx op1
)
17618 rtx e1
= gen_reg_rtx (XFmode
);
17619 rtx e2
= gen_reg_rtx (XFmode
);
17620 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17623 /* e1 = exp (op1) */
17624 emit_insn (gen_expxf2 (e1
, op1
));
17626 /* e2 = e1 + 1.0 / e1 */
17627 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17628 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
17629 emit_insn (gen_addxf3 (e2
, e1
, e2
));
17631 /* op0 = 0.5 * e2 */
17632 half
= force_reg (XFmode
, half
);
17633 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17636 /* Output code to perform an tanh XFmode calculation. */
17639 ix86_emit_i387_tanh (rtx op0
, rtx op1
)
17641 rtx e1
= gen_reg_rtx (XFmode
);
17642 rtx e2
= gen_reg_rtx (XFmode
);
17643 rtx scratch
= gen_reg_rtx (HImode
);
17644 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17646 rtx_code_label
*jump_label
= gen_label_rtx ();
17649 /* scratch = fxam (op1) */
17650 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17652 /* e1 = expm1 (-|2 * op1|) */
17653 emit_insn (gen_addxf3 (e2
, op1
, op1
));
17654 emit_insn (gen_absxf2 (e2
, e2
));
17655 emit_insn (gen_negxf2 (e2
, e2
));
17656 emit_insn (gen_expm1xf2 (e1
, e2
));
17658 /* e2 = e1 / (e1 + 2.0) */
17659 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
17660 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
17661 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17663 /* flags = signbit (op1) */
17664 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17666 /* if (!flags) then e2 = -e2 */
17667 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17668 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17669 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17671 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17672 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17673 JUMP_LABEL (insn
) = jump_label
;
17675 emit_insn (gen_negxf2 (e2
, e2
));
17677 emit_label (jump_label
);
17678 LABEL_NUSES (jump_label
) = 1;
17680 emit_move_insn (op0
, e2
);
17683 /* Output code to perform an asinh XFmode calculation. */
17686 ix86_emit_i387_asinh (rtx op0
, rtx op1
)
17688 rtx e1
= gen_reg_rtx (XFmode
);
17689 rtx e2
= gen_reg_rtx (XFmode
);
17690 rtx scratch
= gen_reg_rtx (HImode
);
17691 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17693 rtx_code_label
*jump_label
= gen_label_rtx ();
17696 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17697 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
17698 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17699 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17700 emit_insn (gen_sqrtxf2 (e2
, e2
));
17701 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
17704 emit_insn (gen_divxf3 (e1
, e1
, e2
));
17706 /* scratch = fxam (op1) */
17707 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17709 /* e1 = e1 + |op1| */
17710 emit_insn (gen_absxf2 (e2
, op1
));
17711 emit_insn (gen_addxf3 (e1
, e1
, e2
));
17713 /* e2 = log1p (e1) */
17714 ix86_emit_i387_log1p (e2
, e1
);
17716 /* flags = signbit (op1) */
17717 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17719 /* if (flags) then e2 = -e2 */
17720 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17721 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17722 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17724 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17725 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17726 JUMP_LABEL (insn
) = jump_label
;
17728 emit_insn (gen_negxf2 (e2
, e2
));
17730 emit_label (jump_label
);
17731 LABEL_NUSES (jump_label
) = 1;
17733 emit_move_insn (op0
, e2
);
17736 /* Output code to perform an acosh XFmode calculation. */
17739 ix86_emit_i387_acosh (rtx op0
, rtx op1
)
17741 rtx e1
= gen_reg_rtx (XFmode
);
17742 rtx e2
= gen_reg_rtx (XFmode
);
17743 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17745 /* e2 = sqrt (op1 + 1.0) */
17746 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
17747 emit_insn (gen_sqrtxf2 (e2
, e2
));
17749 /* e1 = sqrt (op1 - 1.0) */
17750 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
17751 emit_insn (gen_sqrtxf2 (e1
, e1
));
17754 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
17756 /* e1 = e1 + op1 */
17757 emit_insn (gen_addxf3 (e1
, e1
, op1
));
17759 /* op0 = log (e1) */
17760 emit_insn (gen_logxf2 (op0
, e1
));
17763 /* Output code to perform an atanh XFmode calculation. */
17766 ix86_emit_i387_atanh (rtx op0
, rtx op1
)
17768 rtx e1
= gen_reg_rtx (XFmode
);
17769 rtx e2
= gen_reg_rtx (XFmode
);
17770 rtx scratch
= gen_reg_rtx (HImode
);
17771 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17772 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17774 rtx_code_label
*jump_label
= gen_label_rtx ();
17777 /* scratch = fxam (op1) */
17778 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17781 emit_insn (gen_absxf2 (e2
, op1
));
17783 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17784 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17785 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
17786 emit_insn (gen_addxf3 (e2
, e2
, e2
));
17787 emit_insn (gen_negxf2 (e2
, e2
));
17788 emit_insn (gen_divxf3 (e1
, e2
, e1
));
17790 /* e2 = log1p (e1) */
17791 ix86_emit_i387_log1p (e2
, e1
);
17793 /* flags = signbit (op1) */
17794 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17796 /* if (!flags) then e2 = -e2 */
17797 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17798 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17799 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17801 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17802 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17803 JUMP_LABEL (insn
) = jump_label
;
17805 emit_insn (gen_negxf2 (e2
, e2
));
17807 emit_label (jump_label
);
17808 LABEL_NUSES (jump_label
) = 1;
17810 /* op0 = 0.5 * e2 */
17811 half
= force_reg (XFmode
, half
);
17812 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17815 /* Output code to perform a log1p XFmode calculation. */
17818 ix86_emit_i387_log1p (rtx op0
, rtx op1
)
17820 rtx_code_label
*label1
= gen_label_rtx ();
17821 rtx_code_label
*label2
= gen_label_rtx ();
17823 rtx tmp
= gen_reg_rtx (XFmode
);
17824 rtx res
= gen_reg_rtx (XFmode
);
17825 rtx cst
, cstln2
, cst1
;
17828 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17829 before the conditional jump, otherwise the stack adjustment will be
17830 only conditional. */
17831 do_pending_stack_adjust ();
17833 cst
= const_double_from_real_value
17834 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
17835 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
17837 emit_insn (gen_absxf2 (tmp
, op1
));
17839 cst
= force_reg (XFmode
, cst
);
17840 ix86_expand_branch (GE
, tmp
, cst
, label1
);
17841 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
17842 insn
= get_last_insn ();
17843 JUMP_LABEL (insn
) = label1
;
17845 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
17846 emit_jump (label2
);
17848 emit_label (label1
);
17849 LABEL_NUSES (label1
) = 1;
17851 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17852 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
17853 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
17855 emit_label (label2
);
17856 LABEL_NUSES (label2
) = 1;
17858 emit_move_insn (op0
, res
);
17861 /* Emit code for round calculation. */
17863 ix86_emit_i387_round (rtx op0
, rtx op1
)
17865 machine_mode inmode
= GET_MODE (op1
);
17866 machine_mode outmode
= GET_MODE (op0
);
17867 rtx e1
= gen_reg_rtx (XFmode
);
17868 rtx e2
= gen_reg_rtx (XFmode
);
17869 rtx scratch
= gen_reg_rtx (HImode
);
17870 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17871 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17872 rtx res
= gen_reg_rtx (outmode
);
17873 rtx_code_label
*jump_label
= gen_label_rtx ();
17874 rtx (*floor_insn
) (rtx
, rtx
);
17875 rtx (*neg_insn
) (rtx
, rtx
);
17883 tmp
= gen_reg_rtx (XFmode
);
17885 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
17891 gcc_unreachable ();
17897 floor_insn
= gen_frndintxf2_floor
;
17898 neg_insn
= gen_negsf2
;
17901 floor_insn
= gen_frndintxf2_floor
;
17902 neg_insn
= gen_negdf2
;
17905 floor_insn
= gen_frndintxf2_floor
;
17906 neg_insn
= gen_negxf2
;
17909 floor_insn
= gen_lfloorxfhi2
;
17910 neg_insn
= gen_neghi2
;
17913 floor_insn
= gen_lfloorxfsi2
;
17914 neg_insn
= gen_negsi2
;
17917 floor_insn
= gen_lfloorxfdi2
;
17918 neg_insn
= gen_negdi2
;
17921 gcc_unreachable ();
17924 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17926 /* scratch = fxam(op1) */
17927 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17929 /* e1 = fabs(op1) */
17930 emit_insn (gen_absxf2 (e1
, op1
));
17932 /* e2 = e1 + 0.5 */
17933 half
= force_reg (XFmode
, half
);
17934 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
17936 /* res = floor(e2) */
17942 tmp
= gen_reg_rtx (XFmode
);
17944 emit_insn (floor_insn (tmp
, e2
));
17945 emit_insn (gen_rtx_SET (res
,
17946 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
17947 UNSPEC_TRUNC_NOOP
)));
17951 emit_insn (floor_insn (res
, e2
));
17954 /* flags = signbit(a) */
17955 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17957 /* if (flags) then res = -res */
17958 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17959 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17960 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17962 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17963 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17964 JUMP_LABEL (insn
) = jump_label
;
17966 emit_insn (neg_insn (res
, res
));
17968 emit_label (jump_label
);
17969 LABEL_NUSES (jump_label
) = 1;
17971 emit_move_insn (op0
, res
);
17974 /* Output code to perform a Newton-Rhapson approximation of a single precision
17975 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
17978 ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
17980 rtx x0
, x1
, e0
, e1
;
17982 x0
= gen_reg_rtx (mode
);
17983 e0
= gen_reg_rtx (mode
);
17984 e1
= gen_reg_rtx (mode
);
17985 x1
= gen_reg_rtx (mode
);
17987 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17989 b
= force_reg (mode
, b
);
17991 /* x0 = rcp(b) estimate */
17992 if (mode
== V16SFmode
|| mode
== V8DFmode
)
17994 if (TARGET_AVX512ER
)
17996 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
17999 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
18003 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18007 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18011 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
18014 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
18017 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
18020 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
18023 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
18026 /* Output code to perform a Newton-Rhapson approximation of a
18027 single precision floating point [reciprocal] square root. */
18030 ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
18032 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
18036 x0
= gen_reg_rtx (mode
);
18037 e0
= gen_reg_rtx (mode
);
18038 e1
= gen_reg_rtx (mode
);
18039 e2
= gen_reg_rtx (mode
);
18040 e3
= gen_reg_rtx (mode
);
18042 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
18045 /* res = rsqrt28(a) estimate */
18046 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18050 /* x0 = rsqrt28(a) estimate */
18051 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18053 /* res = rcp28(x0) estimate */
18054 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
18060 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
18061 mthree
= const_double_from_real_value (r
, SFmode
);
18063 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
18064 mhalf
= const_double_from_real_value (r
, SFmode
);
18065 unspec
= UNSPEC_RSQRT
;
18067 if (VECTOR_MODE_P (mode
))
18069 mthree
= ix86_build_const_vector (mode
, true, mthree
);
18070 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
18071 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18072 if (GET_MODE_SIZE (mode
) == 64)
18073 unspec
= UNSPEC_RSQRT14
;
18076 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18077 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18079 a
= force_reg (mode
, a
);
18081 /* x0 = rsqrt(a) estimate */
18082 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18085 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18088 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
18091 /* Handle masked compare. */
18092 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
18094 mask
= gen_reg_rtx (HImode
);
18095 /* Imm value 0x4 corresponds to not-equal comparison. */
18096 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
18097 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
18101 mask
= gen_reg_rtx (mode
);
18102 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
18103 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
18107 mthree
= force_reg (mode
, mthree
);
18110 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
18112 unsigned vector_size
= GET_MODE_SIZE (mode
);
18114 || (TARGET_AVX512F
&& vector_size
== 64)
18115 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
18116 emit_insn (gen_rtx_SET (e2
,
18117 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
18121 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
18124 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
18127 mhalf
= force_reg (mode
, mhalf
);
18129 /* e3 = -.5 * x0 */
18130 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
18132 /* e3 = -.5 * e0 */
18133 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
18134 /* ret = e2 * e3 */
18135 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
18138 /* Expand fabs (OP0) and return a new rtx that holds the result. The
18139 mask for masking out the sign-bit is stored in *SMASK, if that is
18143 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
18145 machine_mode vmode
, mode
= GET_MODE (op0
);
18148 xa
= gen_reg_rtx (mode
);
18149 if (mode
== SFmode
)
18151 else if (mode
== DFmode
)
18155 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
18156 if (!VECTOR_MODE_P (mode
))
18158 /* We need to generate a scalar mode mask in this case. */
18159 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
18160 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
18161 mask
= gen_reg_rtx (mode
);
18162 emit_insn (gen_rtx_SET (mask
, tmp
));
18164 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
18172 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
18173 swapping the operands if SWAP_OPERANDS is true. The expanded
18174 code is a forward jump to a newly created label in case the
18175 comparison is true. The generated label rtx is returned. */
18176 static rtx_code_label
*
18177 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
18178 bool swap_operands
)
18180 bool unordered_compare
= ix86_unordered_fp_compare (code
);
18181 rtx_code_label
*label
;
18185 std::swap (op0
, op1
);
18187 label
= gen_label_rtx ();
18188 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
18189 if (unordered_compare
)
18190 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
18191 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
18192 emit_insn (gen_rtx_SET (reg
, tmp
));
18193 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
18194 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
18195 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
18196 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18197 JUMP_LABEL (tmp
) = label
;
18202 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18203 using comparison code CODE. Operands are swapped for the comparison if
18204 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18206 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
18207 bool swap_operands
)
18209 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
18210 machine_mode mode
= GET_MODE (op0
);
18211 rtx mask
= gen_reg_rtx (mode
);
18214 std::swap (op0
, op1
);
18216 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
18218 emit_insn (insn (mask
, op0
, op1
,
18219 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
18223 /* Expand copysign from SIGN to the positive value ABS_VALUE
18224 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18228 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
18230 machine_mode mode
= GET_MODE (sign
);
18231 rtx sgn
= gen_reg_rtx (mode
);
18232 if (mask
== NULL_RTX
)
18234 machine_mode vmode
;
18236 if (mode
== SFmode
)
18238 else if (mode
== DFmode
)
18243 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
18244 if (!VECTOR_MODE_P (mode
))
18246 /* We need to generate a scalar mode mask in this case. */
18247 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
18248 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
18249 mask
= gen_reg_rtx (mode
);
18250 emit_insn (gen_rtx_SET (mask
, tmp
));
18254 mask
= gen_rtx_NOT (mode
, mask
);
18255 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
18256 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
18259 /* Expand SSE sequence for computing lround from OP1 storing
18263 ix86_expand_lround (rtx op0
, rtx op1
)
18265 /* C code for the stuff we're doing below:
18266 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18269 machine_mode mode
= GET_MODE (op1
);
18270 const struct real_format
*fmt
;
18271 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18274 /* load nextafter (0.5, 0.0) */
18275 fmt
= REAL_MODE_FORMAT (mode
);
18276 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18277 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18279 /* adj = copysign (0.5, op1) */
18280 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18281 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
18283 /* adj = op1 + adj */
18284 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18286 /* op0 = (imode)adj */
18287 expand_fix (op0
, adj
, 0);
18290 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18294 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
18296 /* C code for the stuff we're doing below (for do_floor):
18298 xi -= (double)xi > op1 ? 1 : 0;
18301 machine_mode fmode
= GET_MODE (op1
);
18302 machine_mode imode
= GET_MODE (op0
);
18303 rtx ireg
, freg
, tmp
;
18304 rtx_code_label
*label
;
18306 /* reg = (long)op1 */
18307 ireg
= gen_reg_rtx (imode
);
18308 expand_fix (ireg
, op1
, 0);
18310 /* freg = (double)reg */
18311 freg
= gen_reg_rtx (fmode
);
18312 expand_float (freg
, ireg
, 0);
18314 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18315 label
= ix86_expand_sse_compare_and_jump (UNLE
,
18316 freg
, op1
, !do_floor
);
18317 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
18318 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
18319 emit_move_insn (ireg
, tmp
);
18321 emit_label (label
);
18322 LABEL_NUSES (label
) = 1;
18324 emit_move_insn (op0
, ireg
);
18327 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18328 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18331 ix86_gen_TWO52 (machine_mode mode
)
18333 const struct real_format
*fmt
;
18334 REAL_VALUE_TYPE TWO52r
;
18337 fmt
= REAL_MODE_FORMAT (mode
);
18338 real_2expN (&TWO52r
, fmt
->p
- 1, mode
);
18339 TWO52
= const_double_from_real_value (TWO52r
, mode
);
18340 TWO52
= force_reg (mode
, TWO52
);
18345 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18348 ix86_expand_rint (rtx operand0
, rtx operand1
)
18350 /* C code for the stuff we're doing below:
18351 xa = fabs (operand1);
18352 if (!isless (xa, 2**52))
18355 if (flag_rounding_math)
18357 two52 = copysign (two52, operand1);
18360 xa = xa + two52 - two52;
18361 return copysign (xa, operand1);
18363 machine_mode mode
= GET_MODE (operand0
);
18364 rtx res
, xa
, TWO52
, mask
;
18365 rtx_code_label
*label
;
18367 TWO52
= ix86_gen_TWO52 (mode
);
18369 /* Temporary for holding the result, initialized to the input
18370 operand to ease control flow. */
18371 res
= copy_to_reg (operand1
);
18373 /* xa = abs (operand1) */
18374 xa
= ix86_expand_sse_fabs (res
, &mask
);
18376 /* if (!isless (xa, TWO52)) goto label; */
18377 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18379 if (flag_rounding_math
)
18381 ix86_sse_copysign_to_positive (TWO52
, TWO52
, res
, mask
);
18385 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18386 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
18388 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18389 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
18390 xa
= ix86_expand_sse_fabs (xa
, NULL
);
18392 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
18394 emit_label (label
);
18395 LABEL_NUSES (label
) = 1;
18397 emit_move_insn (operand0
, res
);
18400 /* Expand SSE2 sequence for computing floor or ceil
18401 from OPERAND1 storing into OPERAND0. */
18403 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
18405 /* C code for the stuff we expand below.
18406 double xa = fabs (x), x2;
18407 if (!isless (xa, TWO52))
18409 x2 = (double)(long)x;
18418 if (HONOR_SIGNED_ZEROS (mode))
18419 return copysign (x2, x);
18422 machine_mode mode
= GET_MODE (operand0
);
18423 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
18424 rtx_code_label
*label
;
18426 TWO52
= ix86_gen_TWO52 (mode
);
18428 /* Temporary for holding the result, initialized to the input
18429 operand to ease control flow. */
18430 res
= copy_to_reg (operand1
);
18432 /* xa = abs (operand1) */
18433 xa
= ix86_expand_sse_fabs (res
, &mask
);
18435 /* if (!isless (xa, TWO52)) goto label; */
18436 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18438 /* xa = (double)(long)x */
18439 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18440 expand_fix (xi
, res
, 0);
18441 expand_float (xa
, xi
, 0);
18444 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18446 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18447 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
18448 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18449 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
18450 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18451 if (HONOR_SIGNED_ZEROS (mode
))
18453 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18454 if (do_floor
&& flag_rounding_math
)
18455 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18457 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
18459 emit_move_insn (res
, tmp
);
18461 emit_label (label
);
18462 LABEL_NUSES (label
) = 1;
18464 emit_move_insn (operand0
, res
);
18467 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18468 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18469 that is only available on 64bit targets. */
18471 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
18473 /* C code for the stuff we expand below.
18474 double xa = fabs (x), x2;
18475 if (!isless (xa, TWO52))
18477 xa = xa + TWO52 - TWO52;
18478 x2 = copysign (xa, x);
18487 if (HONOR_SIGNED_ZEROS (mode))
18488 x2 = copysign (x2, x);
18491 machine_mode mode
= GET_MODE (operand0
);
18492 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
18493 rtx_code_label
*label
;
18495 TWO52
= ix86_gen_TWO52 (mode
);
18497 /* Temporary for holding the result, initialized to the input
18498 operand to ease control flow. */
18499 res
= copy_to_reg (operand1
);
18501 /* xa = abs (operand1) */
18502 xa
= ix86_expand_sse_fabs (res
, &mask
);
18504 /* if (!isless (xa, TWO52)) goto label; */
18505 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18507 /* xa = xa + TWO52 - TWO52; */
18508 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18509 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
18511 /* xa = copysign (xa, operand1) */
18512 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18515 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18517 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18518 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
18519 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18520 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
18521 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18522 if (HONOR_SIGNED_ZEROS (mode
))
18524 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18525 if (do_floor
&& flag_rounding_math
)
18526 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18528 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
18530 emit_move_insn (res
, tmp
);
18532 emit_label (label
);
18533 LABEL_NUSES (label
) = 1;
18535 emit_move_insn (operand0
, res
);
18538 /* Expand SSE sequence for computing trunc
18539 from OPERAND1 storing into OPERAND0. */
18541 ix86_expand_trunc (rtx operand0
, rtx operand1
)
18543 /* C code for SSE variant we expand below.
18544 double xa = fabs (x), x2;
18545 if (!isless (xa, TWO52))
18547 x2 = (double)(long)x;
18548 if (HONOR_SIGNED_ZEROS (mode))
18549 return copysign (x2, x);
18552 machine_mode mode
= GET_MODE (operand0
);
18553 rtx xa
, xi
, TWO52
, res
, mask
;
18554 rtx_code_label
*label
;
18556 TWO52
= ix86_gen_TWO52 (mode
);
18558 /* Temporary for holding the result, initialized to the input
18559 operand to ease control flow. */
18560 res
= copy_to_reg (operand1
);
18562 /* xa = abs (operand1) */
18563 xa
= ix86_expand_sse_fabs (res
, &mask
);
18565 /* if (!isless (xa, TWO52)) goto label; */
18566 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18568 /* xa = (double)(long)x */
18569 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18570 expand_fix (xi
, res
, 0);
18571 expand_float (xa
, xi
, 0);
18573 if (HONOR_SIGNED_ZEROS (mode
))
18574 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18576 emit_move_insn (res
, xa
);
18578 emit_label (label
);
18579 LABEL_NUSES (label
) = 1;
18581 emit_move_insn (operand0
, res
);
18584 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18585 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18586 that is only available on 64bit targets. */
18588 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
18590 machine_mode mode
= GET_MODE (operand0
);
18591 rtx xa
, xa2
, TWO52
, tmp
, one
, res
, mask
;
18592 rtx_code_label
*label
;
18594 /* C code for SSE variant we expand below.
18595 double xa = fabs (x), x2;
18596 if (!isless (xa, TWO52))
18598 xa2 = xa + TWO52 - TWO52;
18602 x2 = copysign (xa2, x);
18606 TWO52
= ix86_gen_TWO52 (mode
);
18608 /* Temporary for holding the result, initialized to the input
18609 operand to ease control flow. */
18610 res
=copy_to_reg (operand1
);
18612 /* xa = abs (operand1) */
18613 xa
= ix86_expand_sse_fabs (res
, &mask
);
18615 /* if (!isless (xa, TWO52)) goto label; */
18616 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18618 /* xa2 = xa + TWO52 - TWO52; */
18619 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18620 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18623 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18625 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18626 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa2
, xa
, false);
18627 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18628 tmp
= expand_simple_binop (mode
, MINUS
,
18629 xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18630 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18631 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
18632 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18634 /* res = copysign (xa2, operand1) */
18635 ix86_sse_copysign_to_positive (res
, tmp
, res
, mask
);
18637 emit_label (label
);
18638 LABEL_NUSES (label
) = 1;
18640 emit_move_insn (operand0
, res
);
18643 /* Expand SSE sequence for computing round
18644 from OPERAND1 storing into OPERAND0. */
18646 ix86_expand_round (rtx operand0
, rtx operand1
)
18648 /* C code for the stuff we're doing below:
18649 double xa = fabs (x);
18650 if (!isless (xa, TWO52))
18652 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18653 return copysign (xa, x);
18655 machine_mode mode
= GET_MODE (operand0
);
18656 rtx res
, TWO52
, xa
, xi
, half
, mask
;
18657 rtx_code_label
*label
;
18658 const struct real_format
*fmt
;
18659 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18661 /* Temporary for holding the result, initialized to the input
18662 operand to ease control flow. */
18663 res
= copy_to_reg (operand1
);
18665 TWO52
= ix86_gen_TWO52 (mode
);
18666 xa
= ix86_expand_sse_fabs (res
, &mask
);
18667 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18669 /* load nextafter (0.5, 0.0) */
18670 fmt
= REAL_MODE_FORMAT (mode
);
18671 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18672 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18674 /* xa = xa + 0.5 */
18675 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18676 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18678 /* xa = (double)(int64_t)xa */
18679 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18680 expand_fix (xi
, xa
, 0);
18681 expand_float (xa
, xi
, 0);
18683 /* res = copysign (xa, operand1) */
18684 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
18686 emit_label (label
);
18687 LABEL_NUSES (label
) = 1;
18689 emit_move_insn (operand0
, res
);
18692 /* Expand SSE sequence for computing round from OPERAND1 storing
18693 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18694 that is only available on 64bit targets. */
18696 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
18698 /* C code for the stuff we expand below.
18699 double xa = fabs (x), xa2, x2;
18700 if (!isless (xa, TWO52))
18702 Using the absolute value and copying back sign makes
18703 -0.0 -> -0.0 correct.
18704 xa2 = xa + TWO52 - TWO52;
18709 else if (dxa > 0.5)
18711 x2 = copysign (xa2, x);
18714 machine_mode mode
= GET_MODE (operand0
);
18715 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
18716 rtx_code_label
*label
;
18718 TWO52
= ix86_gen_TWO52 (mode
);
18720 /* Temporary for holding the result, initialized to the input
18721 operand to ease control flow. */
18722 res
= copy_to_reg (operand1
);
18724 /* xa = abs (operand1) */
18725 xa
= ix86_expand_sse_fabs (res
, &mask
);
18727 /* if (!isless (xa, TWO52)) goto label; */
18728 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18730 /* xa2 = xa + TWO52 - TWO52; */
18731 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18732 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18734 /* dxa = xa2 - xa; */
18735 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
18737 /* generate 0.5, 1.0 and -0.5 */
18738 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
18739 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18740 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
18744 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18745 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
18746 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18747 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18748 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18749 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
18750 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18751 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18753 /* res = copysign (xa2, operand1) */
18754 ix86_sse_copysign_to_positive (res
, xa2
, res
, mask
);
18756 emit_label (label
);
18757 LABEL_NUSES (label
) = 1;
18759 emit_move_insn (operand0
, res
);
18762 /* Expand SSE sequence for computing round
18763 from OP1 storing into OP0 using sse4 round insn. */
18765 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
18767 machine_mode mode
= GET_MODE (op0
);
18768 rtx e1
, e2
, res
, half
;
18769 const struct real_format
*fmt
;
18770 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18771 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
18772 rtx (*gen_round
) (rtx
, rtx
, rtx
);
18777 gen_copysign
= gen_copysignsf3
;
18778 gen_round
= gen_sse4_1_roundsf2
;
18781 gen_copysign
= gen_copysigndf3
;
18782 gen_round
= gen_sse4_1_rounddf2
;
18785 gcc_unreachable ();
18788 /* round (a) = trunc (a + copysign (0.5, a)) */
18790 /* load nextafter (0.5, 0.0) */
18791 fmt
= REAL_MODE_FORMAT (mode
);
18792 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18793 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18794 half
= const_double_from_real_value (pred_half
, mode
);
18796 /* e1 = copysign (0.5, op1) */
18797 e1
= gen_reg_rtx (mode
);
18798 emit_insn (gen_copysign (e1
, half
, op1
));
18800 /* e2 = op1 + e1 */
18801 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18803 /* res = trunc (e2) */
18804 res
= gen_reg_rtx (mode
);
18805 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
18807 emit_move_insn (op0
, res
);
18810 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18811 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18812 insn every time. */
18814 static GTY(()) rtx_insn
*vselect_insn
;
18816 /* Initialize vselect_insn. */
18819 init_vselect_insn (void)
18824 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
18825 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
18826 XVECEXP (x
, 0, i
) = const0_rtx
;
18827 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
18829 x
= gen_rtx_SET (const0_rtx
, x
);
18831 vselect_insn
= emit_insn (x
);
18835 /* Construct (set target (vec_select op0 (parallel perm))) and
18836 return true if that's a valid instruction in the active ISA. */
18839 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
18840 unsigned nelt
, bool testing_p
)
18843 rtx x
, save_vconcat
;
18846 if (vselect_insn
== NULL_RTX
)
18847 init_vselect_insn ();
18849 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
18850 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
18851 for (i
= 0; i
< nelt
; ++i
)
18852 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
18853 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
18854 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
18855 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
18856 SET_DEST (PATTERN (vselect_insn
)) = target
;
18857 icode
= recog_memoized (vselect_insn
);
18859 if (icode
>= 0 && !testing_p
)
18860 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
18862 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
18863 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
18864 INSN_CODE (vselect_insn
) = -1;
18869 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18872 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
18873 const unsigned char *perm
, unsigned nelt
,
18876 machine_mode v2mode
;
18880 if (vselect_insn
== NULL_RTX
)
18881 init_vselect_insn ();
18883 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
18885 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
18886 PUT_MODE (x
, v2mode
);
18889 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
18890 XEXP (x
, 0) = const0_rtx
;
18891 XEXP (x
, 1) = const0_rtx
;
18895 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18896 using movss or movsd. */
18898 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
18900 machine_mode vmode
= d
->vmode
;
18901 unsigned i
, nelt
= d
->nelt
;
18904 if (d
->one_operand_p
)
18907 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
18908 && !(TARGET_SSE
&& vmode
== V4SImode
)
18909 && !(TARGET_MMX_WITH_SSE
&& vmode
== V2SFmode
)
18910 && !(TARGET_SSE2
&& vmode
== V2DFmode
)
18911 && !(TARGET_SSE2
&& vmode
== V2DImode
))
18914 /* Only the first element is changed. */
18915 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
18917 for (i
= 1; i
< nelt
; ++i
)
18918 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
18924 if (d
->perm
[0] == nelt
)
18925 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
18927 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
18929 emit_insn (gen_rtx_SET (d
->target
, x
));
18934 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18935 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
18938 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
18940 machine_mode mmode
, vmode
= d
->vmode
;
18941 unsigned i
, nelt
= d
->nelt
;
18942 unsigned HOST_WIDE_INT mask
;
18943 rtx target
, op0
, op1
, maskop
, x
;
18944 rtx rperm
[32], vperm
;
18946 if (d
->one_operand_p
)
18948 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
18949 && (TARGET_AVX512BW
18950 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
18952 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
18954 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
18956 else if (TARGET_SSE4_1
&& (GET_MODE_SIZE (vmode
) == 16
18957 || GET_MODE_SIZE (vmode
) == 8
18958 || GET_MODE_SIZE (vmode
) == 4))
18963 /* This is a blend, not a permute. Elements must stay in their
18964 respective lanes. */
18965 for (i
= 0; i
< nelt
; ++i
)
18967 unsigned e
= d
->perm
[i
];
18968 if (!(e
== i
|| e
== i
+ nelt
))
18975 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
18976 decision should be extracted elsewhere, so that we only try that
18977 sequence once all budget==3 options have been tried. */
18978 target
= d
->target
;
18998 for (i
= 0; i
< nelt
; ++i
)
18999 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
19003 for (i
= 0; i
< 2; ++i
)
19004 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
19009 for (i
= 0; i
< 2; ++i
)
19010 mask
|= (d
->perm
[i
] >= 2 ? 3 : 0) << (i
* 2);
19015 for (i
= 0; i
< 4; ++i
)
19016 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
19021 /* See if bytes move in pairs so we can use pblendw with
19022 an immediate argument, rather than pblendvb with a vector
19024 for (i
= 0; i
< 16; i
+= 2)
19025 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19028 for (i
= 0; i
< nelt
; ++i
)
19029 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
19032 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
19033 vperm
= force_reg (vmode
, vperm
);
19035 if (GET_MODE_SIZE (vmode
) == 4)
19036 emit_insn (gen_mmx_pblendvb_v4qi (target
, op0
, op1
, vperm
));
19037 else if (GET_MODE_SIZE (vmode
) == 8)
19038 emit_insn (gen_mmx_pblendvb_v8qi (target
, op0
, op1
, vperm
));
19039 else if (GET_MODE_SIZE (vmode
) == 16)
19040 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
19042 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
19043 if (target
!= d
->target
)
19044 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19048 for (i
= 0; i
< 8; ++i
)
19049 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
19054 target
= gen_reg_rtx (vmode
);
19055 op0
= gen_lowpart (vmode
, op0
);
19056 op1
= gen_lowpart (vmode
, op1
);
19060 for (i
= 0; i
< 8; i
+= 2)
19061 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19064 for (i
= 0; i
< 4; ++i
)
19065 mask
|= (d
->perm
[i
* 2] >= 8) << i
;
19070 for (i
= 0; i
< 4; i
+= 2)
19071 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19074 for (i
= 0; i
< 2; ++i
)
19075 mask
|= (d
->perm
[i
* 2] >= 4) << i
;
19080 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19081 for (i
= 0; i
< 32; i
+= 2)
19082 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19084 /* See if bytes move in quadruplets. If yes, vpblendd
19085 with immediate can be used. */
19086 for (i
= 0; i
< 32; i
+= 4)
19087 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
19091 /* See if bytes move the same in both lanes. If yes,
19092 vpblendw with immediate can be used. */
19093 for (i
= 0; i
< 16; i
+= 2)
19094 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
19097 /* Use vpblendw. */
19098 for (i
= 0; i
< 16; ++i
)
19099 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
19104 /* Use vpblendd. */
19105 for (i
= 0; i
< 8; ++i
)
19106 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
19111 /* See if words move in pairs. If yes, vpblendd can be used. */
19112 for (i
= 0; i
< 16; i
+= 2)
19113 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19117 /* See if words move the same in both lanes. If not,
19118 vpblendvb must be used. */
19119 for (i
= 0; i
< 8; i
++)
19120 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
19122 /* Use vpblendvb. */
19123 for (i
= 0; i
< 32; ++i
)
19124 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
19128 target
= gen_reg_rtx (vmode
);
19129 op0
= gen_lowpart (vmode
, op0
);
19130 op1
= gen_lowpart (vmode
, op1
);
19131 goto finish_pblendvb
;
19134 /* Use vpblendw. */
19135 for (i
= 0; i
< 16; ++i
)
19136 mask
|= (d
->perm
[i
] >= 16) << i
;
19140 /* Use vpblendd. */
19141 for (i
= 0; i
< 8; ++i
)
19142 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
19147 /* Use vpblendd. */
19148 for (i
= 0; i
< 4; ++i
)
19149 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
19154 gcc_unreachable ();
19177 if (mmode
!= VOIDmode
)
19178 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
19180 maskop
= GEN_INT (mask
);
19182 /* This matches five different patterns with the different modes. */
19183 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
19184 x
= gen_rtx_SET (target
, x
);
19186 if (target
!= d
->target
)
19187 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19192 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19193 in terms of the variable form of vpermilps.
19195 Note that we will have already failed the immediate input vpermilps,
19196 which requires that the high and low part shuffle be identical; the
19197 variable form doesn't require that. */
19200 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
19202 rtx rperm
[8], vperm
;
19205 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
19208 /* We can only permute within the 128-bit lane. */
19209 for (i
= 0; i
< 8; ++i
)
19211 unsigned e
= d
->perm
[i
];
19212 if (i
< 4 ? e
>= 4 : e
< 4)
19219 for (i
= 0; i
< 8; ++i
)
19221 unsigned e
= d
->perm
[i
];
19223 /* Within each 128-bit lane, the elements of op0 are numbered
19224 from 0 and the elements of op1 are numbered from 4. */
19230 rperm
[i
] = GEN_INT (e
);
19233 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
19234 vperm
= force_reg (V8SImode
, vperm
);
19235 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
19240 /* For V*[QHS]Imode permutations, check if the same permutation
19241 can't be performed in a 2x, 4x or 8x wider inner mode. */
19244 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
19245 struct expand_vec_perm_d
*nd
)
19248 machine_mode mode
= VOIDmode
;
19252 case E_V8QImode
: mode
= V4HImode
; break;
19253 case E_V16QImode
: mode
= V8HImode
; break;
19254 case E_V32QImode
: mode
= V16HImode
; break;
19255 case E_V64QImode
: mode
= V32HImode
; break;
19256 case E_V4HImode
: mode
= V2SImode
; break;
19257 case E_V8HImode
: mode
= V4SImode
; break;
19258 case E_V16HImode
: mode
= V8SImode
; break;
19259 case E_V32HImode
: mode
= V16SImode
; break;
19260 case E_V4SImode
: mode
= V2DImode
; break;
19261 case E_V8SImode
: mode
= V4DImode
; break;
19262 case E_V16SImode
: mode
= V8DImode
; break;
19263 default: return false;
19265 for (i
= 0; i
< d
->nelt
; i
+= 2)
19266 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
19269 nd
->nelt
= d
->nelt
/ 2;
19270 for (i
= 0; i
< nd
->nelt
; i
++)
19271 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
19272 if (GET_MODE_INNER (mode
) != DImode
)
19273 canonicalize_vector_int_perm (nd
, nd
);
19276 nd
->one_operand_p
= d
->one_operand_p
;
19277 nd
->testing_p
= d
->testing_p
;
19278 if (d
->op0
== d
->op1
)
19279 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
19282 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
19283 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
19286 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19288 nd
->target
= gen_reg_rtx (nd
->vmode
);
19293 /* Return true if permutation D can be performed as VMODE permutation
19297 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
19299 unsigned int i
, j
, chunk
;
19301 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
19302 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
19303 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
19306 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
19309 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
19310 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
19311 if (d
->perm
[i
] & (chunk
- 1))
19314 for (j
= 1; j
< chunk
; ++j
)
19315 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
19321 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19322 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19325 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
19327 unsigned i
, nelt
, eltsz
, mask
;
19328 unsigned char perm
[64];
19329 machine_mode vmode
;
19330 struct expand_vec_perm_d nd
;
19331 rtx rperm
[64], vperm
, target
, op0
, op1
;
19335 if (!d
->one_operand_p
)
19336 switch (GET_MODE_SIZE (d
->vmode
))
19360 if (valid_perm_using_mode_p (V2TImode
, d
))
19365 /* Use vperm2i128 insn. The pattern uses
19366 V4DImode instead of V2TImode. */
19367 target
= d
->target
;
19368 if (d
->vmode
!= V4DImode
)
19369 target
= gen_reg_rtx (V4DImode
);
19370 op0
= gen_lowpart (V4DImode
, d
->op0
);
19371 op1
= gen_lowpart (V4DImode
, d
->op1
);
19373 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
19374 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
19375 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
19376 if (target
!= d
->target
)
19377 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19386 switch (GET_MODE_SIZE (d
->vmode
))
19410 /* V4DImode should be already handled through
19411 expand_vselect by vpermq instruction. */
19412 gcc_assert (d
->vmode
!= V4DImode
);
19415 if (d
->vmode
== V8SImode
19416 || d
->vmode
== V16HImode
19417 || d
->vmode
== V32QImode
)
19419 /* First see if vpermq can be used for
19420 V8SImode/V16HImode/V32QImode. */
19421 if (valid_perm_using_mode_p (V4DImode
, d
))
19423 for (i
= 0; i
< 4; i
++)
19424 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
19427 target
= gen_reg_rtx (V4DImode
);
19428 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
19431 emit_move_insn (d
->target
,
19432 gen_lowpart (d
->vmode
, target
));
19438 /* Next see if vpermd can be used. */
19439 if (valid_perm_using_mode_p (V8SImode
, d
))
19442 /* Or if vpermps can be used. */
19443 else if (d
->vmode
== V8SFmode
)
19446 if (vmode
== V32QImode
)
19448 /* vpshufb only works intra lanes, it is not
19449 possible to shuffle bytes in between the lanes. */
19450 for (i
= 0; i
< nelt
; ++i
)
19451 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
19457 if (!TARGET_AVX512BW
)
19460 /* If vpermq didn't work, vpshufb won't work either. */
19461 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
19465 if (d
->vmode
== V16SImode
19466 || d
->vmode
== V32HImode
19467 || d
->vmode
== V64QImode
)
19469 /* First see if vpermq can be used for
19470 V16SImode/V32HImode/V64QImode. */
19471 if (valid_perm_using_mode_p (V8DImode
, d
))
19473 for (i
= 0; i
< 8; i
++)
19474 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
19477 target
= gen_reg_rtx (V8DImode
);
19478 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
19481 emit_move_insn (d
->target
,
19482 gen_lowpart (d
->vmode
, target
));
19488 /* Next see if vpermd can be used. */
19489 if (valid_perm_using_mode_p (V16SImode
, d
))
19492 /* Or if vpermps can be used. */
19493 else if (d
->vmode
== V16SFmode
)
19496 if (vmode
== V64QImode
)
19498 /* vpshufb only works intra lanes, it is not
19499 possible to shuffle bytes in between the lanes. */
19500 for (i
= 0; i
< nelt
; ++i
)
19501 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
19513 /* Try to avoid variable permutation instruction. */
19514 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19516 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19520 if (vmode
== V8SImode
)
19521 for (i
= 0; i
< 8; ++i
)
19522 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
19523 else if (vmode
== V16SImode
)
19524 for (i
= 0; i
< 16; ++i
)
19525 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
19528 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19529 if (!d
->one_operand_p
)
19530 mask
= 2 * nelt
- 1;
19531 else if (vmode
== V64QImode
)
19532 mask
= nelt
/ 4 - 1;
19533 else if (vmode
== V32QImode
)
19534 mask
= nelt
/ 2 - 1;
19538 for (i
= 0; i
< nelt
; ++i
)
19540 unsigned j
, e
= d
->perm
[i
] & mask
;
19541 for (j
= 0; j
< eltsz
; ++j
)
19542 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
19546 machine_mode vpmode
= vmode
;
19548 nelt
= GET_MODE_SIZE (vmode
);
19550 /* Emulate narrow modes with V16QI instructions. */
19553 rtx m128
= GEN_INT (-128);
19555 /* Remap elements from the second operand, as we have to
19556 account for inactive top elements from the first operand. */
19557 if (!d
->one_operand_p
)
19559 for (i
= 0; i
< nelt
; ++i
)
19561 unsigned ival
= UINTVAL (rperm
[i
]);
19563 rperm
[i
] = GEN_INT (ival
+ 16 - nelt
);
19567 /* Fill inactive elements in the top positions with zeros. */
19568 for (i
= nelt
; i
< 16; ++i
)
19571 vpmode
= V16QImode
;
19574 vperm
= gen_rtx_CONST_VECTOR (vpmode
,
19575 gen_rtvec_v (GET_MODE_NUNITS (vpmode
), rperm
));
19576 vperm
= force_reg (vpmode
, vperm
);
19578 if (vmode
== d
->vmode
)
19579 target
= d
->target
;
19581 target
= gen_reg_rtx (vmode
);
19583 op0
= gen_lowpart (vmode
, d
->op0
);
19585 if (d
->one_operand_p
)
19587 rtx (*gen
) (rtx
, rtx
, rtx
);
19589 if (vmode
== V4QImode
)
19590 gen
= gen_mmx_pshufbv4qi3
;
19591 else if (vmode
== V8QImode
)
19592 gen
= gen_mmx_pshufbv8qi3
;
19593 else if (vmode
== V16QImode
)
19594 gen
= gen_ssse3_pshufbv16qi3
;
19595 else if (vmode
== V32QImode
)
19596 gen
= gen_avx2_pshufbv32qi3
;
19597 else if (vmode
== V64QImode
)
19598 gen
= gen_avx512bw_pshufbv64qi3
;
19599 else if (vmode
== V8SFmode
)
19600 gen
= gen_avx2_permvarv8sf
;
19601 else if (vmode
== V8SImode
)
19602 gen
= gen_avx2_permvarv8si
;
19603 else if (vmode
== V16SFmode
)
19604 gen
= gen_avx512f_permvarv16sf
;
19605 else if (vmode
== V16SImode
)
19606 gen
= gen_avx512f_permvarv16si
;
19608 gcc_unreachable ();
19610 emit_insn (gen (target
, op0
, vperm
));
19614 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
19616 op1
= gen_lowpart (vmode
, d
->op1
);
19618 if (vmode
== V4QImode
)
19619 gen
= gen_mmx_ppermv32
;
19620 else if (vmode
== V8QImode
)
19621 gen
= gen_mmx_ppermv64
;
19622 else if (vmode
== V16QImode
)
19623 gen
= gen_xop_pperm
;
19625 gcc_unreachable ();
19627 emit_insn (gen (target
, op0
, op1
, vperm
));
19630 if (target
!= d
->target
)
19631 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19636 /* Try to expand one-operand permutation with constant mask. */
19639 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
19641 machine_mode mode
= GET_MODE (d
->op0
);
19642 machine_mode maskmode
= mode
;
19643 unsigned inner_size
= GET_MODE_SIZE (GET_MODE_INNER (mode
));
19644 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
19645 rtx target
, op0
, mask
;
19648 if (!rtx_equal_p (d
->op0
, d
->op1
))
19651 if (!TARGET_AVX512F
)
19654 /* Accept VNxHImode and VNxQImode now. */
19655 if (!TARGET_AVX512VL
&& GET_MODE_SIZE (mode
) < 64)
19659 if (!TARGET_AVX512BW
&& inner_size
== 2)
19663 if (!TARGET_AVX512VBMI
&& inner_size
== 1)
19669 gen
= gen_avx512f_permvarv16si
;
19672 gen
= gen_avx512f_permvarv16sf
;
19673 maskmode
= V16SImode
;
19676 gen
= gen_avx512f_permvarv8di
;
19679 gen
= gen_avx512f_permvarv8df
;
19680 maskmode
= V8DImode
;
19683 gen
= gen_avx512bw_permvarv32hi
;
19686 gen
= gen_avx512vl_permvarv16hi
;
19689 gen
= gen_avx512vl_permvarv8hi
;
19692 gen
= gen_avx512bw_permvarv64qi
;
19695 gen
= gen_avx512vl_permvarv32qi
;
19698 gen
= gen_avx512vl_permvarv16qi
;
19708 target
= d
->target
;
19710 for (int i
= 0; i
< d
->nelt
; ++i
)
19711 vec
[i
] = GEN_INT (d
->perm
[i
]);
19712 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
19713 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
19717 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
19719 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19720 in a single instruction. */
19723 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
19725 unsigned i
, nelt
= d
->nelt
;
19726 struct expand_vec_perm_d nd
;
19728 /* Check plain VEC_SELECT first, because AVX has instructions that could
19729 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19730 input where SEL+CONCAT may not. */
19731 if (d
->one_operand_p
)
19733 int mask
= nelt
- 1;
19734 bool identity_perm
= true;
19735 bool broadcast_perm
= true;
19737 for (i
= 0; i
< nelt
; i
++)
19739 nd
.perm
[i
] = d
->perm
[i
] & mask
;
19740 if (nd
.perm
[i
] != i
)
19741 identity_perm
= false;
19743 broadcast_perm
= false;
19749 emit_move_insn (d
->target
, d
->op0
);
19752 else if (broadcast_perm
&& TARGET_AVX2
)
19754 /* Use vpbroadcast{b,w,d}. */
19755 rtx (*gen
) (rtx
, rtx
) = NULL
;
19759 if (TARGET_AVX512BW
)
19760 gen
= gen_avx512bw_vec_dupv64qi_1
;
19763 gen
= gen_avx2_pbroadcastv32qi_1
;
19766 if (TARGET_AVX512BW
)
19767 gen
= gen_avx512bw_vec_dupv32hi_1
;
19770 gen
= gen_avx2_pbroadcastv16hi_1
;
19773 if (TARGET_AVX512F
)
19774 gen
= gen_avx512f_vec_dupv16si_1
;
19777 gen
= gen_avx2_pbroadcastv8si_1
;
19780 gen
= gen_avx2_pbroadcastv16qi
;
19783 gen
= gen_avx2_pbroadcastv8hi
;
19786 if (TARGET_AVX512F
)
19787 gen
= gen_avx512f_vec_dupv16sf_1
;
19790 gen
= gen_avx2_vec_dupv8sf_1
;
19793 if (TARGET_AVX512F
)
19794 gen
= gen_avx512f_vec_dupv8df_1
;
19797 if (TARGET_AVX512F
)
19798 gen
= gen_avx512f_vec_dupv8di_1
;
19800 /* For other modes prefer other shuffles this function creates. */
19806 emit_insn (gen (d
->target
, d
->op0
));
19811 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
19814 /* There are plenty of patterns in sse.md that are written for
19815 SEL+CONCAT and are not replicated for a single op. Perhaps
19816 that should be changed, to avoid the nastiness here. */
19818 /* Recognize interleave style patterns, which means incrementing
19819 every other permutation operand. */
19820 for (i
= 0; i
< nelt
; i
+= 2)
19822 nd
.perm
[i
] = d
->perm
[i
] & mask
;
19823 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
19825 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
19829 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19832 for (i
= 0; i
< nelt
; i
+= 4)
19834 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
19835 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
19836 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
19837 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
19840 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
19846 /* Try movss/movsd instructions. */
19847 if (expand_vec_perm_movs (d
))
19850 /* Finally, try the fully general two operand permute. */
19851 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
19855 /* Recognize interleave style patterns with reversed operands. */
19856 if (!d
->one_operand_p
)
19858 for (i
= 0; i
< nelt
; ++i
)
19860 unsigned e
= d
->perm
[i
];
19868 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
19873 /* Try the SSE4.1 blend variable merge instructions. */
19874 if (expand_vec_perm_blend (d
))
19877 /* Try one of the AVX vpermil variable permutations. */
19878 if (expand_vec_perm_vpermil (d
))
19881 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19882 vpshufb, vpermd, vpermps or vpermq variable permutation. */
19883 if (expand_vec_perm_pshufb (d
))
19886 /* Try the AVX2 vpalignr instruction. */
19887 if (expand_vec_perm_palignr (d
, true))
19890 /* Try the AVX512F vperm{w,b,s,d} instructions */
19891 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
19894 /* Try the AVX512F vpermt2/vpermi2 instructions. */
19895 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
19898 /* See if we can get the same permutation in different vector integer
19900 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19903 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19909 /* Canonicalize vec_perm index to make the first index
19910 always comes from the first vector. */
19912 ix86_vec_perm_index_canon (struct expand_vec_perm_d
*d
)
19914 unsigned nelt
= d
->nelt
;
19915 if (d
->perm
[0] < nelt
)
19918 for (unsigned i
= 0; i
!= nelt
; i
++)
19919 d
->perm
[i
] = (d
->perm
[i
] + nelt
) % (2 * nelt
);
19921 std::swap (d
->op0
, d
->op1
);
19925 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19926 in terms of a pair of shufps+ shufps/pshufd instructions. */
19928 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d
*d
)
19930 unsigned char perm1
[4];
19931 machine_mode vmode
= d
->vmode
;
19933 unsigned i
, j
, k
, count
= 0;
19935 if (d
->one_operand_p
19936 || (vmode
!= V4SImode
&& vmode
!= V4SFmode
))
19942 ix86_vec_perm_index_canon (d
);
19943 for (i
= 0; i
< 4; ++i
)
19944 count
+= d
->perm
[i
] > 3 ? 1 : 0;
19946 gcc_assert (count
& 3);
19948 rtx tmp
= gen_reg_rtx (vmode
);
19949 /* 2 from op0 and 2 from op1. */
19952 unsigned char perm2
[4];
19953 for (i
= 0, j
= 0, k
= 2; i
< 4; ++i
)
19954 if (d
->perm
[i
] & 4)
19956 perm1
[k
++] = d
->perm
[i
];
19961 perm1
[j
++] = d
->perm
[i
];
19966 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
19967 perm1
, d
->nelt
, false);
19969 if (vmode
== V4SImode
&& TARGET_SSE2
)
19971 ok
= expand_vselect (d
->target
, tmp
,
19972 perm2
, d
->nelt
, false);
19978 ok
= expand_vselect_vconcat (d
->target
, tmp
, tmp
,
19979 perm2
, d
->nelt
, false);
19983 /* 3 from one op and 1 from another. */
19986 unsigned pair_idx
= 8, lone_idx
= 8, shift
;
19988 /* Find the lone index. */
19989 for (i
= 0; i
< 4; ++i
)
19990 if ((d
->perm
[i
] > 3 && count
== 1)
19991 || (d
->perm
[i
] < 4 && count
== 3))
19994 /* When lone_idx is not 0, it must from second op(count == 1). */
19995 gcc_assert (count
== (lone_idx
? 1 : 3));
19997 /* Find the pair index that sits in the same half as the lone index. */
19998 shift
= lone_idx
& 2;
19999 pair_idx
= 1 - lone_idx
+ 2 * shift
;
20001 /* First permutate lone index and pair index into the same vector as
20002 [ lone, lone, pair, pair ]. */
20003 perm1
[1] = perm1
[0]
20004 = (count
== 3) ? d
->perm
[lone_idx
] : d
->perm
[lone_idx
] - 4;
20005 perm1
[3] = perm1
[2]
20006 = (count
== 3) ? d
->perm
[pair_idx
] : d
->perm
[pair_idx
] + 4;
20008 /* Alway put the vector contains lone indx at the first. */
20010 std::swap (d
->op0
, d
->op1
);
20013 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
20014 perm1
, d
->nelt
, false);
20017 /* Refine lone and pair index to original order. */
20018 perm1
[shift
] = lone_idx
<< 1;
20019 perm1
[shift
+ 1] = pair_idx
<< 1;
20021 /* Select the remaining 2 elements in another vector. */
20022 for (i
= 2 - shift
; i
< 4 - shift
; ++i
)
20023 perm1
[i
] = lone_idx
== 1 ? d
->perm
[i
] + 4 : d
->perm
[i
];
20025 /* Adjust to original selector. */
20027 std::swap (tmp
, d
->op1
);
20030 ok
= expand_vselect_vconcat (d
->target
, tmp
, d
->op1
,
20031 perm1
, d
->nelt
, false);
20039 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20040 in terms of a pair of pshuflw + pshufhw instructions. */
20043 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
20045 unsigned char perm2
[MAX_VECT_LEN
];
20049 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
20052 /* The two permutations only operate in 64-bit lanes. */
20053 for (i
= 0; i
< 4; ++i
)
20054 if (d
->perm
[i
] >= 4)
20056 for (i
= 4; i
< 8; ++i
)
20057 if (d
->perm
[i
] < 4)
20063 /* Emit the pshuflw. */
20064 memcpy (perm2
, d
->perm
, 4);
20065 for (i
= 4; i
< 8; ++i
)
20067 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
20070 /* Emit the pshufhw. */
20071 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
20072 for (i
= 0; i
< 4; ++i
)
20074 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
20080 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20081 the permutation using the SSSE3 palignr instruction. This succeeds
20082 when all of the elements in PERM fit within one vector and we merely
20083 need to shift them down so that a single vector permutation has a
20084 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20085 the vpalignr instruction itself can perform the requested permutation. */
20088 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
20090 unsigned i
, nelt
= d
->nelt
;
20091 unsigned min
, max
, minswap
, maxswap
;
20092 bool in_order
, ok
, swap
= false;
20094 struct expand_vec_perm_d dcopy
;
20096 /* Even with AVX, palignr only operates on 128-bit vectors,
20097 in AVX2 palignr operates on both 128-bit lanes. */
20098 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
20099 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
20104 minswap
= 2 * nelt
;
20106 for (i
= 0; i
< nelt
; ++i
)
20108 unsigned e
= d
->perm
[i
];
20109 unsigned eswap
= d
->perm
[i
] ^ nelt
;
20110 if (GET_MODE_SIZE (d
->vmode
) == 32)
20112 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
20113 eswap
= e
^ (nelt
/ 2);
20119 if (eswap
< minswap
)
20121 if (eswap
> maxswap
)
20125 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
20127 if (d
->one_operand_p
20129 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
20130 ? nelt
/ 2 : nelt
))
20137 /* Given that we have SSSE3, we know we'll be able to implement the
20138 single operand permutation after the palignr with pshufb for
20139 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20141 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
20147 dcopy
.op0
= d
->op1
;
20148 dcopy
.op1
= d
->op0
;
20149 for (i
= 0; i
< nelt
; ++i
)
20150 dcopy
.perm
[i
] ^= nelt
;
20154 for (i
= 0; i
< nelt
; ++i
)
20156 unsigned e
= dcopy
.perm
[i
];
20157 if (GET_MODE_SIZE (d
->vmode
) == 32
20159 && (e
& (nelt
/ 2 - 1)) < min
)
20160 e
= e
- min
- (nelt
/ 2);
20167 dcopy
.one_operand_p
= true;
20169 if (single_insn_only_p
&& !in_order
)
20172 /* For AVX2, test whether we can permute the result in one instruction. */
20177 dcopy
.op1
= dcopy
.op0
;
20178 return expand_vec_perm_1 (&dcopy
);
20181 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
20182 if (GET_MODE_SIZE (d
->vmode
) == 16)
20184 target
= gen_reg_rtx (V1TImode
);
20185 emit_insn (gen_ssse3_palignrv1ti (target
,
20186 gen_lowpart (V1TImode
, dcopy
.op1
),
20187 gen_lowpart (V1TImode
, dcopy
.op0
),
20192 target
= gen_reg_rtx (V2TImode
);
20193 emit_insn (gen_avx2_palignrv2ti (target
,
20194 gen_lowpart (V2TImode
, dcopy
.op1
),
20195 gen_lowpart (V2TImode
, dcopy
.op0
),
20199 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
20201 /* Test for the degenerate case where the alignment by itself
20202 produces the desired permutation. */
20205 emit_move_insn (d
->target
, dcopy
.op0
);
20209 ok
= expand_vec_perm_1 (&dcopy
);
20210 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
20215 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20216 the permutation using the SSE4_1 pblendv instruction. Potentially
20217 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20220 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
20222 unsigned i
, which
, nelt
= d
->nelt
;
20223 struct expand_vec_perm_d dcopy
, dcopy1
;
20224 machine_mode vmode
= d
->vmode
;
20227 /* Use the same checks as in expand_vec_perm_blend. */
20228 if (d
->one_operand_p
)
20230 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
20232 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
20234 else if (TARGET_SSE4_1
&& (GET_MODE_SIZE (vmode
) == 4
20235 || GET_MODE_SIZE (vmode
) == 8
20236 || GET_MODE_SIZE (vmode
) == 16))
20241 /* Figure out where permutation elements stay not in their
20242 respective lanes. */
20243 for (i
= 0, which
= 0; i
< nelt
; ++i
)
20245 unsigned e
= d
->perm
[i
];
20247 which
|= (e
< nelt
? 1 : 2);
20249 /* We can pblend the part where elements stay not in their
20250 respective lanes only when these elements are all in one
20251 half of a permutation.
20252 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20253 lanes, but both 8 and 9 >= 8
20254 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20255 respective lanes and 8 >= 8, but 2 not. */
20256 if (which
!= 1 && which
!= 2)
20258 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
20261 /* First we apply one operand permutation to the part where
20262 elements stay not in their respective lanes. */
20265 dcopy
.op0
= dcopy
.op1
= d
->op1
;
20267 dcopy
.op0
= dcopy
.op1
= d
->op0
;
20269 dcopy
.target
= gen_reg_rtx (vmode
);
20270 dcopy
.one_operand_p
= true;
20272 for (i
= 0; i
< nelt
; ++i
)
20273 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
20275 ok
= expand_vec_perm_1 (&dcopy
);
20276 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
20283 /* Next we put permuted elements into their positions. */
20286 dcopy1
.op1
= dcopy
.target
;
20288 dcopy1
.op0
= dcopy
.target
;
20290 for (i
= 0; i
< nelt
; ++i
)
20291 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
20293 ok
= expand_vec_perm_blend (&dcopy1
);
20299 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
20301 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20302 a two vector permutation into a single vector permutation by using
20303 an interleave operation to merge the vectors. */
20306 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
20308 struct expand_vec_perm_d dremap
, dfinal
;
20309 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
20310 unsigned HOST_WIDE_INT contents
;
20311 unsigned char remap
[2 * MAX_VECT_LEN
];
20313 bool ok
, same_halves
= false;
20315 if (GET_MODE_SIZE (d
->vmode
) == 4
20316 || GET_MODE_SIZE (d
->vmode
) == 8
20317 || GET_MODE_SIZE (d
->vmode
) == 16)
20319 if (d
->one_operand_p
)
20322 else if (GET_MODE_SIZE (d
->vmode
) == 32)
20326 /* For 32-byte modes allow even d->one_operand_p.
20327 The lack of cross-lane shuffling in some instructions
20328 might prevent a single insn shuffle. */
20330 dfinal
.testing_p
= true;
20331 /* If expand_vec_perm_interleave3 can expand this into
20332 a 3 insn sequence, give up and let it be expanded as
20333 3 insn sequence. While that is one insn longer,
20334 it doesn't need a memory operand and in the common
20335 case that both interleave low and high permutations
20336 with the same operands are adjacent needs 4 insns
20337 for both after CSE. */
20338 if (expand_vec_perm_interleave3 (&dfinal
))
20344 /* Examine from whence the elements come. */
20346 for (i
= 0; i
< nelt
; ++i
)
20347 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
20349 memset (remap
, 0xff, sizeof (remap
));
20352 if (GET_MODE_SIZE (d
->vmode
) == 4
20353 || GET_MODE_SIZE (d
->vmode
) == 8)
20355 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
20357 /* Split the two input vectors into 4 halves. */
20358 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
20363 /* If the elements from the low halves use interleave low,
20364 and similarly for interleave high. */
20365 if ((contents
& (h1
| h3
)) == contents
)
20368 for (i
= 0; i
< nelt2
; ++i
)
20371 remap
[i
+ nelt
] = i
* 2 + 1;
20372 dremap
.perm
[i
* 2] = i
;
20373 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20376 else if ((contents
& (h2
| h4
)) == contents
)
20379 for (i
= 0; i
< nelt2
; ++i
)
20381 remap
[i
+ nelt2
] = i
* 2;
20382 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
20383 dremap
.perm
[i
* 2] = i
+ nelt2
;
20384 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
20390 else if (GET_MODE_SIZE (d
->vmode
) == 16)
20392 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
20394 /* Split the two input vectors into 4 halves. */
20395 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
20400 /* If the elements from the low halves use interleave low, and similarly
20401 for interleave high. If the elements are from mis-matched halves, we
20402 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20403 if ((contents
& (h1
| h3
)) == contents
)
20406 for (i
= 0; i
< nelt2
; ++i
)
20409 remap
[i
+ nelt
] = i
* 2 + 1;
20410 dremap
.perm
[i
* 2] = i
;
20411 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20413 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
20414 dremap
.vmode
= V4SFmode
;
20416 else if ((contents
& (h2
| h4
)) == contents
)
20419 for (i
= 0; i
< nelt2
; ++i
)
20421 remap
[i
+ nelt2
] = i
* 2;
20422 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
20423 dremap
.perm
[i
* 2] = i
+ nelt2
;
20424 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
20426 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
20427 dremap
.vmode
= V4SFmode
;
20429 else if ((contents
& (h1
| h4
)) == contents
)
20432 for (i
= 0; i
< nelt2
; ++i
)
20435 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
20436 dremap
.perm
[i
] = i
;
20437 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
20442 dremap
.vmode
= V2DImode
;
20444 dremap
.perm
[0] = 0;
20445 dremap
.perm
[1] = 3;
20448 else if ((contents
& (h2
| h3
)) == contents
)
20451 for (i
= 0; i
< nelt2
; ++i
)
20453 remap
[i
+ nelt2
] = i
;
20454 remap
[i
+ nelt
] = i
+ nelt2
;
20455 dremap
.perm
[i
] = i
+ nelt2
;
20456 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
20461 dremap
.vmode
= V2DImode
;
20463 dremap
.perm
[0] = 1;
20464 dremap
.perm
[1] = 2;
20472 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
20473 unsigned HOST_WIDE_INT q
[8];
20474 unsigned int nonzero_halves
[4];
20476 /* Split the two input vectors into 8 quarters. */
20477 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
20478 for (i
= 1; i
< 8; ++i
)
20479 q
[i
] = q
[0] << (nelt4
* i
);
20480 for (i
= 0; i
< 4; ++i
)
20481 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
20483 nonzero_halves
[nzcnt
] = i
;
20489 gcc_assert (d
->one_operand_p
);
20490 nonzero_halves
[1] = nonzero_halves
[0];
20491 same_halves
= true;
20493 else if (d
->one_operand_p
)
20495 gcc_assert (nonzero_halves
[0] == 0);
20496 gcc_assert (nonzero_halves
[1] == 1);
20501 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
20503 /* Attempt to increase the likelihood that dfinal
20504 shuffle will be intra-lane. */
20505 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
20508 /* vperm2f128 or vperm2i128. */
20509 for (i
= 0; i
< nelt2
; ++i
)
20511 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
20512 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
20513 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
20514 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
20517 if (d
->vmode
!= V8SFmode
20518 && d
->vmode
!= V4DFmode
20519 && d
->vmode
!= V8SImode
)
20521 dremap
.vmode
= V8SImode
;
20523 for (i
= 0; i
< 4; ++i
)
20525 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
20526 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
20530 else if (d
->one_operand_p
)
20532 else if (TARGET_AVX2
20533 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
20536 for (i
= 0; i
< nelt4
; ++i
)
20539 remap
[i
+ nelt
] = i
* 2 + 1;
20540 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
20541 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
20542 dremap
.perm
[i
* 2] = i
;
20543 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20544 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
20545 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
20548 else if (TARGET_AVX2
20549 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
20552 for (i
= 0; i
< nelt4
; ++i
)
20554 remap
[i
+ nelt4
] = i
* 2;
20555 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
20556 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
20557 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
20558 dremap
.perm
[i
* 2] = i
+ nelt4
;
20559 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
20560 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
20561 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
20568 /* Use the remapping array set up above to move the elements from their
20569 swizzled locations into their final destinations. */
20571 for (i
= 0; i
< nelt
; ++i
)
20573 unsigned e
= remap
[d
->perm
[i
]];
20574 gcc_assert (e
< nelt
);
20575 /* If same_halves is true, both halves of the remapped vector are the
20576 same. Avoid cross-lane accesses if possible. */
20577 if (same_halves
&& i
>= nelt2
)
20579 gcc_assert (e
< nelt2
);
20580 dfinal
.perm
[i
] = e
+ nelt2
;
20583 dfinal
.perm
[i
] = e
;
20587 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
20588 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
20590 dfinal
.op1
= dfinal
.op0
;
20591 dfinal
.one_operand_p
= true;
20593 /* Test if the final remap can be done with a single insn. For V4SFmode or
20594 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20596 ok
= expand_vec_perm_1 (&dfinal
);
20597 seq
= get_insns ();
20606 if (dremap
.vmode
!= dfinal
.vmode
)
20608 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
20609 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
20612 ok
= expand_vec_perm_1 (&dremap
);
20619 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20620 a single vector cross-lane permutation into vpermq followed
20621 by any of the single insn permutations. */
20624 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
20626 struct expand_vec_perm_d dremap
, dfinal
;
20627 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
20628 unsigned contents
[2];
20632 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
20633 && d
->one_operand_p
))
20638 for (i
= 0; i
< nelt2
; ++i
)
20640 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
20641 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
20644 for (i
= 0; i
< 2; ++i
)
20646 unsigned int cnt
= 0;
20647 for (j
= 0; j
< 4; ++j
)
20648 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
20656 dremap
.vmode
= V4DImode
;
20658 dremap
.target
= gen_reg_rtx (V4DImode
);
20659 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
20660 dremap
.op1
= dremap
.op0
;
20661 dremap
.one_operand_p
= true;
20662 for (i
= 0; i
< 2; ++i
)
20664 unsigned int cnt
= 0;
20665 for (j
= 0; j
< 4; ++j
)
20666 if ((contents
[i
] & (1u << j
)) != 0)
20667 dremap
.perm
[2 * i
+ cnt
++] = j
;
20668 for (; cnt
< 2; ++cnt
)
20669 dremap
.perm
[2 * i
+ cnt
] = 0;
20673 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
20674 dfinal
.op1
= dfinal
.op0
;
20675 dfinal
.one_operand_p
= true;
20676 for (i
= 0, j
= 0; i
< nelt
; ++i
)
20680 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
20681 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
20683 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
20684 dfinal
.perm
[i
] |= nelt4
;
20686 gcc_unreachable ();
20689 ok
= expand_vec_perm_1 (&dremap
);
20692 ok
= expand_vec_perm_1 (&dfinal
);
20698 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
20700 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20701 a vector permutation using two instructions, vperm2f128 resp.
20702 vperm2i128 followed by any single in-lane permutation. */
20705 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
20707 struct expand_vec_perm_d dfirst
, dsecond
;
20708 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
20712 || GET_MODE_SIZE (d
->vmode
) != 32
20713 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
20717 dsecond
.one_operand_p
= false;
20718 dsecond
.testing_p
= true;
20720 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20721 immediate. For perm < 16 the second permutation uses
20722 d->op0 as first operand, for perm >= 16 it uses d->op1
20723 as first operand. The second operand is the result of
20725 for (perm
= 0; perm
< 32; perm
++)
20727 /* Ignore permutations which do not move anything cross-lane. */
20730 /* The second shuffle for e.g. V4DFmode has
20731 0123 and ABCD operands.
20732 Ignore AB23, as 23 is already in the second lane
20733 of the first operand. */
20734 if ((perm
& 0xc) == (1 << 2)) continue;
20735 /* And 01CD, as 01 is in the first lane of the first
20737 if ((perm
& 3) == 0) continue;
20738 /* And 4567, as then the vperm2[fi]128 doesn't change
20739 anything on the original 4567 second operand. */
20740 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
20744 /* The second shuffle for e.g. V4DFmode has
20745 4567 and ABCD operands.
20746 Ignore AB67, as 67 is already in the second lane
20747 of the first operand. */
20748 if ((perm
& 0xc) == (3 << 2)) continue;
20749 /* And 45CD, as 45 is in the first lane of the first
20751 if ((perm
& 3) == 2) continue;
20752 /* And 0123, as then the vperm2[fi]128 doesn't change
20753 anything on the original 0123 first operand. */
20754 if ((perm
& 0xf) == (1 << 2)) continue;
20757 for (i
= 0; i
< nelt
; i
++)
20759 j
= d
->perm
[i
] / nelt2
;
20760 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
20761 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
20762 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
20763 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
20771 ok
= expand_vec_perm_1 (&dsecond
);
20782 /* Found a usable second shuffle. dfirst will be
20783 vperm2f128 on d->op0 and d->op1. */
20784 dsecond
.testing_p
= false;
20786 dfirst
.target
= gen_reg_rtx (d
->vmode
);
20787 for (i
= 0; i
< nelt
; i
++)
20788 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
20789 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
20791 canonicalize_perm (&dfirst
);
20792 ok
= expand_vec_perm_1 (&dfirst
);
20795 /* And dsecond is some single insn shuffle, taking
20796 d->op0 and result of vperm2f128 (if perm < 16) or
20797 d->op1 and result of vperm2f128 (otherwise). */
20799 dsecond
.op0
= dsecond
.op1
;
20800 dsecond
.op1
= dfirst
.target
;
20802 ok
= expand_vec_perm_1 (&dsecond
);
20808 /* For one operand, the only useful vperm2f128 permutation is 0x01
20810 if (d
->one_operand_p
)
20817 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20818 a two vector permutation using 2 intra-lane interleave insns
20819 and cross-lane shuffle for 32-byte vectors. */
20822 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
20825 rtx (*gen
) (rtx
, rtx
, rtx
);
20827 if (d
->one_operand_p
)
20829 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
20831 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
20837 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
20839 for (i
= 0; i
< nelt
; i
+= 2)
20840 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
20841 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
20851 gen
= gen_vec_interleave_highv32qi
;
20853 gen
= gen_vec_interleave_lowv32qi
;
20857 gen
= gen_vec_interleave_highv16hi
;
20859 gen
= gen_vec_interleave_lowv16hi
;
20863 gen
= gen_vec_interleave_highv8si
;
20865 gen
= gen_vec_interleave_lowv8si
;
20869 gen
= gen_vec_interleave_highv4di
;
20871 gen
= gen_vec_interleave_lowv4di
;
20875 gen
= gen_vec_interleave_highv8sf
;
20877 gen
= gen_vec_interleave_lowv8sf
;
20881 gen
= gen_vec_interleave_highv4df
;
20883 gen
= gen_vec_interleave_lowv4df
;
20886 gcc_unreachable ();
20889 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
20893 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20894 a single vector permutation using a single intra-lane vector
20895 permutation, vperm2f128 swapping the lanes and vblend* insn blending
20896 the non-swapped and swapped vectors together. */
20899 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
20901 struct expand_vec_perm_d dfirst
, dsecond
;
20902 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
20905 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
20909 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
20910 || !d
->one_operand_p
)
20914 for (i
= 0; i
< nelt
; i
++)
20915 dfirst
.perm
[i
] = 0xff;
20916 for (i
= 0, msk
= 0; i
< nelt
; i
++)
20918 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
20919 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
20921 dfirst
.perm
[j
] = d
->perm
[i
];
20925 for (i
= 0; i
< nelt
; i
++)
20926 if (dfirst
.perm
[i
] == 0xff)
20927 dfirst
.perm
[i
] = i
;
20930 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
20933 ok
= expand_vec_perm_1 (&dfirst
);
20934 seq
= get_insns ();
20946 dsecond
.op0
= dfirst
.target
;
20947 dsecond
.op1
= dfirst
.target
;
20948 dsecond
.one_operand_p
= true;
20949 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
20950 for (i
= 0; i
< nelt
; i
++)
20951 dsecond
.perm
[i
] = i
^ nelt2
;
20953 ok
= expand_vec_perm_1 (&dsecond
);
20956 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
20957 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
20961 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20962 a two vector permutation using two single vector permutations and
20963 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
20964 of dfirst or dsecond is identity permutation. */
20967 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d
*d
, bool two_insn
)
20969 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, lane
= nelt
;
20970 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
20971 bool ident1
= true, ident2
= true;
20973 if (d
->one_operand_p
)
20976 if (GET_MODE_SIZE (d
->vmode
) == 16)
20980 if (d
->vmode
!= V4SFmode
&& d
->vmode
!= V2DFmode
&& !TARGET_SSE2
)
20983 else if (GET_MODE_SIZE (d
->vmode
) == 32)
20987 if (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
)
20994 for (i
= 1; i
< nelt
; i
++)
20995 if ((d
->perm
[i
] >= nelt
) != ((d
->perm
[0] >= nelt
) ^ (i
& 1)))
21001 dfirst
.op1
= dfirst
.op0
;
21002 dfirst
.one_operand_p
= true;
21003 dsecond
.op0
= dsecond
.op1
;
21004 dsecond
.one_operand_p
= true;
21006 for (i
= 0; i
< nelt
; i
++)
21007 if (d
->perm
[i
] >= nelt
)
21009 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
] - nelt
;
21010 if (d
->perm
[i
] - nelt
!= i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
21012 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)]
21013 = d
->perm
[i
] - nelt
;
21017 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
];
21018 if (d
->perm
[i
] != i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
21020 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)] = d
->perm
[i
];
21023 if (two_insn
&& !ident1
&& !ident2
)
21029 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
21031 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
21032 if (d
->perm
[0] >= nelt
)
21033 std::swap (dfinal
.op0
, dfinal
.op1
);
21037 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
21042 ok
= expand_vec_perm_1 (&dfirst
);
21043 seq1
= get_insns ();
21053 ok
= expand_vec_perm_1 (&dsecond
);
21054 seq2
= get_insns ();
21064 for (i
= 0; i
< nelt
; i
++)
21066 dfinal
.perm
[i
] = i
/ 2;
21068 dfinal
.perm
[i
] += lane
/ 2;
21070 dfinal
.perm
[i
] += nelt
;
21074 ok
= expand_vselect_vconcat (dfinal
.target
, dfinal
.op0
, dfinal
.op1
,
21075 dfinal
.perm
, dfinal
.nelt
, false);
21080 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21081 the permutation using two single vector permutations and the SSE4_1 pblendv
21082 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21083 identity permutation. */
21086 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d
*d
, bool two_insn
)
21088 unsigned i
, nelt
= d
->nelt
;
21089 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
21090 machine_mode vmode
= d
->vmode
;
21091 bool ident1
= true, ident2
= true;
21093 /* Use the same checks as in expand_vec_perm_blend. */
21094 if (d
->one_operand_p
)
21096 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
21098 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
21100 else if (TARGET_SSE4_1
&& (GET_MODE_SIZE (vmode
) == 16
21101 || GET_MODE_SIZE (vmode
) == 8
21102 || GET_MODE_SIZE (vmode
) == 4))
21110 dfirst
.op1
= dfirst
.op0
;
21111 dfirst
.one_operand_p
= true;
21112 dsecond
.op0
= dsecond
.op1
;
21113 dsecond
.one_operand_p
= true;
21115 for (i
= 0; i
< nelt
; ++i
)
21116 if (d
->perm
[i
] >= nelt
)
21118 dfirst
.perm
[i
] = 0xff;
21119 dsecond
.perm
[i
] = d
->perm
[i
] - nelt
;
21120 if (d
->perm
[i
] != i
+ nelt
)
21125 dsecond
.perm
[i
] = 0xff;
21126 dfirst
.perm
[i
] = d
->perm
[i
];
21127 if (d
->perm
[i
] != i
)
21131 if (two_insn
&& !ident1
&& !ident2
)
21134 /* For now. Ideally treat 0xff as a wildcard. */
21135 for (i
= 0; i
< nelt
; ++i
)
21136 if (dfirst
.perm
[i
] == 0xff)
21138 if (GET_MODE_SIZE (vmode
) == 32
21139 && dfirst
.perm
[i
^ (nelt
/ 2)] != 0xff)
21140 dfirst
.perm
[i
] = dfirst
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
21142 dfirst
.perm
[i
] = i
;
21146 if (GET_MODE_SIZE (vmode
) == 32
21147 && dsecond
.perm
[i
^ (nelt
/ 2)] != 0xff)
21148 dsecond
.perm
[i
] = dsecond
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
21150 dsecond
.perm
[i
] = i
;
21156 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
21158 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
21162 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
21167 ok
= expand_vec_perm_1 (&dfirst
);
21168 seq1
= get_insns ();
21178 ok
= expand_vec_perm_1 (&dsecond
);
21179 seq2
= get_insns ();
21189 for (i
= 0; i
< nelt
; ++i
)
21190 dfinal
.perm
[i
] = (d
->perm
[i
] >= nelt
? i
+ nelt
: i
);
21194 ok
= expand_vec_perm_blend (&dfinal
);
21199 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21200 permutation using two vperm2f128, followed by a vshufpd insn blending
21201 the two vectors together. */
21204 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
21206 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
21209 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
21219 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
21220 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
21221 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
21222 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
21223 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
21224 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
21225 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
21226 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
21227 dthird
.perm
[0] = (d
->perm
[0] % 2);
21228 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
21229 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
21230 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
21232 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21233 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21234 dthird
.op0
= dfirst
.target
;
21235 dthird
.op1
= dsecond
.target
;
21236 dthird
.one_operand_p
= false;
21238 canonicalize_perm (&dfirst
);
21239 canonicalize_perm (&dsecond
);
21241 ok
= expand_vec_perm_1 (&dfirst
)
21242 && expand_vec_perm_1 (&dsecond
)
21243 && expand_vec_perm_1 (&dthird
);
21250 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
21252 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21253 a two vector permutation using two intra-lane vector
21254 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21255 the non-swapped and swapped vectors together. */
21258 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
21260 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
21261 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
21262 rtx_insn
*seq1
, *seq2
;
21264 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
21268 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
21269 || d
->one_operand_p
)
21274 for (i
= 0; i
< nelt
; i
++)
21276 dfirst
.perm
[i
] = 0xff;
21277 dsecond
.perm
[i
] = 0xff;
21279 for (i
= 0, msk
= 0; i
< nelt
; i
++)
21281 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
21284 dfirst
.perm
[j
] = d
->perm
[i
];
21285 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
21289 dsecond
.perm
[j
] = d
->perm
[i
];
21290 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
21294 if (msk
== 0 || msk
== (1U << nelt
) - 1)
21299 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21300 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21303 for (i
= 0; i
< nelt
; i
++)
21305 if (dfirst
.perm
[i
] == 0xff)
21306 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
21307 if (dsecond
.perm
[i
] == 0xff)
21308 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
21310 canonicalize_perm (&dfirst
);
21312 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
21313 seq1
= get_insns ();
21319 canonicalize_perm (&dsecond
);
21321 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
21322 seq2
= get_insns ();
21335 dthird
.op0
= dsecond
.target
;
21336 dthird
.op1
= dsecond
.target
;
21337 dthird
.one_operand_p
= true;
21338 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
21339 for (i
= 0; i
< nelt
; i
++)
21340 dthird
.perm
[i
] = i
^ nelt2
;
21342 ok
= expand_vec_perm_1 (&dthird
);
21345 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
21346 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
21350 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21351 permutation with two pshufb insns and an ior. We should have already
21352 failed all two instruction sequences. */
21355 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
21357 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
21358 unsigned int i
, nelt
, eltsz
;
21360 rtx (*gen
) (rtx
, rtx
, rtx
);
21362 if (!TARGET_SSSE3
|| (GET_MODE_SIZE (d
->vmode
) != 16
21363 && GET_MODE_SIZE (d
->vmode
) != 8
21364 && GET_MODE_SIZE (d
->vmode
) != 4))
21366 gcc_assert (!d
->one_operand_p
);
21371 switch (GET_MODE_SIZE (d
->vmode
))
21375 gen
= gen_mmx_pshufbv4qi3
;
21379 gen
= gen_mmx_pshufbv8qi3
;
21383 gen
= gen_ssse3_pshufbv16qi3
;
21386 gcc_unreachable ();
21390 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21392 /* Generate two permutation masks. If the required element is within
21393 the given vector it is shuffled into the proper lane. If the required
21394 element is in the other vector, force a zero into the lane by setting
21395 bit 7 in the permutation mask. */
21396 m128
= GEN_INT (-128);
21397 for (i
= 0; i
< nelt
; ++i
)
21399 unsigned j
, k
, e
= d
->perm
[i
];
21400 unsigned which
= (e
>= nelt
);
21404 for (j
= 0; j
< eltsz
; ++j
)
21406 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
21407 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
21410 for (k
= i
*eltsz
+ j
; k
< 16; ++k
)
21411 rperm
[0][k
] = rperm
[1][k
] = m128
;
21414 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
21415 vperm
= force_reg (V16QImode
, vperm
);
21417 l
= gen_reg_rtx (mode
);
21418 op
= gen_lowpart (mode
, d
->op0
);
21419 emit_insn (gen (l
, op
, vperm
));
21421 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
21422 vperm
= force_reg (V16QImode
, vperm
);
21424 h
= gen_reg_rtx (mode
);
21425 op
= gen_lowpart (mode
, d
->op1
);
21426 emit_insn (gen (h
, op
, vperm
));
21429 if (d
->vmode
!= mode
)
21430 op
= gen_reg_rtx (mode
);
21431 ix86_emit_vec_binop (IOR
, mode
, op
, l
, h
);
21432 if (op
!= d
->target
)
21433 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21438 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
21439 with two vpshufb insns, vpermq and vpor. We should have already failed
21440 all two or three instruction sequences. */
21443 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
21445 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
21446 unsigned int i
, nelt
, eltsz
;
21449 || !d
->one_operand_p
21450 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
21457 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21459 /* Generate two permutation masks. If the required element is within
21460 the same lane, it is shuffled in. If the required element from the
21461 other lane, force a zero by setting bit 7 in the permutation mask.
21462 In the other mask the mask has non-negative elements if element
21463 is requested from the other lane, but also moved to the other lane,
21464 so that the result of vpshufb can have the two V2TImode halves
21466 m128
= GEN_INT (-128);
21467 for (i
= 0; i
< nelt
; ++i
)
21469 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
21470 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
21472 for (j
= 0; j
< eltsz
; ++j
)
21474 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
21475 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
21479 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
21480 vperm
= force_reg (V32QImode
, vperm
);
21482 h
= gen_reg_rtx (V32QImode
);
21483 op
= gen_lowpart (V32QImode
, d
->op0
);
21484 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
21486 /* Swap the 128-byte lanes of h into hp. */
21487 hp
= gen_reg_rtx (V4DImode
);
21488 op
= gen_lowpart (V4DImode
, h
);
21489 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
21492 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
21493 vperm
= force_reg (V32QImode
, vperm
);
21495 l
= gen_reg_rtx (V32QImode
);
21496 op
= gen_lowpart (V32QImode
, d
->op0
);
21497 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
21500 if (d
->vmode
!= V32QImode
)
21501 op
= gen_reg_rtx (V32QImode
);
21502 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
21503 if (op
!= d
->target
)
21504 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21509 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21510 and extract-odd permutations of two V32QImode and V16QImode operand
21511 with two vpshufb insns, vpor and vpermq. We should have already
21512 failed all two or three instruction sequences. */
21515 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
21517 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
21518 unsigned int i
, nelt
, eltsz
;
21521 || d
->one_operand_p
21522 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
21525 for (i
= 0; i
< d
->nelt
; ++i
)
21526 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
21533 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21535 /* Generate two permutation masks. In the first permutation mask
21536 the first quarter will contain indexes for the first half
21537 of the op0, the second quarter will contain bit 7 set, third quarter
21538 will contain indexes for the second half of the op0 and the
21539 last quarter bit 7 set. In the second permutation mask
21540 the first quarter will contain bit 7 set, the second quarter
21541 indexes for the first half of the op1, the third quarter bit 7 set
21542 and last quarter indexes for the second half of the op1.
21543 I.e. the first mask e.g. for V32QImode extract even will be:
21544 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21545 (all values masked with 0xf except for -128) and second mask
21546 for extract even will be
21547 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21548 m128
= GEN_INT (-128);
21549 for (i
= 0; i
< nelt
; ++i
)
21551 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
21552 unsigned which
= d
->perm
[i
] >= nelt
;
21553 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
21555 for (j
= 0; j
< eltsz
; ++j
)
21557 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
21558 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
21562 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
21563 vperm
= force_reg (V32QImode
, vperm
);
21565 l
= gen_reg_rtx (V32QImode
);
21566 op
= gen_lowpart (V32QImode
, d
->op0
);
21567 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
21569 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
21570 vperm
= force_reg (V32QImode
, vperm
);
21572 h
= gen_reg_rtx (V32QImode
);
21573 op
= gen_lowpart (V32QImode
, d
->op1
);
21574 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
21576 ior
= gen_reg_rtx (V32QImode
);
21577 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
21579 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21580 op
= gen_reg_rtx (V4DImode
);
21581 ior
= gen_lowpart (V4DImode
, ior
);
21582 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
21583 const1_rtx
, GEN_INT (3)));
21584 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21589 /* Implement permutation with pslldq + psrldq + por when pshufb is not
21592 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d
*d
, bool pandn
)
21594 unsigned i
, nelt
= d
->nelt
;
21595 unsigned start1
, end1
= -1;
21596 machine_mode vmode
= d
->vmode
, imode
;
21598 bool clear_op0
, clear_op1
;
21599 unsigned inner_size
;
21600 rtx op0
, op1
, dop1
;
21601 rtx (*gen_vec_shr
) (rtx
, rtx
, rtx
);
21602 rtx (*gen_vec_shl
) (rtx
, rtx
, rtx
);
21604 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21605 if (!TARGET_SSE2
|| (vmode
!= E_V16QImode
&& vmode
!= E_V8HImode
))
21608 start1
= d
->perm
[0];
21609 for (i
= 1; i
< nelt
; i
++)
21611 if (d
->perm
[i
] != d
->perm
[i
-1] + 1
21612 || d
->perm
[i
] == nelt
)
21616 start2
= d
->perm
[i
];
21617 end1
= d
->perm
[i
-1];
21624 clear_op0
= end1
!= nelt
- 1;
21625 clear_op1
= start2
% nelt
!= 0;
21626 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21627 if (!pandn
&& (clear_op0
|| clear_op1
))
21633 gen_vec_shr
= vmode
== E_V16QImode
? gen_vec_shr_v16qi
: gen_vec_shr_v8hi
;
21634 gen_vec_shl
= vmode
== E_V16QImode
? gen_vec_shl_v16qi
: gen_vec_shl_v8hi
;
21635 imode
= GET_MODE_INNER (vmode
);
21636 inner_size
= GET_MODE_BITSIZE (imode
);
21637 op0
= gen_reg_rtx (vmode
);
21638 op1
= gen_reg_rtx (vmode
);
21641 emit_insn (gen_vec_shr (op0
, d
->op0
, GEN_INT (start1
* inner_size
)));
21643 emit_move_insn (op0
, d
->op0
);
21646 if (d
->one_operand_p
)
21649 int shl_offset
= end1
- start1
+ 1 - start2
% nelt
;
21651 emit_insn (gen_vec_shl (op1
, dop1
, GEN_INT (shl_offset
* inner_size
)));
21653 emit_move_insn (op1
, dop1
);
21655 /* Clear lower/upper bits for op0/op1. */
21656 if (clear_op0
|| clear_op1
)
21661 for (i
= 0; i
!= nelt
; i
++)
21663 if (i
< (end1
- start1
+ 1))
21664 vec
[i
] = gen_int_mode ((HOST_WIDE_INT_1U
<< inner_size
) - 1, imode
);
21666 vec
[i
] = CONST0_RTX (imode
);
21668 const_vec
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, vec
));
21669 const_vec
= validize_mem (force_const_mem (vmode
, const_vec
));
21670 clear
= force_reg (vmode
, const_vec
);
21673 emit_move_insn (op0
, gen_rtx_AND (vmode
, op0
, clear
));
21675 emit_move_insn (op1
, gen_rtx_AND (vmode
,
21676 gen_rtx_NOT (vmode
, clear
),
21680 emit_move_insn (d
->target
, gen_rtx_IOR (vmode
, op0
, op1
));
21684 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21685 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21686 operands with two "and" and "pack" or two "shift" and "pack" insns.
21687 We should have already failed all two instruction sequences. */
21690 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
21692 rtx op
, dop0
, dop1
, t
;
21693 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
21694 bool end_perm
= false;
21695 machine_mode half_mode
;
21696 rtx (*gen_and
) (rtx
, rtx
, rtx
);
21697 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
21698 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
21700 if (d
->one_operand_p
)
21706 /* Required for "pack". */
21707 if (!TARGET_SSE4_1
)
21711 half_mode
= V2SImode
;
21712 gen_and
= gen_andv2si3
;
21713 gen_pack
= gen_mmx_packusdw
;
21714 gen_shift
= gen_lshrv2si3
;
21717 /* Required for "pack". */
21718 if (!TARGET_SSE4_1
)
21722 half_mode
= V4SImode
;
21723 gen_and
= gen_andv4si3
;
21724 gen_pack
= gen_sse4_1_packusdw
;
21725 gen_shift
= gen_lshrv4si3
;
21728 /* No check as all instructions are SSE2. */
21731 half_mode
= V4HImode
;
21732 gen_and
= gen_andv4hi3
;
21733 gen_pack
= gen_mmx_packuswb
;
21734 gen_shift
= gen_lshrv4hi3
;
21737 /* No check as all instructions are SSE2. */
21740 half_mode
= V8HImode
;
21741 gen_and
= gen_andv8hi3
;
21742 gen_pack
= gen_sse2_packuswb
;
21743 gen_shift
= gen_lshrv8hi3
;
21750 half_mode
= V8SImode
;
21751 gen_and
= gen_andv8si3
;
21752 gen_pack
= gen_avx2_packusdw
;
21753 gen_shift
= gen_lshrv8si3
;
21761 half_mode
= V16HImode
;
21762 gen_and
= gen_andv16hi3
;
21763 gen_pack
= gen_avx2_packuswb
;
21764 gen_shift
= gen_lshrv16hi3
;
21768 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21769 are more profitable than general shuffles. */
21773 /* Check that permutation is even or odd. */
21778 for (i
= 1; i
< nelt
; ++i
)
21779 if (d
->perm
[i
] != 2 * i
+ odd
)
21785 dop0
= gen_reg_rtx (half_mode
);
21786 dop1
= gen_reg_rtx (half_mode
);
21789 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
21790 t
= force_reg (half_mode
, t
);
21791 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
21792 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
21796 emit_insn (gen_shift (dop0
,
21797 gen_lowpart (half_mode
, d
->op0
),
21799 emit_insn (gen_shift (dop1
,
21800 gen_lowpart (half_mode
, d
->op1
),
21803 /* In AVX2 for 256 bit case we need to permute pack result. */
21804 if (TARGET_AVX2
&& end_perm
)
21806 op
= gen_reg_rtx (d
->vmode
);
21807 t
= gen_reg_rtx (V4DImode
);
21808 emit_insn (gen_pack (op
, dop0
, dop1
));
21809 emit_insn (gen_avx2_permv4di_1 (t
,
21810 gen_lowpart (V4DImode
, op
),
21815 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
21818 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
21823 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21824 and extract-odd permutations of two V64QI operands
21825 with two "shifts", two "truncs" and one "concat" insns for "odd"
21826 and two "truncs" and one concat insn for "even."
21827 Have already failed all two instruction sequences. */
21830 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
21832 rtx t1
, t2
, t3
, t4
;
21833 unsigned i
, odd
, nelt
= d
->nelt
;
21835 if (!TARGET_AVX512BW
21836 || d
->one_operand_p
21837 || d
->vmode
!= V64QImode
)
21840 /* Check that permutation is even or odd. */
21845 for (i
= 1; i
< nelt
; ++i
)
21846 if (d
->perm
[i
] != 2 * i
+ odd
)
21855 t1
= gen_reg_rtx (V32HImode
);
21856 t2
= gen_reg_rtx (V32HImode
);
21857 emit_insn (gen_lshrv32hi3 (t1
,
21858 gen_lowpart (V32HImode
, d
->op0
),
21860 emit_insn (gen_lshrv32hi3 (t2
,
21861 gen_lowpart (V32HImode
, d
->op1
),
21866 t1
= gen_lowpart (V32HImode
, d
->op0
);
21867 t2
= gen_lowpart (V32HImode
, d
->op1
);
21870 t3
= gen_reg_rtx (V32QImode
);
21871 t4
= gen_reg_rtx (V32QImode
);
21872 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
21873 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
21874 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
21879 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
21880 and extract-odd permutations. */
21883 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
21885 rtx t1
, t2
, t3
, t4
, t5
;
21892 t1
= gen_reg_rtx (V4DFmode
);
21893 t2
= gen_reg_rtx (V4DFmode
);
21895 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21896 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
21897 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
21899 /* Now an unpck[lh]pd will produce the result required. */
21901 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
21903 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
21909 int mask
= odd
? 0xdd : 0x88;
21913 t1
= gen_reg_rtx (V8SFmode
);
21914 t2
= gen_reg_rtx (V8SFmode
);
21915 t3
= gen_reg_rtx (V8SFmode
);
21917 /* Shuffle within the 128-bit lanes to produce:
21918 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
21919 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
21922 /* Shuffle the lanes around to produce:
21923 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
21924 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
21927 /* Shuffle within the 128-bit lanes to produce:
21928 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
21929 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
21931 /* Shuffle within the 128-bit lanes to produce:
21932 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
21933 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
21935 /* Shuffle the lanes around to produce:
21936 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
21937 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
21948 /* These are always directly implementable by expand_vec_perm_1. */
21949 gcc_unreachable ();
21952 gcc_assert (TARGET_MMX_WITH_SSE
);
21953 /* We have no suitable instructions. */
21959 if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
21960 return expand_vec_perm_pshufb2 (d
);
21965 /* We need 2*log2(N)-1 operations to achieve odd/even
21966 with interleave. */
21967 t1
= gen_reg_rtx (V4QImode
);
21968 emit_insn (gen_mmx_punpckhbw_low (t1
, d
->op0
, d
->op1
));
21969 emit_insn (gen_mmx_punpcklbw_low (d
->target
, d
->op0
, d
->op1
));
21971 t2
= gen_mmx_punpckhbw_low (d
->target
, d
->target
, t1
);
21973 t2
= gen_mmx_punpcklbw_low (d
->target
, d
->target
, t1
);
21980 return expand_vec_perm_even_odd_pack (d
);
21981 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
21982 return expand_vec_perm_pshufb2 (d
);
21987 /* We need 2*log2(N)-1 operations to achieve odd/even
21988 with interleave. */
21989 t1
= gen_reg_rtx (V4HImode
);
21990 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
21991 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
21993 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
21995 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
22002 return expand_vec_perm_even_odd_pack (d
);
22003 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22004 return expand_vec_perm_pshufb2 (d
);
22009 /* We need 2*log2(N)-1 operations to achieve odd/even
22010 with interleave. */
22011 t1
= gen_reg_rtx (V8HImode
);
22012 t2
= gen_reg_rtx (V8HImode
);
22013 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
22014 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
22015 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
22016 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
22018 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
22020 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
22027 return expand_vec_perm_even_odd_pack (d
);
22031 return expand_vec_perm_even_odd_pack (d
);
22034 return expand_vec_perm_even_odd_trunc (d
);
22039 struct expand_vec_perm_d d_copy
= *d
;
22040 d_copy
.vmode
= V4DFmode
;
22042 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
22044 d_copy
.target
= gen_reg_rtx (V4DFmode
);
22045 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
22046 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
22047 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
22050 emit_move_insn (d
->target
,
22051 gen_lowpart (V4DImode
, d_copy
.target
));
22060 t1
= gen_reg_rtx (V4DImode
);
22061 t2
= gen_reg_rtx (V4DImode
);
22063 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22064 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
22065 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
22067 /* Now an vpunpck[lh]qdq will produce the result required. */
22069 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
22071 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
22078 struct expand_vec_perm_d d_copy
= *d
;
22079 d_copy
.vmode
= V8SFmode
;
22081 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
22083 d_copy
.target
= gen_reg_rtx (V8SFmode
);
22084 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
22085 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
22086 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
22089 emit_move_insn (d
->target
,
22090 gen_lowpart (V8SImode
, d_copy
.target
));
22099 t1
= gen_reg_rtx (V8SImode
);
22100 t2
= gen_reg_rtx (V8SImode
);
22101 t3
= gen_reg_rtx (V4DImode
);
22102 t4
= gen_reg_rtx (V4DImode
);
22103 t5
= gen_reg_rtx (V4DImode
);
22105 /* Shuffle the lanes around into
22106 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22107 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
22108 gen_lowpart (V4DImode
, d
->op1
),
22110 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
22111 gen_lowpart (V4DImode
, d
->op1
),
22114 /* Swap the 2nd and 3rd position in each lane into
22115 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22116 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
22117 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22118 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
22119 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22121 /* Now an vpunpck[lh]qdq will produce
22122 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22124 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
22125 gen_lowpart (V4DImode
, t2
));
22127 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
22128 gen_lowpart (V4DImode
, t2
));
22130 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
22134 gcc_unreachable ();
22140 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22141 extract-even and extract-odd permutations. */
22144 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
22146 unsigned i
, odd
, nelt
= d
->nelt
;
22149 if (odd
!= 0 && odd
!= 1)
22152 for (i
= 1; i
< nelt
; ++i
)
22153 if (d
->perm
[i
] != 2 * i
+ odd
)
22156 if (d
->vmode
== E_V32HImode
22158 && !TARGET_AVX512BW
)
22161 return expand_vec_perm_even_odd_1 (d
, odd
);
22164 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22165 permutations. We assume that expand_vec_perm_1 has already failed. */
22168 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
22170 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
22171 machine_mode vmode
= d
->vmode
;
22172 rtx (*gen
) (rtx
, rtx
, rtx
);
22173 unsigned char perm2
[4];
22174 rtx op0
= d
->op0
, dest
;
22181 /* These are special-cased in sse.md so that we can optionally
22182 use the vbroadcast instruction. They expand to two insns
22183 if the input happens to be in a register. */
22184 gcc_unreachable ();
22194 /* These are always implementable using standard shuffle patterns. */
22195 gcc_unreachable ();
22198 /* This can be implemented via interleave and pshuflw. */
22204 gen
= gen_mmx_punpckhbw_low
;
22208 gen
= gen_mmx_punpcklbw_low
;
22210 dest
= gen_reg_rtx (vmode
);
22211 emit_insn (gen (dest
, op0
, op0
));
22212 vmode
= get_mode_wider_vector (vmode
);
22213 op0
= gen_lowpart (vmode
, dest
);
22215 memset (perm2
, elt
, 2);
22216 dest
= gen_reg_rtx (vmode
);
22217 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22220 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22224 /* This can be implemented via interleave. We save one insn by
22225 stopping once we have promoted to V2SImode and then use pshufd. */
22232 gen
= vmode
== V8QImode
? gen_mmx_punpckhbw
22233 : gen_mmx_punpckhwd
;
22237 gen
= vmode
== V8QImode
? gen_mmx_punpcklbw
22238 : gen_mmx_punpcklwd
;
22241 dest
= gen_reg_rtx (vmode
);
22242 emit_insn (gen (dest
, op0
, op0
));
22243 vmode
= get_mode_wider_vector (vmode
);
22244 op0
= gen_lowpart (vmode
, dest
);
22246 while (vmode
!= V2SImode
);
22248 memset (perm2
, elt
, 2);
22249 dest
= gen_reg_rtx (vmode
);
22250 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22253 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22258 /* These can be implemented via interleave. We save one insn by
22259 stopping once we have promoted to V4SImode and then use pshufd. */
22266 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
22267 : gen_vec_interleave_highv8hi
;
22271 gen
= vmode
== V16QImode
? gen_vec_interleave_lowv16qi
22272 : gen_vec_interleave_lowv8hi
;
22275 dest
= gen_reg_rtx (vmode
);
22276 emit_insn (gen (dest
, op0
, op0
));
22277 vmode
= get_mode_wider_vector (vmode
);
22278 op0
= gen_lowpart (vmode
, dest
);
22280 while (vmode
!= V4SImode
);
22282 memset (perm2
, elt
, 4);
22283 dest
= gen_reg_rtx (vmode
);
22284 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
22287 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22292 /* This can be implemented via interleave and pshufd. */
22296 rtx (*maybe_gen
) (machine_mode
, int, rtx
, rtx
, rtx
);
22299 maybe_gen
= maybe_gen_vec_interleave_high
;
22303 maybe_gen
= maybe_gen_vec_interleave_low
;
22306 dest
= gen_reg_rtx (vmode
);
22307 emit_insn (maybe_gen (vmode
, 1, dest
, op0
, op0
));
22310 op0
= gen_lowpart (vmode
, dest
);
22312 memset (perm2
, elt
, 4);
22313 dest
= gen_reg_rtx (vmode
);
22314 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
22317 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22324 /* For AVX2 broadcasts of the first element vpbroadcast* or
22325 vpermq should be used by expand_vec_perm_1. */
22326 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
22330 gcc_assert (!TARGET_AVX512BW
|| d
->perm
[0]);
22334 gcc_assert (!TARGET_AVX512BW
);
22338 gcc_unreachable ();
22342 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22343 broadcast permutations. */
22346 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
22348 unsigned i
, elt
, nelt
= d
->nelt
;
22350 if (!d
->one_operand_p
)
22354 for (i
= 1; i
< nelt
; ++i
)
22355 if (d
->perm
[i
] != elt
)
22358 return expand_vec_perm_broadcast_1 (d
);
22361 /* Implement arbitrary permutations of two V64QImode operands
22362 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22364 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
22366 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
22372 struct expand_vec_perm_d ds
[2];
22373 rtx rperm
[128], vperm
, target0
, target1
;
22374 unsigned int i
, nelt
;
22375 machine_mode vmode
;
22380 for (i
= 0; i
< 2; i
++)
22383 ds
[i
].vmode
= V32HImode
;
22385 ds
[i
].target
= gen_reg_rtx (V32HImode
);
22386 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
22387 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
22390 /* Prepare permutations such that the first one takes care of
22391 putting the even bytes into the right positions or one higher
22392 positions (ds[0]) and the second one takes care of
22393 putting the odd bytes into the right positions or one below
22396 for (i
= 0; i
< nelt
; i
++)
22398 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
22401 rperm
[i
] = constm1_rtx
;
22402 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
22406 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
22407 rperm
[i
+ 64] = constm1_rtx
;
22411 bool ok
= expand_vec_perm_1 (&ds
[0]);
22413 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
22415 ok
= expand_vec_perm_1 (&ds
[1]);
22417 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
22419 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
22420 vperm
= force_reg (vmode
, vperm
);
22421 target0
= gen_reg_rtx (V64QImode
);
22422 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
22424 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
22425 vperm
= force_reg (vmode
, vperm
);
22426 target1
= gen_reg_rtx (V64QImode
);
22427 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
22429 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
22433 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
22434 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22435 all the shorter instruction sequences. */
22438 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
22440 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
22441 unsigned int i
, nelt
, eltsz
;
22445 || d
->one_operand_p
22446 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
22453 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
22455 /* Generate 4 permutation masks. If the required element is within
22456 the same lane, it is shuffled in. If the required element from the
22457 other lane, force a zero by setting bit 7 in the permutation mask.
22458 In the other mask the mask has non-negative elements if element
22459 is requested from the other lane, but also moved to the other lane,
22460 so that the result of vpshufb can have the two V2TImode halves
22462 m128
= GEN_INT (-128);
22463 for (i
= 0; i
< 32; ++i
)
22465 rperm
[0][i
] = m128
;
22466 rperm
[1][i
] = m128
;
22467 rperm
[2][i
] = m128
;
22468 rperm
[3][i
] = m128
;
22474 for (i
= 0; i
< nelt
; ++i
)
22476 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
22477 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
22478 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
22480 for (j
= 0; j
< eltsz
; ++j
)
22481 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
22482 used
[which
] = true;
22485 for (i
= 0; i
< 2; ++i
)
22487 if (!used
[2 * i
+ 1])
22492 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
22493 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
22494 vperm
= force_reg (V32QImode
, vperm
);
22495 h
[i
] = gen_reg_rtx (V32QImode
);
22496 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
22497 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
22500 /* Swap the 128-byte lanes of h[X]. */
22501 for (i
= 0; i
< 2; ++i
)
22503 if (h
[i
] == NULL_RTX
)
22505 op
= gen_reg_rtx (V4DImode
);
22506 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
22507 const2_rtx
, GEN_INT (3), const0_rtx
,
22509 h
[i
] = gen_lowpart (V32QImode
, op
);
22512 for (i
= 0; i
< 2; ++i
)
22519 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
22520 vperm
= force_reg (V32QImode
, vperm
);
22521 l
[i
] = gen_reg_rtx (V32QImode
);
22522 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
22523 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
22526 for (i
= 0; i
< 2; ++i
)
22530 op
= gen_reg_rtx (V32QImode
);
22531 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
22538 gcc_assert (l
[0] && l
[1]);
22540 if (d
->vmode
!= V32QImode
)
22541 op
= gen_reg_rtx (V32QImode
);
22542 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
22543 if (op
!= d
->target
)
22544 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
22548 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22549 taken care of, perform the expansion in D and return true on success. */
22552 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
22554 /* Try a single instruction expansion. */
22555 if (expand_vec_perm_1 (d
))
22558 /* Try sequences of two instructions. */
22560 if (expand_vec_perm_pshuflw_pshufhw (d
))
22563 if (expand_vec_perm_palignr (d
, false))
22566 if (expand_vec_perm_interleave2 (d
))
22569 if (expand_vec_perm_broadcast (d
))
22572 if (expand_vec_perm_vpermq_perm_1 (d
))
22575 if (expand_vec_perm_vperm2f128 (d
))
22578 if (expand_vec_perm_pblendv (d
))
22581 if (expand_vec_perm_2perm_interleave (d
, true))
22584 if (expand_vec_perm_2perm_pblendv (d
, true))
22587 if (expand_vec_perm_shufps_shufps (d
))
22590 /* Try sequences of three instructions. */
22592 if (expand_vec_perm_even_odd_pack (d
))
22595 if (expand_vec_perm_2vperm2f128_vshuf (d
))
22598 if (expand_vec_perm_pshufb2 (d
))
22601 if (expand_vec_perm_pslldq_psrldq_por (d
, false))
22604 if (expand_vec_perm_interleave3 (d
))
22607 if (expand_vec_perm_vperm2f128_vblend (d
))
22610 if (expand_vec_perm_2perm_interleave (d
, false))
22613 if (expand_vec_perm_2perm_pblendv (d
, false))
22616 /* Try sequences of four instructions. */
22618 if (expand_vec_perm_even_odd_trunc (d
))
22620 if (expand_vec_perm_vpshufb2_vpermq (d
))
22623 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
22626 if (expand_vec_perm_vpermt2_vpshub2 (d
))
22629 /* ??? Look for narrow permutations whose element orderings would
22630 allow the promotion to a wider mode. */
22632 /* ??? Look for sequences of interleave or a wider permute that place
22633 the data into the correct lanes for a half-vector shuffle like
22634 pshuf[lh]w or vpermilps. */
22636 /* ??? Look for sequences of interleave that produce the desired results.
22637 The combinatorics of punpck[lh] get pretty ugly... */
22639 if (expand_vec_perm_even_odd (d
))
22642 /* Generate four or five instructions. */
22643 if (expand_vec_perm_pslldq_psrldq_por (d
, true))
22646 /* Even longer sequences. */
22647 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
22650 /* See if we can get the same permutation in different vector integer
22652 struct expand_vec_perm_d nd
;
22653 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
22656 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
22660 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22661 if (expand_vec_perm2_vperm2f128_vblend (d
))
22667 /* If a permutation only uses one operand, make it clear. Returns true
22668 if the permutation references both operands. */
22671 canonicalize_perm (struct expand_vec_perm_d
*d
)
22673 int i
, which
, nelt
= d
->nelt
;
22675 for (i
= which
= 0; i
< nelt
; ++i
)
22676 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
22678 d
->one_operand_p
= true;
22685 if (!rtx_equal_p (d
->op0
, d
->op1
))
22687 d
->one_operand_p
= false;
22690 /* The elements of PERM do not suggest that only the first operand
22691 is used, but both operands are identical. Allow easier matching
22692 of the permutation by folding the permutation into the single
22697 for (i
= 0; i
< nelt
; ++i
)
22698 d
->perm
[i
] &= nelt
- 1;
22707 return (which
== 3);
22710 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22713 ix86_vectorize_vec_perm_const (machine_mode vmode
, machine_mode op_mode
,
22714 rtx target
, rtx op0
, rtx op1
,
22715 const vec_perm_indices
&sel
)
22717 if (vmode
!= op_mode
)
22720 struct expand_vec_perm_d d
;
22721 unsigned char perm
[MAX_VECT_LEN
];
22722 unsigned int i
, nelt
, which
;
22725 /* For HF mode vector, convert it to HI using subreg. */
22726 if (GET_MODE_INNER (vmode
) == HFmode
)
22728 machine_mode orig_mode
= vmode
;
22729 vmode
= mode_for_vector (HImode
,
22730 GET_MODE_NUNITS (vmode
)).require ();
22732 target
= lowpart_subreg (vmode
, target
, orig_mode
);
22734 op0
= lowpart_subreg (vmode
, op0
, orig_mode
);
22736 op1
= lowpart_subreg (vmode
, op1
, orig_mode
);
22744 gcc_assert (VECTOR_MODE_P (d
.vmode
));
22745 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
22746 d
.testing_p
= !target
;
22748 gcc_assert (sel
.length () == nelt
);
22749 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
22751 /* Given sufficient ISA support we can just return true here
22752 for selected vector modes. */
22759 if (!TARGET_AVX512F
)
22761 /* All implementable with a single vperm[it]2 insn. */
22766 if (!TARGET_AVX512F
)
22768 if (d
.testing_p
&& TARGET_AVX512BW
)
22769 /* All implementable with a single vperm[it]2 insn. */
22773 if (!TARGET_AVX512F
)
22775 if (d
.testing_p
&& TARGET_AVX512BW
)
22776 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22785 if (d
.testing_p
&& TARGET_AVX512VL
)
22786 /* All implementable with a single vperm[it]2 insn. */
22792 if (d
.testing_p
&& TARGET_AVX2
)
22793 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22799 if (d
.testing_p
&& TARGET_AVX2
)
22800 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22807 /* Fall through. */
22812 /* All implementable with a single vpperm insn. */
22813 if (d
.testing_p
&& TARGET_XOP
)
22815 /* All implementable with 2 pshufb + 1 ior. */
22816 if (d
.testing_p
&& TARGET_SSSE3
)
22823 if (!TARGET_MMX_WITH_SSE
)
22829 /* All implementable with *punpckwd. */
22841 /* All implementable with shufpd or unpck[lh]pd. */
22849 for (i
= which
= 0; i
< nelt
; ++i
)
22851 unsigned char e
= sel
[i
];
22852 gcc_assert (e
< 2 * nelt
);
22855 which
|= (e
< nelt
? 1 : 2);
22860 /* For all elements from second vector, fold the elements to first. */
22862 for (i
= 0; i
< nelt
; ++i
)
22865 /* Check whether the mask can be applied to the vector type. */
22866 d
.one_operand_p
= (which
!= 3);
22868 /* Implementable with shufps, pshufd or pshuflw. */
22869 if (d
.one_operand_p
22870 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
22871 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
22872 || d
.vmode
== V4HImode
|| d
.vmode
== V2HImode
))
22875 /* Otherwise we have to go through the motions and see if we can
22876 figure out how to generate the requested permutation. */
22877 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
22878 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
22879 if (!d
.one_operand_p
)
22880 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
22883 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
22889 two_args
= canonicalize_perm (&d
);
22891 /* If one of the operands is a zero vector, try to match pmovzx. */
22892 if (two_args
&& (d
.op0
== CONST0_RTX (vmode
) || d
.op1
== CONST0_RTX (vmode
)))
22894 struct expand_vec_perm_d dzero
= d
;
22895 if (d
.op0
== CONST0_RTX (vmode
))
22897 d
.op1
= dzero
.op1
= force_reg (vmode
, d
.op1
);
22898 std::swap (dzero
.op0
, dzero
.op1
);
22899 for (i
= 0; i
< nelt
; ++i
)
22900 dzero
.perm
[i
] ^= nelt
;
22903 d
.op0
= dzero
.op0
= force_reg (vmode
, d
.op0
);
22905 if (expand_vselect_vconcat (dzero
.target
, dzero
.op0
, dzero
.op1
,
22906 dzero
.perm
, nelt
, dzero
.testing_p
))
22910 /* Force operands into registers. */
22911 rtx nop0
= force_reg (vmode
, d
.op0
);
22912 if (d
.op0
== d
.op1
)
22915 d
.op1
= force_reg (vmode
, d
.op1
);
22917 if (ix86_expand_vec_perm_const_1 (&d
))
22920 /* If the selector says both arguments are needed, but the operands are the
22921 same, the above tried to expand with one_operand_p and flattened selector.
22922 If that didn't work, retry without one_operand_p; we succeeded with that
22924 if (two_args
&& d
.one_operand_p
)
22926 d
.one_operand_p
= false;
22927 memcpy (d
.perm
, perm
, sizeof (perm
));
22928 return ix86_expand_vec_perm_const_1 (&d
);
22935 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
22937 struct expand_vec_perm_d d
;
22943 d
.vmode
= GET_MODE (targ
);
22944 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
22945 d
.one_operand_p
= false;
22946 d
.testing_p
= false;
22948 for (i
= 0; i
< nelt
; ++i
)
22949 d
.perm
[i
] = i
* 2 + odd
;
22951 /* We'll either be able to implement the permutation directly... */
22952 if (expand_vec_perm_1 (&d
))
22955 /* ... or we use the special-case patterns. */
22956 expand_vec_perm_even_odd_1 (&d
, odd
);
22960 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
22962 struct expand_vec_perm_d d
;
22963 unsigned i
, nelt
, base
;
22969 d
.vmode
= GET_MODE (targ
);
22970 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
22971 d
.one_operand_p
= false;
22972 d
.testing_p
= false;
22974 base
= high_p
? nelt
/ 2 : 0;
22975 for (i
= 0; i
< nelt
/ 2; ++i
)
22977 d
.perm
[i
* 2] = i
+ base
;
22978 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
22981 /* Note that for AVX this isn't one instruction. */
22982 ok
= ix86_expand_vec_perm_const_1 (&d
);
22986 /* This function is similar as ix86_expand_vecop_qihi,
22987 but optimized under AVX512BW by using vpmovwb.
22988 For example, optimize vector MUL generation like
22990 vpmovzxbw ymm2, xmm0
22991 vpmovzxbw ymm3, xmm1
22992 vpmullw ymm4, ymm2, ymm3
22995 it would take less instructions than ix86_expand_vecop_qihi.
22996 Return true if success. */
22999 ix86_expand_vecop_qihi2 (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23001 machine_mode himode
, qimode
= GET_MODE (dest
);
23002 rtx hop1
, hop2
, hdest
;
23003 rtx (*gen_extend
)(rtx
, rtx
);
23004 rtx (*gen_truncate
)(rtx
, rtx
);
23005 bool uns_p
= (code
== ASHIFTRT
) ? false : true;
23007 /* There's no V64HImode multiplication instruction. */
23008 if (qimode
== E_V64QImode
)
23011 /* vpmovwb only available under AVX512BW. */
23012 if (!TARGET_AVX512BW
)
23014 if ((qimode
== V8QImode
|| qimode
== V16QImode
)
23015 && !TARGET_AVX512VL
)
23017 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
23018 if (qimode
== V32QImode
23019 && (TARGET_PREFER_AVX128
|| TARGET_PREFER_AVX256
))
23026 gen_extend
= uns_p
? gen_zero_extendv8qiv8hi2
: gen_extendv8qiv8hi2
;
23027 gen_truncate
= gen_truncv8hiv8qi2
;
23030 himode
= V16HImode
;
23031 gen_extend
= uns_p
? gen_zero_extendv16qiv16hi2
: gen_extendv16qiv16hi2
;
23032 gen_truncate
= gen_truncv16hiv16qi2
;
23035 himode
= V32HImode
;
23036 gen_extend
= uns_p
? gen_zero_extendv32qiv32hi2
: gen_extendv32qiv32hi2
;
23037 gen_truncate
= gen_truncv32hiv32qi2
;
23040 gcc_unreachable ();
23043 hop1
= gen_reg_rtx (himode
);
23044 hop2
= gen_reg_rtx (himode
);
23045 hdest
= gen_reg_rtx (himode
);
23046 emit_insn (gen_extend (hop1
, op1
));
23047 emit_insn (gen_extend (hop2
, op2
));
23048 emit_insn (gen_rtx_SET (hdest
, simplify_gen_binary (code
, himode
,
23050 emit_insn (gen_truncate (dest
, hdest
));
23054 /* Expand a vector operation shift by constant for a V*QImode in terms of the
23055 same operation on V*HImode. Return true if success. */
23057 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
,
23058 rtx dest
, rtx op1
, rtx op2
)
23060 machine_mode qimode
, himode
;
23061 HOST_WIDE_INT and_constant
, xor_constant
;
23062 HOST_WIDE_INT shift_amount
;
23063 rtx vec_const_and
, vec_const_xor
;
23064 rtx tmp
, op1_subreg
;
23065 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
23066 rtx (*gen_and
) (rtx
, rtx
, rtx
);
23067 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
23068 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
23070 /* Only optimize shift by constant. */
23071 if (!CONST_INT_P (op2
))
23074 qimode
= GET_MODE (dest
);
23075 shift_amount
= INTVAL (op2
);
23076 /* Do nothing when shift amount greater equal 8. */
23077 if (shift_amount
> 7)
23080 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
23081 /* Record sign bit. */
23082 xor_constant
= 1 << (8 - shift_amount
- 1);
23084 /* Zero upper/lower bits shift from left/right element. */
23086 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
23087 : (1 << (8 - shift_amount
)) - 1);
23096 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
23097 gen_and
= gen_andv16qi3
;
23098 gen_xor
= gen_xorv16qi3
;
23099 gen_sub
= gen_subv16qi3
;
23102 himode
= V16HImode
;
23106 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
23107 gen_and
= gen_andv32qi3
;
23108 gen_xor
= gen_xorv32qi3
;
23109 gen_sub
= gen_subv32qi3
;
23112 himode
= V32HImode
;
23116 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
23117 gen_and
= gen_andv64qi3
;
23118 gen_xor
= gen_xorv64qi3
;
23119 gen_sub
= gen_subv64qi3
;
23122 gcc_unreachable ();
23125 tmp
= gen_reg_rtx (himode
);
23126 vec_const_and
= gen_reg_rtx (qimode
);
23127 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
23129 /* For ASHIFT and LSHIFTRT, perform operation like
23130 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23131 vpand %vec_const_and, %dest. */
23132 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
23133 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
23134 emit_move_insn (vec_const_and
,
23135 ix86_build_const_vector (qimode
, true,
23136 gen_int_mode (and_constant
, QImode
)));
23137 emit_insn (gen_and (dest
, dest
, vec_const_and
));
23139 /* For ASHIFTRT, perform extra operation like
23140 vpxor %vec_const_xor, %dest, %dest
23141 vpsubb %vec_const_xor, %dest, %dest */
23142 if (code
== ASHIFTRT
)
23144 vec_const_xor
= gen_reg_rtx (qimode
);
23145 emit_move_insn (vec_const_xor
,
23146 ix86_build_const_vector (qimode
, true,
23147 gen_int_mode (xor_constant
, QImode
)));
23148 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
23149 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
23154 /* Expand a vector operation CODE for a V*QImode in terms of the
23155 same operation on V*HImode. */
23158 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23160 machine_mode qimode
= GET_MODE (dest
);
23161 machine_mode himode
;
23162 rtx (*gen_il
) (rtx
, rtx
, rtx
);
23163 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
23164 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
23165 struct expand_vec_perm_d d
;
23166 bool ok
, full_interleave
;
23167 bool uns_p
= false;
23170 if (CONST_INT_P (op2
)
23171 && (code
== ASHIFT
|| code
== LSHIFTRT
|| code
== ASHIFTRT
)
23172 && ix86_expand_vec_shift_qihi_constant (code
, dest
, op1
, op2
))
23175 if (TARGET_AVX512BW
23176 && VECTOR_MODE_P (GET_MODE (op2
))
23177 && ix86_expand_vecop_qihi2 (code
, dest
, op1
, op2
))
23184 gen_il
= gen_vec_interleave_lowv16qi
;
23185 gen_ih
= gen_vec_interleave_highv16qi
;
23188 himode
= V16HImode
;
23189 gen_il
= gen_avx2_interleave_lowv32qi
;
23190 gen_ih
= gen_avx2_interleave_highv32qi
;
23193 himode
= V32HImode
;
23194 gen_il
= gen_avx512bw_interleave_lowv64qi
;
23195 gen_ih
= gen_avx512bw_interleave_highv64qi
;
23198 gcc_unreachable ();
23204 /* Unpack data such that we've got a source byte in each low byte of
23205 each word. We don't care what goes into the high byte of each word.
23206 Rather than trying to get zero in there, most convenient is to let
23207 it be a copy of the low byte. */
23208 op2_l
= gen_reg_rtx (qimode
);
23209 op2_h
= gen_reg_rtx (qimode
);
23210 emit_insn (gen_il (op2_l
, op2
, op2
));
23211 emit_insn (gen_ih (op2_h
, op2
, op2
));
23213 op1_l
= gen_reg_rtx (qimode
);
23214 op1_h
= gen_reg_rtx (qimode
);
23215 emit_insn (gen_il (op1_l
, op1
, op1
));
23216 emit_insn (gen_ih (op1_h
, op1
, op1
));
23217 full_interleave
= qimode
== V16QImode
;
23225 op1_l
= gen_reg_rtx (himode
);
23226 op1_h
= gen_reg_rtx (himode
);
23227 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
23228 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
23229 /* vashr/vlshr/vashl */
23230 if (GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
)
23232 rtx tmp
= force_reg (qimode
, op2
);
23233 op2_l
= gen_reg_rtx (himode
);
23234 op2_h
= gen_reg_rtx (himode
);
23235 ix86_expand_sse_unpack (op2_l
, tmp
, uns_p
, false);
23236 ix86_expand_sse_unpack (op2_h
, tmp
, uns_p
, true);
23239 op2_l
= op2_h
= op2
;
23241 full_interleave
= true;
23244 gcc_unreachable ();
23247 /* Perform vashr/vlshr/vashl. */
23249 && GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
)
23251 res_l
= gen_reg_rtx (himode
);
23252 res_h
= gen_reg_rtx (himode
);
23253 emit_insn (gen_rtx_SET (res_l
,
23254 simplify_gen_binary (code
, himode
,
23256 emit_insn (gen_rtx_SET (res_h
,
23257 simplify_gen_binary (code
, himode
,
23260 /* Performance mult/ashr/lshr/ashl. */
23263 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
23265 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
23269 gcc_assert (res_l
&& res_h
);
23271 /* Merge the data back into the right place. */
23273 d
.op0
= gen_lowpart (qimode
, res_l
);
23274 d
.op1
= gen_lowpart (qimode
, res_h
);
23276 d
.nelt
= GET_MODE_NUNITS (qimode
);
23277 d
.one_operand_p
= false;
23278 d
.testing_p
= false;
23280 if (full_interleave
)
23282 /* For SSE2, we used an full interleave, so the desired
23283 results are in the even elements. */
23284 for (i
= 0; i
< d
.nelt
; ++i
)
23289 /* For AVX, the interleave used above was not cross-lane. So the
23290 extraction is evens but with the second and third quarter swapped.
23291 Happily, that is even one insn shorter than even extraction.
23292 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23293 always first from the first and then from the second source operand,
23294 the index bits above the low 4 bits remains the same.
23295 Thus, for d.nelt == 32 we want permutation
23296 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23297 and for d.nelt == 64 we want permutation
23298 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23299 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23300 for (i
= 0; i
< d
.nelt
; ++i
)
23301 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
23304 ok
= ix86_expand_vec_perm_const_1 (&d
);
23307 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
23308 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
23311 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
23312 if op is CONST_VECTOR with all odd elements equal to their
23313 preceding element. */
23316 const_vector_equal_evenodd_p (rtx op
)
23318 machine_mode mode
= GET_MODE (op
);
23319 int i
, nunits
= GET_MODE_NUNITS (mode
);
23320 if (GET_CODE (op
) != CONST_VECTOR
23321 || nunits
!= CONST_VECTOR_NUNITS (op
))
23323 for (i
= 0; i
< nunits
; i
+= 2)
23324 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
23330 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
23331 bool uns_p
, bool odd_p
)
23333 machine_mode mode
= GET_MODE (op1
);
23334 machine_mode wmode
= GET_MODE (dest
);
23336 rtx orig_op1
= op1
, orig_op2
= op2
;
23338 if (!nonimmediate_operand (op1
, mode
))
23339 op1
= force_reg (mode
, op1
);
23340 if (!nonimmediate_operand (op2
, mode
))
23341 op2
= force_reg (mode
, op2
);
23343 /* We only play even/odd games with vectors of SImode. */
23344 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
23346 /* If we're looking for the odd results, shift those members down to
23347 the even slots. For some cpus this is faster than a PSHUFD. */
23350 /* For XOP use vpmacsdqh, but only for smult, as it is only
23352 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
23354 x
= force_reg (wmode
, CONST0_RTX (wmode
));
23355 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
23359 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
23360 if (!const_vector_equal_evenodd_p (orig_op1
))
23361 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
23362 x
, NULL
, 1, OPTAB_DIRECT
);
23363 if (!const_vector_equal_evenodd_p (orig_op2
))
23364 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
23365 x
, NULL
, 1, OPTAB_DIRECT
);
23366 op1
= gen_lowpart (mode
, op1
);
23367 op2
= gen_lowpart (mode
, op2
);
23370 if (mode
== V16SImode
)
23373 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
23375 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
23377 else if (mode
== V8SImode
)
23380 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
23382 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
23385 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
23386 else if (TARGET_SSE4_1
)
23387 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
23390 rtx s1
, s2
, t0
, t1
, t2
;
23392 /* The easiest way to implement this without PMULDQ is to go through
23393 the motions as if we are performing a full 64-bit multiply. With
23394 the exception that we need to do less shuffling of the elements. */
23396 /* Compute the sign-extension, aka highparts, of the two operands. */
23397 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
23398 op1
, pc_rtx
, pc_rtx
);
23399 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
23400 op2
, pc_rtx
, pc_rtx
);
23402 /* Multiply LO(A) * HI(B), and vice-versa. */
23403 t1
= gen_reg_rtx (wmode
);
23404 t2
= gen_reg_rtx (wmode
);
23405 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
23406 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
23408 /* Multiply LO(A) * LO(B). */
23409 t0
= gen_reg_rtx (wmode
);
23410 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
23412 /* Combine and shift the highparts into place. */
23413 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
23414 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
23417 /* Combine high and low parts. */
23418 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
23425 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
23426 bool uns_p
, bool high_p
)
23428 machine_mode wmode
= GET_MODE (dest
);
23429 machine_mode mode
= GET_MODE (op1
);
23430 rtx t1
, t2
, t3
, t4
, mask
;
23435 t1
= gen_reg_rtx (mode
);
23436 t2
= gen_reg_rtx (mode
);
23437 if (TARGET_XOP
&& !uns_p
)
23439 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23440 shuffle the elements once so that all elements are in the right
23441 place for immediate use: { A C B D }. */
23442 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
23443 const1_rtx
, GEN_INT (3)));
23444 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
23445 const1_rtx
, GEN_INT (3)));
23449 /* Put the elements into place for the multiply. */
23450 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
23451 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
23454 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
23458 /* Shuffle the elements between the lanes. After this we
23459 have { A B E F | C D G H } for each operand. */
23460 t1
= gen_reg_rtx (V4DImode
);
23461 t2
= gen_reg_rtx (V4DImode
);
23462 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
23463 const0_rtx
, const2_rtx
,
23464 const1_rtx
, GEN_INT (3)));
23465 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
23466 const0_rtx
, const2_rtx
,
23467 const1_rtx
, GEN_INT (3)));
23469 /* Shuffle the elements within the lanes. After this we
23470 have { A A B B | C C D D } or { E E F F | G G H H }. */
23471 t3
= gen_reg_rtx (V8SImode
);
23472 t4
= gen_reg_rtx (V8SImode
);
23473 mask
= GEN_INT (high_p
23474 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23475 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23476 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
23477 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
23479 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
23484 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
23485 uns_p
, OPTAB_DIRECT
);
23486 t2
= expand_binop (mode
,
23487 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
23488 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
23489 gcc_assert (t1
&& t2
);
23491 t3
= gen_reg_rtx (mode
);
23492 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
23493 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
23501 t1
= gen_reg_rtx (wmode
);
23502 t2
= gen_reg_rtx (wmode
);
23503 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
23504 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
23506 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
23510 gcc_unreachable ();
23515 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
23517 rtx res_1
, res_2
, res_3
, res_4
;
23519 res_1
= gen_reg_rtx (V4SImode
);
23520 res_2
= gen_reg_rtx (V4SImode
);
23521 res_3
= gen_reg_rtx (V2DImode
);
23522 res_4
= gen_reg_rtx (V2DImode
);
23523 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
23524 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
23526 /* Move the results in element 2 down to element 1; we don't care
23527 what goes in elements 2 and 3. Then we can merge the parts
23528 back together with an interleave.
23530 Note that two other sequences were tried:
23531 (1) Use interleaves at the start instead of psrldq, which allows
23532 us to use a single shufps to merge things back at the end.
23533 (2) Use shufps here to combine the two vectors, then pshufd to
23534 put the elements in the correct order.
23535 In both cases the cost of the reformatting stall was too high
23536 and the overall sequence slower. */
23538 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
23539 const0_rtx
, const2_rtx
,
23540 const0_rtx
, const0_rtx
));
23541 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
23542 const0_rtx
, const2_rtx
,
23543 const0_rtx
, const0_rtx
));
23544 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
23546 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
23550 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
23552 machine_mode mode
= GET_MODE (op0
);
23553 rtx t1
, t2
, t3
, t4
, t5
, t6
;
23555 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
23556 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
23557 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
23558 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
23559 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
23560 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
23561 else if (TARGET_XOP
&& mode
== V2DImode
)
23563 /* op1: A,B,C,D, op2: E,F,G,H */
23564 op1
= gen_lowpart (V4SImode
, op1
);
23565 op2
= gen_lowpart (V4SImode
, op2
);
23567 t1
= gen_reg_rtx (V4SImode
);
23568 t2
= gen_reg_rtx (V4SImode
);
23569 t3
= gen_reg_rtx (V2DImode
);
23570 t4
= gen_reg_rtx (V2DImode
);
23573 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
23579 /* t2: (B*E),(A*F),(D*G),(C*H) */
23580 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
23582 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
23583 emit_insn (gen_xop_phadddq (t3
, t2
));
23585 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
23586 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
23588 /* Multiply lower parts and add all */
23589 t5
= gen_reg_rtx (V2DImode
);
23590 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
23591 gen_lowpart (V4SImode
, op1
),
23592 gen_lowpart (V4SImode
, op2
)));
23593 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
23597 machine_mode nmode
;
23598 rtx (*umul
) (rtx
, rtx
, rtx
);
23600 if (mode
== V2DImode
)
23602 umul
= gen_vec_widen_umult_even_v4si
;
23605 else if (mode
== V4DImode
)
23607 umul
= gen_vec_widen_umult_even_v8si
;
23610 else if (mode
== V8DImode
)
23612 umul
= gen_vec_widen_umult_even_v16si
;
23616 gcc_unreachable ();
23619 /* Multiply low parts. */
23620 t1
= gen_reg_rtx (mode
);
23621 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
23623 /* Shift input vectors right 32 bits so we can multiply high parts. */
23625 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
23626 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
23628 /* Multiply high parts by low parts. */
23629 t4
= gen_reg_rtx (mode
);
23630 t5
= gen_reg_rtx (mode
);
23631 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
23632 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
23634 /* Combine and shift the highparts back. */
23635 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
23636 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
23638 /* Combine high and low parts. */
23639 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
23642 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
23643 gen_rtx_MULT (mode
, op1
, op2
));
23646 /* Return 1 if control tansfer instruction INSN
23647 should be encoded with notrack prefix. */
23650 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
23652 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
23657 rtx call
= get_call_rtx_from (insn
);
23658 gcc_assert (call
!= NULL_RTX
);
23659 rtx addr
= XEXP (call
, 0);
23661 /* Do not emit 'notrack' if it's not an indirect call. */
23663 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
23666 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
23669 if (JUMP_P (insn
) && !flag_cet_switch
)
23671 rtx target
= JUMP_LABEL (insn
);
23672 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
23675 /* Check the jump is a switch table. */
23676 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
23677 rtx_insn
*table
= next_insn (label
);
23678 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
23686 /* Calculate integer abs() using only SSE2 instructions. */
23689 ix86_expand_sse2_abs (rtx target
, rtx input
)
23691 machine_mode mode
= GET_MODE (target
);
23698 /* For 64-bit signed integer X, with SSE4.2 use
23699 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
23700 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
23701 32 and use logical instead of arithmetic right shift (which is
23702 unimplemented) and subtract. */
23705 tmp0
= gen_reg_rtx (mode
);
23706 tmp1
= gen_reg_rtx (mode
);
23707 emit_move_insn (tmp1
, CONST0_RTX (mode
));
23708 if (mode
== E_V2DImode
)
23709 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
23711 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
23715 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
23716 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
23717 - 1), NULL
, 0, OPTAB_DIRECT
);
23718 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
23721 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
23722 NULL
, 0, OPTAB_DIRECT
);
23723 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
23724 target
, 0, OPTAB_DIRECT
);
23728 /* For 32-bit signed integer X, the best way to calculate the absolute
23729 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
23730 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
23731 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
23732 NULL
, 0, OPTAB_DIRECT
);
23733 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
23734 NULL
, 0, OPTAB_DIRECT
);
23735 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
23736 target
, 0, OPTAB_DIRECT
);
23740 /* For 16-bit signed integer X, the best way to calculate the absolute
23741 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
23742 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
23744 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
23745 target
, 0, OPTAB_DIRECT
);
23749 /* For 8-bit signed integer X, the best way to calculate the absolute
23750 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23751 as SSE2 provides the PMINUB insn. */
23752 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
23754 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
23755 target
, 0, OPTAB_DIRECT
);
23759 gcc_unreachable ();
23763 emit_move_insn (target
, x
);
23766 /* Expand an extract from a vector register through pextr insn.
23767 Return true if successful. */
23770 ix86_expand_pextr (rtx
*operands
)
23772 rtx dst
= operands
[0];
23773 rtx src
= operands
[1];
23775 unsigned int size
= INTVAL (operands
[2]);
23776 unsigned int pos
= INTVAL (operands
[3]);
23778 if (SUBREG_P (dst
))
23780 /* Reject non-lowpart subregs. */
23781 if (SUBREG_BYTE (dst
) > 0)
23783 dst
= SUBREG_REG (dst
);
23786 if (SUBREG_P (src
))
23788 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
23789 src
= SUBREG_REG (src
);
23792 switch (GET_MODE (src
))
23800 machine_mode srcmode
, dstmode
;
23803 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
23809 if (!TARGET_SSE4_1
)
23811 srcmode
= V16QImode
;
23817 srcmode
= V8HImode
;
23821 if (!TARGET_SSE4_1
)
23823 srcmode
= V4SImode
;
23827 gcc_assert (TARGET_64BIT
);
23828 if (!TARGET_SSE4_1
)
23830 srcmode
= V2DImode
;
23837 /* Reject extractions from misaligned positions. */
23838 if (pos
& (size
-1))
23841 if (GET_MODE (dst
) == dstmode
)
23844 d
= gen_reg_rtx (dstmode
);
23846 /* Construct insn pattern. */
23847 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
23848 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
23850 /* Let the rtl optimizers know about the zero extension performed. */
23851 if (dstmode
== QImode
|| dstmode
== HImode
)
23853 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
23854 d
= gen_lowpart (SImode
, d
);
23857 emit_insn (gen_rtx_SET (d
, pat
));
23860 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
23869 /* Expand an insert into a vector register through pinsr insn.
23870 Return true if successful. */
23873 ix86_expand_pinsr (rtx
*operands
)
23875 rtx dst
= operands
[0];
23876 rtx src
= operands
[3];
23878 unsigned int size
= INTVAL (operands
[1]);
23879 unsigned int pos
= INTVAL (operands
[2]);
23881 if (SUBREG_P (dst
))
23883 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
23884 dst
= SUBREG_REG (dst
);
23887 switch (GET_MODE (dst
))
23895 machine_mode srcmode
, dstmode
;
23896 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
23899 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
23905 if (!TARGET_SSE4_1
)
23907 dstmode
= V16QImode
;
23908 pinsr
= gen_sse4_1_pinsrb
;
23914 dstmode
= V8HImode
;
23915 pinsr
= gen_sse2_pinsrw
;
23919 if (!TARGET_SSE4_1
)
23921 dstmode
= V4SImode
;
23922 pinsr
= gen_sse4_1_pinsrd
;
23926 gcc_assert (TARGET_64BIT
);
23927 if (!TARGET_SSE4_1
)
23929 dstmode
= V2DImode
;
23930 pinsr
= gen_sse4_1_pinsrq
;
23937 /* Reject insertions to misaligned positions. */
23938 if (pos
& (size
-1))
23941 if (SUBREG_P (src
))
23943 unsigned int srcpos
= SUBREG_BYTE (src
);
23949 extr_ops
[0] = gen_reg_rtx (srcmode
);
23950 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
23951 extr_ops
[2] = GEN_INT (size
);
23952 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
23954 if (!ix86_expand_pextr (extr_ops
))
23960 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
23963 if (GET_MODE (dst
) == dstmode
)
23966 d
= gen_reg_rtx (dstmode
);
23968 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
23969 gen_lowpart (srcmode
, src
),
23970 GEN_INT (1 << (pos
/ size
))));
23972 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
23981 /* All CPUs prefer to avoid cross-lane operations so perform reductions
23982 upper against lower halves up to SSE reg size. */
23985 ix86_split_reduction (machine_mode mode
)
23987 /* Reduce lowpart against highpart until we reach SSE reg width to
23988 avoid cross-lane operations. */
24014 /* Generate call to __divmoddi4. */
24017 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
24019 rtx
*quot_p
, rtx
*rem_p
)
24021 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
24023 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
24024 mode
, op0
, mode
, op1
, mode
,
24025 XEXP (rem
, 0), Pmode
);
24031 ix86_expand_atomic_fetch_op_loop (rtx target
, rtx mem
, rtx val
,
24032 enum rtx_code code
, bool after
,
24035 rtx old_reg
, new_reg
, old_mem
, success
;
24036 machine_mode mode
= GET_MODE (target
);
24037 rtx_code_label
*loop_label
= NULL
;
24039 old_reg
= gen_reg_rtx (mode
);
24041 old_mem
= copy_to_reg (mem
);
24042 loop_label
= gen_label_rtx ();
24043 emit_label (loop_label
);
24044 emit_move_insn (old_reg
, old_mem
);
24046 /* return value for atomic_fetch_op. */
24048 emit_move_insn (target
, old_reg
);
24052 new_reg
= expand_simple_binop (mode
, AND
, new_reg
, val
, NULL_RTX
,
24053 true, OPTAB_LIB_WIDEN
);
24054 new_reg
= expand_simple_unop (mode
, code
, new_reg
, NULL_RTX
, true);
24057 new_reg
= expand_simple_binop (mode
, code
, new_reg
, val
, NULL_RTX
,
24058 true, OPTAB_LIB_WIDEN
);
24060 /* return value for atomic_op_fetch. */
24062 emit_move_insn (target
, new_reg
);
24064 success
= NULL_RTX
;
24066 ix86_expand_cmpxchg_loop (&success
, old_mem
, mem
, old_reg
, new_reg
,
24067 gen_int_mode (MEMMODEL_SYNC_SEQ_CST
,
24069 doubleword
, loop_label
);
24072 /* Relax cmpxchg instruction, param loop_label indicates whether
24073 the instruction should be relaxed with a pause loop. If not,
24074 it will be relaxed to an atomic load + compare, and skip
24075 cmpxchg instruction if mem != exp_input. */
24078 ix86_expand_cmpxchg_loop (rtx
*ptarget_bool
, rtx target_val
,
24079 rtx mem
, rtx exp_input
, rtx new_input
,
24080 rtx mem_model
, bool doubleword
,
24081 rtx_code_label
*loop_label
)
24083 rtx_code_label
*cmp_label
= NULL
;
24084 rtx_code_label
*done_label
= NULL
;
24085 rtx target_bool
= NULL_RTX
, new_mem
= NULL_RTX
;
24086 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
24087 rtx (*gendw
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
24088 machine_mode mode
= GET_MODE (target_val
), hmode
= mode
;
24090 if (*ptarget_bool
== NULL
)
24091 target_bool
= gen_reg_rtx (QImode
);
24093 target_bool
= *ptarget_bool
;
24095 cmp_label
= gen_label_rtx ();
24096 done_label
= gen_label_rtx ();
24098 new_mem
= gen_reg_rtx (mode
);
24099 /* Load memory first. */
24100 expand_atomic_load (new_mem
, mem
, MEMMODEL_SEQ_CST
);
24105 gendw
= gen_atomic_compare_and_swapti_doubleword
;
24111 gendw
= gen_atomic_compare_and_swapdi_doubleword
;
24115 gen
= gen_atomic_compare_and_swapdi_1
;
24118 gen
= gen_atomic_compare_and_swapsi_1
;
24121 gen
= gen_atomic_compare_and_swaphi_1
;
24124 gen
= gen_atomic_compare_and_swapqi_1
;
24127 gcc_unreachable ();
24130 /* Compare mem value with expected value. */
24133 rtx low_new_mem
= gen_lowpart (hmode
, new_mem
);
24134 rtx low_exp_input
= gen_lowpart (hmode
, exp_input
);
24135 rtx high_new_mem
= gen_highpart (hmode
, new_mem
);
24136 rtx high_exp_input
= gen_highpart (hmode
, exp_input
);
24137 emit_cmp_and_jump_insns (low_new_mem
, low_exp_input
, NE
, NULL_RTX
,
24138 hmode
, 1, cmp_label
,
24139 profile_probability::guessed_never ());
24140 emit_cmp_and_jump_insns (high_new_mem
, high_exp_input
, NE
, NULL_RTX
,
24141 hmode
, 1, cmp_label
,
24142 profile_probability::guessed_never ());
24145 emit_cmp_and_jump_insns (new_mem
, exp_input
, NE
, NULL_RTX
,
24146 GET_MODE (exp_input
), 1, cmp_label
,
24147 profile_probability::guessed_never ());
24149 /* Directly emits cmpxchg here. */
24151 emit_insn (gendw (target_val
, mem
, exp_input
,
24152 gen_lowpart (hmode
, new_input
),
24153 gen_highpart (hmode
, new_input
),
24156 emit_insn (gen (target_val
, mem
, exp_input
, new_input
, mem_model
));
24160 emit_jump_insn (gen_jump (done_label
));
24162 emit_label (cmp_label
);
24163 emit_move_insn (target_val
, new_mem
);
24164 emit_label (done_label
);
24165 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
24170 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
24172 emit_cmp_and_jump_insns (target_bool
, const0_rtx
, EQ
, const0_rtx
,
24173 GET_MODE (target_bool
), 1, loop_label
,
24174 profile_probability::guessed_never ());
24175 emit_jump_insn (gen_jump (done_label
));
24178 /* If mem is not expected, pause and loop back. */
24179 emit_label (cmp_label
);
24180 emit_move_insn (target_val
, new_mem
);
24181 emit_insn (gen_pause ());
24182 emit_jump_insn (gen_jump (loop_label
));
24184 emit_label (done_label
);
24187 *ptarget_bool
= target_bool
;
24190 /* Convert a BFmode VAL to SFmode without signaling sNaNs.
24191 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24194 ix86_expand_fast_convert_bf_to_sf (rtx val
)
24196 rtx op
= gen_lowpart (HImode
, val
), ret
;
24197 if (CONST_INT_P (op
))
24199 ret
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
24203 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24204 ret
= gen_reg_rtx (SImode
);
24205 emit_move_insn (ret
, GEN_INT (INTVAL (op
) & 0xffff));
24206 emit_insn (gen_ashlsi3 (ret
, ret
, GEN_INT (16)));
24207 return gen_lowpart (SFmode
, ret
);
24210 ret
= gen_reg_rtx (SFmode
);
24211 emit_insn (gen_extendbfsf2_1 (ret
, force_reg (BFmode
, val
)));
24215 #include "gt-i386-expand.h"