1 /* Copyright (C) 1988-2023 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
73 #include "tree-iterator.h"
75 #include "case-cfn-macros.h"
77 #include "fold-const-call.h"
79 #include "tree-ssanames.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
85 #include "symbol-summary.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
104 split_double_mode (machine_mode mode
, rtx operands
[],
105 int num
, rtx lo_half
[], rtx hi_half
[])
107 machine_mode half_mode
;
109 rtx mem_op
= NULL_RTX
;
130 byte
= GET_MODE_SIZE (half_mode
);
134 rtx op
= operands
[num
];
136 /* simplify_subreg refuse to split volatile memory addresses,
137 but we still have to handle it. */
140 if (mem_op
&& rtx_equal_p (op
, mem_op
))
142 lo_half
[num
] = lo_half
[mem_num
];
143 hi_half
[num
] = hi_half
[mem_num
];
149 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
150 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
155 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
156 GET_MODE (op
) == VOIDmode
157 ? mode
: GET_MODE (op
), 0);
159 rtx tmp
= simplify_gen_subreg (half_mode
, op
,
160 GET_MODE (op
) == VOIDmode
161 ? mode
: GET_MODE (op
), byte
);
162 /* simplify_gen_subreg will return NULL RTX for the
163 high half of the paradoxical subreg. */
164 hi_half
[num
] = tmp
? tmp
: gen_reg_rtx (half_mode
);
169 /* Emit the double word assignment DST = { LO, HI }. */
172 split_double_concat (machine_mode mode
, rtx dst
, rtx lo
, rtx hi
)
175 int deleted_move_count
= 0;
176 split_double_mode (mode
, &dst
, 1, &dlo
, &dhi
);
177 /* Constraints ensure that if both lo and hi are MEMs, then
178 dst has early-clobber and thus addresses of MEMs don't use
179 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
180 dlo/dhi are registers. */
182 && rtx_equal_p (dlo
, hi
)
183 && reg_overlap_mentioned_p (dhi
, lo
))
185 /* If dlo is same as hi and lo's address uses dhi register,
186 code below would first emit_move_insn (dhi, hi)
187 and then emit_move_insn (dlo, lo). But the former
188 would invalidate lo's address. Load into dhi first,
190 emit_move_insn (dhi
, lo
);
195 && !rtx_equal_p (dlo
, lo
)
196 && reg_overlap_mentioned_p (dlo
, hi
))
198 /* In this case, code below would first emit_move_insn (dlo, lo)
199 and then emit_move_insn (dhi, hi). But the former would
200 invalidate hi's address. */
201 if (rtx_equal_p (dhi
, lo
))
203 /* We can't load into dhi first, so load into dlo
204 first and we'll swap. */
205 emit_move_insn (dlo
, hi
);
210 /* Load into dhi first. */
211 emit_move_insn (dhi
, hi
);
215 if (!rtx_equal_p (dlo
, hi
))
217 if (!rtx_equal_p (dlo
, lo
))
218 emit_move_insn (dlo
, lo
);
220 deleted_move_count
++;
221 if (!rtx_equal_p (dhi
, hi
))
222 emit_move_insn (dhi
, hi
);
224 deleted_move_count
++;
226 else if (!rtx_equal_p (lo
, dhi
))
228 if (!rtx_equal_p (dhi
, hi
))
229 emit_move_insn (dhi
, hi
);
231 deleted_move_count
++;
232 if (!rtx_equal_p (dlo
, lo
))
233 emit_move_insn (dlo
, lo
);
235 deleted_move_count
++;
237 else if (mode
== TImode
)
238 emit_insn (gen_swapdi (dlo
, dhi
));
240 emit_insn (gen_swapsi (dlo
, dhi
));
242 if (deleted_move_count
== 2)
243 emit_note (NOTE_INSN_DELETED
);
247 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
251 ix86_expand_clear (rtx dest
)
255 /* We play register width games, which are only valid after reload. */
256 gcc_assert (reload_completed
);
258 /* Avoid HImode and its attendant prefix byte. */
259 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
260 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
261 tmp
= gen_rtx_SET (dest
, const0_rtx
);
263 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
265 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
266 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
272 /* Return true if V can be broadcasted from an integer of WIDTH bits
273 which is returned in VAL_BROADCAST. Otherwise, return false. */
276 ix86_broadcast (HOST_WIDE_INT v
, unsigned int width
,
277 HOST_WIDE_INT
&val_broadcast
)
279 wide_int val
= wi::uhwi (v
, HOST_BITS_PER_WIDE_INT
);
280 val_broadcast
= wi::extract_uhwi (val
, 0, width
);
281 for (unsigned int i
= width
; i
< HOST_BITS_PER_WIDE_INT
; i
+= width
)
283 HOST_WIDE_INT each
= wi::extract_uhwi (val
, i
, width
);
284 if (val_broadcast
!= each
)
287 val_broadcast
= sext_hwi (val_broadcast
, width
);
291 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
294 ix86_convert_const_wide_int_to_broadcast (machine_mode mode
, rtx op
)
296 /* Don't use integer vector broadcast if we can't move from GPR to SSE
297 register directly. */
298 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
301 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
302 broadcast only if vector broadcast is available. */
304 || !CONST_WIDE_INT_P (op
)
305 || standard_sse_constant_p (op
, mode
)
306 || (CONST_WIDE_INT_NUNITS (op
) * HOST_BITS_PER_WIDE_INT
307 != GET_MODE_BITSIZE (mode
)))
310 HOST_WIDE_INT val
= CONST_WIDE_INT_ELT (op
, 0);
311 HOST_WIDE_INT val_broadcast
;
312 scalar_int_mode broadcast_mode
;
314 && ix86_broadcast (val
, GET_MODE_BITSIZE (QImode
),
316 broadcast_mode
= QImode
;
318 && ix86_broadcast (val
, GET_MODE_BITSIZE (HImode
),
320 broadcast_mode
= HImode
;
321 else if (ix86_broadcast (val
, GET_MODE_BITSIZE (SImode
),
323 broadcast_mode
= SImode
;
324 else if (TARGET_64BIT
325 && ix86_broadcast (val
, GET_MODE_BITSIZE (DImode
),
327 broadcast_mode
= DImode
;
331 /* Check if OP can be broadcasted from VAL. */
332 for (int i
= 1; i
< CONST_WIDE_INT_NUNITS (op
); i
++)
333 if (val
!= CONST_WIDE_INT_ELT (op
, i
))
336 unsigned int nunits
= (GET_MODE_SIZE (mode
)
337 / GET_MODE_SIZE (broadcast_mode
));
338 machine_mode vector_mode
;
339 if (!mode_for_vector (broadcast_mode
, nunits
).exists (&vector_mode
))
341 rtx target
= gen_reg_rtx (vector_mode
);
342 bool ok
= ix86_expand_vector_init_duplicate (false, vector_mode
,
344 GEN_INT (val_broadcast
));
346 target
= lowpart_subreg (mode
, target
, vector_mode
);
351 ix86_expand_move (machine_mode mode
, rtx operands
[])
354 rtx tmp
, addend
= NULL_RTX
;
355 enum tls_model model
;
360 /* Avoid complex sets of likely spilled hard registers before reload. */
361 if (!ix86_hardreg_mov_ok (op0
, op1
))
363 tmp
= gen_reg_rtx (mode
);
365 ix86_expand_move (mode
, operands
);
371 switch (GET_CODE (op1
))
376 if (GET_CODE (tmp
) != PLUS
377 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
381 addend
= XEXP (tmp
, 1);
385 model
= SYMBOL_REF_TLS_MODEL (op1
);
388 op1
= legitimize_tls_address (op1
, model
, true);
389 else if (ix86_force_load_from_GOT_p (op1
))
391 /* Load the external function address via GOT slot to avoid PLT. */
392 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
396 op1
= gen_rtx_CONST (Pmode
, op1
);
397 op1
= gen_const_mem (Pmode
, op1
);
398 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
402 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
418 op1
= force_operand (op1
, NULL_RTX
);
419 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
420 op0
, 1, OPTAB_DIRECT
);
423 op1
= force_operand (op1
, op0
);
428 op1
= convert_to_mode (mode
, op1
, 1);
434 if ((flag_pic
|| MACHOPIC_INDIRECT
)
435 && symbolic_operand (op1
, mode
))
437 if (TARGET_MACHO
&& !TARGET_64BIT
)
441 if (MACHOPIC_INDIRECT
)
443 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
444 ? op0
: gen_reg_rtx (Pmode
);
445 op1
= machopic_indirect_data_reference (op1
, temp
);
447 op1
= machopic_legitimize_pic_address (op1
, mode
,
448 temp
== op1
? 0 : temp
);
450 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
452 rtx insn
= gen_rtx_SET (op0
, op1
);
456 if (GET_CODE (op0
) == MEM
)
457 op1
= force_reg (Pmode
, op1
);
461 if (GET_CODE (temp
) != REG
)
462 temp
= gen_reg_rtx (Pmode
);
463 temp
= legitimize_pic_address (op1
, temp
);
474 op1
= force_reg (mode
, op1
);
475 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
477 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
478 op1
= legitimize_pic_address (op1
, reg
);
481 op1
= convert_to_mode (mode
, op1
, 1);
488 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
489 || !push_operand (op0
, mode
))
491 op1
= force_reg (mode
, op1
);
493 if (push_operand (op0
, mode
)
494 && ! general_no_elim_operand (op1
, mode
))
495 op1
= copy_to_mode_reg (mode
, op1
);
497 /* Force large constants in 64bit compilation into register
498 to get them CSEed. */
499 if (can_create_pseudo_p ()
500 && (mode
== DImode
) && TARGET_64BIT
501 && immediate_operand (op1
, mode
)
502 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
503 && !register_operand (op0
, mode
)
505 op1
= copy_to_mode_reg (mode
, op1
);
507 if (can_create_pseudo_p ())
509 if (CONST_DOUBLE_P (op1
))
511 /* If we are loading a floating point constant to a
512 register, force the value to memory now, since we'll
513 get better code out the back end. */
515 op1
= validize_mem (force_const_mem (mode
, op1
));
516 if (!register_operand (op0
, mode
))
518 rtx temp
= gen_reg_rtx (mode
);
519 emit_insn (gen_rtx_SET (temp
, op1
));
520 emit_move_insn (op0
, temp
);
524 else if (GET_MODE_SIZE (mode
) >= 16)
526 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
527 (GET_MODE (op0
), op1
);
534 emit_insn (gen_rtx_SET (op0
, op1
));
537 /* OP is a memref of CONST_VECTOR, return scalar constant mem
538 if CONST_VECTOR is a vec_duplicate, else return NULL. */
540 ix86_broadcast_from_constant (machine_mode mode
, rtx op
)
542 int nunits
= GET_MODE_NUNITS (mode
);
546 /* Don't use integer vector broadcast if we can't move from GPR to SSE
547 register directly. */
548 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
549 && INTEGRAL_MODE_P (mode
))
552 /* Convert CONST_VECTOR to a non-standard SSE constant integer
553 broadcast only if vector broadcast is available. */
556 && (GET_MODE_INNER (mode
) == SImode
557 || GET_MODE_INNER (mode
) == DImode
))
558 || FLOAT_MODE_P (mode
))
559 || standard_sse_constant_p (op
, mode
))
562 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
563 We can still put 64-bit integer constant in memory when
564 avx512 embed broadcast is available. */
565 if (GET_MODE_INNER (mode
) == DImode
&& !TARGET_64BIT
567 || (GET_MODE_SIZE (mode
) < 64 && !TARGET_AVX512VL
)))
570 if (GET_MODE_INNER (mode
) == TImode
)
573 rtx constant
= get_pool_constant (XEXP (op
, 0));
574 if (GET_CODE (constant
) != CONST_VECTOR
)
577 /* There could be some rtx like
578 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
579 but with "*.LC1" refer to V2DI constant vector. */
580 if (GET_MODE (constant
) != mode
)
582 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
),
584 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
588 rtx first
= XVECEXP (constant
, 0, 0);
590 for (int i
= 1; i
< nunits
; ++i
)
592 rtx tmp
= XVECEXP (constant
, 0, i
);
593 /* Vector duplicate value. */
594 if (!rtx_equal_p (tmp
, first
))
602 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
604 rtx op0
= operands
[0], op1
= operands
[1];
605 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
606 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
607 unsigned int align
= (TARGET_IAMCU
608 ? GET_MODE_BITSIZE (mode
)
609 : GET_MODE_ALIGNMENT (mode
));
611 if (push_operand (op0
, VOIDmode
))
612 op0
= emit_move_resolve_push (mode
, op0
);
614 /* Force constants other than zero into memory. We do not know how
615 the instructions used to build constants modify the upper 64 bits
616 of the register, once we have that information we may be able
617 to handle some of them more efficiently. */
618 if (can_create_pseudo_p ()
621 && CONSTANT_P (SUBREG_REG (op1
))))
622 && ((register_operand (op0
, mode
)
623 && !standard_sse_constant_p (op1
, mode
))
624 /* ix86_expand_vector_move_misalign() does not like constants. */
625 || (SSE_REG_MODE_P (mode
)
627 && MEM_ALIGN (op0
) < align
)))
631 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
632 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
634 r
= validize_mem (r
);
636 r
= force_reg (imode
, SUBREG_REG (op1
));
637 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
641 machine_mode mode
= GET_MODE (op0
);
642 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
645 op1
= validize_mem (force_const_mem (mode
, op1
));
651 if (can_create_pseudo_p ()
652 && GET_MODE_SIZE (mode
) >= 16
653 && VECTOR_MODE_P (mode
)
655 && SYMBOL_REF_P (XEXP (op1
, 0))
656 && CONSTANT_POOL_ADDRESS_P (XEXP (op1
, 0))))
658 rtx first
= ix86_broadcast_from_constant (mode
, op1
);
659 if (first
!= nullptr)
661 /* Broadcast to XMM/YMM/ZMM register from an integer
662 constant or scalar mem. */
663 op1
= gen_reg_rtx (mode
);
664 if (FLOAT_MODE_P (mode
)
665 || (!TARGET_64BIT
&& GET_MODE_INNER (mode
) == DImode
))
666 first
= force_const_mem (GET_MODE_INNER (mode
), first
);
667 bool ok
= ix86_expand_vector_init_duplicate (false, mode
,
670 emit_move_insn (op0
, op1
);
675 /* We need to check memory alignment for SSE mode since attribute
676 can make operands unaligned. */
677 if (can_create_pseudo_p ()
678 && SSE_REG_MODE_P (mode
)
679 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
680 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
684 /* ix86_expand_vector_move_misalign() does not like both
685 arguments in memory. */
686 if (!register_operand (op0
, mode
)
687 && !register_operand (op1
, mode
))
689 rtx scratch
= gen_reg_rtx (mode
);
690 emit_move_insn (scratch
, op1
);
694 tmp
[0] = op0
; tmp
[1] = op1
;
695 ix86_expand_vector_move_misalign (mode
, tmp
);
699 /* Special case TImode to V1TImode conversions, via V2DI. */
702 && GET_MODE (SUBREG_REG (op1
)) == TImode
703 && TARGET_64BIT
&& TARGET_SSE
704 && can_create_pseudo_p ())
706 rtx tmp
= gen_reg_rtx (V2DImode
);
707 rtx lo
= gen_reg_rtx (DImode
);
708 rtx hi
= gen_reg_rtx (DImode
);
709 emit_move_insn (lo
, gen_lowpart (DImode
, SUBREG_REG (op1
)));
710 emit_move_insn (hi
, gen_highpart (DImode
, SUBREG_REG (op1
)));
711 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
712 emit_move_insn (op0
, gen_lowpart (V1TImode
, tmp
));
716 /* If operand0 is a hard register, make operand1 a pseudo. */
717 if (can_create_pseudo_p ()
718 && !ix86_hardreg_mov_ok (op0
, op1
))
720 rtx tmp
= gen_reg_rtx (GET_MODE (op0
));
721 emit_move_insn (tmp
, op1
);
722 emit_move_insn (op0
, tmp
);
726 /* Make operand1 a register if it isn't already. */
727 if (can_create_pseudo_p ()
728 && !register_operand (op0
, mode
)
729 && !register_operand (op1
, mode
))
731 rtx tmp
= gen_reg_rtx (GET_MODE (op0
));
732 emit_move_insn (tmp
, op1
);
733 emit_move_insn (op0
, tmp
);
737 emit_insn (gen_rtx_SET (op0
, op1
));
740 /* Split 32-byte AVX unaligned load and store if needed. */
743 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
746 rtx (*extract
) (rtx
, rtx
, rtx
);
749 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
750 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
752 emit_insn (gen_rtx_SET (op0
, op1
));
756 rtx orig_op0
= NULL_RTX
;
757 mode
= GET_MODE (op0
);
758 switch (GET_MODE_CLASS (mode
))
760 case MODE_VECTOR_INT
:
762 if (mode
!= V32QImode
)
767 op0
= gen_reg_rtx (V32QImode
);
770 op0
= gen_lowpart (V32QImode
, op0
);
771 op1
= gen_lowpart (V32QImode
, op1
);
775 case MODE_VECTOR_FLOAT
:
786 extract
= gen_avx_vextractf128v32qi
;
790 extract
= gen_avx_vextractf128v16bf
;
794 extract
= gen_avx_vextractf128v16hf
;
798 extract
= gen_avx_vextractf128v8sf
;
802 extract
= gen_avx_vextractf128v4df
;
809 rtx r
= gen_reg_rtx (mode
);
810 m
= adjust_address (op1
, mode
, 0);
811 emit_move_insn (r
, m
);
812 m
= adjust_address (op1
, mode
, 16);
813 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
814 emit_move_insn (op0
, r
);
816 else if (MEM_P (op0
))
818 m
= adjust_address (op0
, mode
, 0);
819 emit_insn (extract (m
, op1
, const0_rtx
));
820 m
= adjust_address (op0
, mode
, 16);
821 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
827 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
830 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
831 straight to ix86_expand_vector_move. */
832 /* Code generation for scalar reg-reg moves of single and double precision data:
833 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
837 if (x86_sse_partial_reg_dependency == true)
842 Code generation for scalar loads of double precision data:
843 if (x86_sse_split_regs == true)
844 movlpd mem, reg (gas syntax)
848 Code generation for unaligned packed loads of single precision data
849 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
850 if (x86_sse_unaligned_move_optimal)
853 if (x86_sse_partial_reg_dependency == true)
865 Code generation for unaligned packed loads of double precision data
866 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
867 if (x86_sse_unaligned_move_optimal)
870 if (x86_sse_split_regs == true)
883 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
890 /* Use unaligned load/store for AVX512 or when optimizing for size. */
891 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
893 emit_insn (gen_rtx_SET (op0
, op1
));
899 if (GET_MODE_SIZE (mode
) == 32)
900 ix86_avx256_split_vector_move_misalign (op0
, op1
);
902 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
903 emit_insn (gen_rtx_SET (op0
, op1
));
907 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
908 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
910 emit_insn (gen_rtx_SET (op0
, op1
));
914 /* ??? If we have typed data, then it would appear that using
915 movdqu is the only way to get unaligned data loaded with
917 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
919 emit_insn (gen_rtx_SET (op0
, op1
));
925 if (TARGET_SSE2
&& mode
== V2DFmode
)
929 /* When SSE registers are split into halves, we can avoid
930 writing to the top half twice. */
931 if (TARGET_SSE_SPLIT_REGS
)
938 /* ??? Not sure about the best option for the Intel chips.
939 The following would seem to satisfy; the register is
940 entirely cleared, breaking the dependency chain. We
941 then store to the upper half, with a dependency depth
942 of one. A rumor has it that Intel recommends two movsd
943 followed by an unpacklpd, but this is unconfirmed. And
944 given that the dependency depth of the unpacklpd would
945 still be one, I'm not sure why this would be better. */
946 zero
= CONST0_RTX (V2DFmode
);
949 m
= adjust_address (op1
, DFmode
, 0);
950 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
951 m
= adjust_address (op1
, DFmode
, 8);
952 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
958 if (mode
!= V4SFmode
)
959 t
= gen_reg_rtx (V4SFmode
);
963 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
964 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
968 m
= adjust_address (op1
, V2SFmode
, 0);
969 emit_insn (gen_sse_loadlps (t
, t
, m
));
970 m
= adjust_address (op1
, V2SFmode
, 8);
971 emit_insn (gen_sse_loadhps (t
, t
, m
));
972 if (mode
!= V4SFmode
)
973 emit_move_insn (op0
, gen_lowpart (mode
, t
));
976 else if (MEM_P (op0
))
978 if (TARGET_SSE2
&& mode
== V2DFmode
)
980 m
= adjust_address (op0
, DFmode
, 0);
981 emit_insn (gen_sse2_storelpd (m
, op1
));
982 m
= adjust_address (op0
, DFmode
, 8);
983 emit_insn (gen_sse2_storehpd (m
, op1
));
987 if (mode
!= V4SFmode
)
988 op1
= gen_lowpart (V4SFmode
, op1
);
990 m
= adjust_address (op0
, V2SFmode
, 0);
991 emit_insn (gen_sse_storelps (m
, op1
));
992 m
= adjust_address (op0
, V2SFmode
, 8);
993 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
1000 /* Move bits 64:95 to bits 32:63. */
1003 ix86_move_vector_high_sse_to_mmx (rtx op
)
1005 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
1006 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1007 GEN_INT (0), GEN_INT (0)));
1008 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
1009 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
1010 rtx insn
= gen_rtx_SET (dest
, op
);
1014 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1017 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
1019 rtx op0
= operands
[0];
1020 rtx op1
= operands
[1];
1021 rtx op2
= operands
[2];
1023 machine_mode dmode
= GET_MODE (op0
);
1024 machine_mode smode
= GET_MODE (op1
);
1025 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
1026 machine_mode inner_smode
= GET_MODE_INNER (smode
);
1028 /* Get the corresponding SSE mode for destination. */
1029 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
1030 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
1032 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
1033 nunits
/ 2).require ();
1035 /* Get the corresponding SSE mode for source. */
1036 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
1037 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
1040 /* Generate SSE pack with signed/unsigned saturation. */
1041 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
1042 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
1043 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
1045 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
1046 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
1047 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
1051 ix86_move_vector_high_sse_to_mmx (op0
);
1054 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1057 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
1059 rtx op0
= operands
[0];
1060 rtx op1
= operands
[1];
1061 rtx op2
= operands
[2];
1062 machine_mode mode
= GET_MODE (op0
);
1064 /* The corresponding SSE mode. */
1065 machine_mode sse_mode
, double_sse_mode
;
1071 sse_mode
= V16QImode
;
1072 double_sse_mode
= V32QImode
;
1073 mask
= gen_rtx_PARALLEL (VOIDmode
,
1075 GEN_INT (0), GEN_INT (16),
1076 GEN_INT (1), GEN_INT (17),
1077 GEN_INT (2), GEN_INT (18),
1078 GEN_INT (3), GEN_INT (19),
1079 GEN_INT (4), GEN_INT (20),
1080 GEN_INT (5), GEN_INT (21),
1081 GEN_INT (6), GEN_INT (22),
1082 GEN_INT (7), GEN_INT (23)));
1087 sse_mode
= V8HImode
;
1088 double_sse_mode
= V16HImode
;
1089 mask
= gen_rtx_PARALLEL (VOIDmode
,
1091 GEN_INT (0), GEN_INT (8),
1092 GEN_INT (1), GEN_INT (9),
1093 GEN_INT (2), GEN_INT (10),
1094 GEN_INT (3), GEN_INT (11)));
1098 sse_mode
= V4SImode
;
1099 double_sse_mode
= V8SImode
;
1100 mask
= gen_rtx_PARALLEL (VOIDmode
,
1102 GEN_INT (0), GEN_INT (4),
1103 GEN_INT (1), GEN_INT (5)));
1107 sse_mode
= V4SFmode
;
1108 double_sse_mode
= V8SFmode
;
1109 mask
= gen_rtx_PARALLEL (VOIDmode
,
1111 GEN_INT (0), GEN_INT (4),
1112 GEN_INT (1), GEN_INT (5)));
1119 /* Generate SSE punpcklXX. */
1120 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
1121 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
1122 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
1124 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
1125 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
1126 rtx insn
= gen_rtx_SET (dest
, op2
);
1129 /* Move high bits to low bits. */
1132 if (sse_mode
== V4SFmode
)
1134 mask
= gen_rtx_PARALLEL (VOIDmode
,
1135 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1136 GEN_INT (4), GEN_INT (5)));
1137 op2
= gen_rtx_VEC_CONCAT (V8SFmode
, dest
, dest
);
1138 op1
= gen_rtx_VEC_SELECT (V4SFmode
, op2
, mask
);
1142 int sz
= GET_MODE_SIZE (mode
);
1145 mask
= gen_rtx_PARALLEL (VOIDmode
,
1146 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1147 GEN_INT (0), GEN_INT (1)));
1149 mask
= gen_rtx_PARALLEL (VOIDmode
,
1150 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1151 GEN_INT (0), GEN_INT (1)));
1155 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
1156 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
1159 insn
= gen_rtx_SET (dest
, op1
);
1164 /* Helper function of ix86_fixup_binary_operands to canonicalize
1165 operand order. Returns true if the operands should be swapped. */
1168 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
1171 rtx dst
= operands
[0];
1172 rtx src1
= operands
[1];
1173 rtx src2
= operands
[2];
1175 /* If the operation is not commutative, we can't do anything. */
1176 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
1177 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
1180 /* Highest priority is that src1 should match dst. */
1181 if (rtx_equal_p (dst
, src1
))
1183 if (rtx_equal_p (dst
, src2
))
1186 /* Next highest priority is that immediate constants come second. */
1187 if (immediate_operand (src2
, mode
))
1189 if (immediate_operand (src1
, mode
))
1192 /* Lowest priority is that memory references should come second. */
1202 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1203 destination to use for the operation. If different from the true
1204 destination in operands[0], a copy operation will be required. */
1207 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
1210 rtx dst
= operands
[0];
1211 rtx src1
= operands
[1];
1212 rtx src2
= operands
[2];
1214 /* Canonicalize operand order. */
1215 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1217 /* It is invalid to swap operands of different modes. */
1218 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
1220 std::swap (src1
, src2
);
1223 /* Both source operands cannot be in memory. */
1224 if (MEM_P (src1
) && MEM_P (src2
))
1226 /* Optimization: Only read from memory once. */
1227 if (rtx_equal_p (src1
, src2
))
1229 src2
= force_reg (mode
, src2
);
1232 else if (rtx_equal_p (dst
, src1
))
1233 src2
= force_reg (mode
, src2
);
1235 src1
= force_reg (mode
, src1
);
1238 /* If the destination is memory, and we do not have matching source
1239 operands, do things in registers. */
1240 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1241 dst
= gen_reg_rtx (mode
);
1243 /* Source 1 cannot be a constant. */
1244 if (CONSTANT_P (src1
))
1245 src1
= force_reg (mode
, src1
);
1247 /* Source 1 cannot be a non-matching memory. */
1248 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1249 src1
= force_reg (mode
, src1
);
1251 /* Improve address combine. */
1253 && GET_MODE_CLASS (mode
) == MODE_INT
1255 src2
= force_reg (mode
, src2
);
1262 /* Similarly, but assume that the destination has already been
1266 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
1267 machine_mode mode
, rtx operands
[])
1269 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1270 gcc_assert (dst
== operands
[0]);
1273 /* Attempt to expand a binary operator. Make the expansion closer to the
1274 actual machine, then just general_operand, which will allow 3 separate
1275 memory references (one output, two input) in a single insn. */
1278 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
1281 rtx src1
, src2
, dst
, op
, clob
;
1283 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1287 /* Emit the instruction. */
1289 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
1291 if (reload_completed
1293 && !rtx_equal_p (dst
, src1
))
1295 /* This is going to be an LEA; avoid splitting it later. */
1300 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1301 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1304 /* Fix up the destination if needed. */
1305 if (dst
!= operands
[0])
1306 emit_move_insn (operands
[0], dst
);
1309 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1310 the given OPERANDS. */
1313 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
1316 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
1317 if (SUBREG_P (operands
[1]))
1322 else if (SUBREG_P (operands
[2]))
1327 /* Optimize (__m128i) d | (__m128i) e and similar code
1328 when d and e are float vectors into float vector logical
1329 insn. In C/C++ without using intrinsics there is no other way
1330 to express vector logical operation on float vectors than
1331 to cast them temporarily to integer vectors. */
1333 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1334 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
1335 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
1336 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
1337 && SUBREG_BYTE (op1
) == 0
1338 && (GET_CODE (op2
) == CONST_VECTOR
1339 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
1340 && SUBREG_BYTE (op2
) == 0))
1341 && can_create_pseudo_p ())
1344 switch (GET_MODE (SUBREG_REG (op1
)))
1352 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1353 if (GET_CODE (op2
) == CONST_VECTOR
)
1355 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1356 op2
= force_reg (GET_MODE (dst
), op2
);
1361 op2
= SUBREG_REG (operands
[2]);
1362 if (!vector_operand (op2
, GET_MODE (dst
)))
1363 op2
= force_reg (GET_MODE (dst
), op2
);
1365 op1
= SUBREG_REG (op1
);
1366 if (!vector_operand (op1
, GET_MODE (dst
)))
1367 op1
= force_reg (GET_MODE (dst
), op1
);
1368 emit_insn (gen_rtx_SET (dst
,
1369 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1371 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1377 if (!vector_operand (operands
[1], mode
))
1378 operands
[1] = force_reg (mode
, operands
[1]);
1379 if (!vector_operand (operands
[2], mode
))
1380 operands
[2] = force_reg (mode
, operands
[2]);
1381 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1382 emit_insn (gen_rtx_SET (operands
[0],
1383 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1387 /* Return TRUE or FALSE depending on whether the binary operator meets the
1388 appropriate constraints. */
1391 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1394 rtx dst
= operands
[0];
1395 rtx src1
= operands
[1];
1396 rtx src2
= operands
[2];
1398 /* Both source operands cannot be in memory. */
1399 if ((MEM_P (src1
) || bcst_mem_operand (src1
, mode
))
1400 && (MEM_P (src2
) || bcst_mem_operand (src2
, mode
)))
1403 /* Canonicalize operand order for commutative operators. */
1404 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1405 std::swap (src1
, src2
);
1407 /* If the destination is memory, we must have a matching source operand. */
1408 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1411 /* Source 1 cannot be a constant. */
1412 if (CONSTANT_P (src1
))
1415 /* Source 1 cannot be a non-matching memory. */
1416 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1417 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1421 || (TARGET_64BIT
&& mode
== DImode
))
1422 && satisfies_constraint_L (src2
));
1427 /* Attempt to expand a unary operator. Make the expansion closer to the
1428 actual machine, then just general_operand, which will allow 2 separate
1429 memory references (one output, one input) in a single insn. */
1432 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1435 bool matching_memory
= false;
1436 rtx src
, dst
, op
, clob
;
1441 /* If the destination is memory, and we do not have matching source
1442 operands, do things in registers. */
1445 if (rtx_equal_p (dst
, src
))
1446 matching_memory
= true;
1448 dst
= gen_reg_rtx (mode
);
1451 /* When source operand is memory, destination must match. */
1452 if (MEM_P (src
) && !matching_memory
)
1453 src
= force_reg (mode
, src
);
1455 /* Emit the instruction. */
1457 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1463 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1464 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1467 /* Fix up the destination if needed. */
1468 if (dst
!= operands
[0])
1469 emit_move_insn (operands
[0], dst
);
1472 /* Predict just emitted jump instruction to be taken with probability PROB. */
1475 predict_jump (int prob
)
1477 rtx_insn
*insn
= get_last_insn ();
1478 gcc_assert (JUMP_P (insn
));
1479 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1482 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1483 divisor are within the range [0-255]. */
1486 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1489 rtx_code_label
*end_label
, *qimode_label
;
1492 rtx scratch
, tmp0
, tmp1
, tmp2
;
1493 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1495 operands
[2] = force_reg (mode
, operands
[2]);
1496 operands
[3] = force_reg (mode
, operands
[3]);
1501 if (GET_MODE (operands
[0]) == SImode
)
1503 if (GET_MODE (operands
[1]) == SImode
)
1504 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1507 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1511 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1515 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1522 end_label
= gen_label_rtx ();
1523 qimode_label
= gen_label_rtx ();
1525 scratch
= gen_reg_rtx (mode
);
1527 /* Use 8bit unsigned divimod if dividend and divisor are within
1528 the range [0-255]. */
1529 emit_move_insn (scratch
, operands
[2]);
1530 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1531 scratch
, 1, OPTAB_DIRECT
);
1532 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1533 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1534 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1535 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1536 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1538 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1539 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1540 JUMP_LABEL (insn
) = qimode_label
;
1542 /* Generate original signed/unsigned divimod. */
1543 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1544 operands
[2], operands
[3]));
1546 /* Branch to the end. */
1547 emit_jump_insn (gen_jump (end_label
));
1550 /* Generate 8bit unsigned divide. */
1551 emit_label (qimode_label
);
1552 /* Don't use operands[0] for result of 8bit divide since not all
1553 registers support QImode ZERO_EXTRACT. */
1554 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1555 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1556 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1557 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1561 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1562 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1566 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1567 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1571 if (GET_MODE (operands
[0]) != SImode
)
1572 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1573 if (GET_MODE (operands
[1]) != SImode
)
1574 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1577 /* Extract remainder from AH. */
1578 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1579 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1580 GEN_INT (8), GEN_INT (8));
1581 insn
= emit_move_insn (operands
[1], tmp1
);
1582 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1584 /* Zero extend quotient from AL. */
1585 tmp1
= gen_lowpart (QImode
, tmp0
);
1586 insn
= emit_insn (gen_extend_insn
1588 GET_MODE (operands
[0]), QImode
, 1));
1589 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1591 emit_label (end_label
);
1594 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1595 matches destination. RTX includes clobber of FLAGS_REG. */
1598 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1603 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1604 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1606 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1609 /* Return true if regno1 def is nearest to the insn. */
1612 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1614 rtx_insn
*prev
= insn
;
1615 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1619 while (prev
&& prev
!= start
)
1621 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1623 prev
= PREV_INSN (prev
);
1626 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1628 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1630 prev
= PREV_INSN (prev
);
1633 /* None of the regs is defined in the bb. */
1637 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1638 int ix86_last_zero_store_uid
;
1640 /* Split lea instructions into a sequence of instructions
1641 which are executed on ALU to avoid AGU stalls.
1642 It is assumed that it is allowed to clobber flags register
1646 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1648 unsigned int regno0
, regno1
, regno2
;
1649 struct ix86_address parts
;
1653 ok
= ix86_decompose_address (operands
[1], &parts
);
1656 target
= gen_lowpart (mode
, operands
[0]);
1658 regno0
= true_regnum (target
);
1659 regno1
= INVALID_REGNUM
;
1660 regno2
= INVALID_REGNUM
;
1664 parts
.base
= gen_lowpart (mode
, parts
.base
);
1665 regno1
= true_regnum (parts
.base
);
1670 parts
.index
= gen_lowpart (mode
, parts
.index
);
1671 regno2
= true_regnum (parts
.index
);
1675 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1677 if (parts
.scale
> 1)
1679 /* Case r1 = r1 + ... */
1680 if (regno1
== regno0
)
1682 /* If we have a case r1 = r1 + C * r2 then we
1683 should use multiplication which is very
1684 expensive. Assume cost model is wrong if we
1685 have such case here. */
1686 gcc_assert (regno2
!= regno0
);
1688 for (adds
= parts
.scale
; adds
> 0; adds
--)
1689 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1693 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1694 if (regno0
!= regno2
)
1695 emit_insn (gen_rtx_SET (target
, parts
.index
));
1697 /* Use shift for scaling, but emit it as MULT instead
1698 to avoid it being immediately peephole2 optimized back
1700 ix86_emit_binop (MULT
, mode
, target
, GEN_INT (parts
.scale
));
1703 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1705 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1706 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1709 else if (!parts
.base
&& !parts
.index
)
1711 gcc_assert(parts
.disp
);
1712 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1718 if (regno0
!= regno2
)
1719 emit_insn (gen_rtx_SET (target
, parts
.index
));
1721 else if (!parts
.index
)
1723 if (regno0
!= regno1
)
1724 emit_insn (gen_rtx_SET (target
, parts
.base
));
1728 if (regno0
== regno1
)
1730 else if (regno0
== regno2
)
1736 /* Find better operand for SET instruction, depending
1737 on which definition is farther from the insn. */
1738 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1739 tmp
= parts
.index
, tmp1
= parts
.base
;
1741 tmp
= parts
.base
, tmp1
= parts
.index
;
1743 emit_insn (gen_rtx_SET (target
, tmp
));
1745 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1746 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1748 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1752 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1755 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1756 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1760 /* Post-reload splitter for converting an SF or DFmode value in an
1761 SSE register into an unsigned SImode. */
1764 ix86_split_convert_uns_si_sse (rtx operands
[])
1766 machine_mode vecmode
;
1767 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1769 large
= operands
[1];
1770 zero_or_two31
= operands
[2];
1771 input
= operands
[3];
1772 two31
= operands
[4];
1773 vecmode
= GET_MODE (large
);
1774 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1776 /* Load up the value into the low element. We must ensure that the other
1777 elements are valid floats -- zero is the easiest such value. */
1780 if (vecmode
== V4SFmode
)
1781 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1783 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1787 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1788 emit_move_insn (value
, CONST0_RTX (vecmode
));
1789 if (vecmode
== V4SFmode
)
1790 emit_insn (gen_sse_movss_v4sf (value
, value
, input
));
1792 emit_insn (gen_sse2_movsd_v2df (value
, value
, input
));
1795 emit_move_insn (large
, two31
);
1796 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1798 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1799 emit_insn (gen_rtx_SET (large
, x
));
1801 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1802 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1804 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1805 emit_insn (gen_rtx_SET (value
, x
));
1807 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1808 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1810 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1811 if (vecmode
== V4SFmode
)
1812 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1814 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1817 emit_insn (gen_xorv4si3 (value
, value
, large
));
1820 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1821 machine_mode mode
, rtx target
,
1822 rtx var
, int one_var
);
1824 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1825 Expects the 64-bit DImode to be supplied in a pair of integral
1826 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1827 -mfpmath=sse, !optimize_size only. */
1830 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1832 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1833 rtx int_xmm
, fp_xmm
;
1834 rtx biases
, exponents
;
1837 int_xmm
= gen_reg_rtx (V4SImode
);
1838 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1839 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1840 else if (TARGET_SSE_SPLIT_REGS
)
1842 emit_clobber (int_xmm
);
1843 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1847 x
= gen_reg_rtx (V2DImode
);
1848 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1849 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1852 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1853 gen_rtvec (4, GEN_INT (0x43300000UL
),
1854 GEN_INT (0x45300000UL
),
1855 const0_rtx
, const0_rtx
));
1856 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1858 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1859 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1861 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1862 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1863 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1864 (0x1.0p84 + double(fp_value_hi_xmm)).
1865 Note these exponents differ by 32. */
1867 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1869 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1870 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1871 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1872 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1873 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1874 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1875 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1876 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1877 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1879 /* Add the upper and lower DFmode values together. */
1881 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1884 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1885 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1886 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1889 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1892 /* Not used, but eases macroization of patterns. */
1894 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1899 static rtx
ix86_expand_sse_fabs (rtx op0
, rtx
*smask
);
1901 /* Convert an unsigned SImode value into a DFmode. Only currently used
1902 for SSE, but applicable anywhere. */
1905 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1907 REAL_VALUE_TYPE TWO31r
;
1910 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1911 NULL
, 1, OPTAB_DIRECT
);
1913 fp
= gen_reg_rtx (DFmode
);
1914 emit_insn (gen_floatsidf2 (fp
, x
));
1916 real_ldexp (&TWO31r
, &dconst1
, 31);
1917 x
= const_double_from_real_value (TWO31r
, DFmode
);
1919 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1921 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1922 if (HONOR_SIGNED_ZEROS (DFmode
) && flag_rounding_math
)
1923 x
= ix86_expand_sse_fabs (x
, NULL
);
1926 emit_move_insn (target
, x
);
1929 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1930 32-bit mode; otherwise we have a direct convert instruction. */
1933 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1935 REAL_VALUE_TYPE TWO32r
;
1936 rtx fp_lo
, fp_hi
, x
;
1938 fp_lo
= gen_reg_rtx (DFmode
);
1939 fp_hi
= gen_reg_rtx (DFmode
);
1941 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1943 real_ldexp (&TWO32r
, &dconst1
, 32);
1944 x
= const_double_from_real_value (TWO32r
, DFmode
);
1945 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1947 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1949 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1952 emit_move_insn (target
, x
);
1955 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1956 For x86_32, -mfpmath=sse, !optimize_size only. */
1958 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1960 REAL_VALUE_TYPE ONE16r
;
1961 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1963 real_ldexp (&ONE16r
, &dconst1
, 16);
1964 x
= const_double_from_real_value (ONE16r
, SFmode
);
1965 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1966 NULL
, 0, OPTAB_DIRECT
);
1967 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1968 NULL
, 0, OPTAB_DIRECT
);
1969 fp_hi
= gen_reg_rtx (SFmode
);
1970 fp_lo
= gen_reg_rtx (SFmode
);
1971 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1972 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1975 x
= validize_mem (force_const_mem (SFmode
, x
));
1976 fp_hi
= gen_rtx_FMA (SFmode
, fp_hi
, x
, fp_lo
);
1977 emit_move_insn (target
, fp_hi
);
1981 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1983 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1985 if (!rtx_equal_p (target
, fp_hi
))
1986 emit_move_insn (target
, fp_hi
);
1990 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1991 a vector of unsigned ints VAL to vector of floats TARGET. */
1994 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1997 REAL_VALUE_TYPE TWO16r
;
1998 machine_mode intmode
= GET_MODE (val
);
1999 machine_mode fltmode
= GET_MODE (target
);
2000 rtx (*cvt
) (rtx
, rtx
);
2002 if (intmode
== V4SImode
)
2003 cvt
= gen_floatv4siv4sf2
;
2005 cvt
= gen_floatv8siv8sf2
;
2006 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
2007 tmp
[0] = force_reg (intmode
, tmp
[0]);
2008 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
2010 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
2011 NULL_RTX
, 1, OPTAB_DIRECT
);
2012 tmp
[3] = gen_reg_rtx (fltmode
);
2013 emit_insn (cvt (tmp
[3], tmp
[1]));
2014 tmp
[4] = gen_reg_rtx (fltmode
);
2015 emit_insn (cvt (tmp
[4], tmp
[2]));
2016 real_ldexp (&TWO16r
, &dconst1
, 16);
2017 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
2018 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
2021 tmp
[6] = gen_rtx_FMA (fltmode
, tmp
[4], tmp
[5], tmp
[3]);
2022 emit_move_insn (target
, tmp
[6]);
2026 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5],
2027 NULL_RTX
, 1, OPTAB_DIRECT
);
2028 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6],
2029 target
, 1, OPTAB_DIRECT
);
2030 if (tmp
[7] != target
)
2031 emit_move_insn (target
, tmp
[7]);
2035 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2036 pattern can be used on it instead of fixuns_trunc*.
2037 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2038 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2041 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
2043 REAL_VALUE_TYPE TWO31r
;
2045 machine_mode mode
= GET_MODE (val
);
2046 machine_mode scalarmode
= GET_MODE_INNER (mode
);
2047 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
2048 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
2051 for (i
= 0; i
< 3; i
++)
2052 tmp
[i
] = gen_reg_rtx (mode
);
2053 real_ldexp (&TWO31r
, &dconst1
, 31);
2054 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
2055 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
2056 two31r
= force_reg (mode
, two31r
);
2059 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
2060 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
2061 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
2062 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
2063 default: gcc_unreachable ();
2065 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
2066 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
2067 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
2069 if (intmode
== V4SImode
|| TARGET_AVX2
)
2070 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
2071 gen_lowpart (intmode
, tmp
[0]),
2072 GEN_INT (31), NULL_RTX
, 0,
2076 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
2077 two31
= ix86_build_const_vector (intmode
, 1, two31
);
2078 *xorp
= expand_simple_binop (intmode
, AND
,
2079 gen_lowpart (intmode
, tmp
[0]),
2083 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
2087 /* Generate code for floating point ABS or NEG. */
2090 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2094 bool use_sse
= false;
2095 bool vector_mode
= VECTOR_MODE_P (mode
);
2096 machine_mode vmode
= mode
;
2099 if (vector_mode
|| mode
== TFmode
|| mode
== HFmode
)
2105 else if (TARGET_SSE_MATH
)
2107 use_sse
= SSE_FLOAT_MODE_P (mode
);
2110 else if (mode
== DFmode
)
2117 set
= gen_rtx_fmt_e (code
, mode
, src
);
2118 set
= gen_rtx_SET (dst
, set
);
2122 rtx mask
, use
, clob
;
2124 /* NEG and ABS performed with SSE use bitwise mask operations.
2125 Create the appropriate mask now. */
2126 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
2127 use
= gen_rtx_USE (VOIDmode
, mask
);
2128 if (vector_mode
|| mode
== TFmode
)
2129 par
= gen_rtvec (2, set
, use
);
2132 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2133 par
= gen_rtvec (3, set
, use
, clob
);
2140 /* Changing of sign for FP values is doable using integer unit too. */
2141 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2142 par
= gen_rtvec (2, set
, clob
);
2145 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2148 /* Deconstruct a floating point ABS or NEG operation
2149 with integer registers into integer operations. */
2152 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2155 enum rtx_code absneg_op
;
2158 gcc_assert (operands_match_p (operands
[0], operands
[1]));
2163 dst
= gen_lowpart (SImode
, operands
[0]);
2167 set
= gen_int_mode (0x7fffffff, SImode
);
2172 set
= gen_int_mode (0x80000000, SImode
);
2175 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2181 dst
= gen_lowpart (DImode
, operands
[0]);
2182 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
2187 set
= gen_rtx_NOT (DImode
, dst
);
2191 dst
= gen_highpart (SImode
, operands
[0]);
2195 set
= gen_int_mode (0x7fffffff, SImode
);
2200 set
= gen_int_mode (0x80000000, SImode
);
2203 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2208 dst
= gen_rtx_REG (SImode
,
2209 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
2212 set
= GEN_INT (0x7fff);
2217 set
= GEN_INT (0x8000);
2220 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2227 set
= gen_rtx_SET (dst
, set
);
2229 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2230 rtvec par
= gen_rtvec (2, set
, clob
);
2232 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2235 /* Expand a copysign operation. Special case operand 0 being a constant. */
2238 ix86_expand_copysign (rtx operands
[])
2240 machine_mode mode
, vmode
;
2241 rtx dest
, vdest
, op0
, op1
, mask
, op2
, op3
;
2243 mode
= GET_MODE (operands
[0]);
2247 else if (mode
== SFmode
)
2249 else if (mode
== DFmode
)
2251 else if (mode
== TFmode
)
2256 if (rtx_equal_p (operands
[1], operands
[2]))
2258 emit_move_insn (operands
[0], operands
[1]);
2263 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2264 if (vdest
== NULL_RTX
)
2265 vdest
= gen_reg_rtx (vmode
);
2268 op1
= lowpart_subreg (vmode
, force_reg (mode
, operands
[2]), mode
);
2269 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2271 if (CONST_DOUBLE_P (operands
[1]))
2273 op0
= simplify_unary_operation (ABS
, mode
, operands
[1], mode
);
2274 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2275 if (op0
== CONST0_RTX (mode
))
2277 emit_move_insn (vdest
, gen_rtx_AND (vmode
, mask
, op1
));
2279 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2283 if (GET_MODE_SIZE (mode
) < 16)
2284 op0
= ix86_build_const_vector (vmode
, false, op0
);
2285 op0
= force_reg (vmode
, op0
);
2288 op0
= lowpart_subreg (vmode
, force_reg (mode
, operands
[1]), mode
);
2290 op2
= gen_reg_rtx (vmode
);
2291 op3
= gen_reg_rtx (vmode
);
2292 emit_move_insn (op2
, gen_rtx_AND (vmode
,
2293 gen_rtx_NOT (vmode
, mask
),
2295 emit_move_insn (op3
, gen_rtx_AND (vmode
, mask
, op1
));
2296 emit_move_insn (vdest
, gen_rtx_IOR (vmode
, op2
, op3
));
2298 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2301 /* Expand an xorsign operation. */
2304 ix86_expand_xorsign (rtx operands
[])
2306 machine_mode mode
, vmode
;
2307 rtx dest
, vdest
, op0
, op1
, mask
, x
, temp
;
2313 mode
= GET_MODE (dest
);
2317 else if (mode
== SFmode
)
2319 else if (mode
== DFmode
)
2324 temp
= gen_reg_rtx (vmode
);
2325 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2327 op1
= lowpart_subreg (vmode
, force_reg (mode
, op1
), mode
);
2328 x
= gen_rtx_AND (vmode
, op1
, mask
);
2329 emit_insn (gen_rtx_SET (temp
, x
));
2331 op0
= lowpart_subreg (vmode
, force_reg (mode
, op0
), mode
);
2332 x
= gen_rtx_XOR (vmode
, temp
, op0
);
2334 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2335 if (vdest
== NULL_RTX
)
2336 vdest
= gen_reg_rtx (vmode
);
2339 emit_insn (gen_rtx_SET (vdest
, x
));
2342 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2345 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2348 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2350 machine_mode mode
= GET_MODE (op0
);
2353 /* Handle special case - vector comparsion with boolean result, transform
2354 it using ptest instruction. */
2355 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
2358 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2359 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2361 gcc_assert (code
== EQ
|| code
== NE
);
2365 op0
= lowpart_subreg (p_mode
, force_reg (mode
, op0
), mode
);
2366 op1
= lowpart_subreg (p_mode
, force_reg (mode
, op1
), mode
);
2369 /* Generate XOR since we can't check that one operand is zero vector. */
2370 tmp
= gen_reg_rtx (mode
);
2371 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2372 tmp
= gen_lowpart (p_mode
, tmp
);
2373 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2374 gen_rtx_UNSPEC (CCmode
,
2375 gen_rtvec (2, tmp
, tmp
),
2377 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2378 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2379 gen_rtx_LABEL_REF (VOIDmode
, label
),
2381 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2395 tmp
= ix86_expand_compare (code
, op0
, op1
);
2396 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2397 gen_rtx_LABEL_REF (VOIDmode
, label
),
2399 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2407 /* DI and TI mode equality/inequality comparisons may be performed
2408 on SSE registers. Avoid splitting them, except when optimizing
2410 if ((code
== EQ
|| code
== NE
)
2411 && !optimize_insn_for_size_p ())
2414 /* Expand DImode branch into multiple compare+branch. */
2417 rtx_code_label
*label2
;
2418 enum rtx_code code1
, code2
, code3
;
2419 machine_mode submode
;
2421 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2423 std::swap (op0
, op1
);
2424 code
= swap_condition (code
);
2427 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2428 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2430 submode
= mode
== DImode
? SImode
: DImode
;
2432 /* If we are doing less-than or greater-or-equal-than,
2433 op1 is a constant and the low word is zero, then we can just
2434 examine the high word. Similarly for low word -1 and
2435 less-or-equal-than or greater-than. */
2437 if (CONST_INT_P (hi
[1]))
2440 case LT
: case LTU
: case GE
: case GEU
:
2441 if (lo
[1] == const0_rtx
)
2443 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2447 case LE
: case LEU
: case GT
: case GTU
:
2448 if (lo
[1] == constm1_rtx
)
2450 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2458 /* Emulate comparisons that do not depend on Zero flag with
2459 double-word subtraction. Note that only Overflow, Sign
2460 and Carry flags are valid, so swap arguments and condition
2461 of comparisons that would otherwise test Zero flag. */
2465 case LE
: case LEU
: case GT
: case GTU
:
2466 std::swap (lo
[0], lo
[1]);
2467 std::swap (hi
[0], hi
[1]);
2468 code
= swap_condition (code
);
2471 case LT
: case LTU
: case GE
: case GEU
:
2473 bool uns
= (code
== LTU
|| code
== GEU
);
2474 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2475 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2477 if (!nonimmediate_operand (lo
[0], submode
))
2478 lo
[0] = force_reg (submode
, lo
[0]);
2479 if (!x86_64_general_operand (lo
[1], submode
))
2480 lo
[1] = force_reg (submode
, lo
[1]);
2482 if (!register_operand (hi
[0], submode
))
2483 hi
[0] = force_reg (submode
, hi
[0]);
2484 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2485 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2486 hi
[1] = force_reg (submode
, hi
[1]);
2488 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2490 tmp
= gen_rtx_SCRATCH (submode
);
2491 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2493 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2494 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2502 /* Otherwise, we need two or three jumps. */
2504 label2
= gen_label_rtx ();
2507 code2
= swap_condition (code
);
2508 code3
= unsigned_condition (code
);
2512 case LT
: case GT
: case LTU
: case GTU
:
2515 case LE
: code1
= LT
; code2
= GT
; break;
2516 case GE
: code1
= GT
; code2
= LT
; break;
2517 case LEU
: code1
= LTU
; code2
= GTU
; break;
2518 case GEU
: code1
= GTU
; code2
= LTU
; break;
2520 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2521 case NE
: code2
= UNKNOWN
; break;
2529 * if (hi(a) < hi(b)) goto true;
2530 * if (hi(a) > hi(b)) goto false;
2531 * if (lo(a) < lo(b)) goto true;
2535 if (code1
!= UNKNOWN
)
2536 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2537 if (code2
!= UNKNOWN
)
2538 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2540 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2542 if (code2
!= UNKNOWN
)
2543 emit_label (label2
);
2548 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2553 /* Figure out whether to use unordered fp comparisons. */
2556 ix86_unordered_fp_compare (enum rtx_code code
)
2558 if (!TARGET_IEEE_FP
)
2587 /* Return a comparison we can do and that it is equivalent to
2588 swap_condition (code) apart possibly from orderedness.
2589 But, never change orderedness if TARGET_IEEE_FP, returning
2590 UNKNOWN in that case if necessary. */
2592 static enum rtx_code
2593 ix86_fp_swap_condition (enum rtx_code code
)
2597 case GT
: /* GTU - CF=0 & ZF=0 */
2598 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2599 case GE
: /* GEU - CF=0 */
2600 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2601 case UNLT
: /* LTU - CF=1 */
2602 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2603 case UNLE
: /* LEU - CF=1 | ZF=1 */
2604 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2606 return swap_condition (code
);
2610 /* Return cost of comparison CODE using the best strategy for performance.
2611 All following functions do use number of instructions as a cost metrics.
2612 In future this should be tweaked to compute bytes for optimize_size and
2613 take into account performance of various instructions on various CPUs. */
2616 ix86_fp_comparison_cost (enum rtx_code code
)
2620 /* The cost of code using bit-twiddling on %ah. */
2637 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2641 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2647 switch (ix86_fp_comparison_strategy (code
))
2649 case IX86_FPCMP_COMI
:
2650 return arith_cost
> 4 ? 3 : 2;
2651 case IX86_FPCMP_SAHF
:
2652 return arith_cost
> 4 ? 4 : 3;
2658 /* Swap, force into registers, or otherwise massage the two operands
2659 to a fp comparison. The operands are updated in place; the new
2660 comparison code is returned. */
2662 static enum rtx_code
2663 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2665 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2666 rtx op0
= *pop0
, op1
= *pop1
;
2667 machine_mode op_mode
= GET_MODE (op0
);
2668 bool is_sse
= SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode
);
2670 if (op_mode
== BFmode
)
2672 rtx op
= gen_lowpart (HImode
, op0
);
2673 if (CONST_INT_P (op
))
2674 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2678 rtx t1
= gen_reg_rtx (SImode
);
2679 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2680 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2681 op
= gen_lowpart (SFmode
, t1
);
2684 op
= gen_lowpart (HImode
, op1
);
2685 if (CONST_INT_P (op
))
2686 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2690 rtx t1
= gen_reg_rtx (SImode
);
2691 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2692 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2693 op
= gen_lowpart (SFmode
, t1
);
2696 return ix86_prepare_fp_compare_args (code
, pop0
, pop1
);
2699 /* All of the unordered compare instructions only work on registers.
2700 The same is true of the fcomi compare instructions. The XFmode
2701 compare instructions require registers except when comparing
2702 against zero or when converting operand 1 from fixed point to
2706 && (unordered_compare
2707 || (op_mode
== XFmode
2708 && ! (standard_80387_constant_p (op0
) == 1
2709 || standard_80387_constant_p (op1
) == 1)
2710 && GET_CODE (op1
) != FLOAT
)
2711 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2713 op0
= force_reg (op_mode
, op0
);
2714 op1
= force_reg (op_mode
, op1
);
2718 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2719 things around if they appear profitable, otherwise force op0
2722 if (standard_80387_constant_p (op0
) == 0
2724 && ! (standard_80387_constant_p (op1
) == 0
2727 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2728 if (new_code
!= UNKNOWN
)
2730 std::swap (op0
, op1
);
2736 op0
= force_reg (op_mode
, op0
);
2738 if (CONSTANT_P (op1
))
2740 int tmp
= standard_80387_constant_p (op1
);
2742 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2746 op1
= force_reg (op_mode
, op1
);
2749 op1
= force_reg (op_mode
, op1
);
2753 /* Try to rearrange the comparison to make it cheaper. */
2754 if (ix86_fp_comparison_cost (code
)
2755 > ix86_fp_comparison_cost (swap_condition (code
))
2756 && (REG_P (op1
) || can_create_pseudo_p ()))
2758 std::swap (op0
, op1
);
2759 code
= swap_condition (code
);
2761 op0
= force_reg (op_mode
, op0
);
2769 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2772 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2774 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2775 machine_mode cmp_mode
;
2778 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2780 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2781 if (unordered_compare
)
2782 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2784 /* Do fcomi/sahf based test when profitable. */
2785 switch (ix86_fp_comparison_strategy (code
))
2787 case IX86_FPCMP_COMI
:
2788 cmp_mode
= CCFPmode
;
2789 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2792 case IX86_FPCMP_SAHF
:
2793 cmp_mode
= CCFPmode
;
2794 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2795 scratch
= gen_reg_rtx (HImode
);
2796 emit_insn (gen_rtx_SET (scratch
, tmp
));
2797 emit_insn (gen_x86_sahf_1 (scratch
));
2800 case IX86_FPCMP_ARITH
:
2801 cmp_mode
= CCNOmode
;
2802 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2803 scratch
= gen_reg_rtx (HImode
);
2804 emit_insn (gen_rtx_SET (scratch
, tmp
));
2806 /* In the unordered case, we have to check C2 for NaN's, which
2807 doesn't happen to work out to anything nice combination-wise.
2808 So do some bit twiddling on the value we've got in AH to come
2809 up with an appropriate set of condition codes. */
2815 if (code
== GT
|| !TARGET_IEEE_FP
)
2817 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2822 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2823 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2824 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2831 if (code
== LT
&& TARGET_IEEE_FP
)
2833 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2834 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2840 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2846 if (code
== GE
|| !TARGET_IEEE_FP
)
2848 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2853 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2854 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2860 if (code
== LE
&& TARGET_IEEE_FP
)
2862 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2863 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2864 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2870 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2876 if (code
== EQ
&& TARGET_IEEE_FP
)
2878 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2879 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2885 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2891 if (code
== NE
&& TARGET_IEEE_FP
)
2893 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2894 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2900 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2906 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2910 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2923 /* Return the test that should be put into the flags user, i.e.
2924 the bcc, scc, or cmov instruction. */
2925 return gen_rtx_fmt_ee (code
, VOIDmode
,
2926 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2930 /* Generate insn patterns to do an integer compare of OPERANDS. */
2933 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2935 machine_mode cmpmode
;
2938 /* Swap operands to emit carry flag comparison. */
2939 if ((code
== GTU
|| code
== LEU
)
2940 && nonimmediate_operand (op1
, VOIDmode
))
2942 std::swap (op0
, op1
);
2943 code
= swap_condition (code
);
2946 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2947 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2949 /* This is very simple, but making the interface the same as in the
2950 FP case makes the rest of the code easier. */
2951 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2952 emit_insn (gen_rtx_SET (flags
, tmp
));
2954 /* Return the test that should be put into the flags user, i.e.
2955 the bcc, scc, or cmov instruction. */
2956 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2960 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2964 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2965 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2967 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2969 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2970 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2973 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2979 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2983 gcc_assert (GET_MODE (dest
) == QImode
);
2985 ret
= ix86_expand_compare (code
, op0
, op1
);
2986 PUT_MODE (ret
, QImode
);
2987 emit_insn (gen_rtx_SET (dest
, ret
));
2990 /* Expand floating point op0 <=> op1, i.e.
2991 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2994 ix86_expand_fp_spaceship (rtx dest
, rtx op0
, rtx op1
)
2996 gcc_checking_assert (ix86_fp_comparison_strategy (GT
) != IX86_FPCMP_ARITH
);
2997 rtx gt
= ix86_expand_fp_compare (GT
, op0
, op1
);
2998 rtx l0
= gen_label_rtx ();
2999 rtx l1
= gen_label_rtx ();
3000 rtx l2
= TARGET_IEEE_FP
? gen_label_rtx () : NULL_RTX
;
3001 rtx lend
= gen_label_rtx ();
3006 rtx un
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
,
3007 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
3008 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, un
,
3009 gen_rtx_LABEL_REF (VOIDmode
, l2
), pc_rtx
);
3010 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3011 add_reg_br_prob_note (jmp
, profile_probability:: very_unlikely ());
3013 rtx eq
= gen_rtx_fmt_ee (UNEQ
, VOIDmode
,
3014 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
3015 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, eq
,
3016 gen_rtx_LABEL_REF (VOIDmode
, l0
), pc_rtx
);
3017 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3018 add_reg_br_prob_note (jmp
, profile_probability::unlikely ());
3019 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, gt
,
3020 gen_rtx_LABEL_REF (VOIDmode
, l1
), pc_rtx
);
3021 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
3022 add_reg_br_prob_note (jmp
, profile_probability::even ());
3023 emit_move_insn (dest
, constm1_rtx
);
3026 emit_move_insn (dest
, const0_rtx
);
3029 emit_move_insn (dest
, const1_rtx
);
3034 emit_move_insn (dest
, const2_rtx
);
3039 /* Expand comparison setting or clearing carry flag. Return true when
3040 successful and set pop for the operation. */
3042 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
3045 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
3047 /* Do not handle double-mode compares that go through special path. */
3048 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
3051 if (SCALAR_FLOAT_MODE_P (mode
))
3054 rtx_insn
*compare_seq
;
3056 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
3058 /* Shortcut: following common codes never translate
3059 into carry flag compares. */
3060 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
3061 || code
== ORDERED
|| code
== UNORDERED
)
3064 /* These comparisons require zero flag; swap operands so they won't. */
3065 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
3068 std::swap (op0
, op1
);
3069 code
= swap_condition (code
);
3072 /* Try to expand the comparison and verify that we end up with
3073 carry flag based comparison. This fails to be true only when
3074 we decide to expand comparison using arithmetic that is not
3075 too common scenario. */
3077 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
3078 compare_seq
= get_insns ();
3081 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
3082 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
3084 code
= GET_CODE (compare_op
);
3086 if (code
!= LTU
&& code
!= GEU
)
3089 emit_insn (compare_seq
);
3094 if (!INTEGRAL_MODE_P (mode
))
3103 /* Convert a==0 into (unsigned)a<1. */
3106 if (op1
!= const0_rtx
)
3109 code
= (code
== EQ
? LTU
: GEU
);
3112 /* Convert a>b into b<a or a>=b-1. */
3115 if (CONST_INT_P (op1
))
3117 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
3118 /* Bail out on overflow. We still can swap operands but that
3119 would force loading of the constant into register. */
3120 if (op1
== const0_rtx
3121 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
3123 code
= (code
== GTU
? GEU
: LTU
);
3127 std::swap (op0
, op1
);
3128 code
= (code
== GTU
? LTU
: GEU
);
3132 /* Convert a>=0 into (unsigned)a<0x80000000. */
3135 if (mode
== DImode
|| op1
!= const0_rtx
)
3137 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3138 code
= (code
== LT
? GEU
: LTU
);
3142 if (mode
== DImode
|| op1
!= constm1_rtx
)
3144 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3145 code
= (code
== LE
? GEU
: LTU
);
3151 /* Swapping operands may cause constant to appear as first operand. */
3152 if (!nonimmediate_operand (op0
, VOIDmode
))
3154 if (!can_create_pseudo_p ())
3156 op0
= force_reg (mode
, op0
);
3158 *pop
= ix86_expand_compare (code
, op0
, op1
);
3159 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
3163 /* Expand conditional increment or decrement using adb/sbb instructions.
3164 The default case using setcc followed by the conditional move can be
3165 done by generic code. */
3167 ix86_expand_int_addcc (rtx operands
[])
3169 enum rtx_code code
= GET_CODE (operands
[1]);
3171 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
3173 rtx val
= const0_rtx
;
3176 rtx op0
= XEXP (operands
[1], 0);
3177 rtx op1
= XEXP (operands
[1], 1);
3179 if (operands
[3] != const1_rtx
3180 && operands
[3] != constm1_rtx
)
3182 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3184 code
= GET_CODE (compare_op
);
3186 flags
= XEXP (compare_op
, 0);
3188 if (GET_MODE (flags
) == CCFPmode
)
3191 code
= ix86_fp_compare_code_to_integer (code
);
3198 PUT_CODE (compare_op
,
3199 reverse_condition_maybe_unordered
3200 (GET_CODE (compare_op
)));
3202 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
3205 mode
= GET_MODE (operands
[0]);
3207 /* Construct either adc or sbb insn. */
3208 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
3209 insn
= gen_sub3_carry
;
3211 insn
= gen_add3_carry
;
3213 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
3219 ix86_expand_int_movcc (rtx operands
[])
3221 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
3222 rtx_insn
*compare_seq
;
3224 machine_mode mode
= GET_MODE (operands
[0]);
3225 bool sign_bit_compare_p
= false;
3226 bool negate_cc_compare_p
= false;
3227 rtx op0
= XEXP (operands
[1], 0);
3228 rtx op1
= XEXP (operands
[1], 1);
3229 rtx op2
= operands
[2];
3230 rtx op3
= operands
[3];
3232 if (GET_MODE (op0
) == TImode
3233 || (GET_MODE (op0
) == DImode
3237 if (GET_MODE (op0
) == BFmode
3238 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
3242 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3243 compare_seq
= get_insns ();
3246 compare_code
= GET_CODE (compare_op
);
3248 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
3249 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
3250 sign_bit_compare_p
= true;
3252 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3253 but if op1 is a constant, the latter form allows more optimizations,
3254 either through the last 2 ops being constant handling, or the one
3255 constant and one variable cases. On the other side, for cmov the
3256 former might be better as we don't need to load the constant into
3257 another register. */
3258 if (code
== EQ
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op2
))
3260 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3261 else if (code
== NE
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op3
))
3264 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3265 HImode insns, we'd be swallowed in word prefix ops. */
3267 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
3268 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
3269 && CONST_INT_P (op2
)
3270 && CONST_INT_P (op3
))
3272 rtx out
= operands
[0];
3273 HOST_WIDE_INT ct
= INTVAL (op2
);
3274 HOST_WIDE_INT cf
= INTVAL (op3
);
3278 || (TARGET_64BIT
&& mode
== DImode
))
3279 && (GET_MODE (op0
) == SImode
3280 || (TARGET_64BIT
&& GET_MODE (op0
) == DImode
)))
3282 /* Special case x != 0 ? -1 : y. */
3283 if (code
== NE
&& op1
== const0_rtx
&& ct
== -1)
3285 negate_cc_compare_p
= true;
3289 else if (code
== EQ
&& op1
== const0_rtx
&& cf
== -1)
3290 negate_cc_compare_p
= true;
3294 /* Sign bit compares are better done using shifts than we do by using
3296 if (sign_bit_compare_p
3297 || negate_cc_compare_p
3298 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3300 /* Detect overlap between destination and compare sources. */
3303 if (negate_cc_compare_p
)
3305 if (GET_MODE (op0
) == DImode
)
3306 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode
), op0
));
3308 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode
),
3309 gen_lowpart (SImode
, op0
)));
3311 tmp
= gen_reg_rtx (mode
);
3313 emit_insn (gen_x86_movdicc_0_m1_neg (tmp
));
3315 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode
,
3318 else if (!sign_bit_compare_p
)
3323 compare_code
= GET_CODE (compare_op
);
3325 flags
= XEXP (compare_op
, 0);
3327 if (GET_MODE (flags
) == CCFPmode
)
3331 = ix86_fp_compare_code_to_integer (compare_code
);
3334 /* To simplify rest of code, restrict to the GEU case. */
3335 if (compare_code
== LTU
)
3338 compare_code
= reverse_condition (compare_code
);
3339 code
= reverse_condition (code
);
3344 PUT_CODE (compare_op
,
3345 reverse_condition_maybe_unordered
3346 (GET_CODE (compare_op
)));
3348 PUT_CODE (compare_op
,
3349 reverse_condition (GET_CODE (compare_op
)));
3353 if (reg_overlap_mentioned_p (out
, compare_op
))
3354 tmp
= gen_reg_rtx (mode
);
3357 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
3359 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
3360 flags
, compare_op
));
3364 if (code
== GT
|| code
== GE
)
3365 code
= reverse_condition (code
);
3371 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
3384 tmp
= expand_simple_binop (mode
, PLUS
,
3386 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3397 tmp
= expand_simple_binop (mode
, IOR
,
3399 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3401 else if (diff
== -1 && ct
)
3411 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3413 tmp
= expand_simple_binop (mode
, PLUS
,
3414 copy_rtx (tmp
), GEN_INT (cf
),
3415 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3423 * andl cf - ct, dest
3433 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3436 tmp
= expand_simple_binop (mode
, AND
,
3438 gen_int_mode (cf
- ct
, mode
),
3439 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3441 tmp
= expand_simple_binop (mode
, PLUS
,
3442 copy_rtx (tmp
), GEN_INT (ct
),
3443 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3446 if (!rtx_equal_p (tmp
, out
))
3447 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3454 machine_mode cmp_mode
= GET_MODE (op0
);
3455 enum rtx_code new_code
;
3457 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3459 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3461 /* We may be reversing a non-trapping
3462 comparison to a trapping comparison. */
3463 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3464 && code
!= EQ
&& code
!= NE
3465 && code
!= ORDERED
&& code
!= UNORDERED
)
3468 new_code
= reverse_condition_maybe_unordered (code
);
3471 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3472 if (new_code
!= UNKNOWN
)
3480 compare_code
= UNKNOWN
;
3481 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3482 && CONST_INT_P (op1
))
3484 if (op1
== const0_rtx
3485 && (code
== LT
|| code
== GE
))
3486 compare_code
= code
;
3487 else if (op1
== constm1_rtx
)
3491 else if (code
== GT
)
3496 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3497 if (compare_code
!= UNKNOWN
3498 && GET_MODE (op0
) == GET_MODE (out
)
3499 && (cf
== -1 || ct
== -1))
3501 /* If lea code below could be used, only optimize
3502 if it results in a 2 insn sequence. */
3504 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3505 || diff
== 3 || diff
== 5 || diff
== 9)
3506 || (compare_code
== LT
&& ct
== -1)
3507 || (compare_code
== GE
&& cf
== -1))
3510 * notl op1 (if necessary)
3518 code
= reverse_condition (code
);
3521 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3523 out
= expand_simple_binop (mode
, IOR
,
3525 out
, 1, OPTAB_DIRECT
);
3526 if (out
!= operands
[0])
3527 emit_move_insn (operands
[0], out
);
3534 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3535 || diff
== 3 || diff
== 5 || diff
== 9)
3536 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3538 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3544 * lea cf(dest*(ct-cf)),dest
3548 * This also catches the degenerate setcc-only case.
3554 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3557 /* On x86_64 the lea instruction operates on Pmode, so we need
3558 to get arithmetics done in proper mode to match. */
3560 tmp
= copy_rtx (out
);
3564 out1
= copy_rtx (out
);
3565 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3569 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3575 tmp
= plus_constant (mode
, tmp
, cf
);
3578 if (!rtx_equal_p (tmp
, out
))
3581 out
= force_operand (tmp
, copy_rtx (out
));
3583 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3585 if (!rtx_equal_p (out
, operands
[0]))
3586 emit_move_insn (operands
[0], copy_rtx (out
));
3592 * General case: Jumpful:
3593 * xorl dest,dest cmpl op1, op2
3594 * cmpl op1, op2 movl ct, dest
3596 * decl dest movl cf, dest
3597 * andl (cf-ct),dest 1:
3602 * This is reasonably steep, but branch mispredict costs are
3603 * high on modern cpus, so consider failing only if optimizing
3607 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3608 && BRANCH_COST (optimize_insn_for_speed_p (),
3613 machine_mode cmp_mode
= GET_MODE (op0
);
3614 enum rtx_code new_code
;
3616 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3618 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3620 /* We may be reversing a non-trapping
3621 comparison to a trapping comparison. */
3622 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3623 && code
!= EQ
&& code
!= NE
3624 && code
!= ORDERED
&& code
!= UNORDERED
)
3627 new_code
= reverse_condition_maybe_unordered (code
);
3632 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3633 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3634 compare_code
= reverse_condition (compare_code
);
3637 if (new_code
!= UNKNOWN
)
3645 if (compare_code
!= UNKNOWN
)
3647 /* notl op1 (if needed)
3652 For x < 0 (resp. x <= -1) there will be no notl,
3653 so if possible swap the constants to get rid of the
3655 True/false will be -1/0 while code below (store flag
3656 followed by decrement) is 0/-1, so the constants need
3657 to be exchanged once more. */
3659 if (compare_code
== GE
|| !cf
)
3661 code
= reverse_condition (code
);
3667 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3671 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3673 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3675 copy_rtx (out
), 1, OPTAB_DIRECT
);
3678 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3679 gen_int_mode (cf
- ct
, mode
),
3680 copy_rtx (out
), 1, OPTAB_DIRECT
);
3682 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3683 copy_rtx (out
), 1, OPTAB_DIRECT
);
3684 if (!rtx_equal_p (out
, operands
[0]))
3685 emit_move_insn (operands
[0], copy_rtx (out
));
3691 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3693 /* Try a few things more with specific constants and a variable. */
3696 rtx var
, orig_out
, out
, tmp
;
3698 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3704 /* If one of the two operands is an interesting constant, load a
3705 constant with the above and mask it in with a logical operation. */
3707 if (CONST_INT_P (operands
[2]))
3710 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3711 operands
[3] = constm1_rtx
, op
= and_optab
;
3712 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3713 operands
[3] = const0_rtx
, op
= ior_optab
;
3717 else if (CONST_INT_P (operands
[3]))
3720 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3722 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3723 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3724 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3725 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3729 operands
[2] = constm1_rtx
;
3732 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3733 operands
[2] = const0_rtx
, op
= ior_optab
;
3740 orig_out
= operands
[0];
3741 tmp
= gen_reg_rtx (mode
);
3744 /* Recurse to get the constant loaded. */
3745 if (!ix86_expand_int_movcc (operands
))
3748 /* Mask in the interesting variable. */
3749 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3751 if (!rtx_equal_p (out
, orig_out
))
3752 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3758 * For comparison with above,
3768 if (! nonimmediate_operand (operands
[2], mode
))
3769 operands
[2] = force_reg (mode
, operands
[2]);
3770 if (! nonimmediate_operand (operands
[3], mode
))
3771 operands
[3] = force_reg (mode
, operands
[3]);
3773 if (! register_operand (operands
[2], VOIDmode
)
3775 || ! register_operand (operands
[3], VOIDmode
)))
3776 operands
[2] = force_reg (mode
, operands
[2]);
3779 && ! register_operand (operands
[3], VOIDmode
))
3780 operands
[3] = force_reg (mode
, operands
[3]);
3782 emit_insn (compare_seq
);
3783 emit_insn (gen_rtx_SET (operands
[0],
3784 gen_rtx_IF_THEN_ELSE (mode
,
3785 compare_op
, operands
[2],
3790 /* Detect conditional moves that exactly match min/max operational
3791 semantics. Note that this is IEEE safe, as long as we don't
3792 interchange the operands.
3794 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3795 and TRUE if the operation is successful and instructions are emitted. */
3798 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3799 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3807 else if (code
== UNGE
)
3808 std::swap (if_true
, if_false
);
3812 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3814 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3819 mode
= GET_MODE (dest
);
3821 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3822 but MODE may be a vector mode and thus not appropriate. */
3823 if (!flag_finite_math_only
|| flag_signed_zeros
)
3825 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3828 if_true
= force_reg (mode
, if_true
);
3829 v
= gen_rtvec (2, if_true
, if_false
);
3830 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3834 code
= is_min
? SMIN
: SMAX
;
3835 if (MEM_P (if_true
) && MEM_P (if_false
))
3836 if_true
= force_reg (mode
, if_true
);
3837 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3840 emit_insn (gen_rtx_SET (dest
, tmp
));
3844 /* Return true if MODE is valid for vector compare to mask register,
3845 Same result for conditionl vector move with mask register. */
3847 ix86_valid_mask_cmp_mode (machine_mode mode
)
3849 /* XOP has its own vector conditional movement. */
3850 if (TARGET_XOP
&& !TARGET_AVX512F
)
3853 /* HFmode only supports vcmpsh whose dest is mask register. */
3854 if (TARGET_AVX512FP16
&& mode
== HFmode
)
3857 /* AVX512F is needed for mask operation. */
3858 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3861 /* AVX512BW is needed for vector QI/HImode,
3862 AVX512VL is needed for 128/256-bit vector. */
3863 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3864 int vector_size
= GET_MODE_SIZE (mode
);
3865 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3868 return vector_size
== 64 || TARGET_AVX512VL
;
3871 /* Return true if integer mask comparison should be used. */
3873 ix86_use_mask_cmp_p (machine_mode mode
, machine_mode cmp_mode
,
3874 rtx op_true
, rtx op_false
)
3876 int vector_size
= GET_MODE_SIZE (mode
);
3878 if (cmp_mode
== HFmode
)
3880 else if (vector_size
< 16)
3882 else if (vector_size
== 64)
3884 else if (GET_MODE_INNER (cmp_mode
) == HFmode
)
3887 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3888 gcc_assert (!op_true
== !op_false
);
3890 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3891 vector dest is required. */
3892 if (!op_true
|| !ix86_valid_mask_cmp_mode (cmp_mode
))
3895 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3896 if (op_false
== CONST0_RTX (mode
)
3897 || op_true
== CONST0_RTX (mode
)
3898 || (INTEGRAL_MODE_P (mode
)
3899 && (op_true
== CONSTM1_RTX (mode
)
3900 || op_false
== CONSTM1_RTX (mode
))))
3906 /* Expand an SSE comparison. Return the register with the result. */
3909 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3910 rtx op_true
, rtx op_false
)
3912 machine_mode mode
= GET_MODE (dest
);
3913 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3915 /* In general case result of comparison can differ from operands' type. */
3916 machine_mode cmp_mode
;
3918 /* In AVX512F the result of comparison is an integer mask. */
3919 bool maskcmp
= false;
3922 if (ix86_use_mask_cmp_p (mode
, cmp_ops_mode
, op_true
, op_false
))
3924 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3926 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
3929 cmp_mode
= cmp_ops_mode
;
3931 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3933 bool (*op1_predicate
)(rtx
, machine_mode
)
3934 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3936 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3937 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3940 || (maskcmp
&& cmp_mode
!= mode
)
3941 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3942 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3943 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3947 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
3952 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3954 if (cmp_mode
!= mode
)
3956 x
= force_reg (cmp_ops_mode
, x
);
3957 convert_move (dest
, x
, false);
3960 emit_insn (gen_rtx_SET (dest
, x
));
3965 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3966 instructions that can be performed using GP registers. */
3969 ix86_emit_vec_binop (enum rtx_code code
, machine_mode mode
,
3970 rtx dst
, rtx src1
, rtx src2
)
3974 tmp
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
3976 if (GET_MODE_SIZE (mode
) <= GET_MODE_SIZE (SImode
)
3977 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
3979 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
3980 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
3986 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3987 operations. This is used for both scalar and vector conditional moves. */
3990 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3992 machine_mode mode
= GET_MODE (dest
);
3993 machine_mode cmpmode
= GET_MODE (cmp
);
3996 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3997 if (rtx_equal_p (op_true
, op_false
))
3999 emit_move_insn (dest
, op_true
);
4003 /* If we have an integer mask and FP value then we need
4004 to cast mask to FP mode. */
4005 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
4007 cmp
= force_reg (cmpmode
, cmp
);
4008 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
4011 /* In AVX512F the result of comparison is an integer mask. */
4013 && GET_MODE_CLASS (cmpmode
) == MODE_INT
)
4015 gcc_assert (ix86_valid_mask_cmp_mode (mode
));
4016 /* Using scalar/vector move with mask register. */
4017 cmp
= force_reg (cmpmode
, cmp
);
4018 /* Optimize for mask zero. */
4019 op_true
= (op_true
!= CONST0_RTX (mode
)
4020 ? force_reg (mode
, op_true
) : op_true
);
4021 op_false
= (op_false
!= CONST0_RTX (mode
)
4022 ? force_reg (mode
, op_false
) : op_false
);
4023 if (op_true
== CONST0_RTX (mode
))
4025 if (cmpmode
== E_DImode
&& !TARGET_64BIT
)
4027 x
= gen_reg_rtx (cmpmode
);
4028 emit_insn (gen_knotdi (x
, cmp
));
4031 x
= expand_simple_unop (cmpmode
, NOT
, cmp
, NULL
, 1);
4033 /* Reverse op_true op_false. */
4034 std::swap (op_true
, op_false
);
4038 emit_insn (gen_movhf_mask (dest
, op_true
, op_false
, cmp
));
4040 emit_insn (gen_rtx_SET (dest
,
4041 gen_rtx_VEC_MERGE (mode
,
4042 op_true
, op_false
, cmp
)));
4046 if (vector_all_ones_operand (op_true
, mode
)
4047 && op_false
== CONST0_RTX (mode
))
4049 emit_move_insn (dest
, cmp
);
4052 else if (op_false
== CONST0_RTX (mode
))
4054 x
= expand_simple_binop (mode
, AND
, cmp
, op_true
,
4055 dest
, 1, OPTAB_DIRECT
);
4057 emit_move_insn (dest
, x
);
4060 else if (op_true
== CONST0_RTX (mode
))
4062 op_false
= force_reg (mode
, op_false
);
4063 x
= gen_rtx_NOT (mode
, cmp
);
4064 ix86_emit_vec_binop (AND
, mode
, dest
, x
, op_false
);
4067 else if (vector_all_ones_operand (op_true
, mode
))
4069 x
= expand_simple_binop (mode
, IOR
, cmp
, op_false
,
4070 dest
, 1, OPTAB_DIRECT
);
4072 emit_move_insn (dest
, x
);
4078 op_true
= force_reg (mode
, op_true
);
4080 if (GET_MODE_SIZE (mode
) < 16
4081 || !nonimmediate_operand (op_false
, mode
))
4082 op_false
= force_reg (mode
, op_false
);
4084 emit_insn (gen_rtx_SET (dest
,
4085 gen_rtx_IF_THEN_ELSE (mode
, cmp
,
4086 op_true
, op_false
)));
4090 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4091 machine_mode blend_mode
= mode
;
4093 if (GET_MODE_SIZE (mode
) < 16
4094 || !vector_operand (op_true
, mode
))
4095 op_true
= force_reg (mode
, op_true
);
4097 op_false
= force_reg (mode
, op_false
);
4103 gen
= gen_mmx_blendvps
;
4107 gen
= gen_sse4_1_blendvps
;
4111 gen
= gen_sse4_1_blendvpd
;
4115 gen
= gen_sse4_1_blendvss
;
4119 gen
= gen_sse4_1_blendvsd
;
4126 gen
= gen_mmx_pblendvb_v8qi
;
4127 blend_mode
= V8QImode
;
4134 gen
= gen_mmx_pblendvb_v4qi
;
4135 blend_mode
= V4QImode
;
4140 gen
= gen_mmx_pblendvb_v2qi
;
4151 gen
= gen_sse4_1_pblendvb
;
4152 blend_mode
= V16QImode
;
4157 gen
= gen_avx_blendvps256
;
4161 gen
= gen_avx_blendvpd256
;
4171 gen
= gen_avx2_pblendvb
;
4172 blend_mode
= V32QImode
;
4177 gen
= gen_avx512bw_blendmv64qi
;
4180 gen
= gen_avx512bw_blendmv32hi
;
4183 gen
= gen_avx512bw_blendmv32hf
;
4186 gen
= gen_avx512bw_blendmv32bf
;
4189 gen
= gen_avx512f_blendmv16si
;
4192 gen
= gen_avx512f_blendmv8di
;
4195 gen
= gen_avx512f_blendmv8df
;
4198 gen
= gen_avx512f_blendmv16sf
;
4207 if (blend_mode
== mode
)
4211 x
= gen_reg_rtx (blend_mode
);
4212 op_false
= gen_lowpart (blend_mode
, op_false
);
4213 op_true
= gen_lowpart (blend_mode
, op_true
);
4214 cmp
= gen_lowpart (blend_mode
, cmp
);
4217 emit_insn (gen (x
, op_false
, op_true
, cmp
));
4220 emit_move_insn (dest
, gen_lowpart (mode
, x
));
4226 t2
= expand_simple_binop (mode
, AND
, op_true
, cmp
,
4227 NULL
, 1, OPTAB_DIRECT
);
4229 t3
= gen_reg_rtx (mode
);
4230 x
= gen_rtx_NOT (mode
, cmp
);
4231 ix86_emit_vec_binop (AND
, mode
, t3
, x
, op_false
);
4233 x
= expand_simple_binop (mode
, IOR
, t3
, t2
,
4234 dest
, 1, OPTAB_DIRECT
);
4236 emit_move_insn (dest
, x
);
4240 /* Swap, force into registers, or otherwise massage the two operands
4241 to an sse comparison with a mask result. Thus we differ a bit from
4242 ix86_prepare_fp_compare_args which expects to produce a flags result.
4244 The DEST operand exists to help determine whether to commute commutative
4245 operators. The POP0/POP1 operands are updated in place. The new
4246 comparison code is returned, or UNKNOWN if not implementable. */
4248 static enum rtx_code
4249 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
4250 rtx
*pop0
, rtx
*pop1
)
4256 /* AVX supports all the needed comparisons. */
4259 /* We have no LTGT as an operator. We could implement it with
4260 NE & ORDERED, but this requires an extra temporary. It's
4261 not clear that it's worth it. */
4268 /* These are supported directly. */
4275 /* AVX has 3 operand comparisons, no need to swap anything. */
4278 /* For commutative operators, try to canonicalize the destination
4279 operand to be first in the comparison - this helps reload to
4280 avoid extra moves. */
4281 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
4289 /* These are not supported directly before AVX, and furthermore
4290 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4291 comparison operands to transform into something that is
4293 std::swap (*pop0
, *pop1
);
4294 code
= swap_condition (code
);
4304 /* Expand a floating-point conditional move. Return true if successful. */
4307 ix86_expand_fp_movcc (rtx operands
[])
4309 machine_mode mode
= GET_MODE (operands
[0]);
4310 enum rtx_code code
= GET_CODE (operands
[1]);
4311 rtx tmp
, compare_op
;
4312 rtx op0
= XEXP (operands
[1], 0);
4313 rtx op1
= XEXP (operands
[1], 1);
4315 if (GET_MODE (op0
) == BFmode
4316 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
4319 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode
))
4323 /* Since we've no cmove for sse registers, don't force bad register
4324 allocation just to gain access to it. Deny movcc when the
4325 comparison mode doesn't match the move mode. */
4326 cmode
= GET_MODE (op0
);
4327 if (cmode
== VOIDmode
)
4328 cmode
= GET_MODE (op1
);
4332 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
4333 if (code
== UNKNOWN
)
4336 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
4337 operands
[2], operands
[3]))
4340 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
4341 operands
[2], operands
[3]);
4342 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
4346 if (GET_MODE (op0
) == TImode
4347 || (GET_MODE (op0
) == DImode
4351 /* The floating point conditional move instructions don't directly
4352 support conditions resulting from a signed integer comparison. */
4354 compare_op
= ix86_expand_compare (code
, op0
, op1
);
4355 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
4357 tmp
= gen_reg_rtx (QImode
);
4358 ix86_expand_setcc (tmp
, code
, op0
, op1
);
4360 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
4363 emit_insn (gen_rtx_SET (operands
[0],
4364 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
4365 operands
[2], operands
[3])));
4370 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4373 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4398 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4401 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4438 /* Return immediate value to be used in UNSPEC_PCMP
4439 for comparison CODE in MODE. */
4442 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
4444 if (FLOAT_MODE_P (mode
))
4445 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
4446 return ix86_int_cmp_code_to_pcmp_immediate (code
);
4449 /* Expand AVX-512 vector comparison. */
4452 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
4454 machine_mode mask_mode
= GET_MODE (dest
);
4455 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
4456 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
4466 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
4470 unspec_code
= UNSPEC_PCMP
;
4473 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
4475 emit_insn (gen_rtx_SET (dest
, unspec
));
4480 /* Expand fp vector comparison. */
4483 ix86_expand_fp_vec_cmp (rtx operands
[])
4485 enum rtx_code code
= GET_CODE (operands
[1]);
4488 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4489 &operands
[2], &operands
[3]);
4490 if (code
== UNKNOWN
)
4493 switch (GET_CODE (operands
[1]))
4496 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4497 operands
[3], NULL
, NULL
);
4498 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4499 operands
[3], NULL
, NULL
);
4503 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4504 operands
[3], NULL
, NULL
);
4505 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4506 operands
[3], NULL
, NULL
);
4512 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4516 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4519 if (operands
[0] != cmp
)
4520 emit_move_insn (operands
[0], cmp
);
4526 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4527 rtx op_true
, rtx op_false
, bool *negate
)
4529 machine_mode data_mode
= GET_MODE (dest
);
4530 machine_mode mode
= GET_MODE (cop0
);
4535 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4537 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
4538 && GET_MODE_SIZE (mode
) <= 16)
4540 /* AVX512F supports all of the comparsions
4541 on all 128/256/512-bit vector int types. */
4542 else if (ix86_use_mask_cmp_p (data_mode
, mode
, op_true
, op_false
))
4546 /* Canonicalize the comparison to EQ, GT, GTU. */
4556 /* x <= cst can be handled as x < cst + 1 unless there is
4557 wrap around in cst + 1. */
4558 if (GET_CODE (cop1
) == CONST_VECTOR
4559 && GET_MODE_INNER (mode
) != TImode
)
4561 unsigned int n_elts
= GET_MODE_NUNITS (mode
), i
;
4562 machine_mode eltmode
= GET_MODE_INNER (mode
);
4563 for (i
= 0; i
< n_elts
; ++i
)
4565 rtx elt
= CONST_VECTOR_ELT (cop1
, i
);
4566 if (!CONST_INT_P (elt
))
4570 /* For LE punt if some element is signed maximum. */
4571 if ((INTVAL (elt
) & (GET_MODE_MASK (eltmode
) >> 1))
4572 == (GET_MODE_MASK (eltmode
) >> 1))
4575 /* For LEU punt if some element is unsigned maximum. */
4576 else if (elt
== constm1_rtx
)
4581 rtvec v
= rtvec_alloc (n_elts
);
4582 for (i
= 0; i
< n_elts
; ++i
)
4584 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1
, i
)) + 1,
4586 cop1
= gen_rtx_CONST_VECTOR (mode
, v
);
4587 std::swap (cop0
, cop1
);
4588 code
= code
== LE
? GT
: GTU
;
4594 code
= reverse_condition (code
);
4600 /* x >= cst can be handled as x > cst - 1 unless there is
4601 wrap around in cst - 1. */
4602 if (GET_CODE (cop1
) == CONST_VECTOR
4603 && GET_MODE_INNER (mode
) != TImode
)
4605 unsigned int n_elts
= GET_MODE_NUNITS (mode
), i
;
4606 machine_mode eltmode
= GET_MODE_INNER (mode
);
4607 for (i
= 0; i
< n_elts
; ++i
)
4609 rtx elt
= CONST_VECTOR_ELT (cop1
, i
);
4610 if (!CONST_INT_P (elt
))
4614 /* For GE punt if some element is signed minimum. */
4615 if (INTVAL (elt
) < 0
4616 && ((INTVAL (elt
) & (GET_MODE_MASK (eltmode
) >> 1))
4620 /* For GEU punt if some element is zero. */
4621 else if (elt
== const0_rtx
)
4626 rtvec v
= rtvec_alloc (n_elts
);
4627 for (i
= 0; i
< n_elts
; ++i
)
4629 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1
, i
)) - 1,
4631 cop1
= gen_rtx_CONST_VECTOR (mode
, v
);
4632 code
= code
== GE
? GT
: GTU
;
4636 code
= reverse_condition (code
);
4642 std::swap (cop0
, cop1
);
4643 code
= swap_condition (code
);
4650 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4651 if (mode
== V2DImode
)
4656 /* SSE4.1 supports EQ. */
4663 /* SSE4.2 supports GT/GTU. */
4673 if (GET_CODE (cop0
) == CONST_VECTOR
)
4674 cop0
= force_reg (mode
, cop0
);
4675 else if (GET_CODE (cop1
) == CONST_VECTOR
)
4676 cop1
= force_reg (mode
, cop1
);
4678 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4679 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4681 std::swap (optrue
, opfalse
);
4683 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4684 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4685 min (x, y) == x). While we add one instruction (the minimum),
4686 we remove the need for two instructions in the negation, as the
4687 result is done this way.
4688 When using masks, do it for SI/DImode element types, as it is shorter
4689 than the two subtractions. */
4691 && GET_MODE_SIZE (mode
) != 64
4692 && vector_all_ones_operand (opfalse
, data_mode
)
4693 && optrue
== CONST0_RTX (data_mode
))
4695 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4696 /* Don't do it if not using integer masks and we'd end up with
4697 the right values in the registers though. */
4698 && (GET_MODE_SIZE (mode
) == 64
4699 || !vector_all_ones_operand (optrue
, data_mode
)
4700 || opfalse
!= CONST0_RTX (data_mode
))))
4702 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4707 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4710 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4711 cop0
= force_reg (mode
, cop0
);
4712 cop1
= force_reg (mode
, cop1
);
4716 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4720 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4724 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4727 if (TARGET_AVX512VL
)
4729 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4730 cop0
= force_reg (mode
, cop0
);
4731 cop1
= force_reg (mode
, cop1
);
4735 if (code
== GTU
&& TARGET_SSE2
)
4736 gen
= gen_uminv16qi3
;
4737 else if (code
== GT
&& TARGET_SSE4_1
)
4738 gen
= gen_sminv16qi3
;
4741 if (code
== GTU
&& TARGET_SSE2
)
4742 gen
= gen_uminv8qi3
;
4743 else if (code
== GT
&& TARGET_SSE4_1
)
4744 gen
= gen_sminv8qi3
;
4747 if (code
== GTU
&& TARGET_SSE2
)
4748 gen
= gen_uminv4qi3
;
4749 else if (code
== GT
&& TARGET_SSE4_1
)
4750 gen
= gen_sminv4qi3
;
4753 if (code
== GTU
&& TARGET_SSE2
)
4754 gen
= gen_uminv2qi3
;
4755 else if (code
== GT
&& TARGET_SSE4_1
)
4756 gen
= gen_sminv2qi3
;
4759 if (code
== GTU
&& TARGET_SSE4_1
)
4760 gen
= gen_uminv8hi3
;
4761 else if (code
== GT
&& TARGET_SSE2
)
4762 gen
= gen_sminv8hi3
;
4765 if (code
== GTU
&& TARGET_SSE4_1
)
4766 gen
= gen_uminv4hi3
;
4767 else if (code
== GT
&& TARGET_SSE2
)
4768 gen
= gen_sminv4hi3
;
4771 if (code
== GTU
&& TARGET_SSE4_1
)
4772 gen
= gen_uminv2hi3
;
4773 else if (code
== GT
&& TARGET_SSE2
)
4774 gen
= gen_sminv2hi3
;
4778 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4782 gen
= (code
== GTU
) ? gen_uminv2si3
: gen_sminv2si3
;
4785 if (TARGET_AVX512VL
)
4787 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4788 cop0
= force_reg (mode
, cop0
);
4789 cop1
= force_reg (mode
, cop1
);
4798 rtx tem
= gen_reg_rtx (mode
);
4799 if (!vector_operand (cop0
, mode
))
4800 cop0
= force_reg (mode
, cop0
);
4801 if (!vector_operand (cop1
, mode
))
4802 cop1
= force_reg (mode
, cop1
);
4804 emit_insn (gen (tem
, cop0
, cop1
));
4810 /* Unsigned parallel compare is not supported by the hardware.
4811 Play some tricks to turn this into a signed comparison
4815 cop0
= force_reg (mode
, cop0
);
4829 /* Subtract (-(INT MAX) - 1) from both operands to make
4831 mask
= ix86_build_signbit_mask (mode
, true, false);
4832 t1
= gen_reg_rtx (mode
);
4833 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4835 t2
= gen_reg_rtx (mode
);
4836 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4855 /* Perform a parallel unsigned saturating subtraction. */
4856 x
= gen_reg_rtx (mode
);
4857 emit_insn (gen_rtx_SET
4858 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4860 cop1
= CONST0_RTX (mode
);
4872 std::swap (op_true
, op_false
);
4874 if (GET_CODE (cop1
) == CONST_VECTOR
)
4875 cop1
= force_reg (mode
, cop1
);
4877 /* Allow the comparison to be done in one mode, but the movcc to
4878 happen in another mode. */
4879 if (data_mode
== mode
)
4880 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
, op_true
, op_false
);
4883 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4884 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4886 if (GET_MODE (x
) == mode
)
4887 x
= gen_lowpart (data_mode
, x
);
4893 /* Expand integer vector comparison. */
4896 ix86_expand_int_vec_cmp (rtx operands
[])
4898 rtx_code code
= GET_CODE (operands
[1]);
4899 bool negate
= false;
4900 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4901 operands
[3], NULL
, NULL
, &negate
);
4907 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4908 CONST0_RTX (GET_MODE (cmp
)),
4909 NULL
, NULL
, &negate
);
4911 gcc_assert (!negate
);
4913 if (operands
[0] != cmp
)
4914 emit_move_insn (operands
[0], cmp
);
4919 /* Expand a floating-point vector conditional move; a vcond operation
4920 rather than a movcc operation. */
4923 ix86_expand_fp_vcond (rtx operands
[])
4925 enum rtx_code code
= GET_CODE (operands
[3]);
4928 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4929 &operands
[4], &operands
[5]);
4930 if (code
== UNKNOWN
)
4933 switch (GET_CODE (operands
[3]))
4936 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4937 operands
[5], operands
[0], operands
[0]);
4938 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4939 operands
[5], operands
[1], operands
[2]);
4943 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4944 operands
[5], operands
[0], operands
[0]);
4945 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4946 operands
[5], operands
[1], operands
[2]);
4952 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4954 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4958 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4959 operands
[5], operands
[1], operands
[2]))
4962 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4963 operands
[1], operands
[2]);
4964 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4968 /* Expand a signed/unsigned integral vector conditional move. */
4971 ix86_expand_int_vcond (rtx operands
[])
4973 machine_mode data_mode
= GET_MODE (operands
[0]);
4974 machine_mode mode
= GET_MODE (operands
[4]);
4975 enum rtx_code code
= GET_CODE (operands
[3]);
4976 bool negate
= false;
4982 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4983 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4984 if ((code
== LT
|| code
== GE
)
4985 && data_mode
== mode
4986 && cop1
== CONST0_RTX (mode
)
4987 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4988 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4989 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4990 && (GET_MODE_SIZE (data_mode
) == 16
4991 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4993 rtx negop
= operands
[2 - (code
== LT
)];
4994 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4995 if (negop
== CONST1_RTX (data_mode
))
4997 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4998 operands
[0], 1, OPTAB_DIRECT
);
4999 if (res
!= operands
[0])
5000 emit_move_insn (operands
[0], res
);
5003 else if (GET_MODE_INNER (data_mode
) != DImode
5004 && vector_all_ones_operand (negop
, data_mode
))
5006 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
5007 operands
[0], 0, OPTAB_DIRECT
);
5008 if (res
!= operands
[0])
5009 emit_move_insn (operands
[0], res
);
5014 if (!nonimmediate_operand (cop1
, mode
))
5015 cop1
= force_reg (mode
, cop1
);
5016 if (!general_operand (operands
[1], data_mode
))
5017 operands
[1] = force_reg (data_mode
, operands
[1]);
5018 if (!general_operand (operands
[2], data_mode
))
5019 operands
[2] = force_reg (data_mode
, operands
[2]);
5021 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
5022 operands
[1], operands
[2], &negate
);
5027 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
5028 operands
[2-negate
]);
5033 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
5034 struct expand_vec_perm_d
*d
)
5036 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5037 expander, so args are either in d, or in op0, op1 etc. */
5038 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
5039 machine_mode maskmode
= mode
;
5040 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
5045 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
5046 gen
= gen_avx512vl_vpermt2varv16qi3
;
5049 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
5050 gen
= gen_avx512vl_vpermt2varv32qi3
;
5053 if (TARGET_AVX512VBMI
)
5054 gen
= gen_avx512bw_vpermt2varv64qi3
;
5057 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
5058 gen
= gen_avx512vl_vpermt2varv8hi3
;
5061 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
5062 gen
= gen_avx512vl_vpermt2varv16hi3
;
5065 if (TARGET_AVX512BW
)
5066 gen
= gen_avx512bw_vpermt2varv32hi3
;
5069 if (TARGET_AVX512VL
)
5070 gen
= gen_avx512vl_vpermt2varv4si3
;
5073 if (TARGET_AVX512VL
)
5074 gen
= gen_avx512vl_vpermt2varv8si3
;
5078 gen
= gen_avx512f_vpermt2varv16si3
;
5081 if (TARGET_AVX512VL
)
5083 gen
= gen_avx512vl_vpermt2varv4sf3
;
5084 maskmode
= V4SImode
;
5088 if (TARGET_AVX512VL
)
5090 gen
= gen_avx512vl_vpermt2varv8sf3
;
5091 maskmode
= V8SImode
;
5097 gen
= gen_avx512f_vpermt2varv16sf3
;
5098 maskmode
= V16SImode
;
5102 if (TARGET_AVX512VL
)
5103 gen
= gen_avx512vl_vpermt2varv2di3
;
5106 if (TARGET_AVX512VL
)
5107 gen
= gen_avx512vl_vpermt2varv4di3
;
5111 gen
= gen_avx512f_vpermt2varv8di3
;
5114 if (TARGET_AVX512VL
)
5116 gen
= gen_avx512vl_vpermt2varv2df3
;
5117 maskmode
= V2DImode
;
5121 if (TARGET_AVX512VL
)
5123 gen
= gen_avx512vl_vpermt2varv4df3
;
5124 maskmode
= V4DImode
;
5130 gen
= gen_avx512f_vpermt2varv8df3
;
5131 maskmode
= V8DImode
;
5141 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5142 expander, so args are either in d, or in op0, op1 etc. */
5149 for (int i
= 0; i
< d
->nelt
; ++i
)
5150 vec
[i
] = GEN_INT (d
->perm
[i
]);
5151 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
5154 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
5158 /* Expand a variable vector permutation. */
5161 ix86_expand_vec_perm (rtx operands
[])
5163 rtx target
= operands
[0];
5164 rtx op0
= operands
[1];
5165 rtx op1
= operands
[2];
5166 rtx mask
= operands
[3];
5167 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
5168 machine_mode mode
= GET_MODE (op0
);
5169 machine_mode maskmode
= GET_MODE (mask
);
5171 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
5173 /* Number of elements in the vector. */
5174 w
= GET_MODE_NUNITS (mode
);
5175 e
= GET_MODE_UNIT_SIZE (mode
);
5176 gcc_assert (w
<= 64);
5178 /* For HF mode vector, convert it to HI using subreg. */
5179 if (GET_MODE_INNER (mode
) == HFmode
)
5181 machine_mode orig_mode
= mode
;
5182 mode
= mode_for_vector (HImode
, w
).require ();
5183 target
= lowpart_subreg (mode
, target
, orig_mode
);
5184 op0
= lowpart_subreg (mode
, op0
, orig_mode
);
5185 op1
= lowpart_subreg (mode
, op1
, orig_mode
);
5188 if (TARGET_AVX512F
&& one_operand_shuffle
)
5190 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
5194 gen
=gen_avx512f_permvarv16si
;
5197 gen
= gen_avx512f_permvarv16sf
;
5200 gen
= gen_avx512f_permvarv8di
;
5203 gen
= gen_avx512f_permvarv8df
;
5210 emit_insn (gen (target
, op0
, mask
));
5215 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
5220 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
5222 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5223 an constant shuffle operand. With a tiny bit of effort we can
5224 use VPERMD instead. A re-interpretation stall for V4DFmode is
5225 unfortunate but there's no avoiding it.
5226 Similarly for V16HImode we don't have instructions for variable
5227 shuffling, while for V32QImode we can use after preparing suitable
5228 masks vpshufb; vpshufb; vpermq; vpor. */
5230 if (mode
== V16HImode
)
5232 maskmode
= mode
= V32QImode
;
5238 maskmode
= mode
= V8SImode
;
5242 t1
= gen_reg_rtx (maskmode
);
5244 /* Replicate the low bits of the V4DImode mask into V8SImode:
5246 t1 = { A A B B C C D D }. */
5247 for (i
= 0; i
< w
/ 2; ++i
)
5248 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
5249 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5250 vt
= force_reg (maskmode
, vt
);
5251 mask
= gen_lowpart (maskmode
, mask
);
5252 if (maskmode
== V8SImode
)
5253 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
5255 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
5257 /* Multiply the shuffle indicies by two. */
5258 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
5261 /* Add one to the odd shuffle indicies:
5262 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5263 for (i
= 0; i
< w
/ 2; ++i
)
5265 vec
[i
* 2] = const0_rtx
;
5266 vec
[i
* 2 + 1] = const1_rtx
;
5268 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5269 vt
= validize_mem (force_const_mem (maskmode
, vt
));
5270 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
5273 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5274 operands
[3] = mask
= t1
;
5275 target
= gen_reg_rtx (mode
);
5276 op0
= gen_lowpart (mode
, op0
);
5277 op1
= gen_lowpart (mode
, op1
);
5283 /* The VPERMD and VPERMPS instructions already properly ignore
5284 the high bits of the shuffle elements. No need for us to
5285 perform an AND ourselves. */
5286 if (one_operand_shuffle
)
5288 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
5289 if (target
!= operands
[0])
5290 emit_move_insn (operands
[0],
5291 gen_lowpart (GET_MODE (operands
[0]), target
));
5295 t1
= gen_reg_rtx (V8SImode
);
5296 t2
= gen_reg_rtx (V8SImode
);
5297 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
5298 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
5304 mask
= gen_lowpart (V8SImode
, mask
);
5305 if (one_operand_shuffle
)
5306 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
5309 t1
= gen_reg_rtx (V8SFmode
);
5310 t2
= gen_reg_rtx (V8SFmode
);
5311 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
5312 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
5318 /* By combining the two 128-bit input vectors into one 256-bit
5319 input vector, we can use VPERMD and VPERMPS for the full
5320 two-operand shuffle. */
5321 t1
= gen_reg_rtx (V8SImode
);
5322 t2
= gen_reg_rtx (V8SImode
);
5323 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
5324 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5325 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
5326 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
5330 t1
= gen_reg_rtx (V8SFmode
);
5331 t2
= gen_reg_rtx (V8SImode
);
5332 mask
= gen_lowpart (V4SImode
, mask
);
5333 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
5334 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5335 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
5336 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
5340 t1
= gen_reg_rtx (V32QImode
);
5341 t2
= gen_reg_rtx (V32QImode
);
5342 t3
= gen_reg_rtx (V32QImode
);
5343 vt2
= GEN_INT (-128);
5344 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
5345 vt
= force_reg (V32QImode
, vt
);
5346 for (i
= 0; i
< 32; i
++)
5347 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
5348 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
5349 vt2
= force_reg (V32QImode
, vt2
);
5350 /* From mask create two adjusted masks, which contain the same
5351 bits as mask in the low 7 bits of each vector element.
5352 The first mask will have the most significant bit clear
5353 if it requests element from the same 128-bit lane
5354 and MSB set if it requests element from the other 128-bit lane.
5355 The second mask will have the opposite values of the MSB,
5356 and additionally will have its 128-bit lanes swapped.
5357 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5358 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5359 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5360 stands for other 12 bytes. */
5361 /* The bit whether element is from the same lane or the other
5362 lane is bit 4, so shift it up by 3 to the MSB position. */
5363 t5
= gen_reg_rtx (V4DImode
);
5364 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
5366 /* Clear MSB bits from the mask just in case it had them set. */
5367 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
5368 /* After this t1 will have MSB set for elements from other lane. */
5369 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
5370 /* Clear bits other than MSB. */
5371 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
5372 /* Or in the lower bits from mask into t3. */
5373 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
5374 /* And invert MSB bits in t1, so MSB is set for elements from the same
5376 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
5377 /* Swap 128-bit lanes in t3. */
5378 t6
= gen_reg_rtx (V4DImode
);
5379 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
5380 const2_rtx
, GEN_INT (3),
5381 const0_rtx
, const1_rtx
));
5382 /* And or in the lower bits from mask into t1. */
5383 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
5384 if (one_operand_shuffle
)
5386 /* Each of these shuffles will put 0s in places where
5387 element from the other 128-bit lane is needed, otherwise
5388 will shuffle in the requested value. */
5389 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
5390 gen_lowpart (V32QImode
, t6
)));
5391 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
5392 /* For t3 the 128-bit lanes are swapped again. */
5393 t7
= gen_reg_rtx (V4DImode
);
5394 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
5395 const2_rtx
, GEN_INT (3),
5396 const0_rtx
, const1_rtx
));
5397 /* And oring both together leads to the result. */
5398 emit_insn (gen_iorv32qi3 (target
, t1
,
5399 gen_lowpart (V32QImode
, t7
)));
5400 if (target
!= operands
[0])
5401 emit_move_insn (operands
[0],
5402 gen_lowpart (GET_MODE (operands
[0]), target
));
5406 t4
= gen_reg_rtx (V32QImode
);
5407 /* Similarly to the above one_operand_shuffle code,
5408 just for repeated twice for each operand. merge_two:
5409 code will merge the two results together. */
5410 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
5411 gen_lowpart (V32QImode
, t6
)));
5412 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
5413 gen_lowpart (V32QImode
, t6
)));
5414 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
5415 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
5416 t7
= gen_reg_rtx (V4DImode
);
5417 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
5418 const2_rtx
, GEN_INT (3),
5419 const0_rtx
, const1_rtx
));
5420 t8
= gen_reg_rtx (V4DImode
);
5421 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
5422 const2_rtx
, GEN_INT (3),
5423 const0_rtx
, const1_rtx
));
5424 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
5425 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
5431 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
5438 /* The XOP VPPERM insn supports three inputs. By ignoring the
5439 one_operand_shuffle special case, we avoid creating another
5440 set of constant vectors in memory. */
5441 one_operand_shuffle
= false;
5443 /* mask = mask & {2*w-1, ...} */
5444 vt
= GEN_INT (2*w
- 1);
5448 /* mask = mask & {w-1, ...} */
5449 vt
= GEN_INT (w
- 1);
5452 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5453 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5454 NULL_RTX
, 0, OPTAB_DIRECT
);
5456 /* For non-QImode operations, convert the word permutation control
5457 into a byte permutation control. */
5458 if (mode
!= V16QImode
)
5460 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
5461 GEN_INT (exact_log2 (e
)),
5462 NULL_RTX
, 0, OPTAB_DIRECT
);
5464 /* Convert mask to vector of chars. */
5465 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
5467 /* Replicate each of the input bytes into byte positions:
5468 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5469 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5470 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5471 for (i
= 0; i
< 16; ++i
)
5472 vec
[i
] = GEN_INT (i
/e
* e
);
5473 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5474 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5476 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
5478 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
5480 /* Convert it into the byte positions by doing
5481 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5482 for (i
= 0; i
< 16; ++i
)
5483 vec
[i
] = GEN_INT (i
% e
);
5484 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5485 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5486 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
5489 /* The actual shuffle operations all operate on V16QImode. */
5490 op0
= gen_lowpart (V16QImode
, op0
);
5491 op1
= gen_lowpart (V16QImode
, op1
);
5495 if (GET_MODE (target
) != V16QImode
)
5496 target
= gen_reg_rtx (V16QImode
);
5497 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
5498 if (target
!= operands
[0])
5499 emit_move_insn (operands
[0],
5500 gen_lowpart (GET_MODE (operands
[0]), target
));
5502 else if (one_operand_shuffle
)
5504 if (GET_MODE (target
) != V16QImode
)
5505 target
= gen_reg_rtx (V16QImode
);
5506 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
5507 if (target
!= operands
[0])
5508 emit_move_insn (operands
[0],
5509 gen_lowpart (GET_MODE (operands
[0]), target
));
5516 /* Shuffle the two input vectors independently. */
5517 t1
= gen_reg_rtx (V16QImode
);
5518 t2
= gen_reg_rtx (V16QImode
);
5519 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
5520 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
5523 /* Then merge them together. The key is whether any given control
5524 element contained a bit set that indicates the second word. */
5527 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
5529 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5530 more shuffle to convert the V2DI input mask into a V4SI
5531 input mask. At which point the masking that expand_int_vcond
5532 will work as desired. */
5533 rtx t3
= gen_reg_rtx (V4SImode
);
5534 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
5535 const0_rtx
, const0_rtx
,
5536 const2_rtx
, const2_rtx
));
5538 maskmode
= V4SImode
;
5542 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5543 vt
= force_reg (maskmode
, vt
);
5544 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5545 NULL_RTX
, 0, OPTAB_DIRECT
);
5547 if (GET_MODE (target
) != mode
)
5548 target
= gen_reg_rtx (mode
);
5550 xops
[1] = gen_lowpart (mode
, t2
);
5551 xops
[2] = gen_lowpart (mode
, t1
);
5552 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
5555 ok
= ix86_expand_int_vcond (xops
);
5557 if (target
!= operands
[0])
5558 emit_move_insn (operands
[0],
5559 gen_lowpart (GET_MODE (operands
[0]), target
));
5563 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5564 true if we should do zero extension, else sign extension. HIGH_P is
5565 true if we want the N/2 high elements, else the low elements. */
5568 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
5570 machine_mode imode
= GET_MODE (src
);
5575 rtx (*unpack
)(rtx
, rtx
);
5576 rtx (*extract
)(rtx
, rtx
) = NULL
;
5577 machine_mode halfmode
= BLKmode
;
5583 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
5585 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
5586 halfmode
= V32QImode
;
5588 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
5592 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
5594 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
5595 halfmode
= V16QImode
;
5597 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
5601 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
5603 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
5604 halfmode
= V16HImode
;
5606 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
5610 unpack
= gen_avx2_zero_extendv8hiv8si2
;
5612 unpack
= gen_avx2_sign_extendv8hiv8si2
;
5613 halfmode
= V8HImode
;
5615 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5619 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5621 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5622 halfmode
= V8SImode
;
5624 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5628 unpack
= gen_avx2_zero_extendv4siv4di2
;
5630 unpack
= gen_avx2_sign_extendv4siv4di2
;
5631 halfmode
= V4SImode
;
5633 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5637 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5639 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5643 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5645 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5649 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5651 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5655 unpack
= gen_sse4_1_zero_extendv4qiv4hi2
;
5657 unpack
= gen_sse4_1_sign_extendv4qiv4hi2
;
5661 unpack
= gen_sse4_1_zero_extendv2hiv2si2
;
5663 unpack
= gen_sse4_1_sign_extendv2hiv2si2
;
5667 unpack
= gen_sse4_1_zero_extendv2qiv2hi2
;
5669 unpack
= gen_sse4_1_sign_extendv2qiv2hi2
;
5675 if (GET_MODE_SIZE (imode
) >= 32)
5677 tmp
= gen_reg_rtx (halfmode
);
5678 emit_insn (extract (tmp
, src
));
5682 switch (GET_MODE_SIZE (imode
))
5685 /* Shift higher 8 bytes to lower 8 bytes. */
5686 tmp
= gen_reg_rtx (V1TImode
);
5687 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5691 /* Shift higher 4 bytes to lower 4 bytes. */
5692 tmp
= gen_reg_rtx (V1DImode
);
5693 emit_insn (gen_mmx_lshrv1di3 (tmp
, gen_lowpart (V1DImode
, src
),
5697 /* Shift higher 2 bytes to lower 2 bytes. */
5698 tmp
= gen_reg_rtx (V1SImode
);
5699 emit_insn (gen_mmx_lshrv1si3 (tmp
, gen_lowpart (V1SImode
, src
),
5706 tmp
= gen_lowpart (imode
, tmp
);
5711 emit_insn (unpack (dest
, tmp
));
5715 rtx (*unpack
)(rtx
, rtx
, rtx
);
5721 unpack
= gen_vec_interleave_highv16qi
;
5723 unpack
= gen_vec_interleave_lowv16qi
;
5727 unpack
= gen_vec_interleave_highv8hi
;
5729 unpack
= gen_vec_interleave_lowv8hi
;
5733 unpack
= gen_vec_interleave_highv4si
;
5735 unpack
= gen_vec_interleave_lowv4si
;
5739 unpack
= gen_mmx_punpckhbw
;
5741 unpack
= gen_mmx_punpcklbw
;
5745 unpack
= gen_mmx_punpckhwd
;
5747 unpack
= gen_mmx_punpcklwd
;
5751 unpack
= gen_mmx_punpckhbw_low
;
5753 unpack
= gen_mmx_punpcklbw_low
;
5760 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5762 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5763 src
, pc_rtx
, pc_rtx
);
5765 rtx tmp2
= gen_reg_rtx (imode
);
5766 emit_insn (unpack (tmp2
, src
, tmp
));
5767 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5771 /* Return true if mem is pool constant which contains a const_vector
5772 perm index, assign the index to PERM. */
5774 ix86_extract_perm_from_pool_constant (int* perm
, rtx mem
)
5776 machine_mode mode
= GET_MODE (mem
);
5777 int nelt
= GET_MODE_NUNITS (mode
);
5779 if (!INTEGRAL_MODE_P (mode
))
5782 /* Needs to be constant pool. */
5784 || !SYMBOL_REF_P (XEXP (mem
, 0))
5785 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem
, 0)))
5788 rtx constant
= get_pool_constant (XEXP (mem
, 0));
5790 if (GET_CODE (constant
) != CONST_VECTOR
)
5793 /* There could be some rtx like
5794 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5795 but with "*.LC1" refer to V2DI constant vector. */
5796 if (GET_MODE (constant
) != mode
)
5798 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
), 0);
5800 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
5804 for (int i
= 0; i
!= nelt
; i
++)
5805 perm
[i
] = UINTVAL (XVECEXP (constant
, 0, i
));
5810 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5811 but works for floating pointer parameters and nonoffsetable memories.
5812 For pushes, it returns just stack offsets; the values will be saved
5813 in the right order. Maximally three parts are generated. */
5816 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5821 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5823 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5825 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5826 gcc_assert (size
>= 2 && size
<= 4);
5828 /* Optimize constant pool reference to immediates. This is used by fp
5829 moves, that force all constants to memory to allow combining. */
5830 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5831 operand
= avoid_constant_pool_reference (operand
);
5833 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5835 /* The only non-offsetable memories we handle are pushes. */
5836 int ok
= push_operand (operand
, VOIDmode
);
5840 operand
= copy_rtx (operand
);
5841 PUT_MODE (operand
, word_mode
);
5842 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5846 if (GET_CODE (operand
) == CONST_VECTOR
)
5848 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5849 /* Caution: if we looked through a constant pool memory above,
5850 the operand may actually have a different mode now. That's
5851 ok, since we want to pun this all the way back to an integer. */
5852 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5853 gcc_assert (operand
!= NULL
);
5860 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5865 if (REG_P (operand
))
5867 gcc_assert (reload_completed
);
5868 for (i
= 0; i
< size
; i
++)
5869 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5871 else if (offsettable_memref_p (operand
))
5873 operand
= adjust_address (operand
, SImode
, 0);
5875 for (i
= 1; i
< size
; i
++)
5876 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5878 else if (CONST_DOUBLE_P (operand
))
5880 const REAL_VALUE_TYPE
*r
;
5883 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5887 real_to_target (l
, r
, mode
);
5888 parts
[3] = gen_int_mode (l
[3], SImode
);
5889 parts
[2] = gen_int_mode (l
[2], SImode
);
5892 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5893 long double may not be 80-bit. */
5894 real_to_target (l
, r
, mode
);
5895 parts
[2] = gen_int_mode (l
[2], SImode
);
5898 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5903 parts
[1] = gen_int_mode (l
[1], SImode
);
5904 parts
[0] = gen_int_mode (l
[0], SImode
);
5913 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5914 if (mode
== XFmode
|| mode
== TFmode
)
5916 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5917 if (REG_P (operand
))
5919 gcc_assert (reload_completed
);
5920 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5921 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5923 else if (offsettable_memref_p (operand
))
5925 operand
= adjust_address (operand
, DImode
, 0);
5927 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5929 else if (CONST_DOUBLE_P (operand
))
5933 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5935 /* real_to_target puts 32-bit pieces in each long. */
5936 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5937 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5940 if (upper_mode
== SImode
)
5941 parts
[1] = gen_int_mode (l
[2], SImode
);
5944 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5945 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5956 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5957 Return false when normal moves are needed; true when all required
5958 insns have been emitted. Operands 2-4 contain the input values
5959 int the correct order; operands 5-7 contain the output values. */
5962 ix86_split_long_move (rtx operands
[])
5968 machine_mode mode
= GET_MODE (operands
[0]);
5969 bool collisionparts
[4];
5971 /* The DFmode expanders may ask us to move double.
5972 For 64bit target this is single move. By hiding the fact
5973 here we simplify i386.md splitters. */
5974 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5976 /* Optimize constant pool reference to immediates. This is used by
5977 fp moves, that force all constants to memory to allow combining. */
5979 if (MEM_P (operands
[1])
5980 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5981 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5982 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5983 if (push_operand (operands
[0], VOIDmode
))
5985 operands
[0] = copy_rtx (operands
[0]);
5986 PUT_MODE (operands
[0], word_mode
);
5989 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5990 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5991 emit_move_insn (operands
[0], operands
[1]);
5995 /* The only non-offsettable memory we handle is push. */
5996 if (push_operand (operands
[0], VOIDmode
))
5999 gcc_assert (!MEM_P (operands
[0])
6000 || offsettable_memref_p (operands
[0]));
6002 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
6003 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
6005 /* When emitting push, take care for source operands on the stack. */
6006 if (push
&& MEM_P (operands
[1])
6007 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
6009 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
6011 /* Compensate for the stack decrement by 4. */
6012 if (!TARGET_64BIT
&& nparts
== 3
6013 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
6014 src_base
= plus_constant (Pmode
, src_base
, 4);
6016 /* src_base refers to the stack pointer and is
6017 automatically decreased by emitted push. */
6018 for (i
= 0; i
< nparts
; i
++)
6019 part
[1][i
] = change_address (part
[1][i
],
6020 GET_MODE (part
[1][i
]), src_base
);
6023 /* We need to do copy in the right order in case an address register
6024 of the source overlaps the destination. */
6025 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
6029 for (i
= 0; i
< nparts
; i
++)
6032 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
6033 if (collisionparts
[i
])
6037 /* Collision in the middle part can be handled by reordering. */
6038 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
6040 std::swap (part
[0][1], part
[0][2]);
6041 std::swap (part
[1][1], part
[1][2]);
6043 else if (collisions
== 1
6045 && (collisionparts
[1] || collisionparts
[2]))
6047 if (collisionparts
[1])
6049 std::swap (part
[0][1], part
[0][2]);
6050 std::swap (part
[1][1], part
[1][2]);
6054 std::swap (part
[0][2], part
[0][3]);
6055 std::swap (part
[1][2], part
[1][3]);
6059 /* If there are more collisions, we can't handle it by reordering.
6060 Do an lea to the last part and use only one colliding move. */
6061 else if (collisions
> 1)
6067 base
= part
[0][nparts
- 1];
6069 /* Handle the case when the last part isn't valid for lea.
6070 Happens in 64-bit mode storing the 12-byte XFmode. */
6071 if (GET_MODE (base
) != Pmode
)
6072 base
= gen_rtx_REG (Pmode
, REGNO (base
));
6074 addr
= XEXP (part
[1][0], 0);
6075 if (TARGET_TLS_DIRECT_SEG_REFS
)
6077 struct ix86_address parts
;
6078 int ok
= ix86_decompose_address (addr
, &parts
);
6080 /* It is not valid to use %gs: or %fs: in lea. */
6081 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
6083 emit_insn (gen_rtx_SET (base
, addr
));
6084 part
[1][0] = replace_equiv_address (part
[1][0], base
);
6085 for (i
= 1; i
< nparts
; i
++)
6087 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
6088 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
6099 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
6100 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
6101 emit_move_insn (part
[0][2], part
[1][2]);
6103 else if (nparts
== 4)
6105 emit_move_insn (part
[0][3], part
[1][3]);
6106 emit_move_insn (part
[0][2], part
[1][2]);
6111 /* In 64bit mode we don't have 32bit push available. In case this is
6112 register, it is OK - we will just use larger counterpart. We also
6113 retype memory - these comes from attempt to avoid REX prefix on
6114 moving of second half of TFmode value. */
6115 if (GET_MODE (part
[1][1]) == SImode
)
6117 switch (GET_CODE (part
[1][1]))
6120 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
6124 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
6131 if (GET_MODE (part
[1][0]) == SImode
)
6132 part
[1][0] = part
[1][1];
6135 emit_move_insn (part
[0][1], part
[1][1]);
6136 emit_move_insn (part
[0][0], part
[1][0]);
6140 /* Choose correct order to not overwrite the source before it is copied. */
6141 if ((REG_P (part
[0][0])
6142 && REG_P (part
[1][1])
6143 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
6145 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
6147 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
6149 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
6151 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
6153 operands
[2 + i
] = part
[0][j
];
6154 operands
[6 + i
] = part
[1][j
];
6159 for (i
= 0; i
< nparts
; i
++)
6161 operands
[2 + i
] = part
[0][i
];
6162 operands
[6 + i
] = part
[1][i
];
6166 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6167 if (optimize_insn_for_size_p ())
6169 for (j
= 0; j
< nparts
- 1; j
++)
6170 if (CONST_INT_P (operands
[6 + j
])
6171 && operands
[6 + j
] != const0_rtx
6172 && REG_P (operands
[2 + j
]))
6173 for (i
= j
; i
< nparts
- 1; i
++)
6174 if (CONST_INT_P (operands
[7 + i
])
6175 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
6176 operands
[7 + i
] = operands
[2 + j
];
6179 for (i
= 0; i
< nparts
; i
++)
6180 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
6185 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6186 left shift by a constant, either using a single shift or
6187 a sequence of add instructions. */
6190 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
6193 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
6194 && !optimize_insn_for_size_p ()))
6197 emit_insn (gen_add2_insn (operand
, operand
));
6201 rtx (*insn
)(rtx
, rtx
, rtx
);
6203 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6204 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
6209 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
6211 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
6212 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
6213 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6214 machine_mode half_mode
;
6216 rtx low
[2], high
[2];
6219 if (CONST_INT_P (operands
[2]))
6221 split_double_mode (mode
, operands
, 2, low
, high
);
6222 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6224 if (count
>= half_width
)
6226 emit_move_insn (high
[0], low
[1]);
6227 ix86_expand_clear (low
[0]);
6229 if (count
> half_width
)
6230 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
6234 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6236 if (!rtx_equal_p (operands
[0], operands
[1]))
6237 emit_move_insn (operands
[0], operands
[1]);
6239 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
6240 ix86_expand_ashl_const (low
[0], count
, mode
);
6245 split_double_mode (mode
, operands
, 1, low
, high
);
6246 half_mode
= mode
== DImode
? SImode
: DImode
;
6248 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6250 if (operands
[1] == const1_rtx
)
6252 /* Assuming we've chosen a QImode capable registers, then 1 << N
6253 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6254 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
6256 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
6258 ix86_expand_clear (low
[0]);
6259 ix86_expand_clear (high
[0]);
6260 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
6262 d
= gen_lowpart (QImode
, low
[0]);
6263 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6264 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
6265 emit_insn (gen_rtx_SET (d
, s
));
6267 d
= gen_lowpart (QImode
, high
[0]);
6268 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6269 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
6270 emit_insn (gen_rtx_SET (d
, s
));
6273 /* Otherwise, we can get the same results by manually performing
6274 a bit extract operation on bit 5/6, and then performing the two
6275 shifts. The two methods of getting 0/1 into low/high are exactly
6276 the same size. Avoiding the shift in the bit extract case helps
6277 pentium4 a bit; no one else seems to care much either way. */
6280 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
6281 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
6282 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
6288 gen_lshr3
= gen_lshrsi3
;
6289 gen_and3
= gen_andsi3
;
6290 gen_xor3
= gen_xorsi3
;
6295 gen_lshr3
= gen_lshrdi3
;
6296 gen_and3
= gen_anddi3
;
6297 gen_xor3
= gen_xordi3
;
6301 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
6302 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
6304 x
= gen_lowpart (half_mode
, operands
[2]);
6305 emit_insn (gen_rtx_SET (high
[0], x
));
6307 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
6308 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
6309 emit_move_insn (low
[0], high
[0]);
6310 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
6313 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6314 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
6318 if (operands
[1] == constm1_rtx
)
6320 /* For -1 << N, we can avoid the shld instruction, because we
6321 know that we're shifting 0...31/63 ones into a -1. */
6322 emit_move_insn (low
[0], constm1_rtx
);
6323 if (optimize_insn_for_size_p ())
6324 emit_move_insn (high
[0], low
[0]);
6326 emit_move_insn (high
[0], constm1_rtx
);
6330 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6332 if (!rtx_equal_p (operands
[0], operands
[1]))
6333 emit_move_insn (operands
[0], operands
[1]);
6335 split_double_mode (mode
, operands
, 1, low
, high
);
6336 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
6339 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6341 if (TARGET_CMOVE
&& scratch
)
6343 ix86_expand_clear (scratch
);
6344 emit_insn (gen_x86_shift_adj_1
6345 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
6348 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
6352 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6354 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
6355 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
6356 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6357 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6359 rtx low
[2], high
[2];
6362 if (CONST_INT_P (operands
[2]))
6364 split_double_mode (mode
, operands
, 2, low
, high
);
6365 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6367 if (count
== GET_MODE_BITSIZE (mode
) - 1)
6369 emit_move_insn (high
[0], high
[1]);
6370 emit_insn (gen_ashr3 (high
[0], high
[0],
6371 GEN_INT (half_width
- 1)));
6372 emit_move_insn (low
[0], high
[0]);
6375 else if (count
>= half_width
)
6377 emit_move_insn (low
[0], high
[1]);
6378 emit_move_insn (high
[0], low
[0]);
6379 emit_insn (gen_ashr3 (high
[0], high
[0],
6380 GEN_INT (half_width
- 1)));
6382 if (count
> half_width
)
6383 emit_insn (gen_ashr3 (low
[0], low
[0],
6384 GEN_INT (count
- half_width
)));
6388 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6390 if (!rtx_equal_p (operands
[0], operands
[1]))
6391 emit_move_insn (operands
[0], operands
[1]);
6393 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6394 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
6399 machine_mode half_mode
;
6401 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6403 if (!rtx_equal_p (operands
[0], operands
[1]))
6404 emit_move_insn (operands
[0], operands
[1]);
6406 split_double_mode (mode
, operands
, 1, low
, high
);
6407 half_mode
= mode
== DImode
? SImode
: DImode
;
6409 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6410 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
6412 if (TARGET_CMOVE
&& scratch
)
6414 emit_move_insn (scratch
, high
[0]);
6415 emit_insn (gen_ashr3 (scratch
, scratch
,
6416 GEN_INT (half_width
- 1)));
6417 emit_insn (gen_x86_shift_adj_1
6418 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6421 emit_insn (gen_x86_shift_adj_3
6422 (half_mode
, low
[0], high
[0], operands
[2]));
6427 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6429 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
6430 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
6431 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6432 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6434 rtx low
[2], high
[2];
6437 if (CONST_INT_P (operands
[2]))
6439 split_double_mode (mode
, operands
, 2, low
, high
);
6440 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6442 if (count
>= half_width
)
6444 emit_move_insn (low
[0], high
[1]);
6445 ix86_expand_clear (high
[0]);
6447 if (count
> half_width
)
6448 emit_insn (gen_lshr3 (low
[0], low
[0],
6449 GEN_INT (count
- half_width
)));
6453 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6455 if (!rtx_equal_p (operands
[0], operands
[1]))
6456 emit_move_insn (operands
[0], operands
[1]);
6458 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6459 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
6464 machine_mode half_mode
;
6466 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6468 if (!rtx_equal_p (operands
[0], operands
[1]))
6469 emit_move_insn (operands
[0], operands
[1]);
6471 split_double_mode (mode
, operands
, 1, low
, high
);
6472 half_mode
= mode
== DImode
? SImode
: DImode
;
6474 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6475 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
6477 if (TARGET_CMOVE
&& scratch
)
6479 ix86_expand_clear (scratch
);
6480 emit_insn (gen_x86_shift_adj_1
6481 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6484 emit_insn (gen_x86_shift_adj_2
6485 (half_mode
, low
[0], high
[0], operands
[2]));
6489 /* Expand move of V1TI mode register X to a new TI mode register. */
6491 ix86_expand_v1ti_to_ti (rtx x
)
6493 rtx result
= gen_reg_rtx (TImode
);
6496 rtx temp
= force_reg (V2DImode
, gen_lowpart (V2DImode
, x
));
6497 rtx lo
= gen_lowpart (DImode
, result
);
6498 emit_insn (gen_vec_extractv2didi (lo
, temp
, const0_rtx
));
6499 rtx hi
= gen_highpart (DImode
, result
);
6500 emit_insn (gen_vec_extractv2didi (hi
, temp
, const1_rtx
));
6503 emit_move_insn (result
, gen_lowpart (TImode
, x
));
6507 /* Expand move of TI mode register X to a new V1TI mode register. */
6509 ix86_expand_ti_to_v1ti (rtx x
)
6513 rtx lo
= gen_lowpart (DImode
, x
);
6514 rtx hi
= gen_highpart (DImode
, x
);
6515 rtx tmp
= gen_reg_rtx (V2DImode
);
6516 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
6517 return force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp
));
6520 return force_reg (V1TImode
, gen_lowpart (V1TImode
, x
));
6523 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6525 ix86_expand_v1ti_shift (enum rtx_code code
, rtx operands
[])
6527 rtx op1
= force_reg (V1TImode
, operands
[1]);
6529 if (!CONST_INT_P (operands
[2]))
6531 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6532 rtx tmp2
= gen_reg_rtx (TImode
);
6533 rtx (*shift
) (rtx
, rtx
, rtx
)
6534 = (code
== ASHIFT
) ? gen_ashlti3
: gen_lshrti3
;
6535 emit_insn (shift (tmp2
, tmp1
, operands
[2]));
6536 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6537 emit_move_insn (operands
[0], tmp3
);
6541 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6545 emit_move_insn (operands
[0], op1
);
6549 if ((bits
& 7) == 0)
6551 rtx tmp
= gen_reg_rtx (V1TImode
);
6553 emit_insn (gen_sse2_ashlv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6555 emit_insn (gen_sse2_lshrv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6556 emit_move_insn (operands
[0], tmp
);
6560 rtx tmp1
= gen_reg_rtx (V1TImode
);
6562 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (64)));
6564 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
6566 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6567 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6569 /* tmp3 will be the V2DImode result. */
6570 rtx tmp3
= gen_reg_rtx (V2DImode
);
6575 emit_insn (gen_ashlv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6577 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6581 /* tmp4 is operands[1], in V2DImode. */
6582 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6584 rtx tmp5
= gen_reg_rtx (V2DImode
);
6586 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6588 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6590 rtx tmp6
= gen_reg_rtx (V2DImode
);
6592 emit_insn (gen_lshrv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6594 emit_insn (gen_ashlv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6596 emit_insn (gen_iorv2di3 (tmp3
, tmp5
, tmp6
));
6599 /* Convert the result back to V1TImode and store in operands[0]. */
6600 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6601 emit_move_insn (operands
[0], tmp7
);
6604 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6606 ix86_expand_v1ti_rotate (enum rtx_code code
, rtx operands
[])
6608 rtx op1
= force_reg (V1TImode
, operands
[1]);
6610 if (!CONST_INT_P (operands
[2]))
6612 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6613 rtx tmp2
= gen_reg_rtx (TImode
);
6614 rtx (*rotate
) (rtx
, rtx
, rtx
)
6615 = (code
== ROTATE
) ? gen_rotlti3
: gen_rotrti3
;
6616 emit_insn (rotate (tmp2
, tmp1
, operands
[2]));
6617 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6618 emit_move_insn (operands
[0], tmp3
);
6622 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6626 emit_move_insn (operands
[0], op1
);
6630 if (code
== ROTATERT
)
6633 if ((bits
& 31) == 0)
6635 rtx tmp2
= gen_reg_rtx (V4SImode
);
6636 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6638 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x93)));
6639 else if (bits
== 64)
6640 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x4e)));
6642 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x39)));
6643 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp2
));
6647 if ((bits
& 7) == 0)
6649 rtx tmp1
= gen_reg_rtx (V1TImode
);
6650 rtx tmp2
= gen_reg_rtx (V1TImode
);
6651 rtx tmp3
= gen_reg_rtx (V1TImode
);
6653 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (bits
)));
6654 emit_insn (gen_sse2_lshrv1ti3 (tmp2
, op1
, GEN_INT (128 - bits
)));
6655 emit_insn (gen_iorv1ti3 (tmp3
, tmp1
, tmp2
));
6656 emit_move_insn (operands
[0], tmp3
);
6660 rtx op1_v4si
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6669 hibits
= gen_reg_rtx (V4SImode
);
6670 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x93)));
6674 lobits
= gen_reg_rtx (V4SImode
);
6675 hibits
= gen_reg_rtx (V4SImode
);
6676 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x93)));
6677 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x4e)));
6681 lobits
= gen_reg_rtx (V4SImode
);
6682 hibits
= gen_reg_rtx (V4SImode
);
6683 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x4e)));
6684 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x39)));
6688 lobits
= gen_reg_rtx (V4SImode
);
6689 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x39)));
6694 rtx tmp1
= gen_reg_rtx (V4SImode
);
6695 rtx tmp2
= gen_reg_rtx (V4SImode
);
6696 rtx tmp3
= gen_reg_rtx (V4SImode
);
6698 emit_insn (gen_ashlv4si3 (tmp1
, lobits
, GEN_INT (bits
& 31)));
6699 emit_insn (gen_lshrv4si3 (tmp2
, hibits
, GEN_INT (32 - (bits
& 31))));
6700 emit_insn (gen_iorv4si3 (tmp3
, tmp1
, tmp2
));
6702 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6705 /* Expand V1TI mode ashiftrt by constant. */
6707 ix86_expand_v1ti_ashiftrt (rtx operands
[])
6709 rtx op1
= force_reg (V1TImode
, operands
[1]);
6711 if (!CONST_INT_P (operands
[2]))
6713 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6714 rtx tmp2
= gen_reg_rtx (TImode
);
6715 emit_insn (gen_ashrti3 (tmp2
, tmp1
, operands
[2]));
6716 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6717 emit_move_insn (operands
[0], tmp3
);
6721 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6725 emit_move_insn (operands
[0], op1
);
6731 /* Two operations. */
6732 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6733 rtx tmp2
= gen_reg_rtx (V4SImode
);
6734 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6736 rtx tmp3
= gen_reg_rtx (V4SImode
);
6737 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6739 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6745 /* Three operations. */
6746 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6747 rtx tmp2
= gen_reg_rtx (V4SImode
);
6748 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6750 rtx tmp3
= gen_reg_rtx (V4SImode
);
6751 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6753 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6754 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6755 rtx tmp6
= gen_reg_rtx (V2DImode
);
6756 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6758 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6764 /* Three operations. */
6765 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6766 rtx tmp2
= gen_reg_rtx (V4SImode
);
6767 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6769 rtx tmp3
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6770 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6771 rtx tmp5
= gen_reg_rtx (V2DImode
);
6772 emit_insn (gen_vec_interleave_highv2di (tmp5
, tmp3
, tmp4
));
6774 rtx tmp6
= force_reg(V4SImode
, gen_lowpart (V4SImode
, tmp5
));
6775 rtx tmp7
= gen_reg_rtx (V4SImode
);
6776 emit_insn (gen_sse2_pshufd (tmp7
, tmp6
, GEN_INT (0xfd)));
6778 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
6784 /* Three operations. */
6785 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6786 rtx tmp2
= gen_reg_rtx (V4SImode
);
6787 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6789 rtx tmp3
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6790 rtx tmp4
= gen_reg_rtx (V8HImode
);
6791 emit_insn (gen_sse2_pshufhw (tmp4
, tmp3
, GEN_INT (0xfe)));
6793 rtx tmp5
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp4
));
6794 rtx tmp6
= gen_reg_rtx (V4SImode
);
6795 emit_insn (gen_sse2_pshufd (tmp6
, tmp5
, GEN_INT (0xfe)));
6797 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6801 if (TARGET_AVX2
|| TARGET_SSE4_1
)
6803 /* Three operations. */
6806 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6807 rtx tmp2
= gen_reg_rtx (V4SImode
);
6808 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6810 rtx tmp3
= gen_reg_rtx (V1TImode
);
6811 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (32)));
6815 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6816 rtx tmp5
= gen_reg_rtx (V4SImode
);
6817 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6820 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6824 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6825 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6826 rtx tmp6
= gen_reg_rtx (V8HImode
);
6827 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6830 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6835 /* Three operations. */
6836 if (bits
== 8 || bits
== 16 || bits
== 24)
6838 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6839 rtx tmp2
= gen_reg_rtx (V4SImode
);
6840 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6842 rtx tmp3
= gen_reg_rtx (V1TImode
);
6843 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (bits
)));
6847 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6848 rtx tmp5
= gen_reg_rtx (V4SImode
);
6849 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6852 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6856 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6857 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6858 rtx tmp6
= gen_reg_rtx (V8HImode
);
6859 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6862 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6870 /* Four operations. */
6871 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6872 rtx tmp2
= gen_reg_rtx (V4SImode
);
6873 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6875 rtx tmp3
= gen_reg_rtx (V4SImode
);
6876 emit_insn (gen_ashrv4si3 (tmp3
, tmp1
, GEN_INT (31)));
6878 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6879 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6880 rtx tmp6
= gen_reg_rtx (V2DImode
);
6881 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6883 rtx tmp7
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp6
));
6884 rtx tmp8
= gen_reg_rtx (V4SImode
);
6885 emit_insn (gen_sse2_pshufd (tmp8
, tmp7
, GEN_INT (0xfd)));
6887 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp8
));
6891 if (TARGET_SSE4_1
&& (bits
== 48 || bits
== 80))
6893 /* Four operations. */
6894 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6895 rtx tmp2
= gen_reg_rtx (V4SImode
);
6896 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6898 rtx tmp3
= gen_reg_rtx (V4SImode
);
6899 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6901 rtx tmp4
= gen_reg_rtx (V1TImode
);
6902 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
6904 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6905 rtx tmp6
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp4
));
6906 rtx tmp7
= gen_reg_rtx (V8HImode
);
6907 emit_insn (gen_sse4_1_pblendw (tmp7
, tmp5
, tmp6
,
6908 GEN_INT (bits
== 48 ? 0x1f : 0x07)));
6910 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
6914 if ((bits
& 7) == 0)
6916 /* Five operations. */
6917 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6918 rtx tmp2
= gen_reg_rtx (V4SImode
);
6919 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6921 rtx tmp3
= gen_reg_rtx (V4SImode
);
6922 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6924 rtx tmp4
= gen_reg_rtx (V1TImode
);
6925 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
6927 rtx tmp5
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6928 rtx tmp6
= gen_reg_rtx (V1TImode
);
6929 emit_insn (gen_sse2_ashlv1ti3 (tmp6
, tmp5
, GEN_INT (128 - bits
)));
6931 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
6932 rtx tmp8
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp6
));
6933 rtx tmp9
= gen_reg_rtx (V2DImode
);
6934 emit_insn (gen_iorv2di3 (tmp9
, tmp7
, tmp8
));
6936 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp9
));
6940 if (TARGET_AVX2
&& bits
< 32)
6942 /* Six operations. */
6943 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6944 rtx tmp2
= gen_reg_rtx (V4SImode
);
6945 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6947 rtx tmp3
= gen_reg_rtx (V1TImode
);
6948 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
6950 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6951 rtx tmp5
= gen_reg_rtx (V2DImode
);
6952 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6954 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6955 rtx tmp7
= gen_reg_rtx (V2DImode
);
6956 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
6958 rtx tmp8
= gen_reg_rtx (V2DImode
);
6959 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
6961 rtx tmp9
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp8
));
6962 rtx tmp10
= gen_reg_rtx (V4SImode
);
6963 emit_insn (gen_avx2_pblenddv4si (tmp10
, tmp2
, tmp9
, GEN_INT (7)));
6965 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp10
));
6969 if (TARGET_SSE4_1
&& bits
< 15)
6971 /* Six operations. */
6972 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6973 rtx tmp2
= gen_reg_rtx (V4SImode
);
6974 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6976 rtx tmp3
= gen_reg_rtx (V1TImode
);
6977 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
6979 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6980 rtx tmp5
= gen_reg_rtx (V2DImode
);
6981 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6983 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6984 rtx tmp7
= gen_reg_rtx (V2DImode
);
6985 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
6987 rtx tmp8
= gen_reg_rtx (V2DImode
);
6988 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
6990 rtx tmp9
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6991 rtx tmp10
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp8
));
6992 rtx tmp11
= gen_reg_rtx (V8HImode
);
6993 emit_insn (gen_sse4_1_pblendw (tmp11
, tmp9
, tmp10
, GEN_INT (0x3f)));
6995 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp11
));
7001 /* Eight operations. */
7002 rtx tmp1
= gen_reg_rtx (V1TImode
);
7003 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
7005 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7006 rtx tmp3
= gen_reg_rtx (V2DImode
);
7007 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (1)));
7009 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
7010 rtx tmp5
= gen_reg_rtx (V2DImode
);
7011 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (63)));
7013 rtx tmp6
= gen_reg_rtx (V2DImode
);
7014 emit_insn (gen_iorv2di3 (tmp6
, tmp3
, tmp5
));
7016 rtx tmp7
= gen_reg_rtx (V2DImode
);
7017 emit_insn (gen_lshrv2di3 (tmp7
, tmp2
, GEN_INT (63)));
7019 rtx tmp8
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp7
));
7020 rtx tmp9
= gen_reg_rtx (V4SImode
);
7021 emit_insn (gen_sse2_pshufd (tmp9
, tmp8
, GEN_INT (0xbf)));
7023 rtx tmp10
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp9
));
7024 rtx tmp11
= gen_reg_rtx (V2DImode
);
7025 emit_insn (gen_ashlv2di3 (tmp11
, tmp10
, GEN_INT (31)));
7027 rtx tmp12
= gen_reg_rtx (V2DImode
);
7028 emit_insn (gen_iorv2di3 (tmp12
, tmp6
, tmp11
));
7030 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp12
));
7036 /* Eight operations. */
7037 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7038 rtx tmp2
= gen_reg_rtx (V4SImode
);
7039 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7041 rtx tmp3
= gen_reg_rtx (V4SImode
);
7042 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7044 rtx tmp4
= gen_reg_rtx (V1TImode
);
7045 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
7047 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7048 rtx tmp6
= gen_reg_rtx (V2DImode
);
7049 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
- 64)));
7051 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7052 rtx tmp8
= gen_reg_rtx (V1TImode
);
7053 emit_insn (gen_sse2_ashlv1ti3 (tmp8
, tmp7
, GEN_INT (64)));
7055 rtx tmp9
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
7056 rtx tmp10
= gen_reg_rtx (V2DImode
);
7057 emit_insn (gen_ashlv2di3 (tmp10
, tmp9
, GEN_INT (128 - bits
)));
7059 rtx tmp11
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp8
));
7060 rtx tmp12
= gen_reg_rtx (V2DImode
);
7061 emit_insn (gen_iorv2di3 (tmp12
, tmp10
, tmp11
));
7063 rtx tmp13
= gen_reg_rtx (V2DImode
);
7064 emit_insn (gen_iorv2di3 (tmp13
, tmp6
, tmp12
));
7066 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp13
));
7070 /* Nine operations. */
7071 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
7072 rtx tmp2
= gen_reg_rtx (V4SImode
);
7073 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
7075 rtx tmp3
= gen_reg_rtx (V4SImode
);
7076 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
7078 rtx tmp4
= gen_reg_rtx (V1TImode
);
7079 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
7081 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
7082 rtx tmp6
= gen_reg_rtx (V2DImode
);
7083 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
)));
7085 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
7086 rtx tmp8
= gen_reg_rtx (V2DImode
);
7087 emit_insn (gen_ashlv2di3 (tmp8
, tmp7
, GEN_INT (64 - bits
)));
7089 rtx tmp9
= gen_reg_rtx (V2DImode
);
7090 emit_insn (gen_iorv2di3 (tmp9
, tmp6
, tmp8
));
7092 rtx tmp10
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
7093 rtx tmp11
= gen_reg_rtx (V1TImode
);
7094 emit_insn (gen_sse2_ashlv1ti3 (tmp11
, tmp10
, GEN_INT (64)));
7096 rtx tmp12
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp11
));
7097 rtx tmp13
= gen_reg_rtx (V2DImode
);
7098 emit_insn (gen_ashlv2di3 (tmp13
, tmp12
, GEN_INT (64 - bits
)));
7100 rtx tmp14
= gen_reg_rtx (V2DImode
);
7101 emit_insn (gen_iorv2di3 (tmp14
, tmp9
, tmp13
));
7103 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp14
));
7107 /* Replace all occurrences of REG FROM with REG TO in X, including
7108 occurrences with different modes. */
7111 ix86_replace_reg_with_reg (rtx x
, rtx from
, rtx to
)
7113 gcc_checking_assert (REG_P (from
)
7115 && GET_MODE (from
) == GET_MODE (to
));
7116 if (!reg_overlap_mentioned_p (from
, x
))
7118 rtx ret
= copy_rtx (x
);
7119 subrtx_ptr_iterator::array_type array
;
7120 FOR_EACH_SUBRTX_PTR (iter
, array
, &ret
, NONCONST
)
7124 if (REG_P (x
) && REGNO (x
) == REGNO (from
))
7130 gcc_checking_assert (REG_NREGS (x
) == 1);
7131 *loc
= gen_rtx_REG (GET_MODE (x
), REGNO (to
));
7138 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7139 DImode for constant loop counts. */
7142 counter_mode (rtx count_exp
)
7144 if (GET_MODE (count_exp
) != VOIDmode
)
7145 return GET_MODE (count_exp
);
7146 if (!CONST_INT_P (count_exp
))
7148 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
7153 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7154 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7155 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7156 memory by VALUE (supposed to be in MODE).
7158 The size is rounded down to whole number of chunk size moved at once.
7159 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7163 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
7164 rtx destptr
, rtx srcptr
, rtx value
,
7165 rtx count
, machine_mode mode
, int unroll
,
7166 int expected_size
, bool issetmem
)
7168 rtx_code_label
*out_label
, *top_label
;
7170 machine_mode iter_mode
= counter_mode (count
);
7171 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
7172 rtx piece_size
= GEN_INT (piece_size_n
);
7173 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
7177 top_label
= gen_label_rtx ();
7178 out_label
= gen_label_rtx ();
7179 iter
= gen_reg_rtx (iter_mode
);
7181 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
7182 NULL
, 1, OPTAB_DIRECT
);
7183 /* Those two should combine. */
7184 if (piece_size
== const1_rtx
)
7186 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
7188 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
7190 emit_move_insn (iter
, const0_rtx
);
7192 emit_label (top_label
);
7194 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
7196 /* This assert could be relaxed - in this case we'll need to compute
7197 smallest power of two, containing in PIECE_SIZE_N and pass it to
7199 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
7200 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
7201 destmem
= adjust_address (destmem
, mode
, 0);
7205 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
7206 srcmem
= adjust_address (srcmem
, mode
, 0);
7208 /* When unrolling for chips that reorder memory reads and writes,
7209 we can save registers by using single temporary.
7210 Also using 4 temporaries is overkill in 32bit mode. */
7211 if (!TARGET_64BIT
&& 0)
7213 for (i
= 0; i
< unroll
; i
++)
7217 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7218 GET_MODE_SIZE (mode
));
7219 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7220 GET_MODE_SIZE (mode
));
7222 emit_move_insn (destmem
, srcmem
);
7228 gcc_assert (unroll
<= 4);
7229 for (i
= 0; i
< unroll
; i
++)
7231 tmpreg
[i
] = gen_reg_rtx (mode
);
7233 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7234 GET_MODE_SIZE (mode
));
7235 emit_move_insn (tmpreg
[i
], srcmem
);
7237 for (i
= 0; i
< unroll
; i
++)
7240 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7241 GET_MODE_SIZE (mode
));
7242 emit_move_insn (destmem
, tmpreg
[i
]);
7247 for (i
= 0; i
< unroll
; i
++)
7250 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7251 GET_MODE_SIZE (mode
));
7252 emit_move_insn (destmem
, value
);
7255 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
7256 true, OPTAB_LIB_WIDEN
);
7258 emit_move_insn (iter
, tmp
);
7260 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
7262 if (expected_size
!= -1)
7264 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
7265 if (expected_size
== 0)
7267 else if (expected_size
> REG_BR_PROB_BASE
)
7268 predict_jump (REG_BR_PROB_BASE
- 1);
7270 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
7274 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
7275 iter
= ix86_zero_extend_to_Pmode (iter
);
7276 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
7277 true, OPTAB_LIB_WIDEN
);
7279 emit_move_insn (destptr
, tmp
);
7282 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
7283 true, OPTAB_LIB_WIDEN
);
7285 emit_move_insn (srcptr
, tmp
);
7287 emit_label (out_label
);
7290 /* Divide COUNTREG by SCALE. */
7292 scale_counter (rtx countreg
, int scale
)
7298 if (CONST_INT_P (countreg
))
7299 return GEN_INT (INTVAL (countreg
) / scale
);
7300 gcc_assert (REG_P (countreg
));
7302 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
7303 GEN_INT (exact_log2 (scale
)),
7304 NULL
, 1, OPTAB_DIRECT
);
7308 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7309 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7310 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7311 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7312 ORIG_VALUE is the original value passed to memset to fill the memory with.
7313 Other arguments have same meaning as for previous function. */
7316 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
7317 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
7319 machine_mode mode
, bool issetmem
)
7324 HOST_WIDE_INT rounded_count
;
7326 /* If possible, it is shorter to use rep movs.
7327 TODO: Maybe it is better to move this logic to decide_alg. */
7328 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
7329 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7330 && (!issetmem
|| orig_value
== const0_rtx
))
7333 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
7334 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
7336 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
7337 GET_MODE_SIZE (mode
)));
7340 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7341 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7342 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
7345 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
7346 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
7349 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7350 destmem
= shallow_copy_rtx (destmem
);
7351 set_mem_size (destmem
, rounded_count
);
7353 else if (MEM_SIZE_KNOWN_P (destmem
))
7354 clear_mem_size (destmem
);
7358 value
= force_reg (mode
, gen_lowpart (mode
, value
));
7359 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
7363 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
7364 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
7367 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7368 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7369 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
7372 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
7373 if (CONST_INT_P (count
))
7376 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7377 srcmem
= shallow_copy_rtx (srcmem
);
7378 set_mem_size (srcmem
, rounded_count
);
7382 if (MEM_SIZE_KNOWN_P (srcmem
))
7383 clear_mem_size (srcmem
);
7385 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
7390 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7392 SRC is passed by pointer to be updated on return.
7393 Return value is updated DST. */
7395 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
7396 HOST_WIDE_INT size_to_move
)
7398 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
7399 enum insn_code code
;
7400 machine_mode move_mode
;
7403 /* Find the widest mode in which we could perform moves.
7404 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7405 it until move of such size is supported. */
7406 piece_size
= 1 << floor_log2 (size_to_move
);
7407 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
7408 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7410 gcc_assert (piece_size
> 1);
7414 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7415 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7416 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7418 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7419 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7420 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7422 move_mode
= word_mode
;
7423 piece_size
= GET_MODE_SIZE (move_mode
);
7424 code
= optab_handler (mov_optab
, move_mode
);
7427 gcc_assert (code
!= CODE_FOR_nothing
);
7429 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7430 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
7432 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7433 gcc_assert (size_to_move
% piece_size
== 0);
7435 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7437 /* We move from memory to memory, so we'll need to do it via
7438 a temporary register. */
7439 tempreg
= gen_reg_rtx (move_mode
);
7440 emit_insn (GEN_FCN (code
) (tempreg
, src
));
7441 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
7443 emit_move_insn (destptr
,
7444 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7445 emit_move_insn (srcptr
,
7446 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
7448 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7450 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
7454 /* Update DST and SRC rtx. */
7459 /* Helper function for the string operations below. Dest VARIABLE whether
7460 it is aligned to VALUE bytes. If true, jump to the label. */
7462 static rtx_code_label
*
7463 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
7465 rtx_code_label
*label
= gen_label_rtx ();
7466 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
7467 if (GET_MODE (variable
) == DImode
)
7468 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
7470 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
7471 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
7474 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
7476 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7481 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7484 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
7485 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
7488 if (CONST_INT_P (count
))
7490 HOST_WIDE_INT countval
= INTVAL (count
);
7491 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7494 /* For now MAX_SIZE should be a power of 2. This assert could be
7495 relaxed, but it'll require a bit more complicated epilogue
7497 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7498 for (i
= max_size
; i
>= 1; i
>>= 1)
7500 if (epilogue_size
& i
)
7501 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7507 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
7508 count
, 1, OPTAB_DIRECT
);
7509 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
7510 count
, QImode
, 1, 4, false);
7514 /* When there are stringops, we can cheaply increase dest and src pointers.
7515 Otherwise we save code size by maintaining offset (zero is readily
7516 available from preceding rep operation) and using x86 addressing modes.
7518 if (TARGET_SINGLE_STRINGOP
)
7522 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7523 src
= change_address (srcmem
, SImode
, srcptr
);
7524 dest
= change_address (destmem
, SImode
, destptr
);
7525 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7527 LABEL_NUSES (label
) = 1;
7531 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7532 src
= change_address (srcmem
, HImode
, srcptr
);
7533 dest
= change_address (destmem
, HImode
, destptr
);
7534 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7536 LABEL_NUSES (label
) = 1;
7540 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7541 src
= change_address (srcmem
, QImode
, srcptr
);
7542 dest
= change_address (destmem
, QImode
, destptr
);
7543 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7545 LABEL_NUSES (label
) = 1;
7550 rtx offset
= force_reg (Pmode
, const0_rtx
);
7555 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7556 src
= change_address (srcmem
, SImode
, srcptr
);
7557 dest
= change_address (destmem
, SImode
, destptr
);
7558 emit_move_insn (dest
, src
);
7559 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
7560 true, OPTAB_LIB_WIDEN
);
7562 emit_move_insn (offset
, tmp
);
7564 LABEL_NUSES (label
) = 1;
7568 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7569 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7570 src
= change_address (srcmem
, HImode
, tmp
);
7571 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7572 dest
= change_address (destmem
, HImode
, tmp
);
7573 emit_move_insn (dest
, src
);
7574 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
7575 true, OPTAB_LIB_WIDEN
);
7577 emit_move_insn (offset
, tmp
);
7579 LABEL_NUSES (label
) = 1;
7583 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7584 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7585 src
= change_address (srcmem
, QImode
, tmp
);
7586 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7587 dest
= change_address (destmem
, QImode
, tmp
);
7588 emit_move_insn (dest
, src
);
7590 LABEL_NUSES (label
) = 1;
7595 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7596 with value PROMOTED_VAL.
7597 SRC is passed by pointer to be updated on return.
7598 Return value is updated DST. */
7600 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
7601 HOST_WIDE_INT size_to_move
)
7604 enum insn_code code
;
7605 machine_mode move_mode
;
7608 /* Find the widest mode in which we could perform moves.
7609 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7610 it until move of such size is supported. */
7611 move_mode
= GET_MODE (promoted_val
);
7612 if (move_mode
== VOIDmode
)
7614 if (size_to_move
< GET_MODE_SIZE (move_mode
))
7616 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
7617 move_mode
= int_mode_for_size (move_bits
, 0).require ();
7618 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
7620 piece_size
= GET_MODE_SIZE (move_mode
);
7621 code
= optab_handler (mov_optab
, move_mode
);
7622 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
7624 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7626 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7627 gcc_assert (size_to_move
% piece_size
== 0);
7629 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7631 if (piece_size
<= GET_MODE_SIZE (word_mode
))
7633 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
7634 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7639 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
7641 emit_move_insn (destptr
,
7642 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7644 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7648 /* Update DST rtx. */
7651 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7653 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
7654 rtx count
, int max_size
)
7656 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
7657 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
7658 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
7659 gen_lowpart (QImode
, value
), count
, QImode
,
7660 1, max_size
/ 2, true);
7663 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7665 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
7666 rtx count
, int max_size
)
7670 if (CONST_INT_P (count
))
7672 HOST_WIDE_INT countval
= INTVAL (count
);
7673 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7676 /* For now MAX_SIZE should be a power of 2. This assert could be
7677 relaxed, but it'll require a bit more complicated epilogue
7679 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7680 for (i
= max_size
; i
>= 1; i
>>= 1)
7682 if (epilogue_size
& i
)
7684 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7685 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7687 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7694 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
7699 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
7702 dest
= change_address (destmem
, DImode
, destptr
);
7703 emit_insn (gen_strset (destptr
, dest
, value
));
7704 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
7705 emit_insn (gen_strset (destptr
, dest
, value
));
7709 dest
= change_address (destmem
, SImode
, destptr
);
7710 emit_insn (gen_strset (destptr
, dest
, value
));
7711 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7712 emit_insn (gen_strset (destptr
, dest
, value
));
7713 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
7714 emit_insn (gen_strset (destptr
, dest
, value
));
7715 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
7716 emit_insn (gen_strset (destptr
, dest
, value
));
7719 LABEL_NUSES (label
) = 1;
7723 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
7726 dest
= change_address (destmem
, DImode
, destptr
);
7727 emit_insn (gen_strset (destptr
, dest
, value
));
7731 dest
= change_address (destmem
, SImode
, destptr
);
7732 emit_insn (gen_strset (destptr
, dest
, value
));
7733 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7734 emit_insn (gen_strset (destptr
, dest
, value
));
7737 LABEL_NUSES (label
) = 1;
7741 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7742 dest
= change_address (destmem
, SImode
, destptr
);
7743 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
7745 LABEL_NUSES (label
) = 1;
7749 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7750 dest
= change_address (destmem
, HImode
, destptr
);
7751 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
7753 LABEL_NUSES (label
) = 1;
7757 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7758 dest
= change_address (destmem
, QImode
, destptr
);
7759 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
7761 LABEL_NUSES (label
) = 1;
7765 /* Adjust COUNTER by the VALUE. */
7767 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
7769 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
7772 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7773 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7774 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7776 Return value is updated DESTMEM. */
7779 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
7780 rtx destptr
, rtx srcptr
, rtx value
,
7781 rtx vec_value
, rtx count
, int align
,
7782 int desired_alignment
, bool issetmem
)
7785 for (i
= 1; i
< desired_alignment
; i
<<= 1)
7789 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
7792 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7793 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7795 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7798 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7799 ix86_adjust_counter (count
, i
);
7801 LABEL_NUSES (label
) = 1;
7802 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
7808 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7809 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7810 and jump to DONE_LABEL. */
7812 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
7813 rtx destptr
, rtx srcptr
,
7814 rtx value
, rtx vec_value
,
7815 rtx count
, int size
,
7816 rtx done_label
, bool issetmem
)
7818 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
7819 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
7823 /* If we do not have vector value to copy, we must reduce size. */
7828 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
7830 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
7831 mode
= GET_MODE (value
);
7834 mode
= GET_MODE (vec_value
), value
= vec_value
;
7838 /* Choose appropriate vector mode. */
7840 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
7841 else if (size
>= 16)
7842 mode
= TARGET_SSE
? V16QImode
: DImode
;
7843 srcmem
= change_address (srcmem
, mode
, srcptr
);
7845 destmem
= change_address (destmem
, mode
, destptr
);
7846 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
7847 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
7848 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7851 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7854 emit_move_insn (destmem
, srcmem
);
7855 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7857 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7860 destmem
= offset_address (destmem
, count
, 1);
7861 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
7862 GET_MODE_SIZE (mode
));
7865 srcmem
= offset_address (srcmem
, count
, 1);
7866 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
7867 GET_MODE_SIZE (mode
));
7869 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7872 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7875 emit_move_insn (destmem
, srcmem
);
7876 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7878 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7880 emit_jump_insn (gen_jump (done_label
));
7884 LABEL_NUSES (label
) = 1;
7887 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7888 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7889 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7890 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7891 DONE_LABEL is a label after the whole copying sequence. The label is created
7892 on demand if *DONE_LABEL is NULL.
7893 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7894 bounds after the initial copies.
7896 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7897 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7898 we will dispatch to a library call for large blocks.
7900 In pseudocode we do:
7904 Assume that SIZE is 4. Bigger sizes are handled analogously
7907 copy 4 bytes from SRCPTR to DESTPTR
7908 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7913 copy 1 byte from SRCPTR to DESTPTR
7916 copy 2 bytes from SRCPTR to DESTPTR
7917 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7922 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7923 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7925 OLD_DESPTR = DESTPTR;
7926 Align DESTPTR up to DESIRED_ALIGN
7927 SRCPTR += DESTPTR - OLD_DESTPTR
7928 COUNT -= DEST_PTR - OLD_DESTPTR
7930 Round COUNT down to multiple of SIZE
7931 << optional caller supplied zero size guard is here >>
7932 << optional caller supplied dynamic check is here >>
7933 << caller supplied main copy loop is here >>
7938 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
7939 rtx
*destptr
, rtx
*srcptr
,
7941 rtx value
, rtx vec_value
,
7943 rtx_code_label
**done_label
,
7947 unsigned HOST_WIDE_INT
*min_size
,
7951 rtx_code_label
*loop_label
= NULL
, *label
;
7954 int prolog_size
= 0;
7957 /* Chose proper value to copy. */
7958 if (issetmem
&& VECTOR_MODE_P (mode
))
7959 mode_value
= vec_value
;
7962 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
7964 /* See if block is big or small, handle small blocks. */
7965 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
7968 loop_label
= gen_label_rtx ();
7971 *done_label
= gen_label_rtx ();
7973 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
7977 /* Handle sizes > 3. */
7978 for (;size2
> 2; size2
>>= 1)
7979 expand_small_cpymem_or_setmem (destmem
, srcmem
,
7983 size2
, *done_label
, issetmem
);
7984 /* Nothing to copy? Jump to DONE_LABEL if so */
7985 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
7988 /* Do a byte copy. */
7989 destmem
= change_address (destmem
, QImode
, *destptr
);
7991 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
7994 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
7995 emit_move_insn (destmem
, srcmem
);
7998 /* Handle sizes 2 and 3. */
7999 label
= ix86_expand_aligntest (*count
, 2, false);
8000 destmem
= change_address (destmem
, HImode
, *destptr
);
8001 destmem
= offset_address (destmem
, *count
, 1);
8002 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
8004 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
8007 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
8008 srcmem
= offset_address (srcmem
, *count
, 1);
8009 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
8010 emit_move_insn (destmem
, srcmem
);
8014 LABEL_NUSES (label
) = 1;
8015 emit_jump_insn (gen_jump (*done_label
));
8019 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
8020 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
8022 /* Start memcpy for COUNT >= SIZE. */
8025 emit_label (loop_label
);
8026 LABEL_NUSES (loop_label
) = 1;
8029 /* Copy first desired_align bytes. */
8031 srcmem
= change_address (srcmem
, mode
, *srcptr
);
8032 destmem
= change_address (destmem
, mode
, *destptr
);
8033 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
8034 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
8037 emit_move_insn (destmem
, mode_value
);
8040 emit_move_insn (destmem
, srcmem
);
8041 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
8043 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
8044 prolog_size
+= GET_MODE_SIZE (mode
);
8048 /* Copy last SIZE bytes. */
8049 destmem
= offset_address (destmem
, *count
, 1);
8050 destmem
= offset_address (destmem
,
8051 GEN_INT (-size
- prolog_size
),
8054 emit_move_insn (destmem
, mode_value
);
8057 srcmem
= offset_address (srcmem
, *count
, 1);
8058 srcmem
= offset_address (srcmem
,
8059 GEN_INT (-size
- prolog_size
),
8061 emit_move_insn (destmem
, srcmem
);
8063 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
8065 destmem
= offset_address (destmem
, modesize
, 1);
8067 emit_move_insn (destmem
, mode_value
);
8070 srcmem
= offset_address (srcmem
, modesize
, 1);
8071 emit_move_insn (destmem
, srcmem
);
8075 /* Align destination. */
8076 if (desired_align
> 1 && desired_align
> align
)
8078 rtx saveddest
= *destptr
;
8080 gcc_assert (desired_align
<= size
);
8081 /* Align destptr up, place it to new register. */
8082 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
8083 GEN_INT (prolog_size
),
8084 NULL_RTX
, 1, OPTAB_DIRECT
);
8085 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
8086 REG_POINTER (*destptr
) = 1;
8087 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
8088 GEN_INT (-desired_align
),
8089 *destptr
, 1, OPTAB_DIRECT
);
8090 /* See how many bytes we skipped. */
8091 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
8093 saveddest
, 1, OPTAB_DIRECT
);
8094 /* Adjust srcptr and count. */
8096 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
8097 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
8098 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
8099 saveddest
, *count
, 1, OPTAB_DIRECT
);
8100 /* We copied at most size + prolog_size. */
8101 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
8103 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
8107 /* Our loops always round down the block size, but for dispatch to
8108 library we need precise value. */
8110 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
8111 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
8115 gcc_assert (prolog_size
== 0);
8116 /* Decrease count, so we won't end up copying last word twice. */
8117 if (!CONST_INT_P (*count
))
8118 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
8119 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
8121 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
8122 (unsigned HOST_WIDE_INT
)size
));
8124 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
8129 /* This function is like the previous one, except here we know how many bytes
8130 need to be copied. That allows us to update alignment not only of DST, which
8131 is returned, but also of SRC, which is passed as a pointer for that
8134 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
8135 rtx srcreg
, rtx value
, rtx vec_value
,
8136 int desired_align
, int align_bytes
,
8141 rtx orig_src
= NULL
;
8143 int copied_bytes
= 0;
8147 gcc_assert (srcp
!= NULL
);
8152 for (piece_size
= 1;
8153 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
8156 if (align_bytes
& piece_size
)
8160 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
8161 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
8163 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
8166 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
8167 copied_bytes
+= piece_size
;
8170 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
8171 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8172 if (MEM_SIZE_KNOWN_P (orig_dst
))
8173 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
8177 int src_align_bytes
= get_mem_align_offset (src
, desired_align
8179 if (src_align_bytes
>= 0)
8180 src_align_bytes
= desired_align
- src_align_bytes
;
8181 if (src_align_bytes
>= 0)
8183 unsigned int src_align
;
8184 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
8186 if ((src_align_bytes
& (src_align
- 1))
8187 == (align_bytes
& (src_align
- 1)))
8190 if (src_align
> (unsigned int) desired_align
)
8191 src_align
= desired_align
;
8192 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
8193 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
8195 if (MEM_SIZE_KNOWN_P (orig_src
))
8196 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
8203 /* Return true if ALG can be used in current context.
8204 Assume we expand memset if MEMSET is true. */
8206 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
8208 if (alg
== no_stringop
)
8210 if (alg
== vector_loop
)
8211 return TARGET_SSE
|| TARGET_AVX
;
8212 /* Algorithms using the rep prefix want at least edi and ecx;
8213 additionally, memset wants eax and memcpy wants esi. Don't
8214 consider such algorithms if the user has appropriated those
8215 registers for their own purposes, or if we have a non-default
8216 address space, since some string insns cannot override the segment. */
8217 if (alg
== rep_prefix_1_byte
8218 || alg
== rep_prefix_4_byte
8219 || alg
== rep_prefix_8_byte
)
8223 if (fixed_regs
[CX_REG
]
8224 || fixed_regs
[DI_REG
]
8225 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
8231 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8232 static enum stringop_alg
8233 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
8234 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
8235 bool memset
, bool zero_memset
, bool have_as
,
8236 int *dynamic_check
, bool *noalign
, bool recur
)
8238 const struct stringop_algs
*algs
;
8239 bool optimize_for_speed
;
8241 const struct processor_costs
*cost
;
8243 bool any_alg_usable_p
= false;
8246 *dynamic_check
= -1;
8248 /* Even if the string operation call is cold, we still might spend a lot
8249 of time processing large blocks. */
8250 if (optimize_function_for_size_p (cfun
)
8251 || (optimize_insn_for_size_p ()
8253 || (expected_size
!= -1 && expected_size
< 256))))
8254 optimize_for_speed
= false;
8256 optimize_for_speed
= true;
8258 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
8260 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
8262 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
8264 /* See maximal size for user defined algorithm. */
8265 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8267 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8268 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
8269 any_alg_usable_p
|= usable
;
8271 if (candidate
!= libcall
&& candidate
&& usable
)
8272 max
= algs
->size
[i
].max
;
8275 /* If expected size is not known but max size is small enough
8276 so inline version is a win, set expected size into
8278 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
8279 && expected_size
== -1)
8280 expected_size
= min_size
/ 2 + max_size
/ 2;
8282 /* If user specified the algorithm, honor it if possible. */
8283 if (ix86_stringop_alg
!= no_stringop
8284 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
8285 return ix86_stringop_alg
;
8286 /* rep; movq or rep; movl is the smallest variant. */
8287 else if (!optimize_for_speed
)
8290 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
8291 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
8292 ? rep_prefix_1_byte
: loop_1_byte
;
8294 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
8295 ? rep_prefix_4_byte
: loop
;
8297 /* Very tiny blocks are best handled via the loop, REP is expensive to
8299 else if (expected_size
!= -1 && expected_size
< 4)
8301 else if (expected_size
!= -1)
8303 enum stringop_alg alg
= libcall
;
8304 bool alg_noalign
= false;
8305 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8307 /* We get here if the algorithms that were not libcall-based
8308 were rep-prefix based and we are unable to use rep prefixes
8309 based on global register usage. Break out of the loop and
8310 use the heuristic below. */
8311 if (algs
->size
[i
].max
== 0)
8313 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
8315 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8317 if (candidate
!= libcall
8318 && alg_usable_p (candidate
, memset
, have_as
))
8321 alg_noalign
= algs
->size
[i
].noalign
;
8323 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8324 last non-libcall inline algorithm. */
8325 if (TARGET_INLINE_ALL_STRINGOPS
)
8327 /* When the current size is best to be copied by a libcall,
8328 but we are still forced to inline, run the heuristic below
8329 that will pick code for medium sized blocks. */
8332 *noalign
= alg_noalign
;
8335 else if (!any_alg_usable_p
)
8338 else if (alg_usable_p (candidate
, memset
, have_as
)
8339 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8340 && candidate
== rep_prefix_1_byte
8341 /* NB: If min_size != max_size, size is
8343 && min_size
!= max_size
))
8345 *noalign
= algs
->size
[i
].noalign
;
8351 /* When asked to inline the call anyway, try to pick meaningful choice.
8352 We look for maximal size of block that is faster to copy by hand and
8353 take blocks of at most of that size guessing that average size will
8354 be roughly half of the block.
8356 If this turns out to be bad, we might simply specify the preferred
8357 choice in ix86_costs. */
8358 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8359 && (algs
->unknown_size
== libcall
8360 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
8362 enum stringop_alg alg
;
8363 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
8365 /* If there aren't any usable algorithms or if recursing already,
8366 then recursing on smaller sizes or same size isn't going to
8367 find anything. Just return the simple byte-at-a-time copy loop. */
8368 if (!any_alg_usable_p
|| recur
)
8370 /* Pick something reasonable. */
8371 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
8372 *dynamic_check
= 128;
8375 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
8376 zero_memset
, have_as
, dynamic_check
, noalign
, true);
8377 gcc_assert (*dynamic_check
== -1);
8378 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8379 *dynamic_check
= max
;
8381 gcc_assert (alg
!= libcall
);
8384 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
8385 ? algs
->unknown_size
: libcall
);
8388 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8389 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8391 decide_alignment (int align
,
8392 enum stringop_alg alg
,
8394 machine_mode move_mode
)
8396 int desired_align
= 0;
8398 gcc_assert (alg
!= no_stringop
);
8402 if (move_mode
== VOIDmode
)
8405 desired_align
= GET_MODE_SIZE (move_mode
);
8406 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8407 copying whole cacheline at once. */
8408 if (TARGET_CPU_P (PENTIUMPRO
)
8409 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
8414 if (desired_align
< align
)
8415 desired_align
= align
;
8416 if (expected_size
!= -1 && expected_size
< 4)
8417 desired_align
= align
;
8419 return desired_align
;
8423 /* Helper function for memcpy. For QImode value 0xXY produce
8424 0xXYXYXYXY of wide specified by MODE. This is essentially
8425 a * 0x10101010, but we can do slightly better than
8426 synth_mult by unwinding the sequence by hand on CPUs with
8429 promote_duplicated_reg (machine_mode mode
, rtx val
)
8431 machine_mode valmode
= GET_MODE (val
);
8433 int nops
= mode
== DImode
? 3 : 2;
8435 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
8436 if (val
== const0_rtx
)
8437 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
8438 if (CONST_INT_P (val
))
8440 HOST_WIDE_INT v
= INTVAL (val
) & 255;
8445 v
|= (v
<< 16) << 16;
8446 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
8449 if (valmode
== VOIDmode
)
8451 if (valmode
!= QImode
)
8452 val
= gen_lowpart (QImode
, val
);
8455 if (!TARGET_PARTIAL_REG_STALL
)
8457 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
8458 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
8459 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
8460 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
8462 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8463 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
8464 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
8469 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8471 if (!TARGET_PARTIAL_REG_STALL
)
8472 emit_insn (gen_insv_1 (mode
, reg
, reg
));
8475 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
8476 NULL
, 1, OPTAB_DIRECT
);
8477 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
8480 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
8481 NULL
, 1, OPTAB_DIRECT
);
8482 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8485 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
8486 NULL
, 1, OPTAB_DIRECT
);
8487 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8492 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8493 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8494 alignment from ALIGN to DESIRED_ALIGN. */
8496 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
8502 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
8503 promoted_val
= promote_duplicated_reg (DImode
, val
);
8504 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
8505 promoted_val
= promote_duplicated_reg (SImode
, val
);
8506 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
8507 promoted_val
= promote_duplicated_reg (HImode
, val
);
8511 return promoted_val
;
8514 /* Copy the address to a Pmode register. This is used for x32 to
8515 truncate DImode TLS address to a SImode register. */
8518 ix86_copy_addr_to_reg (rtx addr
)
8521 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
8523 reg
= copy_addr_to_reg (addr
);
8524 REG_POINTER (reg
) = 1;
8529 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
8530 reg
= copy_to_mode_reg (DImode
, addr
);
8531 REG_POINTER (reg
) = 1;
8532 return gen_rtx_SUBREG (SImode
, reg
, 0);
8536 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8537 operations when profitable. The code depends upon architecture, block size
8538 and alignment, but always has one of the following overall structures:
8540 Aligned move sequence:
8542 1) Prologue guard: Conditional that jumps up to epilogues for small
8543 blocks that can be handled by epilogue alone. This is faster
8544 but also needed for correctness, since prologue assume the block
8545 is larger than the desired alignment.
8547 Optional dynamic check for size and libcall for large
8548 blocks is emitted here too, with -minline-stringops-dynamically.
8550 2) Prologue: copy first few bytes in order to get destination
8551 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8552 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8553 copied. We emit either a jump tree on power of two sized
8554 blocks, or a byte loop.
8556 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8557 with specified algorithm.
8559 4) Epilogue: code copying tail of the block that is too small to be
8560 handled by main body (or up to size guarded by prologue guard).
8562 Misaligned move sequence
8564 1) missaligned move prologue/epilogue containing:
8565 a) Prologue handling small memory blocks and jumping to done_label
8566 (skipped if blocks are known to be large enough)
8567 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8568 needed by single possibly misaligned move
8569 (skipped if alignment is not needed)
8570 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8572 2) Zero size guard dispatching to done_label, if needed
8574 3) dispatch to library call, if needed,
8576 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8577 with specified algorithm. */
8579 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
8580 rtx align_exp
, rtx expected_align_exp
,
8581 rtx expected_size_exp
, rtx min_size_exp
,
8582 rtx max_size_exp
, rtx probable_max_size_exp
,
8587 rtx_code_label
*label
= NULL
;
8589 rtx_code_label
*jump_around_label
= NULL
;
8590 HOST_WIDE_INT align
= 1;
8591 unsigned HOST_WIDE_INT count
= 0;
8592 HOST_WIDE_INT expected_size
= -1;
8593 int size_needed
= 0, epilogue_size_needed
;
8594 int desired_align
= 0, align_bytes
= 0;
8595 enum stringop_alg alg
;
8596 rtx promoted_val
= NULL
;
8597 rtx vec_promoted_val
= NULL
;
8598 bool force_loopy_epilogue
= false;
8600 bool need_zero_guard
= false;
8602 machine_mode move_mode
= VOIDmode
;
8603 machine_mode wider_mode
;
8604 int unroll_factor
= 1;
8605 /* TODO: Once value ranges are available, fill in proper data. */
8606 unsigned HOST_WIDE_INT min_size
= 0;
8607 unsigned HOST_WIDE_INT max_size
= -1;
8608 unsigned HOST_WIDE_INT probable_max_size
= -1;
8609 bool misaligned_prologue_used
= false;
8612 if (CONST_INT_P (align_exp
))
8613 align
= INTVAL (align_exp
);
8614 /* i386 can do misaligned access on reasonably increased cost. */
8615 if (CONST_INT_P (expected_align_exp
)
8616 && INTVAL (expected_align_exp
) > align
)
8617 align
= INTVAL (expected_align_exp
);
8618 /* ALIGN is the minimum of destination and source alignment, but we care here
8619 just about destination alignment. */
8621 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
8622 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
8624 if (CONST_INT_P (count_exp
))
8626 min_size
= max_size
= probable_max_size
= count
= expected_size
8627 = INTVAL (count_exp
);
8628 /* When COUNT is 0, there is nothing to do. */
8635 min_size
= INTVAL (min_size_exp
);
8637 max_size
= INTVAL (max_size_exp
);
8638 if (probable_max_size_exp
)
8639 probable_max_size
= INTVAL (probable_max_size_exp
);
8640 if (CONST_INT_P (expected_size_exp
))
8641 expected_size
= INTVAL (expected_size_exp
);
8644 /* Make sure we don't need to care about overflow later on. */
8645 if (count
> (HOST_WIDE_INT_1U
<< 30))
8648 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
8650 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
8652 /* Step 0: Decide on preferred algorithm, desired alignment and
8653 size of chunks to be copied by main loop. */
8654 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
8656 issetmem
&& val_exp
== const0_rtx
, have_as
,
8657 &dynamic_check
, &noalign
, false);
8660 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
8661 stringop_alg_names
[alg
]);
8665 gcc_assert (alg
!= no_stringop
);
8667 /* For now vector-version of memset is generated only for memory zeroing, as
8668 creating of promoted vector value is very cheap in this case. */
8669 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
8670 alg
= unrolled_loop
;
8673 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
8674 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
8676 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
8679 move_mode
= word_mode
;
8687 need_zero_guard
= true;
8691 need_zero_guard
= true;
8694 need_zero_guard
= true;
8695 unroll_factor
= (TARGET_64BIT
? 4 : 2);
8698 need_zero_guard
= true;
8700 /* Find the widest supported mode. */
8701 move_mode
= word_mode
;
8702 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
8703 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
8704 move_mode
= wider_mode
;
8706 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
8708 if (TARGET_AVX512_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 256)
8711 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8712 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8713 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
8715 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
8716 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
8717 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
8718 move_mode
= word_mode
;
8720 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
8722 case rep_prefix_8_byte
:
8725 case rep_prefix_4_byte
:
8728 case rep_prefix_1_byte
:
8732 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
8733 epilogue_size_needed
= size_needed
;
8735 /* If we are going to call any library calls conditionally, make sure any
8736 pending stack adjustment happen before the first conditional branch,
8737 otherwise they will be emitted before the library call only and won't
8738 happen from the other branches. */
8739 if (dynamic_check
!= -1)
8740 do_pending_stack_adjust ();
8742 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
8743 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
8744 align
= desired_align
;
8746 /* Step 1: Prologue guard. */
8748 /* Alignment code needs count to be in register. */
8749 if (CONST_INT_P (count_exp
) && desired_align
> align
)
8751 if (INTVAL (count_exp
) > desired_align
8752 && INTVAL (count_exp
) > size_needed
)
8755 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
8756 if (align_bytes
<= 0)
8759 align_bytes
= desired_align
- align_bytes
;
8761 if (align_bytes
== 0)
8762 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
8764 gcc_assert (desired_align
>= 1 && align
>= 1);
8766 /* Misaligned move sequences handle both prologue and epilogue at once.
8767 Default code generation results in a smaller code for large alignments
8768 and also avoids redundant job when sizes are known precisely. */
8769 misaligned_prologue_used
8770 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8771 && MAX (desired_align
, epilogue_size_needed
) <= 32
8772 && desired_align
<= epilogue_size_needed
8773 && ((desired_align
> align
&& !align_bytes
)
8774 || (!count
&& epilogue_size_needed
> 1)));
8776 /* Do the cheap promotion to allow better CSE across the
8777 main loop and epilogue (ie one load of the big constant in the
8779 For now the misaligned move sequences do not have fast path
8780 without broadcasting. */
8781 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
8783 if (alg
== vector_loop
)
8785 gcc_assert (val_exp
== const0_rtx
);
8786 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
8787 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
8788 GET_MODE_SIZE (word_mode
),
8789 desired_align
, align
);
8793 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
8794 desired_align
, align
);
8797 /* Misaligned move sequences handles both prologues and epilogues at once.
8798 Default code generation results in smaller code for large alignments and
8799 also avoids redundant job when sizes are known precisely. */
8800 if (misaligned_prologue_used
)
8802 /* Misaligned move prologue handled small blocks by itself. */
8803 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8804 (dst
, src
, &destreg
, &srcreg
,
8805 move_mode
, promoted_val
, vec_promoted_val
,
8808 desired_align
< align
8809 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
8810 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
8812 src
= change_address (src
, BLKmode
, srcreg
);
8813 dst
= change_address (dst
, BLKmode
, destreg
);
8814 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8815 epilogue_size_needed
= 0;
8817 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
8819 /* It is possible that we copied enough so the main loop will not
8821 gcc_assert (size_needed
> 1);
8822 if (jump_around_label
== NULL_RTX
)
8823 jump_around_label
= gen_label_rtx ();
8824 emit_cmp_and_jump_insns (count_exp
,
8825 GEN_INT (size_needed
),
8826 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
8827 if (expected_size
== -1
8828 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
8829 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8831 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8834 /* Ensure that alignment prologue won't copy past end of block. */
8835 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
8837 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
8838 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8839 Make sure it is power of 2. */
8840 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
8842 /* To improve performance of small blocks, we jump around the VAL
8843 promoting mode. This mean that if the promoted VAL is not constant,
8844 we might not use it in the epilogue and have to use byte
8846 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
8847 force_loopy_epilogue
= true;
8848 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8849 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8851 /* If main algorithm works on QImode, no epilogue is needed.
8852 For small sizes just don't align anything. */
8853 if (size_needed
== 1)
8854 desired_align
= align
;
8859 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8861 label
= gen_label_rtx ();
8862 emit_cmp_and_jump_insns (count_exp
,
8863 GEN_INT (epilogue_size_needed
),
8864 LTU
, 0, counter_mode (count_exp
), 1, label
);
8865 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
8866 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8868 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8872 /* Emit code to decide on runtime whether library call or inline should be
8874 if (dynamic_check
!= -1)
8876 if (!issetmem
&& CONST_INT_P (count_exp
))
8878 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
8880 emit_block_copy_via_libcall (dst
, src
, count_exp
);
8881 count_exp
= const0_rtx
;
8887 rtx_code_label
*hot_label
= gen_label_rtx ();
8888 if (jump_around_label
== NULL_RTX
)
8889 jump_around_label
= gen_label_rtx ();
8890 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
8891 LEU
, 0, counter_mode (count_exp
),
8893 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
8895 set_storage_via_libcall (dst
, count_exp
, val_exp
);
8897 emit_block_copy_via_libcall (dst
, src
, count_exp
);
8898 emit_jump (jump_around_label
);
8899 emit_label (hot_label
);
8903 /* Step 2: Alignment prologue. */
8904 /* Do the expensive promotion once we branched off the small blocks. */
8905 if (issetmem
&& !promoted_val
)
8906 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
8907 desired_align
, align
);
8909 if (desired_align
> align
&& !misaligned_prologue_used
)
8911 if (align_bytes
== 0)
8913 /* Except for the first move in prologue, we no longer know
8914 constant offset in aliasing info. It don't seems to worth
8915 the pain to maintain it for the first move, so throw away
8917 dst
= change_address (dst
, BLKmode
, destreg
);
8919 src
= change_address (src
, BLKmode
, srcreg
);
8920 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
8921 promoted_val
, vec_promoted_val
,
8922 count_exp
, align
, desired_align
,
8924 /* At most desired_align - align bytes are copied. */
8925 if (min_size
< (unsigned)(desired_align
- align
))
8928 min_size
-= desired_align
- align
;
8932 /* If we know how many bytes need to be stored before dst is
8933 sufficiently aligned, maintain aliasing info accurately. */
8934 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
8942 count_exp
= plus_constant (counter_mode (count_exp
),
8943 count_exp
, -align_bytes
);
8944 count
-= align_bytes
;
8945 min_size
-= align_bytes
;
8946 max_size
-= align_bytes
;
8949 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
8950 && (count
< (unsigned HOST_WIDE_INT
) size_needed
8951 || (align_bytes
== 0
8952 && count
< ((unsigned HOST_WIDE_INT
) size_needed
8953 + desired_align
- align
))))
8955 /* It is possible that we copied enough so the main loop will not
8957 gcc_assert (size_needed
> 1);
8958 if (label
== NULL_RTX
)
8959 label
= gen_label_rtx ();
8960 emit_cmp_and_jump_insns (count_exp
,
8961 GEN_INT (size_needed
),
8962 LTU
, 0, counter_mode (count_exp
), 1, label
);
8963 if (expected_size
== -1
8964 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
8965 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8967 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8970 if (label
&& size_needed
== 1)
8973 LABEL_NUSES (label
) = 1;
8975 epilogue_size_needed
= 1;
8977 promoted_val
= val_exp
;
8979 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
8980 epilogue_size_needed
= size_needed
;
8982 /* Step 3: Main loop. */
8993 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
8994 count_exp
, move_mode
, unroll_factor
,
8995 expected_size
, issetmem
);
8998 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
8999 vec_promoted_val
, count_exp
, move_mode
,
9000 unroll_factor
, expected_size
, issetmem
);
9002 case rep_prefix_8_byte
:
9003 case rep_prefix_4_byte
:
9004 case rep_prefix_1_byte
:
9005 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
9006 val_exp
, count_exp
, move_mode
, issetmem
);
9009 /* Adjust properly the offset of src and dest memory for aliasing. */
9010 if (CONST_INT_P (count_exp
))
9013 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
9014 (count
/ size_needed
) * size_needed
);
9015 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
9016 (count
/ size_needed
) * size_needed
);
9021 src
= change_address (src
, BLKmode
, srcreg
);
9022 dst
= change_address (dst
, BLKmode
, destreg
);
9025 /* Step 4: Epilogue to copy the remaining bytes. */
9029 /* When the main loop is done, COUNT_EXP might hold original count,
9030 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9031 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9032 bytes. Compensate if needed. */
9034 if (size_needed
< epilogue_size_needed
)
9036 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
9037 GEN_INT (size_needed
- 1), count_exp
, 1,
9039 if (tmp
!= count_exp
)
9040 emit_move_insn (count_exp
, tmp
);
9043 LABEL_NUSES (label
) = 1;
9046 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
9048 if (force_loopy_epilogue
)
9049 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
9050 epilogue_size_needed
);
9054 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
9055 vec_promoted_val
, count_exp
,
9056 epilogue_size_needed
);
9058 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
9059 epilogue_size_needed
);
9062 if (jump_around_label
)
9063 emit_label (jump_around_label
);
9067 /* Expand cmpstrn or memcmp. */
9070 ix86_expand_cmpstrn_or_cmpmem (rtx result
, rtx src1
, rtx src2
,
9071 rtx length
, rtx align
, bool is_cmpstrn
)
9073 /* Expand strncmp and memcmp only with -minline-all-stringops since
9074 "repz cmpsb" can be much slower than strncmp and memcmp functions
9075 implemented with vector instructions, see
9077 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9079 if (!TARGET_INLINE_ALL_STRINGOPS
)
9082 /* Can't use this if the user has appropriated ecx, esi or edi. */
9083 if (fixed_regs
[CX_REG
] || fixed_regs
[SI_REG
] || fixed_regs
[DI_REG
])
9088 /* For strncmp, length is the maximum length, which can be larger
9089 than actual string lengths. We can expand the cmpstrn pattern
9090 to "repz cmpsb" only if one of the strings is a constant so
9091 that expand_builtin_strncmp() can write the length argument to
9092 be the minimum of the const string length and the actual length
9093 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9094 tree t1
= MEM_EXPR (src1
);
9095 tree t2
= MEM_EXPR (src2
);
9096 if (!((t1
&& TREE_CODE (t1
) == MEM_REF
9097 && TREE_CODE (TREE_OPERAND (t1
, 0)) == ADDR_EXPR
9098 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1
, 0), 0))
9100 || (t2
&& TREE_CODE (t2
) == MEM_REF
9101 && TREE_CODE (TREE_OPERAND (t2
, 0)) == ADDR_EXPR
9102 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2
, 0), 0))
9107 rtx addr1
= copy_addr_to_reg (XEXP (src1
, 0));
9108 rtx addr2
= copy_addr_to_reg (XEXP (src2
, 0));
9109 if (addr1
!= XEXP (src1
, 0))
9110 src1
= replace_equiv_address_nv (src1
, addr1
);
9111 if (addr2
!= XEXP (src2
, 0))
9112 src2
= replace_equiv_address_nv (src2
, addr2
);
9114 /* NB: Make a copy of the data length to avoid changing the original
9115 data length by cmpstrnqi patterns. */
9116 length
= ix86_zero_extend_to_Pmode (length
);
9117 rtx lengthreg
= gen_reg_rtx (Pmode
);
9118 emit_move_insn (lengthreg
, length
);
9120 /* If we are testing strict equality, we can use known alignment to
9121 good advantage. This may be possible with combine, particularly
9122 once cc0 is dead. */
9123 if (CONST_INT_P (length
))
9125 if (length
== const0_rtx
)
9127 emit_move_insn (result
, const0_rtx
);
9130 emit_insn (gen_cmpstrnqi_nz_1 (addr1
, addr2
, lengthreg
, align
,
9135 emit_insn (gen_cmp_1 (Pmode
, lengthreg
, lengthreg
));
9136 emit_insn (gen_cmpstrnqi_1 (addr1
, addr2
, lengthreg
, align
,
9140 rtx out
= gen_lowpart (QImode
, result
);
9141 emit_insn (gen_cmpintqi (out
));
9142 emit_move_insn (result
, gen_rtx_SIGN_EXTEND (SImode
, out
));
9147 /* Expand the appropriate insns for doing strlen if not just doing
9150 out = result, initialized with the start address
9151 align_rtx = alignment of the address.
9152 scratch = scratch register, initialized with the startaddress when
9153 not aligned, otherwise undefined
9155 This is just the body. It needs the initializations mentioned above and
9156 some address computing at the end. These things are done in i386.md. */
9159 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
9163 rtx_code_label
*align_2_label
= NULL
;
9164 rtx_code_label
*align_3_label
= NULL
;
9165 rtx_code_label
*align_4_label
= gen_label_rtx ();
9166 rtx_code_label
*end_0_label
= gen_label_rtx ();
9168 rtx tmpreg
= gen_reg_rtx (SImode
);
9169 rtx scratch
= gen_reg_rtx (SImode
);
9173 if (CONST_INT_P (align_rtx
))
9174 align
= INTVAL (align_rtx
);
9176 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9178 /* Is there a known alignment and is it less than 4? */
9181 rtx scratch1
= gen_reg_rtx (Pmode
);
9182 emit_move_insn (scratch1
, out
);
9183 /* Is there a known alignment and is it not 2? */
9186 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
9187 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
9189 /* Leave just the 3 lower bits. */
9190 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
9191 NULL_RTX
, 0, OPTAB_WIDEN
);
9193 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9194 Pmode
, 1, align_4_label
);
9195 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
9196 Pmode
, 1, align_2_label
);
9197 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
9198 Pmode
, 1, align_3_label
);
9202 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9203 check if is aligned to 4 - byte. */
9205 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
9206 NULL_RTX
, 0, OPTAB_WIDEN
);
9208 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9209 Pmode
, 1, align_4_label
);
9212 mem
= change_address (src
, QImode
, out
);
9214 /* Now compare the bytes. */
9216 /* Compare the first n unaligned byte on a byte per byte basis. */
9217 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
9218 QImode
, 1, end_0_label
);
9220 /* Increment the address. */
9221 emit_insn (gen_add2_insn (out
, const1_rtx
));
9223 /* Not needed with an alignment of 2 */
9226 emit_label (align_2_label
);
9228 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9231 emit_insn (gen_add2_insn (out
, const1_rtx
));
9233 emit_label (align_3_label
);
9236 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9239 emit_insn (gen_add2_insn (out
, const1_rtx
));
9242 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9243 align this loop. It gives only huge programs, but does not help to
9245 emit_label (align_4_label
);
9247 mem
= change_address (src
, SImode
, out
);
9248 emit_move_insn (scratch
, mem
);
9249 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
9251 /* This formula yields a nonzero result iff one of the bytes is zero.
9252 This saves three branches inside loop and many cycles. */
9254 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
9255 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
9256 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
9257 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
9258 gen_int_mode (0x80808080, SImode
)));
9259 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
9264 rtx reg
= gen_reg_rtx (SImode
);
9265 rtx reg2
= gen_reg_rtx (Pmode
);
9266 emit_move_insn (reg
, tmpreg
);
9267 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
9269 /* If zero is not in the first two bytes, move two bytes forward. */
9270 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9271 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9272 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9273 emit_insn (gen_rtx_SET (tmpreg
,
9274 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
9277 /* Emit lea manually to avoid clobbering of flags. */
9278 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
9280 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9281 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9282 emit_insn (gen_rtx_SET (out
,
9283 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
9289 rtx_code_label
*end_2_label
= gen_label_rtx ();
9290 /* Is zero in the first two bytes? */
9292 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9293 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9294 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
9295 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
9296 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
9298 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
9299 JUMP_LABEL (tmp
) = end_2_label
;
9301 /* Not in the first two. Move two bytes forward. */
9302 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
9303 emit_insn (gen_add2_insn (out
, const2_rtx
));
9305 emit_label (end_2_label
);
9309 /* Avoid branch in fixing the byte. */
9310 tmpreg
= gen_lowpart (QImode
, tmpreg
);
9311 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
9312 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
9313 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
9314 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
9316 emit_label (end_0_label
);
9319 /* Expand strlen. */
9322 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
9324 if (TARGET_UNROLL_STRLEN
9325 && TARGET_INLINE_ALL_STRINGOPS
9326 && eoschar
== const0_rtx
9329 /* The generic case of strlen expander is long. Avoid it's
9330 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9331 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
9332 /* Well it seems that some optimizer does not combine a call like
9333 foo(strlen(bar), strlen(bar));
9334 when the move and the subtraction is done here. It does calculate
9335 the length just once when these instructions are done inside of
9336 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9337 often used and I use one fewer register for the lifetime of
9338 output_strlen_unroll() this is better. */
9340 emit_move_insn (out
, addr
);
9342 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
9344 /* strlensi_unroll_1 returns the address of the zero at the end of
9345 the string, like memchr(), so compute the length by subtracting
9346 the start address. */
9347 emit_insn (gen_sub2_insn (out
, addr
));
9354 /* For given symbol (function) construct code to compute address of it's PLT
9355 entry in large x86-64 PIC model. */
9358 construct_plt_address (rtx symbol
)
9362 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
9363 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
9364 gcc_assert (Pmode
== DImode
);
9366 tmp
= gen_reg_rtx (Pmode
);
9367 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
9369 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
9370 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
9374 /* Additional registers that are clobbered by SYSV calls. */
9376 static int const x86_64_ms_sysv_extra_clobbered_registers
9377 [NUM_X86_64_MS_CLOBBERED_REGS
] =
9381 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
9382 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
9386 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
9388 rtx pop
, bool sibcall
)
9391 rtx use
= NULL
, call
;
9392 unsigned int vec_len
= 0;
9395 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9397 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
9399 && (lookup_attribute ("interrupt",
9400 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
9401 error ("interrupt service routine cannot be called directly");
9406 if (pop
== const0_rtx
)
9408 gcc_assert (!TARGET_64BIT
|| !pop
);
9410 rtx addr
= XEXP (fnaddr
, 0);
9411 if (TARGET_MACHO
&& !TARGET_64BIT
)
9414 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9415 fnaddr
= machopic_indirect_call_target (fnaddr
);
9420 /* Static functions and indirect calls don't need the pic register. Also,
9421 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9422 it an indirect call. */
9424 && GET_CODE (addr
) == SYMBOL_REF
9425 && ix86_call_use_plt_p (addr
))
9428 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
9429 || !lookup_attribute ("noplt",
9430 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
9433 || (ix86_cmodel
== CM_LARGE_PIC
9434 && DEFAULT_ABI
!= MS_ABI
))
9436 use_reg (&use
, gen_rtx_REG (Pmode
,
9437 REAL_PIC_OFFSET_TABLE_REGNUM
));
9438 if (ix86_use_pseudo_pic_reg ())
9439 emit_move_insn (gen_rtx_REG (Pmode
,
9440 REAL_PIC_OFFSET_TABLE_REGNUM
),
9441 pic_offset_table_rtx
);
9444 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
9447 && ix86_cmodel
== CM_LARGE_PIC
9448 && DEFAULT_ABI
!= MS_ABI
)
9450 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9452 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9453 fnaddr
= force_reg (Pmode
, fnaddr
);
9454 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
, fnaddr
);
9456 else if (TARGET_64BIT
)
9458 fnaddr
= gen_rtx_UNSPEC (Pmode
,
9459 gen_rtvec (1, addr
),
9461 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9465 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9467 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9468 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
9471 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
9472 /* Pmode may not be the same as word_mode for x32, which
9473 doesn't support indirect branch via 32-bit memory slot.
9474 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9475 indirect branch via x32 GOT slot is OK. */
9476 if (GET_MODE (fnaddr
) != word_mode
)
9477 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
9478 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
9483 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9484 parameters passed in vector registers. */
9486 && (INTVAL (callarg2
) > 0
9487 || (INTVAL (callarg2
) == 0
9488 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
9490 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
9491 emit_move_insn (al
, callarg2
);
9495 if (ix86_cmodel
== CM_LARGE_PIC
9498 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
9499 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
9500 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
9501 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9502 branch via x32 GOT slot is OK. */
9503 else if (!(TARGET_X32
9505 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
9506 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
9508 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
9509 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
9511 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
9512 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
9515 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9516 mask off code pointers here.
9517 TODO: also need to handle indirect jump. */
9518 if (ix86_memtag_can_tag_addresses () && !fndecl
9519 && sanitize_flags_p (SANITIZE_HWADDRESS
))
9521 rtx untagged_addr
= ix86_memtag_untagged_pointer (XEXP (fnaddr
, 0),
9523 fnaddr
= gen_rtx_MEM (QImode
, untagged_addr
);
9526 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
9529 call
= gen_rtx_SET (retval
, call
);
9530 vec
[vec_len
++] = call
;
9534 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
9535 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
9536 vec
[vec_len
++] = pop
;
9539 if (cfun
->machine
->no_caller_saved_registers
9541 || (!TREE_THIS_VOLATILE (fndecl
)
9542 && !lookup_attribute ("no_caller_saved_registers",
9543 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
9545 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
9546 bool is_64bit_ms_abi
= (TARGET_64BIT
9547 && ix86_function_abi (fndecl
) == MS_ABI
);
9548 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
9550 /* If there are no caller-saved registers, add all registers
9551 that are clobbered by the call which returns. */
9552 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
9554 && (ix86_call_used_regs
[i
] == 1
9555 || (ix86_call_used_regs
[i
] & c_mask
))
9556 && !STACK_REGNO_P (i
)
9557 && !MMX_REGNO_P (i
))
9559 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
9561 else if (TARGET_64BIT_MS_ABI
9562 && (!callarg2
|| INTVAL (callarg2
) != -2))
9566 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
9568 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
9569 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
9571 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
9574 /* Set here, but it may get cleared later. */
9575 if (TARGET_CALL_MS2SYSV_XLOGUES
)
9580 /* Don't break hot-patched functions. */
9581 else if (ix86_function_ms_hook_prologue (current_function_decl
))
9584 /* TODO: Cases not yet examined. */
9585 else if (flag_split_stack
)
9586 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9590 gcc_assert (!reload_completed
);
9591 cfun
->machine
->call_ms2sysv
= true;
9596 if (TARGET_MACHO
&& TARGET_64BIT
&& !sibcall
9597 && ((GET_CODE (addr
) == SYMBOL_REF
&& !SYMBOL_REF_LOCAL_P (addr
))
9598 || !fndecl
|| TREE_PUBLIC (fndecl
)))
9600 /* We allow public functions defined in a TU to bind locally for PIC
9601 code (the default) on 64bit Mach-O.
9602 If such functions are not inlined, we cannot tell at compile-time if
9603 they will be called via the lazy symbol resolver (this can depend on
9604 options given at link-time). Therefore, we must assume that the lazy
9605 resolver could be used which clobbers R11 and R10. */
9606 clobber_reg (&use
, gen_rtx_REG (DImode
, R11_REG
));
9607 clobber_reg (&use
, gen_rtx_REG (DImode
, R10_REG
));
9611 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
9612 rtx_insn
*call_insn
= emit_call_insn (call
);
9614 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
9619 /* Split simple return with popping POPC bytes from stack to indirect
9620 branch with stack adjustment . */
9623 ix86_split_simple_return_pop_internal (rtx popc
)
9625 struct machine_function
*m
= cfun
->machine
;
9626 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
9629 /* There is no "pascal" calling convention in any 64bit ABI. */
9630 gcc_assert (!TARGET_64BIT
);
9632 insn
= emit_insn (gen_pop (ecx
));
9633 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
9634 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
9636 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
9637 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9638 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9639 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
9640 RTX_FRAME_RELATED_P (insn
) = 1;
9642 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
9643 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9644 insn
= emit_insn (x
);
9645 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9646 RTX_FRAME_RELATED_P (insn
) = 1;
9648 /* Now return address is in ECX. */
9649 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
9652 /* Errors in the source file can cause expand_expr to return const0_rtx
9653 where we expect a vector. To avoid crashing, use one of the vector
9654 clear instructions. */
9657 safe_vector_operand (rtx x
, machine_mode mode
)
9659 if (x
== const0_rtx
)
9660 x
= CONST0_RTX (mode
);
9664 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9667 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
9670 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9671 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9672 rtx op0
= expand_normal (arg0
);
9673 rtx op1
= expand_normal (arg1
);
9674 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9675 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
9676 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
9678 if (VECTOR_MODE_P (mode0
))
9679 op0
= safe_vector_operand (op0
, mode0
);
9680 if (VECTOR_MODE_P (mode1
))
9681 op1
= safe_vector_operand (op1
, mode1
);
9683 if (optimize
|| !target
9684 || GET_MODE (target
) != tmode
9685 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9686 target
= gen_reg_rtx (tmode
);
9688 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
9690 rtx x
= gen_reg_rtx (V4SImode
);
9691 emit_insn (gen_sse2_loadd (x
, op1
));
9692 op1
= gen_lowpart (TImode
, x
);
9695 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
9696 op0
= copy_to_mode_reg (mode0
, op0
);
9697 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
9698 op1
= copy_to_mode_reg (mode1
, op1
);
9700 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
9709 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9712 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
9713 enum ix86_builtin_func_type m_type
,
9714 enum rtx_code sub_code
)
9717 unsigned int i
, nargs
;
9718 bool comparison_p
= false;
9720 bool last_arg_constant
= false;
9724 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9728 case MULTI_ARG_4_DF2_DI_I
:
9729 case MULTI_ARG_4_DF2_DI_I1
:
9730 case MULTI_ARG_4_SF2_SI_I
:
9731 case MULTI_ARG_4_SF2_SI_I1
:
9733 last_arg_constant
= true;
9736 case MULTI_ARG_3_SF
:
9737 case MULTI_ARG_3_DF
:
9738 case MULTI_ARG_3_SF2
:
9739 case MULTI_ARG_3_DF2
:
9740 case MULTI_ARG_3_DI
:
9741 case MULTI_ARG_3_SI
:
9742 case MULTI_ARG_3_SI_DI
:
9743 case MULTI_ARG_3_HI
:
9744 case MULTI_ARG_3_HI_SI
:
9745 case MULTI_ARG_3_QI
:
9746 case MULTI_ARG_3_DI2
:
9747 case MULTI_ARG_3_SI2
:
9748 case MULTI_ARG_3_HI2
:
9749 case MULTI_ARG_3_QI2
:
9753 case MULTI_ARG_2_SF
:
9754 case MULTI_ARG_2_DF
:
9755 case MULTI_ARG_2_DI
:
9756 case MULTI_ARG_2_SI
:
9757 case MULTI_ARG_2_HI
:
9758 case MULTI_ARG_2_QI
:
9762 case MULTI_ARG_2_DI_IMM
:
9763 case MULTI_ARG_2_SI_IMM
:
9764 case MULTI_ARG_2_HI_IMM
:
9765 case MULTI_ARG_2_QI_IMM
:
9767 last_arg_constant
= true;
9770 case MULTI_ARG_1_SF
:
9771 case MULTI_ARG_1_DF
:
9772 case MULTI_ARG_1_SF2
:
9773 case MULTI_ARG_1_DF2
:
9774 case MULTI_ARG_1_DI
:
9775 case MULTI_ARG_1_SI
:
9776 case MULTI_ARG_1_HI
:
9777 case MULTI_ARG_1_QI
:
9778 case MULTI_ARG_1_SI_DI
:
9779 case MULTI_ARG_1_HI_DI
:
9780 case MULTI_ARG_1_HI_SI
:
9781 case MULTI_ARG_1_QI_DI
:
9782 case MULTI_ARG_1_QI_SI
:
9783 case MULTI_ARG_1_QI_HI
:
9787 case MULTI_ARG_2_DI_CMP
:
9788 case MULTI_ARG_2_SI_CMP
:
9789 case MULTI_ARG_2_HI_CMP
:
9790 case MULTI_ARG_2_QI_CMP
:
9792 comparison_p
= true;
9795 case MULTI_ARG_2_SF_TF
:
9796 case MULTI_ARG_2_DF_TF
:
9797 case MULTI_ARG_2_DI_TF
:
9798 case MULTI_ARG_2_SI_TF
:
9799 case MULTI_ARG_2_HI_TF
:
9800 case MULTI_ARG_2_QI_TF
:
9809 if (optimize
|| !target
9810 || GET_MODE (target
) != tmode
9811 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9812 target
= gen_reg_rtx (tmode
);
9813 else if (memory_operand (target
, tmode
))
9816 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
9818 for (i
= 0; i
< nargs
; i
++)
9820 tree arg
= CALL_EXPR_ARG (exp
, i
);
9821 rtx op
= expand_normal (arg
);
9822 int adjust
= (comparison_p
) ? 1 : 0;
9823 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
9825 if (last_arg_constant
&& i
== nargs
- 1)
9827 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
9829 enum insn_code new_icode
= icode
;
9832 case CODE_FOR_xop_vpermil2v2df3
:
9833 case CODE_FOR_xop_vpermil2v4sf3
:
9834 case CODE_FOR_xop_vpermil2v4df3
:
9835 case CODE_FOR_xop_vpermil2v8sf3
:
9836 error ("the last argument must be a 2-bit immediate");
9837 return gen_reg_rtx (tmode
);
9838 case CODE_FOR_xop_rotlv2di3
:
9839 new_icode
= CODE_FOR_rotlv2di3
;
9841 case CODE_FOR_xop_rotlv4si3
:
9842 new_icode
= CODE_FOR_rotlv4si3
;
9844 case CODE_FOR_xop_rotlv8hi3
:
9845 new_icode
= CODE_FOR_rotlv8hi3
;
9847 case CODE_FOR_xop_rotlv16qi3
:
9848 new_icode
= CODE_FOR_rotlv16qi3
;
9850 if (CONST_INT_P (op
))
9852 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
9853 op
= GEN_INT (INTVAL (op
) & mask
);
9855 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
9861 && insn_data
[new_icode
].operand
[0].mode
== tmode
9862 && insn_data
[new_icode
].operand
[1].mode
== tmode
9863 && insn_data
[new_icode
].operand
[2].mode
== mode
9864 && insn_data
[new_icode
].operand
[0].predicate
9865 == insn_data
[icode
].operand
[0].predicate
9866 && insn_data
[new_icode
].operand
[1].predicate
9867 == insn_data
[icode
].operand
[1].predicate
);
9880 if (VECTOR_MODE_P (mode
))
9881 op
= safe_vector_operand (op
, mode
);
9883 /* If we aren't optimizing, only allow one memory operand to be
9885 if (memory_operand (op
, mode
))
9888 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
9891 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
9893 op
= force_reg (mode
, op
);
9902 pat
= GEN_FCN (icode
) (target
, xops
[0]);
9907 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
9908 GEN_INT ((int)sub_code
));
9909 else if (! comparison_p
)
9910 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
9913 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
9916 pat
= GEN_FCN (icode
) (target
, cmp_op
, xops
[0], xops
[1]);
9921 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
9925 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
9939 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9940 insns with vec_merge. */
9943 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
9947 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9948 rtx op1
, op0
= expand_normal (arg0
);
9949 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9950 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
9952 if (optimize
|| !target
9953 || GET_MODE (target
) != tmode
9954 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9955 target
= gen_reg_rtx (tmode
);
9957 if (VECTOR_MODE_P (mode0
))
9958 op0
= safe_vector_operand (op0
, mode0
);
9960 if ((optimize
&& !register_operand (op0
, mode0
))
9961 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
9962 op0
= copy_to_mode_reg (mode0
, op0
);
9965 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
9966 op1
= copy_to_mode_reg (mode0
, op1
);
9968 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
9975 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9978 ix86_expand_sse_compare (const struct builtin_description
*d
,
9979 tree exp
, rtx target
, bool swap
)
9982 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9983 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9984 rtx op0
= expand_normal (arg0
);
9985 rtx op1
= expand_normal (arg1
);
9987 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
9988 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
9989 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
9990 enum rtx_code comparison
= d
->comparison
;
9992 if (VECTOR_MODE_P (mode0
))
9993 op0
= safe_vector_operand (op0
, mode0
);
9994 if (VECTOR_MODE_P (mode1
))
9995 op1
= safe_vector_operand (op1
, mode1
);
9997 /* Swap operands if we have a comparison that isn't available in
10000 std::swap (op0
, op1
);
10002 if (optimize
|| !target
10003 || GET_MODE (target
) != tmode
10004 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10005 target
= gen_reg_rtx (tmode
);
10007 if ((optimize
&& !register_operand (op0
, mode0
))
10008 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
10009 op0
= copy_to_mode_reg (mode0
, op0
);
10010 if ((optimize
&& !register_operand (op1
, mode1
))
10011 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
10012 op1
= copy_to_mode_reg (mode1
, op1
);
10014 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
10015 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
10022 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
10023 * ordered EQ or unordered NE, generate PF jump. */
10026 ix86_ssecom_setcc (const enum rtx_code comparison
,
10027 bool check_unordered
, machine_mode mode
,
10028 rtx set_dst
, rtx target
)
10031 rtx_code_label
*label
= NULL
;
10033 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10034 with NAN operands. */
10035 if (check_unordered
)
10037 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10039 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10040 label
= gen_label_rtx ();
10041 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10042 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10043 gen_rtx_LABEL_REF (VOIDmode
, label
),
10045 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10048 /* NB: Set CCFPmode and check a different CCmode which is in subset
10050 if (GET_MODE (set_dst
) != mode
)
10052 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10053 || mode
== CCOmode
|| mode
== CCPmode
10054 || mode
== CCSmode
|| mode
== CCZmode
);
10055 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10058 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10059 gen_rtx_fmt_ee (comparison
, QImode
,
10064 emit_label (label
);
10066 return SUBREG_REG (target
);
10069 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
10072 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
10076 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10077 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10078 rtx op0
= expand_normal (arg0
);
10079 rtx op1
= expand_normal (arg1
);
10080 enum insn_code icode
= d
->icode
;
10081 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10082 machine_mode mode0
= insn_p
->operand
[0].mode
;
10083 machine_mode mode1
= insn_p
->operand
[1].mode
;
10085 if (VECTOR_MODE_P (mode0
))
10086 op0
= safe_vector_operand (op0
, mode0
);
10087 if (VECTOR_MODE_P (mode1
))
10088 op1
= safe_vector_operand (op1
, mode1
);
10090 enum rtx_code comparison
= d
->comparison
;
10091 rtx const_val
= const0_rtx
;
10093 bool check_unordered
= false;
10094 machine_mode mode
= CCFPmode
;
10095 switch (comparison
)
10097 case LE
: /* -> GE */
10098 case LT
: /* -> GT */
10099 std::swap (op0
, op1
);
10100 comparison
= swap_condition (comparison
);
10106 check_unordered
= true;
10110 check_unordered
= true;
10112 const_val
= const1_rtx
;
10115 gcc_unreachable ();
10118 target
= gen_reg_rtx (SImode
);
10119 emit_move_insn (target
, const_val
);
10120 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10122 if ((optimize
&& !register_operand (op0
, mode0
))
10123 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10124 op0
= copy_to_mode_reg (mode0
, op0
);
10125 if ((optimize
&& !register_operand (op1
, mode1
))
10126 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10127 op1
= copy_to_mode_reg (mode1
, op1
);
10129 pat
= GEN_FCN (icode
) (op0
, op1
);
10133 set_dst
= SET_DEST (pat
);
10135 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
10139 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10142 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
10146 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10147 rtx op1
, op0
= expand_normal (arg0
);
10148 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10149 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10151 if (optimize
|| target
== 0
10152 || GET_MODE (target
) != tmode
10153 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10154 target
= gen_reg_rtx (tmode
);
10156 if (VECTOR_MODE_P (mode0
))
10157 op0
= safe_vector_operand (op0
, mode0
);
10159 if ((optimize
&& !register_operand (op0
, mode0
))
10160 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10161 op0
= copy_to_mode_reg (mode0
, op0
);
10163 op1
= GEN_INT (d
->comparison
);
10165 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
10173 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
10174 tree exp
, rtx target
)
10177 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10178 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10179 rtx op0
= expand_normal (arg0
);
10180 rtx op1
= expand_normal (arg1
);
10182 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10183 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10184 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
10186 if (optimize
|| target
== 0
10187 || GET_MODE (target
) != tmode
10188 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10189 target
= gen_reg_rtx (tmode
);
10191 op0
= safe_vector_operand (op0
, mode0
);
10192 op1
= safe_vector_operand (op1
, mode1
);
10194 if ((optimize
&& !register_operand (op0
, mode0
))
10195 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10196 op0
= copy_to_mode_reg (mode0
, op0
);
10197 if ((optimize
&& !register_operand (op1
, mode1
))
10198 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10199 op1
= copy_to_mode_reg (mode1
, op1
);
10201 op2
= GEN_INT (d
->comparison
);
10203 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
10210 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10213 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
10217 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10218 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10219 rtx op0
= expand_normal (arg0
);
10220 rtx op1
= expand_normal (arg1
);
10221 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
10222 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
10223 enum rtx_code comparison
= d
->comparison
;
10225 if (VECTOR_MODE_P (mode0
))
10226 op0
= safe_vector_operand (op0
, mode0
);
10227 if (VECTOR_MODE_P (mode1
))
10228 op1
= safe_vector_operand (op1
, mode1
);
10230 target
= gen_reg_rtx (SImode
);
10231 emit_move_insn (target
, const0_rtx
);
10232 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10234 if ((optimize
&& !register_operand (op0
, mode0
))
10235 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10236 op0
= copy_to_mode_reg (mode0
, op0
);
10237 if ((optimize
&& !register_operand (op1
, mode1
))
10238 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10239 op1
= copy_to_mode_reg (mode1
, op1
);
10241 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
10245 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10246 gen_rtx_fmt_ee (comparison
, QImode
,
10250 return SUBREG_REG (target
);
10253 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10256 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
10257 tree exp
, rtx target
)
10260 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10261 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10262 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10263 tree arg3
= CALL_EXPR_ARG (exp
, 3);
10264 tree arg4
= CALL_EXPR_ARG (exp
, 4);
10265 rtx scratch0
, scratch1
;
10266 rtx op0
= expand_normal (arg0
);
10267 rtx op1
= expand_normal (arg1
);
10268 rtx op2
= expand_normal (arg2
);
10269 rtx op3
= expand_normal (arg3
);
10270 rtx op4
= expand_normal (arg4
);
10271 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
10273 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10274 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10275 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10276 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
10277 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
10278 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
10279 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
10281 if (VECTOR_MODE_P (modev2
))
10282 op0
= safe_vector_operand (op0
, modev2
);
10283 if (VECTOR_MODE_P (modev4
))
10284 op2
= safe_vector_operand (op2
, modev4
);
10286 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10287 op0
= copy_to_mode_reg (modev2
, op0
);
10288 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
10289 op1
= copy_to_mode_reg (modei3
, op1
);
10290 if ((optimize
&& !register_operand (op2
, modev4
))
10291 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
10292 op2
= copy_to_mode_reg (modev4
, op2
);
10293 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
10294 op3
= copy_to_mode_reg (modei5
, op3
);
10296 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
10298 error ("the fifth argument must be an 8-bit immediate");
10302 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
10304 if (optimize
|| !target
10305 || GET_MODE (target
) != tmode0
10306 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10307 target
= gen_reg_rtx (tmode0
);
10309 scratch1
= gen_reg_rtx (tmode1
);
10311 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10313 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
10315 if (optimize
|| !target
10316 || GET_MODE (target
) != tmode1
10317 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10318 target
= gen_reg_rtx (tmode1
);
10320 scratch0
= gen_reg_rtx (tmode0
);
10322 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
10326 gcc_assert (d
->flag
);
10328 scratch0
= gen_reg_rtx (tmode0
);
10329 scratch1
= gen_reg_rtx (tmode1
);
10331 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10341 target
= gen_reg_rtx (SImode
);
10342 emit_move_insn (target
, const0_rtx
);
10343 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10346 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10347 gen_rtx_fmt_ee (EQ
, QImode
,
10348 gen_rtx_REG ((machine_mode
) d
->flag
,
10351 return SUBREG_REG (target
);
10358 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10361 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
10362 tree exp
, rtx target
)
10365 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10366 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10367 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10368 rtx scratch0
, scratch1
;
10369 rtx op0
= expand_normal (arg0
);
10370 rtx op1
= expand_normal (arg1
);
10371 rtx op2
= expand_normal (arg2
);
10372 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
10374 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10375 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10376 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10377 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
10378 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
10380 if (VECTOR_MODE_P (modev2
))
10381 op0
= safe_vector_operand (op0
, modev2
);
10382 if (VECTOR_MODE_P (modev3
))
10383 op1
= safe_vector_operand (op1
, modev3
);
10385 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10386 op0
= copy_to_mode_reg (modev2
, op0
);
10387 if ((optimize
&& !register_operand (op1
, modev3
))
10388 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
10389 op1
= copy_to_mode_reg (modev3
, op1
);
10391 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
10393 error ("the third argument must be an 8-bit immediate");
10397 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
10399 if (optimize
|| !target
10400 || GET_MODE (target
) != tmode0
10401 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10402 target
= gen_reg_rtx (tmode0
);
10404 scratch1
= gen_reg_rtx (tmode1
);
10406 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
10408 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
10410 if (optimize
|| !target
10411 || GET_MODE (target
) != tmode1
10412 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10413 target
= gen_reg_rtx (tmode1
);
10415 scratch0
= gen_reg_rtx (tmode0
);
10417 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
10421 gcc_assert (d
->flag
);
10423 scratch0
= gen_reg_rtx (tmode0
);
10424 scratch1
= gen_reg_rtx (tmode1
);
10426 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
10436 target
= gen_reg_rtx (SImode
);
10437 emit_move_insn (target
, const0_rtx
);
10438 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10441 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10442 gen_rtx_fmt_ee (EQ
, QImode
,
10443 gen_rtx_REG ((machine_mode
) d
->flag
,
10446 return SUBREG_REG (target
);
10452 /* Fixup modeless constants to fit required mode. */
10455 fixup_modeless_constant (rtx x
, machine_mode mode
)
10457 if (GET_MODE (x
) == VOIDmode
)
10458 x
= convert_to_mode (mode
, x
, 1);
10462 /* Subroutine of ix86_expand_builtin to take care of insns with
10463 variable number of operands. */
10466 ix86_expand_args_builtin (const struct builtin_description
*d
,
10467 tree exp
, rtx target
)
10469 rtx pat
, real_target
;
10470 unsigned int i
, nargs
;
10471 unsigned int nargs_constant
= 0;
10472 unsigned int mask_pos
= 0;
10473 int num_memory
= 0;
10475 bool second_arg_count
= false;
10476 enum insn_code icode
= d
->icode
;
10477 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10478 machine_mode tmode
= insn_p
->operand
[0].mode
;
10479 machine_mode rmode
= VOIDmode
;
10481 enum rtx_code comparison
= d
->comparison
;
10483 switch ((enum ix86_builtin_func_type
) d
->flag
)
10485 case V2DF_FTYPE_V2DF_ROUND
:
10486 case V4DF_FTYPE_V4DF_ROUND
:
10487 case V8DF_FTYPE_V8DF_ROUND
:
10488 case V4SF_FTYPE_V4SF_ROUND
:
10489 case V8SF_FTYPE_V8SF_ROUND
:
10490 case V16SF_FTYPE_V16SF_ROUND
:
10491 case V8HF_FTYPE_V8HF_ROUND
:
10492 case V16HF_FTYPE_V16HF_ROUND
:
10493 case V32HF_FTYPE_V32HF_ROUND
:
10494 case V4SI_FTYPE_V4SF_ROUND
:
10495 case V8SI_FTYPE_V8SF_ROUND
:
10496 case V16SI_FTYPE_V16SF_ROUND
:
10497 return ix86_expand_sse_round (d
, exp
, target
);
10498 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
10499 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
10500 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
10501 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
10502 case INT_FTYPE_V8SF_V8SF_PTEST
:
10503 case INT_FTYPE_V4DI_V4DI_PTEST
:
10504 case INT_FTYPE_V4DF_V4DF_PTEST
:
10505 case INT_FTYPE_V4SF_V4SF_PTEST
:
10506 case INT_FTYPE_V2DI_V2DI_PTEST
:
10507 case INT_FTYPE_V2DF_V2DF_PTEST
:
10508 return ix86_expand_sse_ptest (d
, exp
, target
);
10509 case FLOAT128_FTYPE_FLOAT128
:
10510 case FLOAT_FTYPE_FLOAT
:
10511 case FLOAT_FTYPE_BFLOAT16
:
10512 case INT_FTYPE_INT
:
10513 case UINT_FTYPE_UINT
:
10514 case UINT16_FTYPE_UINT16
:
10515 case UINT64_FTYPE_INT
:
10516 case UINT64_FTYPE_UINT64
:
10517 case INT64_FTYPE_INT64
:
10518 case INT64_FTYPE_V4SF
:
10519 case INT64_FTYPE_V2DF
:
10520 case INT_FTYPE_V16QI
:
10521 case INT_FTYPE_V8QI
:
10522 case INT_FTYPE_V8SF
:
10523 case INT_FTYPE_V4DF
:
10524 case INT_FTYPE_V4SF
:
10525 case INT_FTYPE_V2DF
:
10526 case INT_FTYPE_V32QI
:
10527 case V16QI_FTYPE_V16QI
:
10528 case V8SI_FTYPE_V8SF
:
10529 case V8SI_FTYPE_V4SI
:
10530 case V8HI_FTYPE_V8HI
:
10531 case V8HI_FTYPE_V16QI
:
10532 case V8QI_FTYPE_V8QI
:
10533 case V8SF_FTYPE_V8SF
:
10534 case V8SF_FTYPE_V8SI
:
10535 case V8SF_FTYPE_V4SF
:
10536 case V8SF_FTYPE_V8HI
:
10537 case V4SI_FTYPE_V4SI
:
10538 case V4SI_FTYPE_V16QI
:
10539 case V4SI_FTYPE_V4SF
:
10540 case V4SI_FTYPE_V8SI
:
10541 case V4SI_FTYPE_V8HI
:
10542 case V4SI_FTYPE_V4DF
:
10543 case V4SI_FTYPE_V2DF
:
10544 case V4HI_FTYPE_V4HI
:
10545 case V4DF_FTYPE_V4DF
:
10546 case V4DF_FTYPE_V4SI
:
10547 case V4DF_FTYPE_V4SF
:
10548 case V4DF_FTYPE_V2DF
:
10549 case V4SF_FTYPE_V4SF
:
10550 case V4SF_FTYPE_V4SI
:
10551 case V4SF_FTYPE_V8SF
:
10552 case V4SF_FTYPE_V4DF
:
10553 case V4SF_FTYPE_V8HI
:
10554 case V4SF_FTYPE_V2DF
:
10555 case V2DI_FTYPE_V2DI
:
10556 case V2DI_FTYPE_V16QI
:
10557 case V2DI_FTYPE_V8HI
:
10558 case V2DI_FTYPE_V4SI
:
10559 case V2DF_FTYPE_V2DF
:
10560 case V2DF_FTYPE_V4SI
:
10561 case V2DF_FTYPE_V4DF
:
10562 case V2DF_FTYPE_V4SF
:
10563 case V2DF_FTYPE_V2SI
:
10564 case V2SI_FTYPE_V2SI
:
10565 case V2SI_FTYPE_V4SF
:
10566 case V2SI_FTYPE_V2SF
:
10567 case V2SI_FTYPE_V2DF
:
10568 case V2SF_FTYPE_V2SF
:
10569 case V2SF_FTYPE_V2SI
:
10570 case V32QI_FTYPE_V32QI
:
10571 case V32QI_FTYPE_V16QI
:
10572 case V16HI_FTYPE_V16HI
:
10573 case V16HI_FTYPE_V8HI
:
10574 case V8SI_FTYPE_V8SI
:
10575 case V16HI_FTYPE_V16QI
:
10576 case V8SI_FTYPE_V16QI
:
10577 case V4DI_FTYPE_V16QI
:
10578 case V8SI_FTYPE_V8HI
:
10579 case V4DI_FTYPE_V8HI
:
10580 case V4DI_FTYPE_V4SI
:
10581 case V4DI_FTYPE_V2DI
:
10582 case UQI_FTYPE_UQI
:
10583 case UHI_FTYPE_UHI
:
10584 case USI_FTYPE_USI
:
10585 case USI_FTYPE_UQI
:
10586 case USI_FTYPE_UHI
:
10587 case UDI_FTYPE_UDI
:
10588 case UHI_FTYPE_V16QI
:
10589 case USI_FTYPE_V32QI
:
10590 case UDI_FTYPE_V64QI
:
10591 case V16QI_FTYPE_UHI
:
10592 case V32QI_FTYPE_USI
:
10593 case V64QI_FTYPE_UDI
:
10594 case V8HI_FTYPE_UQI
:
10595 case V16HI_FTYPE_UHI
:
10596 case V32HI_FTYPE_USI
:
10597 case V4SI_FTYPE_UQI
:
10598 case V8SI_FTYPE_UQI
:
10599 case V4SI_FTYPE_UHI
:
10600 case V8SI_FTYPE_UHI
:
10601 case UQI_FTYPE_V8HI
:
10602 case UHI_FTYPE_V16HI
:
10603 case USI_FTYPE_V32HI
:
10604 case UQI_FTYPE_V4SI
:
10605 case UQI_FTYPE_V8SI
:
10606 case UHI_FTYPE_V16SI
:
10607 case UQI_FTYPE_V2DI
:
10608 case UQI_FTYPE_V4DI
:
10609 case UQI_FTYPE_V8DI
:
10610 case V16SI_FTYPE_UHI
:
10611 case V2DI_FTYPE_UQI
:
10612 case V4DI_FTYPE_UQI
:
10613 case V16SI_FTYPE_INT
:
10614 case V16SF_FTYPE_V8SF
:
10615 case V16SI_FTYPE_V8SI
:
10616 case V16SF_FTYPE_V4SF
:
10617 case V16SI_FTYPE_V4SI
:
10618 case V16SI_FTYPE_V16SF
:
10619 case V16SI_FTYPE_V16SI
:
10620 case V64QI_FTYPE_V64QI
:
10621 case V32HI_FTYPE_V32HI
:
10622 case V16SF_FTYPE_V16SF
:
10623 case V8DI_FTYPE_UQI
:
10624 case V8DI_FTYPE_V8DI
:
10625 case V8DF_FTYPE_V4DF
:
10626 case V8DF_FTYPE_V2DF
:
10627 case V8DF_FTYPE_V8DF
:
10628 case V4DI_FTYPE_V4DI
:
10629 case V16BF_FTYPE_V16SF
:
10630 case V8BF_FTYPE_V8SF
:
10631 case V8BF_FTYPE_V4SF
:
10634 case V4SF_FTYPE_V4SF_VEC_MERGE
:
10635 case V2DF_FTYPE_V2DF_VEC_MERGE
:
10636 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
10637 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
10638 case V16QI_FTYPE_V16QI_V16QI
:
10639 case V16QI_FTYPE_V8HI_V8HI
:
10640 case V16HF_FTYPE_V16HF_V16HF
:
10641 case V16SF_FTYPE_V16SF_V16SF
:
10642 case V8QI_FTYPE_V8QI_V8QI
:
10643 case V8QI_FTYPE_V4HI_V4HI
:
10644 case V8HI_FTYPE_V8HI_V8HI
:
10645 case V8HI_FTYPE_V16QI_V16QI
:
10646 case V8HI_FTYPE_V4SI_V4SI
:
10647 case V8HF_FTYPE_V8HF_V8HF
:
10648 case V8SF_FTYPE_V8SF_V8SF
:
10649 case V8SF_FTYPE_V8SF_V8SI
:
10650 case V8DF_FTYPE_V8DF_V8DF
:
10651 case V4SI_FTYPE_V4SI_V4SI
:
10652 case V4SI_FTYPE_V8HI_V8HI
:
10653 case V4SI_FTYPE_V2DF_V2DF
:
10654 case V4HI_FTYPE_V4HI_V4HI
:
10655 case V4HI_FTYPE_V8QI_V8QI
:
10656 case V4HI_FTYPE_V2SI_V2SI
:
10657 case V4DF_FTYPE_V4DF_V4DF
:
10658 case V4DF_FTYPE_V4DF_V4DI
:
10659 case V4SF_FTYPE_V4SF_V4SF
:
10660 case V4SF_FTYPE_V4SF_V4SI
:
10661 case V4SF_FTYPE_V4SF_V2SI
:
10662 case V4SF_FTYPE_V4SF_V2DF
:
10663 case V4SF_FTYPE_V4SF_UINT
:
10664 case V4SF_FTYPE_V4SF_DI
:
10665 case V4SF_FTYPE_V4SF_SI
:
10666 case V2DI_FTYPE_V2DI_V2DI
:
10667 case V2DI_FTYPE_V16QI_V16QI
:
10668 case V2DI_FTYPE_V4SI_V4SI
:
10669 case V2DI_FTYPE_V2DI_V16QI
:
10670 case V2SI_FTYPE_V2SI_V2SI
:
10671 case V2SI_FTYPE_V4HI_V4HI
:
10672 case V2SI_FTYPE_V2SF_V2SF
:
10673 case V2DF_FTYPE_V2DF_V2DF
:
10674 case V2DF_FTYPE_V2DF_V4SF
:
10675 case V2DF_FTYPE_V2DF_V2DI
:
10676 case V2DF_FTYPE_V2DF_DI
:
10677 case V2DF_FTYPE_V2DF_SI
:
10678 case V2DF_FTYPE_V2DF_UINT
:
10679 case V2SF_FTYPE_V2SF_V2SF
:
10680 case V1DI_FTYPE_V1DI_V1DI
:
10681 case V1DI_FTYPE_V8QI_V8QI
:
10682 case V1DI_FTYPE_V2SI_V2SI
:
10683 case V32QI_FTYPE_V16HI_V16HI
:
10684 case V16HI_FTYPE_V8SI_V8SI
:
10685 case V64QI_FTYPE_V64QI_V64QI
:
10686 case V32QI_FTYPE_V32QI_V32QI
:
10687 case V16HI_FTYPE_V32QI_V32QI
:
10688 case V16HI_FTYPE_V16HI_V16HI
:
10689 case V8SI_FTYPE_V4DF_V4DF
:
10690 case V8SI_FTYPE_V8SI_V8SI
:
10691 case V8SI_FTYPE_V16HI_V16HI
:
10692 case V4DI_FTYPE_V4DI_V4DI
:
10693 case V4DI_FTYPE_V8SI_V8SI
:
10694 case V4DI_FTYPE_V32QI_V32QI
:
10695 case V8DI_FTYPE_V64QI_V64QI
:
10696 if (comparison
== UNKNOWN
)
10697 return ix86_expand_binop_builtin (icode
, exp
, target
);
10700 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
10701 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
10702 gcc_assert (comparison
!= UNKNOWN
);
10706 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
10707 case V16HI_FTYPE_V16HI_SI_COUNT
:
10708 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
10709 case V8SI_FTYPE_V8SI_SI_COUNT
:
10710 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
10711 case V4DI_FTYPE_V4DI_INT_COUNT
:
10712 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
10713 case V8HI_FTYPE_V8HI_SI_COUNT
:
10714 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
10715 case V4SI_FTYPE_V4SI_SI_COUNT
:
10716 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
10717 case V4HI_FTYPE_V4HI_SI_COUNT
:
10718 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
10719 case V2DI_FTYPE_V2DI_SI_COUNT
:
10720 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
10721 case V2SI_FTYPE_V2SI_SI_COUNT
:
10722 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
10723 case V1DI_FTYPE_V1DI_SI_COUNT
:
10725 second_arg_count
= true;
10727 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
10728 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
10729 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
10730 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
10731 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
10732 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
10733 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
10734 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
10735 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
10736 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
10737 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
10738 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
10739 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
10740 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
10741 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
10742 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
10743 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
10744 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
10746 second_arg_count
= true;
10748 case UINT64_FTYPE_UINT64_UINT64
:
10749 case UINT_FTYPE_UINT_UINT
:
10750 case UINT_FTYPE_UINT_USHORT
:
10751 case UINT_FTYPE_UINT_UCHAR
:
10752 case UINT16_FTYPE_UINT16_INT
:
10753 case UINT8_FTYPE_UINT8_INT
:
10754 case UQI_FTYPE_UQI_UQI
:
10755 case UHI_FTYPE_UHI_UHI
:
10756 case USI_FTYPE_USI_USI
:
10757 case UDI_FTYPE_UDI_UDI
:
10758 case V16SI_FTYPE_V8DF_V8DF
:
10759 case V32BF_FTYPE_V16SF_V16SF
:
10760 case V16BF_FTYPE_V8SF_V8SF
:
10761 case V8BF_FTYPE_V4SF_V4SF
:
10762 case V16BF_FTYPE_V16SF_UHI
:
10763 case V8BF_FTYPE_V8SF_UQI
:
10764 case V8BF_FTYPE_V4SF_UQI
:
10767 case V2DI_FTYPE_V2DI_INT_CONVERT
:
10770 nargs_constant
= 1;
10772 case V4DI_FTYPE_V4DI_INT_CONVERT
:
10775 nargs_constant
= 1;
10777 case V8DI_FTYPE_V8DI_INT_CONVERT
:
10780 nargs_constant
= 1;
10782 case V8HI_FTYPE_V8HI_INT
:
10783 case V8HI_FTYPE_V8SF_INT
:
10784 case V16HI_FTYPE_V16SF_INT
:
10785 case V8HI_FTYPE_V4SF_INT
:
10786 case V8SF_FTYPE_V8SF_INT
:
10787 case V4SF_FTYPE_V16SF_INT
:
10788 case V16SF_FTYPE_V16SF_INT
:
10789 case V4SI_FTYPE_V4SI_INT
:
10790 case V4SI_FTYPE_V8SI_INT
:
10791 case V4HI_FTYPE_V4HI_INT
:
10792 case V4DF_FTYPE_V4DF_INT
:
10793 case V4DF_FTYPE_V8DF_INT
:
10794 case V4SF_FTYPE_V4SF_INT
:
10795 case V4SF_FTYPE_V8SF_INT
:
10796 case V2DI_FTYPE_V2DI_INT
:
10797 case V2DF_FTYPE_V2DF_INT
:
10798 case V2DF_FTYPE_V4DF_INT
:
10799 case V16HI_FTYPE_V16HI_INT
:
10800 case V8SI_FTYPE_V8SI_INT
:
10801 case V16SI_FTYPE_V16SI_INT
:
10802 case V4SI_FTYPE_V16SI_INT
:
10803 case V4DI_FTYPE_V4DI_INT
:
10804 case V2DI_FTYPE_V4DI_INT
:
10805 case V4DI_FTYPE_V8DI_INT
:
10806 case UQI_FTYPE_UQI_UQI_CONST
:
10807 case UHI_FTYPE_UHI_UQI
:
10808 case USI_FTYPE_USI_UQI
:
10809 case UDI_FTYPE_UDI_UQI
:
10811 nargs_constant
= 1;
10813 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
10814 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
10815 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
10816 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
10817 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
10818 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
10819 case UHI_FTYPE_V16SI_V16SI_UHI
:
10820 case UQI_FTYPE_V8DI_V8DI_UQI
:
10821 case V16HI_FTYPE_V16SI_V16HI_UHI
:
10822 case V16QI_FTYPE_V16SI_V16QI_UHI
:
10823 case V16QI_FTYPE_V8DI_V16QI_UQI
:
10824 case V32HF_FTYPE_V32HF_V32HF_USI
:
10825 case V16SF_FTYPE_V16SF_V16SF_UHI
:
10826 case V16SF_FTYPE_V4SF_V16SF_UHI
:
10827 case V16SI_FTYPE_SI_V16SI_UHI
:
10828 case V16SI_FTYPE_V16HI_V16SI_UHI
:
10829 case V16SI_FTYPE_V16QI_V16SI_UHI
:
10830 case V8SF_FTYPE_V4SF_V8SF_UQI
:
10831 case V4DF_FTYPE_V2DF_V4DF_UQI
:
10832 case V8SI_FTYPE_V4SI_V8SI_UQI
:
10833 case V8SI_FTYPE_SI_V8SI_UQI
:
10834 case V4SI_FTYPE_V4SI_V4SI_UQI
:
10835 case V4SI_FTYPE_SI_V4SI_UQI
:
10836 case V4DI_FTYPE_V2DI_V4DI_UQI
:
10837 case V4DI_FTYPE_DI_V4DI_UQI
:
10838 case V2DI_FTYPE_V2DI_V2DI_UQI
:
10839 case V2DI_FTYPE_DI_V2DI_UQI
:
10840 case V64QI_FTYPE_V64QI_V64QI_UDI
:
10841 case V64QI_FTYPE_V16QI_V64QI_UDI
:
10842 case V64QI_FTYPE_QI_V64QI_UDI
:
10843 case V32QI_FTYPE_V32QI_V32QI_USI
:
10844 case V32QI_FTYPE_V16QI_V32QI_USI
:
10845 case V32QI_FTYPE_QI_V32QI_USI
:
10846 case V16QI_FTYPE_V16QI_V16QI_UHI
:
10847 case V16QI_FTYPE_QI_V16QI_UHI
:
10848 case V32HI_FTYPE_V8HI_V32HI_USI
:
10849 case V32HI_FTYPE_HI_V32HI_USI
:
10850 case V16HI_FTYPE_V8HI_V16HI_UHI
:
10851 case V16HI_FTYPE_HI_V16HI_UHI
:
10852 case V8HI_FTYPE_V8HI_V8HI_UQI
:
10853 case V8HI_FTYPE_HI_V8HI_UQI
:
10854 case V16HF_FTYPE_V16HF_V16HF_UHI
:
10855 case V8SF_FTYPE_V8HI_V8SF_UQI
:
10856 case V4SF_FTYPE_V8HI_V4SF_UQI
:
10857 case V8SI_FTYPE_V8HF_V8SI_UQI
:
10858 case V8SF_FTYPE_V8HF_V8SF_UQI
:
10859 case V8SI_FTYPE_V8SF_V8SI_UQI
:
10860 case V4SI_FTYPE_V4SF_V4SI_UQI
:
10861 case V4SI_FTYPE_V8HF_V4SI_UQI
:
10862 case V4SF_FTYPE_V8HF_V4SF_UQI
:
10863 case V4DI_FTYPE_V8HF_V4DI_UQI
:
10864 case V4DI_FTYPE_V4SF_V4DI_UQI
:
10865 case V2DI_FTYPE_V8HF_V2DI_UQI
:
10866 case V2DI_FTYPE_V4SF_V2DI_UQI
:
10867 case V8HF_FTYPE_V8HF_V8HF_UQI
:
10868 case V8HF_FTYPE_V8HF_V8HF_V8HF
:
10869 case V8HF_FTYPE_V8HI_V8HF_UQI
:
10870 case V8HF_FTYPE_V8SI_V8HF_UQI
:
10871 case V8HF_FTYPE_V8SF_V8HF_UQI
:
10872 case V8HF_FTYPE_V4SI_V8HF_UQI
:
10873 case V8HF_FTYPE_V4SF_V8HF_UQI
:
10874 case V8HF_FTYPE_V4DI_V8HF_UQI
:
10875 case V8HF_FTYPE_V4DF_V8HF_UQI
:
10876 case V8HF_FTYPE_V2DI_V8HF_UQI
:
10877 case V8HF_FTYPE_V2DF_V8HF_UQI
:
10878 case V4SF_FTYPE_V4DI_V4SF_UQI
:
10879 case V4SF_FTYPE_V2DI_V4SF_UQI
:
10880 case V4DF_FTYPE_V4DI_V4DF_UQI
:
10881 case V4DF_FTYPE_V8HF_V4DF_UQI
:
10882 case V2DF_FTYPE_V8HF_V2DF_UQI
:
10883 case V2DF_FTYPE_V2DI_V2DF_UQI
:
10884 case V16QI_FTYPE_V8HI_V16QI_UQI
:
10885 case V16QI_FTYPE_V16HI_V16QI_UHI
:
10886 case V16QI_FTYPE_V4SI_V16QI_UQI
:
10887 case V16QI_FTYPE_V8SI_V16QI_UQI
:
10888 case V8HI_FTYPE_V8HF_V8HI_UQI
:
10889 case V8HI_FTYPE_V4SI_V8HI_UQI
:
10890 case V8HI_FTYPE_V8SI_V8HI_UQI
:
10891 case V16QI_FTYPE_V2DI_V16QI_UQI
:
10892 case V16QI_FTYPE_V4DI_V16QI_UQI
:
10893 case V8HI_FTYPE_V2DI_V8HI_UQI
:
10894 case V8HI_FTYPE_V4DI_V8HI_UQI
:
10895 case V4SI_FTYPE_V2DI_V4SI_UQI
:
10896 case V4SI_FTYPE_V4DI_V4SI_UQI
:
10897 case V32QI_FTYPE_V32HI_V32QI_USI
:
10898 case UHI_FTYPE_V16QI_V16QI_UHI
:
10899 case USI_FTYPE_V32QI_V32QI_USI
:
10900 case UDI_FTYPE_V64QI_V64QI_UDI
:
10901 case UQI_FTYPE_V8HI_V8HI_UQI
:
10902 case UHI_FTYPE_V16HI_V16HI_UHI
:
10903 case USI_FTYPE_V32HI_V32HI_USI
:
10904 case UQI_FTYPE_V4SI_V4SI_UQI
:
10905 case UQI_FTYPE_V8SI_V8SI_UQI
:
10906 case UQI_FTYPE_V2DI_V2DI_UQI
:
10907 case UQI_FTYPE_V4DI_V4DI_UQI
:
10908 case V4SF_FTYPE_V2DF_V4SF_UQI
:
10909 case V4SF_FTYPE_V4DF_V4SF_UQI
:
10910 case V16SI_FTYPE_V16SI_V16SI_UHI
:
10911 case V16SI_FTYPE_V4SI_V16SI_UHI
:
10912 case V2DI_FTYPE_V4SI_V2DI_UQI
:
10913 case V2DI_FTYPE_V8HI_V2DI_UQI
:
10914 case V2DI_FTYPE_V16QI_V2DI_UQI
:
10915 case V4DI_FTYPE_V4DI_V4DI_UQI
:
10916 case V4DI_FTYPE_V4SI_V4DI_UQI
:
10917 case V4DI_FTYPE_V8HI_V4DI_UQI
:
10918 case V4DI_FTYPE_V16QI_V4DI_UQI
:
10919 case V4DI_FTYPE_V4DF_V4DI_UQI
:
10920 case V2DI_FTYPE_V2DF_V2DI_UQI
:
10921 case V4SI_FTYPE_V4DF_V4SI_UQI
:
10922 case V4SI_FTYPE_V2DF_V4SI_UQI
:
10923 case V4SI_FTYPE_V8HI_V4SI_UQI
:
10924 case V4SI_FTYPE_V16QI_V4SI_UQI
:
10925 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
10926 case V8DF_FTYPE_V2DF_V8DF_UQI
:
10927 case V8DF_FTYPE_V4DF_V8DF_UQI
:
10928 case V8DF_FTYPE_V8DF_V8DF_UQI
:
10929 case V8SF_FTYPE_V8SF_V8SF_UQI
:
10930 case V8SF_FTYPE_V8SI_V8SF_UQI
:
10931 case V4DF_FTYPE_V4DF_V4DF_UQI
:
10932 case V4SF_FTYPE_V4SF_V4SF_UQI
:
10933 case V2DF_FTYPE_V2DF_V2DF_UQI
:
10934 case V2DF_FTYPE_V4SF_V2DF_UQI
:
10935 case V2DF_FTYPE_V4SI_V2DF_UQI
:
10936 case V4SF_FTYPE_V4SI_V4SF_UQI
:
10937 case V4DF_FTYPE_V4SF_V4DF_UQI
:
10938 case V4DF_FTYPE_V4SI_V4DF_UQI
:
10939 case V8SI_FTYPE_V8SI_V8SI_UQI
:
10940 case V8SI_FTYPE_V8HI_V8SI_UQI
:
10941 case V8SI_FTYPE_V16QI_V8SI_UQI
:
10942 case V8DF_FTYPE_V8SI_V8DF_UQI
:
10943 case V8DI_FTYPE_DI_V8DI_UQI
:
10944 case V16SF_FTYPE_V8SF_V16SF_UHI
:
10945 case V16SI_FTYPE_V8SI_V16SI_UHI
:
10946 case V16HF_FTYPE_V16HI_V16HF_UHI
:
10947 case V16HF_FTYPE_V16HF_V16HF_V16HF
:
10948 case V16HI_FTYPE_V16HF_V16HI_UHI
:
10949 case V16HI_FTYPE_V16HI_V16HI_UHI
:
10950 case V8HI_FTYPE_V16QI_V8HI_UQI
:
10951 case V16HI_FTYPE_V16QI_V16HI_UHI
:
10952 case V32HI_FTYPE_V32HI_V32HI_USI
:
10953 case V32HI_FTYPE_V32QI_V32HI_USI
:
10954 case V8DI_FTYPE_V16QI_V8DI_UQI
:
10955 case V8DI_FTYPE_V2DI_V8DI_UQI
:
10956 case V8DI_FTYPE_V4DI_V8DI_UQI
:
10957 case V8DI_FTYPE_V8DI_V8DI_UQI
:
10958 case V8DI_FTYPE_V8HI_V8DI_UQI
:
10959 case V8DI_FTYPE_V8SI_V8DI_UQI
:
10960 case V8HI_FTYPE_V8DI_V8HI_UQI
:
10961 case V8SI_FTYPE_V8DI_V8SI_UQI
:
10962 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
10963 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
10964 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
10965 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
10966 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
10967 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
10968 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
10969 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
10970 case V32BF_FTYPE_V16SF_V16SF_USI
:
10971 case V16BF_FTYPE_V8SF_V8SF_UHI
:
10972 case V8BF_FTYPE_V4SF_V4SF_UQI
:
10973 case V16BF_FTYPE_V16SF_V16BF_UHI
:
10974 case V8BF_FTYPE_V8SF_V8BF_UQI
:
10975 case V8BF_FTYPE_V4SF_V8BF_UQI
:
10976 case V16SF_FTYPE_V16SF_V32BF_V32BF
:
10977 case V8SF_FTYPE_V8SF_V16BF_V16BF
:
10978 case V4SF_FTYPE_V4SF_V8BF_V8BF
:
10981 case V32QI_FTYPE_V32QI_V32QI_INT
:
10982 case V16HI_FTYPE_V16HI_V16HI_INT
:
10983 case V16QI_FTYPE_V16QI_V16QI_INT
:
10984 case V4DI_FTYPE_V4DI_V4DI_INT
:
10985 case V8HI_FTYPE_V8HI_V8HI_INT
:
10986 case V8SI_FTYPE_V8SI_V8SI_INT
:
10987 case V8SI_FTYPE_V8SI_V4SI_INT
:
10988 case V8SF_FTYPE_V8SF_V8SF_INT
:
10989 case V8SF_FTYPE_V8SF_V4SF_INT
:
10990 case V4SI_FTYPE_V4SI_V4SI_INT
:
10991 case V4DF_FTYPE_V4DF_V4DF_INT
:
10992 case V16SF_FTYPE_V16SF_V16SF_INT
:
10993 case V16SF_FTYPE_V16SF_V4SF_INT
:
10994 case V16SI_FTYPE_V16SI_V4SI_INT
:
10995 case V4DF_FTYPE_V4DF_V2DF_INT
:
10996 case V4SF_FTYPE_V4SF_V4SF_INT
:
10997 case V2DI_FTYPE_V2DI_V2DI_INT
:
10998 case V4DI_FTYPE_V4DI_V2DI_INT
:
10999 case V2DF_FTYPE_V2DF_V2DF_INT
:
11000 case UQI_FTYPE_V8DI_V8UDI_INT
:
11001 case UQI_FTYPE_V8DF_V8DF_INT
:
11002 case UQI_FTYPE_V2DF_V2DF_INT
:
11003 case UQI_FTYPE_V4SF_V4SF_INT
:
11004 case UHI_FTYPE_V16SI_V16SI_INT
:
11005 case UHI_FTYPE_V16SF_V16SF_INT
:
11006 case V64QI_FTYPE_V64QI_V64QI_INT
:
11007 case V32HI_FTYPE_V32HI_V32HI_INT
:
11008 case V16SI_FTYPE_V16SI_V16SI_INT
:
11009 case V8DI_FTYPE_V8DI_V8DI_INT
:
11011 nargs_constant
= 1;
11013 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
11016 nargs_constant
= 1;
11018 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
11021 nargs_constant
= 1;
11023 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
11026 nargs_constant
= 1;
11028 case V2DI_FTYPE_V2DI_UINT_UINT
:
11030 nargs_constant
= 2;
11032 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
11035 nargs_constant
= 1;
11037 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
11041 nargs_constant
= 1;
11043 case QI_FTYPE_V8DF_INT_UQI
:
11044 case QI_FTYPE_V4DF_INT_UQI
:
11045 case QI_FTYPE_V2DF_INT_UQI
:
11046 case HI_FTYPE_V16SF_INT_UHI
:
11047 case QI_FTYPE_V8SF_INT_UQI
:
11048 case QI_FTYPE_V4SF_INT_UQI
:
11049 case QI_FTYPE_V8HF_INT_UQI
:
11050 case HI_FTYPE_V16HF_INT_UHI
:
11051 case SI_FTYPE_V32HF_INT_USI
:
11052 case V4SI_FTYPE_V4SI_V4SI_UHI
:
11053 case V8SI_FTYPE_V8SI_V8SI_UHI
:
11056 nargs_constant
= 1;
11058 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
11062 nargs_constant
= 1;
11064 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
11068 nargs_constant
= 1;
11070 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
11071 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
11072 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
11073 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
11074 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
11075 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
11076 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
11077 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
11078 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
11079 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
11080 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
11081 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
11082 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
11083 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
11084 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
11085 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
11086 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI
:
11087 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
11088 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
11089 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
11090 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
11091 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
11092 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
11093 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
11094 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
11095 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
11096 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
11097 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
11098 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
11099 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
11100 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
11101 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
11102 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
11103 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
11104 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI
:
11105 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI
:
11106 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
11107 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
11108 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
11109 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
11110 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
11111 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
11112 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
11113 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI
:
11114 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
11115 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
11116 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
11117 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
11118 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
11119 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
11120 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
11121 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
11122 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
11123 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
11124 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
11125 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI
:
11126 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI
:
11127 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI
:
11130 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
11131 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
11132 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
11133 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
11134 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
11136 nargs_constant
= 1;
11138 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
11139 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
11140 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
11141 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
11142 case UHI_FTYPE_V16HF_V16HF_INT_UHI
:
11143 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
11144 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
11145 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
11146 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
11147 case UQI_FTYPE_V8HF_V8HF_INT_UQI
:
11148 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
11149 case USI_FTYPE_V32QI_V32QI_INT_USI
:
11150 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
11151 case USI_FTYPE_V32HI_V32HI_INT_USI
:
11152 case USI_FTYPE_V32HF_V32HF_INT_USI
:
11153 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
11154 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
11157 nargs_constant
= 1;
11159 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
11161 nargs_constant
= 2;
11163 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
11164 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
11165 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI
:
11166 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI
:
11167 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI
:
11170 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
11171 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
11174 nargs_constant
= 1;
11176 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
11177 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
11178 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
11179 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
11180 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
11181 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
11182 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
11183 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
11184 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
11185 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
11186 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
11187 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
11188 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
11189 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
11190 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
11191 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
11192 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
11193 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
11194 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
11195 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
11196 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
11197 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
11198 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
11199 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
11200 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
11201 case V16HF_FTYPE_V16HF_INT_V16HF_UHI
:
11202 case V8HF_FTYPE_V8HF_INT_V8HF_UQI
:
11203 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
11204 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
11205 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
11206 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
11207 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
11210 nargs_constant
= 1;
11212 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
11213 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
11214 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
11215 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
11216 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
11217 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
11218 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
11219 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
11220 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
11221 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
11222 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
11223 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
11224 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
11225 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
11226 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
11227 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
11228 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
11229 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
11230 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
11231 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
11232 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
11233 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
11234 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
11235 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
11236 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
11237 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
11238 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
11241 nargs_constant
= 1;
11243 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
11244 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
11245 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
11246 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
11247 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
11248 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
11249 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
11250 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
11251 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
11252 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
11255 nargs_constant
= 1;
11257 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
11258 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
11259 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
11260 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
11261 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
11262 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
11263 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
11264 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
11265 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
11266 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
11267 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
11268 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
11271 nargs_constant
= 2;
11275 gcc_unreachable ();
11278 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
11280 if (comparison
!= UNKNOWN
)
11282 gcc_assert (nargs
== 2);
11283 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
11286 if (rmode
== VOIDmode
|| rmode
== tmode
)
11290 || GET_MODE (target
) != tmode
11291 || !insn_p
->operand
[0].predicate (target
, tmode
))
11292 target
= gen_reg_rtx (tmode
);
11293 else if (memory_operand (target
, tmode
))
11295 real_target
= target
;
11299 real_target
= gen_reg_rtx (tmode
);
11300 target
= lowpart_subreg (rmode
, real_target
, tmode
);
11303 for (i
= 0; i
< nargs
; i
++)
11305 tree arg
= CALL_EXPR_ARG (exp
, i
);
11306 rtx op
= expand_normal (arg
);
11307 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
11308 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
11310 if (second_arg_count
&& i
== 1)
11312 /* SIMD shift insns take either an 8-bit immediate or
11313 register as count. But builtin functions take int as
11314 count. If count doesn't match, we put it in register.
11315 The instructions are using 64-bit count, if op is just
11316 32-bit, zero-extend it, as negative shift counts
11317 are undefined behavior and zero-extension is more
11321 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
11322 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
11324 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11325 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
11326 op
= copy_to_reg (op
);
11329 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11330 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
11335 case CODE_FOR_avx_vinsertf128v4di
:
11336 case CODE_FOR_avx_vextractf128v4di
:
11337 error ("the last argument must be an 1-bit immediate");
11340 case CODE_FOR_avx512f_cmpv8di3_mask
:
11341 case CODE_FOR_avx512f_cmpv16si3_mask
:
11342 case CODE_FOR_avx512f_ucmpv8di3_mask
:
11343 case CODE_FOR_avx512f_ucmpv16si3_mask
:
11344 case CODE_FOR_avx512vl_cmpv4di3_mask
:
11345 case CODE_FOR_avx512vl_cmpv8si3_mask
:
11346 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
11347 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
11348 case CODE_FOR_avx512vl_cmpv2di3_mask
:
11349 case CODE_FOR_avx512vl_cmpv4si3_mask
:
11350 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
11351 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
11352 error ("the last argument must be a 3-bit immediate");
11355 case CODE_FOR_sse4_1_roundsd
:
11356 case CODE_FOR_sse4_1_roundss
:
11358 case CODE_FOR_sse4_1_roundpd
:
11359 case CODE_FOR_sse4_1_roundps
:
11360 case CODE_FOR_avx_roundpd256
:
11361 case CODE_FOR_avx_roundps256
:
11363 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
11364 case CODE_FOR_sse4_1_roundps_sfix
:
11365 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
11366 case CODE_FOR_avx_roundps_sfix256
:
11368 case CODE_FOR_sse4_1_blendps
:
11369 case CODE_FOR_avx_blendpd256
:
11370 case CODE_FOR_avx_vpermilv4df
:
11371 case CODE_FOR_avx_vpermilv4df_mask
:
11372 case CODE_FOR_avx512f_getmantv8df_mask
:
11373 case CODE_FOR_avx512f_getmantv16sf_mask
:
11374 case CODE_FOR_avx512vl_getmantv16hf_mask
:
11375 case CODE_FOR_avx512vl_getmantv8sf_mask
:
11376 case CODE_FOR_avx512vl_getmantv4df_mask
:
11377 case CODE_FOR_avx512fp16_getmantv8hf_mask
:
11378 case CODE_FOR_avx512vl_getmantv4sf_mask
:
11379 case CODE_FOR_avx512vl_getmantv2df_mask
:
11380 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
11381 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
11382 case CODE_FOR_avx512dq_rangepv4df_mask
:
11383 case CODE_FOR_avx512dq_rangepv8sf_mask
:
11384 case CODE_FOR_avx512dq_rangepv2df_mask
:
11385 case CODE_FOR_avx512dq_rangepv4sf_mask
:
11386 case CODE_FOR_avx_shufpd256_mask
:
11387 error ("the last argument must be a 4-bit immediate");
11390 case CODE_FOR_sha1rnds4
:
11391 case CODE_FOR_sse4_1_blendpd
:
11392 case CODE_FOR_avx_vpermilv2df
:
11393 case CODE_FOR_avx_vpermilv2df_mask
:
11394 case CODE_FOR_xop_vpermil2v2df3
:
11395 case CODE_FOR_xop_vpermil2v4sf3
:
11396 case CODE_FOR_xop_vpermil2v4df3
:
11397 case CODE_FOR_xop_vpermil2v8sf3
:
11398 case CODE_FOR_avx512f_vinsertf32x4_mask
:
11399 case CODE_FOR_avx512f_vinserti32x4_mask
:
11400 case CODE_FOR_avx512f_vextractf32x4_mask
:
11401 case CODE_FOR_avx512f_vextracti32x4_mask
:
11402 case CODE_FOR_sse2_shufpd
:
11403 case CODE_FOR_sse2_shufpd_mask
:
11404 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
11405 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
11406 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
11407 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
11408 error ("the last argument must be a 2-bit immediate");
11411 case CODE_FOR_avx_vextractf128v4df
:
11412 case CODE_FOR_avx_vextractf128v8sf
:
11413 case CODE_FOR_avx_vextractf128v8si
:
11414 case CODE_FOR_avx_vinsertf128v4df
:
11415 case CODE_FOR_avx_vinsertf128v8sf
:
11416 case CODE_FOR_avx_vinsertf128v8si
:
11417 case CODE_FOR_avx512f_vinsertf64x4_mask
:
11418 case CODE_FOR_avx512f_vinserti64x4_mask
:
11419 case CODE_FOR_avx512f_vextractf64x4_mask
:
11420 case CODE_FOR_avx512f_vextracti64x4_mask
:
11421 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
11422 case CODE_FOR_avx512dq_vinserti32x8_mask
:
11423 case CODE_FOR_avx512vl_vinsertv4df
:
11424 case CODE_FOR_avx512vl_vinsertv4di
:
11425 case CODE_FOR_avx512vl_vinsertv8sf
:
11426 case CODE_FOR_avx512vl_vinsertv8si
:
11427 error ("the last argument must be a 1-bit immediate");
11430 case CODE_FOR_avx_vmcmpv2df3
:
11431 case CODE_FOR_avx_vmcmpv4sf3
:
11432 case CODE_FOR_avx_cmpv2df3
:
11433 case CODE_FOR_avx_cmpv4sf3
:
11434 case CODE_FOR_avx_cmpv4df3
:
11435 case CODE_FOR_avx_cmpv8sf3
:
11436 case CODE_FOR_avx512f_cmpv8df3_mask
:
11437 case CODE_FOR_avx512f_cmpv16sf3_mask
:
11438 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
11439 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
11440 case CODE_FOR_avx512bw_cmpv32hf3_mask
:
11441 case CODE_FOR_avx512vl_cmpv16hf3_mask
:
11442 case CODE_FOR_avx512fp16_cmpv8hf3_mask
:
11443 error ("the last argument must be a 5-bit immediate");
11447 switch (nargs_constant
)
11450 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11451 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
11453 error ("the next to last argument must be an 8-bit immediate");
11458 error ("the last argument must be an 8-bit immediate");
11461 gcc_unreachable ();
11468 if (VECTOR_MODE_P (mode
))
11469 op
= safe_vector_operand (op
, mode
);
11471 /* If we aren't optimizing, only allow one memory operand to
11473 if (memory_operand (op
, mode
))
11476 op
= fixup_modeless_constant (op
, mode
);
11478 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11480 if (optimize
|| !match
|| num_memory
> 1)
11481 op
= copy_to_mode_reg (mode
, op
);
11485 op
= copy_to_reg (op
);
11486 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11496 pat
= GEN_FCN (icode
) (real_target
, xops
[0]);
11499 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1]);
11502 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1], xops
[2]);
11505 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11509 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11510 xops
[2], xops
[3], xops
[4]);
11513 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11514 xops
[2], xops
[3], xops
[4], xops
[5]);
11517 gcc_unreachable ();
11527 /* Transform pattern of following layout:
11529 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11535 ix86_erase_embedded_rounding (rtx pat
)
11537 if (GET_CODE (pat
) == INSN
)
11538 pat
= PATTERN (pat
);
11540 gcc_assert (GET_CODE (pat
) == SET
);
11541 rtx src
= SET_SRC (pat
);
11542 gcc_assert (XVECLEN (src
, 0) == 2);
11543 rtx p0
= XVECEXP (src
, 0, 0);
11544 gcc_assert (GET_CODE (src
) == UNSPEC
11545 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
11546 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
11550 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11553 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
11554 tree exp
, rtx target
)
11557 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11558 tree arg1
= CALL_EXPR_ARG (exp
, 1);
11559 tree arg2
= CALL_EXPR_ARG (exp
, 2);
11560 tree arg3
= CALL_EXPR_ARG (exp
, 3);
11561 rtx op0
= expand_normal (arg0
);
11562 rtx op1
= expand_normal (arg1
);
11563 rtx op2
= expand_normal (arg2
);
11564 rtx op3
= expand_normal (arg3
);
11565 enum insn_code icode
= d
->icode
;
11566 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11567 machine_mode mode0
= insn_p
->operand
[0].mode
;
11568 machine_mode mode1
= insn_p
->operand
[1].mode
;
11570 /* See avxintrin.h for values. */
11571 static const enum rtx_code comparisons
[32] =
11573 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11574 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
11575 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11576 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
11578 static const bool ordereds
[32] =
11580 true, true, true, false, false, false, false, true,
11581 false, false, false, true, true, true, true, false,
11582 true, true, true, false, false, false, false, true,
11583 false, false, false, true, true, true, true, false
11585 static const bool non_signalings
[32] =
11587 true, false, false, true, true, false, false, true,
11588 true, false, false, true, true, false, false, true,
11589 false, true, true, false, false, true, true, false,
11590 false, true, true, false, false, true, true, false
11593 if (!CONST_INT_P (op2
))
11595 error ("the third argument must be comparison constant");
11598 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
11600 error ("incorrect comparison mode");
11604 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
11606 error ("incorrect rounding operand");
11610 if (VECTOR_MODE_P (mode0
))
11611 op0
= safe_vector_operand (op0
, mode0
);
11612 if (VECTOR_MODE_P (mode1
))
11613 op1
= safe_vector_operand (op1
, mode1
);
11615 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
11616 bool ordered
= ordereds
[INTVAL (op2
)];
11617 bool non_signaling
= non_signalings
[INTVAL (op2
)];
11618 rtx const_val
= const0_rtx
;
11620 bool check_unordered
= false;
11621 machine_mode mode
= CCFPmode
;
11622 switch (comparison
)
11627 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11628 if (!non_signaling
)
11634 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11644 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11651 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11652 if (!non_signaling
)
11659 case LE
: /* -> GE */
11660 case LT
: /* -> GT */
11661 case UNGE
: /* -> UNLE */
11662 case UNGT
: /* -> UNLT */
11663 std::swap (op0
, op1
);
11664 comparison
= swap_condition (comparison
);
11672 /* These are supported by CCFPmode. NB: Use ordered/signaling
11673 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11674 with NAN operands. */
11675 if (ordered
== non_signaling
)
11676 ordered
= !ordered
;
11679 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11680 _CMP_EQ_OQ/_CMP_EQ_OS. */
11681 check_unordered
= true;
11685 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11686 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11687 gcc_assert (!ordered
);
11688 check_unordered
= true;
11690 const_val
= const1_rtx
;
11693 gcc_unreachable ();
11696 target
= gen_reg_rtx (SImode
);
11697 emit_move_insn (target
, const_val
);
11698 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11700 if ((optimize
&& !register_operand (op0
, mode0
))
11701 || !insn_p
->operand
[0].predicate (op0
, mode0
))
11702 op0
= copy_to_mode_reg (mode0
, op0
);
11703 if ((optimize
&& !register_operand (op1
, mode1
))
11704 || !insn_p
->operand
[1].predicate (op1
, mode1
))
11705 op1
= copy_to_mode_reg (mode1
, op1
);
11708 1. COMI: ordered and signaling.
11709 2. UCOMI: unordered and non-signaling.
11712 icode
= (icode
== CODE_FOR_sse_comi_round
11713 ? CODE_FOR_sse_ucomi_round
11714 : CODE_FOR_sse2_ucomi_round
);
11716 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
11720 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11721 if (INTVAL (op3
) == NO_ROUND
)
11723 pat
= ix86_erase_embedded_rounding (pat
);
11727 set_dst
= SET_DEST (pat
);
11731 gcc_assert (GET_CODE (pat
) == SET
);
11732 set_dst
= SET_DEST (pat
);
11737 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
11742 ix86_expand_round_builtin (const struct builtin_description
*d
,
11743 tree exp
, rtx target
)
11746 unsigned int i
, nargs
;
11748 enum insn_code icode
= d
->icode
;
11749 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11750 machine_mode tmode
= insn_p
->operand
[0].mode
;
11751 unsigned int nargs_constant
= 0;
11752 unsigned int redundant_embed_rnd
= 0;
11754 switch ((enum ix86_builtin_func_type
) d
->flag
)
11756 case UINT64_FTYPE_V2DF_INT
:
11757 case UINT64_FTYPE_V4SF_INT
:
11758 case UINT64_FTYPE_V8HF_INT
:
11759 case UINT_FTYPE_V2DF_INT
:
11760 case UINT_FTYPE_V4SF_INT
:
11761 case UINT_FTYPE_V8HF_INT
:
11762 case INT64_FTYPE_V2DF_INT
:
11763 case INT64_FTYPE_V4SF_INT
:
11764 case INT64_FTYPE_V8HF_INT
:
11765 case INT_FTYPE_V2DF_INT
:
11766 case INT_FTYPE_V4SF_INT
:
11767 case INT_FTYPE_V8HF_INT
:
11770 case V32HF_FTYPE_V32HF_V32HF_INT
:
11771 case V8HF_FTYPE_V8HF_V8HF_INT
:
11772 case V8HF_FTYPE_V8HF_INT_INT
:
11773 case V8HF_FTYPE_V8HF_UINT_INT
:
11774 case V8HF_FTYPE_V8HF_INT64_INT
:
11775 case V8HF_FTYPE_V8HF_UINT64_INT
:
11776 case V4SF_FTYPE_V4SF_UINT_INT
:
11777 case V4SF_FTYPE_V4SF_UINT64_INT
:
11778 case V2DF_FTYPE_V2DF_UINT64_INT
:
11779 case V4SF_FTYPE_V4SF_INT_INT
:
11780 case V4SF_FTYPE_V4SF_INT64_INT
:
11781 case V2DF_FTYPE_V2DF_INT64_INT
:
11782 case V4SF_FTYPE_V4SF_V4SF_INT
:
11783 case V2DF_FTYPE_V2DF_V2DF_INT
:
11784 case V4SF_FTYPE_V4SF_V2DF_INT
:
11785 case V2DF_FTYPE_V2DF_V4SF_INT
:
11788 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
11789 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
11790 case V32HI_FTYPE_V32HF_V32HI_USI_INT
:
11791 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
11792 case V8DI_FTYPE_V8HF_V8DI_UQI_INT
:
11793 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
11794 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
11795 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
11796 case V8DF_FTYPE_V8HF_V8DF_UQI_INT
:
11797 case V16SF_FTYPE_V16HF_V16SF_UHI_INT
:
11798 case V32HF_FTYPE_V32HI_V32HF_USI_INT
:
11799 case V32HF_FTYPE_V32HF_V32HF_USI_INT
:
11800 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT
:
11801 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
11802 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
11803 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
11804 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
11805 case V16SI_FTYPE_V16HF_V16SI_UHI_INT
:
11806 case V16HF_FTYPE_V16SI_V16HF_UHI_INT
:
11807 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
11808 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
11809 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
11810 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
11811 case V8HF_FTYPE_V8DI_V8HF_UQI_INT
:
11812 case V8HF_FTYPE_V8DF_V8HF_UQI_INT
:
11813 case V16HF_FTYPE_V16SF_V16HF_UHI_INT
:
11814 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT
:
11817 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
11818 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
11819 nargs_constant
= 2;
11822 case INT_FTYPE_V4SF_V4SF_INT_INT
:
11823 case INT_FTYPE_V2DF_V2DF_INT_INT
:
11824 return ix86_expand_sse_comi_round (d
, exp
, target
);
11825 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
11826 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
11827 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
11828 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT
:
11829 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
11830 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT
:
11831 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT
:
11832 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT
:
11833 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
11834 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
11835 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT
:
11836 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
11837 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
11838 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT
:
11839 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT
:
11840 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT
:
11841 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT
:
11844 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT
:
11845 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
11846 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
11847 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT
:
11848 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT
:
11849 nargs_constant
= 4;
11852 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
11853 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
11854 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
11855 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
11856 case USI_FTYPE_V32HF_V32HF_INT_USI_INT
:
11857 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT
:
11858 nargs_constant
= 3;
11861 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
11862 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
11863 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
11864 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
11865 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
11866 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
11867 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT
:
11869 nargs_constant
= 4;
11871 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
11872 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
11873 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
11874 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
11876 nargs_constant
= 3;
11879 gcc_unreachable ();
11881 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
11885 || GET_MODE (target
) != tmode
11886 || !insn_p
->operand
[0].predicate (target
, tmode
))
11887 target
= gen_reg_rtx (tmode
);
11889 for (i
= 0; i
< nargs
; i
++)
11891 tree arg
= CALL_EXPR_ARG (exp
, i
);
11892 rtx op
= expand_normal (arg
);
11893 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
11894 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
11896 if (i
== nargs
- nargs_constant
)
11902 case CODE_FOR_avx512f_getmantv8df_mask_round
:
11903 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
11904 case CODE_FOR_avx512bw_getmantv32hf_mask_round
:
11905 case CODE_FOR_avx512f_vgetmantv2df_round
:
11906 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
11907 case CODE_FOR_avx512f_vgetmantv4sf_round
:
11908 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
11909 case CODE_FOR_avx512f_vgetmantv8hf_mask_round
:
11910 error ("the immediate argument must be a 4-bit immediate");
11912 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
11913 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
11914 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
11915 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
11916 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round
:
11917 case CODE_FOR_avx512bw_cmpv32hf3_mask_round
:
11918 error ("the immediate argument must be a 5-bit immediate");
11921 error ("the immediate argument must be an 8-bit immediate");
11926 else if (i
== nargs
-1)
11928 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
11930 error ("incorrect rounding operand");
11934 /* If there is no rounding use normal version of the pattern. */
11935 if (INTVAL (op
) == NO_ROUND
)
11937 /* Skip erasing embedded rounding for below expanders who
11938 generates multiple insns. In ix86_erase_embedded_rounding
11939 the pattern will be transformed to a single set, and emit_insn
11940 appends the set insead of insert it to chain. So the insns
11941 emitted inside define_expander would be ignored. */
11944 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round
:
11945 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round
:
11946 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round
:
11947 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round
:
11948 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round
:
11949 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round
:
11950 redundant_embed_rnd
= 0;
11953 redundant_embed_rnd
= 1;
11960 if (VECTOR_MODE_P (mode
))
11961 op
= safe_vector_operand (op
, mode
);
11963 op
= fixup_modeless_constant (op
, mode
);
11965 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11967 if (optimize
|| !match
)
11968 op
= copy_to_mode_reg (mode
, op
);
11972 op
= copy_to_reg (op
);
11973 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11983 pat
= GEN_FCN (icode
) (target
, xops
[0]);
11986 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
11989 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
11992 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11996 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11997 xops
[2], xops
[3], xops
[4]);
12000 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
12001 xops
[2], xops
[3], xops
[4], xops
[5]);
12004 gcc_unreachable ();
12010 if (redundant_embed_rnd
)
12011 pat
= ix86_erase_embedded_rounding (pat
);
12017 /* Subroutine of ix86_expand_builtin to take care of special insns
12018 with variable number of operands. */
12021 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
12022 tree exp
, rtx target
)
12026 unsigned int i
, nargs
, arg_adjust
, memory
;
12027 unsigned int constant
= 100;
12028 bool aligned_mem
= false;
12030 enum insn_code icode
= d
->icode
;
12031 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
12032 machine_mode tmode
= insn_p
->operand
[0].mode
;
12033 enum { load
, store
} klass
;
12035 switch ((enum ix86_builtin_func_type
) d
->flag
)
12037 case VOID_FTYPE_VOID
:
12038 emit_insn (GEN_FCN (icode
) (target
));
12040 case VOID_FTYPE_UINT64
:
12041 case VOID_FTYPE_UNSIGNED
:
12047 case INT_FTYPE_VOID
:
12048 case USHORT_FTYPE_VOID
:
12049 case UINT64_FTYPE_VOID
:
12050 case UINT_FTYPE_VOID
:
12051 case UINT8_FTYPE_VOID
:
12052 case UNSIGNED_FTYPE_VOID
:
12057 case UINT64_FTYPE_PUNSIGNED
:
12058 case V2DI_FTYPE_PV2DI
:
12059 case V4DI_FTYPE_PV4DI
:
12060 case V32QI_FTYPE_PCCHAR
:
12061 case V16QI_FTYPE_PCCHAR
:
12062 case V8SF_FTYPE_PCV4SF
:
12063 case V8SF_FTYPE_PCFLOAT
:
12064 case V4SF_FTYPE_PCFLOAT
:
12065 case V4SF_FTYPE_PCFLOAT16
:
12066 case V4SF_FTYPE_PCBFLOAT16
:
12067 case V4SF_FTYPE_PCV8BF
:
12068 case V4SF_FTYPE_PCV8HF
:
12069 case V8SF_FTYPE_PCFLOAT16
:
12070 case V8SF_FTYPE_PCBFLOAT16
:
12071 case V8SF_FTYPE_PCV16HF
:
12072 case V8SF_FTYPE_PCV16BF
:
12073 case V4DF_FTYPE_PCV2DF
:
12074 case V4DF_FTYPE_PCDOUBLE
:
12075 case V2DF_FTYPE_PCDOUBLE
:
12076 case VOID_FTYPE_PVOID
:
12077 case V8DI_FTYPE_PV8DI
:
12083 case CODE_FOR_sse4_1_movntdqa
:
12084 case CODE_FOR_avx2_movntdqa
:
12085 case CODE_FOR_avx512f_movntdqa
:
12086 aligned_mem
= true;
12092 case VOID_FTYPE_PV2SF_V4SF
:
12093 case VOID_FTYPE_PV8DI_V8DI
:
12094 case VOID_FTYPE_PV4DI_V4DI
:
12095 case VOID_FTYPE_PV2DI_V2DI
:
12096 case VOID_FTYPE_PCHAR_V32QI
:
12097 case VOID_FTYPE_PCHAR_V16QI
:
12098 case VOID_FTYPE_PFLOAT_V16SF
:
12099 case VOID_FTYPE_PFLOAT_V8SF
:
12100 case VOID_FTYPE_PFLOAT_V4SF
:
12101 case VOID_FTYPE_PDOUBLE_V8DF
:
12102 case VOID_FTYPE_PDOUBLE_V4DF
:
12103 case VOID_FTYPE_PDOUBLE_V2DF
:
12104 case VOID_FTYPE_PLONGLONG_LONGLONG
:
12105 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
12106 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
12107 case VOID_FTYPE_PINT_INT
:
12110 /* Reserve memory operand for target. */
12111 memory
= ARRAY_SIZE (xops
);
12114 /* These builtins and instructions require the memory
12115 to be properly aligned. */
12116 case CODE_FOR_avx_movntv4di
:
12117 case CODE_FOR_sse2_movntv2di
:
12118 case CODE_FOR_avx_movntv8sf
:
12119 case CODE_FOR_sse_movntv4sf
:
12120 case CODE_FOR_sse4a_vmmovntv4sf
:
12121 case CODE_FOR_avx_movntv4df
:
12122 case CODE_FOR_sse2_movntv2df
:
12123 case CODE_FOR_sse4a_vmmovntv2df
:
12124 case CODE_FOR_sse2_movntidi
:
12125 case CODE_FOR_sse_movntq
:
12126 case CODE_FOR_sse2_movntisi
:
12127 case CODE_FOR_avx512f_movntv16sf
:
12128 case CODE_FOR_avx512f_movntv8df
:
12129 case CODE_FOR_avx512f_movntv8di
:
12130 aligned_mem
= true;
12136 case VOID_FTYPE_PVOID_PCVOID
:
12142 case V4SF_FTYPE_V4SF_PCV2SF
:
12143 case V2DF_FTYPE_V2DF_PCDOUBLE
:
12148 case V8SF_FTYPE_PCV8SF_V8SI
:
12149 case V4DF_FTYPE_PCV4DF_V4DI
:
12150 case V4SF_FTYPE_PCV4SF_V4SI
:
12151 case V2DF_FTYPE_PCV2DF_V2DI
:
12152 case V8SI_FTYPE_PCV8SI_V8SI
:
12153 case V4DI_FTYPE_PCV4DI_V4DI
:
12154 case V4SI_FTYPE_PCV4SI_V4SI
:
12155 case V2DI_FTYPE_PCV2DI_V2DI
:
12156 case VOID_FTYPE_INT_INT64
:
12161 case VOID_FTYPE_PV8DF_V8DF_UQI
:
12162 case VOID_FTYPE_PV4DF_V4DF_UQI
:
12163 case VOID_FTYPE_PV2DF_V2DF_UQI
:
12164 case VOID_FTYPE_PV16SF_V16SF_UHI
:
12165 case VOID_FTYPE_PV8SF_V8SF_UQI
:
12166 case VOID_FTYPE_PV4SF_V4SF_UQI
:
12167 case VOID_FTYPE_PV8DI_V8DI_UQI
:
12168 case VOID_FTYPE_PV4DI_V4DI_UQI
:
12169 case VOID_FTYPE_PV2DI_V2DI_UQI
:
12170 case VOID_FTYPE_PV16SI_V16SI_UHI
:
12171 case VOID_FTYPE_PV8SI_V8SI_UQI
:
12172 case VOID_FTYPE_PV4SI_V4SI_UQI
:
12173 case VOID_FTYPE_PV64QI_V64QI_UDI
:
12174 case VOID_FTYPE_PV32HI_V32HI_USI
:
12175 case VOID_FTYPE_PV32QI_V32QI_USI
:
12176 case VOID_FTYPE_PV16QI_V16QI_UHI
:
12177 case VOID_FTYPE_PV16HI_V16HI_UHI
:
12178 case VOID_FTYPE_PV8HI_V8HI_UQI
:
12181 /* These builtins and instructions require the memory
12182 to be properly aligned. */
12183 case CODE_FOR_avx512f_storev16sf_mask
:
12184 case CODE_FOR_avx512f_storev16si_mask
:
12185 case CODE_FOR_avx512f_storev8df_mask
:
12186 case CODE_FOR_avx512f_storev8di_mask
:
12187 case CODE_FOR_avx512vl_storev8sf_mask
:
12188 case CODE_FOR_avx512vl_storev8si_mask
:
12189 case CODE_FOR_avx512vl_storev4df_mask
:
12190 case CODE_FOR_avx512vl_storev4di_mask
:
12191 case CODE_FOR_avx512vl_storev4sf_mask
:
12192 case CODE_FOR_avx512vl_storev4si_mask
:
12193 case CODE_FOR_avx512vl_storev2df_mask
:
12194 case CODE_FOR_avx512vl_storev2di_mask
:
12195 aligned_mem
= true;
12201 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
12202 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
12203 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
12204 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
12205 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
12206 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
12207 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
12208 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
12209 case VOID_FTYPE_PV8SI_V8DI_UQI
:
12210 case VOID_FTYPE_PV8HI_V8DI_UQI
:
12211 case VOID_FTYPE_PV16HI_V16SI_UHI
:
12212 case VOID_FTYPE_PUDI_V8DI_UQI
:
12213 case VOID_FTYPE_PV16QI_V16SI_UHI
:
12214 case VOID_FTYPE_PV4SI_V4DI_UQI
:
12215 case VOID_FTYPE_PUDI_V2DI_UQI
:
12216 case VOID_FTYPE_PUDI_V4DI_UQI
:
12217 case VOID_FTYPE_PUSI_V2DI_UQI
:
12218 case VOID_FTYPE_PV8HI_V8SI_UQI
:
12219 case VOID_FTYPE_PUDI_V4SI_UQI
:
12220 case VOID_FTYPE_PUSI_V4DI_UQI
:
12221 case VOID_FTYPE_PUHI_V2DI_UQI
:
12222 case VOID_FTYPE_PUDI_V8SI_UQI
:
12223 case VOID_FTYPE_PUSI_V4SI_UQI
:
12224 case VOID_FTYPE_PCHAR_V64QI_UDI
:
12225 case VOID_FTYPE_PCHAR_V32QI_USI
:
12226 case VOID_FTYPE_PCHAR_V16QI_UHI
:
12227 case VOID_FTYPE_PSHORT_V32HI_USI
:
12228 case VOID_FTYPE_PSHORT_V16HI_UHI
:
12229 case VOID_FTYPE_PSHORT_V8HI_UQI
:
12230 case VOID_FTYPE_PINT_V16SI_UHI
:
12231 case VOID_FTYPE_PINT_V8SI_UQI
:
12232 case VOID_FTYPE_PINT_V4SI_UQI
:
12233 case VOID_FTYPE_PINT64_V8DI_UQI
:
12234 case VOID_FTYPE_PINT64_V4DI_UQI
:
12235 case VOID_FTYPE_PINT64_V2DI_UQI
:
12236 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
12237 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
12238 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
12239 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
12240 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
12241 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
12242 case VOID_FTYPE_PCFLOAT16_V8HF_UQI
:
12243 case VOID_FTYPE_PV32QI_V32HI_USI
:
12244 case VOID_FTYPE_PV16QI_V16HI_UHI
:
12245 case VOID_FTYPE_PUDI_V8HI_UQI
:
12248 /* Reserve memory operand for target. */
12249 memory
= ARRAY_SIZE (xops
);
12251 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
12252 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
12253 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
12254 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
12255 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
12256 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
12257 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
12258 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
12259 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
12260 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
12261 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
12262 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
12263 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
12264 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
12265 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
12266 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
12267 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
12268 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
12271 /* These builtins and instructions require the memory
12272 to be properly aligned. */
12273 case CODE_FOR_avx512f_loadv16sf_mask
:
12274 case CODE_FOR_avx512f_loadv16si_mask
:
12275 case CODE_FOR_avx512f_loadv8df_mask
:
12276 case CODE_FOR_avx512f_loadv8di_mask
:
12277 case CODE_FOR_avx512vl_loadv8sf_mask
:
12278 case CODE_FOR_avx512vl_loadv8si_mask
:
12279 case CODE_FOR_avx512vl_loadv4df_mask
:
12280 case CODE_FOR_avx512vl_loadv4di_mask
:
12281 case CODE_FOR_avx512vl_loadv4sf_mask
:
12282 case CODE_FOR_avx512vl_loadv4si_mask
:
12283 case CODE_FOR_avx512vl_loadv2df_mask
:
12284 case CODE_FOR_avx512vl_loadv2di_mask
:
12285 case CODE_FOR_avx512bw_loadv64qi_mask
:
12286 case CODE_FOR_avx512vl_loadv32qi_mask
:
12287 case CODE_FOR_avx512vl_loadv16qi_mask
:
12288 case CODE_FOR_avx512bw_loadv32hi_mask
:
12289 case CODE_FOR_avx512vl_loadv16hi_mask
:
12290 case CODE_FOR_avx512vl_loadv8hi_mask
:
12291 aligned_mem
= true;
12297 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
12298 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
12299 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
12300 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
12301 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
12302 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
12303 case V16SI_FTYPE_PCINT_V16SI_UHI
:
12304 case V8SI_FTYPE_PCINT_V8SI_UQI
:
12305 case V4SI_FTYPE_PCINT_V4SI_UQI
:
12306 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
12307 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
12308 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
12309 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
12310 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
12311 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
12312 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
12313 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
12314 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
12315 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI
:
12320 case INT_FTYPE_PINT_INT_INT_INT
:
12321 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT
:
12328 gcc_unreachable ();
12331 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
12333 if (klass
== store
)
12335 arg
= CALL_EXPR_ARG (exp
, 0);
12336 op
= expand_normal (arg
);
12337 gcc_assert (target
== 0);
12340 op
= ix86_zero_extend_to_Pmode (op
);
12341 target
= gen_rtx_MEM (tmode
, op
);
12342 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12343 on it. Try to improve it using get_pointer_alignment,
12344 and if the special builtin is one that requires strict
12345 mode alignment, also from it's GET_MODE_ALIGNMENT.
12346 Failure to do so could lead to ix86_legitimate_combined_insn
12347 rejecting all changes to such insns. */
12348 unsigned int align
= get_pointer_alignment (arg
);
12349 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
12350 align
= GET_MODE_ALIGNMENT (tmode
);
12351 if (MEM_ALIGN (target
) < align
)
12352 set_mem_align (target
, align
);
12355 target
= force_reg (tmode
, op
);
12363 || !register_operand (target
, tmode
)
12364 || GET_MODE (target
) != tmode
)
12365 target
= gen_reg_rtx (tmode
);
12368 for (i
= 0; i
< nargs
; i
++)
12370 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
12372 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
12373 op
= expand_normal (arg
);
12377 /* This must be the memory operand. */
12378 op
= ix86_zero_extend_to_Pmode (op
);
12379 op
= gen_rtx_MEM (mode
, op
);
12380 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12381 on it. Try to improve it using get_pointer_alignment,
12382 and if the special builtin is one that requires strict
12383 mode alignment, also from it's GET_MODE_ALIGNMENT.
12384 Failure to do so could lead to ix86_legitimate_combined_insn
12385 rejecting all changes to such insns. */
12386 unsigned int align
= get_pointer_alignment (arg
);
12387 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
12388 align
= GET_MODE_ALIGNMENT (mode
);
12389 if (MEM_ALIGN (op
) < align
)
12390 set_mem_align (op
, align
);
12392 else if (i
== constant
)
12394 /* This must be the constant. */
12395 if (!insn_p
->operand
[nargs
].predicate(op
, SImode
))
12397 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12403 /* This must be register. */
12404 if (VECTOR_MODE_P (mode
))
12405 op
= safe_vector_operand (op
, mode
);
12407 op
= fixup_modeless_constant (op
, mode
);
12409 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12410 and that mask operand shoud be at the end.
12411 Keep all-ones mask which would be simplified by the expander. */
12412 if (nargs
== 3 && i
== 2 && klass
== load
12413 && constm1_operand (op
, mode
)
12414 && insn_p
->operand
[i
].predicate (op
, mode
))
12416 else if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
12417 op
= copy_to_mode_reg (mode
, op
);
12420 op
= copy_to_reg (op
);
12421 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
12431 pat
= GEN_FCN (icode
) (target
);
12434 pat
= GEN_FCN (icode
) (target
, xops
[0]);
12437 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
12440 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
12443 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
12446 gcc_unreachable ();
12453 return klass
== store
? 0 : target
;
12456 /* Return the integer constant in ARG. Constrain it to be in the range
12457 of the subparts of VEC_TYPE; issue an error if not. */
12460 get_element_number (tree vec_type
, tree arg
)
12462 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
12464 if (!tree_fits_uhwi_p (arg
)
12465 || (elt
= tree_to_uhwi (arg
), elt
> max
))
12467 error ("selector must be an integer constant in the range "
12475 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12476 ix86_expand_vector_init. We DO have language-level syntax for this, in
12477 the form of (type){ init-list }. Except that since we can't place emms
12478 instructions from inside the compiler, we can't allow the use of MMX
12479 registers unless the user explicitly asks for it. So we do *not* define
12480 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12481 we have builtins invoked by mmintrin.h that gives us license to emit
12482 these sorts of instructions. */
12485 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
12487 machine_mode tmode
= TYPE_MODE (type
);
12488 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
12489 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
12490 rtvec v
= rtvec_alloc (n_elt
);
12492 gcc_assert (VECTOR_MODE_P (tmode
));
12493 gcc_assert (call_expr_nargs (exp
) == n_elt
);
12495 for (i
= 0; i
< n_elt
; ++i
)
12497 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
12498 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
12501 if (!target
|| !register_operand (target
, tmode
))
12502 target
= gen_reg_rtx (tmode
);
12504 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
12508 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12509 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12510 had a language-level syntax for referencing vector elements. */
12513 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
12515 machine_mode tmode
, mode0
;
12520 arg0
= CALL_EXPR_ARG (exp
, 0);
12521 arg1
= CALL_EXPR_ARG (exp
, 1);
12523 op0
= expand_normal (arg0
);
12524 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
12526 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12527 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
12528 gcc_assert (VECTOR_MODE_P (mode0
));
12530 op0
= force_reg (mode0
, op0
);
12532 if (optimize
|| !target
|| !register_operand (target
, tmode
))
12533 target
= gen_reg_rtx (tmode
);
12535 ix86_expand_vector_extract (true, target
, op0
, elt
);
12540 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12541 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12542 a language-level syntax for referencing vector elements. */
12545 ix86_expand_vec_set_builtin (tree exp
)
12547 machine_mode tmode
, mode1
;
12548 tree arg0
, arg1
, arg2
;
12550 rtx op0
, op1
, target
;
12552 arg0
= CALL_EXPR_ARG (exp
, 0);
12553 arg1
= CALL_EXPR_ARG (exp
, 1);
12554 arg2
= CALL_EXPR_ARG (exp
, 2);
12556 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
12557 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12558 gcc_assert (VECTOR_MODE_P (tmode
));
12560 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
12561 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
12562 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
12564 if (GET_MODE (op1
) != mode1
)
12565 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
12567 op0
= force_reg (tmode
, op0
);
12568 op1
= force_reg (mode1
, op1
);
12570 /* OP0 is the source of these builtin functions and shouldn't be
12571 modified. Create a copy, use it and return it as target. */
12572 target
= gen_reg_rtx (tmode
);
12573 emit_move_insn (target
, op0
);
12574 ix86_expand_vector_set (true, target
, op1
, elt
);
12579 /* Return true if the necessary isa options for this builtin exist,
12581 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12583 ix86_check_builtin_isa_match (unsigned int fcode
,
12584 HOST_WIDE_INT
* pbisa
,
12585 HOST_WIDE_INT
* pbisa2
)
12587 HOST_WIDE_INT isa
= ix86_isa_flags
;
12588 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
12589 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
12590 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
12591 HOST_WIDE_INT tmp_isa
= isa
, tmp_isa2
= isa2
;
12592 /* The general case is we require all the ISAs specified in bisa{,2}
12594 The exceptions are:
12595 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12596 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12597 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12598 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12599 OPTION_MASK_ISA2_AVXVNNI
12600 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
12601 OPTION_MASK_ISA2_AVXIFMA
12602 (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
12603 OPTION_MASK_ISA2_AVXNECONVERT
12604 where for each such pair it is sufficient if either of the ISAs is
12605 enabled, plus if it is ored with other options also those others.
12606 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12608 #define SHARE_BUILTIN(A1, A2, B1, B2) \
12609 if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
12610 && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
12611 && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
12612 || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
12614 tmp_isa |= (A1) | (B1); \
12615 tmp_isa2 |= (A2) | (B2); \
12618 SHARE_BUILTIN (OPTION_MASK_ISA_SSE
, 0, OPTION_MASK_ISA_3DNOW_A
, 0);
12619 SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2
, 0, OPTION_MASK_ISA_CRC32
, 0);
12620 SHARE_BUILTIN (OPTION_MASK_ISA_FMA
, 0, OPTION_MASK_ISA_FMA4
, 0);
12621 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
, 0, 0,
12622 OPTION_MASK_ISA2_AVXVNNI
);
12623 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
, 0, 0,
12624 OPTION_MASK_ISA2_AVXIFMA
);
12625 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL
, OPTION_MASK_ISA2_AVX512BF16
, 0,
12626 OPTION_MASK_ISA2_AVXNECONVERT
);
12627 SHARE_BUILTIN (OPTION_MASK_ISA_AES
, 0, 0, OPTION_MASK_ISA2_VAES
);
12631 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
12632 /* __builtin_ia32_maskmovq requires MMX registers. */
12633 && fcode
!= IX86_BUILTIN_MASKMOVQ
)
12635 bisa
&= ~OPTION_MASK_ISA_MMX
;
12636 bisa
|= OPTION_MASK_ISA_SSE2
;
12644 return (bisa
& isa
) == bisa
&& (bisa2
& isa2
) == bisa2
;
12647 /* Expand an expression EXP that calls a built-in function,
12648 with result going to TARGET if that's convenient
12649 (and in mode MODE if that's convenient).
12650 SUBTARGET may be used as the target for computing one of EXP's operands.
12651 IGNORE is nonzero if the value is to be ignored. */
12654 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
12655 machine_mode mode
, int ignore
)
12658 enum insn_code icode
, icode2
;
12659 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
12660 tree arg0
, arg1
, arg2
, arg3
, arg4
;
12661 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
12662 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
12663 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
12664 HOST_WIDE_INT bisa
, bisa2
;
12666 /* For CPU builtins that can be folded, fold first and expand the fold. */
12669 case IX86_BUILTIN_CPU_INIT
:
12671 /* Make it call __cpu_indicator_init in libgcc. */
12672 tree call_expr
, fndecl
, type
;
12673 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
12674 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
12675 call_expr
= build_call_expr (fndecl
, 0);
12676 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
12678 case IX86_BUILTIN_CPU_IS
:
12679 case IX86_BUILTIN_CPU_SUPPORTS
:
12681 tree arg0
= CALL_EXPR_ARG (exp
, 0);
12682 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
12683 gcc_assert (fold_expr
!= NULL_TREE
);
12684 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
12688 if (!ix86_check_builtin_isa_match (fcode
, &bisa
, &bisa2
))
12690 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
12691 if (TARGET_ABI_X32
)
12692 bisa
|= OPTION_MASK_ABI_X32
;
12694 bisa
|= OPTION_MASK_ABI_64
;
12695 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
12696 (enum fpmath_unit
) 0,
12697 (enum prefer_vector_width
) 0,
12698 PVW_NONE
, PVW_NONE
,
12701 error ("%qE needs unknown isa option", fndecl
);
12704 gcc_assert (opts
!= NULL
);
12705 error ("%qE needs isa option %s", fndecl
, opts
);
12708 return expand_call (exp
, target
, ignore
);
12713 case IX86_BUILTIN_MASKMOVQ
:
12714 case IX86_BUILTIN_MASKMOVDQU
:
12715 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
12716 ? CODE_FOR_mmx_maskmovq
12717 : CODE_FOR_sse2_maskmovdqu
);
12718 /* Note the arg order is different from the operand order. */
12719 arg1
= CALL_EXPR_ARG (exp
, 0);
12720 arg2
= CALL_EXPR_ARG (exp
, 1);
12721 arg0
= CALL_EXPR_ARG (exp
, 2);
12722 op0
= expand_normal (arg0
);
12723 op1
= expand_normal (arg1
);
12724 op2
= expand_normal (arg2
);
12725 mode0
= insn_data
[icode
].operand
[0].mode
;
12726 mode1
= insn_data
[icode
].operand
[1].mode
;
12727 mode2
= insn_data
[icode
].operand
[2].mode
;
12729 op0
= ix86_zero_extend_to_Pmode (op0
);
12730 op0
= gen_rtx_MEM (mode1
, op0
);
12732 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12733 op0
= copy_to_mode_reg (mode0
, op0
);
12734 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12735 op1
= copy_to_mode_reg (mode1
, op1
);
12736 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12737 op2
= copy_to_mode_reg (mode2
, op2
);
12738 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
12744 case IX86_BUILTIN_LDMXCSR
:
12745 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
12746 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12747 emit_move_insn (target
, op0
);
12748 emit_insn (gen_sse_ldmxcsr (target
));
12751 case IX86_BUILTIN_STMXCSR
:
12752 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12753 emit_insn (gen_sse_stmxcsr (target
));
12754 return copy_to_mode_reg (SImode
, target
);
12756 case IX86_BUILTIN_CLFLUSH
:
12757 arg0
= CALL_EXPR_ARG (exp
, 0);
12758 op0
= expand_normal (arg0
);
12759 icode
= CODE_FOR_sse2_clflush
;
12760 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12761 op0
= ix86_zero_extend_to_Pmode (op0
);
12763 emit_insn (gen_sse2_clflush (op0
));
12766 case IX86_BUILTIN_CLWB
:
12767 arg0
= CALL_EXPR_ARG (exp
, 0);
12768 op0
= expand_normal (arg0
);
12769 icode
= CODE_FOR_clwb
;
12770 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12771 op0
= ix86_zero_extend_to_Pmode (op0
);
12773 emit_insn (gen_clwb (op0
));
12776 case IX86_BUILTIN_CLFLUSHOPT
:
12777 arg0
= CALL_EXPR_ARG (exp
, 0);
12778 op0
= expand_normal (arg0
);
12779 icode
= CODE_FOR_clflushopt
;
12780 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12781 op0
= ix86_zero_extend_to_Pmode (op0
);
12783 emit_insn (gen_clflushopt (op0
));
12786 case IX86_BUILTIN_MONITOR
:
12787 case IX86_BUILTIN_MONITORX
:
12788 arg0
= CALL_EXPR_ARG (exp
, 0);
12789 arg1
= CALL_EXPR_ARG (exp
, 1);
12790 arg2
= CALL_EXPR_ARG (exp
, 2);
12791 op0
= expand_normal (arg0
);
12792 op1
= expand_normal (arg1
);
12793 op2
= expand_normal (arg2
);
12795 op0
= ix86_zero_extend_to_Pmode (op0
);
12797 op1
= copy_to_mode_reg (SImode
, op1
);
12799 op2
= copy_to_mode_reg (SImode
, op2
);
12801 emit_insn (fcode
== IX86_BUILTIN_MONITOR
12802 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
12803 : gen_monitorx (Pmode
, op0
, op1
, op2
));
12806 case IX86_BUILTIN_MWAIT
:
12807 arg0
= CALL_EXPR_ARG (exp
, 0);
12808 arg1
= CALL_EXPR_ARG (exp
, 1);
12809 op0
= expand_normal (arg0
);
12810 op1
= expand_normal (arg1
);
12812 op0
= copy_to_mode_reg (SImode
, op0
);
12814 op1
= copy_to_mode_reg (SImode
, op1
);
12815 emit_insn (gen_sse3_mwait (op0
, op1
));
12818 case IX86_BUILTIN_MWAITX
:
12819 arg0
= CALL_EXPR_ARG (exp
, 0);
12820 arg1
= CALL_EXPR_ARG (exp
, 1);
12821 arg2
= CALL_EXPR_ARG (exp
, 2);
12822 op0
= expand_normal (arg0
);
12823 op1
= expand_normal (arg1
);
12824 op2
= expand_normal (arg2
);
12826 op0
= copy_to_mode_reg (SImode
, op0
);
12828 op1
= copy_to_mode_reg (SImode
, op1
);
12830 op2
= copy_to_mode_reg (SImode
, op2
);
12831 emit_insn (gen_mwaitx (op0
, op1
, op2
));
12834 case IX86_BUILTIN_UMONITOR
:
12835 arg0
= CALL_EXPR_ARG (exp
, 0);
12836 op0
= expand_normal (arg0
);
12838 op0
= ix86_zero_extend_to_Pmode (op0
);
12839 emit_insn (gen_umonitor (Pmode
, op0
));
12842 case IX86_BUILTIN_UMWAIT
:
12843 case IX86_BUILTIN_TPAUSE
:
12844 arg0
= CALL_EXPR_ARG (exp
, 0);
12845 arg1
= CALL_EXPR_ARG (exp
, 1);
12846 op0
= expand_normal (arg0
);
12847 op1
= expand_normal (arg1
);
12850 op0
= copy_to_mode_reg (SImode
, op0
);
12852 op1
= force_reg (DImode
, op1
);
12856 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
12857 NULL
, 1, OPTAB_DIRECT
);
12860 case IX86_BUILTIN_UMWAIT
:
12861 icode
= CODE_FOR_umwait_rex64
;
12863 case IX86_BUILTIN_TPAUSE
:
12864 icode
= CODE_FOR_tpause_rex64
;
12867 gcc_unreachable ();
12870 op2
= gen_lowpart (SImode
, op2
);
12871 op1
= gen_lowpart (SImode
, op1
);
12872 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
12878 case IX86_BUILTIN_UMWAIT
:
12879 icode
= CODE_FOR_umwait
;
12881 case IX86_BUILTIN_TPAUSE
:
12882 icode
= CODE_FOR_tpause
;
12885 gcc_unreachable ();
12887 pat
= GEN_FCN (icode
) (op0
, op1
);
12896 || !register_operand (target
, QImode
))
12897 target
= gen_reg_rtx (QImode
);
12899 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12901 emit_insn (gen_rtx_SET (target
, pat
));
12905 case IX86_BUILTIN_TESTUI
:
12906 emit_insn (gen_testui ());
12909 || !register_operand (target
, QImode
))
12910 target
= gen_reg_rtx (QImode
);
12912 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12914 emit_insn (gen_rtx_SET (target
, pat
));
12918 case IX86_BUILTIN_CLZERO
:
12919 arg0
= CALL_EXPR_ARG (exp
, 0);
12920 op0
= expand_normal (arg0
);
12922 op0
= ix86_zero_extend_to_Pmode (op0
);
12923 emit_insn (gen_clzero (Pmode
, op0
));
12926 case IX86_BUILTIN_CLDEMOTE
:
12927 arg0
= CALL_EXPR_ARG (exp
, 0);
12928 op0
= expand_normal (arg0
);
12929 icode
= CODE_FOR_cldemote
;
12930 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12931 op0
= ix86_zero_extend_to_Pmode (op0
);
12933 emit_insn (gen_cldemote (op0
));
12936 case IX86_BUILTIN_LOADIWKEY
:
12938 arg0
= CALL_EXPR_ARG (exp
, 0);
12939 arg1
= CALL_EXPR_ARG (exp
, 1);
12940 arg2
= CALL_EXPR_ARG (exp
, 2);
12941 arg3
= CALL_EXPR_ARG (exp
, 3);
12943 op0
= expand_normal (arg0
);
12944 op1
= expand_normal (arg1
);
12945 op2
= expand_normal (arg2
);
12946 op3
= expand_normal (arg3
);
12949 op0
= copy_to_mode_reg (V2DImode
, op0
);
12951 op1
= copy_to_mode_reg (V2DImode
, op1
);
12953 op2
= copy_to_mode_reg (V2DImode
, op2
);
12955 op3
= copy_to_mode_reg (SImode
, op3
);
12957 emit_insn (gen_loadiwkey (op0
, op1
, op2
, op3
));
12962 case IX86_BUILTIN_AESDEC128KLU8
:
12963 icode
= CODE_FOR_aesdec128klu8
;
12964 goto aesdecenc_expand
;
12966 case IX86_BUILTIN_AESDEC256KLU8
:
12967 icode
= CODE_FOR_aesdec256klu8
;
12968 goto aesdecenc_expand
;
12970 case IX86_BUILTIN_AESENC128KLU8
:
12971 icode
= CODE_FOR_aesenc128klu8
;
12972 goto aesdecenc_expand
;
12974 case IX86_BUILTIN_AESENC256KLU8
:
12975 icode
= CODE_FOR_aesenc256klu8
;
12979 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i *odata
12980 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i idata
12981 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
12983 op0
= expand_normal (arg0
);
12984 op1
= expand_normal (arg1
);
12985 op2
= expand_normal (arg2
);
12987 if (!address_operand (op0
, V2DImode
))
12989 op0
= convert_memory_address (Pmode
, op0
);
12990 op0
= copy_addr_to_reg (op0
);
12992 op0
= gen_rtx_MEM (V2DImode
, op0
);
12995 op1
= copy_to_mode_reg (V2DImode
, op1
);
12997 if (!address_operand (op2
, VOIDmode
))
12999 op2
= convert_memory_address (Pmode
, op2
);
13000 op2
= copy_addr_to_reg (op2
);
13002 op2
= gen_rtx_MEM (BLKmode
, op2
);
13004 emit_insn (GEN_FCN (icode
) (op1
, op1
, op2
));
13007 target
= gen_reg_rtx (QImode
);
13009 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
13010 error occurs. Then the output should be cleared for safety. */
13011 rtx_code_label
*ok_label
;
13014 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
13015 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
13016 ok_label
= gen_label_rtx ();
13017 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
13019 /* Usually the runtime error seldom occur, so predict OK path as
13020 hotspot to optimize it as fallthrough block. */
13021 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
13023 emit_insn (gen_rtx_SET (op1
, const0_rtx
));
13025 emit_label (ok_label
);
13026 emit_insn (gen_rtx_SET (target
, pat
));
13027 emit_insn (gen_rtx_SET (op0
, op1
));
13031 case IX86_BUILTIN_AESDECWIDE128KLU8
:
13032 icode
= CODE_FOR_aesdecwide128klu8
;
13033 goto wideaesdecenc_expand
;
13035 case IX86_BUILTIN_AESDECWIDE256KLU8
:
13036 icode
= CODE_FOR_aesdecwide256klu8
;
13037 goto wideaesdecenc_expand
;
13039 case IX86_BUILTIN_AESENCWIDE128KLU8
:
13040 icode
= CODE_FOR_aesencwide128klu8
;
13041 goto wideaesdecenc_expand
;
13043 case IX86_BUILTIN_AESENCWIDE256KLU8
:
13044 icode
= CODE_FOR_aesencwide256klu8
;
13046 wideaesdecenc_expand
:
13051 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i * odata
13052 arg1
= CALL_EXPR_ARG (exp
, 1); // const __m128i * idata
13053 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
13055 op0
= expand_normal (arg0
);
13056 op1
= expand_normal (arg1
);
13057 op2
= expand_normal (arg2
);
13059 if (!address_operand (op2
, VOIDmode
))
13061 op2
= convert_memory_address (Pmode
, op2
);
13062 op2
= copy_addr_to_reg (op2
);
13064 op2
= gen_rtx_MEM (BLKmode
, op2
);
13066 for (i
= 0; i
< 8; i
++)
13068 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13070 op
= gen_rtx_MEM (V2DImode
,
13071 plus_constant (Pmode
, op1
, (i
* 16)));
13073 emit_move_insn (xmm_regs
[i
], op
);
13076 emit_insn (GEN_FCN (icode
) (op2
));
13079 target
= gen_reg_rtx (QImode
);
13081 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
13082 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
13083 ok_label
= gen_label_rtx ();
13084 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
13086 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
13088 for (i
= 0; i
< 8; i
++)
13089 emit_insn (gen_rtx_SET (xmm_regs
[i
], const0_rtx
));
13091 emit_label (ok_label
);
13092 emit_insn (gen_rtx_SET (target
, pat
));
13094 for (i
= 0; i
< 8; i
++)
13096 op
= gen_rtx_MEM (V2DImode
,
13097 plus_constant (Pmode
, op0
, (i
* 16)));
13098 emit_move_insn (op
, xmm_regs
[i
]);
13103 case IX86_BUILTIN_ENCODEKEY128U32
:
13105 rtx op
, xmm_regs
[7];
13107 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
13108 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i key
13109 arg2
= CALL_EXPR_ARG (exp
, 2); // void *h
13111 op0
= expand_normal (arg0
);
13112 op1
= expand_normal (arg1
);
13113 op2
= expand_normal (arg2
);
13116 op0
= copy_to_mode_reg (SImode
, op0
);
13118 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
13119 emit_move_insn (op
, op1
);
13121 for (i
= 0; i
< 3; i
++)
13122 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13125 target
= gen_reg_rtx (SImode
);
13127 emit_insn (gen_encodekey128u32 (target
, op0
));
13129 for (i
= 0; i
< 3; i
++)
13131 op
= gen_rtx_MEM (V2DImode
,
13132 plus_constant (Pmode
, op2
, (i
* 16)));
13133 emit_move_insn (op
, xmm_regs
[i
]);
13138 case IX86_BUILTIN_ENCODEKEY256U32
:
13140 rtx op
, xmm_regs
[7];
13142 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
13143 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i keylow
13144 arg2
= CALL_EXPR_ARG (exp
, 2); // __m128i keyhi
13145 arg3
= CALL_EXPR_ARG (exp
, 3); // void *h
13147 op0
= expand_normal (arg0
);
13148 op1
= expand_normal (arg1
);
13149 op2
= expand_normal (arg2
);
13150 op3
= expand_normal (arg3
);
13153 op0
= copy_to_mode_reg (SImode
, op0
);
13155 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13156 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
13157 emit_move_insn (op
, op1
);
13158 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (1));
13159 emit_move_insn (op
, op2
);
13161 for (i
= 0; i
< 4; i
++)
13162 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13165 target
= gen_reg_rtx (SImode
);
13167 emit_insn (gen_encodekey256u32 (target
, op0
));
13169 for (i
= 0; i
< 4; i
++)
13171 op
= gen_rtx_MEM (V2DImode
,
13172 plus_constant (Pmode
, op3
, (i
* 16)));
13173 emit_move_insn (op
, xmm_regs
[i
]);
13179 case IX86_BUILTIN_PREFETCH
:
13181 arg0
= CALL_EXPR_ARG (exp
, 0); // const void *
13182 arg1
= CALL_EXPR_ARG (exp
, 1); // const int
13183 arg2
= CALL_EXPR_ARG (exp
, 2); // const int
13184 arg3
= CALL_EXPR_ARG (exp
, 3); // const int
13186 op0
= expand_normal (arg0
);
13187 op1
= expand_normal (arg1
);
13188 op2
= expand_normal (arg2
);
13189 op3
= expand_normal (arg3
);
13191 if (!CONST_INT_P (op1
) || !CONST_INT_P (op2
) || !CONST_INT_P (op3
))
13193 error ("second, third and fourth argument must be a const");
13197 if (INTVAL (op3
) == 1)
13199 if (INTVAL (op2
) < 2 || INTVAL (op2
) > 3)
13201 error ("invalid third argument");
13205 if (TARGET_64BIT
&& TARGET_PREFETCHI
13206 && local_func_symbolic_operand (op0
, GET_MODE (op0
)))
13207 emit_insn (gen_prefetchi (op0
, op2
));
13210 warning (0, "instruction prefetch applies when in 64-bit mode"
13211 " with RIP-relative addressing and"
13212 " option %<-mprefetchi%>;"
13213 " they stay NOPs otherwise");
13214 emit_insn (gen_nop ());
13219 if (!address_operand (op0
, VOIDmode
))
13221 op0
= convert_memory_address (Pmode
, op0
);
13222 op0
= copy_addr_to_reg (op0
);
13225 if (INTVAL (op2
) < 0 || INTVAL (op2
) > 3)
13227 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
13231 if (TARGET_3DNOW
|| TARGET_PREFETCH_SSE
13232 || TARGET_PRFCHW
|| TARGET_PREFETCHWT1
)
13233 emit_insn (gen_prefetch (op0
, op1
, op2
));
13234 else if (!MEM_P (op0
) && side_effects_p (op0
))
13235 /* Don't do anything with direct references to volatile memory,
13236 but generate code to handle other side effects. */
13243 case IX86_BUILTIN_PREFETCHI
:
13245 arg0
= CALL_EXPR_ARG (exp
, 0); // const void *
13246 arg1
= CALL_EXPR_ARG (exp
, 1); // const int
13248 op0
= expand_normal (arg0
);
13249 op1
= expand_normal (arg1
);
13251 if (!CONST_INT_P (op1
))
13253 error ("second argument must be a const");
13257 /* GOT/PLT_PIC should not be available for instruction prefetch.
13258 It must be real instruction address. */
13260 && local_func_symbolic_operand (op0
, GET_MODE (op0
)))
13261 emit_insn (gen_prefetchi (op0
, op1
));
13264 /* Ignore the hint. */
13265 warning (0, "instruction prefetch applies when in 64-bit mode"
13266 " with RIP-relative addressing and"
13267 " option %<-mprefetchi%>;"
13268 " they stay NOPs otherwise");
13269 emit_insn (gen_nop ());
13275 case IX86_BUILTIN_VEC_INIT_V2SI
:
13276 case IX86_BUILTIN_VEC_INIT_V4HI
:
13277 case IX86_BUILTIN_VEC_INIT_V8QI
:
13278 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
13280 case IX86_BUILTIN_VEC_EXT_V2DF
:
13281 case IX86_BUILTIN_VEC_EXT_V2DI
:
13282 case IX86_BUILTIN_VEC_EXT_V4SF
:
13283 case IX86_BUILTIN_VEC_EXT_V4SI
:
13284 case IX86_BUILTIN_VEC_EXT_V8HI
:
13285 case IX86_BUILTIN_VEC_EXT_V2SI
:
13286 case IX86_BUILTIN_VEC_EXT_V4HI
:
13287 case IX86_BUILTIN_VEC_EXT_V16QI
:
13288 return ix86_expand_vec_ext_builtin (exp
, target
);
13290 case IX86_BUILTIN_VEC_SET_V2DI
:
13291 case IX86_BUILTIN_VEC_SET_V4SF
:
13292 case IX86_BUILTIN_VEC_SET_V4SI
:
13293 case IX86_BUILTIN_VEC_SET_V8HI
:
13294 case IX86_BUILTIN_VEC_SET_V4HI
:
13295 case IX86_BUILTIN_VEC_SET_V16QI
:
13296 return ix86_expand_vec_set_builtin (exp
);
13298 case IX86_BUILTIN_NANQ
:
13299 case IX86_BUILTIN_NANSQ
:
13300 return expand_call (exp
, target
, ignore
);
13302 case IX86_BUILTIN_RDPID
:
13304 op0
= gen_reg_rtx (word_mode
);
13308 insn
= gen_rdpid_rex64 (op0
);
13309 op0
= convert_to_mode (SImode
, op0
, 1);
13312 insn
= gen_rdpid (op0
);
13317 || !register_operand (target
, SImode
))
13318 target
= gen_reg_rtx (SImode
);
13320 emit_move_insn (target
, op0
);
13323 case IX86_BUILTIN_2INTERSECTD512
:
13324 case IX86_BUILTIN_2INTERSECTQ512
:
13325 case IX86_BUILTIN_2INTERSECTD256
:
13326 case IX86_BUILTIN_2INTERSECTQ256
:
13327 case IX86_BUILTIN_2INTERSECTD128
:
13328 case IX86_BUILTIN_2INTERSECTQ128
:
13329 arg0
= CALL_EXPR_ARG (exp
, 0);
13330 arg1
= CALL_EXPR_ARG (exp
, 1);
13331 arg2
= CALL_EXPR_ARG (exp
, 2);
13332 arg3
= CALL_EXPR_ARG (exp
, 3);
13333 op0
= expand_normal (arg0
);
13334 op1
= expand_normal (arg1
);
13335 op2
= expand_normal (arg2
);
13336 op3
= expand_normal (arg3
);
13338 if (!address_operand (op0
, VOIDmode
))
13340 op0
= convert_memory_address (Pmode
, op0
);
13341 op0
= copy_addr_to_reg (op0
);
13343 if (!address_operand (op1
, VOIDmode
))
13345 op1
= convert_memory_address (Pmode
, op1
);
13346 op1
= copy_addr_to_reg (op1
);
13351 case IX86_BUILTIN_2INTERSECTD512
:
13353 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
13355 case IX86_BUILTIN_2INTERSECTQ512
:
13357 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
13359 case IX86_BUILTIN_2INTERSECTD256
:
13361 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
13363 case IX86_BUILTIN_2INTERSECTQ256
:
13365 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
13367 case IX86_BUILTIN_2INTERSECTD128
:
13369 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
13371 case IX86_BUILTIN_2INTERSECTQ128
:
13373 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
13376 gcc_unreachable ();
13379 mode2
= insn_data
[icode
].operand
[1].mode
;
13380 mode3
= insn_data
[icode
].operand
[2].mode
;
13381 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
13382 op2
= copy_to_mode_reg (mode2
, op2
);
13383 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
13384 op3
= copy_to_mode_reg (mode3
, op3
);
13386 op4
= gen_reg_rtx (mode4
);
13387 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
13388 mode0
= mode4
== P2HImode
? HImode
: QImode
;
13389 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
13390 gen_lowpart (mode0
, op4
));
13391 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
13392 gen_highpart (mode0
, op4
));
13396 case IX86_BUILTIN_RDPMC
:
13397 case IX86_BUILTIN_RDTSC
:
13398 case IX86_BUILTIN_RDTSCP
:
13399 case IX86_BUILTIN_XGETBV
:
13401 op0
= gen_reg_rtx (DImode
);
13402 op1
= gen_reg_rtx (DImode
);
13404 if (fcode
== IX86_BUILTIN_RDPMC
)
13406 arg0
= CALL_EXPR_ARG (exp
, 0);
13407 op2
= expand_normal (arg0
);
13408 if (!register_operand (op2
, SImode
))
13409 op2
= copy_to_mode_reg (SImode
, op2
);
13411 insn
= (TARGET_64BIT
13412 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
13413 : gen_rdpmc (op0
, op2
));
13416 else if (fcode
== IX86_BUILTIN_XGETBV
)
13418 arg0
= CALL_EXPR_ARG (exp
, 0);
13419 op2
= expand_normal (arg0
);
13420 if (!register_operand (op2
, SImode
))
13421 op2
= copy_to_mode_reg (SImode
, op2
);
13423 insn
= (TARGET_64BIT
13424 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
13425 : gen_xgetbv (op0
, op2
));
13428 else if (fcode
== IX86_BUILTIN_RDTSC
)
13430 insn
= (TARGET_64BIT
13431 ? gen_rdtsc_rex64 (op0
, op1
)
13432 : gen_rdtsc (op0
));
13437 op2
= gen_reg_rtx (SImode
);
13439 insn
= (TARGET_64BIT
13440 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
13441 : gen_rdtscp (op0
, op2
));
13444 arg0
= CALL_EXPR_ARG (exp
, 0);
13445 op4
= expand_normal (arg0
);
13446 if (!address_operand (op4
, VOIDmode
))
13448 op4
= convert_memory_address (Pmode
, op4
);
13449 op4
= copy_addr_to_reg (op4
);
13451 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
13455 || !register_operand (target
, DImode
))
13456 target
= gen_reg_rtx (DImode
);
13460 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
13461 op1
, 1, OPTAB_DIRECT
);
13462 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
13463 op0
, 1, OPTAB_DIRECT
);
13466 emit_move_insn (target
, op0
);
13469 case IX86_BUILTIN_ENQCMD
:
13470 case IX86_BUILTIN_ENQCMDS
:
13471 case IX86_BUILTIN_MOVDIR64B
:
13473 arg0
= CALL_EXPR_ARG (exp
, 0);
13474 arg1
= CALL_EXPR_ARG (exp
, 1);
13475 op0
= expand_normal (arg0
);
13476 op1
= expand_normal (arg1
);
13478 op0
= ix86_zero_extend_to_Pmode (op0
);
13479 if (!address_operand (op1
, VOIDmode
))
13481 op1
= convert_memory_address (Pmode
, op1
);
13482 op1
= copy_addr_to_reg (op1
);
13484 op1
= gen_rtx_MEM (XImode
, op1
);
13486 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
13488 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
13494 || !register_operand (target
, SImode
))
13495 target
= gen_reg_rtx (SImode
);
13497 emit_move_insn (target
, const0_rtx
);
13498 target
= gen_rtx_SUBREG (QImode
, target
, 0);
13500 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
13502 : UNSPECV_ENQCMDS
);
13503 icode
= code_for_enqcmd (unspecv
, Pmode
);
13504 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13507 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
13508 gen_rtx_fmt_ee (EQ
, QImode
,
13509 gen_rtx_REG (CCZmode
, FLAGS_REG
),
13511 return SUBREG_REG (target
);
13514 case IX86_BUILTIN_FXSAVE
:
13515 case IX86_BUILTIN_FXRSTOR
:
13516 case IX86_BUILTIN_FXSAVE64
:
13517 case IX86_BUILTIN_FXRSTOR64
:
13518 case IX86_BUILTIN_FNSTENV
:
13519 case IX86_BUILTIN_FLDENV
:
13523 case IX86_BUILTIN_FXSAVE
:
13524 icode
= CODE_FOR_fxsave
;
13526 case IX86_BUILTIN_FXRSTOR
:
13527 icode
= CODE_FOR_fxrstor
;
13529 case IX86_BUILTIN_FXSAVE64
:
13530 icode
= CODE_FOR_fxsave64
;
13532 case IX86_BUILTIN_FXRSTOR64
:
13533 icode
= CODE_FOR_fxrstor64
;
13535 case IX86_BUILTIN_FNSTENV
:
13536 icode
= CODE_FOR_fnstenv
;
13538 case IX86_BUILTIN_FLDENV
:
13539 icode
= CODE_FOR_fldenv
;
13542 gcc_unreachable ();
13545 arg0
= CALL_EXPR_ARG (exp
, 0);
13546 op0
= expand_normal (arg0
);
13548 if (!address_operand (op0
, VOIDmode
))
13550 op0
= convert_memory_address (Pmode
, op0
);
13551 op0
= copy_addr_to_reg (op0
);
13553 op0
= gen_rtx_MEM (mode0
, op0
);
13555 pat
= GEN_FCN (icode
) (op0
);
13560 case IX86_BUILTIN_XSETBV
:
13561 arg0
= CALL_EXPR_ARG (exp
, 0);
13562 arg1
= CALL_EXPR_ARG (exp
, 1);
13563 op0
= expand_normal (arg0
);
13564 op1
= expand_normal (arg1
);
13567 op0
= copy_to_mode_reg (SImode
, op0
);
13569 op1
= force_reg (DImode
, op1
);
13573 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13574 NULL
, 1, OPTAB_DIRECT
);
13576 icode
= CODE_FOR_xsetbv_rex64
;
13578 op2
= gen_lowpart (SImode
, op2
);
13579 op1
= gen_lowpart (SImode
, op1
);
13580 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13584 icode
= CODE_FOR_xsetbv
;
13586 pat
= GEN_FCN (icode
) (op0
, op1
);
13592 case IX86_BUILTIN_XSAVE
:
13593 case IX86_BUILTIN_XRSTOR
:
13594 case IX86_BUILTIN_XSAVE64
:
13595 case IX86_BUILTIN_XRSTOR64
:
13596 case IX86_BUILTIN_XSAVEOPT
:
13597 case IX86_BUILTIN_XSAVEOPT64
:
13598 case IX86_BUILTIN_XSAVES
:
13599 case IX86_BUILTIN_XRSTORS
:
13600 case IX86_BUILTIN_XSAVES64
:
13601 case IX86_BUILTIN_XRSTORS64
:
13602 case IX86_BUILTIN_XSAVEC
:
13603 case IX86_BUILTIN_XSAVEC64
:
13604 arg0
= CALL_EXPR_ARG (exp
, 0);
13605 arg1
= CALL_EXPR_ARG (exp
, 1);
13606 op0
= expand_normal (arg0
);
13607 op1
= expand_normal (arg1
);
13609 if (!address_operand (op0
, VOIDmode
))
13611 op0
= convert_memory_address (Pmode
, op0
);
13612 op0
= copy_addr_to_reg (op0
);
13614 op0
= gen_rtx_MEM (BLKmode
, op0
);
13616 op1
= force_reg (DImode
, op1
);
13620 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13621 NULL
, 1, OPTAB_DIRECT
);
13624 case IX86_BUILTIN_XSAVE
:
13625 icode
= CODE_FOR_xsave_rex64
;
13627 case IX86_BUILTIN_XRSTOR
:
13628 icode
= CODE_FOR_xrstor_rex64
;
13630 case IX86_BUILTIN_XSAVE64
:
13631 icode
= CODE_FOR_xsave64
;
13633 case IX86_BUILTIN_XRSTOR64
:
13634 icode
= CODE_FOR_xrstor64
;
13636 case IX86_BUILTIN_XSAVEOPT
:
13637 icode
= CODE_FOR_xsaveopt_rex64
;
13639 case IX86_BUILTIN_XSAVEOPT64
:
13640 icode
= CODE_FOR_xsaveopt64
;
13642 case IX86_BUILTIN_XSAVES
:
13643 icode
= CODE_FOR_xsaves_rex64
;
13645 case IX86_BUILTIN_XRSTORS
:
13646 icode
= CODE_FOR_xrstors_rex64
;
13648 case IX86_BUILTIN_XSAVES64
:
13649 icode
= CODE_FOR_xsaves64
;
13651 case IX86_BUILTIN_XRSTORS64
:
13652 icode
= CODE_FOR_xrstors64
;
13654 case IX86_BUILTIN_XSAVEC
:
13655 icode
= CODE_FOR_xsavec_rex64
;
13657 case IX86_BUILTIN_XSAVEC64
:
13658 icode
= CODE_FOR_xsavec64
;
13661 gcc_unreachable ();
13664 op2
= gen_lowpart (SImode
, op2
);
13665 op1
= gen_lowpart (SImode
, op1
);
13666 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13672 case IX86_BUILTIN_XSAVE
:
13673 icode
= CODE_FOR_xsave
;
13675 case IX86_BUILTIN_XRSTOR
:
13676 icode
= CODE_FOR_xrstor
;
13678 case IX86_BUILTIN_XSAVEOPT
:
13679 icode
= CODE_FOR_xsaveopt
;
13681 case IX86_BUILTIN_XSAVES
:
13682 icode
= CODE_FOR_xsaves
;
13684 case IX86_BUILTIN_XRSTORS
:
13685 icode
= CODE_FOR_xrstors
;
13687 case IX86_BUILTIN_XSAVEC
:
13688 icode
= CODE_FOR_xsavec
;
13691 gcc_unreachable ();
13693 pat
= GEN_FCN (icode
) (op0
, op1
);
13700 case IX86_BUILTIN_LLWPCB
:
13701 arg0
= CALL_EXPR_ARG (exp
, 0);
13702 op0
= expand_normal (arg0
);
13704 if (!register_operand (op0
, Pmode
))
13705 op0
= ix86_zero_extend_to_Pmode (op0
);
13706 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
13709 case IX86_BUILTIN_SLWPCB
:
13711 || !register_operand (target
, Pmode
))
13712 target
= gen_reg_rtx (Pmode
);
13713 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
13716 case IX86_BUILTIN_LWPVAL32
:
13717 case IX86_BUILTIN_LWPVAL64
:
13718 case IX86_BUILTIN_LWPINS32
:
13719 case IX86_BUILTIN_LWPINS64
:
13720 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
13721 || fcode
== IX86_BUILTIN_LWPINS32
)
13722 ? SImode
: DImode
);
13724 if (fcode
== IX86_BUILTIN_LWPVAL32
13725 || fcode
== IX86_BUILTIN_LWPVAL64
)
13726 icode
= code_for_lwp_lwpval (mode
);
13728 icode
= code_for_lwp_lwpins (mode
);
13730 arg0
= CALL_EXPR_ARG (exp
, 0);
13731 arg1
= CALL_EXPR_ARG (exp
, 1);
13732 arg2
= CALL_EXPR_ARG (exp
, 2);
13733 op0
= expand_normal (arg0
);
13734 op1
= expand_normal (arg1
);
13735 op2
= expand_normal (arg2
);
13736 mode0
= insn_data
[icode
].operand
[0].mode
;
13738 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13739 op0
= copy_to_mode_reg (mode0
, op0
);
13740 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
13741 op1
= copy_to_mode_reg (SImode
, op1
);
13743 if (!CONST_INT_P (op2
))
13745 error ("the last argument must be a 32-bit immediate");
13749 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
13751 if (fcode
== IX86_BUILTIN_LWPINS32
13752 || fcode
== IX86_BUILTIN_LWPINS64
)
13755 || !nonimmediate_operand (target
, QImode
))
13756 target
= gen_reg_rtx (QImode
);
13758 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13760 emit_insn (gen_rtx_SET (target
, pat
));
13767 case IX86_BUILTIN_BEXTRI32
:
13768 case IX86_BUILTIN_BEXTRI64
:
13769 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
13771 arg0
= CALL_EXPR_ARG (exp
, 0);
13772 arg1
= CALL_EXPR_ARG (exp
, 1);
13773 op0
= expand_normal (arg0
);
13774 op1
= expand_normal (arg1
);
13776 if (!CONST_INT_P (op1
))
13778 error ("last argument must be an immediate");
13783 unsigned char lsb_index
= UINTVAL (op1
);
13784 unsigned char length
= UINTVAL (op1
) >> 8;
13786 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
13788 icode
= code_for_tbm_bextri (mode
);
13790 mode1
= insn_data
[icode
].operand
[1].mode
;
13791 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
13792 op0
= copy_to_mode_reg (mode1
, op0
);
13794 mode0
= insn_data
[icode
].operand
[0].mode
;
13796 || !register_operand (target
, mode0
))
13797 target
= gen_reg_rtx (mode0
);
13799 if (length
== 0 || lsb_index
>= bitsize
)
13801 emit_move_insn (target
, const0_rtx
);
13805 if (length
+ lsb_index
> bitsize
)
13806 length
= bitsize
- lsb_index
;
13808 op1
= GEN_INT (length
);
13809 op2
= GEN_INT (lsb_index
);
13811 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
13815 case IX86_BUILTIN_RDRAND16_STEP
:
13819 case IX86_BUILTIN_RDRAND32_STEP
:
13823 case IX86_BUILTIN_RDRAND64_STEP
:
13827 arg0
= CALL_EXPR_ARG (exp
, 0);
13828 op1
= expand_normal (arg0
);
13829 if (!address_operand (op1
, VOIDmode
))
13831 op1
= convert_memory_address (Pmode
, op1
);
13832 op1
= copy_addr_to_reg (op1
);
13835 op0
= gen_reg_rtx (mode
);
13836 emit_insn (gen_rdrand (mode
, op0
));
13838 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
13840 op1
= force_reg (SImode
, const1_rtx
);
13842 /* Emit SImode conditional move. */
13843 if (mode
== HImode
)
13845 if (TARGET_ZERO_EXTEND_WITH_AND
13846 && optimize_function_for_speed_p (cfun
))
13848 op2
= force_reg (SImode
, const0_rtx
);
13850 emit_insn (gen_movstricthi
13851 (gen_lowpart (HImode
, op2
), op0
));
13855 op2
= gen_reg_rtx (SImode
);
13857 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
13860 else if (mode
== SImode
)
13863 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
13866 || !register_operand (target
, SImode
))
13867 target
= gen_reg_rtx (SImode
);
13869 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13871 emit_insn (gen_rtx_SET (target
,
13872 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
13875 case IX86_BUILTIN_RDSEED16_STEP
:
13879 case IX86_BUILTIN_RDSEED32_STEP
:
13883 case IX86_BUILTIN_RDSEED64_STEP
:
13887 arg0
= CALL_EXPR_ARG (exp
, 0);
13888 op1
= expand_normal (arg0
);
13889 if (!address_operand (op1
, VOIDmode
))
13891 op1
= convert_memory_address (Pmode
, op1
);
13892 op1
= copy_addr_to_reg (op1
);
13895 op0
= gen_reg_rtx (mode
);
13896 emit_insn (gen_rdseed (mode
, op0
));
13898 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
13900 op2
= gen_reg_rtx (QImode
);
13902 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13904 emit_insn (gen_rtx_SET (op2
, pat
));
13907 || !register_operand (target
, SImode
))
13908 target
= gen_reg_rtx (SImode
);
13910 emit_insn (gen_zero_extendqisi2 (target
, op2
));
13913 case IX86_BUILTIN_SBB32
:
13914 icode
= CODE_FOR_subborrowsi
;
13915 icode2
= CODE_FOR_subborrowsi_0
;
13921 case IX86_BUILTIN_SBB64
:
13922 icode
= CODE_FOR_subborrowdi
;
13923 icode2
= CODE_FOR_subborrowdi_0
;
13929 case IX86_BUILTIN_ADDCARRYX32
:
13930 icode
= CODE_FOR_addcarrysi
;
13931 icode2
= CODE_FOR_addcarrysi_0
;
13937 case IX86_BUILTIN_ADDCARRYX64
:
13938 icode
= CODE_FOR_addcarrydi
;
13939 icode2
= CODE_FOR_addcarrydi_0
;
13945 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
13946 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
13947 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
13948 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
13950 op1
= expand_normal (arg0
);
13951 if (!integer_zerop (arg0
))
13952 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
13954 op2
= expand_normal (arg1
);
13955 if (!register_operand (op2
, mode0
))
13956 op2
= copy_to_mode_reg (mode0
, op2
);
13958 op3
= expand_normal (arg2
);
13959 if (!register_operand (op3
, mode0
))
13960 op3
= copy_to_mode_reg (mode0
, op3
);
13962 op4
= expand_normal (arg3
);
13963 if (!address_operand (op4
, VOIDmode
))
13965 op4
= convert_memory_address (Pmode
, op4
);
13966 op4
= copy_addr_to_reg (op4
);
13969 op0
= gen_reg_rtx (mode0
);
13970 if (integer_zerop (arg0
))
13972 /* If arg0 is 0, optimize right away into add or sub
13973 instruction that sets CCCmode flags. */
13974 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
13975 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
13979 /* Generate CF from input operand. */
13980 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
13982 /* Generate instruction that consumes CF. */
13983 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
13984 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
13985 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
13986 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
13989 /* Return current CF value. */
13991 target
= gen_reg_rtx (QImode
);
13993 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
13994 emit_insn (gen_rtx_SET (target
, pat
));
13996 /* Store the result. */
13997 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
14001 case IX86_BUILTIN_READ_FLAGS
:
14005 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
14008 || target
== NULL_RTX
14009 || !nonimmediate_operand (target
, word_mode
)
14010 || GET_MODE (target
) != word_mode
)
14011 target
= gen_reg_rtx (word_mode
);
14013 emit_insn (gen_pop (target
));
14016 case IX86_BUILTIN_WRITE_FLAGS
:
14018 arg0
= CALL_EXPR_ARG (exp
, 0);
14019 op0
= expand_normal (arg0
);
14020 if (!general_no_elim_operand (op0
, word_mode
))
14021 op0
= copy_to_mode_reg (word_mode
, op0
);
14023 emit_insn (gen_push (op0
));
14024 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
14027 case IX86_BUILTIN_KTESTC8
:
14028 icode
= CODE_FOR_ktestqi
;
14032 case IX86_BUILTIN_KTESTZ8
:
14033 icode
= CODE_FOR_ktestqi
;
14037 case IX86_BUILTIN_KTESTC16
:
14038 icode
= CODE_FOR_ktesthi
;
14042 case IX86_BUILTIN_KTESTZ16
:
14043 icode
= CODE_FOR_ktesthi
;
14047 case IX86_BUILTIN_KTESTC32
:
14048 icode
= CODE_FOR_ktestsi
;
14052 case IX86_BUILTIN_KTESTZ32
:
14053 icode
= CODE_FOR_ktestsi
;
14057 case IX86_BUILTIN_KTESTC64
:
14058 icode
= CODE_FOR_ktestdi
;
14062 case IX86_BUILTIN_KTESTZ64
:
14063 icode
= CODE_FOR_ktestdi
;
14067 case IX86_BUILTIN_KORTESTC8
:
14068 icode
= CODE_FOR_kortestqi
;
14072 case IX86_BUILTIN_KORTESTZ8
:
14073 icode
= CODE_FOR_kortestqi
;
14077 case IX86_BUILTIN_KORTESTC16
:
14078 icode
= CODE_FOR_kortesthi
;
14082 case IX86_BUILTIN_KORTESTZ16
:
14083 icode
= CODE_FOR_kortesthi
;
14087 case IX86_BUILTIN_KORTESTC32
:
14088 icode
= CODE_FOR_kortestsi
;
14092 case IX86_BUILTIN_KORTESTZ32
:
14093 icode
= CODE_FOR_kortestsi
;
14097 case IX86_BUILTIN_KORTESTC64
:
14098 icode
= CODE_FOR_kortestdi
;
14102 case IX86_BUILTIN_KORTESTZ64
:
14103 icode
= CODE_FOR_kortestdi
;
14107 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
14108 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
14109 op0
= expand_normal (arg0
);
14110 op1
= expand_normal (arg1
);
14112 mode0
= insn_data
[icode
].operand
[0].mode
;
14113 mode1
= insn_data
[icode
].operand
[1].mode
;
14115 if (GET_MODE (op0
) != VOIDmode
)
14116 op0
= force_reg (GET_MODE (op0
), op0
);
14118 op0
= gen_lowpart (mode0
, op0
);
14120 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14121 op0
= copy_to_mode_reg (mode0
, op0
);
14123 if (GET_MODE (op1
) != VOIDmode
)
14124 op1
= force_reg (GET_MODE (op1
), op1
);
14126 op1
= gen_lowpart (mode1
, op1
);
14128 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14129 op1
= copy_to_mode_reg (mode1
, op1
);
14131 target
= gen_reg_rtx (QImode
);
14133 /* Emit kortest. */
14134 emit_insn (GEN_FCN (icode
) (op0
, op1
));
14135 /* And use setcc to return result from flags. */
14136 ix86_expand_setcc (target
, EQ
,
14137 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
14140 case IX86_BUILTIN_GATHERSIV2DF
:
14141 icode
= CODE_FOR_avx2_gathersiv2df
;
14143 case IX86_BUILTIN_GATHERSIV4DF
:
14144 icode
= CODE_FOR_avx2_gathersiv4df
;
14146 case IX86_BUILTIN_GATHERDIV2DF
:
14147 icode
= CODE_FOR_avx2_gatherdiv2df
;
14149 case IX86_BUILTIN_GATHERDIV4DF
:
14150 icode
= CODE_FOR_avx2_gatherdiv4df
;
14152 case IX86_BUILTIN_GATHERSIV4SF
:
14153 icode
= CODE_FOR_avx2_gathersiv4sf
;
14155 case IX86_BUILTIN_GATHERSIV8SF
:
14156 icode
= CODE_FOR_avx2_gathersiv8sf
;
14158 case IX86_BUILTIN_GATHERDIV4SF
:
14159 icode
= CODE_FOR_avx2_gatherdiv4sf
;
14161 case IX86_BUILTIN_GATHERDIV8SF
:
14162 icode
= CODE_FOR_avx2_gatherdiv8sf
;
14164 case IX86_BUILTIN_GATHERSIV2DI
:
14165 icode
= CODE_FOR_avx2_gathersiv2di
;
14167 case IX86_BUILTIN_GATHERSIV4DI
:
14168 icode
= CODE_FOR_avx2_gathersiv4di
;
14170 case IX86_BUILTIN_GATHERDIV2DI
:
14171 icode
= CODE_FOR_avx2_gatherdiv2di
;
14173 case IX86_BUILTIN_GATHERDIV4DI
:
14174 icode
= CODE_FOR_avx2_gatherdiv4di
;
14176 case IX86_BUILTIN_GATHERSIV4SI
:
14177 icode
= CODE_FOR_avx2_gathersiv4si
;
14179 case IX86_BUILTIN_GATHERSIV8SI
:
14180 icode
= CODE_FOR_avx2_gathersiv8si
;
14182 case IX86_BUILTIN_GATHERDIV4SI
:
14183 icode
= CODE_FOR_avx2_gatherdiv4si
;
14185 case IX86_BUILTIN_GATHERDIV8SI
:
14186 icode
= CODE_FOR_avx2_gatherdiv8si
;
14188 case IX86_BUILTIN_GATHERALTSIV4DF
:
14189 icode
= CODE_FOR_avx2_gathersiv4df
;
14191 case IX86_BUILTIN_GATHERALTDIV8SF
:
14192 icode
= CODE_FOR_avx2_gatherdiv8sf
;
14194 case IX86_BUILTIN_GATHERALTSIV4DI
:
14195 icode
= CODE_FOR_avx2_gathersiv4di
;
14197 case IX86_BUILTIN_GATHERALTDIV8SI
:
14198 icode
= CODE_FOR_avx2_gatherdiv8si
;
14200 case IX86_BUILTIN_GATHER3SIV16SF
:
14201 icode
= CODE_FOR_avx512f_gathersiv16sf
;
14203 case IX86_BUILTIN_GATHER3SIV8DF
:
14204 icode
= CODE_FOR_avx512f_gathersiv8df
;
14206 case IX86_BUILTIN_GATHER3DIV16SF
:
14207 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
14209 case IX86_BUILTIN_GATHER3DIV8DF
:
14210 icode
= CODE_FOR_avx512f_gatherdiv8df
;
14212 case IX86_BUILTIN_GATHER3SIV16SI
:
14213 icode
= CODE_FOR_avx512f_gathersiv16si
;
14215 case IX86_BUILTIN_GATHER3SIV8DI
:
14216 icode
= CODE_FOR_avx512f_gathersiv8di
;
14218 case IX86_BUILTIN_GATHER3DIV16SI
:
14219 icode
= CODE_FOR_avx512f_gatherdiv16si
;
14221 case IX86_BUILTIN_GATHER3DIV8DI
:
14222 icode
= CODE_FOR_avx512f_gatherdiv8di
;
14224 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14225 icode
= CODE_FOR_avx512f_gathersiv8df
;
14227 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14228 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
14230 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14231 icode
= CODE_FOR_avx512f_gathersiv8di
;
14233 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14234 icode
= CODE_FOR_avx512f_gatherdiv16si
;
14236 case IX86_BUILTIN_GATHER3SIV2DF
:
14237 icode
= CODE_FOR_avx512vl_gathersiv2df
;
14239 case IX86_BUILTIN_GATHER3SIV4DF
:
14240 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14242 case IX86_BUILTIN_GATHER3DIV2DF
:
14243 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
14245 case IX86_BUILTIN_GATHER3DIV4DF
:
14246 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
14248 case IX86_BUILTIN_GATHER3SIV4SF
:
14249 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
14251 case IX86_BUILTIN_GATHER3SIV8SF
:
14252 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
14254 case IX86_BUILTIN_GATHER3DIV4SF
:
14255 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
14257 case IX86_BUILTIN_GATHER3DIV8SF
:
14258 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14260 case IX86_BUILTIN_GATHER3SIV2DI
:
14261 icode
= CODE_FOR_avx512vl_gathersiv2di
;
14263 case IX86_BUILTIN_GATHER3SIV4DI
:
14264 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14266 case IX86_BUILTIN_GATHER3DIV2DI
:
14267 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
14269 case IX86_BUILTIN_GATHER3DIV4DI
:
14270 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
14272 case IX86_BUILTIN_GATHER3SIV4SI
:
14273 icode
= CODE_FOR_avx512vl_gathersiv4si
;
14275 case IX86_BUILTIN_GATHER3SIV8SI
:
14276 icode
= CODE_FOR_avx512vl_gathersiv8si
;
14278 case IX86_BUILTIN_GATHER3DIV4SI
:
14279 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
14281 case IX86_BUILTIN_GATHER3DIV8SI
:
14282 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14284 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14285 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14287 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14288 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14290 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14291 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14293 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14294 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14296 case IX86_BUILTIN_SCATTERSIV16SF
:
14297 icode
= CODE_FOR_avx512f_scattersiv16sf
;
14299 case IX86_BUILTIN_SCATTERSIV8DF
:
14300 icode
= CODE_FOR_avx512f_scattersiv8df
;
14302 case IX86_BUILTIN_SCATTERDIV16SF
:
14303 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14305 case IX86_BUILTIN_SCATTERDIV8DF
:
14306 icode
= CODE_FOR_avx512f_scatterdiv8df
;
14308 case IX86_BUILTIN_SCATTERSIV16SI
:
14309 icode
= CODE_FOR_avx512f_scattersiv16si
;
14311 case IX86_BUILTIN_SCATTERSIV8DI
:
14312 icode
= CODE_FOR_avx512f_scattersiv8di
;
14314 case IX86_BUILTIN_SCATTERDIV16SI
:
14315 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14317 case IX86_BUILTIN_SCATTERDIV8DI
:
14318 icode
= CODE_FOR_avx512f_scatterdiv8di
;
14320 case IX86_BUILTIN_SCATTERSIV8SF
:
14321 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
14323 case IX86_BUILTIN_SCATTERSIV4SF
:
14324 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
14326 case IX86_BUILTIN_SCATTERSIV4DF
:
14327 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14329 case IX86_BUILTIN_SCATTERSIV2DF
:
14330 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14332 case IX86_BUILTIN_SCATTERDIV8SF
:
14333 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14335 case IX86_BUILTIN_SCATTERDIV4SF
:
14336 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14338 case IX86_BUILTIN_SCATTERDIV4DF
:
14339 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
14341 case IX86_BUILTIN_SCATTERDIV2DF
:
14342 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
14344 case IX86_BUILTIN_SCATTERSIV8SI
:
14345 icode
= CODE_FOR_avx512vl_scattersiv8si
;
14347 case IX86_BUILTIN_SCATTERSIV4SI
:
14348 icode
= CODE_FOR_avx512vl_scattersiv4si
;
14350 case IX86_BUILTIN_SCATTERSIV4DI
:
14351 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14353 case IX86_BUILTIN_SCATTERSIV2DI
:
14354 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14356 case IX86_BUILTIN_SCATTERDIV8SI
:
14357 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14359 case IX86_BUILTIN_SCATTERDIV4SI
:
14360 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14362 case IX86_BUILTIN_SCATTERDIV4DI
:
14363 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
14365 case IX86_BUILTIN_SCATTERDIV2DI
:
14366 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
14368 case IX86_BUILTIN_GATHERPFDPD
:
14369 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
14370 goto vec_prefetch_gen
;
14371 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14372 icode
= CODE_FOR_avx512f_scattersiv8df
;
14374 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14375 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14377 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14378 icode
= CODE_FOR_avx512f_scattersiv8di
;
14380 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14381 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14383 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14384 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14386 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14387 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14389 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14390 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14392 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14393 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14395 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14396 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14398 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14399 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14401 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14402 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14404 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14405 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14407 case IX86_BUILTIN_GATHERPFDPS
:
14408 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
14409 goto vec_prefetch_gen
;
14410 case IX86_BUILTIN_GATHERPFQPD
:
14411 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
14412 goto vec_prefetch_gen
;
14413 case IX86_BUILTIN_GATHERPFQPS
:
14414 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
14415 goto vec_prefetch_gen
;
14416 case IX86_BUILTIN_SCATTERPFDPD
:
14417 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
14418 goto vec_prefetch_gen
;
14419 case IX86_BUILTIN_SCATTERPFDPS
:
14420 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
14421 goto vec_prefetch_gen
;
14422 case IX86_BUILTIN_SCATTERPFQPD
:
14423 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
14424 goto vec_prefetch_gen
;
14425 case IX86_BUILTIN_SCATTERPFQPS
:
14426 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
14427 goto vec_prefetch_gen
;
14431 rtx (*gen
) (rtx
, rtx
);
14433 arg0
= CALL_EXPR_ARG (exp
, 0);
14434 arg1
= CALL_EXPR_ARG (exp
, 1);
14435 arg2
= CALL_EXPR_ARG (exp
, 2);
14436 arg3
= CALL_EXPR_ARG (exp
, 3);
14437 arg4
= CALL_EXPR_ARG (exp
, 4);
14438 op0
= expand_normal (arg0
);
14439 op1
= expand_normal (arg1
);
14440 op2
= expand_normal (arg2
);
14441 op3
= expand_normal (arg3
);
14442 op4
= expand_normal (arg4
);
14443 /* Note the arg order is different from the operand order. */
14444 mode0
= insn_data
[icode
].operand
[1].mode
;
14445 mode2
= insn_data
[icode
].operand
[3].mode
;
14446 mode3
= insn_data
[icode
].operand
[4].mode
;
14447 mode4
= insn_data
[icode
].operand
[5].mode
;
14449 if (target
== NULL_RTX
14450 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
14451 || !insn_data
[icode
].operand
[0].predicate (target
,
14452 GET_MODE (target
)))
14453 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
14455 subtarget
= target
;
14459 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14460 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14461 half
= gen_reg_rtx (V8SImode
);
14462 if (!nonimmediate_operand (op2
, V16SImode
))
14463 op2
= copy_to_mode_reg (V16SImode
, op2
);
14464 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14467 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14468 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14469 case IX86_BUILTIN_GATHERALTSIV4DF
:
14470 case IX86_BUILTIN_GATHERALTSIV4DI
:
14471 half
= gen_reg_rtx (V4SImode
);
14472 if (!nonimmediate_operand (op2
, V8SImode
))
14473 op2
= copy_to_mode_reg (V8SImode
, op2
);
14474 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14477 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14478 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14479 half
= gen_reg_rtx (mode0
);
14480 if (mode0
== V8SFmode
)
14481 gen
= gen_vec_extract_lo_v16sf
;
14483 gen
= gen_vec_extract_lo_v16si
;
14484 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14485 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14486 emit_insn (gen (half
, op0
));
14488 op3
= lowpart_subreg (QImode
, op3
, HImode
);
14490 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14491 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14492 case IX86_BUILTIN_GATHERALTDIV8SF
:
14493 case IX86_BUILTIN_GATHERALTDIV8SI
:
14494 half
= gen_reg_rtx (mode0
);
14495 if (mode0
== V4SFmode
)
14496 gen
= gen_vec_extract_lo_v8sf
;
14498 gen
= gen_vec_extract_lo_v8si
;
14499 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14500 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14501 emit_insn (gen (half
, op0
));
14503 if (VECTOR_MODE_P (GET_MODE (op3
)))
14505 half
= gen_reg_rtx (mode0
);
14506 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14507 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14508 emit_insn (gen (half
, op3
));
14516 /* Force memory operand only with base register here. But we
14517 don't want to do it on memory operand for other builtin
14519 op1
= ix86_zero_extend_to_Pmode (op1
);
14521 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
14522 op0
= copy_to_mode_reg (mode0
, op0
);
14523 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
14524 op1
= copy_to_mode_reg (Pmode
, op1
);
14525 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
14526 op2
= copy_to_mode_reg (mode2
, op2
);
14528 op3
= fixup_modeless_constant (op3
, mode3
);
14530 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
14532 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
14533 op3
= copy_to_mode_reg (mode3
, op3
);
14537 op3
= copy_to_reg (op3
);
14538 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
14540 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
14542 error ("the last argument must be scale 1, 2, 4, 8");
14546 /* Optimize. If mask is known to have all high bits set,
14547 replace op0 with pc_rtx to signal that the instruction
14548 overwrites the whole destination and doesn't use its
14549 previous contents. */
14552 if (TREE_CODE (arg3
) == INTEGER_CST
)
14554 if (integer_all_onesp (arg3
))
14557 else if (TREE_CODE (arg3
) == VECTOR_CST
)
14559 unsigned int negative
= 0;
14560 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
14562 tree cst
= VECTOR_CST_ELT (arg3
, i
);
14563 if (TREE_CODE (cst
) == INTEGER_CST
14564 && tree_int_cst_sign_bit (cst
))
14566 else if (TREE_CODE (cst
) == REAL_CST
14567 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
14570 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
14573 else if (TREE_CODE (arg3
) == SSA_NAME
14574 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
14576 /* Recognize also when mask is like:
14577 __v2df src = _mm_setzero_pd ();
14578 __v2df mask = _mm_cmpeq_pd (src, src);
14580 __v8sf src = _mm256_setzero_ps ();
14581 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14582 as that is a cheaper way to load all ones into
14583 a register than having to load a constant from
14585 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
14586 if (is_gimple_call (def_stmt
))
14588 tree fndecl
= gimple_call_fndecl (def_stmt
);
14590 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
14591 switch (DECL_MD_FUNCTION_CODE (fndecl
))
14593 case IX86_BUILTIN_CMPPD
:
14594 case IX86_BUILTIN_CMPPS
:
14595 case IX86_BUILTIN_CMPPD256
:
14596 case IX86_BUILTIN_CMPPS256
:
14597 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
14600 case IX86_BUILTIN_CMPEQPD
:
14601 case IX86_BUILTIN_CMPEQPS
:
14602 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
14603 && initializer_zerop (gimple_call_arg (def_stmt
,
14614 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
14621 case IX86_BUILTIN_GATHER3DIV16SF
:
14622 if (target
== NULL_RTX
)
14623 target
= gen_reg_rtx (V8SFmode
);
14624 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
14626 case IX86_BUILTIN_GATHER3DIV16SI
:
14627 if (target
== NULL_RTX
)
14628 target
= gen_reg_rtx (V8SImode
);
14629 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
14631 case IX86_BUILTIN_GATHER3DIV8SF
:
14632 case IX86_BUILTIN_GATHERDIV8SF
:
14633 if (target
== NULL_RTX
)
14634 target
= gen_reg_rtx (V4SFmode
);
14635 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
14637 case IX86_BUILTIN_GATHER3DIV8SI
:
14638 case IX86_BUILTIN_GATHERDIV8SI
:
14639 if (target
== NULL_RTX
)
14640 target
= gen_reg_rtx (V4SImode
);
14641 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
14644 target
= subtarget
;
14650 arg0
= CALL_EXPR_ARG (exp
, 0);
14651 arg1
= CALL_EXPR_ARG (exp
, 1);
14652 arg2
= CALL_EXPR_ARG (exp
, 2);
14653 arg3
= CALL_EXPR_ARG (exp
, 3);
14654 arg4
= CALL_EXPR_ARG (exp
, 4);
14655 op0
= expand_normal (arg0
);
14656 op1
= expand_normal (arg1
);
14657 op2
= expand_normal (arg2
);
14658 op3
= expand_normal (arg3
);
14659 op4
= expand_normal (arg4
);
14660 mode1
= insn_data
[icode
].operand
[1].mode
;
14661 mode2
= insn_data
[icode
].operand
[2].mode
;
14662 mode3
= insn_data
[icode
].operand
[3].mode
;
14663 mode4
= insn_data
[icode
].operand
[4].mode
;
14665 /* Scatter instruction stores operand op3 to memory with
14666 indices from op2 and scale from op4 under writemask op1.
14667 If index operand op2 has more elements then source operand
14668 op3 one need to use only its low half. And vice versa. */
14671 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14672 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14673 half
= gen_reg_rtx (V8SImode
);
14674 if (!nonimmediate_operand (op2
, V16SImode
))
14675 op2
= copy_to_mode_reg (V16SImode
, op2
);
14676 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14679 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14680 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14681 half
= gen_reg_rtx (mode3
);
14682 if (mode3
== V8SFmode
)
14683 gen
= gen_vec_extract_lo_v16sf
;
14685 gen
= gen_vec_extract_lo_v16si
;
14686 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14687 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14688 emit_insn (gen (half
, op3
));
14691 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14692 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14693 half
= gen_reg_rtx (V4SImode
);
14694 if (!nonimmediate_operand (op2
, V8SImode
))
14695 op2
= copy_to_mode_reg (V8SImode
, op2
);
14696 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14699 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14700 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14701 half
= gen_reg_rtx (mode3
);
14702 if (mode3
== V4SFmode
)
14703 gen
= gen_vec_extract_lo_v8sf
;
14705 gen
= gen_vec_extract_lo_v8si
;
14706 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14707 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14708 emit_insn (gen (half
, op3
));
14711 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14712 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14713 if (!nonimmediate_operand (op2
, V4SImode
))
14714 op2
= copy_to_mode_reg (V4SImode
, op2
);
14716 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14717 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14718 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14719 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14725 /* Force memory operand only with base register here. But we
14726 don't want to do it on memory operand for other builtin
14728 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
14730 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
14731 op0
= copy_to_mode_reg (Pmode
, op0
);
14733 op1
= fixup_modeless_constant (op1
, mode1
);
14735 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
14737 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14738 op1
= copy_to_mode_reg (mode1
, op1
);
14742 op1
= copy_to_reg (op1
);
14743 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
14746 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
14747 op2
= copy_to_mode_reg (mode2
, op2
);
14749 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14750 op3
= copy_to_mode_reg (mode3
, op3
);
14752 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14754 error ("the last argument must be scale 1, 2, 4, 8");
14758 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14766 arg0
= CALL_EXPR_ARG (exp
, 0);
14767 arg1
= CALL_EXPR_ARG (exp
, 1);
14768 arg2
= CALL_EXPR_ARG (exp
, 2);
14769 arg3
= CALL_EXPR_ARG (exp
, 3);
14770 arg4
= CALL_EXPR_ARG (exp
, 4);
14771 op0
= expand_normal (arg0
);
14772 op1
= expand_normal (arg1
);
14773 op2
= expand_normal (arg2
);
14774 op3
= expand_normal (arg3
);
14775 op4
= expand_normal (arg4
);
14776 mode0
= insn_data
[icode
].operand
[0].mode
;
14777 mode1
= insn_data
[icode
].operand
[1].mode
;
14778 mode3
= insn_data
[icode
].operand
[3].mode
;
14779 mode4
= insn_data
[icode
].operand
[4].mode
;
14781 op0
= fixup_modeless_constant (op0
, mode0
);
14783 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
14785 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14786 op0
= copy_to_mode_reg (mode0
, op0
);
14790 op0
= copy_to_reg (op0
);
14791 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
14794 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14795 op1
= copy_to_mode_reg (mode1
, op1
);
14797 /* Force memory operand only with base register here. But we
14798 don't want to do it on memory operand for other builtin
14800 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
14802 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
14803 op2
= copy_to_mode_reg (Pmode
, op2
);
14805 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14807 error ("the forth argument must be scale 1, 2, 4, 8");
14811 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14813 error ("incorrect hint operand");
14817 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14825 case IX86_BUILTIN_XABORT
:
14826 icode
= CODE_FOR_xabort
;
14827 arg0
= CALL_EXPR_ARG (exp
, 0);
14828 op0
= expand_normal (arg0
);
14829 mode0
= insn_data
[icode
].operand
[0].mode
;
14830 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14832 error ("the argument to %<xabort%> intrinsic must "
14833 "be an 8-bit immediate");
14836 emit_insn (gen_xabort (op0
));
14839 case IX86_BUILTIN_RDSSPD
:
14840 case IX86_BUILTIN_RDSSPQ
:
14841 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
14844 || !register_operand (target
, mode
))
14845 target
= gen_reg_rtx (mode
);
14847 op0
= force_reg (mode
, const0_rtx
);
14849 emit_insn (gen_rdssp (mode
, target
, op0
));
14852 case IX86_BUILTIN_INCSSPD
:
14853 case IX86_BUILTIN_INCSSPQ
:
14854 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
14856 arg0
= CALL_EXPR_ARG (exp
, 0);
14857 op0
= expand_normal (arg0
);
14859 op0
= force_reg (mode
, op0
);
14861 emit_insn (gen_incssp (mode
, op0
));
14864 case IX86_BUILTIN_HRESET
:
14865 icode
= CODE_FOR_hreset
;
14866 arg0
= CALL_EXPR_ARG (exp
, 0);
14867 op0
= expand_normal (arg0
);
14868 op0
= force_reg (SImode
, op0
);
14869 emit_insn (gen_hreset (op0
));
14872 case IX86_BUILTIN_RSTORSSP
:
14873 case IX86_BUILTIN_CLRSSBSY
:
14874 arg0
= CALL_EXPR_ARG (exp
, 0);
14875 op0
= expand_normal (arg0
);
14876 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
14877 ? CODE_FOR_rstorssp
14878 : CODE_FOR_clrssbsy
);
14880 if (!address_operand (op0
, VOIDmode
))
14882 op0
= convert_memory_address (Pmode
, op0
);
14883 op0
= copy_addr_to_reg (op0
);
14885 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
14888 case IX86_BUILTIN_WRSSD
:
14889 case IX86_BUILTIN_WRSSQ
:
14890 case IX86_BUILTIN_WRUSSD
:
14891 case IX86_BUILTIN_WRUSSQ
:
14892 mode
= ((fcode
== IX86_BUILTIN_WRSSD
14893 || fcode
== IX86_BUILTIN_WRUSSD
)
14894 ? SImode
: DImode
);
14896 arg0
= CALL_EXPR_ARG (exp
, 0);
14897 op0
= expand_normal (arg0
);
14898 arg1
= CALL_EXPR_ARG (exp
, 1);
14899 op1
= expand_normal (arg1
);
14901 op0
= force_reg (mode
, op0
);
14903 if (!address_operand (op1
, VOIDmode
))
14905 op1
= convert_memory_address (Pmode
, op1
);
14906 op1
= copy_addr_to_reg (op1
);
14908 op1
= gen_rtx_MEM (mode
, op1
);
14910 icode
= ((fcode
== IX86_BUILTIN_WRSSD
14911 || fcode
== IX86_BUILTIN_WRSSQ
)
14912 ? code_for_wrss (mode
)
14913 : code_for_wruss (mode
));
14914 emit_insn (GEN_FCN (icode
) (op0
, op1
));
14922 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14923 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
14925 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
14926 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
14930 if (fcode
>= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14931 && fcode
<= IX86_BUILTIN__BDESC_PURE_ARGS_LAST
)
14933 i
= fcode
- IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
;
14934 return ix86_expand_special_args_builtin (bdesc_pure_args
+ i
, exp
,
14938 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
14939 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
14941 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
14942 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
14943 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
14944 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
14946 machine_mode mode
, wide_mode
, nar_mode
;
14948 nar_mode
= V4SFmode
;
14950 wide_mode
= V64SFmode
;
14951 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
14952 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
14956 case IX86_BUILTIN_4FMAPS
:
14957 fcn
= gen_avx5124fmaddps_4fmaddps
;
14961 case IX86_BUILTIN_4DPWSSD
:
14962 nar_mode
= V4SImode
;
14964 wide_mode
= V64SImode
;
14965 fcn
= gen_avx5124vnniw_vp4dpwssd
;
14969 case IX86_BUILTIN_4DPWSSDS
:
14970 nar_mode
= V4SImode
;
14972 wide_mode
= V64SImode
;
14973 fcn
= gen_avx5124vnniw_vp4dpwssds
;
14977 case IX86_BUILTIN_4FNMAPS
:
14978 fcn
= gen_avx5124fmaddps_4fnmaddps
;
14982 case IX86_BUILTIN_4FNMAPS_MASK
:
14983 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
14984 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
14987 case IX86_BUILTIN_4DPWSSD_MASK
:
14988 nar_mode
= V4SImode
;
14990 wide_mode
= V64SImode
;
14991 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
14992 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
14995 case IX86_BUILTIN_4DPWSSDS_MASK
:
14996 nar_mode
= V4SImode
;
14998 wide_mode
= V64SImode
;
14999 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
15000 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
15003 case IX86_BUILTIN_4FMAPS_MASK
:
15013 wide_reg
= gen_reg_rtx (wide_mode
);
15014 for (i
= 0; i
< 4; i
++)
15016 args
[i
] = CALL_EXPR_ARG (exp
, i
);
15017 ops
[i
] = expand_normal (args
[i
]);
15019 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
15023 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
15024 accum
= force_reg (mode
, accum
);
15026 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
15027 addr
= force_reg (Pmode
, addr
);
15029 mem
= gen_rtx_MEM (nar_mode
, addr
);
15031 target
= gen_reg_rtx (mode
);
15033 emit_move_insn (target
, accum
);
15036 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
15040 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
15042 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
15044 if (CONST_INT_P (mask
))
15045 mask
= fixup_modeless_constant (mask
, HImode
);
15047 mask
= force_reg (HImode
, mask
);
15049 if (GET_MODE (mask
) != HImode
)
15050 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
15052 /* If merge is 0 then we're about to emit z-masked variant. */
15053 if (const0_operand (merge
, mode
))
15054 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
15055 /* If merge is the same as accum then emit merge-masked variant. */
15056 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
15058 merge
= force_reg (mode
, merge
);
15059 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
15061 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15064 target
= gen_reg_rtx (mode
);
15065 emit_move_insn (target
, merge
);
15066 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
15072 case IX86_BUILTIN_4FNMASS
:
15073 fcn
= gen_avx5124fmaddps_4fnmaddss
;
15077 case IX86_BUILTIN_4FMASS
:
15078 fcn
= gen_avx5124fmaddps_4fmaddss
;
15082 case IX86_BUILTIN_4FNMASS_MASK
:
15083 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
15084 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
15087 case IX86_BUILTIN_4FMASS_MASK
:
15096 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
15097 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
15101 wide_reg
= gen_reg_rtx (V64SFmode
);
15102 for (i
= 0; i
< 4; i
++)
15105 args
[i
] = CALL_EXPR_ARG (exp
, i
);
15106 ops
[i
] = expand_normal (args
[i
]);
15108 tmp
= gen_reg_rtx (SFmode
);
15109 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
15111 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
15112 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
15115 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
15116 accum
= force_reg (V4SFmode
, accum
);
15118 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
15119 addr
= force_reg (Pmode
, addr
);
15121 mem
= gen_rtx_MEM (V4SFmode
, addr
);
15123 target
= gen_reg_rtx (V4SFmode
);
15125 emit_move_insn (target
, accum
);
15128 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
15132 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
15134 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
15136 if (CONST_INT_P (mask
))
15137 mask
= fixup_modeless_constant (mask
, QImode
);
15139 mask
= force_reg (QImode
, mask
);
15141 if (GET_MODE (mask
) != QImode
)
15142 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
15144 /* If merge is 0 then we're about to emit z-masked variant. */
15145 if (const0_operand (merge
, mode
))
15146 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
15147 /* If merge is the same as accum then emit merge-masked
15149 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
15151 merge
= force_reg (mode
, merge
);
15152 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
15154 /* Merge with something unknown might happen if we z-mask
15158 target
= gen_reg_rtx (mode
);
15159 emit_move_insn (target
, merge
);
15160 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
15165 case IX86_BUILTIN_RDPID
:
15166 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
15168 case IX86_BUILTIN_FABSQ
:
15169 case IX86_BUILTIN_COPYSIGNQ
:
15171 /* Emit a normal call if SSE isn't available. */
15172 return expand_call (exp
, target
, ignore
);
15175 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
15179 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
15180 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
15182 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
15183 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
15186 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15187 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
15189 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
15190 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
15193 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15194 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
15196 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
15197 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
15200 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15201 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
15203 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
15204 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
15207 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15208 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
15210 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
15211 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
15212 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
15213 (enum ix86_builtin_func_type
)
15214 d
->flag
, d
->comparison
);
15217 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
15218 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
15220 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
15221 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
15225 gcc_unreachable ();
15228 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15229 fill target with val via vec_duplicate. */
15232 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
15237 /* Save/restore recog_data in case this is called from splitters
15238 or other routines where recog_data needs to stay valid across
15239 force_reg. See PR106577. */
15240 recog_data_d recog_data_save
= recog_data
;
15242 /* First attempt to recognize VAL as-is. */
15243 dup
= gen_vec_duplicate (mode
, val
);
15244 insn
= emit_insn (gen_rtx_SET (target
, dup
));
15245 if (recog_memoized (insn
) < 0)
15248 machine_mode innermode
= GET_MODE_INNER (mode
);
15251 /* If that fails, force VAL into a register. */
15254 reg
= force_reg (innermode
, val
);
15255 if (GET_MODE (reg
) != innermode
)
15256 reg
= gen_lowpart (innermode
, reg
);
15257 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
15258 seq
= get_insns ();
15261 emit_insn_before (seq
, insn
);
15263 ok
= recog_memoized (insn
) >= 0;
15266 recog_data
= recog_data_save
;
15270 /* Get a vector mode of the same size as the original but with elements
15271 twice as wide. This is only guaranteed to apply to integral vectors. */
15273 static machine_mode
15274 get_mode_wider_vector (machine_mode o
)
15276 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15277 machine_mode n
= GET_MODE_NEXT_MODE (o
).require ();
15278 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
15279 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
15283 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
15284 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
15286 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15287 with all elements equal to VAR. Return true if successful. */
15290 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
15291 rtx target
, rtx val
)
15315 return ix86_vector_duplicate_value (mode
, target
, val
);
15320 if (TARGET_SSE
|| TARGET_3DNOW_A
)
15324 val
= gen_lowpart (SImode
, val
);
15325 x
= gen_rtx_TRUNCATE (HImode
, val
);
15326 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15327 emit_insn (gen_rtx_SET (target
, x
));
15337 val
= gen_lowpart (SImode
, val
);
15338 x
= gen_rtx_TRUNCATE (HImode
, val
);
15339 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15340 emit_insn (gen_rtx_SET (target
, x
));
15355 return ix86_vector_duplicate_value (mode
, target
, val
);
15359 struct expand_vec_perm_d dperm
;
15363 memset (&dperm
, 0, sizeof (dperm
));
15364 dperm
.target
= target
;
15365 dperm
.vmode
= mode
;
15366 dperm
.nelt
= GET_MODE_NUNITS (mode
);
15367 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
15368 dperm
.one_operand_p
= true;
15370 if (mode
== V8HFmode
|| mode
== V8BFmode
)
15372 tmp1
= force_reg (GET_MODE_INNER (mode
), val
);
15373 tmp2
= gen_reg_rtx (mode
);
15374 emit_insn (maybe_gen_vec_set_0 (mode
, tmp2
,
15375 CONST0_RTX (mode
), tmp1
));
15376 tmp1
= gen_lowpart (mode
, tmp2
);
15380 /* Extend to SImode using a paradoxical SUBREG. */
15381 tmp1
= gen_reg_rtx (SImode
);
15382 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
15384 /* Insert the SImode value as
15385 low element of a V4SImode vector. */
15386 tmp2
= gen_reg_rtx (V4SImode
);
15387 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
15388 tmp1
= gen_lowpart (mode
, tmp2
);
15391 emit_move_insn (dperm
.op0
, tmp1
);
15392 ok
= (expand_vec_perm_1 (&dperm
)
15393 || expand_vec_perm_broadcast_1 (&dperm
));
15401 return ix86_vector_duplicate_value (mode
, target
, val
);
15408 /* Replicate the value once into the next wider mode and recurse. */
15410 machine_mode smode
, wsmode
, wvmode
;
15413 smode
= GET_MODE_INNER (mode
);
15414 wvmode
= get_mode_wider_vector (mode
);
15415 wsmode
= GET_MODE_INNER (wvmode
);
15417 val
= convert_modes (wsmode
, smode
, val
, true);
15419 if (smode
== QImode
&& !TARGET_PARTIAL_REG_STALL
)
15420 emit_insn (gen_insv_1 (wsmode
, val
, val
));
15423 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
15424 GEN_INT (GET_MODE_BITSIZE (smode
)),
15425 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15426 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1,
15430 x
= gen_reg_rtx (wvmode
);
15431 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
15433 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
15442 return ix86_vector_duplicate_value (mode
, target
, val
);
15445 machine_mode hvmode
;
15458 hvmode
= V16QImode
;
15461 gcc_unreachable ();
15463 rtx x
= gen_reg_rtx (hvmode
);
15465 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15468 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15469 emit_insn (gen_rtx_SET (target
, x
));
15477 if (TARGET_AVX512BW
)
15478 return ix86_vector_duplicate_value (mode
, target
, val
);
15481 machine_mode hvmode
;
15485 hvmode
= V16HImode
;
15488 hvmode
= V16HFmode
;
15491 hvmode
= V16BFmode
;
15494 hvmode
= V32QImode
;
15497 gcc_unreachable ();
15499 rtx x
= gen_reg_rtx (hvmode
);
15501 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15504 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15505 emit_insn (gen_rtx_SET (target
, x
));
15514 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15515 whose ONE_VAR element is VAR, and other elements are zero. Return true
15519 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
15520 rtx target
, rtx var
, int one_var
)
15522 machine_mode vsimode
;
15525 bool use_vector_set
= false;
15526 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
15531 /* For SSE4.1, we normally use vector set. But if the second
15532 element is zero and inter-unit moves are OK, we use movq
15534 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
15535 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15541 use_vector_set
= TARGET_SSE4_1
;
15544 use_vector_set
= TARGET_SSE2
;
15545 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15546 ? gen_vec_setv8hi_0
: NULL
;
15549 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15552 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
15555 use_vector_set
= TARGET_SSE4_1
;
15558 use_vector_set
= TARGET_AVX
;
15561 use_vector_set
= TARGET_AVX
;
15562 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15563 ? gen_vec_setv16hi_0
: NULL
;
15566 use_vector_set
= TARGET_AVX
;
15567 gen_vec_set_0
= gen_vec_setv8si_0
;
15570 use_vector_set
= TARGET_AVX
;
15571 gen_vec_set_0
= gen_vec_setv8sf_0
;
15574 use_vector_set
= TARGET_AVX
;
15575 gen_vec_set_0
= gen_vec_setv4df_0
;
15578 /* Use ix86_expand_vector_set in 64bit mode only. */
15579 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
15580 gen_vec_set_0
= gen_vec_setv4di_0
;
15583 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15584 gen_vec_set_0
= gen_vec_setv16si_0
;
15587 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15588 gen_vec_set_0
= gen_vec_setv16sf_0
;
15591 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15592 gen_vec_set_0
= gen_vec_setv8df_0
;
15595 /* Use ix86_expand_vector_set in 64bit mode only. */
15596 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
15597 gen_vec_set_0
= gen_vec_setv8di_0
;
15600 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15601 gen_vec_set_0
= gen_vec_setv8hf_0
;
15604 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15605 gen_vec_set_0
= gen_vec_setv16hf_0
;
15608 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15609 gen_vec_set_0
= gen_vec_setv32hf_0
;
15612 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15613 gen_vec_set_0
= gen_vec_setv8bf_0
;
15616 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15617 gen_vec_set_0
= gen_vec_setv16bf_0
;
15620 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15621 gen_vec_set_0
= gen_vec_setv32bf_0
;
15624 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15625 gen_vec_set_0
= gen_vec_setv32hi_0
;
15630 if (use_vector_set
)
15632 if (gen_vec_set_0
&& one_var
== 0)
15634 var
= force_reg (GET_MODE_INNER (mode
), var
);
15635 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
15638 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
15639 var
= force_reg (GET_MODE_INNER (mode
), var
);
15640 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15656 var
= force_reg (GET_MODE_INNER (mode
), var
);
15657 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
15658 emit_insn (gen_rtx_SET (target
, x
));
15663 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
15664 new_target
= gen_reg_rtx (mode
);
15666 new_target
= target
;
15667 var
= force_reg (GET_MODE_INNER (mode
), var
);
15668 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
15669 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
15670 emit_insn (gen_rtx_SET (new_target
, x
));
15673 /* We need to shuffle the value to the correct position, so
15674 create a new pseudo to store the intermediate result. */
15676 /* With SSE2, we can use the integer shuffle insns. */
15677 if (mode
!= V4SFmode
&& TARGET_SSE2
)
15679 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
15681 GEN_INT (one_var
== 1 ? 0 : 1),
15682 GEN_INT (one_var
== 2 ? 0 : 1),
15683 GEN_INT (one_var
== 3 ? 0 : 1)));
15684 if (target
!= new_target
)
15685 emit_move_insn (target
, new_target
);
15689 /* Otherwise convert the intermediate result to V4SFmode and
15690 use the SSE1 shuffle instructions. */
15691 if (mode
!= V4SFmode
)
15693 tmp
= gen_reg_rtx (V4SFmode
);
15694 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
15699 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
15701 GEN_INT (one_var
== 1 ? 0 : 1),
15702 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
15703 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
15705 if (mode
!= V4SFmode
)
15706 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
15707 else if (tmp
!= target
)
15708 emit_move_insn (target
, tmp
);
15710 else if (target
!= new_target
)
15711 emit_move_insn (target
, new_target
);
15716 vsimode
= V4SImode
;
15722 vsimode
= V2SImode
;
15728 /* Zero extend the variable element to SImode and recurse. */
15729 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
15731 x
= gen_reg_rtx (vsimode
);
15732 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
15734 gcc_unreachable ();
15736 emit_move_insn (target
, gen_lowpart (mode
, x
));
15744 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15745 consisting of the values in VALS. It is known that all elements
15746 except ONE_VAR are constants. Return true if successful. */
15749 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
15750 rtx target
, rtx vals
, int one_var
)
15752 rtx var
= XVECEXP (vals
, 0, one_var
);
15753 machine_mode wmode
;
15756 const_vec
= copy_rtx (vals
);
15757 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
15758 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
15766 /* For the two element vectors, it's just as easy to use
15767 the general case. */
15771 /* Use ix86_expand_vector_set in 64bit mode only. */
15796 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
15805 /* There's no way to set one QImode entry easily. Combine
15806 the variable value with its adjacent constant value, and
15807 promote to an HImode set. */
15808 x
= XVECEXP (vals
, 0, one_var
^ 1);
15811 var
= convert_modes (HImode
, QImode
, var
, true);
15812 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
15813 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15814 x
= GEN_INT (INTVAL (x
) & 0xff);
15818 var
= convert_modes (HImode
, QImode
, var
, true);
15819 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
15821 if (x
!= const0_rtx
)
15822 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
15823 1, OPTAB_LIB_WIDEN
);
15825 x
= gen_reg_rtx (wmode
);
15826 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
15827 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
15829 emit_move_insn (target
, gen_lowpart (mode
, x
));
15836 emit_move_insn (target
, const_vec
);
15837 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15841 /* A subroutine of ix86_expand_vector_init_general. Use vector
15842 concatenate to handle the most general case: all values variable,
15843 and none identical. */
15846 ix86_expand_vector_init_concat (machine_mode mode
,
15847 rtx target
, rtx
*ops
, int n
)
15849 machine_mode half_mode
= VOIDmode
;
15860 half_mode
= V16HFmode
;
15863 half_mode
= V16BFmode
;
15866 half_mode
= V8SImode
;
15869 half_mode
= V8SFmode
;
15872 half_mode
= V4DImode
;
15875 half_mode
= V4DFmode
;
15878 half_mode
= V8HFmode
;
15881 half_mode
= V8BFmode
;
15884 half_mode
= V4SImode
;
15887 half_mode
= V4SFmode
;
15890 half_mode
= V2DImode
;
15893 half_mode
= V2DFmode
;
15896 half_mode
= V2SImode
;
15899 half_mode
= V2SFmode
;
15902 half_mode
= DImode
;
15905 half_mode
= SImode
;
15908 half_mode
= DFmode
;
15911 half_mode
= SFmode
;
15914 gcc_unreachable ();
15917 if (!register_operand (ops
[1], half_mode
))
15918 ops
[1] = force_reg (half_mode
, ops
[1]);
15919 if (!register_operand (ops
[0], half_mode
))
15920 ops
[0] = force_reg (half_mode
, ops
[0]);
15921 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
15929 half_mode
= V2DImode
;
15932 half_mode
= V2DFmode
;
15935 half_mode
= V2SImode
;
15938 half_mode
= V2SFmode
;
15941 gcc_unreachable ();
15949 half_mode
= V4DImode
;
15952 half_mode
= V4DFmode
;
15955 half_mode
= V4SImode
;
15958 half_mode
= V4SFmode
;
15961 gcc_unreachable ();
15969 half_mode
= V8SImode
;
15972 half_mode
= V8SFmode
;
15975 gcc_unreachable ();
15980 /* FIXME: We process inputs backward to help RA. PR 36222. */
15982 for (j
= 1; j
!= -1; j
--)
15984 half
[j
] = gen_reg_rtx (half_mode
);
15988 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
15992 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
15996 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
15997 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
16001 gcc_unreachable ();
16003 ix86_expand_vector_init (false, half
[j
],
16004 gen_rtx_PARALLEL (half_mode
, v
));
16007 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
16011 gcc_unreachable ();
16015 /* A subroutine of ix86_expand_vector_init_general. Use vector
16016 interleave to handle the most general case: all values variable,
16017 and none identical. */
16020 ix86_expand_vector_init_interleave (machine_mode mode
,
16021 rtx target
, rtx
*ops
, int n
)
16023 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
16026 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
16027 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
16028 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
16033 gen_load_even
= gen_vec_interleave_lowv8hf
;
16034 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16035 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16036 inner_mode
= HFmode
;
16037 first_imode
= V4SImode
;
16038 second_imode
= V2DImode
;
16039 third_imode
= VOIDmode
;
16042 gen_load_even
= gen_vec_interleave_lowv8bf
;
16043 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16044 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16045 inner_mode
= BFmode
;
16046 first_imode
= V4SImode
;
16047 second_imode
= V2DImode
;
16048 third_imode
= VOIDmode
;
16051 gen_load_even
= gen_vec_setv8hi
;
16052 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
16053 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16054 inner_mode
= HImode
;
16055 first_imode
= V4SImode
;
16056 second_imode
= V2DImode
;
16057 third_imode
= VOIDmode
;
16060 gen_load_even
= gen_vec_setv16qi
;
16061 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
16062 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
16063 inner_mode
= QImode
;
16064 first_imode
= V8HImode
;
16065 second_imode
= V4SImode
;
16066 third_imode
= V2DImode
;
16069 gcc_unreachable ();
16072 for (i
= 0; i
< n
; i
++)
16075 if (inner_mode
== HFmode
|| inner_mode
== BFmode
)
16078 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16079 machine_mode vec_mode
=
16080 (inner_mode
== HFmode
) ? V8HFmode
: V8BFmode
;
16081 op0
= gen_reg_rtx (vec_mode
);
16082 even
= lowpart_subreg (vec_mode
,
16083 force_reg (inner_mode
, op
), inner_mode
);
16084 odd
= lowpart_subreg (vec_mode
,
16085 force_reg (inner_mode
, ops
[i
+ i
+ 1]),
16087 emit_insn (gen_load_even (op0
, even
, odd
));
16091 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16092 op0
= gen_reg_rtx (SImode
);
16093 emit_move_insn (op0
, gen_lowpart (SImode
, op
));
16095 /* Insert the SImode value as low element of V4SImode vector. */
16096 op1
= gen_reg_rtx (V4SImode
);
16097 op0
= gen_rtx_VEC_MERGE (V4SImode
,
16098 gen_rtx_VEC_DUPLICATE (V4SImode
,
16100 CONST0_RTX (V4SImode
),
16102 emit_insn (gen_rtx_SET (op1
, op0
));
16104 /* Cast the V4SImode vector back to a vector in orignal mode. */
16105 op0
= gen_reg_rtx (mode
);
16106 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
16108 /* Load even elements into the second position. */
16109 emit_insn (gen_load_even (op0
,
16110 force_reg (inner_mode
,
16115 /* Cast vector to FIRST_IMODE vector. */
16116 ops
[i
] = gen_reg_rtx (first_imode
);
16117 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
16120 /* Interleave low FIRST_IMODE vectors. */
16121 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
16123 op0
= gen_reg_rtx (first_imode
);
16124 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
16126 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16127 ops
[j
] = gen_reg_rtx (second_imode
);
16128 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
16131 /* Interleave low SECOND_IMODE vectors. */
16132 switch (second_imode
)
16135 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
16137 op0
= gen_reg_rtx (second_imode
);
16138 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
16141 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16143 ops
[j
] = gen_reg_rtx (third_imode
);
16144 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
16146 second_imode
= V2DImode
;
16147 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
16151 op0
= gen_reg_rtx (second_imode
);
16152 emit_insn (gen_interleave_second_low (op0
, ops
[0],
16155 /* Cast the SECOND_IMODE vector back to a vector on original
16157 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
16161 gcc_unreachable ();
16165 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
16166 all values variable, and none identical. */
16169 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
16170 rtx target
, rtx vals
)
16172 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
16173 machine_mode half_mode
= VOIDmode
;
16174 machine_mode quarter_mode
= VOIDmode
;
16181 if (!mmx_ok
&& !TARGET_SSE
)
16197 n
= GET_MODE_NUNITS (mode
);
16198 for (i
= 0; i
< n
; i
++)
16199 ops
[i
] = XVECEXP (vals
, 0, i
);
16200 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
16204 for (i
= 0; i
< 2; i
++)
16205 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
16206 op0
= gen_reg_rtx (V4DImode
);
16207 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
16208 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
16212 for (i
= 0; i
< 4; i
++)
16213 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
16214 ops
[4] = gen_reg_rtx (V4DImode
);
16215 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
16216 ops
[5] = gen_reg_rtx (V4DImode
);
16217 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
16218 op0
= gen_reg_rtx (V8DImode
);
16219 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
16220 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
16224 half_mode
= V16QImode
;
16228 half_mode
= V8HImode
;
16232 half_mode
= V8HFmode
;
16236 half_mode
= V8BFmode
;
16240 n
= GET_MODE_NUNITS (mode
);
16241 for (i
= 0; i
< n
; i
++)
16242 ops
[i
] = XVECEXP (vals
, 0, i
);
16243 op0
= gen_reg_rtx (half_mode
);
16244 op1
= gen_reg_rtx (half_mode
);
16245 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
16247 ix86_expand_vector_init_interleave (half_mode
, op1
,
16248 &ops
[n
>> 1], n
>> 2);
16249 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
16253 quarter_mode
= V16QImode
;
16254 half_mode
= V32QImode
;
16258 quarter_mode
= V8HImode
;
16259 half_mode
= V16HImode
;
16263 quarter_mode
= V8HFmode
;
16264 half_mode
= V16HFmode
;
16268 quarter_mode
= V8BFmode
;
16269 half_mode
= V16BFmode
;
16273 n
= GET_MODE_NUNITS (mode
);
16274 for (i
= 0; i
< n
; i
++)
16275 ops
[i
] = XVECEXP (vals
, 0, i
);
16276 op0
= gen_reg_rtx (quarter_mode
);
16277 op1
= gen_reg_rtx (quarter_mode
);
16278 op2
= gen_reg_rtx (quarter_mode
);
16279 op3
= gen_reg_rtx (quarter_mode
);
16280 op4
= gen_reg_rtx (half_mode
);
16281 op5
= gen_reg_rtx (half_mode
);
16282 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
16284 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
16285 &ops
[n
>> 2], n
>> 3);
16286 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
16287 &ops
[n
>> 1], n
>> 3);
16288 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
16289 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
16290 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
16291 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
16292 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
16296 if (!TARGET_SSE4_1
)
16304 /* Don't use ix86_expand_vector_init_interleave if we can't
16305 move from GPR to SSE register directly. */
16306 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
16313 n
= GET_MODE_NUNITS (mode
);
16314 for (i
= 0; i
< n
; i
++)
16315 ops
[i
] = XVECEXP (vals
, 0, i
);
16316 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
16327 gcc_unreachable ();
16331 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
16332 machine_mode tmp_mode
, inner_mode
;
16333 rtx words
[4], shift
;
16335 tmp_mode
= (GET_MODE_SIZE (mode
) < UNITS_PER_WORD
) ? SImode
: word_mode
;
16337 inner_mode
= GET_MODE_INNER (mode
);
16338 n_elts
= GET_MODE_NUNITS (mode
);
16339 n_words
= GET_MODE_SIZE (mode
) / GET_MODE_SIZE (tmp_mode
);
16340 n_elt_per_word
= n_elts
/ n_words
;
16341 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
16343 for (i
= 0; i
< n_words
; ++i
)
16345 rtx word
= NULL_RTX
;
16347 for (j
= 0; j
< n_elt_per_word
; ++j
)
16349 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
16350 elt
= convert_modes (tmp_mode
, inner_mode
, elt
, true);
16356 word
= expand_simple_binop (tmp_mode
, ASHIFT
, word
, shift
,
16357 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16358 word
= expand_simple_binop (tmp_mode
, IOR
, word
, elt
,
16359 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16367 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
16368 else if (n_words
== 2)
16370 rtx tmp
= gen_reg_rtx (mode
);
16371 emit_clobber (tmp
);
16372 emit_move_insn (gen_lowpart (tmp_mode
, tmp
), words
[0]);
16373 emit_move_insn (gen_highpart (tmp_mode
, tmp
), words
[1]);
16374 emit_move_insn (target
, tmp
);
16376 else if (n_words
== 4)
16378 rtx tmp
= gen_reg_rtx (V4SImode
);
16379 gcc_assert (tmp_mode
== SImode
);
16380 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
16381 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
16382 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
16385 gcc_unreachable ();
16389 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16390 instructions unless MMX_OK is true. */
16393 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
16395 machine_mode mode
= GET_MODE (target
);
16396 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16397 int n_elts
= GET_MODE_NUNITS (mode
);
16398 int n_var
= 0, one_var
= -1;
16399 bool all_same
= true, all_const_zero
= true;
16403 /* Handle first initialization from vector elts. */
16404 if (n_elts
!= XVECLEN (vals
, 0))
16406 rtx subtarget
= target
;
16407 x
= XVECEXP (vals
, 0, 0);
16408 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
16409 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
16411 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
16412 if (inner_mode
== QImode
16413 || inner_mode
== HImode
16414 || inner_mode
== TImode
16415 || inner_mode
== HFmode
16416 || inner_mode
== BFmode
)
16418 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
16419 scalar_mode elt_mode
= inner_mode
== TImode
? DImode
: SImode
;
16420 n_bits
/= GET_MODE_SIZE (elt_mode
);
16421 mode
= mode_for_vector (elt_mode
, n_bits
).require ();
16422 inner_mode
= mode_for_vector (elt_mode
, n_bits
/ 2).require ();
16423 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
16424 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
16425 subtarget
= gen_reg_rtx (mode
);
16427 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
16428 if (subtarget
!= target
)
16429 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
16432 gcc_unreachable ();
16435 for (i
= 0; i
< n_elts
; ++i
)
16437 x
= XVECEXP (vals
, 0, i
);
16438 if (!(CONST_SCALAR_INT_P (x
)
16439 || CONST_DOUBLE_P (x
)
16440 || CONST_FIXED_P (x
)))
16441 n_var
++, one_var
= i
;
16442 else if (x
!= CONST0_RTX (inner_mode
))
16443 all_const_zero
= false;
16444 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
16448 /* Constants are best loaded from the constant pool. */
16451 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
16455 /* If all values are identical, broadcast the value. */
16457 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
16458 XVECEXP (vals
, 0, 0)))
16461 /* Values where only one field is non-constant are best loaded from
16462 the pool and overwritten via move later. */
16466 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
16467 XVECEXP (vals
, 0, one_var
),
16471 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
16475 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
16479 V setg (V v, int idx, T val)
16481 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16482 V valv = (V){val, val, val, val, val, val, val, val};
16483 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16484 v = (v & ~mask) | (valv & mask);
16488 ix86_expand_vector_set_var (rtx target
, rtx val
, rtx idx
)
16491 machine_mode mode
= GET_MODE (target
);
16492 machine_mode cmp_mode
= mode
;
16493 int n_elts
= GET_MODE_NUNITS (mode
);
16494 rtx valv
,idxv
,constv
,idx_tmp
;
16497 /* 512-bits vector byte/word broadcast and comparison only available
16498 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16499 when without TARGET_AVX512BW. */
16500 if ((mode
== V32HImode
|| mode
== V32HFmode
|| mode
== V32BFmode
16501 || mode
== V64QImode
)
16502 && !TARGET_AVX512BW
)
16504 gcc_assert (TARGET_AVX512F
);
16505 rtx vhi
, vlo
, idx_hi
;
16506 machine_mode half_mode
;
16507 rtx (*extract_hi
)(rtx
, rtx
);
16508 rtx (*extract_lo
)(rtx
, rtx
);
16510 if (mode
== V32HImode
)
16512 half_mode
= V16HImode
;
16513 extract_hi
= gen_vec_extract_hi_v32hi
;
16514 extract_lo
= gen_vec_extract_lo_v32hi
;
16516 else if (mode
== V32HFmode
)
16518 half_mode
= V16HFmode
;
16519 extract_hi
= gen_vec_extract_hi_v32hf
;
16520 extract_lo
= gen_vec_extract_lo_v32hf
;
16522 else if (mode
== V32BFmode
)
16524 half_mode
= V16BFmode
;
16525 extract_hi
= gen_vec_extract_hi_v32bf
;
16526 extract_lo
= gen_vec_extract_lo_v32bf
;
16530 half_mode
= V32QImode
;
16531 extract_hi
= gen_vec_extract_hi_v64qi
;
16532 extract_lo
= gen_vec_extract_lo_v64qi
;
16535 vhi
= gen_reg_rtx (half_mode
);
16536 vlo
= gen_reg_rtx (half_mode
);
16537 idx_hi
= gen_reg_rtx (GET_MODE (idx
));
16538 emit_insn (extract_hi (vhi
, target
));
16539 emit_insn (extract_lo (vlo
, target
));
16542 vec
[2] = GEN_INT (n_elts
/2);
16543 ix86_expand_binary_operator (MINUS
, GET_MODE (idx
), vec
);
16544 ix86_expand_vector_set_var (vhi
, val
, idx_hi
);
16545 ix86_expand_vector_set_var (vlo
, val
, idx
);
16546 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, vlo
, vhi
)));
16550 if (FLOAT_MODE_P (GET_MODE_INNER (mode
)))
16555 cmp_mode
= V2DImode
;
16558 cmp_mode
= V4DImode
;
16561 cmp_mode
= V8DImode
;
16564 cmp_mode
= V2SImode
;
16567 cmp_mode
= V4SImode
;
16570 cmp_mode
= V8SImode
;
16573 cmp_mode
= V16SImode
;
16576 cmp_mode
= V8HImode
;
16579 cmp_mode
= V16HImode
;
16582 cmp_mode
= V32HImode
;
16585 cmp_mode
= V8HImode
;
16588 cmp_mode
= V16HImode
;
16591 cmp_mode
= V32HImode
;
16594 gcc_unreachable ();
16598 for (int i
= 0; i
!= n_elts
; i
++)
16599 vec
[i
] = GEN_INT (i
);
16600 constv
= gen_rtx_CONST_VECTOR (cmp_mode
, gen_rtvec_v (n_elts
, vec
));
16601 valv
= gen_reg_rtx (mode
);
16602 idxv
= gen_reg_rtx (cmp_mode
);
16603 idx_tmp
= convert_to_mode (GET_MODE_INNER (cmp_mode
), idx
, 1);
16605 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16608 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16609 cmp_mode
, idxv
, idx_tmp
);
16614 vec
[3] = gen_rtx_EQ (mode
, idxv
, constv
);
16617 ok
= ix86_expand_int_vcond (vec
);
16622 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
16624 machine_mode mode
= GET_MODE (target
);
16625 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16626 machine_mode half_mode
;
16627 bool use_vec_merge
= false;
16628 bool blendm_const
= false;
16630 static rtx (*gen_extract
[8][2]) (rtx
, rtx
)
16632 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
16633 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
16634 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
16635 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
16636 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
16637 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
},
16638 { gen_vec_extract_lo_v16hf
, gen_vec_extract_hi_v16hf
},
16639 { gen_vec_extract_lo_v16bf
, gen_vec_extract_hi_v16bf
}
16641 static rtx (*gen_insert
[8][2]) (rtx
, rtx
, rtx
)
16643 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
16644 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
16645 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
16646 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
16647 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
16648 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
},
16649 { gen_vec_set_lo_v16hf
, gen_vec_set_hi_v16hf
},
16650 { gen_vec_set_lo_v16bf
, gen_vec_set_hi_v16bf
},
16653 machine_mode mmode
= VOIDmode
;
16654 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
16659 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16667 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16668 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
16670 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16672 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16673 emit_insn (gen_rtx_SET (target
, tmp
));
16679 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
16683 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16684 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
16686 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16688 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16689 emit_insn (gen_rtx_SET (target
, tmp
));
16693 /* NB: For ELT == 0, use standard scalar operation patterns which
16694 preserve the rest of the vector for combiner:
16697 (vec_duplicate:V2DF (reg:DF))
16707 /* For the two element vectors, we implement a VEC_CONCAT with
16708 the extraction of the other element. */
16710 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
16711 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
16714 op0
= val
, op1
= tmp
;
16716 op0
= tmp
, op1
= val
;
16718 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
16719 emit_insn (gen_rtx_SET (target
, tmp
));
16724 use_vec_merge
= TARGET_SSE4_1
;
16731 use_vec_merge
= true;
16735 /* tmp = target = A B C D */
16736 tmp
= copy_to_reg (target
);
16737 /* target = A A B B */
16738 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
16739 /* target = X A B B */
16740 ix86_expand_vector_set (false, target
, val
, 0);
16741 /* target = A X C D */
16742 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16743 const1_rtx
, const0_rtx
,
16744 GEN_INT (2+4), GEN_INT (3+4)));
16748 /* tmp = target = A B C D */
16749 tmp
= copy_to_reg (target
);
16750 /* tmp = X B C D */
16751 ix86_expand_vector_set (false, tmp
, val
, 0);
16752 /* target = A B X D */
16753 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16754 const0_rtx
, const1_rtx
,
16755 GEN_INT (0+4), GEN_INT (3+4)));
16759 /* tmp = target = A B C D */
16760 tmp
= copy_to_reg (target
);
16761 /* tmp = X B C D */
16762 ix86_expand_vector_set (false, tmp
, val
, 0);
16763 /* target = A B X D */
16764 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16765 const0_rtx
, const1_rtx
,
16766 GEN_INT (2+4), GEN_INT (0+4)));
16770 gcc_unreachable ();
16775 use_vec_merge
= TARGET_SSE4_1
;
16779 /* Element 0 handled by vec_merge below. */
16782 use_vec_merge
= true;
16788 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16789 store into element 0, then shuffle them back. */
16793 order
[0] = GEN_INT (elt
);
16794 order
[1] = const1_rtx
;
16795 order
[2] = const2_rtx
;
16796 order
[3] = GEN_INT (3);
16797 order
[elt
] = const0_rtx
;
16799 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16800 order
[1], order
[2], order
[3]));
16802 ix86_expand_vector_set (false, target
, val
, 0);
16804 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16805 order
[1], order
[2], order
[3]));
16809 /* For SSE1, we have to reuse the V4SF code. */
16810 rtx t
= gen_reg_rtx (V4SFmode
);
16811 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
16812 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
16813 emit_move_insn (target
, gen_lowpart (mode
, t
));
16821 use_vec_merge
= TARGET_SSE2
;
16824 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
16829 use_vec_merge
= TARGET_SSE4_1
;
16833 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16837 half_mode
= V16QImode
;
16844 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16845 if (TARGET_AVX2
&& elt
!= 0)
16848 gen_blendm
= ((mode
== E_V16HFmode
) ? gen_avx2_pblendph_1
16849 : gen_avx2_pblendbf_1
);
16850 blendm_const
= true;
16855 half_mode
= ((mode
== E_V16HFmode
) ? V8HFmode
: V8BFmode
);
16856 j
= ((mode
== E_V16HFmode
) ? 6 : 7);
16862 half_mode
= V8HImode
;
16868 half_mode
= V4SImode
;
16874 half_mode
= V2DImode
;
16880 half_mode
= V4SFmode
;
16886 half_mode
= V2DFmode
;
16892 /* Compute offset. */
16896 gcc_assert (i
<= 1);
16898 /* Extract the half. */
16899 tmp
= gen_reg_rtx (half_mode
);
16900 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
16902 /* Put val in tmp at elt. */
16903 ix86_expand_vector_set (false, tmp
, val
, elt
);
16906 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
16910 if (TARGET_AVX512F
)
16913 gen_blendm
= gen_avx512f_blendmv8df
;
16918 if (TARGET_AVX512F
)
16921 gen_blendm
= gen_avx512f_blendmv8di
;
16926 if (TARGET_AVX512F
)
16929 gen_blendm
= gen_avx512f_blendmv16sf
;
16934 if (TARGET_AVX512F
)
16937 gen_blendm
= gen_avx512f_blendmv16si
;
16942 if (TARGET_AVX512BW
)
16945 gen_blendm
= gen_avx512bw_blendmv32hf
;
16949 if (TARGET_AVX512BW
)
16952 gen_blendm
= gen_avx512bw_blendmv32bf
;
16956 if (TARGET_AVX512BW
)
16959 gen_blendm
= gen_avx512bw_blendmv32hi
;
16961 else if (TARGET_AVX512F
)
16963 half_mode
= E_V8HImode
;
16970 if (TARGET_AVX512BW
)
16973 gen_blendm
= gen_avx512bw_blendmv64qi
;
16975 else if (TARGET_AVX512F
)
16977 half_mode
= E_V16QImode
;
16984 /* Compute offset. */
16988 gcc_assert (i
<= 3);
16991 /* Extract the quarter. */
16992 tmp
= gen_reg_rtx (V4SImode
);
16993 rtx tmp2
= gen_lowpart (V16SImode
, target
);
16994 rtx mask
= gen_reg_rtx (QImode
);
16996 emit_move_insn (mask
, constm1_rtx
);
16997 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
17000 tmp2
= gen_reg_rtx (half_mode
);
17001 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
17004 /* Put val in tmp at elt. */
17005 ix86_expand_vector_set (false, tmp
, val
, elt
);
17008 tmp2
= gen_reg_rtx (V16SImode
);
17009 rtx tmp3
= gen_lowpart (V16SImode
, target
);
17010 mask
= gen_reg_rtx (HImode
);
17011 emit_move_insn (mask
, constm1_rtx
);
17012 tmp
= gen_lowpart (V4SImode
, tmp
);
17013 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
17015 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
17023 if (mmode
!= VOIDmode
)
17025 tmp
= gen_reg_rtx (mode
);
17026 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
17027 rtx merge_mask
= gen_int_mode (HOST_WIDE_INT_1U
<< elt
, mmode
);
17028 /* The avx512*_blendm<mode> expanders have different operand order
17029 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
17030 elements where the mask is set and second input operand otherwise,
17031 in {sse,avx}*_*blend* the first input operand is used for elements
17032 where the mask is clear and second input operand otherwise. */
17034 merge_mask
= force_reg (mmode
, merge_mask
);
17035 emit_insn (gen_blendm (target
, target
, tmp
, merge_mask
));
17037 else if (use_vec_merge
)
17040 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
17041 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
17042 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
17043 emit_insn (gen_rtx_SET (target
, tmp
));
17047 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
17049 emit_move_insn (mem
, target
);
17051 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
17052 emit_move_insn (tmp
, val
);
17054 emit_move_insn (target
, mem
);
17059 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
17061 machine_mode mode
= GET_MODE (vec
);
17062 machine_mode inner_mode
= GET_MODE_INNER (mode
);
17063 bool use_vec_extr
= false;
17069 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17083 use_vec_extr
= true;
17087 use_vec_extr
= TARGET_SSE4_1
;
17099 tmp
= gen_reg_rtx (mode
);
17100 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
17101 GEN_INT (elt
), GEN_INT (elt
),
17102 GEN_INT (elt
+4), GEN_INT (elt
+4)));
17106 tmp
= gen_reg_rtx (mode
);
17107 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
17111 gcc_unreachable ();
17114 use_vec_extr
= true;
17119 use_vec_extr
= TARGET_SSE4_1
;
17133 tmp
= gen_reg_rtx (mode
);
17134 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
17135 GEN_INT (elt
), GEN_INT (elt
),
17136 GEN_INT (elt
), GEN_INT (elt
)));
17140 tmp
= gen_reg_rtx (mode
);
17141 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
17145 gcc_unreachable ();
17148 use_vec_extr
= true;
17153 /* For SSE1, we have to reuse the V4SF code. */
17154 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
17155 gen_lowpart (V4SFmode
, vec
), elt
);
17164 use_vec_extr
= TARGET_SSE2
;
17167 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
17171 use_vec_extr
= TARGET_SSE4_1
;
17175 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
17177 tmp
= gen_reg_rtx (SImode
);
17178 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
17180 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
17185 use_vec_extr
= TARGET_SSE4_1
;
17191 tmp
= gen_reg_rtx (V4SFmode
);
17193 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
17195 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
17196 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17204 tmp
= gen_reg_rtx (V2DFmode
);
17206 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
17208 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
17209 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
17217 tmp
= gen_reg_rtx (V16QImode
);
17219 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
17221 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
17222 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17230 tmp
= gen_reg_rtx (V8HImode
);
17232 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
17234 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
17235 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17243 tmp
= gen_reg_rtx (V4SImode
);
17245 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
17247 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
17248 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17256 tmp
= gen_reg_rtx (V2DImode
);
17258 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
17260 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
17261 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
17267 if (TARGET_AVX512BW
)
17269 tmp
= gen_reg_rtx (V16HImode
);
17271 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
17273 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
17274 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17280 if (TARGET_AVX512BW
)
17282 tmp
= gen_reg_rtx (V32QImode
);
17284 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
17286 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
17287 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
17293 tmp
= gen_reg_rtx (V8SFmode
);
17295 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
17297 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
17298 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17302 tmp
= gen_reg_rtx (V4DFmode
);
17304 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
17306 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
17307 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17311 tmp
= gen_reg_rtx (V8SImode
);
17313 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
17315 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
17316 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17320 tmp
= gen_reg_rtx (V4DImode
);
17322 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
17324 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
17325 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17330 if (TARGET_AVX512BW
)
17332 tmp
= (mode
== E_V32HFmode
17333 ? gen_reg_rtx (V16HFmode
)
17334 : gen_reg_rtx (V16BFmode
));
17336 emit_insn (maybe_gen_vec_extract_lo (mode
, tmp
, vec
));
17338 emit_insn (maybe_gen_vec_extract_hi (mode
, tmp
, vec
));
17339 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17348 tmp
= (mode
== E_V16HFmode
17349 ? gen_reg_rtx (V8HFmode
)
17350 : gen_reg_rtx (V8BFmode
));
17352 emit_insn (maybe_gen_vec_extract_lo (mode
, tmp
, vec
));
17354 emit_insn (maybe_gen_vec_extract_hi (mode
, tmp
, vec
));
17355 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17361 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17362 /* ??? Could extract the appropriate HImode element and shift. */
17371 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
17372 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
17374 /* Let the rtl optimizers know about the zero extension performed. */
17375 if (inner_mode
== QImode
|| inner_mode
== HImode
)
17377 rtx reg
= gen_reg_rtx (SImode
);
17378 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
17379 emit_move_insn (reg
, tmp
);
17380 tmp
= gen_lowpart (inner_mode
, reg
);
17381 SUBREG_PROMOTED_VAR_P (tmp
) = 1;
17382 SUBREG_PROMOTED_SET (tmp
, 1);
17385 emit_move_insn (target
, tmp
);
17389 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
17391 emit_move_insn (mem
, vec
);
17393 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
17394 emit_move_insn (target
, tmp
);
17398 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17399 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17400 The upper bits of DEST are undefined, though they shouldn't cause
17401 exceptions (some bits from src or all zeros are ok). */
17404 emit_reduc_half (rtx dest
, rtx src
, int i
)
17407 switch (GET_MODE (src
))
17411 tem
= gen_sse_movhlps (dest
, src
, src
);
17413 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
17414 GEN_INT (1 + 4), GEN_INT (1 + 4));
17417 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
17420 d
= gen_reg_rtx (V1SImode
);
17421 tem
= gen_mmx_lshrv1si3 (d
, gen_lowpart (V1SImode
, src
),
17425 d
= gen_reg_rtx (V1DImode
);
17426 tem
= gen_mmx_lshrv1di3 (d
, gen_lowpart (V1DImode
, src
),
17434 d
= gen_reg_rtx (V1TImode
);
17435 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
17440 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
17442 tem
= gen_avx_shufps256 (dest
, src
, src
,
17443 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
17447 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
17449 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
17458 if (GET_MODE (dest
) != V4DImode
)
17459 d
= gen_reg_rtx (V4DImode
);
17460 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
17461 gen_lowpart (V4DImode
, src
),
17466 d
= gen_reg_rtx (V2TImode
);
17467 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
17476 d
= gen_reg_rtx (V4TImode
);
17477 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
17487 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
17488 gen_lowpart (V16SImode
, src
),
17489 gen_lowpart (V16SImode
, src
),
17490 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
17491 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
17492 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
17493 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
17494 GEN_INT (0xC), GEN_INT (0xD),
17495 GEN_INT (0xE), GEN_INT (0xF),
17496 GEN_INT (0x10), GEN_INT (0x11),
17497 GEN_INT (0x12), GEN_INT (0x13),
17498 GEN_INT (0x14), GEN_INT (0x15),
17499 GEN_INT (0x16), GEN_INT (0x17));
17501 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
17502 gen_lowpart (V16SImode
, src
),
17503 GEN_INT (i
== 128 ? 0x2 : 0x1),
17507 GEN_INT (i
== 128 ? 0x6 : 0x5),
17511 GEN_INT (i
== 128 ? 0xA : 0x9),
17515 GEN_INT (i
== 128 ? 0xE : 0xD),
17521 gcc_unreachable ();
17525 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
17528 /* Expand a vector reduction. FN is the binary pattern to reduce;
17529 DEST is the destination; IN is the input vector. */
17532 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
17534 rtx half
, dst
, vec
= in
;
17535 machine_mode mode
= GET_MODE (in
);
17538 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17540 && mode
== V8HImode
17541 && fn
== gen_uminv8hi3
)
17543 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
17547 for (i
= GET_MODE_BITSIZE (mode
);
17548 i
> GET_MODE_UNIT_BITSIZE (mode
);
17551 half
= gen_reg_rtx (mode
);
17552 emit_reduc_half (half
, vec
, i
);
17553 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
17556 dst
= gen_reg_rtx (mode
);
17557 emit_insn (fn (dst
, half
, vec
));
17562 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17563 FP status register is set. */
17566 ix86_emit_fp_unordered_jump (rtx label
)
17568 rtx reg
= gen_reg_rtx (HImode
);
17572 emit_insn (gen_x86_fnstsw_1 (reg
));
17574 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
17576 emit_insn (gen_x86_sahf_1 (reg
));
17578 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
17579 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
17583 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
17585 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17586 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
17589 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
17590 gen_rtx_LABEL_REF (VOIDmode
, label
),
17592 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
17593 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
17594 JUMP_LABEL (insn
) = label
;
17597 /* Output code to perform an sinh XFmode calculation. */
17600 ix86_emit_i387_sinh (rtx op0
, rtx op1
)
17602 rtx e1
= gen_reg_rtx (XFmode
);
17603 rtx e2
= gen_reg_rtx (XFmode
);
17604 rtx scratch
= gen_reg_rtx (HImode
);
17605 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17606 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17608 rtx_code_label
*jump_label
= gen_label_rtx ();
17611 /* scratch = fxam (op1) */
17612 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17614 /* e1 = expm1 (|op1|) */
17615 emit_insn (gen_absxf2 (e2
, op1
));
17616 emit_insn (gen_expm1xf2 (e1
, e2
));
17618 /* e2 = e1 / (e1 + 1.0) + e1 */
17619 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17620 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17621 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17622 emit_insn (gen_addxf3 (e2
, e2
, e1
));
17624 /* flags = signbit (op1) */
17625 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17627 /* if (flags) then e2 = -e2 */
17628 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17629 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17630 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17632 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17633 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17634 JUMP_LABEL (insn
) = jump_label
;
17636 emit_insn (gen_negxf2 (e2
, e2
));
17638 emit_label (jump_label
);
17639 LABEL_NUSES (jump_label
) = 1;
17641 /* op0 = 0.5 * e2 */
17642 half
= force_reg (XFmode
, half
);
17643 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17646 /* Output code to perform an cosh XFmode calculation. */
17649 ix86_emit_i387_cosh (rtx op0
, rtx op1
)
17651 rtx e1
= gen_reg_rtx (XFmode
);
17652 rtx e2
= gen_reg_rtx (XFmode
);
17653 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17656 /* e1 = exp (op1) */
17657 emit_insn (gen_expxf2 (e1
, op1
));
17659 /* e2 = e1 + 1.0 / e1 */
17660 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17661 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
17662 emit_insn (gen_addxf3 (e2
, e1
, e2
));
17664 /* op0 = 0.5 * e2 */
17665 half
= force_reg (XFmode
, half
);
17666 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17669 /* Output code to perform an tanh XFmode calculation. */
17672 ix86_emit_i387_tanh (rtx op0
, rtx op1
)
17674 rtx e1
= gen_reg_rtx (XFmode
);
17675 rtx e2
= gen_reg_rtx (XFmode
);
17676 rtx scratch
= gen_reg_rtx (HImode
);
17677 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17679 rtx_code_label
*jump_label
= gen_label_rtx ();
17682 /* scratch = fxam (op1) */
17683 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17685 /* e1 = expm1 (-|2 * op1|) */
17686 emit_insn (gen_addxf3 (e2
, op1
, op1
));
17687 emit_insn (gen_absxf2 (e2
, e2
));
17688 emit_insn (gen_negxf2 (e2
, e2
));
17689 emit_insn (gen_expm1xf2 (e1
, e2
));
17691 /* e2 = e1 / (e1 + 2.0) */
17692 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
17693 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
17694 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17696 /* flags = signbit (op1) */
17697 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17699 /* if (!flags) then e2 = -e2 */
17700 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17701 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17702 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17704 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17705 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17706 JUMP_LABEL (insn
) = jump_label
;
17708 emit_insn (gen_negxf2 (e2
, e2
));
17710 emit_label (jump_label
);
17711 LABEL_NUSES (jump_label
) = 1;
17713 emit_move_insn (op0
, e2
);
17716 /* Output code to perform an asinh XFmode calculation. */
17719 ix86_emit_i387_asinh (rtx op0
, rtx op1
)
17721 rtx e1
= gen_reg_rtx (XFmode
);
17722 rtx e2
= gen_reg_rtx (XFmode
);
17723 rtx scratch
= gen_reg_rtx (HImode
);
17724 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17726 rtx_code_label
*jump_label
= gen_label_rtx ();
17729 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17730 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
17731 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17732 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17733 emit_insn (gen_sqrtxf2 (e2
, e2
));
17734 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
17737 emit_insn (gen_divxf3 (e1
, e1
, e2
));
17739 /* scratch = fxam (op1) */
17740 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17742 /* e1 = e1 + |op1| */
17743 emit_insn (gen_absxf2 (e2
, op1
));
17744 emit_insn (gen_addxf3 (e1
, e1
, e2
));
17746 /* e2 = log1p (e1) */
17747 ix86_emit_i387_log1p (e2
, e1
);
17749 /* flags = signbit (op1) */
17750 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17752 /* if (flags) then e2 = -e2 */
17753 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17754 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17755 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17757 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17758 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17759 JUMP_LABEL (insn
) = jump_label
;
17761 emit_insn (gen_negxf2 (e2
, e2
));
17763 emit_label (jump_label
);
17764 LABEL_NUSES (jump_label
) = 1;
17766 emit_move_insn (op0
, e2
);
17769 /* Output code to perform an acosh XFmode calculation. */
17772 ix86_emit_i387_acosh (rtx op0
, rtx op1
)
17774 rtx e1
= gen_reg_rtx (XFmode
);
17775 rtx e2
= gen_reg_rtx (XFmode
);
17776 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17778 /* e2 = sqrt (op1 + 1.0) */
17779 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
17780 emit_insn (gen_sqrtxf2 (e2
, e2
));
17782 /* e1 = sqrt (op1 - 1.0) */
17783 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
17784 emit_insn (gen_sqrtxf2 (e1
, e1
));
17787 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
17789 /* e1 = e1 + op1 */
17790 emit_insn (gen_addxf3 (e1
, e1
, op1
));
17792 /* op0 = log (e1) */
17793 emit_insn (gen_logxf2 (op0
, e1
));
17796 /* Output code to perform an atanh XFmode calculation. */
17799 ix86_emit_i387_atanh (rtx op0
, rtx op1
)
17801 rtx e1
= gen_reg_rtx (XFmode
);
17802 rtx e2
= gen_reg_rtx (XFmode
);
17803 rtx scratch
= gen_reg_rtx (HImode
);
17804 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17805 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17807 rtx_code_label
*jump_label
= gen_label_rtx ();
17810 /* scratch = fxam (op1) */
17811 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17814 emit_insn (gen_absxf2 (e2
, op1
));
17816 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17817 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17818 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
17819 emit_insn (gen_addxf3 (e2
, e2
, e2
));
17820 emit_insn (gen_negxf2 (e2
, e2
));
17821 emit_insn (gen_divxf3 (e1
, e2
, e1
));
17823 /* e2 = log1p (e1) */
17824 ix86_emit_i387_log1p (e2
, e1
);
17826 /* flags = signbit (op1) */
17827 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17829 /* if (!flags) then e2 = -e2 */
17830 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17831 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17832 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17834 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17835 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17836 JUMP_LABEL (insn
) = jump_label
;
17838 emit_insn (gen_negxf2 (e2
, e2
));
17840 emit_label (jump_label
);
17841 LABEL_NUSES (jump_label
) = 1;
17843 /* op0 = 0.5 * e2 */
17844 half
= force_reg (XFmode
, half
);
17845 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17848 /* Output code to perform a log1p XFmode calculation. */
17851 ix86_emit_i387_log1p (rtx op0
, rtx op1
)
17853 rtx_code_label
*label1
= gen_label_rtx ();
17854 rtx_code_label
*label2
= gen_label_rtx ();
17856 rtx tmp
= gen_reg_rtx (XFmode
);
17857 rtx res
= gen_reg_rtx (XFmode
);
17858 rtx cst
, cstln2
, cst1
;
17861 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17862 before the conditional jump, otherwise the stack adjustment will be
17863 only conditional. */
17864 do_pending_stack_adjust ();
17866 cst
= const_double_from_real_value
17867 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
17868 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
17870 emit_insn (gen_absxf2 (tmp
, op1
));
17872 cst
= force_reg (XFmode
, cst
);
17873 ix86_expand_branch (GE
, tmp
, cst
, label1
);
17874 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
17875 insn
= get_last_insn ();
17876 JUMP_LABEL (insn
) = label1
;
17878 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
17879 emit_jump (label2
);
17881 emit_label (label1
);
17882 LABEL_NUSES (label1
) = 1;
17884 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17885 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
17886 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
17888 emit_label (label2
);
17889 LABEL_NUSES (label2
) = 1;
17891 emit_move_insn (op0
, res
);
17894 /* Emit code for round calculation. */
17896 ix86_emit_i387_round (rtx op0
, rtx op1
)
17898 machine_mode inmode
= GET_MODE (op1
);
17899 machine_mode outmode
= GET_MODE (op0
);
17900 rtx e1
= gen_reg_rtx (XFmode
);
17901 rtx e2
= gen_reg_rtx (XFmode
);
17902 rtx scratch
= gen_reg_rtx (HImode
);
17903 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17904 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17905 rtx res
= gen_reg_rtx (outmode
);
17906 rtx_code_label
*jump_label
= gen_label_rtx ();
17907 rtx (*floor_insn
) (rtx
, rtx
);
17908 rtx (*neg_insn
) (rtx
, rtx
);
17916 tmp
= gen_reg_rtx (XFmode
);
17918 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
17924 gcc_unreachable ();
17930 floor_insn
= gen_frndintxf2_floor
;
17931 neg_insn
= gen_negsf2
;
17934 floor_insn
= gen_frndintxf2_floor
;
17935 neg_insn
= gen_negdf2
;
17938 floor_insn
= gen_frndintxf2_floor
;
17939 neg_insn
= gen_negxf2
;
17942 floor_insn
= gen_lfloorxfhi2
;
17943 neg_insn
= gen_neghi2
;
17946 floor_insn
= gen_lfloorxfsi2
;
17947 neg_insn
= gen_negsi2
;
17950 floor_insn
= gen_lfloorxfdi2
;
17951 neg_insn
= gen_negdi2
;
17954 gcc_unreachable ();
17957 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17959 /* scratch = fxam(op1) */
17960 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17962 /* e1 = fabs(op1) */
17963 emit_insn (gen_absxf2 (e1
, op1
));
17965 /* e2 = e1 + 0.5 */
17966 half
= force_reg (XFmode
, half
);
17967 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
17969 /* res = floor(e2) */
17975 tmp
= gen_reg_rtx (XFmode
);
17977 emit_insn (floor_insn (tmp
, e2
));
17978 emit_insn (gen_rtx_SET (res
,
17979 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
17980 UNSPEC_TRUNC_NOOP
)));
17984 emit_insn (floor_insn (res
, e2
));
17987 /* flags = signbit(a) */
17988 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17990 /* if (flags) then res = -res */
17991 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17992 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17993 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17995 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17996 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17997 JUMP_LABEL (insn
) = jump_label
;
17999 emit_insn (neg_insn (res
, res
));
18001 emit_label (jump_label
);
18002 LABEL_NUSES (jump_label
) = 1;
18004 emit_move_insn (op0
, res
);
18007 /* Output code to perform a Newton-Rhapson approximation of a single precision
18008 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
18011 ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
18013 rtx x0
, x1
, e0
, e1
;
18015 x0
= gen_reg_rtx (mode
);
18016 e0
= gen_reg_rtx (mode
);
18017 e1
= gen_reg_rtx (mode
);
18018 x1
= gen_reg_rtx (mode
);
18020 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
18022 b
= force_reg (mode
, b
);
18024 /* x0 = rcp(b) estimate */
18025 if (mode
== V16SFmode
|| mode
== V8DFmode
)
18027 if (TARGET_AVX512ER
)
18029 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18032 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
18036 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18040 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
18044 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
18047 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
18050 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
18053 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
18056 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
18059 /* Output code to perform a Newton-Rhapson approximation of a
18060 single precision floating point [reciprocal] square root. */
18063 ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
18065 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
18069 x0
= gen_reg_rtx (mode
);
18070 e0
= gen_reg_rtx (mode
);
18071 e1
= gen_reg_rtx (mode
);
18072 e2
= gen_reg_rtx (mode
);
18073 e3
= gen_reg_rtx (mode
);
18075 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
18078 /* res = rsqrt28(a) estimate */
18079 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18083 /* x0 = rsqrt28(a) estimate */
18084 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18086 /* res = rcp28(x0) estimate */
18087 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
18093 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
18094 mthree
= const_double_from_real_value (r
, SFmode
);
18096 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
18097 mhalf
= const_double_from_real_value (r
, SFmode
);
18098 unspec
= UNSPEC_RSQRT
;
18100 if (VECTOR_MODE_P (mode
))
18102 mthree
= ix86_build_const_vector (mode
, true, mthree
);
18103 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
18104 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18105 if (GET_MODE_SIZE (mode
) == 64)
18106 unspec
= UNSPEC_RSQRT14
;
18109 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18110 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18112 a
= force_reg (mode
, a
);
18114 /* x0 = rsqrt(a) estimate */
18115 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
18118 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18121 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
18124 /* Handle masked compare. */
18125 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
18127 mask
= gen_reg_rtx (HImode
);
18128 /* Imm value 0x4 corresponds to not-equal comparison. */
18129 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
18130 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
18134 mask
= gen_reg_rtx (mode
);
18135 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
18136 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
18140 mthree
= force_reg (mode
, mthree
);
18143 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
18145 unsigned vector_size
= GET_MODE_SIZE (mode
);
18147 || (TARGET_AVX512F
&& vector_size
== 64)
18148 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
18149 emit_insn (gen_rtx_SET (e2
,
18150 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
18154 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
18157 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
18160 mhalf
= force_reg (mode
, mhalf
);
18162 /* e3 = -.5 * x0 */
18163 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
18165 /* e3 = -.5 * e0 */
18166 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
18167 /* ret = e2 * e3 */
18168 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
18171 /* Expand fabs (OP0) and return a new rtx that holds the result. The
18172 mask for masking out the sign-bit is stored in *SMASK, if that is
18176 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
18178 machine_mode vmode
, mode
= GET_MODE (op0
);
18181 xa
= gen_reg_rtx (mode
);
18182 if (mode
== SFmode
)
18184 else if (mode
== DFmode
)
18188 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
18189 if (!VECTOR_MODE_P (mode
))
18191 /* We need to generate a scalar mode mask in this case. */
18192 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
18193 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
18194 mask
= gen_reg_rtx (mode
);
18195 emit_insn (gen_rtx_SET (mask
, tmp
));
18197 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
18205 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
18206 swapping the operands if SWAP_OPERANDS is true. The expanded
18207 code is a forward jump to a newly created label in case the
18208 comparison is true. The generated label rtx is returned. */
18209 static rtx_code_label
*
18210 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
18211 bool swap_operands
)
18213 bool unordered_compare
= ix86_unordered_fp_compare (code
);
18214 rtx_code_label
*label
;
18218 std::swap (op0
, op1
);
18220 label
= gen_label_rtx ();
18221 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
18222 if (unordered_compare
)
18223 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
18224 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
18225 emit_insn (gen_rtx_SET (reg
, tmp
));
18226 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
18227 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
18228 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
18229 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
18230 JUMP_LABEL (tmp
) = label
;
18235 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18236 using comparison code CODE. Operands are swapped for the comparison if
18237 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18239 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
18240 bool swap_operands
)
18242 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
18243 machine_mode mode
= GET_MODE (op0
);
18244 rtx mask
= gen_reg_rtx (mode
);
18247 std::swap (op0
, op1
);
18249 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
18251 emit_insn (insn (mask
, op0
, op1
,
18252 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
18256 /* Expand copysign from SIGN to the positive value ABS_VALUE
18257 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18261 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
18263 machine_mode mode
= GET_MODE (sign
);
18264 rtx sgn
= gen_reg_rtx (mode
);
18265 if (mask
== NULL_RTX
)
18267 machine_mode vmode
;
18269 if (mode
== SFmode
)
18271 else if (mode
== DFmode
)
18276 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
18277 if (!VECTOR_MODE_P (mode
))
18279 /* We need to generate a scalar mode mask in this case. */
18280 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
18281 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
18282 mask
= gen_reg_rtx (mode
);
18283 emit_insn (gen_rtx_SET (mask
, tmp
));
18287 mask
= gen_rtx_NOT (mode
, mask
);
18288 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
18289 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
18292 /* Expand SSE sequence for computing lround from OP1 storing
18296 ix86_expand_lround (rtx op0
, rtx op1
)
18298 /* C code for the stuff we're doing below:
18299 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18302 machine_mode mode
= GET_MODE (op1
);
18303 const struct real_format
*fmt
;
18304 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18307 /* load nextafter (0.5, 0.0) */
18308 fmt
= REAL_MODE_FORMAT (mode
);
18309 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18310 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18312 /* adj = copysign (0.5, op1) */
18313 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18314 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
18316 /* adj = op1 + adj */
18317 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18319 /* op0 = (imode)adj */
18320 expand_fix (op0
, adj
, 0);
18323 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18327 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
18329 /* C code for the stuff we're doing below (for do_floor):
18331 xi -= (double)xi > op1 ? 1 : 0;
18334 machine_mode fmode
= GET_MODE (op1
);
18335 machine_mode imode
= GET_MODE (op0
);
18336 rtx ireg
, freg
, tmp
;
18337 rtx_code_label
*label
;
18339 /* reg = (long)op1 */
18340 ireg
= gen_reg_rtx (imode
);
18341 expand_fix (ireg
, op1
, 0);
18343 /* freg = (double)reg */
18344 freg
= gen_reg_rtx (fmode
);
18345 expand_float (freg
, ireg
, 0);
18347 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18348 label
= ix86_expand_sse_compare_and_jump (UNLE
,
18349 freg
, op1
, !do_floor
);
18350 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
18351 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
18352 emit_move_insn (ireg
, tmp
);
18354 emit_label (label
);
18355 LABEL_NUSES (label
) = 1;
18357 emit_move_insn (op0
, ireg
);
18360 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18361 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18364 ix86_gen_TWO52 (machine_mode mode
)
18366 const struct real_format
*fmt
;
18367 REAL_VALUE_TYPE TWO52r
;
18370 fmt
= REAL_MODE_FORMAT (mode
);
18371 real_2expN (&TWO52r
, fmt
->p
- 1, mode
);
18372 TWO52
= const_double_from_real_value (TWO52r
, mode
);
18373 TWO52
= force_reg (mode
, TWO52
);
18378 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18381 ix86_expand_rint (rtx operand0
, rtx operand1
)
18383 /* C code for the stuff we're doing below:
18384 xa = fabs (operand1);
18385 if (!isless (xa, 2**52))
18388 if (flag_rounding_math)
18390 two52 = copysign (two52, operand1);
18393 xa = xa + two52 - two52;
18394 return copysign (xa, operand1);
18396 machine_mode mode
= GET_MODE (operand0
);
18397 rtx res
, xa
, TWO52
, mask
;
18398 rtx_code_label
*label
;
18400 TWO52
= ix86_gen_TWO52 (mode
);
18402 /* Temporary for holding the result, initialized to the input
18403 operand to ease control flow. */
18404 res
= copy_to_reg (operand1
);
18406 /* xa = abs (operand1) */
18407 xa
= ix86_expand_sse_fabs (res
, &mask
);
18409 /* if (!isless (xa, TWO52)) goto label; */
18410 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18412 if (flag_rounding_math
)
18414 ix86_sse_copysign_to_positive (TWO52
, TWO52
, res
, mask
);
18418 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18419 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
18421 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18422 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
18423 xa
= ix86_expand_sse_fabs (xa
, NULL
);
18425 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
18427 emit_label (label
);
18428 LABEL_NUSES (label
) = 1;
18430 emit_move_insn (operand0
, res
);
18433 /* Expand SSE2 sequence for computing floor or ceil
18434 from OPERAND1 storing into OPERAND0. */
18436 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
18438 /* C code for the stuff we expand below.
18439 double xa = fabs (x), x2;
18440 if (!isless (xa, TWO52))
18442 x2 = (double)(long)x;
18451 if (HONOR_SIGNED_ZEROS (mode))
18452 return copysign (x2, x);
18455 machine_mode mode
= GET_MODE (operand0
);
18456 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
18457 rtx_code_label
*label
;
18459 TWO52
= ix86_gen_TWO52 (mode
);
18461 /* Temporary for holding the result, initialized to the input
18462 operand to ease control flow. */
18463 res
= copy_to_reg (operand1
);
18465 /* xa = abs (operand1) */
18466 xa
= ix86_expand_sse_fabs (res
, &mask
);
18468 /* if (!isless (xa, TWO52)) goto label; */
18469 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18471 /* xa = (double)(long)x */
18472 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18473 expand_fix (xi
, res
, 0);
18474 expand_float (xa
, xi
, 0);
18477 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18479 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18480 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
18481 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18482 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
18483 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18484 if (HONOR_SIGNED_ZEROS (mode
))
18486 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18487 if (do_floor
&& flag_rounding_math
)
18488 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18490 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
18492 emit_move_insn (res
, tmp
);
18494 emit_label (label
);
18495 LABEL_NUSES (label
) = 1;
18497 emit_move_insn (operand0
, res
);
18500 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18501 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18502 that is only available on 64bit targets. */
18504 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
18506 /* C code for the stuff we expand below.
18507 double xa = fabs (x), x2;
18508 if (!isless (xa, TWO52))
18510 xa = xa + TWO52 - TWO52;
18511 x2 = copysign (xa, x);
18520 if (HONOR_SIGNED_ZEROS (mode))
18521 x2 = copysign (x2, x);
18524 machine_mode mode
= GET_MODE (operand0
);
18525 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
18526 rtx_code_label
*label
;
18528 TWO52
= ix86_gen_TWO52 (mode
);
18530 /* Temporary for holding the result, initialized to the input
18531 operand to ease control flow. */
18532 res
= copy_to_reg (operand1
);
18534 /* xa = abs (operand1) */
18535 xa
= ix86_expand_sse_fabs (res
, &mask
);
18537 /* if (!isless (xa, TWO52)) goto label; */
18538 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18540 /* xa = xa + TWO52 - TWO52; */
18541 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18542 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
18544 /* xa = copysign (xa, operand1) */
18545 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18548 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18550 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18551 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
18552 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18553 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
18554 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18555 if (HONOR_SIGNED_ZEROS (mode
))
18557 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18558 if (do_floor
&& flag_rounding_math
)
18559 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18561 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
18563 emit_move_insn (res
, tmp
);
18565 emit_label (label
);
18566 LABEL_NUSES (label
) = 1;
18568 emit_move_insn (operand0
, res
);
18571 /* Expand SSE sequence for computing trunc
18572 from OPERAND1 storing into OPERAND0. */
18574 ix86_expand_trunc (rtx operand0
, rtx operand1
)
18576 /* C code for SSE variant we expand below.
18577 double xa = fabs (x), x2;
18578 if (!isless (xa, TWO52))
18580 x2 = (double)(long)x;
18581 if (HONOR_SIGNED_ZEROS (mode))
18582 return copysign (x2, x);
18585 machine_mode mode
= GET_MODE (operand0
);
18586 rtx xa
, xi
, TWO52
, res
, mask
;
18587 rtx_code_label
*label
;
18589 TWO52
= ix86_gen_TWO52 (mode
);
18591 /* Temporary for holding the result, initialized to the input
18592 operand to ease control flow. */
18593 res
= copy_to_reg (operand1
);
18595 /* xa = abs (operand1) */
18596 xa
= ix86_expand_sse_fabs (res
, &mask
);
18598 /* if (!isless (xa, TWO52)) goto label; */
18599 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18601 /* xa = (double)(long)x */
18602 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18603 expand_fix (xi
, res
, 0);
18604 expand_float (xa
, xi
, 0);
18606 if (HONOR_SIGNED_ZEROS (mode
))
18607 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18609 emit_move_insn (res
, xa
);
18611 emit_label (label
);
18612 LABEL_NUSES (label
) = 1;
18614 emit_move_insn (operand0
, res
);
18617 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18618 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18619 that is only available on 64bit targets. */
18621 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
18623 machine_mode mode
= GET_MODE (operand0
);
18624 rtx xa
, xa2
, TWO52
, tmp
, one
, res
, mask
;
18625 rtx_code_label
*label
;
18627 /* C code for SSE variant we expand below.
18628 double xa = fabs (x), x2;
18629 if (!isless (xa, TWO52))
18631 xa2 = xa + TWO52 - TWO52;
18635 x2 = copysign (xa2, x);
18639 TWO52
= ix86_gen_TWO52 (mode
);
18641 /* Temporary for holding the result, initialized to the input
18642 operand to ease control flow. */
18643 res
=copy_to_reg (operand1
);
18645 /* xa = abs (operand1) */
18646 xa
= ix86_expand_sse_fabs (res
, &mask
);
18648 /* if (!isless (xa, TWO52)) goto label; */
18649 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18651 /* xa2 = xa + TWO52 - TWO52; */
18652 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18653 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18656 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18658 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18659 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa2
, xa
, false);
18660 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18661 tmp
= expand_simple_binop (mode
, MINUS
,
18662 xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18663 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18664 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
18665 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18667 /* res = copysign (xa2, operand1) */
18668 ix86_sse_copysign_to_positive (res
, tmp
, res
, mask
);
18670 emit_label (label
);
18671 LABEL_NUSES (label
) = 1;
18673 emit_move_insn (operand0
, res
);
18676 /* Expand SSE sequence for computing round
18677 from OPERAND1 storing into OPERAND0. */
18679 ix86_expand_round (rtx operand0
, rtx operand1
)
18681 /* C code for the stuff we're doing below:
18682 double xa = fabs (x);
18683 if (!isless (xa, TWO52))
18685 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18686 return copysign (xa, x);
18688 machine_mode mode
= GET_MODE (operand0
);
18689 rtx res
, TWO52
, xa
, xi
, half
, mask
;
18690 rtx_code_label
*label
;
18691 const struct real_format
*fmt
;
18692 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18694 /* Temporary for holding the result, initialized to the input
18695 operand to ease control flow. */
18696 res
= copy_to_reg (operand1
);
18698 TWO52
= ix86_gen_TWO52 (mode
);
18699 xa
= ix86_expand_sse_fabs (res
, &mask
);
18700 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18702 /* load nextafter (0.5, 0.0) */
18703 fmt
= REAL_MODE_FORMAT (mode
);
18704 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18705 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18707 /* xa = xa + 0.5 */
18708 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18709 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18711 /* xa = (double)(int64_t)xa */
18712 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18713 expand_fix (xi
, xa
, 0);
18714 expand_float (xa
, xi
, 0);
18716 /* res = copysign (xa, operand1) */
18717 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
18719 emit_label (label
);
18720 LABEL_NUSES (label
) = 1;
18722 emit_move_insn (operand0
, res
);
18725 /* Expand SSE sequence for computing round from OPERAND1 storing
18726 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18727 that is only available on 64bit targets. */
18729 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
18731 /* C code for the stuff we expand below.
18732 double xa = fabs (x), xa2, x2;
18733 if (!isless (xa, TWO52))
18735 Using the absolute value and copying back sign makes
18736 -0.0 -> -0.0 correct.
18737 xa2 = xa + TWO52 - TWO52;
18742 else if (dxa > 0.5)
18744 x2 = copysign (xa2, x);
18747 machine_mode mode
= GET_MODE (operand0
);
18748 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
18749 rtx_code_label
*label
;
18751 TWO52
= ix86_gen_TWO52 (mode
);
18753 /* Temporary for holding the result, initialized to the input
18754 operand to ease control flow. */
18755 res
= copy_to_reg (operand1
);
18757 /* xa = abs (operand1) */
18758 xa
= ix86_expand_sse_fabs (res
, &mask
);
18760 /* if (!isless (xa, TWO52)) goto label; */
18761 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18763 /* xa2 = xa + TWO52 - TWO52; */
18764 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18765 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18767 /* dxa = xa2 - xa; */
18768 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
18770 /* generate 0.5, 1.0 and -0.5 */
18771 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
18772 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18773 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
18777 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18778 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
18779 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18780 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18781 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18782 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
18783 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18784 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18786 /* res = copysign (xa2, operand1) */
18787 ix86_sse_copysign_to_positive (res
, xa2
, res
, mask
);
18789 emit_label (label
);
18790 LABEL_NUSES (label
) = 1;
18792 emit_move_insn (operand0
, res
);
18795 /* Expand SSE sequence for computing round
18796 from OP1 storing into OP0 using sse4 round insn. */
18798 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
18800 machine_mode mode
= GET_MODE (op0
);
18801 rtx e1
, e2
, res
, half
;
18802 const struct real_format
*fmt
;
18803 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18804 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
18805 rtx (*gen_round
) (rtx
, rtx
, rtx
);
18810 gen_copysign
= gen_copysignsf3
;
18811 gen_round
= gen_sse4_1_roundsf2
;
18814 gen_copysign
= gen_copysigndf3
;
18815 gen_round
= gen_sse4_1_rounddf2
;
18818 gcc_unreachable ();
18821 /* round (a) = trunc (a + copysign (0.5, a)) */
18823 /* load nextafter (0.5, 0.0) */
18824 fmt
= REAL_MODE_FORMAT (mode
);
18825 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18826 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18827 half
= const_double_from_real_value (pred_half
, mode
);
18829 /* e1 = copysign (0.5, op1) */
18830 e1
= gen_reg_rtx (mode
);
18831 emit_insn (gen_copysign (e1
, half
, op1
));
18833 /* e2 = op1 + e1 */
18834 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18836 /* res = trunc (e2) */
18837 res
= gen_reg_rtx (mode
);
18838 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
18840 emit_move_insn (op0
, res
);
18843 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18844 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18845 insn every time. */
18847 static GTY(()) rtx_insn
*vselect_insn
;
18849 /* Initialize vselect_insn. */
18852 init_vselect_insn (void)
18857 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
18858 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
18859 XVECEXP (x
, 0, i
) = const0_rtx
;
18860 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
18862 x
= gen_rtx_SET (const0_rtx
, x
);
18864 vselect_insn
= emit_insn (x
);
18868 /* Construct (set target (vec_select op0 (parallel perm))) and
18869 return true if that's a valid instruction in the active ISA. */
18872 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
18873 unsigned nelt
, bool testing_p
)
18876 rtx x
, save_vconcat
;
18879 if (vselect_insn
== NULL_RTX
)
18880 init_vselect_insn ();
18882 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
18883 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
18884 for (i
= 0; i
< nelt
; ++i
)
18885 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
18886 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
18887 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
18888 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
18889 SET_DEST (PATTERN (vselect_insn
)) = target
;
18890 icode
= recog_memoized (vselect_insn
);
18892 if (icode
>= 0 && !testing_p
)
18893 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
18895 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
18896 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
18897 INSN_CODE (vselect_insn
) = -1;
18902 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18905 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
18906 const unsigned char *perm
, unsigned nelt
,
18909 machine_mode v2mode
;
18913 if (vselect_insn
== NULL_RTX
)
18914 init_vselect_insn ();
18916 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
18918 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
18919 PUT_MODE (x
, v2mode
);
18922 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
18923 XEXP (x
, 0) = const0_rtx
;
18924 XEXP (x
, 1) = const0_rtx
;
18928 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18929 using movss or movsd. */
18931 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
18933 machine_mode vmode
= d
->vmode
;
18934 unsigned i
, nelt
= d
->nelt
;
18937 if (d
->one_operand_p
)
18940 if (!(TARGET_SSE
&& (vmode
== V4SFmode
|| vmode
== V4SImode
))
18941 && !(TARGET_MMX_WITH_SSE
&& (vmode
== V2SFmode
|| vmode
== V2SImode
))
18942 && !(TARGET_SSE2
&& (vmode
== V2DFmode
|| vmode
== V2DImode
)))
18945 /* Only the first element is changed. */
18946 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
18948 for (i
= 1; i
< nelt
; ++i
)
18949 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
18955 if (d
->perm
[0] == nelt
)
18956 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
18958 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
18960 emit_insn (gen_rtx_SET (d
->target
, x
));
18965 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18968 expand_vec_perm_insertps (struct expand_vec_perm_d
*d
)
18970 machine_mode vmode
= d
->vmode
;
18971 unsigned i
, cnt_s
, nelt
= d
->nelt
;
18975 if (d
->one_operand_p
)
18978 if (!(TARGET_SSE4_1
18979 && (vmode
== V4SFmode
|| vmode
== V4SImode
18980 || (TARGET_MMX_WITH_SSE
18981 && (vmode
== V2SFmode
|| vmode
== V2SImode
)))))
18984 for (i
= 0; i
< nelt
; ++i
)
18986 if (d
->perm
[i
] == i
)
18998 for (i
= 0; i
< nelt
; ++i
)
19000 if (d
->perm
[i
] == i
+ nelt
)
19014 gcc_assert (cnt_d
!= -1);
19016 cnt_s
= d
->perm
[cnt_d
];
19028 gcc_assert (cnt_s
< nelt
);
19030 rtx x
= gen_sse4_1_insertps (vmode
, d
->target
, dst
, src
,
19031 GEN_INT (cnt_s
<< 6 | cnt_d
<< 4));
19037 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19038 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
19041 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
19043 machine_mode mmode
, vmode
= d
->vmode
;
19044 unsigned i
, nelt
= d
->nelt
;
19045 unsigned HOST_WIDE_INT mask
;
19046 rtx target
, op0
, op1
, maskop
, x
;
19047 rtx rperm
[32], vperm
;
19049 if (d
->one_operand_p
)
19051 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
19052 && (TARGET_AVX512BW
19053 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
19055 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
19057 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
19059 else if (TARGET_SSE4_1
19060 && (GET_MODE_SIZE (vmode
) == 16
19061 || (TARGET_MMX_WITH_SSE
&& GET_MODE_SIZE (vmode
) == 8)
19062 || GET_MODE_SIZE (vmode
) == 4))
19067 /* This is a blend, not a permute. Elements must stay in their
19068 respective lanes. */
19069 for (i
= 0; i
< nelt
; ++i
)
19071 unsigned e
= d
->perm
[i
];
19072 if (!(e
== i
|| e
== i
+ nelt
))
19079 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
19080 decision should be extracted elsewhere, so that we only try that
19081 sequence once all budget==3 options have been tried. */
19082 target
= d
->target
;
19104 for (i
= 0; i
< nelt
; ++i
)
19105 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
19109 for (i
= 0; i
< 2; ++i
)
19110 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
19115 for (i
= 0; i
< 2; ++i
)
19116 mask
|= (d
->perm
[i
] >= 2 ? 3 : 0) << (i
* 2);
19123 /* Use vpblendd instead of vpblendw. */
19124 for (i
= 0; i
< nelt
; ++i
)
19125 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
19130 for (i
= 0; i
< 4; ++i
)
19131 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
19137 /* See if bytes move in pairs so we can use pblendw with
19138 an immediate argument, rather than pblendvb with a vector
19140 for (i
= 0; i
< 16; i
+= 2)
19141 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19144 for (i
= 0; i
< nelt
; ++i
)
19145 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
19148 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
19149 vperm
= force_reg (vmode
, vperm
);
19151 if (GET_MODE_SIZE (vmode
) == 4)
19152 emit_insn (gen_mmx_pblendvb_v4qi (target
, op0
, op1
, vperm
));
19153 else if (GET_MODE_SIZE (vmode
) == 8)
19154 emit_insn (gen_mmx_pblendvb_v8qi (target
, op0
, op1
, vperm
));
19155 else if (GET_MODE_SIZE (vmode
) == 16)
19156 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
19158 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
19159 if (target
!= d
->target
)
19160 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19164 for (i
= 0; i
< 8; ++i
)
19165 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
19170 target
= gen_reg_rtx (vmode
);
19171 op0
= gen_lowpart (vmode
, op0
);
19172 op1
= gen_lowpart (vmode
, op1
);
19176 for (i
= 0; i
< 8; i
+= 2)
19177 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19180 for (i
= 0; i
< 4; ++i
)
19181 mask
|= (d
->perm
[i
* 2] >= 8) << i
;
19186 for (i
= 0; i
< 4; i
+= 2)
19187 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19190 for (i
= 0; i
< 2; ++i
)
19191 mask
|= (d
->perm
[i
* 2] >= 4) << i
;
19196 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19197 for (i
= 0; i
< 32; i
+= 2)
19198 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19200 /* See if bytes move in quadruplets. If yes, vpblendd
19201 with immediate can be used. */
19202 for (i
= 0; i
< 32; i
+= 4)
19203 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
19207 /* See if bytes move the same in both lanes. If yes,
19208 vpblendw with immediate can be used. */
19209 for (i
= 0; i
< 16; i
+= 2)
19210 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
19213 /* Use vpblendw. */
19214 for (i
= 0; i
< 16; ++i
)
19215 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
19220 /* Use vpblendd. */
19221 for (i
= 0; i
< 8; ++i
)
19222 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
19227 /* See if words move in pairs. If yes, vpblendd can be used. */
19228 for (i
= 0; i
< 16; i
+= 2)
19229 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
19233 /* See if words move the same in both lanes. If not,
19234 vpblendvb must be used. */
19235 for (i
= 0; i
< 8; i
++)
19236 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
19238 /* Use vpblendvb. */
19239 for (i
= 0; i
< 32; ++i
)
19240 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
19244 target
= gen_reg_rtx (vmode
);
19245 op0
= gen_lowpart (vmode
, op0
);
19246 op1
= gen_lowpart (vmode
, op1
);
19247 goto finish_pblendvb
;
19250 /* Use vpblendw. */
19251 for (i
= 0; i
< 16; ++i
)
19252 mask
|= (d
->perm
[i
] >= 16) << i
;
19256 /* Use vpblendd. */
19257 for (i
= 0; i
< 8; ++i
)
19258 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
19263 /* Use vpblendd. */
19264 for (i
= 0; i
< 4; ++i
)
19265 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
19270 gcc_unreachable ();
19293 if (mmode
!= VOIDmode
)
19294 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
19296 maskop
= GEN_INT (mask
);
19298 /* This matches five different patterns with the different modes. */
19299 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
19300 x
= gen_rtx_SET (target
, x
);
19302 if (target
!= d
->target
)
19303 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19308 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19309 in terms of the variable form of vpermilps.
19311 Note that we will have already failed the immediate input vpermilps,
19312 which requires that the high and low part shuffle be identical; the
19313 variable form doesn't require that. */
19316 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
19318 rtx rperm
[8], vperm
;
19321 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
19324 /* We can only permute within the 128-bit lane. */
19325 for (i
= 0; i
< 8; ++i
)
19327 unsigned e
= d
->perm
[i
];
19328 if (i
< 4 ? e
>= 4 : e
< 4)
19335 for (i
= 0; i
< 8; ++i
)
19337 unsigned e
= d
->perm
[i
];
19339 /* Within each 128-bit lane, the elements of op0 are numbered
19340 from 0 and the elements of op1 are numbered from 4. */
19346 rperm
[i
] = GEN_INT (e
);
19349 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
19350 vperm
= force_reg (V8SImode
, vperm
);
19351 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
19356 /* For V*[QHS]Imode permutations, check if the same permutation
19357 can't be performed in a 2x, 4x or 8x wider inner mode. */
19360 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
19361 struct expand_vec_perm_d
*nd
)
19364 machine_mode mode
= VOIDmode
;
19368 case E_V8QImode
: mode
= V4HImode
; break;
19369 case E_V16QImode
: mode
= V8HImode
; break;
19370 case E_V32QImode
: mode
= V16HImode
; break;
19371 case E_V64QImode
: mode
= V32HImode
; break;
19372 case E_V4HImode
: mode
= V2SImode
; break;
19373 case E_V8HImode
: mode
= V4SImode
; break;
19374 case E_V16HImode
: mode
= V8SImode
; break;
19375 case E_V32HImode
: mode
= V16SImode
; break;
19376 case E_V4SImode
: mode
= V2DImode
; break;
19377 case E_V8SImode
: mode
= V4DImode
; break;
19378 case E_V16SImode
: mode
= V8DImode
; break;
19379 default: return false;
19381 for (i
= 0; i
< d
->nelt
; i
+= 2)
19382 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
19385 nd
->nelt
= d
->nelt
/ 2;
19386 for (i
= 0; i
< nd
->nelt
; i
++)
19387 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
19388 if (GET_MODE_INNER (mode
) != DImode
)
19389 canonicalize_vector_int_perm (nd
, nd
);
19392 nd
->one_operand_p
= d
->one_operand_p
;
19393 nd
->testing_p
= d
->testing_p
;
19394 if (d
->op0
== d
->op1
)
19395 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
19398 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
19399 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
19402 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19404 nd
->target
= gen_reg_rtx (nd
->vmode
);
19409 /* Return true if permutation D can be performed as VMODE permutation
19413 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
19415 unsigned int i
, j
, chunk
;
19417 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
19418 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
19419 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
19422 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
19425 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
19426 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
19427 if (d
->perm
[i
] & (chunk
- 1))
19430 for (j
= 1; j
< chunk
; ++j
)
19431 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
19437 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19438 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19441 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
19443 unsigned i
, nelt
, eltsz
, mask
;
19444 unsigned char perm
[64];
19445 machine_mode vmode
;
19446 struct expand_vec_perm_d nd
;
19447 rtx rperm
[64], vperm
, target
, op0
, op1
;
19451 if (!d
->one_operand_p
)
19452 switch (GET_MODE_SIZE (d
->vmode
))
19476 if (valid_perm_using_mode_p (V2TImode
, d
))
19481 /* Use vperm2i128 insn. The pattern uses
19482 V4DImode instead of V2TImode. */
19483 target
= d
->target
;
19484 if (d
->vmode
!= V4DImode
)
19485 target
= gen_reg_rtx (V4DImode
);
19486 op0
= gen_lowpart (V4DImode
, d
->op0
);
19487 op1
= gen_lowpart (V4DImode
, d
->op1
);
19489 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
19490 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
19491 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
19492 if (target
!= d
->target
)
19493 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19502 switch (GET_MODE_SIZE (d
->vmode
))
19526 /* V4DImode should be already handled through
19527 expand_vselect by vpermq instruction. */
19528 gcc_assert (d
->vmode
!= V4DImode
);
19531 if (d
->vmode
== V8SImode
19532 || d
->vmode
== V16HImode
19533 || d
->vmode
== V32QImode
)
19535 /* First see if vpermq can be used for
19536 V8SImode/V16HImode/V32QImode. */
19537 if (valid_perm_using_mode_p (V4DImode
, d
))
19539 for (i
= 0; i
< 4; i
++)
19540 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
19543 target
= gen_reg_rtx (V4DImode
);
19544 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
19547 emit_move_insn (d
->target
,
19548 gen_lowpart (d
->vmode
, target
));
19554 /* Next see if vpermd can be used. */
19555 if (valid_perm_using_mode_p (V8SImode
, d
))
19558 /* Or if vpermps can be used. */
19559 else if (d
->vmode
== V8SFmode
)
19562 if (vmode
== V32QImode
)
19564 /* vpshufb only works intra lanes, it is not
19565 possible to shuffle bytes in between the lanes. */
19566 for (i
= 0; i
< nelt
; ++i
)
19567 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
19573 if (!TARGET_AVX512BW
)
19576 /* If vpermq didn't work, vpshufb won't work either. */
19577 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
19581 if (d
->vmode
== V16SImode
19582 || d
->vmode
== V32HImode
19583 || d
->vmode
== V64QImode
)
19585 /* First see if vpermq can be used for
19586 V16SImode/V32HImode/V64QImode. */
19587 if (valid_perm_using_mode_p (V8DImode
, d
))
19589 for (i
= 0; i
< 8; i
++)
19590 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
19593 target
= gen_reg_rtx (V8DImode
);
19594 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
19597 emit_move_insn (d
->target
,
19598 gen_lowpart (d
->vmode
, target
));
19604 /* Next see if vpermd can be used. */
19605 if (valid_perm_using_mode_p (V16SImode
, d
))
19608 /* Or if vpermps can be used. */
19609 else if (d
->vmode
== V16SFmode
)
19612 if (vmode
== V64QImode
)
19614 /* vpshufb only works intra lanes, it is not
19615 possible to shuffle bytes in between the lanes. */
19616 for (i
= 0; i
< nelt
; ++i
)
19617 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
19629 /* Try to avoid variable permutation instruction. */
19630 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19632 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19636 if (vmode
== V8SImode
)
19637 for (i
= 0; i
< 8; ++i
)
19638 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
19639 else if (vmode
== V16SImode
)
19640 for (i
= 0; i
< 16; ++i
)
19641 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
19644 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19645 if (!d
->one_operand_p
)
19646 mask
= 2 * nelt
- 1;
19647 else if (vmode
== V64QImode
)
19648 mask
= nelt
/ 4 - 1;
19649 else if (vmode
== V32QImode
)
19650 mask
= nelt
/ 2 - 1;
19654 for (i
= 0; i
< nelt
; ++i
)
19656 unsigned j
, e
= d
->perm
[i
] & mask
;
19657 for (j
= 0; j
< eltsz
; ++j
)
19658 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
19662 machine_mode vpmode
= vmode
;
19664 nelt
= GET_MODE_SIZE (vmode
);
19666 /* Emulate narrow modes with V16QI instructions. */
19669 rtx m128
= GEN_INT (-128);
19671 /* Remap elements from the second operand, as we have to
19672 account for inactive top elements from the first operand. */
19673 if (!d
->one_operand_p
)
19675 for (i
= 0; i
< nelt
; ++i
)
19677 unsigned ival
= UINTVAL (rperm
[i
]);
19679 rperm
[i
] = GEN_INT (ival
+ 16 - nelt
);
19683 /* Fill inactive elements in the top positions with zeros. */
19684 for (i
= nelt
; i
< 16; ++i
)
19687 vpmode
= V16QImode
;
19690 vperm
= gen_rtx_CONST_VECTOR (vpmode
,
19691 gen_rtvec_v (GET_MODE_NUNITS (vpmode
), rperm
));
19692 vperm
= force_reg (vpmode
, vperm
);
19694 if (vmode
== d
->vmode
)
19695 target
= d
->target
;
19697 target
= gen_reg_rtx (vmode
);
19699 op0
= gen_lowpart (vmode
, d
->op0
);
19701 if (d
->one_operand_p
)
19703 rtx (*gen
) (rtx
, rtx
, rtx
);
19705 if (vmode
== V4QImode
)
19706 gen
= gen_mmx_pshufbv4qi3
;
19707 else if (vmode
== V8QImode
)
19708 gen
= gen_mmx_pshufbv8qi3
;
19709 else if (vmode
== V16QImode
)
19710 gen
= gen_ssse3_pshufbv16qi3
;
19711 else if (vmode
== V32QImode
)
19712 gen
= gen_avx2_pshufbv32qi3
;
19713 else if (vmode
== V64QImode
)
19714 gen
= gen_avx512bw_pshufbv64qi3
;
19715 else if (vmode
== V8SFmode
)
19716 gen
= gen_avx2_permvarv8sf
;
19717 else if (vmode
== V8SImode
)
19718 gen
= gen_avx2_permvarv8si
;
19719 else if (vmode
== V16SFmode
)
19720 gen
= gen_avx512f_permvarv16sf
;
19721 else if (vmode
== V16SImode
)
19722 gen
= gen_avx512f_permvarv16si
;
19724 gcc_unreachable ();
19726 emit_insn (gen (target
, op0
, vperm
));
19730 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
19732 op1
= gen_lowpart (vmode
, d
->op1
);
19734 if (vmode
== V4QImode
)
19735 gen
= gen_mmx_ppermv32
;
19736 else if (vmode
== V8QImode
)
19737 gen
= gen_mmx_ppermv64
;
19738 else if (vmode
== V16QImode
)
19739 gen
= gen_xop_pperm
;
19741 gcc_unreachable ();
19743 emit_insn (gen (target
, op0
, op1
, vperm
));
19746 if (target
!= d
->target
)
19747 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19752 /* Try to expand one-operand permutation with constant mask. */
19755 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
19757 machine_mode mode
= GET_MODE (d
->op0
);
19758 machine_mode maskmode
= mode
;
19759 unsigned inner_size
= GET_MODE_SIZE (GET_MODE_INNER (mode
));
19760 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
19761 rtx target
, op0
, mask
;
19764 if (!rtx_equal_p (d
->op0
, d
->op1
))
19767 if (!TARGET_AVX512F
)
19770 /* Accept VNxHImode and VNxQImode now. */
19771 if (!TARGET_AVX512VL
&& GET_MODE_SIZE (mode
) < 64)
19775 if (!TARGET_AVX512BW
&& inner_size
== 2)
19779 if (!TARGET_AVX512VBMI
&& inner_size
== 1)
19785 gen
= gen_avx512f_permvarv16si
;
19788 gen
= gen_avx512f_permvarv16sf
;
19789 maskmode
= V16SImode
;
19792 gen
= gen_avx512f_permvarv8di
;
19795 gen
= gen_avx512f_permvarv8df
;
19796 maskmode
= V8DImode
;
19799 gen
= gen_avx512bw_permvarv32hi
;
19802 gen
= gen_avx512vl_permvarv16hi
;
19805 gen
= gen_avx512vl_permvarv8hi
;
19808 gen
= gen_avx512bw_permvarv64qi
;
19811 gen
= gen_avx512vl_permvarv32qi
;
19814 gen
= gen_avx512vl_permvarv16qi
;
19824 target
= d
->target
;
19826 for (int i
= 0; i
< d
->nelt
; ++i
)
19827 vec
[i
] = GEN_INT (d
->perm
[i
]);
19828 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
19829 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
19833 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
19835 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19836 in a single instruction. */
19839 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
19841 unsigned i
, nelt
= d
->nelt
;
19842 struct expand_vec_perm_d nd
;
19844 /* Check plain VEC_SELECT first, because AVX has instructions that could
19845 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19846 input where SEL+CONCAT may not. */
19847 if (d
->one_operand_p
)
19849 int mask
= nelt
- 1;
19850 bool identity_perm
= true;
19851 bool broadcast_perm
= true;
19853 for (i
= 0; i
< nelt
; i
++)
19855 nd
.perm
[i
] = d
->perm
[i
] & mask
;
19856 if (nd
.perm
[i
] != i
)
19857 identity_perm
= false;
19859 broadcast_perm
= false;
19865 emit_move_insn (d
->target
, d
->op0
);
19868 else if (broadcast_perm
&& TARGET_AVX2
)
19870 /* Use vpbroadcast{b,w,d}. */
19871 rtx (*gen
) (rtx
, rtx
) = NULL
;
19875 if (TARGET_AVX512BW
)
19876 gen
= gen_avx512bw_vec_dupv64qi_1
;
19879 gen
= gen_avx2_pbroadcastv32qi_1
;
19882 if (TARGET_AVX512BW
)
19883 gen
= gen_avx512bw_vec_dupv32hi_1
;
19886 gen
= gen_avx2_pbroadcastv16hi_1
;
19889 if (TARGET_AVX512F
)
19890 gen
= gen_avx512f_vec_dupv16si_1
;
19893 gen
= gen_avx2_pbroadcastv8si_1
;
19896 gen
= gen_avx2_pbroadcastv16qi
;
19899 gen
= gen_avx2_pbroadcastv8hi
;
19902 if (TARGET_AVX512F
)
19903 gen
= gen_avx512f_vec_dupv16sf_1
;
19906 gen
= gen_avx2_vec_dupv8sf_1
;
19909 if (TARGET_AVX512F
)
19910 gen
= gen_avx512f_vec_dupv8df_1
;
19913 if (TARGET_AVX512F
)
19914 gen
= gen_avx512f_vec_dupv8di_1
;
19916 /* For other modes prefer other shuffles this function creates. */
19922 emit_insn (gen (d
->target
, d
->op0
));
19927 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
19930 /* There are plenty of patterns in sse.md that are written for
19931 SEL+CONCAT and are not replicated for a single op. Perhaps
19932 that should be changed, to avoid the nastiness here. */
19934 /* Recognize interleave style patterns, which means incrementing
19935 every other permutation operand. */
19936 for (i
= 0; i
< nelt
; i
+= 2)
19938 nd
.perm
[i
] = d
->perm
[i
] & mask
;
19939 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
19941 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
19945 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19948 for (i
= 0; i
< nelt
; i
+= 4)
19950 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
19951 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
19952 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
19953 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
19956 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
19962 /* Try the SSE4.1 blend variable merge instructions. */
19963 if (expand_vec_perm_blend (d
))
19966 /* Try movss/movsd instructions. */
19967 if (expand_vec_perm_movs (d
))
19970 /* Try the SSE4.1 insertps instruction. */
19971 if (expand_vec_perm_insertps (d
))
19974 /* Try the fully general two operand permute. */
19975 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
19979 /* Recognize interleave style patterns with reversed operands. */
19980 if (!d
->one_operand_p
)
19982 for (i
= 0; i
< nelt
; ++i
)
19984 unsigned e
= d
->perm
[i
];
19992 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
19997 /* Try one of the AVX vpermil variable permutations. */
19998 if (expand_vec_perm_vpermil (d
))
20001 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
20002 vpshufb, vpermd, vpermps or vpermq variable permutation. */
20003 if (expand_vec_perm_pshufb (d
))
20006 /* Try the AVX2 vpalignr instruction. */
20007 if (expand_vec_perm_palignr (d
, true))
20010 /* Try the AVX512F vperm{w,b,s,d} instructions */
20011 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
20014 /* Try the AVX512F vpermt2/vpermi2 instructions. */
20015 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
20018 /* See if we can get the same permutation in different vector integer
20020 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
20023 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
20029 /* Canonicalize vec_perm index to make the first index
20030 always comes from the first vector. */
20032 ix86_vec_perm_index_canon (struct expand_vec_perm_d
*d
)
20034 unsigned nelt
= d
->nelt
;
20035 if (d
->perm
[0] < nelt
)
20038 for (unsigned i
= 0; i
!= nelt
; i
++)
20039 d
->perm
[i
] = (d
->perm
[i
] + nelt
) % (2 * nelt
);
20041 std::swap (d
->op0
, d
->op1
);
20045 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20046 in terms of a pair of shufps+ shufps/pshufd instructions. */
20048 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d
*d
)
20050 unsigned char perm1
[4];
20051 machine_mode vmode
= d
->vmode
;
20053 unsigned i
, j
, k
, count
= 0;
20055 if (d
->one_operand_p
20056 || (vmode
!= V4SImode
&& vmode
!= V4SFmode
))
20062 ix86_vec_perm_index_canon (d
);
20063 for (i
= 0; i
< 4; ++i
)
20064 count
+= d
->perm
[i
] > 3 ? 1 : 0;
20066 gcc_assert (count
& 3);
20068 rtx tmp
= gen_reg_rtx (vmode
);
20069 /* 2 from op0 and 2 from op1. */
20072 unsigned char perm2
[4];
20073 for (i
= 0, j
= 0, k
= 2; i
< 4; ++i
)
20074 if (d
->perm
[i
] & 4)
20076 perm1
[k
++] = d
->perm
[i
];
20081 perm1
[j
++] = d
->perm
[i
];
20086 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
20087 perm1
, d
->nelt
, false);
20089 if (vmode
== V4SImode
&& TARGET_SSE2
)
20091 ok
= expand_vselect (d
->target
, tmp
,
20092 perm2
, d
->nelt
, false);
20098 ok
= expand_vselect_vconcat (d
->target
, tmp
, tmp
,
20099 perm2
, d
->nelt
, false);
20103 /* 3 from one op and 1 from another. */
20106 unsigned pair_idx
= 8, lone_idx
= 8, shift
;
20108 /* Find the lone index. */
20109 for (i
= 0; i
< 4; ++i
)
20110 if ((d
->perm
[i
] > 3 && count
== 1)
20111 || (d
->perm
[i
] < 4 && count
== 3))
20114 /* When lone_idx is not 0, it must from second op(count == 1). */
20115 gcc_assert (count
== (lone_idx
? 1 : 3));
20117 /* Find the pair index that sits in the same half as the lone index. */
20118 shift
= lone_idx
& 2;
20119 pair_idx
= 1 - lone_idx
+ 2 * shift
;
20121 /* First permutate lone index and pair index into the same vector as
20122 [ lone, lone, pair, pair ]. */
20123 perm1
[1] = perm1
[0]
20124 = (count
== 3) ? d
->perm
[lone_idx
] : d
->perm
[lone_idx
] - 4;
20125 perm1
[3] = perm1
[2]
20126 = (count
== 3) ? d
->perm
[pair_idx
] : d
->perm
[pair_idx
] + 4;
20128 /* Alway put the vector contains lone indx at the first. */
20130 std::swap (d
->op0
, d
->op1
);
20133 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
20134 perm1
, d
->nelt
, false);
20137 /* Refine lone and pair index to original order. */
20138 perm1
[shift
] = lone_idx
<< 1;
20139 perm1
[shift
+ 1] = pair_idx
<< 1;
20141 /* Select the remaining 2 elements in another vector. */
20142 for (i
= 2 - shift
; i
< 4 - shift
; ++i
)
20143 perm1
[i
] = lone_idx
== 1 ? d
->perm
[i
] + 4 : d
->perm
[i
];
20145 /* Adjust to original selector. */
20147 std::swap (tmp
, d
->op1
);
20150 ok
= expand_vselect_vconcat (d
->target
, tmp
, d
->op1
,
20151 perm1
, d
->nelt
, false);
20159 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20160 in terms of a pair of pshuflw + pshufhw instructions. */
20163 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
20165 unsigned char perm2
[MAX_VECT_LEN
];
20169 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
20172 /* The two permutations only operate in 64-bit lanes. */
20173 for (i
= 0; i
< 4; ++i
)
20174 if (d
->perm
[i
] >= 4)
20176 for (i
= 4; i
< 8; ++i
)
20177 if (d
->perm
[i
] < 4)
20183 /* Emit the pshuflw. */
20184 memcpy (perm2
, d
->perm
, 4);
20185 for (i
= 4; i
< 8; ++i
)
20187 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
20190 /* Emit the pshufhw. */
20191 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
20192 for (i
= 0; i
< 4; ++i
)
20194 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
20200 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20201 the permutation using the SSSE3 palignr instruction. This succeeds
20202 when all of the elements in PERM fit within one vector and we merely
20203 need to shift them down so that a single vector permutation has a
20204 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20205 the vpalignr instruction itself can perform the requested permutation. */
20208 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
20210 unsigned i
, nelt
= d
->nelt
;
20211 unsigned min
, max
, minswap
, maxswap
;
20212 bool in_order
, ok
, swap
= false;
20214 struct expand_vec_perm_d dcopy
;
20216 /* Even with AVX, palignr only operates on 128-bit vectors,
20217 in AVX2 palignr operates on both 128-bit lanes. */
20218 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
20219 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
20224 minswap
= 2 * nelt
;
20226 for (i
= 0; i
< nelt
; ++i
)
20228 unsigned e
= d
->perm
[i
];
20229 unsigned eswap
= d
->perm
[i
] ^ nelt
;
20230 if (GET_MODE_SIZE (d
->vmode
) == 32)
20232 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
20233 eswap
= e
^ (nelt
/ 2);
20239 if (eswap
< minswap
)
20241 if (eswap
> maxswap
)
20245 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
20247 if (d
->one_operand_p
20249 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
20250 ? nelt
/ 2 : nelt
))
20257 /* Given that we have SSSE3, we know we'll be able to implement the
20258 single operand permutation after the palignr with pshufb for
20259 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20261 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
20267 dcopy
.op0
= d
->op1
;
20268 dcopy
.op1
= d
->op0
;
20269 for (i
= 0; i
< nelt
; ++i
)
20270 dcopy
.perm
[i
] ^= nelt
;
20274 for (i
= 0; i
< nelt
; ++i
)
20276 unsigned e
= dcopy
.perm
[i
];
20277 if (GET_MODE_SIZE (d
->vmode
) == 32
20279 && (e
& (nelt
/ 2 - 1)) < min
)
20280 e
= e
- min
- (nelt
/ 2);
20287 dcopy
.one_operand_p
= true;
20289 if (single_insn_only_p
&& !in_order
)
20292 /* For AVX2, test whether we can permute the result in one instruction. */
20297 dcopy
.op1
= dcopy
.op0
;
20298 return expand_vec_perm_1 (&dcopy
);
20301 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
20302 if (GET_MODE_SIZE (d
->vmode
) == 16)
20304 target
= gen_reg_rtx (V1TImode
);
20305 emit_insn (gen_ssse3_palignrv1ti (target
,
20306 gen_lowpart (V1TImode
, dcopy
.op1
),
20307 gen_lowpart (V1TImode
, dcopy
.op0
),
20312 target
= gen_reg_rtx (V2TImode
);
20313 emit_insn (gen_avx2_palignrv2ti (target
,
20314 gen_lowpart (V2TImode
, dcopy
.op1
),
20315 gen_lowpart (V2TImode
, dcopy
.op0
),
20319 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
20321 /* Test for the degenerate case where the alignment by itself
20322 produces the desired permutation. */
20325 emit_move_insn (d
->target
, dcopy
.op0
);
20329 ok
= expand_vec_perm_1 (&dcopy
);
20330 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
20335 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20336 the permutation using the SSE4_1 pblendv instruction. Potentially
20337 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20340 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
20342 unsigned i
, which
, nelt
= d
->nelt
;
20343 struct expand_vec_perm_d dcopy
, dcopy1
;
20344 machine_mode vmode
= d
->vmode
;
20347 /* Use the same checks as in expand_vec_perm_blend. */
20348 if (d
->one_operand_p
)
20350 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
20352 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
20354 else if (TARGET_SSE4_1
20355 && (GET_MODE_SIZE (vmode
) == 16
20356 || (TARGET_MMX_WITH_SSE
&& GET_MODE_SIZE (vmode
) == 8)
20357 || GET_MODE_SIZE (vmode
) == 4))
20362 /* Figure out where permutation elements stay not in their
20363 respective lanes. */
20364 for (i
= 0, which
= 0; i
< nelt
; ++i
)
20366 unsigned e
= d
->perm
[i
];
20368 which
|= (e
< nelt
? 1 : 2);
20370 /* We can pblend the part where elements stay not in their
20371 respective lanes only when these elements are all in one
20372 half of a permutation.
20373 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20374 lanes, but both 8 and 9 >= 8
20375 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20376 respective lanes and 8 >= 8, but 2 not. */
20377 if (which
!= 1 && which
!= 2)
20379 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
20382 /* First we apply one operand permutation to the part where
20383 elements stay not in their respective lanes. */
20386 dcopy
.op0
= dcopy
.op1
= d
->op1
;
20388 dcopy
.op0
= dcopy
.op1
= d
->op0
;
20390 dcopy
.target
= gen_reg_rtx (vmode
);
20391 dcopy
.one_operand_p
= true;
20393 for (i
= 0; i
< nelt
; ++i
)
20394 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
20396 ok
= expand_vec_perm_1 (&dcopy
);
20397 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
20404 /* Next we put permuted elements into their positions. */
20407 dcopy1
.op1
= dcopy
.target
;
20409 dcopy1
.op0
= dcopy
.target
;
20411 for (i
= 0; i
< nelt
; ++i
)
20412 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
20414 ok
= expand_vec_perm_blend (&dcopy1
);
20420 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
20422 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20423 a two vector permutation into a single vector permutation by using
20424 an interleave operation to merge the vectors. */
20427 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
20429 struct expand_vec_perm_d dremap
, dfinal
;
20430 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
20431 unsigned HOST_WIDE_INT contents
;
20432 unsigned char remap
[2 * MAX_VECT_LEN
];
20434 bool ok
, same_halves
= false;
20436 if (GET_MODE_SIZE (d
->vmode
) == 4
20437 || GET_MODE_SIZE (d
->vmode
) == 8
20438 || GET_MODE_SIZE (d
->vmode
) == 16)
20440 if (d
->one_operand_p
)
20443 else if (GET_MODE_SIZE (d
->vmode
) == 32)
20447 /* For 32-byte modes allow even d->one_operand_p.
20448 The lack of cross-lane shuffling in some instructions
20449 might prevent a single insn shuffle. */
20451 dfinal
.testing_p
= true;
20452 /* If expand_vec_perm_interleave3 can expand this into
20453 a 3 insn sequence, give up and let it be expanded as
20454 3 insn sequence. While that is one insn longer,
20455 it doesn't need a memory operand and in the common
20456 case that both interleave low and high permutations
20457 with the same operands are adjacent needs 4 insns
20458 for both after CSE. */
20459 if (expand_vec_perm_interleave3 (&dfinal
))
20465 /* Examine from whence the elements come. */
20467 for (i
= 0; i
< nelt
; ++i
)
20468 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
20470 memset (remap
, 0xff, sizeof (remap
));
20473 if (GET_MODE_SIZE (d
->vmode
) == 4
20474 || GET_MODE_SIZE (d
->vmode
) == 8)
20476 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
20478 /* Split the two input vectors into 4 halves. */
20479 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
20484 /* If the elements from the low halves use interleave low,
20485 and similarly for interleave high. */
20486 if ((contents
& (h1
| h3
)) == contents
)
20489 for (i
= 0; i
< nelt2
; ++i
)
20492 remap
[i
+ nelt
] = i
* 2 + 1;
20493 dremap
.perm
[i
* 2] = i
;
20494 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20497 else if ((contents
& (h2
| h4
)) == contents
)
20500 for (i
= 0; i
< nelt2
; ++i
)
20502 remap
[i
+ nelt2
] = i
* 2;
20503 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
20504 dremap
.perm
[i
* 2] = i
+ nelt2
;
20505 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
20511 else if (GET_MODE_SIZE (d
->vmode
) == 16)
20513 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
20515 /* Split the two input vectors into 4 halves. */
20516 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
20521 /* If the elements from the low halves use interleave low, and similarly
20522 for interleave high. If the elements are from mis-matched halves, we
20523 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20524 if ((contents
& (h1
| h3
)) == contents
)
20527 for (i
= 0; i
< nelt2
; ++i
)
20530 remap
[i
+ nelt
] = i
* 2 + 1;
20531 dremap
.perm
[i
* 2] = i
;
20532 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20534 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
20535 dremap
.vmode
= V4SFmode
;
20537 else if ((contents
& (h2
| h4
)) == contents
)
20540 for (i
= 0; i
< nelt2
; ++i
)
20542 remap
[i
+ nelt2
] = i
* 2;
20543 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
20544 dremap
.perm
[i
* 2] = i
+ nelt2
;
20545 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
20547 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
20548 dremap
.vmode
= V4SFmode
;
20550 else if ((contents
& (h1
| h4
)) == contents
)
20553 for (i
= 0; i
< nelt2
; ++i
)
20556 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
20557 dremap
.perm
[i
] = i
;
20558 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
20563 dremap
.vmode
= V2DImode
;
20565 dremap
.perm
[0] = 0;
20566 dremap
.perm
[1] = 3;
20569 else if ((contents
& (h2
| h3
)) == contents
)
20572 for (i
= 0; i
< nelt2
; ++i
)
20574 remap
[i
+ nelt2
] = i
;
20575 remap
[i
+ nelt
] = i
+ nelt2
;
20576 dremap
.perm
[i
] = i
+ nelt2
;
20577 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
20582 dremap
.vmode
= V2DImode
;
20584 dremap
.perm
[0] = 1;
20585 dremap
.perm
[1] = 2;
20593 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
20594 unsigned HOST_WIDE_INT q
[8];
20595 unsigned int nonzero_halves
[4];
20597 /* Split the two input vectors into 8 quarters. */
20598 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
20599 for (i
= 1; i
< 8; ++i
)
20600 q
[i
] = q
[0] << (nelt4
* i
);
20601 for (i
= 0; i
< 4; ++i
)
20602 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
20604 nonzero_halves
[nzcnt
] = i
;
20610 gcc_assert (d
->one_operand_p
);
20611 nonzero_halves
[1] = nonzero_halves
[0];
20612 same_halves
= true;
20614 else if (d
->one_operand_p
)
20616 gcc_assert (nonzero_halves
[0] == 0);
20617 gcc_assert (nonzero_halves
[1] == 1);
20622 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
20624 /* Attempt to increase the likelihood that dfinal
20625 shuffle will be intra-lane. */
20626 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
20629 /* vperm2f128 or vperm2i128. */
20630 for (i
= 0; i
< nelt2
; ++i
)
20632 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
20633 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
20634 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
20635 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
20638 if (d
->vmode
!= V8SFmode
20639 && d
->vmode
!= V4DFmode
20640 && d
->vmode
!= V8SImode
)
20642 dremap
.vmode
= V8SImode
;
20644 for (i
= 0; i
< 4; ++i
)
20646 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
20647 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
20651 else if (d
->one_operand_p
)
20653 else if (TARGET_AVX2
20654 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
20657 for (i
= 0; i
< nelt4
; ++i
)
20660 remap
[i
+ nelt
] = i
* 2 + 1;
20661 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
20662 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
20663 dremap
.perm
[i
* 2] = i
;
20664 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20665 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
20666 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
20669 else if (TARGET_AVX2
20670 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
20673 for (i
= 0; i
< nelt4
; ++i
)
20675 remap
[i
+ nelt4
] = i
* 2;
20676 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
20677 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
20678 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
20679 dremap
.perm
[i
* 2] = i
+ nelt4
;
20680 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
20681 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
20682 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
20689 /* Use the remapping array set up above to move the elements from their
20690 swizzled locations into their final destinations. */
20692 for (i
= 0; i
< nelt
; ++i
)
20694 unsigned e
= remap
[d
->perm
[i
]];
20695 gcc_assert (e
< nelt
);
20696 /* If same_halves is true, both halves of the remapped vector are the
20697 same. Avoid cross-lane accesses if possible. */
20698 if (same_halves
&& i
>= nelt2
)
20700 gcc_assert (e
< nelt2
);
20701 dfinal
.perm
[i
] = e
+ nelt2
;
20704 dfinal
.perm
[i
] = e
;
20708 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
20709 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
20711 dfinal
.op1
= dfinal
.op0
;
20712 dfinal
.one_operand_p
= true;
20714 /* Test if the final remap can be done with a single insn. For V4SFmode or
20715 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20717 ok
= expand_vec_perm_1 (&dfinal
);
20718 seq
= get_insns ();
20727 if (dremap
.vmode
!= dfinal
.vmode
)
20729 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
20730 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
20733 ok
= expand_vec_perm_1 (&dremap
);
20740 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20741 a single vector cross-lane permutation into vpermq followed
20742 by any of the single insn permutations. */
20745 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
20747 struct expand_vec_perm_d dremap
, dfinal
;
20748 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
20749 unsigned contents
[2];
20753 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
20754 && d
->one_operand_p
))
20759 for (i
= 0; i
< nelt2
; ++i
)
20761 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
20762 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
20765 for (i
= 0; i
< 2; ++i
)
20767 unsigned int cnt
= 0;
20768 for (j
= 0; j
< 4; ++j
)
20769 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
20777 dremap
.vmode
= V4DImode
;
20779 dremap
.target
= gen_reg_rtx (V4DImode
);
20780 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
20781 dremap
.op1
= dremap
.op0
;
20782 dremap
.one_operand_p
= true;
20783 for (i
= 0; i
< 2; ++i
)
20785 unsigned int cnt
= 0;
20786 for (j
= 0; j
< 4; ++j
)
20787 if ((contents
[i
] & (1u << j
)) != 0)
20788 dremap
.perm
[2 * i
+ cnt
++] = j
;
20789 for (; cnt
< 2; ++cnt
)
20790 dremap
.perm
[2 * i
+ cnt
] = 0;
20794 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
20795 dfinal
.op1
= dfinal
.op0
;
20796 dfinal
.one_operand_p
= true;
20797 for (i
= 0, j
= 0; i
< nelt
; ++i
)
20801 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
20802 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
20804 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
20805 dfinal
.perm
[i
] |= nelt4
;
20807 gcc_unreachable ();
20810 ok
= expand_vec_perm_1 (&dremap
);
20813 ok
= expand_vec_perm_1 (&dfinal
);
20819 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
20821 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20822 a vector permutation using two instructions, vperm2f128 resp.
20823 vperm2i128 followed by any single in-lane permutation. */
20826 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
20828 struct expand_vec_perm_d dfirst
, dsecond
;
20829 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
20833 || GET_MODE_SIZE (d
->vmode
) != 32
20834 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
20838 dsecond
.one_operand_p
= false;
20839 dsecond
.testing_p
= true;
20841 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20842 immediate. For perm < 16 the second permutation uses
20843 d->op0 as first operand, for perm >= 16 it uses d->op1
20844 as first operand. The second operand is the result of
20846 for (perm
= 0; perm
< 32; perm
++)
20848 /* Ignore permutations which do not move anything cross-lane. */
20851 /* The second shuffle for e.g. V4DFmode has
20852 0123 and ABCD operands.
20853 Ignore AB23, as 23 is already in the second lane
20854 of the first operand. */
20855 if ((perm
& 0xc) == (1 << 2)) continue;
20856 /* And 01CD, as 01 is in the first lane of the first
20858 if ((perm
& 3) == 0) continue;
20859 /* And 4567, as then the vperm2[fi]128 doesn't change
20860 anything on the original 4567 second operand. */
20861 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
20865 /* The second shuffle for e.g. V4DFmode has
20866 4567 and ABCD operands.
20867 Ignore AB67, as 67 is already in the second lane
20868 of the first operand. */
20869 if ((perm
& 0xc) == (3 << 2)) continue;
20870 /* And 45CD, as 45 is in the first lane of the first
20872 if ((perm
& 3) == 2) continue;
20873 /* And 0123, as then the vperm2[fi]128 doesn't change
20874 anything on the original 0123 first operand. */
20875 if ((perm
& 0xf) == (1 << 2)) continue;
20878 for (i
= 0; i
< nelt
; i
++)
20880 j
= d
->perm
[i
] / nelt2
;
20881 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
20882 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
20883 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
20884 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
20892 ok
= expand_vec_perm_1 (&dsecond
);
20903 /* Found a usable second shuffle. dfirst will be
20904 vperm2f128 on d->op0 and d->op1. */
20905 dsecond
.testing_p
= false;
20907 dfirst
.target
= gen_reg_rtx (d
->vmode
);
20908 for (i
= 0; i
< nelt
; i
++)
20909 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
20910 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
20912 canonicalize_perm (&dfirst
);
20913 ok
= expand_vec_perm_1 (&dfirst
);
20916 /* And dsecond is some single insn shuffle, taking
20917 d->op0 and result of vperm2f128 (if perm < 16) or
20918 d->op1 and result of vperm2f128 (otherwise). */
20920 dsecond
.op0
= dsecond
.op1
;
20921 dsecond
.op1
= dfirst
.target
;
20923 ok
= expand_vec_perm_1 (&dsecond
);
20929 /* For one operand, the only useful vperm2f128 permutation is 0x01
20931 if (d
->one_operand_p
)
20938 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20939 a two vector permutation using 2 intra-lane interleave insns
20940 and cross-lane shuffle for 32-byte vectors. */
20943 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
20946 rtx (*gen
) (rtx
, rtx
, rtx
);
20948 if (d
->one_operand_p
)
20950 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
20952 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
20958 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
20960 for (i
= 0; i
< nelt
; i
+= 2)
20961 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
20962 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
20972 gen
= gen_vec_interleave_highv32qi
;
20974 gen
= gen_vec_interleave_lowv32qi
;
20978 gen
= gen_vec_interleave_highv16hi
;
20980 gen
= gen_vec_interleave_lowv16hi
;
20984 gen
= gen_vec_interleave_highv8si
;
20986 gen
= gen_vec_interleave_lowv8si
;
20990 gen
= gen_vec_interleave_highv4di
;
20992 gen
= gen_vec_interleave_lowv4di
;
20996 gen
= gen_vec_interleave_highv8sf
;
20998 gen
= gen_vec_interleave_lowv8sf
;
21002 gen
= gen_vec_interleave_highv4df
;
21004 gen
= gen_vec_interleave_lowv4df
;
21007 gcc_unreachable ();
21010 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
21014 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21015 a single vector permutation using a single intra-lane vector
21016 permutation, vperm2f128 swapping the lanes and vblend* insn blending
21017 the non-swapped and swapped vectors together. */
21020 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
21022 struct expand_vec_perm_d dfirst
, dsecond
;
21023 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
21026 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
21030 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
21031 || !d
->one_operand_p
)
21035 for (i
= 0; i
< nelt
; i
++)
21036 dfirst
.perm
[i
] = 0xff;
21037 for (i
= 0, msk
= 0; i
< nelt
; i
++)
21039 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
21040 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
21042 dfirst
.perm
[j
] = d
->perm
[i
];
21046 for (i
= 0; i
< nelt
; i
++)
21047 if (dfirst
.perm
[i
] == 0xff)
21048 dfirst
.perm
[i
] = i
;
21051 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21054 ok
= expand_vec_perm_1 (&dfirst
);
21055 seq
= get_insns ();
21067 dsecond
.op0
= dfirst
.target
;
21068 dsecond
.op1
= dfirst
.target
;
21069 dsecond
.one_operand_p
= true;
21070 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21071 for (i
= 0; i
< nelt
; i
++)
21072 dsecond
.perm
[i
] = i
^ nelt2
;
21074 ok
= expand_vec_perm_1 (&dsecond
);
21077 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
21078 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
21082 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21083 a two vector permutation using two single vector permutations and
21084 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
21085 of dfirst or dsecond is identity permutation. */
21088 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d
*d
, bool two_insn
)
21090 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, lane
= nelt
;
21091 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
21092 bool ident1
= true, ident2
= true;
21094 if (d
->one_operand_p
)
21097 if (GET_MODE_SIZE (d
->vmode
) == 16)
21101 if (d
->vmode
!= V4SFmode
&& d
->vmode
!= V2DFmode
&& !TARGET_SSE2
)
21104 else if (GET_MODE_SIZE (d
->vmode
) == 32)
21108 if (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
)
21115 for (i
= 1; i
< nelt
; i
++)
21116 if ((d
->perm
[i
] >= nelt
) != ((d
->perm
[0] >= nelt
) ^ (i
& 1)))
21122 dfirst
.op1
= dfirst
.op0
;
21123 dfirst
.one_operand_p
= true;
21124 dsecond
.op0
= dsecond
.op1
;
21125 dsecond
.one_operand_p
= true;
21127 for (i
= 0; i
< nelt
; i
++)
21128 if (d
->perm
[i
] >= nelt
)
21130 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
] - nelt
;
21131 if (d
->perm
[i
] - nelt
!= i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
21133 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)]
21134 = d
->perm
[i
] - nelt
;
21138 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
];
21139 if (d
->perm
[i
] != i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
21141 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)] = d
->perm
[i
];
21144 if (two_insn
&& !ident1
&& !ident2
)
21150 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
21152 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
21153 if (d
->perm
[0] >= nelt
)
21154 std::swap (dfinal
.op0
, dfinal
.op1
);
21158 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
21163 ok
= expand_vec_perm_1 (&dfirst
);
21164 seq1
= get_insns ();
21174 ok
= expand_vec_perm_1 (&dsecond
);
21175 seq2
= get_insns ();
21185 for (i
= 0; i
< nelt
; i
++)
21187 dfinal
.perm
[i
] = i
/ 2;
21189 dfinal
.perm
[i
] += lane
/ 2;
21191 dfinal
.perm
[i
] += nelt
;
21195 ok
= expand_vselect_vconcat (dfinal
.target
, dfinal
.op0
, dfinal
.op1
,
21196 dfinal
.perm
, dfinal
.nelt
, false);
21201 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21202 the permutation using two single vector permutations and the SSE4_1 pblendv
21203 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21204 identity permutation. */
21207 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d
*d
, bool two_insn
)
21209 unsigned i
, nelt
= d
->nelt
;
21210 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
21211 machine_mode vmode
= d
->vmode
;
21212 bool ident1
= true, ident2
= true;
21214 /* Use the same checks as in expand_vec_perm_blend. */
21215 if (d
->one_operand_p
)
21217 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
21219 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
21221 else if (TARGET_SSE4_1
21222 && (GET_MODE_SIZE (vmode
) == 16
21223 || (TARGET_MMX_WITH_SSE
&& GET_MODE_SIZE (vmode
) == 8)
21224 || GET_MODE_SIZE (vmode
) == 4))
21232 dfirst
.op1
= dfirst
.op0
;
21233 dfirst
.one_operand_p
= true;
21234 dsecond
.op0
= dsecond
.op1
;
21235 dsecond
.one_operand_p
= true;
21237 for (i
= 0; i
< nelt
; ++i
)
21238 if (d
->perm
[i
] >= nelt
)
21240 dfirst
.perm
[i
] = 0xff;
21241 dsecond
.perm
[i
] = d
->perm
[i
] - nelt
;
21242 if (d
->perm
[i
] != i
+ nelt
)
21247 dsecond
.perm
[i
] = 0xff;
21248 dfirst
.perm
[i
] = d
->perm
[i
];
21249 if (d
->perm
[i
] != i
)
21253 if (two_insn
&& !ident1
&& !ident2
)
21256 /* For now. Ideally treat 0xff as a wildcard. */
21257 for (i
= 0; i
< nelt
; ++i
)
21258 if (dfirst
.perm
[i
] == 0xff)
21260 if (GET_MODE_SIZE (vmode
) == 32
21261 && dfirst
.perm
[i
^ (nelt
/ 2)] != 0xff)
21262 dfirst
.perm
[i
] = dfirst
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
21264 dfirst
.perm
[i
] = i
;
21268 if (GET_MODE_SIZE (vmode
) == 32
21269 && dsecond
.perm
[i
^ (nelt
/ 2)] != 0xff)
21270 dsecond
.perm
[i
] = dsecond
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
21272 dsecond
.perm
[i
] = i
;
21278 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
21280 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
21284 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
21289 ok
= expand_vec_perm_1 (&dfirst
);
21290 seq1
= get_insns ();
21300 ok
= expand_vec_perm_1 (&dsecond
);
21301 seq2
= get_insns ();
21311 for (i
= 0; i
< nelt
; ++i
)
21312 dfinal
.perm
[i
] = (d
->perm
[i
] >= nelt
? i
+ nelt
: i
);
21316 ok
= expand_vec_perm_blend (&dfinal
);
21321 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21322 permutation using two vperm2f128, followed by a vshufpd insn blending
21323 the two vectors together. */
21326 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
21328 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
21331 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
21341 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
21342 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
21343 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
21344 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
21345 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
21346 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
21347 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
21348 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
21349 dthird
.perm
[0] = (d
->perm
[0] % 2);
21350 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
21351 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
21352 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
21354 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21355 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21356 dthird
.op0
= dfirst
.target
;
21357 dthird
.op1
= dsecond
.target
;
21358 dthird
.one_operand_p
= false;
21360 canonicalize_perm (&dfirst
);
21361 canonicalize_perm (&dsecond
);
21363 ok
= expand_vec_perm_1 (&dfirst
)
21364 && expand_vec_perm_1 (&dsecond
)
21365 && expand_vec_perm_1 (&dthird
);
21372 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
21374 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21375 a two vector permutation using two intra-lane vector
21376 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21377 the non-swapped and swapped vectors together. */
21380 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
21382 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
21383 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
21384 rtx_insn
*seq1
, *seq2
;
21386 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
21390 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
21391 || d
->one_operand_p
)
21396 for (i
= 0; i
< nelt
; i
++)
21398 dfirst
.perm
[i
] = 0xff;
21399 dsecond
.perm
[i
] = 0xff;
21401 for (i
= 0, msk
= 0; i
< nelt
; i
++)
21403 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
21406 dfirst
.perm
[j
] = d
->perm
[i
];
21407 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
21411 dsecond
.perm
[j
] = d
->perm
[i
];
21412 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
21416 if (msk
== 0 || msk
== (1U << nelt
) - 1)
21421 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21422 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21425 for (i
= 0; i
< nelt
; i
++)
21427 if (dfirst
.perm
[i
] == 0xff)
21428 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
21429 if (dsecond
.perm
[i
] == 0xff)
21430 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
21432 canonicalize_perm (&dfirst
);
21434 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
21435 seq1
= get_insns ();
21441 canonicalize_perm (&dsecond
);
21443 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
21444 seq2
= get_insns ();
21457 dthird
.op0
= dsecond
.target
;
21458 dthird
.op1
= dsecond
.target
;
21459 dthird
.one_operand_p
= true;
21460 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
21461 for (i
= 0; i
< nelt
; i
++)
21462 dthird
.perm
[i
] = i
^ nelt2
;
21464 ok
= expand_vec_perm_1 (&dthird
);
21467 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
21468 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
21472 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21473 permutation with two pshufb insns and an ior. We should have already
21474 failed all two instruction sequences. */
21477 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
21479 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
21480 unsigned int i
, nelt
, eltsz
;
21482 rtx (*gen
) (rtx
, rtx
, rtx
);
21484 if (!TARGET_SSSE3
|| (GET_MODE_SIZE (d
->vmode
) != 16
21485 && GET_MODE_SIZE (d
->vmode
) != 8
21486 && GET_MODE_SIZE (d
->vmode
) != 4))
21488 gcc_assert (!d
->one_operand_p
);
21493 switch (GET_MODE_SIZE (d
->vmode
))
21497 gen
= gen_mmx_pshufbv4qi3
;
21501 gen
= gen_mmx_pshufbv8qi3
;
21505 gen
= gen_ssse3_pshufbv16qi3
;
21508 gcc_unreachable ();
21512 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21514 /* Generate two permutation masks. If the required element is within
21515 the given vector it is shuffled into the proper lane. If the required
21516 element is in the other vector, force a zero into the lane by setting
21517 bit 7 in the permutation mask. */
21518 m128
= GEN_INT (-128);
21519 for (i
= 0; i
< nelt
; ++i
)
21521 unsigned j
, k
, e
= d
->perm
[i
];
21522 unsigned which
= (e
>= nelt
);
21526 for (j
= 0; j
< eltsz
; ++j
)
21528 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
21529 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
21532 for (k
= i
*eltsz
+ j
; k
< 16; ++k
)
21533 rperm
[0][k
] = rperm
[1][k
] = m128
;
21536 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
21537 vperm
= force_reg (V16QImode
, vperm
);
21539 l
= gen_reg_rtx (mode
);
21540 op
= gen_lowpart (mode
, d
->op0
);
21541 emit_insn (gen (l
, op
, vperm
));
21543 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
21544 vperm
= force_reg (V16QImode
, vperm
);
21546 h
= gen_reg_rtx (mode
);
21547 op
= gen_lowpart (mode
, d
->op1
);
21548 emit_insn (gen (h
, op
, vperm
));
21551 if (d
->vmode
!= mode
)
21552 op
= gen_reg_rtx (mode
);
21553 ix86_emit_vec_binop (IOR
, mode
, op
, l
, h
);
21554 if (op
!= d
->target
)
21555 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21560 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
21561 with two vpshufb insns, vpermq and vpor. We should have already failed
21562 all two or three instruction sequences. */
21565 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
21567 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
21568 unsigned int i
, nelt
, eltsz
;
21571 || !d
->one_operand_p
21572 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
21579 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21581 /* Generate two permutation masks. If the required element is within
21582 the same lane, it is shuffled in. If the required element from the
21583 other lane, force a zero by setting bit 7 in the permutation mask.
21584 In the other mask the mask has non-negative elements if element
21585 is requested from the other lane, but also moved to the other lane,
21586 so that the result of vpshufb can have the two V2TImode halves
21588 m128
= GEN_INT (-128);
21589 for (i
= 0; i
< nelt
; ++i
)
21591 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
21592 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
21594 for (j
= 0; j
< eltsz
; ++j
)
21596 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
21597 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
21601 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
21602 vperm
= force_reg (V32QImode
, vperm
);
21604 h
= gen_reg_rtx (V32QImode
);
21605 op
= gen_lowpart (V32QImode
, d
->op0
);
21606 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
21608 /* Swap the 128-byte lanes of h into hp. */
21609 hp
= gen_reg_rtx (V4DImode
);
21610 op
= gen_lowpart (V4DImode
, h
);
21611 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
21614 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
21615 vperm
= force_reg (V32QImode
, vperm
);
21617 l
= gen_reg_rtx (V32QImode
);
21618 op
= gen_lowpart (V32QImode
, d
->op0
);
21619 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
21622 if (d
->vmode
!= V32QImode
)
21623 op
= gen_reg_rtx (V32QImode
);
21624 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
21625 if (op
!= d
->target
)
21626 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21631 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21632 and extract-odd permutations of two V32QImode and V16QImode operand
21633 with two vpshufb insns, vpor and vpermq. We should have already
21634 failed all two or three instruction sequences. */
21637 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
21639 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
21640 unsigned int i
, nelt
, eltsz
;
21643 || d
->one_operand_p
21644 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
21647 for (i
= 0; i
< d
->nelt
; ++i
)
21648 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
21655 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21657 /* Generate two permutation masks. In the first permutation mask
21658 the first quarter will contain indexes for the first half
21659 of the op0, the second quarter will contain bit 7 set, third quarter
21660 will contain indexes for the second half of the op0 and the
21661 last quarter bit 7 set. In the second permutation mask
21662 the first quarter will contain bit 7 set, the second quarter
21663 indexes for the first half of the op1, the third quarter bit 7 set
21664 and last quarter indexes for the second half of the op1.
21665 I.e. the first mask e.g. for V32QImode extract even will be:
21666 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21667 (all values masked with 0xf except for -128) and second mask
21668 for extract even will be
21669 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21670 m128
= GEN_INT (-128);
21671 for (i
= 0; i
< nelt
; ++i
)
21673 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
21674 unsigned which
= d
->perm
[i
] >= nelt
;
21675 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
21677 for (j
= 0; j
< eltsz
; ++j
)
21679 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
21680 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
21684 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
21685 vperm
= force_reg (V32QImode
, vperm
);
21687 l
= gen_reg_rtx (V32QImode
);
21688 op
= gen_lowpart (V32QImode
, d
->op0
);
21689 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
21691 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
21692 vperm
= force_reg (V32QImode
, vperm
);
21694 h
= gen_reg_rtx (V32QImode
);
21695 op
= gen_lowpart (V32QImode
, d
->op1
);
21696 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
21698 ior
= gen_reg_rtx (V32QImode
);
21699 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
21701 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21702 op
= gen_reg_rtx (V4DImode
);
21703 ior
= gen_lowpart (V4DImode
, ior
);
21704 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
21705 const1_rtx
, GEN_INT (3)));
21706 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21711 /* Implement permutation with pslldq + psrldq + por when pshufb is not
21714 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d
*d
, bool pandn
)
21716 unsigned i
, nelt
= d
->nelt
;
21717 unsigned start1
, end1
= -1;
21718 machine_mode vmode
= d
->vmode
, imode
;
21720 bool clear_op0
, clear_op1
;
21721 unsigned inner_size
;
21722 rtx op0
, op1
, dop1
;
21723 rtx (*gen_vec_shr
) (rtx
, rtx
, rtx
);
21724 rtx (*gen_vec_shl
) (rtx
, rtx
, rtx
);
21726 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21727 if (!TARGET_SSE2
|| (vmode
!= E_V16QImode
&& vmode
!= E_V8HImode
))
21730 start1
= d
->perm
[0];
21731 for (i
= 1; i
< nelt
; i
++)
21733 if (d
->perm
[i
] != d
->perm
[i
-1] + 1
21734 || d
->perm
[i
] == nelt
)
21738 start2
= d
->perm
[i
];
21739 end1
= d
->perm
[i
-1];
21746 clear_op0
= end1
!= nelt
- 1;
21747 clear_op1
= start2
% nelt
!= 0;
21748 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21749 if (!pandn
&& (clear_op0
|| clear_op1
))
21755 gen_vec_shr
= vmode
== E_V16QImode
? gen_vec_shr_v16qi
: gen_vec_shr_v8hi
;
21756 gen_vec_shl
= vmode
== E_V16QImode
? gen_vec_shl_v16qi
: gen_vec_shl_v8hi
;
21757 imode
= GET_MODE_INNER (vmode
);
21758 inner_size
= GET_MODE_BITSIZE (imode
);
21759 op0
= gen_reg_rtx (vmode
);
21760 op1
= gen_reg_rtx (vmode
);
21763 emit_insn (gen_vec_shr (op0
, d
->op0
, GEN_INT (start1
* inner_size
)));
21765 emit_move_insn (op0
, d
->op0
);
21768 if (d
->one_operand_p
)
21771 int shl_offset
= end1
- start1
+ 1 - start2
% nelt
;
21773 emit_insn (gen_vec_shl (op1
, dop1
, GEN_INT (shl_offset
* inner_size
)));
21775 emit_move_insn (op1
, dop1
);
21777 /* Clear lower/upper bits for op0/op1. */
21778 if (clear_op0
|| clear_op1
)
21783 for (i
= 0; i
!= nelt
; i
++)
21785 if (i
< (end1
- start1
+ 1))
21786 vec
[i
] = gen_int_mode ((HOST_WIDE_INT_1U
<< inner_size
) - 1, imode
);
21788 vec
[i
] = CONST0_RTX (imode
);
21790 const_vec
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, vec
));
21791 const_vec
= validize_mem (force_const_mem (vmode
, const_vec
));
21792 clear
= force_reg (vmode
, const_vec
);
21795 emit_move_insn (op0
, gen_rtx_AND (vmode
, op0
, clear
));
21797 emit_move_insn (op1
, gen_rtx_AND (vmode
,
21798 gen_rtx_NOT (vmode
, clear
),
21802 emit_move_insn (d
->target
, gen_rtx_IOR (vmode
, op0
, op1
));
21806 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21807 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21808 operands with two "and" and "pack" or two "shift" and "pack" insns.
21809 We should have already failed all two instruction sequences. */
21812 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
21814 rtx op
, dop0
, dop1
, t
;
21815 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
21816 bool end_perm
= false;
21817 machine_mode half_mode
;
21818 rtx (*gen_and
) (rtx
, rtx
, rtx
);
21819 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
21820 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
21822 if (d
->one_operand_p
)
21828 /* Required for "pack". */
21829 if (!TARGET_SSE4_1
)
21833 half_mode
= V2SImode
;
21834 gen_and
= gen_andv2si3
;
21835 gen_pack
= gen_mmx_packusdw
;
21836 gen_shift
= gen_lshrv2si3
;
21839 /* Required for "pack". */
21840 if (!TARGET_SSE4_1
)
21844 half_mode
= V4SImode
;
21845 gen_and
= gen_andv4si3
;
21846 gen_pack
= gen_sse4_1_packusdw
;
21847 gen_shift
= gen_lshrv4si3
;
21850 /* No check as all instructions are SSE2. */
21853 half_mode
= V4HImode
;
21854 gen_and
= gen_andv4hi3
;
21855 gen_pack
= gen_mmx_packuswb
;
21856 gen_shift
= gen_lshrv4hi3
;
21859 /* No check as all instructions are SSE2. */
21862 half_mode
= V8HImode
;
21863 gen_and
= gen_andv8hi3
;
21864 gen_pack
= gen_sse2_packuswb
;
21865 gen_shift
= gen_lshrv8hi3
;
21872 half_mode
= V8SImode
;
21873 gen_and
= gen_andv8si3
;
21874 gen_pack
= gen_avx2_packusdw
;
21875 gen_shift
= gen_lshrv8si3
;
21883 half_mode
= V16HImode
;
21884 gen_and
= gen_andv16hi3
;
21885 gen_pack
= gen_avx2_packuswb
;
21886 gen_shift
= gen_lshrv16hi3
;
21890 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21891 are more profitable than general shuffles. */
21895 /* Check that permutation is even or odd. */
21900 for (i
= 1; i
< nelt
; ++i
)
21901 if (d
->perm
[i
] != 2 * i
+ odd
)
21907 dop0
= gen_reg_rtx (half_mode
);
21908 dop1
= gen_reg_rtx (half_mode
);
21911 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
21912 t
= force_reg (half_mode
, t
);
21913 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
21914 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
21918 emit_insn (gen_shift (dop0
,
21919 gen_lowpart (half_mode
, d
->op0
),
21921 emit_insn (gen_shift (dop1
,
21922 gen_lowpart (half_mode
, d
->op1
),
21925 /* In AVX2 for 256 bit case we need to permute pack result. */
21926 if (TARGET_AVX2
&& end_perm
)
21928 op
= gen_reg_rtx (d
->vmode
);
21929 t
= gen_reg_rtx (V4DImode
);
21930 emit_insn (gen_pack (op
, dop0
, dop1
));
21931 emit_insn (gen_avx2_permv4di_1 (t
,
21932 gen_lowpart (V4DImode
, op
),
21937 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
21940 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
21945 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21946 and extract-odd permutations of two V64QI operands
21947 with two "shifts", two "truncs" and one "concat" insns for "odd"
21948 and two "truncs" and one concat insn for "even."
21949 Have already failed all two instruction sequences. */
21952 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
21954 rtx t1
, t2
, t3
, t4
;
21955 unsigned i
, odd
, nelt
= d
->nelt
;
21957 if (!TARGET_AVX512BW
21958 || d
->one_operand_p
21959 || d
->vmode
!= V64QImode
)
21962 /* Check that permutation is even or odd. */
21967 for (i
= 1; i
< nelt
; ++i
)
21968 if (d
->perm
[i
] != 2 * i
+ odd
)
21977 t1
= gen_reg_rtx (V32HImode
);
21978 t2
= gen_reg_rtx (V32HImode
);
21979 emit_insn (gen_lshrv32hi3 (t1
,
21980 gen_lowpart (V32HImode
, d
->op0
),
21982 emit_insn (gen_lshrv32hi3 (t2
,
21983 gen_lowpart (V32HImode
, d
->op1
),
21988 t1
= gen_lowpart (V32HImode
, d
->op0
);
21989 t2
= gen_lowpart (V32HImode
, d
->op1
);
21992 t3
= gen_reg_rtx (V32QImode
);
21993 t4
= gen_reg_rtx (V32QImode
);
21994 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
21995 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
21996 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
22001 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
22002 and extract-odd permutations. */
22005 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
22007 rtx t1
, t2
, t3
, t4
, t5
;
22014 t1
= gen_reg_rtx (V4DFmode
);
22015 t2
= gen_reg_rtx (V4DFmode
);
22017 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22018 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
22019 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
22021 /* Now an unpck[lh]pd will produce the result required. */
22023 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
22025 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
22031 int mask
= odd
? 0xdd : 0x88;
22035 t1
= gen_reg_rtx (V8SFmode
);
22036 t2
= gen_reg_rtx (V8SFmode
);
22037 t3
= gen_reg_rtx (V8SFmode
);
22039 /* Shuffle within the 128-bit lanes to produce:
22040 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
22041 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
22044 /* Shuffle the lanes around to produce:
22045 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
22046 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
22049 /* Shuffle within the 128-bit lanes to produce:
22050 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
22051 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
22053 /* Shuffle within the 128-bit lanes to produce:
22054 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
22055 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
22057 /* Shuffle the lanes around to produce:
22058 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
22059 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
22070 /* These are always directly implementable by expand_vec_perm_1. */
22071 gcc_unreachable ();
22074 gcc_assert (TARGET_MMX_WITH_SSE
);
22075 /* We have no suitable instructions. */
22081 if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22082 return expand_vec_perm_pshufb2 (d
);
22087 /* We need 2*log2(N)-1 operations to achieve odd/even
22088 with interleave. */
22089 t1
= gen_reg_rtx (V4QImode
);
22090 emit_insn (gen_mmx_punpckhbw_low (t1
, d
->op0
, d
->op1
));
22091 emit_insn (gen_mmx_punpcklbw_low (d
->target
, d
->op0
, d
->op1
));
22093 t2
= gen_mmx_punpckhbw_low (d
->target
, d
->target
, t1
);
22095 t2
= gen_mmx_punpcklbw_low (d
->target
, d
->target
, t1
);
22102 return expand_vec_perm_even_odd_pack (d
);
22103 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22104 return expand_vec_perm_pshufb2 (d
);
22109 /* We need 2*log2(N)-1 operations to achieve odd/even
22110 with interleave. */
22111 t1
= gen_reg_rtx (V4HImode
);
22112 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
22113 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
22115 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
22117 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
22124 return expand_vec_perm_even_odd_pack (d
);
22125 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
22126 return expand_vec_perm_pshufb2 (d
);
22131 /* We need 2*log2(N)-1 operations to achieve odd/even
22132 with interleave. */
22133 t1
= gen_reg_rtx (V8HImode
);
22134 t2
= gen_reg_rtx (V8HImode
);
22135 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
22136 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
22137 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
22138 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
22140 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
22142 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
22149 return expand_vec_perm_even_odd_pack (d
);
22153 return expand_vec_perm_even_odd_pack (d
);
22156 return expand_vec_perm_even_odd_trunc (d
);
22161 struct expand_vec_perm_d d_copy
= *d
;
22162 d_copy
.vmode
= V4DFmode
;
22164 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
22166 d_copy
.target
= gen_reg_rtx (V4DFmode
);
22167 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
22168 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
22169 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
22172 emit_move_insn (d
->target
,
22173 gen_lowpart (V4DImode
, d_copy
.target
));
22182 t1
= gen_reg_rtx (V4DImode
);
22183 t2
= gen_reg_rtx (V4DImode
);
22185 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22186 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
22187 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
22189 /* Now an vpunpck[lh]qdq will produce the result required. */
22191 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
22193 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
22200 struct expand_vec_perm_d d_copy
= *d
;
22201 d_copy
.vmode
= V8SFmode
;
22203 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
22205 d_copy
.target
= gen_reg_rtx (V8SFmode
);
22206 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
22207 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
22208 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
22211 emit_move_insn (d
->target
,
22212 gen_lowpart (V8SImode
, d_copy
.target
));
22221 t1
= gen_reg_rtx (V8SImode
);
22222 t2
= gen_reg_rtx (V8SImode
);
22223 t3
= gen_reg_rtx (V4DImode
);
22224 t4
= gen_reg_rtx (V4DImode
);
22225 t5
= gen_reg_rtx (V4DImode
);
22227 /* Shuffle the lanes around into
22228 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22229 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
22230 gen_lowpart (V4DImode
, d
->op1
),
22232 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
22233 gen_lowpart (V4DImode
, d
->op1
),
22236 /* Swap the 2nd and 3rd position in each lane into
22237 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22238 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
22239 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22240 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
22241 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22243 /* Now an vpunpck[lh]qdq will produce
22244 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22246 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
22247 gen_lowpart (V4DImode
, t2
));
22249 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
22250 gen_lowpart (V4DImode
, t2
));
22252 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
22256 gcc_unreachable ();
22262 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22263 extract-even and extract-odd permutations. */
22266 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
22268 unsigned i
, odd
, nelt
= d
->nelt
;
22271 if (odd
!= 0 && odd
!= 1)
22274 for (i
= 1; i
< nelt
; ++i
)
22275 if (d
->perm
[i
] != 2 * i
+ odd
)
22278 if (d
->vmode
== E_V32HImode
22280 && !TARGET_AVX512BW
)
22283 return expand_vec_perm_even_odd_1 (d
, odd
);
22286 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22287 permutations. We assume that expand_vec_perm_1 has already failed. */
22290 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
22292 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
22293 machine_mode vmode
= d
->vmode
;
22294 rtx (*gen
) (rtx
, rtx
, rtx
);
22295 unsigned char perm2
[4];
22296 rtx op0
= d
->op0
, dest
;
22303 /* These are special-cased in sse.md so that we can optionally
22304 use the vbroadcast instruction. They expand to two insns
22305 if the input happens to be in a register. */
22306 gcc_unreachable ();
22316 /* These are always implementable using standard shuffle patterns. */
22317 gcc_unreachable ();
22320 /* This can be implemented via interleave and pshuflw. */
22326 gen
= gen_mmx_punpckhbw_low
;
22330 gen
= gen_mmx_punpcklbw_low
;
22332 dest
= gen_reg_rtx (vmode
);
22333 emit_insn (gen (dest
, op0
, op0
));
22334 vmode
= get_mode_wider_vector (vmode
);
22335 op0
= gen_lowpart (vmode
, dest
);
22337 memset (perm2
, elt
, 2);
22338 dest
= gen_reg_rtx (vmode
);
22339 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22342 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22346 /* This can be implemented via interleave. We save one insn by
22347 stopping once we have promoted to V2SImode and then use pshufd. */
22354 gen
= vmode
== V8QImode
? gen_mmx_punpckhbw
22355 : gen_mmx_punpckhwd
;
22359 gen
= vmode
== V8QImode
? gen_mmx_punpcklbw
22360 : gen_mmx_punpcklwd
;
22363 dest
= gen_reg_rtx (vmode
);
22364 emit_insn (gen (dest
, op0
, op0
));
22365 vmode
= get_mode_wider_vector (vmode
);
22366 op0
= gen_lowpart (vmode
, dest
);
22368 while (vmode
!= V2SImode
);
22370 memset (perm2
, elt
, 2);
22371 dest
= gen_reg_rtx (vmode
);
22372 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22375 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22380 /* These can be implemented via interleave. We save one insn by
22381 stopping once we have promoted to V4SImode and then use pshufd. */
22388 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
22389 : gen_vec_interleave_highv8hi
;
22393 gen
= vmode
== V16QImode
? gen_vec_interleave_lowv16qi
22394 : gen_vec_interleave_lowv8hi
;
22397 dest
= gen_reg_rtx (vmode
);
22398 emit_insn (gen (dest
, op0
, op0
));
22399 vmode
= get_mode_wider_vector (vmode
);
22400 op0
= gen_lowpart (vmode
, dest
);
22402 while (vmode
!= V4SImode
);
22404 memset (perm2
, elt
, 4);
22405 dest
= gen_reg_rtx (vmode
);
22406 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
22409 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22414 /* This can be implemented via interleave and pshufd. */
22418 rtx (*maybe_gen
) (machine_mode
, int, rtx
, rtx
, rtx
);
22421 maybe_gen
= maybe_gen_vec_interleave_high
;
22425 maybe_gen
= maybe_gen_vec_interleave_low
;
22428 dest
= gen_reg_rtx (vmode
);
22429 emit_insn (maybe_gen (vmode
, 1, dest
, op0
, op0
));
22432 op0
= gen_lowpart (vmode
, dest
);
22434 memset (perm2
, elt
, 4);
22435 dest
= gen_reg_rtx (vmode
);
22436 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
22439 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22446 /* For AVX2 broadcasts of the first element vpbroadcast* or
22447 vpermq should be used by expand_vec_perm_1. */
22448 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
22452 gcc_assert (!TARGET_AVX512BW
|| d
->perm
[0]);
22456 gcc_assert (!TARGET_AVX512BW
);
22460 gcc_unreachable ();
22464 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22465 broadcast permutations. */
22468 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
22470 unsigned i
, elt
, nelt
= d
->nelt
;
22472 if (!d
->one_operand_p
)
22476 for (i
= 1; i
< nelt
; ++i
)
22477 if (d
->perm
[i
] != elt
)
22480 return expand_vec_perm_broadcast_1 (d
);
22483 /* Implement arbitrary permutations of two V64QImode operands
22484 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22486 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
22488 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
22494 struct expand_vec_perm_d ds
[2];
22495 rtx rperm
[128], vperm
, target0
, target1
;
22496 unsigned int i
, nelt
;
22497 machine_mode vmode
;
22502 for (i
= 0; i
< 2; i
++)
22505 ds
[i
].vmode
= V32HImode
;
22507 ds
[i
].target
= gen_reg_rtx (V32HImode
);
22508 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
22509 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
22512 /* Prepare permutations such that the first one takes care of
22513 putting the even bytes into the right positions or one higher
22514 positions (ds[0]) and the second one takes care of
22515 putting the odd bytes into the right positions or one below
22518 for (i
= 0; i
< nelt
; i
++)
22520 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
22523 rperm
[i
] = constm1_rtx
;
22524 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
22528 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
22529 rperm
[i
+ 64] = constm1_rtx
;
22533 bool ok
= expand_vec_perm_1 (&ds
[0]);
22535 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
22537 ok
= expand_vec_perm_1 (&ds
[1]);
22539 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
22541 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
22542 vperm
= force_reg (vmode
, vperm
);
22543 target0
= gen_reg_rtx (V64QImode
);
22544 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
22546 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
22547 vperm
= force_reg (vmode
, vperm
);
22548 target1
= gen_reg_rtx (V64QImode
);
22549 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
22551 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
22555 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
22556 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22557 all the shorter instruction sequences. */
22560 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
22562 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
22563 unsigned int i
, nelt
, eltsz
;
22567 || d
->one_operand_p
22568 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
22575 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
22577 /* Generate 4 permutation masks. If the required element is within
22578 the same lane, it is shuffled in. If the required element from the
22579 other lane, force a zero by setting bit 7 in the permutation mask.
22580 In the other mask the mask has non-negative elements if element
22581 is requested from the other lane, but also moved to the other lane,
22582 so that the result of vpshufb can have the two V2TImode halves
22584 m128
= GEN_INT (-128);
22585 for (i
= 0; i
< 32; ++i
)
22587 rperm
[0][i
] = m128
;
22588 rperm
[1][i
] = m128
;
22589 rperm
[2][i
] = m128
;
22590 rperm
[3][i
] = m128
;
22596 for (i
= 0; i
< nelt
; ++i
)
22598 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
22599 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
22600 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
22602 for (j
= 0; j
< eltsz
; ++j
)
22603 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
22604 used
[which
] = true;
22607 for (i
= 0; i
< 2; ++i
)
22609 if (!used
[2 * i
+ 1])
22614 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
22615 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
22616 vperm
= force_reg (V32QImode
, vperm
);
22617 h
[i
] = gen_reg_rtx (V32QImode
);
22618 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
22619 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
22622 /* Swap the 128-byte lanes of h[X]. */
22623 for (i
= 0; i
< 2; ++i
)
22625 if (h
[i
] == NULL_RTX
)
22627 op
= gen_reg_rtx (V4DImode
);
22628 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
22629 const2_rtx
, GEN_INT (3), const0_rtx
,
22631 h
[i
] = gen_lowpart (V32QImode
, op
);
22634 for (i
= 0; i
< 2; ++i
)
22641 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
22642 vperm
= force_reg (V32QImode
, vperm
);
22643 l
[i
] = gen_reg_rtx (V32QImode
);
22644 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
22645 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
22648 for (i
= 0; i
< 2; ++i
)
22652 op
= gen_reg_rtx (V32QImode
);
22653 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
22660 gcc_assert (l
[0] && l
[1]);
22662 if (d
->vmode
!= V32QImode
)
22663 op
= gen_reg_rtx (V32QImode
);
22664 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
22665 if (op
!= d
->target
)
22666 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
22670 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22671 taken care of, perform the expansion in D and return true on success. */
22674 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
22676 /* Try a single instruction expansion. */
22677 if (expand_vec_perm_1 (d
))
22680 /* Try sequences of two instructions. */
22682 if (expand_vec_perm_pshuflw_pshufhw (d
))
22685 if (expand_vec_perm_palignr (d
, false))
22688 if (expand_vec_perm_interleave2 (d
))
22691 if (expand_vec_perm_broadcast (d
))
22694 if (expand_vec_perm_vpermq_perm_1 (d
))
22697 if (expand_vec_perm_vperm2f128 (d
))
22700 if (expand_vec_perm_pblendv (d
))
22703 if (expand_vec_perm_2perm_interleave (d
, true))
22706 if (expand_vec_perm_2perm_pblendv (d
, true))
22709 if (expand_vec_perm_shufps_shufps (d
))
22712 /* Try sequences of three instructions. */
22714 if (expand_vec_perm_even_odd_pack (d
))
22717 if (expand_vec_perm_2vperm2f128_vshuf (d
))
22720 if (expand_vec_perm_pshufb2 (d
))
22723 if (expand_vec_perm_pslldq_psrldq_por (d
, false))
22726 if (expand_vec_perm_interleave3 (d
))
22729 if (expand_vec_perm_vperm2f128_vblend (d
))
22732 if (expand_vec_perm_2perm_interleave (d
, false))
22735 if (expand_vec_perm_2perm_pblendv (d
, false))
22738 /* Try sequences of four instructions. */
22740 if (expand_vec_perm_even_odd_trunc (d
))
22742 if (expand_vec_perm_vpshufb2_vpermq (d
))
22745 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
22748 if (expand_vec_perm_vpermt2_vpshub2 (d
))
22751 /* ??? Look for narrow permutations whose element orderings would
22752 allow the promotion to a wider mode. */
22754 /* ??? Look for sequences of interleave or a wider permute that place
22755 the data into the correct lanes for a half-vector shuffle like
22756 pshuf[lh]w or vpermilps. */
22758 /* ??? Look for sequences of interleave that produce the desired results.
22759 The combinatorics of punpck[lh] get pretty ugly... */
22761 if (expand_vec_perm_even_odd (d
))
22764 /* Generate four or five instructions. */
22765 if (expand_vec_perm_pslldq_psrldq_por (d
, true))
22768 /* Even longer sequences. */
22769 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
22772 /* See if we can get the same permutation in different vector integer
22774 struct expand_vec_perm_d nd
;
22775 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
22778 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
22782 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22783 if (expand_vec_perm2_vperm2f128_vblend (d
))
22789 /* If a permutation only uses one operand, make it clear. Returns true
22790 if the permutation references both operands. */
22793 canonicalize_perm (struct expand_vec_perm_d
*d
)
22795 int i
, which
, nelt
= d
->nelt
;
22797 for (i
= which
= 0; i
< nelt
; ++i
)
22798 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
22800 d
->one_operand_p
= true;
22807 if (!rtx_equal_p (d
->op0
, d
->op1
))
22809 d
->one_operand_p
= false;
22812 /* The elements of PERM do not suggest that only the first operand
22813 is used, but both operands are identical. Allow easier matching
22814 of the permutation by folding the permutation into the single
22819 for (i
= 0; i
< nelt
; ++i
)
22820 d
->perm
[i
] &= nelt
- 1;
22829 return (which
== 3);
22832 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22835 ix86_vectorize_vec_perm_const (machine_mode vmode
, machine_mode op_mode
,
22836 rtx target
, rtx op0
, rtx op1
,
22837 const vec_perm_indices
&sel
)
22839 if (vmode
!= op_mode
)
22842 struct expand_vec_perm_d d
;
22843 unsigned char perm
[MAX_VECT_LEN
];
22844 unsigned int i
, nelt
, which
;
22847 /* For HF mode vector, convert it to HI using subreg. */
22848 if (GET_MODE_INNER (vmode
) == HFmode
)
22850 machine_mode orig_mode
= vmode
;
22851 vmode
= mode_for_vector (HImode
,
22852 GET_MODE_NUNITS (vmode
)).require ();
22854 target
= lowpart_subreg (vmode
, target
, orig_mode
);
22856 op0
= lowpart_subreg (vmode
, op0
, orig_mode
);
22858 op1
= lowpart_subreg (vmode
, op1
, orig_mode
);
22866 gcc_assert (VECTOR_MODE_P (d
.vmode
));
22867 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
22868 d
.testing_p
= !target
;
22870 gcc_assert (sel
.length () == nelt
);
22871 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
22873 /* Given sufficient ISA support we can just return true here
22874 for selected vector modes. */
22881 if (!TARGET_AVX512F
)
22883 /* All implementable with a single vperm[it]2 insn. */
22888 if (!TARGET_AVX512F
)
22890 if (d
.testing_p
&& TARGET_AVX512BW
)
22891 /* All implementable with a single vperm[it]2 insn. */
22895 if (!TARGET_AVX512F
)
22897 if (d
.testing_p
&& TARGET_AVX512BW
)
22898 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22907 if (d
.testing_p
&& TARGET_AVX512VL
)
22908 /* All implementable with a single vperm[it]2 insn. */
22914 if (d
.testing_p
&& TARGET_AVX2
)
22915 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22921 if (d
.testing_p
&& TARGET_AVX2
)
22922 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22929 /* Fall through. */
22934 /* All implementable with a single vpperm insn. */
22935 if (d
.testing_p
&& TARGET_XOP
)
22937 /* All implementable with 2 pshufb + 1 ior. */
22938 if (d
.testing_p
&& TARGET_SSSE3
)
22945 if (!TARGET_MMX_WITH_SSE
)
22951 /* All implementable with *punpckwd. */
22963 /* All implementable with shufpd or unpck[lh]pd. */
22971 for (i
= which
= 0; i
< nelt
; ++i
)
22973 unsigned char e
= sel
[i
];
22974 gcc_assert (e
< 2 * nelt
);
22977 which
|= (e
< nelt
? 1 : 2);
22982 /* For all elements from second vector, fold the elements to first. */
22984 for (i
= 0; i
< nelt
; ++i
)
22987 /* Check whether the mask can be applied to the vector type. */
22988 d
.one_operand_p
= (which
!= 3);
22990 /* Implementable with shufps, pshufd or pshuflw. */
22991 if (d
.one_operand_p
22992 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
22993 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
22994 || d
.vmode
== V4HImode
|| d
.vmode
== V2HImode
))
22997 /* Otherwise we have to go through the motions and see if we can
22998 figure out how to generate the requested permutation. */
22999 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
23000 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
23001 if (!d
.one_operand_p
)
23002 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
23005 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
23011 two_args
= canonicalize_perm (&d
);
23013 /* If one of the operands is a zero vector, try to match pmovzx. */
23014 if (two_args
&& (d
.op0
== CONST0_RTX (vmode
) || d
.op1
== CONST0_RTX (vmode
)))
23016 struct expand_vec_perm_d dzero
= d
;
23017 if (d
.op0
== CONST0_RTX (vmode
))
23019 d
.op1
= dzero
.op1
= force_reg (vmode
, d
.op1
);
23020 std::swap (dzero
.op0
, dzero
.op1
);
23021 for (i
= 0; i
< nelt
; ++i
)
23022 dzero
.perm
[i
] ^= nelt
;
23025 d
.op0
= dzero
.op0
= force_reg (vmode
, d
.op0
);
23027 if (expand_vselect_vconcat (dzero
.target
, dzero
.op0
, dzero
.op1
,
23028 dzero
.perm
, nelt
, dzero
.testing_p
))
23032 /* Force operands into registers. */
23033 rtx nop0
= force_reg (vmode
, d
.op0
);
23034 if (d
.op0
== d
.op1
)
23037 d
.op1
= force_reg (vmode
, d
.op1
);
23039 if (ix86_expand_vec_perm_const_1 (&d
))
23042 /* If the selector says both arguments are needed, but the operands are the
23043 same, the above tried to expand with one_operand_p and flattened selector.
23044 If that didn't work, retry without one_operand_p; we succeeded with that
23046 if (two_args
&& d
.one_operand_p
)
23048 d
.one_operand_p
= false;
23049 memcpy (d
.perm
, perm
, sizeof (perm
));
23050 return ix86_expand_vec_perm_const_1 (&d
);
23057 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
23059 struct expand_vec_perm_d d
;
23065 d
.vmode
= GET_MODE (targ
);
23066 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
23067 d
.one_operand_p
= false;
23068 d
.testing_p
= false;
23070 for (i
= 0; i
< nelt
; ++i
)
23071 d
.perm
[i
] = i
* 2 + odd
;
23073 /* We'll either be able to implement the permutation directly... */
23074 if (expand_vec_perm_1 (&d
))
23077 /* ... or we use the special-case patterns. */
23078 expand_vec_perm_even_odd_1 (&d
, odd
);
23082 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
23084 struct expand_vec_perm_d d
;
23085 unsigned i
, nelt
, base
;
23091 d
.vmode
= GET_MODE (targ
);
23092 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
23093 d
.one_operand_p
= false;
23094 d
.testing_p
= false;
23096 base
= high_p
? nelt
/ 2 : 0;
23097 for (i
= 0; i
< nelt
/ 2; ++i
)
23099 d
.perm
[i
* 2] = i
+ base
;
23100 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
23103 /* Note that for AVX this isn't one instruction. */
23104 ok
= ix86_expand_vec_perm_const_1 (&d
);
23108 /* This function is similar as ix86_expand_vecop_qihi,
23109 but optimized under AVX512BW by using vpmovwb.
23110 For example, optimize vector MUL generation like
23112 vpmovzxbw ymm2, xmm0
23113 vpmovzxbw ymm3, xmm1
23114 vpmullw ymm4, ymm2, ymm3
23117 it would take less instructions than ix86_expand_vecop_qihi.
23118 Return true if success. */
23121 ix86_expand_vecop_qihi2 (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23123 machine_mode himode
, qimode
= GET_MODE (dest
);
23124 rtx hop1
, hop2
, hdest
;
23125 rtx (*gen_truncate
)(rtx
, rtx
);
23126 bool uns_p
= (code
== ASHIFTRT
) ? false : true;
23128 /* There are no V64HImode instructions. */
23129 if (qimode
== V64QImode
)
23132 /* vpmovwb only available under AVX512BW. */
23133 if (!TARGET_AVX512BW
)
23135 if ((qimode
== V8QImode
|| qimode
== V16QImode
)
23136 && !TARGET_AVX512VL
)
23138 /* Do not generate ymm/zmm instructions when
23139 target prefers 128/256 bit vector width. */
23140 if ((qimode
== V16QImode
&& TARGET_PREFER_AVX128
)
23141 || (qimode
== V32QImode
&& TARGET_PREFER_AVX256
))
23148 gen_truncate
= gen_truncv8hiv8qi2
;
23151 himode
= V16HImode
;
23152 gen_truncate
= gen_truncv16hiv16qi2
;
23155 himode
= V32HImode
;
23156 gen_truncate
= gen_truncv32hiv32qi2
;
23159 gcc_unreachable ();
23162 hop1
= gen_reg_rtx (himode
);
23163 hop2
= gen_reg_rtx (himode
);
23164 hdest
= gen_reg_rtx (himode
);
23165 emit_insn (gen_extend_insn (hop1
, op1
, himode
, qimode
, uns_p
));
23166 emit_insn (gen_extend_insn (hop2
, op2
, himode
, qimode
, uns_p
));
23167 emit_insn (gen_rtx_SET (hdest
, simplify_gen_binary (code
, himode
,
23169 emit_insn (gen_truncate (dest
, hdest
));
23173 /* Expand a vector operation shift by constant for a V*QImode in terms of the
23174 same operation on V*HImode. Return true if success. */
23176 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
,
23177 rtx dest
, rtx op1
, rtx op2
)
23179 machine_mode qimode
, himode
;
23180 HOST_WIDE_INT and_constant
, xor_constant
;
23181 HOST_WIDE_INT shift_amount
;
23182 rtx vec_const_and
, vec_const_xor
;
23183 rtx tmp
, op1_subreg
;
23184 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
23185 rtx (*gen_and
) (rtx
, rtx
, rtx
);
23186 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
23187 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
23189 /* Only optimize shift by constant. */
23190 if (!CONST_INT_P (op2
))
23193 qimode
= GET_MODE (dest
);
23194 shift_amount
= INTVAL (op2
);
23195 /* Do nothing when shift amount greater equal 8. */
23196 if (shift_amount
> 7)
23199 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
23200 /* Record sign bit. */
23201 xor_constant
= 1 << (8 - shift_amount
- 1);
23203 /* Zero upper/lower bits shift from left/right element. */
23205 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
23206 : (1 << (8 - shift_amount
)) - 1);
23215 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
23216 gen_and
= gen_andv16qi3
;
23217 gen_xor
= gen_xorv16qi3
;
23218 gen_sub
= gen_subv16qi3
;
23221 himode
= V16HImode
;
23225 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
23226 gen_and
= gen_andv32qi3
;
23227 gen_xor
= gen_xorv32qi3
;
23228 gen_sub
= gen_subv32qi3
;
23231 himode
= V32HImode
;
23235 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
23236 gen_and
= gen_andv64qi3
;
23237 gen_xor
= gen_xorv64qi3
;
23238 gen_sub
= gen_subv64qi3
;
23241 gcc_unreachable ();
23244 tmp
= gen_reg_rtx (himode
);
23245 vec_const_and
= gen_reg_rtx (qimode
);
23246 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
23248 /* For ASHIFT and LSHIFTRT, perform operation like
23249 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23250 vpand %vec_const_and, %dest. */
23251 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
23252 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
23253 emit_move_insn (vec_const_and
,
23254 ix86_build_const_vector (qimode
, true,
23255 gen_int_mode (and_constant
, QImode
)));
23256 emit_insn (gen_and (dest
, dest
, vec_const_and
));
23258 /* For ASHIFTRT, perform extra operation like
23259 vpxor %vec_const_xor, %dest, %dest
23260 vpsubb %vec_const_xor, %dest, %dest */
23261 if (code
== ASHIFTRT
)
23263 vec_const_xor
= gen_reg_rtx (qimode
);
23264 emit_move_insn (vec_const_xor
,
23265 ix86_build_const_vector (qimode
, true,
23266 gen_int_mode (xor_constant
, QImode
)));
23267 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
23268 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
23274 ix86_expand_vecop_qihi_partial (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23276 machine_mode qimode
= GET_MODE (dest
);
23277 rtx qop1
, qop2
, hop1
, hop2
, qdest
, hres
;
23278 bool op2vec
= GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
;
23287 gcc_unreachable ();
23290 qop1
= lowpart_subreg (V16QImode
, force_reg (qimode
, op1
), qimode
);
23293 qop2
= lowpart_subreg (V16QImode
, force_reg (qimode
, op2
), qimode
);
23300 gcc_assert (op2vec
);
23301 /* Unpack data such that we've got a source byte in each low byte of
23302 each word. We don't care what goes into the high byte of each word.
23303 Rather than trying to get zero in there, most convenient is to let
23304 it be a copy of the low byte. */
23305 hop1
= copy_to_reg (qop1
);
23306 hop2
= copy_to_reg (qop2
);
23307 emit_insn (gen_vec_interleave_lowv16qi (hop1
, hop1
, hop1
));
23308 emit_insn (gen_vec_interleave_lowv16qi (hop2
, hop2
, hop2
));
23316 hop1
= gen_reg_rtx (V8HImode
);
23317 ix86_expand_sse_unpack (hop1
, qop1
, uns_p
, false);
23318 /* vashr/vlshr/vashl */
23321 hop2
= gen_reg_rtx (V8HImode
);
23322 ix86_expand_sse_unpack (hop2
, qop2
, uns_p
, false);
23329 gcc_unreachable ();
23332 if (code
!= MULT
&& op2vec
)
23334 /* Expand vashr/vlshr/vashl. */
23335 hres
= gen_reg_rtx (V8HImode
);
23336 emit_insn (gen_rtx_SET (hres
,
23337 simplify_gen_binary (code
, V8HImode
,
23341 /* Expand mult/ashr/lshr/ashl. */
23342 hres
= expand_simple_binop (V8HImode
, code
, hop1
, hop2
,
23343 NULL_RTX
, 1, OPTAB_DIRECT
);
23345 if (TARGET_AVX512BW
&& TARGET_AVX512VL
)
23347 if (qimode
== V8QImode
)
23350 qdest
= gen_reg_rtx (V8QImode
);
23352 emit_insn (gen_truncv8hiv8qi2 (qdest
, hres
));
23356 struct expand_vec_perm_d d
;
23357 rtx qres
= gen_lowpart (V16QImode
, hres
);
23361 qdest
= gen_reg_rtx (V16QImode
);
23363 /* Merge the data back into the right place. */
23367 d
.vmode
= V16QImode
;
23369 d
.one_operand_p
= false;
23370 d
.testing_p
= false;
23372 for (i
= 0; i
< d
.nelt
; ++i
)
23375 ok
= ix86_expand_vec_perm_const_1 (&d
);
23380 emit_move_insn (dest
, gen_lowpart (qimode
, qdest
));
23383 /* Expand a vector operation CODE for a V*QImode in terms of the
23384 same operation on V*HImode. */
23387 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
23389 machine_mode qimode
= GET_MODE (dest
);
23390 machine_mode himode
;
23391 rtx (*gen_il
) (rtx
, rtx
, rtx
);
23392 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
23393 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
23394 bool op2vec
= GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
;
23395 struct expand_vec_perm_d d
;
23396 bool full_interleave
= true;
23401 if (CONST_INT_P (op2
)
23402 && (code
== ASHIFT
|| code
== LSHIFTRT
|| code
== ASHIFTRT
)
23403 && ix86_expand_vec_shift_qihi_constant (code
, dest
, op1
, op2
))
23406 if (TARGET_AVX512BW
23407 && VECTOR_MODE_P (GET_MODE (op2
))
23408 && ix86_expand_vecop_qihi2 (code
, dest
, op1
, op2
))
23417 himode
= V16HImode
;
23420 himode
= V32HImode
;
23423 gcc_unreachable ();
23429 gcc_assert (op2vec
);
23430 /* Unpack data such that we've got a source byte in each low byte of
23431 each word. We don't care what goes into the high byte of each word.
23432 Rather than trying to get zero in there, most convenient is to let
23433 it be a copy of the low byte. */
23437 gen_il
= gen_vec_interleave_lowv16qi
;
23438 gen_ih
= gen_vec_interleave_highv16qi
;
23441 gen_il
= gen_avx2_interleave_lowv32qi
;
23442 gen_ih
= gen_avx2_interleave_highv32qi
;
23443 full_interleave
= false;
23446 gen_il
= gen_avx512bw_interleave_lowv64qi
;
23447 gen_ih
= gen_avx512bw_interleave_highv64qi
;
23448 full_interleave
= false;
23451 gcc_unreachable ();
23454 op2_l
= gen_reg_rtx (qimode
);
23455 op2_h
= gen_reg_rtx (qimode
);
23456 emit_insn (gen_il (op2_l
, op2
, op2
));
23457 emit_insn (gen_ih (op2_h
, op2
, op2
));
23459 op1_l
= gen_reg_rtx (qimode
);
23460 op1_h
= gen_reg_rtx (qimode
);
23461 emit_insn (gen_il (op1_l
, op1
, op1
));
23462 emit_insn (gen_ih (op1_h
, op1
, op1
));
23470 op1_l
= gen_reg_rtx (himode
);
23471 op1_h
= gen_reg_rtx (himode
);
23472 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
23473 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
23474 /* vashr/vlshr/vashl */
23477 rtx tmp
= force_reg (qimode
, op2
);
23478 op2_l
= gen_reg_rtx (himode
);
23479 op2_h
= gen_reg_rtx (himode
);
23480 ix86_expand_sse_unpack (op2_l
, tmp
, uns_p
, false);
23481 ix86_expand_sse_unpack (op2_h
, tmp
, uns_p
, true);
23484 op2_l
= op2_h
= op2
;
23488 gcc_unreachable ();
23491 if (code
!= MULT
&& op2vec
)
23493 /* Expand vashr/vlshr/vashl. */
23494 res_l
= gen_reg_rtx (himode
);
23495 res_h
= gen_reg_rtx (himode
);
23496 emit_insn (gen_rtx_SET (res_l
,
23497 simplify_gen_binary (code
, himode
,
23499 emit_insn (gen_rtx_SET (res_h
,
23500 simplify_gen_binary (code
, himode
,
23505 /* Expand mult/ashr/lshr/ashl. */
23506 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
23508 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
23512 gcc_assert (res_l
&& res_h
);
23514 /* Merge the data back into the right place. */
23516 d
.op0
= gen_lowpart (qimode
, res_l
);
23517 d
.op1
= gen_lowpart (qimode
, res_h
);
23519 d
.nelt
= GET_MODE_NUNITS (qimode
);
23520 d
.one_operand_p
= false;
23521 d
.testing_p
= false;
23523 if (full_interleave
)
23525 /* We used the full interleave, the desired
23526 results are in the even elements. */
23527 for (i
= 0; i
< d
.nelt
; ++i
)
23532 /* For AVX, the interleave used above was not cross-lane. So the
23533 extraction is evens but with the second and third quarter swapped.
23534 Happily, that is even one insn shorter than even extraction.
23535 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23536 always first from the first and then from the second source operand,
23537 the index bits above the low 4 bits remains the same.
23538 Thus, for d.nelt == 32 we want permutation
23539 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23540 and for d.nelt == 64 we want permutation
23541 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23542 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23543 for (i
= 0; i
< d
.nelt
; ++i
)
23544 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
23547 ok
= ix86_expand_vec_perm_const_1 (&d
);
23551 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
23552 if op is CONST_VECTOR with all odd elements equal to their
23553 preceding element. */
23556 const_vector_equal_evenodd_p (rtx op
)
23558 machine_mode mode
= GET_MODE (op
);
23559 int i
, nunits
= GET_MODE_NUNITS (mode
);
23560 if (GET_CODE (op
) != CONST_VECTOR
23561 || nunits
!= CONST_VECTOR_NUNITS (op
))
23563 for (i
= 0; i
< nunits
; i
+= 2)
23564 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
23570 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
23571 bool uns_p
, bool odd_p
)
23573 machine_mode mode
= GET_MODE (op1
);
23574 machine_mode wmode
= GET_MODE (dest
);
23576 rtx orig_op1
= op1
, orig_op2
= op2
;
23578 if (!nonimmediate_operand (op1
, mode
))
23579 op1
= force_reg (mode
, op1
);
23580 if (!nonimmediate_operand (op2
, mode
))
23581 op2
= force_reg (mode
, op2
);
23583 /* We only play even/odd games with vectors of SImode. */
23584 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
23586 /* If we're looking for the odd results, shift those members down to
23587 the even slots. For some cpus this is faster than a PSHUFD. */
23590 /* For XOP use vpmacsdqh, but only for smult, as it is only
23592 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
23594 x
= force_reg (wmode
, CONST0_RTX (wmode
));
23595 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
23599 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
23600 if (!const_vector_equal_evenodd_p (orig_op1
))
23601 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
23602 x
, NULL
, 1, OPTAB_DIRECT
);
23603 if (!const_vector_equal_evenodd_p (orig_op2
))
23604 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
23605 x
, NULL
, 1, OPTAB_DIRECT
);
23606 op1
= gen_lowpart (mode
, op1
);
23607 op2
= gen_lowpart (mode
, op2
);
23610 if (mode
== V16SImode
)
23613 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
23615 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
23617 else if (mode
== V8SImode
)
23620 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
23622 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
23625 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
23626 else if (TARGET_SSE4_1
)
23627 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
23630 rtx s1
, s2
, t0
, t1
, t2
;
23632 /* The easiest way to implement this without PMULDQ is to go through
23633 the motions as if we are performing a full 64-bit multiply. With
23634 the exception that we need to do less shuffling of the elements. */
23636 /* Compute the sign-extension, aka highparts, of the two operands. */
23637 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
23638 op1
, pc_rtx
, pc_rtx
);
23639 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
23640 op2
, pc_rtx
, pc_rtx
);
23642 /* Multiply LO(A) * HI(B), and vice-versa. */
23643 t1
= gen_reg_rtx (wmode
);
23644 t2
= gen_reg_rtx (wmode
);
23645 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
23646 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
23648 /* Multiply LO(A) * LO(B). */
23649 t0
= gen_reg_rtx (wmode
);
23650 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
23652 /* Combine and shift the highparts into place. */
23653 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
23654 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
23657 /* Combine high and low parts. */
23658 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
23665 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
23666 bool uns_p
, bool high_p
)
23668 machine_mode wmode
= GET_MODE (dest
);
23669 machine_mode mode
= GET_MODE (op1
);
23670 rtx t1
, t2
, t3
, t4
, mask
;
23675 t1
= gen_reg_rtx (mode
);
23676 t2
= gen_reg_rtx (mode
);
23677 if (TARGET_XOP
&& !uns_p
)
23679 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23680 shuffle the elements once so that all elements are in the right
23681 place for immediate use: { A C B D }. */
23682 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
23683 const1_rtx
, GEN_INT (3)));
23684 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
23685 const1_rtx
, GEN_INT (3)));
23689 /* Put the elements into place for the multiply. */
23690 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
23691 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
23694 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
23698 /* Shuffle the elements between the lanes. After this we
23699 have { A B E F | C D G H } for each operand. */
23700 t1
= gen_reg_rtx (V4DImode
);
23701 t2
= gen_reg_rtx (V4DImode
);
23702 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
23703 const0_rtx
, const2_rtx
,
23704 const1_rtx
, GEN_INT (3)));
23705 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
23706 const0_rtx
, const2_rtx
,
23707 const1_rtx
, GEN_INT (3)));
23709 /* Shuffle the elements within the lanes. After this we
23710 have { A A B B | C C D D } or { E E F F | G G H H }. */
23711 t3
= gen_reg_rtx (V8SImode
);
23712 t4
= gen_reg_rtx (V8SImode
);
23713 mask
= GEN_INT (high_p
23714 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23715 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23716 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
23717 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
23719 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
23724 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
23725 uns_p
, OPTAB_DIRECT
);
23726 t2
= expand_binop (mode
,
23727 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
23728 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
23729 gcc_assert (t1
&& t2
);
23731 t3
= gen_reg_rtx (mode
);
23732 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
23733 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
23741 t1
= gen_reg_rtx (wmode
);
23742 t2
= gen_reg_rtx (wmode
);
23743 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
23744 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
23746 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
23750 gcc_unreachable ();
23755 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
23757 rtx res_1
, res_2
, res_3
, res_4
;
23759 res_1
= gen_reg_rtx (V4SImode
);
23760 res_2
= gen_reg_rtx (V4SImode
);
23761 res_3
= gen_reg_rtx (V2DImode
);
23762 res_4
= gen_reg_rtx (V2DImode
);
23763 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
23764 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
23766 /* Move the results in element 2 down to element 1; we don't care
23767 what goes in elements 2 and 3. Then we can merge the parts
23768 back together with an interleave.
23770 Note that two other sequences were tried:
23771 (1) Use interleaves at the start instead of psrldq, which allows
23772 us to use a single shufps to merge things back at the end.
23773 (2) Use shufps here to combine the two vectors, then pshufd to
23774 put the elements in the correct order.
23775 In both cases the cost of the reformatting stall was too high
23776 and the overall sequence slower. */
23778 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
23779 const0_rtx
, const2_rtx
,
23780 const0_rtx
, const0_rtx
));
23781 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
23782 const0_rtx
, const2_rtx
,
23783 const0_rtx
, const0_rtx
));
23784 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
23786 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
23790 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
23792 machine_mode mode
= GET_MODE (op0
);
23793 rtx t1
, t2
, t3
, t4
, t5
, t6
;
23795 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
23796 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
23797 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
23798 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
23799 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
23800 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
23801 else if (TARGET_XOP
&& mode
== V2DImode
)
23803 /* op1: A,B,C,D, op2: E,F,G,H */
23804 op1
= gen_lowpart (V4SImode
, op1
);
23805 op2
= gen_lowpart (V4SImode
, op2
);
23807 t1
= gen_reg_rtx (V4SImode
);
23808 t2
= gen_reg_rtx (V4SImode
);
23809 t3
= gen_reg_rtx (V2DImode
);
23810 t4
= gen_reg_rtx (V2DImode
);
23813 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
23819 /* t2: (B*E),(A*F),(D*G),(C*H) */
23820 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
23822 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
23823 emit_insn (gen_xop_phadddq (t3
, t2
));
23825 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
23826 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
23828 /* Multiply lower parts and add all */
23829 t5
= gen_reg_rtx (V2DImode
);
23830 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
23831 gen_lowpart (V4SImode
, op1
),
23832 gen_lowpart (V4SImode
, op2
)));
23833 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
23837 machine_mode nmode
;
23838 rtx (*umul
) (rtx
, rtx
, rtx
);
23840 if (mode
== V2DImode
)
23842 umul
= gen_vec_widen_umult_even_v4si
;
23845 else if (mode
== V4DImode
)
23847 umul
= gen_vec_widen_umult_even_v8si
;
23850 else if (mode
== V8DImode
)
23852 umul
= gen_vec_widen_umult_even_v16si
;
23856 gcc_unreachable ();
23859 /* Multiply low parts. */
23860 t1
= gen_reg_rtx (mode
);
23861 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
23863 /* Shift input vectors right 32 bits so we can multiply high parts. */
23865 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
23866 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
23868 /* Multiply high parts by low parts. */
23869 t4
= gen_reg_rtx (mode
);
23870 t5
= gen_reg_rtx (mode
);
23871 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
23872 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
23874 /* Combine and shift the highparts back. */
23875 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
23876 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
23878 /* Combine high and low parts. */
23879 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
23882 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
23883 gen_rtx_MULT (mode
, op1
, op2
));
23886 /* Return 1 if control tansfer instruction INSN
23887 should be encoded with notrack prefix. */
23890 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
23892 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
23897 rtx call
= get_call_rtx_from (insn
);
23898 gcc_assert (call
!= NULL_RTX
);
23899 rtx addr
= XEXP (call
, 0);
23901 /* Do not emit 'notrack' if it's not an indirect call. */
23903 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
23906 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
23909 if (JUMP_P (insn
) && !flag_cet_switch
)
23911 rtx target
= JUMP_LABEL (insn
);
23912 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
23915 /* Check the jump is a switch table. */
23916 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
23917 rtx_insn
*table
= next_insn (label
);
23918 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
23926 /* Calculate integer abs() using only SSE2 instructions. */
23929 ix86_expand_sse2_abs (rtx target
, rtx input
)
23931 machine_mode mode
= GET_MODE (target
);
23938 /* For 64-bit signed integer X, with SSE4.2 use
23939 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
23940 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
23941 32 and use logical instead of arithmetic right shift (which is
23942 unimplemented) and subtract. */
23945 tmp0
= gen_reg_rtx (mode
);
23946 tmp1
= gen_reg_rtx (mode
);
23947 emit_move_insn (tmp1
, CONST0_RTX (mode
));
23948 if (mode
== E_V2DImode
)
23949 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
23951 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
23955 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
23956 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
23957 - 1), NULL
, 0, OPTAB_DIRECT
);
23958 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
23961 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
23962 NULL
, 0, OPTAB_DIRECT
);
23963 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
23964 target
, 0, OPTAB_DIRECT
);
23968 /* For 32-bit signed integer X, the best way to calculate the absolute
23969 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
23970 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
23971 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
23972 NULL
, 0, OPTAB_DIRECT
);
23973 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
23974 NULL
, 0, OPTAB_DIRECT
);
23975 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
23976 target
, 0, OPTAB_DIRECT
);
23980 /* For 16-bit signed integer X, the best way to calculate the absolute
23981 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
23982 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
23984 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
23985 target
, 0, OPTAB_DIRECT
);
23989 /* For 8-bit signed integer X, the best way to calculate the absolute
23990 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23991 as SSE2 provides the PMINUB insn. */
23992 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
23994 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
23995 target
, 0, OPTAB_DIRECT
);
23999 gcc_unreachable ();
24003 emit_move_insn (target
, x
);
24006 /* Expand an extract from a vector register through pextr insn.
24007 Return true if successful. */
24010 ix86_expand_pextr (rtx
*operands
)
24012 rtx dst
= operands
[0];
24013 rtx src
= operands
[1];
24015 unsigned int size
= INTVAL (operands
[2]);
24016 unsigned int pos
= INTVAL (operands
[3]);
24018 if (SUBREG_P (dst
))
24020 /* Reject non-lowpart subregs. */
24021 if (SUBREG_BYTE (dst
) > 0)
24023 dst
= SUBREG_REG (dst
);
24026 if (SUBREG_P (src
))
24028 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
24029 src
= SUBREG_REG (src
);
24032 switch (GET_MODE (src
))
24040 machine_mode srcmode
, dstmode
;
24043 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
24049 if (!TARGET_SSE4_1
)
24051 srcmode
= V16QImode
;
24057 srcmode
= V8HImode
;
24061 if (!TARGET_SSE4_1
)
24063 srcmode
= V4SImode
;
24067 gcc_assert (TARGET_64BIT
);
24068 if (!TARGET_SSE4_1
)
24070 srcmode
= V2DImode
;
24077 /* Reject extractions from misaligned positions. */
24078 if (pos
& (size
-1))
24081 if (GET_MODE (dst
) == dstmode
)
24084 d
= gen_reg_rtx (dstmode
);
24086 /* Construct insn pattern. */
24087 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
24088 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
24090 /* Let the rtl optimizers know about the zero extension performed. */
24091 if (dstmode
== QImode
|| dstmode
== HImode
)
24093 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
24094 d
= gen_lowpart (SImode
, d
);
24097 emit_insn (gen_rtx_SET (d
, pat
));
24100 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
24109 /* Expand an insert into a vector register through pinsr insn.
24110 Return true if successful. */
24113 ix86_expand_pinsr (rtx
*operands
)
24115 rtx dst
= operands
[0];
24116 rtx src
= operands
[3];
24118 unsigned int size
= INTVAL (operands
[1]);
24119 unsigned int pos
= INTVAL (operands
[2]);
24121 if (SUBREG_P (dst
))
24123 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
24124 dst
= SUBREG_REG (dst
);
24127 switch (GET_MODE (dst
))
24135 machine_mode srcmode
, dstmode
;
24136 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
24139 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
24145 if (!TARGET_SSE4_1
)
24147 dstmode
= V16QImode
;
24148 pinsr
= gen_sse4_1_pinsrb
;
24154 dstmode
= V8HImode
;
24155 pinsr
= gen_sse2_pinsrw
;
24159 if (!TARGET_SSE4_1
)
24161 dstmode
= V4SImode
;
24162 pinsr
= gen_sse4_1_pinsrd
;
24166 gcc_assert (TARGET_64BIT
);
24167 if (!TARGET_SSE4_1
)
24169 dstmode
= V2DImode
;
24170 pinsr
= gen_sse4_1_pinsrq
;
24177 /* Reject insertions to misaligned positions. */
24178 if (pos
& (size
-1))
24181 if (SUBREG_P (src
))
24183 unsigned int srcpos
= SUBREG_BYTE (src
);
24189 extr_ops
[0] = gen_reg_rtx (srcmode
);
24190 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
24191 extr_ops
[2] = GEN_INT (size
);
24192 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
24194 if (!ix86_expand_pextr (extr_ops
))
24200 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
24203 if (GET_MODE (dst
) == dstmode
)
24206 d
= gen_reg_rtx (dstmode
);
24208 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
24209 gen_lowpart (srcmode
, src
),
24210 GEN_INT (1 << (pos
/ size
))));
24212 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
24221 /* All CPUs prefer to avoid cross-lane operations so perform reductions
24222 upper against lower halves up to SSE reg size. */
24225 ix86_split_reduction (machine_mode mode
)
24227 /* Reduce lowpart against highpart until we reach SSE reg width to
24228 avoid cross-lane operations. */
24254 /* Generate call to __divmoddi4. */
24257 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
24259 rtx
*quot_p
, rtx
*rem_p
)
24261 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
24263 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
24264 mode
, op0
, mode
, op1
, mode
,
24265 XEXP (rem
, 0), Pmode
);
24271 ix86_expand_atomic_fetch_op_loop (rtx target
, rtx mem
, rtx val
,
24272 enum rtx_code code
, bool after
,
24275 rtx old_reg
, new_reg
, old_mem
, success
;
24276 machine_mode mode
= GET_MODE (target
);
24277 rtx_code_label
*loop_label
= NULL
;
24279 old_reg
= gen_reg_rtx (mode
);
24281 old_mem
= copy_to_reg (mem
);
24282 loop_label
= gen_label_rtx ();
24283 emit_label (loop_label
);
24284 emit_move_insn (old_reg
, old_mem
);
24286 /* return value for atomic_fetch_op. */
24288 emit_move_insn (target
, old_reg
);
24292 new_reg
= expand_simple_binop (mode
, AND
, new_reg
, val
, NULL_RTX
,
24293 true, OPTAB_LIB_WIDEN
);
24294 new_reg
= expand_simple_unop (mode
, code
, new_reg
, NULL_RTX
, true);
24297 new_reg
= expand_simple_binop (mode
, code
, new_reg
, val
, NULL_RTX
,
24298 true, OPTAB_LIB_WIDEN
);
24300 /* return value for atomic_op_fetch. */
24302 emit_move_insn (target
, new_reg
);
24304 success
= NULL_RTX
;
24306 ix86_expand_cmpxchg_loop (&success
, old_mem
, mem
, old_reg
, new_reg
,
24307 gen_int_mode (MEMMODEL_SYNC_SEQ_CST
,
24309 doubleword
, loop_label
);
24312 /* Relax cmpxchg instruction, param loop_label indicates whether
24313 the instruction should be relaxed with a pause loop. If not,
24314 it will be relaxed to an atomic load + compare, and skip
24315 cmpxchg instruction if mem != exp_input. */
24318 ix86_expand_cmpxchg_loop (rtx
*ptarget_bool
, rtx target_val
,
24319 rtx mem
, rtx exp_input
, rtx new_input
,
24320 rtx mem_model
, bool doubleword
,
24321 rtx_code_label
*loop_label
)
24323 rtx_code_label
*cmp_label
= NULL
;
24324 rtx_code_label
*done_label
= NULL
;
24325 rtx target_bool
= NULL_RTX
, new_mem
= NULL_RTX
;
24326 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
24327 rtx (*gendw
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
24328 machine_mode mode
= GET_MODE (target_val
), hmode
= mode
;
24330 if (*ptarget_bool
== NULL
)
24331 target_bool
= gen_reg_rtx (QImode
);
24333 target_bool
= *ptarget_bool
;
24335 cmp_label
= gen_label_rtx ();
24336 done_label
= gen_label_rtx ();
24338 new_mem
= gen_reg_rtx (mode
);
24339 /* Load memory first. */
24340 expand_atomic_load (new_mem
, mem
, MEMMODEL_SEQ_CST
);
24345 gendw
= gen_atomic_compare_and_swapti_doubleword
;
24351 gendw
= gen_atomic_compare_and_swapdi_doubleword
;
24355 gen
= gen_atomic_compare_and_swapdi_1
;
24358 gen
= gen_atomic_compare_and_swapsi_1
;
24361 gen
= gen_atomic_compare_and_swaphi_1
;
24364 gen
= gen_atomic_compare_and_swapqi_1
;
24367 gcc_unreachable ();
24370 /* Compare mem value with expected value. */
24373 rtx low_new_mem
= gen_lowpart (hmode
, new_mem
);
24374 rtx low_exp_input
= gen_lowpart (hmode
, exp_input
);
24375 rtx high_new_mem
= gen_highpart (hmode
, new_mem
);
24376 rtx high_exp_input
= gen_highpart (hmode
, exp_input
);
24377 emit_cmp_and_jump_insns (low_new_mem
, low_exp_input
, NE
, NULL_RTX
,
24378 hmode
, 1, cmp_label
,
24379 profile_probability::guessed_never ());
24380 emit_cmp_and_jump_insns (high_new_mem
, high_exp_input
, NE
, NULL_RTX
,
24381 hmode
, 1, cmp_label
,
24382 profile_probability::guessed_never ());
24385 emit_cmp_and_jump_insns (new_mem
, exp_input
, NE
, NULL_RTX
,
24386 GET_MODE (exp_input
), 1, cmp_label
,
24387 profile_probability::guessed_never ());
24389 /* Directly emits cmpxchg here. */
24391 emit_insn (gendw (target_val
, mem
, exp_input
,
24392 gen_lowpart (hmode
, new_input
),
24393 gen_highpart (hmode
, new_input
),
24396 emit_insn (gen (target_val
, mem
, exp_input
, new_input
, mem_model
));
24400 emit_jump_insn (gen_jump (done_label
));
24402 emit_label (cmp_label
);
24403 emit_move_insn (target_val
, new_mem
);
24404 emit_label (done_label
);
24405 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
24410 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
24412 emit_cmp_and_jump_insns (target_bool
, const0_rtx
, EQ
, const0_rtx
,
24413 GET_MODE (target_bool
), 1, loop_label
,
24414 profile_probability::guessed_never ());
24415 emit_jump_insn (gen_jump (done_label
));
24418 /* If mem is not expected, pause and loop back. */
24419 emit_label (cmp_label
);
24420 emit_move_insn (target_val
, new_mem
);
24421 emit_insn (gen_pause ());
24422 emit_jump_insn (gen_jump (loop_label
));
24424 emit_label (done_label
);
24427 *ptarget_bool
= target_bool
;
24430 /* Convert a BFmode VAL to SFmode without signaling sNaNs.
24431 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24434 ix86_expand_fast_convert_bf_to_sf (rtx val
)
24436 rtx op
= gen_lowpart (HImode
, val
), ret
;
24437 if (CONST_INT_P (op
))
24439 ret
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
24443 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24444 ret
= gen_reg_rtx (SImode
);
24445 emit_move_insn (ret
, GEN_INT (INTVAL (op
) & 0xffff));
24446 emit_insn (gen_ashlsi3 (ret
, ret
, GEN_INT (16)));
24447 return gen_lowpart (SFmode
, ret
);
24450 ret
= gen_reg_rtx (SFmode
);
24451 emit_insn (gen_extendbfsf2_1 (ret
, force_reg (BFmode
, val
)));
24455 #include "gt-i386-expand.h"