1 /* Copyright (C) 1988-2019 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
63 #include "sched-int.h"
65 #include "tree-pass.h"
67 #include "pass_manager.h"
68 #include "target-globals.h"
69 #include "gimple-iterator.h"
70 #include "tree-vectorizer.h"
71 #include "shrink-wrap.h"
74 #include "tree-iterator.h"
76 #include "case-cfn-macros.h"
78 #include "fold-const-call.h"
80 #include "tree-ssanames.h"
82 #include "selftest-rtl.h"
83 #include "print-rtl.h"
86 #include "symbol-summary.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
104 split_double_mode (machine_mode mode
, rtx operands
[],
105 int num
, rtx lo_half
[], rtx hi_half
[])
107 machine_mode half_mode
;
122 byte
= GET_MODE_SIZE (half_mode
);
126 rtx op
= operands
[num
];
128 /* simplify_subreg refuse to split volatile memory addresses,
129 but we still have to handle it. */
132 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
133 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
137 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
138 GET_MODE (op
) == VOIDmode
139 ? mode
: GET_MODE (op
), 0);
140 hi_half
[num
] = simplify_gen_subreg (half_mode
, op
,
141 GET_MODE (op
) == VOIDmode
142 ? mode
: GET_MODE (op
), byte
);
147 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
151 ix86_expand_clear (rtx dest
)
155 /* We play register width games, which are only valid after reload. */
156 gcc_assert (reload_completed
);
158 /* Avoid HImode and its attendant prefix byte. */
159 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
160 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
161 tmp
= gen_rtx_SET (dest
, const0_rtx
);
163 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
165 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
166 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
173 ix86_expand_move (machine_mode mode
, rtx operands
[])
176 rtx tmp
, addend
= NULL_RTX
;
177 enum tls_model model
;
182 switch (GET_CODE (op1
))
187 if (GET_CODE (tmp
) != PLUS
188 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
192 addend
= XEXP (tmp
, 1);
196 model
= SYMBOL_REF_TLS_MODEL (op1
);
199 op1
= legitimize_tls_address (op1
, model
, true);
200 else if (ix86_force_load_from_GOT_p (op1
))
202 /* Load the external function address via GOT slot to avoid PLT. */
203 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
207 op1
= gen_rtx_CONST (Pmode
, op1
);
208 op1
= gen_const_mem (Pmode
, op1
);
209 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
213 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
229 op1
= force_operand (op1
, NULL_RTX
);
230 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
231 op0
, 1, OPTAB_DIRECT
);
234 op1
= force_operand (op1
, op0
);
239 op1
= convert_to_mode (mode
, op1
, 1);
245 if ((flag_pic
|| MACHOPIC_INDIRECT
)
246 && symbolic_operand (op1
, mode
))
248 if (TARGET_MACHO
&& !TARGET_64BIT
)
252 if (MACHOPIC_INDIRECT
)
254 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
255 ? op0
: gen_reg_rtx (Pmode
);
256 op1
= machopic_indirect_data_reference (op1
, temp
);
258 op1
= machopic_legitimize_pic_address (op1
, mode
,
259 temp
== op1
? 0 : temp
);
261 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
263 rtx insn
= gen_rtx_SET (op0
, op1
);
267 if (GET_CODE (op0
) == MEM
)
268 op1
= force_reg (Pmode
, op1
);
272 if (GET_CODE (temp
) != REG
)
273 temp
= gen_reg_rtx (Pmode
);
274 temp
= legitimize_pic_address (op1
, temp
);
285 op1
= force_reg (mode
, op1
);
286 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
288 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
289 op1
= legitimize_pic_address (op1
, reg
);
292 op1
= convert_to_mode (mode
, op1
, 1);
299 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
300 || !push_operand (op0
, mode
))
302 op1
= force_reg (mode
, op1
);
304 if (push_operand (op0
, mode
)
305 && ! general_no_elim_operand (op1
, mode
))
306 op1
= copy_to_mode_reg (mode
, op1
);
308 /* Force large constants in 64bit compilation into register
309 to get them CSEed. */
310 if (can_create_pseudo_p ()
311 && (mode
== DImode
) && TARGET_64BIT
312 && immediate_operand (op1
, mode
)
313 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
314 && !register_operand (op0
, mode
)
316 op1
= copy_to_mode_reg (mode
, op1
);
318 if (can_create_pseudo_p ()
319 && CONST_DOUBLE_P (op1
))
321 /* If we are loading a floating point constant to a register,
322 force the value to memory now, since we'll get better code
325 op1
= validize_mem (force_const_mem (mode
, op1
));
326 if (!register_operand (op0
, mode
))
328 rtx temp
= gen_reg_rtx (mode
);
329 emit_insn (gen_rtx_SET (temp
, op1
));
330 emit_move_insn (op0
, temp
);
336 emit_insn (gen_rtx_SET (op0
, op1
));
340 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
342 rtx op0
= operands
[0], op1
= operands
[1];
343 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
344 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
345 unsigned int align
= (TARGET_IAMCU
346 ? GET_MODE_BITSIZE (mode
)
347 : GET_MODE_ALIGNMENT (mode
));
349 if (push_operand (op0
, VOIDmode
))
350 op0
= emit_move_resolve_push (mode
, op0
);
352 /* Force constants other than zero into memory. We do not know how
353 the instructions used to build constants modify the upper 64 bits
354 of the register, once we have that information we may be able
355 to handle some of them more efficiently. */
356 if (can_create_pseudo_p ()
359 && CONSTANT_P (SUBREG_REG (op1
))))
360 && ((register_operand (op0
, mode
)
361 && !standard_sse_constant_p (op1
, mode
))
362 /* ix86_expand_vector_move_misalign() does not like constants. */
363 || (SSE_REG_MODE_P (mode
)
365 && MEM_ALIGN (op0
) < align
)))
369 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
370 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
372 r
= validize_mem (r
);
374 r
= force_reg (imode
, SUBREG_REG (op1
));
375 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
378 op1
= validize_mem (force_const_mem (mode
, op1
));
381 /* We need to check memory alignment for SSE mode since attribute
382 can make operands unaligned. */
383 if (can_create_pseudo_p ()
384 && SSE_REG_MODE_P (mode
)
385 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
386 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
390 /* ix86_expand_vector_move_misalign() does not like both
391 arguments in memory. */
392 if (!register_operand (op0
, mode
)
393 && !register_operand (op1
, mode
))
394 op1
= force_reg (mode
, op1
);
396 tmp
[0] = op0
; tmp
[1] = op1
;
397 ix86_expand_vector_move_misalign (mode
, tmp
);
401 /* Make operand1 a register if it isn't already. */
402 if (can_create_pseudo_p ()
403 && !register_operand (op0
, mode
)
404 && !register_operand (op1
, mode
))
406 emit_move_insn (op0
, force_reg (GET_MODE (op0
), op1
));
410 emit_insn (gen_rtx_SET (op0
, op1
));
413 /* Split 32-byte AVX unaligned load and store if needed. */
416 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
419 rtx (*extract
) (rtx
, rtx
, rtx
);
422 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
423 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
425 emit_insn (gen_rtx_SET (op0
, op1
));
429 rtx orig_op0
= NULL_RTX
;
430 mode
= GET_MODE (op0
);
431 switch (GET_MODE_CLASS (mode
))
433 case MODE_VECTOR_INT
:
435 if (mode
!= V32QImode
)
440 op0
= gen_reg_rtx (V32QImode
);
443 op0
= gen_lowpart (V32QImode
, op0
);
444 op1
= gen_lowpart (V32QImode
, op1
);
448 case MODE_VECTOR_FLOAT
:
459 extract
= gen_avx_vextractf128v32qi
;
463 extract
= gen_avx_vextractf128v8sf
;
467 extract
= gen_avx_vextractf128v4df
;
474 rtx r
= gen_reg_rtx (mode
);
475 m
= adjust_address (op1
, mode
, 0);
476 emit_move_insn (r
, m
);
477 m
= adjust_address (op1
, mode
, 16);
478 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
479 emit_move_insn (op0
, r
);
481 else if (MEM_P (op0
))
483 m
= adjust_address (op0
, mode
, 0);
484 emit_insn (extract (m
, op1
, const0_rtx
));
485 m
= adjust_address (op0
, mode
, 16);
486 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
492 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
495 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
496 straight to ix86_expand_vector_move. */
497 /* Code generation for scalar reg-reg moves of single and double precision data:
498 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
502 if (x86_sse_partial_reg_dependency == true)
507 Code generation for scalar loads of double precision data:
508 if (x86_sse_split_regs == true)
509 movlpd mem, reg (gas syntax)
513 Code generation for unaligned packed loads of single precision data
514 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
515 if (x86_sse_unaligned_move_optimal)
518 if (x86_sse_partial_reg_dependency == true)
530 Code generation for unaligned packed loads of double precision data
531 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
532 if (x86_sse_unaligned_move_optimal)
535 if (x86_sse_split_regs == true)
548 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
555 /* Use unaligned load/store for AVX512 or when optimizing for size. */
556 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
558 emit_insn (gen_rtx_SET (op0
, op1
));
564 if (GET_MODE_SIZE (mode
) == 32)
565 ix86_avx256_split_vector_move_misalign (op0
, op1
);
567 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
568 emit_insn (gen_rtx_SET (op0
, op1
));
572 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
573 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
575 emit_insn (gen_rtx_SET (op0
, op1
));
579 /* ??? If we have typed data, then it would appear that using
580 movdqu is the only way to get unaligned data loaded with
582 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
584 emit_insn (gen_rtx_SET (op0
, op1
));
590 if (TARGET_SSE2
&& mode
== V2DFmode
)
594 /* When SSE registers are split into halves, we can avoid
595 writing to the top half twice. */
596 if (TARGET_SSE_SPLIT_REGS
)
603 /* ??? Not sure about the best option for the Intel chips.
604 The following would seem to satisfy; the register is
605 entirely cleared, breaking the dependency chain. We
606 then store to the upper half, with a dependency depth
607 of one. A rumor has it that Intel recommends two movsd
608 followed by an unpacklpd, but this is unconfirmed. And
609 given that the dependency depth of the unpacklpd would
610 still be one, I'm not sure why this would be better. */
611 zero
= CONST0_RTX (V2DFmode
);
614 m
= adjust_address (op1
, DFmode
, 0);
615 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
616 m
= adjust_address (op1
, DFmode
, 8);
617 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
623 if (mode
!= V4SFmode
)
624 t
= gen_reg_rtx (V4SFmode
);
628 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
629 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
633 m
= adjust_address (op1
, V2SFmode
, 0);
634 emit_insn (gen_sse_loadlps (t
, t
, m
));
635 m
= adjust_address (op1
, V2SFmode
, 8);
636 emit_insn (gen_sse_loadhps (t
, t
, m
));
637 if (mode
!= V4SFmode
)
638 emit_move_insn (op0
, gen_lowpart (mode
, t
));
641 else if (MEM_P (op0
))
643 if (TARGET_SSE2
&& mode
== V2DFmode
)
645 m
= adjust_address (op0
, DFmode
, 0);
646 emit_insn (gen_sse2_storelpd (m
, op1
));
647 m
= adjust_address (op0
, DFmode
, 8);
648 emit_insn (gen_sse2_storehpd (m
, op1
));
652 if (mode
!= V4SFmode
)
653 op1
= gen_lowpart (V4SFmode
, op1
);
655 m
= adjust_address (op0
, V2SFmode
, 0);
656 emit_insn (gen_sse_storelps (m
, op1
));
657 m
= adjust_address (op0
, V2SFmode
, 8);
658 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
665 /* Move bits 64:95 to bits 32:63. */
668 ix86_move_vector_high_sse_to_mmx (rtx op
)
670 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
671 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
672 GEN_INT (0), GEN_INT (0)));
673 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
674 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
675 rtx insn
= gen_rtx_SET (dest
, op
);
679 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
682 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
684 rtx op0
= operands
[0];
685 rtx op1
= operands
[1];
686 rtx op2
= operands
[2];
688 machine_mode dmode
= GET_MODE (op0
);
689 machine_mode smode
= GET_MODE (op1
);
690 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
691 machine_mode inner_smode
= GET_MODE_INNER (smode
);
693 /* Get the corresponding SSE mode for destination. */
694 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
695 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
697 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
698 nunits
/ 2).require ();
700 /* Get the corresponding SSE mode for source. */
701 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
702 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
705 /* Generate SSE pack with signed/unsigned saturation. */
706 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
707 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
708 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
710 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
711 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
712 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
716 ix86_move_vector_high_sse_to_mmx (op0
);
719 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
722 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
724 rtx op0
= operands
[0];
725 rtx op1
= operands
[1];
726 rtx op2
= operands
[2];
727 machine_mode mode
= GET_MODE (op0
);
729 /* The corresponding SSE mode. */
730 machine_mode sse_mode
, double_sse_mode
;
735 sse_mode
= V16QImode
;
736 double_sse_mode
= V32QImode
;
737 mask
= gen_rtx_PARALLEL (VOIDmode
,
739 GEN_INT (0), GEN_INT (16),
740 GEN_INT (1), GEN_INT (17),
741 GEN_INT (2), GEN_INT (18),
742 GEN_INT (3), GEN_INT (19),
743 GEN_INT (4), GEN_INT (20),
744 GEN_INT (5), GEN_INT (21),
745 GEN_INT (6), GEN_INT (22),
746 GEN_INT (7), GEN_INT (23)));
751 double_sse_mode
= V16HImode
;
752 mask
= gen_rtx_PARALLEL (VOIDmode
,
754 GEN_INT (0), GEN_INT (8),
755 GEN_INT (1), GEN_INT (9),
756 GEN_INT (2), GEN_INT (10),
757 GEN_INT (3), GEN_INT (11)));
762 double_sse_mode
= V8SImode
;
763 mask
= gen_rtx_PARALLEL (VOIDmode
,
765 GEN_INT (0), GEN_INT (4),
766 GEN_INT (1), GEN_INT (5)));
773 /* Generate SSE punpcklXX. */
774 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
775 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
776 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
778 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
779 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
780 rtx insn
= gen_rtx_SET (dest
, op2
);
785 /* Move bits 64:127 to bits 0:63. */
786 mask
= gen_rtx_PARALLEL (VOIDmode
,
787 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
788 GEN_INT (0), GEN_INT (0)));
789 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
790 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
791 insn
= gen_rtx_SET (dest
, op1
);
796 /* Helper function of ix86_fixup_binary_operands to canonicalize
797 operand order. Returns true if the operands should be swapped. */
800 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
803 rtx dst
= operands
[0];
804 rtx src1
= operands
[1];
805 rtx src2
= operands
[2];
807 /* If the operation is not commutative, we can't do anything. */
808 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
809 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
812 /* Highest priority is that src1 should match dst. */
813 if (rtx_equal_p (dst
, src1
))
815 if (rtx_equal_p (dst
, src2
))
818 /* Next highest priority is that immediate constants come second. */
819 if (immediate_operand (src2
, mode
))
821 if (immediate_operand (src1
, mode
))
824 /* Lowest priority is that memory references should come second. */
834 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
835 destination to use for the operation. If different from the true
836 destination in operands[0], a copy operation will be required. */
839 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
842 rtx dst
= operands
[0];
843 rtx src1
= operands
[1];
844 rtx src2
= operands
[2];
846 /* Canonicalize operand order. */
847 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
849 /* It is invalid to swap operands of different modes. */
850 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
852 std::swap (src1
, src2
);
855 /* Both source operands cannot be in memory. */
856 if (MEM_P (src1
) && MEM_P (src2
))
858 /* Optimization: Only read from memory once. */
859 if (rtx_equal_p (src1
, src2
))
861 src2
= force_reg (mode
, src2
);
864 else if (rtx_equal_p (dst
, src1
))
865 src2
= force_reg (mode
, src2
);
867 src1
= force_reg (mode
, src1
);
870 /* If the destination is memory, and we do not have matching source
871 operands, do things in registers. */
872 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
873 dst
= gen_reg_rtx (mode
);
875 /* Source 1 cannot be a constant. */
876 if (CONSTANT_P (src1
))
877 src1
= force_reg (mode
, src1
);
879 /* Source 1 cannot be a non-matching memory. */
880 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
881 src1
= force_reg (mode
, src1
);
883 /* Improve address combine. */
885 && GET_MODE_CLASS (mode
) == MODE_INT
887 src2
= force_reg (mode
, src2
);
894 /* Similarly, but assume that the destination has already been
898 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
899 machine_mode mode
, rtx operands
[])
901 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
902 gcc_assert (dst
== operands
[0]);
905 /* Attempt to expand a binary operator. Make the expansion closer to the
906 actual machine, then just general_operand, which will allow 3 separate
907 memory references (one output, two input) in a single insn. */
910 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
913 rtx src1
, src2
, dst
, op
, clob
;
915 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
919 /* Emit the instruction. */
921 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
925 && !rtx_equal_p (dst
, src1
))
927 /* This is going to be an LEA; avoid splitting it later. */
932 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
933 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
936 /* Fix up the destination if needed. */
937 if (dst
!= operands
[0])
938 emit_move_insn (operands
[0], dst
);
941 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
942 the given OPERANDS. */
945 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
948 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
949 if (SUBREG_P (operands
[1]))
954 else if (SUBREG_P (operands
[2]))
959 /* Optimize (__m128i) d | (__m128i) e and similar code
960 when d and e are float vectors into float vector logical
961 insn. In C/C++ without using intrinsics there is no other way
962 to express vector logical operation on float vectors than
963 to cast them temporarily to integer vectors. */
965 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
966 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
967 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
968 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
969 && SUBREG_BYTE (op1
) == 0
970 && (GET_CODE (op2
) == CONST_VECTOR
971 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
972 && SUBREG_BYTE (op2
) == 0))
973 && can_create_pseudo_p ())
976 switch (GET_MODE (SUBREG_REG (op1
)))
984 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
985 if (GET_CODE (op2
) == CONST_VECTOR
)
987 op2
= gen_lowpart (GET_MODE (dst
), op2
);
988 op2
= force_reg (GET_MODE (dst
), op2
);
993 op2
= SUBREG_REG (operands
[2]);
994 if (!vector_operand (op2
, GET_MODE (dst
)))
995 op2
= force_reg (GET_MODE (dst
), op2
);
997 op1
= SUBREG_REG (op1
);
998 if (!vector_operand (op1
, GET_MODE (dst
)))
999 op1
= force_reg (GET_MODE (dst
), op1
);
1000 emit_insn (gen_rtx_SET (dst
,
1001 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1003 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1009 if (!vector_operand (operands
[1], mode
))
1010 operands
[1] = force_reg (mode
, operands
[1]);
1011 if (!vector_operand (operands
[2], mode
))
1012 operands
[2] = force_reg (mode
, operands
[2]);
1013 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1014 emit_insn (gen_rtx_SET (operands
[0],
1015 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1019 /* Return TRUE or FALSE depending on whether the binary operator meets the
1020 appropriate constraints. */
1023 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1026 rtx dst
= operands
[0];
1027 rtx src1
= operands
[1];
1028 rtx src2
= operands
[2];
1030 /* Both source operands cannot be in memory. */
1031 if (MEM_P (src1
) && MEM_P (src2
))
1034 /* Canonicalize operand order for commutative operators. */
1035 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1036 std::swap (src1
, src2
);
1038 /* If the destination is memory, we must have a matching source operand. */
1039 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1042 /* Source 1 cannot be a constant. */
1043 if (CONSTANT_P (src1
))
1046 /* Source 1 cannot be a non-matching memory. */
1047 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1048 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1052 || (TARGET_64BIT
&& mode
== DImode
))
1053 && satisfies_constraint_L (src2
));
1058 /* Attempt to expand a unary operator. Make the expansion closer to the
1059 actual machine, then just general_operand, which will allow 2 separate
1060 memory references (one output, one input) in a single insn. */
1063 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1066 bool matching_memory
= false;
1067 rtx src
, dst
, op
, clob
;
1072 /* If the destination is memory, and we do not have matching source
1073 operands, do things in registers. */
1076 if (rtx_equal_p (dst
, src
))
1077 matching_memory
= true;
1079 dst
= gen_reg_rtx (mode
);
1082 /* When source operand is memory, destination must match. */
1083 if (MEM_P (src
) && !matching_memory
)
1084 src
= force_reg (mode
, src
);
1086 /* Emit the instruction. */
1088 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1094 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1095 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1098 /* Fix up the destination if needed. */
1099 if (dst
!= operands
[0])
1100 emit_move_insn (operands
[0], dst
);
1103 /* Predict just emitted jump instruction to be taken with probability PROB. */
1106 predict_jump (int prob
)
1108 rtx_insn
*insn
= get_last_insn ();
1109 gcc_assert (JUMP_P (insn
));
1110 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1113 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1114 divisor are within the range [0-255]. */
1117 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1120 rtx_code_label
*end_label
, *qimode_label
;
1123 rtx scratch
, tmp0
, tmp1
, tmp2
;
1124 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1125 rtx (*gen_zero_extend
) (rtx
, rtx
);
1126 rtx (*gen_test_ccno_1
) (rtx
, rtx
);
1131 if (GET_MODE (operands
[0]) == SImode
)
1133 if (GET_MODE (operands
[1]) == SImode
)
1134 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1137 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1138 gen_zero_extend
= gen_zero_extendqisi2
;
1143 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1144 gen_zero_extend
= gen_zero_extendqidi2
;
1146 gen_test_ccno_1
= gen_testsi_ccno_1
;
1149 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1150 gen_test_ccno_1
= gen_testdi_ccno_1
;
1151 gen_zero_extend
= gen_zero_extendqidi2
;
1157 end_label
= gen_label_rtx ();
1158 qimode_label
= gen_label_rtx ();
1160 scratch
= gen_reg_rtx (mode
);
1162 /* Use 8bit unsigned divimod if dividend and divisor are within
1163 the range [0-255]. */
1164 emit_move_insn (scratch
, operands
[2]);
1165 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1166 scratch
, 1, OPTAB_DIRECT
);
1167 emit_insn (gen_test_ccno_1 (scratch
, GEN_INT (-0x100)));
1168 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1169 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1170 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1171 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1173 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1174 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1175 JUMP_LABEL (insn
) = qimode_label
;
1177 /* Generate original signed/unsigned divimod. */
1178 div
= gen_divmod4_1 (operands
[0], operands
[1],
1179 operands
[2], operands
[3]);
1182 /* Branch to the end. */
1183 emit_jump_insn (gen_jump (end_label
));
1186 /* Generate 8bit unsigned divide. */
1187 emit_label (qimode_label
);
1188 /* Don't use operands[0] for result of 8bit divide since not all
1189 registers support QImode ZERO_EXTRACT. */
1190 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1191 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1192 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1193 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1197 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1198 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1202 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1203 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1207 if (GET_MODE (operands
[0]) != SImode
)
1208 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1209 if (GET_MODE (operands
[1]) != SImode
)
1210 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1213 /* Extract remainder from AH. */
1214 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]),
1215 tmp0
, GEN_INT (8), GEN_INT (8));
1216 if (REG_P (operands
[1]))
1217 insn
= emit_move_insn (operands
[1], tmp1
);
1220 /* Need a new scratch register since the old one has result
1222 scratch
= gen_reg_rtx (GET_MODE (operands
[1]));
1223 emit_move_insn (scratch
, tmp1
);
1224 insn
= emit_move_insn (operands
[1], scratch
);
1226 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1228 /* Zero extend quotient from AL. */
1229 tmp1
= gen_lowpart (QImode
, tmp0
);
1230 insn
= emit_insn (gen_zero_extend (operands
[0], tmp1
));
1231 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1233 emit_label (end_label
);
1236 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1237 matches destination. RTX includes clobber of FLAGS_REG. */
1240 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1245 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1246 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1248 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1251 /* Return true if regno1 def is nearest to the insn. */
1254 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1256 rtx_insn
*prev
= insn
;
1257 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1261 while (prev
&& prev
!= start
)
1263 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1265 prev
= PREV_INSN (prev
);
1268 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1270 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1272 prev
= PREV_INSN (prev
);
1275 /* None of the regs is defined in the bb. */
1279 /* Split lea instructions into a sequence of instructions
1280 which are executed on ALU to avoid AGU stalls.
1281 It is assumed that it is allowed to clobber flags register
1285 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1287 unsigned int regno0
, regno1
, regno2
;
1288 struct ix86_address parts
;
1292 ok
= ix86_decompose_address (operands
[1], &parts
);
1295 target
= gen_lowpart (mode
, operands
[0]);
1297 regno0
= true_regnum (target
);
1298 regno1
= INVALID_REGNUM
;
1299 regno2
= INVALID_REGNUM
;
1303 parts
.base
= gen_lowpart (mode
, parts
.base
);
1304 regno1
= true_regnum (parts
.base
);
1309 parts
.index
= gen_lowpart (mode
, parts
.index
);
1310 regno2
= true_regnum (parts
.index
);
1314 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1316 if (parts
.scale
> 1)
1318 /* Case r1 = r1 + ... */
1319 if (regno1
== regno0
)
1321 /* If we have a case r1 = r1 + C * r2 then we
1322 should use multiplication which is very
1323 expensive. Assume cost model is wrong if we
1324 have such case here. */
1325 gcc_assert (regno2
!= regno0
);
1327 for (adds
= parts
.scale
; adds
> 0; adds
--)
1328 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1332 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1333 if (regno0
!= regno2
)
1334 emit_insn (gen_rtx_SET (target
, parts
.index
));
1336 /* Use shift for scaling. */
1337 ix86_emit_binop (ASHIFT
, mode
, target
,
1338 GEN_INT (exact_log2 (parts
.scale
)));
1341 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1343 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1344 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1347 else if (!parts
.base
&& !parts
.index
)
1349 gcc_assert(parts
.disp
);
1350 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1356 if (regno0
!= regno2
)
1357 emit_insn (gen_rtx_SET (target
, parts
.index
));
1359 else if (!parts
.index
)
1361 if (regno0
!= regno1
)
1362 emit_insn (gen_rtx_SET (target
, parts
.base
));
1366 if (regno0
== regno1
)
1368 else if (regno0
== regno2
)
1374 /* Find better operand for SET instruction, depending
1375 on which definition is farther from the insn. */
1376 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1377 tmp
= parts
.index
, tmp1
= parts
.base
;
1379 tmp
= parts
.base
, tmp1
= parts
.index
;
1381 emit_insn (gen_rtx_SET (target
, tmp
));
1383 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1384 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1386 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1390 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1393 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1394 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1398 /* Post-reload splitter for converting an SF or DFmode value in an
1399 SSE register into an unsigned SImode. */
1402 ix86_split_convert_uns_si_sse (rtx operands
[])
1404 machine_mode vecmode
;
1405 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1407 large
= operands
[1];
1408 zero_or_two31
= operands
[2];
1409 input
= operands
[3];
1410 two31
= operands
[4];
1411 vecmode
= GET_MODE (large
);
1412 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1414 /* Load up the value into the low element. We must ensure that the other
1415 elements are valid floats -- zero is the easiest such value. */
1418 if (vecmode
== V4SFmode
)
1419 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1421 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1425 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1426 emit_move_insn (value
, CONST0_RTX (vecmode
));
1427 if (vecmode
== V4SFmode
)
1428 emit_insn (gen_sse_movss (value
, value
, input
));
1430 emit_insn (gen_sse2_movsd (value
, value
, input
));
1433 emit_move_insn (large
, two31
);
1434 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1436 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1437 emit_insn (gen_rtx_SET (large
, x
));
1439 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1440 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1442 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1443 emit_insn (gen_rtx_SET (value
, x
));
1445 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1446 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1448 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1449 if (vecmode
== V4SFmode
)
1450 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1452 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1455 emit_insn (gen_xorv4si3 (value
, value
, large
));
1458 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1459 machine_mode mode
, rtx target
,
1460 rtx var
, int one_var
);
1462 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1463 Expects the 64-bit DImode to be supplied in a pair of integral
1464 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1465 -mfpmath=sse, !optimize_size only. */
1468 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1470 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1471 rtx int_xmm
, fp_xmm
;
1472 rtx biases
, exponents
;
1475 int_xmm
= gen_reg_rtx (V4SImode
);
1476 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1477 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1478 else if (TARGET_SSE_SPLIT_REGS
)
1480 emit_clobber (int_xmm
);
1481 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1485 x
= gen_reg_rtx (V2DImode
);
1486 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1487 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1490 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1491 gen_rtvec (4, GEN_INT (0x43300000UL
),
1492 GEN_INT (0x45300000UL
),
1493 const0_rtx
, const0_rtx
));
1494 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1496 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1497 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1499 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1500 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1501 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1502 (0x1.0p84 + double(fp_value_hi_xmm)).
1503 Note these exponents differ by 32. */
1505 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1507 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1508 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1509 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1510 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1511 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1512 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1513 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1514 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1515 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1517 /* Add the upper and lower DFmode values together. */
1519 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1522 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1523 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1524 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1527 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1530 /* Not used, but eases macroization of patterns. */
1532 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1537 /* Convert an unsigned SImode value into a DFmode. Only currently used
1538 for SSE, but applicable anywhere. */
1541 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1543 REAL_VALUE_TYPE TWO31r
;
1546 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1547 NULL
, 1, OPTAB_DIRECT
);
1549 fp
= gen_reg_rtx (DFmode
);
1550 emit_insn (gen_floatsidf2 (fp
, x
));
1552 real_ldexp (&TWO31r
, &dconst1
, 31);
1553 x
= const_double_from_real_value (TWO31r
, DFmode
);
1555 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1557 emit_move_insn (target
, x
);
1560 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1561 32-bit mode; otherwise we have a direct convert instruction. */
1564 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1566 REAL_VALUE_TYPE TWO32r
;
1567 rtx fp_lo
, fp_hi
, x
;
1569 fp_lo
= gen_reg_rtx (DFmode
);
1570 fp_hi
= gen_reg_rtx (DFmode
);
1572 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1574 real_ldexp (&TWO32r
, &dconst1
, 32);
1575 x
= const_double_from_real_value (TWO32r
, DFmode
);
1576 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1578 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1580 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1583 emit_move_insn (target
, x
);
1586 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1587 For x86_32, -mfpmath=sse, !optimize_size only. */
1589 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1591 REAL_VALUE_TYPE ONE16r
;
1592 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1594 real_ldexp (&ONE16r
, &dconst1
, 16);
1595 x
= const_double_from_real_value (ONE16r
, SFmode
);
1596 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1597 NULL
, 0, OPTAB_DIRECT
);
1598 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1599 NULL
, 0, OPTAB_DIRECT
);
1600 fp_hi
= gen_reg_rtx (SFmode
);
1601 fp_lo
= gen_reg_rtx (SFmode
);
1602 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1603 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1604 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1606 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1608 if (!rtx_equal_p (target
, fp_hi
))
1609 emit_move_insn (target
, fp_hi
);
1612 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1613 a vector of unsigned ints VAL to vector of floats TARGET. */
1616 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1619 REAL_VALUE_TYPE TWO16r
;
1620 machine_mode intmode
= GET_MODE (val
);
1621 machine_mode fltmode
= GET_MODE (target
);
1622 rtx (*cvt
) (rtx
, rtx
);
1624 if (intmode
== V4SImode
)
1625 cvt
= gen_floatv4siv4sf2
;
1627 cvt
= gen_floatv8siv8sf2
;
1628 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1629 tmp
[0] = force_reg (intmode
, tmp
[0]);
1630 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1632 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1633 NULL_RTX
, 1, OPTAB_DIRECT
);
1634 tmp
[3] = gen_reg_rtx (fltmode
);
1635 emit_insn (cvt (tmp
[3], tmp
[1]));
1636 tmp
[4] = gen_reg_rtx (fltmode
);
1637 emit_insn (cvt (tmp
[4], tmp
[2]));
1638 real_ldexp (&TWO16r
, &dconst1
, 16);
1639 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1640 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1641 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5], NULL_RTX
, 1,
1643 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6], target
, 1,
1645 if (tmp
[7] != target
)
1646 emit_move_insn (target
, tmp
[7]);
1649 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1650 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1651 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1652 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1655 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1657 REAL_VALUE_TYPE TWO31r
;
1659 machine_mode mode
= GET_MODE (val
);
1660 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1661 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1662 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1665 for (i
= 0; i
< 3; i
++)
1666 tmp
[i
] = gen_reg_rtx (mode
);
1667 real_ldexp (&TWO31r
, &dconst1
, 31);
1668 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1669 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1670 two31r
= force_reg (mode
, two31r
);
1673 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1674 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1675 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1676 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1677 default: gcc_unreachable ();
1679 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1680 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1681 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1683 if (intmode
== V4SImode
|| TARGET_AVX2
)
1684 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1685 gen_lowpart (intmode
, tmp
[0]),
1686 GEN_INT (31), NULL_RTX
, 0,
1690 rtx two31
= GEN_INT (HOST_WIDE_INT_1U
<< 31);
1691 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1692 *xorp
= expand_simple_binop (intmode
, AND
,
1693 gen_lowpart (intmode
, tmp
[0]),
1697 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
1701 /* Generate code for floating point ABS or NEG. */
1704 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1708 bool use_sse
= false;
1709 bool vector_mode
= VECTOR_MODE_P (mode
);
1710 machine_mode vmode
= mode
;
1715 else if (mode
== TFmode
)
1717 else if (TARGET_SSE_MATH
)
1719 use_sse
= SSE_FLOAT_MODE_P (mode
);
1722 else if (mode
== DFmode
)
1729 set
= gen_rtx_fmt_e (code
, mode
, src
);
1730 set
= gen_rtx_SET (dst
, set
);
1734 rtx mask
, use
, clob
;
1736 /* NEG and ABS performed with SSE use bitwise mask operations.
1737 Create the appropriate mask now. */
1738 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
1739 use
= gen_rtx_USE (VOIDmode
, mask
);
1741 par
= gen_rtvec (2, set
, use
);
1744 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1745 par
= gen_rtvec (3, set
, use
, clob
);
1752 /* Changing of sign for FP values is doable using integer unit too. */
1753 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1754 par
= gen_rtvec (2, set
, clob
);
1757 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1760 /* Deconstruct a floating point ABS or NEG operation
1761 with integer registers into integer operations. */
1764 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1767 enum rtx_code absneg_op
;
1770 gcc_assert (operands_match_p (operands
[0], operands
[1]));
1775 dst
= gen_lowpart (SImode
, operands
[0]);
1779 set
= gen_int_mode (0x7fffffff, SImode
);
1784 set
= gen_int_mode (0x80000000, SImode
);
1787 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1793 dst
= gen_lowpart (DImode
, operands
[0]);
1794 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
1799 set
= gen_rtx_NOT (DImode
, dst
);
1803 dst
= gen_highpart (SImode
, operands
[0]);
1807 set
= gen_int_mode (0x7fffffff, SImode
);
1812 set
= gen_int_mode (0x80000000, SImode
);
1815 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1820 dst
= gen_rtx_REG (SImode
,
1821 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
1824 set
= GEN_INT (0x7fff);
1829 set
= GEN_INT (0x8000);
1832 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1839 set
= gen_rtx_SET (dst
, set
);
1841 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1842 rtvec par
= gen_rtvec (2, set
, clob
);
1844 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1847 /* Expand a copysign operation. Special case operand 0 being a constant. */
1850 ix86_expand_copysign (rtx operands
[])
1852 machine_mode mode
, vmode
;
1853 rtx dest
, op0
, op1
, mask
;
1859 mode
= GET_MODE (dest
);
1863 else if (mode
== DFmode
)
1865 else if (mode
== TFmode
)
1870 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1872 if (CONST_DOUBLE_P (op0
))
1874 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0
)))
1875 op0
= simplify_unary_operation (ABS
, mode
, op0
, mode
);
1877 if (mode
== SFmode
|| mode
== DFmode
)
1879 if (op0
== CONST0_RTX (mode
))
1880 op0
= CONST0_RTX (vmode
);
1883 rtx v
= ix86_build_const_vector (vmode
, false, op0
);
1885 op0
= force_reg (vmode
, v
);
1888 else if (op0
!= CONST0_RTX (mode
))
1889 op0
= force_reg (mode
, op0
);
1891 emit_insn (gen_copysign3_const (mode
, dest
, op0
, op1
, mask
));
1895 rtx nmask
= ix86_build_signbit_mask (vmode
, 0, 1);
1897 emit_insn (gen_copysign3_var
1898 (mode
, dest
, NULL_RTX
, op0
, op1
, nmask
, mask
));
1902 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1903 be a constant, and so has already been expanded into a vector constant. */
1906 ix86_split_copysign_const (rtx operands
[])
1908 machine_mode mode
, vmode
;
1909 rtx dest
, op0
, mask
, x
;
1915 mode
= GET_MODE (dest
);
1916 vmode
= GET_MODE (mask
);
1918 dest
= lowpart_subreg (vmode
, dest
, mode
);
1919 x
= gen_rtx_AND (vmode
, dest
, mask
);
1920 emit_insn (gen_rtx_SET (dest
, x
));
1922 if (op0
!= CONST0_RTX (vmode
))
1924 x
= gen_rtx_IOR (vmode
, dest
, op0
);
1925 emit_insn (gen_rtx_SET (dest
, x
));
1929 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1930 so we have to do two masks. */
1933 ix86_split_copysign_var (rtx operands
[])
1935 machine_mode mode
, vmode
;
1936 rtx dest
, scratch
, op0
, op1
, mask
, nmask
, x
;
1939 scratch
= operands
[1];
1942 nmask
= operands
[4];
1945 mode
= GET_MODE (dest
);
1946 vmode
= GET_MODE (mask
);
1948 if (rtx_equal_p (op0
, op1
))
1950 /* Shouldn't happen often (it's useless, obviously), but when it does
1951 we'd generate incorrect code if we continue below. */
1952 emit_move_insn (dest
, op0
);
1956 if (REG_P (mask
) && REGNO (dest
) == REGNO (mask
)) /* alternative 0 */
1958 gcc_assert (REGNO (op1
) == REGNO (scratch
));
1960 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1961 emit_insn (gen_rtx_SET (scratch
, x
));
1964 op0
= lowpart_subreg (vmode
, op0
, mode
);
1965 x
= gen_rtx_NOT (vmode
, dest
);
1966 x
= gen_rtx_AND (vmode
, x
, op0
);
1967 emit_insn (gen_rtx_SET (dest
, x
));
1971 if (REGNO (op1
) == REGNO (scratch
)) /* alternative 1,3 */
1973 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1975 else /* alternative 2,4 */
1977 gcc_assert (REGNO (mask
) == REGNO (scratch
));
1978 op1
= lowpart_subreg (vmode
, op1
, mode
);
1979 x
= gen_rtx_AND (vmode
, scratch
, op1
);
1981 emit_insn (gen_rtx_SET (scratch
, x
));
1983 if (REGNO (op0
) == REGNO (dest
)) /* alternative 1,2 */
1985 dest
= lowpart_subreg (vmode
, op0
, mode
);
1986 x
= gen_rtx_AND (vmode
, dest
, nmask
);
1988 else /* alternative 3,4 */
1990 gcc_assert (REGNO (nmask
) == REGNO (dest
));
1992 op0
= lowpart_subreg (vmode
, op0
, mode
);
1993 x
= gen_rtx_AND (vmode
, dest
, op0
);
1995 emit_insn (gen_rtx_SET (dest
, x
));
1998 x
= gen_rtx_IOR (vmode
, dest
, scratch
);
1999 emit_insn (gen_rtx_SET (dest
, x
));
2002 /* Expand an xorsign operation. */
2005 ix86_expand_xorsign (rtx operands
[])
2007 machine_mode mode
, vmode
;
2008 rtx dest
, op0
, op1
, mask
;
2014 mode
= GET_MODE (dest
);
2018 else if (mode
== DFmode
)
2023 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2025 emit_insn (gen_xorsign3_1 (mode
, dest
, op0
, op1
, mask
));
2028 /* Deconstruct an xorsign operation into bit masks. */
2031 ix86_split_xorsign (rtx operands
[])
2033 machine_mode mode
, vmode
;
2034 rtx dest
, op0
, mask
, x
;
2040 mode
= GET_MODE (dest
);
2041 vmode
= GET_MODE (mask
);
2043 dest
= lowpart_subreg (vmode
, dest
, mode
);
2044 x
= gen_rtx_AND (vmode
, dest
, mask
);
2045 emit_insn (gen_rtx_SET (dest
, x
));
2047 op0
= lowpart_subreg (vmode
, op0
, mode
);
2048 x
= gen_rtx_XOR (vmode
, dest
, op0
);
2049 emit_insn (gen_rtx_SET (dest
, x
));
2052 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2055 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2057 machine_mode mode
= GET_MODE (op0
);
2060 /* Handle special case - vector comparsion with boolean result, transform
2061 it using ptest instruction. */
2062 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
2064 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2065 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2067 gcc_assert (code
== EQ
|| code
== NE
);
2068 /* Generate XOR since we can't check that one operand is zero vector. */
2069 tmp
= gen_reg_rtx (mode
);
2070 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2071 tmp
= gen_lowpart (p_mode
, tmp
);
2072 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2073 gen_rtx_UNSPEC (CCmode
,
2074 gen_rtvec (2, tmp
, tmp
),
2076 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2077 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2078 gen_rtx_LABEL_REF (VOIDmode
, label
),
2080 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2093 tmp
= ix86_expand_compare (code
, op0
, op1
);
2094 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2095 gen_rtx_LABEL_REF (VOIDmode
, label
),
2097 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2103 /* For 32-bit target DI comparison may be performed on
2104 SSE registers. To allow this we should avoid split
2105 to SI mode which is achieved by doing xor in DI mode
2106 and then comparing with zero (which is recognized by
2107 STV pass). We don't compare using xor when optimizing
2109 if (!optimize_insn_for_size_p ()
2111 && (code
== EQ
|| code
== NE
))
2113 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2118 /* Expand DImode branch into multiple compare+branch. */
2121 rtx_code_label
*label2
;
2122 enum rtx_code code1
, code2
, code3
;
2123 machine_mode submode
;
2125 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2127 std::swap (op0
, op1
);
2128 code
= swap_condition (code
);
2131 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2132 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2134 submode
= mode
== DImode
? SImode
: DImode
;
2136 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2137 avoid two branches. This costs one extra insn, so disable when
2138 optimizing for size. */
2140 if ((code
== EQ
|| code
== NE
)
2141 && (!optimize_insn_for_size_p ()
2142 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2147 if (hi
[1] != const0_rtx
)
2148 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2149 NULL_RTX
, 0, OPTAB_WIDEN
);
2152 if (lo
[1] != const0_rtx
)
2153 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2154 NULL_RTX
, 0, OPTAB_WIDEN
);
2156 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2157 NULL_RTX
, 0, OPTAB_WIDEN
);
2159 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2163 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2164 op1 is a constant and the low word is zero, then we can just
2165 examine the high word. Similarly for low word -1 and
2166 less-or-equal-than or greater-than. */
2168 if (CONST_INT_P (hi
[1]))
2171 case LT
: case LTU
: case GE
: case GEU
:
2172 if (lo
[1] == const0_rtx
)
2174 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2178 case LE
: case LEU
: case GT
: case GTU
:
2179 if (lo
[1] == constm1_rtx
)
2181 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2189 /* Emulate comparisons that do not depend on Zero flag with
2190 double-word subtraction. Note that only Overflow, Sign
2191 and Carry flags are valid, so swap arguments and condition
2192 of comparisons that would otherwise test Zero flag. */
2196 case LE
: case LEU
: case GT
: case GTU
:
2197 std::swap (lo
[0], lo
[1]);
2198 std::swap (hi
[0], hi
[1]);
2199 code
= swap_condition (code
);
2202 case LT
: case LTU
: case GE
: case GEU
:
2204 bool uns
= (code
== LTU
|| code
== GEU
);
2205 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2206 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2208 if (!nonimmediate_operand (lo
[0], submode
))
2209 lo
[0] = force_reg (submode
, lo
[0]);
2210 if (!x86_64_general_operand (lo
[1], submode
))
2211 lo
[1] = force_reg (submode
, lo
[1]);
2213 if (!register_operand (hi
[0], submode
))
2214 hi
[0] = force_reg (submode
, hi
[0]);
2215 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2216 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2217 hi
[1] = force_reg (submode
, hi
[1]);
2219 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2221 tmp
= gen_rtx_SCRATCH (submode
);
2222 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2224 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2225 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2233 /* Otherwise, we need two or three jumps. */
2235 label2
= gen_label_rtx ();
2238 code2
= swap_condition (code
);
2239 code3
= unsigned_condition (code
);
2243 case LT
: case GT
: case LTU
: case GTU
:
2246 case LE
: code1
= LT
; code2
= GT
; break;
2247 case GE
: code1
= GT
; code2
= LT
; break;
2248 case LEU
: code1
= LTU
; code2
= GTU
; break;
2249 case GEU
: code1
= GTU
; code2
= LTU
; break;
2251 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2252 case NE
: code2
= UNKNOWN
; break;
2260 * if (hi(a) < hi(b)) goto true;
2261 * if (hi(a) > hi(b)) goto false;
2262 * if (lo(a) < lo(b)) goto true;
2266 if (code1
!= UNKNOWN
)
2267 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2268 if (code2
!= UNKNOWN
)
2269 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2271 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2273 if (code2
!= UNKNOWN
)
2274 emit_label (label2
);
2279 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2284 /* Figure out whether to use unordered fp comparisons. */
2287 ix86_unordered_fp_compare (enum rtx_code code
)
2289 if (!TARGET_IEEE_FP
)
2318 /* Return a comparison we can do and that it is equivalent to
2319 swap_condition (code) apart possibly from orderedness.
2320 But, never change orderedness if TARGET_IEEE_FP, returning
2321 UNKNOWN in that case if necessary. */
2323 static enum rtx_code
2324 ix86_fp_swap_condition (enum rtx_code code
)
2328 case GT
: /* GTU - CF=0 & ZF=0 */
2329 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2330 case GE
: /* GEU - CF=0 */
2331 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2332 case UNLT
: /* LTU - CF=1 */
2333 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2334 case UNLE
: /* LEU - CF=1 | ZF=1 */
2335 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2337 return swap_condition (code
);
2341 /* Return cost of comparison CODE using the best strategy for performance.
2342 All following functions do use number of instructions as a cost metrics.
2343 In future this should be tweaked to compute bytes for optimize_size and
2344 take into account performance of various instructions on various CPUs. */
2347 ix86_fp_comparison_cost (enum rtx_code code
)
2351 /* The cost of code using bit-twiddling on %ah. */
2368 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2372 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2378 switch (ix86_fp_comparison_strategy (code
))
2380 case IX86_FPCMP_COMI
:
2381 return arith_cost
> 4 ? 3 : 2;
2382 case IX86_FPCMP_SAHF
:
2383 return arith_cost
> 4 ? 4 : 3;
2389 /* Swap, force into registers, or otherwise massage the two operands
2390 to a fp comparison. The operands are updated in place; the new
2391 comparison code is returned. */
2393 static enum rtx_code
2394 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2396 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2397 rtx op0
= *pop0
, op1
= *pop1
;
2398 machine_mode op_mode
= GET_MODE (op0
);
2399 bool is_sse
= TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (op_mode
);
2401 /* All of the unordered compare instructions only work on registers.
2402 The same is true of the fcomi compare instructions. The XFmode
2403 compare instructions require registers except when comparing
2404 against zero or when converting operand 1 from fixed point to
2408 && (unordered_compare
2409 || (op_mode
== XFmode
2410 && ! (standard_80387_constant_p (op0
) == 1
2411 || standard_80387_constant_p (op1
) == 1)
2412 && GET_CODE (op1
) != FLOAT
)
2413 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2415 op0
= force_reg (op_mode
, op0
);
2416 op1
= force_reg (op_mode
, op1
);
2420 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2421 things around if they appear profitable, otherwise force op0
2424 if (standard_80387_constant_p (op0
) == 0
2426 && ! (standard_80387_constant_p (op1
) == 0
2429 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2430 if (new_code
!= UNKNOWN
)
2432 std::swap (op0
, op1
);
2438 op0
= force_reg (op_mode
, op0
);
2440 if (CONSTANT_P (op1
))
2442 int tmp
= standard_80387_constant_p (op1
);
2444 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2448 op1
= force_reg (op_mode
, op1
);
2451 op1
= force_reg (op_mode
, op1
);
2455 /* Try to rearrange the comparison to make it cheaper. */
2456 if (ix86_fp_comparison_cost (code
)
2457 > ix86_fp_comparison_cost (swap_condition (code
))
2458 && (REG_P (op1
) || can_create_pseudo_p ()))
2460 std::swap (op0
, op1
);
2461 code
= swap_condition (code
);
2463 op0
= force_reg (op_mode
, op0
);
2471 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2474 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2476 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2477 machine_mode cmp_mode
;
2480 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2482 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2483 if (unordered_compare
)
2484 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2486 /* Do fcomi/sahf based test when profitable. */
2487 switch (ix86_fp_comparison_strategy (code
))
2489 case IX86_FPCMP_COMI
:
2490 cmp_mode
= CCFPmode
;
2491 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2494 case IX86_FPCMP_SAHF
:
2495 cmp_mode
= CCFPmode
;
2496 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2497 scratch
= gen_reg_rtx (HImode
);
2498 emit_insn (gen_rtx_SET (scratch
, tmp
));
2499 emit_insn (gen_x86_sahf_1 (scratch
));
2502 case IX86_FPCMP_ARITH
:
2503 cmp_mode
= CCNOmode
;
2504 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2505 scratch
= gen_reg_rtx (HImode
);
2506 emit_insn (gen_rtx_SET (scratch
, tmp
));
2508 /* In the unordered case, we have to check C2 for NaN's, which
2509 doesn't happen to work out to anything nice combination-wise.
2510 So do some bit twiddling on the value we've got in AH to come
2511 up with an appropriate set of condition codes. */
2517 if (code
== GT
|| !TARGET_IEEE_FP
)
2519 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2524 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2525 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2526 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2533 if (code
== LT
&& TARGET_IEEE_FP
)
2535 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2536 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2542 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2548 if (code
== GE
|| !TARGET_IEEE_FP
)
2550 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2555 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2556 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2562 if (code
== LE
&& TARGET_IEEE_FP
)
2564 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2565 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2566 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2572 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2578 if (code
== EQ
&& TARGET_IEEE_FP
)
2580 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2581 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2587 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2593 if (code
== NE
&& TARGET_IEEE_FP
)
2595 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2596 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2602 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2608 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2612 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2625 /* Return the test that should be put into the flags user, i.e.
2626 the bcc, scc, or cmov instruction. */
2627 return gen_rtx_fmt_ee (code
, VOIDmode
,
2628 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2632 /* Generate insn patterns to do an integer compare of OPERANDS. */
2635 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2637 machine_mode cmpmode
;
2640 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2641 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2643 /* This is very simple, but making the interface the same as in the
2644 FP case makes the rest of the code easier. */
2645 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2646 emit_insn (gen_rtx_SET (flags
, tmp
));
2648 /* Return the test that should be put into the flags user, i.e.
2649 the bcc, scc, or cmov instruction. */
2650 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2654 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2658 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2659 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2661 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2663 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2664 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2667 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2673 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2677 gcc_assert (GET_MODE (dest
) == QImode
);
2679 ret
= ix86_expand_compare (code
, op0
, op1
);
2680 PUT_MODE (ret
, QImode
);
2681 emit_insn (gen_rtx_SET (dest
, ret
));
2684 /* Expand comparison setting or clearing carry flag. Return true when
2685 successful and set pop for the operation. */
2687 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2690 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2692 /* Do not handle double-mode compares that go through special path. */
2693 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2696 if (SCALAR_FLOAT_MODE_P (mode
))
2699 rtx_insn
*compare_seq
;
2701 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2703 /* Shortcut: following common codes never translate
2704 into carry flag compares. */
2705 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2706 || code
== ORDERED
|| code
== UNORDERED
)
2709 /* These comparisons require zero flag; swap operands so they won't. */
2710 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2713 std::swap (op0
, op1
);
2714 code
= swap_condition (code
);
2717 /* Try to expand the comparison and verify that we end up with
2718 carry flag based comparison. This fails to be true only when
2719 we decide to expand comparison using arithmetic that is not
2720 too common scenario. */
2722 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2723 compare_seq
= get_insns ();
2726 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2727 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2729 code
= GET_CODE (compare_op
);
2731 if (code
!= LTU
&& code
!= GEU
)
2734 emit_insn (compare_seq
);
2739 if (!INTEGRAL_MODE_P (mode
))
2748 /* Convert a==0 into (unsigned)a<1. */
2751 if (op1
!= const0_rtx
)
2754 code
= (code
== EQ
? LTU
: GEU
);
2757 /* Convert a>b into b<a or a>=b-1. */
2760 if (CONST_INT_P (op1
))
2762 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
2763 /* Bail out on overflow. We still can swap operands but that
2764 would force loading of the constant into register. */
2765 if (op1
== const0_rtx
2766 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
2768 code
= (code
== GTU
? GEU
: LTU
);
2772 std::swap (op0
, op1
);
2773 code
= (code
== GTU
? LTU
: GEU
);
2777 /* Convert a>=0 into (unsigned)a<0x80000000. */
2780 if (mode
== DImode
|| op1
!= const0_rtx
)
2782 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2783 code
= (code
== LT
? GEU
: LTU
);
2787 if (mode
== DImode
|| op1
!= constm1_rtx
)
2789 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2790 code
= (code
== LE
? GEU
: LTU
);
2796 /* Swapping operands may cause constant to appear as first operand. */
2797 if (!nonimmediate_operand (op0
, VOIDmode
))
2799 if (!can_create_pseudo_p ())
2801 op0
= force_reg (mode
, op0
);
2803 *pop
= ix86_expand_compare (code
, op0
, op1
);
2804 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
2808 /* Expand conditional increment or decrement using adb/sbb instructions.
2809 The default case using setcc followed by the conditional move can be
2810 done by generic code. */
2812 ix86_expand_int_addcc (rtx operands
[])
2814 enum rtx_code code
= GET_CODE (operands
[1]);
2816 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
2818 rtx val
= const0_rtx
;
2821 rtx op0
= XEXP (operands
[1], 0);
2822 rtx op1
= XEXP (operands
[1], 1);
2824 if (operands
[3] != const1_rtx
2825 && operands
[3] != constm1_rtx
)
2827 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2829 code
= GET_CODE (compare_op
);
2831 flags
= XEXP (compare_op
, 0);
2833 if (GET_MODE (flags
) == CCFPmode
)
2836 code
= ix86_fp_compare_code_to_integer (code
);
2843 PUT_CODE (compare_op
,
2844 reverse_condition_maybe_unordered
2845 (GET_CODE (compare_op
)));
2847 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
2850 mode
= GET_MODE (operands
[0]);
2852 /* Construct either adc or sbb insn. */
2853 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
2854 insn
= gen_sub3_carry
;
2856 insn
= gen_add3_carry
;
2858 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
2864 ix86_expand_int_movcc (rtx operands
[])
2866 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
2867 rtx_insn
*compare_seq
;
2869 machine_mode mode
= GET_MODE (operands
[0]);
2870 bool sign_bit_compare_p
= false;
2871 rtx op0
= XEXP (operands
[1], 0);
2872 rtx op1
= XEXP (operands
[1], 1);
2874 if (GET_MODE (op0
) == TImode
2875 || (GET_MODE (op0
) == DImode
2880 compare_op
= ix86_expand_compare (code
, op0
, op1
);
2881 compare_seq
= get_insns ();
2884 compare_code
= GET_CODE (compare_op
);
2886 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
2887 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
2888 sign_bit_compare_p
= true;
2890 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2891 HImode insns, we'd be swallowed in word prefix ops. */
2893 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
2894 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2895 && CONST_INT_P (operands
[2])
2896 && CONST_INT_P (operands
[3]))
2898 rtx out
= operands
[0];
2899 HOST_WIDE_INT ct
= INTVAL (operands
[2]);
2900 HOST_WIDE_INT cf
= INTVAL (operands
[3]);
2904 /* Sign bit compares are better done using shifts than we do by using
2906 if (sign_bit_compare_p
2907 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2909 /* Detect overlap between destination and compare sources. */
2912 if (!sign_bit_compare_p
)
2917 compare_code
= GET_CODE (compare_op
);
2919 flags
= XEXP (compare_op
, 0);
2921 if (GET_MODE (flags
) == CCFPmode
)
2925 = ix86_fp_compare_code_to_integer (compare_code
);
2928 /* To simplify rest of code, restrict to the GEU case. */
2929 if (compare_code
== LTU
)
2932 compare_code
= reverse_condition (compare_code
);
2933 code
= reverse_condition (code
);
2938 PUT_CODE (compare_op
,
2939 reverse_condition_maybe_unordered
2940 (GET_CODE (compare_op
)));
2942 PUT_CODE (compare_op
,
2943 reverse_condition (GET_CODE (compare_op
)));
2947 if (reg_overlap_mentioned_p (out
, op0
)
2948 || reg_overlap_mentioned_p (out
, op1
))
2949 tmp
= gen_reg_rtx (mode
);
2952 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
2954 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
2955 flags
, compare_op
));
2959 if (code
== GT
|| code
== GE
)
2960 code
= reverse_condition (code
);
2966 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
2979 tmp
= expand_simple_binop (mode
, PLUS
,
2981 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2992 tmp
= expand_simple_binop (mode
, IOR
,
2994 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2996 else if (diff
== -1 && ct
)
3006 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3008 tmp
= expand_simple_binop (mode
, PLUS
,
3009 copy_rtx (tmp
), GEN_INT (cf
),
3010 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3018 * andl cf - ct, dest
3028 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3031 tmp
= expand_simple_binop (mode
, AND
,
3033 gen_int_mode (cf
- ct
, mode
),
3034 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3036 tmp
= expand_simple_binop (mode
, PLUS
,
3037 copy_rtx (tmp
), GEN_INT (ct
),
3038 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3041 if (!rtx_equal_p (tmp
, out
))
3042 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3049 machine_mode cmp_mode
= GET_MODE (op0
);
3050 enum rtx_code new_code
;
3052 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3054 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3056 /* We may be reversing unordered compare to normal compare, that
3057 is not valid in general (we may convert non-trapping condition
3058 to trapping one), however on i386 we currently emit all
3059 comparisons unordered. */
3060 new_code
= reverse_condition_maybe_unordered (code
);
3063 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3064 if (new_code
!= UNKNOWN
)
3072 compare_code
= UNKNOWN
;
3073 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3074 && CONST_INT_P (op1
))
3076 if (op1
== const0_rtx
3077 && (code
== LT
|| code
== GE
))
3078 compare_code
= code
;
3079 else if (op1
== constm1_rtx
)
3083 else if (code
== GT
)
3088 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3089 if (compare_code
!= UNKNOWN
3090 && GET_MODE (op0
) == GET_MODE (out
)
3091 && (cf
== -1 || ct
== -1))
3093 /* If lea code below could be used, only optimize
3094 if it results in a 2 insn sequence. */
3096 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3097 || diff
== 3 || diff
== 5 || diff
== 9)
3098 || (compare_code
== LT
&& ct
== -1)
3099 || (compare_code
== GE
&& cf
== -1))
3102 * notl op1 (if necessary)
3110 code
= reverse_condition (code
);
3113 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3115 out
= expand_simple_binop (mode
, IOR
,
3117 out
, 1, OPTAB_DIRECT
);
3118 if (out
!= operands
[0])
3119 emit_move_insn (operands
[0], out
);
3126 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3127 || diff
== 3 || diff
== 5 || diff
== 9)
3128 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3130 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3136 * lea cf(dest*(ct-cf)),dest
3140 * This also catches the degenerate setcc-only case.
3146 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3149 /* On x86_64 the lea instruction operates on Pmode, so we need
3150 to get arithmetics done in proper mode to match. */
3152 tmp
= copy_rtx (out
);
3156 out1
= copy_rtx (out
);
3157 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3161 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3167 tmp
= gen_rtx_PLUS (mode
, tmp
, GEN_INT (cf
));
3170 if (!rtx_equal_p (tmp
, out
))
3173 out
= force_operand (tmp
, copy_rtx (out
));
3175 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3177 if (!rtx_equal_p (out
, operands
[0]))
3178 emit_move_insn (operands
[0], copy_rtx (out
));
3184 * General case: Jumpful:
3185 * xorl dest,dest cmpl op1, op2
3186 * cmpl op1, op2 movl ct, dest
3188 * decl dest movl cf, dest
3189 * andl (cf-ct),dest 1:
3194 * This is reasonably steep, but branch mispredict costs are
3195 * high on modern cpus, so consider failing only if optimizing
3199 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3200 && BRANCH_COST (optimize_insn_for_speed_p (),
3205 machine_mode cmp_mode
= GET_MODE (op0
);
3206 enum rtx_code new_code
;
3208 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3210 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3212 /* We may be reversing unordered compare to normal compare,
3213 that is not valid in general (we may convert non-trapping
3214 condition to trapping one), however on i386 we currently
3215 emit all comparisons unordered. */
3216 new_code
= reverse_condition_maybe_unordered (code
);
3220 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3221 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3222 compare_code
= reverse_condition (compare_code
);
3225 if (new_code
!= UNKNOWN
)
3233 if (compare_code
!= UNKNOWN
)
3235 /* notl op1 (if needed)
3240 For x < 0 (resp. x <= -1) there will be no notl,
3241 so if possible swap the constants to get rid of the
3243 True/false will be -1/0 while code below (store flag
3244 followed by decrement) is 0/-1, so the constants need
3245 to be exchanged once more. */
3247 if (compare_code
== GE
|| !cf
)
3249 code
= reverse_condition (code
);
3255 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3259 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3261 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3263 copy_rtx (out
), 1, OPTAB_DIRECT
);
3266 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3267 gen_int_mode (cf
- ct
, mode
),
3268 copy_rtx (out
), 1, OPTAB_DIRECT
);
3270 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3271 copy_rtx (out
), 1, OPTAB_DIRECT
);
3272 if (!rtx_equal_p (out
, operands
[0]))
3273 emit_move_insn (operands
[0], copy_rtx (out
));
3279 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3281 /* Try a few things more with specific constants and a variable. */
3284 rtx var
, orig_out
, out
, tmp
;
3286 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3289 /* If one of the two operands is an interesting constant, load a
3290 constant with the above and mask it in with a logical operation. */
3292 if (CONST_INT_P (operands
[2]))
3295 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3296 operands
[3] = constm1_rtx
, op
= and_optab
;
3297 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3298 operands
[3] = const0_rtx
, op
= ior_optab
;
3302 else if (CONST_INT_P (operands
[3]))
3305 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3306 operands
[2] = constm1_rtx
, op
= and_optab
;
3307 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3308 operands
[2] = const0_rtx
, op
= ior_optab
;
3315 orig_out
= operands
[0];
3316 tmp
= gen_reg_rtx (mode
);
3319 /* Recurse to get the constant loaded. */
3320 if (!ix86_expand_int_movcc (operands
))
3323 /* Mask in the interesting variable. */
3324 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3326 if (!rtx_equal_p (out
, orig_out
))
3327 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3333 * For comparison with above,
3343 if (! nonimmediate_operand (operands
[2], mode
))
3344 operands
[2] = force_reg (mode
, operands
[2]);
3345 if (! nonimmediate_operand (operands
[3], mode
))
3346 operands
[3] = force_reg (mode
, operands
[3]);
3348 if (! register_operand (operands
[2], VOIDmode
)
3350 || ! register_operand (operands
[3], VOIDmode
)))
3351 operands
[2] = force_reg (mode
, operands
[2]);
3354 && ! register_operand (operands
[3], VOIDmode
))
3355 operands
[3] = force_reg (mode
, operands
[3]);
3357 emit_insn (compare_seq
);
3358 emit_insn (gen_rtx_SET (operands
[0],
3359 gen_rtx_IF_THEN_ELSE (mode
,
3360 compare_op
, operands
[2],
3365 /* Detect conditional moves that exactly match min/max operational
3366 semantics. Note that this is IEEE safe, as long as we don't
3367 interchange the operands.
3369 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3370 and TRUE if the operation is successful and instructions are emitted. */
3373 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3374 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3382 else if (code
== UNGE
)
3383 std::swap (if_true
, if_false
);
3387 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3389 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3394 mode
= GET_MODE (dest
);
3396 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3397 but MODE may be a vector mode and thus not appropriate. */
3398 if (!flag_finite_math_only
|| flag_signed_zeros
)
3400 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3403 if_true
= force_reg (mode
, if_true
);
3404 v
= gen_rtvec (2, if_true
, if_false
);
3405 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3409 code
= is_min
? SMIN
: SMAX
;
3410 if (MEM_P (if_true
) && MEM_P (if_false
))
3411 if_true
= force_reg (mode
, if_true
);
3412 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3415 emit_insn (gen_rtx_SET (dest
, tmp
));
3419 /* Expand an SSE comparison. Return the register with the result. */
3422 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3423 rtx op_true
, rtx op_false
)
3425 machine_mode mode
= GET_MODE (dest
);
3426 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3428 /* In general case result of comparison can differ from operands' type. */
3429 machine_mode cmp_mode
;
3431 /* In AVX512F the result of comparison is an integer mask. */
3432 bool maskcmp
= false;
3435 if (GET_MODE_SIZE (cmp_ops_mode
) == 64)
3437 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3438 cmp_mode
= int_mode_for_size (nbits
, 0).require ();
3442 cmp_mode
= cmp_ops_mode
;
3444 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3446 int (*op1_predicate
)(rtx
, machine_mode
)
3447 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3449 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3450 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3453 || (maskcmp
&& cmp_mode
!= mode
)
3454 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3455 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3456 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3458 /* Compare patterns for int modes are unspec in AVX512F only. */
3459 if (maskcmp
&& (code
== GT
|| code
== EQ
))
3461 rtx (*gen
)(rtx
, rtx
, rtx
);
3463 switch (cmp_ops_mode
)
3466 gcc_assert (TARGET_AVX512BW
);
3467 gen
= code
== GT
? gen_avx512bw_gtv64qi3
: gen_avx512bw_eqv64qi3_1
;
3470 gcc_assert (TARGET_AVX512BW
);
3471 gen
= code
== GT
? gen_avx512bw_gtv32hi3
: gen_avx512bw_eqv32hi3_1
;
3474 gen
= code
== GT
? gen_avx512f_gtv16si3
: gen_avx512f_eqv16si3_1
;
3477 gen
= code
== GT
? gen_avx512f_gtv8di3
: gen_avx512f_eqv8di3_1
;
3485 emit_insn (gen (dest
, cmp_op0
, cmp_op1
));
3489 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3491 if (cmp_mode
!= mode
&& !maskcmp
)
3493 x
= force_reg (cmp_ops_mode
, x
);
3494 convert_move (dest
, x
, false);
3497 emit_insn (gen_rtx_SET (dest
, x
));
3502 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3503 operations. This is used for both scalar and vector conditional moves. */
3506 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3508 machine_mode mode
= GET_MODE (dest
);
3509 machine_mode cmpmode
= GET_MODE (cmp
);
3511 /* In AVX512F the result of comparison is an integer mask. */
3512 bool maskcmp
= (mode
!= cmpmode
&& TARGET_AVX512F
);
3516 /* If we have an integer mask and FP value then we need
3517 to cast mask to FP mode. */
3518 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3520 cmp
= force_reg (cmpmode
, cmp
);
3521 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3526 rtx (*gen
) (rtx
, rtx
) = NULL
;
3527 if ((op_true
== CONST0_RTX (mode
)
3528 && vector_all_ones_operand (op_false
, mode
))
3529 || (op_false
== CONST0_RTX (mode
)
3530 && vector_all_ones_operand (op_true
, mode
)))
3534 if (TARGET_AVX512BW
)
3535 gen
= gen_avx512bw_cvtmask2bv64qi
;
3538 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3539 gen
= gen_avx512vl_cvtmask2bv32qi
;
3542 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3543 gen
= gen_avx512vl_cvtmask2bv16qi
;
3546 if (TARGET_AVX512BW
)
3547 gen
= gen_avx512bw_cvtmask2wv32hi
;
3550 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3551 gen
= gen_avx512vl_cvtmask2wv16hi
;
3554 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3555 gen
= gen_avx512vl_cvtmask2wv8hi
;
3558 if (TARGET_AVX512DQ
)
3559 gen
= gen_avx512f_cvtmask2dv16si
;
3562 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3563 gen
= gen_avx512vl_cvtmask2dv8si
;
3566 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3567 gen
= gen_avx512vl_cvtmask2dv4si
;
3570 if (TARGET_AVX512DQ
)
3571 gen
= gen_avx512f_cvtmask2qv8di
;
3574 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3575 gen
= gen_avx512vl_cvtmask2qv4di
;
3578 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3579 gen
= gen_avx512vl_cvtmask2qv2di
;
3584 if (gen
&& SCALAR_INT_MODE_P (cmpmode
))
3586 cmp
= force_reg (cmpmode
, cmp
);
3587 if (op_true
== CONST0_RTX (mode
))
3589 rtx (*gen_not
) (rtx
, rtx
);
3592 case E_QImode
: gen_not
= gen_knotqi
; break;
3593 case E_HImode
: gen_not
= gen_knothi
; break;
3594 case E_SImode
: gen_not
= gen_knotsi
; break;
3595 case E_DImode
: gen_not
= gen_knotdi
; break;
3596 default: gcc_unreachable ();
3598 rtx n
= gen_reg_rtx (cmpmode
);
3599 emit_insn (gen_not (n
, cmp
));
3602 emit_insn (gen (dest
, cmp
));
3606 else if (vector_all_ones_operand (op_true
, mode
)
3607 && op_false
== CONST0_RTX (mode
))
3609 emit_insn (gen_rtx_SET (dest
, cmp
));
3612 else if (op_false
== CONST0_RTX (mode
))
3614 op_true
= force_reg (mode
, op_true
);
3615 x
= gen_rtx_AND (mode
, cmp
, op_true
);
3616 emit_insn (gen_rtx_SET (dest
, x
));
3619 else if (op_true
== CONST0_RTX (mode
))
3621 op_false
= force_reg (mode
, op_false
);
3622 x
= gen_rtx_NOT (mode
, cmp
);
3623 x
= gen_rtx_AND (mode
, x
, op_false
);
3624 emit_insn (gen_rtx_SET (dest
, x
));
3627 else if (INTEGRAL_MODE_P (mode
) && op_true
== CONSTM1_RTX (mode
))
3629 op_false
= force_reg (mode
, op_false
);
3630 x
= gen_rtx_IOR (mode
, cmp
, op_false
);
3631 emit_insn (gen_rtx_SET (dest
, x
));
3634 else if (TARGET_XOP
)
3636 op_true
= force_reg (mode
, op_true
);
3638 if (!nonimmediate_operand (op_false
, mode
))
3639 op_false
= force_reg (mode
, op_false
);
3641 emit_insn (gen_rtx_SET (dest
, gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3647 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3650 if (!vector_operand (op_true
, mode
))
3651 op_true
= force_reg (mode
, op_true
);
3653 op_false
= force_reg (mode
, op_false
);
3659 gen
= gen_sse4_1_blendvps
;
3663 gen
= gen_sse4_1_blendvpd
;
3668 gen
= gen_sse4_1_blendvss
;
3669 op_true
= force_reg (mode
, op_true
);
3675 gen
= gen_sse4_1_blendvsd
;
3676 op_true
= force_reg (mode
, op_true
);
3685 gen
= gen_sse4_1_pblendvb
;
3686 if (mode
!= V16QImode
)
3687 d
= gen_reg_rtx (V16QImode
);
3688 op_false
= gen_lowpart (V16QImode
, op_false
);
3689 op_true
= gen_lowpart (V16QImode
, op_true
);
3690 cmp
= gen_lowpart (V16QImode
, cmp
);
3695 gen
= gen_avx_blendvps256
;
3699 gen
= gen_avx_blendvpd256
;
3707 gen
= gen_avx2_pblendvb
;
3708 if (mode
!= V32QImode
)
3709 d
= gen_reg_rtx (V32QImode
);
3710 op_false
= gen_lowpart (V32QImode
, op_false
);
3711 op_true
= gen_lowpart (V32QImode
, op_true
);
3712 cmp
= gen_lowpart (V32QImode
, cmp
);
3717 gen
= gen_avx512bw_blendmv64qi
;
3720 gen
= gen_avx512bw_blendmv32hi
;
3723 gen
= gen_avx512f_blendmv16si
;
3726 gen
= gen_avx512f_blendmv8di
;
3729 gen
= gen_avx512f_blendmv8df
;
3732 gen
= gen_avx512f_blendmv16sf
;
3741 emit_insn (gen (d
, op_false
, op_true
, cmp
));
3743 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
3747 op_true
= force_reg (mode
, op_true
);
3749 t2
= gen_reg_rtx (mode
);
3751 t3
= gen_reg_rtx (mode
);
3755 x
= gen_rtx_AND (mode
, op_true
, cmp
);
3756 emit_insn (gen_rtx_SET (t2
, x
));
3758 x
= gen_rtx_NOT (mode
, cmp
);
3759 x
= gen_rtx_AND (mode
, x
, op_false
);
3760 emit_insn (gen_rtx_SET (t3
, x
));
3762 x
= gen_rtx_IOR (mode
, t3
, t2
);
3763 emit_insn (gen_rtx_SET (dest
, x
));
3767 /* Swap, force into registers, or otherwise massage the two operands
3768 to an sse comparison with a mask result. Thus we differ a bit from
3769 ix86_prepare_fp_compare_args which expects to produce a flags result.
3771 The DEST operand exists to help determine whether to commute commutative
3772 operators. The POP0/POP1 operands are updated in place. The new
3773 comparison code is returned, or UNKNOWN if not implementable. */
3775 static enum rtx_code
3776 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
3777 rtx
*pop0
, rtx
*pop1
)
3783 /* AVX supports all the needed comparisons. */
3786 /* We have no LTGT as an operator. We could implement it with
3787 NE & ORDERED, but this requires an extra temporary. It's
3788 not clear that it's worth it. */
3795 /* These are supported directly. */
3802 /* AVX has 3 operand comparisons, no need to swap anything. */
3805 /* For commutative operators, try to canonicalize the destination
3806 operand to be first in the comparison - this helps reload to
3807 avoid extra moves. */
3808 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
3816 /* These are not supported directly before AVX, and furthermore
3817 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3818 comparison operands to transform into something that is
3820 std::swap (*pop0
, *pop1
);
3821 code
= swap_condition (code
);
3831 /* Expand a floating-point conditional move. Return true if successful. */
3834 ix86_expand_fp_movcc (rtx operands
[])
3836 machine_mode mode
= GET_MODE (operands
[0]);
3837 enum rtx_code code
= GET_CODE (operands
[1]);
3838 rtx tmp
, compare_op
;
3839 rtx op0
= XEXP (operands
[1], 0);
3840 rtx op1
= XEXP (operands
[1], 1);
3842 if (TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (mode
))
3846 /* Since we've no cmove for sse registers, don't force bad register
3847 allocation just to gain access to it. Deny movcc when the
3848 comparison mode doesn't match the move mode. */
3849 cmode
= GET_MODE (op0
);
3850 if (cmode
== VOIDmode
)
3851 cmode
= GET_MODE (op1
);
3855 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
3856 if (code
== UNKNOWN
)
3859 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
3860 operands
[2], operands
[3]))
3863 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
3864 operands
[2], operands
[3]);
3865 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
3869 if (GET_MODE (op0
) == TImode
3870 || (GET_MODE (op0
) == DImode
3874 /* The floating point conditional move instructions don't directly
3875 support conditions resulting from a signed integer comparison. */
3877 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3878 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
3880 tmp
= gen_reg_rtx (QImode
);
3881 ix86_expand_setcc (tmp
, code
, op0
, op1
);
3883 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
3886 emit_insn (gen_rtx_SET (operands
[0],
3887 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
3888 operands
[2], operands
[3])));
3893 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3896 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3921 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3924 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3961 /* Return immediate value to be used in UNSPEC_PCMP
3962 for comparison CODE in MODE. */
3965 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
3967 if (FLOAT_MODE_P (mode
))
3968 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
3969 return ix86_int_cmp_code_to_pcmp_immediate (code
);
3972 /* Expand AVX-512 vector comparison. */
3975 ix86_expand_mask_vec_cmp (rtx operands
[])
3977 machine_mode mask_mode
= GET_MODE (operands
[0]);
3978 machine_mode cmp_mode
= GET_MODE (operands
[2]);
3979 enum rtx_code code
= GET_CODE (operands
[1]);
3980 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
3990 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
3994 unspec_code
= UNSPEC_PCMP
;
3997 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, operands
[2],
4000 emit_insn (gen_rtx_SET (operands
[0], unspec
));
4005 /* Expand fp vector comparison. */
4008 ix86_expand_fp_vec_cmp (rtx operands
[])
4010 enum rtx_code code
= GET_CODE (operands
[1]);
4013 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4014 &operands
[2], &operands
[3]);
4015 if (code
== UNKNOWN
)
4018 switch (GET_CODE (operands
[1]))
4021 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4022 operands
[3], NULL
, NULL
);
4023 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4024 operands
[3], NULL
, NULL
);
4028 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4029 operands
[3], NULL
, NULL
);
4030 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4031 operands
[3], NULL
, NULL
);
4037 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4041 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4042 operands
[1], operands
[2]);
4044 if (operands
[0] != cmp
)
4045 emit_move_insn (operands
[0], cmp
);
4051 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4052 rtx op_true
, rtx op_false
, bool *negate
)
4054 machine_mode data_mode
= GET_MODE (dest
);
4055 machine_mode mode
= GET_MODE (cop0
);
4060 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4062 && (mode
== V16QImode
|| mode
== V8HImode
4063 || mode
== V4SImode
|| mode
== V2DImode
))
4067 /* Canonicalize the comparison to EQ, GT, GTU. */
4078 code
= reverse_condition (code
);
4084 code
= reverse_condition (code
);
4090 std::swap (cop0
, cop1
);
4091 code
= swap_condition (code
);
4098 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4099 if (mode
== V2DImode
)
4104 /* SSE4.1 supports EQ. */
4111 /* SSE4.2 supports GT/GTU. */
4121 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4122 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4124 std::swap (optrue
, opfalse
);
4126 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4127 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4128 min (x, y) == x). While we add one instruction (the minimum),
4129 we remove the need for two instructions in the negation, as the
4130 result is done this way.
4131 When using masks, do it for SI/DImode element types, as it is shorter
4132 than the two subtractions. */
4134 && GET_MODE_SIZE (mode
) != 64
4135 && vector_all_ones_operand (opfalse
, data_mode
)
4136 && optrue
== CONST0_RTX (data_mode
))
4138 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4139 /* Don't do it if not using integer masks and we'd end up with
4140 the right values in the registers though. */
4141 && (GET_MODE_SIZE (mode
) == 64
4142 || !vector_all_ones_operand (optrue
, data_mode
)
4143 || opfalse
!= CONST0_RTX (data_mode
))))
4145 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4150 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4153 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4154 cop0
= force_reg (mode
, cop0
);
4155 cop1
= force_reg (mode
, cop1
);
4159 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4163 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4167 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4170 if (TARGET_AVX512VL
)
4172 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4173 cop0
= force_reg (mode
, cop0
);
4174 cop1
= force_reg (mode
, cop1
);
4178 if (code
== GTU
&& TARGET_SSE2
)
4179 gen
= gen_uminv16qi3
;
4180 else if (code
== GT
&& TARGET_SSE4_1
)
4181 gen
= gen_sminv16qi3
;
4184 if (code
== GTU
&& TARGET_SSE4_1
)
4185 gen
= gen_uminv8hi3
;
4186 else if (code
== GT
&& TARGET_SSE2
)
4187 gen
= gen_sminv8hi3
;
4191 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4194 if (TARGET_AVX512VL
)
4196 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4197 cop0
= force_reg (mode
, cop0
);
4198 cop1
= force_reg (mode
, cop1
);
4207 rtx tem
= gen_reg_rtx (mode
);
4208 if (!vector_operand (cop0
, mode
))
4209 cop0
= force_reg (mode
, cop0
);
4210 if (!vector_operand (cop1
, mode
))
4211 cop1
= force_reg (mode
, cop1
);
4213 emit_insn (gen (tem
, cop0
, cop1
));
4219 /* Unsigned parallel compare is not supported by the hardware.
4220 Play some tricks to turn this into a signed comparison
4224 cop0
= force_reg (mode
, cop0
);
4237 /* Subtract (-(INT MAX) - 1) from both operands to make
4239 mask
= ix86_build_signbit_mask (mode
, true, false);
4240 t1
= gen_reg_rtx (mode
);
4241 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4243 t2
= gen_reg_rtx (mode
);
4244 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4258 /* Perform a parallel unsigned saturating subtraction. */
4259 x
= gen_reg_rtx (mode
);
4260 emit_insn (gen_rtx_SET
4261 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4263 cop1
= CONST0_RTX (mode
);
4275 std::swap (op_true
, op_false
);
4277 /* Allow the comparison to be done in one mode, but the movcc to
4278 happen in another mode. */
4279 if (data_mode
== mode
)
4281 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4286 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4287 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4289 if (GET_MODE (x
) == mode
)
4290 x
= gen_lowpart (data_mode
, x
);
4296 /* Expand integer vector comparison. */
4299 ix86_expand_int_vec_cmp (rtx operands
[])
4301 rtx_code code
= GET_CODE (operands
[1]);
4302 bool negate
= false;
4303 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4304 operands
[3], NULL
, NULL
, &negate
);
4310 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4311 CONST0_RTX (GET_MODE (cmp
)),
4312 NULL
, NULL
, &negate
);
4314 gcc_assert (!negate
);
4316 if (operands
[0] != cmp
)
4317 emit_move_insn (operands
[0], cmp
);
4322 /* Expand a floating-point vector conditional move; a vcond operation
4323 rather than a movcc operation. */
4326 ix86_expand_fp_vcond (rtx operands
[])
4328 enum rtx_code code
= GET_CODE (operands
[3]);
4331 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4332 &operands
[4], &operands
[5]);
4333 if (code
== UNKNOWN
)
4336 switch (GET_CODE (operands
[3]))
4339 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4340 operands
[5], operands
[0], operands
[0]);
4341 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4342 operands
[5], operands
[1], operands
[2]);
4346 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4347 operands
[5], operands
[0], operands
[0]);
4348 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4349 operands
[5], operands
[1], operands
[2]);
4355 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4357 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4361 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4362 operands
[5], operands
[1], operands
[2]))
4365 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4366 operands
[1], operands
[2]);
4367 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4371 /* Expand a signed/unsigned integral vector conditional move. */
4374 ix86_expand_int_vcond (rtx operands
[])
4376 machine_mode data_mode
= GET_MODE (operands
[0]);
4377 machine_mode mode
= GET_MODE (operands
[4]);
4378 enum rtx_code code
= GET_CODE (operands
[3]);
4379 bool negate
= false;
4385 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4386 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4387 if ((code
== LT
|| code
== GE
)
4388 && data_mode
== mode
4389 && cop1
== CONST0_RTX (mode
)
4390 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4391 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4392 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4393 && (GET_MODE_SIZE (data_mode
) == 16
4394 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4396 rtx negop
= operands
[2 - (code
== LT
)];
4397 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4398 if (negop
== CONST1_RTX (data_mode
))
4400 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4401 operands
[0], 1, OPTAB_DIRECT
);
4402 if (res
!= operands
[0])
4403 emit_move_insn (operands
[0], res
);
4406 else if (GET_MODE_INNER (data_mode
) != DImode
4407 && vector_all_ones_operand (negop
, data_mode
))
4409 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4410 operands
[0], 0, OPTAB_DIRECT
);
4411 if (res
!= operands
[0])
4412 emit_move_insn (operands
[0], res
);
4417 if (!nonimmediate_operand (cop1
, mode
))
4418 cop1
= force_reg (mode
, cop1
);
4419 if (!general_operand (operands
[1], data_mode
))
4420 operands
[1] = force_reg (data_mode
, operands
[1]);
4421 if (!general_operand (operands
[2], data_mode
))
4422 operands
[2] = force_reg (data_mode
, operands
[2]);
4424 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4425 operands
[1], operands
[2], &negate
);
4430 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4431 operands
[2-negate
]);
4436 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4437 struct expand_vec_perm_d
*d
)
4439 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4440 expander, so args are either in d, or in op0, op1 etc. */
4441 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4442 machine_mode maskmode
= mode
;
4443 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4448 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4449 gen
= gen_avx512vl_vpermt2varv8hi3
;
4452 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4453 gen
= gen_avx512vl_vpermt2varv16hi3
;
4456 if (TARGET_AVX512VBMI
)
4457 gen
= gen_avx512bw_vpermt2varv64qi3
;
4460 if (TARGET_AVX512BW
)
4461 gen
= gen_avx512bw_vpermt2varv32hi3
;
4464 if (TARGET_AVX512VL
)
4465 gen
= gen_avx512vl_vpermt2varv4si3
;
4468 if (TARGET_AVX512VL
)
4469 gen
= gen_avx512vl_vpermt2varv8si3
;
4473 gen
= gen_avx512f_vpermt2varv16si3
;
4476 if (TARGET_AVX512VL
)
4478 gen
= gen_avx512vl_vpermt2varv4sf3
;
4479 maskmode
= V4SImode
;
4483 if (TARGET_AVX512VL
)
4485 gen
= gen_avx512vl_vpermt2varv8sf3
;
4486 maskmode
= V8SImode
;
4492 gen
= gen_avx512f_vpermt2varv16sf3
;
4493 maskmode
= V16SImode
;
4497 if (TARGET_AVX512VL
)
4498 gen
= gen_avx512vl_vpermt2varv2di3
;
4501 if (TARGET_AVX512VL
)
4502 gen
= gen_avx512vl_vpermt2varv4di3
;
4506 gen
= gen_avx512f_vpermt2varv8di3
;
4509 if (TARGET_AVX512VL
)
4511 gen
= gen_avx512vl_vpermt2varv2df3
;
4512 maskmode
= V2DImode
;
4516 if (TARGET_AVX512VL
)
4518 gen
= gen_avx512vl_vpermt2varv4df3
;
4519 maskmode
= V4DImode
;
4525 gen
= gen_avx512f_vpermt2varv8df3
;
4526 maskmode
= V8DImode
;
4536 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4537 expander, so args are either in d, or in op0, op1 etc. */
4544 for (int i
= 0; i
< d
->nelt
; ++i
)
4545 vec
[i
] = GEN_INT (d
->perm
[i
]);
4546 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4549 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4553 /* Expand a variable vector permutation. */
4556 ix86_expand_vec_perm (rtx operands
[])
4558 rtx target
= operands
[0];
4559 rtx op0
= operands
[1];
4560 rtx op1
= operands
[2];
4561 rtx mask
= operands
[3];
4562 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4563 machine_mode mode
= GET_MODE (op0
);
4564 machine_mode maskmode
= GET_MODE (mask
);
4566 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4568 /* Number of elements in the vector. */
4569 w
= GET_MODE_NUNITS (mode
);
4570 e
= GET_MODE_UNIT_SIZE (mode
);
4571 gcc_assert (w
<= 64);
4573 if (TARGET_AVX512F
&& one_operand_shuffle
)
4575 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4579 gen
=gen_avx512f_permvarv16si
;
4582 gen
= gen_avx512f_permvarv16sf
;
4585 gen
= gen_avx512f_permvarv8di
;
4588 gen
= gen_avx512f_permvarv8df
;
4595 emit_insn (gen (target
, op0
, mask
));
4600 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
4605 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
4607 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4608 an constant shuffle operand. With a tiny bit of effort we can
4609 use VPERMD instead. A re-interpretation stall for V4DFmode is
4610 unfortunate but there's no avoiding it.
4611 Similarly for V16HImode we don't have instructions for variable
4612 shuffling, while for V32QImode we can use after preparing suitable
4613 masks vpshufb; vpshufb; vpermq; vpor. */
4615 if (mode
== V16HImode
)
4617 maskmode
= mode
= V32QImode
;
4623 maskmode
= mode
= V8SImode
;
4627 t1
= gen_reg_rtx (maskmode
);
4629 /* Replicate the low bits of the V4DImode mask into V8SImode:
4631 t1 = { A A B B C C D D }. */
4632 for (i
= 0; i
< w
/ 2; ++i
)
4633 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
4634 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4635 vt
= force_reg (maskmode
, vt
);
4636 mask
= gen_lowpart (maskmode
, mask
);
4637 if (maskmode
== V8SImode
)
4638 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
4640 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
4642 /* Multiply the shuffle indicies by two. */
4643 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
4646 /* Add one to the odd shuffle indicies:
4647 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4648 for (i
= 0; i
< w
/ 2; ++i
)
4650 vec
[i
* 2] = const0_rtx
;
4651 vec
[i
* 2 + 1] = const1_rtx
;
4653 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4654 vt
= validize_mem (force_const_mem (maskmode
, vt
));
4655 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
4658 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4659 operands
[3] = mask
= t1
;
4660 target
= gen_reg_rtx (mode
);
4661 op0
= gen_lowpart (mode
, op0
);
4662 op1
= gen_lowpart (mode
, op1
);
4668 /* The VPERMD and VPERMPS instructions already properly ignore
4669 the high bits of the shuffle elements. No need for us to
4670 perform an AND ourselves. */
4671 if (one_operand_shuffle
)
4673 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
4674 if (target
!= operands
[0])
4675 emit_move_insn (operands
[0],
4676 gen_lowpart (GET_MODE (operands
[0]), target
));
4680 t1
= gen_reg_rtx (V8SImode
);
4681 t2
= gen_reg_rtx (V8SImode
);
4682 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
4683 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
4689 mask
= gen_lowpart (V8SImode
, mask
);
4690 if (one_operand_shuffle
)
4691 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
4694 t1
= gen_reg_rtx (V8SFmode
);
4695 t2
= gen_reg_rtx (V8SFmode
);
4696 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
4697 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
4703 /* By combining the two 128-bit input vectors into one 256-bit
4704 input vector, we can use VPERMD and VPERMPS for the full
4705 two-operand shuffle. */
4706 t1
= gen_reg_rtx (V8SImode
);
4707 t2
= gen_reg_rtx (V8SImode
);
4708 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
4709 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4710 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
4711 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
4715 t1
= gen_reg_rtx (V8SFmode
);
4716 t2
= gen_reg_rtx (V8SImode
);
4717 mask
= gen_lowpart (V4SImode
, mask
);
4718 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
4719 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4720 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
4721 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
4725 t1
= gen_reg_rtx (V32QImode
);
4726 t2
= gen_reg_rtx (V32QImode
);
4727 t3
= gen_reg_rtx (V32QImode
);
4728 vt2
= GEN_INT (-128);
4729 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
4730 vt
= force_reg (V32QImode
, vt
);
4731 for (i
= 0; i
< 32; i
++)
4732 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
4733 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
4734 vt2
= force_reg (V32QImode
, vt2
);
4735 /* From mask create two adjusted masks, which contain the same
4736 bits as mask in the low 7 bits of each vector element.
4737 The first mask will have the most significant bit clear
4738 if it requests element from the same 128-bit lane
4739 and MSB set if it requests element from the other 128-bit lane.
4740 The second mask will have the opposite values of the MSB,
4741 and additionally will have its 128-bit lanes swapped.
4742 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4743 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4744 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4745 stands for other 12 bytes. */
4746 /* The bit whether element is from the same lane or the other
4747 lane is bit 4, so shift it up by 3 to the MSB position. */
4748 t5
= gen_reg_rtx (V4DImode
);
4749 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
4751 /* Clear MSB bits from the mask just in case it had them set. */
4752 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
4753 /* After this t1 will have MSB set for elements from other lane. */
4754 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
4755 /* Clear bits other than MSB. */
4756 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
4757 /* Or in the lower bits from mask into t3. */
4758 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
4759 /* And invert MSB bits in t1, so MSB is set for elements from the same
4761 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
4762 /* Swap 128-bit lanes in t3. */
4763 t6
= gen_reg_rtx (V4DImode
);
4764 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
4765 const2_rtx
, GEN_INT (3),
4766 const0_rtx
, const1_rtx
));
4767 /* And or in the lower bits from mask into t1. */
4768 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
4769 if (one_operand_shuffle
)
4771 /* Each of these shuffles will put 0s in places where
4772 element from the other 128-bit lane is needed, otherwise
4773 will shuffle in the requested value. */
4774 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
4775 gen_lowpart (V32QImode
, t6
)));
4776 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
4777 /* For t3 the 128-bit lanes are swapped again. */
4778 t7
= gen_reg_rtx (V4DImode
);
4779 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
4780 const2_rtx
, GEN_INT (3),
4781 const0_rtx
, const1_rtx
));
4782 /* And oring both together leads to the result. */
4783 emit_insn (gen_iorv32qi3 (target
, t1
,
4784 gen_lowpart (V32QImode
, t7
)));
4785 if (target
!= operands
[0])
4786 emit_move_insn (operands
[0],
4787 gen_lowpart (GET_MODE (operands
[0]), target
));
4791 t4
= gen_reg_rtx (V32QImode
);
4792 /* Similarly to the above one_operand_shuffle code,
4793 just for repeated twice for each operand. merge_two:
4794 code will merge the two results together. */
4795 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
4796 gen_lowpart (V32QImode
, t6
)));
4797 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
4798 gen_lowpart (V32QImode
, t6
)));
4799 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
4800 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
4801 t7
= gen_reg_rtx (V4DImode
);
4802 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
4803 const2_rtx
, GEN_INT (3),
4804 const0_rtx
, const1_rtx
));
4805 t8
= gen_reg_rtx (V4DImode
);
4806 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
4807 const2_rtx
, GEN_INT (3),
4808 const0_rtx
, const1_rtx
));
4809 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
4810 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
4816 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
4823 /* The XOP VPPERM insn supports three inputs. By ignoring the
4824 one_operand_shuffle special case, we avoid creating another
4825 set of constant vectors in memory. */
4826 one_operand_shuffle
= false;
4828 /* mask = mask & {2*w-1, ...} */
4829 vt
= GEN_INT (2*w
- 1);
4833 /* mask = mask & {w-1, ...} */
4834 vt
= GEN_INT (w
- 1);
4837 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4838 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4839 NULL_RTX
, 0, OPTAB_DIRECT
);
4841 /* For non-QImode operations, convert the word permutation control
4842 into a byte permutation control. */
4843 if (mode
!= V16QImode
)
4845 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
4846 GEN_INT (exact_log2 (e
)),
4847 NULL_RTX
, 0, OPTAB_DIRECT
);
4849 /* Convert mask to vector of chars. */
4850 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
4852 /* Replicate each of the input bytes into byte positions:
4853 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4854 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4855 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4856 for (i
= 0; i
< 16; ++i
)
4857 vec
[i
] = GEN_INT (i
/e
* e
);
4858 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4859 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4861 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
4863 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
4865 /* Convert it into the byte positions by doing
4866 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4867 for (i
= 0; i
< 16; ++i
)
4868 vec
[i
] = GEN_INT (i
% e
);
4869 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4870 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4871 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
4874 /* The actual shuffle operations all operate on V16QImode. */
4875 op0
= gen_lowpart (V16QImode
, op0
);
4876 op1
= gen_lowpart (V16QImode
, op1
);
4880 if (GET_MODE (target
) != V16QImode
)
4881 target
= gen_reg_rtx (V16QImode
);
4882 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
4883 if (target
!= operands
[0])
4884 emit_move_insn (operands
[0],
4885 gen_lowpart (GET_MODE (operands
[0]), target
));
4887 else if (one_operand_shuffle
)
4889 if (GET_MODE (target
) != V16QImode
)
4890 target
= gen_reg_rtx (V16QImode
);
4891 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
4892 if (target
!= operands
[0])
4893 emit_move_insn (operands
[0],
4894 gen_lowpart (GET_MODE (operands
[0]), target
));
4901 /* Shuffle the two input vectors independently. */
4902 t1
= gen_reg_rtx (V16QImode
);
4903 t2
= gen_reg_rtx (V16QImode
);
4904 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
4905 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
4908 /* Then merge them together. The key is whether any given control
4909 element contained a bit set that indicates the second word. */
4912 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
4914 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4915 more shuffle to convert the V2DI input mask into a V4SI
4916 input mask. At which point the masking that expand_int_vcond
4917 will work as desired. */
4918 rtx t3
= gen_reg_rtx (V4SImode
);
4919 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
4920 const0_rtx
, const0_rtx
,
4921 const2_rtx
, const2_rtx
));
4923 maskmode
= V4SImode
;
4927 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4928 vt
= force_reg (maskmode
, vt
);
4929 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4930 NULL_RTX
, 0, OPTAB_DIRECT
);
4932 if (GET_MODE (target
) != mode
)
4933 target
= gen_reg_rtx (mode
);
4935 xops
[1] = gen_lowpart (mode
, t2
);
4936 xops
[2] = gen_lowpart (mode
, t1
);
4937 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
4940 ok
= ix86_expand_int_vcond (xops
);
4942 if (target
!= operands
[0])
4943 emit_move_insn (operands
[0],
4944 gen_lowpart (GET_MODE (operands
[0]), target
));
4948 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4949 true if we should do zero extension, else sign extension. HIGH_P is
4950 true if we want the N/2 high elements, else the low elements. */
4953 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
4955 machine_mode imode
= GET_MODE (src
);
4960 rtx (*unpack
)(rtx
, rtx
);
4961 rtx (*extract
)(rtx
, rtx
) = NULL
;
4962 machine_mode halfmode
= BLKmode
;
4968 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
4970 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
4971 halfmode
= V32QImode
;
4973 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
4977 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
4979 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
4980 halfmode
= V16QImode
;
4982 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
4986 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
4988 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
4989 halfmode
= V16HImode
;
4991 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
4995 unpack
= gen_avx2_zero_extendv8hiv8si2
;
4997 unpack
= gen_avx2_sign_extendv8hiv8si2
;
4998 halfmode
= V8HImode
;
5000 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5004 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5006 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5007 halfmode
= V8SImode
;
5009 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5013 unpack
= gen_avx2_zero_extendv4siv4di2
;
5015 unpack
= gen_avx2_sign_extendv4siv4di2
;
5016 halfmode
= V4SImode
;
5018 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5022 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5024 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5028 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5030 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5034 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5036 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5042 if (GET_MODE_SIZE (imode
) >= 32)
5044 tmp
= gen_reg_rtx (halfmode
);
5045 emit_insn (extract (tmp
, src
));
5049 /* Shift higher 8 bytes to lower 8 bytes. */
5050 tmp
= gen_reg_rtx (V1TImode
);
5051 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5053 tmp
= gen_lowpart (imode
, tmp
);
5058 emit_insn (unpack (dest
, tmp
));
5062 rtx (*unpack
)(rtx
, rtx
, rtx
);
5068 unpack
= gen_vec_interleave_highv16qi
;
5070 unpack
= gen_vec_interleave_lowv16qi
;
5074 unpack
= gen_vec_interleave_highv8hi
;
5076 unpack
= gen_vec_interleave_lowv8hi
;
5080 unpack
= gen_vec_interleave_highv4si
;
5082 unpack
= gen_vec_interleave_lowv4si
;
5089 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5091 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5092 src
, pc_rtx
, pc_rtx
);
5094 rtx tmp2
= gen_reg_rtx (imode
);
5095 emit_insn (unpack (tmp2
, src
, tmp
));
5096 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5100 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5101 but works for floating pointer parameters and nonoffsetable memories.
5102 For pushes, it returns just stack offsets; the values will be saved
5103 in the right order. Maximally three parts are generated. */
5106 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5111 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5113 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5115 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5116 gcc_assert (size
>= 2 && size
<= 4);
5118 /* Optimize constant pool reference to immediates. This is used by fp
5119 moves, that force all constants to memory to allow combining. */
5120 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5121 operand
= avoid_constant_pool_reference (operand
);
5123 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5125 /* The only non-offsetable memories we handle are pushes. */
5126 int ok
= push_operand (operand
, VOIDmode
);
5130 operand
= copy_rtx (operand
);
5131 PUT_MODE (operand
, word_mode
);
5132 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5136 if (GET_CODE (operand
) == CONST_VECTOR
)
5138 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5139 /* Caution: if we looked through a constant pool memory above,
5140 the operand may actually have a different mode now. That's
5141 ok, since we want to pun this all the way back to an integer. */
5142 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5143 gcc_assert (operand
!= NULL
);
5150 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5155 if (REG_P (operand
))
5157 gcc_assert (reload_completed
);
5158 for (i
= 0; i
< size
; i
++)
5159 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5161 else if (offsettable_memref_p (operand
))
5163 operand
= adjust_address (operand
, SImode
, 0);
5165 for (i
= 1; i
< size
; i
++)
5166 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5168 else if (CONST_DOUBLE_P (operand
))
5170 const REAL_VALUE_TYPE
*r
;
5173 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5177 real_to_target (l
, r
, mode
);
5178 parts
[3] = gen_int_mode (l
[3], SImode
);
5179 parts
[2] = gen_int_mode (l
[2], SImode
);
5182 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5183 long double may not be 80-bit. */
5184 real_to_target (l
, r
, mode
);
5185 parts
[2] = gen_int_mode (l
[2], SImode
);
5188 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5193 parts
[1] = gen_int_mode (l
[1], SImode
);
5194 parts
[0] = gen_int_mode (l
[0], SImode
);
5203 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5204 if (mode
== XFmode
|| mode
== TFmode
)
5206 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5207 if (REG_P (operand
))
5209 gcc_assert (reload_completed
);
5210 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5211 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5213 else if (offsettable_memref_p (operand
))
5215 operand
= adjust_address (operand
, DImode
, 0);
5217 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5219 else if (CONST_DOUBLE_P (operand
))
5223 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5225 /* real_to_target puts 32-bit pieces in each long. */
5226 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5227 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5230 if (upper_mode
== SImode
)
5231 parts
[1] = gen_int_mode (l
[2], SImode
);
5234 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5235 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5246 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5247 Return false when normal moves are needed; true when all required
5248 insns have been emitted. Operands 2-4 contain the input values
5249 int the correct order; operands 5-7 contain the output values. */
5252 ix86_split_long_move (rtx operands
[])
5258 machine_mode mode
= GET_MODE (operands
[0]);
5259 bool collisionparts
[4];
5261 /* The DFmode expanders may ask us to move double.
5262 For 64bit target this is single move. By hiding the fact
5263 here we simplify i386.md splitters. */
5264 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5266 /* Optimize constant pool reference to immediates. This is used by
5267 fp moves, that force all constants to memory to allow combining. */
5269 if (MEM_P (operands
[1])
5270 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5271 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5272 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5273 if (push_operand (operands
[0], VOIDmode
))
5275 operands
[0] = copy_rtx (operands
[0]);
5276 PUT_MODE (operands
[0], word_mode
);
5279 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5280 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5281 emit_move_insn (operands
[0], operands
[1]);
5285 /* The only non-offsettable memory we handle is push. */
5286 if (push_operand (operands
[0], VOIDmode
))
5289 gcc_assert (!MEM_P (operands
[0])
5290 || offsettable_memref_p (operands
[0]));
5292 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5293 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5295 /* When emitting push, take care for source operands on the stack. */
5296 if (push
&& MEM_P (operands
[1])
5297 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5299 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5301 /* Compensate for the stack decrement by 4. */
5302 if (!TARGET_64BIT
&& nparts
== 3
5303 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5304 src_base
= plus_constant (Pmode
, src_base
, 4);
5306 /* src_base refers to the stack pointer and is
5307 automatically decreased by emitted push. */
5308 for (i
= 0; i
< nparts
; i
++)
5309 part
[1][i
] = change_address (part
[1][i
],
5310 GET_MODE (part
[1][i
]), src_base
);
5313 /* We need to do copy in the right order in case an address register
5314 of the source overlaps the destination. */
5315 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5319 for (i
= 0; i
< nparts
; i
++)
5322 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5323 if (collisionparts
[i
])
5327 /* Collision in the middle part can be handled by reordering. */
5328 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5330 std::swap (part
[0][1], part
[0][2]);
5331 std::swap (part
[1][1], part
[1][2]);
5333 else if (collisions
== 1
5335 && (collisionparts
[1] || collisionparts
[2]))
5337 if (collisionparts
[1])
5339 std::swap (part
[0][1], part
[0][2]);
5340 std::swap (part
[1][1], part
[1][2]);
5344 std::swap (part
[0][2], part
[0][3]);
5345 std::swap (part
[1][2], part
[1][3]);
5349 /* If there are more collisions, we can't handle it by reordering.
5350 Do an lea to the last part and use only one colliding move. */
5351 else if (collisions
> 1)
5357 base
= part
[0][nparts
- 1];
5359 /* Handle the case when the last part isn't valid for lea.
5360 Happens in 64-bit mode storing the 12-byte XFmode. */
5361 if (GET_MODE (base
) != Pmode
)
5362 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5364 addr
= XEXP (part
[1][0], 0);
5365 if (TARGET_TLS_DIRECT_SEG_REFS
)
5367 struct ix86_address parts
;
5368 int ok
= ix86_decompose_address (addr
, &parts
);
5370 /* It is not valid to use %gs: or %fs: in lea. */
5371 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5373 emit_insn (gen_rtx_SET (base
, addr
));
5374 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5375 for (i
= 1; i
< nparts
; i
++)
5377 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5378 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5389 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5390 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5391 emit_move_insn (part
[0][2], part
[1][2]);
5393 else if (nparts
== 4)
5395 emit_move_insn (part
[0][3], part
[1][3]);
5396 emit_move_insn (part
[0][2], part
[1][2]);
5401 /* In 64bit mode we don't have 32bit push available. In case this is
5402 register, it is OK - we will just use larger counterpart. We also
5403 retype memory - these comes from attempt to avoid REX prefix on
5404 moving of second half of TFmode value. */
5405 if (GET_MODE (part
[1][1]) == SImode
)
5407 switch (GET_CODE (part
[1][1]))
5410 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5414 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5421 if (GET_MODE (part
[1][0]) == SImode
)
5422 part
[1][0] = part
[1][1];
5425 emit_move_insn (part
[0][1], part
[1][1]);
5426 emit_move_insn (part
[0][0], part
[1][0]);
5430 /* Choose correct order to not overwrite the source before it is copied. */
5431 if ((REG_P (part
[0][0])
5432 && REG_P (part
[1][1])
5433 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5435 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5437 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5439 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5441 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5443 operands
[2 + i
] = part
[0][j
];
5444 operands
[6 + i
] = part
[1][j
];
5449 for (i
= 0; i
< nparts
; i
++)
5451 operands
[2 + i
] = part
[0][i
];
5452 operands
[6 + i
] = part
[1][i
];
5456 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5457 if (optimize_insn_for_size_p ())
5459 for (j
= 0; j
< nparts
- 1; j
++)
5460 if (CONST_INT_P (operands
[6 + j
])
5461 && operands
[6 + j
] != const0_rtx
5462 && REG_P (operands
[2 + j
]))
5463 for (i
= j
; i
< nparts
- 1; i
++)
5464 if (CONST_INT_P (operands
[7 + i
])
5465 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5466 operands
[7 + i
] = operands
[2 + j
];
5469 for (i
= 0; i
< nparts
; i
++)
5470 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5475 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5476 left shift by a constant, either using a single shift or
5477 a sequence of add instructions. */
5480 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5483 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5484 && !optimize_insn_for_size_p ()))
5487 emit_insn (gen_add2_insn (operand
, operand
));
5491 rtx (*insn
)(rtx
, rtx
, rtx
);
5493 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5494 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5499 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5501 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5502 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5503 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5504 machine_mode half_mode
;
5506 rtx low
[2], high
[2];
5509 if (CONST_INT_P (operands
[2]))
5511 split_double_mode (mode
, operands
, 2, low
, high
);
5512 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5514 if (count
>= half_width
)
5516 emit_move_insn (high
[0], low
[1]);
5517 emit_move_insn (low
[0], const0_rtx
);
5519 if (count
> half_width
)
5520 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
5524 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5526 if (!rtx_equal_p (operands
[0], operands
[1]))
5527 emit_move_insn (operands
[0], operands
[1]);
5529 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
5530 ix86_expand_ashl_const (low
[0], count
, mode
);
5535 split_double_mode (mode
, operands
, 1, low
, high
);
5536 half_mode
= mode
== DImode
? SImode
: DImode
;
5538 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5540 if (operands
[1] == const1_rtx
)
5542 /* Assuming we've chosen a QImode capable registers, then 1 << N
5543 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5544 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
5546 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
5548 ix86_expand_clear (low
[0]);
5549 ix86_expand_clear (high
[0]);
5550 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
5552 d
= gen_lowpart (QImode
, low
[0]);
5553 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5554 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
5555 emit_insn (gen_rtx_SET (d
, s
));
5557 d
= gen_lowpart (QImode
, high
[0]);
5558 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5559 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
5560 emit_insn (gen_rtx_SET (d
, s
));
5563 /* Otherwise, we can get the same results by manually performing
5564 a bit extract operation on bit 5/6, and then performing the two
5565 shifts. The two methods of getting 0/1 into low/high are exactly
5566 the same size. Avoiding the shift in the bit extract case helps
5567 pentium4 a bit; no one else seems to care much either way. */
5570 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
5571 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
5572 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
5578 gen_lshr3
= gen_lshrsi3
;
5579 gen_and3
= gen_andsi3
;
5580 gen_xor3
= gen_xorsi3
;
5585 gen_lshr3
= gen_lshrdi3
;
5586 gen_and3
= gen_anddi3
;
5587 gen_xor3
= gen_xordi3
;
5591 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
5592 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
5594 x
= gen_lowpart (half_mode
, operands
[2]);
5595 emit_insn (gen_rtx_SET (high
[0], x
));
5597 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
5598 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
5599 emit_move_insn (low
[0], high
[0]);
5600 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
5603 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5604 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
5608 if (operands
[1] == constm1_rtx
)
5610 /* For -1 << N, we can avoid the shld instruction, because we
5611 know that we're shifting 0...31/63 ones into a -1. */
5612 emit_move_insn (low
[0], constm1_rtx
);
5613 if (optimize_insn_for_size_p ())
5614 emit_move_insn (high
[0], low
[0]);
5616 emit_move_insn (high
[0], constm1_rtx
);
5620 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5622 if (!rtx_equal_p (operands
[0], operands
[1]))
5623 emit_move_insn (operands
[0], operands
[1]);
5625 split_double_mode (mode
, operands
, 1, low
, high
);
5626 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
5629 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5631 if (TARGET_CMOVE
&& scratch
)
5633 ix86_expand_clear (scratch
);
5634 emit_insn (gen_x86_shift_adj_1
5635 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
5638 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
5642 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5644 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
5645 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
5646 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5647 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5649 rtx low
[2], high
[2];
5652 if (CONST_INT_P (operands
[2]))
5654 split_double_mode (mode
, operands
, 2, low
, high
);
5655 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5657 if (count
== GET_MODE_BITSIZE (mode
) - 1)
5659 emit_move_insn (high
[0], high
[1]);
5660 emit_insn (gen_ashr3 (high
[0], high
[0],
5661 GEN_INT (half_width
- 1)));
5662 emit_move_insn (low
[0], high
[0]);
5665 else if (count
>= half_width
)
5667 emit_move_insn (low
[0], high
[1]);
5668 emit_move_insn (high
[0], low
[0]);
5669 emit_insn (gen_ashr3 (high
[0], high
[0],
5670 GEN_INT (half_width
- 1)));
5672 if (count
> half_width
)
5673 emit_insn (gen_ashr3 (low
[0], low
[0],
5674 GEN_INT (count
- half_width
)));
5678 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5680 if (!rtx_equal_p (operands
[0], operands
[1]))
5681 emit_move_insn (operands
[0], operands
[1]);
5683 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5684 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
5689 machine_mode half_mode
;
5691 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5693 if (!rtx_equal_p (operands
[0], operands
[1]))
5694 emit_move_insn (operands
[0], operands
[1]);
5696 split_double_mode (mode
, operands
, 1, low
, high
);
5697 half_mode
= mode
== DImode
? SImode
: DImode
;
5699 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5700 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
5702 if (TARGET_CMOVE
&& scratch
)
5704 emit_move_insn (scratch
, high
[0]);
5705 emit_insn (gen_ashr3 (scratch
, scratch
,
5706 GEN_INT (half_width
- 1)));
5707 emit_insn (gen_x86_shift_adj_1
5708 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5711 emit_insn (gen_x86_shift_adj_3
5712 (half_mode
, low
[0], high
[0], operands
[2]));
5717 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5719 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
5720 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
5721 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5722 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5724 rtx low
[2], high
[2];
5727 if (CONST_INT_P (operands
[2]))
5729 split_double_mode (mode
, operands
, 2, low
, high
);
5730 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5732 if (count
>= half_width
)
5734 emit_move_insn (low
[0], high
[1]);
5735 ix86_expand_clear (high
[0]);
5737 if (count
> half_width
)
5738 emit_insn (gen_lshr3 (low
[0], low
[0],
5739 GEN_INT (count
- half_width
)));
5743 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5745 if (!rtx_equal_p (operands
[0], operands
[1]))
5746 emit_move_insn (operands
[0], operands
[1]);
5748 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5749 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
5754 machine_mode half_mode
;
5756 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5758 if (!rtx_equal_p (operands
[0], operands
[1]))
5759 emit_move_insn (operands
[0], operands
[1]);
5761 split_double_mode (mode
, operands
, 1, low
, high
);
5762 half_mode
= mode
== DImode
? SImode
: DImode
;
5764 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5765 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
5767 if (TARGET_CMOVE
&& scratch
)
5769 ix86_expand_clear (scratch
);
5770 emit_insn (gen_x86_shift_adj_1
5771 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5774 emit_insn (gen_x86_shift_adj_2
5775 (half_mode
, low
[0], high
[0], operands
[2]));
5779 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5780 DImode for constant loop counts. */
5783 counter_mode (rtx count_exp
)
5785 if (GET_MODE (count_exp
) != VOIDmode
)
5786 return GET_MODE (count_exp
);
5787 if (!CONST_INT_P (count_exp
))
5789 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
5794 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5795 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5796 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5797 memory by VALUE (supposed to be in MODE).
5799 The size is rounded down to whole number of chunk size moved at once.
5800 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5804 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
5805 rtx destptr
, rtx srcptr
, rtx value
,
5806 rtx count
, machine_mode mode
, int unroll
,
5807 int expected_size
, bool issetmem
)
5809 rtx_code_label
*out_label
, *top_label
;
5811 machine_mode iter_mode
= counter_mode (count
);
5812 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
5813 rtx piece_size
= GEN_INT (piece_size_n
);
5814 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
5818 top_label
= gen_label_rtx ();
5819 out_label
= gen_label_rtx ();
5820 iter
= gen_reg_rtx (iter_mode
);
5822 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
5823 NULL
, 1, OPTAB_DIRECT
);
5824 /* Those two should combine. */
5825 if (piece_size
== const1_rtx
)
5827 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
5829 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
5831 emit_move_insn (iter
, const0_rtx
);
5833 emit_label (top_label
);
5835 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
5837 /* This assert could be relaxed - in this case we'll need to compute
5838 smallest power of two, containing in PIECE_SIZE_N and pass it to
5840 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
5841 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
5842 destmem
= adjust_address (destmem
, mode
, 0);
5846 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
5847 srcmem
= adjust_address (srcmem
, mode
, 0);
5849 /* When unrolling for chips that reorder memory reads and writes,
5850 we can save registers by using single temporary.
5851 Also using 4 temporaries is overkill in 32bit mode. */
5852 if (!TARGET_64BIT
&& 0)
5854 for (i
= 0; i
< unroll
; i
++)
5858 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5859 GET_MODE_SIZE (mode
));
5860 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5861 GET_MODE_SIZE (mode
));
5863 emit_move_insn (destmem
, srcmem
);
5869 gcc_assert (unroll
<= 4);
5870 for (i
= 0; i
< unroll
; i
++)
5872 tmpreg
[i
] = gen_reg_rtx (mode
);
5874 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5875 GET_MODE_SIZE (mode
));
5876 emit_move_insn (tmpreg
[i
], srcmem
);
5878 for (i
= 0; i
< unroll
; i
++)
5881 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5882 GET_MODE_SIZE (mode
));
5883 emit_move_insn (destmem
, tmpreg
[i
]);
5888 for (i
= 0; i
< unroll
; i
++)
5891 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5892 GET_MODE_SIZE (mode
));
5893 emit_move_insn (destmem
, value
);
5896 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
5897 true, OPTAB_LIB_WIDEN
);
5899 emit_move_insn (iter
, tmp
);
5901 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
5903 if (expected_size
!= -1)
5905 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
5906 if (expected_size
== 0)
5908 else if (expected_size
> REG_BR_PROB_BASE
)
5909 predict_jump (REG_BR_PROB_BASE
- 1);
5911 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
5915 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
5916 iter
= ix86_zero_extend_to_Pmode (iter
);
5917 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
5918 true, OPTAB_LIB_WIDEN
);
5920 emit_move_insn (destptr
, tmp
);
5923 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
5924 true, OPTAB_LIB_WIDEN
);
5926 emit_move_insn (srcptr
, tmp
);
5928 emit_label (out_label
);
5931 /* Divide COUNTREG by SCALE. */
5933 scale_counter (rtx countreg
, int scale
)
5939 if (CONST_INT_P (countreg
))
5940 return GEN_INT (INTVAL (countreg
) / scale
);
5941 gcc_assert (REG_P (countreg
));
5943 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
5944 GEN_INT (exact_log2 (scale
)),
5945 NULL
, 1, OPTAB_DIRECT
);
5949 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5950 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5951 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5952 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5953 ORIG_VALUE is the original value passed to memset to fill the memory with.
5954 Other arguments have same meaning as for previous function. */
5957 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
5958 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
5960 machine_mode mode
, bool issetmem
)
5965 HOST_WIDE_INT rounded_count
;
5967 /* If possible, it is shorter to use rep movs.
5968 TODO: Maybe it is better to move this logic to decide_alg. */
5969 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
5970 && (!issetmem
|| orig_value
== const0_rtx
))
5973 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
5974 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
5976 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
5977 GET_MODE_SIZE (mode
)));
5980 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5981 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5982 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
5985 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
5986 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
5989 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
5990 destmem
= shallow_copy_rtx (destmem
);
5991 set_mem_size (destmem
, rounded_count
);
5993 else if (MEM_SIZE_KNOWN_P (destmem
))
5994 clear_mem_size (destmem
);
5998 value
= force_reg (mode
, gen_lowpart (mode
, value
));
5999 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
6003 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
6004 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
6007 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
6008 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
6009 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
6012 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
6013 if (CONST_INT_P (count
))
6016 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6017 srcmem
= shallow_copy_rtx (srcmem
);
6018 set_mem_size (srcmem
, rounded_count
);
6022 if (MEM_SIZE_KNOWN_P (srcmem
))
6023 clear_mem_size (srcmem
);
6025 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
6030 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6032 SRC is passed by pointer to be updated on return.
6033 Return value is updated DST. */
6035 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
6036 HOST_WIDE_INT size_to_move
)
6038 rtx dst
= destmem
, src
= *srcmem
, adjust
, tempreg
;
6039 enum insn_code code
;
6040 machine_mode move_mode
;
6043 /* Find the widest mode in which we could perform moves.
6044 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6045 it until move of such size is supported. */
6046 piece_size
= 1 << floor_log2 (size_to_move
);
6047 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
6048 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6050 gcc_assert (piece_size
> 1);
6054 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6055 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6056 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
6058 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
6059 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
6060 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6062 move_mode
= word_mode
;
6063 piece_size
= GET_MODE_SIZE (move_mode
);
6064 code
= optab_handler (mov_optab
, move_mode
);
6067 gcc_assert (code
!= CODE_FOR_nothing
);
6069 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6070 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
6072 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6073 gcc_assert (size_to_move
% piece_size
== 0);
6074 adjust
= GEN_INT (piece_size
);
6075 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6077 /* We move from memory to memory, so we'll need to do it via
6078 a temporary register. */
6079 tempreg
= gen_reg_rtx (move_mode
);
6080 emit_insn (GEN_FCN (code
) (tempreg
, src
));
6081 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
6083 emit_move_insn (destptr
,
6084 gen_rtx_PLUS (Pmode
, copy_rtx (destptr
), adjust
));
6085 emit_move_insn (srcptr
,
6086 gen_rtx_PLUS (Pmode
, copy_rtx (srcptr
), adjust
));
6088 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6090 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
6094 /* Update DST and SRC rtx. */
6099 /* Helper function for the string operations below. Dest VARIABLE whether
6100 it is aligned to VALUE bytes. If true, jump to the label. */
6102 static rtx_code_label
*
6103 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
6105 rtx_code_label
*label
= gen_label_rtx ();
6106 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
6107 if (GET_MODE (variable
) == DImode
)
6108 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
6110 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
6111 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
6114 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
6116 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
6121 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6124 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
6125 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
6128 if (CONST_INT_P (count
))
6130 HOST_WIDE_INT countval
= INTVAL (count
);
6131 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6134 /* For now MAX_SIZE should be a power of 2. This assert could be
6135 relaxed, but it'll require a bit more complicated epilogue
6137 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6138 for (i
= max_size
; i
>= 1; i
>>= 1)
6140 if (epilogue_size
& i
)
6141 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6147 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
6148 count
, 1, OPTAB_DIRECT
);
6149 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
6150 count
, QImode
, 1, 4, false);
6154 /* When there are stringops, we can cheaply increase dest and src pointers.
6155 Otherwise we save code size by maintaining offset (zero is readily
6156 available from preceding rep operation) and using x86 addressing modes.
6158 if (TARGET_SINGLE_STRINGOP
)
6162 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6163 src
= change_address (srcmem
, SImode
, srcptr
);
6164 dest
= change_address (destmem
, SImode
, destptr
);
6165 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6167 LABEL_NUSES (label
) = 1;
6171 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6172 src
= change_address (srcmem
, HImode
, srcptr
);
6173 dest
= change_address (destmem
, HImode
, destptr
);
6174 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6176 LABEL_NUSES (label
) = 1;
6180 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6181 src
= change_address (srcmem
, QImode
, srcptr
);
6182 dest
= change_address (destmem
, QImode
, destptr
);
6183 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6185 LABEL_NUSES (label
) = 1;
6190 rtx offset
= force_reg (Pmode
, const0_rtx
);
6195 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6196 src
= change_address (srcmem
, SImode
, srcptr
);
6197 dest
= change_address (destmem
, SImode
, destptr
);
6198 emit_move_insn (dest
, src
);
6199 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
6200 true, OPTAB_LIB_WIDEN
);
6202 emit_move_insn (offset
, tmp
);
6204 LABEL_NUSES (label
) = 1;
6208 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6209 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6210 src
= change_address (srcmem
, HImode
, tmp
);
6211 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6212 dest
= change_address (destmem
, HImode
, tmp
);
6213 emit_move_insn (dest
, src
);
6214 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
6215 true, OPTAB_LIB_WIDEN
);
6217 emit_move_insn (offset
, tmp
);
6219 LABEL_NUSES (label
) = 1;
6223 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6224 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6225 src
= change_address (srcmem
, QImode
, tmp
);
6226 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6227 dest
= change_address (destmem
, QImode
, tmp
);
6228 emit_move_insn (dest
, src
);
6230 LABEL_NUSES (label
) = 1;
6235 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6236 with value PROMOTED_VAL.
6237 SRC is passed by pointer to be updated on return.
6238 Return value is updated DST. */
6240 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
6241 HOST_WIDE_INT size_to_move
)
6243 rtx dst
= destmem
, adjust
;
6244 enum insn_code code
;
6245 machine_mode move_mode
;
6248 /* Find the widest mode in which we could perform moves.
6249 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6250 it until move of such size is supported. */
6251 move_mode
= GET_MODE (promoted_val
);
6252 if (move_mode
== VOIDmode
)
6254 if (size_to_move
< GET_MODE_SIZE (move_mode
))
6256 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
6257 move_mode
= int_mode_for_size (move_bits
, 0).require ();
6258 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
6260 piece_size
= GET_MODE_SIZE (move_mode
);
6261 code
= optab_handler (mov_optab
, move_mode
);
6262 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
6264 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6266 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6267 gcc_assert (size_to_move
% piece_size
== 0);
6268 adjust
= GEN_INT (piece_size
);
6269 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6271 if (piece_size
<= GET_MODE_SIZE (word_mode
))
6273 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
6274 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6279 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
6281 emit_move_insn (destptr
,
6282 gen_rtx_PLUS (Pmode
, copy_rtx (destptr
), adjust
));
6284 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6288 /* Update DST rtx. */
6291 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6293 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
6294 rtx count
, int max_size
)
6296 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
6297 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
6298 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
6299 gen_lowpart (QImode
, value
), count
, QImode
,
6300 1, max_size
/ 2, true);
6303 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6305 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
6306 rtx count
, int max_size
)
6310 if (CONST_INT_P (count
))
6312 HOST_WIDE_INT countval
= INTVAL (count
);
6313 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6316 /* For now MAX_SIZE should be a power of 2. This assert could be
6317 relaxed, but it'll require a bit more complicated epilogue
6319 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6320 for (i
= max_size
; i
>= 1; i
>>= 1)
6322 if (epilogue_size
& i
)
6324 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6325 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6327 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6334 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
6339 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
6342 dest
= change_address (destmem
, DImode
, destptr
);
6343 emit_insn (gen_strset (destptr
, dest
, value
));
6344 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
6345 emit_insn (gen_strset (destptr
, dest
, value
));
6349 dest
= change_address (destmem
, SImode
, destptr
);
6350 emit_insn (gen_strset (destptr
, dest
, value
));
6351 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6352 emit_insn (gen_strset (destptr
, dest
, value
));
6353 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
6354 emit_insn (gen_strset (destptr
, dest
, value
));
6355 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
6356 emit_insn (gen_strset (destptr
, dest
, value
));
6359 LABEL_NUSES (label
) = 1;
6363 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
6366 dest
= change_address (destmem
, DImode
, destptr
);
6367 emit_insn (gen_strset (destptr
, dest
, value
));
6371 dest
= change_address (destmem
, SImode
, destptr
);
6372 emit_insn (gen_strset (destptr
, dest
, value
));
6373 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6374 emit_insn (gen_strset (destptr
, dest
, value
));
6377 LABEL_NUSES (label
) = 1;
6381 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6382 dest
= change_address (destmem
, SImode
, destptr
);
6383 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
6385 LABEL_NUSES (label
) = 1;
6389 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6390 dest
= change_address (destmem
, HImode
, destptr
);
6391 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
6393 LABEL_NUSES (label
) = 1;
6397 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6398 dest
= change_address (destmem
, QImode
, destptr
);
6399 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
6401 LABEL_NUSES (label
) = 1;
6405 /* Adjust COUNTER by the VALUE. */
6407 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
6409 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
6412 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6413 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6414 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6416 Return value is updated DESTMEM. */
6419 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
6420 rtx destptr
, rtx srcptr
, rtx value
,
6421 rtx vec_value
, rtx count
, int align
,
6422 int desired_alignment
, bool issetmem
)
6425 for (i
= 1; i
< desired_alignment
; i
<<= 1)
6429 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
6432 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6433 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6435 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6438 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6439 ix86_adjust_counter (count
, i
);
6441 LABEL_NUSES (label
) = 1;
6442 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
6448 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6449 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6450 and jump to DONE_LABEL. */
6452 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
6453 rtx destptr
, rtx srcptr
,
6454 rtx value
, rtx vec_value
,
6455 rtx count
, int size
,
6456 rtx done_label
, bool issetmem
)
6458 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
6459 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
6463 /* If we do not have vector value to copy, we must reduce size. */
6468 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
6470 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
6471 mode
= GET_MODE (value
);
6474 mode
= GET_MODE (vec_value
), value
= vec_value
;
6478 /* Choose appropriate vector mode. */
6480 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
6481 else if (size
>= 16)
6482 mode
= TARGET_SSE
? V16QImode
: DImode
;
6483 srcmem
= change_address (srcmem
, mode
, srcptr
);
6485 destmem
= change_address (destmem
, mode
, destptr
);
6486 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6487 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6488 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6491 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6494 emit_move_insn (destmem
, srcmem
);
6495 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6497 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6500 destmem
= offset_address (destmem
, count
, 1);
6501 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
6502 GET_MODE_SIZE (mode
));
6505 srcmem
= offset_address (srcmem
, count
, 1);
6506 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
6507 GET_MODE_SIZE (mode
));
6509 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6512 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6515 emit_move_insn (destmem
, srcmem
);
6516 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6518 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6520 emit_jump_insn (gen_jump (done_label
));
6524 LABEL_NUSES (label
) = 1;
6527 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6528 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6529 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6530 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6531 DONE_LABEL is a label after the whole copying sequence. The label is created
6532 on demand if *DONE_LABEL is NULL.
6533 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6534 bounds after the initial copies.
6536 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6537 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6538 we will dispatch to a library call for large blocks.
6540 In pseudocode we do:
6544 Assume that SIZE is 4. Bigger sizes are handled analogously
6547 copy 4 bytes from SRCPTR to DESTPTR
6548 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6553 copy 1 byte from SRCPTR to DESTPTR
6556 copy 2 bytes from SRCPTR to DESTPTR
6557 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6562 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6563 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6565 OLD_DESPTR = DESTPTR;
6566 Align DESTPTR up to DESIRED_ALIGN
6567 SRCPTR += DESTPTR - OLD_DESTPTR
6568 COUNT -= DEST_PTR - OLD_DESTPTR
6570 Round COUNT down to multiple of SIZE
6571 << optional caller supplied zero size guard is here >>
6572 << optional caller supplied dynamic check is here >>
6573 << caller supplied main copy loop is here >>
6578 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
6579 rtx
*destptr
, rtx
*srcptr
,
6581 rtx value
, rtx vec_value
,
6583 rtx_code_label
**done_label
,
6587 unsigned HOST_WIDE_INT
*min_size
,
6591 rtx_code_label
*loop_label
= NULL
, *label
;
6594 int prolog_size
= 0;
6597 /* Chose proper value to copy. */
6598 if (issetmem
&& VECTOR_MODE_P (mode
))
6599 mode_value
= vec_value
;
6602 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6604 /* See if block is big or small, handle small blocks. */
6605 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
6608 loop_label
= gen_label_rtx ();
6611 *done_label
= gen_label_rtx ();
6613 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
6617 /* Handle sizes > 3. */
6618 for (;size2
> 2; size2
>>= 1)
6619 expand_small_cpymem_or_setmem (destmem
, srcmem
,
6623 size2
, *done_label
, issetmem
);
6624 /* Nothing to copy? Jump to DONE_LABEL if so */
6625 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
6628 /* Do a byte copy. */
6629 destmem
= change_address (destmem
, QImode
, *destptr
);
6631 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
6634 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
6635 emit_move_insn (destmem
, srcmem
);
6638 /* Handle sizes 2 and 3. */
6639 label
= ix86_expand_aligntest (*count
, 2, false);
6640 destmem
= change_address (destmem
, HImode
, *destptr
);
6641 destmem
= offset_address (destmem
, *count
, 1);
6642 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
6644 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
6647 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
6648 srcmem
= offset_address (srcmem
, *count
, 1);
6649 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
6650 emit_move_insn (destmem
, srcmem
);
6654 LABEL_NUSES (label
) = 1;
6655 emit_jump_insn (gen_jump (*done_label
));
6659 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
6660 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
6662 /* Start memcpy for COUNT >= SIZE. */
6665 emit_label (loop_label
);
6666 LABEL_NUSES (loop_label
) = 1;
6669 /* Copy first desired_align bytes. */
6671 srcmem
= change_address (srcmem
, mode
, *srcptr
);
6672 destmem
= change_address (destmem
, mode
, *destptr
);
6673 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6674 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
6677 emit_move_insn (destmem
, mode_value
);
6680 emit_move_insn (destmem
, srcmem
);
6681 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6683 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6684 prolog_size
+= GET_MODE_SIZE (mode
);
6688 /* Copy last SIZE bytes. */
6689 destmem
= offset_address (destmem
, *count
, 1);
6690 destmem
= offset_address (destmem
,
6691 GEN_INT (-size
- prolog_size
),
6694 emit_move_insn (destmem
, mode_value
);
6697 srcmem
= offset_address (srcmem
, *count
, 1);
6698 srcmem
= offset_address (srcmem
,
6699 GEN_INT (-size
- prolog_size
),
6701 emit_move_insn (destmem
, srcmem
);
6703 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6705 destmem
= offset_address (destmem
, modesize
, 1);
6707 emit_move_insn (destmem
, mode_value
);
6710 srcmem
= offset_address (srcmem
, modesize
, 1);
6711 emit_move_insn (destmem
, srcmem
);
6715 /* Align destination. */
6716 if (desired_align
> 1 && desired_align
> align
)
6718 rtx saveddest
= *destptr
;
6720 gcc_assert (desired_align
<= size
);
6721 /* Align destptr up, place it to new register. */
6722 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
6723 GEN_INT (prolog_size
),
6724 NULL_RTX
, 1, OPTAB_DIRECT
);
6725 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
6726 REG_POINTER (*destptr
) = 1;
6727 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
6728 GEN_INT (-desired_align
),
6729 *destptr
, 1, OPTAB_DIRECT
);
6730 /* See how many bytes we skipped. */
6731 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
6733 saveddest
, 1, OPTAB_DIRECT
);
6734 /* Adjust srcptr and count. */
6736 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
6737 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
6738 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6739 saveddest
, *count
, 1, OPTAB_DIRECT
);
6740 /* We copied at most size + prolog_size. */
6741 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
6743 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
6747 /* Our loops always round down the block size, but for dispatch to
6748 library we need precise value. */
6750 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
6751 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
6755 gcc_assert (prolog_size
== 0);
6756 /* Decrease count, so we won't end up copying last word twice. */
6757 if (!CONST_INT_P (*count
))
6758 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6759 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
6761 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
6762 (unsigned HOST_WIDE_INT
)size
));
6764 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
6769 /* This function is like the previous one, except here we know how many bytes
6770 need to be copied. That allows us to update alignment not only of DST, which
6771 is returned, but also of SRC, which is passed as a pointer for that
6774 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
6775 rtx srcreg
, rtx value
, rtx vec_value
,
6776 int desired_align
, int align_bytes
,
6781 rtx orig_src
= NULL
;
6783 int copied_bytes
= 0;
6787 gcc_assert (srcp
!= NULL
);
6792 for (piece_size
= 1;
6793 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
6796 if (align_bytes
& piece_size
)
6800 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
6801 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
6803 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
6806 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
6807 copied_bytes
+= piece_size
;
6810 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
6811 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
6812 if (MEM_SIZE_KNOWN_P (orig_dst
))
6813 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
6817 int src_align_bytes
= get_mem_align_offset (src
, desired_align
6819 if (src_align_bytes
>= 0)
6820 src_align_bytes
= desired_align
- src_align_bytes
;
6821 if (src_align_bytes
>= 0)
6823 unsigned int src_align
;
6824 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
6826 if ((src_align_bytes
& (src_align
- 1))
6827 == (align_bytes
& (src_align
- 1)))
6830 if (src_align
> (unsigned int) desired_align
)
6831 src_align
= desired_align
;
6832 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
6833 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
6835 if (MEM_SIZE_KNOWN_P (orig_src
))
6836 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
6843 /* Return true if ALG can be used in current context.
6844 Assume we expand memset if MEMSET is true. */
6846 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
6848 if (alg
== no_stringop
)
6850 if (alg
== vector_loop
)
6851 return TARGET_SSE
|| TARGET_AVX
;
6852 /* Algorithms using the rep prefix want at least edi and ecx;
6853 additionally, memset wants eax and memcpy wants esi. Don't
6854 consider such algorithms if the user has appropriated those
6855 registers for their own purposes, or if we have a non-default
6856 address space, since some string insns cannot override the segment. */
6857 if (alg
== rep_prefix_1_byte
6858 || alg
== rep_prefix_4_byte
6859 || alg
== rep_prefix_8_byte
)
6863 if (fixed_regs
[CX_REG
]
6864 || fixed_regs
[DI_REG
]
6865 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
6871 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6872 static enum stringop_alg
6873 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
6874 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
6875 bool memset
, bool zero_memset
, bool have_as
,
6876 int *dynamic_check
, bool *noalign
, bool recur
)
6878 const struct stringop_algs
*algs
;
6879 bool optimize_for_speed
;
6881 const struct processor_costs
*cost
;
6883 bool any_alg_usable_p
= false;
6886 *dynamic_check
= -1;
6888 /* Even if the string operation call is cold, we still might spend a lot
6889 of time processing large blocks. */
6890 if (optimize_function_for_size_p (cfun
)
6891 || (optimize_insn_for_size_p ()
6893 || (expected_size
!= -1 && expected_size
< 256))))
6894 optimize_for_speed
= false;
6896 optimize_for_speed
= true;
6898 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
6900 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
6902 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
6904 /* See maximal size for user defined algorithm. */
6905 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6907 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6908 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
6909 any_alg_usable_p
|= usable
;
6911 if (candidate
!= libcall
&& candidate
&& usable
)
6912 max
= algs
->size
[i
].max
;
6915 /* If expected size is not known but max size is small enough
6916 so inline version is a win, set expected size into
6918 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
6919 && expected_size
== -1)
6920 expected_size
= min_size
/ 2 + max_size
/ 2;
6922 /* If user specified the algorithm, honor it if possible. */
6923 if (ix86_stringop_alg
!= no_stringop
6924 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
6925 return ix86_stringop_alg
;
6926 /* rep; movq or rep; movl is the smallest variant. */
6927 else if (!optimize_for_speed
)
6930 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
6931 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
6932 ? rep_prefix_1_byte
: loop_1_byte
;
6934 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
6935 ? rep_prefix_4_byte
: loop
;
6937 /* Very tiny blocks are best handled via the loop, REP is expensive to
6939 else if (expected_size
!= -1 && expected_size
< 4)
6941 else if (expected_size
!= -1)
6943 enum stringop_alg alg
= libcall
;
6944 bool alg_noalign
= false;
6945 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6947 /* We get here if the algorithms that were not libcall-based
6948 were rep-prefix based and we are unable to use rep prefixes
6949 based on global register usage. Break out of the loop and
6950 use the heuristic below. */
6951 if (algs
->size
[i
].max
== 0)
6953 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
6955 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6957 if (candidate
!= libcall
6958 && alg_usable_p (candidate
, memset
, have_as
))
6961 alg_noalign
= algs
->size
[i
].noalign
;
6963 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6964 last non-libcall inline algorithm. */
6965 if (TARGET_INLINE_ALL_STRINGOPS
)
6967 /* When the current size is best to be copied by a libcall,
6968 but we are still forced to inline, run the heuristic below
6969 that will pick code for medium sized blocks. */
6972 *noalign
= alg_noalign
;
6975 else if (!any_alg_usable_p
)
6978 else if (alg_usable_p (candidate
, memset
, have_as
))
6980 *noalign
= algs
->size
[i
].noalign
;
6986 /* When asked to inline the call anyway, try to pick meaningful choice.
6987 We look for maximal size of block that is faster to copy by hand and
6988 take blocks of at most of that size guessing that average size will
6989 be roughly half of the block.
6991 If this turns out to be bad, we might simply specify the preferred
6992 choice in ix86_costs. */
6993 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
6994 && (algs
->unknown_size
== libcall
6995 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
6997 enum stringop_alg alg
;
6998 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
7000 /* If there aren't any usable algorithms or if recursing already,
7001 then recursing on smaller sizes or same size isn't going to
7002 find anything. Just return the simple byte-at-a-time copy loop. */
7003 if (!any_alg_usable_p
|| recur
)
7005 /* Pick something reasonable. */
7006 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
7007 *dynamic_check
= 128;
7010 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
7011 zero_memset
, have_as
, dynamic_check
, noalign
, true);
7012 gcc_assert (*dynamic_check
== -1);
7013 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7014 *dynamic_check
= max
;
7016 gcc_assert (alg
!= libcall
);
7019 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
7020 ? algs
->unknown_size
: libcall
);
7023 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7024 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7026 decide_alignment (int align
,
7027 enum stringop_alg alg
,
7029 machine_mode move_mode
)
7031 int desired_align
= 0;
7033 gcc_assert (alg
!= no_stringop
);
7037 if (move_mode
== VOIDmode
)
7040 desired_align
= GET_MODE_SIZE (move_mode
);
7041 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7042 copying whole cacheline at once. */
7043 if (TARGET_PENTIUMPRO
7044 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
7049 if (desired_align
< align
)
7050 desired_align
= align
;
7051 if (expected_size
!= -1 && expected_size
< 4)
7052 desired_align
= align
;
7054 return desired_align
;
7058 /* Helper function for memcpy. For QImode value 0xXY produce
7059 0xXYXYXYXY of wide specified by MODE. This is essentially
7060 a * 0x10101010, but we can do slightly better than
7061 synth_mult by unwinding the sequence by hand on CPUs with
7064 promote_duplicated_reg (machine_mode mode
, rtx val
)
7066 machine_mode valmode
= GET_MODE (val
);
7068 int nops
= mode
== DImode
? 3 : 2;
7070 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
7071 if (val
== const0_rtx
)
7072 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
7073 if (CONST_INT_P (val
))
7075 HOST_WIDE_INT v
= INTVAL (val
) & 255;
7080 v
|= (v
<< 16) << 16;
7081 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
7084 if (valmode
== VOIDmode
)
7086 if (valmode
!= QImode
)
7087 val
= gen_lowpart (QImode
, val
);
7090 if (!TARGET_PARTIAL_REG_STALL
)
7092 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
7093 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
7094 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
7095 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
7097 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7098 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
7099 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
7104 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7106 if (!TARGET_PARTIAL_REG_STALL
)
7108 emit_insn (gen_insvsi_1 (reg
, reg
));
7110 emit_insn (gen_insvdi_1 (reg
, reg
));
7113 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
7114 NULL
, 1, OPTAB_DIRECT
);
7115 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
7118 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
7119 NULL
, 1, OPTAB_DIRECT
);
7120 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7123 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
7124 NULL
, 1, OPTAB_DIRECT
);
7125 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7130 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7131 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7132 alignment from ALIGN to DESIRED_ALIGN. */
7134 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
7140 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
7141 promoted_val
= promote_duplicated_reg (DImode
, val
);
7142 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
7143 promoted_val
= promote_duplicated_reg (SImode
, val
);
7144 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
7145 promoted_val
= promote_duplicated_reg (HImode
, val
);
7149 return promoted_val
;
7152 /* Copy the address to a Pmode register. This is used for x32 to
7153 truncate DImode TLS address to a SImode register. */
7156 ix86_copy_addr_to_reg (rtx addr
)
7159 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
7161 reg
= copy_addr_to_reg (addr
);
7162 REG_POINTER (reg
) = 1;
7167 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
7168 reg
= copy_to_mode_reg (DImode
, addr
);
7169 REG_POINTER (reg
) = 1;
7170 return gen_rtx_SUBREG (SImode
, reg
, 0);
7174 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7175 operations when profitable. The code depends upon architecture, block size
7176 and alignment, but always has one of the following overall structures:
7178 Aligned move sequence:
7180 1) Prologue guard: Conditional that jumps up to epilogues for small
7181 blocks that can be handled by epilogue alone. This is faster
7182 but also needed for correctness, since prologue assume the block
7183 is larger than the desired alignment.
7185 Optional dynamic check for size and libcall for large
7186 blocks is emitted here too, with -minline-stringops-dynamically.
7188 2) Prologue: copy first few bytes in order to get destination
7189 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7190 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7191 copied. We emit either a jump tree on power of two sized
7192 blocks, or a byte loop.
7194 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7195 with specified algorithm.
7197 4) Epilogue: code copying tail of the block that is too small to be
7198 handled by main body (or up to size guarded by prologue guard).
7200 Misaligned move sequence
7202 1) missaligned move prologue/epilogue containing:
7203 a) Prologue handling small memory blocks and jumping to done_label
7204 (skipped if blocks are known to be large enough)
7205 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7206 needed by single possibly misaligned move
7207 (skipped if alignment is not needed)
7208 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7210 2) Zero size guard dispatching to done_label, if needed
7212 3) dispatch to library call, if needed,
7214 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7215 with specified algorithm. */
7217 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
7218 rtx align_exp
, rtx expected_align_exp
,
7219 rtx expected_size_exp
, rtx min_size_exp
,
7220 rtx max_size_exp
, rtx probable_max_size_exp
,
7225 rtx_code_label
*label
= NULL
;
7227 rtx_code_label
*jump_around_label
= NULL
;
7228 HOST_WIDE_INT align
= 1;
7229 unsigned HOST_WIDE_INT count
= 0;
7230 HOST_WIDE_INT expected_size
= -1;
7231 int size_needed
= 0, epilogue_size_needed
;
7232 int desired_align
= 0, align_bytes
= 0;
7233 enum stringop_alg alg
;
7234 rtx promoted_val
= NULL
;
7235 rtx vec_promoted_val
= NULL
;
7236 bool force_loopy_epilogue
= false;
7238 bool need_zero_guard
= false;
7240 machine_mode move_mode
= VOIDmode
;
7241 machine_mode wider_mode
;
7242 int unroll_factor
= 1;
7243 /* TODO: Once value ranges are available, fill in proper data. */
7244 unsigned HOST_WIDE_INT min_size
= 0;
7245 unsigned HOST_WIDE_INT max_size
= -1;
7246 unsigned HOST_WIDE_INT probable_max_size
= -1;
7247 bool misaligned_prologue_used
= false;
7250 if (CONST_INT_P (align_exp
))
7251 align
= INTVAL (align_exp
);
7252 /* i386 can do misaligned access on reasonably increased cost. */
7253 if (CONST_INT_P (expected_align_exp
)
7254 && INTVAL (expected_align_exp
) > align
)
7255 align
= INTVAL (expected_align_exp
);
7256 /* ALIGN is the minimum of destination and source alignment, but we care here
7257 just about destination alignment. */
7259 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
7260 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
7262 if (CONST_INT_P (count_exp
))
7264 min_size
= max_size
= probable_max_size
= count
= expected_size
7265 = INTVAL (count_exp
);
7266 /* When COUNT is 0, there is nothing to do. */
7273 min_size
= INTVAL (min_size_exp
);
7275 max_size
= INTVAL (max_size_exp
);
7276 if (probable_max_size_exp
)
7277 probable_max_size
= INTVAL (probable_max_size_exp
);
7278 if (CONST_INT_P (expected_size_exp
))
7279 expected_size
= INTVAL (expected_size_exp
);
7282 /* Make sure we don't need to care about overflow later on. */
7283 if (count
> (HOST_WIDE_INT_1U
<< 30))
7286 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
7288 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
7290 /* Step 0: Decide on preferred algorithm, desired alignment and
7291 size of chunks to be copied by main loop. */
7292 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
7294 issetmem
&& val_exp
== const0_rtx
, have_as
,
7295 &dynamic_check
, &noalign
, false);
7298 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
7299 stringop_alg_names
[alg
]);
7303 gcc_assert (alg
!= no_stringop
);
7305 /* For now vector-version of memset is generated only for memory zeroing, as
7306 creating of promoted vector value is very cheap in this case. */
7307 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
7308 alg
= unrolled_loop
;
7311 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
7312 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
7314 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
7317 move_mode
= word_mode
;
7325 need_zero_guard
= true;
7329 need_zero_guard
= true;
7332 need_zero_guard
= true;
7333 unroll_factor
= (TARGET_64BIT
? 4 : 2);
7336 need_zero_guard
= true;
7338 /* Find the widest supported mode. */
7339 move_mode
= word_mode
;
7340 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
7341 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
7342 move_mode
= wider_mode
;
7344 if (TARGET_AVX128_OPTIMAL
&& GET_MODE_BITSIZE (move_mode
) > 128)
7347 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7348 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7349 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7351 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7352 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7353 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
7354 move_mode
= word_mode
;
7356 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
7358 case rep_prefix_8_byte
:
7361 case rep_prefix_4_byte
:
7364 case rep_prefix_1_byte
:
7368 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
7369 epilogue_size_needed
= size_needed
;
7371 /* If we are going to call any library calls conditionally, make sure any
7372 pending stack adjustment happen before the first conditional branch,
7373 otherwise they will be emitted before the library call only and won't
7374 happen from the other branches. */
7375 if (dynamic_check
!= -1)
7376 do_pending_stack_adjust ();
7378 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
7379 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
7380 align
= desired_align
;
7382 /* Step 1: Prologue guard. */
7384 /* Alignment code needs count to be in register. */
7385 if (CONST_INT_P (count_exp
) && desired_align
> align
)
7387 if (INTVAL (count_exp
) > desired_align
7388 && INTVAL (count_exp
) > size_needed
)
7391 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
7392 if (align_bytes
<= 0)
7395 align_bytes
= desired_align
- align_bytes
;
7397 if (align_bytes
== 0)
7398 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
7400 gcc_assert (desired_align
>= 1 && align
>= 1);
7402 /* Misaligned move sequences handle both prologue and epilogue at once.
7403 Default code generation results in a smaller code for large alignments
7404 and also avoids redundant job when sizes are known precisely. */
7405 misaligned_prologue_used
7406 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7407 && MAX (desired_align
, epilogue_size_needed
) <= 32
7408 && desired_align
<= epilogue_size_needed
7409 && ((desired_align
> align
&& !align_bytes
)
7410 || (!count
&& epilogue_size_needed
> 1)));
7412 /* Do the cheap promotion to allow better CSE across the
7413 main loop and epilogue (ie one load of the big constant in the
7415 For now the misaligned move sequences do not have fast path
7416 without broadcasting. */
7417 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
7419 if (alg
== vector_loop
)
7421 gcc_assert (val_exp
== const0_rtx
);
7422 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
7423 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
7424 GET_MODE_SIZE (word_mode
),
7425 desired_align
, align
);
7429 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7430 desired_align
, align
);
7433 /* Misaligned move sequences handles both prologues and epilogues at once.
7434 Default code generation results in smaller code for large alignments and
7435 also avoids redundant job when sizes are known precisely. */
7436 if (misaligned_prologue_used
)
7438 /* Misaligned move prologue handled small blocks by itself. */
7439 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7440 (dst
, src
, &destreg
, &srcreg
,
7441 move_mode
, promoted_val
, vec_promoted_val
,
7444 desired_align
< align
7445 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
7446 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
7448 src
= change_address (src
, BLKmode
, srcreg
);
7449 dst
= change_address (dst
, BLKmode
, destreg
);
7450 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7451 epilogue_size_needed
= 0;
7453 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
7455 /* It is possible that we copied enough so the main loop will not
7457 gcc_assert (size_needed
> 1);
7458 if (jump_around_label
== NULL_RTX
)
7459 jump_around_label
= gen_label_rtx ();
7460 emit_cmp_and_jump_insns (count_exp
,
7461 GEN_INT (size_needed
),
7462 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
7463 if (expected_size
== -1
7464 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7465 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7467 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7470 /* Ensure that alignment prologue won't copy past end of block. */
7471 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
7473 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
7474 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7475 Make sure it is power of 2. */
7476 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
7478 /* To improve performance of small blocks, we jump around the VAL
7479 promoting mode. This mean that if the promoted VAL is not constant,
7480 we might not use it in the epilogue and have to use byte
7482 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
7483 force_loopy_epilogue
= true;
7484 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7485 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7487 /* If main algorithm works on QImode, no epilogue is needed.
7488 For small sizes just don't align anything. */
7489 if (size_needed
== 1)
7490 desired_align
= align
;
7495 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7497 label
= gen_label_rtx ();
7498 emit_cmp_and_jump_insns (count_exp
,
7499 GEN_INT (epilogue_size_needed
),
7500 LTU
, 0, counter_mode (count_exp
), 1, label
);
7501 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
7502 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7504 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7508 /* Emit code to decide on runtime whether library call or inline should be
7510 if (dynamic_check
!= -1)
7512 if (!issetmem
&& CONST_INT_P (count_exp
))
7514 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
7516 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7517 count_exp
= const0_rtx
;
7523 rtx_code_label
*hot_label
= gen_label_rtx ();
7524 if (jump_around_label
== NULL_RTX
)
7525 jump_around_label
= gen_label_rtx ();
7526 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
7527 LEU
, 0, counter_mode (count_exp
),
7529 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7531 set_storage_via_libcall (dst
, count_exp
, val_exp
);
7533 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7534 emit_jump (jump_around_label
);
7535 emit_label (hot_label
);
7539 /* Step 2: Alignment prologue. */
7540 /* Do the expensive promotion once we branched off the small blocks. */
7541 if (issetmem
&& !promoted_val
)
7542 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7543 desired_align
, align
);
7545 if (desired_align
> align
&& !misaligned_prologue_used
)
7547 if (align_bytes
== 0)
7549 /* Except for the first move in prologue, we no longer know
7550 constant offset in aliasing info. It don't seems to worth
7551 the pain to maintain it for the first move, so throw away
7553 dst
= change_address (dst
, BLKmode
, destreg
);
7555 src
= change_address (src
, BLKmode
, srcreg
);
7556 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
7557 promoted_val
, vec_promoted_val
,
7558 count_exp
, align
, desired_align
,
7560 /* At most desired_align - align bytes are copied. */
7561 if (min_size
< (unsigned)(desired_align
- align
))
7564 min_size
-= desired_align
- align
;
7568 /* If we know how many bytes need to be stored before dst is
7569 sufficiently aligned, maintain aliasing info accurately. */
7570 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
7578 count_exp
= plus_constant (counter_mode (count_exp
),
7579 count_exp
, -align_bytes
);
7580 count
-= align_bytes
;
7581 min_size
-= align_bytes
;
7582 max_size
-= align_bytes
;
7585 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
7586 && (count
< (unsigned HOST_WIDE_INT
) size_needed
7587 || (align_bytes
== 0
7588 && count
< ((unsigned HOST_WIDE_INT
) size_needed
7589 + desired_align
- align
))))
7591 /* It is possible that we copied enough so the main loop will not
7593 gcc_assert (size_needed
> 1);
7594 if (label
== NULL_RTX
)
7595 label
= gen_label_rtx ();
7596 emit_cmp_and_jump_insns (count_exp
,
7597 GEN_INT (size_needed
),
7598 LTU
, 0, counter_mode (count_exp
), 1, label
);
7599 if (expected_size
== -1
7600 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7601 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7603 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7606 if (label
&& size_needed
== 1)
7609 LABEL_NUSES (label
) = 1;
7611 epilogue_size_needed
= 1;
7613 promoted_val
= val_exp
;
7615 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
7616 epilogue_size_needed
= size_needed
;
7618 /* Step 3: Main loop. */
7629 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
7630 count_exp
, move_mode
, unroll_factor
,
7631 expected_size
, issetmem
);
7634 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
7635 vec_promoted_val
, count_exp
, move_mode
,
7636 unroll_factor
, expected_size
, issetmem
);
7638 case rep_prefix_8_byte
:
7639 case rep_prefix_4_byte
:
7640 case rep_prefix_1_byte
:
7641 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
7642 val_exp
, count_exp
, move_mode
, issetmem
);
7645 /* Adjust properly the offset of src and dest memory for aliasing. */
7646 if (CONST_INT_P (count_exp
))
7649 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
7650 (count
/ size_needed
) * size_needed
);
7651 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
7652 (count
/ size_needed
) * size_needed
);
7657 src
= change_address (src
, BLKmode
, srcreg
);
7658 dst
= change_address (dst
, BLKmode
, destreg
);
7661 /* Step 4: Epilogue to copy the remaining bytes. */
7665 /* When the main loop is done, COUNT_EXP might hold original count,
7666 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7667 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7668 bytes. Compensate if needed. */
7670 if (size_needed
< epilogue_size_needed
)
7672 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
7673 GEN_INT (size_needed
- 1), count_exp
, 1,
7675 if (tmp
!= count_exp
)
7676 emit_move_insn (count_exp
, tmp
);
7679 LABEL_NUSES (label
) = 1;
7682 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
7684 if (force_loopy_epilogue
)
7685 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
7686 epilogue_size_needed
);
7690 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
7691 vec_promoted_val
, count_exp
,
7692 epilogue_size_needed
);
7694 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
7695 epilogue_size_needed
);
7698 if (jump_around_label
)
7699 emit_label (jump_around_label
);
7704 /* Expand the appropriate insns for doing strlen if not just doing
7707 out = result, initialized with the start address
7708 align_rtx = alignment of the address.
7709 scratch = scratch register, initialized with the startaddress when
7710 not aligned, otherwise undefined
7712 This is just the body. It needs the initializations mentioned above and
7713 some address computing at the end. These things are done in i386.md. */
7716 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
7720 rtx_code_label
*align_2_label
= NULL
;
7721 rtx_code_label
*align_3_label
= NULL
;
7722 rtx_code_label
*align_4_label
= gen_label_rtx ();
7723 rtx_code_label
*end_0_label
= gen_label_rtx ();
7725 rtx tmpreg
= gen_reg_rtx (SImode
);
7726 rtx scratch
= gen_reg_rtx (SImode
);
7730 if (CONST_INT_P (align_rtx
))
7731 align
= INTVAL (align_rtx
);
7733 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7735 /* Is there a known alignment and is it less than 4? */
7738 rtx scratch1
= gen_reg_rtx (Pmode
);
7739 emit_move_insn (scratch1
, out
);
7740 /* Is there a known alignment and is it not 2? */
7743 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
7744 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
7746 /* Leave just the 3 lower bits. */
7747 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
7748 NULL_RTX
, 0, OPTAB_WIDEN
);
7750 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7751 Pmode
, 1, align_4_label
);
7752 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
7753 Pmode
, 1, align_2_label
);
7754 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
7755 Pmode
, 1, align_3_label
);
7759 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7760 check if is aligned to 4 - byte. */
7762 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
7763 NULL_RTX
, 0, OPTAB_WIDEN
);
7765 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7766 Pmode
, 1, align_4_label
);
7769 mem
= change_address (src
, QImode
, out
);
7771 /* Now compare the bytes. */
7773 /* Compare the first n unaligned byte on a byte per byte basis. */
7774 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
7775 QImode
, 1, end_0_label
);
7777 /* Increment the address. */
7778 emit_insn (gen_add2_insn (out
, const1_rtx
));
7780 /* Not needed with an alignment of 2 */
7783 emit_label (align_2_label
);
7785 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7788 emit_insn (gen_add2_insn (out
, const1_rtx
));
7790 emit_label (align_3_label
);
7793 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7796 emit_insn (gen_add2_insn (out
, const1_rtx
));
7799 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7800 align this loop. It gives only huge programs, but does not help to
7802 emit_label (align_4_label
);
7804 mem
= change_address (src
, SImode
, out
);
7805 emit_move_insn (scratch
, mem
);
7806 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
7808 /* This formula yields a nonzero result iff one of the bytes is zero.
7809 This saves three branches inside loop and many cycles. */
7811 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
7812 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
7813 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
7814 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
7815 gen_int_mode (0x80808080, SImode
)));
7816 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
7821 rtx reg
= gen_reg_rtx (SImode
);
7822 rtx reg2
= gen_reg_rtx (Pmode
);
7823 emit_move_insn (reg
, tmpreg
);
7824 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
7826 /* If zero is not in the first two bytes, move two bytes forward. */
7827 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7828 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7829 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7830 emit_insn (gen_rtx_SET (tmpreg
,
7831 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
7834 /* Emit lea manually to avoid clobbering of flags. */
7835 emit_insn (gen_rtx_SET (reg2
, gen_rtx_PLUS (Pmode
, out
, const2_rtx
)));
7837 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7838 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7839 emit_insn (gen_rtx_SET (out
,
7840 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
7846 rtx_code_label
*end_2_label
= gen_label_rtx ();
7847 /* Is zero in the first two bytes? */
7849 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7850 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7851 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
7852 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
7853 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
7855 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
7856 JUMP_LABEL (tmp
) = end_2_label
;
7858 /* Not in the first two. Move two bytes forward. */
7859 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
7860 emit_insn (gen_add2_insn (out
, const2_rtx
));
7862 emit_label (end_2_label
);
7866 /* Avoid branch in fixing the byte. */
7867 tmpreg
= gen_lowpart (QImode
, tmpreg
);
7868 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
7869 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
7870 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
7871 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
7873 emit_label (end_0_label
);
7876 /* Expand strlen. */
7879 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
7881 if (TARGET_UNROLL_STRLEN
7882 && TARGET_INLINE_ALL_STRINGOPS
7883 && eoschar
== const0_rtx
7886 /* The generic case of strlen expander is long. Avoid it's
7887 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7888 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
7889 /* Well it seems that some optimizer does not combine a call like
7890 foo(strlen(bar), strlen(bar));
7891 when the move and the subtraction is done here. It does calculate
7892 the length just once when these instructions are done inside of
7893 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7894 often used and I use one fewer register for the lifetime of
7895 output_strlen_unroll() this is better. */
7897 emit_move_insn (out
, addr
);
7899 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
7901 /* strlensi_unroll_1 returns the address of the zero at the end of
7902 the string, like memchr(), so compute the length by subtracting
7903 the start address. */
7904 emit_insn (gen_sub2_insn (out
, addr
));
7911 /* For given symbol (function) construct code to compute address of it's PLT
7912 entry in large x86-64 PIC model. */
7915 construct_plt_address (rtx symbol
)
7919 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
7920 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
7921 gcc_assert (Pmode
== DImode
);
7923 tmp
= gen_reg_rtx (Pmode
);
7924 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
7926 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
7927 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
7931 /* Additional registers that are clobbered by SYSV calls. */
7933 static int const x86_64_ms_sysv_extra_clobbered_registers
7934 [NUM_X86_64_MS_CLOBBERED_REGS
] =
7938 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
7939 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
7943 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
7945 rtx pop
, bool sibcall
)
7948 rtx use
= NULL
, call
;
7949 unsigned int vec_len
= 0;
7952 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7954 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
7956 && (lookup_attribute ("interrupt",
7957 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
7958 error ("interrupt service routine cannot be called directly");
7963 if (pop
== const0_rtx
)
7965 gcc_assert (!TARGET_64BIT
|| !pop
);
7967 if (TARGET_MACHO
&& !TARGET_64BIT
)
7970 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7971 fnaddr
= machopic_indirect_call_target (fnaddr
);
7976 /* Static functions and indirect calls don't need the pic register. Also,
7977 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7978 it an indirect call. */
7979 rtx addr
= XEXP (fnaddr
, 0);
7981 && GET_CODE (addr
) == SYMBOL_REF
7982 && !SYMBOL_REF_LOCAL_P (addr
))
7985 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
7986 || !lookup_attribute ("noplt",
7987 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
7990 || (ix86_cmodel
== CM_LARGE_PIC
7991 && DEFAULT_ABI
!= MS_ABI
))
7993 use_reg (&use
, gen_rtx_REG (Pmode
,
7994 REAL_PIC_OFFSET_TABLE_REGNUM
));
7995 if (ix86_use_pseudo_pic_reg ())
7996 emit_move_insn (gen_rtx_REG (Pmode
,
7997 REAL_PIC_OFFSET_TABLE_REGNUM
),
7998 pic_offset_table_rtx
);
8001 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
8005 fnaddr
= gen_rtx_UNSPEC (Pmode
,
8006 gen_rtvec (1, addr
),
8008 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8012 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8014 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8015 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
8018 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
8019 /* Pmode may not be the same as word_mode for x32, which
8020 doesn't support indirect branch via 32-bit memory slot.
8021 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8022 indirect branch via x32 GOT slot is OK. */
8023 if (GET_MODE (fnaddr
) != word_mode
)
8024 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
8025 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
8030 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8031 parameters passed in vector registers. */
8033 && (INTVAL (callarg2
) > 0
8034 || (INTVAL (callarg2
) == 0
8035 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
8037 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
8038 emit_move_insn (al
, callarg2
);
8042 if (ix86_cmodel
== CM_LARGE_PIC
8045 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
8046 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
8047 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
8048 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8049 branch via x32 GOT slot is OK. */
8050 else if (!(TARGET_X32
8052 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
8053 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
8055 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
8056 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
8058 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
8059 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
8062 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
8065 call
= gen_rtx_SET (retval
, call
);
8066 vec
[vec_len
++] = call
;
8070 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
8071 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
8072 vec
[vec_len
++] = pop
;
8075 if (cfun
->machine
->no_caller_saved_registers
8077 || (!TREE_THIS_VOLATILE (fndecl
)
8078 && !lookup_attribute ("no_caller_saved_registers",
8079 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
8081 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
8082 bool is_64bit_ms_abi
= (TARGET_64BIT
8083 && ix86_function_abi (fndecl
) == MS_ABI
);
8084 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
8086 /* If there are no caller-saved registers, add all registers
8087 that are clobbered by the call which returns. */
8088 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
8090 && (ix86_call_used_regs
[i
] == 1
8091 || (ix86_call_used_regs
[i
] & c_mask
))
8092 && !STACK_REGNO_P (i
)
8093 && !MMX_REGNO_P (i
))
8095 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
8097 else if (TARGET_64BIT_MS_ABI
8098 && (!callarg2
|| INTVAL (callarg2
) != -2))
8102 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
8104 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
8105 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
8107 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
8110 /* Set here, but it may get cleared later. */
8111 if (TARGET_CALL_MS2SYSV_XLOGUES
)
8116 /* Don't break hot-patched functions. */
8117 else if (ix86_function_ms_hook_prologue (current_function_decl
))
8120 /* TODO: Cases not yet examined. */
8121 else if (flag_split_stack
)
8122 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8126 gcc_assert (!reload_completed
);
8127 cfun
->machine
->call_ms2sysv
= true;
8133 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
8134 rtx_insn
*call_insn
= emit_call_insn (call
);
8136 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
8141 /* Split simple return with popping POPC bytes from stack to indirect
8142 branch with stack adjustment . */
8145 ix86_split_simple_return_pop_internal (rtx popc
)
8147 struct machine_function
*m
= cfun
->machine
;
8148 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
8151 /* There is no "pascal" calling convention in any 64bit ABI. */
8152 gcc_assert (!TARGET_64BIT
);
8154 insn
= emit_insn (gen_pop (ecx
));
8155 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
8156 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
8158 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
8159 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8160 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8161 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
8162 RTX_FRAME_RELATED_P (insn
) = 1;
8164 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
8165 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8166 insn
= emit_insn (x
);
8167 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8168 RTX_FRAME_RELATED_P (insn
) = 1;
8170 /* Now return address is in ECX. */
8171 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
8174 /* Errors in the source file can cause expand_expr to return const0_rtx
8175 where we expect a vector. To avoid crashing, use one of the vector
8176 clear instructions. */
8179 safe_vector_operand (rtx x
, machine_mode mode
)
8181 if (x
== const0_rtx
)
8182 x
= CONST0_RTX (mode
);
8186 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8189 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
8192 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8193 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8194 rtx op0
= expand_normal (arg0
);
8195 rtx op1
= expand_normal (arg1
);
8196 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8197 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8198 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
8200 if (VECTOR_MODE_P (mode0
))
8201 op0
= safe_vector_operand (op0
, mode0
);
8202 if (VECTOR_MODE_P (mode1
))
8203 op1
= safe_vector_operand (op1
, mode1
);
8205 if (optimize
|| !target
8206 || GET_MODE (target
) != tmode
8207 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8208 target
= gen_reg_rtx (tmode
);
8210 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
8212 rtx x
= gen_reg_rtx (V4SImode
);
8213 emit_insn (gen_sse2_loadd (x
, op1
));
8214 op1
= gen_lowpart (TImode
, x
);
8217 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8218 op0
= copy_to_mode_reg (mode0
, op0
);
8219 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
8220 op1
= copy_to_mode_reg (mode1
, op1
);
8222 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8231 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8234 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
8235 enum ix86_builtin_func_type m_type
,
8236 enum rtx_code sub_code
)
8241 bool comparison_p
= false;
8243 bool last_arg_constant
= false;
8250 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8254 case MULTI_ARG_4_DF2_DI_I
:
8255 case MULTI_ARG_4_DF2_DI_I1
:
8256 case MULTI_ARG_4_SF2_SI_I
:
8257 case MULTI_ARG_4_SF2_SI_I1
:
8259 last_arg_constant
= true;
8262 case MULTI_ARG_3_SF
:
8263 case MULTI_ARG_3_DF
:
8264 case MULTI_ARG_3_SF2
:
8265 case MULTI_ARG_3_DF2
:
8266 case MULTI_ARG_3_DI
:
8267 case MULTI_ARG_3_SI
:
8268 case MULTI_ARG_3_SI_DI
:
8269 case MULTI_ARG_3_HI
:
8270 case MULTI_ARG_3_HI_SI
:
8271 case MULTI_ARG_3_QI
:
8272 case MULTI_ARG_3_DI2
:
8273 case MULTI_ARG_3_SI2
:
8274 case MULTI_ARG_3_HI2
:
8275 case MULTI_ARG_3_QI2
:
8279 case MULTI_ARG_2_SF
:
8280 case MULTI_ARG_2_DF
:
8281 case MULTI_ARG_2_DI
:
8282 case MULTI_ARG_2_SI
:
8283 case MULTI_ARG_2_HI
:
8284 case MULTI_ARG_2_QI
:
8288 case MULTI_ARG_2_DI_IMM
:
8289 case MULTI_ARG_2_SI_IMM
:
8290 case MULTI_ARG_2_HI_IMM
:
8291 case MULTI_ARG_2_QI_IMM
:
8293 last_arg_constant
= true;
8296 case MULTI_ARG_1_SF
:
8297 case MULTI_ARG_1_DF
:
8298 case MULTI_ARG_1_SF2
:
8299 case MULTI_ARG_1_DF2
:
8300 case MULTI_ARG_1_DI
:
8301 case MULTI_ARG_1_SI
:
8302 case MULTI_ARG_1_HI
:
8303 case MULTI_ARG_1_QI
:
8304 case MULTI_ARG_1_SI_DI
:
8305 case MULTI_ARG_1_HI_DI
:
8306 case MULTI_ARG_1_HI_SI
:
8307 case MULTI_ARG_1_QI_DI
:
8308 case MULTI_ARG_1_QI_SI
:
8309 case MULTI_ARG_1_QI_HI
:
8313 case MULTI_ARG_2_DI_CMP
:
8314 case MULTI_ARG_2_SI_CMP
:
8315 case MULTI_ARG_2_HI_CMP
:
8316 case MULTI_ARG_2_QI_CMP
:
8318 comparison_p
= true;
8321 case MULTI_ARG_2_SF_TF
:
8322 case MULTI_ARG_2_DF_TF
:
8323 case MULTI_ARG_2_DI_TF
:
8324 case MULTI_ARG_2_SI_TF
:
8325 case MULTI_ARG_2_HI_TF
:
8326 case MULTI_ARG_2_QI_TF
:
8335 if (optimize
|| !target
8336 || GET_MODE (target
) != tmode
8337 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8338 target
= gen_reg_rtx (tmode
);
8339 else if (memory_operand (target
, tmode
))
8342 gcc_assert (nargs
<= 4);
8344 for (i
= 0; i
< nargs
; i
++)
8346 tree arg
= CALL_EXPR_ARG (exp
, i
);
8347 rtx op
= expand_normal (arg
);
8348 int adjust
= (comparison_p
) ? 1 : 0;
8349 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
8351 if (last_arg_constant
&& i
== nargs
- 1)
8353 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
8355 enum insn_code new_icode
= icode
;
8358 case CODE_FOR_xop_vpermil2v2df3
:
8359 case CODE_FOR_xop_vpermil2v4sf3
:
8360 case CODE_FOR_xop_vpermil2v4df3
:
8361 case CODE_FOR_xop_vpermil2v8sf3
:
8362 error ("the last argument must be a 2-bit immediate");
8363 return gen_reg_rtx (tmode
);
8364 case CODE_FOR_xop_rotlv2di3
:
8365 new_icode
= CODE_FOR_rotlv2di3
;
8367 case CODE_FOR_xop_rotlv4si3
:
8368 new_icode
= CODE_FOR_rotlv4si3
;
8370 case CODE_FOR_xop_rotlv8hi3
:
8371 new_icode
= CODE_FOR_rotlv8hi3
;
8373 case CODE_FOR_xop_rotlv16qi3
:
8374 new_icode
= CODE_FOR_rotlv16qi3
;
8376 if (CONST_INT_P (op
))
8378 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
8379 op
= GEN_INT (INTVAL (op
) & mask
);
8381 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
8387 && insn_data
[new_icode
].operand
[0].mode
== tmode
8388 && insn_data
[new_icode
].operand
[1].mode
== tmode
8389 && insn_data
[new_icode
].operand
[2].mode
== mode
8390 && insn_data
[new_icode
].operand
[0].predicate
8391 == insn_data
[icode
].operand
[0].predicate
8392 && insn_data
[new_icode
].operand
[1].predicate
8393 == insn_data
[icode
].operand
[1].predicate
);
8406 if (VECTOR_MODE_P (mode
))
8407 op
= safe_vector_operand (op
, mode
);
8409 /* If we aren't optimizing, only allow one memory operand to be
8411 if (memory_operand (op
, mode
))
8414 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
8417 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
8419 op
= force_reg (mode
, op
);
8423 args
[i
].mode
= mode
;
8429 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
8434 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
8435 GEN_INT ((int)sub_code
));
8436 else if (! comparison_p
)
8437 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
8440 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
8444 pat
= GEN_FCN (icode
) (target
, cmp_op
, args
[0].op
, args
[1].op
);
8449 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
8453 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
, args
[3].op
);
8467 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8468 insns with vec_merge. */
8471 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
8475 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8476 rtx op1
, op0
= expand_normal (arg0
);
8477 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8478 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8480 if (optimize
|| !target
8481 || GET_MODE (target
) != tmode
8482 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8483 target
= gen_reg_rtx (tmode
);
8485 if (VECTOR_MODE_P (mode0
))
8486 op0
= safe_vector_operand (op0
, mode0
);
8488 if ((optimize
&& !register_operand (op0
, mode0
))
8489 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8490 op0
= copy_to_mode_reg (mode0
, op0
);
8493 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
8494 op1
= copy_to_mode_reg (mode0
, op1
);
8496 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8503 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8506 ix86_expand_sse_compare (const struct builtin_description
*d
,
8507 tree exp
, rtx target
, bool swap
)
8510 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8511 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8512 rtx op0
= expand_normal (arg0
);
8513 rtx op1
= expand_normal (arg1
);
8515 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8516 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8517 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8518 enum rtx_code comparison
= d
->comparison
;
8520 if (VECTOR_MODE_P (mode0
))
8521 op0
= safe_vector_operand (op0
, mode0
);
8522 if (VECTOR_MODE_P (mode1
))
8523 op1
= safe_vector_operand (op1
, mode1
);
8525 /* Swap operands if we have a comparison that isn't available in
8528 std::swap (op0
, op1
);
8530 if (optimize
|| !target
8531 || GET_MODE (target
) != tmode
8532 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8533 target
= gen_reg_rtx (tmode
);
8535 if ((optimize
&& !register_operand (op0
, mode0
))
8536 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
8537 op0
= copy_to_mode_reg (mode0
, op0
);
8538 if ((optimize
&& !register_operand (op1
, mode1
))
8539 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
8540 op1
= copy_to_mode_reg (mode1
, op1
);
8542 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
8543 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8550 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8553 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
8557 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8558 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8559 rtx op0
= expand_normal (arg0
);
8560 rtx op1
= expand_normal (arg1
);
8561 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8562 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8563 enum rtx_code comparison
= d
->comparison
;
8565 if (VECTOR_MODE_P (mode0
))
8566 op0
= safe_vector_operand (op0
, mode0
);
8567 if (VECTOR_MODE_P (mode1
))
8568 op1
= safe_vector_operand (op1
, mode1
);
8570 /* Swap operands if we have a comparison that isn't available in
8572 if (d
->flag
& BUILTIN_DESC_SWAP_OPERANDS
)
8573 std::swap (op0
, op1
);
8575 target
= gen_reg_rtx (SImode
);
8576 emit_move_insn (target
, const0_rtx
);
8577 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8579 if ((optimize
&& !register_operand (op0
, mode0
))
8580 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8581 op0
= copy_to_mode_reg (mode0
, op0
);
8582 if ((optimize
&& !register_operand (op1
, mode1
))
8583 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8584 op1
= copy_to_mode_reg (mode1
, op1
);
8586 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8590 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8591 gen_rtx_fmt_ee (comparison
, QImode
,
8595 return SUBREG_REG (target
);
8598 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8601 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
8605 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8606 rtx op1
, op0
= expand_normal (arg0
);
8607 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8608 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8610 if (optimize
|| target
== 0
8611 || GET_MODE (target
) != tmode
8612 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8613 target
= gen_reg_rtx (tmode
);
8615 if (VECTOR_MODE_P (mode0
))
8616 op0
= safe_vector_operand (op0
, mode0
);
8618 if ((optimize
&& !register_operand (op0
, mode0
))
8619 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8620 op0
= copy_to_mode_reg (mode0
, op0
);
8622 op1
= GEN_INT (d
->comparison
);
8624 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
8632 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
8633 tree exp
, rtx target
)
8636 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8637 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8638 rtx op0
= expand_normal (arg0
);
8639 rtx op1
= expand_normal (arg1
);
8641 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8642 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8643 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8645 if (optimize
|| target
== 0
8646 || GET_MODE (target
) != tmode
8647 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8648 target
= gen_reg_rtx (tmode
);
8650 op0
= safe_vector_operand (op0
, mode0
);
8651 op1
= safe_vector_operand (op1
, mode1
);
8653 if ((optimize
&& !register_operand (op0
, mode0
))
8654 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8655 op0
= copy_to_mode_reg (mode0
, op0
);
8656 if ((optimize
&& !register_operand (op1
, mode1
))
8657 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8658 op1
= copy_to_mode_reg (mode1
, op1
);
8660 op2
= GEN_INT (d
->comparison
);
8662 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8669 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8672 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
8676 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8677 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8678 rtx op0
= expand_normal (arg0
);
8679 rtx op1
= expand_normal (arg1
);
8680 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8681 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8682 enum rtx_code comparison
= d
->comparison
;
8684 if (VECTOR_MODE_P (mode0
))
8685 op0
= safe_vector_operand (op0
, mode0
);
8686 if (VECTOR_MODE_P (mode1
))
8687 op1
= safe_vector_operand (op1
, mode1
);
8689 target
= gen_reg_rtx (SImode
);
8690 emit_move_insn (target
, const0_rtx
);
8691 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8693 if ((optimize
&& !register_operand (op0
, mode0
))
8694 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8695 op0
= copy_to_mode_reg (mode0
, op0
);
8696 if ((optimize
&& !register_operand (op1
, mode1
))
8697 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8698 op1
= copy_to_mode_reg (mode1
, op1
);
8700 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8704 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8705 gen_rtx_fmt_ee (comparison
, QImode
,
8709 return SUBREG_REG (target
);
8712 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8715 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
8716 tree exp
, rtx target
)
8719 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8720 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8721 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8722 tree arg3
= CALL_EXPR_ARG (exp
, 3);
8723 tree arg4
= CALL_EXPR_ARG (exp
, 4);
8724 rtx scratch0
, scratch1
;
8725 rtx op0
= expand_normal (arg0
);
8726 rtx op1
= expand_normal (arg1
);
8727 rtx op2
= expand_normal (arg2
);
8728 rtx op3
= expand_normal (arg3
);
8729 rtx op4
= expand_normal (arg4
);
8730 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
8732 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8733 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8734 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8735 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
8736 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
8737 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
8738 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
8740 if (VECTOR_MODE_P (modev2
))
8741 op0
= safe_vector_operand (op0
, modev2
);
8742 if (VECTOR_MODE_P (modev4
))
8743 op2
= safe_vector_operand (op2
, modev4
);
8745 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8746 op0
= copy_to_mode_reg (modev2
, op0
);
8747 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
8748 op1
= copy_to_mode_reg (modei3
, op1
);
8749 if ((optimize
&& !register_operand (op2
, modev4
))
8750 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
8751 op2
= copy_to_mode_reg (modev4
, op2
);
8752 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
8753 op3
= copy_to_mode_reg (modei5
, op3
);
8755 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
8757 error ("the fifth argument must be an 8-bit immediate");
8761 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
8763 if (optimize
|| !target
8764 || GET_MODE (target
) != tmode0
8765 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8766 target
= gen_reg_rtx (tmode0
);
8768 scratch1
= gen_reg_rtx (tmode1
);
8770 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8772 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
8774 if (optimize
|| !target
8775 || GET_MODE (target
) != tmode1
8776 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8777 target
= gen_reg_rtx (tmode1
);
8779 scratch0
= gen_reg_rtx (tmode0
);
8781 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
8785 gcc_assert (d
->flag
);
8787 scratch0
= gen_reg_rtx (tmode0
);
8788 scratch1
= gen_reg_rtx (tmode1
);
8790 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8800 target
= gen_reg_rtx (SImode
);
8801 emit_move_insn (target
, const0_rtx
);
8802 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8805 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8806 gen_rtx_fmt_ee (EQ
, QImode
,
8807 gen_rtx_REG ((machine_mode
) d
->flag
,
8810 return SUBREG_REG (target
);
8817 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8820 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
8821 tree exp
, rtx target
)
8824 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8825 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8826 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8827 rtx scratch0
, scratch1
;
8828 rtx op0
= expand_normal (arg0
);
8829 rtx op1
= expand_normal (arg1
);
8830 rtx op2
= expand_normal (arg2
);
8831 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
8833 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8834 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8835 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8836 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
8837 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
8839 if (VECTOR_MODE_P (modev2
))
8840 op0
= safe_vector_operand (op0
, modev2
);
8841 if (VECTOR_MODE_P (modev3
))
8842 op1
= safe_vector_operand (op1
, modev3
);
8844 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8845 op0
= copy_to_mode_reg (modev2
, op0
);
8846 if ((optimize
&& !register_operand (op1
, modev3
))
8847 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
8848 op1
= copy_to_mode_reg (modev3
, op1
);
8850 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
8852 error ("the third argument must be an 8-bit immediate");
8856 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
8858 if (optimize
|| !target
8859 || GET_MODE (target
) != tmode0
8860 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8861 target
= gen_reg_rtx (tmode0
);
8863 scratch1
= gen_reg_rtx (tmode1
);
8865 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
8867 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
8869 if (optimize
|| !target
8870 || GET_MODE (target
) != tmode1
8871 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8872 target
= gen_reg_rtx (tmode1
);
8874 scratch0
= gen_reg_rtx (tmode0
);
8876 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
8880 gcc_assert (d
->flag
);
8882 scratch0
= gen_reg_rtx (tmode0
);
8883 scratch1
= gen_reg_rtx (tmode1
);
8885 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
8895 target
= gen_reg_rtx (SImode
);
8896 emit_move_insn (target
, const0_rtx
);
8897 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8900 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8901 gen_rtx_fmt_ee (EQ
, QImode
,
8902 gen_rtx_REG ((machine_mode
) d
->flag
,
8905 return SUBREG_REG (target
);
8911 /* Fixup modeless constants to fit required mode. */
8914 fixup_modeless_constant (rtx x
, machine_mode mode
)
8916 if (GET_MODE (x
) == VOIDmode
)
8917 x
= convert_to_mode (mode
, x
, 1);
8921 /* Subroutine of ix86_expand_builtin to take care of insns with
8922 variable number of operands. */
8925 ix86_expand_args_builtin (const struct builtin_description
*d
,
8926 tree exp
, rtx target
)
8928 rtx pat
, real_target
;
8929 unsigned int i
, nargs
;
8930 unsigned int nargs_constant
= 0;
8931 unsigned int mask_pos
= 0;
8938 bool second_arg_count
= false;
8939 enum insn_code icode
= d
->icode
;
8940 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
8941 machine_mode tmode
= insn_p
->operand
[0].mode
;
8942 machine_mode rmode
= VOIDmode
;
8944 enum rtx_code comparison
= d
->comparison
;
8946 switch ((enum ix86_builtin_func_type
) d
->flag
)
8948 case V2DF_FTYPE_V2DF_ROUND
:
8949 case V4DF_FTYPE_V4DF_ROUND
:
8950 case V8DF_FTYPE_V8DF_ROUND
:
8951 case V4SF_FTYPE_V4SF_ROUND
:
8952 case V8SF_FTYPE_V8SF_ROUND
:
8953 case V16SF_FTYPE_V16SF_ROUND
:
8954 case V4SI_FTYPE_V4SF_ROUND
:
8955 case V8SI_FTYPE_V8SF_ROUND
:
8956 case V16SI_FTYPE_V16SF_ROUND
:
8957 return ix86_expand_sse_round (d
, exp
, target
);
8958 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
8959 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
8960 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
8961 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
8962 case INT_FTYPE_V8SF_V8SF_PTEST
:
8963 case INT_FTYPE_V4DI_V4DI_PTEST
:
8964 case INT_FTYPE_V4DF_V4DF_PTEST
:
8965 case INT_FTYPE_V4SF_V4SF_PTEST
:
8966 case INT_FTYPE_V2DI_V2DI_PTEST
:
8967 case INT_FTYPE_V2DF_V2DF_PTEST
:
8968 return ix86_expand_sse_ptest (d
, exp
, target
);
8969 case FLOAT128_FTYPE_FLOAT128
:
8970 case FLOAT_FTYPE_FLOAT
:
8972 case UINT_FTYPE_UINT
:
8973 case UINT16_FTYPE_UINT16
:
8974 case UINT64_FTYPE_INT
:
8975 case UINT64_FTYPE_UINT64
:
8976 case INT64_FTYPE_INT64
:
8977 case INT64_FTYPE_V4SF
:
8978 case INT64_FTYPE_V2DF
:
8979 case INT_FTYPE_V16QI
:
8980 case INT_FTYPE_V8QI
:
8981 case INT_FTYPE_V8SF
:
8982 case INT_FTYPE_V4DF
:
8983 case INT_FTYPE_V4SF
:
8984 case INT_FTYPE_V2DF
:
8985 case INT_FTYPE_V32QI
:
8986 case V16QI_FTYPE_V16QI
:
8987 case V8SI_FTYPE_V8SF
:
8988 case V8SI_FTYPE_V4SI
:
8989 case V8HI_FTYPE_V8HI
:
8990 case V8HI_FTYPE_V16QI
:
8991 case V8QI_FTYPE_V8QI
:
8992 case V8SF_FTYPE_V8SF
:
8993 case V8SF_FTYPE_V8SI
:
8994 case V8SF_FTYPE_V4SF
:
8995 case V8SF_FTYPE_V8HI
:
8996 case V4SI_FTYPE_V4SI
:
8997 case V4SI_FTYPE_V16QI
:
8998 case V4SI_FTYPE_V4SF
:
8999 case V4SI_FTYPE_V8SI
:
9000 case V4SI_FTYPE_V8HI
:
9001 case V4SI_FTYPE_V4DF
:
9002 case V4SI_FTYPE_V2DF
:
9003 case V4HI_FTYPE_V4HI
:
9004 case V4DF_FTYPE_V4DF
:
9005 case V4DF_FTYPE_V4SI
:
9006 case V4DF_FTYPE_V4SF
:
9007 case V4DF_FTYPE_V2DF
:
9008 case V4SF_FTYPE_V4SF
:
9009 case V4SF_FTYPE_V4SI
:
9010 case V4SF_FTYPE_V8SF
:
9011 case V4SF_FTYPE_V4DF
:
9012 case V4SF_FTYPE_V8HI
:
9013 case V4SF_FTYPE_V2DF
:
9014 case V2DI_FTYPE_V2DI
:
9015 case V2DI_FTYPE_V16QI
:
9016 case V2DI_FTYPE_V8HI
:
9017 case V2DI_FTYPE_V4SI
:
9018 case V2DF_FTYPE_V2DF
:
9019 case V2DF_FTYPE_V4SI
:
9020 case V2DF_FTYPE_V4DF
:
9021 case V2DF_FTYPE_V4SF
:
9022 case V2DF_FTYPE_V2SI
:
9023 case V2SI_FTYPE_V2SI
:
9024 case V2SI_FTYPE_V4SF
:
9025 case V2SI_FTYPE_V2SF
:
9026 case V2SI_FTYPE_V2DF
:
9027 case V2SF_FTYPE_V2SF
:
9028 case V2SF_FTYPE_V2SI
:
9029 case V32QI_FTYPE_V32QI
:
9030 case V32QI_FTYPE_V16QI
:
9031 case V16HI_FTYPE_V16HI
:
9032 case V16HI_FTYPE_V8HI
:
9033 case V8SI_FTYPE_V8SI
:
9034 case V16HI_FTYPE_V16QI
:
9035 case V8SI_FTYPE_V16QI
:
9036 case V4DI_FTYPE_V16QI
:
9037 case V8SI_FTYPE_V8HI
:
9038 case V4DI_FTYPE_V8HI
:
9039 case V4DI_FTYPE_V4SI
:
9040 case V4DI_FTYPE_V2DI
:
9047 case UHI_FTYPE_V16QI
:
9048 case USI_FTYPE_V32QI
:
9049 case UDI_FTYPE_V64QI
:
9050 case V16QI_FTYPE_UHI
:
9051 case V32QI_FTYPE_USI
:
9052 case V64QI_FTYPE_UDI
:
9053 case V8HI_FTYPE_UQI
:
9054 case V16HI_FTYPE_UHI
:
9055 case V32HI_FTYPE_USI
:
9056 case V4SI_FTYPE_UQI
:
9057 case V8SI_FTYPE_UQI
:
9058 case V4SI_FTYPE_UHI
:
9059 case V8SI_FTYPE_UHI
:
9060 case UQI_FTYPE_V8HI
:
9061 case UHI_FTYPE_V16HI
:
9062 case USI_FTYPE_V32HI
:
9063 case UQI_FTYPE_V4SI
:
9064 case UQI_FTYPE_V8SI
:
9065 case UHI_FTYPE_V16SI
:
9066 case UQI_FTYPE_V2DI
:
9067 case UQI_FTYPE_V4DI
:
9068 case UQI_FTYPE_V8DI
:
9069 case V16SI_FTYPE_UHI
:
9070 case V2DI_FTYPE_UQI
:
9071 case V4DI_FTYPE_UQI
:
9072 case V16SI_FTYPE_INT
:
9073 case V16SF_FTYPE_V8SF
:
9074 case V16SI_FTYPE_V8SI
:
9075 case V16SF_FTYPE_V4SF
:
9076 case V16SI_FTYPE_V4SI
:
9077 case V16SI_FTYPE_V16SF
:
9078 case V16SI_FTYPE_V16SI
:
9079 case V64QI_FTYPE_V64QI
:
9080 case V32HI_FTYPE_V32HI
:
9081 case V16SF_FTYPE_V16SF
:
9082 case V8DI_FTYPE_UQI
:
9083 case V8DI_FTYPE_V8DI
:
9084 case V8DF_FTYPE_V4DF
:
9085 case V8DF_FTYPE_V2DF
:
9086 case V8DF_FTYPE_V8DF
:
9087 case V4DI_FTYPE_V4DI
:
9088 case V16HI_FTYPE_V16SF
:
9089 case V8HI_FTYPE_V8SF
:
9090 case V8HI_FTYPE_V4SF
:
9093 case V4SF_FTYPE_V4SF_VEC_MERGE
:
9094 case V2DF_FTYPE_V2DF_VEC_MERGE
:
9095 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
9096 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
9097 case V16QI_FTYPE_V16QI_V16QI
:
9098 case V16QI_FTYPE_V8HI_V8HI
:
9099 case V16SF_FTYPE_V16SF_V16SF
:
9100 case V8QI_FTYPE_V8QI_V8QI
:
9101 case V8QI_FTYPE_V4HI_V4HI
:
9102 case V8HI_FTYPE_V8HI_V8HI
:
9103 case V8HI_FTYPE_V16QI_V16QI
:
9104 case V8HI_FTYPE_V4SI_V4SI
:
9105 case V8SF_FTYPE_V8SF_V8SF
:
9106 case V8SF_FTYPE_V8SF_V8SI
:
9107 case V8DF_FTYPE_V8DF_V8DF
:
9108 case V4SI_FTYPE_V4SI_V4SI
:
9109 case V4SI_FTYPE_V8HI_V8HI
:
9110 case V4SI_FTYPE_V2DF_V2DF
:
9111 case V4HI_FTYPE_V4HI_V4HI
:
9112 case V4HI_FTYPE_V8QI_V8QI
:
9113 case V4HI_FTYPE_V2SI_V2SI
:
9114 case V4DF_FTYPE_V4DF_V4DF
:
9115 case V4DF_FTYPE_V4DF_V4DI
:
9116 case V4SF_FTYPE_V4SF_V4SF
:
9117 case V4SF_FTYPE_V4SF_V4SI
:
9118 case V4SF_FTYPE_V4SF_V2SI
:
9119 case V4SF_FTYPE_V4SF_V2DF
:
9120 case V4SF_FTYPE_V4SF_UINT
:
9121 case V4SF_FTYPE_V4SF_DI
:
9122 case V4SF_FTYPE_V4SF_SI
:
9123 case V2DI_FTYPE_V2DI_V2DI
:
9124 case V2DI_FTYPE_V16QI_V16QI
:
9125 case V2DI_FTYPE_V4SI_V4SI
:
9126 case V2DI_FTYPE_V2DI_V16QI
:
9127 case V2SI_FTYPE_V2SI_V2SI
:
9128 case V2SI_FTYPE_V4HI_V4HI
:
9129 case V2SI_FTYPE_V2SF_V2SF
:
9130 case V2DF_FTYPE_V2DF_V2DF
:
9131 case V2DF_FTYPE_V2DF_V4SF
:
9132 case V2DF_FTYPE_V2DF_V2DI
:
9133 case V2DF_FTYPE_V2DF_DI
:
9134 case V2DF_FTYPE_V2DF_SI
:
9135 case V2DF_FTYPE_V2DF_UINT
:
9136 case V2SF_FTYPE_V2SF_V2SF
:
9137 case V1DI_FTYPE_V1DI_V1DI
:
9138 case V1DI_FTYPE_V8QI_V8QI
:
9139 case V1DI_FTYPE_V2SI_V2SI
:
9140 case V32QI_FTYPE_V16HI_V16HI
:
9141 case V16HI_FTYPE_V8SI_V8SI
:
9142 case V64QI_FTYPE_V64QI_V64QI
:
9143 case V32QI_FTYPE_V32QI_V32QI
:
9144 case V16HI_FTYPE_V32QI_V32QI
:
9145 case V16HI_FTYPE_V16HI_V16HI
:
9146 case V8SI_FTYPE_V4DF_V4DF
:
9147 case V8SI_FTYPE_V8SI_V8SI
:
9148 case V8SI_FTYPE_V16HI_V16HI
:
9149 case V4DI_FTYPE_V4DI_V4DI
:
9150 case V4DI_FTYPE_V8SI_V8SI
:
9151 case V8DI_FTYPE_V64QI_V64QI
:
9152 if (comparison
== UNKNOWN
)
9153 return ix86_expand_binop_builtin (icode
, exp
, target
);
9156 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
9157 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
9158 gcc_assert (comparison
!= UNKNOWN
);
9162 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
9163 case V16HI_FTYPE_V16HI_SI_COUNT
:
9164 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
9165 case V8SI_FTYPE_V8SI_SI_COUNT
:
9166 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
9167 case V4DI_FTYPE_V4DI_INT_COUNT
:
9168 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
9169 case V8HI_FTYPE_V8HI_SI_COUNT
:
9170 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
9171 case V4SI_FTYPE_V4SI_SI_COUNT
:
9172 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
9173 case V4HI_FTYPE_V4HI_SI_COUNT
:
9174 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
9175 case V2DI_FTYPE_V2DI_SI_COUNT
:
9176 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
9177 case V2SI_FTYPE_V2SI_SI_COUNT
:
9178 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
9179 case V1DI_FTYPE_V1DI_SI_COUNT
:
9181 second_arg_count
= true;
9183 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
9184 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
9185 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
9186 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
9187 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
9188 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
9189 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
9190 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
9191 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
9192 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
9193 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
9194 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
9195 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
9196 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
9197 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
9198 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
9199 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
9200 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
9202 second_arg_count
= true;
9204 case UINT64_FTYPE_UINT64_UINT64
:
9205 case UINT_FTYPE_UINT_UINT
:
9206 case UINT_FTYPE_UINT_USHORT
:
9207 case UINT_FTYPE_UINT_UCHAR
:
9208 case UINT16_FTYPE_UINT16_INT
:
9209 case UINT8_FTYPE_UINT8_INT
:
9210 case UQI_FTYPE_UQI_UQI
:
9211 case UHI_FTYPE_UHI_UHI
:
9212 case USI_FTYPE_USI_USI
:
9213 case UDI_FTYPE_UDI_UDI
:
9214 case V16SI_FTYPE_V8DF_V8DF
:
9215 case V32HI_FTYPE_V16SF_V16SF
:
9216 case V16HI_FTYPE_V8SF_V8SF
:
9217 case V8HI_FTYPE_V4SF_V4SF
:
9218 case V16HI_FTYPE_V16SF_UHI
:
9219 case V8HI_FTYPE_V8SF_UQI
:
9220 case V8HI_FTYPE_V4SF_UQI
:
9223 case V2DI_FTYPE_V2DI_INT_CONVERT
:
9228 case V4DI_FTYPE_V4DI_INT_CONVERT
:
9233 case V8DI_FTYPE_V8DI_INT_CONVERT
:
9238 case V8HI_FTYPE_V8HI_INT
:
9239 case V8HI_FTYPE_V8SF_INT
:
9240 case V16HI_FTYPE_V16SF_INT
:
9241 case V8HI_FTYPE_V4SF_INT
:
9242 case V8SF_FTYPE_V8SF_INT
:
9243 case V4SF_FTYPE_V16SF_INT
:
9244 case V16SF_FTYPE_V16SF_INT
:
9245 case V4SI_FTYPE_V4SI_INT
:
9246 case V4SI_FTYPE_V8SI_INT
:
9247 case V4HI_FTYPE_V4HI_INT
:
9248 case V4DF_FTYPE_V4DF_INT
:
9249 case V4DF_FTYPE_V8DF_INT
:
9250 case V4SF_FTYPE_V4SF_INT
:
9251 case V4SF_FTYPE_V8SF_INT
:
9252 case V2DI_FTYPE_V2DI_INT
:
9253 case V2DF_FTYPE_V2DF_INT
:
9254 case V2DF_FTYPE_V4DF_INT
:
9255 case V16HI_FTYPE_V16HI_INT
:
9256 case V8SI_FTYPE_V8SI_INT
:
9257 case V16SI_FTYPE_V16SI_INT
:
9258 case V4SI_FTYPE_V16SI_INT
:
9259 case V4DI_FTYPE_V4DI_INT
:
9260 case V2DI_FTYPE_V4DI_INT
:
9261 case V4DI_FTYPE_V8DI_INT
:
9262 case UQI_FTYPE_UQI_UQI_CONST
:
9263 case UHI_FTYPE_UHI_UQI
:
9264 case USI_FTYPE_USI_UQI
:
9265 case UDI_FTYPE_UDI_UQI
:
9269 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
9270 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
9271 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
9272 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
9273 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
9274 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
9275 case UHI_FTYPE_V16SI_V16SI_UHI
:
9276 case UQI_FTYPE_V8DI_V8DI_UQI
:
9277 case V16HI_FTYPE_V16SI_V16HI_UHI
:
9278 case V16QI_FTYPE_V16SI_V16QI_UHI
:
9279 case V16QI_FTYPE_V8DI_V16QI_UQI
:
9280 case V16SF_FTYPE_V16SF_V16SF_UHI
:
9281 case V16SF_FTYPE_V4SF_V16SF_UHI
:
9282 case V16SI_FTYPE_SI_V16SI_UHI
:
9283 case V16SI_FTYPE_V16HI_V16SI_UHI
:
9284 case V16SI_FTYPE_V16QI_V16SI_UHI
:
9285 case V8SF_FTYPE_V4SF_V8SF_UQI
:
9286 case V4DF_FTYPE_V2DF_V4DF_UQI
:
9287 case V8SI_FTYPE_V4SI_V8SI_UQI
:
9288 case V8SI_FTYPE_SI_V8SI_UQI
:
9289 case V4SI_FTYPE_V4SI_V4SI_UQI
:
9290 case V4SI_FTYPE_SI_V4SI_UQI
:
9291 case V4DI_FTYPE_V2DI_V4DI_UQI
:
9292 case V4DI_FTYPE_DI_V4DI_UQI
:
9293 case V2DI_FTYPE_V2DI_V2DI_UQI
:
9294 case V2DI_FTYPE_DI_V2DI_UQI
:
9295 case V64QI_FTYPE_V64QI_V64QI_UDI
:
9296 case V64QI_FTYPE_V16QI_V64QI_UDI
:
9297 case V64QI_FTYPE_QI_V64QI_UDI
:
9298 case V32QI_FTYPE_V32QI_V32QI_USI
:
9299 case V32QI_FTYPE_V16QI_V32QI_USI
:
9300 case V32QI_FTYPE_QI_V32QI_USI
:
9301 case V16QI_FTYPE_V16QI_V16QI_UHI
:
9302 case V16QI_FTYPE_QI_V16QI_UHI
:
9303 case V32HI_FTYPE_V8HI_V32HI_USI
:
9304 case V32HI_FTYPE_HI_V32HI_USI
:
9305 case V16HI_FTYPE_V8HI_V16HI_UHI
:
9306 case V16HI_FTYPE_HI_V16HI_UHI
:
9307 case V8HI_FTYPE_V8HI_V8HI_UQI
:
9308 case V8HI_FTYPE_HI_V8HI_UQI
:
9309 case V8SF_FTYPE_V8HI_V8SF_UQI
:
9310 case V4SF_FTYPE_V8HI_V4SF_UQI
:
9311 case V8SI_FTYPE_V8SF_V8SI_UQI
:
9312 case V4SI_FTYPE_V4SF_V4SI_UQI
:
9313 case V4DI_FTYPE_V4SF_V4DI_UQI
:
9314 case V2DI_FTYPE_V4SF_V2DI_UQI
:
9315 case V4SF_FTYPE_V4DI_V4SF_UQI
:
9316 case V4SF_FTYPE_V2DI_V4SF_UQI
:
9317 case V4DF_FTYPE_V4DI_V4DF_UQI
:
9318 case V2DF_FTYPE_V2DI_V2DF_UQI
:
9319 case V16QI_FTYPE_V8HI_V16QI_UQI
:
9320 case V16QI_FTYPE_V16HI_V16QI_UHI
:
9321 case V16QI_FTYPE_V4SI_V16QI_UQI
:
9322 case V16QI_FTYPE_V8SI_V16QI_UQI
:
9323 case V8HI_FTYPE_V4SI_V8HI_UQI
:
9324 case V8HI_FTYPE_V8SI_V8HI_UQI
:
9325 case V16QI_FTYPE_V2DI_V16QI_UQI
:
9326 case V16QI_FTYPE_V4DI_V16QI_UQI
:
9327 case V8HI_FTYPE_V2DI_V8HI_UQI
:
9328 case V8HI_FTYPE_V4DI_V8HI_UQI
:
9329 case V4SI_FTYPE_V2DI_V4SI_UQI
:
9330 case V4SI_FTYPE_V4DI_V4SI_UQI
:
9331 case V32QI_FTYPE_V32HI_V32QI_USI
:
9332 case UHI_FTYPE_V16QI_V16QI_UHI
:
9333 case USI_FTYPE_V32QI_V32QI_USI
:
9334 case UDI_FTYPE_V64QI_V64QI_UDI
:
9335 case UQI_FTYPE_V8HI_V8HI_UQI
:
9336 case UHI_FTYPE_V16HI_V16HI_UHI
:
9337 case USI_FTYPE_V32HI_V32HI_USI
:
9338 case UQI_FTYPE_V4SI_V4SI_UQI
:
9339 case UQI_FTYPE_V8SI_V8SI_UQI
:
9340 case UQI_FTYPE_V2DI_V2DI_UQI
:
9341 case UQI_FTYPE_V4DI_V4DI_UQI
:
9342 case V4SF_FTYPE_V2DF_V4SF_UQI
:
9343 case V4SF_FTYPE_V4DF_V4SF_UQI
:
9344 case V16SI_FTYPE_V16SI_V16SI_UHI
:
9345 case V16SI_FTYPE_V4SI_V16SI_UHI
:
9346 case V2DI_FTYPE_V4SI_V2DI_UQI
:
9347 case V2DI_FTYPE_V8HI_V2DI_UQI
:
9348 case V2DI_FTYPE_V16QI_V2DI_UQI
:
9349 case V4DI_FTYPE_V4DI_V4DI_UQI
:
9350 case V4DI_FTYPE_V4SI_V4DI_UQI
:
9351 case V4DI_FTYPE_V8HI_V4DI_UQI
:
9352 case V4DI_FTYPE_V16QI_V4DI_UQI
:
9353 case V4DI_FTYPE_V4DF_V4DI_UQI
:
9354 case V2DI_FTYPE_V2DF_V2DI_UQI
:
9355 case V4SI_FTYPE_V4DF_V4SI_UQI
:
9356 case V4SI_FTYPE_V2DF_V4SI_UQI
:
9357 case V4SI_FTYPE_V8HI_V4SI_UQI
:
9358 case V4SI_FTYPE_V16QI_V4SI_UQI
:
9359 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
9360 case V8DF_FTYPE_V2DF_V8DF_UQI
:
9361 case V8DF_FTYPE_V4DF_V8DF_UQI
:
9362 case V8DF_FTYPE_V8DF_V8DF_UQI
:
9363 case V8SF_FTYPE_V8SF_V8SF_UQI
:
9364 case V8SF_FTYPE_V8SI_V8SF_UQI
:
9365 case V4DF_FTYPE_V4DF_V4DF_UQI
:
9366 case V4SF_FTYPE_V4SF_V4SF_UQI
:
9367 case V2DF_FTYPE_V2DF_V2DF_UQI
:
9368 case V2DF_FTYPE_V4SF_V2DF_UQI
:
9369 case V2DF_FTYPE_V4SI_V2DF_UQI
:
9370 case V4SF_FTYPE_V4SI_V4SF_UQI
:
9371 case V4DF_FTYPE_V4SF_V4DF_UQI
:
9372 case V4DF_FTYPE_V4SI_V4DF_UQI
:
9373 case V8SI_FTYPE_V8SI_V8SI_UQI
:
9374 case V8SI_FTYPE_V8HI_V8SI_UQI
:
9375 case V8SI_FTYPE_V16QI_V8SI_UQI
:
9376 case V8DF_FTYPE_V8SI_V8DF_UQI
:
9377 case V8DI_FTYPE_DI_V8DI_UQI
:
9378 case V16SF_FTYPE_V8SF_V16SF_UHI
:
9379 case V16SI_FTYPE_V8SI_V16SI_UHI
:
9380 case V16HI_FTYPE_V16HI_V16HI_UHI
:
9381 case V8HI_FTYPE_V16QI_V8HI_UQI
:
9382 case V16HI_FTYPE_V16QI_V16HI_UHI
:
9383 case V32HI_FTYPE_V32HI_V32HI_USI
:
9384 case V32HI_FTYPE_V32QI_V32HI_USI
:
9385 case V8DI_FTYPE_V16QI_V8DI_UQI
:
9386 case V8DI_FTYPE_V2DI_V8DI_UQI
:
9387 case V8DI_FTYPE_V4DI_V8DI_UQI
:
9388 case V8DI_FTYPE_V8DI_V8DI_UQI
:
9389 case V8DI_FTYPE_V8HI_V8DI_UQI
:
9390 case V8DI_FTYPE_V8SI_V8DI_UQI
:
9391 case V8HI_FTYPE_V8DI_V8HI_UQI
:
9392 case V8SI_FTYPE_V8DI_V8SI_UQI
:
9393 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
9394 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
9395 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
9396 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
9397 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
9398 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
9399 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
9400 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
9401 case V32HI_FTYPE_V16SF_V16SF_USI
:
9402 case V16HI_FTYPE_V8SF_V8SF_UHI
:
9403 case V8HI_FTYPE_V4SF_V4SF_UQI
:
9404 case V16HI_FTYPE_V16SF_V16HI_UHI
:
9405 case V8HI_FTYPE_V8SF_V8HI_UQI
:
9406 case V8HI_FTYPE_V4SF_V8HI_UQI
:
9407 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
9408 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
9409 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
9412 case V32QI_FTYPE_V32QI_V32QI_INT
:
9413 case V16HI_FTYPE_V16HI_V16HI_INT
:
9414 case V16QI_FTYPE_V16QI_V16QI_INT
:
9415 case V4DI_FTYPE_V4DI_V4DI_INT
:
9416 case V8HI_FTYPE_V8HI_V8HI_INT
:
9417 case V8SI_FTYPE_V8SI_V8SI_INT
:
9418 case V8SI_FTYPE_V8SI_V4SI_INT
:
9419 case V8SF_FTYPE_V8SF_V8SF_INT
:
9420 case V8SF_FTYPE_V8SF_V4SF_INT
:
9421 case V4SI_FTYPE_V4SI_V4SI_INT
:
9422 case V4DF_FTYPE_V4DF_V4DF_INT
:
9423 case V16SF_FTYPE_V16SF_V16SF_INT
:
9424 case V16SF_FTYPE_V16SF_V4SF_INT
:
9425 case V16SI_FTYPE_V16SI_V4SI_INT
:
9426 case V4DF_FTYPE_V4DF_V2DF_INT
:
9427 case V4SF_FTYPE_V4SF_V4SF_INT
:
9428 case V2DI_FTYPE_V2DI_V2DI_INT
:
9429 case V4DI_FTYPE_V4DI_V2DI_INT
:
9430 case V2DF_FTYPE_V2DF_V2DF_INT
:
9431 case UQI_FTYPE_V8DI_V8UDI_INT
:
9432 case UQI_FTYPE_V8DF_V8DF_INT
:
9433 case UQI_FTYPE_V2DF_V2DF_INT
:
9434 case UQI_FTYPE_V4SF_V4SF_INT
:
9435 case UHI_FTYPE_V16SI_V16SI_INT
:
9436 case UHI_FTYPE_V16SF_V16SF_INT
:
9437 case V64QI_FTYPE_V64QI_V64QI_INT
:
9438 case V32HI_FTYPE_V32HI_V32HI_INT
:
9439 case V16SI_FTYPE_V16SI_V16SI_INT
:
9440 case V8DI_FTYPE_V8DI_V8DI_INT
:
9444 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
9449 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
9454 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
9459 case V2DI_FTYPE_V2DI_UINT_UINT
:
9463 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
9468 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
9474 case QI_FTYPE_V8DF_INT_UQI
:
9475 case QI_FTYPE_V4DF_INT_UQI
:
9476 case QI_FTYPE_V2DF_INT_UQI
:
9477 case HI_FTYPE_V16SF_INT_UHI
:
9478 case QI_FTYPE_V8SF_INT_UQI
:
9479 case QI_FTYPE_V4SF_INT_UQI
:
9480 case V4SI_FTYPE_V4SI_V4SI_UHI
:
9481 case V8SI_FTYPE_V8SI_V8SI_UHI
:
9486 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
9492 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
9498 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
9499 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
9500 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
9501 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
9502 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
9503 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
9504 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
9505 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
9506 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
9507 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
9508 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
9509 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
9510 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
9511 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
9512 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
9513 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
9514 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
9515 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
9516 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
9517 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
9518 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
9519 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
9520 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
9521 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
9522 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
9523 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
9524 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
9525 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
9526 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
9527 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
9528 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
9529 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
9530 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
9531 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
9532 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
9533 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
9534 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
9535 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
9536 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
9537 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
9538 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
9539 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
9540 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
9541 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
9542 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
9543 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
9544 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
9545 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
9546 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
9547 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
9548 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
9549 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
9550 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
9551 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
9554 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
9555 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
9556 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
9557 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
9558 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
9562 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
9563 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
9564 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
9565 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
9566 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
9567 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
9568 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
9569 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
9570 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
9571 case USI_FTYPE_V32QI_V32QI_INT_USI
:
9572 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
9573 case USI_FTYPE_V32HI_V32HI_INT_USI
:
9574 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
9575 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
9576 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT
:
9577 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT
:
9578 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT
:
9579 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT
:
9580 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT
:
9581 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT
:
9582 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT
:
9583 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT
:
9584 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT
:
9589 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
9593 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
9594 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
9595 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
9596 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
9597 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
9600 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
9601 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
9606 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
9607 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
9608 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
9609 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
9610 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
9611 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
9612 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
9613 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
9614 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
9615 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
9616 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
9617 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
9618 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
9619 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
9620 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
9621 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
9622 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
9623 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
9624 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
9625 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
9626 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
9627 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
9628 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
9629 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
9630 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
9631 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
9632 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
9633 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
9634 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
9635 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
9640 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
9641 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
9642 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
9643 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
9644 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
9645 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
9646 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
9647 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
9648 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
9649 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
9650 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
9651 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
9652 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
9653 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
9654 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
9655 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
9656 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
9657 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
9658 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
9659 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
9660 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
9661 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
9662 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
9663 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
9664 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
9665 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
9666 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
9671 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
9672 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
9673 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
9674 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
9675 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
9676 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
9677 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
9678 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
9679 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
9680 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
9685 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
9686 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
9687 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
9688 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
9689 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
9690 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
9691 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
9692 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
9693 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
9694 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
9695 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
9696 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
9706 gcc_assert (nargs
<= ARRAY_SIZE (args
));
9708 if (comparison
!= UNKNOWN
)
9710 gcc_assert (nargs
== 2);
9711 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
9714 if (rmode
== VOIDmode
|| rmode
== tmode
)
9718 || GET_MODE (target
) != tmode
9719 || !insn_p
->operand
[0].predicate (target
, tmode
))
9720 target
= gen_reg_rtx (tmode
);
9721 else if (memory_operand (target
, tmode
))
9723 real_target
= target
;
9727 real_target
= gen_reg_rtx (tmode
);
9728 target
= lowpart_subreg (rmode
, real_target
, tmode
);
9731 for (i
= 0; i
< nargs
; i
++)
9733 tree arg
= CALL_EXPR_ARG (exp
, i
);
9734 rtx op
= expand_normal (arg
);
9735 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
9736 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
9738 if (second_arg_count
&& i
== 1)
9740 /* SIMD shift insns take either an 8-bit immediate or
9741 register as count. But builtin functions take int as
9742 count. If count doesn't match, we put it in register.
9743 The instructions are using 64-bit count, if op is just
9744 32-bit, zero-extend it, as negative shift counts
9745 are undefined behavior and zero-extension is more
9749 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
9750 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
9752 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9753 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
9754 op
= copy_to_reg (op
);
9757 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9758 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
9763 case CODE_FOR_avx_vinsertf128v4di
:
9764 case CODE_FOR_avx_vextractf128v4di
:
9765 error ("the last argument must be an 1-bit immediate");
9768 case CODE_FOR_avx512f_cmpv8di3_mask
:
9769 case CODE_FOR_avx512f_cmpv16si3_mask
:
9770 case CODE_FOR_avx512f_ucmpv8di3_mask
:
9771 case CODE_FOR_avx512f_ucmpv16si3_mask
:
9772 case CODE_FOR_avx512vl_cmpv4di3_mask
:
9773 case CODE_FOR_avx512vl_cmpv8si3_mask
:
9774 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
9775 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
9776 case CODE_FOR_avx512vl_cmpv2di3_mask
:
9777 case CODE_FOR_avx512vl_cmpv4si3_mask
:
9778 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
9779 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
9780 error ("the last argument must be a 3-bit immediate");
9783 case CODE_FOR_sse4_1_roundsd
:
9784 case CODE_FOR_sse4_1_roundss
:
9786 case CODE_FOR_sse4_1_roundpd
:
9787 case CODE_FOR_sse4_1_roundps
:
9788 case CODE_FOR_avx_roundpd256
:
9789 case CODE_FOR_avx_roundps256
:
9791 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
9792 case CODE_FOR_sse4_1_roundps_sfix
:
9793 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
9794 case CODE_FOR_avx_roundps_sfix256
:
9796 case CODE_FOR_sse4_1_blendps
:
9797 case CODE_FOR_avx_blendpd256
:
9798 case CODE_FOR_avx_vpermilv4df
:
9799 case CODE_FOR_avx_vpermilv4df_mask
:
9800 case CODE_FOR_avx512f_getmantv8df_mask
:
9801 case CODE_FOR_avx512f_getmantv16sf_mask
:
9802 case CODE_FOR_avx512vl_getmantv8sf_mask
:
9803 case CODE_FOR_avx512vl_getmantv4df_mask
:
9804 case CODE_FOR_avx512vl_getmantv4sf_mask
:
9805 case CODE_FOR_avx512vl_getmantv2df_mask
:
9806 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
9807 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
9808 case CODE_FOR_avx512dq_rangepv4df_mask
:
9809 case CODE_FOR_avx512dq_rangepv8sf_mask
:
9810 case CODE_FOR_avx512dq_rangepv2df_mask
:
9811 case CODE_FOR_avx512dq_rangepv4sf_mask
:
9812 case CODE_FOR_avx_shufpd256_mask
:
9813 error ("the last argument must be a 4-bit immediate");
9816 case CODE_FOR_sha1rnds4
:
9817 case CODE_FOR_sse4_1_blendpd
:
9818 case CODE_FOR_avx_vpermilv2df
:
9819 case CODE_FOR_avx_vpermilv2df_mask
:
9820 case CODE_FOR_xop_vpermil2v2df3
:
9821 case CODE_FOR_xop_vpermil2v4sf3
:
9822 case CODE_FOR_xop_vpermil2v4df3
:
9823 case CODE_FOR_xop_vpermil2v8sf3
:
9824 case CODE_FOR_avx512f_vinsertf32x4_mask
:
9825 case CODE_FOR_avx512f_vinserti32x4_mask
:
9826 case CODE_FOR_avx512f_vextractf32x4_mask
:
9827 case CODE_FOR_avx512f_vextracti32x4_mask
:
9828 case CODE_FOR_sse2_shufpd
:
9829 case CODE_FOR_sse2_shufpd_mask
:
9830 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
9831 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
9832 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
9833 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
9834 error ("the last argument must be a 2-bit immediate");
9837 case CODE_FOR_avx_vextractf128v4df
:
9838 case CODE_FOR_avx_vextractf128v8sf
:
9839 case CODE_FOR_avx_vextractf128v8si
:
9840 case CODE_FOR_avx_vinsertf128v4df
:
9841 case CODE_FOR_avx_vinsertf128v8sf
:
9842 case CODE_FOR_avx_vinsertf128v8si
:
9843 case CODE_FOR_avx512f_vinsertf64x4_mask
:
9844 case CODE_FOR_avx512f_vinserti64x4_mask
:
9845 case CODE_FOR_avx512f_vextractf64x4_mask
:
9846 case CODE_FOR_avx512f_vextracti64x4_mask
:
9847 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
9848 case CODE_FOR_avx512dq_vinserti32x8_mask
:
9849 case CODE_FOR_avx512vl_vinsertv4df
:
9850 case CODE_FOR_avx512vl_vinsertv4di
:
9851 case CODE_FOR_avx512vl_vinsertv8sf
:
9852 case CODE_FOR_avx512vl_vinsertv8si
:
9853 error ("the last argument must be a 1-bit immediate");
9856 case CODE_FOR_avx_vmcmpv2df3
:
9857 case CODE_FOR_avx_vmcmpv4sf3
:
9858 case CODE_FOR_avx_cmpv2df3
:
9859 case CODE_FOR_avx_cmpv4sf3
:
9860 case CODE_FOR_avx_cmpv4df3
:
9861 case CODE_FOR_avx_cmpv8sf3
:
9862 case CODE_FOR_avx512f_cmpv8df3_mask
:
9863 case CODE_FOR_avx512f_cmpv16sf3_mask
:
9864 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
9865 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
9866 error ("the last argument must be a 5-bit immediate");
9870 switch (nargs_constant
)
9873 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9874 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
9876 error ("the next to last argument must be an 8-bit immediate");
9881 error ("the last argument must be an 8-bit immediate");
9891 if (VECTOR_MODE_P (mode
))
9892 op
= safe_vector_operand (op
, mode
);
9894 /* If we aren't optimizing, only allow one memory operand to
9896 if (memory_operand (op
, mode
))
9899 op
= fixup_modeless_constant (op
, mode
);
9901 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
9903 if (optimize
|| !match
|| num_memory
> 1)
9904 op
= copy_to_mode_reg (mode
, op
);
9908 op
= copy_to_reg (op
);
9909 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9914 args
[i
].mode
= mode
;
9920 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
);
9923 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
);
9926 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9930 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9931 args
[2].op
, args
[3].op
);
9934 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9935 args
[2].op
, args
[3].op
, args
[4].op
);
9938 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9939 args
[2].op
, args
[3].op
, args
[4].op
,
9953 /* Transform pattern of following layout:
9955 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9961 ix86_erase_embedded_rounding (rtx pat
)
9963 if (GET_CODE (pat
) == INSN
)
9964 pat
= PATTERN (pat
);
9966 gcc_assert (GET_CODE (pat
) == SET
);
9967 rtx src
= SET_SRC (pat
);
9968 gcc_assert (XVECLEN (src
, 0) == 2);
9969 rtx p0
= XVECEXP (src
, 0, 0);
9970 gcc_assert (GET_CODE (src
) == UNSPEC
9971 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
9972 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
9976 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9979 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
9980 tree exp
, rtx target
)
9983 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9984 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9985 tree arg2
= CALL_EXPR_ARG (exp
, 2);
9986 tree arg3
= CALL_EXPR_ARG (exp
, 3);
9987 rtx op0
= expand_normal (arg0
);
9988 rtx op1
= expand_normal (arg1
);
9989 rtx op2
= expand_normal (arg2
);
9990 rtx op3
= expand_normal (arg3
);
9991 enum insn_code icode
= d
->icode
;
9992 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
9993 machine_mode mode0
= insn_p
->operand
[0].mode
;
9994 machine_mode mode1
= insn_p
->operand
[1].mode
;
9996 /* See avxintrin.h for values. */
9997 static const enum rtx_code comparisons
[32] =
9999 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
10000 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
10001 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
10002 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
10004 static const bool ordereds
[32] =
10006 true, true, true, false, false, false, false, true,
10007 false, false, false, true, true, true, true, false,
10008 true, true, true, false, false, false, false, true,
10009 false, false, false, true, true, true, true, false
10011 static const bool non_signalings
[32] =
10013 true, false, false, true, true, false, false, true,
10014 true, false, false, true, true, false, false, true,
10015 false, true, true, false, false, true, true, false,
10016 false, true, true, false, false, true, true, false
10019 if (!CONST_INT_P (op2
))
10021 error ("the third argument must be comparison constant");
10024 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
10026 error ("incorrect comparison mode");
10030 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
10032 error ("incorrect rounding operand");
10036 if (VECTOR_MODE_P (mode0
))
10037 op0
= safe_vector_operand (op0
, mode0
);
10038 if (VECTOR_MODE_P (mode1
))
10039 op1
= safe_vector_operand (op1
, mode1
);
10041 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
10042 bool ordered
= ordereds
[INTVAL (op2
)];
10043 bool non_signaling
= non_signalings
[INTVAL (op2
)];
10044 rtx const_val
= const0_rtx
;
10046 bool check_unordered
= false;
10047 machine_mode mode
= CCFPmode
;
10048 switch (comparison
)
10053 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10054 if (!non_signaling
)
10060 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10070 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10077 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10078 if (!non_signaling
)
10085 case LE
: /* -> GE */
10086 case LT
: /* -> GT */
10087 case UNGE
: /* -> UNLE */
10088 case UNGT
: /* -> UNLT */
10089 std::swap (op0
, op1
);
10090 comparison
= swap_condition (comparison
);
10098 /* These are supported by CCFPmode. NB: Use ordered/signaling
10099 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10100 with NAN operands. */
10101 if (ordered
== non_signaling
)
10102 ordered
= !ordered
;
10105 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10106 _CMP_EQ_OQ/_CMP_EQ_OS. */
10107 check_unordered
= true;
10111 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10112 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10113 gcc_assert (!ordered
);
10114 check_unordered
= true;
10116 const_val
= const1_rtx
;
10119 gcc_unreachable ();
10122 target
= gen_reg_rtx (SImode
);
10123 emit_move_insn (target
, const_val
);
10124 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10126 if ((optimize
&& !register_operand (op0
, mode0
))
10127 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10128 op0
= copy_to_mode_reg (mode0
, op0
);
10129 if ((optimize
&& !register_operand (op1
, mode1
))
10130 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10131 op1
= copy_to_mode_reg (mode1
, op1
);
10134 1. COMI: ordered and signaling.
10135 2. UCOMI: unordered and non-signaling.
10138 icode
= (icode
== CODE_FOR_sse_comi_round
10139 ? CODE_FOR_sse_ucomi_round
10140 : CODE_FOR_sse2_ucomi_round
);
10142 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
10146 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10147 if (INTVAL (op3
) == NO_ROUND
)
10149 pat
= ix86_erase_embedded_rounding (pat
);
10153 set_dst
= SET_DEST (pat
);
10157 gcc_assert (GET_CODE (pat
) == SET
);
10158 set_dst
= SET_DEST (pat
);
10163 rtx_code_label
*label
= NULL
;
10165 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10166 with NAN operands. */
10167 if (check_unordered
)
10169 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10171 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10172 label
= gen_label_rtx ();
10173 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10174 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10175 gen_rtx_LABEL_REF (VOIDmode
, label
),
10177 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10180 /* NB: Set CCFPmode and check a different CCmode which is in subset
10182 if (GET_MODE (set_dst
) != mode
)
10184 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10185 || mode
== CCOmode
|| mode
== CCPmode
10186 || mode
== CCSmode
|| mode
== CCZmode
);
10187 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10190 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10191 gen_rtx_fmt_ee (comparison
, QImode
,
10196 emit_label (label
);
10198 return SUBREG_REG (target
);
10202 ix86_expand_round_builtin (const struct builtin_description
*d
,
10203 tree exp
, rtx target
)
10206 unsigned int i
, nargs
;
10212 enum insn_code icode
= d
->icode
;
10213 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10214 machine_mode tmode
= insn_p
->operand
[0].mode
;
10215 unsigned int nargs_constant
= 0;
10216 unsigned int redundant_embed_rnd
= 0;
10218 switch ((enum ix86_builtin_func_type
) d
->flag
)
10220 case UINT64_FTYPE_V2DF_INT
:
10221 case UINT64_FTYPE_V4SF_INT
:
10222 case UINT_FTYPE_V2DF_INT
:
10223 case UINT_FTYPE_V4SF_INT
:
10224 case INT64_FTYPE_V2DF_INT
:
10225 case INT64_FTYPE_V4SF_INT
:
10226 case INT_FTYPE_V2DF_INT
:
10227 case INT_FTYPE_V4SF_INT
:
10230 case V4SF_FTYPE_V4SF_UINT_INT
:
10231 case V4SF_FTYPE_V4SF_UINT64_INT
:
10232 case V2DF_FTYPE_V2DF_UINT64_INT
:
10233 case V4SF_FTYPE_V4SF_INT_INT
:
10234 case V4SF_FTYPE_V4SF_INT64_INT
:
10235 case V2DF_FTYPE_V2DF_INT64_INT
:
10236 case V4SF_FTYPE_V4SF_V4SF_INT
:
10237 case V2DF_FTYPE_V2DF_V2DF_INT
:
10238 case V4SF_FTYPE_V4SF_V2DF_INT
:
10239 case V2DF_FTYPE_V2DF_V4SF_INT
:
10242 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
10243 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
10244 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
10245 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
10246 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
10247 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
10248 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
10249 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
10250 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
10251 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
10252 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
10253 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
10254 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
10255 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
10258 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
10259 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
10260 nargs_constant
= 2;
10263 case INT_FTYPE_V4SF_V4SF_INT_INT
:
10264 case INT_FTYPE_V2DF_V2DF_INT_INT
:
10265 return ix86_expand_sse_comi_round (d
, exp
, target
);
10266 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
10267 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
10268 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
10269 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
10270 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
10271 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
10272 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
10273 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
10276 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
10277 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
10278 nargs_constant
= 4;
10281 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
10282 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
10283 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
10284 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
10285 nargs_constant
= 3;
10288 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
10289 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
10290 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
10291 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
10292 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
10293 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
10295 nargs_constant
= 4;
10297 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
10298 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
10299 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
10300 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
10302 nargs_constant
= 3;
10305 gcc_unreachable ();
10307 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10311 || GET_MODE (target
) != tmode
10312 || !insn_p
->operand
[0].predicate (target
, tmode
))
10313 target
= gen_reg_rtx (tmode
);
10315 for (i
= 0; i
< nargs
; i
++)
10317 tree arg
= CALL_EXPR_ARG (exp
, i
);
10318 rtx op
= expand_normal (arg
);
10319 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10320 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10322 if (i
== nargs
- nargs_constant
)
10328 case CODE_FOR_avx512f_getmantv8df_mask_round
:
10329 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
10330 case CODE_FOR_avx512f_vgetmantv2df_round
:
10331 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
10332 case CODE_FOR_avx512f_vgetmantv4sf_round
:
10333 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
10334 error ("the immediate argument must be a 4-bit immediate");
10336 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
10337 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
10338 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
10339 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
10340 error ("the immediate argument must be a 5-bit immediate");
10343 error ("the immediate argument must be an 8-bit immediate");
10348 else if (i
== nargs
-1)
10350 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
10352 error ("incorrect rounding operand");
10356 /* If there is no rounding use normal version of the pattern. */
10357 if (INTVAL (op
) == NO_ROUND
)
10358 redundant_embed_rnd
= 1;
10362 if (VECTOR_MODE_P (mode
))
10363 op
= safe_vector_operand (op
, mode
);
10365 op
= fixup_modeless_constant (op
, mode
);
10367 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10369 if (optimize
|| !match
)
10370 op
= copy_to_mode_reg (mode
, op
);
10374 op
= copy_to_reg (op
);
10375 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10380 args
[i
].mode
= mode
;
10386 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10389 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10392 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10396 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10397 args
[2].op
, args
[3].op
);
10400 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10401 args
[2].op
, args
[3].op
, args
[4].op
);
10404 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10405 args
[2].op
, args
[3].op
, args
[4].op
,
10409 gcc_unreachable ();
10415 if (redundant_embed_rnd
)
10416 pat
= ix86_erase_embedded_rounding (pat
);
10422 /* Subroutine of ix86_expand_builtin to take care of special insns
10423 with variable number of operands. */
10426 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
10427 tree exp
, rtx target
)
10431 unsigned int i
, nargs
, arg_adjust
, memory
;
10432 bool aligned_mem
= false;
10438 enum insn_code icode
= d
->icode
;
10439 bool last_arg_constant
= false;
10440 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10441 machine_mode tmode
= insn_p
->operand
[0].mode
;
10442 enum { load
, store
} klass
;
10444 switch ((enum ix86_builtin_func_type
) d
->flag
)
10446 case VOID_FTYPE_VOID
:
10447 emit_insn (GEN_FCN (icode
) (target
));
10449 case VOID_FTYPE_UINT64
:
10450 case VOID_FTYPE_UNSIGNED
:
10456 case INT_FTYPE_VOID
:
10457 case USHORT_FTYPE_VOID
:
10458 case UINT64_FTYPE_VOID
:
10459 case UINT_FTYPE_VOID
:
10460 case UNSIGNED_FTYPE_VOID
:
10465 case UINT64_FTYPE_PUNSIGNED
:
10466 case V2DI_FTYPE_PV2DI
:
10467 case V4DI_FTYPE_PV4DI
:
10468 case V32QI_FTYPE_PCCHAR
:
10469 case V16QI_FTYPE_PCCHAR
:
10470 case V8SF_FTYPE_PCV4SF
:
10471 case V8SF_FTYPE_PCFLOAT
:
10472 case V4SF_FTYPE_PCFLOAT
:
10473 case V4DF_FTYPE_PCV2DF
:
10474 case V4DF_FTYPE_PCDOUBLE
:
10475 case V2DF_FTYPE_PCDOUBLE
:
10476 case VOID_FTYPE_PVOID
:
10477 case V8DI_FTYPE_PV8DI
:
10483 case CODE_FOR_sse4_1_movntdqa
:
10484 case CODE_FOR_avx2_movntdqa
:
10485 case CODE_FOR_avx512f_movntdqa
:
10486 aligned_mem
= true;
10492 case VOID_FTYPE_PV2SF_V4SF
:
10493 case VOID_FTYPE_PV8DI_V8DI
:
10494 case VOID_FTYPE_PV4DI_V4DI
:
10495 case VOID_FTYPE_PV2DI_V2DI
:
10496 case VOID_FTYPE_PCHAR_V32QI
:
10497 case VOID_FTYPE_PCHAR_V16QI
:
10498 case VOID_FTYPE_PFLOAT_V16SF
:
10499 case VOID_FTYPE_PFLOAT_V8SF
:
10500 case VOID_FTYPE_PFLOAT_V4SF
:
10501 case VOID_FTYPE_PDOUBLE_V8DF
:
10502 case VOID_FTYPE_PDOUBLE_V4DF
:
10503 case VOID_FTYPE_PDOUBLE_V2DF
:
10504 case VOID_FTYPE_PLONGLONG_LONGLONG
:
10505 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
10506 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
10507 case VOID_FTYPE_PINT_INT
:
10510 /* Reserve memory operand for target. */
10511 memory
= ARRAY_SIZE (args
);
10514 /* These builtins and instructions require the memory
10515 to be properly aligned. */
10516 case CODE_FOR_avx_movntv4di
:
10517 case CODE_FOR_sse2_movntv2di
:
10518 case CODE_FOR_avx_movntv8sf
:
10519 case CODE_FOR_sse_movntv4sf
:
10520 case CODE_FOR_sse4a_vmmovntv4sf
:
10521 case CODE_FOR_avx_movntv4df
:
10522 case CODE_FOR_sse2_movntv2df
:
10523 case CODE_FOR_sse4a_vmmovntv2df
:
10524 case CODE_FOR_sse2_movntidi
:
10525 case CODE_FOR_sse_movntq
:
10526 case CODE_FOR_sse2_movntisi
:
10527 case CODE_FOR_avx512f_movntv16sf
:
10528 case CODE_FOR_avx512f_movntv8df
:
10529 case CODE_FOR_avx512f_movntv8di
:
10530 aligned_mem
= true;
10536 case VOID_FTYPE_PVOID_PCVOID
:
10542 case V4SF_FTYPE_V4SF_PCV2SF
:
10543 case V2DF_FTYPE_V2DF_PCDOUBLE
:
10548 case V8SF_FTYPE_PCV8SF_V8SI
:
10549 case V4DF_FTYPE_PCV4DF_V4DI
:
10550 case V4SF_FTYPE_PCV4SF_V4SI
:
10551 case V2DF_FTYPE_PCV2DF_V2DI
:
10552 case V8SI_FTYPE_PCV8SI_V8SI
:
10553 case V4DI_FTYPE_PCV4DI_V4DI
:
10554 case V4SI_FTYPE_PCV4SI_V4SI
:
10555 case V2DI_FTYPE_PCV2DI_V2DI
:
10556 case VOID_FTYPE_INT_INT64
:
10561 case VOID_FTYPE_PV8DF_V8DF_UQI
:
10562 case VOID_FTYPE_PV4DF_V4DF_UQI
:
10563 case VOID_FTYPE_PV2DF_V2DF_UQI
:
10564 case VOID_FTYPE_PV16SF_V16SF_UHI
:
10565 case VOID_FTYPE_PV8SF_V8SF_UQI
:
10566 case VOID_FTYPE_PV4SF_V4SF_UQI
:
10567 case VOID_FTYPE_PV8DI_V8DI_UQI
:
10568 case VOID_FTYPE_PV4DI_V4DI_UQI
:
10569 case VOID_FTYPE_PV2DI_V2DI_UQI
:
10570 case VOID_FTYPE_PV16SI_V16SI_UHI
:
10571 case VOID_FTYPE_PV8SI_V8SI_UQI
:
10572 case VOID_FTYPE_PV4SI_V4SI_UQI
:
10573 case VOID_FTYPE_PV64QI_V64QI_UDI
:
10574 case VOID_FTYPE_PV32HI_V32HI_USI
:
10575 case VOID_FTYPE_PV32QI_V32QI_USI
:
10576 case VOID_FTYPE_PV16QI_V16QI_UHI
:
10577 case VOID_FTYPE_PV16HI_V16HI_UHI
:
10578 case VOID_FTYPE_PV8HI_V8HI_UQI
:
10581 /* These builtins and instructions require the memory
10582 to be properly aligned. */
10583 case CODE_FOR_avx512f_storev16sf_mask
:
10584 case CODE_FOR_avx512f_storev16si_mask
:
10585 case CODE_FOR_avx512f_storev8df_mask
:
10586 case CODE_FOR_avx512f_storev8di_mask
:
10587 case CODE_FOR_avx512vl_storev8sf_mask
:
10588 case CODE_FOR_avx512vl_storev8si_mask
:
10589 case CODE_FOR_avx512vl_storev4df_mask
:
10590 case CODE_FOR_avx512vl_storev4di_mask
:
10591 case CODE_FOR_avx512vl_storev4sf_mask
:
10592 case CODE_FOR_avx512vl_storev4si_mask
:
10593 case CODE_FOR_avx512vl_storev2df_mask
:
10594 case CODE_FOR_avx512vl_storev2di_mask
:
10595 aligned_mem
= true;
10601 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
10602 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
10603 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
10604 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
10605 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
10606 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
10607 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
10608 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
10609 case VOID_FTYPE_PV8SI_V8DI_UQI
:
10610 case VOID_FTYPE_PV8HI_V8DI_UQI
:
10611 case VOID_FTYPE_PV16HI_V16SI_UHI
:
10612 case VOID_FTYPE_PV16QI_V8DI_UQI
:
10613 case VOID_FTYPE_PV16QI_V16SI_UHI
:
10614 case VOID_FTYPE_PV4SI_V4DI_UQI
:
10615 case VOID_FTYPE_PV4SI_V2DI_UQI
:
10616 case VOID_FTYPE_PV8HI_V4DI_UQI
:
10617 case VOID_FTYPE_PV8HI_V2DI_UQI
:
10618 case VOID_FTYPE_PV8HI_V8SI_UQI
:
10619 case VOID_FTYPE_PV8HI_V4SI_UQI
:
10620 case VOID_FTYPE_PV16QI_V4DI_UQI
:
10621 case VOID_FTYPE_PV16QI_V2DI_UQI
:
10622 case VOID_FTYPE_PV16QI_V8SI_UQI
:
10623 case VOID_FTYPE_PV16QI_V4SI_UQI
:
10624 case VOID_FTYPE_PCHAR_V64QI_UDI
:
10625 case VOID_FTYPE_PCHAR_V32QI_USI
:
10626 case VOID_FTYPE_PCHAR_V16QI_UHI
:
10627 case VOID_FTYPE_PSHORT_V32HI_USI
:
10628 case VOID_FTYPE_PSHORT_V16HI_UHI
:
10629 case VOID_FTYPE_PSHORT_V8HI_UQI
:
10630 case VOID_FTYPE_PINT_V16SI_UHI
:
10631 case VOID_FTYPE_PINT_V8SI_UQI
:
10632 case VOID_FTYPE_PINT_V4SI_UQI
:
10633 case VOID_FTYPE_PINT64_V8DI_UQI
:
10634 case VOID_FTYPE_PINT64_V4DI_UQI
:
10635 case VOID_FTYPE_PINT64_V2DI_UQI
:
10636 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
10637 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
10638 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
10639 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
10640 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
10641 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
10642 case VOID_FTYPE_PV32QI_V32HI_USI
:
10643 case VOID_FTYPE_PV16QI_V16HI_UHI
:
10644 case VOID_FTYPE_PV8QI_V8HI_UQI
:
10647 /* Reserve memory operand for target. */
10648 memory
= ARRAY_SIZE (args
);
10650 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
10651 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
10652 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
10653 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
10654 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
10655 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
10656 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
10657 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
10658 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
10659 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
10660 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
10661 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
10662 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
10663 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
10664 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
10665 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
10666 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
10667 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
10670 /* These builtins and instructions require the memory
10671 to be properly aligned. */
10672 case CODE_FOR_avx512f_loadv16sf_mask
:
10673 case CODE_FOR_avx512f_loadv16si_mask
:
10674 case CODE_FOR_avx512f_loadv8df_mask
:
10675 case CODE_FOR_avx512f_loadv8di_mask
:
10676 case CODE_FOR_avx512vl_loadv8sf_mask
:
10677 case CODE_FOR_avx512vl_loadv8si_mask
:
10678 case CODE_FOR_avx512vl_loadv4df_mask
:
10679 case CODE_FOR_avx512vl_loadv4di_mask
:
10680 case CODE_FOR_avx512vl_loadv4sf_mask
:
10681 case CODE_FOR_avx512vl_loadv4si_mask
:
10682 case CODE_FOR_avx512vl_loadv2df_mask
:
10683 case CODE_FOR_avx512vl_loadv2di_mask
:
10684 case CODE_FOR_avx512bw_loadv64qi_mask
:
10685 case CODE_FOR_avx512vl_loadv32qi_mask
:
10686 case CODE_FOR_avx512vl_loadv16qi_mask
:
10687 case CODE_FOR_avx512bw_loadv32hi_mask
:
10688 case CODE_FOR_avx512vl_loadv16hi_mask
:
10689 case CODE_FOR_avx512vl_loadv8hi_mask
:
10690 aligned_mem
= true;
10696 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
10697 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
10698 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
10699 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
10700 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
10701 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
10702 case V16SI_FTYPE_PCINT_V16SI_UHI
:
10703 case V8SI_FTYPE_PCINT_V8SI_UQI
:
10704 case V4SI_FTYPE_PCINT_V4SI_UQI
:
10705 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
10706 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
10707 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
10708 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
10709 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
10710 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
10711 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
10712 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
10713 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
10718 case VOID_FTYPE_UINT_UINT_UINT
:
10719 case VOID_FTYPE_UINT64_UINT_UINT
:
10720 case UCHAR_FTYPE_UINT_UINT_UINT
:
10721 case UCHAR_FTYPE_UINT64_UINT_UINT
:
10724 memory
= ARRAY_SIZE (args
);
10725 last_arg_constant
= true;
10728 gcc_unreachable ();
10731 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10733 if (klass
== store
)
10735 arg
= CALL_EXPR_ARG (exp
, 0);
10736 op
= expand_normal (arg
);
10737 gcc_assert (target
== 0);
10740 op
= ix86_zero_extend_to_Pmode (op
);
10741 target
= gen_rtx_MEM (tmode
, op
);
10742 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10743 on it. Try to improve it using get_pointer_alignment,
10744 and if the special builtin is one that requires strict
10745 mode alignment, also from it's GET_MODE_ALIGNMENT.
10746 Failure to do so could lead to ix86_legitimate_combined_insn
10747 rejecting all changes to such insns. */
10748 unsigned int align
= get_pointer_alignment (arg
);
10749 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
10750 align
= GET_MODE_ALIGNMENT (tmode
);
10751 if (MEM_ALIGN (target
) < align
)
10752 set_mem_align (target
, align
);
10755 target
= force_reg (tmode
, op
);
10763 || !register_operand (target
, tmode
)
10764 || GET_MODE (target
) != tmode
)
10765 target
= gen_reg_rtx (tmode
);
10768 for (i
= 0; i
< nargs
; i
++)
10770 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10773 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
10774 op
= expand_normal (arg
);
10775 match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10777 if (last_arg_constant
&& (i
+ 1) == nargs
)
10781 if (icode
== CODE_FOR_lwp_lwpvalsi3
10782 || icode
== CODE_FOR_lwp_lwpinssi3
10783 || icode
== CODE_FOR_lwp_lwpvaldi3
10784 || icode
== CODE_FOR_lwp_lwpinsdi3
)
10785 error ("the last argument must be a 32-bit immediate");
10787 error ("the last argument must be an 8-bit immediate");
10795 /* This must be the memory operand. */
10796 op
= ix86_zero_extend_to_Pmode (op
);
10797 op
= gen_rtx_MEM (mode
, op
);
10798 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10799 on it. Try to improve it using get_pointer_alignment,
10800 and if the special builtin is one that requires strict
10801 mode alignment, also from it's GET_MODE_ALIGNMENT.
10802 Failure to do so could lead to ix86_legitimate_combined_insn
10803 rejecting all changes to such insns. */
10804 unsigned int align
= get_pointer_alignment (arg
);
10805 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
10806 align
= GET_MODE_ALIGNMENT (mode
);
10807 if (MEM_ALIGN (op
) < align
)
10808 set_mem_align (op
, align
);
10812 /* This must be register. */
10813 if (VECTOR_MODE_P (mode
))
10814 op
= safe_vector_operand (op
, mode
);
10816 op
= fixup_modeless_constant (op
, mode
);
10818 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10819 op
= copy_to_mode_reg (mode
, op
);
10822 op
= copy_to_reg (op
);
10823 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10829 args
[i
].mode
= mode
;
10835 pat
= GEN_FCN (icode
) (target
);
10838 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10841 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10844 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
10847 gcc_unreachable ();
10853 return klass
== store
? 0 : target
;
10856 /* Return the integer constant in ARG. Constrain it to be in the range
10857 of the subparts of VEC_TYPE; issue an error if not. */
10860 get_element_number (tree vec_type
, tree arg
)
10862 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
10864 if (!tree_fits_uhwi_p (arg
)
10865 || (elt
= tree_to_uhwi (arg
), elt
> max
))
10867 error ("selector must be an integer constant in the range "
10875 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10876 ix86_expand_vector_init. We DO have language-level syntax for this, in
10877 the form of (type){ init-list }. Except that since we can't place emms
10878 instructions from inside the compiler, we can't allow the use of MMX
10879 registers unless the user explicitly asks for it. So we do *not* define
10880 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10881 we have builtins invoked by mmintrin.h that gives us license to emit
10882 these sorts of instructions. */
10885 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
10887 machine_mode tmode
= TYPE_MODE (type
);
10888 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
10889 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
10890 rtvec v
= rtvec_alloc (n_elt
);
10892 gcc_assert (VECTOR_MODE_P (tmode
));
10893 gcc_assert (call_expr_nargs (exp
) == n_elt
);
10895 for (i
= 0; i
< n_elt
; ++i
)
10897 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
10898 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
10901 if (!target
|| !register_operand (target
, tmode
))
10902 target
= gen_reg_rtx (tmode
);
10904 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
10908 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10909 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10910 had a language-level syntax for referencing vector elements. */
10913 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
10915 machine_mode tmode
, mode0
;
10920 arg0
= CALL_EXPR_ARG (exp
, 0);
10921 arg1
= CALL_EXPR_ARG (exp
, 1);
10923 op0
= expand_normal (arg0
);
10924 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
10926 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10927 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
10928 gcc_assert (VECTOR_MODE_P (mode0
));
10930 op0
= force_reg (mode0
, op0
);
10932 if (optimize
|| !target
|| !register_operand (target
, tmode
))
10933 target
= gen_reg_rtx (tmode
);
10935 ix86_expand_vector_extract (true, target
, op0
, elt
);
10940 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10941 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10942 a language-level syntax for referencing vector elements. */
10945 ix86_expand_vec_set_builtin (tree exp
)
10947 machine_mode tmode
, mode1
;
10948 tree arg0
, arg1
, arg2
;
10950 rtx op0
, op1
, target
;
10952 arg0
= CALL_EXPR_ARG (exp
, 0);
10953 arg1
= CALL_EXPR_ARG (exp
, 1);
10954 arg2
= CALL_EXPR_ARG (exp
, 2);
10956 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
10957 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10958 gcc_assert (VECTOR_MODE_P (tmode
));
10960 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
10961 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
10962 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
10964 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
10965 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
10967 op0
= force_reg (tmode
, op0
);
10968 op1
= force_reg (mode1
, op1
);
10970 /* OP0 is the source of these builtin functions and shouldn't be
10971 modified. Create a copy, use it and return it as target. */
10972 target
= gen_reg_rtx (tmode
);
10973 emit_move_insn (target
, op0
);
10974 ix86_expand_vector_set (true, target
, op1
, elt
);
10979 /* Expand an expression EXP that calls a built-in function,
10980 with result going to TARGET if that's convenient
10981 (and in mode MODE if that's convenient).
10982 SUBTARGET may be used as the target for computing one of EXP's operands.
10983 IGNORE is nonzero if the value is to be ignored. */
10986 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
10987 machine_mode mode
, int ignore
)
10990 enum insn_code icode
, icode2
;
10991 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
10992 tree arg0
, arg1
, arg2
, arg3
, arg4
;
10993 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
10994 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
10995 unsigned int fcode
= DECL_FUNCTION_CODE (fndecl
);
10997 /* For CPU builtins that can be folded, fold first and expand the fold. */
11000 case IX86_BUILTIN_CPU_INIT
:
11002 /* Make it call __cpu_indicator_init in libgcc. */
11003 tree call_expr
, fndecl
, type
;
11004 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
11005 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
11006 call_expr
= build_call_expr (fndecl
, 0);
11007 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
11009 case IX86_BUILTIN_CPU_IS
:
11010 case IX86_BUILTIN_CPU_SUPPORTS
:
11012 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11013 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
11014 gcc_assert (fold_expr
!= NULL_TREE
);
11015 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
11019 HOST_WIDE_INT isa
= ix86_isa_flags
;
11020 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
11021 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
11022 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
11023 /* The general case is we require all the ISAs specified in bisa{,2}
11025 The exceptions are:
11026 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11027 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11028 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11029 where for each this pair it is sufficient if either of the ISAs is
11030 enabled, plus if it is ored with other options also those others. */
11031 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11032 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11033 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
11034 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
11035 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11036 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11037 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
11038 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
11039 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11040 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11041 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
11042 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
11043 /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
11044 MMX is disabled. NB: Since MMX intrinsics are marked with
11045 SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
11047 if (TARGET_MMX
|| TARGET_MMX_WITH_SSE
)
11049 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
))
11050 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
))
11051 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
)) != 0)
11052 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
);
11053 if (((bisa
& (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
))
11054 == (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
))
11055 && (isa
& (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
)) != 0)
11056 isa
|= (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
);
11057 if (((bisa
& (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
))
11058 == (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
))
11059 && (isa
& (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
)) != 0)
11060 isa
|= (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
);
11062 if ((bisa
& isa
) != bisa
|| (bisa2
& isa2
) != bisa2
)
11064 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
11065 if (TARGET_ABI_X32
)
11066 bisa
|= OPTION_MASK_ABI_X32
;
11068 bisa
|= OPTION_MASK_ABI_64
;
11069 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
11070 (enum fpmath_unit
) 0, false, add_abi_p
);
11072 error ("%qE needs unknown isa option", fndecl
);
11075 gcc_assert (opts
!= NULL
);
11076 error ("%qE needs isa option %s", fndecl
, opts
);
11079 return expand_call (exp
, target
, ignore
);
11084 case IX86_BUILTIN_MASKMOVQ
:
11085 case IX86_BUILTIN_MASKMOVDQU
:
11086 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
11087 ? CODE_FOR_mmx_maskmovq
11088 : CODE_FOR_sse2_maskmovdqu
);
11089 /* Note the arg order is different from the operand order. */
11090 arg1
= CALL_EXPR_ARG (exp
, 0);
11091 arg2
= CALL_EXPR_ARG (exp
, 1);
11092 arg0
= CALL_EXPR_ARG (exp
, 2);
11093 op0
= expand_normal (arg0
);
11094 op1
= expand_normal (arg1
);
11095 op2
= expand_normal (arg2
);
11096 mode0
= insn_data
[icode
].operand
[0].mode
;
11097 mode1
= insn_data
[icode
].operand
[1].mode
;
11098 mode2
= insn_data
[icode
].operand
[2].mode
;
11100 op0
= ix86_zero_extend_to_Pmode (op0
);
11101 op0
= gen_rtx_MEM (mode1
, op0
);
11103 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11104 op0
= copy_to_mode_reg (mode0
, op0
);
11105 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11106 op1
= copy_to_mode_reg (mode1
, op1
);
11107 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
11108 op2
= copy_to_mode_reg (mode2
, op2
);
11109 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11115 case IX86_BUILTIN_LDMXCSR
:
11116 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
11117 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11118 emit_move_insn (target
, op0
);
11119 emit_insn (gen_sse_ldmxcsr (target
));
11122 case IX86_BUILTIN_STMXCSR
:
11123 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11124 emit_insn (gen_sse_stmxcsr (target
));
11125 return copy_to_mode_reg (SImode
, target
);
11127 case IX86_BUILTIN_CLFLUSH
:
11128 arg0
= CALL_EXPR_ARG (exp
, 0);
11129 op0
= expand_normal (arg0
);
11130 icode
= CODE_FOR_sse2_clflush
;
11131 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11132 op0
= ix86_zero_extend_to_Pmode (op0
);
11134 emit_insn (gen_sse2_clflush (op0
));
11137 case IX86_BUILTIN_CLWB
:
11138 arg0
= CALL_EXPR_ARG (exp
, 0);
11139 op0
= expand_normal (arg0
);
11140 icode
= CODE_FOR_clwb
;
11141 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11142 op0
= ix86_zero_extend_to_Pmode (op0
);
11144 emit_insn (gen_clwb (op0
));
11147 case IX86_BUILTIN_CLFLUSHOPT
:
11148 arg0
= CALL_EXPR_ARG (exp
, 0);
11149 op0
= expand_normal (arg0
);
11150 icode
= CODE_FOR_clflushopt
;
11151 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11152 op0
= ix86_zero_extend_to_Pmode (op0
);
11154 emit_insn (gen_clflushopt (op0
));
11157 case IX86_BUILTIN_MONITOR
:
11158 case IX86_BUILTIN_MONITORX
:
11159 arg0
= CALL_EXPR_ARG (exp
, 0);
11160 arg1
= CALL_EXPR_ARG (exp
, 1);
11161 arg2
= CALL_EXPR_ARG (exp
, 2);
11162 op0
= expand_normal (arg0
);
11163 op1
= expand_normal (arg1
);
11164 op2
= expand_normal (arg2
);
11166 op0
= ix86_zero_extend_to_Pmode (op0
);
11168 op1
= copy_to_mode_reg (SImode
, op1
);
11170 op2
= copy_to_mode_reg (SImode
, op2
);
11172 emit_insn (fcode
== IX86_BUILTIN_MONITOR
11173 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
11174 : gen_monitorx (Pmode
, op0
, op1
, op2
));
11177 case IX86_BUILTIN_MWAIT
:
11178 arg0
= CALL_EXPR_ARG (exp
, 0);
11179 arg1
= CALL_EXPR_ARG (exp
, 1);
11180 op0
= expand_normal (arg0
);
11181 op1
= expand_normal (arg1
);
11183 op0
= copy_to_mode_reg (SImode
, op0
);
11185 op1
= copy_to_mode_reg (SImode
, op1
);
11186 emit_insn (gen_sse3_mwait (op0
, op1
));
11189 case IX86_BUILTIN_MWAITX
:
11190 arg0
= CALL_EXPR_ARG (exp
, 0);
11191 arg1
= CALL_EXPR_ARG (exp
, 1);
11192 arg2
= CALL_EXPR_ARG (exp
, 2);
11193 op0
= expand_normal (arg0
);
11194 op1
= expand_normal (arg1
);
11195 op2
= expand_normal (arg2
);
11197 op0
= copy_to_mode_reg (SImode
, op0
);
11199 op1
= copy_to_mode_reg (SImode
, op1
);
11201 op2
= copy_to_mode_reg (SImode
, op2
);
11202 emit_insn (gen_mwaitx (op0
, op1
, op2
));
11205 case IX86_BUILTIN_UMONITOR
:
11206 arg0
= CALL_EXPR_ARG (exp
, 0);
11207 op0
= expand_normal (arg0
);
11209 op0
= ix86_zero_extend_to_Pmode (op0
);
11210 emit_insn (gen_umonitor (Pmode
, op0
));
11213 case IX86_BUILTIN_UMWAIT
:
11214 case IX86_BUILTIN_TPAUSE
:
11215 arg0
= CALL_EXPR_ARG (exp
, 0);
11216 arg1
= CALL_EXPR_ARG (exp
, 1);
11217 op0
= expand_normal (arg0
);
11218 op1
= expand_normal (arg1
);
11221 op0
= copy_to_mode_reg (SImode
, op0
);
11223 op1
= force_reg (DImode
, op1
);
11227 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11228 NULL
, 1, OPTAB_DIRECT
);
11231 case IX86_BUILTIN_UMWAIT
:
11232 icode
= CODE_FOR_umwait_rex64
;
11234 case IX86_BUILTIN_TPAUSE
:
11235 icode
= CODE_FOR_tpause_rex64
;
11238 gcc_unreachable ();
11241 op2
= gen_lowpart (SImode
, op2
);
11242 op1
= gen_lowpart (SImode
, op1
);
11243 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11249 case IX86_BUILTIN_UMWAIT
:
11250 icode
= CODE_FOR_umwait
;
11252 case IX86_BUILTIN_TPAUSE
:
11253 icode
= CODE_FOR_tpause
;
11256 gcc_unreachable ();
11258 pat
= GEN_FCN (icode
) (op0
, op1
);
11267 || !register_operand (target
, QImode
))
11268 target
= gen_reg_rtx (QImode
);
11270 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11272 emit_insn (gen_rtx_SET (target
, pat
));
11276 case IX86_BUILTIN_CLZERO
:
11277 arg0
= CALL_EXPR_ARG (exp
, 0);
11278 op0
= expand_normal (arg0
);
11280 op0
= ix86_zero_extend_to_Pmode (op0
);
11281 emit_insn (gen_clzero (Pmode
, op0
));
11284 case IX86_BUILTIN_CLDEMOTE
:
11285 arg0
= CALL_EXPR_ARG (exp
, 0);
11286 op0
= expand_normal (arg0
);
11287 icode
= CODE_FOR_cldemote
;
11288 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11289 op0
= ix86_zero_extend_to_Pmode (op0
);
11291 emit_insn (gen_cldemote (op0
));
11294 case IX86_BUILTIN_VEC_INIT_V2SI
:
11295 case IX86_BUILTIN_VEC_INIT_V4HI
:
11296 case IX86_BUILTIN_VEC_INIT_V8QI
:
11297 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
11299 case IX86_BUILTIN_VEC_EXT_V2DF
:
11300 case IX86_BUILTIN_VEC_EXT_V2DI
:
11301 case IX86_BUILTIN_VEC_EXT_V4SF
:
11302 case IX86_BUILTIN_VEC_EXT_V4SI
:
11303 case IX86_BUILTIN_VEC_EXT_V8HI
:
11304 case IX86_BUILTIN_VEC_EXT_V2SI
:
11305 case IX86_BUILTIN_VEC_EXT_V4HI
:
11306 case IX86_BUILTIN_VEC_EXT_V16QI
:
11307 return ix86_expand_vec_ext_builtin (exp
, target
);
11309 case IX86_BUILTIN_VEC_SET_V2DI
:
11310 case IX86_BUILTIN_VEC_SET_V4SF
:
11311 case IX86_BUILTIN_VEC_SET_V4SI
:
11312 case IX86_BUILTIN_VEC_SET_V8HI
:
11313 case IX86_BUILTIN_VEC_SET_V4HI
:
11314 case IX86_BUILTIN_VEC_SET_V16QI
:
11315 return ix86_expand_vec_set_builtin (exp
);
11317 case IX86_BUILTIN_NANQ
:
11318 case IX86_BUILTIN_NANSQ
:
11319 return expand_call (exp
, target
, ignore
);
11321 case IX86_BUILTIN_RDPID
:
11323 op0
= gen_reg_rtx (word_mode
);
11327 insn
= gen_rdpid_rex64 (op0
);
11328 op0
= convert_to_mode (SImode
, op0
, 1);
11331 insn
= gen_rdpid (op0
);
11336 || !register_operand (target
, SImode
))
11337 target
= gen_reg_rtx (SImode
);
11339 emit_move_insn (target
, op0
);
11342 case IX86_BUILTIN_2INTERSECTD512
:
11343 case IX86_BUILTIN_2INTERSECTQ512
:
11344 case IX86_BUILTIN_2INTERSECTD256
:
11345 case IX86_BUILTIN_2INTERSECTQ256
:
11346 case IX86_BUILTIN_2INTERSECTD128
:
11347 case IX86_BUILTIN_2INTERSECTQ128
:
11348 arg0
= CALL_EXPR_ARG (exp
, 0);
11349 arg1
= CALL_EXPR_ARG (exp
, 1);
11350 arg2
= CALL_EXPR_ARG (exp
, 2);
11351 arg3
= CALL_EXPR_ARG (exp
, 3);
11352 op0
= expand_normal (arg0
);
11353 op1
= expand_normal (arg1
);
11354 op2
= expand_normal (arg2
);
11355 op3
= expand_normal (arg3
);
11357 if (!address_operand (op0
, VOIDmode
))
11359 op0
= convert_memory_address (Pmode
, op0
);
11360 op0
= copy_addr_to_reg (op0
);
11362 if (!address_operand (op1
, VOIDmode
))
11364 op1
= convert_memory_address (Pmode
, op1
);
11365 op1
= copy_addr_to_reg (op1
);
11370 case IX86_BUILTIN_2INTERSECTD512
:
11372 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
11374 case IX86_BUILTIN_2INTERSECTQ512
:
11376 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
11378 case IX86_BUILTIN_2INTERSECTD256
:
11380 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
11382 case IX86_BUILTIN_2INTERSECTQ256
:
11384 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
11386 case IX86_BUILTIN_2INTERSECTD128
:
11388 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
11390 case IX86_BUILTIN_2INTERSECTQ128
:
11392 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
11395 gcc_unreachable ();
11398 mode2
= insn_data
[icode
].operand
[1].mode
;
11399 mode3
= insn_data
[icode
].operand
[2].mode
;
11400 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
11401 op2
= copy_to_mode_reg (mode2
, op2
);
11402 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
11403 op3
= copy_to_mode_reg (mode3
, op3
);
11405 op4
= gen_reg_rtx (mode4
);
11406 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
11407 mode0
= mode4
== P2HImode
? HImode
: QImode
;
11408 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
11409 gen_lowpart (mode0
, op4
));
11410 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
11411 gen_highpart (mode0
, op4
));
11415 case IX86_BUILTIN_RDPMC
:
11416 case IX86_BUILTIN_RDTSC
:
11417 case IX86_BUILTIN_RDTSCP
:
11418 case IX86_BUILTIN_XGETBV
:
11420 op0
= gen_reg_rtx (DImode
);
11421 op1
= gen_reg_rtx (DImode
);
11423 if (fcode
== IX86_BUILTIN_RDPMC
)
11425 arg0
= CALL_EXPR_ARG (exp
, 0);
11426 op2
= expand_normal (arg0
);
11427 if (!register_operand (op2
, SImode
))
11428 op2
= copy_to_mode_reg (SImode
, op2
);
11430 insn
= (TARGET_64BIT
11431 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
11432 : gen_rdpmc (op0
, op2
));
11435 else if (fcode
== IX86_BUILTIN_XGETBV
)
11437 arg0
= CALL_EXPR_ARG (exp
, 0);
11438 op2
= expand_normal (arg0
);
11439 if (!register_operand (op2
, SImode
))
11440 op2
= copy_to_mode_reg (SImode
, op2
);
11442 insn
= (TARGET_64BIT
11443 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
11444 : gen_xgetbv (op0
, op2
));
11447 else if (fcode
== IX86_BUILTIN_RDTSC
)
11449 insn
= (TARGET_64BIT
11450 ? gen_rdtsc_rex64 (op0
, op1
)
11451 : gen_rdtsc (op0
));
11456 op2
= gen_reg_rtx (SImode
);
11458 insn
= (TARGET_64BIT
11459 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
11460 : gen_rdtscp (op0
, op2
));
11463 arg0
= CALL_EXPR_ARG (exp
, 0);
11464 op4
= expand_normal (arg0
);
11465 if (!address_operand (op4
, VOIDmode
))
11467 op4
= convert_memory_address (Pmode
, op4
);
11468 op4
= copy_addr_to_reg (op4
);
11470 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
11474 || !register_operand (target
, DImode
))
11475 target
= gen_reg_rtx (DImode
);
11479 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
11480 op1
, 1, OPTAB_DIRECT
);
11481 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
11482 op0
, 1, OPTAB_DIRECT
);
11485 emit_move_insn (target
, op0
);
11488 case IX86_BUILTIN_ENQCMD
:
11489 case IX86_BUILTIN_ENQCMDS
:
11490 case IX86_BUILTIN_MOVDIR64B
:
11492 arg0
= CALL_EXPR_ARG (exp
, 0);
11493 arg1
= CALL_EXPR_ARG (exp
, 1);
11494 op0
= expand_normal (arg0
);
11495 op1
= expand_normal (arg1
);
11497 op0
= ix86_zero_extend_to_Pmode (op0
);
11498 if (!address_operand (op1
, VOIDmode
))
11500 op1
= convert_memory_address (Pmode
, op1
);
11501 op1
= copy_addr_to_reg (op1
);
11503 op1
= gen_rtx_MEM (XImode
, op1
);
11505 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
11507 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
11514 target
= gen_reg_rtx (SImode
);
11515 emit_move_insn (target
, const0_rtx
);
11516 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11518 if (fcode
== IX86_BUILTIN_ENQCMD
)
11519 pat
= gen_enqcmd (UNSPECV_ENQCMD
, Pmode
, op0
, op1
);
11521 pat
= gen_enqcmd (UNSPECV_ENQCMDS
, Pmode
, op0
, op1
);
11525 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11526 gen_rtx_fmt_ee (EQ
, QImode
,
11530 return SUBREG_REG (target
);
11533 case IX86_BUILTIN_FXSAVE
:
11534 case IX86_BUILTIN_FXRSTOR
:
11535 case IX86_BUILTIN_FXSAVE64
:
11536 case IX86_BUILTIN_FXRSTOR64
:
11537 case IX86_BUILTIN_FNSTENV
:
11538 case IX86_BUILTIN_FLDENV
:
11542 case IX86_BUILTIN_FXSAVE
:
11543 icode
= CODE_FOR_fxsave
;
11545 case IX86_BUILTIN_FXRSTOR
:
11546 icode
= CODE_FOR_fxrstor
;
11548 case IX86_BUILTIN_FXSAVE64
:
11549 icode
= CODE_FOR_fxsave64
;
11551 case IX86_BUILTIN_FXRSTOR64
:
11552 icode
= CODE_FOR_fxrstor64
;
11554 case IX86_BUILTIN_FNSTENV
:
11555 icode
= CODE_FOR_fnstenv
;
11557 case IX86_BUILTIN_FLDENV
:
11558 icode
= CODE_FOR_fldenv
;
11561 gcc_unreachable ();
11564 arg0
= CALL_EXPR_ARG (exp
, 0);
11565 op0
= expand_normal (arg0
);
11567 if (!address_operand (op0
, VOIDmode
))
11569 op0
= convert_memory_address (Pmode
, op0
);
11570 op0
= copy_addr_to_reg (op0
);
11572 op0
= gen_rtx_MEM (mode0
, op0
);
11574 pat
= GEN_FCN (icode
) (op0
);
11579 case IX86_BUILTIN_XSETBV
:
11580 arg0
= CALL_EXPR_ARG (exp
, 0);
11581 arg1
= CALL_EXPR_ARG (exp
, 1);
11582 op0
= expand_normal (arg0
);
11583 op1
= expand_normal (arg1
);
11586 op0
= copy_to_mode_reg (SImode
, op0
);
11588 op1
= force_reg (DImode
, op1
);
11592 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11593 NULL
, 1, OPTAB_DIRECT
);
11595 icode
= CODE_FOR_xsetbv_rex64
;
11597 op2
= gen_lowpart (SImode
, op2
);
11598 op1
= gen_lowpart (SImode
, op1
);
11599 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11603 icode
= CODE_FOR_xsetbv
;
11605 pat
= GEN_FCN (icode
) (op0
, op1
);
11611 case IX86_BUILTIN_XSAVE
:
11612 case IX86_BUILTIN_XRSTOR
:
11613 case IX86_BUILTIN_XSAVE64
:
11614 case IX86_BUILTIN_XRSTOR64
:
11615 case IX86_BUILTIN_XSAVEOPT
:
11616 case IX86_BUILTIN_XSAVEOPT64
:
11617 case IX86_BUILTIN_XSAVES
:
11618 case IX86_BUILTIN_XRSTORS
:
11619 case IX86_BUILTIN_XSAVES64
:
11620 case IX86_BUILTIN_XRSTORS64
:
11621 case IX86_BUILTIN_XSAVEC
:
11622 case IX86_BUILTIN_XSAVEC64
:
11623 arg0
= CALL_EXPR_ARG (exp
, 0);
11624 arg1
= CALL_EXPR_ARG (exp
, 1);
11625 op0
= expand_normal (arg0
);
11626 op1
= expand_normal (arg1
);
11628 if (!address_operand (op0
, VOIDmode
))
11630 op0
= convert_memory_address (Pmode
, op0
);
11631 op0
= copy_addr_to_reg (op0
);
11633 op0
= gen_rtx_MEM (BLKmode
, op0
);
11635 op1
= force_reg (DImode
, op1
);
11639 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11640 NULL
, 1, OPTAB_DIRECT
);
11643 case IX86_BUILTIN_XSAVE
:
11644 icode
= CODE_FOR_xsave_rex64
;
11646 case IX86_BUILTIN_XRSTOR
:
11647 icode
= CODE_FOR_xrstor_rex64
;
11649 case IX86_BUILTIN_XSAVE64
:
11650 icode
= CODE_FOR_xsave64
;
11652 case IX86_BUILTIN_XRSTOR64
:
11653 icode
= CODE_FOR_xrstor64
;
11655 case IX86_BUILTIN_XSAVEOPT
:
11656 icode
= CODE_FOR_xsaveopt_rex64
;
11658 case IX86_BUILTIN_XSAVEOPT64
:
11659 icode
= CODE_FOR_xsaveopt64
;
11661 case IX86_BUILTIN_XSAVES
:
11662 icode
= CODE_FOR_xsaves_rex64
;
11664 case IX86_BUILTIN_XRSTORS
:
11665 icode
= CODE_FOR_xrstors_rex64
;
11667 case IX86_BUILTIN_XSAVES64
:
11668 icode
= CODE_FOR_xsaves64
;
11670 case IX86_BUILTIN_XRSTORS64
:
11671 icode
= CODE_FOR_xrstors64
;
11673 case IX86_BUILTIN_XSAVEC
:
11674 icode
= CODE_FOR_xsavec_rex64
;
11676 case IX86_BUILTIN_XSAVEC64
:
11677 icode
= CODE_FOR_xsavec64
;
11680 gcc_unreachable ();
11683 op2
= gen_lowpart (SImode
, op2
);
11684 op1
= gen_lowpart (SImode
, op1
);
11685 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11691 case IX86_BUILTIN_XSAVE
:
11692 icode
= CODE_FOR_xsave
;
11694 case IX86_BUILTIN_XRSTOR
:
11695 icode
= CODE_FOR_xrstor
;
11697 case IX86_BUILTIN_XSAVEOPT
:
11698 icode
= CODE_FOR_xsaveopt
;
11700 case IX86_BUILTIN_XSAVES
:
11701 icode
= CODE_FOR_xsaves
;
11703 case IX86_BUILTIN_XRSTORS
:
11704 icode
= CODE_FOR_xrstors
;
11706 case IX86_BUILTIN_XSAVEC
:
11707 icode
= CODE_FOR_xsavec
;
11710 gcc_unreachable ();
11712 pat
= GEN_FCN (icode
) (op0
, op1
);
11719 case IX86_BUILTIN_LLWPCB
:
11720 arg0
= CALL_EXPR_ARG (exp
, 0);
11721 op0
= expand_normal (arg0
);
11722 icode
= CODE_FOR_lwp_llwpcb
;
11723 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11724 op0
= ix86_zero_extend_to_Pmode (op0
);
11725 emit_insn (gen_lwp_llwpcb (op0
));
11728 case IX86_BUILTIN_SLWPCB
:
11729 icode
= CODE_FOR_lwp_slwpcb
;
11731 || !insn_data
[icode
].operand
[0].predicate (target
, Pmode
))
11732 target
= gen_reg_rtx (Pmode
);
11733 emit_insn (gen_lwp_slwpcb (target
));
11736 case IX86_BUILTIN_BEXTRI32
:
11737 case IX86_BUILTIN_BEXTRI64
:
11738 arg0
= CALL_EXPR_ARG (exp
, 0);
11739 arg1
= CALL_EXPR_ARG (exp
, 1);
11740 op0
= expand_normal (arg0
);
11741 op1
= expand_normal (arg1
);
11742 icode
= (fcode
== IX86_BUILTIN_BEXTRI32
11743 ? CODE_FOR_tbm_bextri_si
11744 : CODE_FOR_tbm_bextri_di
);
11745 if (!CONST_INT_P (op1
))
11747 error ("last argument must be an immediate");
11752 unsigned char length
= (INTVAL (op1
) >> 8) & 0xFF;
11753 unsigned char lsb_index
= INTVAL (op1
) & 0xFF;
11754 op1
= GEN_INT (length
);
11755 op2
= GEN_INT (lsb_index
);
11757 mode1
= insn_data
[icode
].operand
[1].mode
;
11758 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
11759 op0
= copy_to_mode_reg (mode1
, op0
);
11761 mode0
= insn_data
[icode
].operand
[0].mode
;
11763 || !register_operand (target
, mode0
))
11764 target
= gen_reg_rtx (mode0
);
11766 pat
= GEN_FCN (icode
) (target
, op0
, op1
, op2
);
11772 case IX86_BUILTIN_RDRAND16_STEP
:
11773 icode
= CODE_FOR_rdrandhi_1
;
11777 case IX86_BUILTIN_RDRAND32_STEP
:
11778 icode
= CODE_FOR_rdrandsi_1
;
11782 case IX86_BUILTIN_RDRAND64_STEP
:
11783 icode
= CODE_FOR_rdranddi_1
;
11787 arg0
= CALL_EXPR_ARG (exp
, 0);
11788 op1
= expand_normal (arg0
);
11789 if (!address_operand (op1
, VOIDmode
))
11791 op1
= convert_memory_address (Pmode
, op1
);
11792 op1
= copy_addr_to_reg (op1
);
11795 op0
= gen_reg_rtx (mode0
);
11796 emit_insn (GEN_FCN (icode
) (op0
));
11798 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11800 op1
= gen_reg_rtx (SImode
);
11801 emit_move_insn (op1
, CONST1_RTX (SImode
));
11803 /* Emit SImode conditional move. */
11804 if (mode0
== HImode
)
11806 if (TARGET_ZERO_EXTEND_WITH_AND
11807 && optimize_function_for_speed_p (cfun
))
11809 op2
= force_reg (SImode
, const0_rtx
);
11811 emit_insn (gen_movstricthi
11812 (gen_lowpart (HImode
, op2
), op0
));
11816 op2
= gen_reg_rtx (SImode
);
11818 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
11821 else if (mode0
== SImode
)
11824 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
11827 || !register_operand (target
, SImode
))
11828 target
= gen_reg_rtx (SImode
);
11830 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11832 emit_insn (gen_rtx_SET (target
,
11833 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
11836 case IX86_BUILTIN_RDSEED16_STEP
:
11837 icode
= CODE_FOR_rdseedhi_1
;
11841 case IX86_BUILTIN_RDSEED32_STEP
:
11842 icode
= CODE_FOR_rdseedsi_1
;
11846 case IX86_BUILTIN_RDSEED64_STEP
:
11847 icode
= CODE_FOR_rdseeddi_1
;
11851 arg0
= CALL_EXPR_ARG (exp
, 0);
11852 op1
= expand_normal (arg0
);
11853 if (!address_operand (op1
, VOIDmode
))
11855 op1
= convert_memory_address (Pmode
, op1
);
11856 op1
= copy_addr_to_reg (op1
);
11859 op0
= gen_reg_rtx (mode0
);
11860 emit_insn (GEN_FCN (icode
) (op0
));
11862 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11864 op2
= gen_reg_rtx (QImode
);
11866 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11868 emit_insn (gen_rtx_SET (op2
, pat
));
11871 || !register_operand (target
, SImode
))
11872 target
= gen_reg_rtx (SImode
);
11874 emit_insn (gen_zero_extendqisi2 (target
, op2
));
11877 case IX86_BUILTIN_SBB32
:
11878 icode
= CODE_FOR_subborrowsi
;
11879 icode2
= CODE_FOR_subborrowsi_0
;
11885 case IX86_BUILTIN_SBB64
:
11886 icode
= CODE_FOR_subborrowdi
;
11887 icode2
= CODE_FOR_subborrowdi_0
;
11893 case IX86_BUILTIN_ADDCARRYX32
:
11894 icode
= CODE_FOR_addcarrysi
;
11895 icode2
= CODE_FOR_addcarrysi_0
;
11901 case IX86_BUILTIN_ADDCARRYX64
:
11902 icode
= CODE_FOR_addcarrydi
;
11903 icode2
= CODE_FOR_addcarrydi_0
;
11909 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
11910 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
11911 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
11912 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
11914 op1
= expand_normal (arg0
);
11915 if (!integer_zerop (arg0
))
11916 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
11918 op2
= expand_normal (arg1
);
11919 if (!register_operand (op2
, mode0
))
11920 op2
= copy_to_mode_reg (mode0
, op2
);
11922 op3
= expand_normal (arg2
);
11923 if (!register_operand (op3
, mode0
))
11924 op3
= copy_to_mode_reg (mode0
, op3
);
11926 op4
= expand_normal (arg3
);
11927 if (!address_operand (op4
, VOIDmode
))
11929 op4
= convert_memory_address (Pmode
, op4
);
11930 op4
= copy_addr_to_reg (op4
);
11933 op0
= gen_reg_rtx (mode0
);
11934 if (integer_zerop (arg0
))
11936 /* If arg0 is 0, optimize right away into add or sub
11937 instruction that sets CCCmode flags. */
11938 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
11939 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
11943 /* Generate CF from input operand. */
11944 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
11946 /* Generate instruction that consumes CF. */
11947 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
11948 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
11949 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
11950 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
11953 /* Return current CF value. */
11955 target
= gen_reg_rtx (QImode
);
11957 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
11958 emit_insn (gen_rtx_SET (target
, pat
));
11960 /* Store the result. */
11961 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
11965 case IX86_BUILTIN_READ_FLAGS
:
11966 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11969 || target
== NULL_RTX
11970 || !nonimmediate_operand (target
, word_mode
)
11971 || GET_MODE (target
) != word_mode
)
11972 target
= gen_reg_rtx (word_mode
);
11974 emit_insn (gen_pop (target
));
11977 case IX86_BUILTIN_WRITE_FLAGS
:
11979 arg0
= CALL_EXPR_ARG (exp
, 0);
11980 op0
= expand_normal (arg0
);
11981 if (!general_no_elim_operand (op0
, word_mode
))
11982 op0
= copy_to_mode_reg (word_mode
, op0
);
11984 emit_insn (gen_push (op0
));
11985 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11988 case IX86_BUILTIN_KTESTC8
:
11989 icode
= CODE_FOR_ktestqi
;
11993 case IX86_BUILTIN_KTESTZ8
:
11994 icode
= CODE_FOR_ktestqi
;
11998 case IX86_BUILTIN_KTESTC16
:
11999 icode
= CODE_FOR_ktesthi
;
12003 case IX86_BUILTIN_KTESTZ16
:
12004 icode
= CODE_FOR_ktesthi
;
12008 case IX86_BUILTIN_KTESTC32
:
12009 icode
= CODE_FOR_ktestsi
;
12013 case IX86_BUILTIN_KTESTZ32
:
12014 icode
= CODE_FOR_ktestsi
;
12018 case IX86_BUILTIN_KTESTC64
:
12019 icode
= CODE_FOR_ktestdi
;
12023 case IX86_BUILTIN_KTESTZ64
:
12024 icode
= CODE_FOR_ktestdi
;
12028 case IX86_BUILTIN_KORTESTC8
:
12029 icode
= CODE_FOR_kortestqi
;
12033 case IX86_BUILTIN_KORTESTZ8
:
12034 icode
= CODE_FOR_kortestqi
;
12038 case IX86_BUILTIN_KORTESTC16
:
12039 icode
= CODE_FOR_kortesthi
;
12043 case IX86_BUILTIN_KORTESTZ16
:
12044 icode
= CODE_FOR_kortesthi
;
12048 case IX86_BUILTIN_KORTESTC32
:
12049 icode
= CODE_FOR_kortestsi
;
12053 case IX86_BUILTIN_KORTESTZ32
:
12054 icode
= CODE_FOR_kortestsi
;
12058 case IX86_BUILTIN_KORTESTC64
:
12059 icode
= CODE_FOR_kortestdi
;
12063 case IX86_BUILTIN_KORTESTZ64
:
12064 icode
= CODE_FOR_kortestdi
;
12068 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
12069 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
12070 op0
= expand_normal (arg0
);
12071 op1
= expand_normal (arg1
);
12073 mode0
= insn_data
[icode
].operand
[0].mode
;
12074 mode1
= insn_data
[icode
].operand
[1].mode
;
12076 if (GET_MODE (op0
) != VOIDmode
)
12077 op0
= force_reg (GET_MODE (op0
), op0
);
12079 op0
= gen_lowpart (mode0
, op0
);
12081 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12082 op0
= copy_to_mode_reg (mode0
, op0
);
12084 if (GET_MODE (op1
) != VOIDmode
)
12085 op1
= force_reg (GET_MODE (op1
), op1
);
12087 op1
= gen_lowpart (mode1
, op1
);
12089 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12090 op1
= copy_to_mode_reg (mode1
, op1
);
12092 target
= gen_reg_rtx (QImode
);
12094 /* Emit kortest. */
12095 emit_insn (GEN_FCN (icode
) (op0
, op1
));
12096 /* And use setcc to return result from flags. */
12097 ix86_expand_setcc (target
, EQ
,
12098 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
12101 case IX86_BUILTIN_GATHERSIV2DF
:
12102 icode
= CODE_FOR_avx2_gathersiv2df
;
12104 case IX86_BUILTIN_GATHERSIV4DF
:
12105 icode
= CODE_FOR_avx2_gathersiv4df
;
12107 case IX86_BUILTIN_GATHERDIV2DF
:
12108 icode
= CODE_FOR_avx2_gatherdiv2df
;
12110 case IX86_BUILTIN_GATHERDIV4DF
:
12111 icode
= CODE_FOR_avx2_gatherdiv4df
;
12113 case IX86_BUILTIN_GATHERSIV4SF
:
12114 icode
= CODE_FOR_avx2_gathersiv4sf
;
12116 case IX86_BUILTIN_GATHERSIV8SF
:
12117 icode
= CODE_FOR_avx2_gathersiv8sf
;
12119 case IX86_BUILTIN_GATHERDIV4SF
:
12120 icode
= CODE_FOR_avx2_gatherdiv4sf
;
12122 case IX86_BUILTIN_GATHERDIV8SF
:
12123 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12125 case IX86_BUILTIN_GATHERSIV2DI
:
12126 icode
= CODE_FOR_avx2_gathersiv2di
;
12128 case IX86_BUILTIN_GATHERSIV4DI
:
12129 icode
= CODE_FOR_avx2_gathersiv4di
;
12131 case IX86_BUILTIN_GATHERDIV2DI
:
12132 icode
= CODE_FOR_avx2_gatherdiv2di
;
12134 case IX86_BUILTIN_GATHERDIV4DI
:
12135 icode
= CODE_FOR_avx2_gatherdiv4di
;
12137 case IX86_BUILTIN_GATHERSIV4SI
:
12138 icode
= CODE_FOR_avx2_gathersiv4si
;
12140 case IX86_BUILTIN_GATHERSIV8SI
:
12141 icode
= CODE_FOR_avx2_gathersiv8si
;
12143 case IX86_BUILTIN_GATHERDIV4SI
:
12144 icode
= CODE_FOR_avx2_gatherdiv4si
;
12146 case IX86_BUILTIN_GATHERDIV8SI
:
12147 icode
= CODE_FOR_avx2_gatherdiv8si
;
12149 case IX86_BUILTIN_GATHERALTSIV4DF
:
12150 icode
= CODE_FOR_avx2_gathersiv4df
;
12152 case IX86_BUILTIN_GATHERALTDIV8SF
:
12153 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12155 case IX86_BUILTIN_GATHERALTSIV4DI
:
12156 icode
= CODE_FOR_avx2_gathersiv4di
;
12158 case IX86_BUILTIN_GATHERALTDIV8SI
:
12159 icode
= CODE_FOR_avx2_gatherdiv8si
;
12161 case IX86_BUILTIN_GATHER3SIV16SF
:
12162 icode
= CODE_FOR_avx512f_gathersiv16sf
;
12164 case IX86_BUILTIN_GATHER3SIV8DF
:
12165 icode
= CODE_FOR_avx512f_gathersiv8df
;
12167 case IX86_BUILTIN_GATHER3DIV16SF
:
12168 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12170 case IX86_BUILTIN_GATHER3DIV8DF
:
12171 icode
= CODE_FOR_avx512f_gatherdiv8df
;
12173 case IX86_BUILTIN_GATHER3SIV16SI
:
12174 icode
= CODE_FOR_avx512f_gathersiv16si
;
12176 case IX86_BUILTIN_GATHER3SIV8DI
:
12177 icode
= CODE_FOR_avx512f_gathersiv8di
;
12179 case IX86_BUILTIN_GATHER3DIV16SI
:
12180 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12182 case IX86_BUILTIN_GATHER3DIV8DI
:
12183 icode
= CODE_FOR_avx512f_gatherdiv8di
;
12185 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12186 icode
= CODE_FOR_avx512f_gathersiv8df
;
12188 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12189 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12191 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12192 icode
= CODE_FOR_avx512f_gathersiv8di
;
12194 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12195 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12197 case IX86_BUILTIN_GATHER3SIV2DF
:
12198 icode
= CODE_FOR_avx512vl_gathersiv2df
;
12200 case IX86_BUILTIN_GATHER3SIV4DF
:
12201 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12203 case IX86_BUILTIN_GATHER3DIV2DF
:
12204 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
12206 case IX86_BUILTIN_GATHER3DIV4DF
:
12207 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
12209 case IX86_BUILTIN_GATHER3SIV4SF
:
12210 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
12212 case IX86_BUILTIN_GATHER3SIV8SF
:
12213 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
12215 case IX86_BUILTIN_GATHER3DIV4SF
:
12216 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
12218 case IX86_BUILTIN_GATHER3DIV8SF
:
12219 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12221 case IX86_BUILTIN_GATHER3SIV2DI
:
12222 icode
= CODE_FOR_avx512vl_gathersiv2di
;
12224 case IX86_BUILTIN_GATHER3SIV4DI
:
12225 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12227 case IX86_BUILTIN_GATHER3DIV2DI
:
12228 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
12230 case IX86_BUILTIN_GATHER3DIV4DI
:
12231 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
12233 case IX86_BUILTIN_GATHER3SIV4SI
:
12234 icode
= CODE_FOR_avx512vl_gathersiv4si
;
12236 case IX86_BUILTIN_GATHER3SIV8SI
:
12237 icode
= CODE_FOR_avx512vl_gathersiv8si
;
12239 case IX86_BUILTIN_GATHER3DIV4SI
:
12240 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
12242 case IX86_BUILTIN_GATHER3DIV8SI
:
12243 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12245 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12246 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12248 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12249 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12251 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12252 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12254 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12255 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12257 case IX86_BUILTIN_SCATTERSIV16SF
:
12258 icode
= CODE_FOR_avx512f_scattersiv16sf
;
12260 case IX86_BUILTIN_SCATTERSIV8DF
:
12261 icode
= CODE_FOR_avx512f_scattersiv8df
;
12263 case IX86_BUILTIN_SCATTERDIV16SF
:
12264 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12266 case IX86_BUILTIN_SCATTERDIV8DF
:
12267 icode
= CODE_FOR_avx512f_scatterdiv8df
;
12269 case IX86_BUILTIN_SCATTERSIV16SI
:
12270 icode
= CODE_FOR_avx512f_scattersiv16si
;
12272 case IX86_BUILTIN_SCATTERSIV8DI
:
12273 icode
= CODE_FOR_avx512f_scattersiv8di
;
12275 case IX86_BUILTIN_SCATTERDIV16SI
:
12276 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12278 case IX86_BUILTIN_SCATTERDIV8DI
:
12279 icode
= CODE_FOR_avx512f_scatterdiv8di
;
12281 case IX86_BUILTIN_SCATTERSIV8SF
:
12282 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
12284 case IX86_BUILTIN_SCATTERSIV4SF
:
12285 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
12287 case IX86_BUILTIN_SCATTERSIV4DF
:
12288 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12290 case IX86_BUILTIN_SCATTERSIV2DF
:
12291 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12293 case IX86_BUILTIN_SCATTERDIV8SF
:
12294 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12296 case IX86_BUILTIN_SCATTERDIV4SF
:
12297 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12299 case IX86_BUILTIN_SCATTERDIV4DF
:
12300 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
12302 case IX86_BUILTIN_SCATTERDIV2DF
:
12303 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
12305 case IX86_BUILTIN_SCATTERSIV8SI
:
12306 icode
= CODE_FOR_avx512vl_scattersiv8si
;
12308 case IX86_BUILTIN_SCATTERSIV4SI
:
12309 icode
= CODE_FOR_avx512vl_scattersiv4si
;
12311 case IX86_BUILTIN_SCATTERSIV4DI
:
12312 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12314 case IX86_BUILTIN_SCATTERSIV2DI
:
12315 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12317 case IX86_BUILTIN_SCATTERDIV8SI
:
12318 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12320 case IX86_BUILTIN_SCATTERDIV4SI
:
12321 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12323 case IX86_BUILTIN_SCATTERDIV4DI
:
12324 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
12326 case IX86_BUILTIN_SCATTERDIV2DI
:
12327 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
12329 case IX86_BUILTIN_GATHERPFDPD
:
12330 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
12331 goto vec_prefetch_gen
;
12332 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12333 icode
= CODE_FOR_avx512f_scattersiv8df
;
12335 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12336 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12338 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12339 icode
= CODE_FOR_avx512f_scattersiv8di
;
12341 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12342 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12344 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12345 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12347 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12348 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12350 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12351 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12353 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12354 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12356 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12357 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12359 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12360 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12362 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12363 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12365 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12366 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12368 case IX86_BUILTIN_GATHERPFDPS
:
12369 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
12370 goto vec_prefetch_gen
;
12371 case IX86_BUILTIN_GATHERPFQPD
:
12372 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
12373 goto vec_prefetch_gen
;
12374 case IX86_BUILTIN_GATHERPFQPS
:
12375 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
12376 goto vec_prefetch_gen
;
12377 case IX86_BUILTIN_SCATTERPFDPD
:
12378 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
12379 goto vec_prefetch_gen
;
12380 case IX86_BUILTIN_SCATTERPFDPS
:
12381 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
12382 goto vec_prefetch_gen
;
12383 case IX86_BUILTIN_SCATTERPFQPD
:
12384 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
12385 goto vec_prefetch_gen
;
12386 case IX86_BUILTIN_SCATTERPFQPS
:
12387 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
12388 goto vec_prefetch_gen
;
12392 rtx (*gen
) (rtx
, rtx
);
12394 arg0
= CALL_EXPR_ARG (exp
, 0);
12395 arg1
= CALL_EXPR_ARG (exp
, 1);
12396 arg2
= CALL_EXPR_ARG (exp
, 2);
12397 arg3
= CALL_EXPR_ARG (exp
, 3);
12398 arg4
= CALL_EXPR_ARG (exp
, 4);
12399 op0
= expand_normal (arg0
);
12400 op1
= expand_normal (arg1
);
12401 op2
= expand_normal (arg2
);
12402 op3
= expand_normal (arg3
);
12403 op4
= expand_normal (arg4
);
12404 /* Note the arg order is different from the operand order. */
12405 mode0
= insn_data
[icode
].operand
[1].mode
;
12406 mode2
= insn_data
[icode
].operand
[3].mode
;
12407 mode3
= insn_data
[icode
].operand
[4].mode
;
12408 mode4
= insn_data
[icode
].operand
[5].mode
;
12410 if (target
== NULL_RTX
12411 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
12412 || !insn_data
[icode
].operand
[0].predicate (target
,
12413 GET_MODE (target
)))
12414 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
12416 subtarget
= target
;
12420 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12421 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12422 half
= gen_reg_rtx (V8SImode
);
12423 if (!nonimmediate_operand (op2
, V16SImode
))
12424 op2
= copy_to_mode_reg (V16SImode
, op2
);
12425 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12428 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12429 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12430 case IX86_BUILTIN_GATHERALTSIV4DF
:
12431 case IX86_BUILTIN_GATHERALTSIV4DI
:
12432 half
= gen_reg_rtx (V4SImode
);
12433 if (!nonimmediate_operand (op2
, V8SImode
))
12434 op2
= copy_to_mode_reg (V8SImode
, op2
);
12435 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12438 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12439 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12440 half
= gen_reg_rtx (mode0
);
12441 if (mode0
== V8SFmode
)
12442 gen
= gen_vec_extract_lo_v16sf
;
12444 gen
= gen_vec_extract_lo_v16si
;
12445 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12446 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12447 emit_insn (gen (half
, op0
));
12449 op3
= lowpart_subreg (QImode
, op3
, HImode
);
12451 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12452 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12453 case IX86_BUILTIN_GATHERALTDIV8SF
:
12454 case IX86_BUILTIN_GATHERALTDIV8SI
:
12455 half
= gen_reg_rtx (mode0
);
12456 if (mode0
== V4SFmode
)
12457 gen
= gen_vec_extract_lo_v8sf
;
12459 gen
= gen_vec_extract_lo_v8si
;
12460 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12461 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12462 emit_insn (gen (half
, op0
));
12464 if (VECTOR_MODE_P (GET_MODE (op3
)))
12466 half
= gen_reg_rtx (mode0
);
12467 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12468 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12469 emit_insn (gen (half
, op3
));
12477 /* Force memory operand only with base register here. But we
12478 don't want to do it on memory operand for other builtin
12480 op1
= ix86_zero_extend_to_Pmode (op1
);
12482 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
12483 op0
= copy_to_mode_reg (mode0
, op0
);
12484 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
12485 op1
= copy_to_mode_reg (Pmode
, op1
);
12486 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
12487 op2
= copy_to_mode_reg (mode2
, op2
);
12489 op3
= fixup_modeless_constant (op3
, mode3
);
12491 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
12493 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
12494 op3
= copy_to_mode_reg (mode3
, op3
);
12498 op3
= copy_to_reg (op3
);
12499 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
12501 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
12503 error ("the last argument must be scale 1, 2, 4, 8");
12507 /* Optimize. If mask is known to have all high bits set,
12508 replace op0 with pc_rtx to signal that the instruction
12509 overwrites the whole destination and doesn't use its
12510 previous contents. */
12513 if (TREE_CODE (arg3
) == INTEGER_CST
)
12515 if (integer_all_onesp (arg3
))
12518 else if (TREE_CODE (arg3
) == VECTOR_CST
)
12520 unsigned int negative
= 0;
12521 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
12523 tree cst
= VECTOR_CST_ELT (arg3
, i
);
12524 if (TREE_CODE (cst
) == INTEGER_CST
12525 && tree_int_cst_sign_bit (cst
))
12527 else if (TREE_CODE (cst
) == REAL_CST
12528 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
12531 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
12534 else if (TREE_CODE (arg3
) == SSA_NAME
12535 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
12537 /* Recognize also when mask is like:
12538 __v2df src = _mm_setzero_pd ();
12539 __v2df mask = _mm_cmpeq_pd (src, src);
12541 __v8sf src = _mm256_setzero_ps ();
12542 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12543 as that is a cheaper way to load all ones into
12544 a register than having to load a constant from
12546 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
12547 if (is_gimple_call (def_stmt
))
12549 tree fndecl
= gimple_call_fndecl (def_stmt
);
12551 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
12552 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl
))
12554 case IX86_BUILTIN_CMPPD
:
12555 case IX86_BUILTIN_CMPPS
:
12556 case IX86_BUILTIN_CMPPD256
:
12557 case IX86_BUILTIN_CMPPS256
:
12558 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
12561 case IX86_BUILTIN_CMPEQPD
:
12562 case IX86_BUILTIN_CMPEQPS
:
12563 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
12564 && initializer_zerop (gimple_call_arg (def_stmt
,
12575 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
12582 case IX86_BUILTIN_GATHER3DIV16SF
:
12583 if (target
== NULL_RTX
)
12584 target
= gen_reg_rtx (V8SFmode
);
12585 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
12587 case IX86_BUILTIN_GATHER3DIV16SI
:
12588 if (target
== NULL_RTX
)
12589 target
= gen_reg_rtx (V8SImode
);
12590 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
12592 case IX86_BUILTIN_GATHER3DIV8SF
:
12593 case IX86_BUILTIN_GATHERDIV8SF
:
12594 if (target
== NULL_RTX
)
12595 target
= gen_reg_rtx (V4SFmode
);
12596 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
12598 case IX86_BUILTIN_GATHER3DIV8SI
:
12599 case IX86_BUILTIN_GATHERDIV8SI
:
12600 if (target
== NULL_RTX
)
12601 target
= gen_reg_rtx (V4SImode
);
12602 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
12605 target
= subtarget
;
12611 arg0
= CALL_EXPR_ARG (exp
, 0);
12612 arg1
= CALL_EXPR_ARG (exp
, 1);
12613 arg2
= CALL_EXPR_ARG (exp
, 2);
12614 arg3
= CALL_EXPR_ARG (exp
, 3);
12615 arg4
= CALL_EXPR_ARG (exp
, 4);
12616 op0
= expand_normal (arg0
);
12617 op1
= expand_normal (arg1
);
12618 op2
= expand_normal (arg2
);
12619 op3
= expand_normal (arg3
);
12620 op4
= expand_normal (arg4
);
12621 mode1
= insn_data
[icode
].operand
[1].mode
;
12622 mode2
= insn_data
[icode
].operand
[2].mode
;
12623 mode3
= insn_data
[icode
].operand
[3].mode
;
12624 mode4
= insn_data
[icode
].operand
[4].mode
;
12626 /* Scatter instruction stores operand op3 to memory with
12627 indices from op2 and scale from op4 under writemask op1.
12628 If index operand op2 has more elements then source operand
12629 op3 one need to use only its low half. And vice versa. */
12632 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12633 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12634 half
= gen_reg_rtx (V8SImode
);
12635 if (!nonimmediate_operand (op2
, V16SImode
))
12636 op2
= copy_to_mode_reg (V16SImode
, op2
);
12637 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12640 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12641 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12642 half
= gen_reg_rtx (mode3
);
12643 if (mode3
== V8SFmode
)
12644 gen
= gen_vec_extract_lo_v16sf
;
12646 gen
= gen_vec_extract_lo_v16si
;
12647 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12648 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12649 emit_insn (gen (half
, op3
));
12652 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12653 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12654 half
= gen_reg_rtx (V4SImode
);
12655 if (!nonimmediate_operand (op2
, V8SImode
))
12656 op2
= copy_to_mode_reg (V8SImode
, op2
);
12657 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12660 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12661 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12662 half
= gen_reg_rtx (mode3
);
12663 if (mode3
== V4SFmode
)
12664 gen
= gen_vec_extract_lo_v8sf
;
12666 gen
= gen_vec_extract_lo_v8si
;
12667 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12668 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12669 emit_insn (gen (half
, op3
));
12672 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12673 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12674 if (!nonimmediate_operand (op2
, V4SImode
))
12675 op2
= copy_to_mode_reg (V4SImode
, op2
);
12677 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12678 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12679 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12680 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12686 /* Force memory operand only with base register here. But we
12687 don't want to do it on memory operand for other builtin
12689 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
12691 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12692 op0
= copy_to_mode_reg (Pmode
, op0
);
12694 op1
= fixup_modeless_constant (op1
, mode1
);
12696 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
12698 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12699 op1
= copy_to_mode_reg (mode1
, op1
);
12703 op1
= copy_to_reg (op1
);
12704 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
12707 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12708 op2
= copy_to_mode_reg (mode2
, op2
);
12710 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12711 op3
= copy_to_mode_reg (mode3
, op3
);
12713 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12715 error ("the last argument must be scale 1, 2, 4, 8");
12719 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12727 arg0
= CALL_EXPR_ARG (exp
, 0);
12728 arg1
= CALL_EXPR_ARG (exp
, 1);
12729 arg2
= CALL_EXPR_ARG (exp
, 2);
12730 arg3
= CALL_EXPR_ARG (exp
, 3);
12731 arg4
= CALL_EXPR_ARG (exp
, 4);
12732 op0
= expand_normal (arg0
);
12733 op1
= expand_normal (arg1
);
12734 op2
= expand_normal (arg2
);
12735 op3
= expand_normal (arg3
);
12736 op4
= expand_normal (arg4
);
12737 mode0
= insn_data
[icode
].operand
[0].mode
;
12738 mode1
= insn_data
[icode
].operand
[1].mode
;
12739 mode3
= insn_data
[icode
].operand
[3].mode
;
12740 mode4
= insn_data
[icode
].operand
[4].mode
;
12742 op0
= fixup_modeless_constant (op0
, mode0
);
12744 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
12746 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12747 op0
= copy_to_mode_reg (mode0
, op0
);
12751 op0
= copy_to_reg (op0
);
12752 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
12755 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12756 op1
= copy_to_mode_reg (mode1
, op1
);
12758 /* Force memory operand only with base register here. But we
12759 don't want to do it on memory operand for other builtin
12761 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
12763 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
12764 op2
= copy_to_mode_reg (Pmode
, op2
);
12766 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12768 error ("the forth argument must be scale 1, 2, 4, 8");
12772 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12774 error ("incorrect hint operand");
12778 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12786 case IX86_BUILTIN_XABORT
:
12787 icode
= CODE_FOR_xabort
;
12788 arg0
= CALL_EXPR_ARG (exp
, 0);
12789 op0
= expand_normal (arg0
);
12790 mode0
= insn_data
[icode
].operand
[0].mode
;
12791 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12793 error ("the argument to %<xabort%> intrinsic must "
12794 "be an 8-bit immediate");
12797 emit_insn (gen_xabort (op0
));
12800 case IX86_BUILTIN_RSTORSSP
:
12801 case IX86_BUILTIN_CLRSSBSY
:
12802 arg0
= CALL_EXPR_ARG (exp
, 0);
12803 op0
= expand_normal (arg0
);
12804 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
12805 ? CODE_FOR_rstorssp
12806 : CODE_FOR_clrssbsy
);
12807 if (!address_operand (op0
, VOIDmode
))
12809 op1
= convert_memory_address (Pmode
, op0
);
12810 op0
= copy_addr_to_reg (op1
);
12812 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (Pmode
, op0
)));
12815 case IX86_BUILTIN_WRSSD
:
12816 case IX86_BUILTIN_WRSSQ
:
12817 case IX86_BUILTIN_WRUSSD
:
12818 case IX86_BUILTIN_WRUSSQ
:
12819 arg0
= CALL_EXPR_ARG (exp
, 0);
12820 op0
= expand_normal (arg0
);
12821 arg1
= CALL_EXPR_ARG (exp
, 1);
12822 op1
= expand_normal (arg1
);
12825 case IX86_BUILTIN_WRSSD
:
12826 icode
= CODE_FOR_wrsssi
;
12829 case IX86_BUILTIN_WRSSQ
:
12830 icode
= CODE_FOR_wrssdi
;
12833 case IX86_BUILTIN_WRUSSD
:
12834 icode
= CODE_FOR_wrusssi
;
12837 case IX86_BUILTIN_WRUSSQ
:
12838 icode
= CODE_FOR_wrussdi
;
12842 op0
= force_reg (mode
, op0
);
12843 if (!address_operand (op1
, VOIDmode
))
12845 op2
= convert_memory_address (Pmode
, op1
);
12846 op1
= copy_addr_to_reg (op2
);
12848 emit_insn (GEN_FCN (icode
) (op0
, gen_rtx_MEM (mode
, op1
)));
12855 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12856 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
12858 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
12859 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
12863 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
12864 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
12866 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
12867 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
12868 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
12869 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
12871 machine_mode mode
, wide_mode
, nar_mode
;
12873 nar_mode
= V4SFmode
;
12875 wide_mode
= V64SFmode
;
12876 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
12877 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
12881 case IX86_BUILTIN_4FMAPS
:
12882 fcn
= gen_avx5124fmaddps_4fmaddps
;
12886 case IX86_BUILTIN_4DPWSSD
:
12887 nar_mode
= V4SImode
;
12889 wide_mode
= V64SImode
;
12890 fcn
= gen_avx5124vnniw_vp4dpwssd
;
12894 case IX86_BUILTIN_4DPWSSDS
:
12895 nar_mode
= V4SImode
;
12897 wide_mode
= V64SImode
;
12898 fcn
= gen_avx5124vnniw_vp4dpwssds
;
12902 case IX86_BUILTIN_4FNMAPS
:
12903 fcn
= gen_avx5124fmaddps_4fnmaddps
;
12907 case IX86_BUILTIN_4FNMAPS_MASK
:
12908 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
12909 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
12912 case IX86_BUILTIN_4DPWSSD_MASK
:
12913 nar_mode
= V4SImode
;
12915 wide_mode
= V64SImode
;
12916 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
12917 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
12920 case IX86_BUILTIN_4DPWSSDS_MASK
:
12921 nar_mode
= V4SImode
;
12923 wide_mode
= V64SImode
;
12924 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
12925 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
12928 case IX86_BUILTIN_4FMAPS_MASK
:
12938 wide_reg
= gen_reg_rtx (wide_mode
);
12939 for (i
= 0; i
< 4; i
++)
12941 args
[i
] = CALL_EXPR_ARG (exp
, i
);
12942 ops
[i
] = expand_normal (args
[i
]);
12944 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
12948 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
12949 accum
= force_reg (mode
, accum
);
12951 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
12952 addr
= force_reg (Pmode
, addr
);
12954 mem
= gen_rtx_MEM (nar_mode
, addr
);
12956 target
= gen_reg_rtx (mode
);
12958 emit_move_insn (target
, accum
);
12961 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
12965 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
12967 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
12969 if (CONST_INT_P (mask
))
12970 mask
= fixup_modeless_constant (mask
, HImode
);
12972 mask
= force_reg (HImode
, mask
);
12974 if (GET_MODE (mask
) != HImode
)
12975 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
12977 /* If merge is 0 then we're about to emit z-masked variant. */
12978 if (const0_operand (merge
, mode
))
12979 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
12980 /* If merge is the same as accum then emit merge-masked variant. */
12981 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
12983 merge
= force_reg (mode
, merge
);
12984 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
12986 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12989 target
= gen_reg_rtx (mode
);
12990 emit_move_insn (target
, merge
);
12991 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
12997 case IX86_BUILTIN_4FNMASS
:
12998 fcn
= gen_avx5124fmaddps_4fnmaddss
;
13002 case IX86_BUILTIN_4FMASS
:
13003 fcn
= gen_avx5124fmaddps_4fmaddss
;
13007 case IX86_BUILTIN_4FNMASS_MASK
:
13008 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
13009 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
13012 case IX86_BUILTIN_4FMASS_MASK
:
13021 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
13022 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
13026 wide_reg
= gen_reg_rtx (V64SFmode
);
13027 for (i
= 0; i
< 4; i
++)
13030 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13031 ops
[i
] = expand_normal (args
[i
]);
13033 tmp
= gen_reg_rtx (SFmode
);
13034 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
13036 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
13037 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
13040 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13041 accum
= force_reg (V4SFmode
, accum
);
13043 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13044 addr
= force_reg (Pmode
, addr
);
13046 mem
= gen_rtx_MEM (V4SFmode
, addr
);
13048 target
= gen_reg_rtx (V4SFmode
);
13050 emit_move_insn (target
, accum
);
13053 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13057 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13059 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13061 if (CONST_INT_P (mask
))
13062 mask
= fixup_modeless_constant (mask
, QImode
);
13064 mask
= force_reg (QImode
, mask
);
13066 if (GET_MODE (mask
) != QImode
)
13067 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
13069 /* If merge is 0 then we're about to emit z-masked variant. */
13070 if (const0_operand (merge
, mode
))
13071 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13072 /* If merge is the same as accum then emit merge-masked
13074 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13076 merge
= force_reg (mode
, merge
);
13077 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13079 /* Merge with something unknown might happen if we z-mask
13083 target
= gen_reg_rtx (mode
);
13084 emit_move_insn (target
, merge
);
13085 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13090 case IX86_BUILTIN_RDPID
:
13091 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
13093 case IX86_BUILTIN_FABSQ
:
13094 case IX86_BUILTIN_COPYSIGNQ
:
13096 /* Emit a normal call if SSE isn't available. */
13097 return expand_call (exp
, target
, ignore
);
13100 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
13104 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
13105 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
13107 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
13108 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
13111 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13112 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
13114 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
13115 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
13118 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13119 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
13121 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
13122 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
13125 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13126 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
13128 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
13129 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
13132 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13133 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
13135 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
13136 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
13137 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
13138 (enum ix86_builtin_func_type
)
13139 d
->flag
, d
->comparison
);
13142 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
13143 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
13145 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
13146 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
13150 if (fcode
>= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13151 && fcode
<= IX86_BUILTIN__BDESC_CET_NORMAL_LAST
)
13153 i
= fcode
- IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
;
13154 return ix86_expand_special_args_builtin (bdesc_cet_rdssp
+ i
, exp
,
13158 gcc_unreachable ();
13161 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13162 fill target with val via vec_duplicate. */
13165 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
13171 /* First attempt to recognize VAL as-is. */
13172 dup
= gen_vec_duplicate (mode
, val
);
13173 insn
= emit_insn (gen_rtx_SET (target
, dup
));
13174 if (recog_memoized (insn
) < 0)
13177 machine_mode innermode
= GET_MODE_INNER (mode
);
13180 /* If that fails, force VAL into a register. */
13183 reg
= force_reg (innermode
, val
);
13184 if (GET_MODE (reg
) != innermode
)
13185 reg
= gen_lowpart (innermode
, reg
);
13186 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
13187 seq
= get_insns ();
13190 emit_insn_before (seq
, insn
);
13192 ok
= recog_memoized (insn
) >= 0;
13198 /* Get a vector mode of the same size as the original but with elements
13199 twice as wide. This is only guaranteed to apply to integral vectors. */
13201 static machine_mode
13202 get_mode_wider_vector (machine_mode o
)
13204 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13205 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
13206 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
13207 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
13211 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
13212 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
13214 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13215 with all elements equal to VAR. Return true if successful. */
13218 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
13219 rtx target
, rtx val
)
13243 return ix86_vector_duplicate_value (mode
, target
, val
);
13248 if (TARGET_SSE
|| TARGET_3DNOW_A
)
13252 val
= gen_lowpart (SImode
, val
);
13253 x
= gen_rtx_TRUNCATE (HImode
, val
);
13254 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
13255 emit_insn (gen_rtx_SET (target
, x
));
13267 return ix86_vector_duplicate_value (mode
, target
, val
);
13271 struct expand_vec_perm_d dperm
;
13275 memset (&dperm
, 0, sizeof (dperm
));
13276 dperm
.target
= target
;
13277 dperm
.vmode
= mode
;
13278 dperm
.nelt
= GET_MODE_NUNITS (mode
);
13279 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
13280 dperm
.one_operand_p
= true;
13282 /* Extend to SImode using a paradoxical SUBREG. */
13283 tmp1
= gen_reg_rtx (SImode
);
13284 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
13286 /* Insert the SImode value as low element of a V4SImode vector. */
13287 tmp2
= gen_reg_rtx (V4SImode
);
13288 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
13289 emit_move_insn (dperm
.op0
, gen_lowpart (mode
, tmp2
));
13291 ok
= (expand_vec_perm_1 (&dperm
)
13292 || expand_vec_perm_broadcast_1 (&dperm
));
13300 return ix86_vector_duplicate_value (mode
, target
, val
);
13307 /* Replicate the value once into the next wider mode and recurse. */
13309 machine_mode smode
, wsmode
, wvmode
;
13312 smode
= GET_MODE_INNER (mode
);
13313 wvmode
= get_mode_wider_vector (mode
);
13314 wsmode
= GET_MODE_INNER (wvmode
);
13316 val
= convert_modes (wsmode
, smode
, val
, true);
13317 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
13318 GEN_INT (GET_MODE_BITSIZE (smode
)),
13319 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13320 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1, OPTAB_LIB_WIDEN
);
13322 x
= gen_reg_rtx (wvmode
);
13323 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
13325 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
13332 return ix86_vector_duplicate_value (mode
, target
, val
);
13335 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
: V16QImode
);
13336 rtx x
= gen_reg_rtx (hvmode
);
13338 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13341 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13342 emit_insn (gen_rtx_SET (target
, x
));
13348 if (TARGET_AVX512BW
)
13349 return ix86_vector_duplicate_value (mode
, target
, val
);
13352 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
: V32QImode
);
13353 rtx x
= gen_reg_rtx (hvmode
);
13355 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13358 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13359 emit_insn (gen_rtx_SET (target
, x
));
13368 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13369 whose ONE_VAR element is VAR, and other elements are zero. Return true
13373 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
13374 rtx target
, rtx var
, int one_var
)
13376 machine_mode vsimode
;
13379 bool use_vector_set
= false;
13380 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
13385 /* For SSE4.1, we normally use vector set. But if the second
13386 element is zero and inter-unit moves are OK, we use movq
13388 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
13389 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13395 use_vector_set
= TARGET_SSE4_1
;
13398 use_vector_set
= TARGET_SSE2
;
13401 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
13405 use_vector_set
= TARGET_AVX
;
13408 use_vector_set
= TARGET_AVX
;
13409 gen_vec_set_0
= gen_vec_setv8si_0
;
13412 use_vector_set
= TARGET_AVX
;
13413 gen_vec_set_0
= gen_vec_setv8sf_0
;
13416 use_vector_set
= TARGET_AVX
;
13417 gen_vec_set_0
= gen_vec_setv4df_0
;
13420 /* Use ix86_expand_vector_set in 64bit mode only. */
13421 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
13422 gen_vec_set_0
= gen_vec_setv4di_0
;
13425 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13426 gen_vec_set_0
= gen_vec_setv16si_0
;
13429 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13430 gen_vec_set_0
= gen_vec_setv16sf_0
;
13433 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13434 gen_vec_set_0
= gen_vec_setv8df_0
;
13437 /* Use ix86_expand_vector_set in 64bit mode only. */
13438 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
13439 gen_vec_set_0
= gen_vec_setv8di_0
;
13445 if (use_vector_set
)
13447 if (gen_vec_set_0
&& one_var
== 0)
13449 var
= force_reg (GET_MODE_INNER (mode
), var
);
13450 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
13453 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
13454 var
= force_reg (GET_MODE_INNER (mode
), var
);
13455 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13471 var
= force_reg (GET_MODE_INNER (mode
), var
);
13472 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
13473 emit_insn (gen_rtx_SET (target
, x
));
13478 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
13479 new_target
= gen_reg_rtx (mode
);
13481 new_target
= target
;
13482 var
= force_reg (GET_MODE_INNER (mode
), var
);
13483 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
13484 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
13485 emit_insn (gen_rtx_SET (new_target
, x
));
13488 /* We need to shuffle the value to the correct position, so
13489 create a new pseudo to store the intermediate result. */
13491 /* With SSE2, we can use the integer shuffle insns. */
13492 if (mode
!= V4SFmode
&& TARGET_SSE2
)
13494 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
13496 GEN_INT (one_var
== 1 ? 0 : 1),
13497 GEN_INT (one_var
== 2 ? 0 : 1),
13498 GEN_INT (one_var
== 3 ? 0 : 1)));
13499 if (target
!= new_target
)
13500 emit_move_insn (target
, new_target
);
13504 /* Otherwise convert the intermediate result to V4SFmode and
13505 use the SSE1 shuffle instructions. */
13506 if (mode
!= V4SFmode
)
13508 tmp
= gen_reg_rtx (V4SFmode
);
13509 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
13514 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
13516 GEN_INT (one_var
== 1 ? 0 : 1),
13517 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
13518 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
13520 if (mode
!= V4SFmode
)
13521 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
13522 else if (tmp
!= target
)
13523 emit_move_insn (target
, tmp
);
13525 else if (target
!= new_target
)
13526 emit_move_insn (target
, new_target
);
13531 vsimode
= V4SImode
;
13537 vsimode
= V2SImode
;
13543 /* Zero extend the variable element to SImode and recurse. */
13544 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
13546 x
= gen_reg_rtx (vsimode
);
13547 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
13549 gcc_unreachable ();
13551 emit_move_insn (target
, gen_lowpart (mode
, x
));
13559 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13560 consisting of the values in VALS. It is known that all elements
13561 except ONE_VAR are constants. Return true if successful. */
13564 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
13565 rtx target
, rtx vals
, int one_var
)
13567 rtx var
= XVECEXP (vals
, 0, one_var
);
13568 machine_mode wmode
;
13571 const_vec
= copy_rtx (vals
);
13572 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
13573 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
13581 /* For the two element vectors, it's just as easy to use
13582 the general case. */
13586 /* Use ix86_expand_vector_set in 64bit mode only. */
13610 /* There's no way to set one QImode entry easily. Combine
13611 the variable value with its adjacent constant value, and
13612 promote to an HImode set. */
13613 x
= XVECEXP (vals
, 0, one_var
^ 1);
13616 var
= convert_modes (HImode
, QImode
, var
, true);
13617 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
13618 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13619 x
= GEN_INT (INTVAL (x
) & 0xff);
13623 var
= convert_modes (HImode
, QImode
, var
, true);
13624 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
13626 if (x
!= const0_rtx
)
13627 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
13628 1, OPTAB_LIB_WIDEN
);
13630 x
= gen_reg_rtx (wmode
);
13631 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
13632 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
13634 emit_move_insn (target
, gen_lowpart (mode
, x
));
13641 emit_move_insn (target
, const_vec
);
13642 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13646 /* A subroutine of ix86_expand_vector_init_general. Use vector
13647 concatenate to handle the most general case: all values variable,
13648 and none identical. */
13651 ix86_expand_vector_init_concat (machine_mode mode
,
13652 rtx target
, rtx
*ops
, int n
)
13654 machine_mode cmode
, hmode
= VOIDmode
, gmode
= VOIDmode
;
13655 rtx first
[16], second
[8], third
[4];
13707 gcc_unreachable ();
13710 if (!register_operand (ops
[1], cmode
))
13711 ops
[1] = force_reg (cmode
, ops
[1]);
13712 if (!register_operand (ops
[0], cmode
))
13713 ops
[0] = force_reg (cmode
, ops
[0]);
13714 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
13734 gcc_unreachable ();
13758 gcc_unreachable ();
13776 gcc_unreachable ();
13781 /* FIXME: We process inputs backward to help RA. PR 36222. */
13784 for (; i
> 0; i
-= 2, j
--)
13786 first
[j
] = gen_reg_rtx (cmode
);
13787 v
= gen_rtvec (2, ops
[i
- 1], ops
[i
]);
13788 ix86_expand_vector_init (false, first
[j
],
13789 gen_rtx_PARALLEL (cmode
, v
));
13795 gcc_assert (hmode
!= VOIDmode
);
13796 gcc_assert (gmode
!= VOIDmode
);
13797 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13799 second
[j
] = gen_reg_rtx (hmode
);
13800 ix86_expand_vector_init_concat (hmode
, second
[j
],
13804 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13806 third
[j
] = gen_reg_rtx (gmode
);
13807 ix86_expand_vector_init_concat (gmode
, third
[j
],
13811 ix86_expand_vector_init_concat (mode
, target
, third
, n
);
13815 gcc_assert (hmode
!= VOIDmode
);
13816 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13818 second
[j
] = gen_reg_rtx (hmode
);
13819 ix86_expand_vector_init_concat (hmode
, second
[j
],
13823 ix86_expand_vector_init_concat (mode
, target
, second
, n
);
13826 ix86_expand_vector_init_concat (mode
, target
, first
, n
);
13830 gcc_unreachable ();
13834 /* A subroutine of ix86_expand_vector_init_general. Use vector
13835 interleave to handle the most general case: all values variable,
13836 and none identical. */
13839 ix86_expand_vector_init_interleave (machine_mode mode
,
13840 rtx target
, rtx
*ops
, int n
)
13842 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
13845 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
13846 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
13847 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
13852 gen_load_even
= gen_vec_setv8hi
;
13853 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
13854 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13855 inner_mode
= HImode
;
13856 first_imode
= V4SImode
;
13857 second_imode
= V2DImode
;
13858 third_imode
= VOIDmode
;
13861 gen_load_even
= gen_vec_setv16qi
;
13862 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
13863 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
13864 inner_mode
= QImode
;
13865 first_imode
= V8HImode
;
13866 second_imode
= V4SImode
;
13867 third_imode
= V2DImode
;
13870 gcc_unreachable ();
13873 for (i
= 0; i
< n
; i
++)
13875 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13876 op0
= gen_reg_rtx (SImode
);
13877 emit_move_insn (op0
, gen_lowpart (SImode
, ops
[i
+ i
]));
13879 /* Insert the SImode value as low element of V4SImode vector. */
13880 op1
= gen_reg_rtx (V4SImode
);
13881 op0
= gen_rtx_VEC_MERGE (V4SImode
,
13882 gen_rtx_VEC_DUPLICATE (V4SImode
,
13884 CONST0_RTX (V4SImode
),
13886 emit_insn (gen_rtx_SET (op1
, op0
));
13888 /* Cast the V4SImode vector back to a vector in orignal mode. */
13889 op0
= gen_reg_rtx (mode
);
13890 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
13892 /* Load even elements into the second position. */
13893 emit_insn (gen_load_even (op0
,
13894 force_reg (inner_mode
,
13898 /* Cast vector to FIRST_IMODE vector. */
13899 ops
[i
] = gen_reg_rtx (first_imode
);
13900 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
13903 /* Interleave low FIRST_IMODE vectors. */
13904 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13906 op0
= gen_reg_rtx (first_imode
);
13907 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
13909 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13910 ops
[j
] = gen_reg_rtx (second_imode
);
13911 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
13914 /* Interleave low SECOND_IMODE vectors. */
13915 switch (second_imode
)
13918 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
13920 op0
= gen_reg_rtx (second_imode
);
13921 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
13924 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13926 ops
[j
] = gen_reg_rtx (third_imode
);
13927 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
13929 second_imode
= V2DImode
;
13930 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13934 op0
= gen_reg_rtx (second_imode
);
13935 emit_insn (gen_interleave_second_low (op0
, ops
[0],
13938 /* Cast the SECOND_IMODE vector back to a vector on original
13940 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
13944 gcc_unreachable ();
13948 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13949 all values variable, and none identical. */
13952 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
13953 rtx target
, rtx vals
)
13955 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
13956 machine_mode half_mode
= VOIDmode
;
13957 machine_mode quarter_mode
= VOIDmode
;
13964 if (!mmx_ok
&& !TARGET_SSE
)
13980 n
= GET_MODE_NUNITS (mode
);
13981 for (i
= 0; i
< n
; i
++)
13982 ops
[i
] = XVECEXP (vals
, 0, i
);
13983 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
13987 for (i
= 0; i
< 2; i
++)
13988 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13989 op0
= gen_reg_rtx (V4DImode
);
13990 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
13991 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13995 for (i
= 0; i
< 4; i
++)
13996 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13997 ops
[4] = gen_reg_rtx (V4DImode
);
13998 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
13999 ops
[5] = gen_reg_rtx (V4DImode
);
14000 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
14001 op0
= gen_reg_rtx (V8DImode
);
14002 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
14003 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
14007 half_mode
= V16QImode
;
14011 half_mode
= V8HImode
;
14015 n
= GET_MODE_NUNITS (mode
);
14016 for (i
= 0; i
< n
; i
++)
14017 ops
[i
] = XVECEXP (vals
, 0, i
);
14018 op0
= gen_reg_rtx (half_mode
);
14019 op1
= gen_reg_rtx (half_mode
);
14020 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
14022 ix86_expand_vector_init_interleave (half_mode
, op1
,
14023 &ops
[n
>> 1], n
>> 2);
14024 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
14028 quarter_mode
= V16QImode
;
14029 half_mode
= V32QImode
;
14033 quarter_mode
= V8HImode
;
14034 half_mode
= V16HImode
;
14038 n
= GET_MODE_NUNITS (mode
);
14039 for (i
= 0; i
< n
; i
++)
14040 ops
[i
] = XVECEXP (vals
, 0, i
);
14041 op0
= gen_reg_rtx (quarter_mode
);
14042 op1
= gen_reg_rtx (quarter_mode
);
14043 op2
= gen_reg_rtx (quarter_mode
);
14044 op3
= gen_reg_rtx (quarter_mode
);
14045 op4
= gen_reg_rtx (half_mode
);
14046 op5
= gen_reg_rtx (half_mode
);
14047 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
14049 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
14050 &ops
[n
>> 2], n
>> 3);
14051 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
14052 &ops
[n
>> 1], n
>> 3);
14053 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
14054 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
14055 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
14056 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
14057 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
14061 if (!TARGET_SSE4_1
)
14069 /* Don't use ix86_expand_vector_init_interleave if we can't
14070 move from GPR to SSE register directly. */
14071 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
14074 n
= GET_MODE_NUNITS (mode
);
14075 for (i
= 0; i
< n
; i
++)
14076 ops
[i
] = XVECEXP (vals
, 0, i
);
14077 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
14085 gcc_unreachable ();
14089 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
14090 machine_mode inner_mode
;
14091 rtx words
[4], shift
;
14093 inner_mode
= GET_MODE_INNER (mode
);
14094 n_elts
= GET_MODE_NUNITS (mode
);
14095 n_words
= GET_MODE_SIZE (mode
) / UNITS_PER_WORD
;
14096 n_elt_per_word
= n_elts
/ n_words
;
14097 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
14099 for (i
= 0; i
< n_words
; ++i
)
14101 rtx word
= NULL_RTX
;
14103 for (j
= 0; j
< n_elt_per_word
; ++j
)
14105 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
14106 elt
= convert_modes (word_mode
, inner_mode
, elt
, true);
14112 word
= expand_simple_binop (word_mode
, ASHIFT
, word
, shift
,
14113 word
, 1, OPTAB_LIB_WIDEN
);
14114 word
= expand_simple_binop (word_mode
, IOR
, word
, elt
,
14115 word
, 1, OPTAB_LIB_WIDEN
);
14123 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
14124 else if (n_words
== 2)
14126 rtx tmp
= gen_reg_rtx (mode
);
14127 emit_clobber (tmp
);
14128 emit_move_insn (gen_lowpart (word_mode
, tmp
), words
[0]);
14129 emit_move_insn (gen_highpart (word_mode
, tmp
), words
[1]);
14130 emit_move_insn (target
, tmp
);
14132 else if (n_words
== 4)
14134 rtx tmp
= gen_reg_rtx (V4SImode
);
14135 gcc_assert (word_mode
== SImode
);
14136 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
14137 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
14138 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
14141 gcc_unreachable ();
14145 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14146 instructions unless MMX_OK is true. */
14149 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
14151 machine_mode mode
= GET_MODE (target
);
14152 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14153 int n_elts
= GET_MODE_NUNITS (mode
);
14154 int n_var
= 0, one_var
= -1;
14155 bool all_same
= true, all_const_zero
= true;
14159 /* Handle first initialization from vector elts. */
14160 if (n_elts
!= XVECLEN (vals
, 0))
14162 rtx subtarget
= target
;
14163 x
= XVECEXP (vals
, 0, 0);
14164 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
14165 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
14167 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
14168 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14170 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
14171 mode
= mode_for_vector (SImode
, n_bits
/ 4).require ();
14172 inner_mode
= mode_for_vector (SImode
, n_bits
/ 8).require ();
14173 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
14174 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
14175 subtarget
= gen_reg_rtx (mode
);
14177 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
14178 if (subtarget
!= target
)
14179 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
14182 gcc_unreachable ();
14185 for (i
= 0; i
< n_elts
; ++i
)
14187 x
= XVECEXP (vals
, 0, i
);
14188 if (!(CONST_SCALAR_INT_P (x
)
14189 || CONST_DOUBLE_P (x
)
14190 || CONST_FIXED_P (x
)))
14191 n_var
++, one_var
= i
;
14192 else if (x
!= CONST0_RTX (inner_mode
))
14193 all_const_zero
= false;
14194 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
14198 /* Constants are best loaded from the constant pool. */
14201 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
14205 /* If all values are identical, broadcast the value. */
14207 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
14208 XVECEXP (vals
, 0, 0)))
14211 /* Values where only one field is non-constant are best loaded from
14212 the pool and overwritten via move later. */
14216 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
14217 XVECEXP (vals
, 0, one_var
),
14221 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
14225 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
14229 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
14231 machine_mode mode
= GET_MODE (target
);
14232 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14233 machine_mode half_mode
;
14234 bool use_vec_merge
= false;
14236 static rtx (*gen_extract
[6][2]) (rtx
, rtx
)
14238 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
14239 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
14240 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
14241 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
14242 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
14243 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
}
14245 static rtx (*gen_insert
[6][2]) (rtx
, rtx
, rtx
)
14247 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
14248 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
14249 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
14250 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
14251 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
14252 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
}
14255 machine_mode mmode
= VOIDmode
;
14256 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
14264 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14265 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
14267 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14269 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14270 emit_insn (gen_rtx_SET (target
, tmp
));
14276 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
14280 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14281 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
14283 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14285 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14286 emit_insn (gen_rtx_SET (target
, tmp
));
14290 /* NB: For ELT == 0, use standard scalar operation patterns which
14291 preserve the rest of the vector for combiner:
14294 (vec_duplicate:V2DF (reg:DF))
14304 /* For the two element vectors, we implement a VEC_CONCAT with
14305 the extraction of the other element. */
14307 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
14308 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
14311 op0
= val
, op1
= tmp
;
14313 op0
= tmp
, op1
= val
;
14315 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
14316 emit_insn (gen_rtx_SET (target
, tmp
));
14321 use_vec_merge
= TARGET_SSE4_1
;
14328 use_vec_merge
= true;
14332 /* tmp = target = A B C D */
14333 tmp
= copy_to_reg (target
);
14334 /* target = A A B B */
14335 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
14336 /* target = X A B B */
14337 ix86_expand_vector_set (false, target
, val
, 0);
14338 /* target = A X C D */
14339 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14340 const1_rtx
, const0_rtx
,
14341 GEN_INT (2+4), GEN_INT (3+4)));
14345 /* tmp = target = A B C D */
14346 tmp
= copy_to_reg (target
);
14347 /* tmp = X B C D */
14348 ix86_expand_vector_set (false, tmp
, val
, 0);
14349 /* target = A B X D */
14350 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14351 const0_rtx
, const1_rtx
,
14352 GEN_INT (0+4), GEN_INT (3+4)));
14356 /* tmp = target = A B C D */
14357 tmp
= copy_to_reg (target
);
14358 /* tmp = X B C D */
14359 ix86_expand_vector_set (false, tmp
, val
, 0);
14360 /* target = A B X D */
14361 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14362 const0_rtx
, const1_rtx
,
14363 GEN_INT (2+4), GEN_INT (0+4)));
14367 gcc_unreachable ();
14372 use_vec_merge
= TARGET_SSE4_1
;
14376 /* Element 0 handled by vec_merge below. */
14379 use_vec_merge
= true;
14385 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14386 store into element 0, then shuffle them back. */
14390 order
[0] = GEN_INT (elt
);
14391 order
[1] = const1_rtx
;
14392 order
[2] = const2_rtx
;
14393 order
[3] = GEN_INT (3);
14394 order
[elt
] = const0_rtx
;
14396 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14397 order
[1], order
[2], order
[3]));
14399 ix86_expand_vector_set (false, target
, val
, 0);
14401 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14402 order
[1], order
[2], order
[3]));
14406 /* For SSE1, we have to reuse the V4SF code. */
14407 rtx t
= gen_reg_rtx (V4SFmode
);
14408 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
14409 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
14410 emit_move_insn (target
, gen_lowpart (mode
, t
));
14415 use_vec_merge
= TARGET_SSE2
;
14418 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14422 use_vec_merge
= TARGET_SSE4_1
;
14429 half_mode
= V16QImode
;
14435 half_mode
= V8HImode
;
14441 half_mode
= V4SImode
;
14447 half_mode
= V2DImode
;
14453 half_mode
= V4SFmode
;
14459 half_mode
= V2DFmode
;
14465 /* Compute offset. */
14469 gcc_assert (i
<= 1);
14471 /* Extract the half. */
14472 tmp
= gen_reg_rtx (half_mode
);
14473 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
14475 /* Put val in tmp at elt. */
14476 ix86_expand_vector_set (false, tmp
, val
, elt
);
14479 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
14483 if (TARGET_AVX512F
)
14486 gen_blendm
= gen_avx512f_blendmv8df
;
14491 if (TARGET_AVX512F
)
14494 gen_blendm
= gen_avx512f_blendmv8di
;
14499 if (TARGET_AVX512F
)
14502 gen_blendm
= gen_avx512f_blendmv16sf
;
14507 if (TARGET_AVX512F
)
14510 gen_blendm
= gen_avx512f_blendmv16si
;
14515 if (TARGET_AVX512BW
)
14518 gen_blendm
= gen_avx512bw_blendmv32hi
;
14520 else if (TARGET_AVX512F
)
14522 half_mode
= E_V8HImode
;
14529 if (TARGET_AVX512BW
)
14532 gen_blendm
= gen_avx512bw_blendmv64qi
;
14534 else if (TARGET_AVX512F
)
14536 half_mode
= E_V16QImode
;
14543 /* Compute offset. */
14547 gcc_assert (i
<= 3);
14550 /* Extract the quarter. */
14551 tmp
= gen_reg_rtx (V4SImode
);
14552 rtx tmp2
= gen_lowpart (V16SImode
, target
);
14553 rtx mask
= gen_reg_rtx (QImode
);
14555 emit_move_insn (mask
, constm1_rtx
);
14556 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
14559 tmp2
= gen_reg_rtx (half_mode
);
14560 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
14563 /* Put val in tmp at elt. */
14564 ix86_expand_vector_set (false, tmp
, val
, elt
);
14567 tmp2
= gen_reg_rtx (V16SImode
);
14568 rtx tmp3
= gen_lowpart (V16SImode
, target
);
14569 mask
= gen_reg_rtx (HImode
);
14570 emit_move_insn (mask
, constm1_rtx
);
14571 tmp
= gen_lowpart (V4SImode
, tmp
);
14572 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
14574 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
14582 if (mmode
!= VOIDmode
)
14584 tmp
= gen_reg_rtx (mode
);
14585 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
14586 /* The avx512*_blendm<mode> expanders have different operand order
14587 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14588 elements where the mask is set and second input operand otherwise,
14589 in {sse,avx}*_*blend* the first input operand is used for elements
14590 where the mask is clear and second input operand otherwise. */
14591 emit_insn (gen_blendm (target
, target
, tmp
,
14593 gen_int_mode (HOST_WIDE_INT_1U
<< elt
,
14596 else if (use_vec_merge
)
14599 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
14600 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
14601 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
14602 emit_insn (gen_rtx_SET (target
, tmp
));
14606 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14608 emit_move_insn (mem
, target
);
14610 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
14611 emit_move_insn (tmp
, val
);
14613 emit_move_insn (target
, mem
);
14618 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
14620 machine_mode mode
= GET_MODE (vec
);
14621 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14622 bool use_vec_extr
= false;
14637 use_vec_extr
= true;
14641 use_vec_extr
= TARGET_SSE4_1
;
14653 tmp
= gen_reg_rtx (mode
);
14654 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
14655 GEN_INT (elt
), GEN_INT (elt
),
14656 GEN_INT (elt
+4), GEN_INT (elt
+4)));
14660 tmp
= gen_reg_rtx (mode
);
14661 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
14665 gcc_unreachable ();
14668 use_vec_extr
= true;
14673 use_vec_extr
= TARGET_SSE4_1
;
14687 tmp
= gen_reg_rtx (mode
);
14688 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
14689 GEN_INT (elt
), GEN_INT (elt
),
14690 GEN_INT (elt
), GEN_INT (elt
)));
14694 tmp
= gen_reg_rtx (mode
);
14695 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
14699 gcc_unreachable ();
14702 use_vec_extr
= true;
14707 /* For SSE1, we have to reuse the V4SF code. */
14708 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
14709 gen_lowpart (V4SFmode
, vec
), elt
);
14715 use_vec_extr
= TARGET_SSE2
;
14718 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14722 use_vec_extr
= TARGET_SSE4_1
;
14728 tmp
= gen_reg_rtx (V4SFmode
);
14730 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
14732 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
14733 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14741 tmp
= gen_reg_rtx (V2DFmode
);
14743 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
14745 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
14746 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14754 tmp
= gen_reg_rtx (V16QImode
);
14756 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
14758 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
14759 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14767 tmp
= gen_reg_rtx (V8HImode
);
14769 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
14771 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
14772 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14780 tmp
= gen_reg_rtx (V4SImode
);
14782 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
14784 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
14785 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14793 tmp
= gen_reg_rtx (V2DImode
);
14795 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
14797 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
14798 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14804 if (TARGET_AVX512BW
)
14806 tmp
= gen_reg_rtx (V16HImode
);
14808 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
14810 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
14811 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14817 if (TARGET_AVX512BW
)
14819 tmp
= gen_reg_rtx (V32QImode
);
14821 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
14823 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
14824 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
14830 tmp
= gen_reg_rtx (V8SFmode
);
14832 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
14834 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
14835 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14839 tmp
= gen_reg_rtx (V4DFmode
);
14841 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
14843 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
14844 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14848 tmp
= gen_reg_rtx (V8SImode
);
14850 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
14852 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
14853 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14857 tmp
= gen_reg_rtx (V4DImode
);
14859 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
14861 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
14862 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14866 /* ??? Could extract the appropriate HImode element and shift. */
14873 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
14874 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
14876 /* Let the rtl optimizers know about the zero extension performed. */
14877 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14879 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
14880 target
= gen_lowpart (SImode
, target
);
14883 emit_insn (gen_rtx_SET (target
, tmp
));
14887 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14889 emit_move_insn (mem
, vec
);
14891 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
14892 emit_move_insn (target
, tmp
);
14896 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14897 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14898 The upper bits of DEST are undefined, though they shouldn't cause
14899 exceptions (some bits from src or all zeros are ok). */
14902 emit_reduc_half (rtx dest
, rtx src
, int i
)
14905 switch (GET_MODE (src
))
14909 tem
= gen_sse_movhlps (dest
, src
, src
);
14911 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
14912 GEN_INT (1 + 4), GEN_INT (1 + 4));
14915 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
14921 d
= gen_reg_rtx (V1TImode
);
14922 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
14927 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
14929 tem
= gen_avx_shufps256 (dest
, src
, src
,
14930 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
14934 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
14936 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
14944 if (GET_MODE (dest
) != V4DImode
)
14945 d
= gen_reg_rtx (V4DImode
);
14946 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
14947 gen_lowpart (V4DImode
, src
),
14952 d
= gen_reg_rtx (V2TImode
);
14953 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
14964 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
14965 gen_lowpart (V16SImode
, src
),
14966 gen_lowpart (V16SImode
, src
),
14967 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
14968 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
14969 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
14970 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
14971 GEN_INT (0xC), GEN_INT (0xD),
14972 GEN_INT (0xE), GEN_INT (0xF),
14973 GEN_INT (0x10), GEN_INT (0x11),
14974 GEN_INT (0x12), GEN_INT (0x13),
14975 GEN_INT (0x14), GEN_INT (0x15),
14976 GEN_INT (0x16), GEN_INT (0x17));
14978 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
14979 gen_lowpart (V16SImode
, src
),
14980 GEN_INT (i
== 128 ? 0x2 : 0x1),
14984 GEN_INT (i
== 128 ? 0x6 : 0x5),
14988 GEN_INT (i
== 128 ? 0xA : 0x9),
14992 GEN_INT (i
== 128 ? 0xE : 0xD),
14998 gcc_unreachable ();
15002 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
15005 /* Expand a vector reduction. FN is the binary pattern to reduce;
15006 DEST is the destination; IN is the input vector. */
15009 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
15011 rtx half
, dst
, vec
= in
;
15012 machine_mode mode
= GET_MODE (in
);
15015 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15017 && mode
== V8HImode
15018 && fn
== gen_uminv8hi3
)
15020 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
15024 for (i
= GET_MODE_BITSIZE (mode
);
15025 i
> GET_MODE_UNIT_BITSIZE (mode
);
15028 half
= gen_reg_rtx (mode
);
15029 emit_reduc_half (half
, vec
, i
);
15030 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
15033 dst
= gen_reg_rtx (mode
);
15034 emit_insn (fn (dst
, half
, vec
));
15039 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15040 FP status register is set. */
15043 ix86_emit_fp_unordered_jump (rtx label
)
15045 rtx reg
= gen_reg_rtx (HImode
);
15049 emit_insn (gen_x86_fnstsw_1 (reg
));
15051 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
15053 emit_insn (gen_x86_sahf_1 (reg
));
15055 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
15056 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
15060 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
15062 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15063 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
15066 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
15067 gen_rtx_LABEL_REF (VOIDmode
, label
),
15069 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
15070 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15071 JUMP_LABEL (insn
) = label
;
15074 /* Output code to perform an sinh XFmode calculation. */
15076 void ix86_emit_i387_sinh (rtx op0
, rtx op1
)
15078 rtx e1
= gen_reg_rtx (XFmode
);
15079 rtx e2
= gen_reg_rtx (XFmode
);
15080 rtx scratch
= gen_reg_rtx (HImode
);
15081 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15082 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15084 rtx_code_label
*jump_label
= gen_label_rtx ();
15087 /* scratch = fxam (op1) */
15088 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15090 /* e1 = expm1 (|op1|) */
15091 emit_insn (gen_absxf2 (e2
, op1
));
15092 emit_insn (gen_expm1xf2 (e1
, e2
));
15094 /* e2 = e1 / (e1 + 1.0) + e1 */
15095 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15096 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15097 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15098 emit_insn (gen_addxf3 (e2
, e2
, e1
));
15100 /* flags = signbit (op1) */
15101 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15103 /* if (flags) then e2 = -e2 */
15104 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15105 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15106 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15108 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15109 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15110 JUMP_LABEL (insn
) = jump_label
;
15112 emit_insn (gen_negxf2 (e2
, e2
));
15114 emit_label (jump_label
);
15115 LABEL_NUSES (jump_label
) = 1;
15117 /* op0 = 0.5 * e2 */
15118 half
= force_reg (XFmode
, half
);
15119 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15122 /* Output code to perform an cosh XFmode calculation. */
15124 void ix86_emit_i387_cosh (rtx op0
, rtx op1
)
15126 rtx e1
= gen_reg_rtx (XFmode
);
15127 rtx e2
= gen_reg_rtx (XFmode
);
15128 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15131 /* e1 = exp (op1) */
15132 emit_insn (gen_expxf2 (e1
, op1
));
15134 /* e2 = e1 + 1.0 / e1 */
15135 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15136 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
15137 emit_insn (gen_addxf3 (e2
, e1
, e2
));
15139 /* op0 = 0.5 * e2 */
15140 half
= force_reg (XFmode
, half
);
15141 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15144 /* Output code to perform an tanh XFmode calculation. */
15146 void ix86_emit_i387_tanh (rtx op0
, rtx op1
)
15148 rtx e1
= gen_reg_rtx (XFmode
);
15149 rtx e2
= gen_reg_rtx (XFmode
);
15150 rtx scratch
= gen_reg_rtx (HImode
);
15151 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15153 rtx_code_label
*jump_label
= gen_label_rtx ();
15156 /* scratch = fxam (op1) */
15157 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15159 /* e1 = expm1 (-|2 * op1|) */
15160 emit_insn (gen_addxf3 (e2
, op1
, op1
));
15161 emit_insn (gen_absxf2 (e2
, e2
));
15162 emit_insn (gen_negxf2 (e2
, e2
));
15163 emit_insn (gen_expm1xf2 (e1
, e2
));
15165 /* e2 = e1 / (e1 + 2.0) */
15166 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
15167 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
15168 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15170 /* flags = signbit (op1) */
15171 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15173 /* if (!flags) then e2 = -e2 */
15174 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15175 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15176 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15178 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15179 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15180 JUMP_LABEL (insn
) = jump_label
;
15182 emit_insn (gen_negxf2 (e2
, e2
));
15184 emit_label (jump_label
);
15185 LABEL_NUSES (jump_label
) = 1;
15187 emit_move_insn (op0
, e2
);
15190 /* Output code to perform an asinh XFmode calculation. */
15192 void ix86_emit_i387_asinh (rtx op0
, rtx op1
)
15194 rtx e1
= gen_reg_rtx (XFmode
);
15195 rtx e2
= gen_reg_rtx (XFmode
);
15196 rtx scratch
= gen_reg_rtx (HImode
);
15197 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15199 rtx_code_label
*jump_label
= gen_label_rtx ();
15202 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15203 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
15204 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15205 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15206 emit_insn (gen_sqrtxf2 (e2
, e2
));
15207 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
15210 emit_insn (gen_divxf3 (e1
, e1
, e2
));
15212 /* scratch = fxam (op1) */
15213 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15215 /* e1 = e1 + |op1| */
15216 emit_insn (gen_absxf2 (e2
, op1
));
15217 emit_insn (gen_addxf3 (e1
, e1
, e2
));
15219 /* e2 = log1p (e1) */
15220 ix86_emit_i387_log1p (e2
, e1
);
15222 /* flags = signbit (op1) */
15223 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15225 /* if (flags) then e2 = -e2 */
15226 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15227 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15228 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15230 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15231 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15232 JUMP_LABEL (insn
) = jump_label
;
15234 emit_insn (gen_negxf2 (e2
, e2
));
15236 emit_label (jump_label
);
15237 LABEL_NUSES (jump_label
) = 1;
15239 emit_move_insn (op0
, e2
);
15242 /* Output code to perform an acosh XFmode calculation. */
15244 void ix86_emit_i387_acosh (rtx op0
, rtx op1
)
15246 rtx e1
= gen_reg_rtx (XFmode
);
15247 rtx e2
= gen_reg_rtx (XFmode
);
15248 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15250 /* e2 = sqrt (op1 + 1.0) */
15251 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
15252 emit_insn (gen_sqrtxf2 (e2
, e2
));
15254 /* e1 = sqrt (op1 - 1.0) */
15255 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
15256 emit_insn (gen_sqrtxf2 (e1
, e1
));
15259 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
15261 /* e1 = e1 + op1 */
15262 emit_insn (gen_addxf3 (e1
, e1
, op1
));
15264 /* op0 = log (e1) */
15265 emit_insn (gen_logxf2 (op0
, e1
));
15268 /* Output code to perform an atanh XFmode calculation. */
15270 void ix86_emit_i387_atanh (rtx op0
, rtx op1
)
15272 rtx e1
= gen_reg_rtx (XFmode
);
15273 rtx e2
= gen_reg_rtx (XFmode
);
15274 rtx scratch
= gen_reg_rtx (HImode
);
15275 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15276 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15278 rtx_code_label
*jump_label
= gen_label_rtx ();
15281 /* scratch = fxam (op1) */
15282 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15285 emit_insn (gen_absxf2 (e2
, op1
));
15287 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15288 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15289 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
15290 emit_insn (gen_addxf3 (e2
, e2
, e2
));
15291 emit_insn (gen_negxf2 (e2
, e2
));
15292 emit_insn (gen_divxf3 (e1
, e2
, e1
));
15294 /* e2 = log1p (e1) */
15295 ix86_emit_i387_log1p (e2
, e1
);
15297 /* flags = signbit (op1) */
15298 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15300 /* if (!flags) then e2 = -e2 */
15301 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15302 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15303 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15305 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15306 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15307 JUMP_LABEL (insn
) = jump_label
;
15309 emit_insn (gen_negxf2 (e2
, e2
));
15311 emit_label (jump_label
);
15312 LABEL_NUSES (jump_label
) = 1;
15314 /* op0 = 0.5 * e2 */
15315 half
= force_reg (XFmode
, half
);
15316 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15319 /* Output code to perform a log1p XFmode calculation. */
15321 void ix86_emit_i387_log1p (rtx op0
, rtx op1
)
15323 rtx_code_label
*label1
= gen_label_rtx ();
15324 rtx_code_label
*label2
= gen_label_rtx ();
15326 rtx tmp
= gen_reg_rtx (XFmode
);
15327 rtx res
= gen_reg_rtx (XFmode
);
15328 rtx cst
, cstln2
, cst1
;
15331 cst
= const_double_from_real_value
15332 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
15333 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
15335 emit_insn (gen_absxf2 (tmp
, op1
));
15337 cst
= force_reg (XFmode
, cst
);
15338 ix86_expand_branch (GE
, tmp
, cst
, label1
);
15339 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15340 insn
= get_last_insn ();
15341 JUMP_LABEL (insn
) = label1
;
15343 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
15344 emit_jump (label2
);
15346 emit_label (label1
);
15347 LABEL_NUSES (label1
) = 1;
15349 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15350 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
15351 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
15353 emit_label (label2
);
15354 LABEL_NUSES (label2
) = 1;
15356 emit_move_insn (op0
, res
);
15359 /* Emit code for round calculation. */
15360 void ix86_emit_i387_round (rtx op0
, rtx op1
)
15362 machine_mode inmode
= GET_MODE (op1
);
15363 machine_mode outmode
= GET_MODE (op0
);
15364 rtx e1
= gen_reg_rtx (XFmode
);
15365 rtx e2
= gen_reg_rtx (XFmode
);
15366 rtx scratch
= gen_reg_rtx (HImode
);
15367 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15368 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15369 rtx res
= gen_reg_rtx (outmode
);
15370 rtx_code_label
*jump_label
= gen_label_rtx ();
15371 rtx (*floor_insn
) (rtx
, rtx
);
15372 rtx (*neg_insn
) (rtx
, rtx
);
15380 tmp
= gen_reg_rtx (XFmode
);
15382 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
15388 gcc_unreachable ();
15394 floor_insn
= gen_frndintxf2_floor
;
15395 neg_insn
= gen_negsf2
;
15398 floor_insn
= gen_frndintxf2_floor
;
15399 neg_insn
= gen_negdf2
;
15402 floor_insn
= gen_frndintxf2_floor
;
15403 neg_insn
= gen_negxf2
;
15406 floor_insn
= gen_lfloorxfhi2
;
15407 neg_insn
= gen_neghi2
;
15410 floor_insn
= gen_lfloorxfsi2
;
15411 neg_insn
= gen_negsi2
;
15414 floor_insn
= gen_lfloorxfdi2
;
15415 neg_insn
= gen_negdi2
;
15418 gcc_unreachable ();
15421 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15423 /* scratch = fxam(op1) */
15424 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15426 /* e1 = fabs(op1) */
15427 emit_insn (gen_absxf2 (e1
, op1
));
15429 /* e2 = e1 + 0.5 */
15430 half
= force_reg (XFmode
, half
);
15431 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
15433 /* res = floor(e2) */
15439 tmp
= gen_reg_rtx (XFmode
);
15441 emit_insn (floor_insn (tmp
, e2
));
15442 emit_insn (gen_rtx_SET (res
,
15443 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
15444 UNSPEC_TRUNC_NOOP
)));
15448 emit_insn (floor_insn (res
, e2
));
15451 /* flags = signbit(a) */
15452 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15454 /* if (flags) then res = -res */
15455 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15456 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15457 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15459 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15460 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15461 JUMP_LABEL (insn
) = jump_label
;
15463 emit_insn (neg_insn (res
, res
));
15465 emit_label (jump_label
);
15466 LABEL_NUSES (jump_label
) = 1;
15468 emit_move_insn (op0
, res
);
15471 /* Output code to perform a Newton-Rhapson approximation of a single precision
15472 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15474 void ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
15476 rtx x0
, x1
, e0
, e1
;
15478 x0
= gen_reg_rtx (mode
);
15479 e0
= gen_reg_rtx (mode
);
15480 e1
= gen_reg_rtx (mode
);
15481 x1
= gen_reg_rtx (mode
);
15483 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15485 b
= force_reg (mode
, b
);
15487 /* x0 = rcp(b) estimate */
15488 if (mode
== V16SFmode
|| mode
== V8DFmode
)
15490 if (TARGET_AVX512ER
)
15492 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15495 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
15499 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15503 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15507 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
15510 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
15513 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
15516 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
15519 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
15522 /* Output code to perform a Newton-Rhapson approximation of a
15523 single precision floating point [reciprocal] square root. */
15525 void ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
15527 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
15531 x0
= gen_reg_rtx (mode
);
15532 e0
= gen_reg_rtx (mode
);
15533 e1
= gen_reg_rtx (mode
);
15534 e2
= gen_reg_rtx (mode
);
15535 e3
= gen_reg_rtx (mode
);
15537 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
15540 /* res = rsqrt28(a) estimate */
15541 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15545 /* x0 = rsqrt28(a) estimate */
15546 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15548 /* res = rcp28(x0) estimate */
15549 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
15555 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
15556 mthree
= const_double_from_real_value (r
, SFmode
);
15558 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
15559 mhalf
= const_double_from_real_value (r
, SFmode
);
15560 unspec
= UNSPEC_RSQRT
;
15562 if (VECTOR_MODE_P (mode
))
15564 mthree
= ix86_build_const_vector (mode
, true, mthree
);
15565 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
15566 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15567 if (GET_MODE_SIZE (mode
) == 64)
15568 unspec
= UNSPEC_RSQRT14
;
15571 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15572 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15574 a
= force_reg (mode
, a
);
15576 /* x0 = rsqrt(a) estimate */
15577 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15580 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15583 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
15586 /* Handle masked compare. */
15587 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
15589 mask
= gen_reg_rtx (HImode
);
15590 /* Imm value 0x4 corresponds to not-equal comparison. */
15591 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
15592 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
15596 mask
= gen_reg_rtx (mode
);
15597 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
15598 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
15603 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
15605 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
15608 mthree
= force_reg (mode
, mthree
);
15609 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
15611 mhalf
= force_reg (mode
, mhalf
);
15613 /* e3 = -.5 * x0 */
15614 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
15616 /* e3 = -.5 * e0 */
15617 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
15618 /* ret = e2 * e3 */
15619 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
15622 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15623 mask for masking out the sign-bit is stored in *SMASK, if that is
15627 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
15629 machine_mode vmode
, mode
= GET_MODE (op0
);
15632 xa
= gen_reg_rtx (mode
);
15633 if (mode
== SFmode
)
15635 else if (mode
== DFmode
)
15639 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
15640 if (!VECTOR_MODE_P (mode
))
15642 /* We need to generate a scalar mode mask in this case. */
15643 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15644 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15645 mask
= gen_reg_rtx (mode
);
15646 emit_insn (gen_rtx_SET (mask
, tmp
));
15648 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
15656 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15657 swapping the operands if SWAP_OPERANDS is true. The expanded
15658 code is a forward jump to a newly created label in case the
15659 comparison is true. The generated label rtx is returned. */
15660 static rtx_code_label
*
15661 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
15662 bool swap_operands
)
15664 bool unordered_compare
= ix86_unordered_fp_compare (code
);
15665 rtx_code_label
*label
;
15669 std::swap (op0
, op1
);
15671 label
= gen_label_rtx ();
15672 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
15673 if (unordered_compare
)
15674 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
15675 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
15676 emit_insn (gen_rtx_SET (reg
, tmp
));
15677 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
15678 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
15679 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
15680 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15681 JUMP_LABEL (tmp
) = label
;
15686 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15687 using comparison code CODE. Operands are swapped for the comparison if
15688 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15690 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
15691 bool swap_operands
)
15693 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
15694 machine_mode mode
= GET_MODE (op0
);
15695 rtx mask
= gen_reg_rtx (mode
);
15698 std::swap (op0
, op1
);
15700 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
15702 emit_insn (insn (mask
, op0
, op1
,
15703 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
15707 /* Expand copysign from SIGN to the positive value ABS_VALUE
15708 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15712 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
15714 machine_mode mode
= GET_MODE (sign
);
15715 rtx sgn
= gen_reg_rtx (mode
);
15716 if (mask
== NULL_RTX
)
15718 machine_mode vmode
;
15720 if (mode
== SFmode
)
15722 else if (mode
== DFmode
)
15727 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
15728 if (!VECTOR_MODE_P (mode
))
15730 /* We need to generate a scalar mode mask in this case. */
15731 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15732 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15733 mask
= gen_reg_rtx (mode
);
15734 emit_insn (gen_rtx_SET (mask
, tmp
));
15738 mask
= gen_rtx_NOT (mode
, mask
);
15739 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
15740 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
15743 /* Expand SSE sequence for computing lround from OP1 storing
15747 ix86_expand_lround (rtx op0
, rtx op1
)
15749 /* C code for the stuff we're doing below:
15750 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15753 machine_mode mode
= GET_MODE (op1
);
15754 const struct real_format
*fmt
;
15755 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
15758 /* load nextafter (0.5, 0.0) */
15759 fmt
= REAL_MODE_FORMAT (mode
);
15760 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
15761 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
15763 /* adj = copysign (0.5, op1) */
15764 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
15765 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
15767 /* adj = op1 + adj */
15768 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
15770 /* op0 = (imode)adj */
15771 expand_fix (op0
, adj
, 0);
15774 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15778 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
15780 /* C code for the stuff we're doing below (for do_floor):
15782 xi -= (double)xi > op1 ? 1 : 0;
15785 machine_mode fmode
= GET_MODE (op1
);
15786 machine_mode imode
= GET_MODE (op0
);
15787 rtx ireg
, freg
, tmp
;
15788 rtx_code_label
*label
;
15790 /* reg = (long)op1 */
15791 ireg
= gen_reg_rtx (imode
);
15792 expand_fix (ireg
, op1
, 0);
15794 /* freg = (double)reg */
15795 freg
= gen_reg_rtx (fmode
);
15796 expand_float (freg
, ireg
, 0);
15798 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15799 label
= ix86_expand_sse_compare_and_jump (UNLE
,
15800 freg
, op1
, !do_floor
);
15801 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
15802 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
15803 emit_move_insn (ireg
, tmp
);
15805 emit_label (label
);
15806 LABEL_NUSES (label
) = 1;
15808 emit_move_insn (op0
, ireg
);
15811 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15812 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15815 ix86_gen_TWO52 (machine_mode mode
)
15817 REAL_VALUE_TYPE TWO52r
;
15820 real_ldexp (&TWO52r
, &dconst1
, mode
== DFmode
? 52 : 23);
15821 TWO52
= const_double_from_real_value (TWO52r
, mode
);
15822 TWO52
= force_reg (mode
, TWO52
);
15827 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15830 ix86_expand_rint (rtx operand0
, rtx operand1
)
15832 /* C code for the stuff we're doing below:
15833 xa = fabs (operand1);
15834 if (!isless (xa, 2**52))
15837 if (flag_rounding_math)
15839 two52 = copysign (two52, operand1);
15842 xa = xa + two52 - two52;
15843 return copysign (xa, operand1);
15845 machine_mode mode
= GET_MODE (operand0
);
15846 rtx res
, xa
, TWO52
, two52
, mask
;
15847 rtx_code_label
*label
;
15849 res
= gen_reg_rtx (mode
);
15850 emit_move_insn (res
, operand1
);
15852 /* xa = abs (operand1) */
15853 xa
= ix86_expand_sse_fabs (res
, &mask
);
15855 /* if (!isless (xa, TWO52)) goto label; */
15856 TWO52
= ix86_gen_TWO52 (mode
);
15857 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15860 if (flag_rounding_math
)
15862 two52
= gen_reg_rtx (mode
);
15863 ix86_sse_copysign_to_positive (two52
, TWO52
, res
, mask
);
15867 xa
= expand_simple_binop (mode
, PLUS
, xa
, two52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15868 xa
= expand_simple_binop (mode
, MINUS
, xa
, two52
, xa
, 0, OPTAB_DIRECT
);
15870 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
15872 emit_label (label
);
15873 LABEL_NUSES (label
) = 1;
15875 emit_move_insn (operand0
, res
);
15878 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15881 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
15883 /* C code for the stuff we expand below.
15884 double xa = fabs (x), x2;
15885 if (!isless (xa, TWO52))
15887 xa = xa + TWO52 - TWO52;
15888 x2 = copysign (xa, x);
15895 if (HONOR_SIGNED_ZEROS (mode))
15896 x2 = copysign (x2, x);
15899 machine_mode mode
= GET_MODE (operand0
);
15900 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
15901 rtx_code_label
*label
;
15903 TWO52
= ix86_gen_TWO52 (mode
);
15905 /* Temporary for holding the result, initialized to the input
15906 operand to ease control flow. */
15907 res
= gen_reg_rtx (mode
);
15908 emit_move_insn (res
, operand1
);
15910 /* xa = abs (operand1) */
15911 xa
= ix86_expand_sse_fabs (res
, &mask
);
15913 /* if (!isless (xa, TWO52)) goto label; */
15914 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15916 /* xa = xa + TWO52 - TWO52; */
15917 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15918 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
15920 /* xa = copysign (xa, operand1) */
15921 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
15924 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15926 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15927 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15928 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15929 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15930 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15931 if (!do_floor
&& HONOR_SIGNED_ZEROS (mode
))
15932 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
15933 emit_move_insn (res
, tmp
);
15935 emit_label (label
);
15936 LABEL_NUSES (label
) = 1;
15938 emit_move_insn (operand0
, res
);
15941 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15944 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
15946 /* C code for the stuff we expand below.
15947 double xa = fabs (x), x2;
15948 if (!isless (xa, TWO52))
15950 x2 = (double)(long)x;
15957 if (HONOR_SIGNED_ZEROS (mode))
15958 return copysign (x2, x);
15961 machine_mode mode
= GET_MODE (operand0
);
15962 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
15963 rtx_code_label
*label
;
15965 TWO52
= ix86_gen_TWO52 (mode
);
15967 /* Temporary for holding the result, initialized to the input
15968 operand to ease control flow. */
15969 res
= gen_reg_rtx (mode
);
15970 emit_move_insn (res
, operand1
);
15972 /* xa = abs (operand1) */
15973 xa
= ix86_expand_sse_fabs (res
, &mask
);
15975 /* if (!isless (xa, TWO52)) goto label; */
15976 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15978 /* xa = (double)(long)x */
15979 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
15980 expand_fix (xi
, res
, 0);
15981 expand_float (xa
, xi
, 0);
15984 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15986 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15987 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15988 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15989 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15990 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15991 emit_move_insn (res
, tmp
);
15993 if (HONOR_SIGNED_ZEROS (mode
))
15994 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
15996 emit_label (label
);
15997 LABEL_NUSES (label
) = 1;
15999 emit_move_insn (operand0
, res
);
16002 /* Expand SSE sequence for computing round from OPERAND1 storing
16003 into OPERAND0. Sequence that works without relying on DImode truncation
16004 via cvttsd2siq that is only available on 64bit targets. */
16006 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
16008 /* C code for the stuff we expand below.
16009 double xa = fabs (x), xa2, x2;
16010 if (!isless (xa, TWO52))
16012 Using the absolute value and copying back sign makes
16013 -0.0 -> -0.0 correct.
16014 xa2 = xa + TWO52 - TWO52;
16019 else if (dxa > 0.5)
16021 x2 = copysign (xa2, x);
16024 machine_mode mode
= GET_MODE (operand0
);
16025 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
16026 rtx_code_label
*label
;
16028 TWO52
= ix86_gen_TWO52 (mode
);
16030 /* Temporary for holding the result, initialized to the input
16031 operand to ease control flow. */
16032 res
= gen_reg_rtx (mode
);
16033 emit_move_insn (res
, operand1
);
16035 /* xa = abs (operand1) */
16036 xa
= ix86_expand_sse_fabs (res
, &mask
);
16038 /* if (!isless (xa, TWO52)) goto label; */
16039 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16041 /* xa2 = xa + TWO52 - TWO52; */
16042 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16043 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16045 /* dxa = xa2 - xa; */
16046 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
16048 /* generate 0.5, 1.0 and -0.5 */
16049 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
16050 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16051 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
16055 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16056 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
16057 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16058 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16059 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16060 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
16061 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16062 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16064 /* res = copysign (xa2, operand1) */
16065 ix86_sse_copysign_to_positive (res
, xa2
, force_reg (mode
, operand1
), mask
);
16067 emit_label (label
);
16068 LABEL_NUSES (label
) = 1;
16070 emit_move_insn (operand0
, res
);
16073 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16076 ix86_expand_trunc (rtx operand0
, rtx operand1
)
16078 /* C code for SSE variant we expand below.
16079 double xa = fabs (x), x2;
16080 if (!isless (xa, TWO52))
16082 x2 = (double)(long)x;
16083 if (HONOR_SIGNED_ZEROS (mode))
16084 return copysign (x2, x);
16087 machine_mode mode
= GET_MODE (operand0
);
16088 rtx xa
, xi
, TWO52
, res
, mask
;
16089 rtx_code_label
*label
;
16091 TWO52
= ix86_gen_TWO52 (mode
);
16093 /* Temporary for holding the result, initialized to the input
16094 operand to ease control flow. */
16095 res
= gen_reg_rtx (mode
);
16096 emit_move_insn (res
, operand1
);
16098 /* xa = abs (operand1) */
16099 xa
= ix86_expand_sse_fabs (res
, &mask
);
16101 /* if (!isless (xa, TWO52)) goto label; */
16102 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16104 /* x = (double)(long)x */
16105 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16106 expand_fix (xi
, res
, 0);
16107 expand_float (res
, xi
, 0);
16109 if (HONOR_SIGNED_ZEROS (mode
))
16110 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
16112 emit_label (label
);
16113 LABEL_NUSES (label
) = 1;
16115 emit_move_insn (operand0
, res
);
16118 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16121 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
16123 machine_mode mode
= GET_MODE (operand0
);
16124 rtx xa
, mask
, TWO52
, one
, res
, smask
, tmp
;
16125 rtx_code_label
*label
;
16127 /* C code for SSE variant we expand below.
16128 double xa = fabs (x), x2;
16129 if (!isless (xa, TWO52))
16131 xa2 = xa + TWO52 - TWO52;
16135 x2 = copysign (xa2, x);
16139 TWO52
= ix86_gen_TWO52 (mode
);
16141 /* Temporary for holding the result, initialized to the input
16142 operand to ease control flow. */
16143 res
= gen_reg_rtx (mode
);
16144 emit_move_insn (res
, operand1
);
16146 /* xa = abs (operand1) */
16147 xa
= ix86_expand_sse_fabs (res
, &smask
);
16149 /* if (!isless (xa, TWO52)) goto label; */
16150 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16152 /* res = xa + TWO52 - TWO52; */
16153 tmp
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16154 tmp
= expand_simple_binop (mode
, MINUS
, tmp
, TWO52
, tmp
, 0, OPTAB_DIRECT
);
16155 emit_move_insn (res
, tmp
);
16158 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16160 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16161 mask
= ix86_expand_sse_compare_mask (UNGT
, res
, xa
, false);
16162 emit_insn (gen_rtx_SET (mask
, gen_rtx_AND (mode
, mask
, one
)));
16163 tmp
= expand_simple_binop (mode
, MINUS
,
16164 res
, mask
, NULL_RTX
, 0, OPTAB_DIRECT
);
16165 emit_move_insn (res
, tmp
);
16167 /* res = copysign (res, operand1) */
16168 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), smask
);
16170 emit_label (label
);
16171 LABEL_NUSES (label
) = 1;
16173 emit_move_insn (operand0
, res
);
16176 /* Expand SSE sequence for computing round from OPERAND1 storing
16179 ix86_expand_round (rtx operand0
, rtx operand1
)
16181 /* C code for the stuff we're doing below:
16182 double xa = fabs (x);
16183 if (!isless (xa, TWO52))
16185 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16186 return copysign (xa, x);
16188 machine_mode mode
= GET_MODE (operand0
);
16189 rtx res
, TWO52
, xa
, xi
, half
, mask
;
16190 rtx_code_label
*label
;
16191 const struct real_format
*fmt
;
16192 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16194 /* Temporary for holding the result, initialized to the input
16195 operand to ease control flow. */
16196 res
= gen_reg_rtx (mode
);
16197 emit_move_insn (res
, operand1
);
16199 TWO52
= ix86_gen_TWO52 (mode
);
16200 xa
= ix86_expand_sse_fabs (res
, &mask
);
16201 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16203 /* load nextafter (0.5, 0.0) */
16204 fmt
= REAL_MODE_FORMAT (mode
);
16205 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16206 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16208 /* xa = xa + 0.5 */
16209 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16210 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16212 /* xa = (double)(int64_t)xa */
16213 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16214 expand_fix (xi
, xa
, 0);
16215 expand_float (xa
, xi
, 0);
16217 /* res = copysign (xa, operand1) */
16218 ix86_sse_copysign_to_positive (res
, xa
, force_reg (mode
, operand1
), mask
);
16220 emit_label (label
);
16221 LABEL_NUSES (label
) = 1;
16223 emit_move_insn (operand0
, res
);
16226 /* Expand SSE sequence for computing round
16227 from OP1 storing into OP0 using sse4 round insn. */
16229 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
16231 machine_mode mode
= GET_MODE (op0
);
16232 rtx e1
, e2
, res
, half
;
16233 const struct real_format
*fmt
;
16234 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16235 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
16236 rtx (*gen_round
) (rtx
, rtx
, rtx
);
16241 gen_copysign
= gen_copysignsf3
;
16242 gen_round
= gen_sse4_1_roundsf2
;
16245 gen_copysign
= gen_copysigndf3
;
16246 gen_round
= gen_sse4_1_rounddf2
;
16249 gcc_unreachable ();
16252 /* round (a) = trunc (a + copysign (0.5, a)) */
16254 /* load nextafter (0.5, 0.0) */
16255 fmt
= REAL_MODE_FORMAT (mode
);
16256 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16257 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16258 half
= const_double_from_real_value (pred_half
, mode
);
16260 /* e1 = copysign (0.5, op1) */
16261 e1
= gen_reg_rtx (mode
);
16262 emit_insn (gen_copysign (e1
, half
, op1
));
16264 /* e2 = op1 + e1 */
16265 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16267 /* res = trunc (e2) */
16268 res
= gen_reg_rtx (mode
);
16269 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
16271 emit_move_insn (op0
, res
);
16274 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16275 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16276 insn every time. */
16278 static GTY(()) rtx_insn
*vselect_insn
;
16280 /* Initialize vselect_insn. */
16283 init_vselect_insn (void)
16288 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
16289 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
16290 XVECEXP (x
, 0, i
) = const0_rtx
;
16291 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
16293 x
= gen_rtx_SET (const0_rtx
, x
);
16295 vselect_insn
= emit_insn (x
);
16299 /* Construct (set target (vec_select op0 (parallel perm))) and
16300 return true if that's a valid instruction in the active ISA. */
16303 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
16304 unsigned nelt
, bool testing_p
)
16307 rtx x
, save_vconcat
;
16310 if (vselect_insn
== NULL_RTX
)
16311 init_vselect_insn ();
16313 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
16314 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
16315 for (i
= 0; i
< nelt
; ++i
)
16316 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
16317 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16318 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
16319 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
16320 SET_DEST (PATTERN (vselect_insn
)) = target
;
16321 icode
= recog_memoized (vselect_insn
);
16323 if (icode
>= 0 && !testing_p
)
16324 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
16326 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
16327 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
16328 INSN_CODE (vselect_insn
) = -1;
16333 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16336 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
16337 const unsigned char *perm
, unsigned nelt
,
16340 machine_mode v2mode
;
16344 if (vselect_insn
== NULL_RTX
)
16345 init_vselect_insn ();
16347 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
16349 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16350 PUT_MODE (x
, v2mode
);
16353 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
16354 XEXP (x
, 0) = const0_rtx
;
16355 XEXP (x
, 1) = const0_rtx
;
16359 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16360 using movss or movsd. */
16362 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
16364 machine_mode vmode
= d
->vmode
;
16365 unsigned i
, nelt
= d
->nelt
;
16368 if (d
->one_operand_p
)
16371 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
16372 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
16375 /* Only the first element is changed. */
16376 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
16378 for (i
= 1; i
< nelt
; ++i
)
16379 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
16385 if (d
->perm
[0] == nelt
)
16386 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
16388 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
16390 emit_insn (gen_rtx_SET (d
->target
, x
));
16395 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16396 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16399 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
16401 machine_mode mmode
, vmode
= d
->vmode
;
16402 unsigned i
, mask
, nelt
= d
->nelt
;
16403 rtx target
, op0
, op1
, maskop
, x
;
16404 rtx rperm
[32], vperm
;
16406 if (d
->one_operand_p
)
16408 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
16409 && (TARGET_AVX512BW
16410 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
16412 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
16414 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
16416 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
16421 /* This is a blend, not a permute. Elements must stay in their
16422 respective lanes. */
16423 for (i
= 0; i
< nelt
; ++i
)
16425 unsigned e
= d
->perm
[i
];
16426 if (!(e
== i
|| e
== i
+ nelt
))
16433 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16434 decision should be extracted elsewhere, so that we only try that
16435 sequence once all budget==3 options have been tried. */
16436 target
= d
->target
;
16455 for (i
= 0; i
< nelt
; ++i
)
16456 mask
|= (d
->perm
[i
] >= nelt
) << i
;
16460 for (i
= 0; i
< 2; ++i
)
16461 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
16466 for (i
= 0; i
< 4; ++i
)
16467 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16472 /* See if bytes move in pairs so we can use pblendw with
16473 an immediate argument, rather than pblendvb with a vector
16475 for (i
= 0; i
< 16; i
+= 2)
16476 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16479 for (i
= 0; i
< nelt
; ++i
)
16480 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
16483 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16484 vperm
= force_reg (vmode
, vperm
);
16486 if (GET_MODE_SIZE (vmode
) == 16)
16487 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
16489 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
16490 if (target
!= d
->target
)
16491 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16495 for (i
= 0; i
< 8; ++i
)
16496 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16501 target
= gen_reg_rtx (vmode
);
16502 op0
= gen_lowpart (vmode
, op0
);
16503 op1
= gen_lowpart (vmode
, op1
);
16507 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16508 for (i
= 0; i
< 32; i
+= 2)
16509 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16511 /* See if bytes move in quadruplets. If yes, vpblendd
16512 with immediate can be used. */
16513 for (i
= 0; i
< 32; i
+= 4)
16514 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
16518 /* See if bytes move the same in both lanes. If yes,
16519 vpblendw with immediate can be used. */
16520 for (i
= 0; i
< 16; i
+= 2)
16521 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
16524 /* Use vpblendw. */
16525 for (i
= 0; i
< 16; ++i
)
16526 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
16531 /* Use vpblendd. */
16532 for (i
= 0; i
< 8; ++i
)
16533 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
16538 /* See if words move in pairs. If yes, vpblendd can be used. */
16539 for (i
= 0; i
< 16; i
+= 2)
16540 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16544 /* See if words move the same in both lanes. If not,
16545 vpblendvb must be used. */
16546 for (i
= 0; i
< 8; i
++)
16547 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
16549 /* Use vpblendvb. */
16550 for (i
= 0; i
< 32; ++i
)
16551 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
16555 target
= gen_reg_rtx (vmode
);
16556 op0
= gen_lowpart (vmode
, op0
);
16557 op1
= gen_lowpart (vmode
, op1
);
16558 goto finish_pblendvb
;
16561 /* Use vpblendw. */
16562 for (i
= 0; i
< 16; ++i
)
16563 mask
|= (d
->perm
[i
] >= 16) << i
;
16567 /* Use vpblendd. */
16568 for (i
= 0; i
< 8; ++i
)
16569 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16574 /* Use vpblendd. */
16575 for (i
= 0; i
< 4; ++i
)
16576 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16581 gcc_unreachable ();
16604 if (mmode
!= VOIDmode
)
16605 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
16607 maskop
= GEN_INT (mask
);
16609 /* This matches five different patterns with the different modes. */
16610 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
16611 x
= gen_rtx_SET (target
, x
);
16613 if (target
!= d
->target
)
16614 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16619 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16620 in terms of the variable form of vpermilps.
16622 Note that we will have already failed the immediate input vpermilps,
16623 which requires that the high and low part shuffle be identical; the
16624 variable form doesn't require that. */
16627 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
16629 rtx rperm
[8], vperm
;
16632 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
16635 /* We can only permute within the 128-bit lane. */
16636 for (i
= 0; i
< 8; ++i
)
16638 unsigned e
= d
->perm
[i
];
16639 if (i
< 4 ? e
>= 4 : e
< 4)
16646 for (i
= 0; i
< 8; ++i
)
16648 unsigned e
= d
->perm
[i
];
16650 /* Within each 128-bit lane, the elements of op0 are numbered
16651 from 0 and the elements of op1 are numbered from 4. */
16657 rperm
[i
] = GEN_INT (e
);
16660 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
16661 vperm
= force_reg (V8SImode
, vperm
);
16662 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
16667 /* Return true if permutation D can be performed as VMODE permutation
16671 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
16673 unsigned int i
, j
, chunk
;
16675 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
16676 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
16677 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
16680 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
16683 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
16684 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
16685 if (d
->perm
[i
] & (chunk
- 1))
16688 for (j
= 1; j
< chunk
; ++j
)
16689 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
16695 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16696 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16699 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
16701 unsigned i
, nelt
, eltsz
, mask
;
16702 unsigned char perm
[64];
16703 machine_mode vmode
= V16QImode
;
16704 rtx rperm
[64], vperm
, target
, op0
, op1
;
16708 if (!d
->one_operand_p
)
16710 if (!TARGET_XOP
|| GET_MODE_SIZE (d
->vmode
) != 16)
16713 && valid_perm_using_mode_p (V2TImode
, d
))
16718 /* Use vperm2i128 insn. The pattern uses
16719 V4DImode instead of V2TImode. */
16720 target
= d
->target
;
16721 if (d
->vmode
!= V4DImode
)
16722 target
= gen_reg_rtx (V4DImode
);
16723 op0
= gen_lowpart (V4DImode
, d
->op0
);
16724 op1
= gen_lowpart (V4DImode
, d
->op1
);
16726 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
16727 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
16728 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
16729 if (target
!= d
->target
)
16730 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16738 if (GET_MODE_SIZE (d
->vmode
) == 16)
16743 else if (GET_MODE_SIZE (d
->vmode
) == 32)
16748 /* V4DImode should be already handled through
16749 expand_vselect by vpermq instruction. */
16750 gcc_assert (d
->vmode
!= V4DImode
);
16753 if (d
->vmode
== V8SImode
16754 || d
->vmode
== V16HImode
16755 || d
->vmode
== V32QImode
)
16757 /* First see if vpermq can be used for
16758 V8SImode/V16HImode/V32QImode. */
16759 if (valid_perm_using_mode_p (V4DImode
, d
))
16761 for (i
= 0; i
< 4; i
++)
16762 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
16765 target
= gen_reg_rtx (V4DImode
);
16766 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
16769 emit_move_insn (d
->target
,
16770 gen_lowpart (d
->vmode
, target
));
16776 /* Next see if vpermd can be used. */
16777 if (valid_perm_using_mode_p (V8SImode
, d
))
16780 /* Or if vpermps can be used. */
16781 else if (d
->vmode
== V8SFmode
)
16784 if (vmode
== V32QImode
)
16786 /* vpshufb only works intra lanes, it is not
16787 possible to shuffle bytes in between the lanes. */
16788 for (i
= 0; i
< nelt
; ++i
)
16789 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
16793 else if (GET_MODE_SIZE (d
->vmode
) == 64)
16795 if (!TARGET_AVX512BW
)
16798 /* If vpermq didn't work, vpshufb won't work either. */
16799 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
16803 if (d
->vmode
== V16SImode
16804 || d
->vmode
== V32HImode
16805 || d
->vmode
== V64QImode
)
16807 /* First see if vpermq can be used for
16808 V16SImode/V32HImode/V64QImode. */
16809 if (valid_perm_using_mode_p (V8DImode
, d
))
16811 for (i
= 0; i
< 8; i
++)
16812 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
16815 target
= gen_reg_rtx (V8DImode
);
16816 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
16819 emit_move_insn (d
->target
,
16820 gen_lowpart (d
->vmode
, target
));
16826 /* Next see if vpermd can be used. */
16827 if (valid_perm_using_mode_p (V16SImode
, d
))
16830 /* Or if vpermps can be used. */
16831 else if (d
->vmode
== V16SFmode
)
16833 if (vmode
== V64QImode
)
16835 /* vpshufb only works intra lanes, it is not
16836 possible to shuffle bytes in between the lanes. */
16837 for (i
= 0; i
< nelt
; ++i
)
16838 if ((d
->perm
[i
] ^ i
) & (nelt
/ 4))
16849 if (vmode
== V8SImode
)
16850 for (i
= 0; i
< 8; ++i
)
16851 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
16852 else if (vmode
== V16SImode
)
16853 for (i
= 0; i
< 16; ++i
)
16854 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
16857 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
16858 if (!d
->one_operand_p
)
16859 mask
= 2 * nelt
- 1;
16860 else if (vmode
== V16QImode
)
16862 else if (vmode
== V64QImode
)
16863 mask
= nelt
/ 4 - 1;
16865 mask
= nelt
/ 2 - 1;
16867 for (i
= 0; i
< nelt
; ++i
)
16869 unsigned j
, e
= d
->perm
[i
] & mask
;
16870 for (j
= 0; j
< eltsz
; ++j
)
16871 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
16875 vperm
= gen_rtx_CONST_VECTOR (vmode
,
16876 gen_rtvec_v (GET_MODE_NUNITS (vmode
), rperm
));
16877 vperm
= force_reg (vmode
, vperm
);
16879 target
= d
->target
;
16880 if (d
->vmode
!= vmode
)
16881 target
= gen_reg_rtx (vmode
);
16882 op0
= gen_lowpart (vmode
, d
->op0
);
16883 if (d
->one_operand_p
)
16885 if (vmode
== V16QImode
)
16886 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, vperm
));
16887 else if (vmode
== V32QImode
)
16888 emit_insn (gen_avx2_pshufbv32qi3 (target
, op0
, vperm
));
16889 else if (vmode
== V64QImode
)
16890 emit_insn (gen_avx512bw_pshufbv64qi3 (target
, op0
, vperm
));
16891 else if (vmode
== V8SFmode
)
16892 emit_insn (gen_avx2_permvarv8sf (target
, op0
, vperm
));
16893 else if (vmode
== V8SImode
)
16894 emit_insn (gen_avx2_permvarv8si (target
, op0
, vperm
));
16895 else if (vmode
== V16SFmode
)
16896 emit_insn (gen_avx512f_permvarv16sf (target
, op0
, vperm
));
16897 else if (vmode
== V16SImode
)
16898 emit_insn (gen_avx512f_permvarv16si (target
, op0
, vperm
));
16900 gcc_unreachable ();
16904 op1
= gen_lowpart (vmode
, d
->op1
);
16905 emit_insn (gen_xop_pperm (target
, op0
, op1
, vperm
));
16907 if (target
!= d
->target
)
16908 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16913 /* For V*[QHS]Imode permutations, check if the same permutation
16914 can't be performed in a 2x, 4x or 8x wider inner mode. */
16917 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
16918 struct expand_vec_perm_d
*nd
)
16921 machine_mode mode
= VOIDmode
;
16925 case E_V16QImode
: mode
= V8HImode
; break;
16926 case E_V32QImode
: mode
= V16HImode
; break;
16927 case E_V64QImode
: mode
= V32HImode
; break;
16928 case E_V8HImode
: mode
= V4SImode
; break;
16929 case E_V16HImode
: mode
= V8SImode
; break;
16930 case E_V32HImode
: mode
= V16SImode
; break;
16931 case E_V4SImode
: mode
= V2DImode
; break;
16932 case E_V8SImode
: mode
= V4DImode
; break;
16933 case E_V16SImode
: mode
= V8DImode
; break;
16934 default: return false;
16936 for (i
= 0; i
< d
->nelt
; i
+= 2)
16937 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
16940 nd
->nelt
= d
->nelt
/ 2;
16941 for (i
= 0; i
< nd
->nelt
; i
++)
16942 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
16943 if (GET_MODE_INNER (mode
) != DImode
)
16944 canonicalize_vector_int_perm (nd
, nd
);
16947 nd
->one_operand_p
= d
->one_operand_p
;
16948 nd
->testing_p
= d
->testing_p
;
16949 if (d
->op0
== d
->op1
)
16950 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
16953 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
16954 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
16957 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
16959 nd
->target
= gen_reg_rtx (nd
->vmode
);
16964 /* Try to expand one-operand permutation with constant mask. */
16967 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
16969 machine_mode mode
= GET_MODE (d
->op0
);
16970 machine_mode maskmode
= mode
;
16971 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
16972 rtx target
, op0
, mask
;
16975 if (!rtx_equal_p (d
->op0
, d
->op1
))
16978 if (!TARGET_AVX512F
)
16984 gen
= gen_avx512f_permvarv16si
;
16987 gen
= gen_avx512f_permvarv16sf
;
16988 maskmode
= V16SImode
;
16991 gen
= gen_avx512f_permvarv8di
;
16994 gen
= gen_avx512f_permvarv8df
;
16995 maskmode
= V8DImode
;
17001 target
= d
->target
;
17003 for (int i
= 0; i
< d
->nelt
; ++i
)
17004 vec
[i
] = GEN_INT (d
->perm
[i
]);
17005 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
17006 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
17010 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
17012 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
17013 in a single instruction. */
17016 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
17018 unsigned i
, nelt
= d
->nelt
;
17019 struct expand_vec_perm_d nd
;
17021 /* Check plain VEC_SELECT first, because AVX has instructions that could
17022 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17023 input where SEL+CONCAT may not. */
17024 if (d
->one_operand_p
)
17026 int mask
= nelt
- 1;
17027 bool identity_perm
= true;
17028 bool broadcast_perm
= true;
17030 for (i
= 0; i
< nelt
; i
++)
17032 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17033 if (nd
.perm
[i
] != i
)
17034 identity_perm
= false;
17036 broadcast_perm
= false;
17042 emit_move_insn (d
->target
, d
->op0
);
17045 else if (broadcast_perm
&& TARGET_AVX2
)
17047 /* Use vpbroadcast{b,w,d}. */
17048 rtx (*gen
) (rtx
, rtx
) = NULL
;
17052 if (TARGET_AVX512BW
)
17053 gen
= gen_avx512bw_vec_dupv64qi_1
;
17056 gen
= gen_avx2_pbroadcastv32qi_1
;
17059 if (TARGET_AVX512BW
)
17060 gen
= gen_avx512bw_vec_dupv32hi_1
;
17063 gen
= gen_avx2_pbroadcastv16hi_1
;
17066 if (TARGET_AVX512F
)
17067 gen
= gen_avx512f_vec_dupv16si_1
;
17070 gen
= gen_avx2_pbroadcastv8si_1
;
17073 gen
= gen_avx2_pbroadcastv16qi
;
17076 gen
= gen_avx2_pbroadcastv8hi
;
17079 if (TARGET_AVX512F
)
17080 gen
= gen_avx512f_vec_dupv16sf_1
;
17083 gen
= gen_avx2_vec_dupv8sf_1
;
17086 if (TARGET_AVX512F
)
17087 gen
= gen_avx512f_vec_dupv8df_1
;
17090 if (TARGET_AVX512F
)
17091 gen
= gen_avx512f_vec_dupv8di_1
;
17093 /* For other modes prefer other shuffles this function creates. */
17099 emit_insn (gen (d
->target
, d
->op0
));
17104 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
17107 /* There are plenty of patterns in sse.md that are written for
17108 SEL+CONCAT and are not replicated for a single op. Perhaps
17109 that should be changed, to avoid the nastiness here. */
17111 /* Recognize interleave style patterns, which means incrementing
17112 every other permutation operand. */
17113 for (i
= 0; i
< nelt
; i
+= 2)
17115 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17116 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
17118 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17122 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17125 for (i
= 0; i
< nelt
; i
+= 4)
17127 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
17128 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
17129 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
17130 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
17133 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17139 /* Try movss/movsd instructions. */
17140 if (expand_vec_perm_movs (d
))
17143 /* Finally, try the fully general two operand permute. */
17144 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
17148 /* Recognize interleave style patterns with reversed operands. */
17149 if (!d
->one_operand_p
)
17151 for (i
= 0; i
< nelt
; ++i
)
17153 unsigned e
= d
->perm
[i
];
17161 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
17166 /* Try the SSE4.1 blend variable merge instructions. */
17167 if (expand_vec_perm_blend (d
))
17170 /* Try one of the AVX vpermil variable permutations. */
17171 if (expand_vec_perm_vpermil (d
))
17174 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17175 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17176 if (expand_vec_perm_pshufb (d
))
17179 /* Try the AVX2 vpalignr instruction. */
17180 if (expand_vec_perm_palignr (d
, true))
17183 /* Try the AVX512F vperm{s,d} instructions. */
17184 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
17187 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17188 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
17191 /* See if we can get the same permutation in different vector integer
17193 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
17196 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
17202 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
17203 in terms of a pair of pshuflw + pshufhw instructions. */
17206 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
17208 unsigned char perm2
[MAX_VECT_LEN
];
17212 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
17215 /* The two permutations only operate in 64-bit lanes. */
17216 for (i
= 0; i
< 4; ++i
)
17217 if (d
->perm
[i
] >= 4)
17219 for (i
= 4; i
< 8; ++i
)
17220 if (d
->perm
[i
] < 4)
17226 /* Emit the pshuflw. */
17227 memcpy (perm2
, d
->perm
, 4);
17228 for (i
= 4; i
< 8; ++i
)
17230 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
17233 /* Emit the pshufhw. */
17234 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
17235 for (i
= 0; i
< 4; ++i
)
17237 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
17243 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17244 the permutation using the SSSE3 palignr instruction. This succeeds
17245 when all of the elements in PERM fit within one vector and we merely
17246 need to shift them down so that a single vector permutation has a
17247 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17248 the vpalignr instruction itself can perform the requested permutation. */
17251 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
17253 unsigned i
, nelt
= d
->nelt
;
17254 unsigned min
, max
, minswap
, maxswap
;
17255 bool in_order
, ok
, swap
= false;
17257 struct expand_vec_perm_d dcopy
;
17259 /* Even with AVX, palignr only operates on 128-bit vectors,
17260 in AVX2 palignr operates on both 128-bit lanes. */
17261 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17262 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
17267 minswap
= 2 * nelt
;
17269 for (i
= 0; i
< nelt
; ++i
)
17271 unsigned e
= d
->perm
[i
];
17272 unsigned eswap
= d
->perm
[i
] ^ nelt
;
17273 if (GET_MODE_SIZE (d
->vmode
) == 32)
17275 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
17276 eswap
= e
^ (nelt
/ 2);
17282 if (eswap
< minswap
)
17284 if (eswap
> maxswap
)
17288 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
17290 if (d
->one_operand_p
17292 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
17293 ? nelt
/ 2 : nelt
))
17300 /* Given that we have SSSE3, we know we'll be able to implement the
17301 single operand permutation after the palignr with pshufb for
17302 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17304 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
17310 dcopy
.op0
= d
->op1
;
17311 dcopy
.op1
= d
->op0
;
17312 for (i
= 0; i
< nelt
; ++i
)
17313 dcopy
.perm
[i
] ^= nelt
;
17317 for (i
= 0; i
< nelt
; ++i
)
17319 unsigned e
= dcopy
.perm
[i
];
17320 if (GET_MODE_SIZE (d
->vmode
) == 32
17322 && (e
& (nelt
/ 2 - 1)) < min
)
17323 e
= e
- min
- (nelt
/ 2);
17330 dcopy
.one_operand_p
= true;
17332 if (single_insn_only_p
&& !in_order
)
17335 /* For AVX2, test whether we can permute the result in one instruction. */
17340 dcopy
.op1
= dcopy
.op0
;
17341 return expand_vec_perm_1 (&dcopy
);
17344 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
17345 if (GET_MODE_SIZE (d
->vmode
) == 16)
17347 target
= gen_reg_rtx (TImode
);
17348 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
17349 gen_lowpart (TImode
, dcopy
.op0
), shift
));
17353 target
= gen_reg_rtx (V2TImode
);
17354 emit_insn (gen_avx2_palignrv2ti (target
,
17355 gen_lowpart (V2TImode
, dcopy
.op1
),
17356 gen_lowpart (V2TImode
, dcopy
.op0
),
17360 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
17362 /* Test for the degenerate case where the alignment by itself
17363 produces the desired permutation. */
17366 emit_move_insn (d
->target
, dcopy
.op0
);
17370 ok
= expand_vec_perm_1 (&dcopy
);
17371 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
17376 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17377 the permutation using the SSE4_1 pblendv instruction. Potentially
17378 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17381 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
17383 unsigned i
, which
, nelt
= d
->nelt
;
17384 struct expand_vec_perm_d dcopy
, dcopy1
;
17385 machine_mode vmode
= d
->vmode
;
17388 /* Use the same checks as in expand_vec_perm_blend. */
17389 if (d
->one_operand_p
)
17391 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
17393 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
17395 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
17400 /* Figure out where permutation elements stay not in their
17401 respective lanes. */
17402 for (i
= 0, which
= 0; i
< nelt
; ++i
)
17404 unsigned e
= d
->perm
[i
];
17406 which
|= (e
< nelt
? 1 : 2);
17408 /* We can pblend the part where elements stay not in their
17409 respective lanes only when these elements are all in one
17410 half of a permutation.
17411 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17412 lanes, but both 8 and 9 >= 8
17413 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17414 respective lanes and 8 >= 8, but 2 not. */
17415 if (which
!= 1 && which
!= 2)
17417 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
17420 /* First we apply one operand permutation to the part where
17421 elements stay not in their respective lanes. */
17424 dcopy
.op0
= dcopy
.op1
= d
->op1
;
17426 dcopy
.op0
= dcopy
.op1
= d
->op0
;
17428 dcopy
.target
= gen_reg_rtx (vmode
);
17429 dcopy
.one_operand_p
= true;
17431 for (i
= 0; i
< nelt
; ++i
)
17432 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17434 ok
= expand_vec_perm_1 (&dcopy
);
17435 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
17442 /* Next we put permuted elements into their positions. */
17445 dcopy1
.op1
= dcopy
.target
;
17447 dcopy1
.op0
= dcopy
.target
;
17449 for (i
= 0; i
< nelt
; ++i
)
17450 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
17452 ok
= expand_vec_perm_blend (&dcopy1
);
17458 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
17460 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17461 a two vector permutation into a single vector permutation by using
17462 an interleave operation to merge the vectors. */
17465 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
17467 struct expand_vec_perm_d dremap
, dfinal
;
17468 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17469 unsigned HOST_WIDE_INT contents
;
17470 unsigned char remap
[2 * MAX_VECT_LEN
];
17472 bool ok
, same_halves
= false;
17474 if (GET_MODE_SIZE (d
->vmode
) == 16)
17476 if (d
->one_operand_p
)
17479 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17483 /* For 32-byte modes allow even d->one_operand_p.
17484 The lack of cross-lane shuffling in some instructions
17485 might prevent a single insn shuffle. */
17487 dfinal
.testing_p
= true;
17488 /* If expand_vec_perm_interleave3 can expand this into
17489 a 3 insn sequence, give up and let it be expanded as
17490 3 insn sequence. While that is one insn longer,
17491 it doesn't need a memory operand and in the common
17492 case that both interleave low and high permutations
17493 with the same operands are adjacent needs 4 insns
17494 for both after CSE. */
17495 if (expand_vec_perm_interleave3 (&dfinal
))
17501 /* Examine from whence the elements come. */
17503 for (i
= 0; i
< nelt
; ++i
)
17504 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
17506 memset (remap
, 0xff, sizeof (remap
));
17509 if (GET_MODE_SIZE (d
->vmode
) == 16)
17511 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
17513 /* Split the two input vectors into 4 halves. */
17514 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
17519 /* If the elements from the low halves use interleave low, and similarly
17520 for interleave high. If the elements are from mis-matched halves, we
17521 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17522 if ((contents
& (h1
| h3
)) == contents
)
17525 for (i
= 0; i
< nelt2
; ++i
)
17528 remap
[i
+ nelt
] = i
* 2 + 1;
17529 dremap
.perm
[i
* 2] = i
;
17530 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17532 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17533 dremap
.vmode
= V4SFmode
;
17535 else if ((contents
& (h2
| h4
)) == contents
)
17538 for (i
= 0; i
< nelt2
; ++i
)
17540 remap
[i
+ nelt2
] = i
* 2;
17541 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
17542 dremap
.perm
[i
* 2] = i
+ nelt2
;
17543 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
17545 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17546 dremap
.vmode
= V4SFmode
;
17548 else if ((contents
& (h1
| h4
)) == contents
)
17551 for (i
= 0; i
< nelt2
; ++i
)
17554 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
17555 dremap
.perm
[i
] = i
;
17556 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
17561 dremap
.vmode
= V2DImode
;
17563 dremap
.perm
[0] = 0;
17564 dremap
.perm
[1] = 3;
17567 else if ((contents
& (h2
| h3
)) == contents
)
17570 for (i
= 0; i
< nelt2
; ++i
)
17572 remap
[i
+ nelt2
] = i
;
17573 remap
[i
+ nelt
] = i
+ nelt2
;
17574 dremap
.perm
[i
] = i
+ nelt2
;
17575 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
17580 dremap
.vmode
= V2DImode
;
17582 dremap
.perm
[0] = 1;
17583 dremap
.perm
[1] = 2;
17591 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
17592 unsigned HOST_WIDE_INT q
[8];
17593 unsigned int nonzero_halves
[4];
17595 /* Split the two input vectors into 8 quarters. */
17596 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
17597 for (i
= 1; i
< 8; ++i
)
17598 q
[i
] = q
[0] << (nelt4
* i
);
17599 for (i
= 0; i
< 4; ++i
)
17600 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
17602 nonzero_halves
[nzcnt
] = i
;
17608 gcc_assert (d
->one_operand_p
);
17609 nonzero_halves
[1] = nonzero_halves
[0];
17610 same_halves
= true;
17612 else if (d
->one_operand_p
)
17614 gcc_assert (nonzero_halves
[0] == 0);
17615 gcc_assert (nonzero_halves
[1] == 1);
17620 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
17622 /* Attempt to increase the likelihood that dfinal
17623 shuffle will be intra-lane. */
17624 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
17627 /* vperm2f128 or vperm2i128. */
17628 for (i
= 0; i
< nelt2
; ++i
)
17630 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
17631 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
17632 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
17633 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
17636 if (d
->vmode
!= V8SFmode
17637 && d
->vmode
!= V4DFmode
17638 && d
->vmode
!= V8SImode
)
17640 dremap
.vmode
= V8SImode
;
17642 for (i
= 0; i
< 4; ++i
)
17644 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
17645 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
17649 else if (d
->one_operand_p
)
17651 else if (TARGET_AVX2
17652 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
17655 for (i
= 0; i
< nelt4
; ++i
)
17658 remap
[i
+ nelt
] = i
* 2 + 1;
17659 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
17660 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
17661 dremap
.perm
[i
* 2] = i
;
17662 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17663 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
17664 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
17667 else if (TARGET_AVX2
17668 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
17671 for (i
= 0; i
< nelt4
; ++i
)
17673 remap
[i
+ nelt4
] = i
* 2;
17674 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
17675 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
17676 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
17677 dremap
.perm
[i
* 2] = i
+ nelt4
;
17678 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
17679 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
17680 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
17687 /* Use the remapping array set up above to move the elements from their
17688 swizzled locations into their final destinations. */
17690 for (i
= 0; i
< nelt
; ++i
)
17692 unsigned e
= remap
[d
->perm
[i
]];
17693 gcc_assert (e
< nelt
);
17694 /* If same_halves is true, both halves of the remapped vector are the
17695 same. Avoid cross-lane accesses if possible. */
17696 if (same_halves
&& i
>= nelt2
)
17698 gcc_assert (e
< nelt2
);
17699 dfinal
.perm
[i
] = e
+ nelt2
;
17702 dfinal
.perm
[i
] = e
;
17706 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
17707 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17709 dfinal
.op1
= dfinal
.op0
;
17710 dfinal
.one_operand_p
= true;
17712 /* Test if the final remap can be done with a single insn. For V4SFmode or
17713 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17715 ok
= expand_vec_perm_1 (&dfinal
);
17716 seq
= get_insns ();
17725 if (dremap
.vmode
!= dfinal
.vmode
)
17727 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
17728 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
17731 ok
= expand_vec_perm_1 (&dremap
);
17738 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17739 a single vector cross-lane permutation into vpermq followed
17740 by any of the single insn permutations. */
17743 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
17745 struct expand_vec_perm_d dremap
, dfinal
;
17746 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
17747 unsigned contents
[2];
17751 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
17752 && d
->one_operand_p
))
17757 for (i
= 0; i
< nelt2
; ++i
)
17759 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
17760 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
17763 for (i
= 0; i
< 2; ++i
)
17765 unsigned int cnt
= 0;
17766 for (j
= 0; j
< 4; ++j
)
17767 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
17775 dremap
.vmode
= V4DImode
;
17777 dremap
.target
= gen_reg_rtx (V4DImode
);
17778 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
17779 dremap
.op1
= dremap
.op0
;
17780 dremap
.one_operand_p
= true;
17781 for (i
= 0; i
< 2; ++i
)
17783 unsigned int cnt
= 0;
17784 for (j
= 0; j
< 4; ++j
)
17785 if ((contents
[i
] & (1u << j
)) != 0)
17786 dremap
.perm
[2 * i
+ cnt
++] = j
;
17787 for (; cnt
< 2; ++cnt
)
17788 dremap
.perm
[2 * i
+ cnt
] = 0;
17792 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17793 dfinal
.op1
= dfinal
.op0
;
17794 dfinal
.one_operand_p
= true;
17795 for (i
= 0, j
= 0; i
< nelt
; ++i
)
17799 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
17800 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
17802 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
17803 dfinal
.perm
[i
] |= nelt4
;
17805 gcc_unreachable ();
17808 ok
= expand_vec_perm_1 (&dremap
);
17811 ok
= expand_vec_perm_1 (&dfinal
);
17817 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
17819 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
17820 a vector permutation using two instructions, vperm2f128 resp.
17821 vperm2i128 followed by any single in-lane permutation. */
17824 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
17826 struct expand_vec_perm_d dfirst
, dsecond
;
17827 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
17831 || GET_MODE_SIZE (d
->vmode
) != 32
17832 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
17836 dsecond
.one_operand_p
= false;
17837 dsecond
.testing_p
= true;
17839 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17840 immediate. For perm < 16 the second permutation uses
17841 d->op0 as first operand, for perm >= 16 it uses d->op1
17842 as first operand. The second operand is the result of
17844 for (perm
= 0; perm
< 32; perm
++)
17846 /* Ignore permutations which do not move anything cross-lane. */
17849 /* The second shuffle for e.g. V4DFmode has
17850 0123 and ABCD operands.
17851 Ignore AB23, as 23 is already in the second lane
17852 of the first operand. */
17853 if ((perm
& 0xc) == (1 << 2)) continue;
17854 /* And 01CD, as 01 is in the first lane of the first
17856 if ((perm
& 3) == 0) continue;
17857 /* And 4567, as then the vperm2[fi]128 doesn't change
17858 anything on the original 4567 second operand. */
17859 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
17863 /* The second shuffle for e.g. V4DFmode has
17864 4567 and ABCD operands.
17865 Ignore AB67, as 67 is already in the second lane
17866 of the first operand. */
17867 if ((perm
& 0xc) == (3 << 2)) continue;
17868 /* And 45CD, as 45 is in the first lane of the first
17870 if ((perm
& 3) == 2) continue;
17871 /* And 0123, as then the vperm2[fi]128 doesn't change
17872 anything on the original 0123 first operand. */
17873 if ((perm
& 0xf) == (1 << 2)) continue;
17876 for (i
= 0; i
< nelt
; i
++)
17878 j
= d
->perm
[i
] / nelt2
;
17879 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
17880 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
17881 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
17882 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17890 ok
= expand_vec_perm_1 (&dsecond
);
17901 /* Found a usable second shuffle. dfirst will be
17902 vperm2f128 on d->op0 and d->op1. */
17903 dsecond
.testing_p
= false;
17905 dfirst
.target
= gen_reg_rtx (d
->vmode
);
17906 for (i
= 0; i
< nelt
; i
++)
17907 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
17908 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
17910 canonicalize_perm (&dfirst
);
17911 ok
= expand_vec_perm_1 (&dfirst
);
17914 /* And dsecond is some single insn shuffle, taking
17915 d->op0 and result of vperm2f128 (if perm < 16) or
17916 d->op1 and result of vperm2f128 (otherwise). */
17918 dsecond
.op0
= dsecond
.op1
;
17919 dsecond
.op1
= dfirst
.target
;
17921 ok
= expand_vec_perm_1 (&dsecond
);
17927 /* For one operand, the only useful vperm2f128 permutation is 0x01
17929 if (d
->one_operand_p
)
17936 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17937 a two vector permutation using 2 intra-lane interleave insns
17938 and cross-lane shuffle for 32-byte vectors. */
17941 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
17944 rtx (*gen
) (rtx
, rtx
, rtx
);
17946 if (d
->one_operand_p
)
17948 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
17950 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
17956 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
17958 for (i
= 0; i
< nelt
; i
+= 2)
17959 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
17960 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
17970 gen
= gen_vec_interleave_highv32qi
;
17972 gen
= gen_vec_interleave_lowv32qi
;
17976 gen
= gen_vec_interleave_highv16hi
;
17978 gen
= gen_vec_interleave_lowv16hi
;
17982 gen
= gen_vec_interleave_highv8si
;
17984 gen
= gen_vec_interleave_lowv8si
;
17988 gen
= gen_vec_interleave_highv4di
;
17990 gen
= gen_vec_interleave_lowv4di
;
17994 gen
= gen_vec_interleave_highv8sf
;
17996 gen
= gen_vec_interleave_lowv8sf
;
18000 gen
= gen_vec_interleave_highv4df
;
18002 gen
= gen_vec_interleave_lowv4df
;
18005 gcc_unreachable ();
18008 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
18012 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
18013 a single vector permutation using a single intra-lane vector
18014 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18015 the non-swapped and swapped vectors together. */
18018 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18020 struct expand_vec_perm_d dfirst
, dsecond
;
18021 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
18024 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18028 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18029 || !d
->one_operand_p
)
18033 for (i
= 0; i
< nelt
; i
++)
18034 dfirst
.perm
[i
] = 0xff;
18035 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18037 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18038 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
18040 dfirst
.perm
[j
] = d
->perm
[i
];
18044 for (i
= 0; i
< nelt
; i
++)
18045 if (dfirst
.perm
[i
] == 0xff)
18046 dfirst
.perm
[i
] = i
;
18049 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18052 ok
= expand_vec_perm_1 (&dfirst
);
18053 seq
= get_insns ();
18065 dsecond
.op0
= dfirst
.target
;
18066 dsecond
.op1
= dfirst
.target
;
18067 dsecond
.one_operand_p
= true;
18068 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18069 for (i
= 0; i
< nelt
; i
++)
18070 dsecond
.perm
[i
] = i
^ nelt2
;
18072 ok
= expand_vec_perm_1 (&dsecond
);
18075 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18076 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
18080 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
18081 permutation using two vperm2f128, followed by a vshufpd insn blending
18082 the two vectors together. */
18085 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
18087 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18090 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
18100 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
18101 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
18102 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
18103 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
18104 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
18105 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
18106 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
18107 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
18108 dthird
.perm
[0] = (d
->perm
[0] % 2);
18109 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
18110 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
18111 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
18113 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18114 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18115 dthird
.op0
= dfirst
.target
;
18116 dthird
.op1
= dsecond
.target
;
18117 dthird
.one_operand_p
= false;
18119 canonicalize_perm (&dfirst
);
18120 canonicalize_perm (&dsecond
);
18122 ok
= expand_vec_perm_1 (&dfirst
)
18123 && expand_vec_perm_1 (&dsecond
)
18124 && expand_vec_perm_1 (&dthird
);
18131 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18132 permutation with two pshufb insns and an ior. We should have already
18133 failed all two instruction sequences. */
18136 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
18138 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
18139 unsigned int i
, nelt
, eltsz
;
18141 if (!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
18143 gcc_assert (!d
->one_operand_p
);
18149 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18151 /* Generate two permutation masks. If the required element is within
18152 the given vector it is shuffled into the proper lane. If the required
18153 element is in the other vector, force a zero into the lane by setting
18154 bit 7 in the permutation mask. */
18155 m128
= GEN_INT (-128);
18156 for (i
= 0; i
< nelt
; ++i
)
18158 unsigned j
, e
= d
->perm
[i
];
18159 unsigned which
= (e
>= nelt
);
18163 for (j
= 0; j
< eltsz
; ++j
)
18165 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
18166 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
18170 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
18171 vperm
= force_reg (V16QImode
, vperm
);
18173 l
= gen_reg_rtx (V16QImode
);
18174 op
= gen_lowpart (V16QImode
, d
->op0
);
18175 emit_insn (gen_ssse3_pshufbv16qi3 (l
, op
, vperm
));
18177 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
18178 vperm
= force_reg (V16QImode
, vperm
);
18180 h
= gen_reg_rtx (V16QImode
);
18181 op
= gen_lowpart (V16QImode
, d
->op1
);
18182 emit_insn (gen_ssse3_pshufbv16qi3 (h
, op
, vperm
));
18185 if (d
->vmode
!= V16QImode
)
18186 op
= gen_reg_rtx (V16QImode
);
18187 emit_insn (gen_iorv16qi3 (op
, l
, h
));
18188 if (op
!= d
->target
)
18189 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18194 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18195 with two vpshufb insns, vpermq and vpor. We should have already failed
18196 all two or three instruction sequences. */
18199 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
18201 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
18202 unsigned int i
, nelt
, eltsz
;
18205 || !d
->one_operand_p
18206 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18213 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18215 /* Generate two permutation masks. If the required element is within
18216 the same lane, it is shuffled in. If the required element from the
18217 other lane, force a zero by setting bit 7 in the permutation mask.
18218 In the other mask the mask has non-negative elements if element
18219 is requested from the other lane, but also moved to the other lane,
18220 so that the result of vpshufb can have the two V2TImode halves
18222 m128
= GEN_INT (-128);
18223 for (i
= 0; i
< nelt
; ++i
)
18225 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18226 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18228 for (j
= 0; j
< eltsz
; ++j
)
18230 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
18231 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
18235 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18236 vperm
= force_reg (V32QImode
, vperm
);
18238 h
= gen_reg_rtx (V32QImode
);
18239 op
= gen_lowpart (V32QImode
, d
->op0
);
18240 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18242 /* Swap the 128-byte lanes of h into hp. */
18243 hp
= gen_reg_rtx (V4DImode
);
18244 op
= gen_lowpart (V4DImode
, h
);
18245 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
18248 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18249 vperm
= force_reg (V32QImode
, vperm
);
18251 l
= gen_reg_rtx (V32QImode
);
18252 op
= gen_lowpart (V32QImode
, d
->op0
);
18253 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18256 if (d
->vmode
!= V32QImode
)
18257 op
= gen_reg_rtx (V32QImode
);
18258 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
18259 if (op
!= d
->target
)
18260 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18265 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18266 and extract-odd permutations of two V32QImode and V16QImode operand
18267 with two vpshufb insns, vpor and vpermq. We should have already
18268 failed all two or three instruction sequences. */
18271 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
18273 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
18274 unsigned int i
, nelt
, eltsz
;
18277 || d
->one_operand_p
18278 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18281 for (i
= 0; i
< d
->nelt
; ++i
)
18282 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
18289 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18291 /* Generate two permutation masks. In the first permutation mask
18292 the first quarter will contain indexes for the first half
18293 of the op0, the second quarter will contain bit 7 set, third quarter
18294 will contain indexes for the second half of the op0 and the
18295 last quarter bit 7 set. In the second permutation mask
18296 the first quarter will contain bit 7 set, the second quarter
18297 indexes for the first half of the op1, the third quarter bit 7 set
18298 and last quarter indexes for the second half of the op1.
18299 I.e. the first mask e.g. for V32QImode extract even will be:
18300 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18301 (all values masked with 0xf except for -128) and second mask
18302 for extract even will be
18303 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18304 m128
= GEN_INT (-128);
18305 for (i
= 0; i
< nelt
; ++i
)
18307 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18308 unsigned which
= d
->perm
[i
] >= nelt
;
18309 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
18311 for (j
= 0; j
< eltsz
; ++j
)
18313 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
18314 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
18318 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18319 vperm
= force_reg (V32QImode
, vperm
);
18321 l
= gen_reg_rtx (V32QImode
);
18322 op
= gen_lowpart (V32QImode
, d
->op0
);
18323 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18325 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18326 vperm
= force_reg (V32QImode
, vperm
);
18328 h
= gen_reg_rtx (V32QImode
);
18329 op
= gen_lowpart (V32QImode
, d
->op1
);
18330 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18332 ior
= gen_reg_rtx (V32QImode
);
18333 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
18335 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18336 op
= gen_reg_rtx (V4DImode
);
18337 ior
= gen_lowpart (V4DImode
, ior
);
18338 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
18339 const1_rtx
, GEN_INT (3)));
18340 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18345 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18346 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18347 with two "and" and "pack" or two "shift" and "pack" insns. We should
18348 have already failed all two instruction sequences. */
18351 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
18353 rtx op
, dop0
, dop1
, t
;
18354 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
18355 bool end_perm
= false;
18356 machine_mode half_mode
;
18357 rtx (*gen_and
) (rtx
, rtx
, rtx
);
18358 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
18359 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
18361 if (d
->one_operand_p
)
18367 /* Required for "pack". */
18368 if (!TARGET_SSE4_1
)
18372 half_mode
= V4SImode
;
18373 gen_and
= gen_andv4si3
;
18374 gen_pack
= gen_sse4_1_packusdw
;
18375 gen_shift
= gen_lshrv4si3
;
18378 /* No check as all instructions are SSE2. */
18381 half_mode
= V8HImode
;
18382 gen_and
= gen_andv8hi3
;
18383 gen_pack
= gen_sse2_packuswb
;
18384 gen_shift
= gen_lshrv8hi3
;
18391 half_mode
= V8SImode
;
18392 gen_and
= gen_andv8si3
;
18393 gen_pack
= gen_avx2_packusdw
;
18394 gen_shift
= gen_lshrv8si3
;
18402 half_mode
= V16HImode
;
18403 gen_and
= gen_andv16hi3
;
18404 gen_pack
= gen_avx2_packuswb
;
18405 gen_shift
= gen_lshrv16hi3
;
18409 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18410 general shuffles. */
18414 /* Check that permutation is even or odd. */
18419 for (i
= 1; i
< nelt
; ++i
)
18420 if (d
->perm
[i
] != 2 * i
+ odd
)
18426 dop0
= gen_reg_rtx (half_mode
);
18427 dop1
= gen_reg_rtx (half_mode
);
18430 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
18431 t
= force_reg (half_mode
, t
);
18432 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
18433 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
18437 emit_insn (gen_shift (dop0
,
18438 gen_lowpart (half_mode
, d
->op0
),
18440 emit_insn (gen_shift (dop1
,
18441 gen_lowpart (half_mode
, d
->op1
),
18444 /* In AVX2 for 256 bit case we need to permute pack result. */
18445 if (TARGET_AVX2
&& end_perm
)
18447 op
= gen_reg_rtx (d
->vmode
);
18448 t
= gen_reg_rtx (V4DImode
);
18449 emit_insn (gen_pack (op
, dop0
, dop1
));
18450 emit_insn (gen_avx2_permv4di_1 (t
,
18451 gen_lowpart (V4DImode
, op
),
18456 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
18459 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
18464 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18465 and extract-odd permutations of two V64QI operands
18466 with two "shifts", two "truncs" and one "concat" insns for "odd"
18467 and two "truncs" and one concat insn for "even."
18468 Have already failed all two instruction sequences. */
18471 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
18473 rtx t1
, t2
, t3
, t4
;
18474 unsigned i
, odd
, nelt
= d
->nelt
;
18476 if (!TARGET_AVX512BW
18477 || d
->one_operand_p
18478 || d
->vmode
!= V64QImode
)
18481 /* Check that permutation is even or odd. */
18486 for (i
= 1; i
< nelt
; ++i
)
18487 if (d
->perm
[i
] != 2 * i
+ odd
)
18496 t1
= gen_reg_rtx (V32HImode
);
18497 t2
= gen_reg_rtx (V32HImode
);
18498 emit_insn (gen_lshrv32hi3 (t1
,
18499 gen_lowpart (V32HImode
, d
->op0
),
18501 emit_insn (gen_lshrv32hi3 (t2
,
18502 gen_lowpart (V32HImode
, d
->op1
),
18507 t1
= gen_lowpart (V32HImode
, d
->op0
);
18508 t2
= gen_lowpart (V32HImode
, d
->op1
);
18511 t3
= gen_reg_rtx (V32QImode
);
18512 t4
= gen_reg_rtx (V32QImode
);
18513 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
18514 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
18515 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
18520 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
18521 and extract-odd permutations. */
18524 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
18526 rtx t1
, t2
, t3
, t4
, t5
;
18533 t1
= gen_reg_rtx (V4DFmode
);
18534 t2
= gen_reg_rtx (V4DFmode
);
18536 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18537 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18538 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18540 /* Now an unpck[lh]pd will produce the result required. */
18542 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
18544 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
18550 int mask
= odd
? 0xdd : 0x88;
18554 t1
= gen_reg_rtx (V8SFmode
);
18555 t2
= gen_reg_rtx (V8SFmode
);
18556 t3
= gen_reg_rtx (V8SFmode
);
18558 /* Shuffle within the 128-bit lanes to produce:
18559 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18560 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
18563 /* Shuffle the lanes around to produce:
18564 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18565 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
18568 /* Shuffle within the 128-bit lanes to produce:
18569 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18570 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
18572 /* Shuffle within the 128-bit lanes to produce:
18573 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18574 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
18576 /* Shuffle the lanes around to produce:
18577 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18578 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
18587 /* These are always directly implementable by expand_vec_perm_1. */
18588 gcc_unreachable ();
18592 return expand_vec_perm_even_odd_pack (d
);
18593 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
18594 return expand_vec_perm_pshufb2 (d
);
18599 /* We need 2*log2(N)-1 operations to achieve odd/even
18600 with interleave. */
18601 t1
= gen_reg_rtx (V8HImode
);
18602 t2
= gen_reg_rtx (V8HImode
);
18603 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
18604 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
18605 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
18606 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
18608 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
18610 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
18616 return expand_vec_perm_even_odd_pack (d
);
18620 return expand_vec_perm_even_odd_pack (d
);
18623 return expand_vec_perm_even_odd_trunc (d
);
18628 struct expand_vec_perm_d d_copy
= *d
;
18629 d_copy
.vmode
= V4DFmode
;
18631 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18633 d_copy
.target
= gen_reg_rtx (V4DFmode
);
18634 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
18635 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
18636 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18639 emit_move_insn (d
->target
,
18640 gen_lowpart (V4DImode
, d_copy
.target
));
18649 t1
= gen_reg_rtx (V4DImode
);
18650 t2
= gen_reg_rtx (V4DImode
);
18652 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18653 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18654 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18656 /* Now an vpunpck[lh]qdq will produce the result required. */
18658 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
18660 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
18667 struct expand_vec_perm_d d_copy
= *d
;
18668 d_copy
.vmode
= V8SFmode
;
18670 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18672 d_copy
.target
= gen_reg_rtx (V8SFmode
);
18673 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
18674 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
18675 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18678 emit_move_insn (d
->target
,
18679 gen_lowpart (V8SImode
, d_copy
.target
));
18688 t1
= gen_reg_rtx (V8SImode
);
18689 t2
= gen_reg_rtx (V8SImode
);
18690 t3
= gen_reg_rtx (V4DImode
);
18691 t4
= gen_reg_rtx (V4DImode
);
18692 t5
= gen_reg_rtx (V4DImode
);
18694 /* Shuffle the lanes around into
18695 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18696 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
18697 gen_lowpart (V4DImode
, d
->op1
),
18699 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
18700 gen_lowpart (V4DImode
, d
->op1
),
18703 /* Swap the 2nd and 3rd position in each lane into
18704 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18705 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
18706 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18707 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
18708 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18710 /* Now an vpunpck[lh]qdq will produce
18711 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18713 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
18714 gen_lowpart (V4DImode
, t2
));
18716 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
18717 gen_lowpart (V4DImode
, t2
));
18719 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
18723 gcc_unreachable ();
18729 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18730 extract-even and extract-odd permutations. */
18733 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
18735 unsigned i
, odd
, nelt
= d
->nelt
;
18738 if (odd
!= 0 && odd
!= 1)
18741 for (i
= 1; i
< nelt
; ++i
)
18742 if (d
->perm
[i
] != 2 * i
+ odd
)
18745 return expand_vec_perm_even_odd_1 (d
, odd
);
18748 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
18749 permutations. We assume that expand_vec_perm_1 has already failed. */
18752 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
18754 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
18755 machine_mode vmode
= d
->vmode
;
18756 unsigned char perm2
[4];
18757 rtx op0
= d
->op0
, dest
;
18764 /* These are special-cased in sse.md so that we can optionally
18765 use the vbroadcast instruction. They expand to two insns
18766 if the input happens to be in a register. */
18767 gcc_unreachable ();
18773 /* These are always implementable using standard shuffle patterns. */
18774 gcc_unreachable ();
18778 /* These can be implemented via interleave. We save one insn by
18779 stopping once we have promoted to V4SImode and then use pshufd. */
18785 rtx (*gen
) (rtx
, rtx
, rtx
)
18786 = vmode
== V16QImode
? gen_vec_interleave_lowv16qi
18787 : gen_vec_interleave_lowv8hi
;
18791 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
18792 : gen_vec_interleave_highv8hi
;
18797 dest
= gen_reg_rtx (vmode
);
18798 emit_insn (gen (dest
, op0
, op0
));
18799 vmode
= get_mode_wider_vector (vmode
);
18800 op0
= gen_lowpart (vmode
, dest
);
18802 while (vmode
!= V4SImode
);
18804 memset (perm2
, elt
, 4);
18805 dest
= gen_reg_rtx (V4SImode
);
18806 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
18809 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
18817 /* For AVX2 broadcasts of the first element vpbroadcast* or
18818 vpermq should be used by expand_vec_perm_1. */
18819 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
18823 gcc_unreachable ();
18827 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18828 broadcast permutations. */
18831 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
18833 unsigned i
, elt
, nelt
= d
->nelt
;
18835 if (!d
->one_operand_p
)
18839 for (i
= 1; i
< nelt
; ++i
)
18840 if (d
->perm
[i
] != elt
)
18843 return expand_vec_perm_broadcast_1 (d
);
18846 /* Implement arbitrary permutations of two V64QImode operands
18847 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18849 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
18851 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
18857 struct expand_vec_perm_d ds
[2];
18858 rtx rperm
[128], vperm
, target0
, target1
;
18859 unsigned int i
, nelt
;
18860 machine_mode vmode
;
18865 for (i
= 0; i
< 2; i
++)
18868 ds
[i
].vmode
= V32HImode
;
18870 ds
[i
].target
= gen_reg_rtx (V32HImode
);
18871 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
18872 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
18875 /* Prepare permutations such that the first one takes care of
18876 putting the even bytes into the right positions or one higher
18877 positions (ds[0]) and the second one takes care of
18878 putting the odd bytes into the right positions or one below
18881 for (i
= 0; i
< nelt
; i
++)
18883 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
18886 rperm
[i
] = constm1_rtx
;
18887 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
18891 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
18892 rperm
[i
+ 64] = constm1_rtx
;
18896 bool ok
= expand_vec_perm_1 (&ds
[0]);
18898 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
18900 ok
= expand_vec_perm_1 (&ds
[1]);
18902 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
18904 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
18905 vperm
= force_reg (vmode
, vperm
);
18906 target0
= gen_reg_rtx (V64QImode
);
18907 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
18909 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
18910 vperm
= force_reg (vmode
, vperm
);
18911 target1
= gen_reg_rtx (V64QImode
);
18912 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
18914 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
18918 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
18919 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
18920 all the shorter instruction sequences. */
18923 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
18925 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
18926 unsigned int i
, nelt
, eltsz
;
18930 || d
->one_operand_p
18931 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18938 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18940 /* Generate 4 permutation masks. If the required element is within
18941 the same lane, it is shuffled in. If the required element from the
18942 other lane, force a zero by setting bit 7 in the permutation mask.
18943 In the other mask the mask has non-negative elements if element
18944 is requested from the other lane, but also moved to the other lane,
18945 so that the result of vpshufb can have the two V2TImode halves
18947 m128
= GEN_INT (-128);
18948 for (i
= 0; i
< 32; ++i
)
18950 rperm
[0][i
] = m128
;
18951 rperm
[1][i
] = m128
;
18952 rperm
[2][i
] = m128
;
18953 rperm
[3][i
] = m128
;
18959 for (i
= 0; i
< nelt
; ++i
)
18961 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18962 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18963 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
18965 for (j
= 0; j
< eltsz
; ++j
)
18966 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
18967 used
[which
] = true;
18970 for (i
= 0; i
< 2; ++i
)
18972 if (!used
[2 * i
+ 1])
18977 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
18978 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
18979 vperm
= force_reg (V32QImode
, vperm
);
18980 h
[i
] = gen_reg_rtx (V32QImode
);
18981 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
18982 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
18985 /* Swap the 128-byte lanes of h[X]. */
18986 for (i
= 0; i
< 2; ++i
)
18988 if (h
[i
] == NULL_RTX
)
18990 op
= gen_reg_rtx (V4DImode
);
18991 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
18992 const2_rtx
, GEN_INT (3), const0_rtx
,
18994 h
[i
] = gen_lowpart (V32QImode
, op
);
18997 for (i
= 0; i
< 2; ++i
)
19004 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
19005 vperm
= force_reg (V32QImode
, vperm
);
19006 l
[i
] = gen_reg_rtx (V32QImode
);
19007 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19008 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
19011 for (i
= 0; i
< 2; ++i
)
19015 op
= gen_reg_rtx (V32QImode
);
19016 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
19023 gcc_assert (l
[0] && l
[1]);
19025 if (d
->vmode
!= V32QImode
)
19026 op
= gen_reg_rtx (V32QImode
);
19027 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
19028 if (op
!= d
->target
)
19029 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
19033 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19034 taken care of, perform the expansion in D and return true on success. */
19037 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19039 /* Try a single instruction expansion. */
19040 if (expand_vec_perm_1 (d
))
19043 /* Try sequences of two instructions. */
19045 if (expand_vec_perm_pshuflw_pshufhw (d
))
19048 if (expand_vec_perm_palignr (d
, false))
19051 if (expand_vec_perm_interleave2 (d
))
19054 if (expand_vec_perm_broadcast (d
))
19057 if (expand_vec_perm_vpermq_perm_1 (d
))
19060 if (expand_vec_perm_vperm2f128 (d
))
19063 if (expand_vec_perm_pblendv (d
))
19066 /* Try sequences of three instructions. */
19068 if (expand_vec_perm_even_odd_pack (d
))
19071 if (expand_vec_perm_2vperm2f128_vshuf (d
))
19074 if (expand_vec_perm_pshufb2 (d
))
19077 if (expand_vec_perm_interleave3 (d
))
19080 if (expand_vec_perm_vperm2f128_vblend (d
))
19083 /* Try sequences of four instructions. */
19085 if (expand_vec_perm_even_odd_trunc (d
))
19087 if (expand_vec_perm_vpshufb2_vpermq (d
))
19090 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
19093 if (expand_vec_perm_vpermt2_vpshub2 (d
))
19096 /* ??? Look for narrow permutations whose element orderings would
19097 allow the promotion to a wider mode. */
19099 /* ??? Look for sequences of interleave or a wider permute that place
19100 the data into the correct lanes for a half-vector shuffle like
19101 pshuf[lh]w or vpermilps. */
19103 /* ??? Look for sequences of interleave that produce the desired results.
19104 The combinatorics of punpck[lh] get pretty ugly... */
19106 if (expand_vec_perm_even_odd (d
))
19109 /* Even longer sequences. */
19110 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
19113 /* See if we can get the same permutation in different vector integer
19115 struct expand_vec_perm_d nd
;
19116 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19119 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19126 /* If a permutation only uses one operand, make it clear. Returns true
19127 if the permutation references both operands. */
19130 canonicalize_perm (struct expand_vec_perm_d
*d
)
19132 int i
, which
, nelt
= d
->nelt
;
19134 for (i
= which
= 0; i
< nelt
; ++i
)
19135 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
19137 d
->one_operand_p
= true;
19144 if (!rtx_equal_p (d
->op0
, d
->op1
))
19146 d
->one_operand_p
= false;
19149 /* The elements of PERM do not suggest that only the first operand
19150 is used, but both operands are identical. Allow easier matching
19151 of the permutation by folding the permutation into the single
19156 for (i
= 0; i
< nelt
; ++i
)
19157 d
->perm
[i
] &= nelt
- 1;
19166 return (which
== 3);
19169 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19172 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19173 rtx op1
, const vec_perm_indices
&sel
)
19175 struct expand_vec_perm_d d
;
19176 unsigned char perm
[MAX_VECT_LEN
];
19177 unsigned int i
, nelt
, which
;
19185 gcc_assert (VECTOR_MODE_P (d
.vmode
));
19186 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19187 d
.testing_p
= !target
;
19189 gcc_assert (sel
.length () == nelt
);
19190 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
19192 /* Given sufficient ISA support we can just return true here
19193 for selected vector modes. */
19200 if (!TARGET_AVX512F
)
19202 /* All implementable with a single vperm[it]2 insn. */
19207 if (!TARGET_AVX512BW
)
19210 /* All implementable with a single vperm[it]2 insn. */
19214 if (!TARGET_AVX512BW
)
19217 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19226 if (d
.testing_p
&& TARGET_AVX512VL
)
19227 /* All implementable with a single vperm[it]2 insn. */
19233 if (d
.testing_p
&& TARGET_AVX2
)
19234 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19240 if (d
.testing_p
&& TARGET_AVX2
)
19241 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19248 /* Fall through. */
19253 /* All implementable with a single vpperm insn. */
19254 if (d
.testing_p
&& TARGET_XOP
)
19256 /* All implementable with 2 pshufb + 1 ior. */
19257 if (d
.testing_p
&& TARGET_SSSE3
)
19264 /* All implementable with shufpd or unpck[lh]pd. */
19272 for (i
= which
= 0; i
< nelt
; ++i
)
19274 unsigned char e
= sel
[i
];
19275 gcc_assert (e
< 2 * nelt
);
19278 which
|= (e
< nelt
? 1 : 2);
19283 /* For all elements from second vector, fold the elements to first. */
19285 for (i
= 0; i
< nelt
; ++i
)
19288 /* Check whether the mask can be applied to the vector type. */
19289 d
.one_operand_p
= (which
!= 3);
19291 /* Implementable with shufps or pshufd. */
19292 if (d
.one_operand_p
&& (d
.vmode
== V4SFmode
|| d
.vmode
== V4SImode
))
19295 /* Otherwise we have to go through the motions and see if we can
19296 figure out how to generate the requested permutation. */
19297 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19298 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
19299 if (!d
.one_operand_p
)
19300 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
19303 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
19309 two_args
= canonicalize_perm (&d
);
19311 if (ix86_expand_vec_perm_const_1 (&d
))
19314 /* If the selector says both arguments are needed, but the operands are the
19315 same, the above tried to expand with one_operand_p and flattened selector.
19316 If that didn't work, retry without one_operand_p; we succeeded with that
19318 if (two_args
&& d
.one_operand_p
)
19320 d
.one_operand_p
= false;
19321 memcpy (d
.perm
, perm
, sizeof (perm
));
19322 return ix86_expand_vec_perm_const_1 (&d
);
19329 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
19331 struct expand_vec_perm_d d
;
19337 d
.vmode
= GET_MODE (targ
);
19338 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19339 d
.one_operand_p
= false;
19340 d
.testing_p
= false;
19342 for (i
= 0; i
< nelt
; ++i
)
19343 d
.perm
[i
] = i
* 2 + odd
;
19345 /* We'll either be able to implement the permutation directly... */
19346 if (expand_vec_perm_1 (&d
))
19349 /* ... or we use the special-case patterns. */
19350 expand_vec_perm_even_odd_1 (&d
, odd
);
19354 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
19356 struct expand_vec_perm_d d
;
19357 unsigned i
, nelt
, base
;
19363 d
.vmode
= GET_MODE (targ
);
19364 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19365 d
.one_operand_p
= false;
19366 d
.testing_p
= false;
19368 base
= high_p
? nelt
/ 2 : 0;
19369 for (i
= 0; i
< nelt
/ 2; ++i
)
19371 d
.perm
[i
* 2] = i
+ base
;
19372 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
19375 /* Note that for AVX this isn't one instruction. */
19376 ok
= ix86_expand_vec_perm_const_1 (&d
);
19381 /* Expand a vector operation CODE for a V*QImode in terms of the
19382 same operation on V*HImode. */
19385 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
19387 machine_mode qimode
= GET_MODE (dest
);
19388 machine_mode himode
;
19389 rtx (*gen_il
) (rtx
, rtx
, rtx
);
19390 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
19391 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
19392 struct expand_vec_perm_d d
;
19393 bool ok
, full_interleave
;
19394 bool uns_p
= false;
19401 gen_il
= gen_vec_interleave_lowv16qi
;
19402 gen_ih
= gen_vec_interleave_highv16qi
;
19405 himode
= V16HImode
;
19406 gen_il
= gen_avx2_interleave_lowv32qi
;
19407 gen_ih
= gen_avx2_interleave_highv32qi
;
19410 himode
= V32HImode
;
19411 gen_il
= gen_avx512bw_interleave_lowv64qi
;
19412 gen_ih
= gen_avx512bw_interleave_highv64qi
;
19415 gcc_unreachable ();
19418 op2_l
= op2_h
= op2
;
19422 /* Unpack data such that we've got a source byte in each low byte of
19423 each word. We don't care what goes into the high byte of each word.
19424 Rather than trying to get zero in there, most convenient is to let
19425 it be a copy of the low byte. */
19426 op2_l
= gen_reg_rtx (qimode
);
19427 op2_h
= gen_reg_rtx (qimode
);
19428 emit_insn (gen_il (op2_l
, op2
, op2
));
19429 emit_insn (gen_ih (op2_h
, op2
, op2
));
19431 op1_l
= gen_reg_rtx (qimode
);
19432 op1_h
= gen_reg_rtx (qimode
);
19433 emit_insn (gen_il (op1_l
, op1
, op1
));
19434 emit_insn (gen_ih (op1_h
, op1
, op1
));
19435 full_interleave
= qimode
== V16QImode
;
19443 op1_l
= gen_reg_rtx (himode
);
19444 op1_h
= gen_reg_rtx (himode
);
19445 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
19446 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
19447 full_interleave
= true;
19450 gcc_unreachable ();
19453 /* Perform the operation. */
19454 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
19456 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
19458 gcc_assert (res_l
&& res_h
);
19460 /* Merge the data back into the right place. */
19462 d
.op0
= gen_lowpart (qimode
, res_l
);
19463 d
.op1
= gen_lowpart (qimode
, res_h
);
19465 d
.nelt
= GET_MODE_NUNITS (qimode
);
19466 d
.one_operand_p
= false;
19467 d
.testing_p
= false;
19469 if (full_interleave
)
19471 /* For SSE2, we used an full interleave, so the desired
19472 results are in the even elements. */
19473 for (i
= 0; i
< d
.nelt
; ++i
)
19478 /* For AVX, the interleave used above was not cross-lane. So the
19479 extraction is evens but with the second and third quarter swapped.
19480 Happily, that is even one insn shorter than even extraction.
19481 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19482 always first from the first and then from the second source operand,
19483 the index bits above the low 4 bits remains the same.
19484 Thus, for d.nelt == 32 we want permutation
19485 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19486 and for d.nelt == 64 we want permutation
19487 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19488 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19489 for (i
= 0; i
< d
.nelt
; ++i
)
19490 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
19493 ok
= ix86_expand_vec_perm_const_1 (&d
);
19496 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19497 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
19500 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19501 if op is CONST_VECTOR with all odd elements equal to their
19502 preceding element. */
19505 const_vector_equal_evenodd_p (rtx op
)
19507 machine_mode mode
= GET_MODE (op
);
19508 int i
, nunits
= GET_MODE_NUNITS (mode
);
19509 if (GET_CODE (op
) != CONST_VECTOR
19510 || nunits
!= CONST_VECTOR_NUNITS (op
))
19512 for (i
= 0; i
< nunits
; i
+= 2)
19513 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
19519 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
19520 bool uns_p
, bool odd_p
)
19522 machine_mode mode
= GET_MODE (op1
);
19523 machine_mode wmode
= GET_MODE (dest
);
19525 rtx orig_op1
= op1
, orig_op2
= op2
;
19527 if (!nonimmediate_operand (op1
, mode
))
19528 op1
= force_reg (mode
, op1
);
19529 if (!nonimmediate_operand (op2
, mode
))
19530 op2
= force_reg (mode
, op2
);
19532 /* We only play even/odd games with vectors of SImode. */
19533 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
19535 /* If we're looking for the odd results, shift those members down to
19536 the even slots. For some cpus this is faster than a PSHUFD. */
19539 /* For XOP use vpmacsdqh, but only for smult, as it is only
19541 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
19543 x
= force_reg (wmode
, CONST0_RTX (wmode
));
19544 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
19548 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
19549 if (!const_vector_equal_evenodd_p (orig_op1
))
19550 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
19551 x
, NULL
, 1, OPTAB_DIRECT
);
19552 if (!const_vector_equal_evenodd_p (orig_op2
))
19553 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
19554 x
, NULL
, 1, OPTAB_DIRECT
);
19555 op1
= gen_lowpart (mode
, op1
);
19556 op2
= gen_lowpart (mode
, op2
);
19559 if (mode
== V16SImode
)
19562 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
19564 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
19566 else if (mode
== V8SImode
)
19569 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
19571 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
19574 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
19575 else if (TARGET_SSE4_1
)
19576 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
19579 rtx s1
, s2
, t0
, t1
, t2
;
19581 /* The easiest way to implement this without PMULDQ is to go through
19582 the motions as if we are performing a full 64-bit multiply. With
19583 the exception that we need to do less shuffling of the elements. */
19585 /* Compute the sign-extension, aka highparts, of the two operands. */
19586 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19587 op1
, pc_rtx
, pc_rtx
);
19588 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19589 op2
, pc_rtx
, pc_rtx
);
19591 /* Multiply LO(A) * HI(B), and vice-versa. */
19592 t1
= gen_reg_rtx (wmode
);
19593 t2
= gen_reg_rtx (wmode
);
19594 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
19595 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
19597 /* Multiply LO(A) * LO(B). */
19598 t0
= gen_reg_rtx (wmode
);
19599 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
19601 /* Combine and shift the highparts into place. */
19602 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
19603 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
19606 /* Combine high and low parts. */
19607 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
19614 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
19615 bool uns_p
, bool high_p
)
19617 machine_mode wmode
= GET_MODE (dest
);
19618 machine_mode mode
= GET_MODE (op1
);
19619 rtx t1
, t2
, t3
, t4
, mask
;
19624 t1
= gen_reg_rtx (mode
);
19625 t2
= gen_reg_rtx (mode
);
19626 if (TARGET_XOP
&& !uns_p
)
19628 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19629 shuffle the elements once so that all elements are in the right
19630 place for immediate use: { A C B D }. */
19631 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
19632 const1_rtx
, GEN_INT (3)));
19633 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
19634 const1_rtx
, GEN_INT (3)));
19638 /* Put the elements into place for the multiply. */
19639 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
19640 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
19643 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
19647 /* Shuffle the elements between the lanes. After this we
19648 have { A B E F | C D G H } for each operand. */
19649 t1
= gen_reg_rtx (V4DImode
);
19650 t2
= gen_reg_rtx (V4DImode
);
19651 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
19652 const0_rtx
, const2_rtx
,
19653 const1_rtx
, GEN_INT (3)));
19654 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
19655 const0_rtx
, const2_rtx
,
19656 const1_rtx
, GEN_INT (3)));
19658 /* Shuffle the elements within the lanes. After this we
19659 have { A A B B | C C D D } or { E E F F | G G H H }. */
19660 t3
= gen_reg_rtx (V8SImode
);
19661 t4
= gen_reg_rtx (V8SImode
);
19662 mask
= GEN_INT (high_p
19663 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19664 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19665 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
19666 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
19668 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
19673 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
19674 uns_p
, OPTAB_DIRECT
);
19675 t2
= expand_binop (mode
,
19676 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
19677 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
19678 gcc_assert (t1
&& t2
);
19680 t3
= gen_reg_rtx (mode
);
19681 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
19682 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
19690 t1
= gen_reg_rtx (wmode
);
19691 t2
= gen_reg_rtx (wmode
);
19692 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
19693 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
19695 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
19699 gcc_unreachable ();
19704 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
19706 rtx res_1
, res_2
, res_3
, res_4
;
19708 res_1
= gen_reg_rtx (V4SImode
);
19709 res_2
= gen_reg_rtx (V4SImode
);
19710 res_3
= gen_reg_rtx (V2DImode
);
19711 res_4
= gen_reg_rtx (V2DImode
);
19712 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
19713 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
19715 /* Move the results in element 2 down to element 1; we don't care
19716 what goes in elements 2 and 3. Then we can merge the parts
19717 back together with an interleave.
19719 Note that two other sequences were tried:
19720 (1) Use interleaves at the start instead of psrldq, which allows
19721 us to use a single shufps to merge things back at the end.
19722 (2) Use shufps here to combine the two vectors, then pshufd to
19723 put the elements in the correct order.
19724 In both cases the cost of the reformatting stall was too high
19725 and the overall sequence slower. */
19727 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
19728 const0_rtx
, const2_rtx
,
19729 const0_rtx
, const0_rtx
));
19730 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
19731 const0_rtx
, const2_rtx
,
19732 const0_rtx
, const0_rtx
));
19733 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
19735 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
19739 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
19741 machine_mode mode
= GET_MODE (op0
);
19742 rtx t1
, t2
, t3
, t4
, t5
, t6
;
19744 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
19745 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
19746 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
19747 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
19748 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
19749 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
19750 else if (TARGET_XOP
&& mode
== V2DImode
)
19752 /* op1: A,B,C,D, op2: E,F,G,H */
19753 op1
= gen_lowpart (V4SImode
, op1
);
19754 op2
= gen_lowpart (V4SImode
, op2
);
19756 t1
= gen_reg_rtx (V4SImode
);
19757 t2
= gen_reg_rtx (V4SImode
);
19758 t3
= gen_reg_rtx (V2DImode
);
19759 t4
= gen_reg_rtx (V2DImode
);
19762 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
19768 /* t2: (B*E),(A*F),(D*G),(C*H) */
19769 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
19771 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19772 emit_insn (gen_xop_phadddq (t3
, t2
));
19774 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19775 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
19777 /* Multiply lower parts and add all */
19778 t5
= gen_reg_rtx (V2DImode
);
19779 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
19780 gen_lowpart (V4SImode
, op1
),
19781 gen_lowpart (V4SImode
, op2
)));
19782 op0
= expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
19787 machine_mode nmode
;
19788 rtx (*umul
) (rtx
, rtx
, rtx
);
19790 if (mode
== V2DImode
)
19792 umul
= gen_vec_widen_umult_even_v4si
;
19795 else if (mode
== V4DImode
)
19797 umul
= gen_vec_widen_umult_even_v8si
;
19800 else if (mode
== V8DImode
)
19802 umul
= gen_vec_widen_umult_even_v16si
;
19806 gcc_unreachable ();
19809 /* Multiply low parts. */
19810 t1
= gen_reg_rtx (mode
);
19811 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
19813 /* Shift input vectors right 32 bits so we can multiply high parts. */
19815 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
19816 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
19818 /* Multiply high parts by low parts. */
19819 t4
= gen_reg_rtx (mode
);
19820 t5
= gen_reg_rtx (mode
);
19821 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
19822 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
19824 /* Combine and shift the highparts back. */
19825 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
19826 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
19828 /* Combine high and low parts. */
19829 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
19832 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19833 gen_rtx_MULT (mode
, op1
, op2
));
19836 /* Return 1 if control tansfer instruction INSN
19837 should be encoded with notrack prefix. */
19840 ix86_notrack_prefixed_insn_p (rtx insn
)
19842 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
19847 rtx call
= get_call_rtx_from (insn
);
19848 gcc_assert (call
!= NULL_RTX
);
19849 rtx addr
= XEXP (call
, 0);
19851 /* Do not emit 'notrack' if it's not an indirect call. */
19853 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
19856 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
19859 if (JUMP_P (insn
) && !flag_cet_switch
)
19861 rtx target
= JUMP_LABEL (insn
);
19862 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
19865 /* Check the jump is a switch table. */
19866 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
19867 rtx_insn
*table
= next_insn (label
);
19868 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
19876 /* Calculate integer abs() using only SSE2 instructions. */
19879 ix86_expand_sse2_abs (rtx target
, rtx input
)
19881 machine_mode mode
= GET_MODE (target
);
19888 /* For 64-bit signed integer X, with SSE4.2 use
19889 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
19890 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
19891 32 and use logical instead of arithmetic right shift (which is
19892 unimplemented) and subtract. */
19895 tmp0
= gen_reg_rtx (mode
);
19896 tmp1
= gen_reg_rtx (mode
);
19897 emit_move_insn (tmp1
, CONST0_RTX (mode
));
19898 if (mode
== E_V2DImode
)
19899 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
19901 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
19905 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
19906 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
19907 - 1), NULL
, 0, OPTAB_DIRECT
);
19908 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
19911 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
19912 NULL
, 0, OPTAB_DIRECT
);
19913 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
19914 target
, 0, OPTAB_DIRECT
);
19918 /* For 32-bit signed integer X, the best way to calculate the absolute
19919 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
19920 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
19921 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
19922 NULL
, 0, OPTAB_DIRECT
);
19923 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
19924 NULL
, 0, OPTAB_DIRECT
);
19925 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
19926 target
, 0, OPTAB_DIRECT
);
19930 /* For 16-bit signed integer X, the best way to calculate the absolute
19931 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
19932 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
19934 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
19935 target
, 0, OPTAB_DIRECT
);
19939 /* For 8-bit signed integer X, the best way to calculate the absolute
19940 value of X is min ((unsigned char) X, (unsigned char) (-X)),
19941 as SSE2 provides the PMINUB insn. */
19942 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
19944 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
19945 target
, 0, OPTAB_DIRECT
);
19949 gcc_unreachable ();
19953 emit_move_insn (target
, x
);
19956 /* Expand an extract from a vector register through pextr insn.
19957 Return true if successful. */
19960 ix86_expand_pextr (rtx
*operands
)
19962 rtx dst
= operands
[0];
19963 rtx src
= operands
[1];
19965 unsigned int size
= INTVAL (operands
[2]);
19966 unsigned int pos
= INTVAL (operands
[3]);
19968 if (SUBREG_P (dst
))
19970 /* Reject non-lowpart subregs. */
19971 if (SUBREG_BYTE (dst
) > 0)
19973 dst
= SUBREG_REG (dst
);
19976 if (SUBREG_P (src
))
19978 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
19979 src
= SUBREG_REG (src
);
19982 switch (GET_MODE (src
))
19991 machine_mode srcmode
, dstmode
;
19994 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
20000 if (!TARGET_SSE4_1
)
20002 srcmode
= V16QImode
;
20008 srcmode
= V8HImode
;
20012 if (!TARGET_SSE4_1
)
20014 srcmode
= V4SImode
;
20018 gcc_assert (TARGET_64BIT
);
20019 if (!TARGET_SSE4_1
)
20021 srcmode
= V2DImode
;
20028 /* Reject extractions from misaligned positions. */
20029 if (pos
& (size
-1))
20032 if (GET_MODE (dst
) == dstmode
)
20035 d
= gen_reg_rtx (dstmode
);
20037 /* Construct insn pattern. */
20038 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
20039 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
20041 /* Let the rtl optimizers know about the zero extension performed. */
20042 if (dstmode
== QImode
|| dstmode
== HImode
)
20044 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
20045 d
= gen_lowpart (SImode
, d
);
20048 emit_insn (gen_rtx_SET (d
, pat
));
20051 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20060 /* Expand an insert into a vector register through pinsr insn.
20061 Return true if successful. */
20064 ix86_expand_pinsr (rtx
*operands
)
20066 rtx dst
= operands
[0];
20067 rtx src
= operands
[3];
20069 unsigned int size
= INTVAL (operands
[1]);
20070 unsigned int pos
= INTVAL (operands
[2]);
20072 if (SUBREG_P (dst
))
20074 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
20075 dst
= SUBREG_REG (dst
);
20078 switch (GET_MODE (dst
))
20087 machine_mode srcmode
, dstmode
;
20088 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
20091 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
20097 if (!TARGET_SSE4_1
)
20099 dstmode
= V16QImode
;
20100 pinsr
= gen_sse4_1_pinsrb
;
20106 dstmode
= V8HImode
;
20107 pinsr
= gen_sse2_pinsrw
;
20111 if (!TARGET_SSE4_1
)
20113 dstmode
= V4SImode
;
20114 pinsr
= gen_sse4_1_pinsrd
;
20118 gcc_assert (TARGET_64BIT
);
20119 if (!TARGET_SSE4_1
)
20121 dstmode
= V2DImode
;
20122 pinsr
= gen_sse4_1_pinsrq
;
20129 /* Reject insertions to misaligned positions. */
20130 if (pos
& (size
-1))
20133 if (SUBREG_P (src
))
20135 unsigned int srcpos
= SUBREG_BYTE (src
);
20141 extr_ops
[0] = gen_reg_rtx (srcmode
);
20142 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
20143 extr_ops
[2] = GEN_INT (size
);
20144 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
20146 if (!ix86_expand_pextr (extr_ops
))
20152 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
20155 if (GET_MODE (dst
) == dstmode
)
20158 d
= gen_reg_rtx (dstmode
);
20160 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
20161 gen_lowpart (srcmode
, src
),
20162 GEN_INT (1 << (pos
/ size
))));
20164 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20173 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20174 upper against lower halves up to SSE reg size. */
20177 ix86_split_reduction (machine_mode mode
)
20179 /* Reduce lowpart against highpart until we reach SSE reg width to
20180 avoid cross-lane operations. */
20206 /* Generate call to __divmoddi4. */
20209 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
20211 rtx
*quot_p
, rtx
*rem_p
)
20213 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
20215 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
20216 mode
, op0
, mode
, op1
, mode
,
20217 XEXP (rem
, 0), Pmode
);
20222 #include "gt-i386-expand.h"