1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
73 #include "tree-iterator.h"
75 #include "case-cfn-macros.h"
77 #include "fold-const-call.h"
79 #include "tree-ssanames.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
85 #include "symbol-summary.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
103 split_double_mode (machine_mode mode
, rtx operands
[],
104 int num
, rtx lo_half
[], rtx hi_half
[])
106 machine_mode half_mode
;
108 rtx mem_op
= NULL_RTX
;
129 byte
= GET_MODE_SIZE (half_mode
);
133 rtx op
= operands
[num
];
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
139 if (mem_op
&& rtx_equal_p (op
, mem_op
))
141 lo_half
[num
] = lo_half
[mem_num
];
142 hi_half
[num
] = hi_half
[mem_num
];
148 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
149 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
154 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
155 GET_MODE (op
) == VOIDmode
156 ? mode
: GET_MODE (op
), 0);
158 rtx tmp
= simplify_gen_subreg (half_mode
, op
,
159 GET_MODE (op
) == VOIDmode
160 ? mode
: GET_MODE (op
), byte
);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half
[num
] = tmp
? tmp
: gen_reg_rtx (half_mode
);
168 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
172 ix86_expand_clear (rtx dest
)
176 /* We play register width games, which are only valid after reload. */
177 gcc_assert (reload_completed
);
179 /* Avoid HImode and its attendant prefix byte. */
180 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
181 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
182 tmp
= gen_rtx_SET (dest
, const0_rtx
);
184 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
186 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
187 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
193 /* Return true if V can be broadcasted from an integer of WIDTH bits
194 which is returned in VAL_BROADCAST. Otherwise, return false. */
197 ix86_broadcast (HOST_WIDE_INT v
, unsigned int width
,
198 HOST_WIDE_INT
&val_broadcast
)
200 wide_int val
= wi::uhwi (v
, HOST_BITS_PER_WIDE_INT
);
201 val_broadcast
= wi::extract_uhwi (val
, 0, width
);
202 for (unsigned int i
= width
; i
< HOST_BITS_PER_WIDE_INT
; i
+= width
)
204 HOST_WIDE_INT each
= wi::extract_uhwi (val
, i
, width
);
205 if (val_broadcast
!= each
)
208 val_broadcast
= sext_hwi (val_broadcast
, width
);
212 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
215 ix86_convert_const_wide_int_to_broadcast (machine_mode mode
, rtx op
)
217 /* Don't use integer vector broadcast if we can't move from GPR to SSE
218 register directly. */
219 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
222 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
223 broadcast only if vector broadcast is available. */
225 || !CONST_WIDE_INT_P (op
)
226 || standard_sse_constant_p (op
, mode
))
229 HOST_WIDE_INT val
= CONST_WIDE_INT_ELT (op
, 0);
230 HOST_WIDE_INT val_broadcast
;
231 scalar_int_mode broadcast_mode
;
233 && ix86_broadcast (val
, GET_MODE_BITSIZE (QImode
),
235 broadcast_mode
= QImode
;
237 && ix86_broadcast (val
, GET_MODE_BITSIZE (HImode
),
239 broadcast_mode
= HImode
;
240 else if (ix86_broadcast (val
, GET_MODE_BITSIZE (SImode
),
242 broadcast_mode
= SImode
;
243 else if (TARGET_64BIT
244 && ix86_broadcast (val
, GET_MODE_BITSIZE (DImode
),
246 broadcast_mode
= DImode
;
250 /* Check if OP can be broadcasted from VAL. */
251 for (int i
= 1; i
< CONST_WIDE_INT_NUNITS (op
); i
++)
252 if (val
!= CONST_WIDE_INT_ELT (op
, i
))
255 unsigned int nunits
= (GET_MODE_SIZE (mode
)
256 / GET_MODE_SIZE (broadcast_mode
));
257 machine_mode vector_mode
;
258 if (!mode_for_vector (broadcast_mode
, nunits
).exists (&vector_mode
))
260 rtx target
= ix86_gen_scratch_sse_rtx (vector_mode
);
261 bool ok
= ix86_expand_vector_init_duplicate (false, vector_mode
,
263 GEN_INT (val_broadcast
));
265 target
= lowpart_subreg (mode
, target
, vector_mode
);
270 ix86_expand_move (machine_mode mode
, rtx operands
[])
273 rtx tmp
, addend
= NULL_RTX
;
274 enum tls_model model
;
279 /* Avoid complex sets of likely spilled hard registers before reload. */
280 if (!ix86_hardreg_mov_ok (op0
, op1
))
282 tmp
= gen_reg_rtx (mode
);
284 ix86_expand_move (mode
, operands
);
290 switch (GET_CODE (op1
))
295 if (GET_CODE (tmp
) != PLUS
296 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
300 addend
= XEXP (tmp
, 1);
304 model
= SYMBOL_REF_TLS_MODEL (op1
);
307 op1
= legitimize_tls_address (op1
, model
, true);
308 else if (ix86_force_load_from_GOT_p (op1
))
310 /* Load the external function address via GOT slot to avoid PLT. */
311 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
315 op1
= gen_rtx_CONST (Pmode
, op1
);
316 op1
= gen_const_mem (Pmode
, op1
);
317 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
321 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
337 op1
= force_operand (op1
, NULL_RTX
);
338 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
339 op0
, 1, OPTAB_DIRECT
);
342 op1
= force_operand (op1
, op0
);
347 op1
= convert_to_mode (mode
, op1
, 1);
353 if ((flag_pic
|| MACHOPIC_INDIRECT
)
354 && symbolic_operand (op1
, mode
))
356 if (TARGET_MACHO
&& !TARGET_64BIT
)
360 if (MACHOPIC_INDIRECT
)
362 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
363 ? op0
: gen_reg_rtx (Pmode
);
364 op1
= machopic_indirect_data_reference (op1
, temp
);
366 op1
= machopic_legitimize_pic_address (op1
, mode
,
367 temp
== op1
? 0 : temp
);
369 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
371 rtx insn
= gen_rtx_SET (op0
, op1
);
375 if (GET_CODE (op0
) == MEM
)
376 op1
= force_reg (Pmode
, op1
);
380 if (GET_CODE (temp
) != REG
)
381 temp
= gen_reg_rtx (Pmode
);
382 temp
= legitimize_pic_address (op1
, temp
);
393 op1
= force_reg (mode
, op1
);
394 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
396 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
397 op1
= legitimize_pic_address (op1
, reg
);
400 op1
= convert_to_mode (mode
, op1
, 1);
407 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
408 || !push_operand (op0
, mode
))
410 op1
= force_reg (mode
, op1
);
412 if (push_operand (op0
, mode
)
413 && ! general_no_elim_operand (op1
, mode
))
414 op1
= copy_to_mode_reg (mode
, op1
);
416 /* Force large constants in 64bit compilation into register
417 to get them CSEed. */
418 if (can_create_pseudo_p ()
419 && (mode
== DImode
) && TARGET_64BIT
420 && immediate_operand (op1
, mode
)
421 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
422 && !register_operand (op0
, mode
)
424 op1
= copy_to_mode_reg (mode
, op1
);
426 if (can_create_pseudo_p ())
428 if (CONST_DOUBLE_P (op1
))
430 /* If we are loading a floating point constant to a
431 register, force the value to memory now, since we'll
432 get better code out the back end. */
434 op1
= validize_mem (force_const_mem (mode
, op1
));
435 if (!register_operand (op0
, mode
))
437 rtx temp
= gen_reg_rtx (mode
);
438 emit_insn (gen_rtx_SET (temp
, op1
));
439 emit_move_insn (op0
, temp
);
443 else if (GET_MODE_SIZE (mode
) >= 16)
445 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
446 (GET_MODE (op0
), op1
);
453 emit_insn (gen_rtx_SET (op0
, op1
));
456 /* OP is a memref of CONST_VECTOR, return scalar constant mem
457 if CONST_VECTOR is a vec_duplicate, else return NULL. */
459 ix86_broadcast_from_constant (machine_mode mode
, rtx op
)
461 int nunits
= GET_MODE_NUNITS (mode
);
465 /* Don't use integer vector broadcast if we can't move from GPR to SSE
466 register directly. */
467 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
468 && INTEGRAL_MODE_P (mode
))
471 /* Convert CONST_VECTOR to a non-standard SSE constant integer
472 broadcast only if vector broadcast is available. */
475 && (GET_MODE_INNER (mode
) == SImode
476 || GET_MODE_INNER (mode
) == DImode
))
477 || FLOAT_MODE_P (mode
))
478 || standard_sse_constant_p (op
, mode
))
481 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
482 We can still put 64-bit integer constant in memory when
483 avx512 embed broadcast is available. */
484 if (GET_MODE_INNER (mode
) == DImode
&& !TARGET_64BIT
486 || (GET_MODE_SIZE (mode
) < 64 && !TARGET_AVX512VL
)))
489 if (GET_MODE_INNER (mode
) == TImode
)
492 rtx constant
= get_pool_constant (XEXP (op
, 0));
493 if (GET_CODE (constant
) != CONST_VECTOR
)
496 /* There could be some rtx like
497 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
498 but with "*.LC1" refer to V2DI constant vector. */
499 if (GET_MODE (constant
) != mode
)
501 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
),
503 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
507 rtx first
= XVECEXP (constant
, 0, 0);
509 for (int i
= 1; i
< nunits
; ++i
)
511 rtx tmp
= XVECEXP (constant
, 0, i
);
512 /* Vector duplicate value. */
513 if (!rtx_equal_p (tmp
, first
))
521 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
523 rtx op0
= operands
[0], op1
= operands
[1];
524 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
525 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
526 unsigned int align
= (TARGET_IAMCU
527 ? GET_MODE_BITSIZE (mode
)
528 : GET_MODE_ALIGNMENT (mode
));
530 if (push_operand (op0
, VOIDmode
))
531 op0
= emit_move_resolve_push (mode
, op0
);
533 /* Force constants other than zero into memory. We do not know how
534 the instructions used to build constants modify the upper 64 bits
535 of the register, once we have that information we may be able
536 to handle some of them more efficiently. */
537 if (can_create_pseudo_p ()
540 && CONSTANT_P (SUBREG_REG (op1
))))
541 && ((register_operand (op0
, mode
)
542 && !standard_sse_constant_p (op1
, mode
))
543 /* ix86_expand_vector_move_misalign() does not like constants. */
544 || (SSE_REG_MODE_P (mode
)
546 && MEM_ALIGN (op0
) < align
)))
550 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
551 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
553 r
= validize_mem (r
);
555 r
= force_reg (imode
, SUBREG_REG (op1
));
556 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
560 machine_mode mode
= GET_MODE (op0
);
561 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
564 op1
= validize_mem (force_const_mem (mode
, op1
));
570 if (can_create_pseudo_p ()
571 && GET_MODE_SIZE (mode
) >= 16
572 && VECTOR_MODE_P (mode
)
574 && SYMBOL_REF_P (XEXP (op1
, 0))
575 && CONSTANT_POOL_ADDRESS_P (XEXP (op1
, 0))))
577 rtx first
= ix86_broadcast_from_constant (mode
, op1
);
578 if (first
!= nullptr)
580 /* Broadcast to XMM/YMM/ZMM register from an integer
581 constant or scalar mem. */
582 op1
= gen_reg_rtx (mode
);
583 if (FLOAT_MODE_P (mode
)
584 || (!TARGET_64BIT
&& GET_MODE_INNER (mode
) == DImode
))
585 first
= force_const_mem (GET_MODE_INNER (mode
), first
);
586 bool ok
= ix86_expand_vector_init_duplicate (false, mode
,
589 emit_move_insn (op0
, op1
);
594 /* We need to check memory alignment for SSE mode since attribute
595 can make operands unaligned. */
596 if (can_create_pseudo_p ()
597 && SSE_REG_MODE_P (mode
)
598 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
599 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
603 /* ix86_expand_vector_move_misalign() does not like both
604 arguments in memory. */
605 if (!register_operand (op0
, mode
)
606 && !register_operand (op1
, mode
))
608 rtx scratch
= ix86_gen_scratch_sse_rtx (mode
);
609 emit_move_insn (scratch
, op1
);
613 tmp
[0] = op0
; tmp
[1] = op1
;
614 ix86_expand_vector_move_misalign (mode
, tmp
);
618 /* Special case TImode to V1TImode conversions, via V2DI. */
621 && GET_MODE (SUBREG_REG (op1
)) == TImode
622 && TARGET_64BIT
&& TARGET_SSE
623 && can_create_pseudo_p ())
625 rtx tmp
= gen_reg_rtx (V2DImode
);
626 rtx lo
= gen_reg_rtx (DImode
);
627 rtx hi
= gen_reg_rtx (DImode
);
628 emit_move_insn (lo
, gen_lowpart (DImode
, SUBREG_REG (op1
)));
629 emit_move_insn (hi
, gen_highpart (DImode
, SUBREG_REG (op1
)));
630 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
631 emit_move_insn (op0
, gen_lowpart (V1TImode
, tmp
));
635 /* If operand0 is a hard register, make operand1 a pseudo. */
636 if (can_create_pseudo_p ()
637 && !ix86_hardreg_mov_ok (op0
, op1
))
639 rtx tmp
= gen_reg_rtx (GET_MODE (op0
));
640 emit_move_insn (tmp
, op1
);
641 emit_move_insn (op0
, tmp
);
645 /* Make operand1 a register if it isn't already. */
646 if (can_create_pseudo_p ()
647 && !register_operand (op0
, mode
)
648 && !register_operand (op1
, mode
))
650 rtx tmp
= ix86_gen_scratch_sse_rtx (GET_MODE (op0
));
651 emit_move_insn (tmp
, op1
);
652 emit_move_insn (op0
, tmp
);
656 emit_insn (gen_rtx_SET (op0
, op1
));
659 /* Split 32-byte AVX unaligned load and store if needed. */
662 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
665 rtx (*extract
) (rtx
, rtx
, rtx
);
668 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
669 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
671 emit_insn (gen_rtx_SET (op0
, op1
));
675 rtx orig_op0
= NULL_RTX
;
676 mode
= GET_MODE (op0
);
677 switch (GET_MODE_CLASS (mode
))
679 case MODE_VECTOR_INT
:
681 if (mode
!= V32QImode
)
686 op0
= gen_reg_rtx (V32QImode
);
689 op0
= gen_lowpart (V32QImode
, op0
);
690 op1
= gen_lowpart (V32QImode
, op1
);
694 case MODE_VECTOR_FLOAT
:
705 extract
= gen_avx_vextractf128v32qi
;
709 extract
= gen_avx_vextractf128v16hf
;
713 extract
= gen_avx_vextractf128v8sf
;
717 extract
= gen_avx_vextractf128v4df
;
724 rtx r
= gen_reg_rtx (mode
);
725 m
= adjust_address (op1
, mode
, 0);
726 emit_move_insn (r
, m
);
727 m
= adjust_address (op1
, mode
, 16);
728 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
729 emit_move_insn (op0
, r
);
731 else if (MEM_P (op0
))
733 m
= adjust_address (op0
, mode
, 0);
734 emit_insn (extract (m
, op1
, const0_rtx
));
735 m
= adjust_address (op0
, mode
, 16);
736 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
742 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
745 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
746 straight to ix86_expand_vector_move. */
747 /* Code generation for scalar reg-reg moves of single and double precision data:
748 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
752 if (x86_sse_partial_reg_dependency == true)
757 Code generation for scalar loads of double precision data:
758 if (x86_sse_split_regs == true)
759 movlpd mem, reg (gas syntax)
763 Code generation for unaligned packed loads of single precision data
764 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
765 if (x86_sse_unaligned_move_optimal)
768 if (x86_sse_partial_reg_dependency == true)
780 Code generation for unaligned packed loads of double precision data
781 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
782 if (x86_sse_unaligned_move_optimal)
785 if (x86_sse_split_regs == true)
798 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
805 /* Use unaligned load/store for AVX512 or when optimizing for size. */
806 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
808 emit_insn (gen_rtx_SET (op0
, op1
));
814 if (GET_MODE_SIZE (mode
) == 32)
815 ix86_avx256_split_vector_move_misalign (op0
, op1
);
817 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
818 emit_insn (gen_rtx_SET (op0
, op1
));
822 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
823 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
825 emit_insn (gen_rtx_SET (op0
, op1
));
829 /* ??? If we have typed data, then it would appear that using
830 movdqu is the only way to get unaligned data loaded with
832 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
834 emit_insn (gen_rtx_SET (op0
, op1
));
840 if (TARGET_SSE2
&& mode
== V2DFmode
)
844 /* When SSE registers are split into halves, we can avoid
845 writing to the top half twice. */
846 if (TARGET_SSE_SPLIT_REGS
)
853 /* ??? Not sure about the best option for the Intel chips.
854 The following would seem to satisfy; the register is
855 entirely cleared, breaking the dependency chain. We
856 then store to the upper half, with a dependency depth
857 of one. A rumor has it that Intel recommends two movsd
858 followed by an unpacklpd, but this is unconfirmed. And
859 given that the dependency depth of the unpacklpd would
860 still be one, I'm not sure why this would be better. */
861 zero
= CONST0_RTX (V2DFmode
);
864 m
= adjust_address (op1
, DFmode
, 0);
865 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
866 m
= adjust_address (op1
, DFmode
, 8);
867 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
873 if (mode
!= V4SFmode
)
874 t
= gen_reg_rtx (V4SFmode
);
878 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
879 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
883 m
= adjust_address (op1
, V2SFmode
, 0);
884 emit_insn (gen_sse_loadlps (t
, t
, m
));
885 m
= adjust_address (op1
, V2SFmode
, 8);
886 emit_insn (gen_sse_loadhps (t
, t
, m
));
887 if (mode
!= V4SFmode
)
888 emit_move_insn (op0
, gen_lowpart (mode
, t
));
891 else if (MEM_P (op0
))
893 if (TARGET_SSE2
&& mode
== V2DFmode
)
895 m
= adjust_address (op0
, DFmode
, 0);
896 emit_insn (gen_sse2_storelpd (m
, op1
));
897 m
= adjust_address (op0
, DFmode
, 8);
898 emit_insn (gen_sse2_storehpd (m
, op1
));
902 if (mode
!= V4SFmode
)
903 op1
= gen_lowpart (V4SFmode
, op1
);
905 m
= adjust_address (op0
, V2SFmode
, 0);
906 emit_insn (gen_sse_storelps (m
, op1
));
907 m
= adjust_address (op0
, V2SFmode
, 8);
908 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
915 /* Move bits 64:95 to bits 32:63. */
918 ix86_move_vector_high_sse_to_mmx (rtx op
)
920 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
921 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
922 GEN_INT (0), GEN_INT (0)));
923 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
924 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
925 rtx insn
= gen_rtx_SET (dest
, op
);
929 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
932 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
934 rtx op0
= operands
[0];
935 rtx op1
= operands
[1];
936 rtx op2
= operands
[2];
938 machine_mode dmode
= GET_MODE (op0
);
939 machine_mode smode
= GET_MODE (op1
);
940 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
941 machine_mode inner_smode
= GET_MODE_INNER (smode
);
943 /* Get the corresponding SSE mode for destination. */
944 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
945 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
947 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
948 nunits
/ 2).require ();
950 /* Get the corresponding SSE mode for source. */
951 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
952 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
955 /* Generate SSE pack with signed/unsigned saturation. */
956 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
957 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
958 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
960 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
961 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
962 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
966 ix86_move_vector_high_sse_to_mmx (op0
);
969 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
972 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
974 rtx op0
= operands
[0];
975 rtx op1
= operands
[1];
976 rtx op2
= operands
[2];
977 machine_mode mode
= GET_MODE (op0
);
979 /* The corresponding SSE mode. */
980 machine_mode sse_mode
, double_sse_mode
;
986 sse_mode
= V16QImode
;
987 double_sse_mode
= V32QImode
;
988 mask
= gen_rtx_PARALLEL (VOIDmode
,
990 GEN_INT (0), GEN_INT (16),
991 GEN_INT (1), GEN_INT (17),
992 GEN_INT (2), GEN_INT (18),
993 GEN_INT (3), GEN_INT (19),
994 GEN_INT (4), GEN_INT (20),
995 GEN_INT (5), GEN_INT (21),
996 GEN_INT (6), GEN_INT (22),
997 GEN_INT (7), GEN_INT (23)));
1002 sse_mode
= V8HImode
;
1003 double_sse_mode
= V16HImode
;
1004 mask
= gen_rtx_PARALLEL (VOIDmode
,
1006 GEN_INT (0), GEN_INT (8),
1007 GEN_INT (1), GEN_INT (9),
1008 GEN_INT (2), GEN_INT (10),
1009 GEN_INT (3), GEN_INT (11)));
1013 sse_mode
= V4SImode
;
1014 double_sse_mode
= V8SImode
;
1015 mask
= gen_rtx_PARALLEL (VOIDmode
,
1017 GEN_INT (0), GEN_INT (4),
1018 GEN_INT (1), GEN_INT (5)));
1022 sse_mode
= V4SFmode
;
1023 double_sse_mode
= V8SFmode
;
1024 mask
= gen_rtx_PARALLEL (VOIDmode
,
1026 GEN_INT (0), GEN_INT (4),
1027 GEN_INT (1), GEN_INT (5)));
1034 /* Generate SSE punpcklXX. */
1035 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
1036 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
1037 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
1039 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
1040 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
1041 rtx insn
= gen_rtx_SET (dest
, op2
);
1044 /* Move high bits to low bits. */
1047 if (sse_mode
== V4SFmode
)
1049 mask
= gen_rtx_PARALLEL (VOIDmode
,
1050 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1051 GEN_INT (4), GEN_INT (5)));
1052 op2
= gen_rtx_VEC_CONCAT (V8SFmode
, dest
, dest
);
1053 op1
= gen_rtx_VEC_SELECT (V4SFmode
, op2
, mask
);
1057 int sz
= GET_MODE_SIZE (mode
);
1060 mask
= gen_rtx_PARALLEL (VOIDmode
,
1061 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1062 GEN_INT (0), GEN_INT (1)));
1064 mask
= gen_rtx_PARALLEL (VOIDmode
,
1065 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1066 GEN_INT (0), GEN_INT (1)));
1070 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
1071 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
1074 insn
= gen_rtx_SET (dest
, op1
);
1079 /* Helper function of ix86_fixup_binary_operands to canonicalize
1080 operand order. Returns true if the operands should be swapped. */
1083 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
1086 rtx dst
= operands
[0];
1087 rtx src1
= operands
[1];
1088 rtx src2
= operands
[2];
1090 /* If the operation is not commutative, we can't do anything. */
1091 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
1092 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
1095 /* Highest priority is that src1 should match dst. */
1096 if (rtx_equal_p (dst
, src1
))
1098 if (rtx_equal_p (dst
, src2
))
1101 /* Next highest priority is that immediate constants come second. */
1102 if (immediate_operand (src2
, mode
))
1104 if (immediate_operand (src1
, mode
))
1107 /* Lowest priority is that memory references should come second. */
1117 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1118 destination to use for the operation. If different from the true
1119 destination in operands[0], a copy operation will be required. */
1122 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
1125 rtx dst
= operands
[0];
1126 rtx src1
= operands
[1];
1127 rtx src2
= operands
[2];
1129 /* Canonicalize operand order. */
1130 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1132 /* It is invalid to swap operands of different modes. */
1133 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
1135 std::swap (src1
, src2
);
1138 /* Both source operands cannot be in memory. */
1139 if (MEM_P (src1
) && MEM_P (src2
))
1141 /* Optimization: Only read from memory once. */
1142 if (rtx_equal_p (src1
, src2
))
1144 src2
= force_reg (mode
, src2
);
1147 else if (rtx_equal_p (dst
, src1
))
1148 src2
= force_reg (mode
, src2
);
1150 src1
= force_reg (mode
, src1
);
1153 /* If the destination is memory, and we do not have matching source
1154 operands, do things in registers. */
1155 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1156 dst
= gen_reg_rtx (mode
);
1158 /* Source 1 cannot be a constant. */
1159 if (CONSTANT_P (src1
))
1160 src1
= force_reg (mode
, src1
);
1162 /* Source 1 cannot be a non-matching memory. */
1163 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1164 src1
= force_reg (mode
, src1
);
1166 /* Improve address combine. */
1168 && GET_MODE_CLASS (mode
) == MODE_INT
1170 src2
= force_reg (mode
, src2
);
1177 /* Similarly, but assume that the destination has already been
1181 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
1182 machine_mode mode
, rtx operands
[])
1184 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1185 gcc_assert (dst
== operands
[0]);
1188 /* Attempt to expand a binary operator. Make the expansion closer to the
1189 actual machine, then just general_operand, which will allow 3 separate
1190 memory references (one output, two input) in a single insn. */
1193 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
1196 rtx src1
, src2
, dst
, op
, clob
;
1198 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1202 /* Emit the instruction. */
1204 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
1206 if (reload_completed
1208 && !rtx_equal_p (dst
, src1
))
1210 /* This is going to be an LEA; avoid splitting it later. */
1215 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1216 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1219 /* Fix up the destination if needed. */
1220 if (dst
!= operands
[0])
1221 emit_move_insn (operands
[0], dst
);
1224 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1225 the given OPERANDS. */
1228 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
1231 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
1232 if (SUBREG_P (operands
[1]))
1237 else if (SUBREG_P (operands
[2]))
1242 /* Optimize (__m128i) d | (__m128i) e and similar code
1243 when d and e are float vectors into float vector logical
1244 insn. In C/C++ without using intrinsics there is no other way
1245 to express vector logical operation on float vectors than
1246 to cast them temporarily to integer vectors. */
1248 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1249 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
1250 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
1251 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
1252 && SUBREG_BYTE (op1
) == 0
1253 && (GET_CODE (op2
) == CONST_VECTOR
1254 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
1255 && SUBREG_BYTE (op2
) == 0))
1256 && can_create_pseudo_p ())
1259 switch (GET_MODE (SUBREG_REG (op1
)))
1267 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1268 if (GET_CODE (op2
) == CONST_VECTOR
)
1270 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1271 op2
= force_reg (GET_MODE (dst
), op2
);
1276 op2
= SUBREG_REG (operands
[2]);
1277 if (!vector_operand (op2
, GET_MODE (dst
)))
1278 op2
= force_reg (GET_MODE (dst
), op2
);
1280 op1
= SUBREG_REG (op1
);
1281 if (!vector_operand (op1
, GET_MODE (dst
)))
1282 op1
= force_reg (GET_MODE (dst
), op1
);
1283 emit_insn (gen_rtx_SET (dst
,
1284 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1286 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1292 if (!vector_operand (operands
[1], mode
))
1293 operands
[1] = force_reg (mode
, operands
[1]);
1294 if (!vector_operand (operands
[2], mode
))
1295 operands
[2] = force_reg (mode
, operands
[2]);
1296 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1297 emit_insn (gen_rtx_SET (operands
[0],
1298 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1302 /* Return TRUE or FALSE depending on whether the binary operator meets the
1303 appropriate constraints. */
1306 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1309 rtx dst
= operands
[0];
1310 rtx src1
= operands
[1];
1311 rtx src2
= operands
[2];
1313 /* Both source operands cannot be in memory. */
1314 if ((MEM_P (src1
) || bcst_mem_operand (src1
, mode
))
1315 && (MEM_P (src2
) || bcst_mem_operand (src2
, mode
)))
1318 /* Canonicalize operand order for commutative operators. */
1319 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1320 std::swap (src1
, src2
);
1322 /* If the destination is memory, we must have a matching source operand. */
1323 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1326 /* Source 1 cannot be a constant. */
1327 if (CONSTANT_P (src1
))
1330 /* Source 1 cannot be a non-matching memory. */
1331 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1332 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1336 || (TARGET_64BIT
&& mode
== DImode
))
1337 && satisfies_constraint_L (src2
));
1342 /* Attempt to expand a unary operator. Make the expansion closer to the
1343 actual machine, then just general_operand, which will allow 2 separate
1344 memory references (one output, one input) in a single insn. */
1347 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1350 bool matching_memory
= false;
1351 rtx src
, dst
, op
, clob
;
1356 /* If the destination is memory, and we do not have matching source
1357 operands, do things in registers. */
1360 if (rtx_equal_p (dst
, src
))
1361 matching_memory
= true;
1363 dst
= gen_reg_rtx (mode
);
1366 /* When source operand is memory, destination must match. */
1367 if (MEM_P (src
) && !matching_memory
)
1368 src
= force_reg (mode
, src
);
1370 /* Emit the instruction. */
1372 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1378 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1379 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1382 /* Fix up the destination if needed. */
1383 if (dst
!= operands
[0])
1384 emit_move_insn (operands
[0], dst
);
1387 /* Predict just emitted jump instruction to be taken with probability PROB. */
1390 predict_jump (int prob
)
1392 rtx_insn
*insn
= get_last_insn ();
1393 gcc_assert (JUMP_P (insn
));
1394 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1397 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1398 divisor are within the range [0-255]. */
1401 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1404 rtx_code_label
*end_label
, *qimode_label
;
1407 rtx scratch
, tmp0
, tmp1
, tmp2
;
1408 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1410 operands
[2] = force_reg (mode
, operands
[2]);
1411 operands
[3] = force_reg (mode
, operands
[3]);
1416 if (GET_MODE (operands
[0]) == SImode
)
1418 if (GET_MODE (operands
[1]) == SImode
)
1419 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1422 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1426 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1430 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1437 end_label
= gen_label_rtx ();
1438 qimode_label
= gen_label_rtx ();
1440 scratch
= gen_reg_rtx (mode
);
1442 /* Use 8bit unsigned divimod if dividend and divisor are within
1443 the range [0-255]. */
1444 emit_move_insn (scratch
, operands
[2]);
1445 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1446 scratch
, 1, OPTAB_DIRECT
);
1447 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1448 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1449 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1450 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1451 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1453 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1454 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1455 JUMP_LABEL (insn
) = qimode_label
;
1457 /* Generate original signed/unsigned divimod. */
1458 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1459 operands
[2], operands
[3]));
1461 /* Branch to the end. */
1462 emit_jump_insn (gen_jump (end_label
));
1465 /* Generate 8bit unsigned divide. */
1466 emit_label (qimode_label
);
1467 /* Don't use operands[0] for result of 8bit divide since not all
1468 registers support QImode ZERO_EXTRACT. */
1469 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1470 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1471 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1472 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1476 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1477 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1481 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1482 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1486 if (GET_MODE (operands
[0]) != SImode
)
1487 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1488 if (GET_MODE (operands
[1]) != SImode
)
1489 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1492 /* Extract remainder from AH. */
1493 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1494 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1495 GEN_INT (8), GEN_INT (8));
1496 insn
= emit_move_insn (operands
[1], tmp1
);
1497 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1499 /* Zero extend quotient from AL. */
1500 tmp1
= gen_lowpart (QImode
, tmp0
);
1501 insn
= emit_insn (gen_extend_insn
1503 GET_MODE (operands
[0]), QImode
, 1));
1504 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1506 emit_label (end_label
);
1509 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1510 matches destination. RTX includes clobber of FLAGS_REG. */
1513 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1518 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1519 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1521 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1524 /* Return true if regno1 def is nearest to the insn. */
1527 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1529 rtx_insn
*prev
= insn
;
1530 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1534 while (prev
&& prev
!= start
)
1536 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1538 prev
= PREV_INSN (prev
);
1541 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1543 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1545 prev
= PREV_INSN (prev
);
1548 /* None of the regs is defined in the bb. */
1552 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1553 int ix86_last_zero_store_uid
;
1555 /* Split lea instructions into a sequence of instructions
1556 which are executed on ALU to avoid AGU stalls.
1557 It is assumed that it is allowed to clobber flags register
1561 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1563 unsigned int regno0
, regno1
, regno2
;
1564 struct ix86_address parts
;
1568 ok
= ix86_decompose_address (operands
[1], &parts
);
1571 target
= gen_lowpart (mode
, operands
[0]);
1573 regno0
= true_regnum (target
);
1574 regno1
= INVALID_REGNUM
;
1575 regno2
= INVALID_REGNUM
;
1579 parts
.base
= gen_lowpart (mode
, parts
.base
);
1580 regno1
= true_regnum (parts
.base
);
1585 parts
.index
= gen_lowpart (mode
, parts
.index
);
1586 regno2
= true_regnum (parts
.index
);
1590 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1592 if (parts
.scale
> 1)
1594 /* Case r1 = r1 + ... */
1595 if (regno1
== regno0
)
1597 /* If we have a case r1 = r1 + C * r2 then we
1598 should use multiplication which is very
1599 expensive. Assume cost model is wrong if we
1600 have such case here. */
1601 gcc_assert (regno2
!= regno0
);
1603 for (adds
= parts
.scale
; adds
> 0; adds
--)
1604 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1608 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1609 if (regno0
!= regno2
)
1610 emit_insn (gen_rtx_SET (target
, parts
.index
));
1612 /* Use shift for scaling, but emit it as MULT instead
1613 to avoid it being immediately peephole2 optimized back
1615 ix86_emit_binop (MULT
, mode
, target
, GEN_INT (parts
.scale
));
1618 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1620 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1621 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1624 else if (!parts
.base
&& !parts
.index
)
1626 gcc_assert(parts
.disp
);
1627 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1633 if (regno0
!= regno2
)
1634 emit_insn (gen_rtx_SET (target
, parts
.index
));
1636 else if (!parts
.index
)
1638 if (regno0
!= regno1
)
1639 emit_insn (gen_rtx_SET (target
, parts
.base
));
1643 if (regno0
== regno1
)
1645 else if (regno0
== regno2
)
1651 /* Find better operand for SET instruction, depending
1652 on which definition is farther from the insn. */
1653 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1654 tmp
= parts
.index
, tmp1
= parts
.base
;
1656 tmp
= parts
.base
, tmp1
= parts
.index
;
1658 emit_insn (gen_rtx_SET (target
, tmp
));
1660 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1661 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1663 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1667 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1670 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1671 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1675 /* Post-reload splitter for converting an SF or DFmode value in an
1676 SSE register into an unsigned SImode. */
1679 ix86_split_convert_uns_si_sse (rtx operands
[])
1681 machine_mode vecmode
;
1682 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1684 large
= operands
[1];
1685 zero_or_two31
= operands
[2];
1686 input
= operands
[3];
1687 two31
= operands
[4];
1688 vecmode
= GET_MODE (large
);
1689 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1691 /* Load up the value into the low element. We must ensure that the other
1692 elements are valid floats -- zero is the easiest such value. */
1695 if (vecmode
== V4SFmode
)
1696 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1698 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1702 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1703 emit_move_insn (value
, CONST0_RTX (vecmode
));
1704 if (vecmode
== V4SFmode
)
1705 emit_insn (gen_sse_movss (value
, value
, input
));
1707 emit_insn (gen_sse2_movsd (value
, value
, input
));
1710 emit_move_insn (large
, two31
);
1711 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1713 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1714 emit_insn (gen_rtx_SET (large
, x
));
1716 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1717 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1719 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1720 emit_insn (gen_rtx_SET (value
, x
));
1722 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1723 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1725 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1726 if (vecmode
== V4SFmode
)
1727 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1729 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1732 emit_insn (gen_xorv4si3 (value
, value
, large
));
1735 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1736 machine_mode mode
, rtx target
,
1737 rtx var
, int one_var
);
1739 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1740 Expects the 64-bit DImode to be supplied in a pair of integral
1741 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1742 -mfpmath=sse, !optimize_size only. */
1745 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1747 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1748 rtx int_xmm
, fp_xmm
;
1749 rtx biases
, exponents
;
1752 int_xmm
= gen_reg_rtx (V4SImode
);
1753 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1754 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1755 else if (TARGET_SSE_SPLIT_REGS
)
1757 emit_clobber (int_xmm
);
1758 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1762 x
= gen_reg_rtx (V2DImode
);
1763 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1764 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1767 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1768 gen_rtvec (4, GEN_INT (0x43300000UL
),
1769 GEN_INT (0x45300000UL
),
1770 const0_rtx
, const0_rtx
));
1771 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1773 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1774 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1776 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1777 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1778 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1779 (0x1.0p84 + double(fp_value_hi_xmm)).
1780 Note these exponents differ by 32. */
1782 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1784 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1785 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1786 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1787 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1788 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1789 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1790 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1791 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1792 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1794 /* Add the upper and lower DFmode values together. */
1796 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1799 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1800 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1801 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1804 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1807 /* Not used, but eases macroization of patterns. */
1809 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1814 static rtx
ix86_expand_sse_fabs (rtx op0
, rtx
*smask
);
1816 /* Convert an unsigned SImode value into a DFmode. Only currently used
1817 for SSE, but applicable anywhere. */
1820 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1822 REAL_VALUE_TYPE TWO31r
;
1825 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1826 NULL
, 1, OPTAB_DIRECT
);
1828 fp
= gen_reg_rtx (DFmode
);
1829 emit_insn (gen_floatsidf2 (fp
, x
));
1831 real_ldexp (&TWO31r
, &dconst1
, 31);
1832 x
= const_double_from_real_value (TWO31r
, DFmode
);
1834 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1836 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1837 if (HONOR_SIGNED_ZEROS (DFmode
) && flag_rounding_math
)
1838 x
= ix86_expand_sse_fabs (x
, NULL
);
1841 emit_move_insn (target
, x
);
1844 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1845 32-bit mode; otherwise we have a direct convert instruction. */
1848 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1850 REAL_VALUE_TYPE TWO32r
;
1851 rtx fp_lo
, fp_hi
, x
;
1853 fp_lo
= gen_reg_rtx (DFmode
);
1854 fp_hi
= gen_reg_rtx (DFmode
);
1856 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1858 real_ldexp (&TWO32r
, &dconst1
, 32);
1859 x
= const_double_from_real_value (TWO32r
, DFmode
);
1860 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1862 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1864 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1867 emit_move_insn (target
, x
);
1870 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1871 For x86_32, -mfpmath=sse, !optimize_size only. */
1873 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1875 REAL_VALUE_TYPE ONE16r
;
1876 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1878 real_ldexp (&ONE16r
, &dconst1
, 16);
1879 x
= const_double_from_real_value (ONE16r
, SFmode
);
1880 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1881 NULL
, 0, OPTAB_DIRECT
);
1882 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1883 NULL
, 0, OPTAB_DIRECT
);
1884 fp_hi
= gen_reg_rtx (SFmode
);
1885 fp_lo
= gen_reg_rtx (SFmode
);
1886 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1887 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1890 x
= validize_mem (force_const_mem (SFmode
, x
));
1891 fp_hi
= gen_rtx_FMA (SFmode
, fp_hi
, x
, fp_lo
);
1892 emit_move_insn (target
, fp_hi
);
1896 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1898 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1900 if (!rtx_equal_p (target
, fp_hi
))
1901 emit_move_insn (target
, fp_hi
);
1905 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1906 a vector of unsigned ints VAL to vector of floats TARGET. */
1909 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1912 REAL_VALUE_TYPE TWO16r
;
1913 machine_mode intmode
= GET_MODE (val
);
1914 machine_mode fltmode
= GET_MODE (target
);
1915 rtx (*cvt
) (rtx
, rtx
);
1917 if (intmode
== V4SImode
)
1918 cvt
= gen_floatv4siv4sf2
;
1920 cvt
= gen_floatv8siv8sf2
;
1921 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1922 tmp
[0] = force_reg (intmode
, tmp
[0]);
1923 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1925 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1926 NULL_RTX
, 1, OPTAB_DIRECT
);
1927 tmp
[3] = gen_reg_rtx (fltmode
);
1928 emit_insn (cvt (tmp
[3], tmp
[1]));
1929 tmp
[4] = gen_reg_rtx (fltmode
);
1930 emit_insn (cvt (tmp
[4], tmp
[2]));
1931 real_ldexp (&TWO16r
, &dconst1
, 16);
1932 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1933 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1936 tmp
[6] = gen_rtx_FMA (fltmode
, tmp
[4], tmp
[5], tmp
[3]);
1937 emit_move_insn (target
, tmp
[6]);
1941 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5],
1942 NULL_RTX
, 1, OPTAB_DIRECT
);
1943 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6],
1944 target
, 1, OPTAB_DIRECT
);
1945 if (tmp
[7] != target
)
1946 emit_move_insn (target
, tmp
[7]);
1950 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1951 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1952 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1953 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1956 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1958 REAL_VALUE_TYPE TWO31r
;
1960 machine_mode mode
= GET_MODE (val
);
1961 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1962 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1963 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1966 for (i
= 0; i
< 3; i
++)
1967 tmp
[i
] = gen_reg_rtx (mode
);
1968 real_ldexp (&TWO31r
, &dconst1
, 31);
1969 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1970 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1971 two31r
= force_reg (mode
, two31r
);
1974 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1975 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1976 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1977 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1978 default: gcc_unreachable ();
1980 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1981 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1982 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1984 if (intmode
== V4SImode
|| TARGET_AVX2
)
1985 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1986 gen_lowpart (intmode
, tmp
[0]),
1987 GEN_INT (31), NULL_RTX
, 0,
1991 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
1992 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1993 *xorp
= expand_simple_binop (intmode
, AND
,
1994 gen_lowpart (intmode
, tmp
[0]),
1998 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
2002 /* Generate code for floating point ABS or NEG. */
2005 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2009 bool use_sse
= false;
2010 bool vector_mode
= VECTOR_MODE_P (mode
);
2011 machine_mode vmode
= mode
;
2014 if (vector_mode
|| mode
== TFmode
|| mode
== HFmode
)
2020 else if (TARGET_SSE_MATH
)
2022 use_sse
= SSE_FLOAT_MODE_P (mode
);
2025 else if (mode
== DFmode
)
2032 set
= gen_rtx_fmt_e (code
, mode
, src
);
2033 set
= gen_rtx_SET (dst
, set
);
2037 rtx mask
, use
, clob
;
2039 /* NEG and ABS performed with SSE use bitwise mask operations.
2040 Create the appropriate mask now. */
2041 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
2042 use
= gen_rtx_USE (VOIDmode
, mask
);
2043 if (vector_mode
|| mode
== TFmode
)
2044 par
= gen_rtvec (2, set
, use
);
2047 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2048 par
= gen_rtvec (3, set
, use
, clob
);
2055 /* Changing of sign for FP values is doable using integer unit too. */
2056 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2057 par
= gen_rtvec (2, set
, clob
);
2060 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2063 /* Deconstruct a floating point ABS or NEG operation
2064 with integer registers into integer operations. */
2067 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2070 enum rtx_code absneg_op
;
2073 gcc_assert (operands_match_p (operands
[0], operands
[1]));
2078 dst
= gen_lowpart (SImode
, operands
[0]);
2082 set
= gen_int_mode (0x7fffffff, SImode
);
2087 set
= gen_int_mode (0x80000000, SImode
);
2090 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2096 dst
= gen_lowpart (DImode
, operands
[0]);
2097 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
2102 set
= gen_rtx_NOT (DImode
, dst
);
2106 dst
= gen_highpart (SImode
, operands
[0]);
2110 set
= gen_int_mode (0x7fffffff, SImode
);
2115 set
= gen_int_mode (0x80000000, SImode
);
2118 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2123 dst
= gen_rtx_REG (SImode
,
2124 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
2127 set
= GEN_INT (0x7fff);
2132 set
= GEN_INT (0x8000);
2135 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2142 set
= gen_rtx_SET (dst
, set
);
2144 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2145 rtvec par
= gen_rtvec (2, set
, clob
);
2147 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2150 /* Expand a copysign operation. Special case operand 0 being a constant. */
2153 ix86_expand_copysign (rtx operands
[])
2155 machine_mode mode
, vmode
;
2156 rtx dest
, vdest
, op0
, op1
, mask
, op2
, op3
;
2158 mode
= GET_MODE (operands
[0]);
2162 else if (mode
== SFmode
)
2164 else if (mode
== DFmode
)
2166 else if (mode
== TFmode
)
2171 if (rtx_equal_p (operands
[1], operands
[2]))
2173 emit_move_insn (operands
[0], operands
[1]);
2178 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2179 if (vdest
== NULL_RTX
)
2180 vdest
= gen_reg_rtx (vmode
);
2183 op1
= lowpart_subreg (vmode
, force_reg (mode
, operands
[2]), mode
);
2184 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2186 if (CONST_DOUBLE_P (operands
[1]))
2188 op0
= simplify_unary_operation (ABS
, mode
, operands
[1], mode
);
2189 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2190 if (op0
== CONST0_RTX (mode
))
2192 emit_move_insn (vdest
, gen_rtx_AND (vmode
, mask
, op1
));
2194 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2198 if (GET_MODE_SIZE (mode
) < 16)
2199 op0
= ix86_build_const_vector (vmode
, false, op0
);
2200 op0
= force_reg (vmode
, op0
);
2203 op0
= lowpart_subreg (vmode
, force_reg (mode
, operands
[1]), mode
);
2205 op2
= gen_reg_rtx (vmode
);
2206 op3
= gen_reg_rtx (vmode
);
2207 emit_move_insn (op2
, gen_rtx_AND (vmode
,
2208 gen_rtx_NOT (vmode
, mask
),
2210 emit_move_insn (op3
, gen_rtx_AND (vmode
, mask
, op1
));
2211 emit_move_insn (vdest
, gen_rtx_IOR (vmode
, op2
, op3
));
2213 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2216 /* Expand an xorsign operation. */
2219 ix86_expand_xorsign (rtx operands
[])
2221 machine_mode mode
, vmode
;
2222 rtx dest
, vdest
, op0
, op1
, mask
, x
, temp
;
2228 mode
= GET_MODE (dest
);
2232 else if (mode
== SFmode
)
2234 else if (mode
== DFmode
)
2239 temp
= gen_reg_rtx (vmode
);
2240 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2242 op1
= lowpart_subreg (vmode
, force_reg (mode
, op1
), mode
);
2243 x
= gen_rtx_AND (vmode
, op1
, mask
);
2244 emit_insn (gen_rtx_SET (temp
, x
));
2246 op0
= lowpart_subreg (vmode
, force_reg (mode
, op0
), mode
);
2247 x
= gen_rtx_XOR (vmode
, temp
, op0
);
2249 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2250 if (vdest
== NULL_RTX
)
2251 vdest
= gen_reg_rtx (vmode
);
2254 emit_insn (gen_rtx_SET (vdest
, x
));
2257 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2260 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2263 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2265 machine_mode mode
= GET_MODE (op0
);
2268 /* Handle special case - vector comparsion with boolean result, transform
2269 it using ptest instruction. */
2270 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
2272 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2273 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2275 gcc_assert (code
== EQ
|| code
== NE
);
2276 /* Generate XOR since we can't check that one operand is zero vector. */
2277 tmp
= gen_reg_rtx (mode
);
2278 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2279 tmp
= gen_lowpart (p_mode
, tmp
);
2280 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2281 gen_rtx_UNSPEC (CCmode
,
2282 gen_rtvec (2, tmp
, tmp
),
2284 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2285 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2286 gen_rtx_LABEL_REF (VOIDmode
, label
),
2288 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2302 tmp
= ix86_expand_compare (code
, op0
, op1
);
2303 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2304 gen_rtx_LABEL_REF (VOIDmode
, label
),
2306 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2312 /* For 32-bit target DI comparison may be performed on
2313 SSE registers. To allow this we should avoid split
2314 to SI mode which is achieved by doing xor in DI mode
2315 and then comparing with zero (which is recognized by
2316 STV pass). We don't compare using xor when optimizing
2318 if (!optimize_insn_for_size_p ()
2320 && (code
== EQ
|| code
== NE
))
2322 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2327 /* Expand DImode branch into multiple compare+branch. */
2330 rtx_code_label
*label2
;
2331 enum rtx_code code1
, code2
, code3
;
2332 machine_mode submode
;
2334 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2336 std::swap (op0
, op1
);
2337 code
= swap_condition (code
);
2340 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2341 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2343 submode
= mode
== DImode
? SImode
: DImode
;
2345 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2346 avoid two branches. This costs one extra insn, so disable when
2347 optimizing for size. */
2349 if ((code
== EQ
|| code
== NE
)
2350 && (!optimize_insn_for_size_p ()
2351 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2356 if (hi
[1] != const0_rtx
)
2357 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2358 NULL_RTX
, 0, OPTAB_WIDEN
);
2361 if (lo
[1] != const0_rtx
)
2362 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2363 NULL_RTX
, 0, OPTAB_WIDEN
);
2365 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2366 NULL_RTX
, 0, OPTAB_WIDEN
);
2368 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2372 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2373 op1 is a constant and the low word is zero, then we can just
2374 examine the high word. Similarly for low word -1 and
2375 less-or-equal-than or greater-than. */
2377 if (CONST_INT_P (hi
[1]))
2380 case LT
: case LTU
: case GE
: case GEU
:
2381 if (lo
[1] == const0_rtx
)
2383 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2387 case LE
: case LEU
: case GT
: case GTU
:
2388 if (lo
[1] == constm1_rtx
)
2390 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2398 /* Emulate comparisons that do not depend on Zero flag with
2399 double-word subtraction. Note that only Overflow, Sign
2400 and Carry flags are valid, so swap arguments and condition
2401 of comparisons that would otherwise test Zero flag. */
2405 case LE
: case LEU
: case GT
: case GTU
:
2406 std::swap (lo
[0], lo
[1]);
2407 std::swap (hi
[0], hi
[1]);
2408 code
= swap_condition (code
);
2411 case LT
: case LTU
: case GE
: case GEU
:
2413 bool uns
= (code
== LTU
|| code
== GEU
);
2414 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2415 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2417 if (!nonimmediate_operand (lo
[0], submode
))
2418 lo
[0] = force_reg (submode
, lo
[0]);
2419 if (!x86_64_general_operand (lo
[1], submode
))
2420 lo
[1] = force_reg (submode
, lo
[1]);
2422 if (!register_operand (hi
[0], submode
))
2423 hi
[0] = force_reg (submode
, hi
[0]);
2424 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2425 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2426 hi
[1] = force_reg (submode
, hi
[1]);
2428 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2430 tmp
= gen_rtx_SCRATCH (submode
);
2431 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2433 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2434 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2442 /* Otherwise, we need two or three jumps. */
2444 label2
= gen_label_rtx ();
2447 code2
= swap_condition (code
);
2448 code3
= unsigned_condition (code
);
2452 case LT
: case GT
: case LTU
: case GTU
:
2455 case LE
: code1
= LT
; code2
= GT
; break;
2456 case GE
: code1
= GT
; code2
= LT
; break;
2457 case LEU
: code1
= LTU
; code2
= GTU
; break;
2458 case GEU
: code1
= GTU
; code2
= LTU
; break;
2460 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2461 case NE
: code2
= UNKNOWN
; break;
2469 * if (hi(a) < hi(b)) goto true;
2470 * if (hi(a) > hi(b)) goto false;
2471 * if (lo(a) < lo(b)) goto true;
2475 if (code1
!= UNKNOWN
)
2476 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2477 if (code2
!= UNKNOWN
)
2478 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2480 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2482 if (code2
!= UNKNOWN
)
2483 emit_label (label2
);
2488 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2493 /* Figure out whether to use unordered fp comparisons. */
2496 ix86_unordered_fp_compare (enum rtx_code code
)
2498 if (!TARGET_IEEE_FP
)
2527 /* Return a comparison we can do and that it is equivalent to
2528 swap_condition (code) apart possibly from orderedness.
2529 But, never change orderedness if TARGET_IEEE_FP, returning
2530 UNKNOWN in that case if necessary. */
2532 static enum rtx_code
2533 ix86_fp_swap_condition (enum rtx_code code
)
2537 case GT
: /* GTU - CF=0 & ZF=0 */
2538 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2539 case GE
: /* GEU - CF=0 */
2540 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2541 case UNLT
: /* LTU - CF=1 */
2542 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2543 case UNLE
: /* LEU - CF=1 | ZF=1 */
2544 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2546 return swap_condition (code
);
2550 /* Return cost of comparison CODE using the best strategy for performance.
2551 All following functions do use number of instructions as a cost metrics.
2552 In future this should be tweaked to compute bytes for optimize_size and
2553 take into account performance of various instructions on various CPUs. */
2556 ix86_fp_comparison_cost (enum rtx_code code
)
2560 /* The cost of code using bit-twiddling on %ah. */
2577 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2581 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2587 switch (ix86_fp_comparison_strategy (code
))
2589 case IX86_FPCMP_COMI
:
2590 return arith_cost
> 4 ? 3 : 2;
2591 case IX86_FPCMP_SAHF
:
2592 return arith_cost
> 4 ? 4 : 3;
2598 /* Swap, force into registers, or otherwise massage the two operands
2599 to a fp comparison. The operands are updated in place; the new
2600 comparison code is returned. */
2602 static enum rtx_code
2603 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2605 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2606 rtx op0
= *pop0
, op1
= *pop1
;
2607 machine_mode op_mode
= GET_MODE (op0
);
2608 bool is_sse
= SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode
);
2610 /* All of the unordered compare instructions only work on registers.
2611 The same is true of the fcomi compare instructions. The XFmode
2612 compare instructions require registers except when comparing
2613 against zero or when converting operand 1 from fixed point to
2617 && (unordered_compare
2618 || (op_mode
== XFmode
2619 && ! (standard_80387_constant_p (op0
) == 1
2620 || standard_80387_constant_p (op1
) == 1)
2621 && GET_CODE (op1
) != FLOAT
)
2622 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2624 op0
= force_reg (op_mode
, op0
);
2625 op1
= force_reg (op_mode
, op1
);
2629 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2630 things around if they appear profitable, otherwise force op0
2633 if (standard_80387_constant_p (op0
) == 0
2635 && ! (standard_80387_constant_p (op1
) == 0
2638 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2639 if (new_code
!= UNKNOWN
)
2641 std::swap (op0
, op1
);
2647 op0
= force_reg (op_mode
, op0
);
2649 if (CONSTANT_P (op1
))
2651 int tmp
= standard_80387_constant_p (op1
);
2653 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2657 op1
= force_reg (op_mode
, op1
);
2660 op1
= force_reg (op_mode
, op1
);
2664 /* Try to rearrange the comparison to make it cheaper. */
2665 if (ix86_fp_comparison_cost (code
)
2666 > ix86_fp_comparison_cost (swap_condition (code
))
2667 && (REG_P (op1
) || can_create_pseudo_p ()))
2669 std::swap (op0
, op1
);
2670 code
= swap_condition (code
);
2672 op0
= force_reg (op_mode
, op0
);
2680 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2683 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2685 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2686 machine_mode cmp_mode
;
2689 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2691 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2692 if (unordered_compare
)
2693 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2695 /* Do fcomi/sahf based test when profitable. */
2696 switch (ix86_fp_comparison_strategy (code
))
2698 case IX86_FPCMP_COMI
:
2699 cmp_mode
= CCFPmode
;
2700 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2703 case IX86_FPCMP_SAHF
:
2704 cmp_mode
= CCFPmode
;
2705 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2706 scratch
= gen_reg_rtx (HImode
);
2707 emit_insn (gen_rtx_SET (scratch
, tmp
));
2708 emit_insn (gen_x86_sahf_1 (scratch
));
2711 case IX86_FPCMP_ARITH
:
2712 cmp_mode
= CCNOmode
;
2713 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2714 scratch
= gen_reg_rtx (HImode
);
2715 emit_insn (gen_rtx_SET (scratch
, tmp
));
2717 /* In the unordered case, we have to check C2 for NaN's, which
2718 doesn't happen to work out to anything nice combination-wise.
2719 So do some bit twiddling on the value we've got in AH to come
2720 up with an appropriate set of condition codes. */
2726 if (code
== GT
|| !TARGET_IEEE_FP
)
2728 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2733 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2734 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2735 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2742 if (code
== LT
&& TARGET_IEEE_FP
)
2744 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2745 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2751 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2757 if (code
== GE
|| !TARGET_IEEE_FP
)
2759 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2764 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2765 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2771 if (code
== LE
&& TARGET_IEEE_FP
)
2773 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2774 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2775 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2781 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2787 if (code
== EQ
&& TARGET_IEEE_FP
)
2789 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2790 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2796 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2802 if (code
== NE
&& TARGET_IEEE_FP
)
2804 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2805 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2811 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2817 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2821 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2834 /* Return the test that should be put into the flags user, i.e.
2835 the bcc, scc, or cmov instruction. */
2836 return gen_rtx_fmt_ee (code
, VOIDmode
,
2837 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2841 /* Generate insn patterns to do an integer compare of OPERANDS. */
2844 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2846 machine_mode cmpmode
;
2849 /* Swap operands to emit carry flag comparison. */
2850 if ((code
== GTU
|| code
== LEU
)
2851 && nonimmediate_operand (op1
, VOIDmode
))
2853 std::swap (op0
, op1
);
2854 code
= swap_condition (code
);
2857 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2858 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2860 /* This is very simple, but making the interface the same as in the
2861 FP case makes the rest of the code easier. */
2862 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2863 emit_insn (gen_rtx_SET (flags
, tmp
));
2865 /* Return the test that should be put into the flags user, i.e.
2866 the bcc, scc, or cmov instruction. */
2867 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2871 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2875 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2876 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2878 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2880 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2881 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2884 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2890 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2894 gcc_assert (GET_MODE (dest
) == QImode
);
2896 ret
= ix86_expand_compare (code
, op0
, op1
);
2897 PUT_MODE (ret
, QImode
);
2898 emit_insn (gen_rtx_SET (dest
, ret
));
2901 /* Expand floating point op0 <=> op1, i.e.
2902 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2905 ix86_expand_fp_spaceship (rtx dest
, rtx op0
, rtx op1
)
2907 gcc_checking_assert (ix86_fp_comparison_strategy (GT
) != IX86_FPCMP_ARITH
);
2908 rtx gt
= ix86_expand_fp_compare (GT
, op0
, op1
);
2909 rtx l0
= gen_label_rtx ();
2910 rtx l1
= gen_label_rtx ();
2911 rtx l2
= TARGET_IEEE_FP
? gen_label_rtx () : NULL_RTX
;
2912 rtx lend
= gen_label_rtx ();
2917 rtx un
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
,
2918 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
2919 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, un
,
2920 gen_rtx_LABEL_REF (VOIDmode
, l2
), pc_rtx
);
2921 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2922 add_reg_br_prob_note (jmp
, profile_probability:: very_unlikely ());
2924 rtx eq
= gen_rtx_fmt_ee (UNEQ
, VOIDmode
,
2925 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
2926 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, eq
,
2927 gen_rtx_LABEL_REF (VOIDmode
, l0
), pc_rtx
);
2928 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2929 add_reg_br_prob_note (jmp
, profile_probability::unlikely ());
2930 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, gt
,
2931 gen_rtx_LABEL_REF (VOIDmode
, l1
), pc_rtx
);
2932 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2933 add_reg_br_prob_note (jmp
, profile_probability::even ());
2934 emit_move_insn (dest
, constm1_rtx
);
2937 emit_move_insn (dest
, const0_rtx
);
2940 emit_move_insn (dest
, const1_rtx
);
2945 emit_move_insn (dest
, const2_rtx
);
2950 /* Expand comparison setting or clearing carry flag. Return true when
2951 successful and set pop for the operation. */
2953 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2956 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2958 /* Do not handle double-mode compares that go through special path. */
2959 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2962 if (SCALAR_FLOAT_MODE_P (mode
))
2965 rtx_insn
*compare_seq
;
2967 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2969 /* Shortcut: following common codes never translate
2970 into carry flag compares. */
2971 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2972 || code
== ORDERED
|| code
== UNORDERED
)
2975 /* These comparisons require zero flag; swap operands so they won't. */
2976 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2979 std::swap (op0
, op1
);
2980 code
= swap_condition (code
);
2983 /* Try to expand the comparison and verify that we end up with
2984 carry flag based comparison. This fails to be true only when
2985 we decide to expand comparison using arithmetic that is not
2986 too common scenario. */
2988 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2989 compare_seq
= get_insns ();
2992 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2993 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2995 code
= GET_CODE (compare_op
);
2997 if (code
!= LTU
&& code
!= GEU
)
3000 emit_insn (compare_seq
);
3005 if (!INTEGRAL_MODE_P (mode
))
3014 /* Convert a==0 into (unsigned)a<1. */
3017 if (op1
!= const0_rtx
)
3020 code
= (code
== EQ
? LTU
: GEU
);
3023 /* Convert a>b into b<a or a>=b-1. */
3026 if (CONST_INT_P (op1
))
3028 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
3029 /* Bail out on overflow. We still can swap operands but that
3030 would force loading of the constant into register. */
3031 if (op1
== const0_rtx
3032 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
3034 code
= (code
== GTU
? GEU
: LTU
);
3038 std::swap (op0
, op1
);
3039 code
= (code
== GTU
? LTU
: GEU
);
3043 /* Convert a>=0 into (unsigned)a<0x80000000. */
3046 if (mode
== DImode
|| op1
!= const0_rtx
)
3048 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3049 code
= (code
== LT
? GEU
: LTU
);
3053 if (mode
== DImode
|| op1
!= constm1_rtx
)
3055 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3056 code
= (code
== LE
? GEU
: LTU
);
3062 /* Swapping operands may cause constant to appear as first operand. */
3063 if (!nonimmediate_operand (op0
, VOIDmode
))
3065 if (!can_create_pseudo_p ())
3067 op0
= force_reg (mode
, op0
);
3069 *pop
= ix86_expand_compare (code
, op0
, op1
);
3070 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
3074 /* Expand conditional increment or decrement using adb/sbb instructions.
3075 The default case using setcc followed by the conditional move can be
3076 done by generic code. */
3078 ix86_expand_int_addcc (rtx operands
[])
3080 enum rtx_code code
= GET_CODE (operands
[1]);
3082 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
3084 rtx val
= const0_rtx
;
3087 rtx op0
= XEXP (operands
[1], 0);
3088 rtx op1
= XEXP (operands
[1], 1);
3090 if (operands
[3] != const1_rtx
3091 && operands
[3] != constm1_rtx
)
3093 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3095 code
= GET_CODE (compare_op
);
3097 flags
= XEXP (compare_op
, 0);
3099 if (GET_MODE (flags
) == CCFPmode
)
3102 code
= ix86_fp_compare_code_to_integer (code
);
3109 PUT_CODE (compare_op
,
3110 reverse_condition_maybe_unordered
3111 (GET_CODE (compare_op
)));
3113 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
3116 mode
= GET_MODE (operands
[0]);
3118 /* Construct either adc or sbb insn. */
3119 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
3120 insn
= gen_sub3_carry
;
3122 insn
= gen_add3_carry
;
3124 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
3130 ix86_expand_int_movcc (rtx operands
[])
3132 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
3133 rtx_insn
*compare_seq
;
3135 machine_mode mode
= GET_MODE (operands
[0]);
3136 bool sign_bit_compare_p
= false;
3137 rtx op0
= XEXP (operands
[1], 0);
3138 rtx op1
= XEXP (operands
[1], 1);
3139 rtx op2
= operands
[2];
3140 rtx op3
= operands
[3];
3142 if (GET_MODE (op0
) == TImode
3143 || (GET_MODE (op0
) == DImode
3148 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3149 compare_seq
= get_insns ();
3152 compare_code
= GET_CODE (compare_op
);
3154 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
3155 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
3156 sign_bit_compare_p
= true;
3158 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3159 but if op1 is a constant, the latter form allows more optimizations,
3160 either through the last 2 ops being constant handling, or the one
3161 constant and one variable cases. On the other side, for cmov the
3162 former might be better as we don't need to load the constant into
3163 another register. */
3164 if (code
== EQ
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op2
))
3166 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3167 else if (code
== NE
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op3
))
3170 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3171 HImode insns, we'd be swallowed in word prefix ops. */
3173 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
3174 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
3175 && CONST_INT_P (op2
)
3176 && CONST_INT_P (op3
))
3178 rtx out
= operands
[0];
3179 HOST_WIDE_INT ct
= INTVAL (op2
);
3180 HOST_WIDE_INT cf
= INTVAL (op3
);
3184 /* Sign bit compares are better done using shifts than we do by using
3186 if (sign_bit_compare_p
3187 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3189 /* Detect overlap between destination and compare sources. */
3192 if (!sign_bit_compare_p
)
3197 compare_code
= GET_CODE (compare_op
);
3199 flags
= XEXP (compare_op
, 0);
3201 if (GET_MODE (flags
) == CCFPmode
)
3205 = ix86_fp_compare_code_to_integer (compare_code
);
3208 /* To simplify rest of code, restrict to the GEU case. */
3209 if (compare_code
== LTU
)
3212 compare_code
= reverse_condition (compare_code
);
3213 code
= reverse_condition (code
);
3218 PUT_CODE (compare_op
,
3219 reverse_condition_maybe_unordered
3220 (GET_CODE (compare_op
)));
3222 PUT_CODE (compare_op
,
3223 reverse_condition (GET_CODE (compare_op
)));
3227 if (reg_overlap_mentioned_p (out
, compare_op
))
3228 tmp
= gen_reg_rtx (mode
);
3231 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
3233 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
3234 flags
, compare_op
));
3238 if (code
== GT
|| code
== GE
)
3239 code
= reverse_condition (code
);
3245 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
3258 tmp
= expand_simple_binop (mode
, PLUS
,
3260 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3271 tmp
= expand_simple_binop (mode
, IOR
,
3273 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3275 else if (diff
== -1 && ct
)
3285 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3287 tmp
= expand_simple_binop (mode
, PLUS
,
3288 copy_rtx (tmp
), GEN_INT (cf
),
3289 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3297 * andl cf - ct, dest
3307 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3310 tmp
= expand_simple_binop (mode
, AND
,
3312 gen_int_mode (cf
- ct
, mode
),
3313 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3315 tmp
= expand_simple_binop (mode
, PLUS
,
3316 copy_rtx (tmp
), GEN_INT (ct
),
3317 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3320 if (!rtx_equal_p (tmp
, out
))
3321 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3328 machine_mode cmp_mode
= GET_MODE (op0
);
3329 enum rtx_code new_code
;
3331 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3333 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3335 /* We may be reversing a non-trapping
3336 comparison to a trapping comparison. */
3337 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3338 && code
!= EQ
&& code
!= NE
3339 && code
!= ORDERED
&& code
!= UNORDERED
)
3342 new_code
= reverse_condition_maybe_unordered (code
);
3345 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3346 if (new_code
!= UNKNOWN
)
3354 compare_code
= UNKNOWN
;
3355 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3356 && CONST_INT_P (op1
))
3358 if (op1
== const0_rtx
3359 && (code
== LT
|| code
== GE
))
3360 compare_code
= code
;
3361 else if (op1
== constm1_rtx
)
3365 else if (code
== GT
)
3370 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3371 if (compare_code
!= UNKNOWN
3372 && GET_MODE (op0
) == GET_MODE (out
)
3373 && (cf
== -1 || ct
== -1))
3375 /* If lea code below could be used, only optimize
3376 if it results in a 2 insn sequence. */
3378 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3379 || diff
== 3 || diff
== 5 || diff
== 9)
3380 || (compare_code
== LT
&& ct
== -1)
3381 || (compare_code
== GE
&& cf
== -1))
3384 * notl op1 (if necessary)
3392 code
= reverse_condition (code
);
3395 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3397 out
= expand_simple_binop (mode
, IOR
,
3399 out
, 1, OPTAB_DIRECT
);
3400 if (out
!= operands
[0])
3401 emit_move_insn (operands
[0], out
);
3408 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3409 || diff
== 3 || diff
== 5 || diff
== 9)
3410 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3412 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3418 * lea cf(dest*(ct-cf)),dest
3422 * This also catches the degenerate setcc-only case.
3428 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3431 /* On x86_64 the lea instruction operates on Pmode, so we need
3432 to get arithmetics done in proper mode to match. */
3434 tmp
= copy_rtx (out
);
3438 out1
= copy_rtx (out
);
3439 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3443 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3449 tmp
= plus_constant (mode
, tmp
, cf
);
3452 if (!rtx_equal_p (tmp
, out
))
3455 out
= force_operand (tmp
, copy_rtx (out
));
3457 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3459 if (!rtx_equal_p (out
, operands
[0]))
3460 emit_move_insn (operands
[0], copy_rtx (out
));
3466 * General case: Jumpful:
3467 * xorl dest,dest cmpl op1, op2
3468 * cmpl op1, op2 movl ct, dest
3470 * decl dest movl cf, dest
3471 * andl (cf-ct),dest 1:
3476 * This is reasonably steep, but branch mispredict costs are
3477 * high on modern cpus, so consider failing only if optimizing
3481 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3482 && BRANCH_COST (optimize_insn_for_speed_p (),
3487 machine_mode cmp_mode
= GET_MODE (op0
);
3488 enum rtx_code new_code
;
3490 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3492 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3494 /* We may be reversing a non-trapping
3495 comparison to a trapping comparison. */
3496 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3497 && code
!= EQ
&& code
!= NE
3498 && code
!= ORDERED
&& code
!= UNORDERED
)
3501 new_code
= reverse_condition_maybe_unordered (code
);
3506 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3507 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3508 compare_code
= reverse_condition (compare_code
);
3511 if (new_code
!= UNKNOWN
)
3519 if (compare_code
!= UNKNOWN
)
3521 /* notl op1 (if needed)
3526 For x < 0 (resp. x <= -1) there will be no notl,
3527 so if possible swap the constants to get rid of the
3529 True/false will be -1/0 while code below (store flag
3530 followed by decrement) is 0/-1, so the constants need
3531 to be exchanged once more. */
3533 if (compare_code
== GE
|| !cf
)
3535 code
= reverse_condition (code
);
3541 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3545 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3547 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3549 copy_rtx (out
), 1, OPTAB_DIRECT
);
3552 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3553 gen_int_mode (cf
- ct
, mode
),
3554 copy_rtx (out
), 1, OPTAB_DIRECT
);
3556 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3557 copy_rtx (out
), 1, OPTAB_DIRECT
);
3558 if (!rtx_equal_p (out
, operands
[0]))
3559 emit_move_insn (operands
[0], copy_rtx (out
));
3565 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3567 /* Try a few things more with specific constants and a variable. */
3570 rtx var
, orig_out
, out
, tmp
;
3572 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3578 /* If one of the two operands is an interesting constant, load a
3579 constant with the above and mask it in with a logical operation. */
3581 if (CONST_INT_P (operands
[2]))
3584 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3585 operands
[3] = constm1_rtx
, op
= and_optab
;
3586 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3587 operands
[3] = const0_rtx
, op
= ior_optab
;
3591 else if (CONST_INT_P (operands
[3]))
3594 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3596 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3597 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3598 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3599 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3603 operands
[2] = constm1_rtx
;
3606 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3607 operands
[2] = const0_rtx
, op
= ior_optab
;
3614 orig_out
= operands
[0];
3615 tmp
= gen_reg_rtx (mode
);
3618 /* Recurse to get the constant loaded. */
3619 if (!ix86_expand_int_movcc (operands
))
3622 /* Mask in the interesting variable. */
3623 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3625 if (!rtx_equal_p (out
, orig_out
))
3626 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3632 * For comparison with above,
3642 if (! nonimmediate_operand (operands
[2], mode
))
3643 operands
[2] = force_reg (mode
, operands
[2]);
3644 if (! nonimmediate_operand (operands
[3], mode
))
3645 operands
[3] = force_reg (mode
, operands
[3]);
3647 if (! register_operand (operands
[2], VOIDmode
)
3649 || ! register_operand (operands
[3], VOIDmode
)))
3650 operands
[2] = force_reg (mode
, operands
[2]);
3653 && ! register_operand (operands
[3], VOIDmode
))
3654 operands
[3] = force_reg (mode
, operands
[3]);
3656 emit_insn (compare_seq
);
3657 emit_insn (gen_rtx_SET (operands
[0],
3658 gen_rtx_IF_THEN_ELSE (mode
,
3659 compare_op
, operands
[2],
3664 /* Detect conditional moves that exactly match min/max operational
3665 semantics. Note that this is IEEE safe, as long as we don't
3666 interchange the operands.
3668 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3669 and TRUE if the operation is successful and instructions are emitted. */
3672 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3673 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3681 else if (code
== UNGE
)
3682 std::swap (if_true
, if_false
);
3686 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3688 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3693 mode
= GET_MODE (dest
);
3695 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3696 but MODE may be a vector mode and thus not appropriate. */
3697 if (!flag_finite_math_only
|| flag_signed_zeros
)
3699 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3702 if_true
= force_reg (mode
, if_true
);
3703 v
= gen_rtvec (2, if_true
, if_false
);
3704 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3708 code
= is_min
? SMIN
: SMAX
;
3709 if (MEM_P (if_true
) && MEM_P (if_false
))
3710 if_true
= force_reg (mode
, if_true
);
3711 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3714 emit_insn (gen_rtx_SET (dest
, tmp
));
3718 /* Return true if MODE is valid for vector compare to mask register,
3719 Same result for conditionl vector move with mask register. */
3721 ix86_valid_mask_cmp_mode (machine_mode mode
)
3723 /* XOP has its own vector conditional movement. */
3724 if (TARGET_XOP
&& !TARGET_AVX512F
)
3727 /* HFmode only supports vcmpsh whose dest is mask register. */
3728 if (TARGET_AVX512FP16
&& mode
== HFmode
)
3731 /* AVX512F is needed for mask operation. */
3732 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3735 /* AVX512BW is needed for vector QI/HImode,
3736 AVX512VL is needed for 128/256-bit vector. */
3737 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3738 int vector_size
= GET_MODE_SIZE (mode
);
3739 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3742 return vector_size
== 64 || TARGET_AVX512VL
;
3745 /* Return true if integer mask comparison should be used. */
3747 ix86_use_mask_cmp_p (machine_mode mode
, machine_mode cmp_mode
,
3748 rtx op_true
, rtx op_false
)
3750 int vector_size
= GET_MODE_SIZE (mode
);
3752 if (cmp_mode
== HFmode
)
3754 else if (vector_size
< 16)
3756 else if (vector_size
== 64)
3758 else if (GET_MODE_INNER (cmp_mode
) == HFmode
)
3761 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3762 gcc_assert (!op_true
== !op_false
);
3764 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3765 vector dest is required. */
3766 if (!op_true
|| !ix86_valid_mask_cmp_mode (cmp_mode
))
3769 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3770 if (op_false
== CONST0_RTX (mode
)
3771 || op_true
== CONST0_RTX (mode
)
3772 || (INTEGRAL_MODE_P (mode
)
3773 && (op_true
== CONSTM1_RTX (mode
)
3774 || op_false
== CONSTM1_RTX (mode
))))
3780 /* Expand an SSE comparison. Return the register with the result. */
3783 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3784 rtx op_true
, rtx op_false
)
3786 machine_mode mode
= GET_MODE (dest
);
3787 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3789 /* In general case result of comparison can differ from operands' type. */
3790 machine_mode cmp_mode
;
3792 /* In AVX512F the result of comparison is an integer mask. */
3793 bool maskcmp
= false;
3796 if (ix86_use_mask_cmp_p (mode
, cmp_ops_mode
, op_true
, op_false
))
3798 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3800 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
3803 cmp_mode
= cmp_ops_mode
;
3805 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3807 bool (*op1_predicate
)(rtx
, machine_mode
)
3808 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3810 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3811 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3814 || (maskcmp
&& cmp_mode
!= mode
)
3815 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3816 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3817 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3821 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
3826 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3828 if (cmp_mode
!= mode
)
3830 x
= force_reg (cmp_ops_mode
, x
);
3831 convert_move (dest
, x
, false);
3834 emit_insn (gen_rtx_SET (dest
, x
));
3839 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3840 instructions that can be performed using GP registers. */
3843 ix86_emit_vec_binop (enum rtx_code code
, machine_mode mode
,
3844 rtx dst
, rtx src1
, rtx src2
)
3848 tmp
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
3850 if (GET_MODE_SIZE (mode
) <= GET_MODE_SIZE (SImode
)
3851 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
3853 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
3854 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
3860 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3861 operations. This is used for both scalar and vector conditional moves. */
3864 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3866 machine_mode mode
= GET_MODE (dest
);
3867 machine_mode cmpmode
= GET_MODE (cmp
);
3870 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3871 if (rtx_equal_p (op_true
, op_false
))
3873 emit_move_insn (dest
, op_true
);
3877 /* If we have an integer mask and FP value then we need
3878 to cast mask to FP mode. */
3879 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3881 cmp
= force_reg (cmpmode
, cmp
);
3882 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3885 /* In AVX512F the result of comparison is an integer mask. */
3887 && GET_MODE_CLASS (cmpmode
) == MODE_INT
)
3889 gcc_assert (ix86_valid_mask_cmp_mode (mode
));
3890 /* Using scalar/vector move with mask register. */
3891 cmp
= force_reg (cmpmode
, cmp
);
3892 /* Optimize for mask zero. */
3893 op_true
= (op_true
!= CONST0_RTX (mode
)
3894 ? force_reg (mode
, op_true
) : op_true
);
3895 op_false
= (op_false
!= CONST0_RTX (mode
)
3896 ? force_reg (mode
, op_false
) : op_false
);
3897 if (op_true
== CONST0_RTX (mode
))
3899 if (cmpmode
== E_DImode
&& !TARGET_64BIT
)
3901 x
= gen_reg_rtx (cmpmode
);
3902 emit_insn (gen_knotdi (x
, cmp
));
3905 x
= expand_simple_unop (cmpmode
, NOT
, cmp
, NULL
, 1);
3907 /* Reverse op_true op_false. */
3908 std::swap (op_true
, op_false
);
3912 emit_insn (gen_movhf_mask (dest
, op_true
, op_false
, cmp
));
3914 emit_insn (gen_rtx_SET (dest
,
3915 gen_rtx_VEC_MERGE (mode
,
3916 op_true
, op_false
, cmp
)));
3920 if (vector_all_ones_operand (op_true
, mode
)
3921 && op_false
== CONST0_RTX (mode
))
3923 emit_move_insn (dest
, cmp
);
3926 else if (op_false
== CONST0_RTX (mode
))
3928 x
= expand_simple_binop (mode
, AND
, cmp
, op_true
,
3929 dest
, 1, OPTAB_DIRECT
);
3931 emit_move_insn (dest
, x
);
3934 else if (op_true
== CONST0_RTX (mode
))
3936 op_false
= force_reg (mode
, op_false
);
3937 x
= gen_rtx_NOT (mode
, cmp
);
3938 ix86_emit_vec_binop (AND
, mode
, dest
, x
, op_false
);
3941 else if (vector_all_ones_operand (op_true
, mode
))
3943 x
= expand_simple_binop (mode
, IOR
, cmp
, op_false
,
3944 dest
, 1, OPTAB_DIRECT
);
3946 emit_move_insn (dest
, x
);
3952 op_true
= force_reg (mode
, op_true
);
3954 if (GET_MODE_SIZE (mode
) < 16
3955 || !nonimmediate_operand (op_false
, mode
))
3956 op_false
= force_reg (mode
, op_false
);
3958 emit_insn (gen_rtx_SET (dest
,
3959 gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3960 op_true
, op_false
)));
3964 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3965 machine_mode blend_mode
= mode
;
3967 if (GET_MODE_SIZE (mode
) < 16
3968 || !vector_operand (op_true
, mode
))
3969 op_true
= force_reg (mode
, op_true
);
3971 op_false
= force_reg (mode
, op_false
);
3977 gen
= gen_mmx_blendvps
;
3981 gen
= gen_sse4_1_blendvps
;
3985 gen
= gen_sse4_1_blendvpd
;
3989 gen
= gen_sse4_1_blendvss
;
3993 gen
= gen_sse4_1_blendvsd
;
4000 gen
= gen_mmx_pblendvb_v8qi
;
4001 blend_mode
= V8QImode
;
4008 gen
= gen_mmx_pblendvb_v4qi
;
4009 blend_mode
= V4QImode
;
4014 gen
= gen_mmx_pblendvb_v2qi
;
4023 gen
= gen_sse4_1_pblendvb
;
4024 blend_mode
= V16QImode
;
4029 gen
= gen_avx_blendvps256
;
4033 gen
= gen_avx_blendvpd256
;
4042 gen
= gen_avx2_pblendvb
;
4043 blend_mode
= V32QImode
;
4048 gen
= gen_avx512bw_blendmv64qi
;
4051 gen
= gen_avx512bw_blendmv32hi
;
4054 gen
= gen_avx512bw_blendmv32hf
;
4057 gen
= gen_avx512f_blendmv16si
;
4060 gen
= gen_avx512f_blendmv8di
;
4063 gen
= gen_avx512f_blendmv8df
;
4066 gen
= gen_avx512f_blendmv16sf
;
4075 if (blend_mode
== mode
)
4079 x
= gen_reg_rtx (blend_mode
);
4080 op_false
= gen_lowpart (blend_mode
, op_false
);
4081 op_true
= gen_lowpart (blend_mode
, op_true
);
4082 cmp
= gen_lowpart (blend_mode
, cmp
);
4085 emit_insn (gen (x
, op_false
, op_true
, cmp
));
4088 emit_move_insn (dest
, gen_lowpart (mode
, x
));
4094 t2
= expand_simple_binop (mode
, AND
, op_true
, cmp
,
4095 NULL
, 1, OPTAB_DIRECT
);
4097 t3
= gen_reg_rtx (mode
);
4098 x
= gen_rtx_NOT (mode
, cmp
);
4099 ix86_emit_vec_binop (AND
, mode
, t3
, x
, op_false
);
4101 x
= expand_simple_binop (mode
, IOR
, t3
, t2
,
4102 dest
, 1, OPTAB_DIRECT
);
4104 emit_move_insn (dest
, x
);
4108 /* Swap, force into registers, or otherwise massage the two operands
4109 to an sse comparison with a mask result. Thus we differ a bit from
4110 ix86_prepare_fp_compare_args which expects to produce a flags result.
4112 The DEST operand exists to help determine whether to commute commutative
4113 operators. The POP0/POP1 operands are updated in place. The new
4114 comparison code is returned, or UNKNOWN if not implementable. */
4116 static enum rtx_code
4117 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
4118 rtx
*pop0
, rtx
*pop1
)
4124 /* AVX supports all the needed comparisons. */
4127 /* We have no LTGT as an operator. We could implement it with
4128 NE & ORDERED, but this requires an extra temporary. It's
4129 not clear that it's worth it. */
4136 /* These are supported directly. */
4143 /* AVX has 3 operand comparisons, no need to swap anything. */
4146 /* For commutative operators, try to canonicalize the destination
4147 operand to be first in the comparison - this helps reload to
4148 avoid extra moves. */
4149 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
4157 /* These are not supported directly before AVX, and furthermore
4158 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4159 comparison operands to transform into something that is
4161 std::swap (*pop0
, *pop1
);
4162 code
= swap_condition (code
);
4172 /* Expand a floating-point conditional move. Return true if successful. */
4175 ix86_expand_fp_movcc (rtx operands
[])
4177 machine_mode mode
= GET_MODE (operands
[0]);
4178 enum rtx_code code
= GET_CODE (operands
[1]);
4179 rtx tmp
, compare_op
;
4180 rtx op0
= XEXP (operands
[1], 0);
4181 rtx op1
= XEXP (operands
[1], 1);
4183 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode
))
4187 /* Since we've no cmove for sse registers, don't force bad register
4188 allocation just to gain access to it. Deny movcc when the
4189 comparison mode doesn't match the move mode. */
4190 cmode
= GET_MODE (op0
);
4191 if (cmode
== VOIDmode
)
4192 cmode
= GET_MODE (op1
);
4196 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
4197 if (code
== UNKNOWN
)
4200 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
4201 operands
[2], operands
[3]))
4204 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
4205 operands
[2], operands
[3]);
4206 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
4210 if (GET_MODE (op0
) == TImode
4211 || (GET_MODE (op0
) == DImode
4215 /* The floating point conditional move instructions don't directly
4216 support conditions resulting from a signed integer comparison. */
4218 compare_op
= ix86_expand_compare (code
, op0
, op1
);
4219 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
4221 tmp
= gen_reg_rtx (QImode
);
4222 ix86_expand_setcc (tmp
, code
, op0
, op1
);
4224 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
4227 emit_insn (gen_rtx_SET (operands
[0],
4228 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
4229 operands
[2], operands
[3])));
4234 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4237 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4262 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4265 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4302 /* Return immediate value to be used in UNSPEC_PCMP
4303 for comparison CODE in MODE. */
4306 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
4308 if (FLOAT_MODE_P (mode
))
4309 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
4310 return ix86_int_cmp_code_to_pcmp_immediate (code
);
4313 /* Expand AVX-512 vector comparison. */
4316 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
4318 machine_mode mask_mode
= GET_MODE (dest
);
4319 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
4320 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
4330 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
4334 unspec_code
= UNSPEC_PCMP
;
4337 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
4339 emit_insn (gen_rtx_SET (dest
, unspec
));
4344 /* Expand fp vector comparison. */
4347 ix86_expand_fp_vec_cmp (rtx operands
[])
4349 enum rtx_code code
= GET_CODE (operands
[1]);
4352 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4353 &operands
[2], &operands
[3]);
4354 if (code
== UNKNOWN
)
4357 switch (GET_CODE (operands
[1]))
4360 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4361 operands
[3], NULL
, NULL
);
4362 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4363 operands
[3], NULL
, NULL
);
4367 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4368 operands
[3], NULL
, NULL
);
4369 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4370 operands
[3], NULL
, NULL
);
4376 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4380 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4383 if (operands
[0] != cmp
)
4384 emit_move_insn (operands
[0], cmp
);
4390 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4391 rtx op_true
, rtx op_false
, bool *negate
)
4393 machine_mode data_mode
= GET_MODE (dest
);
4394 machine_mode mode
= GET_MODE (cop0
);
4399 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4401 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
4402 && GET_MODE_SIZE (mode
) <= 16)
4404 /* AVX512F supports all of the comparsions
4405 on all 128/256/512-bit vector int types. */
4406 else if (ix86_use_mask_cmp_p (data_mode
, mode
, op_true
, op_false
))
4410 /* Canonicalize the comparison to EQ, GT, GTU. */
4421 code
= reverse_condition (code
);
4427 code
= reverse_condition (code
);
4433 std::swap (cop0
, cop1
);
4434 code
= swap_condition (code
);
4441 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4442 if (mode
== V2DImode
)
4447 /* SSE4.1 supports EQ. */
4454 /* SSE4.2 supports GT/GTU. */
4464 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4465 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4467 std::swap (optrue
, opfalse
);
4469 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4470 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4471 min (x, y) == x). While we add one instruction (the minimum),
4472 we remove the need for two instructions in the negation, as the
4473 result is done this way.
4474 When using masks, do it for SI/DImode element types, as it is shorter
4475 than the two subtractions. */
4477 && GET_MODE_SIZE (mode
) != 64
4478 && vector_all_ones_operand (opfalse
, data_mode
)
4479 && optrue
== CONST0_RTX (data_mode
))
4481 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4482 /* Don't do it if not using integer masks and we'd end up with
4483 the right values in the registers though. */
4484 && (GET_MODE_SIZE (mode
) == 64
4485 || !vector_all_ones_operand (optrue
, data_mode
)
4486 || opfalse
!= CONST0_RTX (data_mode
))))
4488 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4493 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4496 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4497 cop0
= force_reg (mode
, cop0
);
4498 cop1
= force_reg (mode
, cop1
);
4502 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4506 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4510 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4513 if (TARGET_AVX512VL
)
4515 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4516 cop0
= force_reg (mode
, cop0
);
4517 cop1
= force_reg (mode
, cop1
);
4521 if (code
== GTU
&& TARGET_SSE2
)
4522 gen
= gen_uminv16qi3
;
4523 else if (code
== GT
&& TARGET_SSE4_1
)
4524 gen
= gen_sminv16qi3
;
4527 if (code
== GTU
&& TARGET_SSE2
)
4528 gen
= gen_uminv8qi3
;
4529 else if (code
== GT
&& TARGET_SSE4_1
)
4530 gen
= gen_sminv8qi3
;
4533 if (code
== GTU
&& TARGET_SSE2
)
4534 gen
= gen_uminv4qi3
;
4535 else if (code
== GT
&& TARGET_SSE4_1
)
4536 gen
= gen_sminv4qi3
;
4539 if (code
== GTU
&& TARGET_SSE2
)
4540 gen
= gen_uminv2qi3
;
4541 else if (code
== GT
&& TARGET_SSE4_1
)
4542 gen
= gen_sminv2qi3
;
4545 if (code
== GTU
&& TARGET_SSE4_1
)
4546 gen
= gen_uminv8hi3
;
4547 else if (code
== GT
&& TARGET_SSE2
)
4548 gen
= gen_sminv8hi3
;
4551 if (code
== GTU
&& TARGET_SSE4_1
)
4552 gen
= gen_uminv4hi3
;
4553 else if (code
== GT
&& TARGET_SSE2
)
4554 gen
= gen_sminv4hi3
;
4557 if (code
== GTU
&& TARGET_SSE4_1
)
4558 gen
= gen_uminv2hi3
;
4559 else if (code
== GT
&& TARGET_SSE2
)
4560 gen
= gen_sminv2hi3
;
4564 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4568 gen
= (code
== GTU
) ? gen_uminv2si3
: gen_sminv2si3
;
4571 if (TARGET_AVX512VL
)
4573 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4574 cop0
= force_reg (mode
, cop0
);
4575 cop1
= force_reg (mode
, cop1
);
4584 rtx tem
= gen_reg_rtx (mode
);
4585 if (!vector_operand (cop0
, mode
))
4586 cop0
= force_reg (mode
, cop0
);
4587 if (!vector_operand (cop1
, mode
))
4588 cop1
= force_reg (mode
, cop1
);
4590 emit_insn (gen (tem
, cop0
, cop1
));
4596 /* Unsigned parallel compare is not supported by the hardware.
4597 Play some tricks to turn this into a signed comparison
4601 cop0
= force_reg (mode
, cop0
);
4615 /* Subtract (-(INT MAX) - 1) from both operands to make
4617 mask
= ix86_build_signbit_mask (mode
, true, false);
4618 t1
= gen_reg_rtx (mode
);
4619 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4621 t2
= gen_reg_rtx (mode
);
4622 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4641 /* Perform a parallel unsigned saturating subtraction. */
4642 x
= gen_reg_rtx (mode
);
4643 emit_insn (gen_rtx_SET
4644 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4646 cop1
= CONST0_RTX (mode
);
4658 std::swap (op_true
, op_false
);
4660 /* Allow the comparison to be done in one mode, but the movcc to
4661 happen in another mode. */
4662 if (data_mode
== mode
)
4664 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4669 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4670 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4672 if (GET_MODE (x
) == mode
)
4673 x
= gen_lowpart (data_mode
, x
);
4679 /* Expand integer vector comparison. */
4682 ix86_expand_int_vec_cmp (rtx operands
[])
4684 rtx_code code
= GET_CODE (operands
[1]);
4685 bool negate
= false;
4686 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4687 operands
[3], NULL
, NULL
, &negate
);
4693 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4694 CONST0_RTX (GET_MODE (cmp
)),
4695 NULL
, NULL
, &negate
);
4697 gcc_assert (!negate
);
4699 if (operands
[0] != cmp
)
4700 emit_move_insn (operands
[0], cmp
);
4705 /* Expand a floating-point vector conditional move; a vcond operation
4706 rather than a movcc operation. */
4709 ix86_expand_fp_vcond (rtx operands
[])
4711 enum rtx_code code
= GET_CODE (operands
[3]);
4714 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4715 &operands
[4], &operands
[5]);
4716 if (code
== UNKNOWN
)
4719 switch (GET_CODE (operands
[3]))
4722 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4723 operands
[5], operands
[0], operands
[0]);
4724 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4725 operands
[5], operands
[1], operands
[2]);
4729 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4730 operands
[5], operands
[0], operands
[0]);
4731 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4732 operands
[5], operands
[1], operands
[2]);
4738 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4740 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4744 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4745 operands
[5], operands
[1], operands
[2]))
4748 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4749 operands
[1], operands
[2]);
4750 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4754 /* Expand a signed/unsigned integral vector conditional move. */
4757 ix86_expand_int_vcond (rtx operands
[])
4759 machine_mode data_mode
= GET_MODE (operands
[0]);
4760 machine_mode mode
= GET_MODE (operands
[4]);
4761 enum rtx_code code
= GET_CODE (operands
[3]);
4762 bool negate
= false;
4768 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4769 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4770 if ((code
== LT
|| code
== GE
)
4771 && data_mode
== mode
4772 && cop1
== CONST0_RTX (mode
)
4773 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4774 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4775 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4776 && (GET_MODE_SIZE (data_mode
) == 16
4777 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4779 rtx negop
= operands
[2 - (code
== LT
)];
4780 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4781 if (negop
== CONST1_RTX (data_mode
))
4783 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4784 operands
[0], 1, OPTAB_DIRECT
);
4785 if (res
!= operands
[0])
4786 emit_move_insn (operands
[0], res
);
4789 else if (GET_MODE_INNER (data_mode
) != DImode
4790 && vector_all_ones_operand (negop
, data_mode
))
4792 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4793 operands
[0], 0, OPTAB_DIRECT
);
4794 if (res
!= operands
[0])
4795 emit_move_insn (operands
[0], res
);
4800 if (!nonimmediate_operand (cop1
, mode
))
4801 cop1
= force_reg (mode
, cop1
);
4802 if (!general_operand (operands
[1], data_mode
))
4803 operands
[1] = force_reg (data_mode
, operands
[1]);
4804 if (!general_operand (operands
[2], data_mode
))
4805 operands
[2] = force_reg (data_mode
, operands
[2]);
4807 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4808 operands
[1], operands
[2], &negate
);
4813 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4814 operands
[2-negate
]);
4819 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4820 struct expand_vec_perm_d
*d
)
4822 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4823 expander, so args are either in d, or in op0, op1 etc. */
4824 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4825 machine_mode maskmode
= mode
;
4826 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4831 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
4832 gen
= gen_avx512vl_vpermt2varv16qi3
;
4835 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
4836 gen
= gen_avx512vl_vpermt2varv32qi3
;
4839 if (TARGET_AVX512VBMI
)
4840 gen
= gen_avx512bw_vpermt2varv64qi3
;
4843 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4844 gen
= gen_avx512vl_vpermt2varv8hi3
;
4847 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4848 gen
= gen_avx512vl_vpermt2varv16hi3
;
4851 if (TARGET_AVX512BW
)
4852 gen
= gen_avx512bw_vpermt2varv32hi3
;
4855 if (TARGET_AVX512VL
)
4856 gen
= gen_avx512vl_vpermt2varv4si3
;
4859 if (TARGET_AVX512VL
)
4860 gen
= gen_avx512vl_vpermt2varv8si3
;
4864 gen
= gen_avx512f_vpermt2varv16si3
;
4867 if (TARGET_AVX512VL
)
4869 gen
= gen_avx512vl_vpermt2varv4sf3
;
4870 maskmode
= V4SImode
;
4874 if (TARGET_AVX512VL
)
4876 gen
= gen_avx512vl_vpermt2varv8sf3
;
4877 maskmode
= V8SImode
;
4883 gen
= gen_avx512f_vpermt2varv16sf3
;
4884 maskmode
= V16SImode
;
4888 if (TARGET_AVX512VL
)
4889 gen
= gen_avx512vl_vpermt2varv2di3
;
4892 if (TARGET_AVX512VL
)
4893 gen
= gen_avx512vl_vpermt2varv4di3
;
4897 gen
= gen_avx512f_vpermt2varv8di3
;
4900 if (TARGET_AVX512VL
)
4902 gen
= gen_avx512vl_vpermt2varv2df3
;
4903 maskmode
= V2DImode
;
4907 if (TARGET_AVX512VL
)
4909 gen
= gen_avx512vl_vpermt2varv4df3
;
4910 maskmode
= V4DImode
;
4916 gen
= gen_avx512f_vpermt2varv8df3
;
4917 maskmode
= V8DImode
;
4927 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4928 expander, so args are either in d, or in op0, op1 etc. */
4935 for (int i
= 0; i
< d
->nelt
; ++i
)
4936 vec
[i
] = GEN_INT (d
->perm
[i
]);
4937 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4940 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4944 /* Expand a variable vector permutation. */
4947 ix86_expand_vec_perm (rtx operands
[])
4949 rtx target
= operands
[0];
4950 rtx op0
= operands
[1];
4951 rtx op1
= operands
[2];
4952 rtx mask
= operands
[3];
4953 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4954 machine_mode mode
= GET_MODE (op0
);
4955 machine_mode maskmode
= GET_MODE (mask
);
4957 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4959 /* Number of elements in the vector. */
4960 w
= GET_MODE_NUNITS (mode
);
4961 e
= GET_MODE_UNIT_SIZE (mode
);
4962 gcc_assert (w
<= 64);
4964 /* For HF mode vector, convert it to HI using subreg. */
4965 if (GET_MODE_INNER (mode
) == HFmode
)
4967 machine_mode orig_mode
= mode
;
4968 mode
= mode_for_vector (HImode
, w
).require ();
4969 target
= lowpart_subreg (mode
, target
, orig_mode
);
4970 op0
= lowpart_subreg (mode
, op0
, orig_mode
);
4971 op1
= lowpart_subreg (mode
, op1
, orig_mode
);
4974 if (TARGET_AVX512F
&& one_operand_shuffle
)
4976 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4980 gen
=gen_avx512f_permvarv16si
;
4983 gen
= gen_avx512f_permvarv16sf
;
4986 gen
= gen_avx512f_permvarv8di
;
4989 gen
= gen_avx512f_permvarv8df
;
4996 emit_insn (gen (target
, op0
, mask
));
5001 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
5006 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
5008 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5009 an constant shuffle operand. With a tiny bit of effort we can
5010 use VPERMD instead. A re-interpretation stall for V4DFmode is
5011 unfortunate but there's no avoiding it.
5012 Similarly for V16HImode we don't have instructions for variable
5013 shuffling, while for V32QImode we can use after preparing suitable
5014 masks vpshufb; vpshufb; vpermq; vpor. */
5016 if (mode
== V16HImode
)
5018 maskmode
= mode
= V32QImode
;
5024 maskmode
= mode
= V8SImode
;
5028 t1
= gen_reg_rtx (maskmode
);
5030 /* Replicate the low bits of the V4DImode mask into V8SImode:
5032 t1 = { A A B B C C D D }. */
5033 for (i
= 0; i
< w
/ 2; ++i
)
5034 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
5035 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5036 vt
= force_reg (maskmode
, vt
);
5037 mask
= gen_lowpart (maskmode
, mask
);
5038 if (maskmode
== V8SImode
)
5039 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
5041 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
5043 /* Multiply the shuffle indicies by two. */
5044 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
5047 /* Add one to the odd shuffle indicies:
5048 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5049 for (i
= 0; i
< w
/ 2; ++i
)
5051 vec
[i
* 2] = const0_rtx
;
5052 vec
[i
* 2 + 1] = const1_rtx
;
5054 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5055 vt
= validize_mem (force_const_mem (maskmode
, vt
));
5056 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
5059 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5060 operands
[3] = mask
= t1
;
5061 target
= gen_reg_rtx (mode
);
5062 op0
= gen_lowpart (mode
, op0
);
5063 op1
= gen_lowpart (mode
, op1
);
5069 /* The VPERMD and VPERMPS instructions already properly ignore
5070 the high bits of the shuffle elements. No need for us to
5071 perform an AND ourselves. */
5072 if (one_operand_shuffle
)
5074 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
5075 if (target
!= operands
[0])
5076 emit_move_insn (operands
[0],
5077 gen_lowpart (GET_MODE (operands
[0]), target
));
5081 t1
= gen_reg_rtx (V8SImode
);
5082 t2
= gen_reg_rtx (V8SImode
);
5083 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
5084 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
5090 mask
= gen_lowpart (V8SImode
, mask
);
5091 if (one_operand_shuffle
)
5092 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
5095 t1
= gen_reg_rtx (V8SFmode
);
5096 t2
= gen_reg_rtx (V8SFmode
);
5097 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
5098 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
5104 /* By combining the two 128-bit input vectors into one 256-bit
5105 input vector, we can use VPERMD and VPERMPS for the full
5106 two-operand shuffle. */
5107 t1
= gen_reg_rtx (V8SImode
);
5108 t2
= gen_reg_rtx (V8SImode
);
5109 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
5110 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5111 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
5112 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
5116 t1
= gen_reg_rtx (V8SFmode
);
5117 t2
= gen_reg_rtx (V8SImode
);
5118 mask
= gen_lowpart (V4SImode
, mask
);
5119 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
5120 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5121 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
5122 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
5126 t1
= gen_reg_rtx (V32QImode
);
5127 t2
= gen_reg_rtx (V32QImode
);
5128 t3
= gen_reg_rtx (V32QImode
);
5129 vt2
= GEN_INT (-128);
5130 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
5131 vt
= force_reg (V32QImode
, vt
);
5132 for (i
= 0; i
< 32; i
++)
5133 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
5134 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
5135 vt2
= force_reg (V32QImode
, vt2
);
5136 /* From mask create two adjusted masks, which contain the same
5137 bits as mask in the low 7 bits of each vector element.
5138 The first mask will have the most significant bit clear
5139 if it requests element from the same 128-bit lane
5140 and MSB set if it requests element from the other 128-bit lane.
5141 The second mask will have the opposite values of the MSB,
5142 and additionally will have its 128-bit lanes swapped.
5143 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5144 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5145 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5146 stands for other 12 bytes. */
5147 /* The bit whether element is from the same lane or the other
5148 lane is bit 4, so shift it up by 3 to the MSB position. */
5149 t5
= gen_reg_rtx (V4DImode
);
5150 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
5152 /* Clear MSB bits from the mask just in case it had them set. */
5153 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
5154 /* After this t1 will have MSB set for elements from other lane. */
5155 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
5156 /* Clear bits other than MSB. */
5157 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
5158 /* Or in the lower bits from mask into t3. */
5159 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
5160 /* And invert MSB bits in t1, so MSB is set for elements from the same
5162 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
5163 /* Swap 128-bit lanes in t3. */
5164 t6
= gen_reg_rtx (V4DImode
);
5165 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
5166 const2_rtx
, GEN_INT (3),
5167 const0_rtx
, const1_rtx
));
5168 /* And or in the lower bits from mask into t1. */
5169 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
5170 if (one_operand_shuffle
)
5172 /* Each of these shuffles will put 0s in places where
5173 element from the other 128-bit lane is needed, otherwise
5174 will shuffle in the requested value. */
5175 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
5176 gen_lowpart (V32QImode
, t6
)));
5177 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
5178 /* For t3 the 128-bit lanes are swapped again. */
5179 t7
= gen_reg_rtx (V4DImode
);
5180 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
5181 const2_rtx
, GEN_INT (3),
5182 const0_rtx
, const1_rtx
));
5183 /* And oring both together leads to the result. */
5184 emit_insn (gen_iorv32qi3 (target
, t1
,
5185 gen_lowpart (V32QImode
, t7
)));
5186 if (target
!= operands
[0])
5187 emit_move_insn (operands
[0],
5188 gen_lowpart (GET_MODE (operands
[0]), target
));
5192 t4
= gen_reg_rtx (V32QImode
);
5193 /* Similarly to the above one_operand_shuffle code,
5194 just for repeated twice for each operand. merge_two:
5195 code will merge the two results together. */
5196 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
5197 gen_lowpart (V32QImode
, t6
)));
5198 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
5199 gen_lowpart (V32QImode
, t6
)));
5200 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
5201 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
5202 t7
= gen_reg_rtx (V4DImode
);
5203 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
5204 const2_rtx
, GEN_INT (3),
5205 const0_rtx
, const1_rtx
));
5206 t8
= gen_reg_rtx (V4DImode
);
5207 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
5208 const2_rtx
, GEN_INT (3),
5209 const0_rtx
, const1_rtx
));
5210 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
5211 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
5217 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
5224 /* The XOP VPPERM insn supports three inputs. By ignoring the
5225 one_operand_shuffle special case, we avoid creating another
5226 set of constant vectors in memory. */
5227 one_operand_shuffle
= false;
5229 /* mask = mask & {2*w-1, ...} */
5230 vt
= GEN_INT (2*w
- 1);
5234 /* mask = mask & {w-1, ...} */
5235 vt
= GEN_INT (w
- 1);
5238 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5239 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5240 NULL_RTX
, 0, OPTAB_DIRECT
);
5242 /* For non-QImode operations, convert the word permutation control
5243 into a byte permutation control. */
5244 if (mode
!= V16QImode
)
5246 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
5247 GEN_INT (exact_log2 (e
)),
5248 NULL_RTX
, 0, OPTAB_DIRECT
);
5250 /* Convert mask to vector of chars. */
5251 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
5253 /* Replicate each of the input bytes into byte positions:
5254 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5255 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5256 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5257 for (i
= 0; i
< 16; ++i
)
5258 vec
[i
] = GEN_INT (i
/e
* e
);
5259 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5260 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5262 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
5264 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
5266 /* Convert it into the byte positions by doing
5267 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5268 for (i
= 0; i
< 16; ++i
)
5269 vec
[i
] = GEN_INT (i
% e
);
5270 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5271 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5272 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
5275 /* The actual shuffle operations all operate on V16QImode. */
5276 op0
= gen_lowpart (V16QImode
, op0
);
5277 op1
= gen_lowpart (V16QImode
, op1
);
5281 if (GET_MODE (target
) != V16QImode
)
5282 target
= gen_reg_rtx (V16QImode
);
5283 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
5284 if (target
!= operands
[0])
5285 emit_move_insn (operands
[0],
5286 gen_lowpart (GET_MODE (operands
[0]), target
));
5288 else if (one_operand_shuffle
)
5290 if (GET_MODE (target
) != V16QImode
)
5291 target
= gen_reg_rtx (V16QImode
);
5292 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
5293 if (target
!= operands
[0])
5294 emit_move_insn (operands
[0],
5295 gen_lowpart (GET_MODE (operands
[0]), target
));
5302 /* Shuffle the two input vectors independently. */
5303 t1
= gen_reg_rtx (V16QImode
);
5304 t2
= gen_reg_rtx (V16QImode
);
5305 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
5306 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
5309 /* Then merge them together. The key is whether any given control
5310 element contained a bit set that indicates the second word. */
5313 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
5315 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5316 more shuffle to convert the V2DI input mask into a V4SI
5317 input mask. At which point the masking that expand_int_vcond
5318 will work as desired. */
5319 rtx t3
= gen_reg_rtx (V4SImode
);
5320 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
5321 const0_rtx
, const0_rtx
,
5322 const2_rtx
, const2_rtx
));
5324 maskmode
= V4SImode
;
5328 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5329 vt
= force_reg (maskmode
, vt
);
5330 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5331 NULL_RTX
, 0, OPTAB_DIRECT
);
5333 if (GET_MODE (target
) != mode
)
5334 target
= gen_reg_rtx (mode
);
5336 xops
[1] = gen_lowpart (mode
, t2
);
5337 xops
[2] = gen_lowpart (mode
, t1
);
5338 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
5341 ok
= ix86_expand_int_vcond (xops
);
5343 if (target
!= operands
[0])
5344 emit_move_insn (operands
[0],
5345 gen_lowpart (GET_MODE (operands
[0]), target
));
5349 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5350 true if we should do zero extension, else sign extension. HIGH_P is
5351 true if we want the N/2 high elements, else the low elements. */
5354 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
5356 machine_mode imode
= GET_MODE (src
);
5361 rtx (*unpack
)(rtx
, rtx
);
5362 rtx (*extract
)(rtx
, rtx
) = NULL
;
5363 machine_mode halfmode
= BLKmode
;
5369 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
5371 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
5372 halfmode
= V32QImode
;
5374 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
5378 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
5380 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
5381 halfmode
= V16QImode
;
5383 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
5387 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
5389 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
5390 halfmode
= V16HImode
;
5392 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
5396 unpack
= gen_avx2_zero_extendv8hiv8si2
;
5398 unpack
= gen_avx2_sign_extendv8hiv8si2
;
5399 halfmode
= V8HImode
;
5401 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5405 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5407 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5408 halfmode
= V8SImode
;
5410 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5414 unpack
= gen_avx2_zero_extendv4siv4di2
;
5416 unpack
= gen_avx2_sign_extendv4siv4di2
;
5417 halfmode
= V4SImode
;
5419 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5423 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5425 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5429 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5431 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5435 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5437 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5441 unpack
= gen_sse4_1_zero_extendv4qiv4hi2
;
5443 unpack
= gen_sse4_1_sign_extendv4qiv4hi2
;
5447 unpack
= gen_sse4_1_zero_extendv2hiv2si2
;
5449 unpack
= gen_sse4_1_sign_extendv2hiv2si2
;
5453 unpack
= gen_sse4_1_zero_extendv2qiv2hi2
;
5455 unpack
= gen_sse4_1_sign_extendv2qiv2hi2
;
5461 if (GET_MODE_SIZE (imode
) >= 32)
5463 tmp
= gen_reg_rtx (halfmode
);
5464 emit_insn (extract (tmp
, src
));
5468 switch (GET_MODE_SIZE (imode
))
5471 /* Shift higher 8 bytes to lower 8 bytes. */
5472 tmp
= gen_reg_rtx (V1TImode
);
5473 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5477 /* Shift higher 4 bytes to lower 4 bytes. */
5478 tmp
= gen_reg_rtx (V1DImode
);
5479 emit_insn (gen_mmx_lshrv1di3 (tmp
, gen_lowpart (V1DImode
, src
),
5483 /* Shift higher 2 bytes to lower 2 bytes. */
5484 tmp
= gen_reg_rtx (V1SImode
);
5485 emit_insn (gen_mmx_lshrv1si3 (tmp
, gen_lowpart (V1SImode
, src
),
5492 tmp
= gen_lowpart (imode
, tmp
);
5497 emit_insn (unpack (dest
, tmp
));
5501 rtx (*unpack
)(rtx
, rtx
, rtx
);
5507 unpack
= gen_vec_interleave_highv16qi
;
5509 unpack
= gen_vec_interleave_lowv16qi
;
5513 unpack
= gen_vec_interleave_highv8hi
;
5515 unpack
= gen_vec_interleave_lowv8hi
;
5519 unpack
= gen_vec_interleave_highv4si
;
5521 unpack
= gen_vec_interleave_lowv4si
;
5525 unpack
= gen_mmx_punpckhbw
;
5527 unpack
= gen_mmx_punpcklbw
;
5531 unpack
= gen_mmx_punpckhwd
;
5533 unpack
= gen_mmx_punpcklwd
;
5537 unpack
= gen_mmx_punpckhbw_low
;
5539 unpack
= gen_mmx_punpcklbw_low
;
5546 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5548 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5549 src
, pc_rtx
, pc_rtx
);
5551 rtx tmp2
= gen_reg_rtx (imode
);
5552 emit_insn (unpack (tmp2
, src
, tmp
));
5553 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5557 /* Return true if mem is pool constant which contains a const_vector
5558 perm index, assign the index to PERM. */
5560 ix86_extract_perm_from_pool_constant (int* perm
, rtx mem
)
5562 machine_mode mode
= GET_MODE (mem
);
5563 int nelt
= GET_MODE_NUNITS (mode
);
5565 if (!INTEGRAL_MODE_P (mode
))
5568 /* Needs to be constant pool. */
5570 || !SYMBOL_REF_P (XEXP (mem
, 0))
5571 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem
, 0)))
5574 rtx constant
= get_pool_constant (XEXP (mem
, 0));
5576 if (GET_CODE (constant
) != CONST_VECTOR
)
5579 /* There could be some rtx like
5580 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5581 but with "*.LC1" refer to V2DI constant vector. */
5582 if (GET_MODE (constant
) != mode
)
5584 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
), 0);
5586 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
5590 for (int i
= 0; i
!= nelt
; i
++)
5591 perm
[i
] = UINTVAL (XVECEXP (constant
, 0, i
));
5596 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5597 but works for floating pointer parameters and nonoffsetable memories.
5598 For pushes, it returns just stack offsets; the values will be saved
5599 in the right order. Maximally three parts are generated. */
5602 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5607 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5609 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5611 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5612 gcc_assert (size
>= 2 && size
<= 4);
5614 /* Optimize constant pool reference to immediates. This is used by fp
5615 moves, that force all constants to memory to allow combining. */
5616 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5617 operand
= avoid_constant_pool_reference (operand
);
5619 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5621 /* The only non-offsetable memories we handle are pushes. */
5622 int ok
= push_operand (operand
, VOIDmode
);
5626 operand
= copy_rtx (operand
);
5627 PUT_MODE (operand
, word_mode
);
5628 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5632 if (GET_CODE (operand
) == CONST_VECTOR
)
5634 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5635 /* Caution: if we looked through a constant pool memory above,
5636 the operand may actually have a different mode now. That's
5637 ok, since we want to pun this all the way back to an integer. */
5638 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5639 gcc_assert (operand
!= NULL
);
5646 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5651 if (REG_P (operand
))
5653 gcc_assert (reload_completed
);
5654 for (i
= 0; i
< size
; i
++)
5655 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5657 else if (offsettable_memref_p (operand
))
5659 operand
= adjust_address (operand
, SImode
, 0);
5661 for (i
= 1; i
< size
; i
++)
5662 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5664 else if (CONST_DOUBLE_P (operand
))
5666 const REAL_VALUE_TYPE
*r
;
5669 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5673 real_to_target (l
, r
, mode
);
5674 parts
[3] = gen_int_mode (l
[3], SImode
);
5675 parts
[2] = gen_int_mode (l
[2], SImode
);
5678 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5679 long double may not be 80-bit. */
5680 real_to_target (l
, r
, mode
);
5681 parts
[2] = gen_int_mode (l
[2], SImode
);
5684 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5689 parts
[1] = gen_int_mode (l
[1], SImode
);
5690 parts
[0] = gen_int_mode (l
[0], SImode
);
5699 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5700 if (mode
== XFmode
|| mode
== TFmode
)
5702 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5703 if (REG_P (operand
))
5705 gcc_assert (reload_completed
);
5706 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5707 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5709 else if (offsettable_memref_p (operand
))
5711 operand
= adjust_address (operand
, DImode
, 0);
5713 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5715 else if (CONST_DOUBLE_P (operand
))
5719 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5721 /* real_to_target puts 32-bit pieces in each long. */
5722 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5723 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5726 if (upper_mode
== SImode
)
5727 parts
[1] = gen_int_mode (l
[2], SImode
);
5730 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5731 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5742 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5743 Return false when normal moves are needed; true when all required
5744 insns have been emitted. Operands 2-4 contain the input values
5745 int the correct order; operands 5-7 contain the output values. */
5748 ix86_split_long_move (rtx operands
[])
5754 machine_mode mode
= GET_MODE (operands
[0]);
5755 bool collisionparts
[4];
5757 /* The DFmode expanders may ask us to move double.
5758 For 64bit target this is single move. By hiding the fact
5759 here we simplify i386.md splitters. */
5760 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5762 /* Optimize constant pool reference to immediates. This is used by
5763 fp moves, that force all constants to memory to allow combining. */
5765 if (MEM_P (operands
[1])
5766 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5767 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5768 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5769 if (push_operand (operands
[0], VOIDmode
))
5771 operands
[0] = copy_rtx (operands
[0]);
5772 PUT_MODE (operands
[0], word_mode
);
5775 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5776 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5777 emit_move_insn (operands
[0], operands
[1]);
5781 /* The only non-offsettable memory we handle is push. */
5782 if (push_operand (operands
[0], VOIDmode
))
5785 gcc_assert (!MEM_P (operands
[0])
5786 || offsettable_memref_p (operands
[0]));
5788 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5789 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5791 /* When emitting push, take care for source operands on the stack. */
5792 if (push
&& MEM_P (operands
[1])
5793 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5795 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5797 /* Compensate for the stack decrement by 4. */
5798 if (!TARGET_64BIT
&& nparts
== 3
5799 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5800 src_base
= plus_constant (Pmode
, src_base
, 4);
5802 /* src_base refers to the stack pointer and is
5803 automatically decreased by emitted push. */
5804 for (i
= 0; i
< nparts
; i
++)
5805 part
[1][i
] = change_address (part
[1][i
],
5806 GET_MODE (part
[1][i
]), src_base
);
5809 /* We need to do copy in the right order in case an address register
5810 of the source overlaps the destination. */
5811 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5815 for (i
= 0; i
< nparts
; i
++)
5818 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5819 if (collisionparts
[i
])
5823 /* Collision in the middle part can be handled by reordering. */
5824 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5826 std::swap (part
[0][1], part
[0][2]);
5827 std::swap (part
[1][1], part
[1][2]);
5829 else if (collisions
== 1
5831 && (collisionparts
[1] || collisionparts
[2]))
5833 if (collisionparts
[1])
5835 std::swap (part
[0][1], part
[0][2]);
5836 std::swap (part
[1][1], part
[1][2]);
5840 std::swap (part
[0][2], part
[0][3]);
5841 std::swap (part
[1][2], part
[1][3]);
5845 /* If there are more collisions, we can't handle it by reordering.
5846 Do an lea to the last part and use only one colliding move. */
5847 else if (collisions
> 1)
5853 base
= part
[0][nparts
- 1];
5855 /* Handle the case when the last part isn't valid for lea.
5856 Happens in 64-bit mode storing the 12-byte XFmode. */
5857 if (GET_MODE (base
) != Pmode
)
5858 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5860 addr
= XEXP (part
[1][0], 0);
5861 if (TARGET_TLS_DIRECT_SEG_REFS
)
5863 struct ix86_address parts
;
5864 int ok
= ix86_decompose_address (addr
, &parts
);
5866 /* It is not valid to use %gs: or %fs: in lea. */
5867 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5869 emit_insn (gen_rtx_SET (base
, addr
));
5870 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5871 for (i
= 1; i
< nparts
; i
++)
5873 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5874 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5885 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5886 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5887 emit_move_insn (part
[0][2], part
[1][2]);
5889 else if (nparts
== 4)
5891 emit_move_insn (part
[0][3], part
[1][3]);
5892 emit_move_insn (part
[0][2], part
[1][2]);
5897 /* In 64bit mode we don't have 32bit push available. In case this is
5898 register, it is OK - we will just use larger counterpart. We also
5899 retype memory - these comes from attempt to avoid REX prefix on
5900 moving of second half of TFmode value. */
5901 if (GET_MODE (part
[1][1]) == SImode
)
5903 switch (GET_CODE (part
[1][1]))
5906 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5910 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5917 if (GET_MODE (part
[1][0]) == SImode
)
5918 part
[1][0] = part
[1][1];
5921 emit_move_insn (part
[0][1], part
[1][1]);
5922 emit_move_insn (part
[0][0], part
[1][0]);
5926 /* Choose correct order to not overwrite the source before it is copied. */
5927 if ((REG_P (part
[0][0])
5928 && REG_P (part
[1][1])
5929 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5931 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5933 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5935 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5937 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5939 operands
[2 + i
] = part
[0][j
];
5940 operands
[6 + i
] = part
[1][j
];
5945 for (i
= 0; i
< nparts
; i
++)
5947 operands
[2 + i
] = part
[0][i
];
5948 operands
[6 + i
] = part
[1][i
];
5952 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5953 if (optimize_insn_for_size_p ())
5955 for (j
= 0; j
< nparts
- 1; j
++)
5956 if (CONST_INT_P (operands
[6 + j
])
5957 && operands
[6 + j
] != const0_rtx
5958 && REG_P (operands
[2 + j
]))
5959 for (i
= j
; i
< nparts
- 1; i
++)
5960 if (CONST_INT_P (operands
[7 + i
])
5961 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5962 operands
[7 + i
] = operands
[2 + j
];
5965 for (i
= 0; i
< nparts
; i
++)
5966 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5971 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5972 left shift by a constant, either using a single shift or
5973 a sequence of add instructions. */
5976 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5979 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5980 && !optimize_insn_for_size_p ()))
5983 emit_insn (gen_add2_insn (operand
, operand
));
5987 rtx (*insn
)(rtx
, rtx
, rtx
);
5989 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5990 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5995 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5997 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5998 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5999 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6000 machine_mode half_mode
;
6002 rtx low
[2], high
[2];
6005 if (CONST_INT_P (operands
[2]))
6007 split_double_mode (mode
, operands
, 2, low
, high
);
6008 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6010 if (count
>= half_width
)
6012 emit_move_insn (high
[0], low
[1]);
6013 emit_move_insn (low
[0], const0_rtx
);
6015 if (count
> half_width
)
6016 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
6020 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6022 if (!rtx_equal_p (operands
[0], operands
[1]))
6023 emit_move_insn (operands
[0], operands
[1]);
6025 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
6026 ix86_expand_ashl_const (low
[0], count
, mode
);
6031 split_double_mode (mode
, operands
, 1, low
, high
);
6032 half_mode
= mode
== DImode
? SImode
: DImode
;
6034 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6036 if (operands
[1] == const1_rtx
)
6038 /* Assuming we've chosen a QImode capable registers, then 1 << N
6039 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6040 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
6042 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
6044 ix86_expand_clear (low
[0]);
6045 ix86_expand_clear (high
[0]);
6046 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
6048 d
= gen_lowpart (QImode
, low
[0]);
6049 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6050 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
6051 emit_insn (gen_rtx_SET (d
, s
));
6053 d
= gen_lowpart (QImode
, high
[0]);
6054 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6055 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
6056 emit_insn (gen_rtx_SET (d
, s
));
6059 /* Otherwise, we can get the same results by manually performing
6060 a bit extract operation on bit 5/6, and then performing the two
6061 shifts. The two methods of getting 0/1 into low/high are exactly
6062 the same size. Avoiding the shift in the bit extract case helps
6063 pentium4 a bit; no one else seems to care much either way. */
6066 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
6067 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
6068 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
6074 gen_lshr3
= gen_lshrsi3
;
6075 gen_and3
= gen_andsi3
;
6076 gen_xor3
= gen_xorsi3
;
6081 gen_lshr3
= gen_lshrdi3
;
6082 gen_and3
= gen_anddi3
;
6083 gen_xor3
= gen_xordi3
;
6087 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
6088 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
6090 x
= gen_lowpart (half_mode
, operands
[2]);
6091 emit_insn (gen_rtx_SET (high
[0], x
));
6093 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
6094 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
6095 emit_move_insn (low
[0], high
[0]);
6096 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
6099 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6100 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
6104 if (operands
[1] == constm1_rtx
)
6106 /* For -1 << N, we can avoid the shld instruction, because we
6107 know that we're shifting 0...31/63 ones into a -1. */
6108 emit_move_insn (low
[0], constm1_rtx
);
6109 if (optimize_insn_for_size_p ())
6110 emit_move_insn (high
[0], low
[0]);
6112 emit_move_insn (high
[0], constm1_rtx
);
6116 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6118 if (!rtx_equal_p (operands
[0], operands
[1]))
6119 emit_move_insn (operands
[0], operands
[1]);
6121 split_double_mode (mode
, operands
, 1, low
, high
);
6122 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
6125 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6127 if (TARGET_CMOVE
&& scratch
)
6129 ix86_expand_clear (scratch
);
6130 emit_insn (gen_x86_shift_adj_1
6131 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
6134 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
6138 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6140 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
6141 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
6142 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6143 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6145 rtx low
[2], high
[2];
6148 if (CONST_INT_P (operands
[2]))
6150 split_double_mode (mode
, operands
, 2, low
, high
);
6151 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6153 if (count
== GET_MODE_BITSIZE (mode
) - 1)
6155 emit_move_insn (high
[0], high
[1]);
6156 emit_insn (gen_ashr3 (high
[0], high
[0],
6157 GEN_INT (half_width
- 1)));
6158 emit_move_insn (low
[0], high
[0]);
6161 else if (count
>= half_width
)
6163 emit_move_insn (low
[0], high
[1]);
6164 emit_move_insn (high
[0], low
[0]);
6165 emit_insn (gen_ashr3 (high
[0], high
[0],
6166 GEN_INT (half_width
- 1)));
6168 if (count
> half_width
)
6169 emit_insn (gen_ashr3 (low
[0], low
[0],
6170 GEN_INT (count
- half_width
)));
6174 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6176 if (!rtx_equal_p (operands
[0], operands
[1]))
6177 emit_move_insn (operands
[0], operands
[1]);
6179 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6180 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
6185 machine_mode half_mode
;
6187 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6189 if (!rtx_equal_p (operands
[0], operands
[1]))
6190 emit_move_insn (operands
[0], operands
[1]);
6192 split_double_mode (mode
, operands
, 1, low
, high
);
6193 half_mode
= mode
== DImode
? SImode
: DImode
;
6195 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6196 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
6198 if (TARGET_CMOVE
&& scratch
)
6200 emit_move_insn (scratch
, high
[0]);
6201 emit_insn (gen_ashr3 (scratch
, scratch
,
6202 GEN_INT (half_width
- 1)));
6203 emit_insn (gen_x86_shift_adj_1
6204 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6207 emit_insn (gen_x86_shift_adj_3
6208 (half_mode
, low
[0], high
[0], operands
[2]));
6213 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6215 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
6216 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
6217 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6218 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6220 rtx low
[2], high
[2];
6223 if (CONST_INT_P (operands
[2]))
6225 split_double_mode (mode
, operands
, 2, low
, high
);
6226 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6228 if (count
>= half_width
)
6230 emit_move_insn (low
[0], high
[1]);
6231 ix86_expand_clear (high
[0]);
6233 if (count
> half_width
)
6234 emit_insn (gen_lshr3 (low
[0], low
[0],
6235 GEN_INT (count
- half_width
)));
6239 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6241 if (!rtx_equal_p (operands
[0], operands
[1]))
6242 emit_move_insn (operands
[0], operands
[1]);
6244 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6245 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
6250 machine_mode half_mode
;
6252 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6254 if (!rtx_equal_p (operands
[0], operands
[1]))
6255 emit_move_insn (operands
[0], operands
[1]);
6257 split_double_mode (mode
, operands
, 1, low
, high
);
6258 half_mode
= mode
== DImode
? SImode
: DImode
;
6260 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6261 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
6263 if (TARGET_CMOVE
&& scratch
)
6265 ix86_expand_clear (scratch
);
6266 emit_insn (gen_x86_shift_adj_1
6267 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6270 emit_insn (gen_x86_shift_adj_2
6271 (half_mode
, low
[0], high
[0], operands
[2]));
6275 /* Expand move of V1TI mode register X to a new TI mode register. */
6277 ix86_expand_v1ti_to_ti (rtx x
)
6279 rtx result
= gen_reg_rtx (TImode
);
6282 rtx temp
= force_reg (V2DImode
, gen_lowpart (V2DImode
, x
));
6283 rtx lo
= gen_lowpart (DImode
, result
);
6284 emit_insn (gen_vec_extractv2didi (lo
, temp
, const0_rtx
));
6285 rtx hi
= gen_highpart (DImode
, result
);
6286 emit_insn (gen_vec_extractv2didi (hi
, temp
, const1_rtx
));
6289 emit_move_insn (result
, gen_lowpart (TImode
, x
));
6293 /* Expand move of TI mode register X to a new V1TI mode register. */
6295 ix86_expand_ti_to_v1ti (rtx x
)
6299 rtx lo
= gen_lowpart (DImode
, x
);
6300 rtx hi
= gen_highpart (DImode
, x
);
6301 rtx tmp
= gen_reg_rtx (V2DImode
);
6302 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
6303 return force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp
));
6306 return force_reg (V1TImode
, gen_lowpart (V1TImode
, x
));
6309 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6311 ix86_expand_v1ti_shift (enum rtx_code code
, rtx operands
[])
6313 rtx op1
= force_reg (V1TImode
, operands
[1]);
6315 if (!CONST_INT_P (operands
[2]))
6317 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6318 rtx tmp2
= gen_reg_rtx (TImode
);
6319 rtx (*shift
) (rtx
, rtx
, rtx
)
6320 = (code
== ASHIFT
) ? gen_ashlti3
: gen_lshrti3
;
6321 emit_insn (shift (tmp2
, tmp1
, operands
[2]));
6322 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6323 emit_move_insn (operands
[0], tmp3
);
6327 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6331 emit_move_insn (operands
[0], op1
);
6335 if ((bits
& 7) == 0)
6337 rtx tmp
= gen_reg_rtx (V1TImode
);
6339 emit_insn (gen_sse2_ashlv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6341 emit_insn (gen_sse2_lshrv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6342 emit_move_insn (operands
[0], tmp
);
6346 rtx tmp1
= gen_reg_rtx (V1TImode
);
6348 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (64)));
6350 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
6352 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6353 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6355 /* tmp3 will be the V2DImode result. */
6356 rtx tmp3
= gen_reg_rtx (V2DImode
);
6361 emit_insn (gen_ashlv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6363 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6367 /* tmp4 is operands[1], in V2DImode. */
6368 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6370 rtx tmp5
= gen_reg_rtx (V2DImode
);
6372 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6374 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6376 rtx tmp6
= gen_reg_rtx (V2DImode
);
6378 emit_insn (gen_lshrv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6380 emit_insn (gen_ashlv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6382 emit_insn (gen_iorv2di3 (tmp3
, tmp5
, tmp6
));
6385 /* Convert the result back to V1TImode and store in operands[0]. */
6386 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6387 emit_move_insn (operands
[0], tmp7
);
6390 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6392 ix86_expand_v1ti_rotate (enum rtx_code code
, rtx operands
[])
6394 rtx op1
= force_reg (V1TImode
, operands
[1]);
6396 if (!CONST_INT_P (operands
[2]))
6398 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6399 rtx tmp2
= gen_reg_rtx (TImode
);
6400 rtx (*rotate
) (rtx
, rtx
, rtx
)
6401 = (code
== ROTATE
) ? gen_rotlti3
: gen_rotrti3
;
6402 emit_insn (rotate (tmp2
, tmp1
, operands
[2]));
6403 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6404 emit_move_insn (operands
[0], tmp3
);
6408 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6412 emit_move_insn (operands
[0], op1
);
6416 if (code
== ROTATERT
)
6419 if ((bits
& 31) == 0)
6421 rtx tmp2
= gen_reg_rtx (V4SImode
);
6422 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6424 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x93)));
6425 else if (bits
== 64)
6426 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x4e)));
6428 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x39)));
6429 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp2
));
6433 if ((bits
& 7) == 0)
6435 rtx tmp1
= gen_reg_rtx (V1TImode
);
6436 rtx tmp2
= gen_reg_rtx (V1TImode
);
6437 rtx tmp3
= gen_reg_rtx (V1TImode
);
6439 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (bits
)));
6440 emit_insn (gen_sse2_lshrv1ti3 (tmp2
, op1
, GEN_INT (128 - bits
)));
6441 emit_insn (gen_iorv1ti3 (tmp3
, tmp1
, tmp2
));
6442 emit_move_insn (operands
[0], tmp3
);
6446 rtx op1_v4si
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6455 hibits
= gen_reg_rtx (V4SImode
);
6456 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x93)));
6460 lobits
= gen_reg_rtx (V4SImode
);
6461 hibits
= gen_reg_rtx (V4SImode
);
6462 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x93)));
6463 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x4e)));
6467 lobits
= gen_reg_rtx (V4SImode
);
6468 hibits
= gen_reg_rtx (V4SImode
);
6469 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x4e)));
6470 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x39)));
6474 lobits
= gen_reg_rtx (V4SImode
);
6475 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x39)));
6480 rtx tmp1
= gen_reg_rtx (V4SImode
);
6481 rtx tmp2
= gen_reg_rtx (V4SImode
);
6482 rtx tmp3
= gen_reg_rtx (V4SImode
);
6484 emit_insn (gen_ashlv4si3 (tmp1
, lobits
, GEN_INT (bits
& 31)));
6485 emit_insn (gen_lshrv4si3 (tmp2
, hibits
, GEN_INT (32 - (bits
& 31))));
6486 emit_insn (gen_iorv4si3 (tmp3
, tmp1
, tmp2
));
6488 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6491 /* Expand V1TI mode ashiftrt by constant. */
6493 ix86_expand_v1ti_ashiftrt (rtx operands
[])
6495 rtx op1
= force_reg (V1TImode
, operands
[1]);
6497 if (!CONST_INT_P (operands
[2]))
6499 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6500 rtx tmp2
= gen_reg_rtx (TImode
);
6501 emit_insn (gen_ashrti3 (tmp2
, tmp1
, operands
[2]));
6502 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6503 emit_move_insn (operands
[0], tmp3
);
6507 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6511 emit_move_insn (operands
[0], op1
);
6517 /* Two operations. */
6518 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6519 rtx tmp2
= gen_reg_rtx (V4SImode
);
6520 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6522 rtx tmp3
= gen_reg_rtx (V4SImode
);
6523 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6525 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6531 /* Three operations. */
6532 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6533 rtx tmp2
= gen_reg_rtx (V4SImode
);
6534 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6536 rtx tmp3
= gen_reg_rtx (V4SImode
);
6537 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6539 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6540 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6541 rtx tmp6
= gen_reg_rtx (V2DImode
);
6542 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6544 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6550 /* Three operations. */
6551 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6552 rtx tmp2
= gen_reg_rtx (V4SImode
);
6553 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6555 rtx tmp3
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6556 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6557 rtx tmp5
= gen_reg_rtx (V2DImode
);
6558 emit_insn (gen_vec_interleave_highv2di (tmp5
, tmp3
, tmp4
));
6560 rtx tmp6
= force_reg(V4SImode
, gen_lowpart (V4SImode
, tmp5
));
6561 rtx tmp7
= gen_reg_rtx (V4SImode
);
6562 emit_insn (gen_sse2_pshufd (tmp7
, tmp6
, GEN_INT (0xfd)));
6564 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
6570 /* Three operations. */
6571 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6572 rtx tmp2
= gen_reg_rtx (V4SImode
);
6573 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6575 rtx tmp3
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6576 rtx tmp4
= gen_reg_rtx (V8HImode
);
6577 emit_insn (gen_sse2_pshufhw (tmp4
, tmp3
, GEN_INT (0xfe)));
6579 rtx tmp5
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp4
));
6580 rtx tmp6
= gen_reg_rtx (V4SImode
);
6581 emit_insn (gen_sse2_pshufd (tmp6
, tmp5
, GEN_INT (0xfe)));
6583 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6587 if (TARGET_AVX2
|| TARGET_SSE4_1
)
6589 /* Three operations. */
6592 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6593 rtx tmp2
= gen_reg_rtx (V4SImode
);
6594 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6596 rtx tmp3
= gen_reg_rtx (V1TImode
);
6597 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (32)));
6601 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6602 rtx tmp5
= gen_reg_rtx (V4SImode
);
6603 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6606 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6610 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6611 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6612 rtx tmp6
= gen_reg_rtx (V8HImode
);
6613 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6616 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6621 /* Three operations. */
6622 if (bits
== 8 || bits
== 16 || bits
== 24)
6624 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6625 rtx tmp2
= gen_reg_rtx (V4SImode
);
6626 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6628 rtx tmp3
= gen_reg_rtx (V1TImode
);
6629 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (bits
)));
6633 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6634 rtx tmp5
= gen_reg_rtx (V4SImode
);
6635 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6638 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6642 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6643 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6644 rtx tmp6
= gen_reg_rtx (V8HImode
);
6645 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6648 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6656 /* Four operations. */
6657 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6658 rtx tmp2
= gen_reg_rtx (V4SImode
);
6659 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6661 rtx tmp3
= gen_reg_rtx (V4SImode
);
6662 emit_insn (gen_ashrv4si3 (tmp3
, tmp1
, GEN_INT (31)));
6664 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6665 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6666 rtx tmp6
= gen_reg_rtx (V2DImode
);
6667 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6669 rtx tmp7
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp6
));
6670 rtx tmp8
= gen_reg_rtx (V4SImode
);
6671 emit_insn (gen_sse2_pshufd (tmp8
, tmp7
, GEN_INT (0xfd)));
6673 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp8
));
6677 if (TARGET_SSE4_1
&& (bits
== 48 || bits
== 80))
6679 /* Four operations. */
6680 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6681 rtx tmp2
= gen_reg_rtx (V4SImode
);
6682 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6684 rtx tmp3
= gen_reg_rtx (V4SImode
);
6685 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6687 rtx tmp4
= gen_reg_rtx (V1TImode
);
6688 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
6690 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6691 rtx tmp6
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp4
));
6692 rtx tmp7
= gen_reg_rtx (V8HImode
);
6693 emit_insn (gen_sse4_1_pblendw (tmp7
, tmp5
, tmp6
,
6694 GEN_INT (bits
== 48 ? 0x1f : 0x07)));
6696 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
6700 if ((bits
& 7) == 0)
6702 /* Five operations. */
6703 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6704 rtx tmp2
= gen_reg_rtx (V4SImode
);
6705 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6707 rtx tmp3
= gen_reg_rtx (V4SImode
);
6708 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6710 rtx tmp4
= gen_reg_rtx (V1TImode
);
6711 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
6713 rtx tmp5
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6714 rtx tmp6
= gen_reg_rtx (V1TImode
);
6715 emit_insn (gen_sse2_ashlv1ti3 (tmp6
, tmp5
, GEN_INT (128 - bits
)));
6717 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
6718 rtx tmp8
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp6
));
6719 rtx tmp9
= gen_reg_rtx (V2DImode
);
6720 emit_insn (gen_iorv2di3 (tmp9
, tmp7
, tmp8
));
6722 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp9
));
6726 if (TARGET_AVX2
&& bits
< 32)
6728 /* Six operations. */
6729 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6730 rtx tmp2
= gen_reg_rtx (V4SImode
);
6731 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6733 rtx tmp3
= gen_reg_rtx (V1TImode
);
6734 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
6736 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6737 rtx tmp5
= gen_reg_rtx (V2DImode
);
6738 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6740 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6741 rtx tmp7
= gen_reg_rtx (V2DImode
);
6742 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
6744 rtx tmp8
= gen_reg_rtx (V2DImode
);
6745 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
6747 rtx tmp9
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp8
));
6748 rtx tmp10
= gen_reg_rtx (V4SImode
);
6749 emit_insn (gen_avx2_pblenddv4si (tmp10
, tmp2
, tmp9
, GEN_INT (7)));
6751 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp10
));
6755 if (TARGET_SSE4_1
&& bits
< 15)
6757 /* Six operations. */
6758 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6759 rtx tmp2
= gen_reg_rtx (V4SImode
);
6760 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6762 rtx tmp3
= gen_reg_rtx (V1TImode
);
6763 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
6765 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6766 rtx tmp5
= gen_reg_rtx (V2DImode
);
6767 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6769 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6770 rtx tmp7
= gen_reg_rtx (V2DImode
);
6771 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
6773 rtx tmp8
= gen_reg_rtx (V2DImode
);
6774 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
6776 rtx tmp9
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6777 rtx tmp10
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp8
));
6778 rtx tmp11
= gen_reg_rtx (V8HImode
);
6779 emit_insn (gen_sse4_1_pblendw (tmp11
, tmp9
, tmp10
, GEN_INT (0x3f)));
6781 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp11
));
6787 /* Eight operations. */
6788 rtx tmp1
= gen_reg_rtx (V1TImode
);
6789 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
6791 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6792 rtx tmp3
= gen_reg_rtx (V2DImode
);
6793 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (1)));
6795 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6796 rtx tmp5
= gen_reg_rtx (V2DImode
);
6797 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (63)));
6799 rtx tmp6
= gen_reg_rtx (V2DImode
);
6800 emit_insn (gen_iorv2di3 (tmp6
, tmp3
, tmp5
));
6802 rtx tmp7
= gen_reg_rtx (V2DImode
);
6803 emit_insn (gen_lshrv2di3 (tmp7
, tmp2
, GEN_INT (63)));
6805 rtx tmp8
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp7
));
6806 rtx tmp9
= gen_reg_rtx (V4SImode
);
6807 emit_insn (gen_sse2_pshufd (tmp9
, tmp8
, GEN_INT (0xbf)));
6809 rtx tmp10
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp9
));
6810 rtx tmp11
= gen_reg_rtx (V2DImode
);
6811 emit_insn (gen_ashlv2di3 (tmp11
, tmp10
, GEN_INT (31)));
6813 rtx tmp12
= gen_reg_rtx (V2DImode
);
6814 emit_insn (gen_iorv2di3 (tmp12
, tmp6
, tmp11
));
6816 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp12
));
6822 /* Eight operations. */
6823 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6824 rtx tmp2
= gen_reg_rtx (V4SImode
);
6825 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6827 rtx tmp3
= gen_reg_rtx (V4SImode
);
6828 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6830 rtx tmp4
= gen_reg_rtx (V1TImode
);
6831 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
6833 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
6834 rtx tmp6
= gen_reg_rtx (V2DImode
);
6835 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
- 64)));
6837 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6838 rtx tmp8
= gen_reg_rtx (V1TImode
);
6839 emit_insn (gen_sse2_ashlv1ti3 (tmp8
, tmp7
, GEN_INT (64)));
6841 rtx tmp9
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6842 rtx tmp10
= gen_reg_rtx (V2DImode
);
6843 emit_insn (gen_ashlv2di3 (tmp10
, tmp9
, GEN_INT (128 - bits
)));
6845 rtx tmp11
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp8
));
6846 rtx tmp12
= gen_reg_rtx (V2DImode
);
6847 emit_insn (gen_iorv2di3 (tmp12
, tmp10
, tmp11
));
6849 rtx tmp13
= gen_reg_rtx (V2DImode
);
6850 emit_insn (gen_iorv2di3 (tmp13
, tmp6
, tmp12
));
6852 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp13
));
6856 /* Nine operations. */
6857 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6858 rtx tmp2
= gen_reg_rtx (V4SImode
);
6859 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6861 rtx tmp3
= gen_reg_rtx (V4SImode
);
6862 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6864 rtx tmp4
= gen_reg_rtx (V1TImode
);
6865 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
6867 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6868 rtx tmp6
= gen_reg_rtx (V2DImode
);
6869 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
)));
6871 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
6872 rtx tmp8
= gen_reg_rtx (V2DImode
);
6873 emit_insn (gen_ashlv2di3 (tmp8
, tmp7
, GEN_INT (64 - bits
)));
6875 rtx tmp9
= gen_reg_rtx (V2DImode
);
6876 emit_insn (gen_iorv2di3 (tmp9
, tmp6
, tmp8
));
6878 rtx tmp10
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6879 rtx tmp11
= gen_reg_rtx (V1TImode
);
6880 emit_insn (gen_sse2_ashlv1ti3 (tmp11
, tmp10
, GEN_INT (64)));
6882 rtx tmp12
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp11
));
6883 rtx tmp13
= gen_reg_rtx (V2DImode
);
6884 emit_insn (gen_ashlv2di3 (tmp13
, tmp12
, GEN_INT (64 - bits
)));
6886 rtx tmp14
= gen_reg_rtx (V2DImode
);
6887 emit_insn (gen_iorv2di3 (tmp14
, tmp9
, tmp13
));
6889 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp14
));
6893 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
6894 DImode for constant loop counts. */
6897 counter_mode (rtx count_exp
)
6899 if (GET_MODE (count_exp
) != VOIDmode
)
6900 return GET_MODE (count_exp
);
6901 if (!CONST_INT_P (count_exp
))
6903 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
6908 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
6909 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
6910 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
6911 memory by VALUE (supposed to be in MODE).
6913 The size is rounded down to whole number of chunk size moved at once.
6914 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
6918 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
6919 rtx destptr
, rtx srcptr
, rtx value
,
6920 rtx count
, machine_mode mode
, int unroll
,
6921 int expected_size
, bool issetmem
)
6923 rtx_code_label
*out_label
, *top_label
;
6925 machine_mode iter_mode
= counter_mode (count
);
6926 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
6927 rtx piece_size
= GEN_INT (piece_size_n
);
6928 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
6932 top_label
= gen_label_rtx ();
6933 out_label
= gen_label_rtx ();
6934 iter
= gen_reg_rtx (iter_mode
);
6936 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
6937 NULL
, 1, OPTAB_DIRECT
);
6938 /* Those two should combine. */
6939 if (piece_size
== const1_rtx
)
6941 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
6943 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
6945 emit_move_insn (iter
, const0_rtx
);
6947 emit_label (top_label
);
6949 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
6951 /* This assert could be relaxed - in this case we'll need to compute
6952 smallest power of two, containing in PIECE_SIZE_N and pass it to
6954 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
6955 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
6956 destmem
= adjust_address (destmem
, mode
, 0);
6960 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
6961 srcmem
= adjust_address (srcmem
, mode
, 0);
6963 /* When unrolling for chips that reorder memory reads and writes,
6964 we can save registers by using single temporary.
6965 Also using 4 temporaries is overkill in 32bit mode. */
6966 if (!TARGET_64BIT
&& 0)
6968 for (i
= 0; i
< unroll
; i
++)
6972 destmem
= adjust_address (copy_rtx (destmem
), mode
,
6973 GET_MODE_SIZE (mode
));
6974 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
6975 GET_MODE_SIZE (mode
));
6977 emit_move_insn (destmem
, srcmem
);
6983 gcc_assert (unroll
<= 4);
6984 for (i
= 0; i
< unroll
; i
++)
6986 tmpreg
[i
] = gen_reg_rtx (mode
);
6988 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
6989 GET_MODE_SIZE (mode
));
6990 emit_move_insn (tmpreg
[i
], srcmem
);
6992 for (i
= 0; i
< unroll
; i
++)
6995 destmem
= adjust_address (copy_rtx (destmem
), mode
,
6996 GET_MODE_SIZE (mode
));
6997 emit_move_insn (destmem
, tmpreg
[i
]);
7002 for (i
= 0; i
< unroll
; i
++)
7005 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7006 GET_MODE_SIZE (mode
));
7007 emit_move_insn (destmem
, value
);
7010 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
7011 true, OPTAB_LIB_WIDEN
);
7013 emit_move_insn (iter
, tmp
);
7015 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
7017 if (expected_size
!= -1)
7019 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
7020 if (expected_size
== 0)
7022 else if (expected_size
> REG_BR_PROB_BASE
)
7023 predict_jump (REG_BR_PROB_BASE
- 1);
7025 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
7029 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
7030 iter
= ix86_zero_extend_to_Pmode (iter
);
7031 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
7032 true, OPTAB_LIB_WIDEN
);
7034 emit_move_insn (destptr
, tmp
);
7037 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
7038 true, OPTAB_LIB_WIDEN
);
7040 emit_move_insn (srcptr
, tmp
);
7042 emit_label (out_label
);
7045 /* Divide COUNTREG by SCALE. */
7047 scale_counter (rtx countreg
, int scale
)
7053 if (CONST_INT_P (countreg
))
7054 return GEN_INT (INTVAL (countreg
) / scale
);
7055 gcc_assert (REG_P (countreg
));
7057 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
7058 GEN_INT (exact_log2 (scale
)),
7059 NULL
, 1, OPTAB_DIRECT
);
7063 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7064 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7065 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7066 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7067 ORIG_VALUE is the original value passed to memset to fill the memory with.
7068 Other arguments have same meaning as for previous function. */
7071 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
7072 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
7074 machine_mode mode
, bool issetmem
)
7079 HOST_WIDE_INT rounded_count
;
7081 /* If possible, it is shorter to use rep movs.
7082 TODO: Maybe it is better to move this logic to decide_alg. */
7083 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
7084 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7085 && (!issetmem
|| orig_value
== const0_rtx
))
7088 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
7089 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
7091 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
7092 GET_MODE_SIZE (mode
)));
7095 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7096 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7097 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
7100 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
7101 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
7104 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7105 destmem
= shallow_copy_rtx (destmem
);
7106 set_mem_size (destmem
, rounded_count
);
7108 else if (MEM_SIZE_KNOWN_P (destmem
))
7109 clear_mem_size (destmem
);
7113 value
= force_reg (mode
, gen_lowpart (mode
, value
));
7114 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
7118 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
7119 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
7122 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7123 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7124 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
7127 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
7128 if (CONST_INT_P (count
))
7131 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7132 srcmem
= shallow_copy_rtx (srcmem
);
7133 set_mem_size (srcmem
, rounded_count
);
7137 if (MEM_SIZE_KNOWN_P (srcmem
))
7138 clear_mem_size (srcmem
);
7140 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
7145 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7147 SRC is passed by pointer to be updated on return.
7148 Return value is updated DST. */
7150 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
7151 HOST_WIDE_INT size_to_move
)
7153 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
7154 enum insn_code code
;
7155 machine_mode move_mode
;
7158 /* Find the widest mode in which we could perform moves.
7159 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7160 it until move of such size is supported. */
7161 piece_size
= 1 << floor_log2 (size_to_move
);
7162 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
7163 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7165 gcc_assert (piece_size
> 1);
7169 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7170 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7171 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7173 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7174 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7175 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7177 move_mode
= word_mode
;
7178 piece_size
= GET_MODE_SIZE (move_mode
);
7179 code
= optab_handler (mov_optab
, move_mode
);
7182 gcc_assert (code
!= CODE_FOR_nothing
);
7184 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7185 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
7187 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7188 gcc_assert (size_to_move
% piece_size
== 0);
7190 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7192 /* We move from memory to memory, so we'll need to do it via
7193 a temporary register. */
7194 tempreg
= gen_reg_rtx (move_mode
);
7195 emit_insn (GEN_FCN (code
) (tempreg
, src
));
7196 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
7198 emit_move_insn (destptr
,
7199 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7200 emit_move_insn (srcptr
,
7201 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
7203 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7205 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
7209 /* Update DST and SRC rtx. */
7214 /* Helper function for the string operations below. Dest VARIABLE whether
7215 it is aligned to VALUE bytes. If true, jump to the label. */
7217 static rtx_code_label
*
7218 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
7220 rtx_code_label
*label
= gen_label_rtx ();
7221 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
7222 if (GET_MODE (variable
) == DImode
)
7223 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
7225 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
7226 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
7229 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
7231 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7236 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7239 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
7240 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
7243 if (CONST_INT_P (count
))
7245 HOST_WIDE_INT countval
= INTVAL (count
);
7246 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7249 /* For now MAX_SIZE should be a power of 2. This assert could be
7250 relaxed, but it'll require a bit more complicated epilogue
7252 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7253 for (i
= max_size
; i
>= 1; i
>>= 1)
7255 if (epilogue_size
& i
)
7256 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7262 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
7263 count
, 1, OPTAB_DIRECT
);
7264 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
7265 count
, QImode
, 1, 4, false);
7269 /* When there are stringops, we can cheaply increase dest and src pointers.
7270 Otherwise we save code size by maintaining offset (zero is readily
7271 available from preceding rep operation) and using x86 addressing modes.
7273 if (TARGET_SINGLE_STRINGOP
)
7277 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7278 src
= change_address (srcmem
, SImode
, srcptr
);
7279 dest
= change_address (destmem
, SImode
, destptr
);
7280 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7282 LABEL_NUSES (label
) = 1;
7286 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7287 src
= change_address (srcmem
, HImode
, srcptr
);
7288 dest
= change_address (destmem
, HImode
, destptr
);
7289 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7291 LABEL_NUSES (label
) = 1;
7295 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7296 src
= change_address (srcmem
, QImode
, srcptr
);
7297 dest
= change_address (destmem
, QImode
, destptr
);
7298 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7300 LABEL_NUSES (label
) = 1;
7305 rtx offset
= force_reg (Pmode
, const0_rtx
);
7310 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7311 src
= change_address (srcmem
, SImode
, srcptr
);
7312 dest
= change_address (destmem
, SImode
, destptr
);
7313 emit_move_insn (dest
, src
);
7314 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
7315 true, OPTAB_LIB_WIDEN
);
7317 emit_move_insn (offset
, tmp
);
7319 LABEL_NUSES (label
) = 1;
7323 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7324 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7325 src
= change_address (srcmem
, HImode
, tmp
);
7326 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7327 dest
= change_address (destmem
, HImode
, tmp
);
7328 emit_move_insn (dest
, src
);
7329 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
7330 true, OPTAB_LIB_WIDEN
);
7332 emit_move_insn (offset
, tmp
);
7334 LABEL_NUSES (label
) = 1;
7338 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7339 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7340 src
= change_address (srcmem
, QImode
, tmp
);
7341 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7342 dest
= change_address (destmem
, QImode
, tmp
);
7343 emit_move_insn (dest
, src
);
7345 LABEL_NUSES (label
) = 1;
7350 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7351 with value PROMOTED_VAL.
7352 SRC is passed by pointer to be updated on return.
7353 Return value is updated DST. */
7355 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
7356 HOST_WIDE_INT size_to_move
)
7359 enum insn_code code
;
7360 machine_mode move_mode
;
7363 /* Find the widest mode in which we could perform moves.
7364 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7365 it until move of such size is supported. */
7366 move_mode
= GET_MODE (promoted_val
);
7367 if (move_mode
== VOIDmode
)
7369 if (size_to_move
< GET_MODE_SIZE (move_mode
))
7371 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
7372 move_mode
= int_mode_for_size (move_bits
, 0).require ();
7373 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
7375 piece_size
= GET_MODE_SIZE (move_mode
);
7376 code
= optab_handler (mov_optab
, move_mode
);
7377 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
7379 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7381 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7382 gcc_assert (size_to_move
% piece_size
== 0);
7384 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7386 if (piece_size
<= GET_MODE_SIZE (word_mode
))
7388 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
7389 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7394 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
7396 emit_move_insn (destptr
,
7397 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7399 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7403 /* Update DST rtx. */
7406 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7408 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
7409 rtx count
, int max_size
)
7411 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
7412 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
7413 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
7414 gen_lowpart (QImode
, value
), count
, QImode
,
7415 1, max_size
/ 2, true);
7418 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7420 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
7421 rtx count
, int max_size
)
7425 if (CONST_INT_P (count
))
7427 HOST_WIDE_INT countval
= INTVAL (count
);
7428 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7431 /* For now MAX_SIZE should be a power of 2. This assert could be
7432 relaxed, but it'll require a bit more complicated epilogue
7434 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7435 for (i
= max_size
; i
>= 1; i
>>= 1)
7437 if (epilogue_size
& i
)
7439 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7440 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7442 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7449 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
7454 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
7457 dest
= change_address (destmem
, DImode
, destptr
);
7458 emit_insn (gen_strset (destptr
, dest
, value
));
7459 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
7460 emit_insn (gen_strset (destptr
, dest
, value
));
7464 dest
= change_address (destmem
, SImode
, destptr
);
7465 emit_insn (gen_strset (destptr
, dest
, value
));
7466 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7467 emit_insn (gen_strset (destptr
, dest
, value
));
7468 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
7469 emit_insn (gen_strset (destptr
, dest
, value
));
7470 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
7471 emit_insn (gen_strset (destptr
, dest
, value
));
7474 LABEL_NUSES (label
) = 1;
7478 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
7481 dest
= change_address (destmem
, DImode
, destptr
);
7482 emit_insn (gen_strset (destptr
, dest
, value
));
7486 dest
= change_address (destmem
, SImode
, destptr
);
7487 emit_insn (gen_strset (destptr
, dest
, value
));
7488 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7489 emit_insn (gen_strset (destptr
, dest
, value
));
7492 LABEL_NUSES (label
) = 1;
7496 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7497 dest
= change_address (destmem
, SImode
, destptr
);
7498 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
7500 LABEL_NUSES (label
) = 1;
7504 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7505 dest
= change_address (destmem
, HImode
, destptr
);
7506 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
7508 LABEL_NUSES (label
) = 1;
7512 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7513 dest
= change_address (destmem
, QImode
, destptr
);
7514 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
7516 LABEL_NUSES (label
) = 1;
7520 /* Adjust COUNTER by the VALUE. */
7522 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
7524 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
7527 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7528 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7529 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7531 Return value is updated DESTMEM. */
7534 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
7535 rtx destptr
, rtx srcptr
, rtx value
,
7536 rtx vec_value
, rtx count
, int align
,
7537 int desired_alignment
, bool issetmem
)
7540 for (i
= 1; i
< desired_alignment
; i
<<= 1)
7544 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
7547 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7548 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7550 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7553 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7554 ix86_adjust_counter (count
, i
);
7556 LABEL_NUSES (label
) = 1;
7557 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
7563 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7564 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7565 and jump to DONE_LABEL. */
7567 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
7568 rtx destptr
, rtx srcptr
,
7569 rtx value
, rtx vec_value
,
7570 rtx count
, int size
,
7571 rtx done_label
, bool issetmem
)
7573 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
7574 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
7578 /* If we do not have vector value to copy, we must reduce size. */
7583 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
7585 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
7586 mode
= GET_MODE (value
);
7589 mode
= GET_MODE (vec_value
), value
= vec_value
;
7593 /* Choose appropriate vector mode. */
7595 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
7596 else if (size
>= 16)
7597 mode
= TARGET_SSE
? V16QImode
: DImode
;
7598 srcmem
= change_address (srcmem
, mode
, srcptr
);
7600 destmem
= change_address (destmem
, mode
, destptr
);
7601 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
7602 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
7603 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7606 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7609 emit_move_insn (destmem
, srcmem
);
7610 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7612 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7615 destmem
= offset_address (destmem
, count
, 1);
7616 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
7617 GET_MODE_SIZE (mode
));
7620 srcmem
= offset_address (srcmem
, count
, 1);
7621 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
7622 GET_MODE_SIZE (mode
));
7624 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7627 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7630 emit_move_insn (destmem
, srcmem
);
7631 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7633 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7635 emit_jump_insn (gen_jump (done_label
));
7639 LABEL_NUSES (label
) = 1;
7642 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7643 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7644 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7645 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7646 DONE_LABEL is a label after the whole copying sequence. The label is created
7647 on demand if *DONE_LABEL is NULL.
7648 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7649 bounds after the initial copies.
7651 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7652 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7653 we will dispatch to a library call for large blocks.
7655 In pseudocode we do:
7659 Assume that SIZE is 4. Bigger sizes are handled analogously
7662 copy 4 bytes from SRCPTR to DESTPTR
7663 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7668 copy 1 byte from SRCPTR to DESTPTR
7671 copy 2 bytes from SRCPTR to DESTPTR
7672 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7677 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7678 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7680 OLD_DESPTR = DESTPTR;
7681 Align DESTPTR up to DESIRED_ALIGN
7682 SRCPTR += DESTPTR - OLD_DESTPTR
7683 COUNT -= DEST_PTR - OLD_DESTPTR
7685 Round COUNT down to multiple of SIZE
7686 << optional caller supplied zero size guard is here >>
7687 << optional caller supplied dynamic check is here >>
7688 << caller supplied main copy loop is here >>
7693 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
7694 rtx
*destptr
, rtx
*srcptr
,
7696 rtx value
, rtx vec_value
,
7698 rtx_code_label
**done_label
,
7702 unsigned HOST_WIDE_INT
*min_size
,
7706 rtx_code_label
*loop_label
= NULL
, *label
;
7709 int prolog_size
= 0;
7712 /* Chose proper value to copy. */
7713 if (issetmem
&& VECTOR_MODE_P (mode
))
7714 mode_value
= vec_value
;
7717 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
7719 /* See if block is big or small, handle small blocks. */
7720 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
7723 loop_label
= gen_label_rtx ();
7726 *done_label
= gen_label_rtx ();
7728 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
7732 /* Handle sizes > 3. */
7733 for (;size2
> 2; size2
>>= 1)
7734 expand_small_cpymem_or_setmem (destmem
, srcmem
,
7738 size2
, *done_label
, issetmem
);
7739 /* Nothing to copy? Jump to DONE_LABEL if so */
7740 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
7743 /* Do a byte copy. */
7744 destmem
= change_address (destmem
, QImode
, *destptr
);
7746 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
7749 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
7750 emit_move_insn (destmem
, srcmem
);
7753 /* Handle sizes 2 and 3. */
7754 label
= ix86_expand_aligntest (*count
, 2, false);
7755 destmem
= change_address (destmem
, HImode
, *destptr
);
7756 destmem
= offset_address (destmem
, *count
, 1);
7757 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
7759 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
7762 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
7763 srcmem
= offset_address (srcmem
, *count
, 1);
7764 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
7765 emit_move_insn (destmem
, srcmem
);
7769 LABEL_NUSES (label
) = 1;
7770 emit_jump_insn (gen_jump (*done_label
));
7774 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
7775 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
7777 /* Start memcpy for COUNT >= SIZE. */
7780 emit_label (loop_label
);
7781 LABEL_NUSES (loop_label
) = 1;
7784 /* Copy first desired_align bytes. */
7786 srcmem
= change_address (srcmem
, mode
, *srcptr
);
7787 destmem
= change_address (destmem
, mode
, *destptr
);
7788 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
7789 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
7792 emit_move_insn (destmem
, mode_value
);
7795 emit_move_insn (destmem
, srcmem
);
7796 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7798 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7799 prolog_size
+= GET_MODE_SIZE (mode
);
7803 /* Copy last SIZE bytes. */
7804 destmem
= offset_address (destmem
, *count
, 1);
7805 destmem
= offset_address (destmem
,
7806 GEN_INT (-size
- prolog_size
),
7809 emit_move_insn (destmem
, mode_value
);
7812 srcmem
= offset_address (srcmem
, *count
, 1);
7813 srcmem
= offset_address (srcmem
,
7814 GEN_INT (-size
- prolog_size
),
7816 emit_move_insn (destmem
, srcmem
);
7818 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7820 destmem
= offset_address (destmem
, modesize
, 1);
7822 emit_move_insn (destmem
, mode_value
);
7825 srcmem
= offset_address (srcmem
, modesize
, 1);
7826 emit_move_insn (destmem
, srcmem
);
7830 /* Align destination. */
7831 if (desired_align
> 1 && desired_align
> align
)
7833 rtx saveddest
= *destptr
;
7835 gcc_assert (desired_align
<= size
);
7836 /* Align destptr up, place it to new register. */
7837 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
7838 GEN_INT (prolog_size
),
7839 NULL_RTX
, 1, OPTAB_DIRECT
);
7840 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
7841 REG_POINTER (*destptr
) = 1;
7842 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
7843 GEN_INT (-desired_align
),
7844 *destptr
, 1, OPTAB_DIRECT
);
7845 /* See how many bytes we skipped. */
7846 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
7848 saveddest
, 1, OPTAB_DIRECT
);
7849 /* Adjust srcptr and count. */
7851 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
7852 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
7853 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
7854 saveddest
, *count
, 1, OPTAB_DIRECT
);
7855 /* We copied at most size + prolog_size. */
7856 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
7858 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
7862 /* Our loops always round down the block size, but for dispatch to
7863 library we need precise value. */
7865 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
7866 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
7870 gcc_assert (prolog_size
== 0);
7871 /* Decrease count, so we won't end up copying last word twice. */
7872 if (!CONST_INT_P (*count
))
7873 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
7874 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
7876 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
7877 (unsigned HOST_WIDE_INT
)size
));
7879 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
7884 /* This function is like the previous one, except here we know how many bytes
7885 need to be copied. That allows us to update alignment not only of DST, which
7886 is returned, but also of SRC, which is passed as a pointer for that
7889 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
7890 rtx srcreg
, rtx value
, rtx vec_value
,
7891 int desired_align
, int align_bytes
,
7896 rtx orig_src
= NULL
;
7898 int copied_bytes
= 0;
7902 gcc_assert (srcp
!= NULL
);
7907 for (piece_size
= 1;
7908 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
7911 if (align_bytes
& piece_size
)
7915 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
7916 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
7918 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
7921 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
7922 copied_bytes
+= piece_size
;
7925 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
7926 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7927 if (MEM_SIZE_KNOWN_P (orig_dst
))
7928 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
7932 int src_align_bytes
= get_mem_align_offset (src
, desired_align
7934 if (src_align_bytes
>= 0)
7935 src_align_bytes
= desired_align
- src_align_bytes
;
7936 if (src_align_bytes
>= 0)
7938 unsigned int src_align
;
7939 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
7941 if ((src_align_bytes
& (src_align
- 1))
7942 == (align_bytes
& (src_align
- 1)))
7945 if (src_align
> (unsigned int) desired_align
)
7946 src_align
= desired_align
;
7947 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
7948 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
7950 if (MEM_SIZE_KNOWN_P (orig_src
))
7951 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
7958 /* Return true if ALG can be used in current context.
7959 Assume we expand memset if MEMSET is true. */
7961 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
7963 if (alg
== no_stringop
)
7965 if (alg
== vector_loop
)
7966 return TARGET_SSE
|| TARGET_AVX
;
7967 /* Algorithms using the rep prefix want at least edi and ecx;
7968 additionally, memset wants eax and memcpy wants esi. Don't
7969 consider such algorithms if the user has appropriated those
7970 registers for their own purposes, or if we have a non-default
7971 address space, since some string insns cannot override the segment. */
7972 if (alg
== rep_prefix_1_byte
7973 || alg
== rep_prefix_4_byte
7974 || alg
== rep_prefix_8_byte
)
7978 if (fixed_regs
[CX_REG
]
7979 || fixed_regs
[DI_REG
]
7980 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
7986 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
7987 static enum stringop_alg
7988 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
7989 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
7990 bool memset
, bool zero_memset
, bool have_as
,
7991 int *dynamic_check
, bool *noalign
, bool recur
)
7993 const struct stringop_algs
*algs
;
7994 bool optimize_for_speed
;
7996 const struct processor_costs
*cost
;
7998 bool any_alg_usable_p
= false;
8001 *dynamic_check
= -1;
8003 /* Even if the string operation call is cold, we still might spend a lot
8004 of time processing large blocks. */
8005 if (optimize_function_for_size_p (cfun
)
8006 || (optimize_insn_for_size_p ()
8008 || (expected_size
!= -1 && expected_size
< 256))))
8009 optimize_for_speed
= false;
8011 optimize_for_speed
= true;
8013 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
8015 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
8017 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
8019 /* See maximal size for user defined algorithm. */
8020 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8022 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8023 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
8024 any_alg_usable_p
|= usable
;
8026 if (candidate
!= libcall
&& candidate
&& usable
)
8027 max
= algs
->size
[i
].max
;
8030 /* If expected size is not known but max size is small enough
8031 so inline version is a win, set expected size into
8033 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
8034 && expected_size
== -1)
8035 expected_size
= min_size
/ 2 + max_size
/ 2;
8037 /* If user specified the algorithm, honor it if possible. */
8038 if (ix86_stringop_alg
!= no_stringop
8039 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
8040 return ix86_stringop_alg
;
8041 /* rep; movq or rep; movl is the smallest variant. */
8042 else if (!optimize_for_speed
)
8045 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
8046 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
8047 ? rep_prefix_1_byte
: loop_1_byte
;
8049 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
8050 ? rep_prefix_4_byte
: loop
;
8052 /* Very tiny blocks are best handled via the loop, REP is expensive to
8054 else if (expected_size
!= -1 && expected_size
< 4)
8056 else if (expected_size
!= -1)
8058 enum stringop_alg alg
= libcall
;
8059 bool alg_noalign
= false;
8060 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8062 /* We get here if the algorithms that were not libcall-based
8063 were rep-prefix based and we are unable to use rep prefixes
8064 based on global register usage. Break out of the loop and
8065 use the heuristic below. */
8066 if (algs
->size
[i
].max
== 0)
8068 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
8070 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8072 if (candidate
!= libcall
8073 && alg_usable_p (candidate
, memset
, have_as
))
8076 alg_noalign
= algs
->size
[i
].noalign
;
8078 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8079 last non-libcall inline algorithm. */
8080 if (TARGET_INLINE_ALL_STRINGOPS
)
8082 /* When the current size is best to be copied by a libcall,
8083 but we are still forced to inline, run the heuristic below
8084 that will pick code for medium sized blocks. */
8087 *noalign
= alg_noalign
;
8090 else if (!any_alg_usable_p
)
8093 else if (alg_usable_p (candidate
, memset
, have_as
)
8094 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8095 && candidate
== rep_prefix_1_byte
8096 /* NB: If min_size != max_size, size is
8098 && min_size
!= max_size
))
8100 *noalign
= algs
->size
[i
].noalign
;
8106 /* When asked to inline the call anyway, try to pick meaningful choice.
8107 We look for maximal size of block that is faster to copy by hand and
8108 take blocks of at most of that size guessing that average size will
8109 be roughly half of the block.
8111 If this turns out to be bad, we might simply specify the preferred
8112 choice in ix86_costs. */
8113 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8114 && (algs
->unknown_size
== libcall
8115 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
8117 enum stringop_alg alg
;
8118 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
8120 /* If there aren't any usable algorithms or if recursing already,
8121 then recursing on smaller sizes or same size isn't going to
8122 find anything. Just return the simple byte-at-a-time copy loop. */
8123 if (!any_alg_usable_p
|| recur
)
8125 /* Pick something reasonable. */
8126 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
8127 *dynamic_check
= 128;
8130 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
8131 zero_memset
, have_as
, dynamic_check
, noalign
, true);
8132 gcc_assert (*dynamic_check
== -1);
8133 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8134 *dynamic_check
= max
;
8136 gcc_assert (alg
!= libcall
);
8139 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
8140 ? algs
->unknown_size
: libcall
);
8143 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8144 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8146 decide_alignment (int align
,
8147 enum stringop_alg alg
,
8149 machine_mode move_mode
)
8151 int desired_align
= 0;
8153 gcc_assert (alg
!= no_stringop
);
8157 if (move_mode
== VOIDmode
)
8160 desired_align
= GET_MODE_SIZE (move_mode
);
8161 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8162 copying whole cacheline at once. */
8163 if (TARGET_CPU_P (PENTIUMPRO
)
8164 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
8169 if (desired_align
< align
)
8170 desired_align
= align
;
8171 if (expected_size
!= -1 && expected_size
< 4)
8172 desired_align
= align
;
8174 return desired_align
;
8178 /* Helper function for memcpy. For QImode value 0xXY produce
8179 0xXYXYXYXY of wide specified by MODE. This is essentially
8180 a * 0x10101010, but we can do slightly better than
8181 synth_mult by unwinding the sequence by hand on CPUs with
8184 promote_duplicated_reg (machine_mode mode
, rtx val
)
8186 machine_mode valmode
= GET_MODE (val
);
8188 int nops
= mode
== DImode
? 3 : 2;
8190 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
8191 if (val
== const0_rtx
)
8192 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
8193 if (CONST_INT_P (val
))
8195 HOST_WIDE_INT v
= INTVAL (val
) & 255;
8200 v
|= (v
<< 16) << 16;
8201 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
8204 if (valmode
== VOIDmode
)
8206 if (valmode
!= QImode
)
8207 val
= gen_lowpart (QImode
, val
);
8210 if (!TARGET_PARTIAL_REG_STALL
)
8212 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
8213 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
8214 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
8215 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
8217 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8218 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
8219 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
8224 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8226 if (!TARGET_PARTIAL_REG_STALL
)
8227 emit_insn (gen_insv_1 (mode
, reg
, reg
));
8230 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
8231 NULL
, 1, OPTAB_DIRECT
);
8232 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
8235 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
8236 NULL
, 1, OPTAB_DIRECT
);
8237 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8240 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
8241 NULL
, 1, OPTAB_DIRECT
);
8242 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8247 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8248 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8249 alignment from ALIGN to DESIRED_ALIGN. */
8251 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
8257 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
8258 promoted_val
= promote_duplicated_reg (DImode
, val
);
8259 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
8260 promoted_val
= promote_duplicated_reg (SImode
, val
);
8261 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
8262 promoted_val
= promote_duplicated_reg (HImode
, val
);
8266 return promoted_val
;
8269 /* Copy the address to a Pmode register. This is used for x32 to
8270 truncate DImode TLS address to a SImode register. */
8273 ix86_copy_addr_to_reg (rtx addr
)
8276 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
8278 reg
= copy_addr_to_reg (addr
);
8279 REG_POINTER (reg
) = 1;
8284 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
8285 reg
= copy_to_mode_reg (DImode
, addr
);
8286 REG_POINTER (reg
) = 1;
8287 return gen_rtx_SUBREG (SImode
, reg
, 0);
8291 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8292 operations when profitable. The code depends upon architecture, block size
8293 and alignment, but always has one of the following overall structures:
8295 Aligned move sequence:
8297 1) Prologue guard: Conditional that jumps up to epilogues for small
8298 blocks that can be handled by epilogue alone. This is faster
8299 but also needed for correctness, since prologue assume the block
8300 is larger than the desired alignment.
8302 Optional dynamic check for size and libcall for large
8303 blocks is emitted here too, with -minline-stringops-dynamically.
8305 2) Prologue: copy first few bytes in order to get destination
8306 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8307 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8308 copied. We emit either a jump tree on power of two sized
8309 blocks, or a byte loop.
8311 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8312 with specified algorithm.
8314 4) Epilogue: code copying tail of the block that is too small to be
8315 handled by main body (or up to size guarded by prologue guard).
8317 Misaligned move sequence
8319 1) missaligned move prologue/epilogue containing:
8320 a) Prologue handling small memory blocks and jumping to done_label
8321 (skipped if blocks are known to be large enough)
8322 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8323 needed by single possibly misaligned move
8324 (skipped if alignment is not needed)
8325 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8327 2) Zero size guard dispatching to done_label, if needed
8329 3) dispatch to library call, if needed,
8331 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8332 with specified algorithm. */
8334 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
8335 rtx align_exp
, rtx expected_align_exp
,
8336 rtx expected_size_exp
, rtx min_size_exp
,
8337 rtx max_size_exp
, rtx probable_max_size_exp
,
8342 rtx_code_label
*label
= NULL
;
8344 rtx_code_label
*jump_around_label
= NULL
;
8345 HOST_WIDE_INT align
= 1;
8346 unsigned HOST_WIDE_INT count
= 0;
8347 HOST_WIDE_INT expected_size
= -1;
8348 int size_needed
= 0, epilogue_size_needed
;
8349 int desired_align
= 0, align_bytes
= 0;
8350 enum stringop_alg alg
;
8351 rtx promoted_val
= NULL
;
8352 rtx vec_promoted_val
= NULL
;
8353 bool force_loopy_epilogue
= false;
8355 bool need_zero_guard
= false;
8357 machine_mode move_mode
= VOIDmode
;
8358 machine_mode wider_mode
;
8359 int unroll_factor
= 1;
8360 /* TODO: Once value ranges are available, fill in proper data. */
8361 unsigned HOST_WIDE_INT min_size
= 0;
8362 unsigned HOST_WIDE_INT max_size
= -1;
8363 unsigned HOST_WIDE_INT probable_max_size
= -1;
8364 bool misaligned_prologue_used
= false;
8367 if (CONST_INT_P (align_exp
))
8368 align
= INTVAL (align_exp
);
8369 /* i386 can do misaligned access on reasonably increased cost. */
8370 if (CONST_INT_P (expected_align_exp
)
8371 && INTVAL (expected_align_exp
) > align
)
8372 align
= INTVAL (expected_align_exp
);
8373 /* ALIGN is the minimum of destination and source alignment, but we care here
8374 just about destination alignment. */
8376 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
8377 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
8379 if (CONST_INT_P (count_exp
))
8381 min_size
= max_size
= probable_max_size
= count
= expected_size
8382 = INTVAL (count_exp
);
8383 /* When COUNT is 0, there is nothing to do. */
8390 min_size
= INTVAL (min_size_exp
);
8392 max_size
= INTVAL (max_size_exp
);
8393 if (probable_max_size_exp
)
8394 probable_max_size
= INTVAL (probable_max_size_exp
);
8395 if (CONST_INT_P (expected_size_exp
))
8396 expected_size
= INTVAL (expected_size_exp
);
8399 /* Make sure we don't need to care about overflow later on. */
8400 if (count
> (HOST_WIDE_INT_1U
<< 30))
8403 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
8405 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
8407 /* Step 0: Decide on preferred algorithm, desired alignment and
8408 size of chunks to be copied by main loop. */
8409 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
8411 issetmem
&& val_exp
== const0_rtx
, have_as
,
8412 &dynamic_check
, &noalign
, false);
8415 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
8416 stringop_alg_names
[alg
]);
8420 gcc_assert (alg
!= no_stringop
);
8422 /* For now vector-version of memset is generated only for memory zeroing, as
8423 creating of promoted vector value is very cheap in this case. */
8424 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
8425 alg
= unrolled_loop
;
8428 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
8429 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
8431 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
8434 move_mode
= word_mode
;
8442 need_zero_guard
= true;
8446 need_zero_guard
= true;
8449 need_zero_guard
= true;
8450 unroll_factor
= (TARGET_64BIT
? 4 : 2);
8453 need_zero_guard
= true;
8455 /* Find the widest supported mode. */
8456 move_mode
= word_mode
;
8457 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
8458 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
8459 move_mode
= wider_mode
;
8461 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
8464 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8465 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8466 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
8468 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
8469 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
8470 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
8471 move_mode
= word_mode
;
8473 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
8475 case rep_prefix_8_byte
:
8478 case rep_prefix_4_byte
:
8481 case rep_prefix_1_byte
:
8485 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
8486 epilogue_size_needed
= size_needed
;
8488 /* If we are going to call any library calls conditionally, make sure any
8489 pending stack adjustment happen before the first conditional branch,
8490 otherwise they will be emitted before the library call only and won't
8491 happen from the other branches. */
8492 if (dynamic_check
!= -1)
8493 do_pending_stack_adjust ();
8495 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
8496 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
8497 align
= desired_align
;
8499 /* Step 1: Prologue guard. */
8501 /* Alignment code needs count to be in register. */
8502 if (CONST_INT_P (count_exp
) && desired_align
> align
)
8504 if (INTVAL (count_exp
) > desired_align
8505 && INTVAL (count_exp
) > size_needed
)
8508 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
8509 if (align_bytes
<= 0)
8512 align_bytes
= desired_align
- align_bytes
;
8514 if (align_bytes
== 0)
8515 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
8517 gcc_assert (desired_align
>= 1 && align
>= 1);
8519 /* Misaligned move sequences handle both prologue and epilogue at once.
8520 Default code generation results in a smaller code for large alignments
8521 and also avoids redundant job when sizes are known precisely. */
8522 misaligned_prologue_used
8523 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8524 && MAX (desired_align
, epilogue_size_needed
) <= 32
8525 && desired_align
<= epilogue_size_needed
8526 && ((desired_align
> align
&& !align_bytes
)
8527 || (!count
&& epilogue_size_needed
> 1)));
8529 /* Do the cheap promotion to allow better CSE across the
8530 main loop and epilogue (ie one load of the big constant in the
8532 For now the misaligned move sequences do not have fast path
8533 without broadcasting. */
8534 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
8536 if (alg
== vector_loop
)
8538 gcc_assert (val_exp
== const0_rtx
);
8539 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
8540 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
8541 GET_MODE_SIZE (word_mode
),
8542 desired_align
, align
);
8546 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
8547 desired_align
, align
);
8550 /* Misaligned move sequences handles both prologues and epilogues at once.
8551 Default code generation results in smaller code for large alignments and
8552 also avoids redundant job when sizes are known precisely. */
8553 if (misaligned_prologue_used
)
8555 /* Misaligned move prologue handled small blocks by itself. */
8556 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8557 (dst
, src
, &destreg
, &srcreg
,
8558 move_mode
, promoted_val
, vec_promoted_val
,
8561 desired_align
< align
8562 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
8563 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
8565 src
= change_address (src
, BLKmode
, srcreg
);
8566 dst
= change_address (dst
, BLKmode
, destreg
);
8567 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8568 epilogue_size_needed
= 0;
8570 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
8572 /* It is possible that we copied enough so the main loop will not
8574 gcc_assert (size_needed
> 1);
8575 if (jump_around_label
== NULL_RTX
)
8576 jump_around_label
= gen_label_rtx ();
8577 emit_cmp_and_jump_insns (count_exp
,
8578 GEN_INT (size_needed
),
8579 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
8580 if (expected_size
== -1
8581 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
8582 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8584 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8587 /* Ensure that alignment prologue won't copy past end of block. */
8588 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
8590 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
8591 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8592 Make sure it is power of 2. */
8593 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
8595 /* To improve performance of small blocks, we jump around the VAL
8596 promoting mode. This mean that if the promoted VAL is not constant,
8597 we might not use it in the epilogue and have to use byte
8599 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
8600 force_loopy_epilogue
= true;
8601 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8602 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8604 /* If main algorithm works on QImode, no epilogue is needed.
8605 For small sizes just don't align anything. */
8606 if (size_needed
== 1)
8607 desired_align
= align
;
8612 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8614 label
= gen_label_rtx ();
8615 emit_cmp_and_jump_insns (count_exp
,
8616 GEN_INT (epilogue_size_needed
),
8617 LTU
, 0, counter_mode (count_exp
), 1, label
);
8618 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
8619 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8621 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8625 /* Emit code to decide on runtime whether library call or inline should be
8627 if (dynamic_check
!= -1)
8629 if (!issetmem
&& CONST_INT_P (count_exp
))
8631 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
8633 emit_block_copy_via_libcall (dst
, src
, count_exp
);
8634 count_exp
= const0_rtx
;
8640 rtx_code_label
*hot_label
= gen_label_rtx ();
8641 if (jump_around_label
== NULL_RTX
)
8642 jump_around_label
= gen_label_rtx ();
8643 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
8644 LEU
, 0, counter_mode (count_exp
),
8646 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
8648 set_storage_via_libcall (dst
, count_exp
, val_exp
);
8650 emit_block_copy_via_libcall (dst
, src
, count_exp
);
8651 emit_jump (jump_around_label
);
8652 emit_label (hot_label
);
8656 /* Step 2: Alignment prologue. */
8657 /* Do the expensive promotion once we branched off the small blocks. */
8658 if (issetmem
&& !promoted_val
)
8659 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
8660 desired_align
, align
);
8662 if (desired_align
> align
&& !misaligned_prologue_used
)
8664 if (align_bytes
== 0)
8666 /* Except for the first move in prologue, we no longer know
8667 constant offset in aliasing info. It don't seems to worth
8668 the pain to maintain it for the first move, so throw away
8670 dst
= change_address (dst
, BLKmode
, destreg
);
8672 src
= change_address (src
, BLKmode
, srcreg
);
8673 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
8674 promoted_val
, vec_promoted_val
,
8675 count_exp
, align
, desired_align
,
8677 /* At most desired_align - align bytes are copied. */
8678 if (min_size
< (unsigned)(desired_align
- align
))
8681 min_size
-= desired_align
- align
;
8685 /* If we know how many bytes need to be stored before dst is
8686 sufficiently aligned, maintain aliasing info accurately. */
8687 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
8695 count_exp
= plus_constant (counter_mode (count_exp
),
8696 count_exp
, -align_bytes
);
8697 count
-= align_bytes
;
8698 min_size
-= align_bytes
;
8699 max_size
-= align_bytes
;
8702 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
8703 && (count
< (unsigned HOST_WIDE_INT
) size_needed
8704 || (align_bytes
== 0
8705 && count
< ((unsigned HOST_WIDE_INT
) size_needed
8706 + desired_align
- align
))))
8708 /* It is possible that we copied enough so the main loop will not
8710 gcc_assert (size_needed
> 1);
8711 if (label
== NULL_RTX
)
8712 label
= gen_label_rtx ();
8713 emit_cmp_and_jump_insns (count_exp
,
8714 GEN_INT (size_needed
),
8715 LTU
, 0, counter_mode (count_exp
), 1, label
);
8716 if (expected_size
== -1
8717 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
8718 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8720 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8723 if (label
&& size_needed
== 1)
8726 LABEL_NUSES (label
) = 1;
8728 epilogue_size_needed
= 1;
8730 promoted_val
= val_exp
;
8732 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
8733 epilogue_size_needed
= size_needed
;
8735 /* Step 3: Main loop. */
8746 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
8747 count_exp
, move_mode
, unroll_factor
,
8748 expected_size
, issetmem
);
8751 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
8752 vec_promoted_val
, count_exp
, move_mode
,
8753 unroll_factor
, expected_size
, issetmem
);
8755 case rep_prefix_8_byte
:
8756 case rep_prefix_4_byte
:
8757 case rep_prefix_1_byte
:
8758 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
8759 val_exp
, count_exp
, move_mode
, issetmem
);
8762 /* Adjust properly the offset of src and dest memory for aliasing. */
8763 if (CONST_INT_P (count_exp
))
8766 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
8767 (count
/ size_needed
) * size_needed
);
8768 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
8769 (count
/ size_needed
) * size_needed
);
8774 src
= change_address (src
, BLKmode
, srcreg
);
8775 dst
= change_address (dst
, BLKmode
, destreg
);
8778 /* Step 4: Epilogue to copy the remaining bytes. */
8782 /* When the main loop is done, COUNT_EXP might hold original count,
8783 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8784 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8785 bytes. Compensate if needed. */
8787 if (size_needed
< epilogue_size_needed
)
8789 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
8790 GEN_INT (size_needed
- 1), count_exp
, 1,
8792 if (tmp
!= count_exp
)
8793 emit_move_insn (count_exp
, tmp
);
8796 LABEL_NUSES (label
) = 1;
8799 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
8801 if (force_loopy_epilogue
)
8802 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
8803 epilogue_size_needed
);
8807 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
8808 vec_promoted_val
, count_exp
,
8809 epilogue_size_needed
);
8811 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
8812 epilogue_size_needed
);
8815 if (jump_around_label
)
8816 emit_label (jump_around_label
);
8820 /* Expand cmpstrn or memcmp. */
8823 ix86_expand_cmpstrn_or_cmpmem (rtx result
, rtx src1
, rtx src2
,
8824 rtx length
, rtx align
, bool is_cmpstrn
)
8826 /* Expand strncmp and memcmp only with -minline-all-stringops since
8827 "repz cmpsb" can be much slower than strncmp and memcmp functions
8828 implemented with vector instructions, see
8830 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
8832 if (!TARGET_INLINE_ALL_STRINGOPS
)
8835 /* Can't use this if the user has appropriated ecx, esi or edi. */
8836 if (fixed_regs
[CX_REG
] || fixed_regs
[SI_REG
] || fixed_regs
[DI_REG
])
8841 /* For strncmp, length is the maximum length, which can be larger
8842 than actual string lengths. We can expand the cmpstrn pattern
8843 to "repz cmpsb" only if one of the strings is a constant so
8844 that expand_builtin_strncmp() can write the length argument to
8845 be the minimum of the const string length and the actual length
8846 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
8847 tree t1
= MEM_EXPR (src1
);
8848 tree t2
= MEM_EXPR (src2
);
8849 if (!((t1
&& TREE_CODE (t1
) == MEM_REF
8850 && TREE_CODE (TREE_OPERAND (t1
, 0)) == ADDR_EXPR
8851 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1
, 0), 0))
8853 || (t2
&& TREE_CODE (t2
) == MEM_REF
8854 && TREE_CODE (TREE_OPERAND (t2
, 0)) == ADDR_EXPR
8855 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2
, 0), 0))
8860 rtx addr1
= copy_addr_to_reg (XEXP (src1
, 0));
8861 rtx addr2
= copy_addr_to_reg (XEXP (src2
, 0));
8862 if (addr1
!= XEXP (src1
, 0))
8863 src1
= replace_equiv_address_nv (src1
, addr1
);
8864 if (addr2
!= XEXP (src2
, 0))
8865 src2
= replace_equiv_address_nv (src2
, addr2
);
8867 /* NB: Make a copy of the data length to avoid changing the original
8868 data length by cmpstrnqi patterns. */
8869 length
= ix86_zero_extend_to_Pmode (length
);
8870 rtx lengthreg
= gen_reg_rtx (Pmode
);
8871 emit_move_insn (lengthreg
, length
);
8873 /* If we are testing strict equality, we can use known alignment to
8874 good advantage. This may be possible with combine, particularly
8875 once cc0 is dead. */
8876 if (CONST_INT_P (length
))
8878 if (length
== const0_rtx
)
8880 emit_move_insn (result
, const0_rtx
);
8883 emit_insn (gen_cmpstrnqi_nz_1 (addr1
, addr2
, lengthreg
, align
,
8888 emit_insn (gen_cmp_1 (Pmode
, lengthreg
, lengthreg
));
8889 emit_insn (gen_cmpstrnqi_1 (addr1
, addr2
, lengthreg
, align
,
8893 rtx out
= gen_lowpart (QImode
, result
);
8894 emit_insn (gen_cmpintqi (out
));
8895 emit_move_insn (result
, gen_rtx_SIGN_EXTEND (SImode
, out
));
8900 /* Expand the appropriate insns for doing strlen if not just doing
8903 out = result, initialized with the start address
8904 align_rtx = alignment of the address.
8905 scratch = scratch register, initialized with the startaddress when
8906 not aligned, otherwise undefined
8908 This is just the body. It needs the initializations mentioned above and
8909 some address computing at the end. These things are done in i386.md. */
8912 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
8916 rtx_code_label
*align_2_label
= NULL
;
8917 rtx_code_label
*align_3_label
= NULL
;
8918 rtx_code_label
*align_4_label
= gen_label_rtx ();
8919 rtx_code_label
*end_0_label
= gen_label_rtx ();
8921 rtx tmpreg
= gen_reg_rtx (SImode
);
8922 rtx scratch
= gen_reg_rtx (SImode
);
8926 if (CONST_INT_P (align_rtx
))
8927 align
= INTVAL (align_rtx
);
8929 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
8931 /* Is there a known alignment and is it less than 4? */
8934 rtx scratch1
= gen_reg_rtx (Pmode
);
8935 emit_move_insn (scratch1
, out
);
8936 /* Is there a known alignment and is it not 2? */
8939 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
8940 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
8942 /* Leave just the 3 lower bits. */
8943 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
8944 NULL_RTX
, 0, OPTAB_WIDEN
);
8946 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
8947 Pmode
, 1, align_4_label
);
8948 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
8949 Pmode
, 1, align_2_label
);
8950 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
8951 Pmode
, 1, align_3_label
);
8955 /* Since the alignment is 2, we have to check 2 or 0 bytes;
8956 check if is aligned to 4 - byte. */
8958 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
8959 NULL_RTX
, 0, OPTAB_WIDEN
);
8961 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
8962 Pmode
, 1, align_4_label
);
8965 mem
= change_address (src
, QImode
, out
);
8967 /* Now compare the bytes. */
8969 /* Compare the first n unaligned byte on a byte per byte basis. */
8970 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
8971 QImode
, 1, end_0_label
);
8973 /* Increment the address. */
8974 emit_insn (gen_add2_insn (out
, const1_rtx
));
8976 /* Not needed with an alignment of 2 */
8979 emit_label (align_2_label
);
8981 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
8984 emit_insn (gen_add2_insn (out
, const1_rtx
));
8986 emit_label (align_3_label
);
8989 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
8992 emit_insn (gen_add2_insn (out
, const1_rtx
));
8995 /* Generate loop to check 4 bytes at a time. It is not a good idea to
8996 align this loop. It gives only huge programs, but does not help to
8998 emit_label (align_4_label
);
9000 mem
= change_address (src
, SImode
, out
);
9001 emit_move_insn (scratch
, mem
);
9002 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
9004 /* This formula yields a nonzero result iff one of the bytes is zero.
9005 This saves three branches inside loop and many cycles. */
9007 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
9008 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
9009 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
9010 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
9011 gen_int_mode (0x80808080, SImode
)));
9012 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
9017 rtx reg
= gen_reg_rtx (SImode
);
9018 rtx reg2
= gen_reg_rtx (Pmode
);
9019 emit_move_insn (reg
, tmpreg
);
9020 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
9022 /* If zero is not in the first two bytes, move two bytes forward. */
9023 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9024 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9025 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9026 emit_insn (gen_rtx_SET (tmpreg
,
9027 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
9030 /* Emit lea manually to avoid clobbering of flags. */
9031 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
9033 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9034 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9035 emit_insn (gen_rtx_SET (out
,
9036 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
9042 rtx_code_label
*end_2_label
= gen_label_rtx ();
9043 /* Is zero in the first two bytes? */
9045 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9046 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9047 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
9048 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
9049 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
9051 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
9052 JUMP_LABEL (tmp
) = end_2_label
;
9054 /* Not in the first two. Move two bytes forward. */
9055 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
9056 emit_insn (gen_add2_insn (out
, const2_rtx
));
9058 emit_label (end_2_label
);
9062 /* Avoid branch in fixing the byte. */
9063 tmpreg
= gen_lowpart (QImode
, tmpreg
);
9064 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
9065 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
9066 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
9067 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
9069 emit_label (end_0_label
);
9072 /* Expand strlen. */
9075 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
9077 if (TARGET_UNROLL_STRLEN
9078 && TARGET_INLINE_ALL_STRINGOPS
9079 && eoschar
== const0_rtx
9082 /* The generic case of strlen expander is long. Avoid it's
9083 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9084 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
9085 /* Well it seems that some optimizer does not combine a call like
9086 foo(strlen(bar), strlen(bar));
9087 when the move and the subtraction is done here. It does calculate
9088 the length just once when these instructions are done inside of
9089 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9090 often used and I use one fewer register for the lifetime of
9091 output_strlen_unroll() this is better. */
9093 emit_move_insn (out
, addr
);
9095 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
9097 /* strlensi_unroll_1 returns the address of the zero at the end of
9098 the string, like memchr(), so compute the length by subtracting
9099 the start address. */
9100 emit_insn (gen_sub2_insn (out
, addr
));
9107 /* For given symbol (function) construct code to compute address of it's PLT
9108 entry in large x86-64 PIC model. */
9111 construct_plt_address (rtx symbol
)
9115 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
9116 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
9117 gcc_assert (Pmode
== DImode
);
9119 tmp
= gen_reg_rtx (Pmode
);
9120 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
9122 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
9123 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
9127 /* Additional registers that are clobbered by SYSV calls. */
9129 static int const x86_64_ms_sysv_extra_clobbered_registers
9130 [NUM_X86_64_MS_CLOBBERED_REGS
] =
9134 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
9135 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
9139 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
9141 rtx pop
, bool sibcall
)
9144 rtx use
= NULL
, call
;
9145 unsigned int vec_len
= 0;
9148 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9150 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
9152 && (lookup_attribute ("interrupt",
9153 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
9154 error ("interrupt service routine cannot be called directly");
9159 if (pop
== const0_rtx
)
9161 gcc_assert (!TARGET_64BIT
|| !pop
);
9163 rtx addr
= XEXP (fnaddr
, 0);
9164 if (TARGET_MACHO
&& !TARGET_64BIT
)
9167 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9168 fnaddr
= machopic_indirect_call_target (fnaddr
);
9173 /* Static functions and indirect calls don't need the pic register. Also,
9174 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9175 it an indirect call. */
9177 && GET_CODE (addr
) == SYMBOL_REF
9178 && ix86_call_use_plt_p (addr
))
9181 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
9182 || !lookup_attribute ("noplt",
9183 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
9186 || (ix86_cmodel
== CM_LARGE_PIC
9187 && DEFAULT_ABI
!= MS_ABI
))
9189 use_reg (&use
, gen_rtx_REG (Pmode
,
9190 REAL_PIC_OFFSET_TABLE_REGNUM
));
9191 if (ix86_use_pseudo_pic_reg ())
9192 emit_move_insn (gen_rtx_REG (Pmode
,
9193 REAL_PIC_OFFSET_TABLE_REGNUM
),
9194 pic_offset_table_rtx
);
9197 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
9200 && ix86_cmodel
== CM_LARGE_PIC
9201 && DEFAULT_ABI
!= MS_ABI
)
9203 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9205 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9206 fnaddr
= force_reg (Pmode
, fnaddr
);
9207 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
, fnaddr
);
9209 else if (TARGET_64BIT
)
9211 fnaddr
= gen_rtx_UNSPEC (Pmode
,
9212 gen_rtvec (1, addr
),
9214 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9218 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9220 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9221 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
9224 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
9225 /* Pmode may not be the same as word_mode for x32, which
9226 doesn't support indirect branch via 32-bit memory slot.
9227 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9228 indirect branch via x32 GOT slot is OK. */
9229 if (GET_MODE (fnaddr
) != word_mode
)
9230 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
9231 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
9236 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9237 parameters passed in vector registers. */
9239 && (INTVAL (callarg2
) > 0
9240 || (INTVAL (callarg2
) == 0
9241 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
9243 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
9244 emit_move_insn (al
, callarg2
);
9248 if (ix86_cmodel
== CM_LARGE_PIC
9251 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
9252 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
9253 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
9254 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9255 branch via x32 GOT slot is OK. */
9256 else if (!(TARGET_X32
9258 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
9259 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
9261 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
9262 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
9264 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
9265 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
9268 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
9271 call
= gen_rtx_SET (retval
, call
);
9272 vec
[vec_len
++] = call
;
9276 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
9277 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
9278 vec
[vec_len
++] = pop
;
9281 if (cfun
->machine
->no_caller_saved_registers
9283 || (!TREE_THIS_VOLATILE (fndecl
)
9284 && !lookup_attribute ("no_caller_saved_registers",
9285 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
9287 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
9288 bool is_64bit_ms_abi
= (TARGET_64BIT
9289 && ix86_function_abi (fndecl
) == MS_ABI
);
9290 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
9292 /* If there are no caller-saved registers, add all registers
9293 that are clobbered by the call which returns. */
9294 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
9296 && (ix86_call_used_regs
[i
] == 1
9297 || (ix86_call_used_regs
[i
] & c_mask
))
9298 && !STACK_REGNO_P (i
)
9299 && !MMX_REGNO_P (i
))
9301 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
9303 else if (TARGET_64BIT_MS_ABI
9304 && (!callarg2
|| INTVAL (callarg2
) != -2))
9308 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
9310 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
9311 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
9313 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
9316 /* Set here, but it may get cleared later. */
9317 if (TARGET_CALL_MS2SYSV_XLOGUES
)
9322 /* Don't break hot-patched functions. */
9323 else if (ix86_function_ms_hook_prologue (current_function_decl
))
9326 /* TODO: Cases not yet examined. */
9327 else if (flag_split_stack
)
9328 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9332 gcc_assert (!reload_completed
);
9333 cfun
->machine
->call_ms2sysv
= true;
9338 if (TARGET_MACHO
&& TARGET_64BIT
&& !sibcall
9339 && ((GET_CODE (addr
) == SYMBOL_REF
&& !SYMBOL_REF_LOCAL_P (addr
))
9340 || !fndecl
|| TREE_PUBLIC (fndecl
)))
9342 /* We allow public functions defined in a TU to bind locally for PIC
9343 code (the default) on 64bit Mach-O.
9344 If such functions are not inlined, we cannot tell at compile-time if
9345 they will be called via the lazy symbol resolver (this can depend on
9346 options given at link-time). Therefore, we must assume that the lazy
9347 resolver could be used which clobbers R11 and R10. */
9348 clobber_reg (&use
, gen_rtx_REG (DImode
, R11_REG
));
9349 clobber_reg (&use
, gen_rtx_REG (DImode
, R10_REG
));
9353 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
9354 rtx_insn
*call_insn
= emit_call_insn (call
);
9356 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
9361 /* Split simple return with popping POPC bytes from stack to indirect
9362 branch with stack adjustment . */
9365 ix86_split_simple_return_pop_internal (rtx popc
)
9367 struct machine_function
*m
= cfun
->machine
;
9368 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
9371 /* There is no "pascal" calling convention in any 64bit ABI. */
9372 gcc_assert (!TARGET_64BIT
);
9374 insn
= emit_insn (gen_pop (ecx
));
9375 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
9376 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
9378 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
9379 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9380 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9381 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
9382 RTX_FRAME_RELATED_P (insn
) = 1;
9384 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
9385 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9386 insn
= emit_insn (x
);
9387 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9388 RTX_FRAME_RELATED_P (insn
) = 1;
9390 /* Now return address is in ECX. */
9391 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
9394 /* Errors in the source file can cause expand_expr to return const0_rtx
9395 where we expect a vector. To avoid crashing, use one of the vector
9396 clear instructions. */
9399 safe_vector_operand (rtx x
, machine_mode mode
)
9401 if (x
== const0_rtx
)
9402 x
= CONST0_RTX (mode
);
9406 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9409 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
9412 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9413 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9414 rtx op0
= expand_normal (arg0
);
9415 rtx op1
= expand_normal (arg1
);
9416 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9417 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
9418 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
9420 if (VECTOR_MODE_P (mode0
))
9421 op0
= safe_vector_operand (op0
, mode0
);
9422 if (VECTOR_MODE_P (mode1
))
9423 op1
= safe_vector_operand (op1
, mode1
);
9425 if (optimize
|| !target
9426 || GET_MODE (target
) != tmode
9427 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9428 target
= gen_reg_rtx (tmode
);
9430 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
9432 rtx x
= gen_reg_rtx (V4SImode
);
9433 emit_insn (gen_sse2_loadd (x
, op1
));
9434 op1
= gen_lowpart (TImode
, x
);
9437 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
9438 op0
= copy_to_mode_reg (mode0
, op0
);
9439 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
9440 op1
= copy_to_mode_reg (mode1
, op1
);
9442 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
9451 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9454 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
9455 enum ix86_builtin_func_type m_type
,
9456 enum rtx_code sub_code
)
9459 unsigned int i
, nargs
;
9460 bool comparison_p
= false;
9462 bool last_arg_constant
= false;
9466 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9470 case MULTI_ARG_4_DF2_DI_I
:
9471 case MULTI_ARG_4_DF2_DI_I1
:
9472 case MULTI_ARG_4_SF2_SI_I
:
9473 case MULTI_ARG_4_SF2_SI_I1
:
9475 last_arg_constant
= true;
9478 case MULTI_ARG_3_SF
:
9479 case MULTI_ARG_3_DF
:
9480 case MULTI_ARG_3_SF2
:
9481 case MULTI_ARG_3_DF2
:
9482 case MULTI_ARG_3_DI
:
9483 case MULTI_ARG_3_SI
:
9484 case MULTI_ARG_3_SI_DI
:
9485 case MULTI_ARG_3_HI
:
9486 case MULTI_ARG_3_HI_SI
:
9487 case MULTI_ARG_3_QI
:
9488 case MULTI_ARG_3_DI2
:
9489 case MULTI_ARG_3_SI2
:
9490 case MULTI_ARG_3_HI2
:
9491 case MULTI_ARG_3_QI2
:
9495 case MULTI_ARG_2_SF
:
9496 case MULTI_ARG_2_DF
:
9497 case MULTI_ARG_2_DI
:
9498 case MULTI_ARG_2_SI
:
9499 case MULTI_ARG_2_HI
:
9500 case MULTI_ARG_2_QI
:
9504 case MULTI_ARG_2_DI_IMM
:
9505 case MULTI_ARG_2_SI_IMM
:
9506 case MULTI_ARG_2_HI_IMM
:
9507 case MULTI_ARG_2_QI_IMM
:
9509 last_arg_constant
= true;
9512 case MULTI_ARG_1_SF
:
9513 case MULTI_ARG_1_DF
:
9514 case MULTI_ARG_1_SF2
:
9515 case MULTI_ARG_1_DF2
:
9516 case MULTI_ARG_1_DI
:
9517 case MULTI_ARG_1_SI
:
9518 case MULTI_ARG_1_HI
:
9519 case MULTI_ARG_1_QI
:
9520 case MULTI_ARG_1_SI_DI
:
9521 case MULTI_ARG_1_HI_DI
:
9522 case MULTI_ARG_1_HI_SI
:
9523 case MULTI_ARG_1_QI_DI
:
9524 case MULTI_ARG_1_QI_SI
:
9525 case MULTI_ARG_1_QI_HI
:
9529 case MULTI_ARG_2_DI_CMP
:
9530 case MULTI_ARG_2_SI_CMP
:
9531 case MULTI_ARG_2_HI_CMP
:
9532 case MULTI_ARG_2_QI_CMP
:
9534 comparison_p
= true;
9537 case MULTI_ARG_2_SF_TF
:
9538 case MULTI_ARG_2_DF_TF
:
9539 case MULTI_ARG_2_DI_TF
:
9540 case MULTI_ARG_2_SI_TF
:
9541 case MULTI_ARG_2_HI_TF
:
9542 case MULTI_ARG_2_QI_TF
:
9551 if (optimize
|| !target
9552 || GET_MODE (target
) != tmode
9553 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9554 target
= gen_reg_rtx (tmode
);
9555 else if (memory_operand (target
, tmode
))
9558 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
9560 for (i
= 0; i
< nargs
; i
++)
9562 tree arg
= CALL_EXPR_ARG (exp
, i
);
9563 rtx op
= expand_normal (arg
);
9564 int adjust
= (comparison_p
) ? 1 : 0;
9565 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
9567 if (last_arg_constant
&& i
== nargs
- 1)
9569 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
9571 enum insn_code new_icode
= icode
;
9574 case CODE_FOR_xop_vpermil2v2df3
:
9575 case CODE_FOR_xop_vpermil2v4sf3
:
9576 case CODE_FOR_xop_vpermil2v4df3
:
9577 case CODE_FOR_xop_vpermil2v8sf3
:
9578 error ("the last argument must be a 2-bit immediate");
9579 return gen_reg_rtx (tmode
);
9580 case CODE_FOR_xop_rotlv2di3
:
9581 new_icode
= CODE_FOR_rotlv2di3
;
9583 case CODE_FOR_xop_rotlv4si3
:
9584 new_icode
= CODE_FOR_rotlv4si3
;
9586 case CODE_FOR_xop_rotlv8hi3
:
9587 new_icode
= CODE_FOR_rotlv8hi3
;
9589 case CODE_FOR_xop_rotlv16qi3
:
9590 new_icode
= CODE_FOR_rotlv16qi3
;
9592 if (CONST_INT_P (op
))
9594 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
9595 op
= GEN_INT (INTVAL (op
) & mask
);
9597 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
9603 && insn_data
[new_icode
].operand
[0].mode
== tmode
9604 && insn_data
[new_icode
].operand
[1].mode
== tmode
9605 && insn_data
[new_icode
].operand
[2].mode
== mode
9606 && insn_data
[new_icode
].operand
[0].predicate
9607 == insn_data
[icode
].operand
[0].predicate
9608 && insn_data
[new_icode
].operand
[1].predicate
9609 == insn_data
[icode
].operand
[1].predicate
);
9622 if (VECTOR_MODE_P (mode
))
9623 op
= safe_vector_operand (op
, mode
);
9625 /* If we aren't optimizing, only allow one memory operand to be
9627 if (memory_operand (op
, mode
))
9630 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
9633 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
9635 op
= force_reg (mode
, op
);
9644 pat
= GEN_FCN (icode
) (target
, xops
[0]);
9649 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
9650 GEN_INT ((int)sub_code
));
9651 else if (! comparison_p
)
9652 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
9655 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
9658 pat
= GEN_FCN (icode
) (target
, cmp_op
, xops
[0], xops
[1]);
9663 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
9667 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
9681 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9682 insns with vec_merge. */
9685 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
9689 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9690 rtx op1
, op0
= expand_normal (arg0
);
9691 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9692 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
9694 if (optimize
|| !target
9695 || GET_MODE (target
) != tmode
9696 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9697 target
= gen_reg_rtx (tmode
);
9699 if (VECTOR_MODE_P (mode0
))
9700 op0
= safe_vector_operand (op0
, mode0
);
9702 if ((optimize
&& !register_operand (op0
, mode0
))
9703 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
9704 op0
= copy_to_mode_reg (mode0
, op0
);
9707 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
9708 op1
= copy_to_mode_reg (mode0
, op1
);
9710 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
9717 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9720 ix86_expand_sse_compare (const struct builtin_description
*d
,
9721 tree exp
, rtx target
, bool swap
)
9724 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9725 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9726 rtx op0
= expand_normal (arg0
);
9727 rtx op1
= expand_normal (arg1
);
9729 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
9730 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
9731 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
9732 enum rtx_code comparison
= d
->comparison
;
9734 if (VECTOR_MODE_P (mode0
))
9735 op0
= safe_vector_operand (op0
, mode0
);
9736 if (VECTOR_MODE_P (mode1
))
9737 op1
= safe_vector_operand (op1
, mode1
);
9739 /* Swap operands if we have a comparison that isn't available in
9742 std::swap (op0
, op1
);
9744 if (optimize
|| !target
9745 || GET_MODE (target
) != tmode
9746 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
9747 target
= gen_reg_rtx (tmode
);
9749 if ((optimize
&& !register_operand (op0
, mode0
))
9750 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
9751 op0
= copy_to_mode_reg (mode0
, op0
);
9752 if ((optimize
&& !register_operand (op1
, mode1
))
9753 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
9754 op1
= copy_to_mode_reg (mode1
, op1
);
9756 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
9757 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
9764 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
9767 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
9771 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9772 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9773 rtx op0
= expand_normal (arg0
);
9774 rtx op1
= expand_normal (arg1
);
9775 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
9776 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
9777 enum rtx_code comparison
= d
->comparison
;
9779 if (VECTOR_MODE_P (mode0
))
9780 op0
= safe_vector_operand (op0
, mode0
);
9781 if (VECTOR_MODE_P (mode1
))
9782 op1
= safe_vector_operand (op1
, mode1
);
9784 target
= gen_reg_rtx (SImode
);
9785 emit_move_insn (target
, const0_rtx
);
9786 target
= gen_rtx_SUBREG (QImode
, target
, 0);
9788 if ((optimize
&& !register_operand (op0
, mode0
))
9789 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
9790 op0
= copy_to_mode_reg (mode0
, op0
);
9791 if ((optimize
&& !register_operand (op1
, mode1
))
9792 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
9793 op1
= copy_to_mode_reg (mode1
, op1
);
9795 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
9799 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
9800 gen_rtx_fmt_ee (comparison
, QImode
,
9804 return SUBREG_REG (target
);
9807 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
9810 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
9814 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9815 rtx op1
, op0
= expand_normal (arg0
);
9816 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
9817 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
9819 if (optimize
|| target
== 0
9820 || GET_MODE (target
) != tmode
9821 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
9822 target
= gen_reg_rtx (tmode
);
9824 if (VECTOR_MODE_P (mode0
))
9825 op0
= safe_vector_operand (op0
, mode0
);
9827 if ((optimize
&& !register_operand (op0
, mode0
))
9828 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
9829 op0
= copy_to_mode_reg (mode0
, op0
);
9831 op1
= GEN_INT (d
->comparison
);
9833 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
9841 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
9842 tree exp
, rtx target
)
9845 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9846 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9847 rtx op0
= expand_normal (arg0
);
9848 rtx op1
= expand_normal (arg1
);
9850 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
9851 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
9852 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
9854 if (optimize
|| target
== 0
9855 || GET_MODE (target
) != tmode
9856 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
9857 target
= gen_reg_rtx (tmode
);
9859 op0
= safe_vector_operand (op0
, mode0
);
9860 op1
= safe_vector_operand (op1
, mode1
);
9862 if ((optimize
&& !register_operand (op0
, mode0
))
9863 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
9864 op0
= copy_to_mode_reg (mode0
, op0
);
9865 if ((optimize
&& !register_operand (op1
, mode1
))
9866 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
9867 op1
= copy_to_mode_reg (mode1
, op1
);
9869 op2
= GEN_INT (d
->comparison
);
9871 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
9878 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
9881 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
9885 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9886 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9887 rtx op0
= expand_normal (arg0
);
9888 rtx op1
= expand_normal (arg1
);
9889 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
9890 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
9891 enum rtx_code comparison
= d
->comparison
;
9893 if (VECTOR_MODE_P (mode0
))
9894 op0
= safe_vector_operand (op0
, mode0
);
9895 if (VECTOR_MODE_P (mode1
))
9896 op1
= safe_vector_operand (op1
, mode1
);
9898 target
= gen_reg_rtx (SImode
);
9899 emit_move_insn (target
, const0_rtx
);
9900 target
= gen_rtx_SUBREG (QImode
, target
, 0);
9902 if ((optimize
&& !register_operand (op0
, mode0
))
9903 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
9904 op0
= copy_to_mode_reg (mode0
, op0
);
9905 if ((optimize
&& !register_operand (op1
, mode1
))
9906 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
9907 op1
= copy_to_mode_reg (mode1
, op1
);
9909 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
9913 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
9914 gen_rtx_fmt_ee (comparison
, QImode
,
9918 return SUBREG_REG (target
);
9921 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
9924 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
9925 tree exp
, rtx target
)
9928 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9929 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9930 tree arg2
= CALL_EXPR_ARG (exp
, 2);
9931 tree arg3
= CALL_EXPR_ARG (exp
, 3);
9932 tree arg4
= CALL_EXPR_ARG (exp
, 4);
9933 rtx scratch0
, scratch1
;
9934 rtx op0
= expand_normal (arg0
);
9935 rtx op1
= expand_normal (arg1
);
9936 rtx op2
= expand_normal (arg2
);
9937 rtx op3
= expand_normal (arg3
);
9938 rtx op4
= expand_normal (arg4
);
9939 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
9941 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
9942 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
9943 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
9944 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
9945 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
9946 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
9947 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
9949 if (VECTOR_MODE_P (modev2
))
9950 op0
= safe_vector_operand (op0
, modev2
);
9951 if (VECTOR_MODE_P (modev4
))
9952 op2
= safe_vector_operand (op2
, modev4
);
9954 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
9955 op0
= copy_to_mode_reg (modev2
, op0
);
9956 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
9957 op1
= copy_to_mode_reg (modei3
, op1
);
9958 if ((optimize
&& !register_operand (op2
, modev4
))
9959 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
9960 op2
= copy_to_mode_reg (modev4
, op2
);
9961 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
9962 op3
= copy_to_mode_reg (modei5
, op3
);
9964 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
9966 error ("the fifth argument must be an 8-bit immediate");
9970 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
9972 if (optimize
|| !target
9973 || GET_MODE (target
) != tmode0
9974 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
9975 target
= gen_reg_rtx (tmode0
);
9977 scratch1
= gen_reg_rtx (tmode1
);
9979 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
9981 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
9983 if (optimize
|| !target
9984 || GET_MODE (target
) != tmode1
9985 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
9986 target
= gen_reg_rtx (tmode1
);
9988 scratch0
= gen_reg_rtx (tmode0
);
9990 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
9994 gcc_assert (d
->flag
);
9996 scratch0
= gen_reg_rtx (tmode0
);
9997 scratch1
= gen_reg_rtx (tmode1
);
9999 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10009 target
= gen_reg_rtx (SImode
);
10010 emit_move_insn (target
, const0_rtx
);
10011 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10014 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10015 gen_rtx_fmt_ee (EQ
, QImode
,
10016 gen_rtx_REG ((machine_mode
) d
->flag
,
10019 return SUBREG_REG (target
);
10026 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10029 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
10030 tree exp
, rtx target
)
10033 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10034 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10035 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10036 rtx scratch0
, scratch1
;
10037 rtx op0
= expand_normal (arg0
);
10038 rtx op1
= expand_normal (arg1
);
10039 rtx op2
= expand_normal (arg2
);
10040 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
10042 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10043 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10044 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10045 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
10046 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
10048 if (VECTOR_MODE_P (modev2
))
10049 op0
= safe_vector_operand (op0
, modev2
);
10050 if (VECTOR_MODE_P (modev3
))
10051 op1
= safe_vector_operand (op1
, modev3
);
10053 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10054 op0
= copy_to_mode_reg (modev2
, op0
);
10055 if ((optimize
&& !register_operand (op1
, modev3
))
10056 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
10057 op1
= copy_to_mode_reg (modev3
, op1
);
10059 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
10061 error ("the third argument must be an 8-bit immediate");
10065 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
10067 if (optimize
|| !target
10068 || GET_MODE (target
) != tmode0
10069 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10070 target
= gen_reg_rtx (tmode0
);
10072 scratch1
= gen_reg_rtx (tmode1
);
10074 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
10076 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
10078 if (optimize
|| !target
10079 || GET_MODE (target
) != tmode1
10080 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10081 target
= gen_reg_rtx (tmode1
);
10083 scratch0
= gen_reg_rtx (tmode0
);
10085 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
10089 gcc_assert (d
->flag
);
10091 scratch0
= gen_reg_rtx (tmode0
);
10092 scratch1
= gen_reg_rtx (tmode1
);
10094 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
10104 target
= gen_reg_rtx (SImode
);
10105 emit_move_insn (target
, const0_rtx
);
10106 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10109 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10110 gen_rtx_fmt_ee (EQ
, QImode
,
10111 gen_rtx_REG ((machine_mode
) d
->flag
,
10114 return SUBREG_REG (target
);
10120 /* Fixup modeless constants to fit required mode. */
10123 fixup_modeless_constant (rtx x
, machine_mode mode
)
10125 if (GET_MODE (x
) == VOIDmode
)
10126 x
= convert_to_mode (mode
, x
, 1);
10130 /* Subroutine of ix86_expand_builtin to take care of insns with
10131 variable number of operands. */
10134 ix86_expand_args_builtin (const struct builtin_description
*d
,
10135 tree exp
, rtx target
)
10137 rtx pat
, real_target
;
10138 unsigned int i
, nargs
;
10139 unsigned int nargs_constant
= 0;
10140 unsigned int mask_pos
= 0;
10141 int num_memory
= 0;
10143 bool second_arg_count
= false;
10144 enum insn_code icode
= d
->icode
;
10145 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10146 machine_mode tmode
= insn_p
->operand
[0].mode
;
10147 machine_mode rmode
= VOIDmode
;
10149 enum rtx_code comparison
= d
->comparison
;
10151 switch ((enum ix86_builtin_func_type
) d
->flag
)
10153 case V2DF_FTYPE_V2DF_ROUND
:
10154 case V4DF_FTYPE_V4DF_ROUND
:
10155 case V8DF_FTYPE_V8DF_ROUND
:
10156 case V4SF_FTYPE_V4SF_ROUND
:
10157 case V8SF_FTYPE_V8SF_ROUND
:
10158 case V16SF_FTYPE_V16SF_ROUND
:
10159 case V8HF_FTYPE_V8HF_ROUND
:
10160 case V16HF_FTYPE_V16HF_ROUND
:
10161 case V32HF_FTYPE_V32HF_ROUND
:
10162 case V4SI_FTYPE_V4SF_ROUND
:
10163 case V8SI_FTYPE_V8SF_ROUND
:
10164 case V16SI_FTYPE_V16SF_ROUND
:
10165 return ix86_expand_sse_round (d
, exp
, target
);
10166 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
10167 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
10168 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
10169 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
10170 case INT_FTYPE_V8SF_V8SF_PTEST
:
10171 case INT_FTYPE_V4DI_V4DI_PTEST
:
10172 case INT_FTYPE_V4DF_V4DF_PTEST
:
10173 case INT_FTYPE_V4SF_V4SF_PTEST
:
10174 case INT_FTYPE_V2DI_V2DI_PTEST
:
10175 case INT_FTYPE_V2DF_V2DF_PTEST
:
10176 return ix86_expand_sse_ptest (d
, exp
, target
);
10177 case FLOAT128_FTYPE_FLOAT128
:
10178 case FLOAT_FTYPE_FLOAT
:
10179 case INT_FTYPE_INT
:
10180 case UINT_FTYPE_UINT
:
10181 case UINT16_FTYPE_UINT16
:
10182 case UINT64_FTYPE_INT
:
10183 case UINT64_FTYPE_UINT64
:
10184 case INT64_FTYPE_INT64
:
10185 case INT64_FTYPE_V4SF
:
10186 case INT64_FTYPE_V2DF
:
10187 case INT_FTYPE_V16QI
:
10188 case INT_FTYPE_V8QI
:
10189 case INT_FTYPE_V8SF
:
10190 case INT_FTYPE_V4DF
:
10191 case INT_FTYPE_V4SF
:
10192 case INT_FTYPE_V2DF
:
10193 case INT_FTYPE_V32QI
:
10194 case V16QI_FTYPE_V16QI
:
10195 case V8SI_FTYPE_V8SF
:
10196 case V8SI_FTYPE_V4SI
:
10197 case V8HI_FTYPE_V8HI
:
10198 case V8HI_FTYPE_V16QI
:
10199 case V8QI_FTYPE_V8QI
:
10200 case V8SF_FTYPE_V8SF
:
10201 case V8SF_FTYPE_V8SI
:
10202 case V8SF_FTYPE_V4SF
:
10203 case V8SF_FTYPE_V8HI
:
10204 case V4SI_FTYPE_V4SI
:
10205 case V4SI_FTYPE_V16QI
:
10206 case V4SI_FTYPE_V4SF
:
10207 case V4SI_FTYPE_V8SI
:
10208 case V4SI_FTYPE_V8HI
:
10209 case V4SI_FTYPE_V4DF
:
10210 case V4SI_FTYPE_V2DF
:
10211 case V4HI_FTYPE_V4HI
:
10212 case V4DF_FTYPE_V4DF
:
10213 case V4DF_FTYPE_V4SI
:
10214 case V4DF_FTYPE_V4SF
:
10215 case V4DF_FTYPE_V2DF
:
10216 case V4SF_FTYPE_V4SF
:
10217 case V4SF_FTYPE_V4SI
:
10218 case V4SF_FTYPE_V8SF
:
10219 case V4SF_FTYPE_V4DF
:
10220 case V4SF_FTYPE_V8HI
:
10221 case V4SF_FTYPE_V2DF
:
10222 case V2DI_FTYPE_V2DI
:
10223 case V2DI_FTYPE_V16QI
:
10224 case V2DI_FTYPE_V8HI
:
10225 case V2DI_FTYPE_V4SI
:
10226 case V2DF_FTYPE_V2DF
:
10227 case V2DF_FTYPE_V4SI
:
10228 case V2DF_FTYPE_V4DF
:
10229 case V2DF_FTYPE_V4SF
:
10230 case V2DF_FTYPE_V2SI
:
10231 case V2SI_FTYPE_V2SI
:
10232 case V2SI_FTYPE_V4SF
:
10233 case V2SI_FTYPE_V2SF
:
10234 case V2SI_FTYPE_V2DF
:
10235 case V2SF_FTYPE_V2SF
:
10236 case V2SF_FTYPE_V2SI
:
10237 case V32QI_FTYPE_V32QI
:
10238 case V32QI_FTYPE_V16QI
:
10239 case V16HI_FTYPE_V16HI
:
10240 case V16HI_FTYPE_V8HI
:
10241 case V8SI_FTYPE_V8SI
:
10242 case V16HI_FTYPE_V16QI
:
10243 case V8SI_FTYPE_V16QI
:
10244 case V4DI_FTYPE_V16QI
:
10245 case V8SI_FTYPE_V8HI
:
10246 case V4DI_FTYPE_V8HI
:
10247 case V4DI_FTYPE_V4SI
:
10248 case V4DI_FTYPE_V2DI
:
10249 case UQI_FTYPE_UQI
:
10250 case UHI_FTYPE_UHI
:
10251 case USI_FTYPE_USI
:
10252 case USI_FTYPE_UQI
:
10253 case USI_FTYPE_UHI
:
10254 case UDI_FTYPE_UDI
:
10255 case UHI_FTYPE_V16QI
:
10256 case USI_FTYPE_V32QI
:
10257 case UDI_FTYPE_V64QI
:
10258 case V16QI_FTYPE_UHI
:
10259 case V32QI_FTYPE_USI
:
10260 case V64QI_FTYPE_UDI
:
10261 case V8HI_FTYPE_UQI
:
10262 case V16HI_FTYPE_UHI
:
10263 case V32HI_FTYPE_USI
:
10264 case V4SI_FTYPE_UQI
:
10265 case V8SI_FTYPE_UQI
:
10266 case V4SI_FTYPE_UHI
:
10267 case V8SI_FTYPE_UHI
:
10268 case UQI_FTYPE_V8HI
:
10269 case UHI_FTYPE_V16HI
:
10270 case USI_FTYPE_V32HI
:
10271 case UQI_FTYPE_V4SI
:
10272 case UQI_FTYPE_V8SI
:
10273 case UHI_FTYPE_V16SI
:
10274 case UQI_FTYPE_V2DI
:
10275 case UQI_FTYPE_V4DI
:
10276 case UQI_FTYPE_V8DI
:
10277 case V16SI_FTYPE_UHI
:
10278 case V2DI_FTYPE_UQI
:
10279 case V4DI_FTYPE_UQI
:
10280 case V16SI_FTYPE_INT
:
10281 case V16SF_FTYPE_V8SF
:
10282 case V16SI_FTYPE_V8SI
:
10283 case V16SF_FTYPE_V4SF
:
10284 case V16SI_FTYPE_V4SI
:
10285 case V16SI_FTYPE_V16SF
:
10286 case V16SI_FTYPE_V16SI
:
10287 case V64QI_FTYPE_V64QI
:
10288 case V32HI_FTYPE_V32HI
:
10289 case V16SF_FTYPE_V16SF
:
10290 case V8DI_FTYPE_UQI
:
10291 case V8DI_FTYPE_V8DI
:
10292 case V8DF_FTYPE_V4DF
:
10293 case V8DF_FTYPE_V2DF
:
10294 case V8DF_FTYPE_V8DF
:
10295 case V4DI_FTYPE_V4DI
:
10296 case V16HI_FTYPE_V16SF
:
10297 case V8HI_FTYPE_V8SF
:
10298 case V8HI_FTYPE_V4SF
:
10301 case V4SF_FTYPE_V4SF_VEC_MERGE
:
10302 case V2DF_FTYPE_V2DF_VEC_MERGE
:
10303 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
10304 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
10305 case V16QI_FTYPE_V16QI_V16QI
:
10306 case V16QI_FTYPE_V8HI_V8HI
:
10307 case V16HF_FTYPE_V16HF_V16HF
:
10308 case V16SF_FTYPE_V16SF_V16SF
:
10309 case V8QI_FTYPE_V8QI_V8QI
:
10310 case V8QI_FTYPE_V4HI_V4HI
:
10311 case V8HI_FTYPE_V8HI_V8HI
:
10312 case V8HI_FTYPE_V16QI_V16QI
:
10313 case V8HI_FTYPE_V4SI_V4SI
:
10314 case V8HF_FTYPE_V8HF_V8HF
:
10315 case V8SF_FTYPE_V8SF_V8SF
:
10316 case V8SF_FTYPE_V8SF_V8SI
:
10317 case V8DF_FTYPE_V8DF_V8DF
:
10318 case V4SI_FTYPE_V4SI_V4SI
:
10319 case V4SI_FTYPE_V8HI_V8HI
:
10320 case V4SI_FTYPE_V2DF_V2DF
:
10321 case V4HI_FTYPE_V4HI_V4HI
:
10322 case V4HI_FTYPE_V8QI_V8QI
:
10323 case V4HI_FTYPE_V2SI_V2SI
:
10324 case V4DF_FTYPE_V4DF_V4DF
:
10325 case V4DF_FTYPE_V4DF_V4DI
:
10326 case V4SF_FTYPE_V4SF_V4SF
:
10327 case V4SF_FTYPE_V4SF_V4SI
:
10328 case V4SF_FTYPE_V4SF_V2SI
:
10329 case V4SF_FTYPE_V4SF_V2DF
:
10330 case V4SF_FTYPE_V4SF_UINT
:
10331 case V4SF_FTYPE_V4SF_DI
:
10332 case V4SF_FTYPE_V4SF_SI
:
10333 case V2DI_FTYPE_V2DI_V2DI
:
10334 case V2DI_FTYPE_V16QI_V16QI
:
10335 case V2DI_FTYPE_V4SI_V4SI
:
10336 case V2DI_FTYPE_V2DI_V16QI
:
10337 case V2SI_FTYPE_V2SI_V2SI
:
10338 case V2SI_FTYPE_V4HI_V4HI
:
10339 case V2SI_FTYPE_V2SF_V2SF
:
10340 case V2DF_FTYPE_V2DF_V2DF
:
10341 case V2DF_FTYPE_V2DF_V4SF
:
10342 case V2DF_FTYPE_V2DF_V2DI
:
10343 case V2DF_FTYPE_V2DF_DI
:
10344 case V2DF_FTYPE_V2DF_SI
:
10345 case V2DF_FTYPE_V2DF_UINT
:
10346 case V2SF_FTYPE_V2SF_V2SF
:
10347 case V1DI_FTYPE_V1DI_V1DI
:
10348 case V1DI_FTYPE_V8QI_V8QI
:
10349 case V1DI_FTYPE_V2SI_V2SI
:
10350 case V32QI_FTYPE_V16HI_V16HI
:
10351 case V16HI_FTYPE_V8SI_V8SI
:
10352 case V64QI_FTYPE_V64QI_V64QI
:
10353 case V32QI_FTYPE_V32QI_V32QI
:
10354 case V16HI_FTYPE_V32QI_V32QI
:
10355 case V16HI_FTYPE_V16HI_V16HI
:
10356 case V8SI_FTYPE_V4DF_V4DF
:
10357 case V8SI_FTYPE_V8SI_V8SI
:
10358 case V8SI_FTYPE_V16HI_V16HI
:
10359 case V4DI_FTYPE_V4DI_V4DI
:
10360 case V4DI_FTYPE_V8SI_V8SI
:
10361 case V8DI_FTYPE_V64QI_V64QI
:
10362 if (comparison
== UNKNOWN
)
10363 return ix86_expand_binop_builtin (icode
, exp
, target
);
10366 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
10367 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
10368 gcc_assert (comparison
!= UNKNOWN
);
10372 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
10373 case V16HI_FTYPE_V16HI_SI_COUNT
:
10374 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
10375 case V8SI_FTYPE_V8SI_SI_COUNT
:
10376 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
10377 case V4DI_FTYPE_V4DI_INT_COUNT
:
10378 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
10379 case V8HI_FTYPE_V8HI_SI_COUNT
:
10380 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
10381 case V4SI_FTYPE_V4SI_SI_COUNT
:
10382 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
10383 case V4HI_FTYPE_V4HI_SI_COUNT
:
10384 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
10385 case V2DI_FTYPE_V2DI_SI_COUNT
:
10386 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
10387 case V2SI_FTYPE_V2SI_SI_COUNT
:
10388 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
10389 case V1DI_FTYPE_V1DI_SI_COUNT
:
10391 second_arg_count
= true;
10393 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
10394 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
10395 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
10396 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
10397 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
10398 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
10399 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
10400 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
10401 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
10402 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
10403 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
10404 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
10405 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
10406 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
10407 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
10408 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
10409 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
10410 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
10412 second_arg_count
= true;
10414 case UINT64_FTYPE_UINT64_UINT64
:
10415 case UINT_FTYPE_UINT_UINT
:
10416 case UINT_FTYPE_UINT_USHORT
:
10417 case UINT_FTYPE_UINT_UCHAR
:
10418 case UINT16_FTYPE_UINT16_INT
:
10419 case UINT8_FTYPE_UINT8_INT
:
10420 case UQI_FTYPE_UQI_UQI
:
10421 case UHI_FTYPE_UHI_UHI
:
10422 case USI_FTYPE_USI_USI
:
10423 case UDI_FTYPE_UDI_UDI
:
10424 case V16SI_FTYPE_V8DF_V8DF
:
10425 case V32HI_FTYPE_V16SF_V16SF
:
10426 case V16HI_FTYPE_V8SF_V8SF
:
10427 case V8HI_FTYPE_V4SF_V4SF
:
10428 case V16HI_FTYPE_V16SF_UHI
:
10429 case V8HI_FTYPE_V8SF_UQI
:
10430 case V8HI_FTYPE_V4SF_UQI
:
10433 case V2DI_FTYPE_V2DI_INT_CONVERT
:
10436 nargs_constant
= 1;
10438 case V4DI_FTYPE_V4DI_INT_CONVERT
:
10441 nargs_constant
= 1;
10443 case V8DI_FTYPE_V8DI_INT_CONVERT
:
10446 nargs_constant
= 1;
10448 case V8HI_FTYPE_V8HI_INT
:
10449 case V8HI_FTYPE_V8SF_INT
:
10450 case V16HI_FTYPE_V16SF_INT
:
10451 case V8HI_FTYPE_V4SF_INT
:
10452 case V8SF_FTYPE_V8SF_INT
:
10453 case V4SF_FTYPE_V16SF_INT
:
10454 case V16SF_FTYPE_V16SF_INT
:
10455 case V4SI_FTYPE_V4SI_INT
:
10456 case V4SI_FTYPE_V8SI_INT
:
10457 case V4HI_FTYPE_V4HI_INT
:
10458 case V4DF_FTYPE_V4DF_INT
:
10459 case V4DF_FTYPE_V8DF_INT
:
10460 case V4SF_FTYPE_V4SF_INT
:
10461 case V4SF_FTYPE_V8SF_INT
:
10462 case V2DI_FTYPE_V2DI_INT
:
10463 case V2DF_FTYPE_V2DF_INT
:
10464 case V2DF_FTYPE_V4DF_INT
:
10465 case V16HI_FTYPE_V16HI_INT
:
10466 case V8SI_FTYPE_V8SI_INT
:
10467 case V16SI_FTYPE_V16SI_INT
:
10468 case V4SI_FTYPE_V16SI_INT
:
10469 case V4DI_FTYPE_V4DI_INT
:
10470 case V2DI_FTYPE_V4DI_INT
:
10471 case V4DI_FTYPE_V8DI_INT
:
10472 case UQI_FTYPE_UQI_UQI_CONST
:
10473 case UHI_FTYPE_UHI_UQI
:
10474 case USI_FTYPE_USI_UQI
:
10475 case UDI_FTYPE_UDI_UQI
:
10477 nargs_constant
= 1;
10479 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
10480 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
10481 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
10482 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
10483 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
10484 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
10485 case UHI_FTYPE_V16SI_V16SI_UHI
:
10486 case UQI_FTYPE_V8DI_V8DI_UQI
:
10487 case V16HI_FTYPE_V16SI_V16HI_UHI
:
10488 case V16QI_FTYPE_V16SI_V16QI_UHI
:
10489 case V16QI_FTYPE_V8DI_V16QI_UQI
:
10490 case V32HF_FTYPE_V32HF_V32HF_USI
:
10491 case V16SF_FTYPE_V16SF_V16SF_UHI
:
10492 case V16SF_FTYPE_V4SF_V16SF_UHI
:
10493 case V16SI_FTYPE_SI_V16SI_UHI
:
10494 case V16SI_FTYPE_V16HI_V16SI_UHI
:
10495 case V16SI_FTYPE_V16QI_V16SI_UHI
:
10496 case V8SF_FTYPE_V4SF_V8SF_UQI
:
10497 case V4DF_FTYPE_V2DF_V4DF_UQI
:
10498 case V8SI_FTYPE_V4SI_V8SI_UQI
:
10499 case V8SI_FTYPE_SI_V8SI_UQI
:
10500 case V4SI_FTYPE_V4SI_V4SI_UQI
:
10501 case V4SI_FTYPE_SI_V4SI_UQI
:
10502 case V4DI_FTYPE_V2DI_V4DI_UQI
:
10503 case V4DI_FTYPE_DI_V4DI_UQI
:
10504 case V2DI_FTYPE_V2DI_V2DI_UQI
:
10505 case V2DI_FTYPE_DI_V2DI_UQI
:
10506 case V64QI_FTYPE_V64QI_V64QI_UDI
:
10507 case V64QI_FTYPE_V16QI_V64QI_UDI
:
10508 case V64QI_FTYPE_QI_V64QI_UDI
:
10509 case V32QI_FTYPE_V32QI_V32QI_USI
:
10510 case V32QI_FTYPE_V16QI_V32QI_USI
:
10511 case V32QI_FTYPE_QI_V32QI_USI
:
10512 case V16QI_FTYPE_V16QI_V16QI_UHI
:
10513 case V16QI_FTYPE_QI_V16QI_UHI
:
10514 case V32HI_FTYPE_V8HI_V32HI_USI
:
10515 case V32HI_FTYPE_HI_V32HI_USI
:
10516 case V16HI_FTYPE_V8HI_V16HI_UHI
:
10517 case V16HI_FTYPE_HI_V16HI_UHI
:
10518 case V8HI_FTYPE_V8HI_V8HI_UQI
:
10519 case V8HI_FTYPE_HI_V8HI_UQI
:
10520 case V16HF_FTYPE_V16HF_V16HF_UHI
:
10521 case V8SF_FTYPE_V8HI_V8SF_UQI
:
10522 case V4SF_FTYPE_V8HI_V4SF_UQI
:
10523 case V8SI_FTYPE_V8HF_V8SI_UQI
:
10524 case V8SF_FTYPE_V8HF_V8SF_UQI
:
10525 case V8SI_FTYPE_V8SF_V8SI_UQI
:
10526 case V4SI_FTYPE_V4SF_V4SI_UQI
:
10527 case V4SI_FTYPE_V8HF_V4SI_UQI
:
10528 case V4SF_FTYPE_V8HF_V4SF_UQI
:
10529 case V4DI_FTYPE_V8HF_V4DI_UQI
:
10530 case V4DI_FTYPE_V4SF_V4DI_UQI
:
10531 case V2DI_FTYPE_V8HF_V2DI_UQI
:
10532 case V2DI_FTYPE_V4SF_V2DI_UQI
:
10533 case V8HF_FTYPE_V8HF_V8HF_UQI
:
10534 case V8HF_FTYPE_V8HF_V8HF_V8HF
:
10535 case V8HF_FTYPE_V8HI_V8HF_UQI
:
10536 case V8HF_FTYPE_V8SI_V8HF_UQI
:
10537 case V8HF_FTYPE_V8SF_V8HF_UQI
:
10538 case V8HF_FTYPE_V4SI_V8HF_UQI
:
10539 case V8HF_FTYPE_V4SF_V8HF_UQI
:
10540 case V8HF_FTYPE_V4DI_V8HF_UQI
:
10541 case V8HF_FTYPE_V4DF_V8HF_UQI
:
10542 case V8HF_FTYPE_V2DI_V8HF_UQI
:
10543 case V8HF_FTYPE_V2DF_V8HF_UQI
:
10544 case V4SF_FTYPE_V4DI_V4SF_UQI
:
10545 case V4SF_FTYPE_V2DI_V4SF_UQI
:
10546 case V4DF_FTYPE_V4DI_V4DF_UQI
:
10547 case V4DF_FTYPE_V8HF_V4DF_UQI
:
10548 case V2DF_FTYPE_V8HF_V2DF_UQI
:
10549 case V2DF_FTYPE_V2DI_V2DF_UQI
:
10550 case V16QI_FTYPE_V8HI_V16QI_UQI
:
10551 case V16QI_FTYPE_V16HI_V16QI_UHI
:
10552 case V16QI_FTYPE_V4SI_V16QI_UQI
:
10553 case V16QI_FTYPE_V8SI_V16QI_UQI
:
10554 case V8HI_FTYPE_V8HF_V8HI_UQI
:
10555 case V8HI_FTYPE_V4SI_V8HI_UQI
:
10556 case V8HI_FTYPE_V8SI_V8HI_UQI
:
10557 case V16QI_FTYPE_V2DI_V16QI_UQI
:
10558 case V16QI_FTYPE_V4DI_V16QI_UQI
:
10559 case V8HI_FTYPE_V2DI_V8HI_UQI
:
10560 case V8HI_FTYPE_V4DI_V8HI_UQI
:
10561 case V4SI_FTYPE_V2DI_V4SI_UQI
:
10562 case V4SI_FTYPE_V4DI_V4SI_UQI
:
10563 case V32QI_FTYPE_V32HI_V32QI_USI
:
10564 case UHI_FTYPE_V16QI_V16QI_UHI
:
10565 case USI_FTYPE_V32QI_V32QI_USI
:
10566 case UDI_FTYPE_V64QI_V64QI_UDI
:
10567 case UQI_FTYPE_V8HI_V8HI_UQI
:
10568 case UHI_FTYPE_V16HI_V16HI_UHI
:
10569 case USI_FTYPE_V32HI_V32HI_USI
:
10570 case UQI_FTYPE_V4SI_V4SI_UQI
:
10571 case UQI_FTYPE_V8SI_V8SI_UQI
:
10572 case UQI_FTYPE_V2DI_V2DI_UQI
:
10573 case UQI_FTYPE_V4DI_V4DI_UQI
:
10574 case V4SF_FTYPE_V2DF_V4SF_UQI
:
10575 case V4SF_FTYPE_V4DF_V4SF_UQI
:
10576 case V16SI_FTYPE_V16SI_V16SI_UHI
:
10577 case V16SI_FTYPE_V4SI_V16SI_UHI
:
10578 case V2DI_FTYPE_V4SI_V2DI_UQI
:
10579 case V2DI_FTYPE_V8HI_V2DI_UQI
:
10580 case V2DI_FTYPE_V16QI_V2DI_UQI
:
10581 case V4DI_FTYPE_V4DI_V4DI_UQI
:
10582 case V4DI_FTYPE_V4SI_V4DI_UQI
:
10583 case V4DI_FTYPE_V8HI_V4DI_UQI
:
10584 case V4DI_FTYPE_V16QI_V4DI_UQI
:
10585 case V4DI_FTYPE_V4DF_V4DI_UQI
:
10586 case V2DI_FTYPE_V2DF_V2DI_UQI
:
10587 case V4SI_FTYPE_V4DF_V4SI_UQI
:
10588 case V4SI_FTYPE_V2DF_V4SI_UQI
:
10589 case V4SI_FTYPE_V8HI_V4SI_UQI
:
10590 case V4SI_FTYPE_V16QI_V4SI_UQI
:
10591 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
10592 case V8DF_FTYPE_V2DF_V8DF_UQI
:
10593 case V8DF_FTYPE_V4DF_V8DF_UQI
:
10594 case V8DF_FTYPE_V8DF_V8DF_UQI
:
10595 case V8SF_FTYPE_V8SF_V8SF_UQI
:
10596 case V8SF_FTYPE_V8SI_V8SF_UQI
:
10597 case V4DF_FTYPE_V4DF_V4DF_UQI
:
10598 case V4SF_FTYPE_V4SF_V4SF_UQI
:
10599 case V2DF_FTYPE_V2DF_V2DF_UQI
:
10600 case V2DF_FTYPE_V4SF_V2DF_UQI
:
10601 case V2DF_FTYPE_V4SI_V2DF_UQI
:
10602 case V4SF_FTYPE_V4SI_V4SF_UQI
:
10603 case V4DF_FTYPE_V4SF_V4DF_UQI
:
10604 case V4DF_FTYPE_V4SI_V4DF_UQI
:
10605 case V8SI_FTYPE_V8SI_V8SI_UQI
:
10606 case V8SI_FTYPE_V8HI_V8SI_UQI
:
10607 case V8SI_FTYPE_V16QI_V8SI_UQI
:
10608 case V8DF_FTYPE_V8SI_V8DF_UQI
:
10609 case V8DI_FTYPE_DI_V8DI_UQI
:
10610 case V16SF_FTYPE_V8SF_V16SF_UHI
:
10611 case V16SI_FTYPE_V8SI_V16SI_UHI
:
10612 case V16HF_FTYPE_V16HI_V16HF_UHI
:
10613 case V16HF_FTYPE_V16HF_V16HF_V16HF
:
10614 case V16HI_FTYPE_V16HF_V16HI_UHI
:
10615 case V16HI_FTYPE_V16HI_V16HI_UHI
:
10616 case V8HI_FTYPE_V16QI_V8HI_UQI
:
10617 case V16HI_FTYPE_V16QI_V16HI_UHI
:
10618 case V32HI_FTYPE_V32HI_V32HI_USI
:
10619 case V32HI_FTYPE_V32QI_V32HI_USI
:
10620 case V8DI_FTYPE_V16QI_V8DI_UQI
:
10621 case V8DI_FTYPE_V2DI_V8DI_UQI
:
10622 case V8DI_FTYPE_V4DI_V8DI_UQI
:
10623 case V8DI_FTYPE_V8DI_V8DI_UQI
:
10624 case V8DI_FTYPE_V8HI_V8DI_UQI
:
10625 case V8DI_FTYPE_V8SI_V8DI_UQI
:
10626 case V8HI_FTYPE_V8DI_V8HI_UQI
:
10627 case V8SI_FTYPE_V8DI_V8SI_UQI
:
10628 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
10629 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
10630 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
10631 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
10632 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
10633 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
10634 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
10635 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
10636 case V32HI_FTYPE_V16SF_V16SF_USI
:
10637 case V16HI_FTYPE_V8SF_V8SF_UHI
:
10638 case V8HI_FTYPE_V4SF_V4SF_UQI
:
10639 case V16HI_FTYPE_V16SF_V16HI_UHI
:
10640 case V8HI_FTYPE_V8SF_V8HI_UQI
:
10641 case V8HI_FTYPE_V4SF_V8HI_UQI
:
10642 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
10643 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
10644 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
10647 case V32QI_FTYPE_V32QI_V32QI_INT
:
10648 case V16HI_FTYPE_V16HI_V16HI_INT
:
10649 case V16QI_FTYPE_V16QI_V16QI_INT
:
10650 case V4DI_FTYPE_V4DI_V4DI_INT
:
10651 case V8HI_FTYPE_V8HI_V8HI_INT
:
10652 case V8SI_FTYPE_V8SI_V8SI_INT
:
10653 case V8SI_FTYPE_V8SI_V4SI_INT
:
10654 case V8SF_FTYPE_V8SF_V8SF_INT
:
10655 case V8SF_FTYPE_V8SF_V4SF_INT
:
10656 case V4SI_FTYPE_V4SI_V4SI_INT
:
10657 case V4DF_FTYPE_V4DF_V4DF_INT
:
10658 case V16SF_FTYPE_V16SF_V16SF_INT
:
10659 case V16SF_FTYPE_V16SF_V4SF_INT
:
10660 case V16SI_FTYPE_V16SI_V4SI_INT
:
10661 case V4DF_FTYPE_V4DF_V2DF_INT
:
10662 case V4SF_FTYPE_V4SF_V4SF_INT
:
10663 case V2DI_FTYPE_V2DI_V2DI_INT
:
10664 case V4DI_FTYPE_V4DI_V2DI_INT
:
10665 case V2DF_FTYPE_V2DF_V2DF_INT
:
10666 case UQI_FTYPE_V8DI_V8UDI_INT
:
10667 case UQI_FTYPE_V8DF_V8DF_INT
:
10668 case UQI_FTYPE_V2DF_V2DF_INT
:
10669 case UQI_FTYPE_V4SF_V4SF_INT
:
10670 case UHI_FTYPE_V16SI_V16SI_INT
:
10671 case UHI_FTYPE_V16SF_V16SF_INT
:
10672 case V64QI_FTYPE_V64QI_V64QI_INT
:
10673 case V32HI_FTYPE_V32HI_V32HI_INT
:
10674 case V16SI_FTYPE_V16SI_V16SI_INT
:
10675 case V8DI_FTYPE_V8DI_V8DI_INT
:
10677 nargs_constant
= 1;
10679 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
10682 nargs_constant
= 1;
10684 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
10687 nargs_constant
= 1;
10689 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
10692 nargs_constant
= 1;
10694 case V2DI_FTYPE_V2DI_UINT_UINT
:
10696 nargs_constant
= 2;
10698 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
10701 nargs_constant
= 1;
10703 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
10707 nargs_constant
= 1;
10709 case QI_FTYPE_V8DF_INT_UQI
:
10710 case QI_FTYPE_V4DF_INT_UQI
:
10711 case QI_FTYPE_V2DF_INT_UQI
:
10712 case HI_FTYPE_V16SF_INT_UHI
:
10713 case QI_FTYPE_V8SF_INT_UQI
:
10714 case QI_FTYPE_V4SF_INT_UQI
:
10715 case QI_FTYPE_V8HF_INT_UQI
:
10716 case HI_FTYPE_V16HF_INT_UHI
:
10717 case SI_FTYPE_V32HF_INT_USI
:
10718 case V4SI_FTYPE_V4SI_V4SI_UHI
:
10719 case V8SI_FTYPE_V8SI_V8SI_UHI
:
10722 nargs_constant
= 1;
10724 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
10728 nargs_constant
= 1;
10730 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
10734 nargs_constant
= 1;
10736 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
10737 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
10738 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
10739 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
10740 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
10741 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
10742 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
10743 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
10744 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
10745 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
10746 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
10747 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
10748 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
10749 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
10750 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
10751 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
10752 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI
:
10753 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
10754 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
10755 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
10756 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
10757 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
10758 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
10759 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
10760 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
10761 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
10762 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
10763 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
10764 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
10765 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
10766 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
10767 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
10768 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
10769 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
10770 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI
:
10771 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI
:
10772 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
10773 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
10774 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
10775 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
10776 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
10777 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
10778 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
10779 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI
:
10780 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
10781 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
10782 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
10783 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
10784 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
10785 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
10786 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
10787 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
10788 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
10789 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
10790 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
10791 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
10792 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
10793 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
10796 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
10797 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
10798 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
10799 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
10800 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
10802 nargs_constant
= 1;
10804 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
10805 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
10806 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
10807 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
10808 case UHI_FTYPE_V16HF_V16HF_INT_UHI
:
10809 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
10810 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
10811 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
10812 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
10813 case UQI_FTYPE_V8HF_V8HF_INT_UQI
:
10814 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
10815 case USI_FTYPE_V32QI_V32QI_INT_USI
:
10816 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
10817 case USI_FTYPE_V32HI_V32HI_INT_USI
:
10818 case USI_FTYPE_V32HF_V32HF_INT_USI
:
10819 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
10820 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
10823 nargs_constant
= 1;
10825 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
10827 nargs_constant
= 2;
10829 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
10830 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
10831 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
10832 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
10833 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
10836 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
10837 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
10840 nargs_constant
= 1;
10842 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
10843 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
10844 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
10845 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
10846 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
10847 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
10848 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
10849 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
10850 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
10851 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
10852 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
10853 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
10854 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
10855 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
10856 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
10857 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
10858 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
10859 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
10860 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
10861 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
10862 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
10863 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
10864 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
10865 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
10866 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
10867 case V16HF_FTYPE_V16HF_INT_V16HF_UHI
:
10868 case V8HF_FTYPE_V8HF_INT_V8HF_UQI
:
10869 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
10870 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
10871 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
10872 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
10873 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
10876 nargs_constant
= 1;
10878 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
10879 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
10880 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
10881 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
10882 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
10883 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
10884 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
10885 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
10886 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
10887 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
10888 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
10889 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
10890 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
10891 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
10892 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
10893 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
10894 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
10895 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
10896 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
10897 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
10898 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
10899 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
10900 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
10901 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
10902 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
10903 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
10904 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
10907 nargs_constant
= 1;
10909 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
10910 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
10911 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
10912 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
10913 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
10914 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
10915 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
10916 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
10917 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
10918 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
10921 nargs_constant
= 1;
10923 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
10924 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
10925 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
10926 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
10927 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
10928 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
10929 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
10930 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
10931 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
10932 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
10933 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
10934 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
10937 nargs_constant
= 2;
10941 gcc_unreachable ();
10944 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
10946 if (comparison
!= UNKNOWN
)
10948 gcc_assert (nargs
== 2);
10949 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
10952 if (rmode
== VOIDmode
|| rmode
== tmode
)
10956 || GET_MODE (target
) != tmode
10957 || !insn_p
->operand
[0].predicate (target
, tmode
))
10958 target
= gen_reg_rtx (tmode
);
10959 else if (memory_operand (target
, tmode
))
10961 real_target
= target
;
10965 real_target
= gen_reg_rtx (tmode
);
10966 target
= lowpart_subreg (rmode
, real_target
, tmode
);
10969 for (i
= 0; i
< nargs
; i
++)
10971 tree arg
= CALL_EXPR_ARG (exp
, i
);
10972 rtx op
= expand_normal (arg
);
10973 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10974 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10976 if (second_arg_count
&& i
== 1)
10978 /* SIMD shift insns take either an 8-bit immediate or
10979 register as count. But builtin functions take int as
10980 count. If count doesn't match, we put it in register.
10981 The instructions are using 64-bit count, if op is just
10982 32-bit, zero-extend it, as negative shift counts
10983 are undefined behavior and zero-extension is more
10987 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
10988 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
10990 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10991 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
10992 op
= copy_to_reg (op
);
10995 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
10996 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
11001 case CODE_FOR_avx_vinsertf128v4di
:
11002 case CODE_FOR_avx_vextractf128v4di
:
11003 error ("the last argument must be an 1-bit immediate");
11006 case CODE_FOR_avx512f_cmpv8di3_mask
:
11007 case CODE_FOR_avx512f_cmpv16si3_mask
:
11008 case CODE_FOR_avx512f_ucmpv8di3_mask
:
11009 case CODE_FOR_avx512f_ucmpv16si3_mask
:
11010 case CODE_FOR_avx512vl_cmpv4di3_mask
:
11011 case CODE_FOR_avx512vl_cmpv8si3_mask
:
11012 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
11013 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
11014 case CODE_FOR_avx512vl_cmpv2di3_mask
:
11015 case CODE_FOR_avx512vl_cmpv4si3_mask
:
11016 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
11017 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
11018 error ("the last argument must be a 3-bit immediate");
11021 case CODE_FOR_sse4_1_roundsd
:
11022 case CODE_FOR_sse4_1_roundss
:
11024 case CODE_FOR_sse4_1_roundpd
:
11025 case CODE_FOR_sse4_1_roundps
:
11026 case CODE_FOR_avx_roundpd256
:
11027 case CODE_FOR_avx_roundps256
:
11029 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
11030 case CODE_FOR_sse4_1_roundps_sfix
:
11031 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
11032 case CODE_FOR_avx_roundps_sfix256
:
11034 case CODE_FOR_sse4_1_blendps
:
11035 case CODE_FOR_avx_blendpd256
:
11036 case CODE_FOR_avx_vpermilv4df
:
11037 case CODE_FOR_avx_vpermilv4df_mask
:
11038 case CODE_FOR_avx512f_getmantv8df_mask
:
11039 case CODE_FOR_avx512f_getmantv16sf_mask
:
11040 case CODE_FOR_avx512vl_getmantv16hf_mask
:
11041 case CODE_FOR_avx512vl_getmantv8sf_mask
:
11042 case CODE_FOR_avx512vl_getmantv4df_mask
:
11043 case CODE_FOR_avx512fp16_getmantv8hf_mask
:
11044 case CODE_FOR_avx512vl_getmantv4sf_mask
:
11045 case CODE_FOR_avx512vl_getmantv2df_mask
:
11046 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
11047 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
11048 case CODE_FOR_avx512dq_rangepv4df_mask
:
11049 case CODE_FOR_avx512dq_rangepv8sf_mask
:
11050 case CODE_FOR_avx512dq_rangepv2df_mask
:
11051 case CODE_FOR_avx512dq_rangepv4sf_mask
:
11052 case CODE_FOR_avx_shufpd256_mask
:
11053 error ("the last argument must be a 4-bit immediate");
11056 case CODE_FOR_sha1rnds4
:
11057 case CODE_FOR_sse4_1_blendpd
:
11058 case CODE_FOR_avx_vpermilv2df
:
11059 case CODE_FOR_avx_vpermilv2df_mask
:
11060 case CODE_FOR_xop_vpermil2v2df3
:
11061 case CODE_FOR_xop_vpermil2v4sf3
:
11062 case CODE_FOR_xop_vpermil2v4df3
:
11063 case CODE_FOR_xop_vpermil2v8sf3
:
11064 case CODE_FOR_avx512f_vinsertf32x4_mask
:
11065 case CODE_FOR_avx512f_vinserti32x4_mask
:
11066 case CODE_FOR_avx512f_vextractf32x4_mask
:
11067 case CODE_FOR_avx512f_vextracti32x4_mask
:
11068 case CODE_FOR_sse2_shufpd
:
11069 case CODE_FOR_sse2_shufpd_mask
:
11070 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
11071 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
11072 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
11073 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
11074 error ("the last argument must be a 2-bit immediate");
11077 case CODE_FOR_avx_vextractf128v4df
:
11078 case CODE_FOR_avx_vextractf128v8sf
:
11079 case CODE_FOR_avx_vextractf128v8si
:
11080 case CODE_FOR_avx_vinsertf128v4df
:
11081 case CODE_FOR_avx_vinsertf128v8sf
:
11082 case CODE_FOR_avx_vinsertf128v8si
:
11083 case CODE_FOR_avx512f_vinsertf64x4_mask
:
11084 case CODE_FOR_avx512f_vinserti64x4_mask
:
11085 case CODE_FOR_avx512f_vextractf64x4_mask
:
11086 case CODE_FOR_avx512f_vextracti64x4_mask
:
11087 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
11088 case CODE_FOR_avx512dq_vinserti32x8_mask
:
11089 case CODE_FOR_avx512vl_vinsertv4df
:
11090 case CODE_FOR_avx512vl_vinsertv4di
:
11091 case CODE_FOR_avx512vl_vinsertv8sf
:
11092 case CODE_FOR_avx512vl_vinsertv8si
:
11093 error ("the last argument must be a 1-bit immediate");
11096 case CODE_FOR_avx_vmcmpv2df3
:
11097 case CODE_FOR_avx_vmcmpv4sf3
:
11098 case CODE_FOR_avx_cmpv2df3
:
11099 case CODE_FOR_avx_cmpv4sf3
:
11100 case CODE_FOR_avx_cmpv4df3
:
11101 case CODE_FOR_avx_cmpv8sf3
:
11102 case CODE_FOR_avx512f_cmpv8df3_mask
:
11103 case CODE_FOR_avx512f_cmpv16sf3_mask
:
11104 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
11105 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
11106 case CODE_FOR_avx512bw_cmpv32hf3_mask
:
11107 case CODE_FOR_avx512vl_cmpv16hf3_mask
:
11108 case CODE_FOR_avx512fp16_cmpv8hf3_mask
:
11109 error ("the last argument must be a 5-bit immediate");
11113 switch (nargs_constant
)
11116 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11117 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
11119 error ("the next to last argument must be an 8-bit immediate");
11124 error ("the last argument must be an 8-bit immediate");
11127 gcc_unreachable ();
11134 if (VECTOR_MODE_P (mode
))
11135 op
= safe_vector_operand (op
, mode
);
11137 /* If we aren't optimizing, only allow one memory operand to
11139 if (memory_operand (op
, mode
))
11142 op
= fixup_modeless_constant (op
, mode
);
11144 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11146 if (optimize
|| !match
|| num_memory
> 1)
11147 op
= copy_to_mode_reg (mode
, op
);
11151 op
= copy_to_reg (op
);
11152 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11162 pat
= GEN_FCN (icode
) (real_target
, xops
[0]);
11165 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1]);
11168 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1], xops
[2]);
11171 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11175 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11176 xops
[2], xops
[3], xops
[4]);
11179 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11180 xops
[2], xops
[3], xops
[4], xops
[5]);
11183 gcc_unreachable ();
11193 /* Transform pattern of following layout:
11195 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11201 ix86_erase_embedded_rounding (rtx pat
)
11203 if (GET_CODE (pat
) == INSN
)
11204 pat
= PATTERN (pat
);
11206 gcc_assert (GET_CODE (pat
) == SET
);
11207 rtx src
= SET_SRC (pat
);
11208 gcc_assert (XVECLEN (src
, 0) == 2);
11209 rtx p0
= XVECEXP (src
, 0, 0);
11210 gcc_assert (GET_CODE (src
) == UNSPEC
11211 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
11212 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
11216 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11219 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
11220 tree exp
, rtx target
)
11223 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11224 tree arg1
= CALL_EXPR_ARG (exp
, 1);
11225 tree arg2
= CALL_EXPR_ARG (exp
, 2);
11226 tree arg3
= CALL_EXPR_ARG (exp
, 3);
11227 rtx op0
= expand_normal (arg0
);
11228 rtx op1
= expand_normal (arg1
);
11229 rtx op2
= expand_normal (arg2
);
11230 rtx op3
= expand_normal (arg3
);
11231 enum insn_code icode
= d
->icode
;
11232 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11233 machine_mode mode0
= insn_p
->operand
[0].mode
;
11234 machine_mode mode1
= insn_p
->operand
[1].mode
;
11236 /* See avxintrin.h for values. */
11237 static const enum rtx_code comparisons
[32] =
11239 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11240 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
11241 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11242 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
11244 static const bool ordereds
[32] =
11246 true, true, true, false, false, false, false, true,
11247 false, false, false, true, true, true, true, false,
11248 true, true, true, false, false, false, false, true,
11249 false, false, false, true, true, true, true, false
11251 static const bool non_signalings
[32] =
11253 true, false, false, true, true, false, false, true,
11254 true, false, false, true, true, false, false, true,
11255 false, true, true, false, false, true, true, false,
11256 false, true, true, false, false, true, true, false
11259 if (!CONST_INT_P (op2
))
11261 error ("the third argument must be comparison constant");
11264 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
11266 error ("incorrect comparison mode");
11270 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
11272 error ("incorrect rounding operand");
11276 if (VECTOR_MODE_P (mode0
))
11277 op0
= safe_vector_operand (op0
, mode0
);
11278 if (VECTOR_MODE_P (mode1
))
11279 op1
= safe_vector_operand (op1
, mode1
);
11281 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
11282 bool ordered
= ordereds
[INTVAL (op2
)];
11283 bool non_signaling
= non_signalings
[INTVAL (op2
)];
11284 rtx const_val
= const0_rtx
;
11286 bool check_unordered
= false;
11287 machine_mode mode
= CCFPmode
;
11288 switch (comparison
)
11293 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11294 if (!non_signaling
)
11300 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11310 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11317 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11318 if (!non_signaling
)
11325 case LE
: /* -> GE */
11326 case LT
: /* -> GT */
11327 case UNGE
: /* -> UNLE */
11328 case UNGT
: /* -> UNLT */
11329 std::swap (op0
, op1
);
11330 comparison
= swap_condition (comparison
);
11338 /* These are supported by CCFPmode. NB: Use ordered/signaling
11339 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11340 with NAN operands. */
11341 if (ordered
== non_signaling
)
11342 ordered
= !ordered
;
11345 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11346 _CMP_EQ_OQ/_CMP_EQ_OS. */
11347 check_unordered
= true;
11351 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11352 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11353 gcc_assert (!ordered
);
11354 check_unordered
= true;
11356 const_val
= const1_rtx
;
11359 gcc_unreachable ();
11362 target
= gen_reg_rtx (SImode
);
11363 emit_move_insn (target
, const_val
);
11364 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11366 if ((optimize
&& !register_operand (op0
, mode0
))
11367 || !insn_p
->operand
[0].predicate (op0
, mode0
))
11368 op0
= copy_to_mode_reg (mode0
, op0
);
11369 if ((optimize
&& !register_operand (op1
, mode1
))
11370 || !insn_p
->operand
[1].predicate (op1
, mode1
))
11371 op1
= copy_to_mode_reg (mode1
, op1
);
11374 1. COMI: ordered and signaling.
11375 2. UCOMI: unordered and non-signaling.
11378 icode
= (icode
== CODE_FOR_sse_comi_round
11379 ? CODE_FOR_sse_ucomi_round
11380 : CODE_FOR_sse2_ucomi_round
);
11382 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
11386 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11387 if (INTVAL (op3
) == NO_ROUND
)
11389 pat
= ix86_erase_embedded_rounding (pat
);
11393 set_dst
= SET_DEST (pat
);
11397 gcc_assert (GET_CODE (pat
) == SET
);
11398 set_dst
= SET_DEST (pat
);
11403 rtx_code_label
*label
= NULL
;
11405 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
11406 with NAN operands. */
11407 if (check_unordered
)
11409 gcc_assert (comparison
== EQ
|| comparison
== NE
);
11411 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
11412 label
= gen_label_rtx ();
11413 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
11414 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
11415 gen_rtx_LABEL_REF (VOIDmode
, label
),
11417 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
11420 /* NB: Set CCFPmode and check a different CCmode which is in subset
11422 if (GET_MODE (set_dst
) != mode
)
11424 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
11425 || mode
== CCOmode
|| mode
== CCPmode
11426 || mode
== CCSmode
|| mode
== CCZmode
);
11427 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
11430 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11431 gen_rtx_fmt_ee (comparison
, QImode
,
11436 emit_label (label
);
11438 return SUBREG_REG (target
);
11442 ix86_expand_round_builtin (const struct builtin_description
*d
,
11443 tree exp
, rtx target
)
11446 unsigned int i
, nargs
;
11448 enum insn_code icode
= d
->icode
;
11449 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11450 machine_mode tmode
= insn_p
->operand
[0].mode
;
11451 unsigned int nargs_constant
= 0;
11452 unsigned int redundant_embed_rnd
= 0;
11454 switch ((enum ix86_builtin_func_type
) d
->flag
)
11456 case UINT64_FTYPE_V2DF_INT
:
11457 case UINT64_FTYPE_V4SF_INT
:
11458 case UINT64_FTYPE_V8HF_INT
:
11459 case UINT_FTYPE_V2DF_INT
:
11460 case UINT_FTYPE_V4SF_INT
:
11461 case UINT_FTYPE_V8HF_INT
:
11462 case INT64_FTYPE_V2DF_INT
:
11463 case INT64_FTYPE_V4SF_INT
:
11464 case INT64_FTYPE_V8HF_INT
:
11465 case INT_FTYPE_V2DF_INT
:
11466 case INT_FTYPE_V4SF_INT
:
11467 case INT_FTYPE_V8HF_INT
:
11470 case V32HF_FTYPE_V32HF_V32HF_INT
:
11471 case V8HF_FTYPE_V8HF_V8HF_INT
:
11472 case V8HF_FTYPE_V8HF_INT_INT
:
11473 case V8HF_FTYPE_V8HF_UINT_INT
:
11474 case V8HF_FTYPE_V8HF_INT64_INT
:
11475 case V8HF_FTYPE_V8HF_UINT64_INT
:
11476 case V4SF_FTYPE_V4SF_UINT_INT
:
11477 case V4SF_FTYPE_V4SF_UINT64_INT
:
11478 case V2DF_FTYPE_V2DF_UINT64_INT
:
11479 case V4SF_FTYPE_V4SF_INT_INT
:
11480 case V4SF_FTYPE_V4SF_INT64_INT
:
11481 case V2DF_FTYPE_V2DF_INT64_INT
:
11482 case V4SF_FTYPE_V4SF_V4SF_INT
:
11483 case V2DF_FTYPE_V2DF_V2DF_INT
:
11484 case V4SF_FTYPE_V4SF_V2DF_INT
:
11485 case V2DF_FTYPE_V2DF_V4SF_INT
:
11488 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
11489 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
11490 case V32HI_FTYPE_V32HF_V32HI_USI_INT
:
11491 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
11492 case V8DI_FTYPE_V8HF_V8DI_UQI_INT
:
11493 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
11494 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
11495 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
11496 case V8DF_FTYPE_V8HF_V8DF_UQI_INT
:
11497 case V16SF_FTYPE_V16HF_V16SF_UHI_INT
:
11498 case V32HF_FTYPE_V32HI_V32HF_USI_INT
:
11499 case V32HF_FTYPE_V32HF_V32HF_USI_INT
:
11500 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT
:
11501 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
11502 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
11503 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
11504 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
11505 case V16SI_FTYPE_V16HF_V16SI_UHI_INT
:
11506 case V16HF_FTYPE_V16SI_V16HF_UHI_INT
:
11507 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
11508 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
11509 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
11510 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
11511 case V8HF_FTYPE_V8DI_V8HF_UQI_INT
:
11512 case V8HF_FTYPE_V8DF_V8HF_UQI_INT
:
11513 case V16HF_FTYPE_V16SF_V16HF_UHI_INT
:
11514 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT
:
11517 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
11518 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
11519 nargs_constant
= 2;
11522 case INT_FTYPE_V4SF_V4SF_INT_INT
:
11523 case INT_FTYPE_V2DF_V2DF_INT_INT
:
11524 return ix86_expand_sse_comi_round (d
, exp
, target
);
11525 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
11526 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
11527 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
11528 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT
:
11529 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
11530 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT
:
11531 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT
:
11532 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT
:
11533 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
11534 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
11535 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT
:
11536 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
11537 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
11538 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT
:
11539 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT
:
11540 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT
:
11541 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT
:
11544 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT
:
11545 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
11546 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
11547 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT
:
11548 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT
:
11549 nargs_constant
= 4;
11552 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
11553 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
11554 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
11555 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
11556 case USI_FTYPE_V32HF_V32HF_INT_USI_INT
:
11557 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT
:
11558 nargs_constant
= 3;
11561 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
11562 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
11563 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
11564 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
11565 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
11566 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
11567 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT
:
11569 nargs_constant
= 4;
11571 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
11572 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
11573 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
11574 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
11576 nargs_constant
= 3;
11579 gcc_unreachable ();
11581 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
11585 || GET_MODE (target
) != tmode
11586 || !insn_p
->operand
[0].predicate (target
, tmode
))
11587 target
= gen_reg_rtx (tmode
);
11589 for (i
= 0; i
< nargs
; i
++)
11591 tree arg
= CALL_EXPR_ARG (exp
, i
);
11592 rtx op
= expand_normal (arg
);
11593 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
11594 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
11596 if (i
== nargs
- nargs_constant
)
11602 case CODE_FOR_avx512f_getmantv8df_mask_round
:
11603 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
11604 case CODE_FOR_avx512bw_getmantv32hf_mask_round
:
11605 case CODE_FOR_avx512f_vgetmantv2df_round
:
11606 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
11607 case CODE_FOR_avx512f_vgetmantv4sf_round
:
11608 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
11609 case CODE_FOR_avx512f_vgetmantv8hf_mask_round
:
11610 error ("the immediate argument must be a 4-bit immediate");
11612 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
11613 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
11614 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
11615 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
11616 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round
:
11617 case CODE_FOR_avx512bw_cmpv32hf3_mask_round
:
11618 error ("the immediate argument must be a 5-bit immediate");
11621 error ("the immediate argument must be an 8-bit immediate");
11626 else if (i
== nargs
-1)
11628 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
11630 error ("incorrect rounding operand");
11634 /* If there is no rounding use normal version of the pattern. */
11635 if (INTVAL (op
) == NO_ROUND
)
11637 /* Skip erasing embedded rounding for below expanders who
11638 generates multiple insns. In ix86_erase_embedded_rounding
11639 the pattern will be transformed to a single set, and emit_insn
11640 appends the set insead of insert it to chain. So the insns
11641 emitted inside define_expander would be ignored. */
11644 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round
:
11645 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round
:
11646 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round
:
11647 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round
:
11648 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round
:
11649 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round
:
11650 redundant_embed_rnd
= 0;
11653 redundant_embed_rnd
= 1;
11660 if (VECTOR_MODE_P (mode
))
11661 op
= safe_vector_operand (op
, mode
);
11663 op
= fixup_modeless_constant (op
, mode
);
11665 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11667 if (optimize
|| !match
)
11668 op
= copy_to_mode_reg (mode
, op
);
11672 op
= copy_to_reg (op
);
11673 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11683 pat
= GEN_FCN (icode
) (target
, xops
[0]);
11686 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
11689 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
11692 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11696 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11697 xops
[2], xops
[3], xops
[4]);
11700 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11701 xops
[2], xops
[3], xops
[4], xops
[5]);
11704 gcc_unreachable ();
11710 if (redundant_embed_rnd
)
11711 pat
= ix86_erase_embedded_rounding (pat
);
11717 /* Subroutine of ix86_expand_builtin to take care of special insns
11718 with variable number of operands. */
11721 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
11722 tree exp
, rtx target
)
11726 unsigned int i
, nargs
, arg_adjust
, memory
;
11727 bool aligned_mem
= false;
11729 enum insn_code icode
= d
->icode
;
11730 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11731 machine_mode tmode
= insn_p
->operand
[0].mode
;
11732 enum { load
, store
} klass
;
11734 switch ((enum ix86_builtin_func_type
) d
->flag
)
11736 case VOID_FTYPE_VOID
:
11737 emit_insn (GEN_FCN (icode
) (target
));
11739 case VOID_FTYPE_UINT64
:
11740 case VOID_FTYPE_UNSIGNED
:
11746 case INT_FTYPE_VOID
:
11747 case USHORT_FTYPE_VOID
:
11748 case UINT64_FTYPE_VOID
:
11749 case UINT_FTYPE_VOID
:
11750 case UINT8_FTYPE_VOID
:
11751 case UNSIGNED_FTYPE_VOID
:
11756 case UINT64_FTYPE_PUNSIGNED
:
11757 case V2DI_FTYPE_PV2DI
:
11758 case V4DI_FTYPE_PV4DI
:
11759 case V32QI_FTYPE_PCCHAR
:
11760 case V16QI_FTYPE_PCCHAR
:
11761 case V8SF_FTYPE_PCV4SF
:
11762 case V8SF_FTYPE_PCFLOAT
:
11763 case V4SF_FTYPE_PCFLOAT
:
11764 case V4DF_FTYPE_PCV2DF
:
11765 case V4DF_FTYPE_PCDOUBLE
:
11766 case V2DF_FTYPE_PCDOUBLE
:
11767 case VOID_FTYPE_PVOID
:
11768 case V8DI_FTYPE_PV8DI
:
11774 case CODE_FOR_sse4_1_movntdqa
:
11775 case CODE_FOR_avx2_movntdqa
:
11776 case CODE_FOR_avx512f_movntdqa
:
11777 aligned_mem
= true;
11783 case VOID_FTYPE_PV2SF_V4SF
:
11784 case VOID_FTYPE_PV8DI_V8DI
:
11785 case VOID_FTYPE_PV4DI_V4DI
:
11786 case VOID_FTYPE_PV2DI_V2DI
:
11787 case VOID_FTYPE_PCHAR_V32QI
:
11788 case VOID_FTYPE_PCHAR_V16QI
:
11789 case VOID_FTYPE_PFLOAT_V16SF
:
11790 case VOID_FTYPE_PFLOAT_V8SF
:
11791 case VOID_FTYPE_PFLOAT_V4SF
:
11792 case VOID_FTYPE_PDOUBLE_V8DF
:
11793 case VOID_FTYPE_PDOUBLE_V4DF
:
11794 case VOID_FTYPE_PDOUBLE_V2DF
:
11795 case VOID_FTYPE_PLONGLONG_LONGLONG
:
11796 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
11797 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
11798 case VOID_FTYPE_PINT_INT
:
11801 /* Reserve memory operand for target. */
11802 memory
= ARRAY_SIZE (xops
);
11805 /* These builtins and instructions require the memory
11806 to be properly aligned. */
11807 case CODE_FOR_avx_movntv4di
:
11808 case CODE_FOR_sse2_movntv2di
:
11809 case CODE_FOR_avx_movntv8sf
:
11810 case CODE_FOR_sse_movntv4sf
:
11811 case CODE_FOR_sse4a_vmmovntv4sf
:
11812 case CODE_FOR_avx_movntv4df
:
11813 case CODE_FOR_sse2_movntv2df
:
11814 case CODE_FOR_sse4a_vmmovntv2df
:
11815 case CODE_FOR_sse2_movntidi
:
11816 case CODE_FOR_sse_movntq
:
11817 case CODE_FOR_sse2_movntisi
:
11818 case CODE_FOR_avx512f_movntv16sf
:
11819 case CODE_FOR_avx512f_movntv8df
:
11820 case CODE_FOR_avx512f_movntv8di
:
11821 aligned_mem
= true;
11827 case VOID_FTYPE_PVOID_PCVOID
:
11833 case V4SF_FTYPE_V4SF_PCV2SF
:
11834 case V2DF_FTYPE_V2DF_PCDOUBLE
:
11839 case V8SF_FTYPE_PCV8SF_V8SI
:
11840 case V4DF_FTYPE_PCV4DF_V4DI
:
11841 case V4SF_FTYPE_PCV4SF_V4SI
:
11842 case V2DF_FTYPE_PCV2DF_V2DI
:
11843 case V8SI_FTYPE_PCV8SI_V8SI
:
11844 case V4DI_FTYPE_PCV4DI_V4DI
:
11845 case V4SI_FTYPE_PCV4SI_V4SI
:
11846 case V2DI_FTYPE_PCV2DI_V2DI
:
11847 case VOID_FTYPE_INT_INT64
:
11852 case VOID_FTYPE_PV8DF_V8DF_UQI
:
11853 case VOID_FTYPE_PV4DF_V4DF_UQI
:
11854 case VOID_FTYPE_PV2DF_V2DF_UQI
:
11855 case VOID_FTYPE_PV16SF_V16SF_UHI
:
11856 case VOID_FTYPE_PV8SF_V8SF_UQI
:
11857 case VOID_FTYPE_PV4SF_V4SF_UQI
:
11858 case VOID_FTYPE_PV8DI_V8DI_UQI
:
11859 case VOID_FTYPE_PV4DI_V4DI_UQI
:
11860 case VOID_FTYPE_PV2DI_V2DI_UQI
:
11861 case VOID_FTYPE_PV16SI_V16SI_UHI
:
11862 case VOID_FTYPE_PV8SI_V8SI_UQI
:
11863 case VOID_FTYPE_PV4SI_V4SI_UQI
:
11864 case VOID_FTYPE_PV64QI_V64QI_UDI
:
11865 case VOID_FTYPE_PV32HI_V32HI_USI
:
11866 case VOID_FTYPE_PV32QI_V32QI_USI
:
11867 case VOID_FTYPE_PV16QI_V16QI_UHI
:
11868 case VOID_FTYPE_PV16HI_V16HI_UHI
:
11869 case VOID_FTYPE_PV8HI_V8HI_UQI
:
11872 /* These builtins and instructions require the memory
11873 to be properly aligned. */
11874 case CODE_FOR_avx512f_storev16sf_mask
:
11875 case CODE_FOR_avx512f_storev16si_mask
:
11876 case CODE_FOR_avx512f_storev8df_mask
:
11877 case CODE_FOR_avx512f_storev8di_mask
:
11878 case CODE_FOR_avx512vl_storev8sf_mask
:
11879 case CODE_FOR_avx512vl_storev8si_mask
:
11880 case CODE_FOR_avx512vl_storev4df_mask
:
11881 case CODE_FOR_avx512vl_storev4di_mask
:
11882 case CODE_FOR_avx512vl_storev4sf_mask
:
11883 case CODE_FOR_avx512vl_storev4si_mask
:
11884 case CODE_FOR_avx512vl_storev2df_mask
:
11885 case CODE_FOR_avx512vl_storev2di_mask
:
11886 aligned_mem
= true;
11892 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
11893 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
11894 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
11895 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
11896 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
11897 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
11898 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
11899 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
11900 case VOID_FTYPE_PV8SI_V8DI_UQI
:
11901 case VOID_FTYPE_PV8HI_V8DI_UQI
:
11902 case VOID_FTYPE_PV16HI_V16SI_UHI
:
11903 case VOID_FTYPE_PUDI_V8DI_UQI
:
11904 case VOID_FTYPE_PV16QI_V16SI_UHI
:
11905 case VOID_FTYPE_PV4SI_V4DI_UQI
:
11906 case VOID_FTYPE_PUDI_V2DI_UQI
:
11907 case VOID_FTYPE_PUDI_V4DI_UQI
:
11908 case VOID_FTYPE_PUSI_V2DI_UQI
:
11909 case VOID_FTYPE_PV8HI_V8SI_UQI
:
11910 case VOID_FTYPE_PUDI_V4SI_UQI
:
11911 case VOID_FTYPE_PUSI_V4DI_UQI
:
11912 case VOID_FTYPE_PUHI_V2DI_UQI
:
11913 case VOID_FTYPE_PUDI_V8SI_UQI
:
11914 case VOID_FTYPE_PUSI_V4SI_UQI
:
11915 case VOID_FTYPE_PCHAR_V64QI_UDI
:
11916 case VOID_FTYPE_PCHAR_V32QI_USI
:
11917 case VOID_FTYPE_PCHAR_V16QI_UHI
:
11918 case VOID_FTYPE_PSHORT_V32HI_USI
:
11919 case VOID_FTYPE_PSHORT_V16HI_UHI
:
11920 case VOID_FTYPE_PSHORT_V8HI_UQI
:
11921 case VOID_FTYPE_PINT_V16SI_UHI
:
11922 case VOID_FTYPE_PINT_V8SI_UQI
:
11923 case VOID_FTYPE_PINT_V4SI_UQI
:
11924 case VOID_FTYPE_PINT64_V8DI_UQI
:
11925 case VOID_FTYPE_PINT64_V4DI_UQI
:
11926 case VOID_FTYPE_PINT64_V2DI_UQI
:
11927 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
11928 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
11929 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
11930 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
11931 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
11932 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
11933 case VOID_FTYPE_PCFLOAT16_V8HF_UQI
:
11934 case VOID_FTYPE_PV32QI_V32HI_USI
:
11935 case VOID_FTYPE_PV16QI_V16HI_UHI
:
11936 case VOID_FTYPE_PUDI_V8HI_UQI
:
11939 /* Reserve memory operand for target. */
11940 memory
= ARRAY_SIZE (xops
);
11942 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
11943 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
11944 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
11945 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
11946 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
11947 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
11948 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
11949 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
11950 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
11951 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
11952 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
11953 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
11954 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
11955 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
11956 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
11957 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
11958 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
11959 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
11962 /* These builtins and instructions require the memory
11963 to be properly aligned. */
11964 case CODE_FOR_avx512f_loadv16sf_mask
:
11965 case CODE_FOR_avx512f_loadv16si_mask
:
11966 case CODE_FOR_avx512f_loadv8df_mask
:
11967 case CODE_FOR_avx512f_loadv8di_mask
:
11968 case CODE_FOR_avx512vl_loadv8sf_mask
:
11969 case CODE_FOR_avx512vl_loadv8si_mask
:
11970 case CODE_FOR_avx512vl_loadv4df_mask
:
11971 case CODE_FOR_avx512vl_loadv4di_mask
:
11972 case CODE_FOR_avx512vl_loadv4sf_mask
:
11973 case CODE_FOR_avx512vl_loadv4si_mask
:
11974 case CODE_FOR_avx512vl_loadv2df_mask
:
11975 case CODE_FOR_avx512vl_loadv2di_mask
:
11976 case CODE_FOR_avx512bw_loadv64qi_mask
:
11977 case CODE_FOR_avx512vl_loadv32qi_mask
:
11978 case CODE_FOR_avx512vl_loadv16qi_mask
:
11979 case CODE_FOR_avx512bw_loadv32hi_mask
:
11980 case CODE_FOR_avx512vl_loadv16hi_mask
:
11981 case CODE_FOR_avx512vl_loadv8hi_mask
:
11982 aligned_mem
= true;
11988 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
11989 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
11990 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
11991 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
11992 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
11993 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
11994 case V16SI_FTYPE_PCINT_V16SI_UHI
:
11995 case V8SI_FTYPE_PCINT_V8SI_UQI
:
11996 case V4SI_FTYPE_PCINT_V4SI_UQI
:
11997 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
11998 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
11999 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
12000 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
12001 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
12002 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
12003 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
12004 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
12005 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
12006 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI
:
12012 gcc_unreachable ();
12015 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
12017 if (klass
== store
)
12019 arg
= CALL_EXPR_ARG (exp
, 0);
12020 op
= expand_normal (arg
);
12021 gcc_assert (target
== 0);
12024 op
= ix86_zero_extend_to_Pmode (op
);
12025 target
= gen_rtx_MEM (tmode
, op
);
12026 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12027 on it. Try to improve it using get_pointer_alignment,
12028 and if the special builtin is one that requires strict
12029 mode alignment, also from it's GET_MODE_ALIGNMENT.
12030 Failure to do so could lead to ix86_legitimate_combined_insn
12031 rejecting all changes to such insns. */
12032 unsigned int align
= get_pointer_alignment (arg
);
12033 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
12034 align
= GET_MODE_ALIGNMENT (tmode
);
12035 if (MEM_ALIGN (target
) < align
)
12036 set_mem_align (target
, align
);
12039 target
= force_reg (tmode
, op
);
12047 || !register_operand (target
, tmode
)
12048 || GET_MODE (target
) != tmode
)
12049 target
= gen_reg_rtx (tmode
);
12052 for (i
= 0; i
< nargs
; i
++)
12054 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
12056 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
12057 op
= expand_normal (arg
);
12061 /* This must be the memory operand. */
12062 op
= ix86_zero_extend_to_Pmode (op
);
12063 op
= gen_rtx_MEM (mode
, op
);
12064 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12065 on it. Try to improve it using get_pointer_alignment,
12066 and if the special builtin is one that requires strict
12067 mode alignment, also from it's GET_MODE_ALIGNMENT.
12068 Failure to do so could lead to ix86_legitimate_combined_insn
12069 rejecting all changes to such insns. */
12070 unsigned int align
= get_pointer_alignment (arg
);
12071 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
12072 align
= GET_MODE_ALIGNMENT (mode
);
12073 if (MEM_ALIGN (op
) < align
)
12074 set_mem_align (op
, align
);
12078 /* This must be register. */
12079 if (VECTOR_MODE_P (mode
))
12080 op
= safe_vector_operand (op
, mode
);
12082 op
= fixup_modeless_constant (op
, mode
);
12084 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12085 and that mask operand shoud be at the end.
12086 Keep all-ones mask which would be simplified by the expander. */
12087 if (nargs
== 3 && i
== 2 && klass
== load
12088 && constm1_operand (op
, mode
)
12089 && insn_p
->operand
[i
].predicate (op
, mode
))
12091 else if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
12092 op
= copy_to_mode_reg (mode
, op
);
12095 op
= copy_to_reg (op
);
12096 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
12106 pat
= GEN_FCN (icode
) (target
);
12109 pat
= GEN_FCN (icode
) (target
, xops
[0]);
12112 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
12115 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
12118 gcc_unreachable ();
12125 return klass
== store
? 0 : target
;
12128 /* Return the integer constant in ARG. Constrain it to be in the range
12129 of the subparts of VEC_TYPE; issue an error if not. */
12132 get_element_number (tree vec_type
, tree arg
)
12134 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
12136 if (!tree_fits_uhwi_p (arg
)
12137 || (elt
= tree_to_uhwi (arg
), elt
> max
))
12139 error ("selector must be an integer constant in the range "
12147 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12148 ix86_expand_vector_init. We DO have language-level syntax for this, in
12149 the form of (type){ init-list }. Except that since we can't place emms
12150 instructions from inside the compiler, we can't allow the use of MMX
12151 registers unless the user explicitly asks for it. So we do *not* define
12152 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12153 we have builtins invoked by mmintrin.h that gives us license to emit
12154 these sorts of instructions. */
12157 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
12159 machine_mode tmode
= TYPE_MODE (type
);
12160 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
12161 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
12162 rtvec v
= rtvec_alloc (n_elt
);
12164 gcc_assert (VECTOR_MODE_P (tmode
));
12165 gcc_assert (call_expr_nargs (exp
) == n_elt
);
12167 for (i
= 0; i
< n_elt
; ++i
)
12169 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
12170 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
12173 if (!target
|| !register_operand (target
, tmode
))
12174 target
= gen_reg_rtx (tmode
);
12176 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
12180 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12181 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12182 had a language-level syntax for referencing vector elements. */
12185 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
12187 machine_mode tmode
, mode0
;
12192 arg0
= CALL_EXPR_ARG (exp
, 0);
12193 arg1
= CALL_EXPR_ARG (exp
, 1);
12195 op0
= expand_normal (arg0
);
12196 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
12198 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12199 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
12200 gcc_assert (VECTOR_MODE_P (mode0
));
12202 op0
= force_reg (mode0
, op0
);
12204 if (optimize
|| !target
|| !register_operand (target
, tmode
))
12205 target
= gen_reg_rtx (tmode
);
12207 ix86_expand_vector_extract (true, target
, op0
, elt
);
12212 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12213 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12214 a language-level syntax for referencing vector elements. */
12217 ix86_expand_vec_set_builtin (tree exp
)
12219 machine_mode tmode
, mode1
;
12220 tree arg0
, arg1
, arg2
;
12222 rtx op0
, op1
, target
;
12224 arg0
= CALL_EXPR_ARG (exp
, 0);
12225 arg1
= CALL_EXPR_ARG (exp
, 1);
12226 arg2
= CALL_EXPR_ARG (exp
, 2);
12228 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
12229 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12230 gcc_assert (VECTOR_MODE_P (tmode
));
12232 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
12233 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
12234 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
12236 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
12237 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
12239 op0
= force_reg (tmode
, op0
);
12240 op1
= force_reg (mode1
, op1
);
12242 /* OP0 is the source of these builtin functions and shouldn't be
12243 modified. Create a copy, use it and return it as target. */
12244 target
= gen_reg_rtx (tmode
);
12245 emit_move_insn (target
, op0
);
12246 ix86_expand_vector_set (true, target
, op1
, elt
);
12251 /* Return true if the necessary isa options for this builtin exist,
12253 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12255 ix86_check_builtin_isa_match (unsigned int fcode
,
12256 HOST_WIDE_INT
* pbisa
,
12257 HOST_WIDE_INT
* pbisa2
)
12259 HOST_WIDE_INT isa
= ix86_isa_flags
;
12260 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
12261 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
12262 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
12263 /* The general case is we require all the ISAs specified in bisa{,2}
12265 The exceptions are:
12266 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12267 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12268 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12269 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12270 OPTION_MASK_ISA2_AVXVNNI
12271 where for each such pair it is sufficient if either of the ISAs is
12272 enabled, plus if it is ored with other options also those others.
12273 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12274 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
12275 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
12276 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
12277 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
12279 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
12280 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
12281 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
12282 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
12284 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
12285 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
12286 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
12287 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
12289 if ((((bisa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12290 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12291 || (bisa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0)
12292 && (((isa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12293 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12294 || (isa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0))
12296 isa
|= OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
;
12297 isa2
|= OPTION_MASK_ISA2_AVXVNNI
;
12300 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
12301 /* __builtin_ia32_maskmovq requires MMX registers. */
12302 && fcode
!= IX86_BUILTIN_MASKMOVQ
)
12304 bisa
&= ~OPTION_MASK_ISA_MMX
;
12305 bisa
|= OPTION_MASK_ISA_SSE2
;
12313 return (bisa
& isa
) == bisa
&& (bisa2
& isa2
) == bisa2
;
12316 /* Expand an expression EXP that calls a built-in function,
12317 with result going to TARGET if that's convenient
12318 (and in mode MODE if that's convenient).
12319 SUBTARGET may be used as the target for computing one of EXP's operands.
12320 IGNORE is nonzero if the value is to be ignored. */
12323 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
12324 machine_mode mode
, int ignore
)
12327 enum insn_code icode
, icode2
;
12328 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
12329 tree arg0
, arg1
, arg2
, arg3
, arg4
;
12330 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
12331 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
12332 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
12333 HOST_WIDE_INT bisa
, bisa2
;
12335 /* For CPU builtins that can be folded, fold first and expand the fold. */
12338 case IX86_BUILTIN_CPU_INIT
:
12340 /* Make it call __cpu_indicator_init in libgcc. */
12341 tree call_expr
, fndecl
, type
;
12342 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
12343 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
12344 call_expr
= build_call_expr (fndecl
, 0);
12345 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
12347 case IX86_BUILTIN_CPU_IS
:
12348 case IX86_BUILTIN_CPU_SUPPORTS
:
12350 tree arg0
= CALL_EXPR_ARG (exp
, 0);
12351 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
12352 gcc_assert (fold_expr
!= NULL_TREE
);
12353 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
12357 if (!ix86_check_builtin_isa_match (fcode
, &bisa
, &bisa2
))
12359 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
12360 if (TARGET_ABI_X32
)
12361 bisa
|= OPTION_MASK_ABI_X32
;
12363 bisa
|= OPTION_MASK_ABI_64
;
12364 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
12365 (enum fpmath_unit
) 0,
12366 (enum prefer_vector_width
) 0,
12367 PVW_NONE
, PVW_NONE
,
12370 error ("%qE needs unknown isa option", fndecl
);
12373 gcc_assert (opts
!= NULL
);
12374 error ("%qE needs isa option %s", fndecl
, opts
);
12377 return expand_call (exp
, target
, ignore
);
12382 case IX86_BUILTIN_MASKMOVQ
:
12383 case IX86_BUILTIN_MASKMOVDQU
:
12384 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
12385 ? CODE_FOR_mmx_maskmovq
12386 : CODE_FOR_sse2_maskmovdqu
);
12387 /* Note the arg order is different from the operand order. */
12388 arg1
= CALL_EXPR_ARG (exp
, 0);
12389 arg2
= CALL_EXPR_ARG (exp
, 1);
12390 arg0
= CALL_EXPR_ARG (exp
, 2);
12391 op0
= expand_normal (arg0
);
12392 op1
= expand_normal (arg1
);
12393 op2
= expand_normal (arg2
);
12394 mode0
= insn_data
[icode
].operand
[0].mode
;
12395 mode1
= insn_data
[icode
].operand
[1].mode
;
12396 mode2
= insn_data
[icode
].operand
[2].mode
;
12398 op0
= ix86_zero_extend_to_Pmode (op0
);
12399 op0
= gen_rtx_MEM (mode1
, op0
);
12401 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12402 op0
= copy_to_mode_reg (mode0
, op0
);
12403 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12404 op1
= copy_to_mode_reg (mode1
, op1
);
12405 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12406 op2
= copy_to_mode_reg (mode2
, op2
);
12407 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
12413 case IX86_BUILTIN_LDMXCSR
:
12414 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
12415 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12416 emit_move_insn (target
, op0
);
12417 emit_insn (gen_sse_ldmxcsr (target
));
12420 case IX86_BUILTIN_STMXCSR
:
12421 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12422 emit_insn (gen_sse_stmxcsr (target
));
12423 return copy_to_mode_reg (SImode
, target
);
12425 case IX86_BUILTIN_CLFLUSH
:
12426 arg0
= CALL_EXPR_ARG (exp
, 0);
12427 op0
= expand_normal (arg0
);
12428 icode
= CODE_FOR_sse2_clflush
;
12429 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12430 op0
= ix86_zero_extend_to_Pmode (op0
);
12432 emit_insn (gen_sse2_clflush (op0
));
12435 case IX86_BUILTIN_CLWB
:
12436 arg0
= CALL_EXPR_ARG (exp
, 0);
12437 op0
= expand_normal (arg0
);
12438 icode
= CODE_FOR_clwb
;
12439 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12440 op0
= ix86_zero_extend_to_Pmode (op0
);
12442 emit_insn (gen_clwb (op0
));
12445 case IX86_BUILTIN_CLFLUSHOPT
:
12446 arg0
= CALL_EXPR_ARG (exp
, 0);
12447 op0
= expand_normal (arg0
);
12448 icode
= CODE_FOR_clflushopt
;
12449 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12450 op0
= ix86_zero_extend_to_Pmode (op0
);
12452 emit_insn (gen_clflushopt (op0
));
12455 case IX86_BUILTIN_MONITOR
:
12456 case IX86_BUILTIN_MONITORX
:
12457 arg0
= CALL_EXPR_ARG (exp
, 0);
12458 arg1
= CALL_EXPR_ARG (exp
, 1);
12459 arg2
= CALL_EXPR_ARG (exp
, 2);
12460 op0
= expand_normal (arg0
);
12461 op1
= expand_normal (arg1
);
12462 op2
= expand_normal (arg2
);
12464 op0
= ix86_zero_extend_to_Pmode (op0
);
12466 op1
= copy_to_mode_reg (SImode
, op1
);
12468 op2
= copy_to_mode_reg (SImode
, op2
);
12470 emit_insn (fcode
== IX86_BUILTIN_MONITOR
12471 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
12472 : gen_monitorx (Pmode
, op0
, op1
, op2
));
12475 case IX86_BUILTIN_MWAIT
:
12476 arg0
= CALL_EXPR_ARG (exp
, 0);
12477 arg1
= CALL_EXPR_ARG (exp
, 1);
12478 op0
= expand_normal (arg0
);
12479 op1
= expand_normal (arg1
);
12481 op0
= copy_to_mode_reg (SImode
, op0
);
12483 op1
= copy_to_mode_reg (SImode
, op1
);
12484 emit_insn (gen_sse3_mwait (op0
, op1
));
12487 case IX86_BUILTIN_MWAITX
:
12488 arg0
= CALL_EXPR_ARG (exp
, 0);
12489 arg1
= CALL_EXPR_ARG (exp
, 1);
12490 arg2
= CALL_EXPR_ARG (exp
, 2);
12491 op0
= expand_normal (arg0
);
12492 op1
= expand_normal (arg1
);
12493 op2
= expand_normal (arg2
);
12495 op0
= copy_to_mode_reg (SImode
, op0
);
12497 op1
= copy_to_mode_reg (SImode
, op1
);
12499 op2
= copy_to_mode_reg (SImode
, op2
);
12500 emit_insn (gen_mwaitx (op0
, op1
, op2
));
12503 case IX86_BUILTIN_UMONITOR
:
12504 arg0
= CALL_EXPR_ARG (exp
, 0);
12505 op0
= expand_normal (arg0
);
12507 op0
= ix86_zero_extend_to_Pmode (op0
);
12508 emit_insn (gen_umonitor (Pmode
, op0
));
12511 case IX86_BUILTIN_UMWAIT
:
12512 case IX86_BUILTIN_TPAUSE
:
12513 arg0
= CALL_EXPR_ARG (exp
, 0);
12514 arg1
= CALL_EXPR_ARG (exp
, 1);
12515 op0
= expand_normal (arg0
);
12516 op1
= expand_normal (arg1
);
12519 op0
= copy_to_mode_reg (SImode
, op0
);
12521 op1
= force_reg (DImode
, op1
);
12525 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
12526 NULL
, 1, OPTAB_DIRECT
);
12529 case IX86_BUILTIN_UMWAIT
:
12530 icode
= CODE_FOR_umwait_rex64
;
12532 case IX86_BUILTIN_TPAUSE
:
12533 icode
= CODE_FOR_tpause_rex64
;
12536 gcc_unreachable ();
12539 op2
= gen_lowpart (SImode
, op2
);
12540 op1
= gen_lowpart (SImode
, op1
);
12541 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
12547 case IX86_BUILTIN_UMWAIT
:
12548 icode
= CODE_FOR_umwait
;
12550 case IX86_BUILTIN_TPAUSE
:
12551 icode
= CODE_FOR_tpause
;
12554 gcc_unreachable ();
12556 pat
= GEN_FCN (icode
) (op0
, op1
);
12565 || !register_operand (target
, QImode
))
12566 target
= gen_reg_rtx (QImode
);
12568 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12570 emit_insn (gen_rtx_SET (target
, pat
));
12574 case IX86_BUILTIN_TESTUI
:
12575 emit_insn (gen_testui ());
12578 || !register_operand (target
, QImode
))
12579 target
= gen_reg_rtx (QImode
);
12581 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12583 emit_insn (gen_rtx_SET (target
, pat
));
12587 case IX86_BUILTIN_CLZERO
:
12588 arg0
= CALL_EXPR_ARG (exp
, 0);
12589 op0
= expand_normal (arg0
);
12591 op0
= ix86_zero_extend_to_Pmode (op0
);
12592 emit_insn (gen_clzero (Pmode
, op0
));
12595 case IX86_BUILTIN_CLDEMOTE
:
12596 arg0
= CALL_EXPR_ARG (exp
, 0);
12597 op0
= expand_normal (arg0
);
12598 icode
= CODE_FOR_cldemote
;
12599 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12600 op0
= ix86_zero_extend_to_Pmode (op0
);
12602 emit_insn (gen_cldemote (op0
));
12605 case IX86_BUILTIN_LOADIWKEY
:
12607 arg0
= CALL_EXPR_ARG (exp
, 0);
12608 arg1
= CALL_EXPR_ARG (exp
, 1);
12609 arg2
= CALL_EXPR_ARG (exp
, 2);
12610 arg3
= CALL_EXPR_ARG (exp
, 3);
12612 op0
= expand_normal (arg0
);
12613 op1
= expand_normal (arg1
);
12614 op2
= expand_normal (arg2
);
12615 op3
= expand_normal (arg3
);
12618 op0
= copy_to_mode_reg (V2DImode
, op0
);
12620 op1
= copy_to_mode_reg (V2DImode
, op1
);
12622 op2
= copy_to_mode_reg (V2DImode
, op2
);
12624 op3
= copy_to_mode_reg (SImode
, op3
);
12626 emit_insn (gen_loadiwkey (op0
, op1
, op2
, op3
));
12631 case IX86_BUILTIN_AESDEC128KLU8
:
12632 icode
= CODE_FOR_aesdec128klu8
;
12633 goto aesdecenc_expand
;
12635 case IX86_BUILTIN_AESDEC256KLU8
:
12636 icode
= CODE_FOR_aesdec256klu8
;
12637 goto aesdecenc_expand
;
12639 case IX86_BUILTIN_AESENC128KLU8
:
12640 icode
= CODE_FOR_aesenc128klu8
;
12641 goto aesdecenc_expand
;
12643 case IX86_BUILTIN_AESENC256KLU8
:
12644 icode
= CODE_FOR_aesenc256klu8
;
12648 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i *odata
12649 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i idata
12650 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
12652 op0
= expand_normal (arg0
);
12653 op1
= expand_normal (arg1
);
12654 op2
= expand_normal (arg2
);
12656 if (!address_operand (op0
, V2DImode
))
12658 op0
= convert_memory_address (Pmode
, op0
);
12659 op0
= copy_addr_to_reg (op0
);
12661 op0
= gen_rtx_MEM (V2DImode
, op0
);
12664 op1
= copy_to_mode_reg (V2DImode
, op1
);
12666 if (!address_operand (op2
, VOIDmode
))
12668 op2
= convert_memory_address (Pmode
, op2
);
12669 op2
= copy_addr_to_reg (op2
);
12671 op2
= gen_rtx_MEM (BLKmode
, op2
);
12673 emit_insn (GEN_FCN (icode
) (op1
, op1
, op2
));
12676 target
= gen_reg_rtx (QImode
);
12678 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12679 error occurs. Then the output should be cleared for safety. */
12680 rtx_code_label
*ok_label
;
12683 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
12684 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
12685 ok_label
= gen_label_rtx ();
12686 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
12688 /* Usually the runtime error seldom occur, so predict OK path as
12689 hotspot to optimize it as fallthrough block. */
12690 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
12692 emit_insn (gen_rtx_SET (op1
, const0_rtx
));
12694 emit_label (ok_label
);
12695 emit_insn (gen_rtx_SET (target
, pat
));
12696 emit_insn (gen_rtx_SET (op0
, op1
));
12700 case IX86_BUILTIN_AESDECWIDE128KLU8
:
12701 icode
= CODE_FOR_aesdecwide128klu8
;
12702 goto wideaesdecenc_expand
;
12704 case IX86_BUILTIN_AESDECWIDE256KLU8
:
12705 icode
= CODE_FOR_aesdecwide256klu8
;
12706 goto wideaesdecenc_expand
;
12708 case IX86_BUILTIN_AESENCWIDE128KLU8
:
12709 icode
= CODE_FOR_aesencwide128klu8
;
12710 goto wideaesdecenc_expand
;
12712 case IX86_BUILTIN_AESENCWIDE256KLU8
:
12713 icode
= CODE_FOR_aesencwide256klu8
;
12715 wideaesdecenc_expand
:
12720 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i * odata
12721 arg1
= CALL_EXPR_ARG (exp
, 1); // const __m128i * idata
12722 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
12724 op0
= expand_normal (arg0
);
12725 op1
= expand_normal (arg1
);
12726 op2
= expand_normal (arg2
);
12728 if (!address_operand (op2
, VOIDmode
))
12730 op2
= convert_memory_address (Pmode
, op2
);
12731 op2
= copy_addr_to_reg (op2
);
12733 op2
= gen_rtx_MEM (BLKmode
, op2
);
12735 for (i
= 0; i
< 8; i
++)
12737 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
12739 op
= gen_rtx_MEM (V2DImode
,
12740 plus_constant (Pmode
, op1
, (i
* 16)));
12742 emit_move_insn (xmm_regs
[i
], op
);
12745 emit_insn (GEN_FCN (icode
) (op2
));
12748 target
= gen_reg_rtx (QImode
);
12750 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
12751 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
12752 ok_label
= gen_label_rtx ();
12753 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
12755 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
12757 for (i
= 0; i
< 8; i
++)
12758 emit_insn (gen_rtx_SET (xmm_regs
[i
], const0_rtx
));
12760 emit_label (ok_label
);
12761 emit_insn (gen_rtx_SET (target
, pat
));
12763 for (i
= 0; i
< 8; i
++)
12765 op
= gen_rtx_MEM (V2DImode
,
12766 plus_constant (Pmode
, op0
, (i
* 16)));
12767 emit_move_insn (op
, xmm_regs
[i
]);
12772 case IX86_BUILTIN_ENCODEKEY128U32
:
12774 rtx op
, xmm_regs
[7];
12776 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
12777 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i key
12778 arg2
= CALL_EXPR_ARG (exp
, 2); // void *h
12780 op0
= expand_normal (arg0
);
12781 op1
= expand_normal (arg1
);
12782 op2
= expand_normal (arg2
);
12785 op0
= copy_to_mode_reg (SImode
, op0
);
12787 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
12788 emit_move_insn (op
, op1
);
12790 for (i
= 0; i
< 3; i
++)
12791 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
12794 target
= gen_reg_rtx (SImode
);
12796 emit_insn (gen_encodekey128u32 (target
, op0
));
12798 for (i
= 0; i
< 3; i
++)
12800 op
= gen_rtx_MEM (V2DImode
,
12801 plus_constant (Pmode
, op2
, (i
* 16)));
12802 emit_move_insn (op
, xmm_regs
[i
]);
12807 case IX86_BUILTIN_ENCODEKEY256U32
:
12809 rtx op
, xmm_regs
[7];
12811 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
12812 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i keylow
12813 arg2
= CALL_EXPR_ARG (exp
, 2); // __m128i keyhi
12814 arg3
= CALL_EXPR_ARG (exp
, 3); // void *h
12816 op0
= expand_normal (arg0
);
12817 op1
= expand_normal (arg1
);
12818 op2
= expand_normal (arg2
);
12819 op3
= expand_normal (arg3
);
12822 op0
= copy_to_mode_reg (SImode
, op0
);
12824 /* Force to use xmm0, xmm1 for keylow, keyhi*/
12825 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
12826 emit_move_insn (op
, op1
);
12827 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (1));
12828 emit_move_insn (op
, op2
);
12830 for (i
= 0; i
< 4; i
++)
12831 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
12834 target
= gen_reg_rtx (SImode
);
12836 emit_insn (gen_encodekey256u32 (target
, op0
));
12838 for (i
= 0; i
< 4; i
++)
12840 op
= gen_rtx_MEM (V2DImode
,
12841 plus_constant (Pmode
, op3
, (i
* 16)));
12842 emit_move_insn (op
, xmm_regs
[i
]);
12848 case IX86_BUILTIN_VEC_INIT_V2SI
:
12849 case IX86_BUILTIN_VEC_INIT_V4HI
:
12850 case IX86_BUILTIN_VEC_INIT_V8QI
:
12851 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
12853 case IX86_BUILTIN_VEC_EXT_V2DF
:
12854 case IX86_BUILTIN_VEC_EXT_V2DI
:
12855 case IX86_BUILTIN_VEC_EXT_V4SF
:
12856 case IX86_BUILTIN_VEC_EXT_V4SI
:
12857 case IX86_BUILTIN_VEC_EXT_V8HI
:
12858 case IX86_BUILTIN_VEC_EXT_V2SI
:
12859 case IX86_BUILTIN_VEC_EXT_V4HI
:
12860 case IX86_BUILTIN_VEC_EXT_V16QI
:
12861 return ix86_expand_vec_ext_builtin (exp
, target
);
12863 case IX86_BUILTIN_VEC_SET_V2DI
:
12864 case IX86_BUILTIN_VEC_SET_V4SF
:
12865 case IX86_BUILTIN_VEC_SET_V4SI
:
12866 case IX86_BUILTIN_VEC_SET_V8HI
:
12867 case IX86_BUILTIN_VEC_SET_V4HI
:
12868 case IX86_BUILTIN_VEC_SET_V16QI
:
12869 return ix86_expand_vec_set_builtin (exp
);
12871 case IX86_BUILTIN_NANQ
:
12872 case IX86_BUILTIN_NANSQ
:
12873 return expand_call (exp
, target
, ignore
);
12875 case IX86_BUILTIN_RDPID
:
12877 op0
= gen_reg_rtx (word_mode
);
12881 insn
= gen_rdpid_rex64 (op0
);
12882 op0
= convert_to_mode (SImode
, op0
, 1);
12885 insn
= gen_rdpid (op0
);
12890 || !register_operand (target
, SImode
))
12891 target
= gen_reg_rtx (SImode
);
12893 emit_move_insn (target
, op0
);
12896 case IX86_BUILTIN_2INTERSECTD512
:
12897 case IX86_BUILTIN_2INTERSECTQ512
:
12898 case IX86_BUILTIN_2INTERSECTD256
:
12899 case IX86_BUILTIN_2INTERSECTQ256
:
12900 case IX86_BUILTIN_2INTERSECTD128
:
12901 case IX86_BUILTIN_2INTERSECTQ128
:
12902 arg0
= CALL_EXPR_ARG (exp
, 0);
12903 arg1
= CALL_EXPR_ARG (exp
, 1);
12904 arg2
= CALL_EXPR_ARG (exp
, 2);
12905 arg3
= CALL_EXPR_ARG (exp
, 3);
12906 op0
= expand_normal (arg0
);
12907 op1
= expand_normal (arg1
);
12908 op2
= expand_normal (arg2
);
12909 op3
= expand_normal (arg3
);
12911 if (!address_operand (op0
, VOIDmode
))
12913 op0
= convert_memory_address (Pmode
, op0
);
12914 op0
= copy_addr_to_reg (op0
);
12916 if (!address_operand (op1
, VOIDmode
))
12918 op1
= convert_memory_address (Pmode
, op1
);
12919 op1
= copy_addr_to_reg (op1
);
12924 case IX86_BUILTIN_2INTERSECTD512
:
12926 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
12928 case IX86_BUILTIN_2INTERSECTQ512
:
12930 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
12932 case IX86_BUILTIN_2INTERSECTD256
:
12934 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
12936 case IX86_BUILTIN_2INTERSECTQ256
:
12938 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
12940 case IX86_BUILTIN_2INTERSECTD128
:
12942 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
12944 case IX86_BUILTIN_2INTERSECTQ128
:
12946 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
12949 gcc_unreachable ();
12952 mode2
= insn_data
[icode
].operand
[1].mode
;
12953 mode3
= insn_data
[icode
].operand
[2].mode
;
12954 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
12955 op2
= copy_to_mode_reg (mode2
, op2
);
12956 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
12957 op3
= copy_to_mode_reg (mode3
, op3
);
12959 op4
= gen_reg_rtx (mode4
);
12960 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
12961 mode0
= mode4
== P2HImode
? HImode
: QImode
;
12962 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
12963 gen_lowpart (mode0
, op4
));
12964 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
12965 gen_highpart (mode0
, op4
));
12969 case IX86_BUILTIN_RDPMC
:
12970 case IX86_BUILTIN_RDTSC
:
12971 case IX86_BUILTIN_RDTSCP
:
12972 case IX86_BUILTIN_XGETBV
:
12974 op0
= gen_reg_rtx (DImode
);
12975 op1
= gen_reg_rtx (DImode
);
12977 if (fcode
== IX86_BUILTIN_RDPMC
)
12979 arg0
= CALL_EXPR_ARG (exp
, 0);
12980 op2
= expand_normal (arg0
);
12981 if (!register_operand (op2
, SImode
))
12982 op2
= copy_to_mode_reg (SImode
, op2
);
12984 insn
= (TARGET_64BIT
12985 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
12986 : gen_rdpmc (op0
, op2
));
12989 else if (fcode
== IX86_BUILTIN_XGETBV
)
12991 arg0
= CALL_EXPR_ARG (exp
, 0);
12992 op2
= expand_normal (arg0
);
12993 if (!register_operand (op2
, SImode
))
12994 op2
= copy_to_mode_reg (SImode
, op2
);
12996 insn
= (TARGET_64BIT
12997 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
12998 : gen_xgetbv (op0
, op2
));
13001 else if (fcode
== IX86_BUILTIN_RDTSC
)
13003 insn
= (TARGET_64BIT
13004 ? gen_rdtsc_rex64 (op0
, op1
)
13005 : gen_rdtsc (op0
));
13010 op2
= gen_reg_rtx (SImode
);
13012 insn
= (TARGET_64BIT
13013 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
13014 : gen_rdtscp (op0
, op2
));
13017 arg0
= CALL_EXPR_ARG (exp
, 0);
13018 op4
= expand_normal (arg0
);
13019 if (!address_operand (op4
, VOIDmode
))
13021 op4
= convert_memory_address (Pmode
, op4
);
13022 op4
= copy_addr_to_reg (op4
);
13024 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
13028 || !register_operand (target
, DImode
))
13029 target
= gen_reg_rtx (DImode
);
13033 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
13034 op1
, 1, OPTAB_DIRECT
);
13035 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
13036 op0
, 1, OPTAB_DIRECT
);
13039 emit_move_insn (target
, op0
);
13042 case IX86_BUILTIN_ENQCMD
:
13043 case IX86_BUILTIN_ENQCMDS
:
13044 case IX86_BUILTIN_MOVDIR64B
:
13046 arg0
= CALL_EXPR_ARG (exp
, 0);
13047 arg1
= CALL_EXPR_ARG (exp
, 1);
13048 op0
= expand_normal (arg0
);
13049 op1
= expand_normal (arg1
);
13051 op0
= ix86_zero_extend_to_Pmode (op0
);
13052 if (!address_operand (op1
, VOIDmode
))
13054 op1
= convert_memory_address (Pmode
, op1
);
13055 op1
= copy_addr_to_reg (op1
);
13057 op1
= gen_rtx_MEM (XImode
, op1
);
13059 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
13061 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
13067 || !register_operand (target
, SImode
))
13068 target
= gen_reg_rtx (SImode
);
13070 emit_move_insn (target
, const0_rtx
);
13071 target
= gen_rtx_SUBREG (QImode
, target
, 0);
13073 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
13075 : UNSPECV_ENQCMDS
);
13076 icode
= code_for_enqcmd (unspecv
, Pmode
);
13077 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13080 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
13081 gen_rtx_fmt_ee (EQ
, QImode
,
13082 gen_rtx_REG (CCZmode
, FLAGS_REG
),
13084 return SUBREG_REG (target
);
13087 case IX86_BUILTIN_FXSAVE
:
13088 case IX86_BUILTIN_FXRSTOR
:
13089 case IX86_BUILTIN_FXSAVE64
:
13090 case IX86_BUILTIN_FXRSTOR64
:
13091 case IX86_BUILTIN_FNSTENV
:
13092 case IX86_BUILTIN_FLDENV
:
13096 case IX86_BUILTIN_FXSAVE
:
13097 icode
= CODE_FOR_fxsave
;
13099 case IX86_BUILTIN_FXRSTOR
:
13100 icode
= CODE_FOR_fxrstor
;
13102 case IX86_BUILTIN_FXSAVE64
:
13103 icode
= CODE_FOR_fxsave64
;
13105 case IX86_BUILTIN_FXRSTOR64
:
13106 icode
= CODE_FOR_fxrstor64
;
13108 case IX86_BUILTIN_FNSTENV
:
13109 icode
= CODE_FOR_fnstenv
;
13111 case IX86_BUILTIN_FLDENV
:
13112 icode
= CODE_FOR_fldenv
;
13115 gcc_unreachable ();
13118 arg0
= CALL_EXPR_ARG (exp
, 0);
13119 op0
= expand_normal (arg0
);
13121 if (!address_operand (op0
, VOIDmode
))
13123 op0
= convert_memory_address (Pmode
, op0
);
13124 op0
= copy_addr_to_reg (op0
);
13126 op0
= gen_rtx_MEM (mode0
, op0
);
13128 pat
= GEN_FCN (icode
) (op0
);
13133 case IX86_BUILTIN_XSETBV
:
13134 arg0
= CALL_EXPR_ARG (exp
, 0);
13135 arg1
= CALL_EXPR_ARG (exp
, 1);
13136 op0
= expand_normal (arg0
);
13137 op1
= expand_normal (arg1
);
13140 op0
= copy_to_mode_reg (SImode
, op0
);
13142 op1
= force_reg (DImode
, op1
);
13146 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13147 NULL
, 1, OPTAB_DIRECT
);
13149 icode
= CODE_FOR_xsetbv_rex64
;
13151 op2
= gen_lowpart (SImode
, op2
);
13152 op1
= gen_lowpart (SImode
, op1
);
13153 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13157 icode
= CODE_FOR_xsetbv
;
13159 pat
= GEN_FCN (icode
) (op0
, op1
);
13165 case IX86_BUILTIN_XSAVE
:
13166 case IX86_BUILTIN_XRSTOR
:
13167 case IX86_BUILTIN_XSAVE64
:
13168 case IX86_BUILTIN_XRSTOR64
:
13169 case IX86_BUILTIN_XSAVEOPT
:
13170 case IX86_BUILTIN_XSAVEOPT64
:
13171 case IX86_BUILTIN_XSAVES
:
13172 case IX86_BUILTIN_XRSTORS
:
13173 case IX86_BUILTIN_XSAVES64
:
13174 case IX86_BUILTIN_XRSTORS64
:
13175 case IX86_BUILTIN_XSAVEC
:
13176 case IX86_BUILTIN_XSAVEC64
:
13177 arg0
= CALL_EXPR_ARG (exp
, 0);
13178 arg1
= CALL_EXPR_ARG (exp
, 1);
13179 op0
= expand_normal (arg0
);
13180 op1
= expand_normal (arg1
);
13182 if (!address_operand (op0
, VOIDmode
))
13184 op0
= convert_memory_address (Pmode
, op0
);
13185 op0
= copy_addr_to_reg (op0
);
13187 op0
= gen_rtx_MEM (BLKmode
, op0
);
13189 op1
= force_reg (DImode
, op1
);
13193 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13194 NULL
, 1, OPTAB_DIRECT
);
13197 case IX86_BUILTIN_XSAVE
:
13198 icode
= CODE_FOR_xsave_rex64
;
13200 case IX86_BUILTIN_XRSTOR
:
13201 icode
= CODE_FOR_xrstor_rex64
;
13203 case IX86_BUILTIN_XSAVE64
:
13204 icode
= CODE_FOR_xsave64
;
13206 case IX86_BUILTIN_XRSTOR64
:
13207 icode
= CODE_FOR_xrstor64
;
13209 case IX86_BUILTIN_XSAVEOPT
:
13210 icode
= CODE_FOR_xsaveopt_rex64
;
13212 case IX86_BUILTIN_XSAVEOPT64
:
13213 icode
= CODE_FOR_xsaveopt64
;
13215 case IX86_BUILTIN_XSAVES
:
13216 icode
= CODE_FOR_xsaves_rex64
;
13218 case IX86_BUILTIN_XRSTORS
:
13219 icode
= CODE_FOR_xrstors_rex64
;
13221 case IX86_BUILTIN_XSAVES64
:
13222 icode
= CODE_FOR_xsaves64
;
13224 case IX86_BUILTIN_XRSTORS64
:
13225 icode
= CODE_FOR_xrstors64
;
13227 case IX86_BUILTIN_XSAVEC
:
13228 icode
= CODE_FOR_xsavec_rex64
;
13230 case IX86_BUILTIN_XSAVEC64
:
13231 icode
= CODE_FOR_xsavec64
;
13234 gcc_unreachable ();
13237 op2
= gen_lowpart (SImode
, op2
);
13238 op1
= gen_lowpart (SImode
, op1
);
13239 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13245 case IX86_BUILTIN_XSAVE
:
13246 icode
= CODE_FOR_xsave
;
13248 case IX86_BUILTIN_XRSTOR
:
13249 icode
= CODE_FOR_xrstor
;
13251 case IX86_BUILTIN_XSAVEOPT
:
13252 icode
= CODE_FOR_xsaveopt
;
13254 case IX86_BUILTIN_XSAVES
:
13255 icode
= CODE_FOR_xsaves
;
13257 case IX86_BUILTIN_XRSTORS
:
13258 icode
= CODE_FOR_xrstors
;
13260 case IX86_BUILTIN_XSAVEC
:
13261 icode
= CODE_FOR_xsavec
;
13264 gcc_unreachable ();
13266 pat
= GEN_FCN (icode
) (op0
, op1
);
13273 case IX86_BUILTIN_LLWPCB
:
13274 arg0
= CALL_EXPR_ARG (exp
, 0);
13275 op0
= expand_normal (arg0
);
13277 if (!register_operand (op0
, Pmode
))
13278 op0
= ix86_zero_extend_to_Pmode (op0
);
13279 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
13282 case IX86_BUILTIN_SLWPCB
:
13284 || !register_operand (target
, Pmode
))
13285 target
= gen_reg_rtx (Pmode
);
13286 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
13289 case IX86_BUILTIN_LWPVAL32
:
13290 case IX86_BUILTIN_LWPVAL64
:
13291 case IX86_BUILTIN_LWPINS32
:
13292 case IX86_BUILTIN_LWPINS64
:
13293 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
13294 || fcode
== IX86_BUILTIN_LWPINS32
)
13295 ? SImode
: DImode
);
13297 if (fcode
== IX86_BUILTIN_LWPVAL32
13298 || fcode
== IX86_BUILTIN_LWPVAL64
)
13299 icode
= code_for_lwp_lwpval (mode
);
13301 icode
= code_for_lwp_lwpins (mode
);
13303 arg0
= CALL_EXPR_ARG (exp
, 0);
13304 arg1
= CALL_EXPR_ARG (exp
, 1);
13305 arg2
= CALL_EXPR_ARG (exp
, 2);
13306 op0
= expand_normal (arg0
);
13307 op1
= expand_normal (arg1
);
13308 op2
= expand_normal (arg2
);
13309 mode0
= insn_data
[icode
].operand
[0].mode
;
13311 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13312 op0
= copy_to_mode_reg (mode0
, op0
);
13313 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
13314 op1
= copy_to_mode_reg (SImode
, op1
);
13316 if (!CONST_INT_P (op2
))
13318 error ("the last argument must be a 32-bit immediate");
13322 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
13324 if (fcode
== IX86_BUILTIN_LWPINS32
13325 || fcode
== IX86_BUILTIN_LWPINS64
)
13328 || !nonimmediate_operand (target
, QImode
))
13329 target
= gen_reg_rtx (QImode
);
13331 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13333 emit_insn (gen_rtx_SET (target
, pat
));
13340 case IX86_BUILTIN_BEXTRI32
:
13341 case IX86_BUILTIN_BEXTRI64
:
13342 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
13344 arg0
= CALL_EXPR_ARG (exp
, 0);
13345 arg1
= CALL_EXPR_ARG (exp
, 1);
13346 op0
= expand_normal (arg0
);
13347 op1
= expand_normal (arg1
);
13349 if (!CONST_INT_P (op1
))
13351 error ("last argument must be an immediate");
13356 unsigned char lsb_index
= UINTVAL (op1
);
13357 unsigned char length
= UINTVAL (op1
) >> 8;
13359 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
13361 icode
= code_for_tbm_bextri (mode
);
13363 mode1
= insn_data
[icode
].operand
[1].mode
;
13364 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
13365 op0
= copy_to_mode_reg (mode1
, op0
);
13367 mode0
= insn_data
[icode
].operand
[0].mode
;
13369 || !register_operand (target
, mode0
))
13370 target
= gen_reg_rtx (mode0
);
13372 if (length
== 0 || lsb_index
>= bitsize
)
13374 emit_move_insn (target
, const0_rtx
);
13378 if (length
+ lsb_index
> bitsize
)
13379 length
= bitsize
- lsb_index
;
13381 op1
= GEN_INT (length
);
13382 op2
= GEN_INT (lsb_index
);
13384 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
13388 case IX86_BUILTIN_RDRAND16_STEP
:
13392 case IX86_BUILTIN_RDRAND32_STEP
:
13396 case IX86_BUILTIN_RDRAND64_STEP
:
13400 arg0
= CALL_EXPR_ARG (exp
, 0);
13401 op1
= expand_normal (arg0
);
13402 if (!address_operand (op1
, VOIDmode
))
13404 op1
= convert_memory_address (Pmode
, op1
);
13405 op1
= copy_addr_to_reg (op1
);
13408 op0
= gen_reg_rtx (mode
);
13409 emit_insn (gen_rdrand (mode
, op0
));
13411 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
13413 op1
= force_reg (SImode
, const1_rtx
);
13415 /* Emit SImode conditional move. */
13416 if (mode
== HImode
)
13418 if (TARGET_ZERO_EXTEND_WITH_AND
13419 && optimize_function_for_speed_p (cfun
))
13421 op2
= force_reg (SImode
, const0_rtx
);
13423 emit_insn (gen_movstricthi
13424 (gen_lowpart (HImode
, op2
), op0
));
13428 op2
= gen_reg_rtx (SImode
);
13430 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
13433 else if (mode
== SImode
)
13436 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
13439 || !register_operand (target
, SImode
))
13440 target
= gen_reg_rtx (SImode
);
13442 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13444 emit_insn (gen_rtx_SET (target
,
13445 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
13448 case IX86_BUILTIN_RDSEED16_STEP
:
13452 case IX86_BUILTIN_RDSEED32_STEP
:
13456 case IX86_BUILTIN_RDSEED64_STEP
:
13460 arg0
= CALL_EXPR_ARG (exp
, 0);
13461 op1
= expand_normal (arg0
);
13462 if (!address_operand (op1
, VOIDmode
))
13464 op1
= convert_memory_address (Pmode
, op1
);
13465 op1
= copy_addr_to_reg (op1
);
13468 op0
= gen_reg_rtx (mode
);
13469 emit_insn (gen_rdseed (mode
, op0
));
13471 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
13473 op2
= gen_reg_rtx (QImode
);
13475 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13477 emit_insn (gen_rtx_SET (op2
, pat
));
13480 || !register_operand (target
, SImode
))
13481 target
= gen_reg_rtx (SImode
);
13483 emit_insn (gen_zero_extendqisi2 (target
, op2
));
13486 case IX86_BUILTIN_SBB32
:
13487 icode
= CODE_FOR_subborrowsi
;
13488 icode2
= CODE_FOR_subborrowsi_0
;
13494 case IX86_BUILTIN_SBB64
:
13495 icode
= CODE_FOR_subborrowdi
;
13496 icode2
= CODE_FOR_subborrowdi_0
;
13502 case IX86_BUILTIN_ADDCARRYX32
:
13503 icode
= CODE_FOR_addcarrysi
;
13504 icode2
= CODE_FOR_addcarrysi_0
;
13510 case IX86_BUILTIN_ADDCARRYX64
:
13511 icode
= CODE_FOR_addcarrydi
;
13512 icode2
= CODE_FOR_addcarrydi_0
;
13518 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
13519 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
13520 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
13521 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
13523 op1
= expand_normal (arg0
);
13524 if (!integer_zerop (arg0
))
13525 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
13527 op2
= expand_normal (arg1
);
13528 if (!register_operand (op2
, mode0
))
13529 op2
= copy_to_mode_reg (mode0
, op2
);
13531 op3
= expand_normal (arg2
);
13532 if (!register_operand (op3
, mode0
))
13533 op3
= copy_to_mode_reg (mode0
, op3
);
13535 op4
= expand_normal (arg3
);
13536 if (!address_operand (op4
, VOIDmode
))
13538 op4
= convert_memory_address (Pmode
, op4
);
13539 op4
= copy_addr_to_reg (op4
);
13542 op0
= gen_reg_rtx (mode0
);
13543 if (integer_zerop (arg0
))
13545 /* If arg0 is 0, optimize right away into add or sub
13546 instruction that sets CCCmode flags. */
13547 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
13548 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
13552 /* Generate CF from input operand. */
13553 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
13555 /* Generate instruction that consumes CF. */
13556 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
13557 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
13558 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
13559 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
13562 /* Return current CF value. */
13564 target
= gen_reg_rtx (QImode
);
13566 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
13567 emit_insn (gen_rtx_SET (target
, pat
));
13569 /* Store the result. */
13570 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
13574 case IX86_BUILTIN_READ_FLAGS
:
13578 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
13581 || target
== NULL_RTX
13582 || !nonimmediate_operand (target
, word_mode
)
13583 || GET_MODE (target
) != word_mode
)
13584 target
= gen_reg_rtx (word_mode
);
13586 emit_insn (gen_pop (target
));
13589 case IX86_BUILTIN_WRITE_FLAGS
:
13591 arg0
= CALL_EXPR_ARG (exp
, 0);
13592 op0
= expand_normal (arg0
);
13593 if (!general_no_elim_operand (op0
, word_mode
))
13594 op0
= copy_to_mode_reg (word_mode
, op0
);
13596 emit_insn (gen_push (op0
));
13597 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
13600 case IX86_BUILTIN_KTESTC8
:
13601 icode
= CODE_FOR_ktestqi
;
13605 case IX86_BUILTIN_KTESTZ8
:
13606 icode
= CODE_FOR_ktestqi
;
13610 case IX86_BUILTIN_KTESTC16
:
13611 icode
= CODE_FOR_ktesthi
;
13615 case IX86_BUILTIN_KTESTZ16
:
13616 icode
= CODE_FOR_ktesthi
;
13620 case IX86_BUILTIN_KTESTC32
:
13621 icode
= CODE_FOR_ktestsi
;
13625 case IX86_BUILTIN_KTESTZ32
:
13626 icode
= CODE_FOR_ktestsi
;
13630 case IX86_BUILTIN_KTESTC64
:
13631 icode
= CODE_FOR_ktestdi
;
13635 case IX86_BUILTIN_KTESTZ64
:
13636 icode
= CODE_FOR_ktestdi
;
13640 case IX86_BUILTIN_KORTESTC8
:
13641 icode
= CODE_FOR_kortestqi
;
13645 case IX86_BUILTIN_KORTESTZ8
:
13646 icode
= CODE_FOR_kortestqi
;
13650 case IX86_BUILTIN_KORTESTC16
:
13651 icode
= CODE_FOR_kortesthi
;
13655 case IX86_BUILTIN_KORTESTZ16
:
13656 icode
= CODE_FOR_kortesthi
;
13660 case IX86_BUILTIN_KORTESTC32
:
13661 icode
= CODE_FOR_kortestsi
;
13665 case IX86_BUILTIN_KORTESTZ32
:
13666 icode
= CODE_FOR_kortestsi
;
13670 case IX86_BUILTIN_KORTESTC64
:
13671 icode
= CODE_FOR_kortestdi
;
13675 case IX86_BUILTIN_KORTESTZ64
:
13676 icode
= CODE_FOR_kortestdi
;
13680 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
13681 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
13682 op0
= expand_normal (arg0
);
13683 op1
= expand_normal (arg1
);
13685 mode0
= insn_data
[icode
].operand
[0].mode
;
13686 mode1
= insn_data
[icode
].operand
[1].mode
;
13688 if (GET_MODE (op0
) != VOIDmode
)
13689 op0
= force_reg (GET_MODE (op0
), op0
);
13691 op0
= gen_lowpart (mode0
, op0
);
13693 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13694 op0
= copy_to_mode_reg (mode0
, op0
);
13696 if (GET_MODE (op1
) != VOIDmode
)
13697 op1
= force_reg (GET_MODE (op1
), op1
);
13699 op1
= gen_lowpart (mode1
, op1
);
13701 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
13702 op1
= copy_to_mode_reg (mode1
, op1
);
13704 target
= gen_reg_rtx (QImode
);
13706 /* Emit kortest. */
13707 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13708 /* And use setcc to return result from flags. */
13709 ix86_expand_setcc (target
, EQ
,
13710 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
13713 case IX86_BUILTIN_GATHERSIV2DF
:
13714 icode
= CODE_FOR_avx2_gathersiv2df
;
13716 case IX86_BUILTIN_GATHERSIV4DF
:
13717 icode
= CODE_FOR_avx2_gathersiv4df
;
13719 case IX86_BUILTIN_GATHERDIV2DF
:
13720 icode
= CODE_FOR_avx2_gatherdiv2df
;
13722 case IX86_BUILTIN_GATHERDIV4DF
:
13723 icode
= CODE_FOR_avx2_gatherdiv4df
;
13725 case IX86_BUILTIN_GATHERSIV4SF
:
13726 icode
= CODE_FOR_avx2_gathersiv4sf
;
13728 case IX86_BUILTIN_GATHERSIV8SF
:
13729 icode
= CODE_FOR_avx2_gathersiv8sf
;
13731 case IX86_BUILTIN_GATHERDIV4SF
:
13732 icode
= CODE_FOR_avx2_gatherdiv4sf
;
13734 case IX86_BUILTIN_GATHERDIV8SF
:
13735 icode
= CODE_FOR_avx2_gatherdiv8sf
;
13737 case IX86_BUILTIN_GATHERSIV2DI
:
13738 icode
= CODE_FOR_avx2_gathersiv2di
;
13740 case IX86_BUILTIN_GATHERSIV4DI
:
13741 icode
= CODE_FOR_avx2_gathersiv4di
;
13743 case IX86_BUILTIN_GATHERDIV2DI
:
13744 icode
= CODE_FOR_avx2_gatherdiv2di
;
13746 case IX86_BUILTIN_GATHERDIV4DI
:
13747 icode
= CODE_FOR_avx2_gatherdiv4di
;
13749 case IX86_BUILTIN_GATHERSIV4SI
:
13750 icode
= CODE_FOR_avx2_gathersiv4si
;
13752 case IX86_BUILTIN_GATHERSIV8SI
:
13753 icode
= CODE_FOR_avx2_gathersiv8si
;
13755 case IX86_BUILTIN_GATHERDIV4SI
:
13756 icode
= CODE_FOR_avx2_gatherdiv4si
;
13758 case IX86_BUILTIN_GATHERDIV8SI
:
13759 icode
= CODE_FOR_avx2_gatherdiv8si
;
13761 case IX86_BUILTIN_GATHERALTSIV4DF
:
13762 icode
= CODE_FOR_avx2_gathersiv4df
;
13764 case IX86_BUILTIN_GATHERALTDIV8SF
:
13765 icode
= CODE_FOR_avx2_gatherdiv8sf
;
13767 case IX86_BUILTIN_GATHERALTSIV4DI
:
13768 icode
= CODE_FOR_avx2_gathersiv4di
;
13770 case IX86_BUILTIN_GATHERALTDIV8SI
:
13771 icode
= CODE_FOR_avx2_gatherdiv8si
;
13773 case IX86_BUILTIN_GATHER3SIV16SF
:
13774 icode
= CODE_FOR_avx512f_gathersiv16sf
;
13776 case IX86_BUILTIN_GATHER3SIV8DF
:
13777 icode
= CODE_FOR_avx512f_gathersiv8df
;
13779 case IX86_BUILTIN_GATHER3DIV16SF
:
13780 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
13782 case IX86_BUILTIN_GATHER3DIV8DF
:
13783 icode
= CODE_FOR_avx512f_gatherdiv8df
;
13785 case IX86_BUILTIN_GATHER3SIV16SI
:
13786 icode
= CODE_FOR_avx512f_gathersiv16si
;
13788 case IX86_BUILTIN_GATHER3SIV8DI
:
13789 icode
= CODE_FOR_avx512f_gathersiv8di
;
13791 case IX86_BUILTIN_GATHER3DIV16SI
:
13792 icode
= CODE_FOR_avx512f_gatherdiv16si
;
13794 case IX86_BUILTIN_GATHER3DIV8DI
:
13795 icode
= CODE_FOR_avx512f_gatherdiv8di
;
13797 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
13798 icode
= CODE_FOR_avx512f_gathersiv8df
;
13800 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
13801 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
13803 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
13804 icode
= CODE_FOR_avx512f_gathersiv8di
;
13806 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
13807 icode
= CODE_FOR_avx512f_gatherdiv16si
;
13809 case IX86_BUILTIN_GATHER3SIV2DF
:
13810 icode
= CODE_FOR_avx512vl_gathersiv2df
;
13812 case IX86_BUILTIN_GATHER3SIV4DF
:
13813 icode
= CODE_FOR_avx512vl_gathersiv4df
;
13815 case IX86_BUILTIN_GATHER3DIV2DF
:
13816 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
13818 case IX86_BUILTIN_GATHER3DIV4DF
:
13819 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
13821 case IX86_BUILTIN_GATHER3SIV4SF
:
13822 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
13824 case IX86_BUILTIN_GATHER3SIV8SF
:
13825 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
13827 case IX86_BUILTIN_GATHER3DIV4SF
:
13828 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
13830 case IX86_BUILTIN_GATHER3DIV8SF
:
13831 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
13833 case IX86_BUILTIN_GATHER3SIV2DI
:
13834 icode
= CODE_FOR_avx512vl_gathersiv2di
;
13836 case IX86_BUILTIN_GATHER3SIV4DI
:
13837 icode
= CODE_FOR_avx512vl_gathersiv4di
;
13839 case IX86_BUILTIN_GATHER3DIV2DI
:
13840 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
13842 case IX86_BUILTIN_GATHER3DIV4DI
:
13843 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
13845 case IX86_BUILTIN_GATHER3SIV4SI
:
13846 icode
= CODE_FOR_avx512vl_gathersiv4si
;
13848 case IX86_BUILTIN_GATHER3SIV8SI
:
13849 icode
= CODE_FOR_avx512vl_gathersiv8si
;
13851 case IX86_BUILTIN_GATHER3DIV4SI
:
13852 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
13854 case IX86_BUILTIN_GATHER3DIV8SI
:
13855 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
13857 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
13858 icode
= CODE_FOR_avx512vl_gathersiv4df
;
13860 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
13861 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
13863 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
13864 icode
= CODE_FOR_avx512vl_gathersiv4di
;
13866 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
13867 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
13869 case IX86_BUILTIN_SCATTERSIV16SF
:
13870 icode
= CODE_FOR_avx512f_scattersiv16sf
;
13872 case IX86_BUILTIN_SCATTERSIV8DF
:
13873 icode
= CODE_FOR_avx512f_scattersiv8df
;
13875 case IX86_BUILTIN_SCATTERDIV16SF
:
13876 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
13878 case IX86_BUILTIN_SCATTERDIV8DF
:
13879 icode
= CODE_FOR_avx512f_scatterdiv8df
;
13881 case IX86_BUILTIN_SCATTERSIV16SI
:
13882 icode
= CODE_FOR_avx512f_scattersiv16si
;
13884 case IX86_BUILTIN_SCATTERSIV8DI
:
13885 icode
= CODE_FOR_avx512f_scattersiv8di
;
13887 case IX86_BUILTIN_SCATTERDIV16SI
:
13888 icode
= CODE_FOR_avx512f_scatterdiv16si
;
13890 case IX86_BUILTIN_SCATTERDIV8DI
:
13891 icode
= CODE_FOR_avx512f_scatterdiv8di
;
13893 case IX86_BUILTIN_SCATTERSIV8SF
:
13894 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
13896 case IX86_BUILTIN_SCATTERSIV4SF
:
13897 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
13899 case IX86_BUILTIN_SCATTERSIV4DF
:
13900 icode
= CODE_FOR_avx512vl_scattersiv4df
;
13902 case IX86_BUILTIN_SCATTERSIV2DF
:
13903 icode
= CODE_FOR_avx512vl_scattersiv2df
;
13905 case IX86_BUILTIN_SCATTERDIV8SF
:
13906 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
13908 case IX86_BUILTIN_SCATTERDIV4SF
:
13909 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
13911 case IX86_BUILTIN_SCATTERDIV4DF
:
13912 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
13914 case IX86_BUILTIN_SCATTERDIV2DF
:
13915 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
13917 case IX86_BUILTIN_SCATTERSIV8SI
:
13918 icode
= CODE_FOR_avx512vl_scattersiv8si
;
13920 case IX86_BUILTIN_SCATTERSIV4SI
:
13921 icode
= CODE_FOR_avx512vl_scattersiv4si
;
13923 case IX86_BUILTIN_SCATTERSIV4DI
:
13924 icode
= CODE_FOR_avx512vl_scattersiv4di
;
13926 case IX86_BUILTIN_SCATTERSIV2DI
:
13927 icode
= CODE_FOR_avx512vl_scattersiv2di
;
13929 case IX86_BUILTIN_SCATTERDIV8SI
:
13930 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
13932 case IX86_BUILTIN_SCATTERDIV4SI
:
13933 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
13935 case IX86_BUILTIN_SCATTERDIV4DI
:
13936 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
13938 case IX86_BUILTIN_SCATTERDIV2DI
:
13939 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
13941 case IX86_BUILTIN_GATHERPFDPD
:
13942 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
13943 goto vec_prefetch_gen
;
13944 case IX86_BUILTIN_SCATTERALTSIV8DF
:
13945 icode
= CODE_FOR_avx512f_scattersiv8df
;
13947 case IX86_BUILTIN_SCATTERALTDIV16SF
:
13948 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
13950 case IX86_BUILTIN_SCATTERALTSIV8DI
:
13951 icode
= CODE_FOR_avx512f_scattersiv8di
;
13953 case IX86_BUILTIN_SCATTERALTDIV16SI
:
13954 icode
= CODE_FOR_avx512f_scatterdiv16si
;
13956 case IX86_BUILTIN_SCATTERALTSIV4DF
:
13957 icode
= CODE_FOR_avx512vl_scattersiv4df
;
13959 case IX86_BUILTIN_SCATTERALTDIV8SF
:
13960 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
13962 case IX86_BUILTIN_SCATTERALTSIV4DI
:
13963 icode
= CODE_FOR_avx512vl_scattersiv4di
;
13965 case IX86_BUILTIN_SCATTERALTDIV8SI
:
13966 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
13968 case IX86_BUILTIN_SCATTERALTSIV2DF
:
13969 icode
= CODE_FOR_avx512vl_scattersiv2df
;
13971 case IX86_BUILTIN_SCATTERALTDIV4SF
:
13972 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
13974 case IX86_BUILTIN_SCATTERALTSIV2DI
:
13975 icode
= CODE_FOR_avx512vl_scattersiv2di
;
13977 case IX86_BUILTIN_SCATTERALTDIV4SI
:
13978 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
13980 case IX86_BUILTIN_GATHERPFDPS
:
13981 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
13982 goto vec_prefetch_gen
;
13983 case IX86_BUILTIN_GATHERPFQPD
:
13984 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
13985 goto vec_prefetch_gen
;
13986 case IX86_BUILTIN_GATHERPFQPS
:
13987 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
13988 goto vec_prefetch_gen
;
13989 case IX86_BUILTIN_SCATTERPFDPD
:
13990 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
13991 goto vec_prefetch_gen
;
13992 case IX86_BUILTIN_SCATTERPFDPS
:
13993 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
13994 goto vec_prefetch_gen
;
13995 case IX86_BUILTIN_SCATTERPFQPD
:
13996 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
13997 goto vec_prefetch_gen
;
13998 case IX86_BUILTIN_SCATTERPFQPS
:
13999 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
14000 goto vec_prefetch_gen
;
14004 rtx (*gen
) (rtx
, rtx
);
14006 arg0
= CALL_EXPR_ARG (exp
, 0);
14007 arg1
= CALL_EXPR_ARG (exp
, 1);
14008 arg2
= CALL_EXPR_ARG (exp
, 2);
14009 arg3
= CALL_EXPR_ARG (exp
, 3);
14010 arg4
= CALL_EXPR_ARG (exp
, 4);
14011 op0
= expand_normal (arg0
);
14012 op1
= expand_normal (arg1
);
14013 op2
= expand_normal (arg2
);
14014 op3
= expand_normal (arg3
);
14015 op4
= expand_normal (arg4
);
14016 /* Note the arg order is different from the operand order. */
14017 mode0
= insn_data
[icode
].operand
[1].mode
;
14018 mode2
= insn_data
[icode
].operand
[3].mode
;
14019 mode3
= insn_data
[icode
].operand
[4].mode
;
14020 mode4
= insn_data
[icode
].operand
[5].mode
;
14022 if (target
== NULL_RTX
14023 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
14024 || !insn_data
[icode
].operand
[0].predicate (target
,
14025 GET_MODE (target
)))
14026 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
14028 subtarget
= target
;
14032 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14033 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14034 half
= gen_reg_rtx (V8SImode
);
14035 if (!nonimmediate_operand (op2
, V16SImode
))
14036 op2
= copy_to_mode_reg (V16SImode
, op2
);
14037 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14040 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14041 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14042 case IX86_BUILTIN_GATHERALTSIV4DF
:
14043 case IX86_BUILTIN_GATHERALTSIV4DI
:
14044 half
= gen_reg_rtx (V4SImode
);
14045 if (!nonimmediate_operand (op2
, V8SImode
))
14046 op2
= copy_to_mode_reg (V8SImode
, op2
);
14047 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14050 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14051 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14052 half
= gen_reg_rtx (mode0
);
14053 if (mode0
== V8SFmode
)
14054 gen
= gen_vec_extract_lo_v16sf
;
14056 gen
= gen_vec_extract_lo_v16si
;
14057 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14058 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14059 emit_insn (gen (half
, op0
));
14061 op3
= lowpart_subreg (QImode
, op3
, HImode
);
14063 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14064 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14065 case IX86_BUILTIN_GATHERALTDIV8SF
:
14066 case IX86_BUILTIN_GATHERALTDIV8SI
:
14067 half
= gen_reg_rtx (mode0
);
14068 if (mode0
== V4SFmode
)
14069 gen
= gen_vec_extract_lo_v8sf
;
14071 gen
= gen_vec_extract_lo_v8si
;
14072 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14073 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14074 emit_insn (gen (half
, op0
));
14076 if (VECTOR_MODE_P (GET_MODE (op3
)))
14078 half
= gen_reg_rtx (mode0
);
14079 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14080 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14081 emit_insn (gen (half
, op3
));
14089 /* Force memory operand only with base register here. But we
14090 don't want to do it on memory operand for other builtin
14092 op1
= ix86_zero_extend_to_Pmode (op1
);
14094 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
14095 op0
= copy_to_mode_reg (mode0
, op0
);
14096 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
14097 op1
= copy_to_mode_reg (Pmode
, op1
);
14098 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
14099 op2
= copy_to_mode_reg (mode2
, op2
);
14101 op3
= fixup_modeless_constant (op3
, mode3
);
14103 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
14105 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
14106 op3
= copy_to_mode_reg (mode3
, op3
);
14110 op3
= copy_to_reg (op3
);
14111 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
14113 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
14115 error ("the last argument must be scale 1, 2, 4, 8");
14119 /* Optimize. If mask is known to have all high bits set,
14120 replace op0 with pc_rtx to signal that the instruction
14121 overwrites the whole destination and doesn't use its
14122 previous contents. */
14125 if (TREE_CODE (arg3
) == INTEGER_CST
)
14127 if (integer_all_onesp (arg3
))
14130 else if (TREE_CODE (arg3
) == VECTOR_CST
)
14132 unsigned int negative
= 0;
14133 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
14135 tree cst
= VECTOR_CST_ELT (arg3
, i
);
14136 if (TREE_CODE (cst
) == INTEGER_CST
14137 && tree_int_cst_sign_bit (cst
))
14139 else if (TREE_CODE (cst
) == REAL_CST
14140 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
14143 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
14146 else if (TREE_CODE (arg3
) == SSA_NAME
14147 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
14149 /* Recognize also when mask is like:
14150 __v2df src = _mm_setzero_pd ();
14151 __v2df mask = _mm_cmpeq_pd (src, src);
14153 __v8sf src = _mm256_setzero_ps ();
14154 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14155 as that is a cheaper way to load all ones into
14156 a register than having to load a constant from
14158 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
14159 if (is_gimple_call (def_stmt
))
14161 tree fndecl
= gimple_call_fndecl (def_stmt
);
14163 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
14164 switch (DECL_MD_FUNCTION_CODE (fndecl
))
14166 case IX86_BUILTIN_CMPPD
:
14167 case IX86_BUILTIN_CMPPS
:
14168 case IX86_BUILTIN_CMPPD256
:
14169 case IX86_BUILTIN_CMPPS256
:
14170 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
14173 case IX86_BUILTIN_CMPEQPD
:
14174 case IX86_BUILTIN_CMPEQPS
:
14175 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
14176 && initializer_zerop (gimple_call_arg (def_stmt
,
14187 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
14194 case IX86_BUILTIN_GATHER3DIV16SF
:
14195 if (target
== NULL_RTX
)
14196 target
= gen_reg_rtx (V8SFmode
);
14197 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
14199 case IX86_BUILTIN_GATHER3DIV16SI
:
14200 if (target
== NULL_RTX
)
14201 target
= gen_reg_rtx (V8SImode
);
14202 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
14204 case IX86_BUILTIN_GATHER3DIV8SF
:
14205 case IX86_BUILTIN_GATHERDIV8SF
:
14206 if (target
== NULL_RTX
)
14207 target
= gen_reg_rtx (V4SFmode
);
14208 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
14210 case IX86_BUILTIN_GATHER3DIV8SI
:
14211 case IX86_BUILTIN_GATHERDIV8SI
:
14212 if (target
== NULL_RTX
)
14213 target
= gen_reg_rtx (V4SImode
);
14214 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
14217 target
= subtarget
;
14223 arg0
= CALL_EXPR_ARG (exp
, 0);
14224 arg1
= CALL_EXPR_ARG (exp
, 1);
14225 arg2
= CALL_EXPR_ARG (exp
, 2);
14226 arg3
= CALL_EXPR_ARG (exp
, 3);
14227 arg4
= CALL_EXPR_ARG (exp
, 4);
14228 op0
= expand_normal (arg0
);
14229 op1
= expand_normal (arg1
);
14230 op2
= expand_normal (arg2
);
14231 op3
= expand_normal (arg3
);
14232 op4
= expand_normal (arg4
);
14233 mode1
= insn_data
[icode
].operand
[1].mode
;
14234 mode2
= insn_data
[icode
].operand
[2].mode
;
14235 mode3
= insn_data
[icode
].operand
[3].mode
;
14236 mode4
= insn_data
[icode
].operand
[4].mode
;
14238 /* Scatter instruction stores operand op3 to memory with
14239 indices from op2 and scale from op4 under writemask op1.
14240 If index operand op2 has more elements then source operand
14241 op3 one need to use only its low half. And vice versa. */
14244 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14245 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14246 half
= gen_reg_rtx (V8SImode
);
14247 if (!nonimmediate_operand (op2
, V16SImode
))
14248 op2
= copy_to_mode_reg (V16SImode
, op2
);
14249 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14252 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14253 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14254 half
= gen_reg_rtx (mode3
);
14255 if (mode3
== V8SFmode
)
14256 gen
= gen_vec_extract_lo_v16sf
;
14258 gen
= gen_vec_extract_lo_v16si
;
14259 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14260 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14261 emit_insn (gen (half
, op3
));
14264 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14265 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14266 half
= gen_reg_rtx (V4SImode
);
14267 if (!nonimmediate_operand (op2
, V8SImode
))
14268 op2
= copy_to_mode_reg (V8SImode
, op2
);
14269 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14272 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14273 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14274 half
= gen_reg_rtx (mode3
);
14275 if (mode3
== V4SFmode
)
14276 gen
= gen_vec_extract_lo_v8sf
;
14278 gen
= gen_vec_extract_lo_v8si
;
14279 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14280 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14281 emit_insn (gen (half
, op3
));
14284 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14285 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14286 if (!nonimmediate_operand (op2
, V4SImode
))
14287 op2
= copy_to_mode_reg (V4SImode
, op2
);
14289 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14290 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14291 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14292 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14298 /* Force memory operand only with base register here. But we
14299 don't want to do it on memory operand for other builtin
14301 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
14303 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
14304 op0
= copy_to_mode_reg (Pmode
, op0
);
14306 op1
= fixup_modeless_constant (op1
, mode1
);
14308 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
14310 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14311 op1
= copy_to_mode_reg (mode1
, op1
);
14315 op1
= copy_to_reg (op1
);
14316 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
14319 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
14320 op2
= copy_to_mode_reg (mode2
, op2
);
14322 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14323 op3
= copy_to_mode_reg (mode3
, op3
);
14325 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14327 error ("the last argument must be scale 1, 2, 4, 8");
14331 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14339 arg0
= CALL_EXPR_ARG (exp
, 0);
14340 arg1
= CALL_EXPR_ARG (exp
, 1);
14341 arg2
= CALL_EXPR_ARG (exp
, 2);
14342 arg3
= CALL_EXPR_ARG (exp
, 3);
14343 arg4
= CALL_EXPR_ARG (exp
, 4);
14344 op0
= expand_normal (arg0
);
14345 op1
= expand_normal (arg1
);
14346 op2
= expand_normal (arg2
);
14347 op3
= expand_normal (arg3
);
14348 op4
= expand_normal (arg4
);
14349 mode0
= insn_data
[icode
].operand
[0].mode
;
14350 mode1
= insn_data
[icode
].operand
[1].mode
;
14351 mode3
= insn_data
[icode
].operand
[3].mode
;
14352 mode4
= insn_data
[icode
].operand
[4].mode
;
14354 op0
= fixup_modeless_constant (op0
, mode0
);
14356 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
14358 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14359 op0
= copy_to_mode_reg (mode0
, op0
);
14363 op0
= copy_to_reg (op0
);
14364 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
14367 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14368 op1
= copy_to_mode_reg (mode1
, op1
);
14370 /* Force memory operand only with base register here. But we
14371 don't want to do it on memory operand for other builtin
14373 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
14375 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
14376 op2
= copy_to_mode_reg (Pmode
, op2
);
14378 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14380 error ("the forth argument must be scale 1, 2, 4, 8");
14384 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14386 error ("incorrect hint operand");
14390 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14398 case IX86_BUILTIN_XABORT
:
14399 icode
= CODE_FOR_xabort
;
14400 arg0
= CALL_EXPR_ARG (exp
, 0);
14401 op0
= expand_normal (arg0
);
14402 mode0
= insn_data
[icode
].operand
[0].mode
;
14403 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14405 error ("the argument to %<xabort%> intrinsic must "
14406 "be an 8-bit immediate");
14409 emit_insn (gen_xabort (op0
));
14412 case IX86_BUILTIN_RDSSPD
:
14413 case IX86_BUILTIN_RDSSPQ
:
14414 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
14417 || !register_operand (target
, mode
))
14418 target
= gen_reg_rtx (mode
);
14420 op0
= force_reg (mode
, const0_rtx
);
14422 emit_insn (gen_rdssp (mode
, target
, op0
));
14425 case IX86_BUILTIN_INCSSPD
:
14426 case IX86_BUILTIN_INCSSPQ
:
14427 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
14429 arg0
= CALL_EXPR_ARG (exp
, 0);
14430 op0
= expand_normal (arg0
);
14432 op0
= force_reg (mode
, op0
);
14434 emit_insn (gen_incssp (mode
, op0
));
14437 case IX86_BUILTIN_HRESET
:
14438 icode
= CODE_FOR_hreset
;
14439 arg0
= CALL_EXPR_ARG (exp
, 0);
14440 op0
= expand_normal (arg0
);
14441 op0
= force_reg (SImode
, op0
);
14442 emit_insn (gen_hreset (op0
));
14445 case IX86_BUILTIN_RSTORSSP
:
14446 case IX86_BUILTIN_CLRSSBSY
:
14447 arg0
= CALL_EXPR_ARG (exp
, 0);
14448 op0
= expand_normal (arg0
);
14449 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
14450 ? CODE_FOR_rstorssp
14451 : CODE_FOR_clrssbsy
);
14453 if (!address_operand (op0
, VOIDmode
))
14455 op0
= convert_memory_address (Pmode
, op0
);
14456 op0
= copy_addr_to_reg (op0
);
14458 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
14461 case IX86_BUILTIN_WRSSD
:
14462 case IX86_BUILTIN_WRSSQ
:
14463 case IX86_BUILTIN_WRUSSD
:
14464 case IX86_BUILTIN_WRUSSQ
:
14465 mode
= ((fcode
== IX86_BUILTIN_WRSSD
14466 || fcode
== IX86_BUILTIN_WRUSSD
)
14467 ? SImode
: DImode
);
14469 arg0
= CALL_EXPR_ARG (exp
, 0);
14470 op0
= expand_normal (arg0
);
14471 arg1
= CALL_EXPR_ARG (exp
, 1);
14472 op1
= expand_normal (arg1
);
14474 op0
= force_reg (mode
, op0
);
14476 if (!address_operand (op1
, VOIDmode
))
14478 op1
= convert_memory_address (Pmode
, op1
);
14479 op1
= copy_addr_to_reg (op1
);
14481 op1
= gen_rtx_MEM (mode
, op1
);
14483 icode
= ((fcode
== IX86_BUILTIN_WRSSD
14484 || fcode
== IX86_BUILTIN_WRSSQ
)
14485 ? code_for_wrss (mode
)
14486 : code_for_wruss (mode
));
14487 emit_insn (GEN_FCN (icode
) (op0
, op1
));
14495 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14496 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
14498 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
14499 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
14503 if (fcode
>= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14504 && fcode
<= IX86_BUILTIN__BDESC_PURE_ARGS_LAST
)
14506 i
= fcode
- IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
;
14507 return ix86_expand_special_args_builtin (bdesc_pure_args
+ i
, exp
,
14511 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
14512 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
14514 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
14515 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
14516 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
14517 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
14519 machine_mode mode
, wide_mode
, nar_mode
;
14521 nar_mode
= V4SFmode
;
14523 wide_mode
= V64SFmode
;
14524 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
14525 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
14529 case IX86_BUILTIN_4FMAPS
:
14530 fcn
= gen_avx5124fmaddps_4fmaddps
;
14534 case IX86_BUILTIN_4DPWSSD
:
14535 nar_mode
= V4SImode
;
14537 wide_mode
= V64SImode
;
14538 fcn
= gen_avx5124vnniw_vp4dpwssd
;
14542 case IX86_BUILTIN_4DPWSSDS
:
14543 nar_mode
= V4SImode
;
14545 wide_mode
= V64SImode
;
14546 fcn
= gen_avx5124vnniw_vp4dpwssds
;
14550 case IX86_BUILTIN_4FNMAPS
:
14551 fcn
= gen_avx5124fmaddps_4fnmaddps
;
14555 case IX86_BUILTIN_4FNMAPS_MASK
:
14556 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
14557 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
14560 case IX86_BUILTIN_4DPWSSD_MASK
:
14561 nar_mode
= V4SImode
;
14563 wide_mode
= V64SImode
;
14564 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
14565 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
14568 case IX86_BUILTIN_4DPWSSDS_MASK
:
14569 nar_mode
= V4SImode
;
14571 wide_mode
= V64SImode
;
14572 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
14573 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
14576 case IX86_BUILTIN_4FMAPS_MASK
:
14586 wide_reg
= gen_reg_rtx (wide_mode
);
14587 for (i
= 0; i
< 4; i
++)
14589 args
[i
] = CALL_EXPR_ARG (exp
, i
);
14590 ops
[i
] = expand_normal (args
[i
]);
14592 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
14596 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
14597 accum
= force_reg (mode
, accum
);
14599 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
14600 addr
= force_reg (Pmode
, addr
);
14602 mem
= gen_rtx_MEM (nar_mode
, addr
);
14604 target
= gen_reg_rtx (mode
);
14606 emit_move_insn (target
, accum
);
14609 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
14613 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
14615 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
14617 if (CONST_INT_P (mask
))
14618 mask
= fixup_modeless_constant (mask
, HImode
);
14620 mask
= force_reg (HImode
, mask
);
14622 if (GET_MODE (mask
) != HImode
)
14623 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
14625 /* If merge is 0 then we're about to emit z-masked variant. */
14626 if (const0_operand (merge
, mode
))
14627 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
14628 /* If merge is the same as accum then emit merge-masked variant. */
14629 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
14631 merge
= force_reg (mode
, merge
);
14632 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
14634 /* Merge with something unknown might happen if we z-mask w/ -O0. */
14637 target
= gen_reg_rtx (mode
);
14638 emit_move_insn (target
, merge
);
14639 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
14645 case IX86_BUILTIN_4FNMASS
:
14646 fcn
= gen_avx5124fmaddps_4fnmaddss
;
14650 case IX86_BUILTIN_4FMASS
:
14651 fcn
= gen_avx5124fmaddps_4fmaddss
;
14655 case IX86_BUILTIN_4FNMASS_MASK
:
14656 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
14657 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
14660 case IX86_BUILTIN_4FMASS_MASK
:
14669 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
14670 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
14674 wide_reg
= gen_reg_rtx (V64SFmode
);
14675 for (i
= 0; i
< 4; i
++)
14678 args
[i
] = CALL_EXPR_ARG (exp
, i
);
14679 ops
[i
] = expand_normal (args
[i
]);
14681 tmp
= gen_reg_rtx (SFmode
);
14682 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
14684 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
14685 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
14688 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
14689 accum
= force_reg (V4SFmode
, accum
);
14691 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
14692 addr
= force_reg (Pmode
, addr
);
14694 mem
= gen_rtx_MEM (V4SFmode
, addr
);
14696 target
= gen_reg_rtx (V4SFmode
);
14698 emit_move_insn (target
, accum
);
14701 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
14705 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
14707 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
14709 if (CONST_INT_P (mask
))
14710 mask
= fixup_modeless_constant (mask
, QImode
);
14712 mask
= force_reg (QImode
, mask
);
14714 if (GET_MODE (mask
) != QImode
)
14715 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
14717 /* If merge is 0 then we're about to emit z-masked variant. */
14718 if (const0_operand (merge
, mode
))
14719 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
14720 /* If merge is the same as accum then emit merge-masked
14722 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
14724 merge
= force_reg (mode
, merge
);
14725 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
14727 /* Merge with something unknown might happen if we z-mask
14731 target
= gen_reg_rtx (mode
);
14732 emit_move_insn (target
, merge
);
14733 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
14738 case IX86_BUILTIN_RDPID
:
14739 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
14741 case IX86_BUILTIN_FABSQ
:
14742 case IX86_BUILTIN_COPYSIGNQ
:
14744 /* Emit a normal call if SSE isn't available. */
14745 return expand_call (exp
, target
, ignore
);
14748 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
14752 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
14753 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
14755 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
14756 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
14759 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
14760 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
14762 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
14763 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
14766 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
14767 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
14769 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
14770 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
14773 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
14774 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
14776 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
14777 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
14780 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
14781 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
14783 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
14784 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
14785 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
14786 (enum ix86_builtin_func_type
)
14787 d
->flag
, d
->comparison
);
14790 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
14791 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
14793 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
14794 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
14798 gcc_unreachable ();
14801 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
14802 fill target with val via vec_duplicate. */
14805 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
14811 /* First attempt to recognize VAL as-is. */
14812 dup
= gen_vec_duplicate (mode
, val
);
14813 insn
= emit_insn (gen_rtx_SET (target
, dup
));
14814 if (recog_memoized (insn
) < 0)
14817 machine_mode innermode
= GET_MODE_INNER (mode
);
14820 /* If that fails, force VAL into a register. */
14823 reg
= force_reg (innermode
, val
);
14824 if (GET_MODE (reg
) != innermode
)
14825 reg
= gen_lowpart (innermode
, reg
);
14826 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
14827 seq
= get_insns ();
14830 emit_insn_before (seq
, insn
);
14832 ok
= recog_memoized (insn
) >= 0;
14838 /* Get a vector mode of the same size as the original but with elements
14839 twice as wide. This is only guaranteed to apply to integral vectors. */
14841 static machine_mode
14842 get_mode_wider_vector (machine_mode o
)
14844 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
14845 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
14846 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
14847 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
14851 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
14852 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
14854 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
14855 with all elements equal to VAR. Return true if successful. */
14858 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
14859 rtx target
, rtx val
)
14883 return ix86_vector_duplicate_value (mode
, target
, val
);
14888 if (TARGET_SSE
|| TARGET_3DNOW_A
)
14892 val
= gen_lowpart (SImode
, val
);
14893 x
= gen_rtx_TRUNCATE (HImode
, val
);
14894 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
14895 emit_insn (gen_rtx_SET (target
, x
));
14905 val
= gen_lowpart (SImode
, val
);
14906 x
= gen_rtx_TRUNCATE (HImode
, val
);
14907 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
14908 emit_insn (gen_rtx_SET (target
, x
));
14922 return ix86_vector_duplicate_value (mode
, target
, val
);
14926 struct expand_vec_perm_d dperm
;
14930 memset (&dperm
, 0, sizeof (dperm
));
14931 dperm
.target
= target
;
14932 dperm
.vmode
= mode
;
14933 dperm
.nelt
= GET_MODE_NUNITS (mode
);
14934 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
14935 dperm
.one_operand_p
= true;
14937 if (mode
== V8HFmode
)
14939 tmp1
= force_reg (HFmode
, val
);
14940 tmp2
= gen_reg_rtx (mode
);
14941 emit_insn (gen_vec_setv8hf_0 (tmp2
, CONST0_RTX (mode
), tmp1
));
14942 tmp1
= gen_lowpart (mode
, tmp2
);
14946 /* Extend to SImode using a paradoxical SUBREG. */
14947 tmp1
= gen_reg_rtx (SImode
);
14948 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
14950 /* Insert the SImode value as
14951 low element of a V4SImode vector. */
14952 tmp2
= gen_reg_rtx (V4SImode
);
14953 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
14954 tmp1
= gen_lowpart (mode
, tmp2
);
14957 emit_move_insn (dperm
.op0
, tmp1
);
14958 ok
= (expand_vec_perm_1 (&dperm
)
14959 || expand_vec_perm_broadcast_1 (&dperm
));
14967 return ix86_vector_duplicate_value (mode
, target
, val
);
14974 /* Replicate the value once into the next wider mode and recurse. */
14976 machine_mode smode
, wsmode
, wvmode
;
14979 smode
= GET_MODE_INNER (mode
);
14980 wvmode
= get_mode_wider_vector (mode
);
14981 wsmode
= GET_MODE_INNER (wvmode
);
14983 val
= convert_modes (wsmode
, smode
, val
, true);
14985 if (smode
== QImode
&& !TARGET_PARTIAL_REG_STALL
)
14986 emit_insn (gen_insv_1 (wsmode
, val
, val
));
14989 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
14990 GEN_INT (GET_MODE_BITSIZE (smode
)),
14991 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
14992 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1,
14996 x
= gen_reg_rtx (wvmode
);
14997 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
14999 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
15007 return ix86_vector_duplicate_value (mode
, target
, val
);
15010 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
15011 : mode
== V16HFmode
? V8HFmode
15013 rtx x
= gen_reg_rtx (hvmode
);
15015 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15018 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15019 emit_insn (gen_rtx_SET (target
, x
));
15026 if (TARGET_AVX512BW
)
15027 return ix86_vector_duplicate_value (mode
, target
, val
);
15030 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
15031 : mode
== V32HFmode
? V16HFmode
15033 rtx x
= gen_reg_rtx (hvmode
);
15035 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15038 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15039 emit_insn (gen_rtx_SET (target
, x
));
15048 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15049 whose ONE_VAR element is VAR, and other elements are zero. Return true
15053 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
15054 rtx target
, rtx var
, int one_var
)
15056 machine_mode vsimode
;
15059 bool use_vector_set
= false;
15060 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
15065 /* For SSE4.1, we normally use vector set. But if the second
15066 element is zero and inter-unit moves are OK, we use movq
15068 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
15069 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15075 use_vector_set
= TARGET_SSE4_1
;
15078 use_vector_set
= TARGET_SSE2
;
15079 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15080 ? gen_vec_setv8hi_0
: NULL
;
15083 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15086 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
15089 use_vector_set
= TARGET_SSE4_1
;
15092 use_vector_set
= TARGET_AVX
;
15095 use_vector_set
= TARGET_AVX
;
15096 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15097 ? gen_vec_setv16hi_0
: NULL
;
15100 use_vector_set
= TARGET_AVX
;
15101 gen_vec_set_0
= gen_vec_setv8si_0
;
15104 use_vector_set
= TARGET_AVX
;
15105 gen_vec_set_0
= gen_vec_setv8sf_0
;
15108 use_vector_set
= TARGET_AVX
;
15109 gen_vec_set_0
= gen_vec_setv4df_0
;
15112 /* Use ix86_expand_vector_set in 64bit mode only. */
15113 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
15114 gen_vec_set_0
= gen_vec_setv4di_0
;
15117 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15118 gen_vec_set_0
= gen_vec_setv16si_0
;
15121 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15122 gen_vec_set_0
= gen_vec_setv16sf_0
;
15125 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15126 gen_vec_set_0
= gen_vec_setv8df_0
;
15129 /* Use ix86_expand_vector_set in 64bit mode only. */
15130 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
15131 gen_vec_set_0
= gen_vec_setv8di_0
;
15134 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15135 gen_vec_set_0
= gen_vec_setv8hf_0
;
15138 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15139 gen_vec_set_0
= gen_vec_setv16hf_0
;
15142 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15143 gen_vec_set_0
= gen_vec_setv32hf_0
;
15146 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15147 gen_vec_set_0
= gen_vec_setv32hi_0
;
15152 if (use_vector_set
)
15154 if (gen_vec_set_0
&& one_var
== 0)
15156 var
= force_reg (GET_MODE_INNER (mode
), var
);
15157 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
15160 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
15161 var
= force_reg (GET_MODE_INNER (mode
), var
);
15162 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15178 var
= force_reg (GET_MODE_INNER (mode
), var
);
15179 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
15180 emit_insn (gen_rtx_SET (target
, x
));
15185 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
15186 new_target
= gen_reg_rtx (mode
);
15188 new_target
= target
;
15189 var
= force_reg (GET_MODE_INNER (mode
), var
);
15190 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
15191 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
15192 emit_insn (gen_rtx_SET (new_target
, x
));
15195 /* We need to shuffle the value to the correct position, so
15196 create a new pseudo to store the intermediate result. */
15198 /* With SSE2, we can use the integer shuffle insns. */
15199 if (mode
!= V4SFmode
&& TARGET_SSE2
)
15201 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
15203 GEN_INT (one_var
== 1 ? 0 : 1),
15204 GEN_INT (one_var
== 2 ? 0 : 1),
15205 GEN_INT (one_var
== 3 ? 0 : 1)));
15206 if (target
!= new_target
)
15207 emit_move_insn (target
, new_target
);
15211 /* Otherwise convert the intermediate result to V4SFmode and
15212 use the SSE1 shuffle instructions. */
15213 if (mode
!= V4SFmode
)
15215 tmp
= gen_reg_rtx (V4SFmode
);
15216 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
15221 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
15223 GEN_INT (one_var
== 1 ? 0 : 1),
15224 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
15225 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
15227 if (mode
!= V4SFmode
)
15228 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
15229 else if (tmp
!= target
)
15230 emit_move_insn (target
, tmp
);
15232 else if (target
!= new_target
)
15233 emit_move_insn (target
, new_target
);
15238 vsimode
= V4SImode
;
15244 vsimode
= V2SImode
;
15250 /* Zero extend the variable element to SImode and recurse. */
15251 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
15253 x
= gen_reg_rtx (vsimode
);
15254 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
15256 gcc_unreachable ();
15258 emit_move_insn (target
, gen_lowpart (mode
, x
));
15266 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15267 consisting of the values in VALS. It is known that all elements
15268 except ONE_VAR are constants. Return true if successful. */
15271 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
15272 rtx target
, rtx vals
, int one_var
)
15274 rtx var
= XVECEXP (vals
, 0, one_var
);
15275 machine_mode wmode
;
15278 const_vec
= copy_rtx (vals
);
15279 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
15280 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
15288 /* For the two element vectors, it's just as easy to use
15289 the general case. */
15293 /* Use ix86_expand_vector_set in 64bit mode only. */
15316 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
15325 /* There's no way to set one QImode entry easily. Combine
15326 the variable value with its adjacent constant value, and
15327 promote to an HImode set. */
15328 x
= XVECEXP (vals
, 0, one_var
^ 1);
15331 var
= convert_modes (HImode
, QImode
, var
, true);
15332 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
15333 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15334 x
= GEN_INT (INTVAL (x
) & 0xff);
15338 var
= convert_modes (HImode
, QImode
, var
, true);
15339 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
15341 if (x
!= const0_rtx
)
15342 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
15343 1, OPTAB_LIB_WIDEN
);
15345 x
= gen_reg_rtx (wmode
);
15346 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
15347 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
15349 emit_move_insn (target
, gen_lowpart (mode
, x
));
15356 emit_move_insn (target
, const_vec
);
15357 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15361 /* A subroutine of ix86_expand_vector_init_general. Use vector
15362 concatenate to handle the most general case: all values variable,
15363 and none identical. */
15366 ix86_expand_vector_init_concat (machine_mode mode
,
15367 rtx target
, rtx
*ops
, int n
)
15369 machine_mode half_mode
= VOIDmode
;
15380 half_mode
= V16HFmode
;
15383 half_mode
= V8SImode
;
15386 half_mode
= V8SFmode
;
15389 half_mode
= V4DImode
;
15392 half_mode
= V4DFmode
;
15395 half_mode
= V8HFmode
;
15398 half_mode
= V4SImode
;
15401 half_mode
= V4SFmode
;
15404 half_mode
= V2DImode
;
15407 half_mode
= V2DFmode
;
15410 half_mode
= V2SImode
;
15413 half_mode
= V2SFmode
;
15416 half_mode
= DImode
;
15419 half_mode
= SImode
;
15422 half_mode
= DFmode
;
15425 half_mode
= SFmode
;
15428 gcc_unreachable ();
15431 if (!register_operand (ops
[1], half_mode
))
15432 ops
[1] = force_reg (half_mode
, ops
[1]);
15433 if (!register_operand (ops
[0], half_mode
))
15434 ops
[0] = force_reg (half_mode
, ops
[0]);
15435 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
15443 half_mode
= V2DImode
;
15446 half_mode
= V2DFmode
;
15449 half_mode
= V2SImode
;
15452 half_mode
= V2SFmode
;
15455 gcc_unreachable ();
15463 half_mode
= V4DImode
;
15466 half_mode
= V4DFmode
;
15469 half_mode
= V4SImode
;
15472 half_mode
= V4SFmode
;
15475 gcc_unreachable ();
15483 half_mode
= V8SImode
;
15486 half_mode
= V8SFmode
;
15489 gcc_unreachable ();
15494 /* FIXME: We process inputs backward to help RA. PR 36222. */
15496 for (j
= 1; j
!= -1; j
--)
15498 half
[j
] = gen_reg_rtx (half_mode
);
15502 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
15506 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
15510 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
15511 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
15515 gcc_unreachable ();
15517 ix86_expand_vector_init (false, half
[j
],
15518 gen_rtx_PARALLEL (half_mode
, v
));
15521 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
15525 gcc_unreachable ();
15529 /* A subroutine of ix86_expand_vector_init_general. Use vector
15530 interleave to handle the most general case: all values variable,
15531 and none identical. */
15534 ix86_expand_vector_init_interleave (machine_mode mode
,
15535 rtx target
, rtx
*ops
, int n
)
15537 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
15540 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
15541 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
15542 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
15547 gen_load_even
= gen_vec_interleave_lowv8hf
;
15548 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
15549 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
15550 inner_mode
= HFmode
;
15551 first_imode
= V4SImode
;
15552 second_imode
= V2DImode
;
15553 third_imode
= VOIDmode
;
15556 gen_load_even
= gen_vec_setv8hi
;
15557 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
15558 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
15559 inner_mode
= HImode
;
15560 first_imode
= V4SImode
;
15561 second_imode
= V2DImode
;
15562 third_imode
= VOIDmode
;
15565 gen_load_even
= gen_vec_setv16qi
;
15566 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
15567 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
15568 inner_mode
= QImode
;
15569 first_imode
= V8HImode
;
15570 second_imode
= V4SImode
;
15571 third_imode
= V2DImode
;
15574 gcc_unreachable ();
15577 for (i
= 0; i
< n
; i
++)
15580 if (inner_mode
== HFmode
)
15583 /* Use vpuncklwd to pack 2 HFmode. */
15584 op0
= gen_reg_rtx (V8HFmode
);
15585 even
= lowpart_subreg (V8HFmode
, force_reg (HFmode
, op
), HFmode
);
15586 odd
= lowpart_subreg (V8HFmode
,
15587 force_reg (HFmode
, ops
[i
+ i
+ 1]),
15589 emit_insn (gen_load_even (op0
, even
, odd
));
15593 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
15594 op0
= gen_reg_rtx (SImode
);
15595 emit_move_insn (op0
, gen_lowpart (SImode
, op
));
15597 /* Insert the SImode value as low element of V4SImode vector. */
15598 op1
= gen_reg_rtx (V4SImode
);
15599 op0
= gen_rtx_VEC_MERGE (V4SImode
,
15600 gen_rtx_VEC_DUPLICATE (V4SImode
,
15602 CONST0_RTX (V4SImode
),
15604 emit_insn (gen_rtx_SET (op1
, op0
));
15606 /* Cast the V4SImode vector back to a vector in orignal mode. */
15607 op0
= gen_reg_rtx (mode
);
15608 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
15610 /* Load even elements into the second position. */
15611 emit_insn (gen_load_even (op0
,
15612 force_reg (inner_mode
,
15617 /* Cast vector to FIRST_IMODE vector. */
15618 ops
[i
] = gen_reg_rtx (first_imode
);
15619 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
15622 /* Interleave low FIRST_IMODE vectors. */
15623 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
15625 op0
= gen_reg_rtx (first_imode
);
15626 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
15628 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
15629 ops
[j
] = gen_reg_rtx (second_imode
);
15630 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
15633 /* Interleave low SECOND_IMODE vectors. */
15634 switch (second_imode
)
15637 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
15639 op0
= gen_reg_rtx (second_imode
);
15640 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
15643 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
15645 ops
[j
] = gen_reg_rtx (third_imode
);
15646 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
15648 second_imode
= V2DImode
;
15649 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
15653 op0
= gen_reg_rtx (second_imode
);
15654 emit_insn (gen_interleave_second_low (op0
, ops
[0],
15657 /* Cast the SECOND_IMODE vector back to a vector on original
15659 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
15663 gcc_unreachable ();
15667 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
15668 all values variable, and none identical. */
15671 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
15672 rtx target
, rtx vals
)
15674 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
15675 machine_mode half_mode
= VOIDmode
;
15676 machine_mode quarter_mode
= VOIDmode
;
15683 if (!mmx_ok
&& !TARGET_SSE
)
15699 n
= GET_MODE_NUNITS (mode
);
15700 for (i
= 0; i
< n
; i
++)
15701 ops
[i
] = XVECEXP (vals
, 0, i
);
15702 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
15706 for (i
= 0; i
< 2; i
++)
15707 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
15708 op0
= gen_reg_rtx (V4DImode
);
15709 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
15710 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
15714 for (i
= 0; i
< 4; i
++)
15715 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
15716 ops
[4] = gen_reg_rtx (V4DImode
);
15717 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
15718 ops
[5] = gen_reg_rtx (V4DImode
);
15719 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
15720 op0
= gen_reg_rtx (V8DImode
);
15721 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
15722 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
15726 half_mode
= V16QImode
;
15730 half_mode
= V8HImode
;
15734 half_mode
= V8HFmode
;
15738 n
= GET_MODE_NUNITS (mode
);
15739 for (i
= 0; i
< n
; i
++)
15740 ops
[i
] = XVECEXP (vals
, 0, i
);
15741 op0
= gen_reg_rtx (half_mode
);
15742 op1
= gen_reg_rtx (half_mode
);
15743 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
15745 ix86_expand_vector_init_interleave (half_mode
, op1
,
15746 &ops
[n
>> 1], n
>> 2);
15747 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
15751 quarter_mode
= V16QImode
;
15752 half_mode
= V32QImode
;
15756 quarter_mode
= V8HImode
;
15757 half_mode
= V16HImode
;
15761 quarter_mode
= V8HFmode
;
15762 half_mode
= V16HFmode
;
15766 n
= GET_MODE_NUNITS (mode
);
15767 for (i
= 0; i
< n
; i
++)
15768 ops
[i
] = XVECEXP (vals
, 0, i
);
15769 op0
= gen_reg_rtx (quarter_mode
);
15770 op1
= gen_reg_rtx (quarter_mode
);
15771 op2
= gen_reg_rtx (quarter_mode
);
15772 op3
= gen_reg_rtx (quarter_mode
);
15773 op4
= gen_reg_rtx (half_mode
);
15774 op5
= gen_reg_rtx (half_mode
);
15775 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
15777 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
15778 &ops
[n
>> 2], n
>> 3);
15779 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
15780 &ops
[n
>> 1], n
>> 3);
15781 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
15782 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
15783 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
15784 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
15785 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
15789 if (!TARGET_SSE4_1
)
15797 /* Don't use ix86_expand_vector_init_interleave if we can't
15798 move from GPR to SSE register directly. */
15799 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
15805 n
= GET_MODE_NUNITS (mode
);
15806 for (i
= 0; i
< n
; i
++)
15807 ops
[i
] = XVECEXP (vals
, 0, i
);
15808 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
15819 gcc_unreachable ();
15823 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
15824 machine_mode tmp_mode
, inner_mode
;
15825 rtx words
[4], shift
;
15827 tmp_mode
= (GET_MODE_SIZE (mode
) < UNITS_PER_WORD
) ? SImode
: word_mode
;
15829 inner_mode
= GET_MODE_INNER (mode
);
15830 n_elts
= GET_MODE_NUNITS (mode
);
15831 n_words
= GET_MODE_SIZE (mode
) / GET_MODE_SIZE (tmp_mode
);
15832 n_elt_per_word
= n_elts
/ n_words
;
15833 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
15835 for (i
= 0; i
< n_words
; ++i
)
15837 rtx word
= NULL_RTX
;
15839 for (j
= 0; j
< n_elt_per_word
; ++j
)
15841 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
15842 elt
= convert_modes (tmp_mode
, inner_mode
, elt
, true);
15848 word
= expand_simple_binop (tmp_mode
, ASHIFT
, word
, shift
,
15849 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15850 word
= expand_simple_binop (tmp_mode
, IOR
, word
, elt
,
15851 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15859 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
15860 else if (n_words
== 2)
15862 rtx tmp
= gen_reg_rtx (mode
);
15863 emit_clobber (tmp
);
15864 emit_move_insn (gen_lowpart (tmp_mode
, tmp
), words
[0]);
15865 emit_move_insn (gen_highpart (tmp_mode
, tmp
), words
[1]);
15866 emit_move_insn (target
, tmp
);
15868 else if (n_words
== 4)
15870 rtx tmp
= gen_reg_rtx (V4SImode
);
15871 gcc_assert (tmp_mode
== SImode
);
15872 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
15873 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
15874 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
15877 gcc_unreachable ();
15881 /* Initialize vector TARGET via VALS. Suppress the use of MMX
15882 instructions unless MMX_OK is true. */
15885 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
15887 machine_mode mode
= GET_MODE (target
);
15888 machine_mode inner_mode
= GET_MODE_INNER (mode
);
15889 int n_elts
= GET_MODE_NUNITS (mode
);
15890 int n_var
= 0, one_var
= -1;
15891 bool all_same
= true, all_const_zero
= true;
15895 /* Handle first initialization from vector elts. */
15896 if (n_elts
!= XVECLEN (vals
, 0))
15898 rtx subtarget
= target
;
15899 x
= XVECEXP (vals
, 0, 0);
15900 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
15901 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
15903 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
15904 if (inner_mode
== QImode
15905 || inner_mode
== HImode
15906 || inner_mode
== TImode
15907 || inner_mode
== HFmode
)
15909 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
15910 scalar_mode elt_mode
= inner_mode
== TImode
? DImode
: SImode
;
15911 n_bits
/= GET_MODE_SIZE (elt_mode
);
15912 mode
= mode_for_vector (elt_mode
, n_bits
).require ();
15913 inner_mode
= mode_for_vector (elt_mode
, n_bits
/ 2).require ();
15914 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
15915 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
15916 subtarget
= gen_reg_rtx (mode
);
15918 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
15919 if (subtarget
!= target
)
15920 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
15923 gcc_unreachable ();
15926 for (i
= 0; i
< n_elts
; ++i
)
15928 x
= XVECEXP (vals
, 0, i
);
15929 if (!(CONST_SCALAR_INT_P (x
)
15930 || CONST_DOUBLE_P (x
)
15931 || CONST_FIXED_P (x
)))
15932 n_var
++, one_var
= i
;
15933 else if (x
!= CONST0_RTX (inner_mode
))
15934 all_const_zero
= false;
15935 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
15939 /* Constants are best loaded from the constant pool. */
15942 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
15946 /* If all values are identical, broadcast the value. */
15948 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
15949 XVECEXP (vals
, 0, 0)))
15952 /* Values where only one field is non-constant are best loaded from
15953 the pool and overwritten via move later. */
15957 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
15958 XVECEXP (vals
, 0, one_var
),
15962 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
15966 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
15970 V setg (V v, int idx, T val)
15972 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
15973 V valv = (V){val, val, val, val, val, val, val, val};
15974 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
15975 v = (v & ~mask) | (valv & mask);
15979 ix86_expand_vector_set_var (rtx target
, rtx val
, rtx idx
)
15982 machine_mode mode
= GET_MODE (target
);
15983 machine_mode cmp_mode
= mode
;
15984 int n_elts
= GET_MODE_NUNITS (mode
);
15985 rtx valv
,idxv
,constv
,idx_tmp
;
15988 /* 512-bits vector byte/word broadcast and comparison only available
15989 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
15990 when without TARGET_AVX512BW. */
15991 if ((mode
== V32HImode
|| mode
== V32HFmode
|| mode
== V64QImode
)
15992 && !TARGET_AVX512BW
)
15994 gcc_assert (TARGET_AVX512F
);
15995 rtx vhi
, vlo
, idx_hi
;
15996 machine_mode half_mode
;
15997 rtx (*extract_hi
)(rtx
, rtx
);
15998 rtx (*extract_lo
)(rtx
, rtx
);
16000 if (mode
== V32HImode
)
16002 half_mode
= V16HImode
;
16003 extract_hi
= gen_vec_extract_hi_v32hi
;
16004 extract_lo
= gen_vec_extract_lo_v32hi
;
16006 else if (mode
== V32HFmode
)
16008 half_mode
= V16HFmode
;
16009 extract_hi
= gen_vec_extract_hi_v32hf
;
16010 extract_lo
= gen_vec_extract_lo_v32hf
;
16014 half_mode
= V32QImode
;
16015 extract_hi
= gen_vec_extract_hi_v64qi
;
16016 extract_lo
= gen_vec_extract_lo_v64qi
;
16019 vhi
= gen_reg_rtx (half_mode
);
16020 vlo
= gen_reg_rtx (half_mode
);
16021 idx_hi
= gen_reg_rtx (GET_MODE (idx
));
16022 emit_insn (extract_hi (vhi
, target
));
16023 emit_insn (extract_lo (vlo
, target
));
16026 vec
[2] = GEN_INT (n_elts
/2);
16027 ix86_expand_binary_operator (MINUS
, GET_MODE (idx
), vec
);
16028 ix86_expand_vector_set_var (vhi
, val
, idx_hi
);
16029 ix86_expand_vector_set_var (vlo
, val
, idx
);
16030 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, vlo
, vhi
)));
16034 if (FLOAT_MODE_P (GET_MODE_INNER (mode
)))
16039 cmp_mode
= V2DImode
;
16042 cmp_mode
= V4DImode
;
16045 cmp_mode
= V8DImode
;
16048 cmp_mode
= V2SImode
;
16051 cmp_mode
= V4SImode
;
16054 cmp_mode
= V8SImode
;
16057 cmp_mode
= V16SImode
;
16060 cmp_mode
= V8HImode
;
16063 cmp_mode
= V16HImode
;
16066 cmp_mode
= V32HImode
;
16069 gcc_unreachable ();
16073 for (int i
= 0; i
!= n_elts
; i
++)
16074 vec
[i
] = GEN_INT (i
);
16075 constv
= gen_rtx_CONST_VECTOR (cmp_mode
, gen_rtvec_v (n_elts
, vec
));
16076 valv
= gen_reg_rtx (mode
);
16077 idxv
= gen_reg_rtx (cmp_mode
);
16078 idx_tmp
= convert_to_mode (GET_MODE_INNER (cmp_mode
), idx
, 1);
16080 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16083 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16084 cmp_mode
, idxv
, idx_tmp
);
16089 vec
[3] = gen_rtx_EQ (mode
, idxv
, constv
);
16092 ok
= ix86_expand_int_vcond (vec
);
16097 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
16099 machine_mode mode
= GET_MODE (target
);
16100 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16101 machine_mode half_mode
;
16102 bool use_vec_merge
= false;
16103 bool blendm_const
= false;
16105 static rtx (*gen_extract
[7][2]) (rtx
, rtx
)
16107 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
16108 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
16109 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
16110 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
16111 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
16112 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
},
16113 { gen_vec_extract_lo_v16hf
, gen_vec_extract_hi_v16hf
}
16115 static rtx (*gen_insert
[7][2]) (rtx
, rtx
, rtx
)
16117 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
16118 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
16119 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
16120 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
16121 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
16122 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
},
16123 { gen_vec_set_lo_v16hf
, gen_vec_set_hi_v16hf
},
16126 machine_mode mmode
= VOIDmode
;
16127 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
16132 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16140 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16141 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
16143 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16145 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16146 emit_insn (gen_rtx_SET (target
, tmp
));
16152 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
16156 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16157 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
16159 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16161 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16162 emit_insn (gen_rtx_SET (target
, tmp
));
16166 /* NB: For ELT == 0, use standard scalar operation patterns which
16167 preserve the rest of the vector for combiner:
16170 (vec_duplicate:V2DF (reg:DF))
16180 /* For the two element vectors, we implement a VEC_CONCAT with
16181 the extraction of the other element. */
16183 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
16184 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
16187 op0
= val
, op1
= tmp
;
16189 op0
= tmp
, op1
= val
;
16191 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
16192 emit_insn (gen_rtx_SET (target
, tmp
));
16197 use_vec_merge
= TARGET_SSE4_1
;
16204 use_vec_merge
= true;
16208 /* tmp = target = A B C D */
16209 tmp
= copy_to_reg (target
);
16210 /* target = A A B B */
16211 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
16212 /* target = X A B B */
16213 ix86_expand_vector_set (false, target
, val
, 0);
16214 /* target = A X C D */
16215 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16216 const1_rtx
, const0_rtx
,
16217 GEN_INT (2+4), GEN_INT (3+4)));
16221 /* tmp = target = A B C D */
16222 tmp
= copy_to_reg (target
);
16223 /* tmp = X B C D */
16224 ix86_expand_vector_set (false, tmp
, val
, 0);
16225 /* target = A B X D */
16226 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16227 const0_rtx
, const1_rtx
,
16228 GEN_INT (0+4), GEN_INT (3+4)));
16232 /* tmp = target = A B C D */
16233 tmp
= copy_to_reg (target
);
16234 /* tmp = X B C D */
16235 ix86_expand_vector_set (false, tmp
, val
, 0);
16236 /* target = A B X D */
16237 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16238 const0_rtx
, const1_rtx
,
16239 GEN_INT (2+4), GEN_INT (0+4)));
16243 gcc_unreachable ();
16248 use_vec_merge
= TARGET_SSE4_1
;
16252 /* Element 0 handled by vec_merge below. */
16255 use_vec_merge
= true;
16261 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16262 store into element 0, then shuffle them back. */
16266 order
[0] = GEN_INT (elt
);
16267 order
[1] = const1_rtx
;
16268 order
[2] = const2_rtx
;
16269 order
[3] = GEN_INT (3);
16270 order
[elt
] = const0_rtx
;
16272 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16273 order
[1], order
[2], order
[3]));
16275 ix86_expand_vector_set (false, target
, val
, 0);
16277 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16278 order
[1], order
[2], order
[3]));
16282 /* For SSE1, we have to reuse the V4SF code. */
16283 rtx t
= gen_reg_rtx (V4SFmode
);
16284 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
16285 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
16286 emit_move_insn (target
, gen_lowpart (mode
, t
));
16293 use_vec_merge
= TARGET_SSE2
;
16296 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
16301 use_vec_merge
= TARGET_SSE4_1
;
16305 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16309 half_mode
= V16QImode
;
16315 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16316 if (TARGET_AVX2
&& elt
!= 0)
16319 gen_blendm
= gen_avx2_pblendph_1
;
16320 blendm_const
= true;
16325 half_mode
= V8HFmode
;
16332 half_mode
= V8HImode
;
16338 half_mode
= V4SImode
;
16344 half_mode
= V2DImode
;
16350 half_mode
= V4SFmode
;
16356 half_mode
= V2DFmode
;
16362 /* Compute offset. */
16366 gcc_assert (i
<= 1);
16368 /* Extract the half. */
16369 tmp
= gen_reg_rtx (half_mode
);
16370 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
16372 /* Put val in tmp at elt. */
16373 ix86_expand_vector_set (false, tmp
, val
, elt
);
16376 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
16380 if (TARGET_AVX512F
)
16383 gen_blendm
= gen_avx512f_blendmv8df
;
16388 if (TARGET_AVX512F
)
16391 gen_blendm
= gen_avx512f_blendmv8di
;
16396 if (TARGET_AVX512F
)
16399 gen_blendm
= gen_avx512f_blendmv16sf
;
16404 if (TARGET_AVX512F
)
16407 gen_blendm
= gen_avx512f_blendmv16si
;
16412 if (TARGET_AVX512BW
)
16415 gen_blendm
= gen_avx512bw_blendmv32hf
;
16419 if (TARGET_AVX512BW
)
16422 gen_blendm
= gen_avx512bw_blendmv32hi
;
16424 else if (TARGET_AVX512F
)
16426 half_mode
= E_V8HImode
;
16433 if (TARGET_AVX512BW
)
16436 gen_blendm
= gen_avx512bw_blendmv64qi
;
16438 else if (TARGET_AVX512F
)
16440 half_mode
= E_V16QImode
;
16447 /* Compute offset. */
16451 gcc_assert (i
<= 3);
16454 /* Extract the quarter. */
16455 tmp
= gen_reg_rtx (V4SImode
);
16456 rtx tmp2
= gen_lowpart (V16SImode
, target
);
16457 rtx mask
= gen_reg_rtx (QImode
);
16459 emit_move_insn (mask
, constm1_rtx
);
16460 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
16463 tmp2
= gen_reg_rtx (half_mode
);
16464 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
16467 /* Put val in tmp at elt. */
16468 ix86_expand_vector_set (false, tmp
, val
, elt
);
16471 tmp2
= gen_reg_rtx (V16SImode
);
16472 rtx tmp3
= gen_lowpart (V16SImode
, target
);
16473 mask
= gen_reg_rtx (HImode
);
16474 emit_move_insn (mask
, constm1_rtx
);
16475 tmp
= gen_lowpart (V4SImode
, tmp
);
16476 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
16478 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
16486 if (mmode
!= VOIDmode
)
16488 tmp
= gen_reg_rtx (mode
);
16489 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
16490 rtx merge_mask
= gen_int_mode (HOST_WIDE_INT_1U
<< elt
, mmode
);
16491 /* The avx512*_blendm<mode> expanders have different operand order
16492 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
16493 elements where the mask is set and second input operand otherwise,
16494 in {sse,avx}*_*blend* the first input operand is used for elements
16495 where the mask is clear and second input operand otherwise. */
16497 merge_mask
= force_reg (mmode
, merge_mask
);
16498 emit_insn (gen_blendm (target
, target
, tmp
, merge_mask
));
16500 else if (use_vec_merge
)
16503 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
16504 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
16505 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
16506 emit_insn (gen_rtx_SET (target
, tmp
));
16510 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
16512 emit_move_insn (mem
, target
);
16514 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
16515 emit_move_insn (tmp
, val
);
16517 emit_move_insn (target
, mem
);
16522 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
16524 machine_mode mode
= GET_MODE (vec
);
16525 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16526 bool use_vec_extr
= false;
16532 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16546 use_vec_extr
= true;
16550 use_vec_extr
= TARGET_SSE4_1
;
16562 tmp
= gen_reg_rtx (mode
);
16563 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
16564 GEN_INT (elt
), GEN_INT (elt
),
16565 GEN_INT (elt
+4), GEN_INT (elt
+4)));
16569 tmp
= gen_reg_rtx (mode
);
16570 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
16574 gcc_unreachable ();
16577 use_vec_extr
= true;
16582 use_vec_extr
= TARGET_SSE4_1
;
16596 tmp
= gen_reg_rtx (mode
);
16597 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
16598 GEN_INT (elt
), GEN_INT (elt
),
16599 GEN_INT (elt
), GEN_INT (elt
)));
16603 tmp
= gen_reg_rtx (mode
);
16604 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
16608 gcc_unreachable ();
16611 use_vec_extr
= true;
16616 /* For SSE1, we have to reuse the V4SF code. */
16617 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
16618 gen_lowpart (V4SFmode
, vec
), elt
);
16626 use_vec_extr
= TARGET_SSE2
;
16629 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
16633 use_vec_extr
= TARGET_SSE4_1
;
16637 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
16639 tmp
= gen_reg_rtx (SImode
);
16640 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
16642 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
16647 use_vec_extr
= TARGET_SSE4_1
;
16653 tmp
= gen_reg_rtx (V4SFmode
);
16655 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
16657 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
16658 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
16666 tmp
= gen_reg_rtx (V2DFmode
);
16668 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
16670 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
16671 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
16679 tmp
= gen_reg_rtx (V16QImode
);
16681 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
16683 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
16684 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
16692 tmp
= gen_reg_rtx (V8HImode
);
16694 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
16696 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
16697 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
16705 tmp
= gen_reg_rtx (V4SImode
);
16707 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
16709 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
16710 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
16718 tmp
= gen_reg_rtx (V2DImode
);
16720 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
16722 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
16723 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
16729 if (TARGET_AVX512BW
)
16731 tmp
= gen_reg_rtx (V16HImode
);
16733 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
16735 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
16736 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
16742 if (TARGET_AVX512BW
)
16744 tmp
= gen_reg_rtx (V32QImode
);
16746 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
16748 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
16749 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
16755 tmp
= gen_reg_rtx (V8SFmode
);
16757 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
16759 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
16760 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
16764 tmp
= gen_reg_rtx (V4DFmode
);
16766 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
16768 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
16769 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
16773 tmp
= gen_reg_rtx (V8SImode
);
16775 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
16777 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
16778 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
16782 tmp
= gen_reg_rtx (V4DImode
);
16784 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
16786 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
16787 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
16791 if (TARGET_AVX512BW
)
16793 tmp
= gen_reg_rtx (V16HFmode
);
16795 emit_insn (gen_vec_extract_lo_v32hf (tmp
, vec
));
16797 emit_insn (gen_vec_extract_hi_v32hf (tmp
, vec
));
16798 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
16806 tmp
= gen_reg_rtx (V8HFmode
);
16808 emit_insn (gen_vec_extract_lo_v16hf (tmp
, vec
));
16810 emit_insn (gen_vec_extract_hi_v16hf (tmp
, vec
));
16811 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
16817 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16818 /* ??? Could extract the appropriate HImode element and shift. */
16827 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
16828 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
16830 /* Let the rtl optimizers know about the zero extension performed. */
16831 if (inner_mode
== QImode
|| inner_mode
== HImode
)
16833 rtx reg
= gen_reg_rtx (SImode
);
16834 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
16835 emit_move_insn (reg
, tmp
);
16836 tmp
= gen_lowpart (inner_mode
, reg
);
16837 SUBREG_PROMOTED_VAR_P (tmp
) = 1;
16838 SUBREG_PROMOTED_SET (tmp
, 1);
16841 emit_move_insn (target
, tmp
);
16845 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
16847 emit_move_insn (mem
, vec
);
16849 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
16850 emit_move_insn (target
, tmp
);
16854 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
16855 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
16856 The upper bits of DEST are undefined, though they shouldn't cause
16857 exceptions (some bits from src or all zeros are ok). */
16860 emit_reduc_half (rtx dest
, rtx src
, int i
)
16863 switch (GET_MODE (src
))
16867 tem
= gen_sse_movhlps (dest
, src
, src
);
16869 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
16870 GEN_INT (1 + 4), GEN_INT (1 + 4));
16873 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
16876 d
= gen_reg_rtx (V1SImode
);
16877 tem
= gen_mmx_lshrv1si3 (d
, gen_lowpart (V1SImode
, src
),
16881 d
= gen_reg_rtx (V1DImode
);
16882 tem
= gen_mmx_lshrv1di3 (d
, gen_lowpart (V1DImode
, src
),
16890 d
= gen_reg_rtx (V1TImode
);
16891 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
16896 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
16898 tem
= gen_avx_shufps256 (dest
, src
, src
,
16899 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
16903 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
16905 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
16914 if (GET_MODE (dest
) != V4DImode
)
16915 d
= gen_reg_rtx (V4DImode
);
16916 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
16917 gen_lowpart (V4DImode
, src
),
16922 d
= gen_reg_rtx (V2TImode
);
16923 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
16932 d
= gen_reg_rtx (V4TImode
);
16933 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
16943 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
16944 gen_lowpart (V16SImode
, src
),
16945 gen_lowpart (V16SImode
, src
),
16946 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
16947 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
16948 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
16949 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
16950 GEN_INT (0xC), GEN_INT (0xD),
16951 GEN_INT (0xE), GEN_INT (0xF),
16952 GEN_INT (0x10), GEN_INT (0x11),
16953 GEN_INT (0x12), GEN_INT (0x13),
16954 GEN_INT (0x14), GEN_INT (0x15),
16955 GEN_INT (0x16), GEN_INT (0x17));
16957 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
16958 gen_lowpart (V16SImode
, src
),
16959 GEN_INT (i
== 128 ? 0x2 : 0x1),
16963 GEN_INT (i
== 128 ? 0x6 : 0x5),
16967 GEN_INT (i
== 128 ? 0xA : 0x9),
16971 GEN_INT (i
== 128 ? 0xE : 0xD),
16977 gcc_unreachable ();
16981 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
16984 /* Expand a vector reduction. FN is the binary pattern to reduce;
16985 DEST is the destination; IN is the input vector. */
16988 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
16990 rtx half
, dst
, vec
= in
;
16991 machine_mode mode
= GET_MODE (in
);
16994 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
16996 && mode
== V8HImode
16997 && fn
== gen_uminv8hi3
)
16999 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
17003 for (i
= GET_MODE_BITSIZE (mode
);
17004 i
> GET_MODE_UNIT_BITSIZE (mode
);
17007 half
= gen_reg_rtx (mode
);
17008 emit_reduc_half (half
, vec
, i
);
17009 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
17012 dst
= gen_reg_rtx (mode
);
17013 emit_insn (fn (dst
, half
, vec
));
17018 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17019 FP status register is set. */
17022 ix86_emit_fp_unordered_jump (rtx label
)
17024 rtx reg
= gen_reg_rtx (HImode
);
17028 emit_insn (gen_x86_fnstsw_1 (reg
));
17030 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
17032 emit_insn (gen_x86_sahf_1 (reg
));
17034 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
17035 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
17039 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
17041 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17042 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
17045 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
17046 gen_rtx_LABEL_REF (VOIDmode
, label
),
17048 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
17049 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
17050 JUMP_LABEL (insn
) = label
;
17053 /* Output code to perform an sinh XFmode calculation. */
17056 ix86_emit_i387_sinh (rtx op0
, rtx op1
)
17058 rtx e1
= gen_reg_rtx (XFmode
);
17059 rtx e2
= gen_reg_rtx (XFmode
);
17060 rtx scratch
= gen_reg_rtx (HImode
);
17061 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17062 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17064 rtx_code_label
*jump_label
= gen_label_rtx ();
17067 /* scratch = fxam (op1) */
17068 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17070 /* e1 = expm1 (|op1|) */
17071 emit_insn (gen_absxf2 (e2
, op1
));
17072 emit_insn (gen_expm1xf2 (e1
, e2
));
17074 /* e2 = e1 / (e1 + 1.0) + e1 */
17075 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17076 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17077 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17078 emit_insn (gen_addxf3 (e2
, e2
, e1
));
17080 /* flags = signbit (op1) */
17081 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17083 /* if (flags) then e2 = -e2 */
17084 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17085 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17086 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17088 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17089 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17090 JUMP_LABEL (insn
) = jump_label
;
17092 emit_insn (gen_negxf2 (e2
, e2
));
17094 emit_label (jump_label
);
17095 LABEL_NUSES (jump_label
) = 1;
17097 /* op0 = 0.5 * e2 */
17098 half
= force_reg (XFmode
, half
);
17099 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17102 /* Output code to perform an cosh XFmode calculation. */
17105 ix86_emit_i387_cosh (rtx op0
, rtx op1
)
17107 rtx e1
= gen_reg_rtx (XFmode
);
17108 rtx e2
= gen_reg_rtx (XFmode
);
17109 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17112 /* e1 = exp (op1) */
17113 emit_insn (gen_expxf2 (e1
, op1
));
17115 /* e2 = e1 + 1.0 / e1 */
17116 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17117 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
17118 emit_insn (gen_addxf3 (e2
, e1
, e2
));
17120 /* op0 = 0.5 * e2 */
17121 half
= force_reg (XFmode
, half
);
17122 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17125 /* Output code to perform an tanh XFmode calculation. */
17128 ix86_emit_i387_tanh (rtx op0
, rtx op1
)
17130 rtx e1
= gen_reg_rtx (XFmode
);
17131 rtx e2
= gen_reg_rtx (XFmode
);
17132 rtx scratch
= gen_reg_rtx (HImode
);
17133 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17135 rtx_code_label
*jump_label
= gen_label_rtx ();
17138 /* scratch = fxam (op1) */
17139 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17141 /* e1 = expm1 (-|2 * op1|) */
17142 emit_insn (gen_addxf3 (e2
, op1
, op1
));
17143 emit_insn (gen_absxf2 (e2
, e2
));
17144 emit_insn (gen_negxf2 (e2
, e2
));
17145 emit_insn (gen_expm1xf2 (e1
, e2
));
17147 /* e2 = e1 / (e1 + 2.0) */
17148 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
17149 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
17150 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17152 /* flags = signbit (op1) */
17153 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17155 /* if (!flags) then e2 = -e2 */
17156 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17157 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17158 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17160 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17161 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17162 JUMP_LABEL (insn
) = jump_label
;
17164 emit_insn (gen_negxf2 (e2
, e2
));
17166 emit_label (jump_label
);
17167 LABEL_NUSES (jump_label
) = 1;
17169 emit_move_insn (op0
, e2
);
17172 /* Output code to perform an asinh XFmode calculation. */
17175 ix86_emit_i387_asinh (rtx op0
, rtx op1
)
17177 rtx e1
= gen_reg_rtx (XFmode
);
17178 rtx e2
= gen_reg_rtx (XFmode
);
17179 rtx scratch
= gen_reg_rtx (HImode
);
17180 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17182 rtx_code_label
*jump_label
= gen_label_rtx ();
17185 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17186 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
17187 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17188 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17189 emit_insn (gen_sqrtxf2 (e2
, e2
));
17190 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
17193 emit_insn (gen_divxf3 (e1
, e1
, e2
));
17195 /* scratch = fxam (op1) */
17196 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17198 /* e1 = e1 + |op1| */
17199 emit_insn (gen_absxf2 (e2
, op1
));
17200 emit_insn (gen_addxf3 (e1
, e1
, e2
));
17202 /* e2 = log1p (e1) */
17203 ix86_emit_i387_log1p (e2
, e1
);
17205 /* flags = signbit (op1) */
17206 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17208 /* if (flags) then e2 = -e2 */
17209 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17210 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17211 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17213 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17214 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17215 JUMP_LABEL (insn
) = jump_label
;
17217 emit_insn (gen_negxf2 (e2
, e2
));
17219 emit_label (jump_label
);
17220 LABEL_NUSES (jump_label
) = 1;
17222 emit_move_insn (op0
, e2
);
17225 /* Output code to perform an acosh XFmode calculation. */
17228 ix86_emit_i387_acosh (rtx op0
, rtx op1
)
17230 rtx e1
= gen_reg_rtx (XFmode
);
17231 rtx e2
= gen_reg_rtx (XFmode
);
17232 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17234 /* e2 = sqrt (op1 + 1.0) */
17235 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
17236 emit_insn (gen_sqrtxf2 (e2
, e2
));
17238 /* e1 = sqrt (op1 - 1.0) */
17239 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
17240 emit_insn (gen_sqrtxf2 (e1
, e1
));
17243 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
17245 /* e1 = e1 + op1 */
17246 emit_insn (gen_addxf3 (e1
, e1
, op1
));
17248 /* op0 = log (e1) */
17249 emit_insn (gen_logxf2 (op0
, e1
));
17252 /* Output code to perform an atanh XFmode calculation. */
17255 ix86_emit_i387_atanh (rtx op0
, rtx op1
)
17257 rtx e1
= gen_reg_rtx (XFmode
);
17258 rtx e2
= gen_reg_rtx (XFmode
);
17259 rtx scratch
= gen_reg_rtx (HImode
);
17260 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17261 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17263 rtx_code_label
*jump_label
= gen_label_rtx ();
17266 /* scratch = fxam (op1) */
17267 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17270 emit_insn (gen_absxf2 (e2
, op1
));
17272 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17273 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17274 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
17275 emit_insn (gen_addxf3 (e2
, e2
, e2
));
17276 emit_insn (gen_negxf2 (e2
, e2
));
17277 emit_insn (gen_divxf3 (e1
, e2
, e1
));
17279 /* e2 = log1p (e1) */
17280 ix86_emit_i387_log1p (e2
, e1
);
17282 /* flags = signbit (op1) */
17283 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17285 /* if (!flags) then e2 = -e2 */
17286 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17287 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17288 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17290 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17291 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17292 JUMP_LABEL (insn
) = jump_label
;
17294 emit_insn (gen_negxf2 (e2
, e2
));
17296 emit_label (jump_label
);
17297 LABEL_NUSES (jump_label
) = 1;
17299 /* op0 = 0.5 * e2 */
17300 half
= force_reg (XFmode
, half
);
17301 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17304 /* Output code to perform a log1p XFmode calculation. */
17307 ix86_emit_i387_log1p (rtx op0
, rtx op1
)
17309 rtx_code_label
*label1
= gen_label_rtx ();
17310 rtx_code_label
*label2
= gen_label_rtx ();
17312 rtx tmp
= gen_reg_rtx (XFmode
);
17313 rtx res
= gen_reg_rtx (XFmode
);
17314 rtx cst
, cstln2
, cst1
;
17317 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17318 before the conditional jump, otherwise the stack adjustment will be
17319 only conditional. */
17320 do_pending_stack_adjust ();
17322 cst
= const_double_from_real_value
17323 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
17324 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
17326 emit_insn (gen_absxf2 (tmp
, op1
));
17328 cst
= force_reg (XFmode
, cst
);
17329 ix86_expand_branch (GE
, tmp
, cst
, label1
);
17330 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
17331 insn
= get_last_insn ();
17332 JUMP_LABEL (insn
) = label1
;
17334 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
17335 emit_jump (label2
);
17337 emit_label (label1
);
17338 LABEL_NUSES (label1
) = 1;
17340 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17341 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
17342 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
17344 emit_label (label2
);
17345 LABEL_NUSES (label2
) = 1;
17347 emit_move_insn (op0
, res
);
17350 /* Emit code for round calculation. */
17352 ix86_emit_i387_round (rtx op0
, rtx op1
)
17354 machine_mode inmode
= GET_MODE (op1
);
17355 machine_mode outmode
= GET_MODE (op0
);
17356 rtx e1
= gen_reg_rtx (XFmode
);
17357 rtx e2
= gen_reg_rtx (XFmode
);
17358 rtx scratch
= gen_reg_rtx (HImode
);
17359 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17360 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17361 rtx res
= gen_reg_rtx (outmode
);
17362 rtx_code_label
*jump_label
= gen_label_rtx ();
17363 rtx (*floor_insn
) (rtx
, rtx
);
17364 rtx (*neg_insn
) (rtx
, rtx
);
17372 tmp
= gen_reg_rtx (XFmode
);
17374 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
17380 gcc_unreachable ();
17386 floor_insn
= gen_frndintxf2_floor
;
17387 neg_insn
= gen_negsf2
;
17390 floor_insn
= gen_frndintxf2_floor
;
17391 neg_insn
= gen_negdf2
;
17394 floor_insn
= gen_frndintxf2_floor
;
17395 neg_insn
= gen_negxf2
;
17398 floor_insn
= gen_lfloorxfhi2
;
17399 neg_insn
= gen_neghi2
;
17402 floor_insn
= gen_lfloorxfsi2
;
17403 neg_insn
= gen_negsi2
;
17406 floor_insn
= gen_lfloorxfdi2
;
17407 neg_insn
= gen_negdi2
;
17410 gcc_unreachable ();
17413 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17415 /* scratch = fxam(op1) */
17416 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17418 /* e1 = fabs(op1) */
17419 emit_insn (gen_absxf2 (e1
, op1
));
17421 /* e2 = e1 + 0.5 */
17422 half
= force_reg (XFmode
, half
);
17423 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
17425 /* res = floor(e2) */
17431 tmp
= gen_reg_rtx (XFmode
);
17433 emit_insn (floor_insn (tmp
, e2
));
17434 emit_insn (gen_rtx_SET (res
,
17435 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
17436 UNSPEC_TRUNC_NOOP
)));
17440 emit_insn (floor_insn (res
, e2
));
17443 /* flags = signbit(a) */
17444 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17446 /* if (flags) then res = -res */
17447 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17448 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17449 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17451 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17452 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17453 JUMP_LABEL (insn
) = jump_label
;
17455 emit_insn (neg_insn (res
, res
));
17457 emit_label (jump_label
);
17458 LABEL_NUSES (jump_label
) = 1;
17460 emit_move_insn (op0
, res
);
17463 /* Output code to perform a Newton-Rhapson approximation of a single precision
17464 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
17467 ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
17469 rtx x0
, x1
, e0
, e1
;
17471 x0
= gen_reg_rtx (mode
);
17472 e0
= gen_reg_rtx (mode
);
17473 e1
= gen_reg_rtx (mode
);
17474 x1
= gen_reg_rtx (mode
);
17476 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17478 b
= force_reg (mode
, b
);
17480 /* x0 = rcp(b) estimate */
17481 if (mode
== V16SFmode
|| mode
== V8DFmode
)
17483 if (TARGET_AVX512ER
)
17485 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
17488 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
17492 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
17496 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
17500 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
17503 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
17506 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
17509 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
17512 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
17515 /* Output code to perform a Newton-Rhapson approximation of a
17516 single precision floating point [reciprocal] square root. */
17519 ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
17521 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
17525 x0
= gen_reg_rtx (mode
);
17526 e0
= gen_reg_rtx (mode
);
17527 e1
= gen_reg_rtx (mode
);
17528 e2
= gen_reg_rtx (mode
);
17529 e3
= gen_reg_rtx (mode
);
17531 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
17534 /* res = rsqrt28(a) estimate */
17535 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
17539 /* x0 = rsqrt28(a) estimate */
17540 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
17542 /* res = rcp28(x0) estimate */
17543 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
17549 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
17550 mthree
= const_double_from_real_value (r
, SFmode
);
17552 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
17553 mhalf
= const_double_from_real_value (r
, SFmode
);
17554 unspec
= UNSPEC_RSQRT
;
17556 if (VECTOR_MODE_P (mode
))
17558 mthree
= ix86_build_const_vector (mode
, true, mthree
);
17559 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
17560 /* There is no 512-bit rsqrt. There is however rsqrt14. */
17561 if (GET_MODE_SIZE (mode
) == 64)
17562 unspec
= UNSPEC_RSQRT14
;
17565 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
17566 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
17568 a
= force_reg (mode
, a
);
17570 /* x0 = rsqrt(a) estimate */
17571 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
17574 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
17577 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
17580 /* Handle masked compare. */
17581 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
17583 mask
= gen_reg_rtx (HImode
);
17584 /* Imm value 0x4 corresponds to not-equal comparison. */
17585 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
17586 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
17590 mask
= gen_reg_rtx (mode
);
17591 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
17592 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
17596 mthree
= force_reg (mode
, mthree
);
17599 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
17601 unsigned vector_size
= GET_MODE_SIZE (mode
);
17603 || (TARGET_AVX512F
&& vector_size
== 64)
17604 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
17605 emit_insn (gen_rtx_SET (e2
,
17606 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
17610 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
17613 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
17616 mhalf
= force_reg (mode
, mhalf
);
17618 /* e3 = -.5 * x0 */
17619 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
17621 /* e3 = -.5 * e0 */
17622 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
17623 /* ret = e2 * e3 */
17624 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
17627 /* Expand fabs (OP0) and return a new rtx that holds the result. The
17628 mask for masking out the sign-bit is stored in *SMASK, if that is
17632 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
17634 machine_mode vmode
, mode
= GET_MODE (op0
);
17637 xa
= gen_reg_rtx (mode
);
17638 if (mode
== SFmode
)
17640 else if (mode
== DFmode
)
17644 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
17645 if (!VECTOR_MODE_P (mode
))
17647 /* We need to generate a scalar mode mask in this case. */
17648 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
17649 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
17650 mask
= gen_reg_rtx (mode
);
17651 emit_insn (gen_rtx_SET (mask
, tmp
));
17653 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
17661 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
17662 swapping the operands if SWAP_OPERANDS is true. The expanded
17663 code is a forward jump to a newly created label in case the
17664 comparison is true. The generated label rtx is returned. */
17665 static rtx_code_label
*
17666 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
17667 bool swap_operands
)
17669 bool unordered_compare
= ix86_unordered_fp_compare (code
);
17670 rtx_code_label
*label
;
17674 std::swap (op0
, op1
);
17676 label
= gen_label_rtx ();
17677 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
17678 if (unordered_compare
)
17679 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
17680 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
17681 emit_insn (gen_rtx_SET (reg
, tmp
));
17682 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
17683 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
17684 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
17685 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17686 JUMP_LABEL (tmp
) = label
;
17691 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
17692 using comparison code CODE. Operands are swapped for the comparison if
17693 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
17695 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
17696 bool swap_operands
)
17698 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
17699 machine_mode mode
= GET_MODE (op0
);
17700 rtx mask
= gen_reg_rtx (mode
);
17703 std::swap (op0
, op1
);
17705 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
17707 emit_insn (insn (mask
, op0
, op1
,
17708 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
17712 /* Expand copysign from SIGN to the positive value ABS_VALUE
17713 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
17717 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
17719 machine_mode mode
= GET_MODE (sign
);
17720 rtx sgn
= gen_reg_rtx (mode
);
17721 if (mask
== NULL_RTX
)
17723 machine_mode vmode
;
17725 if (mode
== SFmode
)
17727 else if (mode
== DFmode
)
17732 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
17733 if (!VECTOR_MODE_P (mode
))
17735 /* We need to generate a scalar mode mask in this case. */
17736 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
17737 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
17738 mask
= gen_reg_rtx (mode
);
17739 emit_insn (gen_rtx_SET (mask
, tmp
));
17743 mask
= gen_rtx_NOT (mode
, mask
);
17744 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
17745 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
17748 /* Expand SSE sequence for computing lround from OP1 storing
17752 ix86_expand_lround (rtx op0
, rtx op1
)
17754 /* C code for the stuff we're doing below:
17755 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
17758 machine_mode mode
= GET_MODE (op1
);
17759 const struct real_format
*fmt
;
17760 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
17763 /* load nextafter (0.5, 0.0) */
17764 fmt
= REAL_MODE_FORMAT (mode
);
17765 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
17766 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
17768 /* adj = copysign (0.5, op1) */
17769 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
17770 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
17772 /* adj = op1 + adj */
17773 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
17775 /* op0 = (imode)adj */
17776 expand_fix (op0
, adj
, 0);
17779 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
17783 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
17785 /* C code for the stuff we're doing below (for do_floor):
17787 xi -= (double)xi > op1 ? 1 : 0;
17790 machine_mode fmode
= GET_MODE (op1
);
17791 machine_mode imode
= GET_MODE (op0
);
17792 rtx ireg
, freg
, tmp
;
17793 rtx_code_label
*label
;
17795 /* reg = (long)op1 */
17796 ireg
= gen_reg_rtx (imode
);
17797 expand_fix (ireg
, op1
, 0);
17799 /* freg = (double)reg */
17800 freg
= gen_reg_rtx (fmode
);
17801 expand_float (freg
, ireg
, 0);
17803 /* ireg = (freg > op1) ? ireg - 1 : ireg */
17804 label
= ix86_expand_sse_compare_and_jump (UNLE
,
17805 freg
, op1
, !do_floor
);
17806 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
17807 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
17808 emit_move_insn (ireg
, tmp
);
17810 emit_label (label
);
17811 LABEL_NUSES (label
) = 1;
17813 emit_move_insn (op0
, ireg
);
17816 /* Generate and return a rtx of mode MODE for 2**n where n is the number
17817 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
17820 ix86_gen_TWO52 (machine_mode mode
)
17822 const struct real_format
*fmt
;
17823 REAL_VALUE_TYPE TWO52r
;
17826 fmt
= REAL_MODE_FORMAT (mode
);
17827 real_2expN (&TWO52r
, fmt
->p
- 1, mode
);
17828 TWO52
= const_double_from_real_value (TWO52r
, mode
);
17829 TWO52
= force_reg (mode
, TWO52
);
17834 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
17837 ix86_expand_rint (rtx operand0
, rtx operand1
)
17839 /* C code for the stuff we're doing below:
17840 xa = fabs (operand1);
17841 if (!isless (xa, 2**52))
17844 if (flag_rounding_math)
17846 two52 = copysign (two52, operand1);
17849 xa = xa + two52 - two52;
17850 return copysign (xa, operand1);
17852 machine_mode mode
= GET_MODE (operand0
);
17853 rtx res
, xa
, TWO52
, mask
;
17854 rtx_code_label
*label
;
17856 TWO52
= ix86_gen_TWO52 (mode
);
17858 /* Temporary for holding the result, initialized to the input
17859 operand to ease control flow. */
17860 res
= copy_to_reg (operand1
);
17862 /* xa = abs (operand1) */
17863 xa
= ix86_expand_sse_fabs (res
, &mask
);
17865 /* if (!isless (xa, TWO52)) goto label; */
17866 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
17868 if (flag_rounding_math
)
17870 ix86_sse_copysign_to_positive (TWO52
, TWO52
, res
, mask
);
17874 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
17875 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
17877 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
17878 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
17879 xa
= ix86_expand_sse_fabs (xa
, NULL
);
17881 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
17883 emit_label (label
);
17884 LABEL_NUSES (label
) = 1;
17886 emit_move_insn (operand0
, res
);
17889 /* Expand SSE2 sequence for computing floor or ceil
17890 from OPERAND1 storing into OPERAND0. */
17892 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
17894 /* C code for the stuff we expand below.
17895 double xa = fabs (x), x2;
17896 if (!isless (xa, TWO52))
17898 x2 = (double)(long)x;
17907 if (HONOR_SIGNED_ZEROS (mode))
17908 return copysign (x2, x);
17911 machine_mode mode
= GET_MODE (operand0
);
17912 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
17913 rtx_code_label
*label
;
17915 TWO52
= ix86_gen_TWO52 (mode
);
17917 /* Temporary for holding the result, initialized to the input
17918 operand to ease control flow. */
17919 res
= copy_to_reg (operand1
);
17921 /* xa = abs (operand1) */
17922 xa
= ix86_expand_sse_fabs (res
, &mask
);
17924 /* if (!isless (xa, TWO52)) goto label; */
17925 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
17927 /* xa = (double)(long)x */
17928 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
17929 expand_fix (xi
, res
, 0);
17930 expand_float (xa
, xi
, 0);
17933 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
17935 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
17936 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
17937 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
17938 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
17939 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
17940 if (HONOR_SIGNED_ZEROS (mode
))
17942 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
17943 if (do_floor
&& flag_rounding_math
)
17944 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
17946 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
17948 emit_move_insn (res
, tmp
);
17950 emit_label (label
);
17951 LABEL_NUSES (label
) = 1;
17953 emit_move_insn (operand0
, res
);
17956 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
17957 into OPERAND0 without relying on DImode truncation via cvttsd2siq
17958 that is only available on 64bit targets. */
17960 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
17962 /* C code for the stuff we expand below.
17963 double xa = fabs (x), x2;
17964 if (!isless (xa, TWO52))
17966 xa = xa + TWO52 - TWO52;
17967 x2 = copysign (xa, x);
17976 if (HONOR_SIGNED_ZEROS (mode))
17977 x2 = copysign (x2, x);
17980 machine_mode mode
= GET_MODE (operand0
);
17981 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
17982 rtx_code_label
*label
;
17984 TWO52
= ix86_gen_TWO52 (mode
);
17986 /* Temporary for holding the result, initialized to the input
17987 operand to ease control flow. */
17988 res
= copy_to_reg (operand1
);
17990 /* xa = abs (operand1) */
17991 xa
= ix86_expand_sse_fabs (res
, &mask
);
17993 /* if (!isless (xa, TWO52)) goto label; */
17994 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
17996 /* xa = xa + TWO52 - TWO52; */
17997 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
17998 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
18000 /* xa = copysign (xa, operand1) */
18001 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18004 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18006 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18007 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
18008 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18009 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
18010 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18011 if (HONOR_SIGNED_ZEROS (mode
))
18013 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18014 if (do_floor
&& flag_rounding_math
)
18015 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18017 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
18019 emit_move_insn (res
, tmp
);
18021 emit_label (label
);
18022 LABEL_NUSES (label
) = 1;
18024 emit_move_insn (operand0
, res
);
18027 /* Expand SSE sequence for computing trunc
18028 from OPERAND1 storing into OPERAND0. */
18030 ix86_expand_trunc (rtx operand0
, rtx operand1
)
18032 /* C code for SSE variant we expand below.
18033 double xa = fabs (x), x2;
18034 if (!isless (xa, TWO52))
18036 x2 = (double)(long)x;
18037 if (HONOR_SIGNED_ZEROS (mode))
18038 return copysign (x2, x);
18041 machine_mode mode
= GET_MODE (operand0
);
18042 rtx xa
, xi
, TWO52
, res
, mask
;
18043 rtx_code_label
*label
;
18045 TWO52
= ix86_gen_TWO52 (mode
);
18047 /* Temporary for holding the result, initialized to the input
18048 operand to ease control flow. */
18049 res
= copy_to_reg (operand1
);
18051 /* xa = abs (operand1) */
18052 xa
= ix86_expand_sse_fabs (res
, &mask
);
18054 /* if (!isless (xa, TWO52)) goto label; */
18055 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18057 /* xa = (double)(long)x */
18058 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18059 expand_fix (xi
, res
, 0);
18060 expand_float (xa
, xi
, 0);
18062 if (HONOR_SIGNED_ZEROS (mode
))
18063 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18065 emit_move_insn (res
, xa
);
18067 emit_label (label
);
18068 LABEL_NUSES (label
) = 1;
18070 emit_move_insn (operand0
, res
);
18073 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18074 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18075 that is only available on 64bit targets. */
18077 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
18079 machine_mode mode
= GET_MODE (operand0
);
18080 rtx xa
, xa2
, TWO52
, tmp
, one
, res
, mask
;
18081 rtx_code_label
*label
;
18083 /* C code for SSE variant we expand below.
18084 double xa = fabs (x), x2;
18085 if (!isless (xa, TWO52))
18087 xa2 = xa + TWO52 - TWO52;
18091 x2 = copysign (xa2, x);
18095 TWO52
= ix86_gen_TWO52 (mode
);
18097 /* Temporary for holding the result, initialized to the input
18098 operand to ease control flow. */
18099 res
=copy_to_reg (operand1
);
18101 /* xa = abs (operand1) */
18102 xa
= ix86_expand_sse_fabs (res
, &mask
);
18104 /* if (!isless (xa, TWO52)) goto label; */
18105 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18107 /* xa2 = xa + TWO52 - TWO52; */
18108 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18109 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18112 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18114 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18115 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa2
, xa
, false);
18116 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18117 tmp
= expand_simple_binop (mode
, MINUS
,
18118 xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18119 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18120 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
18121 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18123 /* res = copysign (xa2, operand1) */
18124 ix86_sse_copysign_to_positive (res
, tmp
, res
, mask
);
18126 emit_label (label
);
18127 LABEL_NUSES (label
) = 1;
18129 emit_move_insn (operand0
, res
);
18132 /* Expand SSE sequence for computing round
18133 from OPERAND1 storing into OPERAND0. */
18135 ix86_expand_round (rtx operand0
, rtx operand1
)
18137 /* C code for the stuff we're doing below:
18138 double xa = fabs (x);
18139 if (!isless (xa, TWO52))
18141 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18142 return copysign (xa, x);
18144 machine_mode mode
= GET_MODE (operand0
);
18145 rtx res
, TWO52
, xa
, xi
, half
, mask
;
18146 rtx_code_label
*label
;
18147 const struct real_format
*fmt
;
18148 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18150 /* Temporary for holding the result, initialized to the input
18151 operand to ease control flow. */
18152 res
= copy_to_reg (operand1
);
18154 TWO52
= ix86_gen_TWO52 (mode
);
18155 xa
= ix86_expand_sse_fabs (res
, &mask
);
18156 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18158 /* load nextafter (0.5, 0.0) */
18159 fmt
= REAL_MODE_FORMAT (mode
);
18160 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18161 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18163 /* xa = xa + 0.5 */
18164 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18165 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18167 /* xa = (double)(int64_t)xa */
18168 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18169 expand_fix (xi
, xa
, 0);
18170 expand_float (xa
, xi
, 0);
18172 /* res = copysign (xa, operand1) */
18173 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
18175 emit_label (label
);
18176 LABEL_NUSES (label
) = 1;
18178 emit_move_insn (operand0
, res
);
18181 /* Expand SSE sequence for computing round from OPERAND1 storing
18182 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18183 that is only available on 64bit targets. */
18185 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
18187 /* C code for the stuff we expand below.
18188 double xa = fabs (x), xa2, x2;
18189 if (!isless (xa, TWO52))
18191 Using the absolute value and copying back sign makes
18192 -0.0 -> -0.0 correct.
18193 xa2 = xa + TWO52 - TWO52;
18198 else if (dxa > 0.5)
18200 x2 = copysign (xa2, x);
18203 machine_mode mode
= GET_MODE (operand0
);
18204 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
18205 rtx_code_label
*label
;
18207 TWO52
= ix86_gen_TWO52 (mode
);
18209 /* Temporary for holding the result, initialized to the input
18210 operand to ease control flow. */
18211 res
= copy_to_reg (operand1
);
18213 /* xa = abs (operand1) */
18214 xa
= ix86_expand_sse_fabs (res
, &mask
);
18216 /* if (!isless (xa, TWO52)) goto label; */
18217 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18219 /* xa2 = xa + TWO52 - TWO52; */
18220 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18221 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18223 /* dxa = xa2 - xa; */
18224 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
18226 /* generate 0.5, 1.0 and -0.5 */
18227 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
18228 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18229 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
18233 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18234 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
18235 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18236 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18237 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18238 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
18239 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18240 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18242 /* res = copysign (xa2, operand1) */
18243 ix86_sse_copysign_to_positive (res
, xa2
, res
, mask
);
18245 emit_label (label
);
18246 LABEL_NUSES (label
) = 1;
18248 emit_move_insn (operand0
, res
);
18251 /* Expand SSE sequence for computing round
18252 from OP1 storing into OP0 using sse4 round insn. */
18254 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
18256 machine_mode mode
= GET_MODE (op0
);
18257 rtx e1
, e2
, res
, half
;
18258 const struct real_format
*fmt
;
18259 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18260 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
18261 rtx (*gen_round
) (rtx
, rtx
, rtx
);
18266 gen_copysign
= gen_copysignsf3
;
18267 gen_round
= gen_sse4_1_roundsf2
;
18270 gen_copysign
= gen_copysigndf3
;
18271 gen_round
= gen_sse4_1_rounddf2
;
18274 gcc_unreachable ();
18277 /* round (a) = trunc (a + copysign (0.5, a)) */
18279 /* load nextafter (0.5, 0.0) */
18280 fmt
= REAL_MODE_FORMAT (mode
);
18281 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18282 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18283 half
= const_double_from_real_value (pred_half
, mode
);
18285 /* e1 = copysign (0.5, op1) */
18286 e1
= gen_reg_rtx (mode
);
18287 emit_insn (gen_copysign (e1
, half
, op1
));
18289 /* e2 = op1 + e1 */
18290 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18292 /* res = trunc (e2) */
18293 res
= gen_reg_rtx (mode
);
18294 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
18296 emit_move_insn (op0
, res
);
18299 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18300 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18301 insn every time. */
18303 static GTY(()) rtx_insn
*vselect_insn
;
18305 /* Initialize vselect_insn. */
18308 init_vselect_insn (void)
18313 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
18314 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
18315 XVECEXP (x
, 0, i
) = const0_rtx
;
18316 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
18318 x
= gen_rtx_SET (const0_rtx
, x
);
18320 vselect_insn
= emit_insn (x
);
18324 /* Construct (set target (vec_select op0 (parallel perm))) and
18325 return true if that's a valid instruction in the active ISA. */
18328 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
18329 unsigned nelt
, bool testing_p
)
18332 rtx x
, save_vconcat
;
18335 if (vselect_insn
== NULL_RTX
)
18336 init_vselect_insn ();
18338 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
18339 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
18340 for (i
= 0; i
< nelt
; ++i
)
18341 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
18342 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
18343 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
18344 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
18345 SET_DEST (PATTERN (vselect_insn
)) = target
;
18346 icode
= recog_memoized (vselect_insn
);
18348 if (icode
>= 0 && !testing_p
)
18349 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
18351 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
18352 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
18353 INSN_CODE (vselect_insn
) = -1;
18358 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18361 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
18362 const unsigned char *perm
, unsigned nelt
,
18365 machine_mode v2mode
;
18369 if (vselect_insn
== NULL_RTX
)
18370 init_vselect_insn ();
18372 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
18374 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
18375 PUT_MODE (x
, v2mode
);
18378 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
18379 XEXP (x
, 0) = const0_rtx
;
18380 XEXP (x
, 1) = const0_rtx
;
18384 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18385 using movss or movsd. */
18387 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
18389 machine_mode vmode
= d
->vmode
;
18390 unsigned i
, nelt
= d
->nelt
;
18393 if (d
->one_operand_p
)
18396 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
18397 && !(TARGET_MMX_WITH_SSE
&& vmode
== V2SFmode
)
18398 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
18401 /* Only the first element is changed. */
18402 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
18404 for (i
= 1; i
< nelt
; ++i
)
18405 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
18411 if (d
->perm
[0] == nelt
)
18412 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
18414 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
18416 emit_insn (gen_rtx_SET (d
->target
, x
));
18421 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18422 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
18425 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
18427 machine_mode mmode
, vmode
= d
->vmode
;
18428 unsigned i
, nelt
= d
->nelt
;
18429 unsigned HOST_WIDE_INT mask
;
18430 rtx target
, op0
, op1
, maskop
, x
;
18431 rtx rperm
[32], vperm
;
18433 if (d
->one_operand_p
)
18435 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
18436 && (TARGET_AVX512BW
18437 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
18439 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
18441 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
18443 else if (TARGET_SSE4_1
&& (GET_MODE_SIZE (vmode
) == 16
18444 || GET_MODE_SIZE (vmode
) == 8
18445 || GET_MODE_SIZE (vmode
) == 4))
18450 /* This is a blend, not a permute. Elements must stay in their
18451 respective lanes. */
18452 for (i
= 0; i
< nelt
; ++i
)
18454 unsigned e
= d
->perm
[i
];
18455 if (!(e
== i
|| e
== i
+ nelt
))
18462 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
18463 decision should be extracted elsewhere, so that we only try that
18464 sequence once all budget==3 options have been tried. */
18465 target
= d
->target
;
18485 for (i
= 0; i
< nelt
; ++i
)
18486 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
18490 for (i
= 0; i
< 2; ++i
)
18491 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
18496 for (i
= 0; i
< 2; ++i
)
18497 mask
|= (d
->perm
[i
] >= 2 ? 3 : 0) << (i
* 2);
18502 for (i
= 0; i
< 4; ++i
)
18503 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
18508 /* See if bytes move in pairs so we can use pblendw with
18509 an immediate argument, rather than pblendvb with a vector
18511 for (i
= 0; i
< 16; i
+= 2)
18512 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18515 for (i
= 0; i
< nelt
; ++i
)
18516 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
18519 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
18520 vperm
= force_reg (vmode
, vperm
);
18522 if (GET_MODE_SIZE (vmode
) == 4)
18523 emit_insn (gen_mmx_pblendvb_v4qi (target
, op0
, op1
, vperm
));
18524 else if (GET_MODE_SIZE (vmode
) == 8)
18525 emit_insn (gen_mmx_pblendvb_v8qi (target
, op0
, op1
, vperm
));
18526 else if (GET_MODE_SIZE (vmode
) == 16)
18527 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
18529 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
18530 if (target
!= d
->target
)
18531 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
18535 for (i
= 0; i
< 8; ++i
)
18536 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
18541 target
= gen_reg_rtx (vmode
);
18542 op0
= gen_lowpart (vmode
, op0
);
18543 op1
= gen_lowpart (vmode
, op1
);
18547 for (i
= 0; i
< 8; i
+= 2)
18548 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18551 for (i
= 0; i
< 4; ++i
)
18552 mask
|= (d
->perm
[i
* 2] >= 8) << i
;
18557 for (i
= 0; i
< 4; i
+= 2)
18558 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18561 for (i
= 0; i
< 2; ++i
)
18562 mask
|= (d
->perm
[i
* 2] >= 4) << i
;
18567 /* See if bytes move in pairs. If not, vpblendvb must be used. */
18568 for (i
= 0; i
< 32; i
+= 2)
18569 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18571 /* See if bytes move in quadruplets. If yes, vpblendd
18572 with immediate can be used. */
18573 for (i
= 0; i
< 32; i
+= 4)
18574 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
18578 /* See if bytes move the same in both lanes. If yes,
18579 vpblendw with immediate can be used. */
18580 for (i
= 0; i
< 16; i
+= 2)
18581 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
18584 /* Use vpblendw. */
18585 for (i
= 0; i
< 16; ++i
)
18586 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
18591 /* Use vpblendd. */
18592 for (i
= 0; i
< 8; ++i
)
18593 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
18598 /* See if words move in pairs. If yes, vpblendd can be used. */
18599 for (i
= 0; i
< 16; i
+= 2)
18600 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18604 /* See if words move the same in both lanes. If not,
18605 vpblendvb must be used. */
18606 for (i
= 0; i
< 8; i
++)
18607 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
18609 /* Use vpblendvb. */
18610 for (i
= 0; i
< 32; ++i
)
18611 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
18615 target
= gen_reg_rtx (vmode
);
18616 op0
= gen_lowpart (vmode
, op0
);
18617 op1
= gen_lowpart (vmode
, op1
);
18618 goto finish_pblendvb
;
18621 /* Use vpblendw. */
18622 for (i
= 0; i
< 16; ++i
)
18623 mask
|= (d
->perm
[i
] >= 16) << i
;
18627 /* Use vpblendd. */
18628 for (i
= 0; i
< 8; ++i
)
18629 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
18634 /* Use vpblendd. */
18635 for (i
= 0; i
< 4; ++i
)
18636 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
18641 gcc_unreachable ();
18664 if (mmode
!= VOIDmode
)
18665 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
18667 maskop
= GEN_INT (mask
);
18669 /* This matches five different patterns with the different modes. */
18670 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
18671 x
= gen_rtx_SET (target
, x
);
18673 if (target
!= d
->target
)
18674 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
18679 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18680 in terms of the variable form of vpermilps.
18682 Note that we will have already failed the immediate input vpermilps,
18683 which requires that the high and low part shuffle be identical; the
18684 variable form doesn't require that. */
18687 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
18689 rtx rperm
[8], vperm
;
18692 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
18695 /* We can only permute within the 128-bit lane. */
18696 for (i
= 0; i
< 8; ++i
)
18698 unsigned e
= d
->perm
[i
];
18699 if (i
< 4 ? e
>= 4 : e
< 4)
18706 for (i
= 0; i
< 8; ++i
)
18708 unsigned e
= d
->perm
[i
];
18710 /* Within each 128-bit lane, the elements of op0 are numbered
18711 from 0 and the elements of op1 are numbered from 4. */
18717 rperm
[i
] = GEN_INT (e
);
18720 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
18721 vperm
= force_reg (V8SImode
, vperm
);
18722 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
18727 /* For V*[QHS]Imode permutations, check if the same permutation
18728 can't be performed in a 2x, 4x or 8x wider inner mode. */
18731 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
18732 struct expand_vec_perm_d
*nd
)
18735 machine_mode mode
= VOIDmode
;
18739 case E_V8QImode
: mode
= V4HImode
; break;
18740 case E_V16QImode
: mode
= V8HImode
; break;
18741 case E_V32QImode
: mode
= V16HImode
; break;
18742 case E_V64QImode
: mode
= V32HImode
; break;
18743 case E_V4HImode
: mode
= V2SImode
; break;
18744 case E_V8HImode
: mode
= V4SImode
; break;
18745 case E_V16HImode
: mode
= V8SImode
; break;
18746 case E_V32HImode
: mode
= V16SImode
; break;
18747 case E_V4SImode
: mode
= V2DImode
; break;
18748 case E_V8SImode
: mode
= V4DImode
; break;
18749 case E_V16SImode
: mode
= V8DImode
; break;
18750 default: return false;
18752 for (i
= 0; i
< d
->nelt
; i
+= 2)
18753 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
18756 nd
->nelt
= d
->nelt
/ 2;
18757 for (i
= 0; i
< nd
->nelt
; i
++)
18758 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
18759 if (GET_MODE_INNER (mode
) != DImode
)
18760 canonicalize_vector_int_perm (nd
, nd
);
18763 nd
->one_operand_p
= d
->one_operand_p
;
18764 nd
->testing_p
= d
->testing_p
;
18765 if (d
->op0
== d
->op1
)
18766 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
18769 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
18770 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
18773 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
18775 nd
->target
= gen_reg_rtx (nd
->vmode
);
18780 /* Return true if permutation D can be performed as VMODE permutation
18784 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
18786 unsigned int i
, j
, chunk
;
18788 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
18789 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
18790 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
18793 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
18796 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
18797 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
18798 if (d
->perm
[i
] & (chunk
- 1))
18801 for (j
= 1; j
< chunk
; ++j
)
18802 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
18808 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18809 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
18812 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
18814 unsigned i
, nelt
, eltsz
, mask
;
18815 unsigned char perm
[64];
18816 machine_mode vmode
;
18817 struct expand_vec_perm_d nd
;
18818 rtx rperm
[64], vperm
, target
, op0
, op1
;
18822 if (!d
->one_operand_p
)
18823 switch (GET_MODE_SIZE (d
->vmode
))
18847 if (valid_perm_using_mode_p (V2TImode
, d
))
18852 /* Use vperm2i128 insn. The pattern uses
18853 V4DImode instead of V2TImode. */
18854 target
= d
->target
;
18855 if (d
->vmode
!= V4DImode
)
18856 target
= gen_reg_rtx (V4DImode
);
18857 op0
= gen_lowpart (V4DImode
, d
->op0
);
18858 op1
= gen_lowpart (V4DImode
, d
->op1
);
18860 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
18861 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
18862 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
18863 if (target
!= d
->target
)
18864 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
18873 switch (GET_MODE_SIZE (d
->vmode
))
18897 /* V4DImode should be already handled through
18898 expand_vselect by vpermq instruction. */
18899 gcc_assert (d
->vmode
!= V4DImode
);
18902 if (d
->vmode
== V8SImode
18903 || d
->vmode
== V16HImode
18904 || d
->vmode
== V32QImode
)
18906 /* First see if vpermq can be used for
18907 V8SImode/V16HImode/V32QImode. */
18908 if (valid_perm_using_mode_p (V4DImode
, d
))
18910 for (i
= 0; i
< 4; i
++)
18911 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
18914 target
= gen_reg_rtx (V4DImode
);
18915 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
18918 emit_move_insn (d
->target
,
18919 gen_lowpart (d
->vmode
, target
));
18925 /* Next see if vpermd can be used. */
18926 if (valid_perm_using_mode_p (V8SImode
, d
))
18929 /* Or if vpermps can be used. */
18930 else if (d
->vmode
== V8SFmode
)
18933 if (vmode
== V32QImode
)
18935 /* vpshufb only works intra lanes, it is not
18936 possible to shuffle bytes in between the lanes. */
18937 for (i
= 0; i
< nelt
; ++i
)
18938 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
18944 if (!TARGET_AVX512BW
)
18947 /* If vpermq didn't work, vpshufb won't work either. */
18948 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
18952 if (d
->vmode
== V16SImode
18953 || d
->vmode
== V32HImode
18954 || d
->vmode
== V64QImode
)
18956 /* First see if vpermq can be used for
18957 V16SImode/V32HImode/V64QImode. */
18958 if (valid_perm_using_mode_p (V8DImode
, d
))
18960 for (i
= 0; i
< 8; i
++)
18961 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
18964 target
= gen_reg_rtx (V8DImode
);
18965 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
18968 emit_move_insn (d
->target
,
18969 gen_lowpart (d
->vmode
, target
));
18975 /* Next see if vpermd can be used. */
18976 if (valid_perm_using_mode_p (V16SImode
, d
))
18979 /* Or if vpermps can be used. */
18980 else if (d
->vmode
== V16SFmode
)
18983 if (vmode
== V64QImode
)
18985 /* vpshufb only works intra lanes, it is not
18986 possible to shuffle bytes in between the lanes. */
18987 for (i
= 0; i
< nelt
; ++i
)
18988 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
19000 /* Try to avoid variable permutation instruction. */
19001 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19003 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19007 if (vmode
== V8SImode
)
19008 for (i
= 0; i
< 8; ++i
)
19009 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
19010 else if (vmode
== V16SImode
)
19011 for (i
= 0; i
< 16; ++i
)
19012 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
19015 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19016 if (!d
->one_operand_p
)
19017 mask
= 2 * nelt
- 1;
19018 else if (vmode
== V64QImode
)
19019 mask
= nelt
/ 4 - 1;
19020 else if (vmode
== V32QImode
)
19021 mask
= nelt
/ 2 - 1;
19025 for (i
= 0; i
< nelt
; ++i
)
19027 unsigned j
, e
= d
->perm
[i
] & mask
;
19028 for (j
= 0; j
< eltsz
; ++j
)
19029 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
19033 machine_mode vpmode
= vmode
;
19035 nelt
= GET_MODE_SIZE (vmode
);
19037 /* Emulate narrow modes with V16QI instructions. */
19040 rtx m128
= GEN_INT (-128);
19042 /* Remap elements from the second operand, as we have to
19043 account for inactive top elements from the first operand. */
19044 if (!d
->one_operand_p
)
19046 for (i
= 0; i
< nelt
; ++i
)
19048 unsigned ival
= UINTVAL (rperm
[i
]);
19050 rperm
[i
] = GEN_INT (ival
+ 16 - nelt
);
19054 /* Fill inactive elements in the top positions with zeros. */
19055 for (i
= nelt
; i
< 16; ++i
)
19058 vpmode
= V16QImode
;
19061 vperm
= gen_rtx_CONST_VECTOR (vpmode
,
19062 gen_rtvec_v (GET_MODE_NUNITS (vpmode
), rperm
));
19063 vperm
= force_reg (vpmode
, vperm
);
19065 if (vmode
== d
->vmode
)
19066 target
= d
->target
;
19068 target
= gen_reg_rtx (vmode
);
19070 op0
= gen_lowpart (vmode
, d
->op0
);
19072 if (d
->one_operand_p
)
19074 rtx (*gen
) (rtx
, rtx
, rtx
);
19076 if (vmode
== V4QImode
)
19077 gen
= gen_mmx_pshufbv4qi3
;
19078 else if (vmode
== V8QImode
)
19079 gen
= gen_mmx_pshufbv8qi3
;
19080 else if (vmode
== V16QImode
)
19081 gen
= gen_ssse3_pshufbv16qi3
;
19082 else if (vmode
== V32QImode
)
19083 gen
= gen_avx2_pshufbv32qi3
;
19084 else if (vmode
== V64QImode
)
19085 gen
= gen_avx512bw_pshufbv64qi3
;
19086 else if (vmode
== V8SFmode
)
19087 gen
= gen_avx2_permvarv8sf
;
19088 else if (vmode
== V8SImode
)
19089 gen
= gen_avx2_permvarv8si
;
19090 else if (vmode
== V16SFmode
)
19091 gen
= gen_avx512f_permvarv16sf
;
19092 else if (vmode
== V16SImode
)
19093 gen
= gen_avx512f_permvarv16si
;
19095 gcc_unreachable ();
19097 emit_insn (gen (target
, op0
, vperm
));
19101 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
19103 op1
= gen_lowpart (vmode
, d
->op1
);
19105 if (vmode
== V4QImode
)
19106 gen
= gen_mmx_ppermv32
;
19107 else if (vmode
== V8QImode
)
19108 gen
= gen_mmx_ppermv64
;
19109 else if (vmode
== V16QImode
)
19110 gen
= gen_xop_pperm
;
19112 gcc_unreachable ();
19114 emit_insn (gen (target
, op0
, op1
, vperm
));
19117 if (target
!= d
->target
)
19118 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19123 /* Try to expand one-operand permutation with constant mask. */
19126 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
19128 machine_mode mode
= GET_MODE (d
->op0
);
19129 machine_mode maskmode
= mode
;
19130 unsigned inner_size
= GET_MODE_SIZE (GET_MODE_INNER (mode
));
19131 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
19132 rtx target
, op0
, mask
;
19135 if (!rtx_equal_p (d
->op0
, d
->op1
))
19138 if (!TARGET_AVX512F
)
19141 /* Accept VNxHImode and VNxQImode now. */
19142 if (!TARGET_AVX512VL
&& GET_MODE_SIZE (mode
) < 64)
19146 if (!TARGET_AVX512BW
&& inner_size
== 2)
19150 if (!TARGET_AVX512VBMI
&& inner_size
== 1)
19156 gen
= gen_avx512f_permvarv16si
;
19159 gen
= gen_avx512f_permvarv16sf
;
19160 maskmode
= V16SImode
;
19163 gen
= gen_avx512f_permvarv8di
;
19166 gen
= gen_avx512f_permvarv8df
;
19167 maskmode
= V8DImode
;
19170 gen
= gen_avx512bw_permvarv32hi
;
19173 gen
= gen_avx512vl_permvarv16hi
;
19176 gen
= gen_avx512vl_permvarv8hi
;
19179 gen
= gen_avx512bw_permvarv64qi
;
19182 gen
= gen_avx512vl_permvarv32qi
;
19185 gen
= gen_avx512vl_permvarv16qi
;
19195 target
= d
->target
;
19197 for (int i
= 0; i
< d
->nelt
; ++i
)
19198 vec
[i
] = GEN_INT (d
->perm
[i
]);
19199 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
19200 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
19204 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
19206 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19207 in a single instruction. */
19210 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
19212 unsigned i
, nelt
= d
->nelt
;
19213 struct expand_vec_perm_d nd
;
19215 /* Check plain VEC_SELECT first, because AVX has instructions that could
19216 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19217 input where SEL+CONCAT may not. */
19218 if (d
->one_operand_p
)
19220 int mask
= nelt
- 1;
19221 bool identity_perm
= true;
19222 bool broadcast_perm
= true;
19224 for (i
= 0; i
< nelt
; i
++)
19226 nd
.perm
[i
] = d
->perm
[i
] & mask
;
19227 if (nd
.perm
[i
] != i
)
19228 identity_perm
= false;
19230 broadcast_perm
= false;
19236 emit_move_insn (d
->target
, d
->op0
);
19239 else if (broadcast_perm
&& TARGET_AVX2
)
19241 /* Use vpbroadcast{b,w,d}. */
19242 rtx (*gen
) (rtx
, rtx
) = NULL
;
19246 if (TARGET_AVX512BW
)
19247 gen
= gen_avx512bw_vec_dupv64qi_1
;
19250 gen
= gen_avx2_pbroadcastv32qi_1
;
19253 if (TARGET_AVX512BW
)
19254 gen
= gen_avx512bw_vec_dupv32hi_1
;
19257 gen
= gen_avx2_pbroadcastv16hi_1
;
19260 if (TARGET_AVX512F
)
19261 gen
= gen_avx512f_vec_dupv16si_1
;
19264 gen
= gen_avx2_pbroadcastv8si_1
;
19267 gen
= gen_avx2_pbroadcastv16qi
;
19270 gen
= gen_avx2_pbroadcastv8hi
;
19273 if (TARGET_AVX512F
)
19274 gen
= gen_avx512f_vec_dupv16sf_1
;
19277 gen
= gen_avx2_vec_dupv8sf_1
;
19280 if (TARGET_AVX512F
)
19281 gen
= gen_avx512f_vec_dupv8df_1
;
19284 if (TARGET_AVX512F
)
19285 gen
= gen_avx512f_vec_dupv8di_1
;
19287 /* For other modes prefer other shuffles this function creates. */
19293 emit_insn (gen (d
->target
, d
->op0
));
19298 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
19301 /* There are plenty of patterns in sse.md that are written for
19302 SEL+CONCAT and are not replicated for a single op. Perhaps
19303 that should be changed, to avoid the nastiness here. */
19305 /* Recognize interleave style patterns, which means incrementing
19306 every other permutation operand. */
19307 for (i
= 0; i
< nelt
; i
+= 2)
19309 nd
.perm
[i
] = d
->perm
[i
] & mask
;
19310 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
19312 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
19316 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19319 for (i
= 0; i
< nelt
; i
+= 4)
19321 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
19322 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
19323 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
19324 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
19327 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
19333 /* Try movss/movsd instructions. */
19334 if (expand_vec_perm_movs (d
))
19337 /* Finally, try the fully general two operand permute. */
19338 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
19342 /* Recognize interleave style patterns with reversed operands. */
19343 if (!d
->one_operand_p
)
19345 for (i
= 0; i
< nelt
; ++i
)
19347 unsigned e
= d
->perm
[i
];
19355 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
19360 /* Try the SSE4.1 blend variable merge instructions. */
19361 if (expand_vec_perm_blend (d
))
19364 /* Try one of the AVX vpermil variable permutations. */
19365 if (expand_vec_perm_vpermil (d
))
19368 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19369 vpshufb, vpermd, vpermps or vpermq variable permutation. */
19370 if (expand_vec_perm_pshufb (d
))
19373 /* Try the AVX2 vpalignr instruction. */
19374 if (expand_vec_perm_palignr (d
, true))
19377 /* Try the AVX512F vperm{w,b,s,d} instructions */
19378 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
19381 /* Try the AVX512F vpermt2/vpermi2 instructions. */
19382 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
19385 /* See if we can get the same permutation in different vector integer
19387 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19390 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19396 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19397 in terms of a pair of pshuflw + pshufhw instructions. */
19400 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
19402 unsigned char perm2
[MAX_VECT_LEN
];
19406 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
19409 /* The two permutations only operate in 64-bit lanes. */
19410 for (i
= 0; i
< 4; ++i
)
19411 if (d
->perm
[i
] >= 4)
19413 for (i
= 4; i
< 8; ++i
)
19414 if (d
->perm
[i
] < 4)
19420 /* Emit the pshuflw. */
19421 memcpy (perm2
, d
->perm
, 4);
19422 for (i
= 4; i
< 8; ++i
)
19424 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
19427 /* Emit the pshufhw. */
19428 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
19429 for (i
= 0; i
< 4; ++i
)
19431 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
19437 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19438 the permutation using the SSSE3 palignr instruction. This succeeds
19439 when all of the elements in PERM fit within one vector and we merely
19440 need to shift them down so that a single vector permutation has a
19441 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
19442 the vpalignr instruction itself can perform the requested permutation. */
19445 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
19447 unsigned i
, nelt
= d
->nelt
;
19448 unsigned min
, max
, minswap
, maxswap
;
19449 bool in_order
, ok
, swap
= false;
19451 struct expand_vec_perm_d dcopy
;
19453 /* Even with AVX, palignr only operates on 128-bit vectors,
19454 in AVX2 palignr operates on both 128-bit lanes. */
19455 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
19456 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
19461 minswap
= 2 * nelt
;
19463 for (i
= 0; i
< nelt
; ++i
)
19465 unsigned e
= d
->perm
[i
];
19466 unsigned eswap
= d
->perm
[i
] ^ nelt
;
19467 if (GET_MODE_SIZE (d
->vmode
) == 32)
19469 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
19470 eswap
= e
^ (nelt
/ 2);
19476 if (eswap
< minswap
)
19478 if (eswap
> maxswap
)
19482 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
19484 if (d
->one_operand_p
19486 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
19487 ? nelt
/ 2 : nelt
))
19494 /* Given that we have SSSE3, we know we'll be able to implement the
19495 single operand permutation after the palignr with pshufb for
19496 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
19498 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
19504 dcopy
.op0
= d
->op1
;
19505 dcopy
.op1
= d
->op0
;
19506 for (i
= 0; i
< nelt
; ++i
)
19507 dcopy
.perm
[i
] ^= nelt
;
19511 for (i
= 0; i
< nelt
; ++i
)
19513 unsigned e
= dcopy
.perm
[i
];
19514 if (GET_MODE_SIZE (d
->vmode
) == 32
19516 && (e
& (nelt
/ 2 - 1)) < min
)
19517 e
= e
- min
- (nelt
/ 2);
19524 dcopy
.one_operand_p
= true;
19526 if (single_insn_only_p
&& !in_order
)
19529 /* For AVX2, test whether we can permute the result in one instruction. */
19534 dcopy
.op1
= dcopy
.op0
;
19535 return expand_vec_perm_1 (&dcopy
);
19538 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
19539 if (GET_MODE_SIZE (d
->vmode
) == 16)
19541 target
= gen_reg_rtx (TImode
);
19542 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
19543 gen_lowpart (TImode
, dcopy
.op0
), shift
));
19547 target
= gen_reg_rtx (V2TImode
);
19548 emit_insn (gen_avx2_palignrv2ti (target
,
19549 gen_lowpart (V2TImode
, dcopy
.op1
),
19550 gen_lowpart (V2TImode
, dcopy
.op0
),
19554 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
19556 /* Test for the degenerate case where the alignment by itself
19557 produces the desired permutation. */
19560 emit_move_insn (d
->target
, dcopy
.op0
);
19564 ok
= expand_vec_perm_1 (&dcopy
);
19565 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
19570 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19571 the permutation using the SSE4_1 pblendv instruction. Potentially
19572 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
19575 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
19577 unsigned i
, which
, nelt
= d
->nelt
;
19578 struct expand_vec_perm_d dcopy
, dcopy1
;
19579 machine_mode vmode
= d
->vmode
;
19582 /* Use the same checks as in expand_vec_perm_blend. */
19583 if (d
->one_operand_p
)
19585 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
19587 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
19589 else if (TARGET_SSE4_1
&& (GET_MODE_SIZE (vmode
) == 4
19590 || GET_MODE_SIZE (vmode
) == 8
19591 || GET_MODE_SIZE (vmode
) == 16))
19596 /* Figure out where permutation elements stay not in their
19597 respective lanes. */
19598 for (i
= 0, which
= 0; i
< nelt
; ++i
)
19600 unsigned e
= d
->perm
[i
];
19602 which
|= (e
< nelt
? 1 : 2);
19604 /* We can pblend the part where elements stay not in their
19605 respective lanes only when these elements are all in one
19606 half of a permutation.
19607 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
19608 lanes, but both 8 and 9 >= 8
19609 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
19610 respective lanes and 8 >= 8, but 2 not. */
19611 if (which
!= 1 && which
!= 2)
19613 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
19616 /* First we apply one operand permutation to the part where
19617 elements stay not in their respective lanes. */
19620 dcopy
.op0
= dcopy
.op1
= d
->op1
;
19622 dcopy
.op0
= dcopy
.op1
= d
->op0
;
19624 dcopy
.target
= gen_reg_rtx (vmode
);
19625 dcopy
.one_operand_p
= true;
19627 for (i
= 0; i
< nelt
; ++i
)
19628 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
19630 ok
= expand_vec_perm_1 (&dcopy
);
19631 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
19638 /* Next we put permuted elements into their positions. */
19641 dcopy1
.op1
= dcopy
.target
;
19643 dcopy1
.op0
= dcopy
.target
;
19645 for (i
= 0; i
< nelt
; ++i
)
19646 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
19648 ok
= expand_vec_perm_blend (&dcopy1
);
19654 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
19656 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19657 a two vector permutation into a single vector permutation by using
19658 an interleave operation to merge the vectors. */
19661 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
19663 struct expand_vec_perm_d dremap
, dfinal
;
19664 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
19665 unsigned HOST_WIDE_INT contents
;
19666 unsigned char remap
[2 * MAX_VECT_LEN
];
19668 bool ok
, same_halves
= false;
19670 if (GET_MODE_SIZE (d
->vmode
) == 4
19671 || GET_MODE_SIZE (d
->vmode
) == 8
19672 || GET_MODE_SIZE (d
->vmode
) == 16)
19674 if (d
->one_operand_p
)
19677 else if (GET_MODE_SIZE (d
->vmode
) == 32)
19681 /* For 32-byte modes allow even d->one_operand_p.
19682 The lack of cross-lane shuffling in some instructions
19683 might prevent a single insn shuffle. */
19685 dfinal
.testing_p
= true;
19686 /* If expand_vec_perm_interleave3 can expand this into
19687 a 3 insn sequence, give up and let it be expanded as
19688 3 insn sequence. While that is one insn longer,
19689 it doesn't need a memory operand and in the common
19690 case that both interleave low and high permutations
19691 with the same operands are adjacent needs 4 insns
19692 for both after CSE. */
19693 if (expand_vec_perm_interleave3 (&dfinal
))
19699 /* Examine from whence the elements come. */
19701 for (i
= 0; i
< nelt
; ++i
)
19702 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
19704 memset (remap
, 0xff, sizeof (remap
));
19707 if (GET_MODE_SIZE (d
->vmode
) == 4
19708 || GET_MODE_SIZE (d
->vmode
) == 8)
19710 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
19712 /* Split the two input vectors into 4 halves. */
19713 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
19718 /* If the elements from the low halves use interleave low,
19719 and similarly for interleave high. */
19720 if ((contents
& (h1
| h3
)) == contents
)
19723 for (i
= 0; i
< nelt2
; ++i
)
19726 remap
[i
+ nelt
] = i
* 2 + 1;
19727 dremap
.perm
[i
* 2] = i
;
19728 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
19731 else if ((contents
& (h2
| h4
)) == contents
)
19734 for (i
= 0; i
< nelt2
; ++i
)
19736 remap
[i
+ nelt2
] = i
* 2;
19737 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
19738 dremap
.perm
[i
* 2] = i
+ nelt2
;
19739 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
19745 else if (GET_MODE_SIZE (d
->vmode
) == 16)
19747 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
19749 /* Split the two input vectors into 4 halves. */
19750 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
19755 /* If the elements from the low halves use interleave low, and similarly
19756 for interleave high. If the elements are from mis-matched halves, we
19757 can use shufps for V4SF/V4SI or do a DImode shuffle. */
19758 if ((contents
& (h1
| h3
)) == contents
)
19761 for (i
= 0; i
< nelt2
; ++i
)
19764 remap
[i
+ nelt
] = i
* 2 + 1;
19765 dremap
.perm
[i
* 2] = i
;
19766 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
19768 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
19769 dremap
.vmode
= V4SFmode
;
19771 else if ((contents
& (h2
| h4
)) == contents
)
19774 for (i
= 0; i
< nelt2
; ++i
)
19776 remap
[i
+ nelt2
] = i
* 2;
19777 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
19778 dremap
.perm
[i
* 2] = i
+ nelt2
;
19779 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
19781 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
19782 dremap
.vmode
= V4SFmode
;
19784 else if ((contents
& (h1
| h4
)) == contents
)
19787 for (i
= 0; i
< nelt2
; ++i
)
19790 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
19791 dremap
.perm
[i
] = i
;
19792 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
19797 dremap
.vmode
= V2DImode
;
19799 dremap
.perm
[0] = 0;
19800 dremap
.perm
[1] = 3;
19803 else if ((contents
& (h2
| h3
)) == contents
)
19806 for (i
= 0; i
< nelt2
; ++i
)
19808 remap
[i
+ nelt2
] = i
;
19809 remap
[i
+ nelt
] = i
+ nelt2
;
19810 dremap
.perm
[i
] = i
+ nelt2
;
19811 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
19816 dremap
.vmode
= V2DImode
;
19818 dremap
.perm
[0] = 1;
19819 dremap
.perm
[1] = 2;
19827 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
19828 unsigned HOST_WIDE_INT q
[8];
19829 unsigned int nonzero_halves
[4];
19831 /* Split the two input vectors into 8 quarters. */
19832 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
19833 for (i
= 1; i
< 8; ++i
)
19834 q
[i
] = q
[0] << (nelt4
* i
);
19835 for (i
= 0; i
< 4; ++i
)
19836 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
19838 nonzero_halves
[nzcnt
] = i
;
19844 gcc_assert (d
->one_operand_p
);
19845 nonzero_halves
[1] = nonzero_halves
[0];
19846 same_halves
= true;
19848 else if (d
->one_operand_p
)
19850 gcc_assert (nonzero_halves
[0] == 0);
19851 gcc_assert (nonzero_halves
[1] == 1);
19856 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
19858 /* Attempt to increase the likelihood that dfinal
19859 shuffle will be intra-lane. */
19860 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
19863 /* vperm2f128 or vperm2i128. */
19864 for (i
= 0; i
< nelt2
; ++i
)
19866 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
19867 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
19868 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
19869 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
19872 if (d
->vmode
!= V8SFmode
19873 && d
->vmode
!= V4DFmode
19874 && d
->vmode
!= V8SImode
)
19876 dremap
.vmode
= V8SImode
;
19878 for (i
= 0; i
< 4; ++i
)
19880 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
19881 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
19885 else if (d
->one_operand_p
)
19887 else if (TARGET_AVX2
19888 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
19891 for (i
= 0; i
< nelt4
; ++i
)
19894 remap
[i
+ nelt
] = i
* 2 + 1;
19895 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
19896 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
19897 dremap
.perm
[i
* 2] = i
;
19898 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
19899 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
19900 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
19903 else if (TARGET_AVX2
19904 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
19907 for (i
= 0; i
< nelt4
; ++i
)
19909 remap
[i
+ nelt4
] = i
* 2;
19910 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
19911 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
19912 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
19913 dremap
.perm
[i
* 2] = i
+ nelt4
;
19914 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
19915 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
19916 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
19923 /* Use the remapping array set up above to move the elements from their
19924 swizzled locations into their final destinations. */
19926 for (i
= 0; i
< nelt
; ++i
)
19928 unsigned e
= remap
[d
->perm
[i
]];
19929 gcc_assert (e
< nelt
);
19930 /* If same_halves is true, both halves of the remapped vector are the
19931 same. Avoid cross-lane accesses if possible. */
19932 if (same_halves
&& i
>= nelt2
)
19934 gcc_assert (e
< nelt2
);
19935 dfinal
.perm
[i
] = e
+ nelt2
;
19938 dfinal
.perm
[i
] = e
;
19942 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
19943 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
19945 dfinal
.op1
= dfinal
.op0
;
19946 dfinal
.one_operand_p
= true;
19948 /* Test if the final remap can be done with a single insn. For V4SFmode or
19949 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
19951 ok
= expand_vec_perm_1 (&dfinal
);
19952 seq
= get_insns ();
19961 if (dremap
.vmode
!= dfinal
.vmode
)
19963 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
19964 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
19967 ok
= expand_vec_perm_1 (&dremap
);
19974 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19975 a single vector cross-lane permutation into vpermq followed
19976 by any of the single insn permutations. */
19979 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
19981 struct expand_vec_perm_d dremap
, dfinal
;
19982 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
19983 unsigned contents
[2];
19987 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
19988 && d
->one_operand_p
))
19993 for (i
= 0; i
< nelt2
; ++i
)
19995 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
19996 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
19999 for (i
= 0; i
< 2; ++i
)
20001 unsigned int cnt
= 0;
20002 for (j
= 0; j
< 4; ++j
)
20003 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
20011 dremap
.vmode
= V4DImode
;
20013 dremap
.target
= gen_reg_rtx (V4DImode
);
20014 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
20015 dremap
.op1
= dremap
.op0
;
20016 dremap
.one_operand_p
= true;
20017 for (i
= 0; i
< 2; ++i
)
20019 unsigned int cnt
= 0;
20020 for (j
= 0; j
< 4; ++j
)
20021 if ((contents
[i
] & (1u << j
)) != 0)
20022 dremap
.perm
[2 * i
+ cnt
++] = j
;
20023 for (; cnt
< 2; ++cnt
)
20024 dremap
.perm
[2 * i
+ cnt
] = 0;
20028 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
20029 dfinal
.op1
= dfinal
.op0
;
20030 dfinal
.one_operand_p
= true;
20031 for (i
= 0, j
= 0; i
< nelt
; ++i
)
20035 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
20036 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
20038 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
20039 dfinal
.perm
[i
] |= nelt4
;
20041 gcc_unreachable ();
20044 ok
= expand_vec_perm_1 (&dremap
);
20047 ok
= expand_vec_perm_1 (&dfinal
);
20053 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
20055 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20056 a vector permutation using two instructions, vperm2f128 resp.
20057 vperm2i128 followed by any single in-lane permutation. */
20060 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
20062 struct expand_vec_perm_d dfirst
, dsecond
;
20063 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
20067 || GET_MODE_SIZE (d
->vmode
) != 32
20068 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
20072 dsecond
.one_operand_p
= false;
20073 dsecond
.testing_p
= true;
20075 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20076 immediate. For perm < 16 the second permutation uses
20077 d->op0 as first operand, for perm >= 16 it uses d->op1
20078 as first operand. The second operand is the result of
20080 for (perm
= 0; perm
< 32; perm
++)
20082 /* Ignore permutations which do not move anything cross-lane. */
20085 /* The second shuffle for e.g. V4DFmode has
20086 0123 and ABCD operands.
20087 Ignore AB23, as 23 is already in the second lane
20088 of the first operand. */
20089 if ((perm
& 0xc) == (1 << 2)) continue;
20090 /* And 01CD, as 01 is in the first lane of the first
20092 if ((perm
& 3) == 0) continue;
20093 /* And 4567, as then the vperm2[fi]128 doesn't change
20094 anything on the original 4567 second operand. */
20095 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
20099 /* The second shuffle for e.g. V4DFmode has
20100 4567 and ABCD operands.
20101 Ignore AB67, as 67 is already in the second lane
20102 of the first operand. */
20103 if ((perm
& 0xc) == (3 << 2)) continue;
20104 /* And 45CD, as 45 is in the first lane of the first
20106 if ((perm
& 3) == 2) continue;
20107 /* And 0123, as then the vperm2[fi]128 doesn't change
20108 anything on the original 0123 first operand. */
20109 if ((perm
& 0xf) == (1 << 2)) continue;
20112 for (i
= 0; i
< nelt
; i
++)
20114 j
= d
->perm
[i
] / nelt2
;
20115 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
20116 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
20117 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
20118 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
20126 ok
= expand_vec_perm_1 (&dsecond
);
20137 /* Found a usable second shuffle. dfirst will be
20138 vperm2f128 on d->op0 and d->op1. */
20139 dsecond
.testing_p
= false;
20141 dfirst
.target
= gen_reg_rtx (d
->vmode
);
20142 for (i
= 0; i
< nelt
; i
++)
20143 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
20144 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
20146 canonicalize_perm (&dfirst
);
20147 ok
= expand_vec_perm_1 (&dfirst
);
20150 /* And dsecond is some single insn shuffle, taking
20151 d->op0 and result of vperm2f128 (if perm < 16) or
20152 d->op1 and result of vperm2f128 (otherwise). */
20154 dsecond
.op0
= dsecond
.op1
;
20155 dsecond
.op1
= dfirst
.target
;
20157 ok
= expand_vec_perm_1 (&dsecond
);
20163 /* For one operand, the only useful vperm2f128 permutation is 0x01
20165 if (d
->one_operand_p
)
20172 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20173 a two vector permutation using 2 intra-lane interleave insns
20174 and cross-lane shuffle for 32-byte vectors. */
20177 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
20180 rtx (*gen
) (rtx
, rtx
, rtx
);
20182 if (d
->one_operand_p
)
20184 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
20186 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
20192 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
20194 for (i
= 0; i
< nelt
; i
+= 2)
20195 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
20196 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
20206 gen
= gen_vec_interleave_highv32qi
;
20208 gen
= gen_vec_interleave_lowv32qi
;
20212 gen
= gen_vec_interleave_highv16hi
;
20214 gen
= gen_vec_interleave_lowv16hi
;
20218 gen
= gen_vec_interleave_highv8si
;
20220 gen
= gen_vec_interleave_lowv8si
;
20224 gen
= gen_vec_interleave_highv4di
;
20226 gen
= gen_vec_interleave_lowv4di
;
20230 gen
= gen_vec_interleave_highv8sf
;
20232 gen
= gen_vec_interleave_lowv8sf
;
20236 gen
= gen_vec_interleave_highv4df
;
20238 gen
= gen_vec_interleave_lowv4df
;
20241 gcc_unreachable ();
20244 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
20248 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20249 a single vector permutation using a single intra-lane vector
20250 permutation, vperm2f128 swapping the lanes and vblend* insn blending
20251 the non-swapped and swapped vectors together. */
20254 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
20256 struct expand_vec_perm_d dfirst
, dsecond
;
20257 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
20260 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
20264 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
20265 || !d
->one_operand_p
)
20269 for (i
= 0; i
< nelt
; i
++)
20270 dfirst
.perm
[i
] = 0xff;
20271 for (i
= 0, msk
= 0; i
< nelt
; i
++)
20273 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
20274 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
20276 dfirst
.perm
[j
] = d
->perm
[i
];
20280 for (i
= 0; i
< nelt
; i
++)
20281 if (dfirst
.perm
[i
] == 0xff)
20282 dfirst
.perm
[i
] = i
;
20285 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
20288 ok
= expand_vec_perm_1 (&dfirst
);
20289 seq
= get_insns ();
20301 dsecond
.op0
= dfirst
.target
;
20302 dsecond
.op1
= dfirst
.target
;
20303 dsecond
.one_operand_p
= true;
20304 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
20305 for (i
= 0; i
< nelt
; i
++)
20306 dsecond
.perm
[i
] = i
^ nelt2
;
20308 ok
= expand_vec_perm_1 (&dsecond
);
20311 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
20312 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
20316 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20317 a two vector permutation using two single vector permutations and
20318 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
20319 of dfirst or dsecond is identity permutation. */
20322 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d
*d
, bool two_insn
)
20324 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, lane
= nelt
;
20325 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
20326 bool ident1
= true, ident2
= true;
20328 if (d
->one_operand_p
)
20331 if (GET_MODE_SIZE (d
->vmode
) == 16)
20335 if (d
->vmode
!= V4SFmode
&& d
->vmode
!= V2DFmode
&& !TARGET_SSE2
)
20338 else if (GET_MODE_SIZE (d
->vmode
) == 32)
20342 if (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
)
20349 for (i
= 1; i
< nelt
; i
++)
20350 if ((d
->perm
[i
] >= nelt
) != ((d
->perm
[0] >= nelt
) ^ (i
& 1)))
20356 dfirst
.op1
= dfirst
.op0
;
20357 dfirst
.one_operand_p
= true;
20358 dsecond
.op0
= dsecond
.op1
;
20359 dsecond
.one_operand_p
= true;
20361 for (i
= 0; i
< nelt
; i
++)
20362 if (d
->perm
[i
] >= nelt
)
20364 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
] - nelt
;
20365 if (d
->perm
[i
] - nelt
!= i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
20367 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)]
20368 = d
->perm
[i
] - nelt
;
20372 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
];
20373 if (d
->perm
[i
] != i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
20375 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)] = d
->perm
[i
];
20378 if (two_insn
&& !ident1
&& !ident2
)
20384 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
20386 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
20387 if (d
->perm
[0] >= nelt
)
20388 std::swap (dfinal
.op0
, dfinal
.op1
);
20392 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
20397 ok
= expand_vec_perm_1 (&dfirst
);
20398 seq1
= get_insns ();
20408 ok
= expand_vec_perm_1 (&dsecond
);
20409 seq2
= get_insns ();
20419 for (i
= 0; i
< nelt
; i
++)
20421 dfinal
.perm
[i
] = i
/ 2;
20423 dfinal
.perm
[i
] += lane
/ 2;
20425 dfinal
.perm
[i
] += nelt
;
20429 ok
= expand_vselect_vconcat (dfinal
.target
, dfinal
.op0
, dfinal
.op1
,
20430 dfinal
.perm
, dfinal
.nelt
, false);
20435 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20436 the permutation using two single vector permutations and the SSE4_1 pblendv
20437 instruction. If two_insn, succeed only if one of dfirst or dsecond is
20438 identity permutation. */
20441 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d
*d
, bool two_insn
)
20443 unsigned i
, nelt
= d
->nelt
;
20444 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
20445 machine_mode vmode
= d
->vmode
;
20446 bool ident1
= true, ident2
= true;
20448 /* Use the same checks as in expand_vec_perm_blend. */
20449 if (d
->one_operand_p
)
20451 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
20453 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
20455 else if (TARGET_SSE4_1
&& (GET_MODE_SIZE (vmode
) == 16
20456 || GET_MODE_SIZE (vmode
) == 8
20457 || GET_MODE_SIZE (vmode
) == 4))
20465 dfirst
.op1
= dfirst
.op0
;
20466 dfirst
.one_operand_p
= true;
20467 dsecond
.op0
= dsecond
.op1
;
20468 dsecond
.one_operand_p
= true;
20470 for (i
= 0; i
< nelt
; ++i
)
20471 if (d
->perm
[i
] >= nelt
)
20473 dfirst
.perm
[i
] = 0xff;
20474 dsecond
.perm
[i
] = d
->perm
[i
] - nelt
;
20475 if (d
->perm
[i
] != i
+ nelt
)
20480 dsecond
.perm
[i
] = 0xff;
20481 dfirst
.perm
[i
] = d
->perm
[i
];
20482 if (d
->perm
[i
] != i
)
20486 if (two_insn
&& !ident1
&& !ident2
)
20489 /* For now. Ideally treat 0xff as a wildcard. */
20490 for (i
= 0; i
< nelt
; ++i
)
20491 if (dfirst
.perm
[i
] == 0xff)
20493 if (GET_MODE_SIZE (vmode
) == 32
20494 && dfirst
.perm
[i
^ (nelt
/ 2)] != 0xff)
20495 dfirst
.perm
[i
] = dfirst
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
20497 dfirst
.perm
[i
] = i
;
20501 if (GET_MODE_SIZE (vmode
) == 32
20502 && dsecond
.perm
[i
^ (nelt
/ 2)] != 0xff)
20503 dsecond
.perm
[i
] = dsecond
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
20505 dsecond
.perm
[i
] = i
;
20511 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
20513 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
20517 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
20522 ok
= expand_vec_perm_1 (&dfirst
);
20523 seq1
= get_insns ();
20533 ok
= expand_vec_perm_1 (&dsecond
);
20534 seq2
= get_insns ();
20544 for (i
= 0; i
< nelt
; ++i
)
20545 dfinal
.perm
[i
] = (d
->perm
[i
] >= nelt
? i
+ nelt
: i
);
20549 ok
= expand_vec_perm_blend (&dfinal
);
20554 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
20555 permutation using two vperm2f128, followed by a vshufpd insn blending
20556 the two vectors together. */
20559 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
20561 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
20564 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
20574 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
20575 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
20576 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
20577 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
20578 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
20579 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
20580 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
20581 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
20582 dthird
.perm
[0] = (d
->perm
[0] % 2);
20583 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
20584 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
20585 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
20587 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
20588 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
20589 dthird
.op0
= dfirst
.target
;
20590 dthird
.op1
= dsecond
.target
;
20591 dthird
.one_operand_p
= false;
20593 canonicalize_perm (&dfirst
);
20594 canonicalize_perm (&dsecond
);
20596 ok
= expand_vec_perm_1 (&dfirst
)
20597 && expand_vec_perm_1 (&dsecond
)
20598 && expand_vec_perm_1 (&dthird
);
20605 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
20607 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20608 a two vector permutation using two intra-lane vector
20609 permutations, vperm2f128 swapping the lanes and vblend* insn blending
20610 the non-swapped and swapped vectors together. */
20613 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
20615 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
20616 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
20617 rtx_insn
*seq1
, *seq2
;
20619 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
20623 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
20624 || d
->one_operand_p
)
20629 for (i
= 0; i
< nelt
; i
++)
20631 dfirst
.perm
[i
] = 0xff;
20632 dsecond
.perm
[i
] = 0xff;
20634 for (i
= 0, msk
= 0; i
< nelt
; i
++)
20636 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
20639 dfirst
.perm
[j
] = d
->perm
[i
];
20640 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
20644 dsecond
.perm
[j
] = d
->perm
[i
];
20645 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
20649 if (msk
== 0 || msk
== (1U << nelt
) - 1)
20654 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
20655 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
20658 for (i
= 0; i
< nelt
; i
++)
20660 if (dfirst
.perm
[i
] == 0xff)
20661 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
20662 if (dsecond
.perm
[i
] == 0xff)
20663 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
20665 canonicalize_perm (&dfirst
);
20667 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
20668 seq1
= get_insns ();
20674 canonicalize_perm (&dsecond
);
20676 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
20677 seq2
= get_insns ();
20690 dthird
.op0
= dsecond
.target
;
20691 dthird
.op1
= dsecond
.target
;
20692 dthird
.one_operand_p
= true;
20693 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
20694 for (i
= 0; i
< nelt
; i
++)
20695 dthird
.perm
[i
] = i
^ nelt2
;
20697 ok
= expand_vec_perm_1 (&dthird
);
20700 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
20701 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
20705 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
20706 permutation with two pshufb insns and an ior. We should have already
20707 failed all two instruction sequences. */
20710 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
20712 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
20713 unsigned int i
, nelt
, eltsz
;
20715 rtx (*gen
) (rtx
, rtx
, rtx
);
20717 if (!TARGET_SSSE3
|| (GET_MODE_SIZE (d
->vmode
) != 16
20718 && GET_MODE_SIZE (d
->vmode
) != 8
20719 && GET_MODE_SIZE (d
->vmode
) != 4))
20721 gcc_assert (!d
->one_operand_p
);
20726 switch (GET_MODE_SIZE (d
->vmode
))
20730 gen
= gen_mmx_pshufbv4qi3
;
20734 gen
= gen_mmx_pshufbv8qi3
;
20738 gen
= gen_ssse3_pshufbv16qi3
;
20741 gcc_unreachable ();
20745 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
20747 /* Generate two permutation masks. If the required element is within
20748 the given vector it is shuffled into the proper lane. If the required
20749 element is in the other vector, force a zero into the lane by setting
20750 bit 7 in the permutation mask. */
20751 m128
= GEN_INT (-128);
20752 for (i
= 0; i
< nelt
; ++i
)
20754 unsigned j
, k
, e
= d
->perm
[i
];
20755 unsigned which
= (e
>= nelt
);
20759 for (j
= 0; j
< eltsz
; ++j
)
20761 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
20762 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
20765 for (k
= i
*eltsz
+ j
; k
< 16; ++k
)
20766 rperm
[0][k
] = rperm
[1][k
] = m128
;
20769 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
20770 vperm
= force_reg (V16QImode
, vperm
);
20772 l
= gen_reg_rtx (mode
);
20773 op
= gen_lowpart (mode
, d
->op0
);
20774 emit_insn (gen (l
, op
, vperm
));
20776 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
20777 vperm
= force_reg (V16QImode
, vperm
);
20779 h
= gen_reg_rtx (mode
);
20780 op
= gen_lowpart (mode
, d
->op1
);
20781 emit_insn (gen (h
, op
, vperm
));
20784 if (d
->vmode
!= mode
)
20785 op
= gen_reg_rtx (mode
);
20786 ix86_emit_vec_binop (IOR
, mode
, op
, l
, h
);
20787 if (op
!= d
->target
)
20788 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
20793 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
20794 with two vpshufb insns, vpermq and vpor. We should have already failed
20795 all two or three instruction sequences. */
20798 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
20800 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
20801 unsigned int i
, nelt
, eltsz
;
20804 || !d
->one_operand_p
20805 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
20812 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
20814 /* Generate two permutation masks. If the required element is within
20815 the same lane, it is shuffled in. If the required element from the
20816 other lane, force a zero by setting bit 7 in the permutation mask.
20817 In the other mask the mask has non-negative elements if element
20818 is requested from the other lane, but also moved to the other lane,
20819 so that the result of vpshufb can have the two V2TImode halves
20821 m128
= GEN_INT (-128);
20822 for (i
= 0; i
< nelt
; ++i
)
20824 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
20825 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
20827 for (j
= 0; j
< eltsz
; ++j
)
20829 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
20830 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
20834 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
20835 vperm
= force_reg (V32QImode
, vperm
);
20837 h
= gen_reg_rtx (V32QImode
);
20838 op
= gen_lowpart (V32QImode
, d
->op0
);
20839 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
20841 /* Swap the 128-byte lanes of h into hp. */
20842 hp
= gen_reg_rtx (V4DImode
);
20843 op
= gen_lowpart (V4DImode
, h
);
20844 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
20847 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
20848 vperm
= force_reg (V32QImode
, vperm
);
20850 l
= gen_reg_rtx (V32QImode
);
20851 op
= gen_lowpart (V32QImode
, d
->op0
);
20852 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
20855 if (d
->vmode
!= V32QImode
)
20856 op
= gen_reg_rtx (V32QImode
);
20857 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
20858 if (op
!= d
->target
)
20859 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
20864 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
20865 and extract-odd permutations of two V32QImode and V16QImode operand
20866 with two vpshufb insns, vpor and vpermq. We should have already
20867 failed all two or three instruction sequences. */
20870 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
20872 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
20873 unsigned int i
, nelt
, eltsz
;
20876 || d
->one_operand_p
20877 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
20880 for (i
= 0; i
< d
->nelt
; ++i
)
20881 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
20888 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
20890 /* Generate two permutation masks. In the first permutation mask
20891 the first quarter will contain indexes for the first half
20892 of the op0, the second quarter will contain bit 7 set, third quarter
20893 will contain indexes for the second half of the op0 and the
20894 last quarter bit 7 set. In the second permutation mask
20895 the first quarter will contain bit 7 set, the second quarter
20896 indexes for the first half of the op1, the third quarter bit 7 set
20897 and last quarter indexes for the second half of the op1.
20898 I.e. the first mask e.g. for V32QImode extract even will be:
20899 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
20900 (all values masked with 0xf except for -128) and second mask
20901 for extract even will be
20902 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
20903 m128
= GEN_INT (-128);
20904 for (i
= 0; i
< nelt
; ++i
)
20906 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
20907 unsigned which
= d
->perm
[i
] >= nelt
;
20908 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
20910 for (j
= 0; j
< eltsz
; ++j
)
20912 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
20913 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
20917 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
20918 vperm
= force_reg (V32QImode
, vperm
);
20920 l
= gen_reg_rtx (V32QImode
);
20921 op
= gen_lowpart (V32QImode
, d
->op0
);
20922 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
20924 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
20925 vperm
= force_reg (V32QImode
, vperm
);
20927 h
= gen_reg_rtx (V32QImode
);
20928 op
= gen_lowpart (V32QImode
, d
->op1
);
20929 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
20931 ior
= gen_reg_rtx (V32QImode
);
20932 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
20934 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
20935 op
= gen_reg_rtx (V4DImode
);
20936 ior
= gen_lowpart (V4DImode
, ior
);
20937 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
20938 const1_rtx
, GEN_INT (3)));
20939 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
20944 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
20945 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
20946 operands with two "and" and "pack" or two "shift" and "pack" insns.
20947 We should have already failed all two instruction sequences. */
20950 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
20952 rtx op
, dop0
, dop1
, t
;
20953 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
20954 bool end_perm
= false;
20955 machine_mode half_mode
;
20956 rtx (*gen_and
) (rtx
, rtx
, rtx
);
20957 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
20958 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
20960 if (d
->one_operand_p
)
20966 /* Required for "pack". */
20967 if (!TARGET_SSE4_1
)
20971 half_mode
= V2SImode
;
20972 gen_and
= gen_andv2si3
;
20973 gen_pack
= gen_mmx_packusdw
;
20974 gen_shift
= gen_lshrv2si3
;
20977 /* Required for "pack". */
20978 if (!TARGET_SSE4_1
)
20982 half_mode
= V4SImode
;
20983 gen_and
= gen_andv4si3
;
20984 gen_pack
= gen_sse4_1_packusdw
;
20985 gen_shift
= gen_lshrv4si3
;
20988 /* No check as all instructions are SSE2. */
20991 half_mode
= V4HImode
;
20992 gen_and
= gen_andv4hi3
;
20993 gen_pack
= gen_mmx_packuswb
;
20994 gen_shift
= gen_lshrv4hi3
;
20997 /* No check as all instructions are SSE2. */
21000 half_mode
= V8HImode
;
21001 gen_and
= gen_andv8hi3
;
21002 gen_pack
= gen_sse2_packuswb
;
21003 gen_shift
= gen_lshrv8hi3
;
21010 half_mode
= V8SImode
;
21011 gen_and
= gen_andv8si3
;
21012 gen_pack
= gen_avx2_packusdw
;
21013 gen_shift
= gen_lshrv8si3
;
21021 half_mode
= V16HImode
;
21022 gen_and
= gen_andv16hi3
;
21023 gen_pack
= gen_avx2_packuswb
;
21024 gen_shift
= gen_lshrv16hi3
;
21028 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21029 are more profitable than general shuffles. */
21033 /* Check that permutation is even or odd. */
21038 for (i
= 1; i
< nelt
; ++i
)
21039 if (d
->perm
[i
] != 2 * i
+ odd
)
21045 dop0
= gen_reg_rtx (half_mode
);
21046 dop1
= gen_reg_rtx (half_mode
);
21049 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
21050 t
= force_reg (half_mode
, t
);
21051 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
21052 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
21056 emit_insn (gen_shift (dop0
,
21057 gen_lowpart (half_mode
, d
->op0
),
21059 emit_insn (gen_shift (dop1
,
21060 gen_lowpart (half_mode
, d
->op1
),
21063 /* In AVX2 for 256 bit case we need to permute pack result. */
21064 if (TARGET_AVX2
&& end_perm
)
21066 op
= gen_reg_rtx (d
->vmode
);
21067 t
= gen_reg_rtx (V4DImode
);
21068 emit_insn (gen_pack (op
, dop0
, dop1
));
21069 emit_insn (gen_avx2_permv4di_1 (t
,
21070 gen_lowpart (V4DImode
, op
),
21075 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
21078 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
21083 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21084 and extract-odd permutations of two V64QI operands
21085 with two "shifts", two "truncs" and one "concat" insns for "odd"
21086 and two "truncs" and one concat insn for "even."
21087 Have already failed all two instruction sequences. */
21090 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
21092 rtx t1
, t2
, t3
, t4
;
21093 unsigned i
, odd
, nelt
= d
->nelt
;
21095 if (!TARGET_AVX512BW
21096 || d
->one_operand_p
21097 || d
->vmode
!= V64QImode
)
21100 /* Check that permutation is even or odd. */
21105 for (i
= 1; i
< nelt
; ++i
)
21106 if (d
->perm
[i
] != 2 * i
+ odd
)
21115 t1
= gen_reg_rtx (V32HImode
);
21116 t2
= gen_reg_rtx (V32HImode
);
21117 emit_insn (gen_lshrv32hi3 (t1
,
21118 gen_lowpart (V32HImode
, d
->op0
),
21120 emit_insn (gen_lshrv32hi3 (t2
,
21121 gen_lowpart (V32HImode
, d
->op1
),
21126 t1
= gen_lowpart (V32HImode
, d
->op0
);
21127 t2
= gen_lowpart (V32HImode
, d
->op1
);
21130 t3
= gen_reg_rtx (V32QImode
);
21131 t4
= gen_reg_rtx (V32QImode
);
21132 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
21133 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
21134 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
21139 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
21140 and extract-odd permutations. */
21143 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
21145 rtx t1
, t2
, t3
, t4
, t5
;
21152 t1
= gen_reg_rtx (V4DFmode
);
21153 t2
= gen_reg_rtx (V4DFmode
);
21155 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21156 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
21157 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
21159 /* Now an unpck[lh]pd will produce the result required. */
21161 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
21163 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
21169 int mask
= odd
? 0xdd : 0x88;
21173 t1
= gen_reg_rtx (V8SFmode
);
21174 t2
= gen_reg_rtx (V8SFmode
);
21175 t3
= gen_reg_rtx (V8SFmode
);
21177 /* Shuffle within the 128-bit lanes to produce:
21178 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
21179 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
21182 /* Shuffle the lanes around to produce:
21183 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
21184 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
21187 /* Shuffle within the 128-bit lanes to produce:
21188 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
21189 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
21191 /* Shuffle within the 128-bit lanes to produce:
21192 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
21193 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
21195 /* Shuffle the lanes around to produce:
21196 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
21197 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
21208 /* These are always directly implementable by expand_vec_perm_1. */
21209 gcc_unreachable ();
21212 gcc_assert (TARGET_MMX_WITH_SSE
);
21213 /* We have no suitable instructions. */
21219 if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
21220 return expand_vec_perm_pshufb2 (d
);
21225 /* We need 2*log2(N)-1 operations to achieve odd/even
21226 with interleave. */
21227 t1
= gen_reg_rtx (V4QImode
);
21228 emit_insn (gen_mmx_punpckhbw_low (t1
, d
->op0
, d
->op1
));
21229 emit_insn (gen_mmx_punpcklbw_low (d
->target
, d
->op0
, d
->op1
));
21231 t2
= gen_mmx_punpckhbw_low (d
->target
, d
->target
, t1
);
21233 t2
= gen_mmx_punpcklbw_low (d
->target
, d
->target
, t1
);
21240 return expand_vec_perm_even_odd_pack (d
);
21241 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
21242 return expand_vec_perm_pshufb2 (d
);
21247 /* We need 2*log2(N)-1 operations to achieve odd/even
21248 with interleave. */
21249 t1
= gen_reg_rtx (V4HImode
);
21250 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
21251 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
21253 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
21255 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
21262 return expand_vec_perm_even_odd_pack (d
);
21263 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
21264 return expand_vec_perm_pshufb2 (d
);
21269 /* We need 2*log2(N)-1 operations to achieve odd/even
21270 with interleave. */
21271 t1
= gen_reg_rtx (V8HImode
);
21272 t2
= gen_reg_rtx (V8HImode
);
21273 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
21274 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
21275 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
21276 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
21278 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
21280 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
21287 return expand_vec_perm_even_odd_pack (d
);
21291 return expand_vec_perm_even_odd_pack (d
);
21294 return expand_vec_perm_even_odd_trunc (d
);
21299 struct expand_vec_perm_d d_copy
= *d
;
21300 d_copy
.vmode
= V4DFmode
;
21302 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
21304 d_copy
.target
= gen_reg_rtx (V4DFmode
);
21305 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
21306 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
21307 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
21310 emit_move_insn (d
->target
,
21311 gen_lowpart (V4DImode
, d_copy
.target
));
21320 t1
= gen_reg_rtx (V4DImode
);
21321 t2
= gen_reg_rtx (V4DImode
);
21323 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21324 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
21325 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
21327 /* Now an vpunpck[lh]qdq will produce the result required. */
21329 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
21331 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
21338 struct expand_vec_perm_d d_copy
= *d
;
21339 d_copy
.vmode
= V8SFmode
;
21341 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
21343 d_copy
.target
= gen_reg_rtx (V8SFmode
);
21344 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
21345 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
21346 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
21349 emit_move_insn (d
->target
,
21350 gen_lowpart (V8SImode
, d_copy
.target
));
21359 t1
= gen_reg_rtx (V8SImode
);
21360 t2
= gen_reg_rtx (V8SImode
);
21361 t3
= gen_reg_rtx (V4DImode
);
21362 t4
= gen_reg_rtx (V4DImode
);
21363 t5
= gen_reg_rtx (V4DImode
);
21365 /* Shuffle the lanes around into
21366 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
21367 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
21368 gen_lowpart (V4DImode
, d
->op1
),
21370 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
21371 gen_lowpart (V4DImode
, d
->op1
),
21374 /* Swap the 2nd and 3rd position in each lane into
21375 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
21376 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
21377 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21378 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
21379 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21381 /* Now an vpunpck[lh]qdq will produce
21382 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
21384 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
21385 gen_lowpart (V4DImode
, t2
));
21387 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
21388 gen_lowpart (V4DImode
, t2
));
21390 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
21394 gcc_unreachable ();
21400 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
21401 extract-even and extract-odd permutations. */
21404 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
21406 unsigned i
, odd
, nelt
= d
->nelt
;
21409 if (odd
!= 0 && odd
!= 1)
21412 for (i
= 1; i
< nelt
; ++i
)
21413 if (d
->perm
[i
] != 2 * i
+ odd
)
21416 if (d
->vmode
== E_V32HImode
21418 && !TARGET_AVX512BW
)
21421 return expand_vec_perm_even_odd_1 (d
, odd
);
21424 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
21425 permutations. We assume that expand_vec_perm_1 has already failed. */
21428 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
21430 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
21431 machine_mode vmode
= d
->vmode
;
21432 rtx (*gen
) (rtx
, rtx
, rtx
);
21433 unsigned char perm2
[4];
21434 rtx op0
= d
->op0
, dest
;
21441 /* These are special-cased in sse.md so that we can optionally
21442 use the vbroadcast instruction. They expand to two insns
21443 if the input happens to be in a register. */
21444 gcc_unreachable ();
21454 /* These are always implementable using standard shuffle patterns. */
21455 gcc_unreachable ();
21458 /* This can be implemented via interleave and pshuflw. */
21464 gen
= gen_mmx_punpckhbw_low
;
21468 gen
= gen_mmx_punpcklbw_low
;
21470 dest
= gen_reg_rtx (vmode
);
21471 emit_insn (gen (dest
, op0
, op0
));
21472 vmode
= get_mode_wider_vector (vmode
);
21473 op0
= gen_lowpart (vmode
, dest
);
21475 memset (perm2
, elt
, 2);
21476 dest
= gen_reg_rtx (vmode
);
21477 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
21480 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
21484 /* This can be implemented via interleave. We save one insn by
21485 stopping once we have promoted to V2SImode and then use pshufd. */
21492 gen
= vmode
== V8QImode
? gen_mmx_punpckhbw
21493 : gen_mmx_punpckhwd
;
21497 gen
= vmode
== V8QImode
? gen_mmx_punpcklbw
21498 : gen_mmx_punpcklwd
;
21501 dest
= gen_reg_rtx (vmode
);
21502 emit_insn (gen (dest
, op0
, op0
));
21503 vmode
= get_mode_wider_vector (vmode
);
21504 op0
= gen_lowpart (vmode
, dest
);
21506 while (vmode
!= V2SImode
);
21508 memset (perm2
, elt
, 2);
21509 dest
= gen_reg_rtx (vmode
);
21510 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
21513 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
21518 /* These can be implemented via interleave. We save one insn by
21519 stopping once we have promoted to V4SImode and then use pshufd. */
21526 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
21527 : gen_vec_interleave_highv8hi
;
21531 gen
= vmode
== V16QImode
? gen_vec_interleave_lowv16qi
21532 : gen_vec_interleave_lowv8hi
;
21535 dest
= gen_reg_rtx (vmode
);
21536 emit_insn (gen (dest
, op0
, op0
));
21537 vmode
= get_mode_wider_vector (vmode
);
21538 op0
= gen_lowpart (vmode
, dest
);
21540 while (vmode
!= V4SImode
);
21542 memset (perm2
, elt
, 4);
21543 dest
= gen_reg_rtx (vmode
);
21544 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
21547 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
21551 /* This can be implemented via interleave and pshufd. */
21557 gen
= gen_vec_interleave_highv8hf
;
21561 gen
= gen_vec_interleave_lowv8hf
;
21564 dest
= gen_reg_rtx (vmode
);
21565 emit_insn (gen (dest
, op0
, op0
));
21568 op0
= gen_lowpart (vmode
, dest
);
21570 memset (perm2
, elt
, 4);
21571 dest
= gen_reg_rtx (vmode
);
21572 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
21575 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
21582 /* For AVX2 broadcasts of the first element vpbroadcast* or
21583 vpermq should be used by expand_vec_perm_1. */
21584 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
21588 gcc_assert (!TARGET_AVX512BW
|| d
->perm
[0]);
21592 gcc_assert (!TARGET_AVX512BW
);
21596 gcc_unreachable ();
21600 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
21601 broadcast permutations. */
21604 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
21606 unsigned i
, elt
, nelt
= d
->nelt
;
21608 if (!d
->one_operand_p
)
21612 for (i
= 1; i
< nelt
; ++i
)
21613 if (d
->perm
[i
] != elt
)
21616 return expand_vec_perm_broadcast_1 (d
);
21619 /* Implement arbitrary permutations of two V64QImode operands
21620 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
21622 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
21624 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
21630 struct expand_vec_perm_d ds
[2];
21631 rtx rperm
[128], vperm
, target0
, target1
;
21632 unsigned int i
, nelt
;
21633 machine_mode vmode
;
21638 for (i
= 0; i
< 2; i
++)
21641 ds
[i
].vmode
= V32HImode
;
21643 ds
[i
].target
= gen_reg_rtx (V32HImode
);
21644 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
21645 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
21648 /* Prepare permutations such that the first one takes care of
21649 putting the even bytes into the right positions or one higher
21650 positions (ds[0]) and the second one takes care of
21651 putting the odd bytes into the right positions or one below
21654 for (i
= 0; i
< nelt
; i
++)
21656 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
21659 rperm
[i
] = constm1_rtx
;
21660 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
21664 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
21665 rperm
[i
+ 64] = constm1_rtx
;
21669 bool ok
= expand_vec_perm_1 (&ds
[0]);
21671 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
21673 ok
= expand_vec_perm_1 (&ds
[1]);
21675 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
21677 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
21678 vperm
= force_reg (vmode
, vperm
);
21679 target0
= gen_reg_rtx (V64QImode
);
21680 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
21682 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
21683 vperm
= force_reg (vmode
, vperm
);
21684 target1
= gen_reg_rtx (V64QImode
);
21685 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
21687 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
21691 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
21692 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
21693 all the shorter instruction sequences. */
21696 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
21698 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
21699 unsigned int i
, nelt
, eltsz
;
21703 || d
->one_operand_p
21704 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
21711 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21713 /* Generate 4 permutation masks. If the required element is within
21714 the same lane, it is shuffled in. If the required element from the
21715 other lane, force a zero by setting bit 7 in the permutation mask.
21716 In the other mask the mask has non-negative elements if element
21717 is requested from the other lane, but also moved to the other lane,
21718 so that the result of vpshufb can have the two V2TImode halves
21720 m128
= GEN_INT (-128);
21721 for (i
= 0; i
< 32; ++i
)
21723 rperm
[0][i
] = m128
;
21724 rperm
[1][i
] = m128
;
21725 rperm
[2][i
] = m128
;
21726 rperm
[3][i
] = m128
;
21732 for (i
= 0; i
< nelt
; ++i
)
21734 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
21735 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
21736 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
21738 for (j
= 0; j
< eltsz
; ++j
)
21739 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
21740 used
[which
] = true;
21743 for (i
= 0; i
< 2; ++i
)
21745 if (!used
[2 * i
+ 1])
21750 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
21751 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
21752 vperm
= force_reg (V32QImode
, vperm
);
21753 h
[i
] = gen_reg_rtx (V32QImode
);
21754 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
21755 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
21758 /* Swap the 128-byte lanes of h[X]. */
21759 for (i
= 0; i
< 2; ++i
)
21761 if (h
[i
] == NULL_RTX
)
21763 op
= gen_reg_rtx (V4DImode
);
21764 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
21765 const2_rtx
, GEN_INT (3), const0_rtx
,
21767 h
[i
] = gen_lowpart (V32QImode
, op
);
21770 for (i
= 0; i
< 2; ++i
)
21777 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
21778 vperm
= force_reg (V32QImode
, vperm
);
21779 l
[i
] = gen_reg_rtx (V32QImode
);
21780 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
21781 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
21784 for (i
= 0; i
< 2; ++i
)
21788 op
= gen_reg_rtx (V32QImode
);
21789 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
21796 gcc_assert (l
[0] && l
[1]);
21798 if (d
->vmode
!= V32QImode
)
21799 op
= gen_reg_rtx (V32QImode
);
21800 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
21801 if (op
!= d
->target
)
21802 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21806 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
21807 taken care of, perform the expansion in D and return true on success. */
21810 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
21812 /* Try a single instruction expansion. */
21813 if (expand_vec_perm_1 (d
))
21816 /* Try sequences of two instructions. */
21818 if (expand_vec_perm_pshuflw_pshufhw (d
))
21821 if (expand_vec_perm_palignr (d
, false))
21824 if (expand_vec_perm_interleave2 (d
))
21827 if (expand_vec_perm_broadcast (d
))
21830 if (expand_vec_perm_vpermq_perm_1 (d
))
21833 if (expand_vec_perm_vperm2f128 (d
))
21836 if (expand_vec_perm_pblendv (d
))
21839 if (expand_vec_perm_2perm_interleave (d
, true))
21842 if (expand_vec_perm_2perm_pblendv (d
, true))
21845 /* Try sequences of three instructions. */
21847 if (expand_vec_perm_even_odd_pack (d
))
21850 if (expand_vec_perm_2vperm2f128_vshuf (d
))
21853 if (expand_vec_perm_pshufb2 (d
))
21856 if (expand_vec_perm_interleave3 (d
))
21859 if (expand_vec_perm_vperm2f128_vblend (d
))
21862 if (expand_vec_perm_2perm_interleave (d
, false))
21865 if (expand_vec_perm_2perm_pblendv (d
, false))
21868 /* Try sequences of four instructions. */
21870 if (expand_vec_perm_even_odd_trunc (d
))
21872 if (expand_vec_perm_vpshufb2_vpermq (d
))
21875 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
21878 if (expand_vec_perm_vpermt2_vpshub2 (d
))
21881 /* ??? Look for narrow permutations whose element orderings would
21882 allow the promotion to a wider mode. */
21884 /* ??? Look for sequences of interleave or a wider permute that place
21885 the data into the correct lanes for a half-vector shuffle like
21886 pshuf[lh]w or vpermilps. */
21888 /* ??? Look for sequences of interleave that produce the desired results.
21889 The combinatorics of punpck[lh] get pretty ugly... */
21891 if (expand_vec_perm_even_odd (d
))
21894 /* Even longer sequences. */
21895 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
21898 /* See if we can get the same permutation in different vector integer
21900 struct expand_vec_perm_d nd
;
21901 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
21904 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
21908 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
21909 if (expand_vec_perm2_vperm2f128_vblend (d
))
21915 /* If a permutation only uses one operand, make it clear. Returns true
21916 if the permutation references both operands. */
21919 canonicalize_perm (struct expand_vec_perm_d
*d
)
21921 int i
, which
, nelt
= d
->nelt
;
21923 for (i
= which
= 0; i
< nelt
; ++i
)
21924 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
21926 d
->one_operand_p
= true;
21933 if (!rtx_equal_p (d
->op0
, d
->op1
))
21935 d
->one_operand_p
= false;
21938 /* The elements of PERM do not suggest that only the first operand
21939 is used, but both operands are identical. Allow easier matching
21940 of the permutation by folding the permutation into the single
21945 for (i
= 0; i
< nelt
; ++i
)
21946 d
->perm
[i
] &= nelt
- 1;
21955 return (which
== 3);
21958 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
21961 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
21962 rtx op1
, const vec_perm_indices
&sel
)
21964 struct expand_vec_perm_d d
;
21965 unsigned char perm
[MAX_VECT_LEN
];
21966 unsigned int i
, nelt
, which
;
21969 /* For HF mode vector, convert it to HI using subreg. */
21970 if (GET_MODE_INNER (vmode
) == HFmode
)
21972 machine_mode orig_mode
= vmode
;
21973 vmode
= mode_for_vector (HImode
,
21974 GET_MODE_NUNITS (vmode
)).require ();
21976 target
= lowpart_subreg (vmode
, target
, orig_mode
);
21978 op0
= lowpart_subreg (vmode
, op0
, orig_mode
);
21980 op1
= lowpart_subreg (vmode
, op1
, orig_mode
);
21988 gcc_assert (VECTOR_MODE_P (d
.vmode
));
21989 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
21990 d
.testing_p
= !target
;
21992 gcc_assert (sel
.length () == nelt
);
21993 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
21995 /* Given sufficient ISA support we can just return true here
21996 for selected vector modes. */
22003 if (!TARGET_AVX512F
)
22005 /* All implementable with a single vperm[it]2 insn. */
22010 if (!TARGET_AVX512F
)
22012 if (d
.testing_p
&& TARGET_AVX512BW
)
22013 /* All implementable with a single vperm[it]2 insn. */
22017 if (!TARGET_AVX512F
)
22019 if (d
.testing_p
&& TARGET_AVX512BW
)
22020 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22029 if (d
.testing_p
&& TARGET_AVX512VL
)
22030 /* All implementable with a single vperm[it]2 insn. */
22036 if (d
.testing_p
&& TARGET_AVX2
)
22037 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22043 if (d
.testing_p
&& TARGET_AVX2
)
22044 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22051 /* Fall through. */
22056 /* All implementable with a single vpperm insn. */
22057 if (d
.testing_p
&& TARGET_XOP
)
22059 /* All implementable with 2 pshufb + 1 ior. */
22060 if (d
.testing_p
&& TARGET_SSSE3
)
22067 if (!TARGET_MMX_WITH_SSE
)
22073 /* All implementable with *punpckwd. */
22085 /* All implementable with shufpd or unpck[lh]pd. */
22093 for (i
= which
= 0; i
< nelt
; ++i
)
22095 unsigned char e
= sel
[i
];
22096 gcc_assert (e
< 2 * nelt
);
22099 which
|= (e
< nelt
? 1 : 2);
22104 /* For all elements from second vector, fold the elements to first. */
22106 for (i
= 0; i
< nelt
; ++i
)
22109 /* Check whether the mask can be applied to the vector type. */
22110 d
.one_operand_p
= (which
!= 3);
22112 /* Implementable with shufps, pshufd or pshuflw. */
22113 if (d
.one_operand_p
22114 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
22115 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
22116 || d
.vmode
== V4HImode
|| d
.vmode
== V2HImode
))
22119 /* Otherwise we have to go through the motions and see if we can
22120 figure out how to generate the requested permutation. */
22121 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
22122 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
22123 if (!d
.one_operand_p
)
22124 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
22127 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
22133 two_args
= canonicalize_perm (&d
);
22135 /* If one of the operands is a zero vector, try to match pmovzx. */
22136 if (two_args
&& (d
.op0
== CONST0_RTX (vmode
) || d
.op1
== CONST0_RTX (vmode
)))
22138 struct expand_vec_perm_d dzero
= d
;
22139 if (d
.op0
== CONST0_RTX (vmode
))
22141 d
.op1
= dzero
.op1
= force_reg (vmode
, d
.op1
);
22142 std::swap (dzero
.op0
, dzero
.op1
);
22143 for (i
= 0; i
< nelt
; ++i
)
22144 dzero
.perm
[i
] ^= nelt
;
22147 d
.op0
= dzero
.op0
= force_reg (vmode
, d
.op0
);
22149 if (expand_vselect_vconcat (dzero
.target
, dzero
.op0
, dzero
.op1
,
22150 dzero
.perm
, nelt
, dzero
.testing_p
))
22154 /* Force operands into registers. */
22155 rtx nop0
= force_reg (vmode
, d
.op0
);
22156 if (d
.op0
== d
.op1
)
22159 d
.op1
= force_reg (vmode
, d
.op1
);
22161 if (ix86_expand_vec_perm_const_1 (&d
))
22164 /* If the selector says both arguments are needed, but the operands are the
22165 same, the above tried to expand with one_operand_p and flattened selector.
22166 If that didn't work, retry without one_operand_p; we succeeded with that
22168 if (two_args
&& d
.one_operand_p
)
22170 d
.one_operand_p
= false;
22171 memcpy (d
.perm
, perm
, sizeof (perm
));
22172 return ix86_expand_vec_perm_const_1 (&d
);
22179 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
22181 struct expand_vec_perm_d d
;
22187 d
.vmode
= GET_MODE (targ
);
22188 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
22189 d
.one_operand_p
= false;
22190 d
.testing_p
= false;
22192 for (i
= 0; i
< nelt
; ++i
)
22193 d
.perm
[i
] = i
* 2 + odd
;
22195 /* We'll either be able to implement the permutation directly... */
22196 if (expand_vec_perm_1 (&d
))
22199 /* ... or we use the special-case patterns. */
22200 expand_vec_perm_even_odd_1 (&d
, odd
);
22204 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
22206 struct expand_vec_perm_d d
;
22207 unsigned i
, nelt
, base
;
22213 d
.vmode
= GET_MODE (targ
);
22214 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
22215 d
.one_operand_p
= false;
22216 d
.testing_p
= false;
22218 base
= high_p
? nelt
/ 2 : 0;
22219 for (i
= 0; i
< nelt
/ 2; ++i
)
22221 d
.perm
[i
* 2] = i
+ base
;
22222 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
22225 /* Note that for AVX this isn't one instruction. */
22226 ok
= ix86_expand_vec_perm_const_1 (&d
);
22230 /* This function is similar as ix86_expand_vecop_qihi,
22231 but optimized under AVX512BW by using vpmovwb.
22232 For example, optimize vector MUL generation like
22234 vpmovzxbw ymm2, xmm0
22235 vpmovzxbw ymm3, xmm1
22236 vpmullw ymm4, ymm2, ymm3
22239 it would take less instructions than ix86_expand_vecop_qihi.
22240 Return true if success. */
22243 ix86_expand_vecop_qihi2 (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
22245 machine_mode himode
, qimode
= GET_MODE (dest
);
22246 rtx hop1
, hop2
, hdest
;
22247 rtx (*gen_extend
)(rtx
, rtx
);
22248 rtx (*gen_truncate
)(rtx
, rtx
);
22249 bool uns_p
= (code
== ASHIFTRT
) ? false : true;
22251 /* There's no V64HImode multiplication instruction. */
22252 if (qimode
== E_V64QImode
)
22255 /* vpmovwb only available under AVX512BW. */
22256 if (!TARGET_AVX512BW
)
22258 if ((qimode
== V8QImode
|| qimode
== V16QImode
)
22259 && !TARGET_AVX512VL
)
22261 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
22262 if (qimode
== V32QImode
22263 && (TARGET_PREFER_AVX128
|| TARGET_PREFER_AVX256
))
22270 gen_extend
= uns_p
? gen_zero_extendv8qiv8hi2
: gen_extendv8qiv8hi2
;
22271 gen_truncate
= gen_truncv8hiv8qi2
;
22274 himode
= V16HImode
;
22275 gen_extend
= uns_p
? gen_zero_extendv16qiv16hi2
: gen_extendv16qiv16hi2
;
22276 gen_truncate
= gen_truncv16hiv16qi2
;
22279 himode
= V32HImode
;
22280 gen_extend
= uns_p
? gen_zero_extendv32qiv32hi2
: gen_extendv32qiv32hi2
;
22281 gen_truncate
= gen_truncv32hiv32qi2
;
22284 gcc_unreachable ();
22287 hop1
= gen_reg_rtx (himode
);
22288 hop2
= gen_reg_rtx (himode
);
22289 hdest
= gen_reg_rtx (himode
);
22290 emit_insn (gen_extend (hop1
, op1
));
22291 emit_insn (gen_extend (hop2
, op2
));
22292 emit_insn (gen_rtx_SET (hdest
, simplify_gen_binary (code
, himode
,
22294 emit_insn (gen_truncate (dest
, hdest
));
22298 /* Expand a vector operation shift by constant for a V*QImode in terms of the
22299 same operation on V*HImode. Return true if success. */
22301 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
,
22302 rtx dest
, rtx op1
, rtx op2
)
22304 machine_mode qimode
, himode
;
22305 HOST_WIDE_INT and_constant
, xor_constant
;
22306 HOST_WIDE_INT shift_amount
;
22307 rtx vec_const_and
, vec_const_xor
;
22308 rtx tmp
, op1_subreg
;
22309 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
22310 rtx (*gen_and
) (rtx
, rtx
, rtx
);
22311 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
22312 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
22314 /* Only optimize shift by constant. */
22315 if (!CONST_INT_P (op2
))
22318 qimode
= GET_MODE (dest
);
22319 shift_amount
= INTVAL (op2
);
22320 /* Do nothing when shift amount greater equal 8. */
22321 if (shift_amount
> 7)
22324 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
22325 /* Record sign bit. */
22326 xor_constant
= 1 << (8 - shift_amount
- 1);
22328 /* Zero upper/lower bits shift from left/right element. */
22330 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
22331 : (1 << (8 - shift_amount
)) - 1);
22340 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
22341 gen_and
= gen_andv16qi3
;
22342 gen_xor
= gen_xorv16qi3
;
22343 gen_sub
= gen_subv16qi3
;
22346 himode
= V16HImode
;
22350 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
22351 gen_and
= gen_andv32qi3
;
22352 gen_xor
= gen_xorv32qi3
;
22353 gen_sub
= gen_subv32qi3
;
22356 himode
= V32HImode
;
22360 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
22361 gen_and
= gen_andv64qi3
;
22362 gen_xor
= gen_xorv64qi3
;
22363 gen_sub
= gen_subv64qi3
;
22366 gcc_unreachable ();
22369 tmp
= gen_reg_rtx (himode
);
22370 vec_const_and
= gen_reg_rtx (qimode
);
22371 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
22373 /* For ASHIFT and LSHIFTRT, perform operation like
22374 vpsllw/vpsrlw $shift_amount, %op1, %dest.
22375 vpand %vec_const_and, %dest. */
22376 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
22377 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
22378 emit_move_insn (vec_const_and
,
22379 ix86_build_const_vector (qimode
, true,
22380 gen_int_mode (and_constant
, QImode
)));
22381 emit_insn (gen_and (dest
, dest
, vec_const_and
));
22383 /* For ASHIFTRT, perform extra operation like
22384 vpxor %vec_const_xor, %dest, %dest
22385 vpsubb %vec_const_xor, %dest, %dest */
22386 if (code
== ASHIFTRT
)
22388 vec_const_xor
= gen_reg_rtx (qimode
);
22389 emit_move_insn (vec_const_xor
,
22390 ix86_build_const_vector (qimode
, true,
22391 gen_int_mode (xor_constant
, QImode
)));
22392 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
22393 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
22398 /* Expand a vector operation CODE for a V*QImode in terms of the
22399 same operation on V*HImode. */
22402 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
22404 machine_mode qimode
= GET_MODE (dest
);
22405 machine_mode himode
;
22406 rtx (*gen_il
) (rtx
, rtx
, rtx
);
22407 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
22408 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
22409 struct expand_vec_perm_d d
;
22410 bool ok
, full_interleave
;
22411 bool uns_p
= false;
22414 if (CONST_INT_P (op2
)
22415 && (code
== ASHIFT
|| code
== LSHIFTRT
|| code
== ASHIFTRT
)
22416 && ix86_expand_vec_shift_qihi_constant (code
, dest
, op1
, op2
))
22419 if (TARGET_AVX512BW
22420 && VECTOR_MODE_P (GET_MODE (op2
))
22421 && ix86_expand_vecop_qihi2 (code
, dest
, op1
, op2
))
22428 gen_il
= gen_vec_interleave_lowv16qi
;
22429 gen_ih
= gen_vec_interleave_highv16qi
;
22432 himode
= V16HImode
;
22433 gen_il
= gen_avx2_interleave_lowv32qi
;
22434 gen_ih
= gen_avx2_interleave_highv32qi
;
22437 himode
= V32HImode
;
22438 gen_il
= gen_avx512bw_interleave_lowv64qi
;
22439 gen_ih
= gen_avx512bw_interleave_highv64qi
;
22442 gcc_unreachable ();
22448 /* Unpack data such that we've got a source byte in each low byte of
22449 each word. We don't care what goes into the high byte of each word.
22450 Rather than trying to get zero in there, most convenient is to let
22451 it be a copy of the low byte. */
22452 op2_l
= gen_reg_rtx (qimode
);
22453 op2_h
= gen_reg_rtx (qimode
);
22454 emit_insn (gen_il (op2_l
, op2
, op2
));
22455 emit_insn (gen_ih (op2_h
, op2
, op2
));
22457 op1_l
= gen_reg_rtx (qimode
);
22458 op1_h
= gen_reg_rtx (qimode
);
22459 emit_insn (gen_il (op1_l
, op1
, op1
));
22460 emit_insn (gen_ih (op1_h
, op1
, op1
));
22461 full_interleave
= qimode
== V16QImode
;
22469 op1_l
= gen_reg_rtx (himode
);
22470 op1_h
= gen_reg_rtx (himode
);
22471 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
22472 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
22473 /* vashr/vlshr/vashl */
22474 if (GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
)
22476 rtx tmp
= force_reg (qimode
, op2
);
22477 op2_l
= gen_reg_rtx (himode
);
22478 op2_h
= gen_reg_rtx (himode
);
22479 ix86_expand_sse_unpack (op2_l
, tmp
, uns_p
, false);
22480 ix86_expand_sse_unpack (op2_h
, tmp
, uns_p
, true);
22483 op2_l
= op2_h
= op2
;
22485 full_interleave
= true;
22488 gcc_unreachable ();
22491 /* Perform vashr/vlshr/vashl. */
22493 && GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
)
22495 res_l
= gen_reg_rtx (himode
);
22496 res_h
= gen_reg_rtx (himode
);
22497 emit_insn (gen_rtx_SET (res_l
,
22498 simplify_gen_binary (code
, himode
,
22500 emit_insn (gen_rtx_SET (res_h
,
22501 simplify_gen_binary (code
, himode
,
22504 /* Performance mult/ashr/lshr/ashl. */
22507 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
22509 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
22513 gcc_assert (res_l
&& res_h
);
22515 /* Merge the data back into the right place. */
22517 d
.op0
= gen_lowpart (qimode
, res_l
);
22518 d
.op1
= gen_lowpart (qimode
, res_h
);
22520 d
.nelt
= GET_MODE_NUNITS (qimode
);
22521 d
.one_operand_p
= false;
22522 d
.testing_p
= false;
22524 if (full_interleave
)
22526 /* For SSE2, we used an full interleave, so the desired
22527 results are in the even elements. */
22528 for (i
= 0; i
< d
.nelt
; ++i
)
22533 /* For AVX, the interleave used above was not cross-lane. So the
22534 extraction is evens but with the second and third quarter swapped.
22535 Happily, that is even one insn shorter than even extraction.
22536 For AVX512BW we have 4 lanes. We extract evens from within a lane,
22537 always first from the first and then from the second source operand,
22538 the index bits above the low 4 bits remains the same.
22539 Thus, for d.nelt == 32 we want permutation
22540 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
22541 and for d.nelt == 64 we want permutation
22542 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
22543 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
22544 for (i
= 0; i
< d
.nelt
; ++i
)
22545 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
22548 ok
= ix86_expand_vec_perm_const_1 (&d
);
22551 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
22552 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
22555 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
22556 if op is CONST_VECTOR with all odd elements equal to their
22557 preceding element. */
22560 const_vector_equal_evenodd_p (rtx op
)
22562 machine_mode mode
= GET_MODE (op
);
22563 int i
, nunits
= GET_MODE_NUNITS (mode
);
22564 if (GET_CODE (op
) != CONST_VECTOR
22565 || nunits
!= CONST_VECTOR_NUNITS (op
))
22567 for (i
= 0; i
< nunits
; i
+= 2)
22568 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
22574 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
22575 bool uns_p
, bool odd_p
)
22577 machine_mode mode
= GET_MODE (op1
);
22578 machine_mode wmode
= GET_MODE (dest
);
22580 rtx orig_op1
= op1
, orig_op2
= op2
;
22582 if (!nonimmediate_operand (op1
, mode
))
22583 op1
= force_reg (mode
, op1
);
22584 if (!nonimmediate_operand (op2
, mode
))
22585 op2
= force_reg (mode
, op2
);
22587 /* We only play even/odd games with vectors of SImode. */
22588 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
22590 /* If we're looking for the odd results, shift those members down to
22591 the even slots. For some cpus this is faster than a PSHUFD. */
22594 /* For XOP use vpmacsdqh, but only for smult, as it is only
22596 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
22598 x
= force_reg (wmode
, CONST0_RTX (wmode
));
22599 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
22603 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
22604 if (!const_vector_equal_evenodd_p (orig_op1
))
22605 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
22606 x
, NULL
, 1, OPTAB_DIRECT
);
22607 if (!const_vector_equal_evenodd_p (orig_op2
))
22608 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
22609 x
, NULL
, 1, OPTAB_DIRECT
);
22610 op1
= gen_lowpart (mode
, op1
);
22611 op2
= gen_lowpart (mode
, op2
);
22614 if (mode
== V16SImode
)
22617 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
22619 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
22621 else if (mode
== V8SImode
)
22624 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
22626 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
22629 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
22630 else if (TARGET_SSE4_1
)
22631 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
22634 rtx s1
, s2
, t0
, t1
, t2
;
22636 /* The easiest way to implement this without PMULDQ is to go through
22637 the motions as if we are performing a full 64-bit multiply. With
22638 the exception that we need to do less shuffling of the elements. */
22640 /* Compute the sign-extension, aka highparts, of the two operands. */
22641 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
22642 op1
, pc_rtx
, pc_rtx
);
22643 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
22644 op2
, pc_rtx
, pc_rtx
);
22646 /* Multiply LO(A) * HI(B), and vice-versa. */
22647 t1
= gen_reg_rtx (wmode
);
22648 t2
= gen_reg_rtx (wmode
);
22649 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
22650 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
22652 /* Multiply LO(A) * LO(B). */
22653 t0
= gen_reg_rtx (wmode
);
22654 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
22656 /* Combine and shift the highparts into place. */
22657 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
22658 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
22661 /* Combine high and low parts. */
22662 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
22669 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
22670 bool uns_p
, bool high_p
)
22672 machine_mode wmode
= GET_MODE (dest
);
22673 machine_mode mode
= GET_MODE (op1
);
22674 rtx t1
, t2
, t3
, t4
, mask
;
22679 t1
= gen_reg_rtx (mode
);
22680 t2
= gen_reg_rtx (mode
);
22681 if (TARGET_XOP
&& !uns_p
)
22683 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
22684 shuffle the elements once so that all elements are in the right
22685 place for immediate use: { A C B D }. */
22686 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
22687 const1_rtx
, GEN_INT (3)));
22688 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
22689 const1_rtx
, GEN_INT (3)));
22693 /* Put the elements into place for the multiply. */
22694 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
22695 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
22698 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
22702 /* Shuffle the elements between the lanes. After this we
22703 have { A B E F | C D G H } for each operand. */
22704 t1
= gen_reg_rtx (V4DImode
);
22705 t2
= gen_reg_rtx (V4DImode
);
22706 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
22707 const0_rtx
, const2_rtx
,
22708 const1_rtx
, GEN_INT (3)));
22709 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
22710 const0_rtx
, const2_rtx
,
22711 const1_rtx
, GEN_INT (3)));
22713 /* Shuffle the elements within the lanes. After this we
22714 have { A A B B | C C D D } or { E E F F | G G H H }. */
22715 t3
= gen_reg_rtx (V8SImode
);
22716 t4
= gen_reg_rtx (V8SImode
);
22717 mask
= GEN_INT (high_p
22718 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
22719 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
22720 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
22721 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
22723 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
22728 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
22729 uns_p
, OPTAB_DIRECT
);
22730 t2
= expand_binop (mode
,
22731 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
22732 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
22733 gcc_assert (t1
&& t2
);
22735 t3
= gen_reg_rtx (mode
);
22736 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
22737 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
22745 t1
= gen_reg_rtx (wmode
);
22746 t2
= gen_reg_rtx (wmode
);
22747 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
22748 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
22750 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
22754 gcc_unreachable ();
22759 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
22761 rtx res_1
, res_2
, res_3
, res_4
;
22763 res_1
= gen_reg_rtx (V4SImode
);
22764 res_2
= gen_reg_rtx (V4SImode
);
22765 res_3
= gen_reg_rtx (V2DImode
);
22766 res_4
= gen_reg_rtx (V2DImode
);
22767 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
22768 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
22770 /* Move the results in element 2 down to element 1; we don't care
22771 what goes in elements 2 and 3. Then we can merge the parts
22772 back together with an interleave.
22774 Note that two other sequences were tried:
22775 (1) Use interleaves at the start instead of psrldq, which allows
22776 us to use a single shufps to merge things back at the end.
22777 (2) Use shufps here to combine the two vectors, then pshufd to
22778 put the elements in the correct order.
22779 In both cases the cost of the reformatting stall was too high
22780 and the overall sequence slower. */
22782 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
22783 const0_rtx
, const2_rtx
,
22784 const0_rtx
, const0_rtx
));
22785 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
22786 const0_rtx
, const2_rtx
,
22787 const0_rtx
, const0_rtx
));
22788 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
22790 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
22794 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
22796 machine_mode mode
= GET_MODE (op0
);
22797 rtx t1
, t2
, t3
, t4
, t5
, t6
;
22799 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
22800 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
22801 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
22802 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
22803 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
22804 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
22805 else if (TARGET_XOP
&& mode
== V2DImode
)
22807 /* op1: A,B,C,D, op2: E,F,G,H */
22808 op1
= gen_lowpart (V4SImode
, op1
);
22809 op2
= gen_lowpart (V4SImode
, op2
);
22811 t1
= gen_reg_rtx (V4SImode
);
22812 t2
= gen_reg_rtx (V4SImode
);
22813 t3
= gen_reg_rtx (V2DImode
);
22814 t4
= gen_reg_rtx (V2DImode
);
22817 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
22823 /* t2: (B*E),(A*F),(D*G),(C*H) */
22824 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
22826 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
22827 emit_insn (gen_xop_phadddq (t3
, t2
));
22829 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
22830 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
22832 /* Multiply lower parts and add all */
22833 t5
= gen_reg_rtx (V2DImode
);
22834 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
22835 gen_lowpart (V4SImode
, op1
),
22836 gen_lowpart (V4SImode
, op2
)));
22837 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
22841 machine_mode nmode
;
22842 rtx (*umul
) (rtx
, rtx
, rtx
);
22844 if (mode
== V2DImode
)
22846 umul
= gen_vec_widen_umult_even_v4si
;
22849 else if (mode
== V4DImode
)
22851 umul
= gen_vec_widen_umult_even_v8si
;
22854 else if (mode
== V8DImode
)
22856 umul
= gen_vec_widen_umult_even_v16si
;
22860 gcc_unreachable ();
22863 /* Multiply low parts. */
22864 t1
= gen_reg_rtx (mode
);
22865 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
22867 /* Shift input vectors right 32 bits so we can multiply high parts. */
22869 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
22870 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
22872 /* Multiply high parts by low parts. */
22873 t4
= gen_reg_rtx (mode
);
22874 t5
= gen_reg_rtx (mode
);
22875 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
22876 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
22878 /* Combine and shift the highparts back. */
22879 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
22880 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
22882 /* Combine high and low parts. */
22883 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
22886 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
22887 gen_rtx_MULT (mode
, op1
, op2
));
22890 /* Return 1 if control tansfer instruction INSN
22891 should be encoded with notrack prefix. */
22894 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
22896 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
22901 rtx call
= get_call_rtx_from (insn
);
22902 gcc_assert (call
!= NULL_RTX
);
22903 rtx addr
= XEXP (call
, 0);
22905 /* Do not emit 'notrack' if it's not an indirect call. */
22907 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
22910 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
22913 if (JUMP_P (insn
) && !flag_cet_switch
)
22915 rtx target
= JUMP_LABEL (insn
);
22916 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
22919 /* Check the jump is a switch table. */
22920 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
22921 rtx_insn
*table
= next_insn (label
);
22922 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
22930 /* Calculate integer abs() using only SSE2 instructions. */
22933 ix86_expand_sse2_abs (rtx target
, rtx input
)
22935 machine_mode mode
= GET_MODE (target
);
22942 /* For 64-bit signed integer X, with SSE4.2 use
22943 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
22944 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
22945 32 and use logical instead of arithmetic right shift (which is
22946 unimplemented) and subtract. */
22949 tmp0
= gen_reg_rtx (mode
);
22950 tmp1
= gen_reg_rtx (mode
);
22951 emit_move_insn (tmp1
, CONST0_RTX (mode
));
22952 if (mode
== E_V2DImode
)
22953 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
22955 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
22959 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
22960 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
22961 - 1), NULL
, 0, OPTAB_DIRECT
);
22962 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
22965 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
22966 NULL
, 0, OPTAB_DIRECT
);
22967 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
22968 target
, 0, OPTAB_DIRECT
);
22972 /* For 32-bit signed integer X, the best way to calculate the absolute
22973 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
22974 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
22975 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
22976 NULL
, 0, OPTAB_DIRECT
);
22977 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
22978 NULL
, 0, OPTAB_DIRECT
);
22979 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
22980 target
, 0, OPTAB_DIRECT
);
22984 /* For 16-bit signed integer X, the best way to calculate the absolute
22985 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
22986 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
22988 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
22989 target
, 0, OPTAB_DIRECT
);
22993 /* For 8-bit signed integer X, the best way to calculate the absolute
22994 value of X is min ((unsigned char) X, (unsigned char) (-X)),
22995 as SSE2 provides the PMINUB insn. */
22996 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
22998 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
22999 target
, 0, OPTAB_DIRECT
);
23003 gcc_unreachable ();
23007 emit_move_insn (target
, x
);
23010 /* Expand an extract from a vector register through pextr insn.
23011 Return true if successful. */
23014 ix86_expand_pextr (rtx
*operands
)
23016 rtx dst
= operands
[0];
23017 rtx src
= operands
[1];
23019 unsigned int size
= INTVAL (operands
[2]);
23020 unsigned int pos
= INTVAL (operands
[3]);
23022 if (SUBREG_P (dst
))
23024 /* Reject non-lowpart subregs. */
23025 if (SUBREG_BYTE (dst
) > 0)
23027 dst
= SUBREG_REG (dst
);
23030 if (SUBREG_P (src
))
23032 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
23033 src
= SUBREG_REG (src
);
23036 switch (GET_MODE (src
))
23044 machine_mode srcmode
, dstmode
;
23047 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
23053 if (!TARGET_SSE4_1
)
23055 srcmode
= V16QImode
;
23061 srcmode
= V8HImode
;
23065 if (!TARGET_SSE4_1
)
23067 srcmode
= V4SImode
;
23071 gcc_assert (TARGET_64BIT
);
23072 if (!TARGET_SSE4_1
)
23074 srcmode
= V2DImode
;
23081 /* Reject extractions from misaligned positions. */
23082 if (pos
& (size
-1))
23085 if (GET_MODE (dst
) == dstmode
)
23088 d
= gen_reg_rtx (dstmode
);
23090 /* Construct insn pattern. */
23091 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
23092 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
23094 /* Let the rtl optimizers know about the zero extension performed. */
23095 if (dstmode
== QImode
|| dstmode
== HImode
)
23097 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
23098 d
= gen_lowpart (SImode
, d
);
23101 emit_insn (gen_rtx_SET (d
, pat
));
23104 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
23113 /* Expand an insert into a vector register through pinsr insn.
23114 Return true if successful. */
23117 ix86_expand_pinsr (rtx
*operands
)
23119 rtx dst
= operands
[0];
23120 rtx src
= operands
[3];
23122 unsigned int size
= INTVAL (operands
[1]);
23123 unsigned int pos
= INTVAL (operands
[2]);
23125 if (SUBREG_P (dst
))
23127 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
23128 dst
= SUBREG_REG (dst
);
23131 switch (GET_MODE (dst
))
23139 machine_mode srcmode
, dstmode
;
23140 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
23143 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
23149 if (!TARGET_SSE4_1
)
23151 dstmode
= V16QImode
;
23152 pinsr
= gen_sse4_1_pinsrb
;
23158 dstmode
= V8HImode
;
23159 pinsr
= gen_sse2_pinsrw
;
23163 if (!TARGET_SSE4_1
)
23165 dstmode
= V4SImode
;
23166 pinsr
= gen_sse4_1_pinsrd
;
23170 gcc_assert (TARGET_64BIT
);
23171 if (!TARGET_SSE4_1
)
23173 dstmode
= V2DImode
;
23174 pinsr
= gen_sse4_1_pinsrq
;
23181 /* Reject insertions to misaligned positions. */
23182 if (pos
& (size
-1))
23185 if (SUBREG_P (src
))
23187 unsigned int srcpos
= SUBREG_BYTE (src
);
23193 extr_ops
[0] = gen_reg_rtx (srcmode
);
23194 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
23195 extr_ops
[2] = GEN_INT (size
);
23196 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
23198 if (!ix86_expand_pextr (extr_ops
))
23204 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
23207 if (GET_MODE (dst
) == dstmode
)
23210 d
= gen_reg_rtx (dstmode
);
23212 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
23213 gen_lowpart (srcmode
, src
),
23214 GEN_INT (1 << (pos
/ size
))));
23216 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
23225 /* All CPUs prefer to avoid cross-lane operations so perform reductions
23226 upper against lower halves up to SSE reg size. */
23229 ix86_split_reduction (machine_mode mode
)
23231 /* Reduce lowpart against highpart until we reach SSE reg width to
23232 avoid cross-lane operations. */
23258 /* Generate call to __divmoddi4. */
23261 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
23263 rtx
*quot_p
, rtx
*rem_p
)
23265 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
23267 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
23268 mode
, op0
, mode
, op1
, mode
,
23269 XEXP (rem
, 0), Pmode
);
23275 ix86_expand_atomic_fetch_op_loop (rtx target
, rtx mem
, rtx val
,
23276 enum rtx_code code
, bool after
,
23279 rtx old_reg
, new_reg
, old_mem
, success
;
23280 machine_mode mode
= GET_MODE (target
);
23281 rtx_code_label
*loop_label
= NULL
;
23283 old_reg
= gen_reg_rtx (mode
);
23285 old_mem
= copy_to_reg (mem
);
23286 loop_label
= gen_label_rtx ();
23287 emit_label (loop_label
);
23288 emit_move_insn (old_reg
, old_mem
);
23290 /* return value for atomic_fetch_op. */
23292 emit_move_insn (target
, old_reg
);
23296 new_reg
= expand_simple_binop (mode
, AND
, new_reg
, val
, NULL_RTX
,
23297 true, OPTAB_LIB_WIDEN
);
23298 new_reg
= expand_simple_unop (mode
, code
, new_reg
, NULL_RTX
, true);
23301 new_reg
= expand_simple_binop (mode
, code
, new_reg
, val
, NULL_RTX
,
23302 true, OPTAB_LIB_WIDEN
);
23304 /* return value for atomic_op_fetch. */
23306 emit_move_insn (target
, new_reg
);
23308 success
= NULL_RTX
;
23310 ix86_expand_cmpxchg_loop (&success
, old_mem
, mem
, old_reg
, new_reg
,
23311 gen_int_mode (MEMMODEL_SYNC_SEQ_CST
,
23313 doubleword
, loop_label
);
23316 /* Relax cmpxchg instruction, param loop_label indicates whether
23317 the instruction should be relaxed with a pause loop. If not,
23318 it will be relaxed to an atomic load + compare, and skip
23319 cmpxchg instruction if mem != exp_input. */
23322 ix86_expand_cmpxchg_loop (rtx
*ptarget_bool
, rtx target_val
,
23323 rtx mem
, rtx exp_input
, rtx new_input
,
23324 rtx mem_model
, bool doubleword
,
23325 rtx_code_label
*loop_label
)
23327 rtx_code_label
*cmp_label
= NULL
;
23328 rtx_code_label
*done_label
= NULL
;
23329 rtx target_bool
= NULL_RTX
, new_mem
= NULL_RTX
;
23330 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
23331 rtx (*gendw
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
23332 machine_mode mode
= GET_MODE (target_val
), hmode
= mode
;
23334 if (*ptarget_bool
== NULL
)
23335 target_bool
= gen_reg_rtx (QImode
);
23337 target_bool
= *ptarget_bool
;
23339 cmp_label
= gen_label_rtx ();
23340 done_label
= gen_label_rtx ();
23342 new_mem
= gen_reg_rtx (mode
);
23343 /* Load memory first. */
23344 expand_atomic_load (new_mem
, mem
, MEMMODEL_SEQ_CST
);
23349 gendw
= gen_atomic_compare_and_swapti_doubleword
;
23355 gendw
= gen_atomic_compare_and_swapdi_doubleword
;
23359 gen
= gen_atomic_compare_and_swapdi_1
;
23362 gen
= gen_atomic_compare_and_swapsi_1
;
23365 gen
= gen_atomic_compare_and_swaphi_1
;
23368 gen
= gen_atomic_compare_and_swapqi_1
;
23371 gcc_unreachable ();
23374 /* Compare mem value with expected value. */
23377 rtx low_new_mem
= gen_lowpart (hmode
, new_mem
);
23378 rtx low_exp_input
= gen_lowpart (hmode
, exp_input
);
23379 rtx high_new_mem
= gen_highpart (hmode
, new_mem
);
23380 rtx high_exp_input
= gen_highpart (hmode
, exp_input
);
23381 emit_cmp_and_jump_insns (low_new_mem
, low_exp_input
, NE
, NULL_RTX
,
23382 hmode
, 1, cmp_label
,
23383 profile_probability::guessed_never ());
23384 emit_cmp_and_jump_insns (high_new_mem
, high_exp_input
, NE
, NULL_RTX
,
23385 hmode
, 1, cmp_label
,
23386 profile_probability::guessed_never ());
23389 emit_cmp_and_jump_insns (new_mem
, exp_input
, NE
, NULL_RTX
,
23390 GET_MODE (exp_input
), 1, cmp_label
,
23391 profile_probability::guessed_never ());
23393 /* Directly emits cmpxchg here. */
23395 emit_insn (gendw (target_val
, mem
, exp_input
,
23396 gen_lowpart (hmode
, new_input
),
23397 gen_highpart (hmode
, new_input
),
23400 emit_insn (gen (target_val
, mem
, exp_input
, new_input
, mem_model
));
23404 emit_jump_insn (gen_jump (done_label
));
23406 emit_label (cmp_label
);
23407 emit_move_insn (target_val
, new_mem
);
23408 emit_label (done_label
);
23409 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
23414 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
23416 emit_cmp_and_jump_insns (target_bool
, const0_rtx
, EQ
, const0_rtx
,
23417 GET_MODE (target_bool
), 1, loop_label
,
23418 profile_probability::guessed_never ());
23419 emit_jump_insn (gen_jump (done_label
));
23422 /* If mem is not expected, pause and loop back. */
23423 emit_label (cmp_label
);
23424 emit_move_insn (target_val
, new_mem
);
23425 emit_insn (gen_pause ());
23426 emit_jump_insn (gen_jump (loop_label
));
23428 emit_label (done_label
);
23431 *ptarget_bool
= target_bool
;
23434 #include "gt-i386-expand.h"