-/* Copyright (C) 1988-2022 Free Software Foundation, Inc.
+/* Copyright (C) 1988-2023 Free Software Foundation, Inc.
This file is part of GCC.
#include "pass_manager.h"
#include "target-globals.h"
#include "gimple-iterator.h"
-#include "tree-vectorizer.h"
#include "shrink-wrap.h"
#include "builtins.h"
#include "rtl-iter.h"
#include "i386-options.h"
#include "i386-builtins.h"
#include "i386-expand.h"
+#include "asan.h"
/* Split one or more double-mode RTL references into pairs of half-mode
references. The RTL can be REG, offsettable MEM, integer constant, or
rtx dlo, dhi;
int deleted_move_count = 0;
split_double_mode (mode, &dst, 1, &dlo, &dhi);
+ /* Constraints ensure that if both lo and hi are MEMs, then
+ dst has early-clobber and thus addresses of MEMs don't use
+ dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
+ dlo/dhi are registers. */
+ if (MEM_P (lo)
+ && rtx_equal_p (dlo, hi)
+ && reg_overlap_mentioned_p (dhi, lo))
+ {
+ /* If dlo is same as hi and lo's address uses dhi register,
+ code below would first emit_move_insn (dhi, hi)
+ and then emit_move_insn (dlo, lo). But the former
+ would invalidate lo's address. Load into dhi first,
+ then swap. */
+ emit_move_insn (dhi, lo);
+ lo = dhi;
+ }
+ else if (MEM_P (hi)
+ && !MEM_P (lo)
+ && !rtx_equal_p (dlo, lo)
+ && reg_overlap_mentioned_p (dlo, hi))
+ {
+ /* In this case, code below would first emit_move_insn (dlo, lo)
+ and then emit_move_insn (dhi, hi). But the former would
+ invalidate hi's address. */
+ if (rtx_equal_p (dhi, lo))
+ {
+ /* We can't load into dhi first, so load into dlo
+ first and we'll swap. */
+ emit_move_insn (dlo, hi);
+ hi = dlo;
+ }
+ else
+ {
+ /* Load into dhi first. */
+ emit_move_insn (dhi, hi);
+ hi = dhi;
+ }
+ }
if (!rtx_equal_p (dlo, hi))
{
if (!rtx_equal_p (dlo, lo))
broadcast only if vector broadcast is available. */
if (!TARGET_AVX
|| !CONST_WIDE_INT_P (op)
- || standard_sse_constant_p (op, mode))
+ || standard_sse_constant_p (op, mode)
+ || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
+ != GET_MODE_BITSIZE (mode)))
return nullptr;
HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
machine_mode vector_mode;
if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
gcc_unreachable ();
- rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
+ rtx target = gen_reg_rtx (vector_mode);
bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
target,
GEN_INT (val_broadcast));
default:
break;
+
+ case SUBREG:
+ /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
+ if (TARGET_64BIT
+ && mode == TImode
+ && SUBREG_P (op1)
+ && GET_MODE (SUBREG_REG (op1)) == DImode
+ && SUBREG_BYTE (op1) == 0)
+ op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
+ break;
}
if ((flag_pic || MACHOPIC_INDIRECT)
return;
}
}
- else if (GET_MODE_SIZE (mode) >= 16)
+ else if (CONST_WIDE_INT_P (op1)
+ && GET_MODE_SIZE (mode) >= 16)
{
rtx tmp = ix86_convert_const_wide_int_to_broadcast
(GET_MODE (op0), op1);
}
}
+ /* Special case inserting 64-bit values into a TImode register. */
+ if (TARGET_64BIT
+ /* Disable for -O0 (see PR110587) unless naked (PR110533). */
+ && (optimize || ix86_function_naked (current_function_decl))
+ && (mode == DImode || mode == DFmode)
+ && SUBREG_P (op0)
+ && GET_MODE (SUBREG_REG (op0)) == TImode
+ && REG_P (SUBREG_REG (op0))
+ && REG_P (op1))
+ {
+ /* Use *insvti_lowpart_1 to set lowpart. */
+ if (SUBREG_BYTE (op0) == 0)
+ {
+ wide_int mask = wi::mask (64, true, 128);
+ rtx tmp = immed_wide_int_const (mask, TImode);
+ op0 = SUBREG_REG (op0);
+ tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+ if (mode == DFmode)
+ op1 = gen_lowpart (DImode, op1);
+ op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+ op1 = gen_rtx_IOR (TImode, tmp, op1);
+ }
+ /* Use *insvti_highpart_1 to set highpart. */
+ else if (SUBREG_BYTE (op0) == 8)
+ {
+ wide_int mask = wi::mask (64, false, 128);
+ rtx tmp = immed_wide_int_const (mask, TImode);
+ op0 = SUBREG_REG (op0);
+ tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+ if (mode == DFmode)
+ op1 = gen_lowpart (DImode, op1);
+ op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+ op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
+ op1 = gen_rtx_IOR (TImode, tmp, op1);
+ }
+ }
+
emit_insn (gen_rtx_SET (op0, op1));
}
if (!register_operand (op0, mode)
&& !register_operand (op1, mode))
{
- rtx scratch = ix86_gen_scratch_sse_rtx (mode);
+ rtx scratch = gen_reg_rtx (mode);
emit_move_insn (scratch, op1);
op1 = scratch;
}
return;
}
- /* Special case TImode to V1TImode conversions, via V2DI. */
- if (mode == V1TImode
+ /* Special case TImode to 128-bit vector conversions via V2DI. */
+ if (VECTOR_MODE_P (mode)
+ && GET_MODE_SIZE (mode) == 16
&& SUBREG_P (op1)
&& GET_MODE (SUBREG_REG (op1)) == TImode
&& TARGET_64BIT && TARGET_SSE
emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
emit_insn (gen_vec_concatv2di (tmp, lo, hi));
- emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
+ emit_move_insn (op0, gen_lowpart (mode, tmp));
return;
}
&& !register_operand (op0, mode)
&& !register_operand (op1, mode))
{
- rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
+ rtx tmp = gen_reg_rtx (GET_MODE (op0));
emit_move_insn (tmp, op1);
emit_move_insn (op0, tmp);
return;
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx op2 = operands[2];
+ rtx src;
machine_mode dmode = GET_MODE (op0);
machine_mode smode = GET_MODE (op1);
op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
- op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
- op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
- rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
- op1, op2));
- emit_insn (insn);
+ /* paskusdw/packuswb does unsigned saturation of a signed source
+ which is different from generic us_truncate RTX. */
+ if (code == US_TRUNCATE)
+ src = gen_rtx_UNSPEC (sse_dmode,
+ gen_rtvec (2, op1, op2),
+ UNSPEC_US_TRUNCATE);
+ else
+ {
+ op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
+ op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
+ src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
+ }
+
+ emit_move_insn (dest, src);
ix86_move_vector_high_sse_to_mmx (op0);
}
switch (mode)
{
- case E_V4QImode:
case E_V8QImode:
+ case E_V4QImode:
+ case E_V2QImode:
sse_mode = V16QImode;
double_sse_mode = V32QImode;
mask = gen_rtx_PARALLEL (VOIDmode,
input = gen_rtx_REG (vecmode, REGNO (input));
emit_move_insn (value, CONST0_RTX (vecmode));
if (vecmode == V4SFmode)
- emit_insn (gen_sse_movss (value, value, input));
+ emit_insn (gen_sse_movss_v4sf (value, value, input));
else
- emit_insn (gen_sse2_movsd (value, value, input));
+ emit_insn (gen_sse2_movsd_v2df (value, value, input));
}
emit_move_insn (large, two31);
}
/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
- pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
+ pattern can be used on it instead of fixuns_trunc*.
This is done by doing just signed conversion if < 0x1p31, and otherwise by
subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
else
dest = NULL_RTX;
op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
- mask = ix86_build_signbit_mask (vmode, 0, 0);
+ mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
if (CONST_DOUBLE_P (operands[1]))
{
/* Handle special case - vector comparsion with boolean result, transform
it using ptest instruction. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || (mode == TImode && !TARGET_64BIT)
|| mode == OImode)
{
rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
gcc_assert (code == EQ || code == NE);
- if (mode == OImode)
+ if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
{
op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
tmp = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
tmp = gen_lowpart (p_mode, tmp);
- emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
- gen_rtx_UNSPEC (CCmode,
+ emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+ gen_rtx_UNSPEC (CCZmode,
gen_rtvec (2, tmp, tmp),
UNSPEC_PTEST)));
tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
machine_mode op_mode = GET_MODE (op0);
bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
+ if (op_mode == BFmode)
+ {
+ rtx op = gen_lowpart (HImode, op0);
+ if (CONST_INT_P (op))
+ op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
+ op0, BFmode);
+ else
+ {
+ rtx t1 = gen_reg_rtx (SImode);
+ emit_insn (gen_zero_extendhisi2 (t1, op));
+ emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
+ op = gen_lowpart (SFmode, t1);
+ }
+ *pop0 = op;
+ op = gen_lowpart (HImode, op1);
+ if (CONST_INT_P (op))
+ op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
+ op1, BFmode);
+ else
+ {
+ rtx t1 = gen_reg_rtx (SImode);
+ emit_insn (gen_zero_extendhisi2 (t1, op));
+ emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
+ op = gen_lowpart (SFmode, t1);
+ }
+ *pop1 = op;
+ return ix86_prepare_fp_compare_args (code, pop0, pop1);
+ }
+
/* All of the unordered compare instructions only work on registers.
The same is true of the fcomi compare instructions. The XFmode
compare instructions require registers except when comparing
cmpmode = SELECT_CC_MODE (code, op0, op1);
flags = gen_rtx_REG (cmpmode, FLAGS_REG);
+ /* Attempt to use PTEST, if available, when testing vector modes for
+ equality/inequality against zero. */
+ if (op1 == const0_rtx
+ && SUBREG_P (op0)
+ && cmpmode == CCZmode
+ && SUBREG_BYTE (op0) == 0
+ && REG_P (SUBREG_REG (op0))
+ && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
+ && TARGET_SSE4_1
+ && GET_MODE (op0) == TImode
+ && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
+ {
+ tmp = SUBREG_REG (op0);
+ tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
+ }
+ else
+ tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
+
/* This is very simple, but making the interface the same as in the
FP case makes the rest of the code easier. */
- tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
emit_insn (gen_rtx_SET (flags, tmp));
/* Return the test that should be put into the flags user, i.e.
&& !TARGET_64BIT))
return false;
+ if (GET_MODE (op0) == BFmode
+ && !ix86_fp_comparison_operator (operands[1], VOIDmode))
+ return false;
+
start_sequence ();
compare_op = ix86_expand_compare (code, op0, op1);
compare_seq = get_insns ();
rtx op0 = XEXP (operands[1], 0);
rtx op1 = XEXP (operands[1], 1);
+ if (GET_MODE (op0) == BFmode
+ && !ix86_fp_comparison_operator (operands[1], VOIDmode))
+ return false;
+
if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
{
machine_mode cmode;
case GTU:
break;
- case NE:
case LE:
case LEU:
+ /* x <= cst can be handled as x < cst + 1 unless there is
+ wrap around in cst + 1. */
+ if (GET_CODE (cop1) == CONST_VECTOR
+ && GET_MODE_INNER (mode) != TImode)
+ {
+ unsigned int n_elts = GET_MODE_NUNITS (mode), i;
+ machine_mode eltmode = GET_MODE_INNER (mode);
+ for (i = 0; i < n_elts; ++i)
+ {
+ rtx elt = CONST_VECTOR_ELT (cop1, i);
+ if (!CONST_INT_P (elt))
+ break;
+ if (code == GE)
+ {
+ /* For LE punt if some element is signed maximum. */
+ if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
+ == (GET_MODE_MASK (eltmode) >> 1))
+ break;
+ }
+ /* For LEU punt if some element is unsigned maximum. */
+ else if (elt == constm1_rtx)
+ break;
+ }
+ if (i == n_elts)
+ {
+ rtvec v = rtvec_alloc (n_elts);
+ for (i = 0; i < n_elts; ++i)
+ RTVEC_ELT (v, i)
+ = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
+ eltmode);
+ cop1 = gen_rtx_CONST_VECTOR (mode, v);
+ std::swap (cop0, cop1);
+ code = code == LE ? GT : GTU;
+ break;
+ }
+ }
+ /* FALLTHRU */
+ case NE:
code = reverse_condition (code);
*negate = true;
break;
case GE:
case GEU:
+ /* x >= cst can be handled as x > cst - 1 unless there is
+ wrap around in cst - 1. */
+ if (GET_CODE (cop1) == CONST_VECTOR
+ && GET_MODE_INNER (mode) != TImode)
+ {
+ unsigned int n_elts = GET_MODE_NUNITS (mode), i;
+ machine_mode eltmode = GET_MODE_INNER (mode);
+ for (i = 0; i < n_elts; ++i)
+ {
+ rtx elt = CONST_VECTOR_ELT (cop1, i);
+ if (!CONST_INT_P (elt))
+ break;
+ if (code == GE)
+ {
+ /* For GE punt if some element is signed minimum. */
+ if (INTVAL (elt) < 0
+ && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
+ == 0))
+ break;
+ }
+ /* For GEU punt if some element is zero. */
+ else if (elt == const0_rtx)
+ break;
+ }
+ if (i == n_elts)
+ {
+ rtvec v = rtvec_alloc (n_elts);
+ for (i = 0; i < n_elts; ++i)
+ RTVEC_ELT (v, i)
+ = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
+ eltmode);
+ cop1 = gen_rtx_CONST_VECTOR (mode, v);
+ code = code == GE ? GT : GTU;
+ break;
+ }
+ }
code = reverse_condition (code);
*negate = true;
/* FALLTHRU */
}
}
+ if (GET_CODE (cop0) == CONST_VECTOR)
+ cop0 = force_reg (mode, cop0);
+ else if (GET_CODE (cop1) == CONST_VECTOR)
+ cop1 = force_reg (mode, cop1);
+
rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
if (*negate)
if (*negate)
std::swap (op_true, op_false);
+ if (GET_CODE (cop1) == CONST_VECTOR)
+ cop1 = force_reg (mode, cop1);
+
/* Allow the comparison to be done in one mode, but the movcc to
happen in another mode. */
if (data_mode == mode)
- {
- x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
- op_true, op_false);
- }
+ x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
else
{
gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
}
}
-/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
+/* Extend SRC into next wider integer vector type. UNSIGNED_P is
+ true if we should do zero extension, else sign extension. */
+
+void
+ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
+{
+ machine_mode imode = GET_MODE (src);
+ rtx ops[3];
+
+ switch (imode)
+ {
+ case E_V8QImode:
+ case E_V4QImode:
+ case E_V2QImode:
+ case E_V4HImode:
+ case E_V2HImode:
+ case E_V2SImode:
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ ops[0] = gen_reg_rtx (imode);
+
+ ops[1] = force_reg (imode, src);
+
+ if (unsigned_p)
+ ops[2] = force_reg (imode, CONST0_RTX (imode));
+ else
+ ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+ ops[1], pc_rtx, pc_rtx);
+
+ ix86_split_mmx_punpck (ops, false);
+ emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
+}
+
+/* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
true if we should do zero extension, else sign extension. HIGH_P is
true if we want the N/2 high elements, else the low elements. */
if (count >= half_width)
{
emit_move_insn (high[0], low[1]);
- emit_move_insn (low[0], const0_rtx);
+ ix86_expand_clear (low[0]);
if (count > half_width)
ix86_expand_ashl_const (high[0], count - half_width, mode);
}
}
+/* Replace all occurrences of REG FROM with REG TO in X, including
+ occurrences with different modes. */
+
+rtx
+ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
+{
+ gcc_checking_assert (REG_P (from)
+ && REG_P (to)
+ && GET_MODE (from) == GET_MODE (to));
+ if (!reg_overlap_mentioned_p (from, x))
+ return x;
+ rtx ret = copy_rtx (x);
+ subrtx_ptr_iterator::array_type array;
+ FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
+ {
+ rtx *loc = *iter;
+ x = *loc;
+ if (REG_P (x) && REGNO (x) == REGNO (from))
+ {
+ if (x == from)
+ *loc = to;
+ else
+ {
+ gcc_checking_assert (REG_NREGS (x) == 1);
+ *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
+ }
+ }
+ }
+ return ret;
+}
+
/* Return mode for the memcpy/memset loop counter. Prefer SImode over
DImode for constant loop counts. */
{
if (alg == no_stringop)
return false;
+ /* It is not possible to use a library call if we have non-default
+ address space. We can do better than the generic byte-at-a-time
+ loop, used as a fallback. */
+ if (alg == libcall && have_as)
+ return false;
if (alg == vector_loop)
return TARGET_SSE || TARGET_AVX;
/* Algorithms using the rep prefix want at least edi and ecx;
gcc_assert (alg != libcall);
return alg;
}
+
+ /* Try to use some reasonable fallback algorithm. Note that for
+ non-default address spaces we default to a loop instead of
+ a libcall. */
return (alg_usable_p (algs->unknown_size, memset, have_as)
- ? algs->unknown_size : libcall);
+ ? algs->unknown_size : have_as ? loop : libcall);
}
/* Decide on alignment. We know that the operand is already aligned to ALIGN
if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
move_mode = TImode;
+ if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
+ move_mode = OImode;
/* Find the corresponding vector mode with the same size as MOVE_MODE.
MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
}
+ /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
+ mask off code pointers here.
+ TODO: also need to handle indirect jump. */
+ if (ix86_memtag_can_tag_addresses () && !fndecl
+ && sanitize_flags_p (SANITIZE_HWADDRESS))
+ {
+ rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
+ NULL_RTX);
+ fnaddr = gen_rtx_MEM (QImode, untagged_addr);
+ }
+
call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
if (retval)
machine_mode mode1 = insn_data[d->icode].operand[1].mode;
enum rtx_code comparison = d->comparison;
+ /* ptest reg, reg sets the carry flag. */
+ if (comparison == LTU
+ && (d->code == IX86_BUILTIN_PTESTC
+ || d->code == IX86_BUILTIN_PTESTC256)
+ && rtx_equal_p (op0, op1))
+ {
+ if (!target)
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const1_rtx);
+ return target;
+ }
+
if (VECTOR_MODE_P (mode0))
op0 = safe_vector_operand (op0, mode0);
if (VECTOR_MODE_P (mode1))
return ix86_expand_sse_ptest (d, exp, target);
case FLOAT128_FTYPE_FLOAT128:
case FLOAT_FTYPE_FLOAT:
+ case FLOAT_FTYPE_BFLOAT16:
case INT_FTYPE_INT:
case UINT_FTYPE_UINT:
case UINT16_FTYPE_UINT16:
case V8DF_FTYPE_V2DF:
case V8DF_FTYPE_V8DF:
case V4DI_FTYPE_V4DI:
- case V16HI_FTYPE_V16SF:
- case V8HI_FTYPE_V8SF:
- case V8HI_FTYPE_V4SF:
+ case V16BF_FTYPE_V16SF:
+ case V8BF_FTYPE_V8SF:
+ case V8BF_FTYPE_V4SF:
nargs = 1;
break;
case V4SF_FTYPE_V4SF_VEC_MERGE:
case V4SF_FTYPE_V4SF_UINT:
case V4SF_FTYPE_V4SF_DI:
case V4SF_FTYPE_V4SF_SI:
+ case V4DI_FTYPE_V4DI_V2DI:
case V2DI_FTYPE_V2DI_V2DI:
case V2DI_FTYPE_V16QI_V16QI:
case V2DI_FTYPE_V4SI_V4SI:
case USI_FTYPE_USI_USI:
case UDI_FTYPE_UDI_UDI:
case V16SI_FTYPE_V8DF_V8DF:
- case V32HI_FTYPE_V16SF_V16SF:
- case V16HI_FTYPE_V8SF_V8SF:
- case V8HI_FTYPE_V4SF_V4SF:
- case V16HI_FTYPE_V16SF_UHI:
- case V8HI_FTYPE_V8SF_UQI:
- case V8HI_FTYPE_V4SF_UQI:
+ case V32BF_FTYPE_V16SF_V16SF:
+ case V16BF_FTYPE_V8SF_V8SF:
+ case V8BF_FTYPE_V4SF_V4SF:
+ case V16BF_FTYPE_V16SF_UHI:
+ case V8BF_FTYPE_V8SF_UQI:
+ case V8BF_FTYPE_V4SF_UQI:
nargs = 2;
break;
case V2DI_FTYPE_V2DI_INT_CONVERT:
case V8HI_FTYPE_V8DI_V8HI_UQI:
case V8SI_FTYPE_V8DI_V8SI_UQI:
case V4SI_FTYPE_V4SI_V4SI_V4SI:
+ case V4DI_FTYPE_V4DI_V4DI_V2DI:
case V16SI_FTYPE_V16SI_V16SI_V16SI:
case V8DI_FTYPE_V8DI_V8DI_V8DI:
case V32HI_FTYPE_V32HI_V32HI_V32HI:
case V16HI_FTYPE_V16HI_V16HI_V16HI:
case V8SI_FTYPE_V8SI_V8SI_V8SI:
case V8HI_FTYPE_V8HI_V8HI_V8HI:
- case V32HI_FTYPE_V16SF_V16SF_USI:
- case V16HI_FTYPE_V8SF_V8SF_UHI:
- case V8HI_FTYPE_V4SF_V4SF_UQI:
- case V16HI_FTYPE_V16SF_V16HI_UHI:
- case V8HI_FTYPE_V8SF_V8HI_UQI:
- case V8HI_FTYPE_V4SF_V8HI_UQI:
- case V16SF_FTYPE_V16SF_V32HI_V32HI:
- case V8SF_FTYPE_V8SF_V16HI_V16HI:
- case V4SF_FTYPE_V4SF_V8HI_V8HI:
+ case V32BF_FTYPE_V16SF_V16SF_USI:
+ case V16BF_FTYPE_V8SF_V8SF_UHI:
+ case V8BF_FTYPE_V4SF_V4SF_UQI:
+ case V16BF_FTYPE_V16SF_V16BF_UHI:
+ case V8BF_FTYPE_V8SF_V8BF_UQI:
+ case V8BF_FTYPE_V4SF_V8BF_UQI:
+ case V16SF_FTYPE_V16SF_V32BF_V32BF:
+ case V8SF_FTYPE_V8SF_V16BF_V16BF:
+ case V4SF_FTYPE_V4SF_V8BF_V8BF:
nargs = 3;
break;
case V32QI_FTYPE_V32QI_V32QI_INT:
case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
- case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
- case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
- case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
+ case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
+ case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
+ case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
nargs = 4;
break;
case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+ case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
nargs = 4;
nargs_constant = 1;
break;
break;
case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
- case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
- case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
- case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
+ case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
+ case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
+ case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
nargs = 4;
break;
case UQI_FTYPE_V8DI_V8DI_INT_UQI:
tree arg;
rtx pat, op;
unsigned int i, nargs, arg_adjust, memory;
+ unsigned int constant = 100;
bool aligned_mem = false;
- rtx xops[3];
+ rtx xops[4];
enum insn_code icode = d->icode;
const struct insn_data_d *insn_p = &insn_data[icode];
machine_mode tmode = insn_p->operand[0].mode;
case V8SF_FTYPE_PCV4SF:
case V8SF_FTYPE_PCFLOAT:
case V4SF_FTYPE_PCFLOAT:
+ case V4SF_FTYPE_PCFLOAT16:
+ case V4SF_FTYPE_PCBFLOAT16:
+ case V4SF_FTYPE_PCV8BF:
+ case V4SF_FTYPE_PCV8HF:
+ case V8SF_FTYPE_PCFLOAT16:
+ case V8SF_FTYPE_PCBFLOAT16:
+ case V8SF_FTYPE_PCV16HF:
+ case V8SF_FTYPE_PCV16BF:
case V4DF_FTYPE_PCV2DF:
case V4DF_FTYPE_PCDOUBLE:
case V2DF_FTYPE_PCDOUBLE:
klass = load;
memory = 0;
break;
+ case INT_FTYPE_PINT_INT_INT_INT:
+ case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
+ nargs = 4;
+ klass = load;
+ memory = 0;
+ constant = 3;
+ break;
default:
gcc_unreachable ();
}
if (MEM_ALIGN (op) < align)
set_mem_align (op, align);
}
+ else if (i == constant)
+ {
+ /* This must be the constant. */
+ if (!insn_p->operand[nargs].predicate(op, SImode))
+ {
+ error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
+ return const0_rtx;
+ }
+ }
else
{
/* This must be register. */
case 3:
pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
break;
+ case 4:
+ pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
+ break;
default:
gcc_unreachable ();
}
op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
elt = get_element_number (TREE_TYPE (arg0), arg2);
- if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
+ if (GET_MODE (op1) != mode1)
op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
op0 = force_reg (tmode, op0);
HOST_WIDE_INT isa2 = ix86_isa_flags2;
HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
+ HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
/* The general case is we require all the ISAs specified in bisa{,2}
to be enabled.
The exceptions are:
OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
(OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
OPTION_MASK_ISA2_AVXVNNI
+ (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
+ OPTION_MASK_ISA2_AVXIFMA
+ (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
+ OPTION_MASK_ISA2_AVXNECONVERT
+ OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
where for each such pair it is sufficient if either of the ISAs is
enabled, plus if it is ored with other options also those others.
OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
- if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
- == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
- && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
- isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
-
- if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
- == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
- && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
- isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
- if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
- == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
- && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
- isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
-
- if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
- == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
- || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
- && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
- == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
- || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
- {
- isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
- isa2 |= OPTION_MASK_ISA2_AVXVNNI;
- }
+#define SHARE_BUILTIN(A1, A2, B1, B2) \
+ if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
+ && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
+ && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
+ || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
+ { \
+ tmp_isa |= (A1) | (B1); \
+ tmp_isa2 |= (A2) | (B2); \
+ }
+
+ SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
+ SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
+ SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
+ SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
+ OPTION_MASK_ISA2_AVXVNNI);
+ SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
+ OPTION_MASK_ISA2_AVXIFMA);
+ SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
+ OPTION_MASK_ISA2_AVXNECONVERT);
+ SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
+ OPTION_MASK_ISA2_VAES);
+ isa = tmp_isa;
+ isa2 = tmp_isa2;
if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
/* __builtin_ia32_maskmovq requires MMX registers. */
return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
}
+/* Emit instructions to set the carry flag from ARG. */
+
+void
+ix86_expand_carry (rtx arg)
+{
+ if (!CONST_INT_P (arg) || arg == const0_rtx)
+ {
+ arg = convert_to_mode (QImode, arg, 1);
+ arg = copy_to_mode_reg (QImode, arg);
+ emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
+ }
+ else
+ emit_insn (gen_x86_stc ());
+}
+
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
return target;
}
+ case IX86_BUILTIN_PREFETCH:
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0); // const void *
+ arg1 = CALL_EXPR_ARG (exp, 1); // const int
+ arg2 = CALL_EXPR_ARG (exp, 2); // const int
+ arg3 = CALL_EXPR_ARG (exp, 3); // const int
+
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ op3 = expand_normal (arg3);
+
+ if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
+ {
+ error ("second, third and fourth argument must be a const");
+ return const0_rtx;
+ }
+
+ if (INTVAL (op3) == 1)
+ {
+ if (INTVAL (op2) < 2 || INTVAL (op2) > 3)
+ {
+ error ("invalid third argument");
+ return const0_rtx;
+ }
+
+ if (TARGET_64BIT && TARGET_PREFETCHI
+ && local_func_symbolic_operand (op0, GET_MODE (op0)))
+ emit_insn (gen_prefetchi (op0, op2));
+ else
+ {
+ warning (0, "instruction prefetch applies when in 64-bit mode"
+ " with RIP-relative addressing and"
+ " option %<-mprefetchi%>;"
+ " they stay NOPs otherwise");
+ emit_insn (gen_nop ());
+ }
+ }
+ else
+ {
+ if (!address_operand (op0, VOIDmode))
+ {
+ op0 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op0);
+ }
+
+ if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
+ {
+ warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
+ op2 = const0_rtx;
+ }
+
+ if (TARGET_3DNOW || TARGET_PREFETCH_SSE
+ || TARGET_PRFCHW || TARGET_PREFETCHWT1)
+ emit_insn (gen_prefetch (op0, op1, op2));
+ else if (!MEM_P (op0) && side_effects_p (op0))
+ /* Don't do anything with direct references to volatile memory,
+ but generate code to handle other side effects. */
+ emit_insn (op0);
+ }
+
+ return 0;
+ }
+
+ case IX86_BUILTIN_PREFETCHI:
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0); // const void *
+ arg1 = CALL_EXPR_ARG (exp, 1); // const int
+
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+
+ if (!CONST_INT_P (op1))
+ {
+ error ("second argument must be a const");
+ return const0_rtx;
+ }
+
+ /* GOT/PLT_PIC should not be available for instruction prefetch.
+ It must be real instruction address. */
+ if (TARGET_64BIT
+ && local_func_symbolic_operand (op0, GET_MODE (op0)))
+ emit_insn (gen_prefetchi (op0, op1));
+ else
+ {
+ /* Ignore the hint. */
+ warning (0, "instruction prefetch applies when in 64-bit mode"
+ " with RIP-relative addressing and"
+ " option %<-mprefetchi%>;"
+ " they stay NOPs otherwise");
+ emit_insn (gen_nop ());
+ }
+
+ return 0;
+ }
+
case IX86_BUILTIN_VEC_INIT_V2SI:
case IX86_BUILTIN_VEC_INIT_V4HI:
case IX86_BUILTIN_VEC_INIT_V8QI:
arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
op1 = expand_normal (arg0);
- if (!integer_zerop (arg0))
- op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
op2 = expand_normal (arg1);
if (!register_operand (op2, mode0))
}
op0 = gen_reg_rtx (mode0);
- if (integer_zerop (arg0))
+ if (op1 == const0_rtx)
{
/* If arg0 is 0, optimize right away into add or sub
instruction that sets CCCmode flags. */
else
{
/* Generate CF from input operand. */
- emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+ ix86_expand_carry (op1);
/* Generate instruction that consumes CF. */
op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
op0 = pc_rtx;
}
else if (TREE_CODE (arg3) == SSA_NAME
- && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
+ && VECTOR_TYPE_P (TREE_TYPE (arg3)))
{
/* Recognize also when mask is like:
__v2df src = _mm_setzero_pd ();
bool ok;
rtx_insn *insn;
rtx dup;
+ /* Save/restore recog_data in case this is called from splitters
+ or other routines where recog_data needs to stay valid across
+ force_reg. See PR106577. */
+ recog_data_d recog_data_save = recog_data;
/* First attempt to recognize VAL as-is. */
dup = gen_vec_duplicate (mode, val);
ok = recog_memoized (insn) >= 0;
gcc_assert (ok);
}
+ recog_data = recog_data_save;
return true;
}
get_mode_wider_vector (machine_mode o)
{
/* ??? Rely on the ordering that genmodes.cc gives to vectors. */
- machine_mode n = GET_MODE_WIDER_MODE (o).require ();
+ machine_mode n = GET_MODE_NEXT_MODE (o).require ();
gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
return n;
{
tmp1 = force_reg (GET_MODE_INNER (mode), val);
tmp2 = gen_reg_rtx (mode);
- emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
- CONST0_RTX (mode), tmp1));
+ emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
tmp1 = gen_lowpart (mode, tmp2);
}
else
emit_move_insn (target, gen_lowpart (mode, words[0]));
else if (n_words == 2)
{
- rtx tmp = gen_reg_rtx (mode);
- emit_clobber (tmp);
- emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
- emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
- emit_move_insn (target, tmp);
+ gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
+ machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
+ rtx tmp = gen_reg_rtx (concat_mode);
+ vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
+ ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
+ emit_move_insn (target, gen_lowpart (mode, tmp));
}
else if (n_words == 4)
{
? gen_reg_rtx (V16HFmode)
: gen_reg_rtx (V16BFmode));
if (elt < 16)
- emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+ emit_insn (gen_vec_extract_lo (mode, tmp, vec));
else
- emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+ emit_insn (gen_vec_extract_hi (mode, tmp, vec));
ix86_expand_vector_extract (false, target, tmp, elt & 15);
return;
}
? gen_reg_rtx (V8HFmode)
: gen_reg_rtx (V8BFmode));
if (elt < 8)
- emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+ emit_insn (gen_vec_extract_lo (mode, tmp, vec));
else
- emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+ emit_insn (gen_vec_extract_hi (mode, tmp, vec));
ix86_expand_vector_extract (false, target, tmp, elt & 7);
return;
}
if (d->one_operand_p)
return false;
- if (!(TARGET_SSE && vmode == V4SFmode)
- && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
- && !(TARGET_SSE2 && vmode == V2DFmode))
+ if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
+ && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
+ && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
return false;
/* Only the first element is changed. */
return true;
}
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
+ using insertps. */
+static bool
+expand_vec_perm_insertps (struct expand_vec_perm_d *d)
+{
+ machine_mode vmode = d->vmode;
+ unsigned i, cnt_s, nelt = d->nelt;
+ int cnt_d = -1;
+ rtx src, dst;
+
+ if (d->one_operand_p)
+ return false;
+
+ if (!(TARGET_SSE4_1
+ && (vmode == V4SFmode || vmode == V4SImode
+ || (TARGET_MMX_WITH_SSE
+ && (vmode == V2SFmode || vmode == V2SImode)))))
+ return false;
+
+ for (i = 0; i < nelt; ++i)
+ {
+ if (d->perm[i] == i)
+ continue;
+ if (cnt_d != -1)
+ {
+ cnt_d = -1;
+ break;
+ }
+ cnt_d = i;
+ }
+
+ if (cnt_d == -1)
+ {
+ for (i = 0; i < nelt; ++i)
+ {
+ if (d->perm[i] == i + nelt)
+ continue;
+ if (cnt_d != -1)
+ return false;
+ cnt_d = i;
+ }
+
+ if (cnt_d == -1)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ gcc_assert (cnt_d != -1);
+
+ cnt_s = d->perm[cnt_d];
+ if (cnt_s < nelt)
+ {
+ src = d->op0;
+ dst = d->op1;
+ }
+ else
+ {
+ cnt_s -= nelt;
+ src = d->op1;
+ dst = d->op0;
+ }
+ gcc_assert (cnt_s < nelt);
+
+ rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
+ GEN_INT (cnt_s << 6 | cnt_d << 4));
+ emit_insn (x);
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
- else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
- || GET_MODE_SIZE (vmode) == 8
- || GET_MODE_SIZE (vmode) == 4))
+ else if (TARGET_SSE4_1
+ && (GET_MODE_SIZE (vmode) == 16
+ || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
+ || GET_MODE_SIZE (vmode) == 4))
;
else
return false;
case E_V8SFmode:
case E_V2DFmode:
case E_V4SFmode:
+ case E_V2SFmode:
+ case E_V2HImode:
case E_V4HImode:
case E_V8HImode:
case E_V8SImode:
goto do_subreg;
case E_V4SImode:
- for (i = 0; i < 4; ++i)
- mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
- vmode = V8HImode;
- goto do_subreg;
+ if (TARGET_AVX2)
+ {
+ /* Use vpblendd instead of vpblendw. */
+ for (i = 0; i < nelt; ++i)
+ mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
+ break;
+ }
+ else
+ {
+ for (i = 0; i < 4; ++i)
+ mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+ vmode = V8HImode;
+ goto do_subreg;
+ }
case E_V16QImode:
/* See if bytes move in pairs so we can use pblendw with
mmode = VOIDmode;
}
+ /* Canonicalize vec_merge. */
+ if (swap_commutative_operands_p (op1, op0)
+ /* Two operands have same precedence, then
+ first bit of mask select first operand. */
+ || (!swap_commutative_operands_p (op0, op1)
+ && !(mask & 1)))
+ {
+ unsigned n_elts = GET_MODE_NUNITS (vmode);
+ std::swap (op0, op1);
+ unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
+ if (n_elts == HOST_BITS_PER_WIDE_INT)
+ mask_all = -1;
+ else
+ mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
+ mask = ~mask & mask_all;
+ }
+
if (mmode != VOIDmode)
maskop = force_reg (mmode, gen_int_mode (mask, mmode));
else
}
}
+ /* Try the SSE4.1 blend variable merge instructions. */
+ if (expand_vec_perm_blend (d))
+ return true;
+
/* Try movss/movsd instructions. */
if (expand_vec_perm_movs (d))
return true;
- /* Finally, try the fully general two operand permute. */
+ /* Try the SSE4.1 insertps instruction. */
+ if (expand_vec_perm_insertps (d))
+ return true;
+
+ /* Try the fully general two operand permute. */
if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
d->testing_p))
return true;
return true;
}
- /* Try the SSE4.1 blend variable merge instructions. */
- if (expand_vec_perm_blend (d))
- return true;
-
/* Try one of the AVX vpermil variable permutations. */
if (expand_vec_perm_vpermil (d))
return true;
return false;
}
+/* Canonicalize vec_perm index to make the first index
+ always comes from the first vector. */
+static void
+ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
+{
+ unsigned nelt = d->nelt;
+ if (d->perm[0] < nelt)
+ return;
+
+ for (unsigned i = 0; i != nelt; i++)
+ d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
+
+ std::swap (d->op0, d->op1);
+ return;
+}
+
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
+ in terms of a pair of shufps+ shufps/pshufd instructions. */
+static bool
+expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
+{
+ unsigned char perm1[4];
+ machine_mode vmode = d->vmode;
+ bool ok;
+ unsigned i, j, k, count = 0;
+
+ if (d->one_operand_p
+ || (vmode != V4SImode && vmode != V4SFmode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ ix86_vec_perm_index_canon (d);
+ for (i = 0; i < 4; ++i)
+ count += d->perm[i] > 3 ? 1 : 0;
+
+ gcc_assert (count & 3);
+
+ rtx tmp = gen_reg_rtx (vmode);
+ /* 2 from op0 and 2 from op1. */
+ if (count == 2)
+ {
+ unsigned char perm2[4];
+ for (i = 0, j = 0, k = 2; i < 4; ++i)
+ if (d->perm[i] & 4)
+ {
+ perm1[k++] = d->perm[i];
+ perm2[i] = k - 1;
+ }
+ else
+ {
+ perm1[j++] = d->perm[i];
+ perm2[i] = j - 1;
+ }
+
+ /* shufps. */
+ ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
+ perm1, d->nelt, false);
+ gcc_assert (ok);
+ if (vmode == V4SImode && TARGET_SSE2)
+ /* pshufd. */
+ ok = expand_vselect (d->target, tmp,
+ perm2, d->nelt, false);
+ else
+ {
+ /* shufps. */
+ perm2[2] += 4;
+ perm2[3] += 4;
+ ok = expand_vselect_vconcat (d->target, tmp, tmp,
+ perm2, d->nelt, false);
+ }
+ gcc_assert (ok);
+ }
+ /* 3 from one op and 1 from another. */
+ else
+ {
+ unsigned pair_idx = 8, lone_idx = 8, shift;
+
+ /* Find the lone index. */
+ for (i = 0; i < 4; ++i)
+ if ((d->perm[i] > 3 && count == 1)
+ || (d->perm[i] < 4 && count == 3))
+ lone_idx = i;
+
+ /* When lone_idx is not 0, it must from second op(count == 1). */
+ gcc_assert (count == (lone_idx ? 1 : 3));
+
+ /* Find the pair index that sits in the same half as the lone index. */
+ shift = lone_idx & 2;
+ pair_idx = 1 - lone_idx + 2 * shift;
+
+ /* First permutate lone index and pair index into the same vector as
+ [ lone, lone, pair, pair ]. */
+ perm1[1] = perm1[0]
+ = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
+ perm1[3] = perm1[2]
+ = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
+
+ /* Alway put the vector contains lone indx at the first. */
+ if (count == 1)
+ std::swap (d->op0, d->op1);
+
+ /* shufps. */
+ ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
+ perm1, d->nelt, false);
+ gcc_assert (ok);
+
+ /* Refine lone and pair index to original order. */
+ perm1[shift] = lone_idx << 1;
+ perm1[shift + 1] = pair_idx << 1;
+
+ /* Select the remaining 2 elements in another vector. */
+ for (i = 2 - shift; i < 4 - shift; ++i)
+ perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
+
+ /* Adjust to original selector. */
+ if (lone_idx > 1)
+ std::swap (tmp, d->op1);
+
+ /* shufps. */
+ ok = expand_vselect_vconcat (d->target, tmp, d->op1,
+ perm1, d->nelt, false);
+
+ gcc_assert (ok);
+ }
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of a pair of pshuflw + pshufhw instructions. */
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
- else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
- || GET_MODE_SIZE (vmode) == 8
- || GET_MODE_SIZE (vmode) == 16))
+ else if (TARGET_SSE4_1
+ && (GET_MODE_SIZE (vmode) == 16
+ || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
+ || GET_MODE_SIZE (vmode) == 4))
;
else
return false;
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
- else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
- || GET_MODE_SIZE (vmode) == 8
- || GET_MODE_SIZE (vmode) == 4))
+ else if (TARGET_SSE4_1
+ && (GET_MODE_SIZE (vmode) == 16
+ || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
+ || GET_MODE_SIZE (vmode) == 4))
;
else
return false;
if (d->testing_p)
return true;
- rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
+ rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
if (elt >= nelt2)
{
- maybe_gen = maybe_gen_vec_interleave_high;
+ gen_interleave = gen_vec_interleave_high;
elt -= nelt2;
}
else
- maybe_gen = maybe_gen_vec_interleave_low;
+ gen_interleave = gen_vec_interleave_low;
nelt2 /= 2;
dest = gen_reg_rtx (vmode);
- emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
+ emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
vmode = V4SImode;
op0 = gen_lowpart (vmode, dest);
if (expand_vec_perm_2perm_pblendv (d, true))
return true;
+ if (expand_vec_perm_shufps_shufps (d))
+ return true;
+
/* Try sequences of three instructions. */
if (expand_vec_perm_even_odd_pack (d))
gcc_assert (ok);
}
-/* This function is similar as ix86_expand_vecop_qihi,
- but optimized under AVX512BW by using vpmovwb.
- For example, optimize vector MUL generation like
-
- vpmovzxbw ymm2, xmm0
- vpmovzxbw ymm3, xmm1
- vpmullw ymm4, ymm2, ymm3
- vpmovwb xmm0, ymm4
-
- it would take less instructions than ix86_expand_vecop_qihi.
- Return true if success. */
-
-static bool
-ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
- machine_mode himode, qimode = GET_MODE (dest);
- rtx hop1, hop2, hdest;
- rtx (*gen_extend)(rtx, rtx);
- rtx (*gen_truncate)(rtx, rtx);
- bool uns_p = (code == ASHIFTRT) ? false : true;
-
- /* There's no V64HImode multiplication instruction. */
- if (qimode == E_V64QImode)
- return false;
-
- /* vpmovwb only available under AVX512BW. */
- if (!TARGET_AVX512BW)
- return false;
- if ((qimode == V8QImode || qimode == V16QImode)
- && !TARGET_AVX512VL)
- return false;
- /* Not generate zmm instruction when prefer 128/256 bit vector width. */
- if (qimode == V32QImode
- && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
- return false;
-
- switch (qimode)
- {
- case E_V8QImode:
- himode = V8HImode;
- gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
- gen_truncate = gen_truncv8hiv8qi2;
- break;
- case E_V16QImode:
- himode = V16HImode;
- gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
- gen_truncate = gen_truncv16hiv16qi2;
- break;
- case E_V32QImode:
- himode = V32HImode;
- gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
- gen_truncate = gen_truncv32hiv32qi2;
- break;
- default:
- gcc_unreachable ();
- }
-
- hop1 = gen_reg_rtx (himode);
- hop2 = gen_reg_rtx (himode);
- hdest = gen_reg_rtx (himode);
- emit_insn (gen_extend (hop1, op1));
- emit_insn (gen_extend (hop2, op2));
- emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
- hop1, hop2)));
- emit_insn (gen_truncate (dest, hdest));
- return true;
-}
-
/* Expand a vector operation shift by constant for a V*QImode in terms of the
same operation on V*HImode. Return true if success. */
static bool
return true;
}
+void
+ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode qimode = GET_MODE (dest);
+ rtx qop1, qop2, hop1, hop2, qdest, hdest;
+ bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+ bool uns_p = code != ASHIFTRT;
+
+ switch (qimode)
+ {
+ case E_V4QImode:
+ case E_V8QImode:
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
+
+ if (op2vec)
+ qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
+ else
+ qop2 = op2;
+
+ qdest = gen_reg_rtx (V16QImode);
+
+ if (CONST_INT_P (op2)
+ && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+ && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
+ {
+ emit_move_insn (dest, gen_lowpart (qimode, qdest));
+ return;
+ }
+
+ switch (code)
+ {
+ case MULT:
+ gcc_assert (op2vec);
+ if (!TARGET_SSE4_1)
+ {
+ /* Unpack data such that we've got a source byte in each low byte
+ of each word. We don't care what goes into the high byte of
+ each word. Rather than trying to get zero in there, most
+ convenient is to let it be a copy of the low byte. */
+ hop1 = copy_to_reg (qop1);
+ hop2 = copy_to_reg (qop2);
+ emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
+ emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
+ break;
+ }
+ /* FALLTHRU */
+ case ASHIFT:
+ case ASHIFTRT:
+ case LSHIFTRT:
+ hop1 = gen_reg_rtx (V8HImode);
+ ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+ /* mult/vashr/vlshr/vashl */
+ if (op2vec)
+ {
+ hop2 = gen_reg_rtx (V8HImode);
+ ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+ }
+ else
+ hop2 = qop2;
+
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (code != MULT && op2vec)
+ {
+ /* Expand vashr/vlshr/vashl. */
+ hdest = gen_reg_rtx (V8HImode);
+ emit_insn (gen_rtx_SET (hdest,
+ simplify_gen_binary (code, V8HImode,
+ hop1, hop2)));
+ }
+ else
+ /* Expand mult/ashr/lshr/ashl. */
+ hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
+ NULL_RTX, 1, OPTAB_DIRECT);
+
+ if (TARGET_AVX512BW && TARGET_AVX512VL)
+ {
+ if (qimode == V8QImode)
+ qdest = dest;
+ else
+ qdest = gen_reg_rtx (V8QImode);
+
+ emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
+ }
+ else
+ {
+ struct expand_vec_perm_d d;
+ rtx qres = gen_lowpart (V16QImode, hdest);
+ bool ok;
+ int i;
+
+ /* Merge the data back into the right place. */
+ d.target = qdest;
+ d.op0 = d.op1 = qres;
+ d.vmode = V16QImode;
+ d.nelt = 16;
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = i * 2;
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+ }
+
+ if (qdest != dest)
+ emit_move_insn (dest, gen_lowpart (qimode, qdest));
+}
+
+/* Emit instruction in 2x wider mode. For example, optimize
+ vector MUL generation like
+
+ vpmovzxbw ymm2, xmm0
+ vpmovzxbw ymm3, xmm1
+ vpmullw ymm4, ymm2, ymm3
+ vpmovwb xmm0, ymm4
+
+ it would take less instructions than ix86_expand_vecop_qihi.
+ Return true if success. */
+
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode himode, qimode = GET_MODE (dest);
+ machine_mode wqimode;
+ rtx qop1, qop2, hop1, hop2, hdest;
+ rtx (*gen_truncate)(rtx, rtx) = NULL;
+ bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+ bool uns_p = code != ASHIFTRT;
+
+ if ((qimode == V16QImode && !TARGET_AVX2)
+ || (qimode == V32QImode && !TARGET_AVX512BW)
+ /* There are no V64HImode instructions. */
+ || qimode == V64QImode)
+ return false;
+
+ /* Do not generate ymm/zmm instructions when
+ target prefers 128/256 bit vector width. */
+ if ((qimode == V16QImode && TARGET_PREFER_AVX128)
+ || (qimode == V32QImode && TARGET_PREFER_AVX256))
+ return false;
+
+ switch (qimode)
+ {
+ case E_V16QImode:
+ himode = V16HImode;
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen_truncate = gen_truncv16hiv16qi2;
+ break;
+ case E_V32QImode:
+ himode = V32HImode;
+ gen_truncate = gen_truncv32hiv32qi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
+ qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
+
+ if (op2vec)
+ qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
+ else
+ qop2 = op2;
+
+ hop1 = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+
+ if (op2vec)
+ {
+ hop2 = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+ }
+ else
+ hop2 = qop2;
+
+ if (code != MULT && op2vec)
+ {
+ /* Expand vashr/vlshr/vashl. */
+ hdest = gen_reg_rtx (himode);
+ emit_insn (gen_rtx_SET (hdest,
+ simplify_gen_binary (code, himode,
+ hop1, hop2)));
+ }
+ else
+ /* Expand mult/ashr/lshr/ashl. */
+ hdest = expand_simple_binop (himode, code, hop1, hop2,
+ NULL_RTX, 1, OPTAB_DIRECT);
+
+ if (gen_truncate)
+ emit_insn (gen_truncate (dest, hdest));
+ else
+ {
+ struct expand_vec_perm_d d;
+ rtx wqdest = gen_reg_rtx (wqimode);
+ rtx wqres = gen_lowpart (wqimode, hdest);
+ bool ok;
+ int i;
+
+ /* Merge the data back into the right place. */
+ d.target = wqdest;
+ d.op0 = d.op1 = wqres;
+ d.vmode = wqimode;
+ d.nelt = GET_MODE_NUNITS (wqimode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = i * 2;
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+
+ emit_move_insn (dest, gen_lowpart (qimode, wqdest));
+ }
+
+ return true;
+}
+
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
rtx (*gen_il) (rtx, rtx, rtx);
rtx (*gen_ih) (rtx, rtx, rtx);
rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+ bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
struct expand_vec_perm_d d;
- bool ok, full_interleave;
- bool uns_p = false;
+ bool full_interleave = true;
+ bool uns_p = code != ASHIFTRT;
+ bool ok;
int i;
if (CONST_INT_P (op2)
&& ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
return;
- if (TARGET_AVX512BW
- && VECTOR_MODE_P (GET_MODE (op2))
- && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+ if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
return;
switch (qimode)
{
case E_V16QImode:
himode = V8HImode;
- gen_il = gen_vec_interleave_lowv16qi;
- gen_ih = gen_vec_interleave_highv16qi;
break;
case E_V32QImode:
himode = V16HImode;
- gen_il = gen_avx2_interleave_lowv32qi;
- gen_ih = gen_avx2_interleave_highv32qi;
break;
case E_V64QImode:
himode = V32HImode;
- gen_il = gen_avx512bw_interleave_lowv64qi;
- gen_ih = gen_avx512bw_interleave_highv64qi;
break;
default:
gcc_unreachable ();
switch (code)
{
case MULT:
+ gcc_assert (op2vec);
/* Unpack data such that we've got a source byte in each low byte of
each word. We don't care what goes into the high byte of each word.
Rather than trying to get zero in there, most convenient is to let
it be a copy of the low byte. */
+ switch (qimode)
+ {
+ case E_V16QImode:
+ gen_il = gen_vec_interleave_lowv16qi;
+ gen_ih = gen_vec_interleave_highv16qi;
+ break;
+ case E_V32QImode:
+ gen_il = gen_avx2_interleave_lowv32qi;
+ gen_ih = gen_avx2_interleave_highv32qi;
+ full_interleave = false;
+ break;
+ case E_V64QImode:
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ full_interleave = false;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
op2_l = gen_reg_rtx (qimode);
op2_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op2_l, op2, op2));
op1_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op1_l, op1, op1));
emit_insn (gen_ih (op1_h, op1, op1));
- full_interleave = qimode == V16QImode;
break;
case ASHIFT:
- case LSHIFTRT:
- uns_p = true;
- /* FALLTHRU */
case ASHIFTRT:
+ case LSHIFTRT:
op1_l = gen_reg_rtx (himode);
op1_h = gen_reg_rtx (himode);
ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
/* vashr/vlshr/vashl */
- if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+ if (op2vec)
{
rtx tmp = force_reg (qimode, op2);
op2_l = gen_reg_rtx (himode);
else
op2_l = op2_h = op2;
- full_interleave = true;
break;
default:
gcc_unreachable ();
}
- /* Perform vashr/vlshr/vashl. */
- if (code != MULT
- && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+ if (code != MULT && op2vec)
{
+ /* Expand vashr/vlshr/vashl. */
res_l = gen_reg_rtx (himode);
res_h = gen_reg_rtx (himode);
emit_insn (gen_rtx_SET (res_l,
simplify_gen_binary (code, himode,
op1_h, op2_h)));
}
- /* Performance mult/ashr/lshr/ashl. */
else
{
+ /* Expand mult/ashr/lshr/ashl. */
res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
1, OPTAB_DIRECT);
res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
if (full_interleave)
{
- /* For SSE2, we used an full interleave, so the desired
+ /* We used the full interleave, the desired
results are in the even elements. */
for (i = 0; i < d.nelt; ++i)
d.perm[i] = i * 2;
ok = ix86_expand_vec_perm_const_1 (&d);
gcc_assert (ok);
-
- set_unique_reg_note (get_last_insn (), REG_EQUAL,
- gen_rtx_fmt_ee (code, qimode, op1, op2));
}
/* Helper function of ix86_expand_mul_widen_evenodd. Return true
*ptarget_bool = target_bool;
}
+/* Convert a BFmode VAL to SFmode without signaling sNaNs.
+ This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
+
+rtx
+ix86_expand_fast_convert_bf_to_sf (rtx val)
+{
+ rtx op = gen_lowpart (HImode, val), ret;
+ if (CONST_INT_P (op))
+ {
+ ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
+ val, BFmode);
+ if (ret)
+ return ret;
+ /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
+ ret = gen_reg_rtx (SImode);
+ emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
+ emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
+ return gen_lowpart (SFmode, ret);
+ }
+
+ ret = gen_reg_rtx (SFmode);
+ emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
+ return ret;
+}
+
#include "gt-i386-expand.h"