#include "pass_manager.h"
#include "target-globals.h"
#include "gimple-iterator.h"
-#include "tree-vectorizer.h"
#include "shrink-wrap.h"
#include "builtins.h"
#include "rtl-iter.h"
/* Special case inserting 64-bit values into a TImode register. */
if (TARGET_64BIT
+ /* Disable for -O0 (see PR110587) unless naked (PR110533). */
+ && (optimize || ix86_function_naked (current_function_decl))
&& (mode == DImode || mode == DFmode)
&& SUBREG_P (op0)
&& GET_MODE (SUBREG_REG (op0)) == TImode
op0 = SUBREG_REG (op0);
tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
if (mode == DFmode)
- op1 = force_reg (DImode, gen_lowpart (DImode, op1));
+ op1 = gen_lowpart (DImode, op1);
op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
op1 = gen_rtx_IOR (TImode, tmp, op1);
}
op0 = SUBREG_REG (op0);
tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
if (mode == DFmode)
- op1 = force_reg (DImode, gen_lowpart (DImode, op1));
+ op1 = gen_lowpart (DImode, op1);
op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
op1 = gen_rtx_IOR (TImode, tmp, op1);
switch (mode)
{
- case E_V4QImode:
case E_V8QImode:
+ case E_V4QImode:
+ case E_V2QImode:
sse_mode = V16QImode;
double_sse_mode = V32QImode;
mask = gen_rtx_PARALLEL (VOIDmode,
}
}
-/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
+/* Extend SRC into next wider integer vector type. UNSIGNED_P is
+ true if we should do zero extension, else sign extension. */
+
+void
+ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
+{
+ machine_mode imode = GET_MODE (src);
+ rtx ops[3];
+
+ switch (imode)
+ {
+ case E_V8QImode:
+ case E_V4QImode:
+ case E_V2QImode:
+ case E_V4HImode:
+ case E_V2HImode:
+ case E_V2SImode:
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ ops[0] = gen_reg_rtx (imode);
+
+ ops[1] = force_reg (imode, src);
+
+ if (unsigned_p)
+ ops[2] = force_reg (imode, CONST0_RTX (imode));
+ else
+ ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+ ops[1], pc_rtx, pc_rtx);
+
+ ix86_split_mmx_punpck (ops, false);
+ emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
+}
+
+/* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
true if we should do zero extension, else sign extension. HIGH_P is
true if we want the N/2 high elements, else the low elements. */
{
if (alg == no_stringop)
return false;
+ /* It is not possible to use a library call if we have non-default
+ address space. We can do better than the generic byte-at-a-time
+ loop, used as a fallback. */
+ if (alg == libcall && have_as)
+ return false;
if (alg == vector_loop)
return TARGET_SSE || TARGET_AVX;
/* Algorithms using the rep prefix want at least edi and ecx;
gcc_assert (alg != libcall);
return alg;
}
+
+ /* Try to use some reasonable fallback algorithm. Note that for
+ non-default address spaces we default to a loop instead of
+ a libcall. */
return (alg_usable_p (algs->unknown_size, memset, have_as)
- ? algs->unknown_size : libcall);
+ ? algs->unknown_size : have_as ? loop : libcall);
}
/* Decide on alignment. We know that the operand is already aligned to ALIGN
case V4SF_FTYPE_V4SF_UINT:
case V4SF_FTYPE_V4SF_DI:
case V4SF_FTYPE_V4SF_SI:
+ case V4DI_FTYPE_V4DI_V2DI:
case V2DI_FTYPE_V2DI_V2DI:
case V2DI_FTYPE_V16QI_V16QI:
case V2DI_FTYPE_V4SI_V4SI:
case V8HI_FTYPE_V8DI_V8HI_UQI:
case V8SI_FTYPE_V8DI_V8SI_UQI:
case V4SI_FTYPE_V4SI_V4SI_V4SI:
+ case V4DI_FTYPE_V4DI_V4DI_V2DI:
case V16SI_FTYPE_V16SI_V16SI_V16SI:
case V8DI_FTYPE_V8DI_V8DI_V8DI:
case V32HI_FTYPE_V32HI_V32HI_V32HI:
case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+ case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
nargs = 4;
nargs_constant = 1;
break;
{
tmp1 = force_reg (GET_MODE_INNER (mode), val);
tmp2 = gen_reg_rtx (mode);
- emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
- CONST0_RTX (mode), tmp1));
+ emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
tmp1 = gen_lowpart (mode, tmp2);
}
else
? gen_reg_rtx (V16HFmode)
: gen_reg_rtx (V16BFmode));
if (elt < 16)
- emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+ emit_insn (gen_vec_extract_lo (mode, tmp, vec));
else
- emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+ emit_insn (gen_vec_extract_hi (mode, tmp, vec));
ix86_expand_vector_extract (false, target, tmp, elt & 15);
return;
}
? gen_reg_rtx (V8HFmode)
: gen_reg_rtx (V8BFmode));
if (elt < 8)
- emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+ emit_insn (gen_vec_extract_lo (mode, tmp, vec));
else
- emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+ emit_insn (gen_vec_extract_hi (mode, tmp, vec));
ix86_expand_vector_extract (false, target, tmp, elt & 7);
return;
}
mmode = VOIDmode;
}
+ /* Canonicalize vec_merge. */
+ if (swap_commutative_operands_p (op1, op0)
+ /* Two operands have same precedence, then
+ first bit of mask select first operand. */
+ || (!swap_commutative_operands_p (op0, op1)
+ && !(mask & 1)))
+ {
+ unsigned n_elts = GET_MODE_NUNITS (vmode);
+ std::swap (op0, op1);
+ unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
+ if (n_elts == HOST_BITS_PER_WIDE_INT)
+ mask_all = -1;
+ else
+ mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
+ mask = ~mask & mask_all;
+ }
+
if (mmode != VOIDmode)
maskop = force_reg (mmode, gen_int_mode (mask, mmode));
else
if (d->testing_p)
return true;
- rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
+ rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
if (elt >= nelt2)
{
- maybe_gen = maybe_gen_vec_interleave_high;
+ gen_interleave = gen_vec_interleave_high;
elt -= nelt2;
}
else
- maybe_gen = maybe_gen_vec_interleave_low;
+ gen_interleave = gen_vec_interleave_low;
nelt2 /= 2;
dest = gen_reg_rtx (vmode);
- emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
+ emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
vmode = V4SImode;
op0 = gen_lowpart (vmode, dest);