#include "pass_manager.h"
#include "target-globals.h"
#include "gimple-iterator.h"
-#include "tree-vectorizer.h"
#include "shrink-wrap.h"
#include "builtins.h"
#include "rtl-iter.h"
default:
break;
+
+ case SUBREG:
+ /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
+ if (TARGET_64BIT
+ && mode == TImode
+ && SUBREG_P (op1)
+ && GET_MODE (SUBREG_REG (op1)) == DImode
+ && SUBREG_BYTE (op1) == 0)
+ op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
+ break;
}
if ((flag_pic || MACHOPIC_INDIRECT)
return;
}
}
- else if (GET_MODE_SIZE (mode) >= 16)
+ else if (CONST_WIDE_INT_P (op1)
+ && GET_MODE_SIZE (mode) >= 16)
{
rtx tmp = ix86_convert_const_wide_int_to_broadcast
(GET_MODE (op0), op1);
}
}
+ /* Special case inserting 64-bit values into a TImode register. */
+ if (TARGET_64BIT
+ /* Disable for -O0 (see PR110587) unless naked (PR110533). */
+ && (optimize || ix86_function_naked (current_function_decl))
+ && (mode == DImode || mode == DFmode)
+ && SUBREG_P (op0)
+ && GET_MODE (SUBREG_REG (op0)) == TImode
+ && REG_P (SUBREG_REG (op0))
+ && REG_P (op1))
+ {
+ /* Use *insvti_lowpart_1 to set lowpart. */
+ if (SUBREG_BYTE (op0) == 0)
+ {
+ wide_int mask = wi::mask (64, true, 128);
+ rtx tmp = immed_wide_int_const (mask, TImode);
+ op0 = SUBREG_REG (op0);
+ tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+ if (mode == DFmode)
+ op1 = gen_lowpart (DImode, op1);
+ op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+ op1 = gen_rtx_IOR (TImode, tmp, op1);
+ }
+ /* Use *insvti_highpart_1 to set highpart. */
+ else if (SUBREG_BYTE (op0) == 8)
+ {
+ wide_int mask = wi::mask (64, false, 128);
+ rtx tmp = immed_wide_int_const (mask, TImode);
+ op0 = SUBREG_REG (op0);
+ tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+ if (mode == DFmode)
+ op1 = gen_lowpart (DImode, op1);
+ op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+ op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
+ op1 = gen_rtx_IOR (TImode, tmp, op1);
+ }
+ }
+
emit_insn (gen_rtx_SET (op0, op1));
}
return;
}
- /* Special case TImode to V1TImode conversions, via V2DI. */
- if (mode == V1TImode
+ /* Special case TImode to 128-bit vector conversions via V2DI. */
+ if (VECTOR_MODE_P (mode)
+ && GET_MODE_SIZE (mode) == 16
&& SUBREG_P (op1)
&& GET_MODE (SUBREG_REG (op1)) == TImode
&& TARGET_64BIT && TARGET_SSE
emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
emit_insn (gen_vec_concatv2di (tmp, lo, hi));
- emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
+ emit_move_insn (op0, gen_lowpart (mode, tmp));
return;
}
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx op2 = operands[2];
+ rtx src;
machine_mode dmode = GET_MODE (op0);
machine_mode smode = GET_MODE (op1);
op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
- op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
- op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
- rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
- op1, op2));
- emit_insn (insn);
+ /* paskusdw/packuswb does unsigned saturation of a signed source
+ which is different from generic us_truncate RTX. */
+ if (code == US_TRUNCATE)
+ src = gen_rtx_UNSPEC (sse_dmode,
+ gen_rtvec (2, op1, op2),
+ UNSPEC_US_TRUNCATE);
+ else
+ {
+ op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
+ op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
+ src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
+ }
+
+ emit_move_insn (dest, src);
ix86_move_vector_high_sse_to_mmx (op0);
}
switch (mode)
{
- case E_V4QImode:
case E_V8QImode:
+ case E_V4QImode:
+ case E_V2QImode:
sse_mode = V16QImode;
double_sse_mode = V32QImode;
mask = gen_rtx_PARALLEL (VOIDmode,
else
dest = NULL_RTX;
op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
- mask = ix86_build_signbit_mask (vmode, 0, 0);
+ mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
if (CONST_DOUBLE_P (operands[1]))
{
/* Handle special case - vector comparsion with boolean result, transform
it using ptest instruction. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || (mode == TImode && !TARGET_64BIT)
|| mode == OImode)
{
rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
gcc_assert (code == EQ || code == NE);
- if (mode == OImode)
+ if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
{
op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
tmp = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
tmp = gen_lowpart (p_mode, tmp);
- emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
- gen_rtx_UNSPEC (CCmode,
+ emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+ gen_rtx_UNSPEC (CCZmode,
gen_rtvec (2, tmp, tmp),
UNSPEC_PTEST)));
tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
cmpmode = SELECT_CC_MODE (code, op0, op1);
flags = gen_rtx_REG (cmpmode, FLAGS_REG);
+ /* Attempt to use PTEST, if available, when testing vector modes for
+ equality/inequality against zero. */
+ if (op1 == const0_rtx
+ && SUBREG_P (op0)
+ && cmpmode == CCZmode
+ && SUBREG_BYTE (op0) == 0
+ && REG_P (SUBREG_REG (op0))
+ && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
+ && TARGET_SSE4_1
+ && GET_MODE (op0) == TImode
+ && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
+ {
+ tmp = SUBREG_REG (op0);
+ tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
+ }
+ else
+ tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
+
/* This is very simple, but making the interface the same as in the
FP case makes the rest of the code easier. */
- tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
emit_insn (gen_rtx_SET (flags, tmp));
/* Return the test that should be put into the flags user, i.e.
}
}
-/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
+/* Extend SRC into next wider integer vector type. UNSIGNED_P is
+ true if we should do zero extension, else sign extension. */
+
+void
+ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
+{
+ machine_mode imode = GET_MODE (src);
+ rtx ops[3];
+
+ switch (imode)
+ {
+ case E_V8QImode:
+ case E_V4QImode:
+ case E_V2QImode:
+ case E_V4HImode:
+ case E_V2HImode:
+ case E_V2SImode:
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ ops[0] = gen_reg_rtx (imode);
+
+ ops[1] = force_reg (imode, src);
+
+ if (unsigned_p)
+ ops[2] = force_reg (imode, CONST0_RTX (imode));
+ else
+ ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+ ops[1], pc_rtx, pc_rtx);
+
+ ix86_split_mmx_punpck (ops, false);
+ emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
+}
+
+/* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
true if we should do zero extension, else sign extension. HIGH_P is
true if we want the N/2 high elements, else the low elements. */
{
if (alg == no_stringop)
return false;
+ /* It is not possible to use a library call if we have non-default
+ address space. We can do better than the generic byte-at-a-time
+ loop, used as a fallback. */
+ if (alg == libcall && have_as)
+ return false;
if (alg == vector_loop)
return TARGET_SSE || TARGET_AVX;
/* Algorithms using the rep prefix want at least edi and ecx;
gcc_assert (alg != libcall);
return alg;
}
+
+ /* Try to use some reasonable fallback algorithm. Note that for
+ non-default address spaces we default to a loop instead of
+ a libcall. */
return (alg_usable_p (algs->unknown_size, memset, have_as)
- ? algs->unknown_size : libcall);
+ ? algs->unknown_size : have_as ? loop : libcall);
}
/* Decide on alignment. We know that the operand is already aligned to ALIGN
machine_mode mode1 = insn_data[d->icode].operand[1].mode;
enum rtx_code comparison = d->comparison;
+ /* ptest reg, reg sets the carry flag. */
+ if (comparison == LTU
+ && (d->code == IX86_BUILTIN_PTESTC
+ || d->code == IX86_BUILTIN_PTESTC256)
+ && rtx_equal_p (op0, op1))
+ {
+ if (!target)
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const1_rtx);
+ return target;
+ }
+
if (VECTOR_MODE_P (mode0))
op0 = safe_vector_operand (op0, mode0);
if (VECTOR_MODE_P (mode1))
case V4SF_FTYPE_V4SF_UINT:
case V4SF_FTYPE_V4SF_DI:
case V4SF_FTYPE_V4SF_SI:
+ case V4DI_FTYPE_V4DI_V2DI:
case V2DI_FTYPE_V2DI_V2DI:
case V2DI_FTYPE_V16QI_V16QI:
case V2DI_FTYPE_V4SI_V4SI:
case V8HI_FTYPE_V8DI_V8HI_UQI:
case V8SI_FTYPE_V8DI_V8SI_UQI:
case V4SI_FTYPE_V4SI_V4SI_V4SI:
+ case V4DI_FTYPE_V4DI_V4DI_V2DI:
case V16SI_FTYPE_V16SI_V16SI_V16SI:
case V8DI_FTYPE_V8DI_V8DI_V8DI:
case V32HI_FTYPE_V32HI_V32HI_V32HI:
case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+ case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
nargs = 4;
nargs_constant = 1;
break;
OPTION_MASK_ISA2_AVXIFMA
(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
OPTION_MASK_ISA2_AVXNECONVERT
+ OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
where for each such pair it is sufficient if either of the ISAs is
enabled, plus if it is ored with other options also those others.
OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
OPTION_MASK_ISA2_AVXIFMA);
SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
OPTION_MASK_ISA2_AVXNECONVERT);
- SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, 0, OPTION_MASK_ISA2_VAES);
+ SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
+ OPTION_MASK_ISA2_VAES);
isa = tmp_isa;
isa2 = tmp_isa2;
return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
}
+/* Emit instructions to set the carry flag from ARG. */
+
+void
+ix86_expand_carry (rtx arg)
+{
+ if (!CONST_INT_P (arg) || arg == const0_rtx)
+ {
+ arg = convert_to_mode (QImode, arg, 1);
+ arg = copy_to_mode_reg (QImode, arg);
+ emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
+ }
+ else
+ emit_insn (gen_x86_stc ());
+}
+
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
op1 = expand_normal (arg0);
- if (!integer_zerop (arg0))
- op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
op2 = expand_normal (arg1);
if (!register_operand (op2, mode0))
}
op0 = gen_reg_rtx (mode0);
- if (integer_zerop (arg0))
+ if (op1 == const0_rtx)
{
/* If arg0 is 0, optimize right away into add or sub
instruction that sets CCCmode flags. */
else
{
/* Generate CF from input operand. */
- emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+ ix86_expand_carry (op1);
/* Generate instruction that consumes CF. */
op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
{
tmp1 = force_reg (GET_MODE_INNER (mode), val);
tmp2 = gen_reg_rtx (mode);
- emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
- CONST0_RTX (mode), tmp1));
+ emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
tmp1 = gen_lowpart (mode, tmp2);
}
else
emit_move_insn (target, gen_lowpart (mode, words[0]));
else if (n_words == 2)
{
- rtx tmp = gen_reg_rtx (mode);
- emit_clobber (tmp);
- emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
- emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
- emit_move_insn (target, tmp);
+ gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
+ machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
+ rtx tmp = gen_reg_rtx (concat_mode);
+ vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
+ ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
+ emit_move_insn (target, gen_lowpart (mode, tmp));
}
else if (n_words == 4)
{
? gen_reg_rtx (V16HFmode)
: gen_reg_rtx (V16BFmode));
if (elt < 16)
- emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+ emit_insn (gen_vec_extract_lo (mode, tmp, vec));
else
- emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+ emit_insn (gen_vec_extract_hi (mode, tmp, vec));
ix86_expand_vector_extract (false, target, tmp, elt & 15);
return;
}
? gen_reg_rtx (V8HFmode)
: gen_reg_rtx (V8BFmode));
if (elt < 8)
- emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+ emit_insn (gen_vec_extract_lo (mode, tmp, vec));
else
- emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+ emit_insn (gen_vec_extract_hi (mode, tmp, vec));
ix86_expand_vector_extract (false, target, tmp, elt & 7);
return;
}
mmode = VOIDmode;
}
+ /* Canonicalize vec_merge. */
+ if (swap_commutative_operands_p (op1, op0)
+ /* Two operands have same precedence, then
+ first bit of mask select first operand. */
+ || (!swap_commutative_operands_p (op0, op1)
+ && !(mask & 1)))
+ {
+ unsigned n_elts = GET_MODE_NUNITS (vmode);
+ std::swap (op0, op1);
+ unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
+ if (n_elts == HOST_BITS_PER_WIDE_INT)
+ mask_all = -1;
+ else
+ mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
+ mask = ~mask & mask_all;
+ }
+
if (mmode != VOIDmode)
maskop = force_reg (mmode, gen_int_mode (mask, mmode));
else
if (d->testing_p)
return true;
- rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
+ rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
if (elt >= nelt2)
{
- maybe_gen = maybe_gen_vec_interleave_high;
+ gen_interleave = gen_vec_interleave_high;
elt -= nelt2;
}
else
- maybe_gen = maybe_gen_vec_interleave_low;
+ gen_interleave = gen_vec_interleave_low;
nelt2 /= 2;
dest = gen_reg_rtx (vmode);
- emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
+ emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
vmode = V4SImode;
op0 = gen_lowpart (vmode, dest);
gcc_assert (ok);
}
-/* This function is similar as ix86_expand_vecop_qihi,
- but optimized under AVX512BW by using vpmovwb.
- For example, optimize vector MUL generation like
-
- vpmovzxbw ymm2, xmm0
- vpmovzxbw ymm3, xmm1
- vpmullw ymm4, ymm2, ymm3
- vpmovwb xmm0, ymm4
-
- it would take less instructions than ix86_expand_vecop_qihi.
- Return true if success. */
-
-static bool
-ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
- machine_mode himode, qimode = GET_MODE (dest);
- rtx hop1, hop2, hdest;
- rtx (*gen_truncate)(rtx, rtx);
- bool uns_p = (code == ASHIFTRT) ? false : true;
-
- /* There are no V64HImode instructions. */
- if (qimode == V64QImode)
- return false;
-
- /* vpmovwb only available under AVX512BW. */
- if (!TARGET_AVX512BW)
- return false;
- if ((qimode == V8QImode || qimode == V16QImode)
- && !TARGET_AVX512VL)
- return false;
- /* Do not generate ymm/zmm instructions when
- target prefers 128/256 bit vector width. */
- if ((qimode == V16QImode && TARGET_PREFER_AVX128)
- || (qimode == V32QImode && TARGET_PREFER_AVX256))
- return false;
-
- switch (qimode)
- {
- case E_V8QImode:
- himode = V8HImode;
- gen_truncate = gen_truncv8hiv8qi2;
- break;
- case E_V16QImode:
- himode = V16HImode;
- gen_truncate = gen_truncv16hiv16qi2;
- break;
- case E_V32QImode:
- himode = V32HImode;
- gen_truncate = gen_truncv32hiv32qi2;
- break;
- default:
- gcc_unreachable ();
- }
-
- hop1 = gen_reg_rtx (himode);
- hop2 = gen_reg_rtx (himode);
- hdest = gen_reg_rtx (himode);
- emit_insn (gen_extend_insn (hop1, op1, himode, qimode, uns_p));
- emit_insn (gen_extend_insn (hop2, op2, himode, qimode, uns_p));
- emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
- hop1, hop2)));
- emit_insn (gen_truncate (dest, hdest));
- return true;
-}
-
/* Expand a vector operation shift by constant for a V*QImode in terms of the
same operation on V*HImode. Return true if success. */
static bool
ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
machine_mode qimode = GET_MODE (dest);
- rtx qop1, qop2, hop1, hop2, qdest, hres;
+ rtx qop1, qop2, hop1, hop2, qdest, hdest;
bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
- bool uns_p = true;
+ bool uns_p = code != ASHIFTRT;
switch (qimode)
{
else
qop2 = op2;
+ qdest = gen_reg_rtx (V16QImode);
+
+ if (CONST_INT_P (op2)
+ && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+ && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
+ {
+ emit_move_insn (dest, gen_lowpart (qimode, qdest));
+ return;
+ }
+
switch (code)
{
case MULT:
gcc_assert (op2vec);
- /* Unpack data such that we've got a source byte in each low byte of
- each word. We don't care what goes into the high byte of each word.
- Rather than trying to get zero in there, most convenient is to let
- it be a copy of the low byte. */
- hop1 = copy_to_reg (qop1);
- hop2 = copy_to_reg (qop2);
- emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
- emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
- break;
-
- case ASHIFTRT:
- uns_p = false;
+ if (!TARGET_SSE4_1)
+ {
+ /* Unpack data such that we've got a source byte in each low byte
+ of each word. We don't care what goes into the high byte of
+ each word. Rather than trying to get zero in there, most
+ convenient is to let it be a copy of the low byte. */
+ hop1 = copy_to_reg (qop1);
+ hop2 = copy_to_reg (qop2);
+ emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
+ emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
+ break;
+ }
/* FALLTHRU */
case ASHIFT:
+ case ASHIFTRT:
case LSHIFTRT:
hop1 = gen_reg_rtx (V8HImode);
ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
- /* vashr/vlshr/vashl */
+ /* mult/vashr/vlshr/vashl */
if (op2vec)
{
hop2 = gen_reg_rtx (V8HImode);
if (code != MULT && op2vec)
{
/* Expand vashr/vlshr/vashl. */
- hres = gen_reg_rtx (V8HImode);
- emit_insn (gen_rtx_SET (hres,
+ hdest = gen_reg_rtx (V8HImode);
+ emit_insn (gen_rtx_SET (hdest,
simplify_gen_binary (code, V8HImode,
hop1, hop2)));
}
else
/* Expand mult/ashr/lshr/ashl. */
- hres = expand_simple_binop (V8HImode, code, hop1, hop2,
+ hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
NULL_RTX, 1, OPTAB_DIRECT);
if (TARGET_AVX512BW && TARGET_AVX512VL)
else
qdest = gen_reg_rtx (V8QImode);
- emit_insn (gen_truncv8hiv8qi2 (qdest, hres));
+ emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
}
else
{
struct expand_vec_perm_d d;
- rtx qres = gen_lowpart (V16QImode, hres);
+ rtx qres = gen_lowpart (V16QImode, hdest);
bool ok;
int i;
- qdest = gen_reg_rtx (V16QImode);
-
/* Merge the data back into the right place. */
d.target = qdest;
- d.op0 = qres;
- d.op1 = qres;
+ d.op0 = d.op1 = qres;
d.vmode = V16QImode;
d.nelt = 16;
d.one_operand_p = false;
emit_move_insn (dest, gen_lowpart (qimode, qdest));
}
+/* Emit instruction in 2x wider mode. For example, optimize
+ vector MUL generation like
+
+ vpmovzxbw ymm2, xmm0
+ vpmovzxbw ymm3, xmm1
+ vpmullw ymm4, ymm2, ymm3
+ vpmovwb xmm0, ymm4
+
+ it would take less instructions than ix86_expand_vecop_qihi.
+ Return true if success. */
+
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode himode, qimode = GET_MODE (dest);
+ machine_mode wqimode;
+ rtx qop1, qop2, hop1, hop2, hdest;
+ rtx (*gen_truncate)(rtx, rtx) = NULL;
+ bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+ bool uns_p = code != ASHIFTRT;
+
+ if ((qimode == V16QImode && !TARGET_AVX2)
+ || (qimode == V32QImode && !TARGET_AVX512BW)
+ /* There are no V64HImode instructions. */
+ || qimode == V64QImode)
+ return false;
+
+ /* Do not generate ymm/zmm instructions when
+ target prefers 128/256 bit vector width. */
+ if ((qimode == V16QImode && TARGET_PREFER_AVX128)
+ || (qimode == V32QImode && TARGET_PREFER_AVX256))
+ return false;
+
+ switch (qimode)
+ {
+ case E_V16QImode:
+ himode = V16HImode;
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen_truncate = gen_truncv16hiv16qi2;
+ break;
+ case E_V32QImode:
+ himode = V32HImode;
+ gen_truncate = gen_truncv32hiv32qi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
+ qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
+
+ if (op2vec)
+ qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
+ else
+ qop2 = op2;
+
+ hop1 = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+
+ if (op2vec)
+ {
+ hop2 = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+ }
+ else
+ hop2 = qop2;
+
+ if (code != MULT && op2vec)
+ {
+ /* Expand vashr/vlshr/vashl. */
+ hdest = gen_reg_rtx (himode);
+ emit_insn (gen_rtx_SET (hdest,
+ simplify_gen_binary (code, himode,
+ hop1, hop2)));
+ }
+ else
+ /* Expand mult/ashr/lshr/ashl. */
+ hdest = expand_simple_binop (himode, code, hop1, hop2,
+ NULL_RTX, 1, OPTAB_DIRECT);
+
+ if (gen_truncate)
+ emit_insn (gen_truncate (dest, hdest));
+ else
+ {
+ struct expand_vec_perm_d d;
+ rtx wqdest = gen_reg_rtx (wqimode);
+ rtx wqres = gen_lowpart (wqimode, hdest);
+ bool ok;
+ int i;
+
+ /* Merge the data back into the right place. */
+ d.target = wqdest;
+ d.op0 = d.op1 = wqres;
+ d.vmode = wqimode;
+ d.nelt = GET_MODE_NUNITS (wqimode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = i * 2;
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+
+ emit_move_insn (dest, gen_lowpart (qimode, wqdest));
+ }
+
+ return true;
+}
+
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
struct expand_vec_perm_d d;
bool full_interleave = true;
- bool uns_p = true;
+ bool uns_p = code != ASHIFTRT;
bool ok;
int i;
&& ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
return;
- if (TARGET_AVX512BW
- && VECTOR_MODE_P (GET_MODE (op2))
- && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+ if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
return;
switch (qimode)
emit_insn (gen_ih (op1_h, op1, op1));
break;
- case ASHIFTRT:
- uns_p = false;
- /* FALLTHRU */
case ASHIFT:
+ case ASHIFTRT:
case LSHIFTRT:
op1_l = gen_reg_rtx (himode);
op1_h = gen_reg_rtx (himode);