#include "pass_manager.h"
#include "target-globals.h"
#include "gimple-iterator.h"
-#include "tree-vectorizer.h"
#include "shrink-wrap.h"
#include "builtins.h"
#include "rtl-iter.h"
default:
break;
+
+ case SUBREG:
+ /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
+ if (TARGET_64BIT
+ && mode == TImode
+ && SUBREG_P (op1)
+ && GET_MODE (SUBREG_REG (op1)) == DImode
+ && SUBREG_BYTE (op1) == 0)
+ op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
+ break;
}
if ((flag_pic || MACHOPIC_INDIRECT)
return;
}
}
- else if (GET_MODE_SIZE (mode) >= 16)
+ else if (CONST_WIDE_INT_P (op1)
+ && GET_MODE_SIZE (mode) >= 16)
{
rtx tmp = ix86_convert_const_wide_int_to_broadcast
(GET_MODE (op0), op1);
}
}
+ /* Special case inserting 64-bit values into a TImode register. */
+ if (TARGET_64BIT
+ /* Disable for -O0 (see PR110587) unless naked (PR110533). */
+ && (optimize || ix86_function_naked (current_function_decl))
+ && (mode == DImode || mode == DFmode)
+ && SUBREG_P (op0)
+ && GET_MODE (SUBREG_REG (op0)) == TImode
+ && REG_P (SUBREG_REG (op0))
+ && REG_P (op1))
+ {
+ /* Use *insvti_lowpart_1 to set lowpart. */
+ if (SUBREG_BYTE (op0) == 0)
+ {
+ wide_int mask = wi::mask (64, true, 128);
+ rtx tmp = immed_wide_int_const (mask, TImode);
+ op0 = SUBREG_REG (op0);
+ tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+ if (mode == DFmode)
+ op1 = gen_lowpart (DImode, op1);
+ op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+ op1 = gen_rtx_IOR (TImode, tmp, op1);
+ }
+ /* Use *insvti_highpart_1 to set highpart. */
+ else if (SUBREG_BYTE (op0) == 8)
+ {
+ wide_int mask = wi::mask (64, false, 128);
+ rtx tmp = immed_wide_int_const (mask, TImode);
+ op0 = SUBREG_REG (op0);
+ tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+ if (mode == DFmode)
+ op1 = gen_lowpart (DImode, op1);
+ op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+ op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
+ op1 = gen_rtx_IOR (TImode, tmp, op1);
+ }
+ }
+
emit_insn (gen_rtx_SET (op0, op1));
}
return;
}
- /* Special case TImode to V1TImode conversions, via V2DI. */
- if (mode == V1TImode
+ /* Special case TImode to 128-bit vector conversions via V2DI. */
+ if (VECTOR_MODE_P (mode)
+ && GET_MODE_SIZE (mode) == 16
&& SUBREG_P (op1)
&& GET_MODE (SUBREG_REG (op1)) == TImode
&& TARGET_64BIT && TARGET_SSE
emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
emit_insn (gen_vec_concatv2di (tmp, lo, hi));
- emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
+ emit_move_insn (op0, gen_lowpart (mode, tmp));
return;
}
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx op2 = operands[2];
+ rtx src;
machine_mode dmode = GET_MODE (op0);
machine_mode smode = GET_MODE (op1);
op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
- op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
- op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
- rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
- op1, op2));
- emit_insn (insn);
+ /* paskusdw/packuswb does unsigned saturation of a signed source
+ which is different from generic us_truncate RTX. */
+ if (code == US_TRUNCATE)
+ src = gen_rtx_UNSPEC (sse_dmode,
+ gen_rtvec (2, op1, op2),
+ UNSPEC_US_TRUNCATE);
+ else
+ {
+ op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
+ op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
+ src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
+ }
+
+ emit_move_insn (dest, src);
ix86_move_vector_high_sse_to_mmx (op0);
}
switch (mode)
{
- case E_V4QImode:
case E_V8QImode:
+ case E_V4QImode:
+ case E_V2QImode:
sse_mode = V16QImode;
double_sse_mode = V32QImode;
mask = gen_rtx_PARALLEL (VOIDmode,
else
dest = NULL_RTX;
op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
- mask = ix86_build_signbit_mask (vmode, 0, 0);
+ mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
if (CONST_DOUBLE_P (operands[1]))
{
/* Handle special case - vector comparsion with boolean result, transform
it using ptest instruction. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || (mode == TImode && !TARGET_64BIT)
|| mode == OImode)
{
rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
gcc_assert (code == EQ || code == NE);
- if (mode == OImode)
+ if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
{
op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
tmp = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
tmp = gen_lowpart (p_mode, tmp);
- emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
- gen_rtx_UNSPEC (CCmode,
+ emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+ gen_rtx_UNSPEC (CCZmode,
gen_rtvec (2, tmp, tmp),
UNSPEC_PTEST)));
tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
cmpmode = SELECT_CC_MODE (code, op0, op1);
flags = gen_rtx_REG (cmpmode, FLAGS_REG);
+ /* Attempt to use PTEST, if available, when testing vector modes for
+ equality/inequality against zero. */
+ if (op1 == const0_rtx
+ && SUBREG_P (op0)
+ && cmpmode == CCZmode
+ && SUBREG_BYTE (op0) == 0
+ && REG_P (SUBREG_REG (op0))
+ && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
+ && TARGET_SSE4_1
+ && GET_MODE (op0) == TImode
+ && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
+ {
+ tmp = SUBREG_REG (op0);
+ tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
+ }
+ else
+ tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
+
/* This is very simple, but making the interface the same as in the
FP case makes the rest of the code easier. */
- tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
emit_insn (gen_rtx_SET (flags, tmp));
/* Return the test that should be put into the flags user, i.e.
}
}
-/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
+/* Extend SRC into next wider integer vector type. UNSIGNED_P is
+ true if we should do zero extension, else sign extension. */
+
+void
+ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
+{
+ machine_mode imode = GET_MODE (src);
+ rtx ops[3];
+
+ switch (imode)
+ {
+ case E_V8QImode:
+ case E_V4QImode:
+ case E_V2QImode:
+ case E_V4HImode:
+ case E_V2HImode:
+ case E_V2SImode:
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ ops[0] = gen_reg_rtx (imode);
+
+ ops[1] = force_reg (imode, src);
+
+ if (unsigned_p)
+ ops[2] = force_reg (imode, CONST0_RTX (imode));
+ else
+ ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+ ops[1], pc_rtx, pc_rtx);
+
+ ix86_split_mmx_punpck (ops, false);
+ emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
+}
+
+/* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
true if we should do zero extension, else sign extension. HIGH_P is
true if we want the N/2 high elements, else the low elements. */
{
if (alg == no_stringop)
return false;
+ /* It is not possible to use a library call if we have non-default
+ address space. We can do better than the generic byte-at-a-time
+ loop, used as a fallback. */
+ if (alg == libcall && have_as)
+ return false;
if (alg == vector_loop)
return TARGET_SSE || TARGET_AVX;
/* Algorithms using the rep prefix want at least edi and ecx;
gcc_assert (alg != libcall);
return alg;
}
+
+ /* Try to use some reasonable fallback algorithm. Note that for
+ non-default address spaces we default to a loop instead of
+ a libcall. */
return (alg_usable_p (algs->unknown_size, memset, have_as)
- ? algs->unknown_size : libcall);
+ ? algs->unknown_size : have_as ? loop : libcall);
}
/* Decide on alignment. We know that the operand is already aligned to ALIGN
machine_mode mode1 = insn_data[d->icode].operand[1].mode;
enum rtx_code comparison = d->comparison;
+ /* ptest reg, reg sets the carry flag. */
+ if (comparison == LTU
+ && (d->code == IX86_BUILTIN_PTESTC
+ || d->code == IX86_BUILTIN_PTESTC256)
+ && rtx_equal_p (op0, op1))
+ {
+ if (!target)
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const1_rtx);
+ return target;
+ }
+
if (VECTOR_MODE_P (mode0))
op0 = safe_vector_operand (op0, mode0);
if (VECTOR_MODE_P (mode1))
case V4SF_FTYPE_V4SF_UINT:
case V4SF_FTYPE_V4SF_DI:
case V4SF_FTYPE_V4SF_SI:
+ case V4DI_FTYPE_V4DI_V2DI:
case V2DI_FTYPE_V2DI_V2DI:
case V2DI_FTYPE_V16QI_V16QI:
case V2DI_FTYPE_V4SI_V4SI:
case V8HI_FTYPE_V8DI_V8HI_UQI:
case V8SI_FTYPE_V8DI_V8SI_UQI:
case V4SI_FTYPE_V4SI_V4SI_V4SI:
+ case V4DI_FTYPE_V4DI_V4DI_V2DI:
case V16SI_FTYPE_V16SI_V16SI_V16SI:
case V8DI_FTYPE_V8DI_V8DI_V8DI:
case V32HI_FTYPE_V32HI_V32HI_V32HI:
case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+ case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
nargs = 4;
nargs_constant = 1;
break;
HOST_WIDE_INT isa2 = ix86_isa_flags2;
HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
+ HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
/* The general case is we require all the ISAs specified in bisa{,2}
to be enabled.
The exceptions are:
OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
(OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
OPTION_MASK_ISA2_AVXVNNI
- (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512IFMA) or
+ (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
OPTION_MASK_ISA2_AVXIFMA
- (OPTION_MASK_ISA_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16) or
+ (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
OPTION_MASK_ISA2_AVXNECONVERT
+ OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
where for each such pair it is sufficient if either of the ISAs is
enabled, plus if it is ored with other options also those others.
OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
- if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
- == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
- && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
- isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
-
- if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
- == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
- && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
- isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
-
- if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
- == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
- && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
- isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
-
- if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
- == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
- || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
- && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
- == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
- || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
- {
- isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
- isa2 |= OPTION_MASK_ISA2_AVXVNNI;
- }
-
- if ((((bisa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
- == (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
- || (bisa2 & OPTION_MASK_ISA2_AVXIFMA) != 0)
- && (((isa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
- == (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
- || (isa2 & OPTION_MASK_ISA2_AVXIFMA) != 0))
- {
- isa |= OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL;
- isa2 |= OPTION_MASK_ISA2_AVXIFMA;
- }
- if ((((bisa & OPTION_MASK_ISA_AVX512VL) != 0
- && (bisa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
- && (bisa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0)
- && (((isa & OPTION_MASK_ISA_AVX512VL) != 0
- && (isa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
- || (isa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0))
- {
- isa |= OPTION_MASK_ISA_AVX512VL;
- isa2 |= OPTION_MASK_ISA2_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16;
- }
+#define SHARE_BUILTIN(A1, A2, B1, B2) \
+ if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
+ && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
+ && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
+ || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
+ { \
+ tmp_isa |= (A1) | (B1); \
+ tmp_isa2 |= (A2) | (B2); \
+ }
+
+ SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
+ SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
+ SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
+ SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
+ OPTION_MASK_ISA2_AVXVNNI);
+ SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
+ OPTION_MASK_ISA2_AVXIFMA);
+ SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
+ OPTION_MASK_ISA2_AVXNECONVERT);
+ SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
+ OPTION_MASK_ISA2_VAES);
+ isa = tmp_isa;
+ isa2 = tmp_isa2;
if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
/* __builtin_ia32_maskmovq requires MMX registers. */
return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
}
+/* Emit instructions to set the carry flag from ARG. */
+
+void
+ix86_expand_carry (rtx arg)
+{
+ if (!CONST_INT_P (arg) || arg == const0_rtx)
+ {
+ arg = convert_to_mode (QImode, arg, 1);
+ arg = copy_to_mode_reg (QImode, arg);
+ emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
+ }
+ else
+ emit_insn (gen_x86_stc ());
+}
+
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
op1 = expand_normal (arg0);
- if (!integer_zerop (arg0))
- op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
op2 = expand_normal (arg1);
if (!register_operand (op2, mode0))
}
op0 = gen_reg_rtx (mode0);
- if (integer_zerop (arg0))
+ if (op1 == const0_rtx)
{
/* If arg0 is 0, optimize right away into add or sub
instruction that sets CCCmode flags. */
else
{
/* Generate CF from input operand. */
- emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+ ix86_expand_carry (op1);
/* Generate instruction that consumes CF. */
op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
op0 = pc_rtx;
}
else if (TREE_CODE (arg3) == SSA_NAME
- && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
+ && VECTOR_TYPE_P (TREE_TYPE (arg3)))
{
/* Recognize also when mask is like:
__v2df src = _mm_setzero_pd ();
{
tmp1 = force_reg (GET_MODE_INNER (mode), val);
tmp2 = gen_reg_rtx (mode);
- emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
- CONST0_RTX (mode), tmp1));
+ emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
tmp1 = gen_lowpart (mode, tmp2);
}
else
emit_move_insn (target, gen_lowpart (mode, words[0]));
else if (n_words == 2)
{
- rtx tmp = gen_reg_rtx (mode);
- emit_clobber (tmp);
- emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
- emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
- emit_move_insn (target, tmp);
+ gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
+ machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
+ rtx tmp = gen_reg_rtx (concat_mode);
+ vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
+ ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
+ emit_move_insn (target, gen_lowpart (mode, tmp));
}
else if (n_words == 4)
{
? gen_reg_rtx (V16HFmode)
: gen_reg_rtx (V16BFmode));
if (elt < 16)
- emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+ emit_insn (gen_vec_extract_lo (mode, tmp, vec));
else
- emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+ emit_insn (gen_vec_extract_hi (mode, tmp, vec));
ix86_expand_vector_extract (false, target, tmp, elt & 15);
return;
}
? gen_reg_rtx (V8HFmode)
: gen_reg_rtx (V8BFmode));
if (elt < 8)
- emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+ emit_insn (gen_vec_extract_lo (mode, tmp, vec));
else
- emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+ emit_insn (gen_vec_extract_hi (mode, tmp, vec));
ix86_expand_vector_extract (false, target, tmp, elt & 7);
return;
}
return true;
}
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
+ using insertps. */
+static bool
+expand_vec_perm_insertps (struct expand_vec_perm_d *d)
+{
+ machine_mode vmode = d->vmode;
+ unsigned i, cnt_s, nelt = d->nelt;
+ int cnt_d = -1;
+ rtx src, dst;
+
+ if (d->one_operand_p)
+ return false;
+
+ if (!(TARGET_SSE4_1
+ && (vmode == V4SFmode || vmode == V4SImode
+ || (TARGET_MMX_WITH_SSE
+ && (vmode == V2SFmode || vmode == V2SImode)))))
+ return false;
+
+ for (i = 0; i < nelt; ++i)
+ {
+ if (d->perm[i] == i)
+ continue;
+ if (cnt_d != -1)
+ {
+ cnt_d = -1;
+ break;
+ }
+ cnt_d = i;
+ }
+
+ if (cnt_d == -1)
+ {
+ for (i = 0; i < nelt; ++i)
+ {
+ if (d->perm[i] == i + nelt)
+ continue;
+ if (cnt_d != -1)
+ return false;
+ cnt_d = i;
+ }
+
+ if (cnt_d == -1)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ gcc_assert (cnt_d != -1);
+
+ cnt_s = d->perm[cnt_d];
+ if (cnt_s < nelt)
+ {
+ src = d->op0;
+ dst = d->op1;
+ }
+ else
+ {
+ cnt_s -= nelt;
+ src = d->op1;
+ dst = d->op0;
+ }
+ gcc_assert (cnt_s < nelt);
+
+ rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
+ GEN_INT (cnt_s << 6 | cnt_d << 4));
+ emit_insn (x);
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
mmode = VOIDmode;
}
+ /* Canonicalize vec_merge. */
+ if (swap_commutative_operands_p (op1, op0)
+ /* Two operands have same precedence, then
+ first bit of mask select first operand. */
+ || (!swap_commutative_operands_p (op0, op1)
+ && !(mask & 1)))
+ {
+ unsigned n_elts = GET_MODE_NUNITS (vmode);
+ std::swap (op0, op1);
+ unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
+ if (n_elts == HOST_BITS_PER_WIDE_INT)
+ mask_all = -1;
+ else
+ mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
+ mask = ~mask & mask_all;
+ }
+
if (mmode != VOIDmode)
maskop = force_reg (mmode, gen_int_mode (mask, mmode));
else
if (expand_vec_perm_movs (d))
return true;
+ /* Try the SSE4.1 insertps instruction. */
+ if (expand_vec_perm_insertps (d))
+ return true;
+
/* Try the fully general two operand permute. */
if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
d->testing_p))
if (d->testing_p)
return true;
- rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
+ rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
if (elt >= nelt2)
{
- maybe_gen = maybe_gen_vec_interleave_high;
+ gen_interleave = gen_vec_interleave_high;
elt -= nelt2;
}
else
- maybe_gen = maybe_gen_vec_interleave_low;
+ gen_interleave = gen_vec_interleave_low;
nelt2 /= 2;
dest = gen_reg_rtx (vmode);
- emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
+ emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
vmode = V4SImode;
op0 = gen_lowpart (vmode, dest);
gcc_assert (ok);
}
-/* This function is similar as ix86_expand_vecop_qihi,
- but optimized under AVX512BW by using vpmovwb.
- For example, optimize vector MUL generation like
-
- vpmovzxbw ymm2, xmm0
- vpmovzxbw ymm3, xmm1
- vpmullw ymm4, ymm2, ymm3
- vpmovwb xmm0, ymm4
-
- it would take less instructions than ix86_expand_vecop_qihi.
- Return true if success. */
-
-static bool
-ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
- machine_mode himode, qimode = GET_MODE (dest);
- rtx hop1, hop2, hdest;
- rtx (*gen_extend)(rtx, rtx);
- rtx (*gen_truncate)(rtx, rtx);
- bool uns_p = (code == ASHIFTRT) ? false : true;
-
- /* There's no V64HImode multiplication instruction. */
- if (qimode == E_V64QImode)
- return false;
-
- /* vpmovwb only available under AVX512BW. */
- if (!TARGET_AVX512BW)
- return false;
- if ((qimode == V8QImode || qimode == V16QImode)
- && !TARGET_AVX512VL)
- return false;
- /* Not generate zmm instruction when prefer 128/256 bit vector width. */
- if (qimode == V32QImode
- && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
- return false;
-
- switch (qimode)
- {
- case E_V8QImode:
- himode = V8HImode;
- gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
- gen_truncate = gen_truncv8hiv8qi2;
- break;
- case E_V16QImode:
- himode = V16HImode;
- gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
- gen_truncate = gen_truncv16hiv16qi2;
- break;
- case E_V32QImode:
- himode = V32HImode;
- gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
- gen_truncate = gen_truncv32hiv32qi2;
- break;
- default:
- gcc_unreachable ();
- }
-
- hop1 = gen_reg_rtx (himode);
- hop2 = gen_reg_rtx (himode);
- hdest = gen_reg_rtx (himode);
- emit_insn (gen_extend (hop1, op1));
- emit_insn (gen_extend (hop2, op2));
- emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
- hop1, hop2)));
- emit_insn (gen_truncate (dest, hdest));
- return true;
-}
-
/* Expand a vector operation shift by constant for a V*QImode in terms of the
same operation on V*HImode. Return true if success. */
static bool
return true;
}
+void
+ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode qimode = GET_MODE (dest);
+ rtx qop1, qop2, hop1, hop2, qdest, hdest;
+ bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+ bool uns_p = code != ASHIFTRT;
+
+ switch (qimode)
+ {
+ case E_V4QImode:
+ case E_V8QImode:
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
+
+ if (op2vec)
+ qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
+ else
+ qop2 = op2;
+
+ qdest = gen_reg_rtx (V16QImode);
+
+ if (CONST_INT_P (op2)
+ && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+ && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
+ {
+ emit_move_insn (dest, gen_lowpart (qimode, qdest));
+ return;
+ }
+
+ switch (code)
+ {
+ case MULT:
+ gcc_assert (op2vec);
+ if (!TARGET_SSE4_1)
+ {
+ /* Unpack data such that we've got a source byte in each low byte
+ of each word. We don't care what goes into the high byte of
+ each word. Rather than trying to get zero in there, most
+ convenient is to let it be a copy of the low byte. */
+ hop1 = copy_to_reg (qop1);
+ hop2 = copy_to_reg (qop2);
+ emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
+ emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
+ break;
+ }
+ /* FALLTHRU */
+ case ASHIFT:
+ case ASHIFTRT:
+ case LSHIFTRT:
+ hop1 = gen_reg_rtx (V8HImode);
+ ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+ /* mult/vashr/vlshr/vashl */
+ if (op2vec)
+ {
+ hop2 = gen_reg_rtx (V8HImode);
+ ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+ }
+ else
+ hop2 = qop2;
+
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (code != MULT && op2vec)
+ {
+ /* Expand vashr/vlshr/vashl. */
+ hdest = gen_reg_rtx (V8HImode);
+ emit_insn (gen_rtx_SET (hdest,
+ simplify_gen_binary (code, V8HImode,
+ hop1, hop2)));
+ }
+ else
+ /* Expand mult/ashr/lshr/ashl. */
+ hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
+ NULL_RTX, 1, OPTAB_DIRECT);
+
+ if (TARGET_AVX512BW && TARGET_AVX512VL)
+ {
+ if (qimode == V8QImode)
+ qdest = dest;
+ else
+ qdest = gen_reg_rtx (V8QImode);
+
+ emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
+ }
+ else
+ {
+ struct expand_vec_perm_d d;
+ rtx qres = gen_lowpart (V16QImode, hdest);
+ bool ok;
+ int i;
+
+ /* Merge the data back into the right place. */
+ d.target = qdest;
+ d.op0 = d.op1 = qres;
+ d.vmode = V16QImode;
+ d.nelt = 16;
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = i * 2;
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+ }
+
+ if (qdest != dest)
+ emit_move_insn (dest, gen_lowpart (qimode, qdest));
+}
+
+/* Emit instruction in 2x wider mode. For example, optimize
+ vector MUL generation like
+
+ vpmovzxbw ymm2, xmm0
+ vpmovzxbw ymm3, xmm1
+ vpmullw ymm4, ymm2, ymm3
+ vpmovwb xmm0, ymm4
+
+ it would take less instructions than ix86_expand_vecop_qihi.
+ Return true if success. */
+
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode himode, qimode = GET_MODE (dest);
+ machine_mode wqimode;
+ rtx qop1, qop2, hop1, hop2, hdest;
+ rtx (*gen_truncate)(rtx, rtx) = NULL;
+ bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+ bool uns_p = code != ASHIFTRT;
+
+ if ((qimode == V16QImode && !TARGET_AVX2)
+ || (qimode == V32QImode && !TARGET_AVX512BW)
+ /* There are no V64HImode instructions. */
+ || qimode == V64QImode)
+ return false;
+
+ /* Do not generate ymm/zmm instructions when
+ target prefers 128/256 bit vector width. */
+ if ((qimode == V16QImode && TARGET_PREFER_AVX128)
+ || (qimode == V32QImode && TARGET_PREFER_AVX256))
+ return false;
+
+ switch (qimode)
+ {
+ case E_V16QImode:
+ himode = V16HImode;
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen_truncate = gen_truncv16hiv16qi2;
+ break;
+ case E_V32QImode:
+ himode = V32HImode;
+ gen_truncate = gen_truncv32hiv32qi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
+ qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
+
+ if (op2vec)
+ qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
+ else
+ qop2 = op2;
+
+ hop1 = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+
+ if (op2vec)
+ {
+ hop2 = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+ }
+ else
+ hop2 = qop2;
+
+ if (code != MULT && op2vec)
+ {
+ /* Expand vashr/vlshr/vashl. */
+ hdest = gen_reg_rtx (himode);
+ emit_insn (gen_rtx_SET (hdest,
+ simplify_gen_binary (code, himode,
+ hop1, hop2)));
+ }
+ else
+ /* Expand mult/ashr/lshr/ashl. */
+ hdest = expand_simple_binop (himode, code, hop1, hop2,
+ NULL_RTX, 1, OPTAB_DIRECT);
+
+ if (gen_truncate)
+ emit_insn (gen_truncate (dest, hdest));
+ else
+ {
+ struct expand_vec_perm_d d;
+ rtx wqdest = gen_reg_rtx (wqimode);
+ rtx wqres = gen_lowpart (wqimode, hdest);
+ bool ok;
+ int i;
+
+ /* Merge the data back into the right place. */
+ d.target = wqdest;
+ d.op0 = d.op1 = wqres;
+ d.vmode = wqimode;
+ d.nelt = GET_MODE_NUNITS (wqimode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = i * 2;
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+
+ emit_move_insn (dest, gen_lowpart (qimode, wqdest));
+ }
+
+ return true;
+}
+
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
rtx (*gen_il) (rtx, rtx, rtx);
rtx (*gen_ih) (rtx, rtx, rtx);
rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+ bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
struct expand_vec_perm_d d;
- bool ok, full_interleave;
- bool uns_p = false;
+ bool full_interleave = true;
+ bool uns_p = code != ASHIFTRT;
+ bool ok;
int i;
if (CONST_INT_P (op2)
&& ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
return;
- if (TARGET_AVX512BW
- && VECTOR_MODE_P (GET_MODE (op2))
- && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+ if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
return;
switch (qimode)
{
case E_V16QImode:
himode = V8HImode;
- gen_il = gen_vec_interleave_lowv16qi;
- gen_ih = gen_vec_interleave_highv16qi;
break;
case E_V32QImode:
himode = V16HImode;
- gen_il = gen_avx2_interleave_lowv32qi;
- gen_ih = gen_avx2_interleave_highv32qi;
break;
case E_V64QImode:
himode = V32HImode;
- gen_il = gen_avx512bw_interleave_lowv64qi;
- gen_ih = gen_avx512bw_interleave_highv64qi;
break;
default:
gcc_unreachable ();
switch (code)
{
case MULT:
+ gcc_assert (op2vec);
/* Unpack data such that we've got a source byte in each low byte of
each word. We don't care what goes into the high byte of each word.
Rather than trying to get zero in there, most convenient is to let
it be a copy of the low byte. */
+ switch (qimode)
+ {
+ case E_V16QImode:
+ gen_il = gen_vec_interleave_lowv16qi;
+ gen_ih = gen_vec_interleave_highv16qi;
+ break;
+ case E_V32QImode:
+ gen_il = gen_avx2_interleave_lowv32qi;
+ gen_ih = gen_avx2_interleave_highv32qi;
+ full_interleave = false;
+ break;
+ case E_V64QImode:
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ full_interleave = false;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
op2_l = gen_reg_rtx (qimode);
op2_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op2_l, op2, op2));
op1_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op1_l, op1, op1));
emit_insn (gen_ih (op1_h, op1, op1));
- full_interleave = qimode == V16QImode;
break;
case ASHIFT:
- case LSHIFTRT:
- uns_p = true;
- /* FALLTHRU */
case ASHIFTRT:
+ case LSHIFTRT:
op1_l = gen_reg_rtx (himode);
op1_h = gen_reg_rtx (himode);
ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
/* vashr/vlshr/vashl */
- if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+ if (op2vec)
{
rtx tmp = force_reg (qimode, op2);
op2_l = gen_reg_rtx (himode);
else
op2_l = op2_h = op2;
- full_interleave = true;
break;
default:
gcc_unreachable ();
}
- /* Perform vashr/vlshr/vashl. */
- if (code != MULT
- && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+ if (code != MULT && op2vec)
{
+ /* Expand vashr/vlshr/vashl. */
res_l = gen_reg_rtx (himode);
res_h = gen_reg_rtx (himode);
emit_insn (gen_rtx_SET (res_l,
simplify_gen_binary (code, himode,
op1_h, op2_h)));
}
- /* Performance mult/ashr/lshr/ashl. */
else
{
+ /* Expand mult/ashr/lshr/ashl. */
res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
1, OPTAB_DIRECT);
res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
if (full_interleave)
{
- /* For SSE2, we used an full interleave, so the desired
+ /* We used the full interleave, the desired
results are in the even elements. */
for (i = 0; i < d.nelt; ++i)
d.perm[i] = i * 2;
ok = ix86_expand_vec_perm_const_1 (&d);
gcc_assert (ok);
-
- set_unique_reg_note (get_last_insn (), REG_EQUAL,
- gen_rtx_fmt_ee (code, qimode, op1, op2));
}
/* Helper function of ix86_expand_mul_widen_evenodd. Return true