i386: Improve memory copy from named address space [PR111657]

[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc

index 50d9d34ebcb68168dcf81d66e94069abaf637282..9a988347200ced7ee91c5a56dedeb5c4077a93f3 100644 (file)
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -66,7 +66,6 @@ along with GCC; see the file COPYING3.  If not see
  #include "pass_manager.h"
  #include "target-globals.h"
  #include "gimple-iterator.h"
-#include "tree-vectorizer.h"
  #include "shrink-wrap.h"
  #include "builtins.h"
  #include "rtl-iter.h"
@@ -429,6 +428,16 @@ ix86_expand_move (machine_mode mode, rtx operands[])
  
      default:
        break;
+
+    case SUBREG:
+      /* Transform TImode paradoxical SUBREG into zero_extendditi2.  */
+      if (TARGET_64BIT
+         && mode == TImode
+         && SUBREG_P (op1)
+         && GET_MODE (SUBREG_REG (op1)) == DImode
+         && SUBREG_BYTE (op1) == 0)
+       op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
+      break;
      }
  
    if ((flag_pic || MACHOPIC_INDIRECT)
@@ -521,7 +530,8 @@ ix86_expand_move (machine_mode mode, rtx operands[])
                   return;
                 }
             }
-         else if (GET_MODE_SIZE (mode) >= 16)
+         else if (CONST_WIDE_INT_P (op1)
+                  && GET_MODE_SIZE (mode) >= 16)
             {
               rtx tmp = ix86_convert_const_wide_int_to_broadcast
                 (GET_MODE (op0), op1);
@@ -531,6 +541,43 @@ ix86_expand_move (machine_mode mode, rtx operands[])
         }
      }
  
+  /* Special case inserting 64-bit values into a TImode register.  */
+  if (TARGET_64BIT
+      /* Disable for -O0 (see PR110587) unless naked (PR110533).  */
+      && (optimize || ix86_function_naked (current_function_decl))
+      && (mode == DImode || mode == DFmode)
+      && SUBREG_P (op0)
+      && GET_MODE (SUBREG_REG (op0)) == TImode
+      && REG_P (SUBREG_REG (op0))
+      && REG_P (op1))
+    {
+      /* Use *insvti_lowpart_1 to set lowpart.  */
+      if (SUBREG_BYTE (op0) == 0)
+       {
+         wide_int mask = wi::mask (64, true, 128);
+         rtx tmp = immed_wide_int_const (mask, TImode);
+         op0 = SUBREG_REG (op0);
+         tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+         if (mode == DFmode)
+           op1 = gen_lowpart (DImode, op1);
+         op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+         op1 = gen_rtx_IOR (TImode, tmp, op1);
+       }
+      /* Use *insvti_highpart_1 to set highpart.  */
+      else if (SUBREG_BYTE (op0) == 8)
+       {
+         wide_int mask = wi::mask (64, false, 128);
+         rtx tmp = immed_wide_int_const (mask, TImode);
+         op0 = SUBREG_REG (op0);
+         tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+         if (mode == DFmode)
+           op1 = gen_lowpart (DImode, op1);
+         op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+         op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
+         op1 = gen_rtx_IOR (TImode, tmp, op1);
+       }
+    }
+
    emit_insn (gen_rtx_SET (op0, op1));
  }
  
@@ -696,8 +743,9 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
        return;
      }
  
-  /* Special case TImode to V1TImode conversions, via V2DI.  */
-  if (mode == V1TImode
+  /* Special case TImode to 128-bit vector conversions via V2DI.  */
+  if (VECTOR_MODE_P (mode)
+      && GET_MODE_SIZE (mode) == 16
        && SUBREG_P (op1)
        && GET_MODE (SUBREG_REG (op1)) == TImode
        && TARGET_64BIT && TARGET_SSE
@@ -709,7 +757,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
        emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
        emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
        emit_insn (gen_vec_concatv2di (tmp, lo, hi));
-      emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
+      emit_move_insn (op0, gen_lowpart (mode, tmp));
        return;
      }
  
@@ -1019,6 +1067,7 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
    rtx op0 = operands[0];
    rtx op1 = operands[1];
    rtx op2 = operands[2];
+  rtx src;
  
    machine_mode dmode = GET_MODE (op0);
    machine_mode smode = GET_MODE (op1);
@@ -1042,11 +1091,20 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
    op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
    op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
  
-  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
-  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
-  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
-                                                   op1, op2));
-  emit_insn (insn);
+  /* paskusdw/packuswb does unsigned saturation of a signed source
+     which is different from generic us_truncate RTX.  */
+  if (code == US_TRUNCATE)
+    src = gen_rtx_UNSPEC (sse_dmode,
+                         gen_rtvec (2, op1, op2),
+                         UNSPEC_US_TRUNCATE);
+  else
+    {
+      op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
+      op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
+      src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
+    }
+
+  emit_move_insn (dest, src);
  
    ix86_move_vector_high_sse_to_mmx (op0);
  }
@@ -1066,8 +1124,9 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
  
    switch (mode)
      {
-    case E_V4QImode:
      case E_V8QImode:
+    case E_V4QImode:
+    case E_V2QImode:
        sse_mode = V16QImode;
        double_sse_mode = V32QImode;
        mask = gen_rtx_PARALLEL (VOIDmode,
@@ -2266,7 +2325,7 @@ ix86_expand_copysign (rtx operands[])
    else
      dest = NULL_RTX;
    op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
-  mask = ix86_build_signbit_mask (vmode, 0, 0);
+  mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
  
    if (CONST_DOUBLE_P (operands[1]))
      {
@@ -2353,6 +2412,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
    /* Handle special case - vector comparsion with boolean result, transform
       it using ptest instruction.  */
    if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+      || (mode == TImode && !TARGET_64BIT)
        || mode == OImode)
      {
        rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
@@ -2360,7 +2420,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
  
        gcc_assert (code == EQ || code == NE);
  
-      if (mode == OImode)
+      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
         {
           op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
           op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
@@ -2370,8 +2430,8 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
        tmp = gen_reg_rtx (mode);
        emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
        tmp = gen_lowpart (p_mode, tmp);
-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
-                             gen_rtx_UNSPEC (CCmode,
+      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+                             gen_rtx_UNSPEC (CCZmode,
                                               gen_rtvec (2, tmp, tmp),
                                               UNSPEC_PTEST)));
        tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
@@ -2946,9 +3006,26 @@ ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
    cmpmode = SELECT_CC_MODE (code, op0, op1);
    flags = gen_rtx_REG (cmpmode, FLAGS_REG);
  
+  /* Attempt to use PTEST, if available, when testing vector modes for
+     equality/inequality against zero.  */
+  if (op1 == const0_rtx
+      && SUBREG_P (op0)
+      && cmpmode == CCZmode
+      && SUBREG_BYTE (op0) == 0
+      && REG_P (SUBREG_REG (op0))
+      && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
+      && TARGET_SSE4_1
+      && GET_MODE (op0) == TImode
+      && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
+    {
+      tmp = SUBREG_REG (op0);
+      tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
+    }
+  else
+    tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
+
    /* This is very simple, but making the interface the same as in the
       FP case makes the rest of the code easier.  */
-  tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
    emit_insn (gen_rtx_SET (flags, tmp));
  
    /* Return the test that should be put into the flags user, i.e.
@@ -5560,7 +5637,43 @@ ix86_expand_vec_perm (rtx operands[])
      }
  }
  
-/* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
+/* Extend SRC into next wider integer vector type.  UNSIGNED_P is
+   true if we should do zero extension, else sign extension.  */
+
+void
+ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
+{
+  machine_mode imode = GET_MODE (src);
+  rtx ops[3];
+
+  switch (imode)
+    {
+    case E_V8QImode:
+    case E_V4QImode:
+    case E_V2QImode:
+    case E_V4HImode:
+    case E_V2HImode:
+    case E_V2SImode:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  ops[0] = gen_reg_rtx (imode);
+
+  ops[1] = force_reg (imode, src);
+
+  if (unsigned_p)
+    ops[2] = force_reg (imode, CONST0_RTX (imode));
+  else
+    ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+                                 ops[1], pc_rtx, pc_rtx);
+
+  ix86_split_mmx_punpck (ops, false);
+  emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
+}
+
+/* Unpack SRC into the next wider integer vector type.  UNSIGNED_P is
     true if we should do zero extension, else sign extension.  HIGH_P is
     true if we want the N/2 high elements, else the low elements.  */
  
@@ -8207,6 +8320,11 @@ alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
  {
    if (alg == no_stringop)
      return false;
+  /* It is not possible to use a library call if we have non-default
+     address space.  We can do better than the generic byte-at-a-time
+     loop, used as a fallback.  */
+  if (alg == libcall && have_as)
+    return false;
    if (alg == vector_loop)
      return TARGET_SSE || TARGET_AVX;
    /* Algorithms using the rep prefix want at least edi and ecx;
@@ -8381,8 +8499,12 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
         gcc_assert (alg != libcall);
        return alg;
      }
+
+  /* Try to use some reasonable fallback algorithm.  Note that for
+     non-default address spaces we default to a loop instead of
+     a libcall.  */
    return (alg_usable_p (algs->unknown_size, memset, have_as)
-         ? algs->unknown_size : libcall);
+         ? algs->unknown_size : have_as ? loop : libcall);
  }
  
  /* Decide on alignment.  We know that the operand is already aligned to ALIGN
@@ -10222,6 +10344,18 @@ ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
    machine_mode mode1 = insn_data[d->icode].operand[1].mode;
    enum rtx_code comparison = d->comparison;
  
+  /* ptest reg, reg sets the carry flag.  */
+  if (comparison == LTU
+      && (d->code == IX86_BUILTIN_PTESTC
+         || d->code == IX86_BUILTIN_PTESTC256)
+      && rtx_equal_p (op0, op1))
+    {
+      if (!target)
+       target = gen_reg_rtx (SImode);
+      emit_move_insn (target, const1_rtx);
+      return target;
+    }
+
    if (VECTOR_MODE_P (mode0))
      op0 = safe_vector_operand (op0, mode0);
    if (VECTOR_MODE_P (mode1))
@@ -10663,6 +10797,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V4SF_FTYPE_V4SF_UINT:
      case V4SF_FTYPE_V4SF_DI:
      case V4SF_FTYPE_V4SF_SI:
+    case V4DI_FTYPE_V4DI_V2DI:
      case V2DI_FTYPE_V2DI_V2DI:
      case V2DI_FTYPE_V16QI_V16QI:
      case V2DI_FTYPE_V4SI_V4SI:
@@ -10960,6 +11095,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V8HI_FTYPE_V8DI_V8HI_UQI:
      case V8SI_FTYPE_V8DI_V8SI_UQI:
      case V4SI_FTYPE_V4SI_V4SI_V4SI:
+    case V4DI_FTYPE_V4DI_V4DI_V2DI:
      case V16SI_FTYPE_V16SI_V16SI_V16SI:
      case V8DI_FTYPE_V8DI_V8DI_V8DI:
      case V32HI_FTYPE_V32HI_V32HI_V32HI:
@@ -11132,6 +11268,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
      case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
      case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
        nargs = 4;
        nargs_constant = 1;
        break;
@@ -12601,6 +12738,7 @@ ix86_check_builtin_isa_match (unsigned int fcode,
         OPTION_MASK_ISA2_AVXIFMA
       (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
         OPTION_MASK_ISA2_AVXNECONVERT
+     OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
       where for each such pair it is sufficient if either of the ISAs is
       enabled, plus if it is ored with other options also those others.
       OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
@@ -12624,7 +12762,8 @@ ix86_check_builtin_isa_match (unsigned int fcode,
                  OPTION_MASK_ISA2_AVXIFMA);
    SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
                  OPTION_MASK_ISA2_AVXNECONVERT);
-  SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, 0, OPTION_MASK_ISA2_VAES);
+  SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
+                OPTION_MASK_ISA2_VAES);
    isa = tmp_isa;
    isa2 = tmp_isa2;
  
@@ -12644,6 +12783,21 @@ ix86_check_builtin_isa_match (unsigned int fcode,
    return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
  }
  
+/* Emit instructions to set the carry flag from ARG.  */
+
+void
+ix86_expand_carry (rtx arg)
+{
+  if (!CONST_INT_P (arg) || arg == const0_rtx)
+    {
+      arg = convert_to_mode (QImode, arg, 1);
+      arg = copy_to_mode_reg (QImode, arg);
+      emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
+    }
+  else
+    emit_insn (gen_x86_stc ());
+}
+
  /* Expand an expression EXP that calls a built-in function,
     with result going to TARGET if that's convenient
     (and in mode MODE if that's convenient).
@@ -13948,8 +14102,6 @@ rdseed_step:
        arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
  
        op1 = expand_normal (arg0);
-      if (!integer_zerop (arg0))
-       op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
  
        op2 = expand_normal (arg1);
        if (!register_operand (op2, mode0))
@@ -13967,7 +14119,7 @@ rdseed_step:
         }
  
        op0 = gen_reg_rtx (mode0);
-      if (integer_zerop (arg0))
+      if (op1 == const0_rtx)
         {
           /* If arg0 is 0, optimize right away into add or sub
              instruction that sets CCCmode flags.  */
@@ -13977,7 +14129,7 @@ rdseed_step:
        else
         {
           /* Generate CF from input operand.  */
-         emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+         ix86_expand_carry (op1);
  
           /* Generate instruction that consumes CF.  */
           op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
@@ -15371,8 +15523,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
             {
               tmp1 = force_reg (GET_MODE_INNER (mode), val);
               tmp2 = gen_reg_rtx (mode);
-             emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
-                                             CONST0_RTX (mode), tmp1));
+             emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
               tmp1 = gen_lowpart (mode, tmp2);
             }
           else
@@ -16367,11 +16518,12 @@ quarter:
         emit_move_insn (target, gen_lowpart (mode, words[0]));
        else if (n_words == 2)
         {
-         rtx tmp = gen_reg_rtx (mode);
-         emit_clobber (tmp);
-         emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
-         emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
-         emit_move_insn (target, tmp);
+         gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
+         machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
+         rtx tmp = gen_reg_rtx (concat_mode);
+         vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
+         ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
+         emit_move_insn (target, gen_lowpart (mode, tmp));
         }
        else if (n_words == 4)
         {
@@ -17333,9 +17485,9 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
                  ? gen_reg_rtx (V16HFmode)
                  : gen_reg_rtx (V16BFmode));
           if (elt < 16)
-           emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+           emit_insn (gen_vec_extract_lo (mode, tmp, vec));
           else
-           emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+           emit_insn (gen_vec_extract_hi (mode, tmp, vec));
           ix86_expand_vector_extract (false, target, tmp, elt & 15);
           return;
         }
@@ -17349,9 +17501,9 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
                  ? gen_reg_rtx (V8HFmode)
                  : gen_reg_rtx (V8BFmode));
           if (elt < 8)
-           emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+           emit_insn (gen_vec_extract_lo (mode, tmp, vec));
           else
-           emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+           emit_insn (gen_vec_extract_hi (mode, tmp, vec));
           ix86_expand_vector_extract (false, target, tmp, elt & 7);
           return;
         }
@@ -19290,6 +19442,23 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
        mmode = VOIDmode;
      }
  
+  /* Canonicalize vec_merge.  */
+  if (swap_commutative_operands_p (op1, op0)
+      /* Two operands have same precedence, then
+        first bit of mask select first operand.  */
+      || (!swap_commutative_operands_p (op0, op1)
+         && !(mask & 1)))
+    {
+      unsigned n_elts = GET_MODE_NUNITS (vmode);
+      std::swap (op0, op1);
+      unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
+      if (n_elts == HOST_BITS_PER_WIDE_INT)
+       mask_all  = -1;
+      else
+       mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
+      mask = ~mask & mask_all;
+    }
+
    if (mmode != VOIDmode)
      maskop = force_reg (mmode, gen_int_mode (mask, mmode));
    else
@@ -22415,18 +22584,18 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
        if (d->testing_p)
         return true;
  
-      rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
+      rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
        if (elt >= nelt2)
         {
-         maybe_gen = maybe_gen_vec_interleave_high;
+         gen_interleave = gen_vec_interleave_high;
           elt -= nelt2;
         }
        else
-       maybe_gen = maybe_gen_vec_interleave_low;
+       gen_interleave = gen_vec_interleave_low;
        nelt2 /= 2;
  
        dest = gen_reg_rtx (vmode);
-      emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
+      emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
  
        vmode = V4SImode;
        op0 = gen_lowpart (vmode, dest);
@@ -23105,71 +23274,6 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
    gcc_assert (ok);
  }
  
-/* This function is similar as ix86_expand_vecop_qihi,
-   but optimized under AVX512BW by using vpmovwb.
-   For example, optimize vector MUL generation like
-
-   vpmovzxbw ymm2, xmm0
-   vpmovzxbw ymm3, xmm1
-   vpmullw   ymm4, ymm2, ymm3
-   vpmovwb   xmm0, ymm4
-
-   it would take less instructions than ix86_expand_vecop_qihi.
-   Return true if success.  */
-
-static bool
-ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
-  machine_mode himode, qimode = GET_MODE (dest);
-  rtx hop1, hop2, hdest;
-  rtx (*gen_truncate)(rtx, rtx);
-  bool uns_p = (code == ASHIFTRT) ? false : true;
-
-  /* There are no V64HImode instructions.  */
-  if (qimode == V64QImode)
-    return false;
-
-  /* vpmovwb only available under AVX512BW.  */
-  if (!TARGET_AVX512BW)
-    return false;
-  if ((qimode == V8QImode || qimode == V16QImode)
-      && !TARGET_AVX512VL)
-    return false;
-  /* Do not generate ymm/zmm instructions when
-     target prefers 128/256 bit vector width.  */
-  if ((qimode == V16QImode && TARGET_PREFER_AVX128)
-      || (qimode == V32QImode && TARGET_PREFER_AVX256))
-    return false;
-
-  switch (qimode)
-    {
-    case E_V8QImode:
-      himode = V8HImode;
-      gen_truncate = gen_truncv8hiv8qi2;
-      break;
-    case E_V16QImode:
-      himode = V16HImode;
-      gen_truncate = gen_truncv16hiv16qi2;
-      break;
-    case E_V32QImode:
-      himode = V32HImode;
-      gen_truncate = gen_truncv32hiv32qi2;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  hop1 = gen_reg_rtx (himode);
-  hop2 = gen_reg_rtx (himode);
-  hdest = gen_reg_rtx (himode);
-  emit_insn (gen_extend_insn (hop1, op1, himode, qimode, uns_p));
-  emit_insn (gen_extend_insn (hop2, op2, himode, qimode, uns_p));
-  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
-                                                     hop1, hop2)));
-  emit_insn (gen_truncate (dest, hdest));
-  return true;
-}
-
  /* Expand a vector operation shift by constant for a V*QImode in terms of the
     same operation on V*HImode. Return true if success. */
  static bool
@@ -23274,9 +23378,9 @@ void
  ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
  {
    machine_mode qimode = GET_MODE (dest);
-  rtx qop1, qop2, hop1, hop2, qdest, hres;
+  rtx qop1, qop2, hop1, hop2, qdest, hdest;
    bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
-  bool uns_p = true;
+  bool uns_p = code != ASHIFTRT;
  
    switch (qimode)
      {
@@ -23294,28 +23398,39 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
    else
      qop2 = op2;
  
+  qdest = gen_reg_rtx (V16QImode);
+
+  if (CONST_INT_P (op2)
+      && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+      && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
+    {
+      emit_move_insn (dest, gen_lowpart (qimode, qdest));
+      return;
+    }
+
    switch (code)
      {
      case MULT:
        gcc_assert (op2vec);
-      /* Unpack data such that we've got a source byte in each low byte of
-        each word.  We don't care what goes into the high byte of each word.
-        Rather than trying to get zero in there, most convenient is to let
-        it be a copy of the low byte.  */
-      hop1 = copy_to_reg (qop1);
-      hop2 = copy_to_reg (qop2);
-      emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
-      emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
-      break;
-
-    case ASHIFTRT:
-      uns_p = false;
+      if (!TARGET_SSE4_1)
+       {
+         /* Unpack data such that we've got a source byte in each low byte
+            of each word.  We don't care what goes into the high byte of
+            each word.  Rather than trying to get zero in there, most
+            convenient is to let it be a copy of the low byte.  */
+         hop1 = copy_to_reg (qop1);
+         hop2 = copy_to_reg (qop2);
+         emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
+         emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
+         break;
+       }
        /* FALLTHRU */
      case ASHIFT:
+    case ASHIFTRT:
      case LSHIFTRT:
        hop1 = gen_reg_rtx (V8HImode);
        ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
-      /* vashr/vlshr/vashl  */
+      /* mult/vashr/vlshr/vashl  */
        if (op2vec)
         {
           hop2 = gen_reg_rtx (V8HImode);
@@ -23332,14 +23447,14 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
    if (code != MULT && op2vec)
      {
        /* Expand vashr/vlshr/vashl.  */
-      hres = gen_reg_rtx (V8HImode);
-      emit_insn (gen_rtx_SET (hres,
+      hdest = gen_reg_rtx (V8HImode);
+      emit_insn (gen_rtx_SET (hdest,
                               simplify_gen_binary (code, V8HImode,
                                                    hop1, hop2)));
      }
    else
      /* Expand mult/ashr/lshr/ashl.  */
-    hres = expand_simple_binop (V8HImode, code, hop1, hop2,
+    hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
                                 NULL_RTX, 1, OPTAB_DIRECT);
  
    if (TARGET_AVX512BW && TARGET_AVX512VL)
@@ -23349,21 +23464,18 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
        else
         qdest = gen_reg_rtx (V8QImode);
  
-      emit_insn (gen_truncv8hiv8qi2 (qdest, hres));
+      emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
      }
    else
      {
        struct expand_vec_perm_d d;
-      rtx qres = gen_lowpart (V16QImode, hres);
+      rtx qres = gen_lowpart (V16QImode, hdest);
        bool ok;
        int i;
  
-      qdest = gen_reg_rtx (V16QImode);
-
        /* Merge the data back into the right place.  */
        d.target = qdest;
-      d.op0 = qres;
-      d.op1 = qres;
+      d.op0 = d.op1 = qres;
        d.vmode = V16QImode;
        d.nelt = 16;
        d.one_operand_p = false;
@@ -23380,6 +23492,116 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
      emit_move_insn (dest, gen_lowpart (qimode, qdest));
  }
  
+/* Emit instruction in 2x wider mode.  For example, optimize
+   vector MUL generation like
+
+   vpmovzxbw ymm2, xmm0
+   vpmovzxbw ymm3, xmm1
+   vpmullw   ymm4, ymm2, ymm3
+   vpmovwb   xmm0, ymm4
+
+   it would take less instructions than ix86_expand_vecop_qihi.
+   Return true if success.  */
+
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode himode, qimode = GET_MODE (dest);
+  machine_mode wqimode;
+  rtx qop1, qop2, hop1, hop2, hdest;
+  rtx (*gen_truncate)(rtx, rtx) = NULL;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+  bool uns_p = code != ASHIFTRT;
+
+  if ((qimode == V16QImode && !TARGET_AVX2)
+      || (qimode == V32QImode && !TARGET_AVX512BW)
+      /* There are no V64HImode instructions.  */
+      || qimode == V64QImode)
+     return false;
+
+  /* Do not generate ymm/zmm instructions when
+     target prefers 128/256 bit vector width.  */
+  if ((qimode == V16QImode && TARGET_PREFER_AVX128)
+      || (qimode == V32QImode && TARGET_PREFER_AVX256))
+    return false;
+
+  switch (qimode)
+    {
+    case E_V16QImode:
+      himode = V16HImode;
+      if (TARGET_AVX512VL && TARGET_AVX512BW)
+       gen_truncate = gen_truncv16hiv16qi2;
+      break;
+    case E_V32QImode:
+      himode = V32HImode;
+      gen_truncate = gen_truncv32hiv32qi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
+  qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
+
+  if (op2vec)
+    qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
+  else
+    qop2 = op2;
+
+  hop1 = gen_reg_rtx (himode);
+  ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+
+  if (op2vec)
+    {
+      hop2 = gen_reg_rtx (himode);
+      ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+    }
+  else
+    hop2 = qop2;
+
+  if (code != MULT && op2vec)
+    {
+      /* Expand vashr/vlshr/vashl.  */
+      hdest = gen_reg_rtx (himode);
+      emit_insn (gen_rtx_SET (hdest,
+                             simplify_gen_binary (code, himode,
+                                                  hop1, hop2)));
+    }
+  else
+    /* Expand mult/ashr/lshr/ashl.  */
+    hdest = expand_simple_binop (himode, code, hop1, hop2,
+                                NULL_RTX, 1, OPTAB_DIRECT);
+
+  if (gen_truncate)
+    emit_insn (gen_truncate (dest, hdest));
+  else
+    {
+      struct expand_vec_perm_d d;
+      rtx wqdest = gen_reg_rtx (wqimode);
+      rtx wqres = gen_lowpart (wqimode, hdest);
+      bool ok;
+      int i;
+
+      /* Merge the data back into the right place.  */
+      d.target = wqdest;
+      d.op0 = d.op1 = wqres;
+      d.vmode = wqimode;
+      d.nelt = GET_MODE_NUNITS (wqimode);
+      d.one_operand_p = false;
+      d.testing_p = false;
+
+      for (i = 0; i < d.nelt; ++i)
+       d.perm[i] = i * 2;
+
+      ok = ix86_expand_vec_perm_const_1 (&d);
+      gcc_assert (ok);
+
+      emit_move_insn (dest, gen_lowpart (qimode, wqdest));
+    }
+
+  return true;
+}
+
  /* Expand a vector operation CODE for a V*QImode in terms of the
     same operation on V*HImode.  */
  
@@ -23394,7 +23616,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
    bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
    struct expand_vec_perm_d d;
    bool full_interleave = true;
-  bool uns_p = true;
+  bool uns_p = code != ASHIFTRT;
    bool ok;
    int i;
  
@@ -23403,9 +23625,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
        && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
      return;
  
-  if (TARGET_AVX512BW
-      && VECTOR_MODE_P (GET_MODE (op2))
-      && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+  if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
      return;
  
    switch (qimode)
@@ -23462,10 +23682,8 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
        emit_insn (gen_ih (op1_h, op1, op1));
        break;
  
-    case ASHIFTRT:
-      uns_p = false;
-      /* FALLTHRU */
      case ASHIFT:
+    case ASHIFTRT:
      case LSHIFTRT:
        op1_l = gen_reg_rtx (himode);
        op1_h = gen_reg_rtx (himode);