i386: Improve memory copy from named address space [PR111657]

[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.cc
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc

index 5334363e235bd37bd352edf11cbcdbd5363e8863..9a988347200ced7ee91c5a56dedeb5c4077a93f3 100644 (file)
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1,4 +1,4 @@
-/* Copyright (C) 1988-2022 Free Software Foundation, Inc.
+/* Copyright (C) 1988-2023 Free Software Foundation, Inc.
  
  This file is part of GCC.
  
@@ -66,7 +66,6 @@ along with GCC; see the file COPYING3.  If not see
  #include "pass_manager.h"
  #include "target-globals.h"
  #include "gimple-iterator.h"
-#include "tree-vectorizer.h"
  #include "shrink-wrap.h"
  #include "builtins.h"
  #include "rtl-iter.h"
@@ -92,6 +91,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "i386-options.h"
  #include "i386-builtins.h"
  #include "i386-expand.h"
+#include "asan.h"
  
  /* Split one or more double-mode RTL references into pairs of half-mode
     references.  The RTL can be REG, offsettable MEM, integer constant, or
@@ -173,6 +173,44 @@ split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
    rtx dlo, dhi;
    int deleted_move_count = 0;
    split_double_mode (mode, &dst, 1, &dlo, &dhi);
+  /* Constraints ensure that if both lo and hi are MEMs, then
+     dst has early-clobber and thus addresses of MEMs don't use
+     dlo/dhi registers.  Otherwise if at least one of li and hi are MEMs,
+     dlo/dhi are registers.  */
+  if (MEM_P (lo)
+      && rtx_equal_p (dlo, hi)
+      && reg_overlap_mentioned_p (dhi, lo))
+    {
+      /* If dlo is same as hi and lo's address uses dhi register,
+        code below would first emit_move_insn (dhi, hi)
+        and then emit_move_insn (dlo, lo).  But the former
+        would invalidate lo's address.  Load into dhi first,
+        then swap.  */
+      emit_move_insn (dhi, lo);
+      lo = dhi;
+    }
+  else if (MEM_P (hi)
+          && !MEM_P (lo)
+          && !rtx_equal_p (dlo, lo)
+          && reg_overlap_mentioned_p (dlo, hi))
+    {
+      /* In this case, code below would first emit_move_insn (dlo, lo)
+        and then emit_move_insn (dhi, hi).  But the former would
+        invalidate hi's address.  */
+      if (rtx_equal_p (dhi, lo))
+       {
+         /* We can't load into dhi first, so load into dlo
+            first and we'll swap.  */
+         emit_move_insn (dlo, hi);
+         hi = dlo;
+       }
+      else
+       {
+         /* Load into dhi first.  */
+         emit_move_insn (dhi, hi);
+         hi = dhi;
+       }
+    }
    if (!rtx_equal_p (dlo, hi))
      {
        if (!rtx_equal_p (dlo, lo))
@@ -263,7 +301,9 @@ ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
       broadcast only if vector broadcast is available.  */
    if (!TARGET_AVX
        || !CONST_WIDE_INT_P (op)
-      || standard_sse_constant_p (op, mode))
+      || standard_sse_constant_p (op, mode)
+      || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
+         != GET_MODE_BITSIZE (mode)))
      return nullptr;
  
    HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
@@ -297,7 +337,7 @@ ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
    machine_mode vector_mode;
    if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
      gcc_unreachable ();
-  rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
+  rtx target = gen_reg_rtx (vector_mode);
    bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
                                                target,
                                                GEN_INT (val_broadcast));
@@ -388,6 +428,16 @@ ix86_expand_move (machine_mode mode, rtx operands[])
  
      default:
        break;
+
+    case SUBREG:
+      /* Transform TImode paradoxical SUBREG into zero_extendditi2.  */
+      if (TARGET_64BIT
+         && mode == TImode
+         && SUBREG_P (op1)
+         && GET_MODE (SUBREG_REG (op1)) == DImode
+         && SUBREG_BYTE (op1) == 0)
+       op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
+      break;
      }
  
    if ((flag_pic || MACHOPIC_INDIRECT)
@@ -480,7 +530,8 @@ ix86_expand_move (machine_mode mode, rtx operands[])
                   return;
                 }
             }
-         else if (GET_MODE_SIZE (mode) >= 16)
+         else if (CONST_WIDE_INT_P (op1)
+                  && GET_MODE_SIZE (mode) >= 16)
             {
               rtx tmp = ix86_convert_const_wide_int_to_broadcast
                 (GET_MODE (op0), op1);
@@ -490,6 +541,43 @@ ix86_expand_move (machine_mode mode, rtx operands[])
         }
      }
  
+  /* Special case inserting 64-bit values into a TImode register.  */
+  if (TARGET_64BIT
+      /* Disable for -O0 (see PR110587) unless naked (PR110533).  */
+      && (optimize || ix86_function_naked (current_function_decl))
+      && (mode == DImode || mode == DFmode)
+      && SUBREG_P (op0)
+      && GET_MODE (SUBREG_REG (op0)) == TImode
+      && REG_P (SUBREG_REG (op0))
+      && REG_P (op1))
+    {
+      /* Use *insvti_lowpart_1 to set lowpart.  */
+      if (SUBREG_BYTE (op0) == 0)
+       {
+         wide_int mask = wi::mask (64, true, 128);
+         rtx tmp = immed_wide_int_const (mask, TImode);
+         op0 = SUBREG_REG (op0);
+         tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+         if (mode == DFmode)
+           op1 = gen_lowpart (DImode, op1);
+         op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+         op1 = gen_rtx_IOR (TImode, tmp, op1);
+       }
+      /* Use *insvti_highpart_1 to set highpart.  */
+      else if (SUBREG_BYTE (op0) == 8)
+       {
+         wide_int mask = wi::mask (64, false, 128);
+         rtx tmp = immed_wide_int_const (mask, TImode);
+         op0 = SUBREG_REG (op0);
+         tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+         if (mode == DFmode)
+           op1 = gen_lowpart (DImode, op1);
+         op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+         op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
+         op1 = gen_rtx_IOR (TImode, tmp, op1);
+       }
+    }
+
    emit_insn (gen_rtx_SET (op0, op1));
  }
  
@@ -645,7 +733,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
        if (!register_operand (op0, mode)
           && !register_operand (op1, mode))
         {
-         rtx scratch = ix86_gen_scratch_sse_rtx (mode);
+         rtx scratch = gen_reg_rtx (mode);
           emit_move_insn (scratch, op1);
           op1 = scratch;
         }
@@ -655,8 +743,9 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
        return;
      }
  
-  /* Special case TImode to V1TImode conversions, via V2DI.  */
-  if (mode == V1TImode
+  /* Special case TImode to 128-bit vector conversions via V2DI.  */
+  if (VECTOR_MODE_P (mode)
+      && GET_MODE_SIZE (mode) == 16
        && SUBREG_P (op1)
        && GET_MODE (SUBREG_REG (op1)) == TImode
        && TARGET_64BIT && TARGET_SSE
@@ -668,7 +757,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
        emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
        emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
        emit_insn (gen_vec_concatv2di (tmp, lo, hi));
-      emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
+      emit_move_insn (op0, gen_lowpart (mode, tmp));
        return;
      }
  
@@ -687,7 +776,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
        && !register_operand (op0, mode)
        && !register_operand (op1, mode))
      {
-      rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
+      rtx tmp = gen_reg_rtx (GET_MODE (op0));
        emit_move_insn (tmp, op1);
        emit_move_insn (op0, tmp);
        return;
@@ -978,6 +1067,7 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
    rtx op0 = operands[0];
    rtx op1 = operands[1];
    rtx op2 = operands[2];
+  rtx src;
  
    machine_mode dmode = GET_MODE (op0);
    machine_mode smode = GET_MODE (op1);
@@ -1001,11 +1091,20 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
    op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
    op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
  
-  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
-  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
-  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
-                                                   op1, op2));
-  emit_insn (insn);
+  /* paskusdw/packuswb does unsigned saturation of a signed source
+     which is different from generic us_truncate RTX.  */
+  if (code == US_TRUNCATE)
+    src = gen_rtx_UNSPEC (sse_dmode,
+                         gen_rtvec (2, op1, op2),
+                         UNSPEC_US_TRUNCATE);
+  else
+    {
+      op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
+      op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
+      src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
+    }
+
+  emit_move_insn (dest, src);
  
    ix86_move_vector_high_sse_to_mmx (op0);
  }
@@ -1025,8 +1124,9 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
  
    switch (mode)
      {
-    case E_V4QImode:
      case E_V8QImode:
+    case E_V4QImode:
+    case E_V2QImode:
        sse_mode = V16QImode;
        double_sse_mode = V32QImode;
        mask = gen_rtx_PARALLEL (VOIDmode,
@@ -1746,9 +1846,9 @@ ix86_split_convert_uns_si_sse (rtx operands[])
        input = gen_rtx_REG (vecmode, REGNO (input));
        emit_move_insn (value, CONST0_RTX (vecmode));
        if (vecmode == V4SFmode)
-       emit_insn (gen_sse_movss (value, value, input));
+       emit_insn (gen_sse_movss_v4sf (value, value, input));
        else
-       emit_insn (gen_sse2_movsd (value, value, input));
+       emit_insn (gen_sse2_movsd_v2df (value, value, input));
      }
  
    emit_move_insn (large, two31);
@@ -1992,7 +2092,7 @@ ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
  }
  
  /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
-   pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
+   pattern can be used on it instead of fixuns_trunc*.
     This is done by doing just signed conversion if < 0x1p31, and otherwise by
     subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
  
@@ -2225,7 +2325,7 @@ ix86_expand_copysign (rtx operands[])
    else
      dest = NULL_RTX;
    op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
-  mask = ix86_build_signbit_mask (vmode, 0, 0);
+  mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
  
    if (CONST_DOUBLE_P (operands[1]))
      {
@@ -2312,6 +2412,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
    /* Handle special case - vector comparsion with boolean result, transform
       it using ptest instruction.  */
    if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+      || (mode == TImode && !TARGET_64BIT)
        || mode == OImode)
      {
        rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
@@ -2319,7 +2420,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
  
        gcc_assert (code == EQ || code == NE);
  
-      if (mode == OImode)
+      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
         {
           op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
           op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
@@ -2329,8 +2430,8 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
        tmp = gen_reg_rtx (mode);
        emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
        tmp = gen_lowpart (p_mode, tmp);
-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
-                             gen_rtx_UNSPEC (CCmode,
+      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+                             gen_rtx_UNSPEC (CCZmode,
                                               gen_rtvec (2, tmp, tmp),
                                               UNSPEC_PTEST)));
        tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
@@ -2626,6 +2727,35 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
    machine_mode op_mode = GET_MODE (op0);
    bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
  
+  if (op_mode == BFmode)
+    {
+      rtx op = gen_lowpart (HImode, op0);
+      if (CONST_INT_P (op))
+       op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
+                                            op0, BFmode);
+      else
+       {
+         rtx t1 = gen_reg_rtx (SImode);
+         emit_insn (gen_zero_extendhisi2 (t1, op));
+         emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
+         op = gen_lowpart (SFmode, t1);
+       }
+      *pop0 = op;
+      op = gen_lowpart (HImode, op1);
+      if (CONST_INT_P (op))
+       op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
+                                            op1, BFmode);
+      else
+       {
+         rtx t1 = gen_reg_rtx (SImode);
+         emit_insn (gen_zero_extendhisi2 (t1, op));
+         emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
+         op = gen_lowpart (SFmode, t1);
+       }
+      *pop1 = op;
+      return ix86_prepare_fp_compare_args (code, pop0, pop1);
+    }
+
    /* All of the unordered compare instructions only work on registers.
       The same is true of the fcomi compare instructions.  The XFmode
       compare instructions require registers except when comparing
@@ -2876,9 +3006,26 @@ ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
    cmpmode = SELECT_CC_MODE (code, op0, op1);
    flags = gen_rtx_REG (cmpmode, FLAGS_REG);
  
+  /* Attempt to use PTEST, if available, when testing vector modes for
+     equality/inequality against zero.  */
+  if (op1 == const0_rtx
+      && SUBREG_P (op0)
+      && cmpmode == CCZmode
+      && SUBREG_BYTE (op0) == 0
+      && REG_P (SUBREG_REG (op0))
+      && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
+      && TARGET_SSE4_1
+      && GET_MODE (op0) == TImode
+      && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
+    {
+      tmp = SUBREG_REG (op0);
+      tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
+    }
+  else
+    tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
+
    /* This is very simple, but making the interface the same as in the
       FP case makes the rest of the code easier.  */
-  tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
    emit_insn (gen_rtx_SET (flags, tmp));
  
    /* Return the test that should be put into the flags user, i.e.
@@ -3164,6 +3311,10 @@ ix86_expand_int_movcc (rtx operands[])
           && !TARGET_64BIT))
      return false;
  
+  if (GET_MODE (op0) == BFmode
+      && !ix86_fp_comparison_operator (operands[1], VOIDmode))
+    return false;
+
    start_sequence ();
    compare_op = ix86_expand_compare (code, op0, op1);
    compare_seq = get_insns ();
@@ -4238,6 +4389,10 @@ ix86_expand_fp_movcc (rtx operands[])
    rtx op0 = XEXP (operands[1], 0);
    rtx op1 = XEXP (operands[1], 1);
  
+  if (GET_MODE (op0) == BFmode
+      && !ix86_fp_comparison_operator (operands[1], VOIDmode))
+    return false;
+
    if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
      {
        machine_mode cmode;
@@ -4473,15 +4628,88 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
         case GTU:
           break;
  
-       case NE:
         case LE:
         case LEU:
+         /* x <= cst can be handled as x < cst + 1 unless there is
+            wrap around in cst + 1.  */
+         if (GET_CODE (cop1) == CONST_VECTOR
+             && GET_MODE_INNER (mode) != TImode)
+           {
+             unsigned int n_elts = GET_MODE_NUNITS (mode), i;
+             machine_mode eltmode = GET_MODE_INNER (mode);
+             for (i = 0; i < n_elts; ++i)
+               {
+                 rtx elt = CONST_VECTOR_ELT (cop1, i);
+                 if (!CONST_INT_P (elt))
+                   break;
+                 if (code == GE)
+                   {
+                     /* For LE punt if some element is signed maximum.  */
+                     if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
+                         == (GET_MODE_MASK (eltmode) >> 1))
+                       break;
+                   }
+                 /* For LEU punt if some element is unsigned maximum.  */
+                 else if (elt == constm1_rtx)
+                   break;
+               }
+             if (i == n_elts)
+               {
+                 rtvec v = rtvec_alloc (n_elts);
+                 for (i = 0; i < n_elts; ++i)
+                   RTVEC_ELT (v, i)
+                     = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
+                                     eltmode);
+                 cop1 = gen_rtx_CONST_VECTOR (mode, v);
+                 std::swap (cop0, cop1);
+                 code = code == LE ? GT : GTU;
+                 break;
+               }
+           }
+         /* FALLTHRU */
+       case NE:
           code = reverse_condition (code);
           *negate = true;
           break;
  
         case GE:
         case GEU:
+         /* x >= cst can be handled as x > cst - 1 unless there is
+            wrap around in cst - 1.  */
+         if (GET_CODE (cop1) == CONST_VECTOR
+             && GET_MODE_INNER (mode) != TImode)
+           {
+             unsigned int n_elts = GET_MODE_NUNITS (mode), i;
+             machine_mode eltmode = GET_MODE_INNER (mode);
+             for (i = 0; i < n_elts; ++i)
+               {
+                 rtx elt = CONST_VECTOR_ELT (cop1, i);
+                 if (!CONST_INT_P (elt))
+                   break;
+                 if (code == GE)
+                   {
+                     /* For GE punt if some element is signed minimum.  */
+                     if (INTVAL (elt) < 0
+                         && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
+                             == 0))
+                       break;
+                   }
+                 /* For GEU punt if some element is zero.  */
+                 else if (elt == const0_rtx)
+                   break;
+               }
+             if (i == n_elts)
+               {
+                 rtvec v = rtvec_alloc (n_elts);
+                 for (i = 0; i < n_elts; ++i)
+                   RTVEC_ELT (v, i)
+                     = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
+                                     eltmode);
+                 cop1 = gen_rtx_CONST_VECTOR (mode, v);
+                 code = code == GE ? GT : GTU;
+                 break;
+               }
+           }
           code = reverse_condition (code);
           *negate = true;
           /* FALLTHRU */
@@ -4519,6 +4747,11 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
             }
         }
  
+      if (GET_CODE (cop0) == CONST_VECTOR)
+       cop0 = force_reg (mode, cop0);
+      else if (GET_CODE (cop1) == CONST_VECTOR)
+       cop1 = force_reg (mode, cop1);
+
        rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
        rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
        if (*negate)
@@ -4715,13 +4948,13 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
    if (*negate)
      std::swap (op_true, op_false);
  
+  if (GET_CODE (cop1) == CONST_VECTOR)
+    cop1 = force_reg (mode, cop1);
+
    /* Allow the comparison to be done in one mode, but the movcc to
       happen in another mode.  */
    if (data_mode == mode)
-    {
-      x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
-                              op_true, op_false);
-    }
+    x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
    else
      {
        gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
@@ -5404,7 +5637,43 @@ ix86_expand_vec_perm (rtx operands[])
      }
  }
  
-/* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
+/* Extend SRC into next wider integer vector type.  UNSIGNED_P is
+   true if we should do zero extension, else sign extension.  */
+
+void
+ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
+{
+  machine_mode imode = GET_MODE (src);
+  rtx ops[3];
+
+  switch (imode)
+    {
+    case E_V8QImode:
+    case E_V4QImode:
+    case E_V2QImode:
+    case E_V4HImode:
+    case E_V2HImode:
+    case E_V2SImode:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  ops[0] = gen_reg_rtx (imode);
+
+  ops[1] = force_reg (imode, src);
+
+  if (unsigned_p)
+    ops[2] = force_reg (imode, CONST0_RTX (imode));
+  else
+    ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+                                 ops[1], pc_rtx, pc_rtx);
+
+  ix86_split_mmx_punpck (ops, false);
+  emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
+}
+
+/* Unpack SRC into the next wider integer vector type.  UNSIGNED_P is
     true if we should do zero extension, else sign extension.  HIGH_P is
     true if we want the N/2 high elements, else the low elements.  */
  
@@ -6068,7 +6337,7 @@ ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
        if (count >= half_width)
         {
           emit_move_insn (high[0], low[1]);
-         emit_move_insn (low[0], const0_rtx);
+         ix86_expand_clear (low[0]);
  
           if (count > half_width)
             ix86_expand_ashl_const (high[0], count - half_width, mode);
@@ -6948,6 +7217,37 @@ ix86_expand_v1ti_ashiftrt (rtx operands[])
      }
  }
  
+/* Replace all occurrences of REG FROM with REG TO in X, including
+   occurrences with different modes.  */
+
+rtx
+ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
+{
+  gcc_checking_assert (REG_P (from)
+                      && REG_P (to)
+                      && GET_MODE (from) == GET_MODE (to));
+  if (!reg_overlap_mentioned_p (from, x))
+    return x;
+  rtx ret = copy_rtx (x);
+  subrtx_ptr_iterator::array_type array;
+  FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
+    {
+      rtx *loc = *iter;
+      x = *loc;
+      if (REG_P (x) && REGNO (x) == REGNO (from))
+       {
+         if (x == from)
+           *loc = to;
+         else
+           {
+             gcc_checking_assert (REG_NREGS (x) == 1);
+             *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
+           }
+       }
+    }
+  return ret;
+}
+
  /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
     DImode for constant loop counts.  */
  
@@ -8020,6 +8320,11 @@ alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
  {
    if (alg == no_stringop)
      return false;
+  /* It is not possible to use a library call if we have non-default
+     address space.  We can do better than the generic byte-at-a-time
+     loop, used as a fallback.  */
+  if (alg == libcall && have_as)
+    return false;
    if (alg == vector_loop)
      return TARGET_SSE || TARGET_AVX;
    /* Algorithms using the rep prefix want at least edi and ecx;
@@ -8194,8 +8499,12 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
         gcc_assert (alg != libcall);
        return alg;
      }
+
+  /* Try to use some reasonable fallback algorithm.  Note that for
+     non-default address spaces we default to a loop instead of
+     a libcall.  */
    return (alg_usable_p (algs->unknown_size, memset, have_as)
-         ? algs->unknown_size : libcall);
+         ? algs->unknown_size : have_as ? loop : libcall);
  }
  
  /* Decide on alignment.  We know that the operand is already aligned to ALIGN
@@ -8518,6 +8827,8 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
  
        if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
         move_mode = TImode;
+      if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
+       move_mode = OImode;
  
        /* Find the corresponding vector mode with the same size as MOVE_MODE.
          MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
@@ -9323,6 +9634,17 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
        fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
      }
  
+  /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
+     mask off code pointers here.
+     TODO: also need to handle indirect jump.  */
+  if (ix86_memtag_can_tag_addresses () && !fndecl
+      && sanitize_flags_p (SANITIZE_HWADDRESS))
+    {
+      rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
+                                                       NULL_RTX);
+      fnaddr = gen_rtx_MEM (QImode, untagged_addr);
+    }
+
    call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
  
    if (retval)
@@ -10022,6 +10344,18 @@ ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
    machine_mode mode1 = insn_data[d->icode].operand[1].mode;
    enum rtx_code comparison = d->comparison;
  
+  /* ptest reg, reg sets the carry flag.  */
+  if (comparison == LTU
+      && (d->code == IX86_BUILTIN_PTESTC
+         || d->code == IX86_BUILTIN_PTESTC256)
+      && rtx_equal_p (op0, op1))
+    {
+      if (!target)
+       target = gen_reg_rtx (SImode);
+      emit_move_insn (target, const1_rtx);
+      return target;
+    }
+
    if (VECTOR_MODE_P (mode0))
      op0 = safe_vector_operand (op0, mode0);
    if (VECTOR_MODE_P (mode1))
@@ -10308,6 +10642,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
        return ix86_expand_sse_ptest (d, exp, target);
      case FLOAT128_FTYPE_FLOAT128:
      case FLOAT_FTYPE_FLOAT:
+    case FLOAT_FTYPE_BFLOAT16:
      case INT_FTYPE_INT:
      case UINT_FTYPE_UINT:
      case UINT16_FTYPE_UINT16:
@@ -10425,9 +10760,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V8DF_FTYPE_V2DF:
      case V8DF_FTYPE_V8DF:
      case V4DI_FTYPE_V4DI:
-    case V16HI_FTYPE_V16SF:
-    case V8HI_FTYPE_V8SF:
-    case V8HI_FTYPE_V4SF:
+    case V16BF_FTYPE_V16SF:
+    case V8BF_FTYPE_V8SF:
+    case V8BF_FTYPE_V4SF:
        nargs = 1;
        break;
      case V4SF_FTYPE_V4SF_VEC_MERGE:
@@ -10462,6 +10797,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V4SF_FTYPE_V4SF_UINT:
      case V4SF_FTYPE_V4SF_DI:
      case V4SF_FTYPE_V4SF_SI:
+    case V4DI_FTYPE_V4DI_V2DI:
      case V2DI_FTYPE_V2DI_V2DI:
      case V2DI_FTYPE_V16QI_V16QI:
      case V2DI_FTYPE_V4SI_V4SI:
@@ -10555,12 +10891,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case USI_FTYPE_USI_USI:
      case UDI_FTYPE_UDI_UDI:
      case V16SI_FTYPE_V8DF_V8DF:
-    case V32HI_FTYPE_V16SF_V16SF:
-    case V16HI_FTYPE_V8SF_V8SF:
-    case V8HI_FTYPE_V4SF_V4SF:
-    case V16HI_FTYPE_V16SF_UHI:
-    case V8HI_FTYPE_V8SF_UQI:
-    case V8HI_FTYPE_V4SF_UQI:
+    case V32BF_FTYPE_V16SF_V16SF:
+    case V16BF_FTYPE_V8SF_V8SF:
+    case V8BF_FTYPE_V4SF_V4SF:
+    case V16BF_FTYPE_V16SF_UHI:
+    case V8BF_FTYPE_V8SF_UQI:
+    case V8BF_FTYPE_V4SF_UQI:
        nargs = 2;
        break;
      case V2DI_FTYPE_V2DI_INT_CONVERT:
@@ -10759,6 +11095,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V8HI_FTYPE_V8DI_V8HI_UQI:
      case V8SI_FTYPE_V8DI_V8SI_UQI:
      case V4SI_FTYPE_V4SI_V4SI_V4SI:
+    case V4DI_FTYPE_V4DI_V4DI_V2DI:
      case V16SI_FTYPE_V16SI_V16SI_V16SI:
      case V8DI_FTYPE_V8DI_V8DI_V8DI:
      case V32HI_FTYPE_V32HI_V32HI_V32HI:
@@ -10766,15 +11103,15 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V16HI_FTYPE_V16HI_V16HI_V16HI:
      case V8SI_FTYPE_V8SI_V8SI_V8SI:
      case V8HI_FTYPE_V8HI_V8HI_V8HI:
-    case V32HI_FTYPE_V16SF_V16SF_USI:
-    case V16HI_FTYPE_V8SF_V8SF_UHI:
-    case V8HI_FTYPE_V4SF_V4SF_UQI:
-    case V16HI_FTYPE_V16SF_V16HI_UHI:
-    case V8HI_FTYPE_V8SF_V8HI_UQI:
-    case V8HI_FTYPE_V4SF_V8HI_UQI:
-    case V16SF_FTYPE_V16SF_V32HI_V32HI:
-    case V8SF_FTYPE_V8SF_V16HI_V16HI:
-    case V4SF_FTYPE_V4SF_V8HI_V8HI:
+    case V32BF_FTYPE_V16SF_V16SF_USI:
+    case V16BF_FTYPE_V8SF_V8SF_UHI:
+    case V8BF_FTYPE_V4SF_V4SF_UQI:
+    case V16BF_FTYPE_V16SF_V16BF_UHI:
+    case V8BF_FTYPE_V8SF_V8BF_UQI:
+    case V8BF_FTYPE_V4SF_V8BF_UQI:
+    case V16SF_FTYPE_V16SF_V32BF_V32BF:
+    case V8SF_FTYPE_V8SF_V16BF_V16BF:
+    case V4SF_FTYPE_V4SF_V8BF_V8BF:
        nargs = 3;
        break;
      case V32QI_FTYPE_V32QI_V32QI_INT:
@@ -10921,9 +11258,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
      case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
      case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
-    case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
-    case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
-    case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
+    case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
+    case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
+    case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
        nargs = 4;
        break;
      case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
@@ -10931,6 +11268,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
      case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
      case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
        nargs = 4;
        nargs_constant = 1;
        break;
@@ -10961,9 +11299,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
        break;
      case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
      case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
-    case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
-    case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
-    case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
+    case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
+    case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
+    case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
        nargs = 4;
        break;
      case UQI_FTYPE_V8DI_V8DI_INT_UQI:
@@ -11823,8 +12161,9 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
    tree arg;
    rtx pat, op;
    unsigned int i, nargs, arg_adjust, memory;
+  unsigned int constant = 100;
    bool aligned_mem = false;
-  rtx xops[3];
+  rtx xops[4];
    enum insn_code icode = d->icode;
    const struct insn_data_d *insn_p = &insn_data[icode];
    machine_mode tmode = insn_p->operand[0].mode;
@@ -11860,6 +12199,14 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
      case V8SF_FTYPE_PCV4SF:
      case V8SF_FTYPE_PCFLOAT:
      case V4SF_FTYPE_PCFLOAT:
+    case V4SF_FTYPE_PCFLOAT16:
+    case V4SF_FTYPE_PCBFLOAT16:
+    case V4SF_FTYPE_PCV8BF:
+    case V4SF_FTYPE_PCV8HF:
+    case V8SF_FTYPE_PCFLOAT16:
+    case V8SF_FTYPE_PCBFLOAT16:
+    case V8SF_FTYPE_PCV16HF:
+    case V8SF_FTYPE_PCV16BF:
      case V4DF_FTYPE_PCV2DF:
      case V4DF_FTYPE_PCDOUBLE:
      case V2DF_FTYPE_PCDOUBLE:
@@ -12107,6 +12454,13 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
        klass = load;
        memory = 0;
        break;
+    case INT_FTYPE_PINT_INT_INT_INT:
+    case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
+      nargs = 4;
+      klass = load;
+      memory = 0;
+      constant = 3;
+      break;
      default:
        gcc_unreachable ();
      }
@@ -12172,6 +12526,15 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
           if (MEM_ALIGN (op) < align)
             set_mem_align (op, align);
         }
+      else if (i == constant)
+       {
+         /* This must be the constant.  */
+         if (!insn_p->operand[nargs].predicate(op, SImode))
+           {
+             error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
+             return const0_rtx;
+           }
+       }
        else
         {
           /* This must be register.  */
@@ -12213,6 +12576,9 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
      case 3:
        pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
        break;
+    case 4:
+      pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
+      break;
      default:
        gcc_unreachable ();
      }
@@ -12332,7 +12698,7 @@ ix86_expand_vec_set_builtin (tree exp)
    op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
    elt = get_element_number (TREE_TYPE (arg0), arg2);
  
-  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
+  if (GET_MODE (op1) != mode1)
      op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
  
    op0 = force_reg (tmode, op0);
@@ -12359,6 +12725,7 @@ ix86_check_builtin_isa_match (unsigned int fcode,
    HOST_WIDE_INT isa2 = ix86_isa_flags2;
    HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
    HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
+  HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
    /* The general case is we require all the ISAs specified in bisa{,2}
       to be enabled.
       The exceptions are:
@@ -12367,34 +12734,38 @@ ix86_check_builtin_isa_match (unsigned int fcode,
       OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
       (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
         OPTION_MASK_ISA2_AVXVNNI
+     (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
+       OPTION_MASK_ISA2_AVXIFMA
+     (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
+       OPTION_MASK_ISA2_AVXNECONVERT
+     OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
       where for each such pair it is sufficient if either of the ISAs is
       enabled, plus if it is ored with other options also those others.
       OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
-  if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
-       == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
-      && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
-    isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
-
-  if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
-       == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
-      && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
-    isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
  
-  if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
-       == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
-      && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
-    isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
-
-  if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
-       == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
-       || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
-      && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
-          == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
-         || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
-    {
-      isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
-      isa2 |= OPTION_MASK_ISA2_AVXVNNI;
-    }
+#define SHARE_BUILTIN(A1, A2, B1, B2) \
+  if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
+       && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
+      && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
+         || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
+    { \
+      tmp_isa |= (A1) | (B1); \
+      tmp_isa2 |= (A2) | (B2); \
+    }
+
+  SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
+  SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
+  SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
+  SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
+                OPTION_MASK_ISA2_AVXVNNI);
+  SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
+                OPTION_MASK_ISA2_AVXIFMA);
+  SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
+                OPTION_MASK_ISA2_AVXNECONVERT);
+  SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
+                OPTION_MASK_ISA2_VAES);
+  isa = tmp_isa;
+  isa2 = tmp_isa2;
  
    if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
        /* __builtin_ia32_maskmovq requires MMX registers.  */
@@ -12412,6 +12783,21 @@ ix86_check_builtin_isa_match (unsigned int fcode,
    return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
  }
  
+/* Emit instructions to set the carry flag from ARG.  */
+
+void
+ix86_expand_carry (rtx arg)
+{
+  if (!CONST_INT_P (arg) || arg == const0_rtx)
+    {
+      arg = convert_to_mode (QImode, arg, 1);
+      arg = copy_to_mode_reg (QImode, arg);
+      emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
+    }
+  else
+    emit_insn (gen_x86_stc ());
+}
+
  /* Expand an expression EXP that calls a built-in function,
     with result going to TARGET if that's convenient
     (and in mode MODE if that's convenient).
@@ -12944,6 +13330,102 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
         return target;
        }
  
+    case IX86_BUILTIN_PREFETCH:
+      {
+       arg0 = CALL_EXPR_ARG (exp, 0); // const void *
+       arg1 = CALL_EXPR_ARG (exp, 1); // const int
+       arg2 = CALL_EXPR_ARG (exp, 2); // const int
+       arg3 = CALL_EXPR_ARG (exp, 3); // const int
+
+       op0 = expand_normal (arg0);
+       op1 = expand_normal (arg1);
+       op2 = expand_normal (arg2);
+       op3 = expand_normal (arg3);
+
+       if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
+         {
+           error ("second, third and fourth argument must be a const");
+           return const0_rtx;
+         }
+
+       if (INTVAL (op3) == 1)
+         {
+           if (INTVAL (op2) < 2 || INTVAL (op2) > 3)
+             {
+               error ("invalid third argument");
+               return const0_rtx;
+             }
+
+           if (TARGET_64BIT && TARGET_PREFETCHI
+               && local_func_symbolic_operand (op0, GET_MODE (op0)))
+             emit_insn (gen_prefetchi (op0, op2));
+           else
+             {
+               warning (0, "instruction prefetch applies when in 64-bit mode"
+                           " with RIP-relative addressing and"
+                           " option %<-mprefetchi%>;"
+                           " they stay NOPs otherwise");
+               emit_insn (gen_nop ());
+             }
+         }
+       else
+         {
+           if (!address_operand (op0, VOIDmode))
+             {
+               op0 = convert_memory_address (Pmode, op0);
+               op0 = copy_addr_to_reg (op0);
+             }
+
+           if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
+             {
+               warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
+               op2 = const0_rtx;
+             }
+
+           if (TARGET_3DNOW || TARGET_PREFETCH_SSE
+               || TARGET_PRFCHW || TARGET_PREFETCHWT1)
+             emit_insn (gen_prefetch (op0, op1, op2));
+           else if (!MEM_P (op0) && side_effects_p (op0))
+             /* Don't do anything with direct references to volatile memory,
+                but generate code to handle other side effects.  */
+             emit_insn (op0);
+         }
+
+       return 0;
+      }
+
+    case IX86_BUILTIN_PREFETCHI:
+      {
+       arg0 = CALL_EXPR_ARG (exp, 0); // const void *
+       arg1 = CALL_EXPR_ARG (exp, 1); // const int
+
+       op0 = expand_normal (arg0);
+       op1 = expand_normal (arg1);
+
+       if (!CONST_INT_P (op1))
+         {
+           error ("second argument must be a const");
+           return const0_rtx;
+         }
+
+       /* GOT/PLT_PIC should not be available for instruction prefetch.
+          It must be real instruction address.  */
+       if (TARGET_64BIT
+           && local_func_symbolic_operand (op0, GET_MODE (op0)))
+         emit_insn (gen_prefetchi (op0, op1));
+       else
+         {
+           /* Ignore the hint.  */
+           warning (0, "instruction prefetch applies when in 64-bit mode"
+                       " with RIP-relative addressing and"
+                       " option %<-mprefetchi%>;"
+                       " they stay NOPs otherwise");
+           emit_insn (gen_nop ());
+         }
+
+       return 0;
+      }
+
      case IX86_BUILTIN_VEC_INIT_V2SI:
      case IX86_BUILTIN_VEC_INIT_V4HI:
      case IX86_BUILTIN_VEC_INIT_V8QI:
@@ -13620,8 +14102,6 @@ rdseed_step:
        arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
  
        op1 = expand_normal (arg0);
-      if (!integer_zerop (arg0))
-       op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
  
        op2 = expand_normal (arg1);
        if (!register_operand (op2, mode0))
@@ -13639,7 +14119,7 @@ rdseed_step:
         }
  
        op0 = gen_reg_rtx (mode0);
-      if (integer_zerop (arg0))
+      if (op1 == const0_rtx)
         {
           /* If arg0 is 0, optimize right away into add or sub
              instruction that sets CCCmode flags.  */
@@ -13649,7 +14129,7 @@ rdseed_step:
        else
         {
           /* Generate CF from input operand.  */
-         emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+         ix86_expand_carry (op1);
  
           /* Generate instruction that consumes CF.  */
           op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
@@ -14243,7 +14723,7 @@ rdseed_step:
                 op0 = pc_rtx;
             }
           else if (TREE_CODE (arg3) == SSA_NAME
-                  && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
+                  && VECTOR_TYPE_P (TREE_TYPE (arg3)))
             {
               /* Recognize also when mask is like:
                  __v2df src = _mm_setzero_pd ();
@@ -14906,6 +15386,10 @@ ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
    bool ok;
    rtx_insn *insn;
    rtx dup;
+  /* Save/restore recog_data in case this is called from splitters
+     or other routines where recog_data needs to stay valid across
+     force_reg.  See PR106577.  */
+  recog_data_d recog_data_save = recog_data;
  
    /* First attempt to recognize VAL as-is.  */
    dup = gen_vec_duplicate (mode, val);
@@ -14931,6 +15415,7 @@ ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
        ok = recog_memoized (insn) >= 0;
        gcc_assert (ok);
      }
+  recog_data = recog_data_save;
    return true;
  }
  
@@ -14941,7 +15426,7 @@ static machine_mode
  get_mode_wider_vector (machine_mode o)
  {
    /* ??? Rely on the ordering that genmodes.cc gives to vectors.  */
-  machine_mode n = GET_MODE_WIDER_MODE (o).require ();
+  machine_mode n = GET_MODE_NEXT_MODE (o).require ();
    gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
    gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
    return n;
@@ -15038,8 +15523,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
             {
               tmp1 = force_reg (GET_MODE_INNER (mode), val);
               tmp2 = gen_reg_rtx (mode);
-             emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
-                                             CONST0_RTX (mode), tmp1));
+             emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
               tmp1 = gen_lowpart (mode, tmp2);
             }
           else
@@ -16034,11 +16518,12 @@ quarter:
         emit_move_insn (target, gen_lowpart (mode, words[0]));
        else if (n_words == 2)
         {
-         rtx tmp = gen_reg_rtx (mode);
-         emit_clobber (tmp);
-         emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
-         emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
-         emit_move_insn (target, tmp);
+         gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
+         machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
+         rtx tmp = gen_reg_rtx (concat_mode);
+         vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
+         ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
+         emit_move_insn (target, gen_lowpart (mode, tmp));
         }
        else if (n_words == 4)
         {
@@ -17000,9 +17485,9 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
                  ? gen_reg_rtx (V16HFmode)
                  : gen_reg_rtx (V16BFmode));
           if (elt < 16)
-           emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+           emit_insn (gen_vec_extract_lo (mode, tmp, vec));
           else
-           emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+           emit_insn (gen_vec_extract_hi (mode, tmp, vec));
           ix86_expand_vector_extract (false, target, tmp, elt & 15);
           return;
         }
@@ -17016,9 +17501,9 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
                  ? gen_reg_rtx (V8HFmode)
                  : gen_reg_rtx (V8BFmode));
           if (elt < 8)
-           emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
+           emit_insn (gen_vec_extract_lo (mode, tmp, vec));
           else
-           emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
+           emit_insn (gen_vec_extract_hi (mode, tmp, vec));
           ix86_expand_vector_extract (false, target, tmp, elt & 7);
           return;
         }
@@ -18604,9 +19089,9 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
    if (d->one_operand_p)
      return false;
  
-  if (!(TARGET_SSE && vmode == V4SFmode)
-      && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
-      && !(TARGET_SSE2 && vmode == V2DFmode))
+  if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
+      && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
+      && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
      return false;
  
    /* Only the first element is changed.  */
@@ -18629,6 +19114,78 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
    return true;
  }
  
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
+   using insertps.  */
+static bool
+expand_vec_perm_insertps (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, cnt_s, nelt = d->nelt;
+  int cnt_d = -1;
+  rtx src, dst;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (!(TARGET_SSE4_1
+       && (vmode == V4SFmode || vmode == V4SImode
+           || (TARGET_MMX_WITH_SSE
+               && (vmode == V2SFmode || vmode == V2SImode)))))
+    return false;
+
+  for (i = 0; i < nelt; ++i)
+    {
+      if (d->perm[i] == i)
+       continue;
+      if (cnt_d != -1)
+       {
+         cnt_d = -1;
+         break;
+       }
+      cnt_d = i;
+    }
+
+  if (cnt_d == -1)
+    {
+      for (i = 0; i < nelt; ++i)
+       {
+         if (d->perm[i] == i + nelt)
+           continue;
+         if (cnt_d != -1)
+           return false;
+         cnt_d = i;
+       }
+
+      if (cnt_d == -1)
+       return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  gcc_assert (cnt_d != -1);
+
+  cnt_s = d->perm[cnt_d];
+  if (cnt_s < nelt)
+    {
+      src = d->op0;
+      dst = d->op1;
+    }
+  else
+    {
+      cnt_s -= nelt;
+      src = d->op1;
+      dst = d->op0;
+     }
+  gcc_assert (cnt_s < nelt);
+
+  rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
+                              GEN_INT (cnt_s << 6 | cnt_d << 4));
+  emit_insn (x);
+
+  return true;
+}
+
  /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
     in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
  
@@ -18651,9 +19208,10 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
      ;
    else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
      ;
-  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
-                            || GET_MODE_SIZE (vmode) == 8
-                            || GET_MODE_SIZE (vmode) == 4))
+  else if (TARGET_SSE4_1
+          && (GET_MODE_SIZE (vmode) == 16
+              || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
+              || GET_MODE_SIZE (vmode) == 4))
      ;
    else
      return false;
@@ -18686,6 +19244,8 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
      case E_V8SFmode:
      case E_V2DFmode:
      case E_V4SFmode:
+    case E_V2SFmode:
+    case E_V2HImode:
      case E_V4HImode:
      case E_V8HImode:
      case E_V8SImode:
@@ -18710,10 +19270,20 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
        goto do_subreg;
  
      case E_V4SImode:
-      for (i = 0; i < 4; ++i)
-       mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
-      vmode = V8HImode;
-      goto do_subreg;
+      if (TARGET_AVX2)
+       {
+         /* Use vpblendd instead of vpblendw.  */
+         for (i = 0; i < nelt; ++i)
+           mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
+         break;
+       }
+      else
+       {
+         for (i = 0; i < 4; ++i)
+           mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+         vmode = V8HImode;
+         goto do_subreg;
+       }
  
      case E_V16QImode:
        /* See if bytes move in pairs so we can use pblendw with
@@ -18872,6 +19442,23 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
        mmode = VOIDmode;
      }
  
+  /* Canonicalize vec_merge.  */
+  if (swap_commutative_operands_p (op1, op0)
+      /* Two operands have same precedence, then
+        first bit of mask select first operand.  */
+      || (!swap_commutative_operands_p (op0, op1)
+         && !(mask & 1)))
+    {
+      unsigned n_elts = GET_MODE_NUNITS (vmode);
+      std::swap (op0, op1);
+      unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
+      if (n_elts == HOST_BITS_PER_WIDE_INT)
+       mask_all  = -1;
+      else
+       mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
+      mask = ~mask & mask_all;
+    }
+
    if (mmode != VOIDmode)
      maskop = force_reg (mmode, gen_int_mode (mask, mmode));
    else
@@ -19541,11 +20128,19 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
         }
      }
  
+  /* Try the SSE4.1 blend variable merge instructions.  */
+  if (expand_vec_perm_blend (d))
+    return true;
+
    /* Try movss/movsd instructions.  */
    if (expand_vec_perm_movs (d))
      return true;
  
-  /* Finally, try the fully general two operand permute.  */
+  /* Try the SSE4.1 insertps instruction.  */
+  if (expand_vec_perm_insertps (d))
+    return true;
+
+  /* Try the fully general two operand permute.  */
    if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
                               d->testing_p))
      return true;
@@ -19568,10 +20163,6 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
         return true;
      }
  
-  /* Try the SSE4.1 blend variable merge instructions.  */
-  if (expand_vec_perm_blend (d))
-    return true;
-
    /* Try one of the AVX vpermil variable permutations.  */
    if (expand_vec_perm_vpermil (d))
      return true;
@@ -19604,6 +20195,136 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
    return false;
  }
  
+/* Canonicalize vec_perm index to make the first index
+   always comes from the first vector.  */
+static void
+ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
+{
+  unsigned nelt = d->nelt;
+  if (d->perm[0] < nelt)
+    return;
+
+  for (unsigned i = 0; i != nelt; i++)
+    d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
+
+  std::swap (d->op0, d->op1);
+  return;
+}
+
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
+   in terms of a pair of shufps+ shufps/pshufd instructions.  */
+static bool
+expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
+{
+  unsigned char perm1[4];
+  machine_mode vmode = d->vmode;
+  bool ok;
+  unsigned i, j, k, count = 0;
+
+  if (d->one_operand_p
+      || (vmode != V4SImode && vmode != V4SFmode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  ix86_vec_perm_index_canon (d);
+  for (i = 0; i < 4; ++i)
+    count += d->perm[i] > 3 ? 1 : 0;
+
+  gcc_assert (count & 3);
+
+  rtx tmp = gen_reg_rtx (vmode);
+  /* 2 from op0 and 2 from op1.  */
+  if (count == 2)
+    {
+      unsigned char perm2[4];
+      for (i = 0, j = 0, k = 2; i < 4; ++i)
+       if (d->perm[i] & 4)
+         {
+           perm1[k++] = d->perm[i];
+           perm2[i] = k - 1;
+         }
+       else
+         {
+           perm1[j++] = d->perm[i];
+           perm2[i] = j - 1;
+         }
+
+      /* shufps.  */
+      ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
+                                 perm1, d->nelt, false);
+      gcc_assert (ok);
+      if (vmode == V4SImode && TARGET_SSE2)
+      /* pshufd.  */
+       ok = expand_vselect (d->target, tmp,
+                            perm2, d->nelt, false);
+      else
+       {
+         /* shufps.  */
+         perm2[2] += 4;
+         perm2[3] += 4;
+         ok = expand_vselect_vconcat (d->target, tmp, tmp,
+                                      perm2, d->nelt, false);
+       }
+      gcc_assert (ok);
+    }
+  /* 3 from one op and 1 from another.  */
+  else
+    {
+      unsigned pair_idx = 8, lone_idx = 8, shift;
+
+      /* Find the lone index.  */
+      for (i = 0; i < 4; ++i)
+       if ((d->perm[i] > 3 && count == 1)
+           || (d->perm[i] < 4 && count == 3))
+         lone_idx = i;
+
+      /* When lone_idx is not 0, it must from second op(count == 1).  */
+      gcc_assert (count == (lone_idx ? 1 : 3));
+
+      /* Find the pair index that sits in the same half as the lone index.  */
+      shift = lone_idx & 2;
+      pair_idx = 1 - lone_idx + 2 * shift;
+
+      /* First permutate lone index and pair index into the same vector as
+        [ lone, lone, pair, pair ].  */
+      perm1[1] = perm1[0]
+       = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
+      perm1[3] = perm1[2]
+       = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
+
+      /* Alway put the vector contains lone indx at the first.  */
+      if (count == 1)
+       std::swap (d->op0, d->op1);
+
+      /* shufps.  */
+      ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
+                                  perm1, d->nelt, false);
+      gcc_assert (ok);
+
+      /* Refine lone and pair index to original order.  */
+      perm1[shift] = lone_idx << 1;
+      perm1[shift + 1] = pair_idx << 1;
+
+      /* Select the remaining 2 elements in another vector.  */
+      for (i = 2 - shift; i < 4 - shift; ++i)
+       perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
+
+      /* Adjust to original selector.  */
+      if (lone_idx > 1)
+       std::swap (tmp, d->op1);
+
+      /* shufps.  */
+      ok = expand_vselect_vconcat (d->target, tmp, d->op1,
+                                  perm1, d->nelt, false);
+
+      gcc_assert (ok);
+    }
+
+  return true;
+}
+
  /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
     in terms of a pair of pshuflw + pshufhw instructions.  */
  
@@ -19799,9 +20520,10 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
      ;
    else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
      ;
-  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
-                            || GET_MODE_SIZE (vmode) == 8
-                            || GET_MODE_SIZE (vmode) == 16))
+  else if (TARGET_SSE4_1
+          && (GET_MODE_SIZE (vmode) == 16
+              || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
+              || GET_MODE_SIZE (vmode) == 4))
      ;
    else
      return false;
@@ -20665,9 +21387,10 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
      ;
    else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
      ;
-  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
-                            || GET_MODE_SIZE (vmode) == 8
-                            || GET_MODE_SIZE (vmode) == 4))
+  else if (TARGET_SSE4_1
+          && (GET_MODE_SIZE (vmode) == 16
+              || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
+              || GET_MODE_SIZE (vmode) == 4))
      ;
    else
      return false;
@@ -21861,18 +22584,18 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
        if (d->testing_p)
         return true;
  
-      rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
+      rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
        if (elt >= nelt2)
         {
-         maybe_gen = maybe_gen_vec_interleave_high;
+         gen_interleave = gen_vec_interleave_high;
           elt -= nelt2;
         }
        else
-       maybe_gen = maybe_gen_vec_interleave_low;
+       gen_interleave = gen_vec_interleave_low;
        nelt2 /= 2;
  
        dest = gen_reg_rtx (vmode);
-      emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
+      emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
  
        vmode = V4SImode;
        op0 = gen_lowpart (vmode, dest);
@@ -22152,6 +22875,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
    if (expand_vec_perm_2perm_pblendv (d, true))
      return true;
  
+  if (expand_vec_perm_shufps_shufps (d))
+    return true;
+
    /* Try sequences of three instructions.  */
  
    if (expand_vec_perm_even_odd_pack (d))
@@ -22548,74 +23274,6 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
    gcc_assert (ok);
  }
  
-/* This function is similar as ix86_expand_vecop_qihi,
-   but optimized under AVX512BW by using vpmovwb.
-   For example, optimize vector MUL generation like
-
-   vpmovzxbw ymm2, xmm0
-   vpmovzxbw ymm3, xmm1
-   vpmullw   ymm4, ymm2, ymm3
-   vpmovwb   xmm0, ymm4
-
-   it would take less instructions than ix86_expand_vecop_qihi.
-   Return true if success.  */
-
-static bool
-ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
-  machine_mode himode, qimode = GET_MODE (dest);
-  rtx hop1, hop2, hdest;
-  rtx (*gen_extend)(rtx, rtx);
-  rtx (*gen_truncate)(rtx, rtx);
-  bool uns_p = (code == ASHIFTRT) ? false : true;
-
-  /* There's no V64HImode multiplication instruction.  */
-  if (qimode == E_V64QImode)
-    return false;
-
-  /* vpmovwb only available under AVX512BW.  */
-  if (!TARGET_AVX512BW)
-    return false;
-  if ((qimode == V8QImode || qimode == V16QImode)
-      && !TARGET_AVX512VL)
-    return false;
-  /* Not generate zmm instruction when prefer 128/256 bit vector width.  */
-  if (qimode == V32QImode
-      && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
-    return false;
-
-  switch (qimode)
-    {
-    case E_V8QImode:
-      himode = V8HImode;
-      gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
-      gen_truncate = gen_truncv8hiv8qi2;
-      break;
-    case E_V16QImode:
-      himode = V16HImode;
-      gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
-      gen_truncate = gen_truncv16hiv16qi2;
-      break;
-    case E_V32QImode:
-      himode = V32HImode;
-      gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
-      gen_truncate = gen_truncv32hiv32qi2;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  hop1 = gen_reg_rtx (himode);
-  hop2 = gen_reg_rtx (himode);
-  hdest = gen_reg_rtx (himode);
-  emit_insn (gen_extend (hop1, op1));
-  emit_insn (gen_extend (hop2, op2));
-  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
-                                                     hop1, hop2)));
-  emit_insn (gen_truncate (dest, hdest));
-  return true;
-}
-
  /* Expand a vector operation shift by constant for a V*QImode in terms of the
     same operation on V*HImode. Return true if success. */
  static bool
@@ -22716,6 +23374,234 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
    return true;
  }
  
+void
+ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode qimode = GET_MODE (dest);
+  rtx qop1, qop2, hop1, hop2, qdest, hdest;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+  bool uns_p = code != ASHIFTRT;
+
+  switch (qimode)
+    {
+    case E_V4QImode:
+    case E_V8QImode:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
+
+  if (op2vec)
+    qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
+  else
+    qop2 = op2;
+
+  qdest = gen_reg_rtx (V16QImode);
+
+  if (CONST_INT_P (op2)
+      && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+      && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
+    {
+      emit_move_insn (dest, gen_lowpart (qimode, qdest));
+      return;
+    }
+
+  switch (code)
+    {
+    case MULT:
+      gcc_assert (op2vec);
+      if (!TARGET_SSE4_1)
+       {
+         /* Unpack data such that we've got a source byte in each low byte
+            of each word.  We don't care what goes into the high byte of
+            each word.  Rather than trying to get zero in there, most
+            convenient is to let it be a copy of the low byte.  */
+         hop1 = copy_to_reg (qop1);
+         hop2 = copy_to_reg (qop2);
+         emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
+         emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
+         break;
+       }
+      /* FALLTHRU */
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+      hop1 = gen_reg_rtx (V8HImode);
+      ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+      /* mult/vashr/vlshr/vashl  */
+      if (op2vec)
+       {
+         hop2 = gen_reg_rtx (V8HImode);
+         ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+       }
+      else
+       hop2 = qop2;
+
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (code != MULT && op2vec)
+    {
+      /* Expand vashr/vlshr/vashl.  */
+      hdest = gen_reg_rtx (V8HImode);
+      emit_insn (gen_rtx_SET (hdest,
+                             simplify_gen_binary (code, V8HImode,
+                                                  hop1, hop2)));
+    }
+  else
+    /* Expand mult/ashr/lshr/ashl.  */
+    hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
+                               NULL_RTX, 1, OPTAB_DIRECT);
+
+  if (TARGET_AVX512BW && TARGET_AVX512VL)
+    {
+      if (qimode == V8QImode)
+       qdest = dest;
+      else
+       qdest = gen_reg_rtx (V8QImode);
+
+      emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
+    }
+  else
+    {
+      struct expand_vec_perm_d d;
+      rtx qres = gen_lowpart (V16QImode, hdest);
+      bool ok;
+      int i;
+
+      /* Merge the data back into the right place.  */
+      d.target = qdest;
+      d.op0 = d.op1 = qres;
+      d.vmode = V16QImode;
+      d.nelt = 16;
+      d.one_operand_p = false;
+      d.testing_p = false;
+
+      for (i = 0; i < d.nelt; ++i)
+       d.perm[i] = i * 2;
+
+      ok = ix86_expand_vec_perm_const_1 (&d);
+      gcc_assert (ok);
+    }
+
+  if (qdest != dest)
+    emit_move_insn (dest, gen_lowpart (qimode, qdest));
+}
+
+/* Emit instruction in 2x wider mode.  For example, optimize
+   vector MUL generation like
+
+   vpmovzxbw ymm2, xmm0
+   vpmovzxbw ymm3, xmm1
+   vpmullw   ymm4, ymm2, ymm3
+   vpmovwb   xmm0, ymm4
+
+   it would take less instructions than ix86_expand_vecop_qihi.
+   Return true if success.  */
+
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode himode, qimode = GET_MODE (dest);
+  machine_mode wqimode;
+  rtx qop1, qop2, hop1, hop2, hdest;
+  rtx (*gen_truncate)(rtx, rtx) = NULL;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+  bool uns_p = code != ASHIFTRT;
+
+  if ((qimode == V16QImode && !TARGET_AVX2)
+      || (qimode == V32QImode && !TARGET_AVX512BW)
+      /* There are no V64HImode instructions.  */
+      || qimode == V64QImode)
+     return false;
+
+  /* Do not generate ymm/zmm instructions when
+     target prefers 128/256 bit vector width.  */
+  if ((qimode == V16QImode && TARGET_PREFER_AVX128)
+      || (qimode == V32QImode && TARGET_PREFER_AVX256))
+    return false;
+
+  switch (qimode)
+    {
+    case E_V16QImode:
+      himode = V16HImode;
+      if (TARGET_AVX512VL && TARGET_AVX512BW)
+       gen_truncate = gen_truncv16hiv16qi2;
+      break;
+    case E_V32QImode:
+      himode = V32HImode;
+      gen_truncate = gen_truncv32hiv32qi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
+  qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
+
+  if (op2vec)
+    qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
+  else
+    qop2 = op2;
+
+  hop1 = gen_reg_rtx (himode);
+  ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+
+  if (op2vec)
+    {
+      hop2 = gen_reg_rtx (himode);
+      ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+    }
+  else
+    hop2 = qop2;
+
+  if (code != MULT && op2vec)
+    {
+      /* Expand vashr/vlshr/vashl.  */
+      hdest = gen_reg_rtx (himode);
+      emit_insn (gen_rtx_SET (hdest,
+                             simplify_gen_binary (code, himode,
+                                                  hop1, hop2)));
+    }
+  else
+    /* Expand mult/ashr/lshr/ashl.  */
+    hdest = expand_simple_binop (himode, code, hop1, hop2,
+                                NULL_RTX, 1, OPTAB_DIRECT);
+
+  if (gen_truncate)
+    emit_insn (gen_truncate (dest, hdest));
+  else
+    {
+      struct expand_vec_perm_d d;
+      rtx wqdest = gen_reg_rtx (wqimode);
+      rtx wqres = gen_lowpart (wqimode, hdest);
+      bool ok;
+      int i;
+
+      /* Merge the data back into the right place.  */
+      d.target = wqdest;
+      d.op0 = d.op1 = wqres;
+      d.vmode = wqimode;
+      d.nelt = GET_MODE_NUNITS (wqimode);
+      d.one_operand_p = false;
+      d.testing_p = false;
+
+      for (i = 0; i < d.nelt; ++i)
+       d.perm[i] = i * 2;
+
+      ok = ix86_expand_vec_perm_const_1 (&d);
+      gcc_assert (ok);
+
+      emit_move_insn (dest, gen_lowpart (qimode, wqdest));
+    }
+
+  return true;
+}
+
  /* Expand a vector operation CODE for a V*QImode in terms of the
     same operation on V*HImode.  */
  
@@ -22727,9 +23613,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
    rtx (*gen_il) (rtx, rtx, rtx);
    rtx (*gen_ih) (rtx, rtx, rtx);
    rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
    struct expand_vec_perm_d d;
-  bool ok, full_interleave;
-  bool uns_p = false;
+  bool full_interleave = true;
+  bool uns_p = code != ASHIFTRT;
+  bool ok;
    int i;
  
    if (CONST_INT_P (op2)
@@ -22737,27 +23625,19 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
        && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
      return;
  
-  if (TARGET_AVX512BW
-      && VECTOR_MODE_P (GET_MODE (op2))
-      && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+  if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
      return;
  
    switch (qimode)
      {
      case E_V16QImode:
        himode = V8HImode;
-      gen_il = gen_vec_interleave_lowv16qi;
-      gen_ih = gen_vec_interleave_highv16qi;
        break;
      case E_V32QImode:
        himode = V16HImode;
-      gen_il = gen_avx2_interleave_lowv32qi;
-      gen_ih = gen_avx2_interleave_highv32qi;
        break;
      case E_V64QImode:
        himode = V32HImode;
-      gen_il = gen_avx512bw_interleave_lowv64qi;
-      gen_ih = gen_avx512bw_interleave_highv64qi;
        break;
      default:
        gcc_unreachable ();
@@ -22766,10 +23646,31 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
    switch (code)
      {
      case MULT:
+      gcc_assert (op2vec);
        /* Unpack data such that we've got a source byte in each low byte of
          each word.  We don't care what goes into the high byte of each word.
          Rather than trying to get zero in there, most convenient is to let
          it be a copy of the low byte.  */
+      switch (qimode)
+       {
+       case E_V16QImode:
+         gen_il = gen_vec_interleave_lowv16qi;
+         gen_ih = gen_vec_interleave_highv16qi;
+         break;
+       case E_V32QImode:
+         gen_il = gen_avx2_interleave_lowv32qi;
+         gen_ih = gen_avx2_interleave_highv32qi;
+         full_interleave = false;
+         break;
+       case E_V64QImode:
+         gen_il = gen_avx512bw_interleave_lowv64qi;
+         gen_ih = gen_avx512bw_interleave_highv64qi;
+         full_interleave = false;
+         break;
+       default:
+         gcc_unreachable ();
+       }
+
        op2_l = gen_reg_rtx (qimode);
        op2_h = gen_reg_rtx (qimode);
        emit_insn (gen_il (op2_l, op2, op2));
@@ -22779,20 +23680,17 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
        op1_h = gen_reg_rtx (qimode);
        emit_insn (gen_il (op1_l, op1, op1));
        emit_insn (gen_ih (op1_h, op1, op1));
-      full_interleave = qimode == V16QImode;
        break;
  
      case ASHIFT:
-    case LSHIFTRT:
-      uns_p = true;
-      /* FALLTHRU */
      case ASHIFTRT:
+    case LSHIFTRT:
        op1_l = gen_reg_rtx (himode);
        op1_h = gen_reg_rtx (himode);
        ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
        ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
        /* vashr/vlshr/vashl  */
-      if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+      if (op2vec)
         {
           rtx tmp = force_reg (qimode, op2);
           op2_l = gen_reg_rtx (himode);
@@ -22803,16 +23701,14 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
        else
         op2_l = op2_h = op2;
  
-      full_interleave = true;
        break;
      default:
        gcc_unreachable ();
      }
  
-  /* Perform vashr/vlshr/vashl.  */
-  if (code != MULT
-      && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+  if (code != MULT && op2vec)
      {
+      /* Expand vashr/vlshr/vashl.  */
        res_l = gen_reg_rtx (himode);
        res_h = gen_reg_rtx (himode);
        emit_insn (gen_rtx_SET (res_l,
@@ -22822,9 +23718,9 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
                               simplify_gen_binary (code, himode,
                                                    op1_h, op2_h)));
      }
-  /* Performance mult/ashr/lshr/ashl.  */
    else
      {
+      /* Expand mult/ashr/lshr/ashl.  */
        res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
                                    1, OPTAB_DIRECT);
        res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
@@ -22844,7 +23740,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
  
    if (full_interleave)
      {
-      /* For SSE2, we used an full interleave, so the desired
+      /* We used the full interleave, the desired
          results are in the even elements.  */
        for (i = 0; i < d.nelt; ++i)
         d.perm[i] = i * 2;
@@ -22868,9 +23764,6 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
  
    ok = ix86_expand_vec_perm_const_1 (&d);
    gcc_assert (ok);
-
-  set_unique_reg_note (get_last_insn (), REG_EQUAL,
-                      gen_rtx_fmt_ee (code, qimode, op1, op2));
  }
  
  /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
@@ -23752,4 +24645,29 @@ ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
    *ptarget_bool = target_bool;
  }
  
+/* Convert a BFmode VAL to SFmode without signaling sNaNs.
+   This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16.  */
+
+rtx
+ix86_expand_fast_convert_bf_to_sf (rtx val)
+{
+  rtx op = gen_lowpart (HImode, val), ret;
+  if (CONST_INT_P (op))
+    {
+      ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
+                                           val, BFmode);
+      if (ret)
+       return ret;
+      /* FLOAT_EXTEND simplification will fail if VAL is a sNaN.  */
+      ret = gen_reg_rtx (SImode);
+      emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
+      emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
+      return gen_lowpart (SFmode, ret);
+    }
+
+  ret = gen_reg_rtx (SFmode);
+  emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
+  return ret;
+}
+
  #include "gt-i386-expand.h"