]> git.ipfire.org Git - thirdparty/gcc.git/blobdiff - gcc/config/i386/sse.md
* doc/md.texi: Document vec_shl_<mode> pattern.
[thirdparty/gcc.git] / gcc / config / i386 / sse.md
index 5020c058d648c95488c3a66d086997a5da794f86..26309aef5c951a7abcf6842dd8b82f424b1f027d 100644 (file)
@@ -1,5 +1,5 @@
 ;; GCC machine description for SSE instructions
-;; Copyright (C) 2005-2018 Free Software Foundation, Inc.
+;; Copyright (C) 2005-2019 Free Software Foundation, Inc.
 ;;
 ;; This file is part of GCC.
 ;;
@@ -21,6 +21,9 @@
   ;; SSE
   UNSPEC_MOVNT
 
+  ;; SSE2
+  UNSPEC_MOVDI_TO_SSE
+
   ;; SSE3
   UNSPEC_LDDQU
 
 
   ;; For AVX512BITALG support
   UNSPEC_VPSHUFBIT
+
+  ;; For AVX512BF16 support
+  UNSPEC_VCVTNE2PS2BF16
+  UNSPEC_VCVTNEPS2BF16
+  UNSPEC_VDPBF16PS
 ])
 
 (define_c_enum "unspecv" [
 (define_mode_iterator VF1
   [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF])
 
+(define_mode_iterator VF1_AVX2
+  [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX2") V4SF])
+
 ;; 128- and 256-bit SF vector modes
 (define_mode_iterator VF1_128_256
   [(V8SF "TARGET_AVX") V4SF])
 (define_mode_attr ssequarterinsnmode
   [(V16SF "V4SF") (V8DF "V2DF") (V16SI "TI") (V8DI "TI")])
 
+(define_mode_attr vecmemsuffix
+  [(V16SF "{z}") (V8SF "{y}") (V4SF "{x}")
+   (V8DF "{z}") (V4DF "{y}") (V2DF "{x}")])
+
 (define_mode_attr ssedoublemodelower
   [(V16QI "v16hi") (V32QI "v32hi") (V64QI "v64hi")
    (V8HI "v8si")   (V16HI "v16si") (V32HI "v32si")
   [(V4SF "V8SF") (V8SF "V16SF") (V16SF "V32SF")
    (V2DF "V4DF") (V4DF "V8DF") (V8DF "V16DF")
    (V16QI "V16HI") (V32QI "V32HI") (V64QI "V64HI")
-   (V4HI "V4SI") (V8HI "V8SI") (V16HI "V16SI") (V32HI "V32SI")
+   (V8HI "V8SI") (V16HI "V16SI") (V32HI "V32SI")
    (V4SI "V4DI") (V8SI "V16SI") (V16SI "V32SI")
    (V4DI "V8DI") (V8DI "V16DI")])
 
 (define_mode_attr ssebytemode
-  [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")])
+  [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")
+   (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")])
 
 ;; All 128bit vector integer modes
 (define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI])
    (V16SF "hi") (V8SF  "qi") (V4SF  "qi")
    (V8DF  "qi") (V4DF  "qi") (V2DF  "qi")])
 
+;; Mapping of vector modes to corresponding mask half size
+(define_mode_attr avx512fmaskhalfmode
+  [(V64QI "SI") (V32QI "HI") (V16QI "QI")
+   (V32HI "HI") (V16HI "QI") (V8HI  "QI") (V4HI "QI")
+   (V16SI "QI") (V8SI  "QI") (V4SI  "QI")
+   (V8DI  "QI") (V4DI  "QI") (V2DI  "QI")
+   (V16SF "QI") (V8SF  "QI") (V4SF  "QI")
+   (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
+
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr sseintvecmode
   [(V16SF "V16SI") (V8DF  "V8DI")
    (set_attr "memory" "none,load")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "avx512f_mov<ssescalarmodelower>_mask"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (match_operand:VF_128 2 "register_operand" "v")
+           (match_operand:VF_128 3 "nonimm_or_0_operand" "0C")
+           (match_operand:QI 4 "register_operand" "Yk"))
+         (match_operand:VF_128 1 "register_operand" "v")
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_expand "avx512f_load<mode>_mask"
+  [(set (match_operand:<ssevecmode> 0 "register_operand")
+       (vec_merge:<ssevecmode>
+         (vec_merge:<ssevecmode>
+           (vec_duplicate:<ssevecmode>
+             (match_operand:MODEF 1 "memory_operand"))
+           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand")
+           (match_operand:QI 3 "register_operand"))
+         (match_dup 4)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "operands[4] = CONST0_RTX (<ssevecmode>mode);")
+
+(define_insn "*avx512f_load<mode>_mask"
+  [(set (match_operand:<ssevecmode> 0 "register_operand" "=v")
+       (vec_merge:<ssevecmode>
+         (vec_merge:<ssevecmode>
+           (vec_duplicate:<ssevecmode>
+             (match_operand:MODEF 1 "memory_operand" "m"))
+           (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
+           (match_operand:QI 3 "register_operand" "Yk"))
+         (match_operand:<ssevecmode> 4 "const0_operand" "C")
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "vmov<ssescalarmodesuffix>\t{%1, %0%{%3%}%N2|%0%{3%}%N2, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_store<mode>_mask"
+  [(set (match_operand:MODEF 0 "memory_operand" "=m")
+       (if_then_else:MODEF
+         (and:QI (match_operand:QI 2 "register_operand" "Yk")
+                (const_int 1))
+         (vec_select:MODEF
+           (match_operand:<ssevecmode> 1 "register_operand" "v")
+           (parallel [(const_int 0)]))
+         (match_dup 0)))]
+  "TARGET_AVX512F"
+  "vmov<ssescalarmodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "store")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "<avx512>_blendm<mode>"
   [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
        (vec_merge:V48_AVX512VL
 ;; from there.
 
 (define_insn_and_split "movdi_to_sse"
-  [(parallel
-    [(set (match_operand:V4SI 0 "register_operand" "=?x,x")
-         (subreg:V4SI (match_operand:DI 1 "nonimmediate_operand" "r,m") 0))
-     (clobber (match_scratch:V4SI 2 "=&x,X"))])]
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x,?x")
+       (unspec:V4SI [(match_operand:DI 1 "nonimmediate_operand" "r,m,r")]
+                    UNSPEC_MOVDI_TO_SSE))
+     (clobber (match_scratch:V4SI 2 "=X,X,&x"))]
   "!TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC"
   "#"
   "&& reload_completed"
         Assemble the 64-bit DImode value in an xmm register.  */
       emit_insn (gen_sse2_loadld (operands[0], CONST0_RTX (V4SImode),
                                  gen_lowpart (SImode, operands[1])));
-      emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode),
-                                 gen_highpart (SImode, operands[1])));
-      emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0],
-                                            operands[2]));
-   }
+      if (TARGET_SSE4_1)
+        emit_insn (gen_sse4_1_pinsrd (operands[0], operands[0],
+                                     gen_highpart (SImode, operands[1]),
+                                     GEN_INT (2)));
+      else
+       {
+         emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode),
+                                     gen_highpart (SImode, operands[1])));
+         emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0],
+                                                operands[2]));
+       }
+  }
  else if (memory_operand (operands[1], DImode))
    emit_insn (gen_vec_concatv2di (gen_lowpart (V2DImode, operands[0]),
                                  operands[1], const0_rtx));
  else
    gcc_unreachable ();
  DONE;
-})
+}
+  [(set_attr "isa" "sse4,*,*")])
 
 (define_split
   [(set (match_operand:V4SF 0 "register_operand")
   "TARGET_SSE"
   "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;")
 
-(define_insn_and_split "*absneg<mode>2"
+(define_insn_and_split "*<code><mode>2"
   [(set (match_operand:VF 0 "register_operand" "=x,x,v,v")
-       (match_operator:VF 3 "absneg_operator"
-         [(match_operand:VF 1 "vector_operand" "0,  xBm,v, m")]))
+       (absneg:VF
+         (match_operand:VF 1 "vector_operand" "0,  xBm,v, m")))
    (use (match_operand:VF 2 "vector_operand"    "xBm,0,  vm,v"))]
   "TARGET_SSE"
   "#"
   "&& reload_completed"
-  [(const_int 0)]
+  [(set (match_dup 0) (match_dup 3))]
 {
-  enum rtx_code absneg_op;
-  rtx op1, op2;
-  rtx t;
+  enum rtx_code absneg_op = <CODE> == ABS ? AND : XOR;
 
   if (TARGET_AVX)
     {
       if (MEM_P (operands[1]))
-       op1 = operands[2], op2 = operands[1];
-      else
-       op1 = operands[1], op2 = operands[2];
+        std::swap (operands[1], operands[2]);
     }
   else
+   {
+     if (operands_match_p (operands[0], operands[2]))
+       std::swap (operands[1], operands[2]);
+   }
+
+  operands[3]
+    = gen_rtx_fmt_ee (absneg_op, <MODE>mode, operands[1], operands[2]);
+}
+  [(set_attr "isa" "noavx,noavx,avx,avx")])
+
+(define_insn_and_split "*nabs<mode>2"
+  [(set (match_operand:VF 0 "register_operand" "=x,x,v,v")
+       (neg:VF
+         (abs:VF
+           (match_operand:VF 1 "vector_operand" "0,xBm,v,m"))))
+   (use (match_operand:VF 2 "vector_operand"    "xBm,0,vm,v"))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 3))]
+{
+  if (TARGET_AVX)
     {
-      op1 = operands[0];
-      if (rtx_equal_p (operands[0], operands[1]))
-       op2 = operands[2];
-      else
-       op2 = operands[1];
+      if (MEM_P (operands[1]))
+        std::swap (operands[1], operands[2]);
     }
+  else
+   {
+     if (operands_match_p (operands[0], operands[2]))
+       std::swap (operands[1], operands[2]);
+   }
 
-  absneg_op = GET_CODE (operands[3]) == NEG ? XOR : AND;
-  t = gen_rtx_fmt_ee (absneg_op, <MODE>mode, op1, op2);
-  t = gen_rtx_SET (operands[0], t);
-  emit_insn (t);
-  DONE;
+  operands[3]
+    = gen_rtx_fmt_ee (IOR, <MODE>mode, operands[1], operands[2]);
 }
   [(set_attr "isa" "noavx,noavx,avx,avx")])
 
   [(set (match_operand:VF1_128_256 0 "register_operand")
        (unspec:VF1_128_256
          [(match_operand:VF1_128_256 1 "vector_operand")] UNSPEC_RSQRT))]
-  "TARGET_SSE_MATH"
+  "TARGET_SSE && TARGET_SSE_MATH"
 {
   ix86_emit_swsqrtsf (operands[0], operands[1], <MODE>mode, true);
   DONE;
        (unspec:V16SF
          [(match_operand:V16SF 1 "vector_operand")]
          UNSPEC_RSQRT28))]
-  "TARGET_SSE_MATH && TARGET_AVX512ER"
+  "TARGET_AVX512ER && TARGET_SSE_MATH"
 {
   ix86_emit_swsqrtsf (operands[0], operands[1], V16SFmode, true);
   DONE;
    (V8HI "const_0_to_7_operand")    (V16QI "const_0_to_7_operand")])
 
 (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
          [(match_operand:V48_AVX512VL 1 "register_operand" "v")
           (match_operand:V48_AVX512VL 2 "nonimmediate_operand" "<round_saeonly_constraint>")
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
          [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
           (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
          [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
           (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
          [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
           (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "avx512f_vmcmp<mode>3<round_saeonly_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (and:<avx512fmaskmode>
          (unspec:<avx512fmaskmode>
            [(match_operand:VF_128 1 "register_operand" "v")
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_insn "avx512f_vmcmp<mode>3_mask<round_saeonly_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (and:<avx512fmaskmode>
          (unspec:<avx512fmaskmode>
            [(match_operand:VF_128 1 "register_operand" "v")
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_insn "avx512f_maskcmp<mode>3"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (match_operator:<avx512fmaskmode> 3 "sse_comparison_operator"
          [(match_operand:VF 1 "register_operand" "v")
           (match_operand:VF 2 "nonimmediate_operand" "vm")]))]
          (match_operand:VF_128_256 2 "vector_operand" "xBm,xm,vm,vm")))]
   "TARGET_SSE && <mask_avx512vl_condition>"
 {
-  static char buf[128];
+  char buf[128];
   const char *ops;
   const char *suffix;
 
     }
 
   snprintf (buf, sizeof (buf), ops, suffix);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "isa" "noavx,avx,avx512dq,avx512f")
    (set_attr "type" "sselog")
          (match_operand:VF_512 2 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512F"
 {
-  static char buf[128];
+  char buf[128];
   const char *ops;
   const char *suffix;
 
   snprintf (buf, sizeof (buf),
            "v%sandn%s\t{%%2, %%1, %%0<mask_operand3_1>|%%0<mask_operand3_1>, %%1, %%2}",
            ops, suffix);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "type" "sselog")
    (set_attr "prefix" "evex")
   "TARGET_SSE && <mask_avx512vl_condition>
    && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
 {
-  static char buf[128];
+  char buf[128];
   const char *ops;
   const char *suffix;
 
     }
 
   snprintf (buf, sizeof (buf), ops, suffix);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "isa" "noavx,avx,avx512dq,avx512f")
    (set_attr "type" "sselog")
          (match_operand:VF_512 2 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
 {
-  static char buf[128];
+  char buf[128];
   const char *ops;
   const char *suffix;
 
   snprintf (buf, sizeof (buf),
           "v%s<logic>%s\t{%%2, %%1, %%0<mask_operand3_1>|%%0<mask_operand3_1>, %%1, %%2}",
           ops, suffix);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "type" "sselog")
    (set_attr "prefix" "evex")
   operands[5] = gen_reg_rtx (<MODE>mode);
 })
 
+(define_expand "xorsign<mode>3"
+  [(set (match_dup 4)
+       (and:VF (match_dup 3)
+               (match_operand:VF 2 "vector_operand")))
+   (set (match_operand:VF 0 "register_operand")
+       (xor:VF (match_dup 4)
+               (match_operand:VF 1 "vector_operand")))]
+  "TARGET_SSE"
+{
+  operands[3] = ix86_build_signbit_mask (<MODE>mode, 1, 0);
+
+  operands[4] = gen_reg_rtx (<MODE>mode);
+})
+
+(define_expand "signbit<mode>2"
+  [(set (match_operand:<sseintvecmode> 0 "register_operand")
+       (lshiftrt:<sseintvecmode>
+         (subreg:<sseintvecmode>
+           (match_operand:VF1_AVX2 1 "register_operand") 0)
+         (match_dup 2)))]
+  "TARGET_SSE2"
+  "operands[2] = GEN_INT (GET_MODE_UNIT_BITSIZE (<MODE>mode)-1);")
+
 ;; Also define scalar versions.  These are used for abs, neg, and
 ;; conditional move.  Using subregs into vector modes causes register
 ;; allocation lossage.  These patterns do not allow memory operands
            (match_operand:MODEF 2 "register_operand" "x,x,v,v")))]
   "SSE_FLOAT_MODE_P (<MODE>mode)"
 {
-  static char buf[128];
+  char buf[128];
   const char *ops;
   const char *suffix
     = (get_attr_mode (insn) == MODE_V4SF) ? "ps" : "<ssevecmodesuffix>";
     }
 
   snprintf (buf, sizeof (buf), ops, suffix);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "isa" "noavx,avx,avx512vl,avx512f")
    (set_attr "type" "sselog")
          (match_operand:TF 2 "vector_operand" "xBm,xm,vm,v")))]
   "TARGET_SSE"
 {
-  static char buf[128];
+  char buf[128];
   const char *ops;
   const char *tmp
     = (which_alternative >= 2 ? "pandnq"
     }
 
   snprintf (buf, sizeof (buf), ops, tmp);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "isa" "noavx,avx,avx512vl,avx512f")
    (set_attr "type" "sselog")
          (match_operand:MODEF 2 "register_operand" "x,x,v,v")))]
   "SSE_FLOAT_MODE_P (<MODE>mode)"
 {
-  static char buf[128];
+  char buf[128];
   const char *ops;
   const char *suffix
     = (get_attr_mode (insn) == MODE_V4SF) ? "ps" : "<ssevecmodesuffix>";
     }
 
   snprintf (buf, sizeof (buf), ops, suffix);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "isa" "noavx,avx,avx512vl,avx512f")
    (set_attr "type" "sselog")
          (match_operand:TF 2 "vector_operand" "xBm,xm,vm,v")))]
   "TARGET_SSE && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
 {
-  static char buf[128];
+  char buf[128];
   const char *ops;
   const char *tmp
     = (which_alternative >= 2 ? "p<logic>q"
     }
 
   snprintf (buf, sizeof (buf), ops, tmp);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "isa" "noavx,avx,avx512vl,avx512f")
    (set_attr "type" "sselog")
        (vec_merge:VF_AVX512VL
          (fma:VF_AVX512VL
            (match_operand:VF_AVX512VL 1 "register_operand" "0,0")
-           (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v")
-           (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>"))
+           (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v")
+           (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>"))
          (match_dup 1)
          (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
   "TARGET_AVX512F && <round_mode512bit_condition>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
        (vec_merge:VF_AVX512VL
          (fma:VF_AVX512VL
-           (match_operand:VF_AVX512VL 1 "register_operand" "v")
-           (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")
+           (match_operand:VF_AVX512VL 1 "<round_nimm_predicate>" "%v")
+           (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>")
            (match_operand:VF_AVX512VL 3 "register_operand" "0"))
          (match_dup 3)
          (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
        (vec_merge:VF_AVX512VL
          (fma:VF_AVX512VL
            (match_operand:VF_AVX512VL 1 "register_operand" "0,0")
-           (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v")
+           (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v")
            (neg:VF_AVX512VL
-             (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>")))
+             (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>")))
          (match_dup 1)
          (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
   "TARGET_AVX512F"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
        (vec_merge:VF_AVX512VL
          (fma:VF_AVX512VL
-           (match_operand:VF_AVX512VL 1 "register_operand" "v")
-           (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")
+           (match_operand:VF_AVX512VL 1 "<round_nimm_predicate>" "%v")
+           (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>")
            (neg:VF_AVX512VL
              (match_operand:VF_AVX512VL 3 "register_operand" "0")))
          (match_dup 3)
          (fma:VF_AVX512VL
            (neg:VF_AVX512VL
              (match_operand:VF_AVX512VL 1 "register_operand" "0,0"))
-           (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v")
-           (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>"))
+           (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v")
+           (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>"))
          (match_dup 1)
          (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
   "TARGET_AVX512F && <round_mode512bit_condition>"
        (vec_merge:VF_AVX512VL
          (fma:VF_AVX512VL
            (neg:VF_AVX512VL
-             (match_operand:VF_AVX512VL 1 "register_operand" "v"))
-           (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")
+             (match_operand:VF_AVX512VL 1 "<round_nimm_predicate>" "%v"))
+           (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>")
            (match_operand:VF_AVX512VL 3 "register_operand" "0"))
          (match_dup 3)
          (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
          (fma:VF_AVX512VL
            (neg:VF_AVX512VL
              (match_operand:VF_AVX512VL 1 "register_operand" "0,0"))
-           (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v")
+           (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v")
            (neg:VF_AVX512VL
-             (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>")))
+             (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>")))
          (match_dup 1)
          (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
   "TARGET_AVX512F && <round_mode512bit_condition>"
        (vec_merge:VF_AVX512VL
          (fma:VF_AVX512VL
            (neg:VF_AVX512VL
-             (match_operand:VF_AVX512VL 1 "register_operand" "v"))
-           (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")
+             (match_operand:VF_AVX512VL 1 "<round_nimm_predicate>" "%v"))
+           (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>")
            (neg:VF_AVX512VL
              (match_operand:VF_AVX512VL 3 "register_operand" "0")))
          (match_dup 3)
        (vec_merge:VF_AVX512VL
          (unspec:VF_AVX512VL
            [(match_operand:VF_AVX512VL 1 "register_operand" "0,0")
-            (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v")
-            (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>")]
+            (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v")
+            (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>")]
            UNSPEC_FMADDSUB)
          (match_dup 1)
          (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
        (vec_merge:VF_AVX512VL
          (unspec:VF_AVX512VL
            [(match_operand:VF_AVX512VL 1 "register_operand" "v")
-            (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")
+            (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>")
             (match_operand:VF_AVX512VL 3 "register_operand" "0")]
            UNSPEC_FMADDSUB)
          (match_dup 3)
        (vec_merge:VF_AVX512VL
          (unspec:VF_AVX512VL
            [(match_operand:VF_AVX512VL 1 "register_operand" "0,0")
-            (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v")
+            (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v")
             (neg:VF_AVX512VL
-              (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>"))]
+              (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>"))]
            UNSPEC_FMADDSUB)
          (match_dup 1)
          (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
        (vec_merge:VF_AVX512VL
          (unspec:VF_AVX512VL
            [(match_operand:VF_AVX512VL 1 "register_operand" "v")
-            (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")
+            (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>")
             (neg:VF_AVX512VL
               (match_operand:VF_AVX512VL 3 "register_operand" "0"))]
            UNSPEC_FMADDSUB)
   [(set (match_operand:VF_128 0 "register_operand")
        (vec_merge:VF_128
          (fma:VF_128
-           (match_operand:VF_128 1 "<round_nimm_predicate>")
+           (match_operand:VF_128 1 "register_operand")
            (match_operand:VF_128 2 "<round_nimm_predicate>")
            (match_operand:VF_128 3 "<round_nimm_predicate>"))
          (match_dup 1)
   [(set (match_operand:VF_128 0 "register_operand")
        (vec_merge:VF_128
          (fma:VF_128
-           (match_operand:VF_128 1 "<round_nimm_predicate>")
+           (match_operand:VF_128 1 "register_operand")
            (match_operand:VF_128 2 "<round_nimm_predicate>")
            (neg:VF_128
              (match_operand:VF_128 3 "<round_nimm_predicate>")))
          (fma:VF_128
            (neg:VF_128
              (match_operand:VF_128 2 "<round_nimm_predicate>"))
-           (match_operand:VF_128 1 "<round_nimm_predicate>")
+           (match_operand:VF_128 1 "register_operand")
            (match_operand:VF_128 3 "<round_nimm_predicate>"))
          (match_dup 1)
          (const_int 1)))]
          (fma:VF_128
            (neg:VF_128
              (match_operand:VF_128 2 "<round_nimm_predicate>"))
-           (match_operand:VF_128 1 "<round_nimm_predicate>")
+           (match_operand:VF_128 1 "register_operand")
            (neg:VF_128
              (match_operand:VF_128 3 "<round_nimm_predicate>")))
          (match_dup 1)
   [(set (match_operand:VF_128 0 "register_operand" "=v,v")
         (vec_merge:VF_128
          (fma:VF_128
-           (match_operand:VF_128 1 "<round_nimm_predicate>" " 0, 0")
+           (match_operand:VF_128 1 "register_operand" "0,0")
            (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>, v")
-           (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>"))
+           (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>"))
          (match_dup 1)
          (const_int 1)))]
   "TARGET_FMA || TARGET_AVX512F"
   [(set (match_operand:VF_128 0 "register_operand" "=v,v")
         (vec_merge:VF_128
          (fma:VF_128
-           (match_operand:VF_128   1 "<round_nimm_predicate>" "0,0")
+           (match_operand:VF_128   1 "register_operand" "0,0")
            (match_operand:VF_128   2 "<round_nimm_predicate>" "<round_constraint>,v")
            (neg:VF_128
-             (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>")))
+             (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>")))
          (match_dup 1)
          (const_int 1)))]
   "TARGET_FMA || TARGET_AVX512F"
          (fma:VF_128
            (neg:VF_128
              (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v"))
-           (match_operand:VF_128   1 "<round_nimm_predicate>" "0,0")
+           (match_operand:VF_128   1 "register_operand" "0,0")
            (match_operand:VF_128   3 "<round_nimm_predicate>" "v,<round_constraint>"))
          (match_dup 1)
          (const_int 1)))]
         (vec_merge:VF_128
          (fma:VF_128
            (neg:VF_128
-             (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>, v"))
-           (match_operand:VF_128   1 "<round_nimm_predicate>" " 0, 0")
+             (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v"))
+           (match_operand:VF_128   1 "register_operand" "0,0")
            (neg:VF_128
-             (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>")))
+             (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>")))
          (match_dup 1)
          (const_int 1)))]
   "TARGET_FMA || TARGET_AVX512F"
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "avx512f_vmfmadd_<mode>_mask<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (match_operand:VF_128 1 "register_operand" "0,0")
+             (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v")
+             (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>"))
+           (match_dup 1)
+           (match_operand:QI 4 "register_operand" "Yk,Yk"))
+         (match_dup 1)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "@
+   vfmadd132<ssescalarmodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}
+   vfmadd213<ssescalarmodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %<iptr>2, %<iptr>3<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_vmfmadd_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (match_operand:VF_128 1 "<round_nimm_predicate>" "%v")
+             (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>")
+             (match_operand:VF_128 3 "register_operand" "0"))
+           (match_dup 3)
+           (match_operand:QI 4 "register_operand" "Yk"))
+         (match_dup 3)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "vfmadd231<ssescalarmodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "avx512f_vmfmadd_<mode>_maskz<round_expand_name>"
+  [(match_operand:VF_128 0 "register_operand")
+   (match_operand:VF_128 1 "<round_expand_nimm_predicate>")
+   (match_operand:VF_128 2 "<round_expand_nimm_predicate>")
+   (match_operand:VF_128 3 "<round_expand_nimm_predicate>")
+   (match_operand:QI 4 "register_operand")]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_avx512f_vmfmadd_<mode>_maskz_1<round_expand_name> (
+    operands[0], operands[1], operands[2], operands[3],
+    CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>));
+  DONE;
+})
+
+(define_insn "avx512f_vmfmadd_<mode>_maskz_1<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (match_operand:VF_128 1 "register_operand" "0,0")
+             (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v")
+             (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>"))
+           (match_operand:VF_128 4 "const0_operand" "C,C")
+           (match_operand:QI 5 "register_operand" "Yk,Yk"))
+         (match_dup 1)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "@
+   vfmadd132<ssescalarmodesuffix>\t{<round_op6>%2, %3, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>3, %<iptr>2<round_op6>}
+   vfmadd213<ssescalarmodesuffix>\t{<round_op6>%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>2, %<iptr>3<round_op6>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx512f_vmfmsub_<mode>_mask<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (match_operand:VF_128 1 "register_operand" "0,0")
+             (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v")
+             (neg:VF_128
+               (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>")))
+           (match_dup 1)
+           (match_operand:QI 4 "register_operand" "Yk,Yk"))
+         (match_dup 1)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "@
+   vfmsub132<ssescalarmodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}
+   vfmsub213<ssescalarmodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %<iptr>2, %<iptr>3<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_vmfmsub_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (match_operand:VF_128 1 "<round_nimm_predicate>" "%v")
+             (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>")
+             (neg:VF_128
+               (match_operand:VF_128 3 "register_operand" "0")))
+           (match_dup 3)
+           (match_operand:QI 4 "register_operand" "Yk"))
+         (match_dup 3)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "vfmsub231<ssescalarmodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx512f_vmfmsub_<mode>_maskz_1<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (match_operand:VF_128 1 "register_operand" "0,0")
+             (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v")
+             (neg:VF_128
+               (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>")))
+           (match_operand:VF_128 4 "const0_operand" "C,C")
+           (match_operand:QI 5 "register_operand" "Yk,Yk"))
+         (match_dup 1)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "@
+   vfmsub132<ssescalarmodesuffix>\t{<round_op6>%2, %3, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>3, %<iptr>2<round_op6>}
+   vfmsub213<ssescalarmodesuffix>\t{<round_op6>%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>2, %<iptr>3<round_op6>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx512f_vmfnmadd_<mode>_mask<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (neg:VF_128
+               (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v"))
+             (match_operand:VF_128 1 "register_operand" "0,0")
+             (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>"))
+           (match_dup 1)
+           (match_operand:QI 4 "register_operand" "Yk,Yk"))
+         (match_dup 1)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "@
+   vfnmadd132<ssescalarmodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}
+   vfnmadd213<ssescalarmodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %<iptr>2, %<iptr>3<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx512f_vmfnmadd_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (neg:VF_128
+               (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>"))
+             (match_operand:VF_128 1 "<round_nimm_predicate>" "%v")
+             (match_operand:VF_128 3 "register_operand" "0"))
+           (match_dup 3)
+           (match_operand:QI 4 "register_operand" "Yk"))
+         (match_dup 3)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "vfnmadd231<ssescalarmodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx512f_vmfnmadd_<mode>_maskz_1<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (neg:VF_128
+               (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v"))
+             (match_operand:VF_128 1 "register_operand" "0,0")
+             (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>"))
+           (match_operand:VF_128 4 "const0_operand" "C,C")
+           (match_operand:QI 5 "register_operand" "Yk,Yk"))
+         (match_dup 1)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "@
+   vfnmadd132<ssescalarmodesuffix>\t{<round_op6>%2, %3, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>3, %<iptr>2<round_op6>}
+   vfnmadd213<ssescalarmodesuffix>\t{<round_op6>%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>2, %<iptr>3<round_op6>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx512f_vmfnmsub_<mode>_mask<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (neg:VF_128
+               (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v"))
+             (match_operand:VF_128 1 "register_operand" "0,0")
+             (neg:VF_128
+               (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>")))
+           (match_dup 1)
+           (match_operand:QI 4 "register_operand" "Yk,Yk"))
+         (match_dup 1)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "@
+   vfnmsub132<ssescalarmodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}
+   vfnmsub213<ssescalarmodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %<iptr>2, %<iptr>3<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx512f_vmfnmsub_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (neg:VF_128
+               (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>"))
+             (match_operand:VF_128 1 "<round_nimm_predicate>" "%v")
+             (neg:VF_128
+               (match_operand:VF_128 3 "register_operand" "0")))
+           (match_dup 3)
+           (match_operand:QI 4 "register_operand" "Yk"))
+         (match_dup 3)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "vfnmsub231<ssescalarmodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx512f_vmfnmsub_<mode>_maskz_1<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (fma:VF_128
+             (neg:VF_128
+               (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v"))
+             (match_operand:VF_128 1 "register_operand" "0,0")
+             (neg:VF_128
+               (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>")))
+           (match_operand:VF_128 4 "const0_operand" "C,C")
+           (match_operand:QI 5 "register_operand" "Yk,Yk"))
+         (match_dup 1)
+         (const_int 1)))]
+  "TARGET_AVX512F"
+  "@
+   vfnmsub132<ssescalarmodesuffix>\t{<round_op6>%2, %3, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>3, %<iptr>2<round_op6>}
+   vfnmsub213<ssescalarmodesuffix>\t{<round_op6>%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>2, %<iptr>3<round_op6>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
 ;; FMA4 floating point scalar intrinsics.  These write the
 ;; entire destination register, with the high-order elements zeroed.
 
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(define_insn "sse_cvtpi2ps"
-  [(set (match_operand:V4SF 0 "register_operand" "=x")
+(define_insn_and_split "sse_cvtpi2ps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x,Yv")
        (vec_merge:V4SF
          (vec_duplicate:V4SF
-           (float:V2SF (match_operand:V2SI 2 "nonimmediate_operand" "ym")))
-         (match_operand:V4SF 1 "register_operand" "0")
-         (const_int 3)))]
-  "TARGET_SSE"
-  "cvtpi2ps\t{%2, %0|%0, %2}"
-  [(set_attr "type" "ssecvt")
+           (float:V2SF (match_operand:V2SI 2 "register_mmxmem_operand" "ym,x,Yv")))
+         (match_operand:V4SF 1 "register_operand" "0,0,Yv")
+         (const_int 3)))
+   (clobber (match_scratch:V4SF 3 "=X,x,Yv"))]
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSE"
+  "@
+   cvtpi2ps\t{%2, %0|%0, %2}
+   #
+   #"
+  "TARGET_MMX_WITH_SSE && reload_completed"
+  [(const_int 0)]
+{
+  rtx op2 = lowpart_subreg (V4SImode, operands[2],
+                           GET_MODE (operands[2]));
+  /* Generate SSE2 cvtdq2ps.  */
+  emit_insn (gen_floatv4siv4sf2 (operands[3], op2));
+
+  /* Merge operands[3] with operands[0].  */
+  rtx mask, op1;
+  if (TARGET_AVX)
+    {
+      mask = gen_rtx_PARALLEL (VOIDmode,
+                              gen_rtvec (4, GEN_INT (0), GEN_INT (1),
+                                         GEN_INT (6), GEN_INT (7)));
+      op1 = gen_rtx_VEC_CONCAT (V8SFmode, operands[3], operands[1]);
+      op2 = gen_rtx_VEC_SELECT (V4SFmode, op1, mask);
+      emit_insn (gen_rtx_SET (operands[0], op2));
+    }
+  else
+    {
+      /* NB: SSE can only concatenate OP0 and OP3 to OP0.  */
+      mask = gen_rtx_PARALLEL (VOIDmode,
+                              gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+                                         GEN_INT (4), GEN_INT (5)));
+      op1 = gen_rtx_VEC_CONCAT (V8SFmode, operands[0], operands[3]);
+      op2 = gen_rtx_VEC_SELECT (V4SFmode, op1, mask);
+      emit_insn (gen_rtx_SET (operands[0], op2));
+
+      /* Swap bits 0:63 with bits 64:127.  */
+      mask = gen_rtx_PARALLEL (VOIDmode,
+                              gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+                                         GEN_INT (0), GEN_INT (1)));
+      rtx dest = lowpart_subreg (V4SImode, operands[0],
+                                GET_MODE (operands[0]));
+      op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+      emit_insn (gen_rtx_SET (dest, op1));
+    }
+  DONE;
+}
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
+   (set_attr "type" "ssecvt")
    (set_attr "mode" "V4SF")])
 
 (define_insn "sse_cvtps2pi"
-  [(set (match_operand:V2SI 0 "register_operand" "=y")
+  [(set (match_operand:V2SI 0 "register_operand" "=y,Yv")
        (vec_select:V2SI
-         (unspec:V4SI [(match_operand:V4SF 1 "nonimmediate_operand" "xm")]
+         (unspec:V4SI [(match_operand:V4SF 1 "register_mmxmem_operand" "xm,YvBm")]
                       UNSPEC_FIX_NOTRUNC)
          (parallel [(const_int 0) (const_int 1)])))]
-  "TARGET_SSE"
-  "cvtps2pi\t{%1, %0|%0, %q1}"
-  [(set_attr "type" "ssecvt")
-   (set_attr "unit" "mmx")
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSE"
+  "@
+   cvtps2pi\t{%1, %0|%0, %q1}
+   %vcvtps2dq\t{%1, %0|%0, %1}"
+  [(set_attr "mmx_isa" "native,x64")
+   (set_attr "type" "ssecvt")
+   (set_attr "unit" "mmx,*")
    (set_attr "mode" "DI")])
 
 (define_insn "sse_cvttps2pi"
-  [(set (match_operand:V2SI 0 "register_operand" "=y")
+  [(set (match_operand:V2SI 0 "register_operand" "=y,Yv")
        (vec_select:V2SI
-         (fix:V4SI (match_operand:V4SF 1 "nonimmediate_operand" "xm"))
+         (fix:V4SI (match_operand:V4SF 1 "register_mmxmem_operand" "xm,YvBm"))
          (parallel [(const_int 0) (const_int 1)])))]
-  "TARGET_SSE"
-  "cvttps2pi\t{%1, %0|%0, %q1}"
-  [(set_attr "type" "ssecvt")
-   (set_attr "unit" "mmx")
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSE"
+  "@
+   cvttps2pi\t{%1, %0|%0, %q1}
+   %vcvttps2dq\t{%1, %0|%0, %1}"
+  [(set_attr "mmx_isa" "native,x64")
+   (set_attr "type" "ssecvt")
+   (set_attr "unit" "mmx,*")
    (set_attr "prefix_rep" "0")
    (set_attr "mode" "SF")])
 
          (match_operand:VF_128 1 "register_operand" "v")
          (const_int 1)))]
   "TARGET_AVX512F && <round_modev4sf_condition>"
-  "vcvtusi2<ssescalarmodesuffix>\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}"
+  "vcvtusi2<ssescalarmodesuffix>{l}\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}"
   [(set_attr "type" "sseicvt")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<ssescalarmode>")])
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (define_insn "sse2_cvtpi2pd"
-  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
-       (float:V2DF (match_operand:V2SI 1 "nonimmediate_operand" "y,m")))]
+  [(set (match_operand:V2DF 0 "register_operand" "=v,x")
+       (float:V2DF (match_operand:V2SI 1 "nonimmediate_operand" "vBm,?!y")))]
   "TARGET_SSE2"
-  "cvtpi2pd\t{%1, %0|%0, %1}"
-  [(set_attr "type" "ssecvt")
-   (set_attr "unit" "mmx,*")
-   (set_attr "prefix_data16" "1,*")
+  "@
+   %vcvtdq2pd\t{%1, %0|%0, %1}
+   cvtpi2pd\t{%1, %0|%0, %1}"
+  [(set_attr "mmx_isa" "*,native")
+   (set_attr "type" "ssecvt")
+   (set_attr "unit" "*,mmx")
+   (set_attr "prefix_data16" "*,1")
+   (set_attr "prefix" "maybe_vex,*")
    (set_attr "mode" "V2DF")])
 
 (define_insn "sse2_cvtpd2pi"
-  [(set (match_operand:V2SI 0 "register_operand" "=y")
-       (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "xm")]
+  [(set (match_operand:V2SI 0 "register_operand" "=v,?!y")
+       (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "vBm,xm")]
                     UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE2"
-  "cvtpd2pi\t{%1, %0|%0, %1}"
-  [(set_attr "type" "ssecvt")
-   (set_attr "unit" "mmx")
+  "@
+   * return TARGET_AVX ? \"vcvtpd2dq{x}\t{%1, %0|%0, %1}\" : \"cvtpd2dq\t{%1, %0|%0, %1}\";
+   cvtpd2pi\t{%1, %0|%0, %1}"
+  [(set_attr "mmx_isa" "*,native")
+   (set_attr "type" "ssecvt")
+   (set_attr "unit" "*,mmx")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "athlon_decode" "vector")
    (set_attr "bdver1_decode" "double")
-   (set_attr "btver2_decode" "direct")
-   (set_attr "prefix_data16" "1")
-   (set_attr "mode" "DI")])
+   (set_attr "prefix_data16" "*,1")
+   (set_attr "prefix" "maybe_vex,*")
+   (set_attr "mode" "TI")])
 
 (define_insn "sse2_cvttpd2pi"
-  [(set (match_operand:V2SI 0 "register_operand" "=y")
-       (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "xm")))]
+  [(set (match_operand:V2SI 0 "register_operand" "=v,?!y")
+       (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vBm,xm")))]
   "TARGET_SSE2"
-  "cvttpd2pi\t{%1, %0|%0, %1}"
-  [(set_attr "type" "ssecvt")
-   (set_attr "unit" "mmx")
+  "@
+   * return TARGET_AVX ? \"vcvttpd2dq{x}\t{%1, %0|%0, %1}\" : \"cvttpd2dq\t{%1, %0|%0, %1}\";
+   cvttpd2pi\t{%1, %0|%0, %1}"
+  [(set_attr "mmx_isa" "*,native")
+   (set_attr "type" "ssecvt")
+   (set_attr "unit" "*,mmx")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "athlon_decode" "vector")
    (set_attr "bdver1_decode" "double")
-   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_data16" "*,1")
+   (set_attr "prefix" "maybe_vex,*")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_cvtsi2sd"
          (const_int 1)))]
   "TARGET_SSE2"
   "@
-   cvtsi2sd\t{%2, %0|%0, %2}
-   cvtsi2sd\t{%2, %0|%0, %2}
-   vcvtsi2sd\t{%2, %1, %0|%0, %1, %2}"
+   cvtsi2sd{l}\t{%2, %0|%0, %2}
+   cvtsi2sd{l}\t{%2, %0|%0, %2}
+   vcvtsi2sd{l}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,direct,*")
          (const_int 1)))]
   "TARGET_SSE2 && TARGET_64BIT"
   "@
-   cvtsi2sdq\t{%2, %0|%0, %2}
-   cvtsi2sdq\t{%2, %0|%0, %2}
-   vcvtsi2sdq\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}"
+   cvtsi2sd{q}\t{%2, %0|%0, %2}
+   cvtsi2sd{q}\t{%2, %0|%0, %2}
+   vcvtsi2sd{q}\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,direct,*")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "float<floatunssuffix>v2div2sf2"
+(define_expand "float<floatunssuffix>v2div2sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=v")
        (vec_concat:V4SF
            (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
-           (const_vector:V2SF [(const_int 0) (const_int 0)])))]
+           (match_dup 2)))]
+  "TARGET_AVX512DQ && TARGET_AVX512VL"
+  "operands[2] = CONST0_RTX (V2SFmode);")
+
+(define_insn "*float<floatunssuffix>v2div2sf2"
+  [(set (match_operand:V4SF 0 "register_operand" "=v")
+       (vec_concat:V4SF
+           (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
+           (match_operand:V2SF 2 "const0_operand" "C")))]
   "TARGET_AVX512DQ && TARGET_AVX512VL"
   "vcvt<floatsuffix>qq2ps{x}\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
   DONE;
 })
 
-(define_insn "float<floatunssuffix>v2div2sf2_mask"
+(define_expand "float<floatunssuffix>v2div2sf2_mask"
   [(set (match_operand:V4SF 0 "register_operand" "=v")
     (vec_concat:V4SF
         (vec_merge:V2SF
-               (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
+           (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
             (vec_select:V2SF
                 (match_operand:V4SF 2 "nonimm_or_0_operand" "0C")
                 (parallel [(const_int 0) (const_int 1)]))
             (match_operand:QI 3 "register_operand" "Yk"))
-           (const_vector:V2SF [(const_int 0) (const_int 0)])))]
+           (match_dup 4)))]
+  "TARGET_AVX512DQ && TARGET_AVX512VL"
+  "operands[4] = CONST0_RTX (V2SFmode);")
+
+(define_insn "*float<floatunssuffix>v2div2sf2_mask"
+  [(set (match_operand:V4SF 0 "register_operand" "=v")
+    (vec_concat:V4SF
+        (vec_merge:V2SF
+           (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
+            (vec_select:V2SF
+                (match_operand:V4SF 2 "nonimm_or_0_operand" "0C")
+                (parallel [(const_int 0) (const_int 1)]))
+            (match_operand:QI 3 "register_operand" "Yk"))
+           (match_operand:V2SF 4 "const0_operand" "C")))]
   "TARGET_AVX512DQ && TARGET_AVX512VL"
   "vcvt<floatsuffix>qq2ps{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
   [(set_attr "type" "ssecvt")
        (vec_merge:V2SF
                (any_float:V2SF (match_operand:V2DI 1
                                  "nonimmediate_operand" "vm"))
-           (const_vector:V2SF [(const_int 0) (const_int 0)])
+           (match_operand:V2SF 3 "const0_operand" "C")
            (match_operand:QI 2 "register_operand" "Yk"))
-           (const_vector:V2SF [(const_int 0) (const_int 0)])))]
+           (match_operand:V2SF 4 "const0_operand" "C")))]
   "TARGET_AVX512DQ && TARGET_AVX512VL"
   "vcvt<floatsuffix>qq2ps{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "V8DF")])
 
 (define_insn "<avx512>_cvt<ssemodesuffix>2mask<mode>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
         [(match_operand:VI12_AVX512VL 1 "register_operand" "v")]
         UNSPEC_CVTINT2MASK))]
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_cvt<ssemodesuffix>2mask<mode>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
         [(match_operand:VI48_AVX512VL 1 "register_operand" "v")]
         UNSPEC_CVTINT2MASK))]
        (vec_merge:VI12_AVX512VL
          (match_operand:VI12_AVX512VL 2 "vector_all_ones_operand")
          (match_operand:VI12_AVX512VL 3 "const0_operand")
-         (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))]
+         (match_operand:<avx512fmaskmode> 1 "register_operand" "k")))]
   "TARGET_AVX512BW"
   "vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}"
   [(set_attr "prefix" "evex")
        (vec_merge:VI48_AVX512VL
          (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
          (match_operand:VI48_AVX512VL 3 "const0_operand")
-         (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))]
+         (match_operand:<avx512fmaskmode> 1 "register_operand" "k")))]
   "TARGET_AVX512DQ"
   "vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}"
   [(set_attr "prefix" "evex")
              (const_string "mmxmov")
           ]
           (const_string "sselog")))
+   (set (attr "mmx_isa")
+     (if_then_else (eq_attr "alternative" "7,8")
+                  (const_string "native")
+                  (const_string "*")))
    (set (attr "prefix_data16")
      (if_then_else (eq_attr "alternative" "3,4")
                   (const_string "1")
    movss\t{%1, %0|%0, %1}
    punpckldq\t{%2, %0|%0, %2}
    movd\t{%1, %0|%0, %1}"
-  [(set_attr "type" "sselog,ssemov,mmxcvt,mmxmov")
+  [(set_attr "mmx_isa" "*,*,native,native")
+   (set_attr "type" "sselog,ssemov,mmxcvt,mmxmov")
    (set_attr "mode" "V4SF,SF,DI,DI")])
 
 (define_insn "*vec_concatv4sf"
    (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
    (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
 
+(define_insn "*vec_concatv4sf_0"
+  [(set (match_operand:V4SF 0 "register_operand"       "=v")
+       (vec_concat:V4SF
+         (match_operand:V2SF 1 "nonimmediate_operand" "xm")
+         (match_operand:V2SF 2 "const0_operand"       " C")))]
+  "TARGET_SSE2"
+  "%vmovq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DF")])
+
 ;; Avoid combining registers from different units in a single alternative,
 ;; see comment above inline_secondary_memory_needed function in i386.c
 (define_insn "vec_set<mode>_0"
   [(set (match_operand:VI4F_256_512 0 "register_operand" "=v,v,v")
        (vec_merge:VI4F_256_512
          (vec_duplicate:VI4F_256_512
-           (match_operand:<ssescalarmode> 2 "general_operand" "v,m,r"))
+           (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "v,m,r"))
          (match_operand:VI4F_256_512 1 "const0_operand" "C,C,C")
          (const_int 1)))]
   "TARGET_AVX"
 
 (define_expand "vec_extract<mode><ssehalfvecmodelower>"
   [(match_operand:<ssehalfvecmode> 0 "nonimmediate_operand")
-   (match_operand:V_512 1 "register_operand")
+   (match_operand:V_256_512 1 "register_operand")
    (match_operand 2 "const_0_to_1_operand")]
-  "TARGET_AVX512F"
+  "TARGET_AVX"
 {
   if (INTVAL (operands[2]))
     emit_insn (gen_vec_extract_hi_<mode> (operands[0], operands[1]));
 (define_expand "<avx512>_fixupimm<mode>_maskz<round_saeonly_expand_name>"
   [(match_operand:VF_AVX512VL 0 "register_operand")
    (match_operand:VF_AVX512VL 1 "register_operand")
-   (match_operand:<sseintvecmode> 2 "<round_saeonly_expand_nimm_predicate>")
-   (match_operand:SI 3 "const_0_to_255_operand")
-   (match_operand:<avx512fmaskmode> 4 "register_operand")]
+   (match_operand:VF_AVX512VL 2 "register_operand")
+   (match_operand:<sseintvecmode> 3 "<round_saeonly_expand_nimm_predicate>")
+   (match_operand:SI 4 "const_0_to_255_operand")
+   (match_operand:<avx512fmaskmode> 5 "register_operand")]
   "TARGET_AVX512F"
 {
-  emit_insn (gen_<avx512>_fixupimm<mode>_mask<round_saeonly_expand_name> (
+  emit_insn (gen_<avx512>_fixupimm<mode>_maskz_1<round_saeonly_expand_name> (
        operands[0], operands[1], operands[2], operands[3],
-       CONST0_RTX (<MODE>mode), operands[4]
-       <round_saeonly_expand_operand5>));
+       operands[4], CONST0_RTX (<MODE>mode), operands[5]
+       <round_saeonly_expand_operand6>));
   DONE;
 })
 
-(define_insn "<avx512>_fixupimm<mode><mask_name><round_saeonly_name>"
+(define_insn "<avx512>_fixupimm<mode><sd_maskz_name><round_saeonly_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
         (unspec:VF_AVX512VL
-          [(match_operand:VF_AVX512VL 1 "register_operand" "v")
-           (match_operand:<sseintvecmode> 2 "nonimmediate_operand" "<round_saeonly_constraint>")
-           (match_operand:SI 3 "const_0_to_255_operand")]
+          [(match_operand:VF_AVX512VL 1 "register_operand" "0")
+          (match_operand:VF_AVX512VL 2 "register_operand" "v")
+           (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "<round_saeonly_constraint>")
+           (match_operand:SI 4 "const_0_to_255_operand")]
            UNSPEC_FIXUPIMM))]
   "TARGET_AVX512F"
-  "vfixupimm<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
+  "vfixupimm<ssemodesuffix>\t{%4, <round_saeonly_sd_mask_op5>%3, %2, %0<sd_mask_op5>|%0<sd_mask_op5>, %2, %3<round_saeonly_sd_mask_op5>, %4}";
+  [(set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<avx512>_fixupimm<mode>_mask<round_saeonly_name>"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
+       (vec_merge:VF_AVX512VL
+          (unspec:VF_AVX512VL
+            [(match_operand:VF_AVX512VL 1 "register_operand" "0")
+            (match_operand:VF_AVX512VL 2 "register_operand" "v")
+             (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "<round_saeonly_constraint>")
+             (match_operand:SI 4 "const_0_to_255_operand")]
+             UNSPEC_FIXUPIMM)
+         (match_dup 1)
+         (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfixupimm<ssemodesuffix>\t{%4, <round_saeonly_op6>%3, %2, %0%{%5%}|%0%{%5%}, %2, %3<round_saeonly_op6>, %4}";
   [(set_attr "prefix" "evex")
    (set_attr "mode" "<MODE>")])
 
 (define_expand "avx512f_sfixupimm<mode>_maskz<round_saeonly_expand_name>"
   [(match_operand:VF_128 0 "register_operand")
    (match_operand:VF_128 1 "register_operand")
-   (match_operand:<sseintvecmode> 2 "<round_saeonly_expand_nimm_predicate>")
-   (match_operand:SI 3 "const_0_to_255_operand")
-   (match_operand:<avx512fmaskmode> 4 "register_operand")]
+   (match_operand:VF_128 2 "register_operand")
+   (match_operand:<sseintvecmode> 3 "<round_saeonly_expand_nimm_predicate>")
+   (match_operand:SI 4 "const_0_to_255_operand")
+   (match_operand:<avx512fmaskmode> 5 "register_operand")]
   "TARGET_AVX512F"
 {
-  emit_insn (gen_avx512f_sfixupimm<mode>_mask<round_saeonly_expand_name> (
+  emit_insn (gen_avx512f_sfixupimm<mode>_maskz_1<round_saeonly_expand_name> (
        operands[0], operands[1], operands[2], operands[3],
-       CONST0_RTX (<MODE>mode), operands[4]
-       <round_saeonly_expand_operand5>));
+       operands[4], CONST0_RTX (<MODE>mode), operands[5]
+       <round_saeonly_expand_operand6>));
   DONE;
 })
 
-(define_insn "avx512f_sfixupimm<mode><mask_name><round_saeonly_name>"
+(define_insn "avx512f_sfixupimm<mode><sd_maskz_name><round_saeonly_name>"
   [(set (match_operand:VF_128 0 "register_operand" "=v")
-        (unspec:VF_128
-          [(match_operand:VF_128 1 "register_operand" "v")
-            (match_operand:<sseintvecmode> 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
-            (match_operand:SI 3 "const_0_to_255_operand")]
-           UNSPEC_FIXUPIMM))]
+       (vec_merge:VF_128
+          (unspec:VF_128
+            [(match_operand:VF_128 1 "register_operand" "0")
+            (match_operand:VF_128 2 "register_operand" "v")
+            (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+            (match_operand:SI 4 "const_0_to_255_operand")]
+           UNSPEC_FIXUPIMM)
+         (match_dup 1)
+         (const_int 1)))]
    "TARGET_AVX512F"
-  "vfixupimm<ssescalarmodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %<iptr>2<round_saeonly_mask_op4>, %3}";
+   "vfixupimm<ssescalarmodesuffix>\t{%4, <round_saeonly_sd_mask_op5>%3, %2, %0<sd_mask_op5>|%0<sd_mask_op5>, %2, %<iptr>3<round_saeonly_sd_mask_op5>, %4}";
+   [(set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "avx512f_sfixupimm<mode>_mask<round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+       (vec_merge:VF_128
+         (vec_merge:VF_128
+           (unspec:VF_128
+              [(match_operand:VF_128 1 "register_operand" "0")
+               (match_operand:VF_128 2 "register_operand" "v")
+               (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+               (match_operand:SI 4 "const_0_to_255_operand")]
+              UNSPEC_FIXUPIMM)
+           (match_dup 1)
+           (const_int 1))
+         (match_dup 1)
+         (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfixupimm<ssescalarmodesuffix>\t{%4, <round_saeonly_op6>%3, %2, %0%{%5%}|%0%{%5%}, %2, %<iptr>3<round_saeonly_op6>, %4}";
   [(set_attr "prefix" "evex")
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_insn "vec_concatv2df"
   [(set (match_operand:V2DF 0 "register_operand"     "=x,x,v,x,v,x,x, v,x,x")
        (vec_concat:V2DF
-         (match_operand:DF 1 "nonimmediate_operand" " 0,x,v,m,m,0,x,xm,0,0")
+         (match_operand:DF 1 "nonimmediate_operand" " 0,x,v,m,m,0,x,vm,0,0")
          (match_operand:DF 2 "nonimm_or_0_operand"  " x,x,v,1,1,m,m, C,x,m")))]
   "TARGET_SSE
    && (!(MEM_P (operands[1]) && MEM_P (operands[2]))
   [(set (match_operand:VF2_512_256 0 "register_operand" "=v")
        (vec_merge:VF2_512_256
          (vec_duplicate:VF2_512_256
-           (match_operand:<ssescalarmode> 2 "general_operand" "xm"))
+           (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "xm"))
          (match_operand:VF2_512_256 1 "const0_operand" "C")
          (const_int 1)))]
   "TARGET_AVX"
    (set_attr "mode" "<sseinsnmode>")])
 
 
+(define_expand "vec_shl_<mode>"
+  [(set (match_dup 3)
+       (ashift:V1TI
+        (match_operand:VI_128 1 "register_operand")
+        (match_operand:SI 2 "const_0_to_255_mul_8_operand")))
+   (set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
+  "TARGET_SSE2"
+{
+  operands[1] = gen_lowpart (V1TImode, operands[1]);
+  operands[3] = gen_reg_rtx (V1TImode);
+  operands[4] = gen_lowpart (<MODE>mode, operands[3]);
+})
+
 (define_expand "vec_shr_<mode>"
   [(set (match_dup 3)
        (lshiftrt:V1TI
   "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);")
 
 (define_insn "<avx512>_eq<mode>3<mask_scalar_merge_name>_1"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk,Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k,k")
        (unspec:<avx512fmaskmode>
          [(match_operand:VI12_AVX512VL 1 "nonimm_or_0_operand" "%v,v")
           (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "vm,C")]
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_eq<mode>3<mask_scalar_merge_name>_1"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk,Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k,k")
        (unspec:<avx512fmaskmode>
          [(match_operand:VI48_AVX512VL 1 "nonimm_or_0_operand" "%v,v")
           (match_operand:VI48_AVX512VL 2 "nonimm_or_0_operand" "vm,C")]
    (set_attr "mode" "OI")])
 
 (define_insn "<avx512>_gt<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
          [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
           (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_GT))]
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_gt<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
          [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
           (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_GT))]
          (match_operand:VI 2 "vector_operand" "xBm,xm,vm")))]
   "TARGET_SSE"
 {
-  static char buf[64];
+  char buf[64];
   const char *ops;
   const char *tmp;
   const char *ssesuffix;
     }
 
   snprintf (buf, sizeof (buf), ops, tmp, ssesuffix);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "isa" "noavx,avx,avx")
    (set_attr "type" "sselog")
   "TARGET_SSE && <mask_mode512bit_condition>
    && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
 {
-  static char buf[64];
+  char buf[64];
   const char *ops;
   const char *tmp;
   const char *ssesuffix;
     }
 
   snprintf (buf, sizeof (buf), ops, tmp, ssesuffix);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "isa" "noavx,avx,avx")
    (set_attr "type" "sselog")
          (match_operand:VI12_AVX_AVX512F 2 "vector_operand" "xBm,xm,vm")))]
   "TARGET_SSE && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
 {
-  static char buf[64];
+  char buf[64];
   const char *ops;
   const char *tmp;
   const char *ssesuffix;
     }
 
   snprintf (buf, sizeof (buf), ops, tmp, ssesuffix);
-  return buf;
+  output_asm_insn (buf, operands);
+  return "";
 }
   [(set_attr "isa" "noavx,avx,avx")
    (set_attr "type" "sselog")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "<avx512>_testm<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
-       (unspec:<avx512fmaskmode>
-        [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
-         (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")]
-        UNSPEC_TESTM))]
-  "TARGET_AVX512BW"
-  "vptestm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
-  [(set_attr "prefix" "evex")
-   (set_attr "mode"  "<sseinsnmode>")])
+(define_mode_iterator VI1248_AVX512VLBW
+  [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V16QI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V8HI "TARGET_AVX512VL && TARGET_AVX512BW")
+   V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
+   V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
+
+(define_mode_iterator AVX512ZEXTMASK
+  [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI])
 
 (define_insn "<avx512>_testm<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
-        [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
-         (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")]
+        [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v")
+         (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")]
         UNSPEC_TESTM))]
   "TARGET_AVX512F"
   "vptestm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
    (set_attr "mode"  "<sseinsnmode>")])
 
 (define_insn "<avx512>_testnm<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
-        [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
-         (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")]
+        [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v")
+         (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")]
         UNSPEC_TESTNM))]
-  "TARGET_AVX512BW"
+  "TARGET_AVX512F"
   "vptestnm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
   [(set_attr "prefix" "evex")
    (set_attr "mode"  "<sseinsnmode>")])
 
-(define_insn "<avx512>_testnm<mode>3<mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
-       (unspec:<avx512fmaskmode>
-        [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
-         (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")]
-        UNSPEC_TESTNM))]
-  "TARGET_AVX512F"
-  "vptestnm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
+(define_insn "*<avx512>_testm<VI1248_AVX512VLBW:mode>3_zext"
+  [(set (match_operand:AVX512ZEXTMASK 0 "register_operand" "=k")
+       (zero_extend:AVX512ZEXTMASK
+         (unspec:<VI1248_AVX512VLBW:avx512fmaskmode>
+          [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v")
+           (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")]
+          UNSPEC_TESTM)))]
+  "TARGET_AVX512BW
+   && (<AVX512ZEXTMASK:MODE_SIZE>
+       > GET_MODE_SIZE (<VI1248_AVX512VLBW:avx512fmaskmode>mode))"
+  "vptestm<VI1248_AVX512VLBW:ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "prefix" "evex")
-   (set_attr "mode"  "<sseinsnmode>")])
+   (set_attr "mode"  "<VI1248_AVX512VLBW:sseinsnmode>")])
+
+(define_insn "*<avx512>_testm<VI1248_AVX512VLBW:mode>3_zext_mask"
+  [(set (match_operand:AVX512ZEXTMASK 0 "register_operand" "=k")
+       (zero_extend:AVX512ZEXTMASK
+         (and:<VI1248_AVX512VLBW:avx512fmaskmode>
+           (unspec:<VI1248_AVX512VLBW:avx512fmaskmode>
+            [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v")
+             (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")]
+            UNSPEC_TESTM)
+           (match_operand:<VI1248_AVX512VLBW:avx512fmaskmode> 3 "register_operand" "Yk"))))]
+  "TARGET_AVX512BW
+   && (<AVX512ZEXTMASK:MODE_SIZE>
+       > GET_MODE_SIZE (<VI1248_AVX512VLBW:avx512fmaskmode>mode))"
+  "vptestm<VI1248_AVX512VLBW:ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "<VI1248_AVX512VLBW:sseinsnmode>")])
+
+(define_insn "*<avx512>_testnm<VI1248_AVX512VLBW:mode>3_zext"
+  [(set (match_operand:AVX512ZEXTMASK 0 "register_operand" "=k")
+       (zero_extend:AVX512ZEXTMASK
+         (unspec:<VI1248_AVX512VLBW:avx512fmaskmode>
+          [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v")
+           (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")]
+          UNSPEC_TESTNM)))]
+  "TARGET_AVX512BW
+   && (<AVX512ZEXTMASK:MODE_SIZE>
+       > GET_MODE_SIZE (<VI1248_AVX512VLBW:avx512fmaskmode>mode))"
+  "vptestnm<VI1248_AVX512VLBW:ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "<VI1248_AVX512VLBW:sseinsnmode>")])
+
+(define_insn "*<avx512>_testnm<VI1248_AVX512VLBW:mode>3_zext_mask"
+  [(set (match_operand:AVX512ZEXTMASK 0 "register_operand" "=k")
+       (zero_extend:AVX512ZEXTMASK
+         (and:<VI1248_AVX512VLBW:avx512fmaskmode>
+           (unspec:<VI1248_AVX512VLBW:avx512fmaskmode>
+            [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v")
+             (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")]
+            UNSPEC_TESTNM)
+           (match_operand:<VI1248_AVX512VLBW:avx512fmaskmode> 3 "register_operand" "Yk"))))]
+  "TARGET_AVX512BW
+   && (<AVX512ZEXTMASK:MODE_SIZE>
+       > GET_MODE_SIZE (<VI1248_AVX512VLBW:avx512fmaskmode>mode))"
+  "vptestnm<VI1248_AVX512VLBW:ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "<VI1248_AVX512VLBW:sseinsnmode>")])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 })
 
 (define_expand "vec_pack_trunc_qi"
-  [(set (match_operand:HI 0 ("register_operand"))
-        (ior:HI (ashift:HI (zero_extend:HI (match_operand:QI 2 ("register_operand")))
+  [(set (match_operand:HI 0 "register_operand")
+       (ior:HI (ashift:HI (zero_extend:HI (match_operand:QI 2 "register_operand"))
                            (const_int 8))
-                (zero_extend:HI (match_operand:QI 1 ("register_operand")))))]
+               (zero_extend:HI (match_operand:QI 1 "register_operand"))))]
   "TARGET_AVX512F")
 
 (define_expand "vec_pack_trunc_<mode>"
-  [(set (match_operand:<DOUBLEMASKMODE> 0 ("register_operand"))
-        (ior:<DOUBLEMASKMODE> (ashift:<DOUBLEMASKMODE> (zero_extend:<DOUBLEMASKMODE> (match_operand:SWI24 2 ("register_operand")))
-                           (match_dup 3))
-                (zero_extend:<DOUBLEMASKMODE> (match_operand:SWI24 1 ("register_operand")))))]
+  [(set (match_operand:<DOUBLEMASKMODE> 0 "register_operand")
+       (ior:<DOUBLEMASKMODE>
+         (ashift:<DOUBLEMASKMODE>
+           (zero_extend:<DOUBLEMASKMODE>
+             (match_operand:SWI24 2 "register_operand"))
+           (match_dup 3))
+         (zero_extend:<DOUBLEMASKMODE>
+           (match_operand:SWI24 1 "register_operand"))))]
   "TARGET_AVX512BW"
 {
   operands[3] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode));
 })
 
+(define_expand "vec_pack_sbool_trunc_qi"
+  [(match_operand:QI 0 "register_operand")
+   (match_operand:QI 1 "register_operand")
+   (match_operand:QI 2 "register_operand")
+   (match_operand:QI 3 "const_int_operand")]
+  "TARGET_AVX512F"
+{
+  HOST_WIDE_INT nunits = INTVAL (operands[3]);
+  rtx mask, tem1, tem2;
+  if (nunits != 8 && nunits != 4)
+    FAIL;
+  mask = gen_reg_rtx (QImode);
+  emit_move_insn (mask, GEN_INT ((1 << (nunits / 2)) - 1));
+  tem1 = gen_reg_rtx (QImode);
+  emit_insn (gen_kandqi (tem1, operands[1], mask));
+  if (TARGET_AVX512DQ)
+    {
+      tem2 = gen_reg_rtx (QImode);
+      emit_insn (gen_kashiftqi (tem2, operands[2],
+                               GEN_INT (nunits / 2)));
+    }
+  else
+    {
+      tem2 = gen_reg_rtx (HImode);
+      emit_insn (gen_kashifthi (tem2, lowpart_subreg (HImode, operands[2],
+                                                     QImode),
+                               GEN_INT (nunits / 2)));
+      tem2 = lowpart_subreg (QImode, tem2, HImode);
+    }
+  emit_insn (gen_kiorqi (operands[0], tem1, tem2));
+  DONE;
+})
+
 (define_insn "<sse2_avx2>_packsswb<mask_name>"
   [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x,v")
        (vec_concat:VI1_AVX512
          (vec_concat:<ssedoublemode>
            (match_operand:VI8F_256 1 "register_operand" "v")
            (match_operand:VI8F_256 2 "nonimmediate_operand" "vm"))
-         (parallel [(match_operand 3  "const_0_to_3_operand")
-                    (match_operand 4  "const_0_to_3_operand")
-                    (match_operand 5  "const_4_to_7_operand")
-                    (match_operand 6  "const_4_to_7_operand")])))]
+         (parallel [(match_operand 3 "const_0_to_3_operand")
+                    (match_operand 4 "const_0_to_3_operand")
+                    (match_operand 5 "const_4_to_7_operand")
+                    (match_operand 6 "const_4_to_7_operand")])))]
   "TARGET_AVX512VL
-   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
-       && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1))"
+   && (INTVAL (operands[3]) & 1) == 0
+   && INTVAL (operands[3]) == INTVAL (operands[4]) - 1
+   && (INTVAL (operands[5]) & 1) == 0
+   && INTVAL (operands[5]) == INTVAL (operands[6]) - 1"
 {
   int mask;
   mask = INTVAL (operands[3]) / 2;
          (vec_concat:<ssedoublemode>
            (match_operand:V8FI 1 "register_operand" "v")
            (match_operand:V8FI 2 "nonimmediate_operand" "vm"))
-         (parallel [(match_operand 3  "const_0_to_7_operand")
-                    (match_operand 4  "const_0_to_7_operand")
-                    (match_operand 5  "const_0_to_7_operand")
-                    (match_operand 6  "const_0_to_7_operand")
-                    (match_operand 7  "const_8_to_15_operand")
-                    (match_operand 8  "const_8_to_15_operand")
-                    (match_operand 9  "const_8_to_15_operand")
-                    (match_operand 10  "const_8_to_15_operand")])))]
+         (parallel [(match_operand 3 "const_0_to_7_operand")
+                    (match_operand 4 "const_0_to_7_operand")
+                    (match_operand 5 "const_0_to_7_operand")
+                    (match_operand 6 "const_0_to_7_operand")
+                    (match_operand 7 "const_8_to_15_operand")
+                    (match_operand 8 "const_8_to_15_operand")
+                    (match_operand 9 "const_8_to_15_operand")
+                    (match_operand 10 "const_8_to_15_operand")])))]
   "TARGET_AVX512F
-   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
-       && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1)
-       && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1)
-       && INTVAL (operands[9]) == (INTVAL (operands[10]) - 1))"
+   && (INTVAL (operands[3]) & 1) == 0
+   && INTVAL (operands[3]) == INTVAL (operands[4]) - 1
+   && (INTVAL (operands[5]) & 1) == 0
+   && INTVAL (operands[5]) == INTVAL (operands[6]) - 1
+   && (INTVAL (operands[7]) & 1) == 0
+   && INTVAL (operands[7]) == INTVAL (operands[8]) - 1
+   && (INTVAL (operands[9]) & 1) == 0
+   && INTVAL (operands[9]) == INTVAL (operands[10]) - 1"
 {
   int mask;
   mask = INTVAL (operands[3]) / 2;
          (vec_concat:<ssedoublemode>
            (match_operand:VI4F_256 1 "register_operand" "v")
            (match_operand:VI4F_256 2 "nonimmediate_operand" "vm"))
-         (parallel [(match_operand 3  "const_0_to_7_operand")
-                    (match_operand 4  "const_0_to_7_operand")
-                    (match_operand 5  "const_0_to_7_operand")
-                    (match_operand 6  "const_0_to_7_operand")
-                    (match_operand 7  "const_8_to_15_operand")
-                    (match_operand 8  "const_8_to_15_operand")
-                    (match_operand 9  "const_8_to_15_operand")
+         (parallel [(match_operand 3 "const_0_to_7_operand")
+                    (match_operand 4 "const_0_to_7_operand")
+                    (match_operand 5 "const_0_to_7_operand")
+                    (match_operand 6 "const_0_to_7_operand")
+                    (match_operand 7 "const_8_to_15_operand")
+                    (match_operand 8 "const_8_to_15_operand")
+                    (match_operand 9 "const_8_to_15_operand")
                     (match_operand 10 "const_8_to_15_operand")])))]
   "TARGET_AVX512VL
-   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
-       && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2)
-       && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3)
-       && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1)
-       && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2)
-       && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3))"
+   && (INTVAL (operands[3]) & 3) == 0
+   && INTVAL (operands[3]) == INTVAL (operands[4]) - 1
+   && INTVAL (operands[3]) == INTVAL (operands[5]) - 2
+   && INTVAL (operands[3]) == INTVAL (operands[6]) - 3
+   && (INTVAL (operands[7]) & 3) == 0
+   && INTVAL (operands[7]) == INTVAL (operands[8]) - 1
+   && INTVAL (operands[7]) == INTVAL (operands[9]) - 2
+   && INTVAL (operands[7]) == INTVAL (operands[10]) - 3"
 {
   int mask;
   mask = INTVAL (operands[3]) / 4;
          (vec_concat:<ssedoublemode>
            (match_operand:V16FI 1 "register_operand" "v")
            (match_operand:V16FI 2 "nonimmediate_operand" "vm"))
-         (parallel [(match_operand 3  "const_0_to_15_operand")
-                    (match_operand 4  "const_0_to_15_operand")
-                    (match_operand 5  "const_0_to_15_operand")
-                    (match_operand 6  "const_0_to_15_operand")
-                    (match_operand 7  "const_0_to_15_operand")
-                    (match_operand 8  "const_0_to_15_operand")
-                    (match_operand 9  "const_0_to_15_operand")
-                    (match_operand 10  "const_0_to_15_operand")
-                    (match_operand 11  "const_16_to_31_operand")
-                    (match_operand 12  "const_16_to_31_operand")
-                    (match_operand 13  "const_16_to_31_operand")
-                    (match_operand 14  "const_16_to_31_operand")
-                    (match_operand 15  "const_16_to_31_operand")
-                    (match_operand 16  "const_16_to_31_operand")
-                    (match_operand 17  "const_16_to_31_operand")
-                    (match_operand 18  "const_16_to_31_operand")])))]
+         (parallel [(match_operand 3 "const_0_to_15_operand")
+                    (match_operand 4 "const_0_to_15_operand")
+                    (match_operand 5 "const_0_to_15_operand")
+                    (match_operand 6 "const_0_to_15_operand")
+                    (match_operand 7 "const_0_to_15_operand")
+                    (match_operand 8 "const_0_to_15_operand")
+                    (match_operand 9 "const_0_to_15_operand")
+                    (match_operand 10 "const_0_to_15_operand")
+                    (match_operand 11 "const_16_to_31_operand")
+                    (match_operand 12 "const_16_to_31_operand")
+                    (match_operand 13 "const_16_to_31_operand")
+                    (match_operand 14 "const_16_to_31_operand")
+                    (match_operand 15 "const_16_to_31_operand")
+                    (match_operand 16 "const_16_to_31_operand")
+                    (match_operand 17 "const_16_to_31_operand")
+                    (match_operand 18 "const_16_to_31_operand")])))]
   "TARGET_AVX512F
-   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
-       && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2)
-       && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3)
-       && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1)
-       && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2)
-       && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3)
-       && INTVAL (operands[11]) == (INTVAL (operands[12]) - 1)
-       && INTVAL (operands[11]) == (INTVAL (operands[13]) - 2)
-       && INTVAL (operands[11]) == (INTVAL (operands[14]) - 3)
-       && INTVAL (operands[15]) == (INTVAL (operands[16]) - 1)
-       && INTVAL (operands[15]) == (INTVAL (operands[17]) - 2)
-       && INTVAL (operands[15]) == (INTVAL (operands[18]) - 3))"
+   && (INTVAL (operands[3]) & 3) == 0
+   && INTVAL (operands[3]) == INTVAL (operands[4]) - 1
+   && INTVAL (operands[3]) == INTVAL (operands[5]) - 2
+   && INTVAL (operands[3]) == INTVAL (operands[6]) - 3
+   && (INTVAL (operands[7]) & 3) == 0
+   && INTVAL (operands[7]) == INTVAL (operands[8]) - 1
+   && INTVAL (operands[7]) == INTVAL (operands[9]) - 2
+   && INTVAL (operands[7]) == INTVAL (operands[10]) - 3
+   && (INTVAL (operands[11]) & 3) == 0
+   && INTVAL (operands[11]) == INTVAL (operands[12]) - 1
+   && INTVAL (operands[11]) == INTVAL (operands[13]) - 2
+   && INTVAL (operands[11]) == INTVAL (operands[14]) - 3
+   && (INTVAL (operands[15]) & 3) == 0
+   && INTVAL (operands[15]) == INTVAL (operands[16]) - 1
+   && INTVAL (operands[15]) == INTVAL (operands[17]) - 2
+   && INTVAL (operands[15]) == INTVAL (operands[18]) - 3"
 {
   int mask;
   mask = INTVAL (operands[3]) / 4;
           (symbol_ref "true")))])
 
 (define_insn "*vec_extractv2di_0_sse"
-  [(set (match_operand:DI 0 "nonimmediate_operand"     "=v,m")
+  [(set (match_operand:DI 0 "nonimmediate_operand"     "=r,x ,m")
        (vec_select:DI
-         (match_operand:V2DI 1 "nonimmediate_operand" "vm,v")
+         (match_operand:V2DI 1 "nonimmediate_operand" " x,xm,x")
          (parallel [(const_int 0)])))]
   "TARGET_SSE && !TARGET_64BIT
    && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "#")
+  "#"
+  [(set_attr "isa" "sse4,*,*")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "0")
+             (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+          ]
+          (symbol_ref "true")))])
+
+(define_split
+  [(set (match_operand:DI 0 "general_reg_operand")
+       (vec_select:DI
+         (match_operand:V2DI 1 "register_operand")
+         (parallel [(const_int 0)])))]
+  "TARGET_SSE4_1 && !TARGET_64BIT
+   && reload_completed"
+  [(set (match_dup 2) (match_dup 4))
+   (set (match_dup 3)
+       (vec_select:SI
+         (match_dup 5)
+         (parallel [(const_int 1)])))]
+{
+  operands[4] = gen_lowpart (SImode, operands[1]);
+  operands[5] = gen_lowpart (V4SImode, operands[1]);
+  split_double_mode (DImode, &operands[0], 1, &operands[2], &operands[3]);
+})
 
 (define_split
   [(set (match_operand:SWI48x 0 "nonimmediate_operand")
    punpckldq\t{%2, %0|%0, %2}
    movd\t{%1, %0|%0, %1}"
   [(set_attr "isa" "noavx,noavx,avx,avx512dq,noavx,noavx,avx,*,*,*")
+   (set (attr "mmx_isa")
+     (if_then_else (eq_attr "alternative" "8,9")
+                  (const_string "native")
+                  (const_string "*")))
    (set (attr "type")
      (cond [(eq_attr "alternative" "7")
              (const_string "ssemov")
 ;; nonimmediate_operand for operand 2 and *not* allowing memory for the SSE
 ;; alternatives pretty much forces the MMX alternative to be chosen.
 (define_insn "*vec_concatv2si"
-  [(set (match_operand:V2SI 0 "register_operand"     "=x,x ,*y,x,x,*y,*y")
+  [(set (match_operand:V2SI 0 "register_operand"     "=x,x ,x,x,*y,*y")
        (vec_concat:V2SI
-         (match_operand:SI 1 "nonimmediate_operand" " 0,rm,rm,0,m, 0,*rm")
-         (match_operand:SI 2 "reg_or_0_operand"     " x,C ,C, x,C,*y,C")))]
+         (match_operand:SI 1 "nonimmediate_operand" " 0,rm,0,m, 0,rm")
+         (match_operand:SI 2 "reg_or_0_operand"     " x,C ,x,C,*y,C")))]
   "TARGET_SSE && !TARGET_SSE4_1"
   "@
    punpckldq\t{%2, %0|%0, %2}
    movd\t{%1, %0|%0, %1}
-   movd\t{%1, %0|%0, %1}
    unpcklps\t{%2, %0|%0, %2}
    movss\t{%1, %0|%0, %1}
    punpckldq\t{%2, %0|%0, %2}
    movd\t{%1, %0|%0, %1}"
-  [(set_attr "isa" "sse2,sse2,sse2,*,*,*,*")
-   (set_attr "type" "sselog,ssemov,mmxmov,sselog,ssemov,mmxcvt,mmxmov")
-   (set_attr "mode" "TI,TI,DI,V4SF,SF,DI,DI")])
+  [(set_attr "isa" "sse2,sse2,*,*,*,*")
+   (set_attr "mmx_isa" "*,*,*,*,native,native")
+   (set_attr "type" "sselog,ssemov,sselog,ssemov,mmxcvt,mmxmov")
+   (set_attr "mode" "TI,TI,V4SF,SF,DI,DI")])
 
 (define_insn "*vec_concatv4si"
   [(set (match_operand:V4SI 0 "register_operand"       "=x,v,x,x,v")
    (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
    (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
 
-;; movd instead of movq is required to handle broken assemblers.
+(define_insn "*vec_concatv4si_0"
+  [(set (match_operand:V4SI 0 "register_operand"       "=v,x")
+       (vec_concat:V4SI
+         (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
+         (match_operand:V2SI 2 "const0_operand"       " C,C")))]
+  "TARGET_SSE2"
+  "@
+   %vmovq\t{%1, %0|%0, %1}
+   movq2dq\t{%1, %0|%0, %1}"
+  [(set_attr "mmx_isa" "*,native")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex,orig")
+   (set_attr "mode" "TI")])
+
 (define_insn "vec_concatv2di"
   [(set (match_operand:V2DI 0 "register_operand"
-         "=Yr,*x,x ,v ,v,v ,x   ,x,v ,x,x,v")
+         "=Yr,*x,x ,v ,x,v ,x,x,v")
        (vec_concat:V2DI
-         (match_operand:DI 1 "nonimmediate_operand"
-         "  0, 0,x ,Yv,r,vm,?!*y,0,Yv,0,0,v")
-         (match_operand:DI 2 "nonimm_or_0_operand"
-         " rm,rm,rm,rm,C ,C ,C ,x,Yv,x,m,m")))]
+         (match_operand:DI 1 "register_operand"
+         "  0, 0,x ,Yv,0,Yv,0,0,v")
+         (match_operand:DI 2 "nonimmediate_operand"
+         " rm,rm,rm,rm,x,Yv,x,m,m")))]
   "TARGET_SSE"
   "@
    pinsrq\t{$1, %2, %0|%0, %2, 1}
    pinsrq\t{$1, %2, %0|%0, %2, 1}
    vpinsrq\t{$1, %2, %1, %0|%0, %1, %2, 1}
    vpinsrq\t{$1, %2, %1, %0|%0, %1, %2, 1}
-   * return HAVE_AS_IX86_INTERUNIT_MOVQ ? \"%vmovq\t{%1, %0|%0, %1}\" : \"%vmovd\t{%1, %0|%0, %1}\";
-   %vmovq\t{%1, %0|%0, %1}
-   movq2dq\t{%1, %0|%0, %1}
    punpcklqdq\t{%2, %0|%0, %2}
    vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}
    movlhps\t{%2, %0|%0, %2}
            (eq_attr "alternative" "3")
              (const_string "x64_avx512dq")
            (eq_attr "alternative" "4")
-             (const_string "x64_sse2")
-           (eq_attr "alternative" "5,6")
-             (const_string "sse2")
-           (eq_attr "alternative" "7")
              (const_string "sse2_noavx")
-           (eq_attr "alternative" "8,11")
+           (eq_attr "alternative" "5,8")
              (const_string "avx")
           ]
           (const_string "noavx")))
    (set (attr "type")
      (if_then_else
-       (eq_attr "alternative" "0,1,2,3,7,8")
+       (eq_attr "alternative" "0,1,2,3,4,5")
        (const_string "sselog")
        (const_string "ssemov")))
    (set (attr "prefix_rex")
-     (if_then_else (eq_attr "alternative" "0,1,2,3,4")
+     (if_then_else (eq_attr "alternative" "0,1,2,3")
                   (const_string "1")
                   (const_string "*")))
    (set (attr "prefix_extra")
              (const_string "vex")
            (eq_attr "alternative" "3")
              (const_string "evex")
-           (eq_attr "alternative" "4,5")
-             (const_string "maybe_vex")
-           (eq_attr "alternative" "8,11")
+           (eq_attr "alternative" "5,8")
              (const_string "maybe_evex")
           ]
           (const_string "orig")))
-   (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")
+   (set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
+
+(define_insn "*vec_concatv2di_0"
+  [(set (match_operand:V2DI 0 "register_operand"     "=v,v ,x")
+       (vec_concat:V2DI
+         (match_operand:DI 1 "nonimmediate_operand" " r,vm,?!*y")
+         (match_operand:DI 2 "const0_operand"       " C,C ,C")))]
+  "TARGET_SSE2"
+  "@
+   * return HAVE_AS_IX86_INTERUNIT_MOVQ ? \"%vmovq\t{%1, %0|%0, %1}\" : \"%vmovd\t{%1, %0|%0, %1}\";
+   %vmovq\t{%1, %0|%0, %1}
+   movq2dq\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "x64,*,*")
+   (set_attr "mmx_isa" "*,*,native")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_rex" "1,*,*")
+   (set_attr "prefix" "maybe_vex,maybe_vex,orig")
+   (set_attr "mode" "TI")
    (set (attr "preferred_for_speed")
-     (cond [(eq_attr "alternative" "4")
+     (cond [(eq_attr "alternative" "0")
              (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
-           (eq_attr "alternative" "6")
-             (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
           ]
           (symbol_ref "true")))])
 
   [(set (match_operand:VI8_AVX_AVX512F 0 "register_operand" "=v,v")
        (vec_merge:VI8_AVX_AVX512F
          (vec_duplicate:VI8_AVX_AVX512F
-           (match_operand:<ssescalarmode> 2 "general_operand" "r,vm"))
+           (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,vm"))
          (match_operand:VI8_AVX_AVX512F 1 "const0_operand" "C,C")
          (const_int 1)))]
   "TARGET_AVX"
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;")
 
+(define_expand "vec_unpacks_sbool_lo_qi"
+  [(match_operand:QI 0 "register_operand")
+   (match_operand:QI 1 "register_operand")
+   (match_operand:QI 2 "const_int_operand")]
+  "TARGET_AVX512F"
+{
+  if (INTVAL (operands[2]) != 8 && INTVAL (operands[2]) != 4)
+    FAIL;
+  emit_move_insn (operands[0], operands[1]);
+  DONE;
+})
+
 (define_expand "vec_unpacks_lo_hi"
   [(set (subreg:HI (match_operand:QI 0 "register_operand") 0)
         (match_operand:HI 1 "register_operand"))]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;")
 
+(define_expand "vec_unpacks_sbool_hi_qi"
+  [(match_operand:QI 0 "register_operand")
+   (match_operand:QI 1 "register_operand")
+   (match_operand:QI 2 "const_int_operand")]
+  "TARGET_AVX512F"
+{
+  HOST_WIDE_INT nunits = INTVAL (operands[2]);
+  if (nunits != 8 && nunits != 4)
+    FAIL;
+  if (TARGET_AVX512DQ)
+    emit_insn (gen_klshiftrtqi (operands[0], operands[1],
+                               GEN_INT (nunits / 2)));
+  else
+    {
+      rtx tem = gen_reg_rtx (HImode);
+      emit_insn (gen_klshiftrthi (tem, lowpart_subreg (HImode, operands[1],
+                                                      QImode),
+                                 GEN_INT (nunits / 2)));
+      emit_move_insn (operands[0], lowpart_subreg (QImode, tem, HImode));
+    }
+  DONE;
+})
+
 (define_expand "vec_unpacks_hi_hi"
   [(parallel
      [(set (subreg:HI (match_operand:QI 0 "register_operand") 0)
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")])
 
+(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_lt"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+       (unspec:SI
+         [(lt:VF_128_256
+            (match_operand:<sseintvecmode> 1 "register_operand" "x")
+            (match_operand:<sseintvecmode> 2 "const0_operand" "C"))]
+         UNSPEC_MOVMSK))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+       (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))]
+  "operands[1] = gen_lowpart (<MODE>mode, operands[1]);"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_lt"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (zero_extend:DI
+         (unspec:SI
+           [(lt:VF_128_256
+              (match_operand:<sseintvecmode> 1 "register_operand" "x")
+              (match_operand:<sseintvecmode> 2 "const0_operand" "C"))]
+           UNSPEC_MOVMSK)))]
+  "TARGET_64BIT && TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+       (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))]
+  "operands[1] = gen_lowpart (<MODE>mode, operands[1]);"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_shift"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+       (unspec:SI
+         [(subreg:VF_128_256
+            (ashiftrt:<sseintvecmode>
+              (match_operand:<sseintvecmode> 1 "register_operand" "x")
+              (match_operand:QI 2 "const_int_operand" "n")) 0)]
+         UNSPEC_MOVMSK))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+       (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))]
+  "operands[1] = gen_lowpart (<MODE>mode, operands[1]);"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_shift"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (zero_extend:DI
+         (unspec:SI
+           [(subreg:VF_128_256
+              (ashiftrt:<sseintvecmode>
+                (match_operand:<sseintvecmode> 1 "register_operand" "x")
+              (match_operand:QI 2 "const_int_operand" "n")) 0)]
+           UNSPEC_MOVMSK)))]
+  "TARGET_64BIT && TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+       (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))]
+  "operands[1] = gen_lowpart (<MODE>mode, operands[1]);"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "<sse2_avx2>_pmovmskb"
   [(set (match_operand:SI 0 "register_operand" "=r")
        (unspec:SI
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")])
 
+(define_insn_and_split "*<sse2_avx2>_pmovmskb_lt"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+       (unspec:SI
+         [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x")
+                       (match_operand:VI1_AVX2 2 "const0_operand" "C"))]
+         UNSPEC_MOVMSK))]
+  "TARGET_SSE2"
+  "#"
+  ""
+  [(set (match_dup 0)
+       (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))]
+  ""
+  [(set_attr "type" "ssemov")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SI")])
+
+(define_insn_and_split "*<sse2_avx2>_pmovmskb_zext_lt"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (zero_extend:DI
+         (unspec:SI
+           [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x")
+                         (match_operand:VI1_AVX2 2 "const0_operand" "C"))]
+           UNSPEC_MOVMSK)))]
+  "TARGET_64BIT && TARGET_SSE2"
+  "#"
+  ""
+  [(set (match_dup 0)
+       (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))]
+  ""
+  [(set_attr "type" "ssemov")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SI")])
+
 (define_expand "sse2_maskmovdqu"
   [(set (match_operand:V16QI 0 "memory_operand")
        (unspec:V16QI [(match_operand:V16QI 1 "register_operand")
   "mwait"
   [(set_attr "length" "3")])
 
-(define_insn "sse3_monitor_<mode>"
+(define_insn "@sse3_monitor_<mode>"
   [(unspec_volatile [(match_operand:P 0 "register_operand" "a")
                     (match_operand:SI 1 "register_operand" "c")
                     (match_operand:SI 2 "register_operand" "d")]
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
-(define_insn "ssse3_ph<plusminus_mnemonic>wv4hi3"
-  [(set (match_operand:V4HI 0 "register_operand" "=y")
+(define_insn_and_split "ssse3_ph<plusminus_mnemonic>wv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv")
        (vec_concat:V4HI
          (vec_concat:V2HI
            (ssse3_plusminus:HI
              (vec_select:HI
-               (match_operand:V4HI 1 "register_operand" "0")
+               (match_operand:V4HI 1 "register_operand" "0,0,Yv")
                (parallel [(const_int 0)]))
              (vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
            (ssse3_plusminus:HI
          (vec_concat:V2HI
            (ssse3_plusminus:HI
              (vec_select:HI
-               (match_operand:V4HI 2 "nonimmediate_operand" "ym")
+               (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yv")
                (parallel [(const_int 0)]))
              (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
            (ssse3_plusminus:HI
              (vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
              (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))))]
-  "TARGET_SSSE3"
-  "ph<plusminus_mnemonic>w\t{%2, %0|%0, %2}"
-  [(set_attr "type" "sseiadd")
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
+  "@
+   ph<plusminus_mnemonic>w\t{%2, %0|%0, %2}
+   #
+   #"
+  "TARGET_MMX_WITH_SSE && reload_completed"
+  [(const_int 0)]
+{
+  /* Generate SSE version of the operation.  */
+  rtx op0 = lowpart_subreg (V8HImode, operands[0],
+                           GET_MODE (operands[0]));
+  rtx op1 = lowpart_subreg (V8HImode, operands[1],
+                           GET_MODE (operands[1]));
+  rtx op2 = lowpart_subreg (V8HImode, operands[2],
+                           GET_MODE (operands[2]));
+  emit_insn (gen_ssse3_ph<plusminus_mnemonic>wv8hi3 (op0, op1, op2));
+  ix86_move_vector_high_sse_to_mmx (op0);
+  DONE;
+}
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
+   (set_attr "type" "sseiadd")
    (set_attr "atom_unit" "complex")
    (set_attr "prefix_extra" "1")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
-   (set_attr "mode" "DI")])
+   (set_attr "mode" "DI,TI,TI")])
 
 (define_insn "avx2_ph<plusminus_mnemonic>dv8si3"
   [(set (match_operand:V8SI 0 "register_operand" "=x")
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
-(define_insn "ssse3_ph<plusminus_mnemonic>dv2si3"
-  [(set (match_operand:V2SI 0 "register_operand" "=y")
+(define_insn_and_split "ssse3_ph<plusminus_mnemonic>dv2si3"
+  [(set (match_operand:V2SI 0 "register_operand" "=y,x,Yv")
        (vec_concat:V2SI
          (plusminus:SI
            (vec_select:SI
-             (match_operand:V2SI 1 "register_operand" "0")
+             (match_operand:V2SI 1 "register_operand" "0,0,Yv")
              (parallel [(const_int 0)]))
            (vec_select:SI (match_dup 1) (parallel [(const_int 1)])))
          (plusminus:SI
            (vec_select:SI
-             (match_operand:V2SI 2 "nonimmediate_operand" "ym")
+             (match_operand:V2SI 2 "register_mmxmem_operand" "ym,x,Yv")
              (parallel [(const_int 0)]))
            (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))))]
-  "TARGET_SSSE3"
-  "ph<plusminus_mnemonic>d\t{%2, %0|%0, %2}"
-  [(set_attr "type" "sseiadd")
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
+  "@
+   ph<plusminus_mnemonic>d\t{%2, %0|%0, %2}
+   #
+   #"
+  "TARGET_MMX_WITH_SSE && reload_completed"
+  [(const_int 0)]
+{
+  /* Generate SSE version of the operation.  */
+  rtx op0 = lowpart_subreg (V4SImode, operands[0],
+                           GET_MODE (operands[0]));
+  rtx op1 = lowpart_subreg (V4SImode, operands[1],
+                           GET_MODE (operands[1]));
+  rtx op2 = lowpart_subreg (V4SImode, operands[2],
+                           GET_MODE (operands[2]));
+  emit_insn (gen_ssse3_ph<plusminus_mnemonic>dv4si3 (op0, op1, op2));
+  ix86_move_vector_high_sse_to_mmx (op0);
+  DONE;
+}
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
+   (set_attr "type" "sseiadd")
    (set_attr "atom_unit" "complex")
    (set_attr "prefix_extra" "1")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
-   (set_attr "mode" "DI")])
+   (set_attr "mode" "DI,TI,TI")])
 
 (define_insn "avx2_pmaddubsw256"
   [(set (match_operand:V16HI 0 "register_operand" "=x,v")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_pmaddubsw"
-  [(set (match_operand:V4HI 0 "register_operand" "=y")
+  [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv")
        (ss_plus:V4HI
          (mult:V4HI
            (zero_extend:V4HI
              (vec_select:V4QI
-               (match_operand:V8QI 1 "register_operand" "0")
+               (match_operand:V8QI 1 "register_operand" "0,0,Yv")
                (parallel [(const_int 0) (const_int 2)
                           (const_int 4) (const_int 6)])))
            (sign_extend:V4HI
              (vec_select:V4QI
-               (match_operand:V8QI 2 "nonimmediate_operand" "ym")
+               (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv")
                (parallel [(const_int 0) (const_int 2)
                           (const_int 4) (const_int 6)]))))
          (mult:V4HI
              (vec_select:V4QI (match_dup 2)
                (parallel [(const_int 1) (const_int 3)
                           (const_int 5) (const_int 7)]))))))]
-  "TARGET_SSSE3"
-  "pmaddubsw\t{%2, %0|%0, %2}"
-  [(set_attr "type" "sseiadd")
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
+  "@
+   pmaddubsw\t{%2, %0|%0, %2}
+   pmaddubsw\t{%2, %0|%0, %2}
+   vpmaddubsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
+   (set_attr "type" "sseiadd")
    (set_attr "atom_unit" "simul")
    (set_attr "prefix_extra" "1")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
-   (set_attr "mode" "DI")])
+   (set_attr "mode" "DI,TI,TI")])
 
 (define_mode_iterator PMULHRSW
-  [V4HI V8HI (V16HI "TARGET_AVX2")])
+  [V8HI (V16HI "TARGET_AVX2")])
 
 (define_expand "<ssse3_avx2>_pmulhrsw<mode>3_mask"
   [(set (match_operand:PMULHRSW 0 "register_operand")
                (const_int 14))
              (match_dup 3))
            (const_int 1))))]
-  "TARGET_AVX2"
+  "TARGET_SSSE3"
 {
   operands[3] = CONST1_RTX(<MODE>mode);
   ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
    (set_attr "prefix" "orig,maybe_evex,evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "ssse3_pmulhrswv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand")
+       (truncate:V4HI
+         (lshiftrt:V4SI
+           (plus:V4SI
+             (lshiftrt:V4SI
+               (mult:V4SI
+                 (sign_extend:V4SI
+                   (match_operand:V4HI 1 "register_mmxmem_operand"))
+                 (sign_extend:V4SI
+                   (match_operand:V4HI 2 "register_mmxmem_operand")))
+               (const_int 14))
+             (match_dup 3))
+           (const_int 1))))]
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
+{
+  operands[3] = CONST1_RTX(V4HImode);
+  ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);
+})
+
 (define_insn "*ssse3_pmulhrswv4hi3"
-  [(set (match_operand:V4HI 0 "register_operand" "=y")
+  [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv")
        (truncate:V4HI
          (lshiftrt:V4SI
            (plus:V4SI
              (lshiftrt:V4SI
                (mult:V4SI
                  (sign_extend:V4SI
-                   (match_operand:V4HI 1 "nonimmediate_operand" "%0"))
+                   (match_operand:V4HI 1 "register_mmxmem_operand" "%0,0,Yv"))
                  (sign_extend:V4SI
-                   (match_operand:V4HI 2 "nonimmediate_operand" "ym")))
+                   (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yv")))
                (const_int 14))
              (match_operand:V4HI 3 "const1_operand"))
            (const_int 1))))]
-  "TARGET_SSSE3 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
-  "pmulhrsw\t{%2, %0|%0, %2}"
-  [(set_attr "type" "sseimul")
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE)
+   && TARGET_SSSE3
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   pmulhrsw\t{%2, %0|%0, %2}
+   pmulhrsw\t{%2, %0|%0, %2}
+   vpmulhrsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
+   (set_attr "type" "sseimul")
    (set_attr "prefix_extra" "1")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
-   (set_attr "mode" "DI")])
+   (set_attr "mode" "DI,TI,TI")])
 
 (define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
   [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x,v")
    (set_attr "btver2_decode" "vector")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "ssse3_pshufbv8qi3"
-  [(set (match_operand:V8QI 0 "register_operand" "=y")
-       (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0")
-                     (match_operand:V8QI 2 "nonimmediate_operand" "ym")]
-                    UNSPEC_PSHUFB))]
-  "TARGET_SSSE3"
-  "pshufb\t{%2, %0|%0, %2}";
-  [(set_attr "type" "sselog1")
+(define_insn_and_split "ssse3_pshufbv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv")
+       (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,Yv")
+                     (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv")]
+                    UNSPEC_PSHUFB))
+   (clobber (match_scratch:V4SI 3 "=X,x,Yv"))]
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
+  "@
+   pshufb\t{%2, %0|%0, %2}
+   #
+   #"
+  "TARGET_MMX_WITH_SSE && reload_completed"
+  [(set (match_dup 3) (match_dup 5))
+   (set (match_dup 3)
+       (and:V4SI (match_dup 3) (match_dup 2)))
+   (set (match_dup 0)
+       (unspec:V16QI [(match_dup 1) (match_dup 4)] UNSPEC_PSHUFB))]
+{
+  /* Emulate MMX version of pshufb with SSE version by masking out the
+     bit 3 of the shuffle control byte.  */
+  operands[0] = lowpart_subreg (V16QImode, operands[0],
+                               GET_MODE (operands[0]));
+  operands[1] = lowpart_subreg (V16QImode, operands[1],
+                               GET_MODE (operands[1]));
+  operands[2] = lowpart_subreg (V4SImode, operands[2],
+                               GET_MODE (operands[2]));
+  operands[4] = lowpart_subreg (V16QImode, operands[3],
+                               GET_MODE (operands[3]));
+  rtvec par = gen_rtvec (4, GEN_INT (0xf7f7f7f7),
+                        GEN_INT (0xf7f7f7f7),
+                        GEN_INT (0xf7f7f7f7),
+                        GEN_INT (0xf7f7f7f7));
+  rtx vec_const = gen_rtx_CONST_VECTOR (V4SImode, par);
+  operands[5] = force_const_mem (V4SImode, vec_const);
+}
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
    (set_attr "prefix_extra" "1")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
-   (set_attr "mode" "DI")])
+   (set_attr "mode" "DI,TI,TI")])
 
 (define_insn "<ssse3_avx2>_psign<mode>3"
   [(set (match_operand:VI124_AVX2 0 "register_operand" "=x,x")
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "ssse3_psign<mode>3"
-  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,Yv")
        (unspec:MMXMODEI
-         [(match_operand:MMXMODEI 1 "register_operand" "0")
-          (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")]
+         [(match_operand:MMXMODEI 1 "register_operand" "0,0,Yv")
+          (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,x,Yv")]
          UNSPEC_PSIGN))]
-  "TARGET_SSSE3"
-  "psign<mmxvecsize>\t{%2, %0|%0, %2}";
-  [(set_attr "type" "sselog1")
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
+  "@
+   psign<mmxvecsize>\t{%2, %0|%0, %2}
+   psign<mmxvecsize>\t{%2, %0|%0, %2}
+   vpsign<mmxvecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
+   (set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
-   (set_attr "mode" "DI")])
+   (set_attr "mode" "DI,TI,TI")])
 
 (define_insn "<ssse3_avx2>_palignr<mode>_mask"
   [(set (match_operand:VI1_AVX512 0 "register_operand" "=v")
    (set_attr "prefix" "orig,vex,evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "ssse3_palignrdi"
-  [(set (match_operand:DI 0 "register_operand" "=y")
-       (unspec:DI [(match_operand:DI 1 "register_operand" "0")
-                   (match_operand:DI 2 "nonimmediate_operand" "ym")
-                   (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")]
+(define_insn_and_split "ssse3_palignrdi"
+  [(set (match_operand:DI 0 "register_operand" "=y,x,Yv")
+       (unspec:DI [(match_operand:DI 1 "register_operand" "0,0,Yv")
+                   (match_operand:DI 2 "register_mmxmem_operand" "ym,x,Yv")
+                   (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n,n,n")]
                   UNSPEC_PALIGNR))]
-  "TARGET_SSSE3"
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
 {
-  operands[3] = GEN_INT (INTVAL (operands[3]) / 8);
-  return "palignr\t{%3, %2, %0|%0, %2, %3}";
+  switch (which_alternative)
+    {
+    case 0:
+      operands[3] = GEN_INT (INTVAL (operands[3]) / 8);
+      return "palignr\t{%3, %2, %0|%0, %2, %3}";
+    case 1:
+    case 2:
+      return "#";
+    default:
+      gcc_unreachable ();
+    }
 }
-  [(set_attr "type" "sseishft")
+  "TARGET_MMX_WITH_SSE && reload_completed"
+  [(set (match_dup 0)
+       (lshiftrt:V1TI (match_dup 0) (match_dup 3)))]
+{
+  /* Emulate MMX palignrdi with SSE psrldq.  */
+  rtx op0 = lowpart_subreg (V2DImode, operands[0],
+                           GET_MODE (operands[0]));
+  if (TARGET_AVX)
+    emit_insn (gen_vec_concatv2di (op0, operands[2], operands[1]));
+  else
+    {
+      /* NB: SSE can only concatenate OP0 and OP1 to OP0.  */
+      emit_insn (gen_vec_concatv2di (op0, operands[1], operands[2]));
+      /* Swap bits 0:63 with bits 64:127.  */
+      rtx mask = gen_rtx_PARALLEL (VOIDmode,
+                                  gen_rtvec (4, GEN_INT (2),
+                                             GEN_INT (3),
+                                             GEN_INT (0),
+                                             GEN_INT (1)));
+      rtx op1 = lowpart_subreg (V4SImode, op0, GET_MODE (op0));
+      rtx op2 = gen_rtx_VEC_SELECT (V4SImode, op1, mask);
+      emit_insn (gen_rtx_SET (op1, op2));
+    }
+  operands[0] = lowpart_subreg (V1TImode, op0, GET_MODE (op0));
+}
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
+   (set_attr "type" "sseishft")
    (set_attr "atom_unit" "sishuf")
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
-   (set_attr "mode" "DI")])
+   (set_attr "mode" "DI,TI,TI")])
 
 ;; Mode iterator to handle singularity w/ absence of V2DI and V4DI
 ;; modes for abs instruction on pre AVX-512 targets.
 })
 
 (define_insn "abs<mode>2"
-  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=y,Yv")
        (abs:MMXMODEI
-         (match_operand:MMXMODEI 1 "nonimmediate_operand" "ym")))]
-  "TARGET_SSSE3"
-  "pabs<mmxvecsize>\t{%1, %0|%0, %1}";
-  [(set_attr "type" "sselog1")
+         (match_operand:MMXMODEI 1 "register_mmxmem_operand" "ym,Yv")))]
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
+  "@
+   pabs<mmxvecsize>\t{%1, %0|%0, %1}
+   %vpabs<mmxvecsize>\t{%1, %0|%0, %1}"
+  [(set_attr "mmx_isa" "native,x64")
+   (set_attr "type" "sselog1")
    (set_attr "prefix_rep" "0")
    (set_attr "prefix_extra" "1")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
-   (set_attr "mode" "DI")])
+   (set_attr "mode" "DI,TI")])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
    (set_attr "prefix_data16" "1,1,*")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,orig,vex")
-   (set_attr "mode" "<MODE>")])
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
+       (unspec:VF_128_256
+         [(match_operand:VF_128_256 1 "register_operand" "0,0,x")
+          (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
+          (match_operand:VF_128_256 3 "register_operand" "Yz,Yz,x")]
+         UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "@
+   blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector") 
+   (set_attr "mode" "<MODE>")])
+
+;; Also define scalar versions.  These are used for conditional move.
+;; Using subregs into vector modes causes register allocation lossage.
+;; These patterns do not allow memory operands because the native
+;; instructions read the full 128-bits.
+
+(define_insn "sse4_1_blendv<ssemodesuffix>"
+  [(set (match_operand:MODEF 0 "register_operand" "=Yr,*x,x")
+       (unspec:MODEF
+         [(match_operand:MODEF 1 "register_operand" "0,0,x")
+          (match_operand:MODEF 2 "register_operand" "Yr,*x,x")
+          (match_operand:MODEF 3 "register_operand" "Yz,Yz,x")]
+         UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+{
+  if (get_attr_mode (insn) == MODE_V4SF)
+    return (which_alternative == 2
+           ? "vblendvps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+           : "blendvps\t{%3, %2, %0|%0, %2, %3}");
+  else
+    return (which_alternative == 2
+           ? "vblendv<ssevecmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+           : "blendv<ssevecmodesuffix>\t{%3, %2, %0|%0, %2, %3}");
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector") 
+   (set (attr "mode")
+       (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "V4SF")
+              (match_test "TARGET_AVX")
+                (const_string "<ssevecmode>")
+              (match_test "optimize_function_for_size_p (cfun)")
+                (const_string "V4SF")
+              ]
+              (const_string "<ssevecmode>")))])
 
-(define_insn "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
+(define_insn_and_split "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt"
   [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
        (unspec:VF_128_256
          [(match_operand:VF_128_256 1 "register_operand" "0,0,x")
           (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
-          (match_operand:VF_128_256 3 "register_operand" "Yz,Yz,x")]
+          (subreg:VF_128_256
+            (lt:<sseintvecmode>
+              (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
+              (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)]
          UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
-  "@
-   blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+       (unspec:VF_128_256
+        [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
+  "operands[3] = gen_lowpart (<MODE>mode, operands[3]);"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "length_immediate" "1")
    (set_attr "btver2_decode" "vector,vector,vector") 
    (set_attr "mode" "<MODE>")])
 
+(define_mode_attr ssefltmodesuffix
+  [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")])
+
+(define_mode_attr ssefltvecmode
+  [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
+
+(define_insn_and_split "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint"
+  [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x")
+       (unspec:<ssebytemode>
+         [(match_operand:<ssebytemode> 1 "register_operand" "0,0,x")
+          (match_operand:<ssebytemode> 2 "vector_operand" "YrBm,*xBm,xm")
+          (subreg:<ssebytemode>
+            (lt:VI48_AVX
+              (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x")
+              (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)]
+         UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+       (unspec:<ssefltvecmode>
+        [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
+{
+  operands[0] = gen_lowpart (<ssefltvecmode>mode, operands[0]);
+  operands[1] = gen_lowpart (<ssefltvecmode>mode, operands[1]);
+  operands[2] = gen_lowpart (<ssefltvecmode>mode, operands[2]);
+  operands[3] = gen_lowpart (<ssefltvecmode>mode, operands[3]);
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector") 
+   (set_attr "mode" "<ssefltvecmode>")])
+
 (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
        (unspec:VF_128_256
    (set_attr "btver2_decode" "vector,vector,vector")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
+       (unspec:VI1_AVX2
+         [(match_operand:VI1_AVX2 1 "register_operand"  "0,0,x")
+          (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
+          (lt:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")
+                       (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))]
+         UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "#"
+  ""
+  [(set (match_dup 0)
+       (unspec:VI1_AVX2
+        [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
+  ""
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "*,*,1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn "sse4_1_pblendw"
   [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x")
        (vec_merge:V8HI
   [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
        (any_extend:V8HI
          (vec_select:V8QI
-           (match_operand:V16QI 1 "nonimmediate_operand" "Yrm,*xm,vm")
+           (match_operand:V16QI 1 "register_operand" "Yr,*x,v")
            (parallel [(const_int 0) (const_int 1)
                       (const_int 2) (const_int 3)
                       (const_int 4) (const_int 5)
                       (const_int 6) (const_int 7)]))))]
   "TARGET_SSE4_1 && <mask_avx512bw_condition> && <mask_avx512vl_condition>"
-  "%vpmov<extsuffix>bw\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  "%vpmov<extsuffix>bw\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_<code>v8qiv8hi2<mask_name>_1"
+  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
+       (any_extend:V8HI
+         (match_operand:V8QI 1 "memory_operand" "m,m,m")))]
+  "TARGET_SSE4_1 && <mask_avx512bw_condition> && <mask_avx512vl_condition>"
+  "%vpmov<extsuffix>bw\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "*sse4_1_<code>v8qiv8hi2<mask_name>_2"
+  [(set (match_operand:V8HI 0 "register_operand")
+       (any_extend:V8HI
+         (vec_select:V8QI
+           (subreg:V16QI
+             (vec_concat:V2DI
+               (match_operand:DI 1 "memory_operand")
+               (const_int 0)) 0)
+           (parallel [(const_int 0) (const_int 1)
+                      (const_int 2) (const_int 3)
+                      (const_int 4) (const_int 5)
+                      (const_int 6) (const_int 7)]))))]
+  "TARGET_SSE4_1 && <mask_avx512bw_condition> && <mask_avx512vl_condition>
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (any_extend:V8HI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);")
+
 (define_insn "<mask_codefor>avx512f_<code>v16qiv16si2<mask_name>"
   [(set (match_operand:V16SI 0 "register_operand" "=v")
        (any_extend:V16SI
   [(set (match_operand:V8SI 0 "register_operand" "=v")
        (any_extend:V8SI
          (vec_select:V8QI
-           (match_operand:V16QI 1 "nonimmediate_operand" "vm")
+           (match_operand:V16QI 1 "register_operand" "v")
            (parallel [(const_int 0) (const_int 1)
                       (const_int 2) (const_int 3)
                       (const_int 4) (const_int 5)
                       (const_int 6) (const_int 7)]))))]
   "TARGET_AVX2 && <mask_avx512vl_condition>"
-  "vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  "vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
+(define_insn "*avx2_<code>v8qiv8si2<mask_name>_1"
+  [(set (match_operand:V8SI 0 "register_operand" "=v")
+       (any_extend:V8SI
+         (match_operand:V8QI 1 "memory_operand" "m")))]
+  "TARGET_AVX2 && <mask_avx512vl_condition>"
+  "%vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "OI")])
+
+(define_insn_and_split "*avx2_<code>v8qiv8si2<mask_name>_2"
+  [(set (match_operand:V8SI 0 "register_operand")
+       (any_extend:V8SI
+         (vec_select:V8QI
+           (subreg:V16QI
+             (vec_concat:V2DI
+               (match_operand:DI 1 "memory_operand")
+               (const_int 0)) 0)
+           (parallel [(const_int 0) (const_int 1)
+                      (const_int 2) (const_int 3)
+                      (const_int 4) (const_int 5)
+                      (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX2 && <mask_avx512vl_condition>
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (any_extend:V8SI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);")
+
 (define_insn "sse4_1_<code>v4qiv4si2<mask_name>"
   [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
        (any_extend:V4SI
          (vec_select:V4QI
-           (match_operand:V16QI 1 "nonimmediate_operand" "Yrm,*xm,vm")
+           (match_operand:V16QI 1 "register_operand" "Yr,*x,v")
            (parallel [(const_int 0) (const_int 1)
                       (const_int 2) (const_int 3)]))))]
   "TARGET_SSE4_1 && <mask_avx512vl_condition>"
-  "%vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}"
+  "%vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_<code>v4qiv4si2<mask_name>_1"
+  [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
+       (any_extend:V4SI
+         (match_operand:V4QI 1 "memory_operand" "m,m,m")))]
+  "TARGET_SSE4_1 && <mask_avx512vl_condition>"
+  "%vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "*sse4_1_<code>v4qiv4si2<mask_name>_2"
+  [(set (match_operand:V4SI 0 "register_operand")
+       (any_extend:V4SI
+         (vec_select:V4QI
+           (subreg:V16QI
+             (vec_merge:V4SI
+               (vec_duplicate:V4SI
+                 (match_operand:SI 1 "memory_operand"))
+               (const_vector:V4SI
+                  [(const_int 0) (const_int 0)
+                   (const_int 0) (const_int 0)])
+               (const_int 1)) 0)
+           (parallel [(const_int 0) (const_int 1)
+                      (const_int 2) (const_int 3)]))))]
+  "TARGET_SSE4_1 && <mask_avx512vl_condition>
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (any_extend:V4SI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], V4QImode, 0);")
+
 (define_insn "avx512f_<code>v16hiv16si2<mask_name>"
   [(set (match_operand:V16SI 0 "register_operand" "=v")
        (any_extend:V16SI
   [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
        (any_extend:V4SI
          (vec_select:V4HI
-           (match_operand:V8HI 1 "nonimmediate_operand" "Yrm,*xm,vm")
+           (match_operand:V8HI 1 "register_operand" "Yr,*x,v")
            (parallel [(const_int 0) (const_int 1)
                       (const_int 2) (const_int 3)]))))]
   "TARGET_SSE4_1 && <mask_avx512vl_condition>"
-  "%vpmov<extsuffix>wd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  "%vpmov<extsuffix>wd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_<code>v4hiv4si2<mask_name>_1"
+  [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
+       (any_extend:V4SI
+         (match_operand:V4HI 1 "memory_operand" "m,m,m")))]
+  "TARGET_SSE4_1 && <mask_avx512vl_condition>"
+  "%vpmov<extsuffix>wd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "*sse4_1_<code>v4hiv4si2<mask_name>_2"
+  [(set (match_operand:V4SI 0 "register_operand")
+       (any_extend:V4SI
+         (vec_select:V4HI
+           (subreg:V8HI
+             (vec_concat:V2DI
+               (match_operand:DI 1 "memory_operand")
+               (const_int 0)) 0)
+           (parallel [(const_int 0) (const_int 1)
+                      (const_int 2) (const_int 3)]))))]
+  "TARGET_SSE4_1 && <mask_avx512vl_condition>
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (any_extend:V4SI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], V4HImode, 0);")
+
 (define_insn "avx512f_<code>v8qiv8di2<mask_name>"
   [(set (match_operand:V8DI 0 "register_operand" "=v")
        (any_extend:V8DI
          (vec_select:V8QI
-           (match_operand:V16QI 1 "nonimmediate_operand" "vm")
+           (match_operand:V16QI 1 "register_operand" "v")
            (parallel [(const_int 0) (const_int 1)
                       (const_int 2) (const_int 3)
                       (const_int 4) (const_int 5)
                       (const_int 6) (const_int 7)]))))]
   "TARGET_AVX512F"
-  "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}"
+  "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "*avx512f_<code>v8qiv8di2<mask_name>_1"
+  [(set (match_operand:V8DI 0 "register_operand" "=v")
+       (any_extend:V8DI
+         (match_operand:V8QI 1 "memory_operand" "m")))]
+  "TARGET_AVX512F"
+  "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
+(define_insn_and_split "*avx512f_<code>v8qiv8di2<mask_name>_2"
+  [(set (match_operand:V8DI 0 "register_operand")
+       (any_extend:V8DI
+         (vec_select:V8QI
+           (subreg:V16QI
+             (vec_concat:V2DI
+               (match_operand:DI 1 "memory_operand")
+               (const_int 0)) 0)
+           (parallel [(const_int 0) (const_int 1)
+                      (const_int 2) (const_int 3)
+                      (const_int 4) (const_int 5)
+                      (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX512F && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (any_extend:V8DI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);")
+
 (define_insn "avx2_<code>v4qiv4di2<mask_name>"
   [(set (match_operand:V4DI 0 "register_operand" "=v")
        (any_extend:V4DI
          (vec_select:V4QI
-           (match_operand:V16QI 1 "nonimmediate_operand" "vm")
+           (match_operand:V16QI 1 "register_operand" "v")
            (parallel [(const_int 0) (const_int 1)
                       (const_int 2) (const_int 3)]))))]
   "TARGET_AVX2 && <mask_avx512vl_condition>"
-  "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}"
+  "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "OI")])
+
+(define_insn "*avx2_<code>v4qiv4di2<mask_name>_1"
+  [(set (match_operand:V4DI 0 "register_operand" "=v")
+       (any_extend:V4DI
+         (match_operand:V4QI 1 "memory_operand" "m")))]
+  "TARGET_AVX2 && <mask_avx512vl_condition>"
+  "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
+(define_insn_and_split "*avx2_<code>v4qiv4di2<mask_name>_2"
+  [(set (match_operand:V4DI 0 "register_operand")
+       (any_extend:V4DI
+         (vec_select:V4QI
+           (subreg:V16QI
+             (vec_merge:V4SI
+               (vec_duplicate:V4SI
+                 (match_operand:SI 1 "memory_operand"))
+               (const_vector:V4SI
+                  [(const_int 0) (const_int 0)
+                   (const_int 0) (const_int 0)])
+               (const_int 1)) 0)
+           (parallel [(const_int 0) (const_int 1)
+                      (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX2 && <mask_avx512vl_condition>
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (any_extend:V4DI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], V4QImode, 0);")
+
 (define_insn "sse4_1_<code>v2qiv2di2<mask_name>"
   [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v")
        (any_extend:V2DI
          (vec_select:V2QI
-           (match_operand:V16QI 1 "nonimmediate_operand" "Yrm,*xm,vm")
+           (match_operand:V16QI 1 "register_operand" "Yr,*x,v")
            (parallel [(const_int 0) (const_int 1)]))))]
   "TARGET_SSE4_1 && <mask_avx512vl_condition>"
-  "%vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %w1}"
+  "%vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
   [(set (match_operand:V4DI 0 "register_operand" "=v")
        (any_extend:V4DI
          (vec_select:V4HI
-           (match_operand:V8HI 1 "nonimmediate_operand" "vm")
+           (match_operand:V8HI 1 "register_operand" "v")
            (parallel [(const_int 0) (const_int 1)
                       (const_int 2) (const_int 3)]))))]
   "TARGET_AVX2 && <mask_avx512vl_condition>"
-  "vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  "vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "OI")])
+
+(define_insn "*avx2_<code>v4hiv4di2<mask_name>_1"
+  [(set (match_operand:V4DI 0 "register_operand" "=v")
+       (any_extend:V4DI
+         (match_operand:V4HI 1 "memory_operand" "m")))]
+  "TARGET_AVX2 && <mask_avx512vl_condition>"
+  "vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
+(define_insn_and_split "*avx2_<code>v4hiv4di2<mask_name>_2"
+  [(set (match_operand:V4DI 0 "register_operand")
+       (any_extend:V4DI
+         (vec_select:V4HI
+           (subreg:V8HI
+             (vec_concat:V2DI
+               (match_operand:DI 1 "memory_operand")
+               (const_int 0)) 0)
+           (parallel [(const_int 0) (const_int 1)
+                      (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX2 && <mask_avx512vl_condition>
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (any_extend:V4DI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], V4HImode, 0);")
+
 (define_insn "sse4_1_<code>v2hiv2di2<mask_name>"
   [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v")
        (any_extend:V2DI
          (vec_select:V2HI
-           (match_operand:V8HI 1 "nonimmediate_operand" "Yrm,*xm,vm")
+           (match_operand:V8HI 1 "register_operand" "Yr,*x,v")
            (parallel [(const_int 0) (const_int 1)]))))]
   "TARGET_SSE4_1 && <mask_avx512vl_condition>"
-  "%vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}"
+  "%vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_<code>v2hiv2di2<mask_name>_1"
+  [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v")
+       (any_extend:V2DI
+         (match_operand:V2HI 1 "memory_operand" "m,m,m")))]
+  "TARGET_SSE4_1 && <mask_avx512vl_condition>"
+  "%vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "*sse4_1_<code>v2hiv2di2<mask_name>_2"
+  [(set (match_operand:V2DI 0 "register_operand")
+       (any_extend:V2DI
+         (vec_select:V2HI
+           (subreg:V8HI
+             (vec_merge:V4SI
+               (vec_duplicate:V4SI
+                 (match_operand:SI 1 "memory_operand"))
+               (const_vector:V4SI
+                  [(const_int 0) (const_int 0)
+                   (const_int 0) (const_int 0)])
+               (const_int 1)) 0)
+           (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE4_1 && <mask_avx512vl_condition>
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (any_extend:V2DI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], V2HImode, 0);")
+
 (define_insn "avx512f_<code>v8siv8di2<mask_name>"
   [(set (match_operand:V8DI 0 "register_operand" "=v")
        (any_extend:V8DI
   [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v")
        (any_extend:V2DI
          (vec_select:V2SI
-           (match_operand:V4SI 1 "nonimmediate_operand" "Yrm,*xm,vm")
+           (match_operand:V4SI 1 "register_operand" "Yr,*x,v")
            (parallel [(const_int 0) (const_int 1)]))))]
   "TARGET_SSE4_1 && <mask_avx512vl_condition>"
-  "%vpmov<extsuffix>dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  "%vpmov<extsuffix>dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_<code>v2siv2di2<mask_name>_1"
+  [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v")
+       (any_extend:V2DI
+         (match_operand:V2SI 1 "memory_operand" "m,m,m")))]
+  "TARGET_SSE4_1 && <mask_avx512vl_condition>"
+  "%vpmov<extsuffix>dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "*sse4_1_<code>v2siv2di2<mask_name>_2"
+  [(set (match_operand:V2DI 0 "register_operand")
+       (any_extend:V2DI
+         (vec_select:V2SI
+           (subreg:V4SI
+             (vec_concat:V2DI
+               (match_operand:DI 1 "memory_operand")
+               (const_int 0)) 0)
+           (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE4_1 && <mask_avx512vl_condition>
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (any_extend:V2DI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], V2SImode, 0);")
+
 ;; ptestps/ptestpd are very similar to comiss and ucomiss when
 ;; setting FLAGS_REG. But it is not a really compare instruction.
 (define_insn "avx_vtest<ssemodesuffix><avxsizesuffix>"
   switch (INTVAL (operands[4]))
     {
     case 3:
-      return "vgatherpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+      /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+        gas changed what it requires incompatibly.  */
+      return "%M2vgatherpf0<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}";
     case 2:
-      return "vgatherpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+      return "%M2vgatherpf1<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}";
     default:
       gcc_unreachable ();
     }
   switch (INTVAL (operands[4]))
     {
     case 3:
-      return "vgatherpf0<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+      /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+        gas changed what it requires incompatibly.  */
+      return "%M2vgatherpf0<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}";
     case 2:
-      return "vgatherpf1<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+      return "%M2vgatherpf1<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}";
     default:
       gcc_unreachable ();
     }
     {
     case 3:
     case 7:
-      return "vscatterpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+      /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+        gas changed what it requires incompatibly.  */
+      return "%M2vscatterpf0<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}";
     case 2:
     case 6:
-      return "vscatterpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+      return "%M2vscatterpf1<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}";
     default:
       gcc_unreachable ();
     }
     {
     case 3:
     case 7:
-      return "vscatterpf0<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+      /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+        gas changed what it requires incompatibly.  */
+      return "%M2vscatterpf0<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}";
     case 2:
     case 6:
-      return "vscatterpf1<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+      return "%M2vscatterpf1<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}";
     default:
       gcc_unreachable ();
     }
   [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v")
        (vec_duplicate:VI8_AVX512VL
          (zero_extend:DI
-           (match_operand:QI 1 "register_operand" "Yk"))))]
+           (match_operand:QI 1 "register_operand" "k"))))]
   "TARGET_AVX512CD"
   "vpbroadcastmb2q\t{%1, %0|%0, %1}"
   [(set_attr "type" "mskmov")
   [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v")
        (vec_duplicate:VI4_AVX512VL
          (zero_extend:SI
-           (match_operand:HI 1 "register_operand" "Yk"))))]
+           (match_operand:HI 1 "register_operand" "k"))))]
   "TARGET_AVX512CD"
   "vpbroadcastmw2d\t{%1, %0|%0, %1}"
   [(set_attr "type" "mskmov")
          UNSPEC_GATHER))
    (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
   "TARGET_AVX2"
-  "v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}"
+  "%M3v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
          UNSPEC_GATHER))
    (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
   "TARGET_AVX2"
-  "v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %6, %0|%0, %6, %1}"
+  "%M2v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %6, %0|%0, %6, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
          UNSPEC_GATHER))
    (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
   "TARGET_AVX2"
-  "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}"
+  "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
   "TARGET_AVX2"
 {
   if (<MODE>mode != <VEC_GATHER_SRCDI>mode)
-    return "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %x0|%x0, %6, %4}";
-  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}";
+    return "%M2v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %x0|%x0, %6, %4}";
+  return "%M2v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}";
 }
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "vex")
                     (const_int 2) (const_int 3)])))
    (clobber (match_scratch:VI4F_256 1 "=&x"))]
   "TARGET_AVX2"
-  "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %0|%0, %7, %5}"
+  "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %0|%0, %7, %5}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
                     (const_int 2) (const_int 3)])))
    (clobber (match_scratch:VI4F_256 1 "=&x"))]
   "TARGET_AVX2"
-  "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}"
+  "%M2v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
-;; Memory operand override for -masm=intel of the v*gatherq* patterns.
-(define_mode_attr gatherq_mode
-  [(V4SI "q") (V2DI "x") (V4SF "q") (V2DF "x")
-   (V8SI "x") (V4DI "t") (V8SF "x") (V4DF "t")
-   (V16SI "t") (V8DI "g") (V16SF "t") (V8DF "g")])
-
 (define_expand "<avx512>_gathersi<mode>"
   [(parallel [(set (match_operand:VI48F 0 "register_operand")
                   (unspec:VI48F
          UNSPEC_GATHER))
    (clobber (match_scratch:<avx512fmaskmode> 2 "=&Yk"))]
   "TARGET_AVX512F"
-  "v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %<xtg_mode>6}"
+;; %X6 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+  "%M4v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %X6}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
          UNSPEC_GATHER))
    (clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))]
   "TARGET_AVX512F"
-  "v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %<xtg_mode>5}"
+;; %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+  "%M3v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %X5}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
          UNSPEC_GATHER))
    (clobber (match_scratch:QI 2 "=&Yk"))]
   "TARGET_AVX512F"
-{
-  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %<gatherq_mode>6}";
-}
+;; %X6 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+  "%M4v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %X6}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
    (clobber (match_scratch:QI 1 "=&Yk"))]
   "TARGET_AVX512F"
 {
+  /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+     gas changed what it requires incompatibly.  */
   if (<MODE>mode != <VEC_GATHER_SRCDI>mode)
     {
       if (<MODE_SIZE> != 64)
-       return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %x0%{%1%}|%x0%{%1%}, %<gatherq_mode>5}";
+       return "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %x0%{%1%}|%x0%{%1%}, %X5}";
       else
-       return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %t5}";
+       return "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %X5}";
     }
-  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %<gatherq_mode>5}";
+  return "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %X5}";
 }
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
          UNSPEC_SCATTER))
    (clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))]
   "TARGET_AVX512F"
-  "v<sseintprefix>scatterd<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}"
+;; %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+  "%M0v<sseintprefix>scatterd<ssemodesuffix>\t{%3, %5%{%1%}|%X5%{%1%}, %3}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
          UNSPEC_SCATTER))
    (clobber (match_scratch:QI 1 "=&Yk"))]
   "TARGET_AVX512F"
-{
-  if (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) == 8)
-    return "v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}";
-  return "v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%t5%{%1%}, %3}";
-}
+;; %X5 so that we don't emit any *WORD PTR for -masm=intel, as
+;; gas changed what it requires incompatibly.
+  "%M0v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%X5%{%1%}, %3}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
    (set_attr "mode" "<MODE>")])
 
 (define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
           (unspec:<avx512fmaskmode>
-            [(match_operand:VF_AVX512VL 1 "register_operand" "v")
+            [(match_operand:VF_AVX512VL 1 "vector_operand" "vm")
              (match_operand:QI 2 "const_0_to_255_operand" "n")]
              UNSPEC_FPCLASS))]
    "TARGET_AVX512DQ"
-   "vfpclass<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}";
+   "vfpclass<ssemodesuffix><vecmemsuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}";
   [(set_attr "type" "sse")
    (set_attr "length_immediate" "1")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "avx512dq_vmfpclass<mode>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+(define_insn "avx512dq_vmfpclass<mode><mask_scalar_merge_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (and:<avx512fmaskmode>
          (unspec:<avx512fmaskmode>
-           [(match_operand:VF_128 1 "register_operand" "v")
+           [(match_operand:VF_128 1 "vector_operand" "vm")
              (match_operand:QI 2 "const_0_to_255_operand" "n")]
            UNSPEC_FPCLASS)
          (const_int 1)))]
    "TARGET_AVX512DQ"
-   "vfpclass<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}";
+   "vfpclass<ssescalarmodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}";
   [(set_attr "type" "sse")
    (set_attr "length_immediate" "1")
    (set_attr "prefix" "evex")
   "vpopcnt<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
 
 ;; Save multiple registers out-of-line.
-(define_insn "save_multiple<mode>"
+(define_insn "*save_multiple<mode>"
   [(match_parallel 0 "save_multiple"
     [(use (match_operand:P 1 "symbol_operand"))])]
   "TARGET_SSE && TARGET_64BIT"
   "call\t%P1")
 
 ;; Restore multiple registers out-of-line.
-(define_insn "restore_multiple<mode>"
+(define_insn "*restore_multiple<mode>"
   [(match_parallel 0 "restore_multiple"
     [(use (match_operand:P 1 "symbol_operand"))])]
   "TARGET_SSE && TARGET_64BIT"
   "call\t%P1")
 
 ;; Restore multiple registers out-of-line and return.
-(define_insn "restore_multiple_and_return<mode>"
+(define_insn "*restore_multiple_and_return<mode>"
   [(match_parallel 0 "restore_multiple"
     [(return)
      (use (match_operand:P 1 "symbol_operand"))
 
 ;; Restore multiple registers out-of-line when hard frame pointer is used,
 ;; perform the leave operation prior to returning (from the function).
-(define_insn "restore_multiple_leave_return<mode>"
+(define_insn "*restore_multiple_leave_return<mode>"
   [(match_parallel 0 "restore_multiple"
     [(return)
      (use (match_operand:P 1 "symbol_operand"))
   [(set (match_operand:VI1_AVX512VL_F 0 "register_operand" "=v")
        (unspec:VI1_AVX512VL_F
          [(match_operand:VI1_AVX512VL_F 1 "register_operand" "v")
-          (match_operand:VI1_AVX512VL_F 2 "vector_operand" "v")]
+          (match_operand:VI1_AVX512VL_F 2 "vector_operand" "vm")]
          UNSPEC_VAESDEC))]
   "TARGET_VAES"
   "vaesdec\t{%2, %1, %0|%0, %1, %2}"
   [(set (match_operand:VI1_AVX512VL_F 0 "register_operand" "=v")
        (unspec:VI1_AVX512VL_F
          [(match_operand:VI1_AVX512VL_F 1 "register_operand" "v")
-          (match_operand:VI1_AVX512VL_F 2 "vector_operand" "v")]
+          (match_operand:VI1_AVX512VL_F 2 "vector_operand" "vm")]
          UNSPEC_VAESDECLAST))]
   "TARGET_VAES"
   "vaesdeclast\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "mode" "DI")])
 
 (define_insn "avx512vl_vpshufbitqmb<mode><mask_scalar_merge_name>"
-  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
          [(match_operand:VI1_AVX512VLBW 1 "register_operand" "v")
           (match_operand:VI1_AVX512VLBW 2 "nonimmediate_operand" "vm")]
   "vpshufbitqmb\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
   [(set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
+
+(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+;; Converting from BF to SF
+(define_mode_attr bf16_cvt_2sf
+  [(V32HI  "V16SF") (V16HI  "V8SF") (V8HI  "V4SF")])
+;; Converting from SF to BF
+(define_mode_attr sf_cvt_bf16
+  [(V4SF  "V8HI") (V8SF  "V8HI") (V16SF  "V16HI")])
+;; Mapping from BF to SF
+(define_mode_attr sf_bf16
+  [(V4SF  "V8HI") (V8SF  "V16HI") (V16SF  "V32HI")])
+
+(define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz"
+  [(match_operand:BF16 0 "register_operand")
+   (match_operand:<bf16_cvt_2sf> 1 "register_operand")
+   (match_operand:<bf16_cvt_2sf> 2 "register_operand")
+   (match_operand:<avx512fmaskmode> 3 "register_operand")]
+  "TARGET_AVX512BF16"
+{
+  emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1],
+    operands[2], CONST0_RTX(<MODE>mode), operands[3]));
+  DONE;
+})
+
+(define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>"
+  [(set (match_operand:BF16 0 "register_operand" "=v")
+       (unspec:BF16
+         [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v")
+          (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")]
+        UNSPEC_VCVTNE2PS2BF16))]
+  "TARGET_AVX512BF16"
+  "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
+
+(define_expand "avx512f_cvtneps2bf16_<mode>_maskz"
+  [(match_operand:<sf_cvt_bf16> 0 "register_operand")
+   (match_operand:VF1_AVX512VL 1 "register_operand")
+   (match_operand:<avx512fmaskmode> 2 "register_operand")]
+  "TARGET_AVX512BF16"
+{
+  emit_insn (gen_avx512f_cvtneps2bf16_<mode>_mask(operands[0], operands[1],
+    CONST0_RTX(<sf_cvt_bf16>mode), operands[2]));
+  DONE;
+})
+
+(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
+  [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
+       (unspec:<sf_cvt_bf16>
+         [(match_operand:VF1_AVX512VL 1 "register_operand" "v")]
+        UNSPEC_VCVTNEPS2BF16))]
+  "TARGET_AVX512BF16"
+  "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
+
+(define_expand "avx512f_dpbf16ps_<mode>_maskz"
+  [(match_operand:VF1_AVX512VL 0 "register_operand")
+   (match_operand:VF1_AVX512VL 1 "register_operand")
+   (match_operand:<sf_bf16> 2 "register_operand")
+   (match_operand:<sf_bf16> 3 "register_operand")
+   (match_operand:<avx512fmaskhalfmode> 4 "register_operand")]
+  "TARGET_AVX512BF16"
+{
+  emit_insn (gen_avx512f_dpbf16ps_<mode>_maskz_1(operands[0], operands[1],
+    operands[2], operands[3], CONST0_RTX(<MODE>mode), operands[4]));
+  DONE;
+})
+
+(define_insn "avx512f_dpbf16ps_<mode><maskz_half_name>"
+  [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v")
+       (unspec:VF1_AVX512VL
+         [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
+          (match_operand:<sf_bf16> 2 "register_operand" "v")
+          (match_operand:<sf_bf16> 3 "register_operand" "v")]
+        UNSPEC_VDPBF16PS))]
+  "TARGET_AVX512BF16"
+  "vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}")
+
+(define_insn "avx512f_dpbf16ps_<mode>_mask"
+  [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v")
+       (vec_merge:VF1_AVX512VL
+         (unspec:VF1_AVX512VL
+           [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
+            (match_operand:<sf_bf16> 2 "register_operand" "v")
+            (match_operand:<sf_bf16> 3 "register_operand" "v")]
+             UNSPEC_VDPBF16PS)
+          (match_dup 1)
+          (match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512BF16"
+  "vdpbf16ps\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}")